From e7e61bc5ffcd85f7e561209daaa9db6928797112 Mon Sep 17 00:00:00 2001 From: GeiserX <9169332+GeiserX@users.noreply.github.com> Date: Mon, 15 Jun 2026 19:23:04 +0200 Subject: [PATCH] fix: poll for admin approval instead of dying when no auth URL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A registration that comes back not-authorized with no interactive URL means the node holds a valid key and is awaiting admin approval on an approval-gated tailnet, a transient and recoverable state. It was being mapped to the generic Internal MachineAuthorization error, which the control runner check_auth loop caught in its catch-all terminal arm and turned into a permanent Failed AuthRejected that stopped the runner for good. On an approval-gated tailnet a freshly-registered node whose key control had not yet approved would die where Go comes up automatically once the admin approves. Surface the no-URL await-approval case as a distinct typed error and poll for approval, matching Go's NeedsMachineAuth to Starting auto transition with no re-registration. - ts_control Error gains a NeedsMachineAuth variant, distinct from Internal MachineAuthorization and from the URL-carrying MachineNotAuthorized. register.rs maps MachineNotAuthorized None to it. - ts_runtime DeviceState gains a transient NeedsMachineAuth variant treated like Connecting by wait_for_running and carrying no browse_to_url. The From conversion maps the control error to a non-permanent NetworkUnreachable. - The check_auth loop and the connect bring-up loop both gain a transient NeedsMachineAuth arm that publishes the state, polls 5s, and retries instead of stopping. A genuine Rejected bad-key still routes to the terminal Failed arm, so real auth failures are not turned into infinite polls. Signed-off-by: Sergio Fernández <9169332+GeiserX@users.noreply.github.com> --- ts_control/src/lib.rs | 10 +++++ ts_control/src/tokio/client.rs | 19 +++++---- ts_control/src/tokio/register.rs | 56 ++++++++++++++++++++++++-- ts_runtime/src/control_runner.rs | 36 +++++++++++++++++ ts_runtime/src/device_state.rs | 68 ++++++++++++++++++++++++++++++-- 5 files changed, 172 insertions(+), 17 deletions(-) diff --git a/ts_control/src/lib.rs b/ts_control/src/lib.rs index f2793fe..db93862 100644 --- a/ts_control/src/lib.rs +++ b/ts_control/src/lib.rs @@ -112,6 +112,16 @@ pub enum Error { #[error("machine was not authorized by control to join tailnet, authorize at {0}")] MachineNotAuthorized(url::Url), + /// A machine is not yet authorized and control offered **no** interactive auth URL — it is + /// awaiting admin approval on an approval-gated tailnet. **Transient and recoverable**: the node + /// holds a valid key and must poll-and-retry registration until an admin approves, then it comes + /// up with no re-registration (Go's `ipn.State::NeedsMachineAuth` → `Starting` auto-transition). + /// Distinct from [`Internal`](Self::Internal)`(MachineAuthorization, _)` so the control runner can + /// tell "awaiting approval" (poll) apart from a hard internal failure (stop), and from + /// [`MachineNotAuthorized`](Self::MachineNotAuthorized) which carries a URL for interactive login. + #[error("machine awaiting admin approval to join tailnet (no interactive auth URL)")] + NeedsMachineAuth, + /// The user supplied an invalid URL. #[error("invalid URL: {0}")] InvalidUrl(url::Url), diff --git a/ts_control/src/tokio/client.rs b/ts_control/src/tokio/client.rs index c472ce3..9daf020 100644 --- a/ts_control/src/tokio/client.rs +++ b/ts_control/src/tokio/client.rs @@ -471,8 +471,8 @@ fn reconnect_delay_after_poll( /// re-register clears the state for free (Go's `authRoutine` keeps `urlToVisit` and keeps polling). /// /// **Only `MachineNotAuthorized` sets the cell.** `MachineNotAuthorized(None)` (no auth URL on -/// offer) maps upstream to [`Error::Internal`]`(MachineAuthorization, _)`, not this variant, so it -/// correctly does *not* set a (nonexistent) URL. The write is sticky via +/// offer) maps upstream to [`Error::NeedsMachineAuth`], not this variant, so it correctly does +/// *not* set a (nonexistent) URL. The write is sticky via /// [`send_if_modified`](watch::Sender::send_if_modified): the cell is updated only when the URL /// actually differs from its current value, so a re-auth URL that persists across several failed /// re-register attempts does not thrash the cell or wake the runtime's bridge spuriously. @@ -1207,19 +1207,18 @@ mod tests { } /// `MachineNotAuthorized(None)` (control offered no auth URL) maps upstream to - /// `Error::Internal(MachineAuthorization, _)`, NOT `Error::MachineNotAuthorized`, so the helper - /// must leave the cell untouched. Built from the *exact* upstream mapping (register.rs - /// `From for Error`) so this stays honest if that mapping ever changes. + /// `Error::NeedsMachineAuth`, NOT `Error::MachineNotAuthorized`, so the helper must leave the + /// cell untouched (there is no URL to surface). Built from the *exact* upstream mapping + /// (register.rs `From for Error`) so this stays honest if that mapping ever + /// changes. #[test] fn machine_not_authorized_none_does_not_set_url_cell() { let (tx, rx) = watch::channel(None); let err = Error::from(crate::tokio::register::RegistrationError::MachineNotAuthorized(None)); - // Confirm the mapping is the non-URL internal variant (the precondition for the assertion). - assert!(matches!( - err, - Error::Internal(crate::InternalErrorKind::MachineAuthorization, _) - )); + // Confirm the mapping is the no-URL await-approval variant (the precondition for the + // assertion): it is the distinct `NeedsMachineAuth`, not a URL-carrying re-auth signal. + assert!(matches!(err, Error::NeedsMachineAuth)); surface_reauth_url(&err, &tx); diff --git a/ts_control/src/tokio/register.rs b/ts_control/src/tokio/register.rs index a5f2d3f..a6fc8dc 100644 --- a/ts_control/src/tokio/register.rs +++ b/ts_control/src/tokio/register.rs @@ -90,10 +90,13 @@ impl From for crate::Error { RegistrationError::MachineNotAuthorized(Some(u)) => { crate::Error::MachineNotAuthorized(u) } - RegistrationError::MachineNotAuthorized(None) => crate::Error::Internal( - crate::InternalErrorKind::MachineAuthorization, - crate::Operation::Registration, - ), + // "Not authorized, no URL, no error" = awaiting admin approval on an approval-gated + // tailnet (a TRANSIENT state — the node holds a valid key and must poll until approved, + // then comes up with no re-registration; Go's `NeedsMachineAuth`). Surface it as the + // distinct `NeedsMachineAuth` so the control runner polls instead of stopping, rather + // than collapsing it into the generic `Internal(MachineAuthorization, _)` (which the + // runner treats as a hard failure → terminal `Failed`). + RegistrationError::MachineNotAuthorized(None) => crate::Error::NeedsMachineAuth, RegistrationError::Rejected(msg) => crate::Error::Registration(msg), RegistrationError::RateLimited(d) => crate::Error::RateLimited(d), RegistrationError::Internal(k) => { @@ -478,4 +481,49 @@ mod tests { assert!(classify_register_response(&resp).is_ok()); } + + /// `MachineNotAuthorized(None)` — the await-admin-approval case — must map to the DISTINCT + /// `crate::Error::NeedsMachineAuth`, NOT the generic `Internal(MachineAuthorization, _)` (tsr-dvu). + /// This is what lets the control runner tell "awaiting approval, poll" apart from a hard internal + /// failure: the runner matches `NeedsMachineAuth` as a transient poll arm, while `Internal` would + /// fall into its terminal `Failed` arm and permanently stop the runner. + #[test] + fn machine_not_authorized_none_maps_to_needs_machine_auth() { + let err = crate::Error::from(RegistrationError::MachineNotAuthorized(None)); + assert!( + matches!(err, crate::Error::NeedsMachineAuth), + "MachineNotAuthorized(None) must map to the distinct NeedsMachineAuth, got {err:?}" + ); + // Specifically NOT the generic internal machine-auth error (the pre-tsr-dvu mapping that the + // runner treated as a hard failure). + assert!(!matches!( + err, + crate::Error::Internal(crate::InternalErrorKind::MachineAuthorization, _) + )); + } + + /// `MachineNotAuthorized(Some(url))` — the interactive-login case — is UNCHANGED by tsr-dvu: it + /// still maps to the URL-carrying `crate::Error::MachineNotAuthorized(url)` (the runner surfaces + /// `NeedsLogin`). + #[test] + fn machine_not_authorized_some_maps_to_machine_not_authorized_url() { + let url: Url = "https://login.example.com/a/abc123".parse().unwrap(); + let err = crate::Error::from(RegistrationError::MachineNotAuthorized(Some(url.clone()))); + assert_eq!(err, crate::Error::MachineNotAuthorized(url)); + } + + /// Regression: a hard `Rejected(msg)` (bad/expired/unknown key — what `classify_register_response` + /// returns for not-authorized WITH an error reason) maps to `crate::Error::Registration(msg)`, + /// which is DISTINCT from the transient `NeedsMachineAuth`. The runner routes `Registration` to + /// its terminal `Failed` arm, so a genuine auth failure still terminates — tsr-dvu must NOT turn + /// real rejections into infinite polls. + #[test] + fn rejected_maps_to_registration_not_needs_machine_auth() { + let err = crate::Error::from(RegistrationError::Rejected("invalid key".to_string())); + assert_eq!(err, crate::Error::Registration("invalid key".to_string())); + assert!( + !matches!(err, crate::Error::NeedsMachineAuth), + "a hard rejection must NOT be the transient await-approval variant" + ); + } } diff --git a/ts_runtime/src/control_runner.rs b/ts_runtime/src/control_runner.rs index 3fb81d6..1f57b60 100644 --- a/ts_runtime/src/control_runner.rs +++ b/ts_runtime/src/control_runner.rs @@ -204,6 +204,23 @@ impl kameo::Actor for ControlRunner { .send_replace(crate::DeviceState::NeedsLogin(u.clone())); tokio::time::sleep(Duration::from_secs(5)).await; } + Err(ControlError::NeedsMachineAuth) => { + // The node is registered with a valid key but awaiting ADMIN APPROVAL on an + // approval-gated tailnet, and control offered NO interactive URL. This is + // TRANSIENT (Go's `NeedsMachineAuth`): poll registration until an admin approves, + // then `check_auth` returns `Ok(())` → the loop breaks and the node comes up with + // no re-registration. Publishing the (no-URL) `NeedsMachineAuth` state — NOT a + // terminal `Failed` and NOT `NeedsLogin` (there is no URL to open) — lets a + // watcher / `wait_until_running` see "awaiting approval" instead of an opaque + // timeout. Same 5s poll cadence as the `MachineNotAuthorized(url)` arm. + tracing::info!( + "machine awaiting admin approval to join the tailnet; polling until approved" + ); + params + .state_tx + .send_replace(crate::DeviceState::NeedsMachineAuth); + tokio::time::sleep(Duration::from_secs(5)).await; + } Err(ControlError::RateLimited(retry_after)) => { // Control asked us to slow down (HTTP 429). Wait exactly the server-requested // cooldown and retry — this is transient, NOT a terminal `Failed`, so we must @@ -278,6 +295,25 @@ impl kameo::Actor for ControlRunner { ); tokio::time::sleep(retry_after).await; } + Err(ControlRunnerError::Control(ControlError::NeedsMachineAuth)) => { + // `connect` issues its OWN `machine/register` POST (a second one after + // `check_auth`'s), so it too can come back "awaiting admin approval, no URL". In + // the normal flow the `check_auth` loop above already gated this (it only breaks + // once registration returns `Ok`, and approval is monotonic), so this arm is the + // defensive twin for a node de-authorized in the race window between the two POSTs: + // treat it as TRANSIENT exactly like the `check_auth` arm — publish the (no-URL) + // `NeedsMachineAuth` state, poll the same 5s, and retry the bring-up — rather than + // collapsing into the terminal `Failed` arm below (which would permanently stop + // the runner on a recoverable await-approval). Mirrors Go's `NeedsMachineAuth`. + tracing::info!( + "machine awaiting admin approval during session bring-up; polling until \ + approved" + ); + params + .state_tx + .send_replace(crate::DeviceState::NeedsMachineAuth); + tokio::time::sleep(Duration::from_secs(5)).await; + } Err(e) => { tracing::error!(error = %e, "bringing up the control session failed"); // The control session never came up; surface it as a transient registration diff --git a/ts_runtime/src/device_state.rs b/ts_runtime/src/device_state.rs index 3fe71ae..eeddc2f 100644 --- a/ts_runtime/src/device_state.rs +++ b/ts_runtime/src/device_state.rs @@ -30,6 +30,15 @@ pub enum DeviceState { /// Control requires interactive authentication (no usable auth key): the node is waiting for a /// human to authorize it at the carried URL. Transient — registration retries until authorized. NeedsLogin(url::Url), + /// The node is registered with a valid key but awaiting **admin approval** on an approval-gated + /// tailnet, and control offered **no** interactive auth URL (so, unlike + /// [`NeedsLogin`](Self::NeedsLogin), there is nothing for a human to open — an admin must approve + /// the node out of band). **Transient** — treated like [`Connecting`](Self::Connecting) by the + /// waiters ([`wait_until_running`](crate::Runtime::wait_until_running) keeps waiting, never + /// settling on it); the runtime polls registration and, once an admin approves, auto-transitions + /// to [`Running`](Self::Running) with no re-registration (Go's `ipn.State::NeedsMachineAuth` → + /// `Starting`). No `browse_to_url` is derived from it (there is no URL). + NeedsMachineAuth, /// The node key expired and an automatic, non-interactive re-authentication is in progress: the /// runtime is rotating the node key and re-registering with the stored auth key (Go `doLogin`). /// **Transient** — treated like [`Connecting`](Self::Connecting) by the waiters @@ -116,6 +125,13 @@ impl From<&ts_control::Error> for RegistrationError { // is reached; classifying it as `NetworkUnreachable` here keeps any other caller of this // conversion on the correct (non-permanent, retry-may-succeed) branch. ts_control::Error::RateLimited(_) => RegistrationError::NetworkUnreachable, + // "Awaiting admin approval, no URL" is **transient** — the node holds a valid key and the + // runtime polls until an admin approves, then comes up (Go `NeedsMachineAuth → Starting`), + // so it must NOT become a permanent `AuthRejected`. The control runner's `check_auth` and + // `connect` loops already intercept `NeedsMachineAuth` and poll before this mapping is + // reached; classifying it as `NetworkUnreachable` here keeps any other caller of this + // conversion on the correct (non-permanent, retry-may-succeed) branch. + ts_control::Error::NeedsMachineAuth => RegistrationError::NetworkUnreachable, // InvalidUrl / Internal: not a transient network condition and not an auth decision — // treat as a (permanent-ish) auth rejection carrying the display reason so the caller // sees something actionable rather than an opaque "timeout". @@ -143,9 +159,13 @@ pub(crate) async fn wait_for_running( DeviceState::Failed(e) => Some(Err(e.clone())), DeviceState::Expired => Some(Err(RegistrationError::KeyExpired)), DeviceState::NeedsLogin(u) => Some(Err(RegistrationError::NeedsLogin(u.clone()))), - // Transient, like `Connecting`: an auto-reauth is in flight and the next good - // self-node flips back to `Running`, so keep waiting rather than settling. - DeviceState::Connecting | DeviceState::Reauthenticating => None, + // Transient, like `Connecting`: keep waiting rather than settling. `Reauthenticating` + // — an auto-reauth is in flight and the next good self-node flips back to `Running`. + // `NeedsMachineAuth` — awaiting admin approval (no URL); the runtime polls and + // auto-transitions to `Running` once approved (Go `NeedsMachineAuth → Starting`). + DeviceState::Connecting + | DeviceState::Reauthenticating + | DeviceState::NeedsMachineAuth => None, }; if let Some(result) = settled { return result; @@ -216,6 +236,16 @@ mod tests { !rl.is_permanent(), "a rate-limit must be a transient (non-permanent) failure" ); + // "Awaiting admin approval, no URL" (tsr-dvu) is TRANSIENT and must map to a non-permanent + // state, never the `AuthRejected` catch-all (which would wrongly mark an approval-gated node + // as a hard failure). Pins the explicit arm: if a refactor drops it and lets `NeedsMachineAuth` + // fall into `other => AuthRejected`, this fails. + let nma = RegistrationError::from(&ts_control::Error::NeedsMachineAuth); + assert_eq!(nma, RegistrationError::NetworkUnreachable); + assert!( + !nma.is_permanent(), + "awaiting admin approval must be a transient (non-permanent) failure" + ); } // --- wait_for_running loop --- @@ -308,6 +338,38 @@ mod tests { ); } + /// `NeedsMachineAuth` is transient (a waiter never settles on it): like `Connecting`, it times + /// out rather than resolving to a terminal error, because the runtime polls registration and the + /// next good self-node flips the state back to `Running` once an admin approves. This is the + /// behavioral guard for tsr-dvu: an approval-gated node awaiting admin approval never surfaces as + /// a permanent failure (the bug was it dying terminally instead of polling). + #[tokio::test] + async fn wait_does_not_settle_on_needs_machine_auth() { + let (_tx, rx) = watch::channel(DeviceState::NeedsMachineAuth); + assert_eq!( + wait_for_running(rx, Some(Duration::from_millis(30))).await, + Err(RegistrationError::Timeout), + "NeedsMachineAuth is transient — a waiter keeps waiting, it does not settle" + ); + } + + /// The full await-approval recovery as a waiter sees it: `NeedsMachineAuth` (awaiting admin + /// approval) → `Running` (the admin approved; `check_auth` returned `Ok`, the netmap self-node + /// arrived) resolves `Ok(())`. Proves the transient state is observed and then recovered with no + /// re-registration — Go's `NeedsMachineAuth → Starting → Running` auto-transition. + #[tokio::test] + async fn wait_resolves_on_needs_machine_auth_then_running() { + let (tx, rx) = watch::channel(DeviceState::NeedsMachineAuth); + tokio::spawn(async move { + tokio::time::sleep(Duration::from_millis(20)).await; + tx.send_replace(DeviceState::Running); + }); + assert_eq!( + wait_for_running(rx, Some(Duration::from_secs(1))).await, + Ok(()) + ); + } + /// If the sender is dropped while still `Connecting`, the wait ends as `NetworkUnreachable` /// rather than hanging forever. #[tokio::test]