Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions ts_control/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,16 @@ pub enum Error {
#[error("machine was not authorized by control to join tailnet, authorize at {0}")]
MachineNotAuthorized(url::Url),

/// A machine is not yet authorized and control offered **no** interactive auth URL — it is
/// awaiting admin approval on an approval-gated tailnet. **Transient and recoverable**: the node
/// holds a valid key and must poll-and-retry registration until an admin approves, then it comes
/// up with no re-registration (Go's `ipn.State::NeedsMachineAuth` → `Starting` auto-transition).
/// Distinct from [`Internal`](Self::Internal)`(MachineAuthorization, _)` so the control runner can
/// tell "awaiting approval" (poll) apart from a hard internal failure (stop), and from
/// [`MachineNotAuthorized`](Self::MachineNotAuthorized) which carries a URL for interactive login.
#[error("machine awaiting admin approval to join tailnet (no interactive auth URL)")]
NeedsMachineAuth,

/// The user supplied an invalid URL.
#[error("invalid URL: {0}")]
InvalidUrl(url::Url),
Expand Down
19 changes: 9 additions & 10 deletions ts_control/src/tokio/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -471,8 +471,8 @@ fn reconnect_delay_after_poll(
/// re-register clears the state for free (Go's `authRoutine` keeps `urlToVisit` and keeps polling).
///
/// **Only `MachineNotAuthorized` sets the cell.** `MachineNotAuthorized(None)` (no auth URL on
/// offer) maps upstream to [`Error::Internal`]`(MachineAuthorization, _)`, not this variant, so it
/// correctly does *not* set a (nonexistent) URL. The write is sticky via
/// offer) maps upstream to [`Error::NeedsMachineAuth`], not this variant, so it correctly does
/// *not* set a (nonexistent) URL. The write is sticky via
/// [`send_if_modified`](watch::Sender::send_if_modified): the cell is updated only when the URL
/// actually differs from its current value, so a re-auth URL that persists across several failed
/// re-register attempts does not thrash the cell or wake the runtime's bridge spuriously.
Expand Down Expand Up @@ -1207,19 +1207,18 @@ mod tests {
}

/// `MachineNotAuthorized(None)` (control offered no auth URL) maps upstream to
/// `Error::Internal(MachineAuthorization, _)`, NOT `Error::MachineNotAuthorized`, so the helper
/// must leave the cell untouched. Built from the *exact* upstream mapping (register.rs
/// `From<RegistrationError> for Error`) so this stays honest if that mapping ever changes.
/// `Error::NeedsMachineAuth`, NOT `Error::MachineNotAuthorized`, so the helper must leave the
/// cell untouched (there is no URL to surface). Built from the *exact* upstream mapping
/// (register.rs `From<RegistrationError> for Error`) so this stays honest if that mapping ever
/// changes.
#[test]
fn machine_not_authorized_none_does_not_set_url_cell() {
let (tx, rx) = watch::channel(None);
let err =
Error::from(crate::tokio::register::RegistrationError::MachineNotAuthorized(None));
// Confirm the mapping is the non-URL internal variant (the precondition for the assertion).
assert!(matches!(
err,
Error::Internal(crate::InternalErrorKind::MachineAuthorization, _)
));
// Confirm the mapping is the no-URL await-approval variant (the precondition for the
// assertion): it is the distinct `NeedsMachineAuth`, not a URL-carrying re-auth signal.
assert!(matches!(err, Error::NeedsMachineAuth));

surface_reauth_url(&err, &tx);

Expand Down
56 changes: 52 additions & 4 deletions ts_control/src/tokio/register.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,13 @@ impl From<RegistrationError> for crate::Error {
RegistrationError::MachineNotAuthorized(Some(u)) => {
crate::Error::MachineNotAuthorized(u)
}
RegistrationError::MachineNotAuthorized(None) => crate::Error::Internal(
crate::InternalErrorKind::MachineAuthorization,
crate::Operation::Registration,
),
// "Not authorized, no URL, no error" = awaiting admin approval on an approval-gated
// tailnet (a TRANSIENT state — the node holds a valid key and must poll until approved,
// then comes up with no re-registration; Go's `NeedsMachineAuth`). Surface it as the
// distinct `NeedsMachineAuth` so the control runner polls instead of stopping, rather
// than collapsing it into the generic `Internal(MachineAuthorization, _)` (which the
// runner treats as a hard failure → terminal `Failed`).
RegistrationError::MachineNotAuthorized(None) => crate::Error::NeedsMachineAuth,
RegistrationError::Rejected(msg) => crate::Error::Registration(msg),
RegistrationError::RateLimited(d) => crate::Error::RateLimited(d),
RegistrationError::Internal(k) => {
Expand Down Expand Up @@ -478,4 +481,49 @@ mod tests {

assert!(classify_register_response(&resp).is_ok());
}

/// `MachineNotAuthorized(None)` — the await-admin-approval case — must map to the DISTINCT
/// `crate::Error::NeedsMachineAuth`, NOT the generic `Internal(MachineAuthorization, _)` (tsr-dvu).
/// This is what lets the control runner tell "awaiting approval, poll" apart from a hard internal
/// failure: the runner matches `NeedsMachineAuth` as a transient poll arm, while `Internal` would
/// fall into its terminal `Failed` arm and permanently stop the runner.
#[test]
fn machine_not_authorized_none_maps_to_needs_machine_auth() {
let err = crate::Error::from(RegistrationError::MachineNotAuthorized(None));
assert!(
matches!(err, crate::Error::NeedsMachineAuth),
"MachineNotAuthorized(None) must map to the distinct NeedsMachineAuth, got {err:?}"
);
// Specifically NOT the generic internal machine-auth error (the pre-tsr-dvu mapping that the
// runner treated as a hard failure).
assert!(!matches!(
err,
crate::Error::Internal(crate::InternalErrorKind::MachineAuthorization, _)
));
}

/// `MachineNotAuthorized(Some(url))` — the interactive-login case — is UNCHANGED by tsr-dvu: it
/// still maps to the URL-carrying `crate::Error::MachineNotAuthorized(url)` (the runner surfaces
/// `NeedsLogin`).
#[test]
fn machine_not_authorized_some_maps_to_machine_not_authorized_url() {
let url: Url = "https://login.example.com/a/abc123".parse().unwrap();
let err = crate::Error::from(RegistrationError::MachineNotAuthorized(Some(url.clone())));
assert_eq!(err, crate::Error::MachineNotAuthorized(url));
}

/// Regression: a hard `Rejected(msg)` (bad/expired/unknown key — what `classify_register_response`
/// returns for not-authorized WITH an error reason) maps to `crate::Error::Registration(msg)`,
/// which is DISTINCT from the transient `NeedsMachineAuth`. The runner routes `Registration` to
/// its terminal `Failed` arm, so a genuine auth failure still terminates — tsr-dvu must NOT turn
/// real rejections into infinite polls.
#[test]
fn rejected_maps_to_registration_not_needs_machine_auth() {
let err = crate::Error::from(RegistrationError::Rejected("invalid key".to_string()));
assert_eq!(err, crate::Error::Registration("invalid key".to_string()));
assert!(
!matches!(err, crate::Error::NeedsMachineAuth),
"a hard rejection must NOT be the transient await-approval variant"
);
}
}
36 changes: 36 additions & 0 deletions ts_runtime/src/control_runner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,23 @@ impl kameo::Actor for ControlRunner {
.send_replace(crate::DeviceState::NeedsLogin(u.clone()));
tokio::time::sleep(Duration::from_secs(5)).await;
}
Err(ControlError::NeedsMachineAuth) => {
// The node is registered with a valid key but awaiting ADMIN APPROVAL on an
// approval-gated tailnet, and control offered NO interactive URL. This is
// TRANSIENT (Go's `NeedsMachineAuth`): poll registration until an admin approves,
// then `check_auth` returns `Ok(())` → the loop breaks and the node comes up with
// no re-registration. Publishing the (no-URL) `NeedsMachineAuth` state — NOT a
// terminal `Failed` and NOT `NeedsLogin` (there is no URL to open) — lets a
// watcher / `wait_until_running` see "awaiting approval" instead of an opaque
// timeout. Same 5s poll cadence as the `MachineNotAuthorized(url)` arm.
tracing::info!(
"machine awaiting admin approval to join the tailnet; polling until approved"
);
params
.state_tx
.send_replace(crate::DeviceState::NeedsMachineAuth);
tokio::time::sleep(Duration::from_secs(5)).await;
}
Err(ControlError::RateLimited(retry_after)) => {
// Control asked us to slow down (HTTP 429). Wait exactly the server-requested
// cooldown and retry — this is transient, NOT a terminal `Failed`, so we must
Expand Down Expand Up @@ -278,6 +295,25 @@ impl kameo::Actor for ControlRunner {
);
tokio::time::sleep(retry_after).await;
}
Err(ControlRunnerError::Control(ControlError::NeedsMachineAuth)) => {
// `connect` issues its OWN `machine/register` POST (a second one after
// `check_auth`'s), so it too can come back "awaiting admin approval, no URL". In
// the normal flow the `check_auth` loop above already gated this (it only breaks
// once registration returns `Ok`, and approval is monotonic), so this arm is the
// defensive twin for a node de-authorized in the race window between the two POSTs:
// treat it as TRANSIENT exactly like the `check_auth` arm — publish the (no-URL)
// `NeedsMachineAuth` state, poll the same 5s, and retry the bring-up — rather than
// collapsing into the terminal `Failed` arm below (which would permanently stop
// the runner on a recoverable await-approval). Mirrors Go's `NeedsMachineAuth`.
tracing::info!(
"machine awaiting admin approval during session bring-up; polling until \
approved"
);
params
.state_tx
.send_replace(crate::DeviceState::NeedsMachineAuth);
tokio::time::sleep(Duration::from_secs(5)).await;
}
Err(e) => {
tracing::error!(error = %e, "bringing up the control session failed");
// The control session never came up; surface it as a transient registration
Expand Down
68 changes: 65 additions & 3 deletions ts_runtime/src/device_state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,15 @@ pub enum DeviceState {
/// Control requires interactive authentication (no usable auth key): the node is waiting for a
/// human to authorize it at the carried URL. Transient — registration retries until authorized.
NeedsLogin(url::Url),
/// The node is registered with a valid key but awaiting **admin approval** on an approval-gated
/// tailnet, and control offered **no** interactive auth URL (so, unlike
/// [`NeedsLogin`](Self::NeedsLogin), there is nothing for a human to open — an admin must approve
/// the node out of band). **Transient** — treated like [`Connecting`](Self::Connecting) by the
/// waiters ([`wait_until_running`](crate::Runtime::wait_until_running) keeps waiting, never
/// settling on it); the runtime polls registration and, once an admin approves, auto-transitions
/// to [`Running`](Self::Running) with no re-registration (Go's `ipn.State::NeedsMachineAuth` →
/// `Starting`). No `browse_to_url` is derived from it (there is no URL).
NeedsMachineAuth,
/// The node key expired and an automatic, non-interactive re-authentication is in progress: the
/// runtime is rotating the node key and re-registering with the stored auth key (Go `doLogin`).
/// **Transient** — treated like [`Connecting`](Self::Connecting) by the waiters
Expand Down Expand Up @@ -116,6 +125,13 @@ impl From<&ts_control::Error> for RegistrationError {
// is reached; classifying it as `NetworkUnreachable` here keeps any other caller of this
// conversion on the correct (non-permanent, retry-may-succeed) branch.
ts_control::Error::RateLimited(_) => RegistrationError::NetworkUnreachable,
// "Awaiting admin approval, no URL" is **transient** — the node holds a valid key and the
// runtime polls until an admin approves, then comes up (Go `NeedsMachineAuth → Starting`),
// so it must NOT become a permanent `AuthRejected`. The control runner's `check_auth` and
// `connect` loops already intercept `NeedsMachineAuth` and poll before this mapping is
// reached; classifying it as `NetworkUnreachable` here keeps any other caller of this
// conversion on the correct (non-permanent, retry-may-succeed) branch.
ts_control::Error::NeedsMachineAuth => RegistrationError::NetworkUnreachable,
// InvalidUrl / Internal: not a transient network condition and not an auth decision —
// treat as a (permanent-ish) auth rejection carrying the display reason so the caller
// sees something actionable rather than an opaque "timeout".
Expand Down Expand Up @@ -143,9 +159,13 @@ pub(crate) async fn wait_for_running(
DeviceState::Failed(e) => Some(Err(e.clone())),
DeviceState::Expired => Some(Err(RegistrationError::KeyExpired)),
DeviceState::NeedsLogin(u) => Some(Err(RegistrationError::NeedsLogin(u.clone()))),
// Transient, like `Connecting`: an auto-reauth is in flight and the next good
// self-node flips back to `Running`, so keep waiting rather than settling.
DeviceState::Connecting | DeviceState::Reauthenticating => None,
// Transient, like `Connecting`: keep waiting rather than settling. `Reauthenticating`
// — an auto-reauth is in flight and the next good self-node flips back to `Running`.
// `NeedsMachineAuth` — awaiting admin approval (no URL); the runtime polls and
// auto-transitions to `Running` once approved (Go `NeedsMachineAuth → Starting`).
DeviceState::Connecting
| DeviceState::Reauthenticating
| DeviceState::NeedsMachineAuth => None,
};
if let Some(result) = settled {
return result;
Expand Down Expand Up @@ -216,6 +236,16 @@ mod tests {
!rl.is_permanent(),
"a rate-limit must be a transient (non-permanent) failure"
);
// "Awaiting admin approval, no URL" (tsr-dvu) is TRANSIENT and must map to a non-permanent
// state, never the `AuthRejected` catch-all (which would wrongly mark an approval-gated node
// as a hard failure). Pins the explicit arm: if a refactor drops it and lets `NeedsMachineAuth`
// fall into `other => AuthRejected`, this fails.
let nma = RegistrationError::from(&ts_control::Error::NeedsMachineAuth);
assert_eq!(nma, RegistrationError::NetworkUnreachable);
assert!(
!nma.is_permanent(),
"awaiting admin approval must be a transient (non-permanent) failure"
);
}

// --- wait_for_running loop ---
Expand Down Expand Up @@ -308,6 +338,38 @@ mod tests {
);
}

/// `NeedsMachineAuth` is transient (a waiter never settles on it): like `Connecting`, it times
/// out rather than resolving to a terminal error, because the runtime polls registration and the
/// next good self-node flips the state back to `Running` once an admin approves. This is the
/// behavioral guard for tsr-dvu: an approval-gated node awaiting admin approval never surfaces as
/// a permanent failure (the bug was it dying terminally instead of polling).
#[tokio::test]
async fn wait_does_not_settle_on_needs_machine_auth() {
let (_tx, rx) = watch::channel(DeviceState::NeedsMachineAuth);
assert_eq!(
wait_for_running(rx, Some(Duration::from_millis(30))).await,
Err(RegistrationError::Timeout),
"NeedsMachineAuth is transient — a waiter keeps waiting, it does not settle"
);
}

/// The full await-approval recovery as a waiter sees it: `NeedsMachineAuth` (awaiting admin
/// approval) → `Running` (the admin approved; `check_auth` returned `Ok`, the netmap self-node
/// arrived) resolves `Ok(())`. Proves the transient state is observed and then recovered with no
/// re-registration — Go's `NeedsMachineAuth → Starting → Running` auto-transition.
#[tokio::test]
async fn wait_resolves_on_needs_machine_auth_then_running() {
let (tx, rx) = watch::channel(DeviceState::NeedsMachineAuth);
tokio::spawn(async move {
tokio::time::sleep(Duration::from_millis(20)).await;
tx.send_replace(DeviceState::Running);
});
assert_eq!(
wait_for_running(rx, Some(Duration::from_secs(1))).await,
Ok(())
);
}

/// If the sender is dropped while still `Connecting`, the wait ends as `NetworkUnreachable`
/// rather than hanging forever.
#[tokio::test]
Expand Down