From 5ded2fc5313e4de86374f879d2384031a53de95a Mon Sep 17 00:00:00 2001 From: Sam Calder-Mason Date: Fri, 19 Jun 2026 14:09:42 +1000 Subject: [PATCH 1/3] fix(auth): coordinate credential refresh across processes and guard logins Multiple panda servers on one host share a single credentials file (the path is keyed only by issuer+client+resource). Each runs a background refresh, and because the provider rotates refresh tokens (issuing a new one and revoking the old), concurrent refreshes revoke each other and produce invalid_grant storms. Separately, a login that returns no refresh token (e.g. an outdated device flow without offline_access) silently overwrote a working credential and the restarted server then expired at the next access-token expiry. - store.refresh now coordinates across processes with a non-blocking flock: the winner reloads and refreshes; losers reuse the on-disk token and never replay a revoked one. - Save/Clear take the lock too (blocking up to WriteLockWait, fail-closed with ErrCredentialBusy on contention) so logins/logouts serialize with refreshes. - Save refuses to replace a refreshable credential with one that cannot refresh (ErrCredentialDowngrade), failing closed on read errors. - Credentials are written atomically (temp + sync + rename). - Added structured logging for refresh/rotation/contention/invalid_grant and a login warning when no refresh token is issued. --- go.mod | 2 +- pkg/auth/store/lock_other.go | 10 ++ pkg/auth/store/lock_test.go | 269 +++++++++++++++++++++++++++++++++++ pkg/auth/store/lock_unix.go | 56 ++++++++ pkg/auth/store/store.go | 220 ++++++++++++++++++++++++++-- pkg/auth/store/store_test.go | 59 ++++++++ pkg/cli/auth.go | 28 ++++ 7 files changed, 632 insertions(+), 12 deletions(-) create mode 100644 pkg/auth/store/lock_other.go create mode 100644 pkg/auth/store/lock_test.go create mode 100644 pkg/auth/store/lock_unix.go diff --git a/go.mod b/go.mod index 7c43b2fb..fb6a8e62 100644 --- a/go.mod +++ b/go.mod @@ -27,6 +27,7 @@ require ( github.com/spf13/cobra v1.10.2 github.com/stretchr/testify v1.11.1 github.com/testcontainers/testcontainers-go v0.42.0 + golang.org/x/sys v0.45.0 golang.org/x/time v0.15.0 google.golang.org/protobuf v1.36.11 gopkg.in/yaml.v3 v3.0.1 @@ -131,7 +132,6 @@ require ( golang.org/x/net v0.55.0 // indirect golang.org/x/oauth2 v0.36.0 // indirect golang.org/x/sync v0.20.0 // indirect - golang.org/x/sys v0.45.0 // indirect golang.org/x/telemetry v0.0.0-20260421165255-392afab6f40e // indirect golang.org/x/term v0.43.0 // indirect golang.org/x/text v0.37.0 // indirect diff --git a/pkg/auth/store/lock_other.go b/pkg/auth/store/lock_other.go new file mode 100644 index 00000000..536b6f63 --- /dev/null +++ b/pkg/auth/store/lock_other.go @@ -0,0 +1,10 @@ +//go:build !unix || aix + +package store + +// tryFileLock is a no-op on platforms without flock(2). Process-local +// serialization via refreshMu still applies; cross-process coordination is +// simply unavailable, matching the historical behaviour. +func tryFileLock(_ string) (release func(), acquired bool, err error) { + return func() {}, true, nil +} diff --git a/pkg/auth/store/lock_test.go b/pkg/auth/store/lock_test.go new file mode 100644 index 00000000..4fcba95d --- /dev/null +++ b/pkg/auth/store/lock_test.go @@ -0,0 +1,269 @@ +//go:build unix && !aix + +package store + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "os" + "path/filepath" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/sirupsen/logrus" + + authclient "github.com/ethpandaops/panda/pkg/auth/client" +) + +func TestTryFileLockIsMutuallyExclusive(t *testing.T) { + t.Parallel() + + path := filepath.Join(t.TempDir(), "creds.json") + + release1, ok1, err1 := tryFileLock(path) + if err1 != nil || !ok1 { + t.Fatalf("first lock should be acquired: ok=%v err=%v", ok1, err1) + } + + if _, ok2, _ := tryFileLock(path); ok2 { + t.Fatal("second lock should fail while the first is held") + } + + release1() + + release3, ok3, err3 := tryFileLock(path) + if err3 != nil || !ok3 { + t.Fatalf("lock should be acquirable after release: ok=%v err=%v", ok3, err3) + } + release3() +} + +// TestSaveAndClearFailClosedWhileLockHeld verifies that an interactive write +// refuses to proceed (rather than clobber a rotation) while another process +// holds the credentials lock, and succeeds once it is released. +func TestSaveAndClearFailClosedWhileLockHeld(t *testing.T) { + t.Parallel() + + path := filepath.Join(t.TempDir(), "creds.json") + st := New(logrus.New(), Config{ + Path: path, + WriteLockWait: 100 * time.Millisecond, + }).(*store) + + // Simulate a refresh in another process holding the lock. + release, ok, err := tryFileLock(path) + if err != nil || !ok { + t.Fatalf("failed to take lock: ok=%v err=%v", ok, err) + } + + tokens := &authclient.Tokens{ + AccessToken: "access", + RefreshToken: "refresh", + ExpiresAt: time.Now().Add(time.Hour), + } + + if err := st.Save(tokens); !errors.Is(err, ErrCredentialBusy) { + t.Fatalf("expected ErrCredentialBusy while lock held, got %v", err) + } + + if err := st.Clear(); !errors.Is(err, ErrCredentialBusy) { + t.Fatalf("expected ErrCredentialBusy from Clear while lock held, got %v", err) + } + + release() + + if err := st.Save(tokens); err != nil { + t.Fatalf("Save should succeed after the lock is released: %v", err) + } +} + +// TestRefreshReusesTokenRotatedByAnotherProcess verifies the reload-recheck +// step: if another process rotated the on-disk token after we decided to +// refresh but before we acquired the lock, we reuse that token instead of +// refreshing again (which would present a token the other process revoked). +func TestRefreshReusesTokenRotatedByAnotherProcess(t *testing.T) { + t.Parallel() + + path := filepath.Join(t.TempDir(), "creds.json") + client := &countingAuthClient{} + store := New(logrus.New(), Config{ + Path: path, + AuthClient: client, + RefreshBuffer: 5 * time.Minute, + }).(*store) + + // In-memory token is due for refresh (inside the buffer). + store.tokens = &authclient.Tokens{ + AccessToken: "stale", + RefreshToken: "stale-rt", + ExpiresAt: time.Now().Add(time.Minute), + } + + // Another process has already written a fresh token to the shared file. + writeTokens(t, path, &authclient.Tokens{ + AccessToken: "fresh", + RefreshToken: "fresh-rt", + ExpiresIn: 3600, + ExpiresAt: time.Now().Add(time.Hour), + }) + + token, err := store.GetAccessToken() + if err != nil { + t.Fatalf("GetAccessToken returned error: %v", err) + } + + if token != "fresh" { + t.Fatalf("expected the token written by another process, got %q", token) + } + + if client.refreshCalls.Load() != 0 { + t.Fatalf("expected no provider refresh, got %d", client.refreshCalls.Load()) + } +} + +// TestSharedCredentialRefreshNeverReusesRevokedToken hammers several stores +// that share one credentials file against a provider that rotates and revokes +// like authentik. The file lock must ensure no store ever presents a +// rotated-away token (which would return invalid_grant). +func TestSharedCredentialRefreshNeverReusesRevokedToken(t *testing.T) { + t.Parallel() + + path := filepath.Join(t.TempDir(), "creds.json") + client := newRotatingAuthClient("rt-0") + writeTokens(t, path, &authclient.Tokens{ + AccessToken: "at-0", + RefreshToken: "rt-0", + ExpiresIn: 3600, + ExpiresAt: time.Now().Add(time.Hour), + RefreshTokenIssuedAt: time.Now(), + }) + + const ( + stores = 4 + iterations = 50 + ) + + newSharedStore := func() *store { + return New(logrus.New(), Config{ + Path: path, + AuthClient: client, + RefreshBuffer: 5 * time.Minute, + RefreshTokenTTL: time.Nanosecond, // always past half-life: forces a refresh every call + }).(*store) + } + + start := make(chan struct{}) + + var wg sync.WaitGroup + + wg.Add(stores) + + for range stores { + s := newSharedStore() + + go func() { + defer wg.Done() + + <-start // release all goroutines together to maximise contention + + for range iterations { + token, err := s.GetAccessToken() + if err != nil { + t.Errorf("GetAccessToken returned error: %v", err) + + return + } + + if token == "" { + t.Error("GetAccessToken returned an empty token") + + return + } + } + }() + } + + close(start) + wg.Wait() + + if got := client.invalidGrants.Load(); got != 0 { + t.Fatalf("expected zero invalid_grant (revoked-token reuse), got %d", got) + } + + if got := client.refreshCalls.Load(); got == 0 { + t.Fatal("expected at least one provider refresh") + } + + // The shared credential must end in a usable, authenticated state. + final := New(logrus.New(), Config{Path: path, AuthClient: client}).(*store) + if !final.IsAuthenticated() { + t.Fatal("shared credential is not authenticated after concurrent refreshes") + } +} + +func writeTokens(t *testing.T, path string, tokens *authclient.Tokens) { + t.Helper() + + data, err := json.MarshalIndent(tokens, "", " ") + if err != nil { + t.Fatalf("marshaling tokens: %v", err) + } + + if err := os.WriteFile(path, data, 0o600); err != nil { + t.Fatalf("writing tokens: %v", err) + } +} + +// rotatingAuthClient models authentik: each refresh issues a new refresh token +// and revokes the presented one. Presenting a revoked token returns +// invalid_grant. +type rotatingAuthClient struct { + mu sync.Mutex + counter int + valid map[string]struct{} + refreshCalls atomic.Int64 + invalidGrants atomic.Int64 +} + +func newRotatingAuthClient(initial string) *rotatingAuthClient { + return &rotatingAuthClient{valid: map[string]struct{}{initial: {}}} +} + +func (c *rotatingAuthClient) Login(_ context.Context) (*authclient.Tokens, error) { + return nil, errors.New("not implemented") +} + +func (c *rotatingAuthClient) ClientCredentials(_ context.Context) (*authclient.Tokens, error) { + return nil, errors.New("not implemented") +} + +func (c *rotatingAuthClient) Refresh(_ context.Context, refreshToken string) (*authclient.Tokens, error) { + c.refreshCalls.Add(1) + + c.mu.Lock() + defer c.mu.Unlock() + + if _, ok := c.valid[refreshToken]; !ok { + c.invalidGrants.Add(1) + + return nil, fmt.Errorf("token endpoint returned status 400: {\"error\": \"invalid_grant\"}") + } + + delete(c.valid, refreshToken) + c.counter++ + next := fmt.Sprintf("rt-%d", c.counter) + c.valid[next] = struct{}{} + + return &authclient.Tokens{ + AccessToken: fmt.Sprintf("at-%d", c.counter), + RefreshToken: next, + TokenType: "Bearer", + ExpiresIn: 3600, + ExpiresAt: time.Now().Add(time.Hour), + RefreshTokenIssuedAt: time.Now(), + }, nil +} diff --git a/pkg/auth/store/lock_unix.go b/pkg/auth/store/lock_unix.go new file mode 100644 index 00000000..b130ec14 --- /dev/null +++ b/pkg/auth/store/lock_unix.go @@ -0,0 +1,56 @@ +//go:build unix && !aix + +package store + +import ( + "errors" + "fmt" + "os" + "path/filepath" + + "golang.org/x/sys/unix" +) + +// tryFileLock attempts a non-blocking exclusive lock on path+".lock" so that +// only one process drives a credential refresh at a time when several panda +// processes share one credentials file. +// +// It returns (release, true, nil) when this process holds the lock, +// (nil, false, nil) when another process currently holds it, and +// (no-op release, true, err) when the lock file itself cannot be created — in +// which case the caller proceeds relying on process-local serialization, so a +// single process is never blocked by lock-file errors. +// +// The lock is advisory and released automatically by the OS when the holding +// process exits, so a crashed holder never leaves a stale lock. +func tryFileLock(path string) (release func(), acquired bool, err error) { + lockPath := path + ".lock" + + if mkErr := os.MkdirAll(filepath.Dir(lockPath), 0o700); mkErr != nil { + return func() {}, true, fmt.Errorf("creating lock directory: %w", mkErr) + } + + f, openErr := os.OpenFile(lockPath, os.O_CREATE|os.O_RDWR, 0o600) + if openErr != nil { + return func() {}, true, fmt.Errorf("opening lock file: %w", openErr) + } + + if flockErr := unix.Flock(int(f.Fd()), unix.LOCK_EX|unix.LOCK_NB); flockErr != nil { + // EWOULDBLOCK/EAGAIN is genuine contention: another process holds the + // lock, so back off and reuse what it writes. Any other flock error is + // unexpected — fail open and refresh without cross-process coordination + // rather than risk being unable to refresh at all. + if errors.Is(flockErr, unix.EWOULDBLOCK) || errors.Is(flockErr, unix.EAGAIN) { + _ = f.Close() + + return nil, false, nil + } + + return func() { _ = f.Close() }, true, fmt.Errorf("locking credentials file: %w", flockErr) + } + + return func() { + _ = unix.Flock(int(f.Fd()), unix.LOCK_UN) + _ = f.Close() + }, true, nil +} diff --git a/pkg/auth/store/store.go b/pkg/auth/store/store.go index baa2e721..27bb0968 100644 --- a/pkg/auth/store/store.go +++ b/pkg/auth/store/store.go @@ -6,6 +6,7 @@ import ( "crypto/sha256" "encoding/hex" "encoding/json" + "errors" "fmt" "os" "path/filepath" @@ -18,6 +19,26 @@ import ( "github.com/ethpandaops/panda/pkg/auth/client" ) +const ( + // credentialLockWait bounds how long an interactive write (login/logout) + // waits for an in-flight refresh to release the credentials file lock. + credentialLockWait = 5 * time.Second + + // credentialLockPoll is the retry interval while waiting for the lock. + credentialLockPoll = 50 * time.Millisecond +) + +// ErrCredentialDowngrade is returned by Save when the new tokens cannot refresh +// (no refresh token) but the stored credential can. Replacing a refreshable +// credential with one that cannot refresh would silently downgrade a working +// session to one that dies at the next access-token expiry. +var ErrCredentialDowngrade = errors.New("refusing to overwrite a refreshable credential with one that cannot refresh") + +// ErrCredentialBusy is returned by Save/Clear when another process holds the +// credentials file lock (a refresh is in progress) for longer than the write +// lock wait. The caller should retry shortly rather than clobber the rotation. +var ErrCredentialBusy = errors.New("credentials are being refreshed by another process; retry shortly") + // Store manages local credential storage. type Store interface { // Path returns the resolved credentials file path. @@ -67,6 +88,10 @@ type Config struct { // to keep the refresh token alive via provider rotation. RefreshTokenTTL time.Duration + // WriteLockWait bounds how long Save/Clear wait for the credentials file + // lock before returning ErrCredentialBusy. Defaults to credentialLockWait. + WriteLockWait time.Duration + // AuthClient is the OAuth client for refreshing tokens. AuthClient client.Client } @@ -91,6 +116,10 @@ func New(log logrus.FieldLogger, cfg Config) Store { cfg.RefreshBuffer = 5 * time.Minute } + if cfg.WriteLockWait == 0 { + cfg.WriteLockWait = credentialLockWait + } + return &store{ log: log.WithField("component", "credential-store"), cfg: cfg, @@ -101,8 +130,68 @@ func (s *store) Path() string { return s.cfg.Path } -// Save saves tokens to the store. +// Save persists tokens for login/logout-style writes. It serializes with +// concurrent refreshers in other processes via the credentials file lock and +// refuses to replace a refreshable credential with one that cannot refresh +// (ErrCredentialDowngrade), so an outdated login cannot silently downgrade a +// working session. func (s *store) Save(tokens *client.Tokens) error { + release, ok := s.lockForWrite() + if !ok { + return ErrCredentialBusy + } + defer release() + + if tokens.RefreshToken == "" { + // Fail closed: if we cannot read the existing credential we must not + // assume it is safe to replace with a non-refreshable one. + existing, err := s.Load() + if err != nil { + return fmt.Errorf("checking existing credential before overwrite: %w", err) + } + + if existing != nil && existing.RefreshToken != "" { + return ErrCredentialDowngrade + } + } + + return s.writeTokens(tokens) +} + +// lockForWrite acquires the credentials file lock for an interactive write, +// waiting up to WriteLockWait for an in-flight refresh to finish. Persistent +// contention fails closed (ok=false) so the caller surfaces a retry rather than +// clobbering a rotation. A lock-infrastructure error (not contention) proceeds +// best-effort, since the atomic write still prevents corruption. +func (s *store) lockForWrite() (release func(), ok bool) { + deadline := time.Now().Add(s.cfg.WriteLockWait) + + for { + releaseLock, acquired, err := tryFileLock(s.cfg.Path) + if err != nil { + s.log.WithError(err).Warn( + "Credential file lock unavailable; writing without cross-process coordination", + ) + + return releaseLock, true + } + + if acquired { + return releaseLock, true + } + + if time.Now().After(deadline) { + return nil, false + } + + time.Sleep(credentialLockPoll) + } +} + +// writeTokens atomically persists tokens and updates the in-memory cache. It +// does not take the file lock: callers needing cross-process exclusion (Save, +// Clear) acquire it first, and refresh already holds it. +func (s *store) writeTokens(tokens *client.Tokens) error { s.mu.Lock() defer s.mu.Unlock() @@ -118,9 +207,40 @@ func (s *store) Save(tokens *client.Tokens) error { return fmt.Errorf("marshaling tokens: %w", err) } - // Write file with secure permissions. - if err := os.WriteFile(s.cfg.Path, data, 0600); err != nil { - return fmt.Errorf("writing credentials file: %w", err) + // Write atomically via a temp file + rename so a crash mid-write never + // leaves partial JSON or loses the only valid rotated refresh token. + tmp, err := os.CreateTemp(dir, ".credentials-*.tmp") + if err != nil { + return fmt.Errorf("creating temp credentials file: %w", err) + } + + tmpName := tmp.Name() + defer func() { _ = os.Remove(tmpName) }() + + if _, err := tmp.Write(data); err != nil { + _ = tmp.Close() + + return fmt.Errorf("writing temp credentials file: %w", err) + } + + if err := tmp.Chmod(0600); err != nil { + _ = tmp.Close() + + return fmt.Errorf("setting credentials permissions: %w", err) + } + + if err := tmp.Sync(); err != nil { + _ = tmp.Close() + + return fmt.Errorf("syncing temp credentials file: %w", err) + } + + if err := tmp.Close(); err != nil { + return fmt.Errorf("closing temp credentials file: %w", err) + } + + if err := os.Rename(tmpName, s.cfg.Path); err != nil { + return fmt.Errorf("replacing credentials file: %w", err) } s.tokens = tokens @@ -154,8 +274,15 @@ func (s *store) Load() (*client.Tokens, error) { return &tokens, nil } -// Clear removes stored tokens. +// Clear removes stored tokens, serializing with concurrent refreshers via the +// credentials file lock. func (s *store) Clear() error { + release, ok := s.lockForWrite() + if !ok { + return ErrCredentialBusy + } + defer release() + s.mu.Lock() defer s.mu.Unlock() @@ -300,40 +427,111 @@ func (s *store) refresh(prior *client.Tokens) (*client.Tokens, error) { return nil, fmt.Errorf("no refresh token available") } - // Serialize refreshes so concurrent callers do not double-call the provider. + // Serialize refreshes within this process so concurrent callers do not + // double-call the provider. s.refreshMu.Lock() defer s.refreshMu.Unlock() - // Another caller may have refreshed while we waited for the lock. + // Another in-process caller may have refreshed while we waited for the lock. if current := s.snapshotTokens(); current != nil && !s.needsRefresh(current) { return current, nil } + // Coordinate across processes that share this credentials file (e.g. several + // panda servers on one host pointed at the same proxy). Only the process + // that wins the file lock drives the rotation; the others reuse the token it + // writes. Without this, each process refreshes the same refresh token and + // provider-side rotation turns every loser's token into invalid_grant. + release, acquired, lockErr := tryFileLock(s.cfg.Path) + if lockErr != nil { + s.log.WithError(lockErr).Warn( + "Credential file lock unavailable; refreshing without cross-process coordination", + ) + } + + if !acquired { + // Another process is refreshing right now. Do not touch the refresh + // token — reuse whatever it writes. Refresh is proactive (well before + // expiry), so a still-valid token is normally available and the winner's + // rotation lands on the next reload. Only hand back a token that is not + // yet expired; otherwise surface an error so the caller retries once the + // holder has written the rotated token. + s.log.Debug("Another process holds the credential lock; reusing its refreshed token") + + if reloaded, loadErr := s.Load(); loadErr == nil && reloaded != nil && + time.Now().Before(reloaded.ExpiresAt) { + return reloaded, nil + } + + if time.Now().Before(prior.ExpiresAt) { + return prior, nil + } + + return nil, fmt.Errorf("another process is refreshing credentials and the current token has expired") + } + defer release() + + // We hold the lock. Another process may have rotated the token just before + // we acquired it; reload from disk and skip the refresh if it is now fresh. + if reloaded, loadErr := s.Load(); loadErr == nil && reloaded != nil && reloaded.RefreshToken != "" { + if !s.needsRefresh(reloaded) { + s.log.Debug("Credentials already refreshed by another process; reusing") + + return reloaded, nil + } + + prior = reloaded + } + priorIssuedAt := prior.RefreshTokenIssuedAt - s.log.Debug("Refreshing access token") + s.log.WithField("expires_at", prior.ExpiresAt.Format(time.RFC3339)).Debug("Refreshing access token") newTokens, err := s.cfg.AuthClient.Refresh(context.Background(), prior.RefreshToken) if err != nil { + if isInvalidGrant(err) { + s.log.WithError(err).Warn( + "Refresh token rejected (invalid_grant); it was likely rotated by another " + + "refresher sharing these credentials, or revoked — re-authentication may be " + + "required (panda auth login)", + ) + } else { + s.log.WithError(err).Warn("Failed to refresh access token") + } + return nil, fmt.Errorf("refreshing token: %w", err) } - // If the provider did not rotate the refresh token, preserve the + // The provider rotates the refresh token when it returns a fresh one (the + // client stamps a non-zero issued-at). When it does not rotate, preserve the // original issued-at so the half-life check stays correct. + rotated := !newTokens.RefreshTokenIssuedAt.IsZero() if newTokens.RefreshTokenIssuedAt.IsZero() { newTokens.RefreshTokenIssuedAt = priorIssuedAt } - // Save new tokens. - if err := s.Save(newTokens); err != nil { + // We already hold the file lock; write directly without re-acquiring it. + if err := s.writeTokens(newTokens); err != nil { return nil, err } s.clearForceRefresh() + s.log.WithFields(logrus.Fields{ + "rotated_refresh_token": rotated, + "expires_at": newTokens.ExpiresAt.Format(time.RFC3339), + }).Info("Refreshed access token") + return newTokens, nil } +// isInvalidGrant reports whether a refresh error is an OAuth invalid_grant +// response, which for a rotating provider means the presented refresh token was +// already rotated away or revoked. +func isInvalidGrant(err error) bool { + return err != nil && strings.Contains(err.Error(), "invalid_grant") +} + func defaultCredentialPath(cfg Config) string { home, _ := os.UserHomeDir() newBaseDir := filepath.Join(home, ".config", "panda") diff --git a/pkg/auth/store/store_test.go b/pkg/auth/store/store_test.go index f6b66fcb..9b4a8aac 100644 --- a/pkg/auth/store/store_test.go +++ b/pkg/auth/store/store_test.go @@ -14,11 +14,66 @@ import ( authclient "github.com/ethpandaops/panda/pkg/auth/client" ) +func TestSaveRefusesCredentialDowngrade(t *testing.T) { + t.Parallel() + + st := New(logrus.New(), Config{ + Path: filepath.Join(t.TempDir(), "creds.json"), + }).(*store) + + // Seed a refreshable credential. + if err := st.Save(&authclient.Tokens{ + AccessToken: "access", + RefreshToken: "refresh", + ExpiresAt: time.Now().Add(time.Hour), + }); err != nil { + t.Fatalf("seeding refreshable credential: %v", err) + } + + // A login that returns no refresh token must be refused, not silently saved. + err := st.Save(&authclient.Tokens{ + AccessToken: "access-2", + ExpiresAt: time.Now().Add(time.Hour), + }) + if !errors.Is(err, ErrCredentialDowngrade) { + t.Fatalf("expected ErrCredentialDowngrade, got %v", err) + } + + // The refreshable credential must be left intact. + got, err := st.Load() + if err != nil { + t.Fatalf("Load: %v", err) + } + + if got == nil || got.RefreshToken != "refresh" { + t.Fatalf("refreshable credential was overwritten: %+v", got) + } + + // A refreshable login is allowed; after logout a tokenless login is allowed + // because there is no refreshable credential to protect. + if err := st.Save(&authclient.Tokens{ + AccessToken: "a", RefreshToken: "r2", ExpiresAt: time.Now().Add(time.Hour), + }); err != nil { + t.Fatalf("refreshable save should succeed: %v", err) + } + + if err := st.Clear(); err != nil { + t.Fatalf("Clear: %v", err) + } + + if err := st.Save(&authclient.Tokens{ + AccessToken: "a", ExpiresAt: time.Now().Add(time.Hour), + }); err != nil { + t.Fatalf("tokenless save with no existing credential should succeed: %v", err) + } +} + func TestGetAccessTokenKeepsValidTokenWithoutRefreshToken(t *testing.T) { t.Parallel() client := &stubAuthClient{} store := New(logrus.New(), Config{ + Path: filepath.Join(t.TempDir(), "creds.json"), AuthClient: client, RefreshBuffer: 5 * time.Minute, }).(*store) @@ -46,6 +101,7 @@ func TestGetAccessTokenFallsBackWhenRefreshFailsButTokenIsStillValid(t *testing. client := &stubAuthClient{refreshErr: errors.New("temporary failure")} store := New(logrus.New(), Config{ + Path: filepath.Join(t.TempDir(), "creds.json"), AuthClient: client, RefreshBuffer: 5 * time.Minute, }).(*store) @@ -74,6 +130,7 @@ func TestGetAccessTokenRefreshesAtRefreshTokenHalfLife(t *testing.T) { client := &stubAuthClient{} store := New(logrus.New(), Config{ + Path: filepath.Join(t.TempDir(), "creds.json"), AuthClient: client, RefreshBuffer: 5 * time.Minute, RefreshTokenTTL: 30 * 24 * time.Hour, // 30 days @@ -104,6 +161,7 @@ func TestGetAccessTokenDoesNotRefreshBeforeRefreshTokenHalfLife(t *testing.T) { client := &stubAuthClient{} store := New(logrus.New(), Config{ + Path: filepath.Join(t.TempDir(), "creds.json"), AuthClient: client, RefreshBuffer: 5 * time.Minute, RefreshTokenTTL: 30 * 24 * time.Hour, // 30 days @@ -134,6 +192,7 @@ func TestGetAccessTokenSerializesConcurrentRefreshes(t *testing.T) { client := &countingAuthClient{} store := New(logrus.New(), Config{ + Path: filepath.Join(t.TempDir(), "creds.json"), AuthClient: client, RefreshBuffer: 5 * time.Minute, }).(*store) diff --git a/pkg/cli/auth.go b/pkg/cli/auth.go index 56a77712..5eccef46 100644 --- a/pkg/cli/auth.go +++ b/pkg/cli/auth.go @@ -2,6 +2,7 @@ package cli import ( "context" + "errors" "fmt" "os" "strings" @@ -132,6 +133,19 @@ func runAuthLogin(cmd *cobra.Command, _ []string) error { store := newAuthStore(target, client) if err := store.Save(tokens); err != nil { + if errors.Is(err, authstore.ErrCredentialDowngrade) { + return fmt.Errorf( + "login returned no refresh token, but a refreshable credential already exists at %s; "+ + "refusing to overwrite it. Upgrade panda so the login requests offline_access, or "+ + "run 'panda auth logout' first to replace it", + store.Path(), + ) + } + + if errors.Is(err, authstore.ErrCredentialBusy) { + return fmt.Errorf("another process is refreshing these credentials; try the login again in a moment") + } + return fmt.Errorf("saving tokens: %w", err) } @@ -139,6 +153,16 @@ func runAuthLogin(cmd *cobra.Command, _ []string) error { fmt.Printf("Credentials stored at: %s\n", store.Path()) fmt.Printf("Token expires at: %s\n", tokens.ExpiresAt.Format(time.RFC3339)) + // A successful login with no refresh token cannot auto-refresh; warn so the + // short-lived session is not a surprise. + if tokens.RefreshToken == "" { + fmt.Println( + "WARNING: no refresh token was issued for this login. The session cannot auto-refresh " + + "and will expire when the access token does. Upgrade panda or re-run the login so the " + + "flow requests offline_access.", + ) + } + // Restart the server if it's running so it picks up the new credentials. restartServerIfRunning(baseCtx) @@ -154,6 +178,10 @@ func runAuthLogout(cmd *cobra.Command, _ []string) error { store := newAuthStore(target, nil) if err := store.Clear(); err != nil { + if errors.Is(err, authstore.ErrCredentialBusy) { + return fmt.Errorf("another process is refreshing these credentials; try the logout again in a moment") + } + return fmt.Errorf("clearing tokens: %w", err) } From e664b97b048dc40616014f69f6a44607f683109d Mon Sep 17 00:00:00 2001 From: Sam Calder-Mason Date: Fri, 19 Jun 2026 14:20:53 +1000 Subject: [PATCH 2/3] fix(auth): correct offline_access messaging and show refresh-token detail in status The 'upgrade panda' advice on a missing refresh token was wrong: this version already requests offline_access, so a missing refresh token is the provider not granting it. Reword the login warning and downgrade error accordingly. panda auth status now reports whether a refresh token is present and when it was last rotated, and a new --verify flag actively tests the refresh token against the provider (rotating it). --- pkg/cli/auth.go | 79 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 72 insertions(+), 7 deletions(-) diff --git a/pkg/cli/auth.go b/pkg/cli/auth.go index 5eccef46..290d9d8b 100644 --- a/pkg/cli/auth.go +++ b/pkg/cli/auth.go @@ -31,6 +31,7 @@ var ( authClientID string authResource string noBrowser bool + verifyRefresh bool ) var authCmd = &cobra.Command{ @@ -67,6 +68,9 @@ func init() { authLoginCmd.Flags().BoolVar(&noBrowser, "no-browser", false, "manual auth flow for SSH/headless environments (auto-detected over SSH)") + authStatusCmd.Flags().BoolVar(&verifyRefresh, "verify", false, + "actively test the refresh token against the provider (rotates it)") + for _, cmd := range []*cobra.Command{authLoginCmd, authLogoutCmd, authStatusCmd} { cmd.Flags().StringVar(&authIssuerURL, "issuer", "", "proxy auth issuer URL (defaults to the configured server's proxy auth issuer)") cmd.Flags().StringVar(&authClientID, "client-id", "", "OAuth client ID (defaults to configured value or 'panda')") @@ -135,9 +139,9 @@ func runAuthLogin(cmd *cobra.Command, _ []string) error { if err := store.Save(tokens); err != nil { if errors.Is(err, authstore.ErrCredentialDowngrade) { return fmt.Errorf( - "login returned no refresh token, but a refreshable credential already exists at %s; "+ - "refusing to overwrite it. Upgrade panda so the login requests offline_access, or "+ - "run 'panda auth logout' first to replace it", + "the provider issued no refresh token for this login (it did not grant the "+ + "offline_access scope), but a refreshable credential already exists at %s; "+ + "refusing to overwrite it. Run 'panda auth logout' first if you intend to replace it", store.Path(), ) } @@ -157,9 +161,9 @@ func runAuthLogin(cmd *cobra.Command, _ []string) error { // short-lived session is not a surprise. if tokens.RefreshToken == "" { fmt.Println( - "WARNING: no refresh token was issued for this login. The session cannot auto-refresh " + - "and will expire when the access token does. Upgrade panda or re-run the login so the " + - "flow requests offline_access.", + "WARNING: the provider issued no refresh token for this login (it did not grant the " + + "offline_access scope). The session cannot auto-refresh and will expire when the " + + "access token does.", ) } @@ -221,10 +225,71 @@ func runAuthStatus(cmd *cobra.Command, _ []string) error { if tokens.ExpiresAt.After(time.Now()) { fmt.Printf("Status: Authenticated (expires in %s)\n", time.Until(tokens.ExpiresAt).Round(time.Second)) fmt.Printf("Expires at: %s\n", tokens.ExpiresAt.Format(time.RFC3339)) + } else { + fmt.Printf("Status: Expired (expired at %s)\n", tokens.ExpiresAt.Format(time.RFC3339)) + } + + printRefreshTokenStatus(tokens) + + if verifyRefresh { + return verifyRefreshToken(store, tokens) + } + + return nil +} + +// printRefreshTokenStatus reports what is knowable about the refresh token from +// the stored credential. The refresh token is opaque, so its expiry cannot be +// read locally; use --verify to test it against the provider. +func printRefreshTokenStatus(tokens *authclient.Tokens) { + if tokens.RefreshToken == "" { + fmt.Println("Refresh token: none (session cannot auto-refresh; it ends when the access token expires)") + + return + } + + if tokens.RefreshTokenIssuedAt.IsZero() { + fmt.Println("Refresh token: present (last rotation time unknown; the provider reused it on the last refresh)") + + return + } + + fmt.Printf("Refresh token: present (last rotated %s ago, at %s)\n", + time.Since(tokens.RefreshTokenIssuedAt).Round(time.Second), + tokens.RefreshTokenIssuedAt.Format(time.RFC3339), + ) +} + +// verifyRefreshToken actively confirms the refresh token is still accepted by +// the provider. This performs a real refresh and so rotates the token. +func verifyRefreshToken(store authstore.Store, tokens *authclient.Tokens) error { + if tokens.RefreshToken == "" { + fmt.Println("Refresh check: skipped (no refresh token)") + return nil } - fmt.Printf("Status: Expired (expired at %s)\n", tokens.ExpiresAt.Format(time.RFC3339)) + fmt.Println("Refresh check: testing the refresh token with the provider (this rotates it)...") + + store.Invalidate() + + if _, err := store.GetAccessToken(); err != nil { + fmt.Printf("Refresh check: FAILED — %v\n", err) + + return nil + } + + refreshed, err := store.Load() + if err != nil || refreshed == nil { + fmt.Println("Refresh check: OK (refresh token is valid)") + + return nil + } + + fmt.Printf("Refresh check: OK (refresh token is valid; new access token expires at %s)\n", + refreshed.ExpiresAt.Format(time.RFC3339), + ) + return nil } From 6748b52d866072d882a2be19afda3397bd064895 Mon Sep 17 00:00:00 2001 From: Sam Calder-Mason Date: Fri, 19 Jun 2026 14:28:05 +1000 Subject: [PATCH 3/3] fix(auth): raise credential write-lock wait above the refresh HTTP timeout A refresh holds the file lock across its token request (capped at 30s by the auth client), so a 5s wait made login/logout fail spuriously during a slow refresh. Wait 35s so an interactive write rides through a slow-but- valid refresh and only fails when one is genuinely stuck. --- pkg/auth/store/store.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pkg/auth/store/store.go b/pkg/auth/store/store.go index 27bb0968..a710d81a 100644 --- a/pkg/auth/store/store.go +++ b/pkg/auth/store/store.go @@ -21,8 +21,11 @@ import ( const ( // credentialLockWait bounds how long an interactive write (login/logout) - // waits for an in-flight refresh to release the credentials file lock. - credentialLockWait = 5 * time.Second + // waits for an in-flight refresh to release the credentials file lock. A + // refresh holds the lock across its token-endpoint request, which the auth + // client caps at 30s, so this must exceed that to ride through a slow-but- + // valid refresh and only surface ErrCredentialBusy when one is truly stuck. + credentialLockWait = 35 * time.Second // credentialLockPoll is the retry interval while waiting for the lock. credentialLockPoll = 50 * time.Millisecond