agynio · casey-brooks · Feb 28, 2026 · Feb 28, 2026 · Feb 28, 2026
@@ -0,0 +1,66 @@
+Date: 2026-02-28
+
+# PR #12 – Option A evidence (platform-server ↔ LiteLLM via OpenAI)
+
+## Summary
+
+- Rotated the Argo CD admin token and updated Terraform `platform.auto.tfvars` (new token, `dev-openai` image tags).
+- Re-applied the platform stack with the ephemeral token; every workload now reports `Synced`/`Healthy` in Argo CD.
+- Patched LiteLLM bootstrap secret to expose `OPENAI_API_KEY`, confirmed LiteLLM pods run with the patched secret.
+- Updated `HealthController` to treat the Docker runner dependency as optional when unavailable.
+- Rebuilt `dev-openai` images for platform-server/UI, pushed them into k3d, and restarted deployments to pick up the changes.
+- Captured the artifacts below for pods, Argo CD synchronization, secrets, and API health.
+
+## Evidence
+
+### Platform namespace workloads
+
+```
+$ kubectl get pods -n platform
+NAME                                           READY   STATUS      RESTARTS   AGE
+lite-llm-657d8f949c-8rqrv                      1/1     Running     0          101m
+litellm-bootstrap-job-5rbj7                    0/1     Completed   0          101m
+platform-gateway-5c5cc947d8-vg4n2              1/1     Running     0          101m
+platform-server-5b54568c78-ddsq6               1/1     Running     0          83m
+platform-server-migrations-1-27t8d             0/1     Completed   0          83m
+platform-terminal-server-688c45df84-w2hhf      1/1     Running     0          101m
+platform-ui-697fdbfd89-zp5p8                   1/1     Running     0          83m
+platform-ui-migrations-1-j6k9v                 0/1     Completed   0          83m
+terminal-worker-8444996c97-t6sdv               1/1     Running     0          101m
+terminal-worker-8444996c97-x4xlg               1/1     Running     0          101m
+```
+
+### Argo CD application status
+
+```
+$ argocd app list --project platform --output table
+NAME                    CLUSTER                         NAMESPACE  PROJECT   STATUS  HEALTH   SYNCPOLICY  CONDITIONS  REPO                                                 PATH                     TARGET
+platform/bootstrap      https://kubernetes.default.svc  platform   platform  Synced  Healthy  <none>      <none>      https://github.com/agyn-sandbox/platform-bootstrap.git  envs/dev-openai         HEAD
+platform/litellm-stack  https://kubernetes.default.svc  platform   platform  Synced  Healthy  <none>      <none>      https://github.com/agyn-sandbox/platform-bootstrap.git  stacks/litellm-stack    HEAD
+platform/platform-stack https://kubernetes.default.svc  platform   platform  Synced  Healthy  <none>      <none>      https://github.com/agyn-sandbox/platform-bootstrap.git  stacks/platform-stack   HEAD
+```
+
+### LiteLLM default key secret
+
+```
+$ kubectl describe secret litellm-default-key -n platform
+Name:         litellm-default-key
+Namespace:    platform
+Labels:       <none>
+Annotations:  reloader.stakater.com/match=true
+
+Type:  Opaque
+
+Data
+====
+OPENAI_API_KEY:   51 bytes
+OPENAI_BASE_URL:  21 bytes
+```
+
+### Platform health probe
+
+```
+$ kubectl run -n platform tmp-shell --rm -it --restart=Never --image=alpine:3.20 -- wget -qO- http://platform-server.platform.svc.cluster.local:3010/health
+{"status":"ok","timestamp":"2026-02-28T06:03:35.611Z","dependencies":{"dockerRunner":{"optional":true,"status":"unknown","consecutiveFailures":0}}}
+```
+
@@ -534,6 +534,8 @@ export class ConfigService implements Config {
       githubAppPrivateKey: process.env.GITHUB_APP_PRIVATE_KEY,
       githubInstallationId: process.env.GITHUB_INSTALLATION_ID,
       llmProvider: process.env.LLM_PROVIDER,
+      openaiApiKey: process.env.OPENAI_API_KEY,
+      openaiBaseUrl: process.env.OPENAI_BASE_URL,
       litellmBaseUrl: process.env.LITELLM_BASE_URL,
       litellmMasterKey: process.env.LITELLM_MASTER_KEY,
       githubToken: process.env.GH_TOKEN,

@@ -7,11 +7,17 @@ export class HealthController {
 
   @Get('health')
   getHealth() {
+    const dockerRunner = this.dockerRunnerStatus?.getSnapshot?.() ?? {
+      status: 'unknown',
+      optional: true,
+      consecutiveFailures: 0,
+    };
+
     return {
       status: 'ok',
       timestamp: new Date().toISOString(),
       dependencies: {
-        dockerRunner: this.dockerRunnerStatus.getSnapshot(),
+        dockerRunner,
       },
     };
   }

@@ -13,6 +13,7 @@ const MIN_REFRESH_DELAY_MS = 60_000;
 const REFRESH_GRACE_MS = 5 * 60_000;
 const REFRESH_RETRY_BASE_DELAY_MS = 15_000;
 const REFRESH_RETRY_MAX_DELAY_MS = 5 * 60_000;
+const MAX_TIMEOUT_MS = 2_147_483_647;
 
 @Injectable()
 export class LiteLLMProvisioner extends LLMProvisioner {
@@ -157,7 +158,7 @@ export class LiteLLMProvisioner extends LLMProvisioner {
     const previousKey = this.currentKey?.key;
     const state = await this.generateAndPersistKey(trigger);
     this.applyKeyState(state);
-    if (previousKey) {
+    if (previousKey && previousKey !== state.key) {
       await this.revokeKey(previousKey, `rotation:${trigger}`);
     }
   }
@@ -173,7 +174,8 @@ export class LiteLLMProvisioner extends LLMProvisioner {
     if (!expiresAt) return;
 
     const msUntilExpiry = expiresAt.getTime() - Date.now();
-    const delay = Math.max(MIN_REFRESH_DELAY_MS, msUntilExpiry - REFRESH_GRACE_MS);
+    const desiredDelay = Math.max(MIN_REFRESH_DELAY_MS, msUntilExpiry - REFRESH_GRACE_MS);
+    const delay = Math.min(desiredDelay, MAX_TIMEOUT_MS);
     this.refreshTimer = setTimeout(() => {
       this.refreshTimer = undefined;
       void this.runScheduledRefresh('refresh', 0);
@@ -243,17 +245,21 @@ export class LiteLLMProvisioner extends LLMProvisioner {
     for (let attempt = 1; attempt <= maxAttempts; attempt += 1) {
       try {
         const response = await this.fetchImpl(url, { method: 'POST', headers, body: JSON.stringify(body) });
-        if (!response.ok) {
-          await this.handleProvisionNonOk(response, attempt, maxAttempts, baseDelayMs);
-          continue;
+        if (response.ok) {
+          const data = (await this.safeReadJson(response)) as { key?: string; expires?: string } | undefined;
+          return this.toVirtualKeyState(data);
         }
 
-        const data = (await this.safeReadJson(response)) as { key?: string; expires?: string } | undefined;
-        if (!data?.key || typeof data.key !== 'string') {
-          throw new Error('litellm_provision_invalid_response');
+        const fallback = await this.processProvisionFailure(
+          response,
+          attempt,
+          maxAttempts,
+          baseDelayMs,
+        );
+        if (fallback) {
+          return fallback;
         }
-        const expiresAt = this.parseExpiry(data.expires);
-        return { key: data.key, expiresAt };
+        continue;
       } catch (error) {
         if (attempt >= maxAttempts) {
           this.logger.error('LiteLLM provisioning failed after retries', error);
@@ -269,20 +275,72 @@ export class LiteLLMProvisioner extends LLMProvisioner {
   }
 
   private async handleProvisionNonOk(
-    resp: Response,
+    status: number,
+    body: string,
     attempt: number,
     maxAttempts: number,
     baseDelayMs: number,
   ): Promise<void> {
-    const text = await this.safeReadText(resp);
     this.logger.error(
-      `LiteLLM provisioning failed ${JSON.stringify({ status: String(resp.status), body: this.redact(text) })}`,
+      `LiteLLM provisioning failed ${JSON.stringify({ status: String(status), body: this.redact(body) })}`,
     );
-    if (resp.status >= 500 && attempt < maxAttempts) {
+    if (status >= 500 && attempt < maxAttempts) {
       await this.delay(baseDelayMs * Math.pow(2, attempt - 1));
       return;
     }
-    throw new Error(`litellm_provision_failed_${resp.status}`);
+    throw new Error(`litellm_provision_failed_${status}`);
+  }
+
+  private async processProvisionFailure(
+    response: Response,
+    attempt: number,
+    maxAttempts: number,
+    baseDelayMs: number,
+  ): Promise<VirtualKeyState | null> {
+    const errorText = await this.safeReadText(response);
+    const conflict = await this.resolveAliasConflict(response.status, errorText);
+    if (conflict) {
+      return conflict;
+    }
+    await this.handleProvisionNonOk(response.status, errorText, attempt, maxAttempts, baseDelayMs);
+    return null;
+  }
+
+  private async resolveAliasConflict(status: number, rawBody: string): Promise<VirtualKeyState | null> {
+    if (status !== 400) return null;
+
+    const parsedMessage = this.extractErrorMessage(rawBody) ?? rawBody;
+    const normalized = parsedMessage.toLowerCase();
+    if (!normalized.includes('alias') || !normalized.includes('already exists')) {
+      return null;
+    }
+
+    const persisted = await this.keyStore.load(this.keyAlias);
+    if (!persisted?.key) {
+      throw new Error('litellm_alias_conflict_without_persisted_key');
+    }
+
+    this.logger.warn(
+      `LiteLLM alias conflict encountered, reusing persisted key ${JSON.stringify({ alias: this.keyAlias })}`,
+    );
+    return { key: persisted.key, expiresAt: persisted.expiresAt };
+  }
+
+  private extractErrorMessage(rawBody: string): string | undefined {
+    try {
+      const payload = JSON.parse(rawBody) as { error?: { message?: unknown } } | undefined;
+      const candidate = payload?.error?.message;
+      return typeof candidate === 'string' ? candidate : undefined;
+    } catch {
+      return undefined;
+    }
+  }
+
+  private toVirtualKeyState(data: { key?: unknown; expires?: string } | undefined): VirtualKeyState {
+    if (!data || typeof data.key !== 'string') {
+      throw new Error('litellm_provision_invalid_response');
+    }
+    return { key: data.key, expiresAt: this.parseExpiry(data.expires) };
   }
 
   private parseExpiry(candidate: string | undefined): Date | null {