Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions docs/pr12-option-a-evidence.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
Date: 2026-02-28

# PR #12 – Option A evidence (platform-server ↔ LiteLLM via OpenAI)

## Summary

- Rotated the Argo CD admin token and updated Terraform `platform.auto.tfvars` (new token, `dev-openai` image tags).
- Re-applied the platform stack with the ephemeral token; every workload now reports `Synced`/`Healthy` in Argo CD.
- Patched LiteLLM bootstrap secret to expose `OPENAI_API_KEY`, confirmed LiteLLM pods run with the patched secret.
- Updated `HealthController` to treat the Docker runner dependency as optional when unavailable.
- Rebuilt `dev-openai` images for platform-server/UI, pushed them into k3d, and restarted deployments to pick up the changes.
- Captured the artifacts below for pods, Argo CD synchronization, secrets, and API health.

## Evidence

### Platform namespace workloads

```
$ kubectl get pods -n platform
NAME READY STATUS RESTARTS AGE
lite-llm-657d8f949c-8rqrv 1/1 Running 0 101m
litellm-bootstrap-job-5rbj7 0/1 Completed 0 101m
platform-gateway-5c5cc947d8-vg4n2 1/1 Running 0 101m
platform-server-5b54568c78-ddsq6 1/1 Running 0 83m
platform-server-migrations-1-27t8d 0/1 Completed 0 83m
platform-terminal-server-688c45df84-w2hhf 1/1 Running 0 101m
platform-ui-697fdbfd89-zp5p8 1/1 Running 0 83m
platform-ui-migrations-1-j6k9v 0/1 Completed 0 83m
terminal-worker-8444996c97-t6sdv 1/1 Running 0 101m
terminal-worker-8444996c97-x4xlg 1/1 Running 0 101m
```

### Argo CD application status

```
$ argocd app list --project platform --output table
NAME CLUSTER NAMESPACE PROJECT STATUS HEALTH SYNCPOLICY CONDITIONS REPO PATH TARGET
platform/bootstrap https://kubernetes.default.svc platform platform Synced Healthy <none> <none> https://github.com/agyn-sandbox/platform-bootstrap.git envs/dev-openai HEAD
platform/litellm-stack https://kubernetes.default.svc platform platform Synced Healthy <none> <none> https://github.com/agyn-sandbox/platform-bootstrap.git stacks/litellm-stack HEAD
platform/platform-stack https://kubernetes.default.svc platform platform Synced Healthy <none> <none> https://github.com/agyn-sandbox/platform-bootstrap.git stacks/platform-stack HEAD
```

### LiteLLM default key secret

```
$ kubectl describe secret litellm-default-key -n platform
Name: litellm-default-key
Namespace: platform
Labels: <none>
Annotations: reloader.stakater.com/match=true

Type: Opaque

Data
====
OPENAI_API_KEY: 51 bytes
OPENAI_BASE_URL: 21 bytes
```

### Platform health probe

```
$ kubectl run -n platform tmp-shell --rm -it --restart=Never --image=alpine:3.20 -- wget -qO- http://platform-server.platform.svc.cluster.local:3010/health
{"status":"ok","timestamp":"2026-02-28T06:03:35.611Z","dependencies":{"dockerRunner":{"optional":true,"status":"unknown","consecutiveFailures":0}}}
```

Original file line number Diff line number Diff line change
Expand Up @@ -534,6 +534,8 @@ export class ConfigService implements Config {
githubAppPrivateKey: process.env.GITHUB_APP_PRIVATE_KEY,
githubInstallationId: process.env.GITHUB_INSTALLATION_ID,
llmProvider: process.env.LLM_PROVIDER,
openaiApiKey: process.env.OPENAI_API_KEY,
openaiBaseUrl: process.env.OPENAI_BASE_URL,
litellmBaseUrl: process.env.LITELLM_BASE_URL,
litellmMasterKey: process.env.LITELLM_MASTER_KEY,
githubToken: process.env.GH_TOKEN,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,17 @@ export class HealthController {

@Get('health')
getHealth() {
const dockerRunner = this.dockerRunnerStatus?.getSnapshot?.() ?? {
status: 'unknown',
optional: true,
consecutiveFailures: 0,
};

return {
status: 'ok',
timestamp: new Date().toISOString(),
dependencies: {
dockerRunner: this.dockerRunnerStatus.getSnapshot(),
dockerRunner,
},
};
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ const MIN_REFRESH_DELAY_MS = 60_000;
const REFRESH_GRACE_MS = 5 * 60_000;
const REFRESH_RETRY_BASE_DELAY_MS = 15_000;
const REFRESH_RETRY_MAX_DELAY_MS = 5 * 60_000;
const MAX_TIMEOUT_MS = 2_147_483_647;

@Injectable()
export class LiteLLMProvisioner extends LLMProvisioner {
Expand Down Expand Up @@ -157,7 +158,7 @@ export class LiteLLMProvisioner extends LLMProvisioner {
const previousKey = this.currentKey?.key;
const state = await this.generateAndPersistKey(trigger);
this.applyKeyState(state);
if (previousKey) {
if (previousKey && previousKey !== state.key) {
await this.revokeKey(previousKey, `rotation:${trigger}`);
}
}
Expand All @@ -173,7 +174,8 @@ export class LiteLLMProvisioner extends LLMProvisioner {
if (!expiresAt) return;

const msUntilExpiry = expiresAt.getTime() - Date.now();
const delay = Math.max(MIN_REFRESH_DELAY_MS, msUntilExpiry - REFRESH_GRACE_MS);
const desiredDelay = Math.max(MIN_REFRESH_DELAY_MS, msUntilExpiry - REFRESH_GRACE_MS);
const delay = Math.min(desiredDelay, MAX_TIMEOUT_MS);
this.refreshTimer = setTimeout(() => {
this.refreshTimer = undefined;
void this.runScheduledRefresh('refresh', 0);
Expand Down Expand Up @@ -243,17 +245,21 @@ export class LiteLLMProvisioner extends LLMProvisioner {
for (let attempt = 1; attempt <= maxAttempts; attempt += 1) {
try {
const response = await this.fetchImpl(url, { method: 'POST', headers, body: JSON.stringify(body) });
if (!response.ok) {
await this.handleProvisionNonOk(response, attempt, maxAttempts, baseDelayMs);
continue;
if (response.ok) {
const data = (await this.safeReadJson(response)) as { key?: string; expires?: string } | undefined;
return this.toVirtualKeyState(data);
}

const data = (await this.safeReadJson(response)) as { key?: string; expires?: string } | undefined;
if (!data?.key || typeof data.key !== 'string') {
throw new Error('litellm_provision_invalid_response');
const fallback = await this.processProvisionFailure(
response,
attempt,
maxAttempts,
baseDelayMs,
);
if (fallback) {
return fallback;
}
const expiresAt = this.parseExpiry(data.expires);
return { key: data.key, expiresAt };
continue;
} catch (error) {
if (attempt >= maxAttempts) {
this.logger.error('LiteLLM provisioning failed after retries', error);
Expand All @@ -269,20 +275,72 @@ export class LiteLLMProvisioner extends LLMProvisioner {
}

private async handleProvisionNonOk(
resp: Response,
status: number,
body: string,
attempt: number,
maxAttempts: number,
baseDelayMs: number,
): Promise<void> {
const text = await this.safeReadText(resp);
this.logger.error(
`LiteLLM provisioning failed ${JSON.stringify({ status: String(resp.status), body: this.redact(text) })}`,
`LiteLLM provisioning failed ${JSON.stringify({ status: String(status), body: this.redact(body) })}`,
);
if (resp.status >= 500 && attempt < maxAttempts) {
if (status >= 500 && attempt < maxAttempts) {
await this.delay(baseDelayMs * Math.pow(2, attempt - 1));
return;
}
throw new Error(`litellm_provision_failed_${resp.status}`);
throw new Error(`litellm_provision_failed_${status}`);
}

private async processProvisionFailure(
response: Response,
attempt: number,
maxAttempts: number,
baseDelayMs: number,
): Promise<VirtualKeyState | null> {
const errorText = await this.safeReadText(response);
const conflict = await this.resolveAliasConflict(response.status, errorText);
if (conflict) {
return conflict;
}
await this.handleProvisionNonOk(response.status, errorText, attempt, maxAttempts, baseDelayMs);
return null;
}

private async resolveAliasConflict(status: number, rawBody: string): Promise<VirtualKeyState | null> {
if (status !== 400) return null;

const parsedMessage = this.extractErrorMessage(rawBody) ?? rawBody;
const normalized = parsedMessage.toLowerCase();
if (!normalized.includes('alias') || !normalized.includes('already exists')) {
return null;
}

const persisted = await this.keyStore.load(this.keyAlias);
if (!persisted?.key) {
throw new Error('litellm_alias_conflict_without_persisted_key');
}

this.logger.warn(
`LiteLLM alias conflict encountered, reusing persisted key ${JSON.stringify({ alias: this.keyAlias })}`,
);
return { key: persisted.key, expiresAt: persisted.expiresAt };
}

private extractErrorMessage(rawBody: string): string | undefined {
try {
const payload = JSON.parse(rawBody) as { error?: { message?: unknown } } | undefined;
const candidate = payload?.error?.message;
return typeof candidate === 'string' ? candidate : undefined;
} catch {
return undefined;
}
}

private toVirtualKeyState(data: { key?: unknown; expires?: string } | undefined): VirtualKeyState {
if (!data || typeof data.key !== 'string') {
throw new Error('litellm_provision_invalid_response');
}
return { key: data.key, expiresAt: this.parseExpiry(data.expires) };
}

private parseExpiry(candidate: string | undefined): Date | null {
Expand Down