AgentMeter · adamhenson · Mar 19, 2026 · Mar 19, 2026 · Mar 19, 2026 · Mar 19, 2026
diff --git a/.github/workflows/agent-review-codex.yml b/.github/workflows/agent-review-codex.yml
@@ -39,7 +39,32 @@ jobs:
           prompt-file: .github/codex/prompts/review.md
           model: ${{ vars.GH_AW_MODEL_AGENT_CODEX || 'gpt-5.4-mini' }}
           sandbox: workspace-write
-          output-file: /tmp/codex-output.md
+          codex-home: /tmp/codex-home
+
+      - name: Extract Codex token usage
+        id: codex-tokens
+        if: always()
+        run: |
+          rollout=$(find /tmp/codex-home/sessions -name "rollout-*.jsonl" 2>/dev/null -printf "%T@ %p\n" | sort -rn | head -1 | cut -d' ' -f2-)
+          if [ -z "$rollout" ]; then
+            echo "No rollout JSONL found — token counts unavailable"
+            echo "input_tokens=" >> "$GITHUB_OUTPUT"
+            echo "output_tokens=" >> "$GITHUB_OUTPUT"
+            echo "cache_read_tokens=" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+          echo "Parsing rollout: $rollout"
+          token_line=$(grep '"token_count"' "$rollout" | tail -1)
+          if [ -z "$token_line" ]; then
+            echo "No token_count event in rollout — token counts unavailable"
+            echo "input_tokens=" >> "$GITHUB_OUTPUT"
+            echo "output_tokens=" >> "$GITHUB_OUTPUT"
+            echo "cache_read_tokens=" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+          echo "input_tokens=$(echo "$token_line" | jq -r '.payload.info.total_token_usage.input_tokens // empty')" >> "$GITHUB_OUTPUT"
+          echo "output_tokens=$(echo "$token_line" | jq -r '.payload.info.total_token_usage.output_tokens // empty')" >> "$GITHUB_OUTPUT"
+          echo "cache_read_tokens=$(echo "$token_line" | jq -r '.payload.info.total_token_usage.cached_input_tokens // empty')" >> "$GITHUB_OUTPUT"
 
       - name: Post review comment
         if: steps.codex.outputs.final-message != ''
@@ -56,45 +81,6 @@ jobs:
         env:
           CODEX_REVIEW: ${{ steps.codex.outputs.final-message }}
 
-      - name: Extract Codex token usage
-        id: extract_tokens
-        if: always()
-        env:
-          CODEX_HOME: /home/runner/.codex
-        run: |
-          echo "=== CODEX_HOME contents ==="
-          find "$CODEX_HOME" -type f 2>/dev/null || echo "(empty or missing)"
-          echo "=== end ==="
-
-          # Try session files in CODEX_HOME/sessions/ first, then ~/.codex/sessions/
-          SESSION_FILE=$(ls -t "$CODEX_HOME"/sessions/*.jsonl 2>/dev/null | head -1 || \
-                         ls -t ~/.codex/sessions/*.jsonl 2>/dev/null | head -1 || true)
-
-          echo "SESSION_FILE=$SESSION_FILE"
-
-          if [ -n "$SESSION_FILE" ] && [ -f "$SESSION_FILE" ]; then
-            echo "=== last 5 lines of session file ==="
-            tail -5 "$SESSION_FILE"
-            echo "=== end ==="
-            TOKEN_LINE=$(grep '"token_count"' "$SESSION_FILE" 2>/dev/null | tail -1 || true)
-            if [ -n "$TOKEN_LINE" ]; then
-              INPUT=$(echo "$TOKEN_LINE" | jq -r '.payload.info.total_token_usage.input_tokens // 0')
-              OUTPUT=$(echo "$TOKEN_LINE" | jq -r '.payload.info.total_token_usage.output_tokens // 0')
-              CACHE_READ=$(echo "$TOKEN_LINE" | jq -r '.payload.info.total_token_usage.cached_input_tokens // 0')
-            else
-              echo "No token_count event found in session file"
-              INPUT=0; OUTPUT=0; CACHE_READ=0
-            fi
-          else
-            echo "No session file found"
-            INPUT=0; OUTPUT=0; CACHE_READ=0
-          fi
-          {
-            echo "input_tokens=$INPUT"
-            echo "output_tokens=$OUTPUT"
-            echo "cache_read_tokens=$CACHE_READ"
-          } >> "$GITHUB_OUTPUT"
-
       - name: Track with AgentMeter
         if: always()
         uses: foo-software/agentmeter-action@main
@@ -104,8 +90,8 @@ jobs:
           engine: codex
           model: ${{ vars.GH_AW_MODEL_AGENT_CODEX || 'gpt-5.4-mini' }}
           status: ${{ job.status == 'success' && 'success' || 'failed' }}
-          input_tokens: ${{ steps.extract_tokens.outputs.input_tokens }}
-          output_tokens: ${{ steps.extract_tokens.outputs.output_tokens }}
-          cache_read_tokens: ${{ steps.extract_tokens.outputs.cache_read_tokens }}
           started_at: ${{ steps.timer.outputs.started_at }}
           post_comment: 'true'
+          input_tokens: ${{ steps.codex-tokens.outputs.input_tokens }}
+          output_tokens: ${{ steps.codex-tokens.outputs.output_tokens }}
+          cache_read_tokens: ${{ steps.codex-tokens.outputs.cache_read_tokens }}
diff --git a/.github/workflows/codex-compat-check.yml b/.github/workflows/codex-compat-check.yml
@@ -0,0 +1,112 @@
+name: "Codex: Rollout JSONL Compatibility Check"
+
+on:
+  schedule:
+    - cron: "0 6 * * *"
+  workflow_dispatch:
+
+jobs:
+  verify:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      issues: write
+    steps:
+      - name: Run minimal Codex exec
+        id: codex
+        uses: openai/codex-action@v1
+        with:
+          openai-api-key: ${{ secrets.OPENAI_API_KEY }}
+          prompt: "Reply with only the single word: hello"
+          model: gpt-5.4-mini
+          sandbox: read-only
+          codex-home: /tmp/codex-check
+
+      - name: Verify rollout JSONL structure
+        run: |
+          rollout=$(find /tmp/codex-check/sessions -name "rollout-*.jsonl" 2>/dev/null | sort | tail -1)
+
+          if [ -z "$rollout" ]; then
+            echo "::error::No rollout JSONL found — codex may have changed its session file layout"
+            echo "Contents of /tmp/codex-check:"
+            find /tmp/codex-check -type f 2>/dev/null || echo "(empty)"
+            exit 1
+          fi
+
+          echo "Found rollout: $rollout"
+
+          token_line=$(grep '"token_count"' "$rollout" | tail -1)
+
+          if [ -z "$token_line" ]; then
+            echo "::error::No token_count event in rollout JSONL — codex may have changed its event format"
+            echo "Rollout file contents (last 20 lines):"
+            tail -20 "$rollout"
+            exit 1
+          fi
+
+          input_tokens=$(echo "$token_line" | jq -r '.payload.info.total_token_usage.input_tokens // empty')
+
+          if [ -z "$input_tokens" ]; then
+            echo "::error::input_tokens field missing from total_token_usage — codex may have changed the token_count schema"
+            echo "token_count event: $token_line"
+            exit 1
+          fi
+
+          if ! [[ "$input_tokens" =~ ^[0-9]+$ ]] || [ "$input_tokens" -eq 0 ]; then
+            echo "::error::input_tokens is not a positive integer ($input_tokens) — something unexpected in the rollout"
+            echo "token_count event: $token_line"
+            exit 1
+          fi
+
+          output_tokens=$(echo "$token_line" | jq -r '.payload.info.total_token_usage.output_tokens // empty')
+          echo "✅ Rollout JSONL verified — input_tokens=$input_tokens output_tokens=$output_tokens"
+
+      - name: Open issue on failure
+        if: failure()
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const date = new Date().toISOString().split('T')[0];
+            const title = `⚠️ Codex rollout JSONL compat check failed (${date})`;
+            const runUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
+
+            const { data: existing } = await github.rest.issues.listForRepo({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              state: 'open',
+              labels: 'codex-compat',
+            });
+
+            if (existing.length > 0) {
+              console.log(`Open codex-compat issue already exists (#${existing[0].number}), skipping.`);
+              return;
+            }
+
+            await github.rest.issues.create({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              title,
+              labels: ['codex-compat'],
+              body: [
+                '## Codex rollout JSONL compatibility check failed',
+                '',
+                'The nightly check that verifies `openai/codex-action` still writes token counts',
+                'to the rollout JSONL file has failed. This likely means the `@openai/codex` CLI',
+                'changed its internal session file format.',
+                '',
+                '**Impact:** Codex runs tracked by agentmeter-action will show `—` for cost instead',
+                'of a real value until this is fixed.',
+                '',
+                `**Failed run:** ${runUrl}`,
+                '',
+                '## What to check',
+                '',
+                '1. Look at the failed step logs — it will say which assertion failed',
+                '2. Run `codex exec --ephemeral "say hello"` locally and inspect `~/.codex/sessions/`',
+                '3. If the format changed, update:',
+                '   - The `Extract Codex token usage` step in `.github/workflows/agent-review-codex.yml`',
+                '   - `tryExtractFromCodexJsonl()` in `src/token-extractor.ts`',
+                '   - `CodexTokenEvent` in `src/types.ts`',
+                '   - `docs/challenges.md` section 6',
+              ].join('\n'),
+            });
diff --git a/__tests__/comment.test.ts b/__tests__/comment.test.ts
@@ -196,6 +196,58 @@ describe('buildCommentBody', () => {
     expect(updatedBody).toContain('$0.01');
   });
 
+  it('shows newest run first (row #1)', () => {
+    const firstBody = buildCommentBody({
+      apiPricing: testPricing,
+      existingBody: null,
+      runData: { ...baseRun, workflowName: 'first-run' },
+    });
+    const secondBody = buildCommentBody({
+      apiPricing: testPricing,
+      existingBody: firstBody,
+      runData: { ...baseRun, workflowName: 'second-run' },
+    });
+    const rows = secondBody.match(/\| \d+ \| .+? \|/g) ?? [];
+    expect(rows[0]).toContain('second-run');
+    expect(rows[1]).toContain('first-run');
+  });
+
+  it('shows all runs inline when count is at or below the limit', () => {
+    let body: string | null = null;
+    for (let i = 0; i < 5; i++) {
+      body = buildCommentBody({
+        apiPricing: testPricing,
+        existingBody: body,
+        runData: { ...baseRun, workflowName: `run-${i}` },
+      });
+    }
+    // No "All N runs" collapsible should appear
+    expect(body).not.toContain('All 5 runs');
+    expect(body).not.toContain('All 6 runs');
+  });
+
+  it('shows only 5 most recent runs and adds collapsible when over limit', () => {
+    let body: string | null = null;
+    for (let i = 1; i <= 7; i++) {
+      body = buildCommentBody({
+        apiPricing: testPricing,
+        existingBody: body,
+        runData: { ...baseRun, workflowName: `run-${i}` },
+      });
+    }
+    // Collapsible should exist
+    expect(body).toContain('All 7 runs');
+    // Latest 5 visible in main table (runs 7, 6, 5, 4, 3)
+    const mainTableSection = body!.split('<details>')[0];
+    expect(mainTableSection).toContain('run-7');
+    expect(mainTableSection).toContain('run-3');
+    expect(mainTableSection).not.toContain('run-2');
+    expect(mainTableSection).not.toContain('run-1');
+    // All runs present inside collapsible
+    expect(body).toContain('run-1');
+    expect(body).toContain('run-2');
+  });
+
   it('appends new run to existing comment and shows total', () => {
     const firstBody = buildCommentBody({
       apiPricing: testPricing,

diff --git a/__tests__/token-extractor.test.ts b/__tests__/token-extractor.test.ts
@@ -130,6 +130,60 @@ describe('extractTokensFromOutput', () => {
     expect(extractTokensFromOutput(jsonlOutput)).toBeNull();
   });
 
+  it('parses codex exec --json turn.completed event', () => {
+    const jsonlOutput = [
+      JSON.stringify({ type: 'thread.started', thread_id: 'abc' }),
+      JSON.stringify({ type: 'turn.started' }),
+      JSON.stringify({
+        type: 'turn.completed',
+        usage: { input_tokens: 24763, cached_input_tokens: 24448, output_tokens: 122 },
+      }),
+    ].join('\n');
+
+    const result = extractTokensFromOutput(jsonlOutput);
+    expect(result).not.toBeNull();
+    expect(result!.tokens.inputTokens).toBe(24763);
+    expect(result!.tokens.outputTokens).toBe(122);
+    expect(result!.tokens.cacheReadTokens).toBe(24448);
+    expect(result!.tokens.cacheWriteTokens).toBe(0);
+    expect(result!.isApproximate).toBe(false);
+  });
+
+  it('sums multiple turn.completed events across turns', () => {
+    const jsonlOutput = [
+      JSON.stringify({
+        type: 'turn.completed',
+        usage: { input_tokens: 1000, cached_input_tokens: 800, output_tokens: 100 },
+      }),
+      JSON.stringify({
+        type: 'turn.completed',
+        usage: { input_tokens: 500, cached_input_tokens: 200, output_tokens: 50 },
+      }),
+    ].join('\n');
+
+    const result = extractTokensFromOutput(jsonlOutput);
+    expect(result!.tokens.inputTokens).toBe(1500);
+    expect(result!.tokens.outputTokens).toBe(150);
+    expect(result!.tokens.cacheReadTokens).toBe(1000);
+  });
+
+  it('returns null for --json output with no turn.completed events', () => {
+    const jsonlOutput = [
+      JSON.stringify({ type: 'thread.started', thread_id: 'abc' }),
+      JSON.stringify({ type: 'item.started', item: { type: 'command_execution' } }),
+    ].join('\n');
+    expect(extractTokensFromOutput(jsonlOutput)).toBeNull();
+  });
+
+  it('handles missing usage fields in turn.completed gracefully', () => {
+    const jsonlOutput = JSON.stringify({ type: 'turn.completed', usage: {} });
+    const result = extractTokensFromOutput(jsonlOutput);
+    expect(result).not.toBeNull();
+    expect(result!.tokens.inputTokens).toBe(0);
+    expect(result!.tokens.outputTokens).toBe(0);
+    expect(result!.tokens.cacheReadTokens).toBe(0);
+  });
+
   it('defaults missing cache fields to zero in JSON', () => {
     const output = JSON.stringify({
       usage: { input_tokens: 100, output_tokens: 50 },

diff --git a/docs/challenges.md b/docs/challenges.md
@@ -94,6 +94,48 @@ If the user omits `if: always()` on the AgentMeter step, failed agent runs won't
 
 ---
 
+### 6. Codex token counts rely on an internal rollout file format
+
+`codex exec` (via `openai/codex-action`) does not expose token usage through any documented public API. However, when running without `--ephemeral`, the Codex CLI writes a rollout JSONL file to:
+
+```
+$CODEX_HOME/sessions/YYYY/MM/DD/rollout-<timestamp>-<uuid>.jsonl
+```
+
+Each line is a JSON event. Token totals appear in `token_count` events:
+
+```json
+{
+  "type": "event_msg",
+  "payload": {
+    "type": "token_count",
+    "info": {
+      "total_token_usage": {
+        "input_tokens": 479565,
+        "output_tokens": 7489,
+        "cached_input_tokens": 444416
+      }
+    },
+    "rate_limits": null
+  }
+}
+```
+
+The last `token_count` event in the file contains cumulative totals for the full run.
+
+**How the workflow extracts tokens:**
+
+1. Set `codex-home: /tmp/codex-home` on `openai/codex-action` so the rollout path is known
+2. After the codex step, find the latest rollout file with `find /tmp/codex-home/sessions -name "rollout-*.jsonl" | sort | tail -1`
+3. Grep for `"token_count"`, take the last line, extract fields with `jq`
+4. Pass `input_tokens`, `output_tokens`, `cache_read_tokens` as explicit inputs to the AgentMeter step
+
+**Stability caveat:** The rollout format is an internal Codex CLI implementation detail, not a versioned public API. A future `@openai/codex` release could rename fields or restructure events. Since `codex-version` in `openai/codex-action` defaults to latest, this could silently break on a CLI upgrade. Failure is graceful — costs show as `—` if the rollout file is missing or unparseable.
+
+**Alternative path (`codex exec --json`):** Running with `--json` writes JSONL to stdout with `turn.completed` events containing a `usage` field. However, `openai/codex-action`'s `final-message` output reads from the output file, not stdout — so the JSONL stream is not accessible from within the action's step outputs. The `tryExtractFromCodexExecJsonl` function in `token-extractor.ts` handles this format for consumers who capture `codex exec --json` stdout directly.
+
+---
+
 ## What works regardless of mode
 
 - The action **never fails the workflow** — all errors are `core.warning()`, not `core.setFailed()`.
@@ -123,4 +165,5 @@ If the user omits `if: always()` on the AgentMeter step, failed agent runs won't
 | Comment posting | ✅ | Upsert by marker, correct PR/issue number |
 | `GITHUB_TOKEN` availability | ✅ | `github_token` input with `default: ${{ github.token }}` |
 | Node.js version | ✅ | node24 |
-| Pricing table | ✅ | Fetched from `/api/models/pricing`; built-in prefix fallback |
+| Pricing table | ✅ | Fetched from `/api/models/pricing`; shows `—` if unreachable |
+| Codex token counts | ✅ with caveat | Parsed from rollout JSONL at `$CODEX_HOME/sessions/YYYY/MM/DD/rollout-*.jsonl`. Works in production. Rollout format is internal (not a public API) — see section 6 below. |