Stackbilt-dev · stackbilt-admin · May 29, 2026 · May 29, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,10 +16,10 @@ Groq built-in tools (issue #69), landing across stacked PRs. Additive only.
 - **`RESEARCH` `ModelRecommendationUseCase`** — new use case with `scoreUseCase` weights and a `MODEL_RECOMMENDATIONS.RESEARCH` list; honored by `factory.resolveUseCase()` via `metadata.useCase`. Not inferred from request shape (opt-in only).
 - **Capability-aware built-in-tools routing** — `openai/gpt-oss-120b` is hosted by both Cerebras and Groq; a `builtInTools` request is steered to Groq (the capable host) while plain requests keep the prior default. Resolves the catalog collision via `getProvidersForCatalogModel`.
 - **Groq built-in-tools request fork + boundary gating** — Compound systems send tools on `compound_custom.tools.enabled_tools` (identifiers verbatim); `openai/gpt-oss-120b` sends OpenAI-style `tools: [{ type }]` with `web_search` → `browser_search` translation, merged alongside function tools. Unsupported `(model, tool)` pairs throw `ConfigurationError` naming the capable models.
+- **Groq built-in tool result parsing** — `message.executed_tools[]` is parsed into `LLMResponse.metadata.builtInToolResults` (`Array<{ type, name?, arguments?, results: [{ title, url, content, score }] }>`). Only executions carrying a non-empty `search_results.results` surface; non-search runs (e.g. `code_interpreter`) are omitted, and the field is absent when no search ran. The model's internal reasoning surfaces on `metadata.reasoning` when present. `GROQ_RESPONSE_SCHEMA` extended with an optional, shallow `executed_tools` entry (validates `type` only — citation sub-fields are intentionally unguarded to avoid false `SchemaDriftError` fallback on a single sampled shape).
 
 ### Notes
 - Built-in tool surcharges are billed by the provider and are **not** attributed per-call in `TokenUsage`; use `CreditLedger` for accounting.
-- Structured result surfacing (`metadata.builtInToolResults`) for the Groq adapter is being wired in a follow-up PR; the request path and gating are complete.
 
 ## [1.9.0] — 2026-05-22
 

diff --git a/README.md b/README.md
@@ -426,7 +426,13 @@ Notes:
 - **Capability-aware routing.** `openai/gpt-oss-120b` is hosted by both Cerebras and Groq; only Groq runs built-in tools, so a `builtInTools` request is steered to Groq automatically. Plain requests keep the default routing.
 - **Provenance.** The Compound systems are tagged `RESEARCH`-only in the catalog and are not auto-selected for generic use cases — pin the model (or request the `RESEARCH` use case) to use them, since selecting a Compound model can incur per-search surcharges.
 - **Cost.** Built-in tool surcharges (e.g. web search ~$5/1k requests) are billed by the provider and are **not** attributed per-call in `TokenUsage`; track them via `CreditLedger` if needed.
-- **Citations.** Structured search results surface on `LLMResponse.metadata.builtInToolResults` (`{ type, name?, arguments?, results: [{ title, url, content, score }] }`). Result parsing for the Groq adapter is being wired in a follow-up; the request path and gating described here are live today.
+- **Citations.** Structured search results surface on `LLMResponse.metadata.builtInToolResults` — `Array<{ type, name?, arguments?, results: [{ title, url, content, score }] }>`. Only executions that ran a web search appear (e.g. `code_interpreter` runs, which carry no citations, are omitted); the field is absent when no search ran. Citation sub-fields are passed through as the provider returns them — treat them as best-effort and validate URLs before use.
+- **Reasoning.** When the model exposes its internal reasoning (the queries it searched), it surfaces on `LLMResponse.metadata.reasoning` as a string. Absent when the model doesn't emit it.
+
+```typescript
+const citations = res.metadata?.builtInToolResults?.[0]?.results ?? [];
+// → [{ title, url, content, score }, …]
+```
 
 ## Prompt Cache Hints
 

diff --git a/src/__tests__/groq.test.ts b/src/__tests__/groq.test.ts
@@ -588,6 +588,118 @@ describe('GroqProvider', () => {
     });
   });
 
+  describe('built-in tool results (S5)', () => {
+    // Real wire shape locked from the S0 spike: executed_tools[].search_results
+    // is an object { results: [...] }, results carry {title,url,content,score}.
+    const searchResponse = (model: string, message: Record<string, unknown>) => ({
+      ok: true,
+      json: async () => ({
+        id: 'chatcmpl-bi-res',
+        object: 'chat.completion',
+        created: 1700000000,
+        model,
+        choices: [{ index: 0, message: { role: 'assistant', content: 'answer', ...message }, finish_reason: 'stop' }],
+        usage: { prompt_tokens: 30, completion_tokens: 40, total_tokens: 70 }
+      }),
+      headers: new Headers({ 'content-type': 'application/json' })
+    });
+
+    it('flattens compound executed_tools into metadata.builtInToolResults (all four citation fields)', async () => {
+      mockFetch.mockResolvedValueOnce(searchResponse('groq/compound', {
+        reasoning: 'I should search the web.',
+        executed_tools: [{
+          index: 0,
+          type: 'search',
+          arguments: '{"query":"authoritative sources on X"}',
+          search_results: {
+            results: [
+              { title: 'Source A', url: 'https://a.example/x', content: 'snippet A', score: 0.91 },
+              { title: 'Source B', url: 'https://b.example/x', content: 'snippet B', score: 0.84 },
+            ]
+          }
+        }]
+      }));
+
+      const res = await provider.generateResponse({
+        messages: [{ role: 'user', content: 'Find sources.' }],
+        model: 'groq/compound',
+        builtInTools: [{ type: 'web_search' }],
+        maxTokens: 100,
+      });
+
+      const results = res.metadata?.builtInToolResults as Array<Record<string, unknown>>;
+      expect(results).toHaveLength(1);
+      expect(results[0].type).toBe('search');
+      expect(results[0].arguments).toBe('{"query":"authoritative sources on X"}');
+      expect(results[0].name).toBeUndefined(); // compound omits name
+      const citations = results[0].results as Array<Record<string, unknown>>;
+      expect(citations).toHaveLength(2);
+      // Direct assertion on the four citation fields (the binding S5 note).
+      expect(citations[0]).toEqual({ title: 'Source A', url: 'https://a.example/x', content: 'snippet A', score: 0.91 });
+      // reasoning surfaces too
+      expect(res.metadata?.reasoning).toBe('I should search the web.');
+    });
+
+    it('keeps only search executions and preserves gpt-oss name/arguments', async () => {
+      mockFetch.mockResolvedValueOnce(searchResponse('openai/gpt-oss-120b', {
+        executed_tools: [
+          {
+            index: 0,
+            type: 'browser_search',
+            name: 'browser.search',
+            arguments: '{"query":"X"}',
+            search_results: { results: [{ title: 'T', url: 'https://t.example', content: 'c', score: 0.5 }] }
+          },
+          // Non-search execution (no search_results) — dropped by design.
+          { index: 1, type: 'browser.open', name: 'browser.open', arguments: '{"id":1}' },
+        ]
+      }));
+
+      const res = await provider.generateResponse({
+        messages: [{ role: 'user', content: 'Find.' }],
+        model: 'openai/gpt-oss-120b',
+        builtInTools: [{ type: 'web_search' }],
+        maxTokens: 100,
+      });
+
+      const results = res.metadata?.builtInToolResults as Array<Record<string, unknown>>;
+      expect(results).toHaveLength(1);
+      expect(results[0].type).toBe('browser_search');
+      expect(results[0].name).toBe('browser.search');
+    });
+
+    it('omits builtInToolResults when no execution carries results', async () => {
+      mockFetch.mockResolvedValueOnce(searchResponse('groq/compound', {
+        executed_tools: [
+          { index: 0, type: 'code_interpreter', arguments: '{}', output: '42' },
+          { index: 1, type: 'search', search_results: { results: [] } },
+        ]
+      }));
+
+      const res = await provider.generateResponse({
+        messages: [{ role: 'user', content: 'Compute.' }],
+        model: 'groq/compound',
+        builtInTools: [{ type: 'code_interpreter' }],
+        maxTokens: 100,
+      });
+
+      expect(res.metadata?.builtInToolResults).toBeUndefined();
+    });
+
+    it('omits builtInToolResults entirely for a plain response (no executed_tools)', async () => {
+      mockFetch.mockResolvedValueOnce(searchResponse('llama-3.3-70b-versatile', {}));
+
+      const res = await provider.generateResponse({
+        messages: [{ role: 'user', content: 'hi' }],
+        model: 'llama-3.3-70b-versatile',
+        maxTokens: 100,
+      });
+
+      expect(res.metadata?.builtInToolResults).toBeUndefined();
+      expect(res.metadata?.reasoning).toBeUndefined();
+    });
+  });
+
   describe('healthCheck', () => {
     it('should return true when API is healthy', async () => {
       mockFetch.mockResolvedValueOnce({

diff --git a/src/providers/groq.ts b/src/providers/groq.ts
@@ -3,7 +3,7 @@
  * Implementation for Groq fast inference models (OpenAI-compatible API)
  */
 
-import type { LLMRequest, LLMResponse, GroqConfig, ModelCapabilities, ProviderBalance, ToolCall, TokenUsage, BuiltInTool, BuiltInToolType } from '../types.js';
+import type { LLMRequest, LLMResponse, GroqConfig, ModelCapabilities, ProviderBalance, ToolCall, TokenUsage, BuiltInTool, BuiltInToolType, BuiltInToolResult } from '../types.js';
 import { BaseProvider } from './base.js';
 import {
   LLMErrorFactory,
@@ -44,6 +44,25 @@ const GROQ_RESPONSE_SCHEMA: SchemaField[] = [
             },
           },
         },
+        // Built-in tool executions (issue #69 S5). Validated SHALLOW on purpose:
+        // only the always-present `type` is checked. `search_results.results`
+        // sub-fields ({title,url,content,score}) are NOT validated here —
+        // SchemaDriftError routes through the fallback chain, and the fallback
+        // host (Cerebras gpt-oss) doesn't run built-in tools, so a false drift
+        // on a citation sub-field (sampled n=1 in the S0 spike) would silently
+        // degrade a working search response into a tool-less one. The parser
+        // soft-degrades instead; citation-field coverage lives in a parser unit
+        // test (the binding note's accepted alternative to a deep fixture).
+        {
+          path: 'message.executed_tools',
+          type: 'array',
+          optional: true,
+          items: {
+            shape: [
+              { path: 'type', type: 'string' },
+            ],
+          },
+        },
       ],
     },
   },
@@ -106,11 +125,27 @@ interface GroqResponse {
     message: {
       role: string;
       content: string | null;
+      // The model's internal reasoning (exposes built-in search queries).
+      // Present on both compound and gpt-oss when built-in tools run.
+      reasoning?: string;
       tool_calls?: Array<{
         id: string;
         type: 'function';
         function: { name: string; arguments: string };
       }>;
+      // Server-side built-in tool executions (issue #69). Open-ended `type`
+      // (compound: 'search'; gpt-oss: 'browser_search'/'browser.open'/…); only
+      // search executions carry `search_results.results`. Verified live in S0.
+      executed_tools?: Array<{
+        index?: number;
+        type: string;
+        name?: string;
+        arguments?: string;
+        output?: string;
+        search_results?: {
+          results?: Array<{ title: string; url: string; content: string; score: number }>;
+        };
+      }>;
     };
     finish_reason: 'stop' | 'length' | 'content_filter' | 'tool_calls';
   }>;
@@ -613,6 +648,8 @@ export class GroqProvider extends BaseProvider {
       toolCalls = this.validateToolCalls(raw);
     }
 
+    const builtInToolResults = this.extractBuiltInToolResults(choice.message.executed_tools);
+
     return {
       id: data.id,
       message: content,
@@ -625,11 +662,53 @@ export class GroqProvider extends BaseProvider {
       toolCalls,
       metadata: {
         systemFingerprint: data.system_fingerprint,
-        created: data.created
+        created: data.created,
+        // Surface only when present, to keep metadata clean for plain responses.
+        ...(builtInToolResults ? { builtInToolResults } : {}),
+        ...(choice.message.reasoning ? { reasoning: choice.message.reasoning } : {}),
       }
     };
   }
 
+  /**
+   * Map Groq's `message.executed_tools[]` → normalized `BuiltInToolResult[]`
+   * (issue #69, verified live in S0).
+   *
+   * Keeps only executions that carry a non-empty `search_results.results` and
+   * flattens those into `results`, preserving the per-execution `type` / `name`
+   * / `arguments`. Non-search executions (e.g. `code_interpreter`) have no
+   * `search_results` and so drop out by design — that's the locked spec, not a
+   * bug. Citation sub-fields are mapped as-is: any field the provider omits
+   * surfaces as `undefined` (soft degrade) rather than throwing, since the
+   * schema deliberately doesn't guard them (consumers HEAD-probe URLs anyway).
+   */
+  private extractBuiltInToolResults(
+    executed: GroqResponse['choices'][number]['message']['executed_tools']
+  ): BuiltInToolResult[] | undefined {
+    if (!executed || executed.length === 0) return undefined;
+
+    const out: BuiltInToolResult[] = [];
+    for (const exec of executed) {
+      const results = exec.search_results?.results;
+      if (!Array.isArray(results) || results.length === 0) continue;
+
+      const entry: BuiltInToolResult = {
+        type: exec.type,
+        results: results.map(r => ({
+          title: r.title,
+          url: r.url,
+          content: r.content,
+          score: r.score,
+        })),
+      };
+      if (exec.name !== undefined) entry.name = exec.name;
+      if (exec.arguments !== undefined) entry.arguments = exec.arguments;
+      out.push(entry);
+    }
+
+    return out.length > 0 ? out : undefined;
+  }
+
   private getDefaultModel(request: LLMRequest): string {
     return getProviderDefaultModel('groq', request);
   }