From 4876614cbeb3078e02524642d4d3f366a04806e0 Mon Sep 17 00:00:00 2001 From: arzafran Date: Fri, 19 Jun 2026 16:27:40 -0300 Subject: [PATCH] =?UTF-8?q?feat(knowledge):=20TTL-cached=20team-knowledge?= =?UTF-8?q?=20index=20=E2=80=94=20no=20clone=20required?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Makes the session-start awareness ping (and a future /knowledge consult) work for anyone with gh auth, not just people who cloned the repo and set KNOWLEDGE_REPO_PATH. Mirrors the codex-verdict cache pattern: - src/lib/knowledge-index.ts: refreshKnowledgeIndex() fetches the corpus listing via gh api at most once per 6h TTL, writes ~/.claude/tmp/knowledge-index.json; readKnowledgeIndex() reads it instantly. Fully fail-open — no gh / network error returns the existing cache, never clobbers good data, never throws. - src/scripts/refresh-knowledge-index.ts: detached entrypoint session-start spawns fire-and-forget (TTL-gated internally, so it only calls gh when stale). - teamKnowledgeAwareness() now prefers the cache (no clone needed); an explicit repoPath still uses the local clone, so it falls back for offline/no-gh users. No second source of truth, no cross-repo token, no vendored index — the cache is per-user, self-refreshing, and never drifts. parseContentsListing + isStale are pure and unit-tested (no network in tests). --- src/lib/knowledge-index.ts | 118 +++++++++++++++++++++++++ src/lib/team-knowledge.ts | 67 +++++++++----- src/scripts/refresh-knowledge-index.ts | 9 ++ src/scripts/session-start.ts | 12 +++ tests/knowledge-index.test.ts | 110 +++++++++++++++++++++++ tests/team-knowledge.test.ts | 17 ++++ 6 files changed, 311 insertions(+), 22 deletions(-) create mode 100644 src/lib/knowledge-index.ts create mode 100644 src/scripts/refresh-knowledge-index.ts create mode 100644 tests/knowledge-index.test.ts diff --git a/src/lib/knowledge-index.ts b/src/lib/knowledge-index.ts new file mode 100644 index 0000000..0a6aa14 --- /dev/null +++ b/src/lib/knowledge-index.ts @@ -0,0 +1,118 @@ +// TTL-cached index of the shared team-knowledge corpus fetched from the GitHub +// contents API via `gh api`. The cache lives in ~/.claude/tmp/knowledge-index.json +// and is warmed by refresh-knowledge-index.ts (spawned detached at SessionStart). +// +// Fail-open contract (same as codex.ts): +// - Never throw into a hot path. +// - A network/gh failure returns the existing cache (or null). Never clobbles +// a good cache entry with empty/failed data. +// - `gh` not on PATH → return existing cache silently. +// +// The corpus repo defaults to darkroomengineering/team-knowledge, overridable +// via $KNOWLEDGE_REPO env var. + +import { z } from "zod"; +import { readState, writeState } from "./hook-runtime.ts"; +import { hasCommand } from "./platform.ts"; + +// Generated/meta files in the corpus that are not knowledge notes. Defined here +// (the lower-level lib) so team-knowledge.ts can re-export it without a circular +// dependency. Mirrors the SKIP_FILES set in lint-knowledge.ts. +export const NON_NOTE_FILES = new Set(["README.md", "INDEX.md", "CONTRIBUTING.md"]); + +const KNOWLEDGE_INDEX_TTL_MS = 6 * 60 * 60 * 1000; // 6 hours +const CACHE_FILE = "knowledge-index.json"; +const KNOWLEDGE_REPO = process.env.KNOWLEDGE_REPO ?? "darkroomengineering/team-knowledge"; + +// ── Schema ───────────────────────────────────────────────────────────────────── + +export const KnowledgeIndexSchema = z.object({ + notes: z.array(z.string()), + checkedAt: z.string(), +}); + +export type KnowledgeIndex = z.infer; + +// ── Pure helpers ─────────────────────────────────────────────────────────────── + +/** Map a GitHub contents-API listing to sorted slugs. + * Keeps entries where `type === "file"`, name ends with `.md`, and name is + * not in NON_NOTE_FILES. Returns name without the `.md` suffix, sorted. */ +export function parseContentsListing(entries: Array<{ name: string; type: string }>): string[] { + return entries + .filter((e) => e.type === "file" && e.name.endsWith(".md") && !NON_NOTE_FILES.has(e.name)) + .map((e) => e.name.slice(0, -".md".length)) + .sort(); +} + +/** True when the cache is missing or older than the TTL. */ +export function isStale(checkedAt: string | undefined): boolean { + if (checkedAt === undefined) return true; + const t = Date.parse(checkedAt); + if (Number.isNaN(t)) return true; + return Date.now() - t > KNOWLEDGE_INDEX_TTL_MS; +} + +// ── Cache I/O ────────────────────────────────────────────────────────────────── + +/** Read the on-disk cache. Returns null on any error or schema mismatch. */ +export async function readKnowledgeIndex(): Promise { + const raw = await readState(CACHE_FILE, null); + const parsed = KnowledgeIndexSchema.safeParse(raw); + return parsed.success ? parsed.data : null; +} + +// ── Refresh (TTL-gated, fail-open) ───────────────────────────────────────────── + +/** Refresh the knowledge index if stale, otherwise return the cached value. + * Network/gh failures return the existing cache (or null) without overwriting it. */ +export async function refreshKnowledgeIndex(): Promise { + // 1. Read current cache; return early if still fresh. + const existing = await readKnowledgeIndex(); + if (existing && !isStale(existing.checkedAt)) { + return existing; + } + + // 2. `gh` required for network fetch — bail without clobbering existing cache. + if (!hasCommand("gh")) { + return existing ?? null; + } + + // 3. Fetch via `gh api`. + try { + const proc = Bun.spawn(["gh", "api", `repos/${KNOWLEDGE_REPO}/contents`], { + stdout: "pipe", + stderr: "ignore", + timeout: 10_000, + }); + + const [text, exit] = await Promise.all([new Response(proc.stdout).text(), proc.exited]); + + if (exit !== 0) { + // Non-zero exit — don't clobber good cache. + return existing ?? null; + } + + let rawEntries: unknown; + try { + rawEntries = JSON.parse(text); + } catch { + return existing ?? null; + } + + if (!Array.isArray(rawEntries)) { + return existing ?? null; + } + + const notes = parseContentsListing(rawEntries as Array<{ name: string; type: string }>); + const index: KnowledgeIndex = { + notes, + checkedAt: new Date().toISOString(), + }; + await writeState(CACHE_FILE, index); + return index; + } catch { + // timeout, spawn error, write error — return existing cache (or null). + return existing ?? null; + } +} diff --git a/src/lib/team-knowledge.ts b/src/lib/team-knowledge.ts index aecebb8..c5c5863 100644 --- a/src/lib/team-knowledge.ts +++ b/src/lib/team-knowledge.ts @@ -2,22 +2,25 @@ // counterpart to the /share-learning write path). The corpus was write-only: // /share-learning posts notes, but nothing surfaced them at the moment an agent // would use them, so they rarely got consulted. This emits a one-line ping when -// a local clone is configured and non-empty. +// a local clone is configured and non-empty — or from a TTL-cached remote index +// when no local clone is available. // // Hot-path constraint: this runs inside the SessionStart hook, so it does ZERO -// network I/O (no `gh api`) and is fully fail-open — any error → no output, -// never block session start. Live retrieval stays on-demand elsewhere; this only -// makes the agent aware the corpus exists and worth an `rg`. +// blocking network I/O and is fully fail-open — any error → no output, never +// block session start. Network warming happens in a detached background script +// (refresh-knowledge-index.ts) and the result is read from a TTL cache. // -// Opt-in: keyed on KNOWLEDGE_REPO_PATH — the same env var lint:knowledge and -// new-note already use to find a local clone. If it's unset, this is a no-op. +// Priority: +// 1. Explicit repoPath (local clone) — used by tests and $KNOWLEDGE_REPO_PATH. +// 2. TTL cache written by refresh-knowledge-index.ts (no local clone needed). +// 3. No output. import { readdir } from "node:fs/promises"; +import { NON_NOTE_FILES, readKnowledgeIndex } from "./knowledge-index.ts"; -// Generated/meta files in the corpus that are not knowledge notes. Mirrors the -// SKIP_FILES set in lint-knowledge.ts; kept local so the hot path doesn't import -// the linter module. -const NON_NOTE_FILES = new Set(["README.md", "INDEX.md", "CONTRIBUTING.md"]); +// Re-export so external consumers (lint-knowledge, tests) can import from +// team-knowledge.ts as before without knowing the source moved. +export { NON_NOTE_FILES } from "./knowledge-index.ts"; /** Lines to print at session start advertising the shared corpus, or [] when * there's nothing to surface (no clone configured, empty, or unreadable). @@ -25,19 +28,39 @@ const NON_NOTE_FILES = new Set(["README.md", "INDEX.md", "CONTRIBUTING.md"]); export async function teamKnowledgeAwareness( repoPath: string | undefined = process.env.KNOWLEDGE_REPO_PATH, ): Promise { - if (!repoPath) return []; + // Branch A: explicit repoPath — use the local clone directly (existing behavior, + // preserves all current tests). Cache is bypassed in this branch. + if (repoPath) { + try { + const entries = await readdir(repoPath); + const notes = entries.filter((n) => n.endsWith(".md") && !NON_NOTE_FILES.has(n)); + if (notes.length === 0) return []; + const label = notes.length === 1 ? "note" : "notes"; + return [ + "", + `team-knowledge: ${notes.length} shared ${label} at ${repoPath}`, + ` consult before architecture / convention / gotcha calls — rg "" "${repoPath}"`, + ]; + } catch { + // missing dir, permission error, etc. — stay silent, never disrupt startup. + return []; + } + } + + // Branch B: no local clone — try the TTL cache written by refresh-knowledge-index.ts. try { - const entries = await readdir(repoPath); - const notes = entries.filter((n) => n.endsWith(".md") && !NON_NOTE_FILES.has(n)); - if (notes.length === 0) return []; - const label = notes.length === 1 ? "note" : "notes"; - return [ - "", - `team-knowledge: ${notes.length} shared ${label} at ${repoPath}`, - ` consult before architecture / convention / gotcha calls — rg "" "${repoPath}"`, - ]; + const index = await readKnowledgeIndex(); + if (index && index.notes.length > 0) { + const count = index.notes.length; + const label = count === 1 ? "note" : "notes"; + return [ + "", + `team-knowledge: ${count} shared ${label} — consult before architecture / convention / gotcha calls`, + ]; + } } catch { - // missing dir, permission error, etc. — stay silent, never disrupt startup. - return []; + // cache unreadable — fall through silently } + + return []; } diff --git a/src/scripts/refresh-knowledge-index.ts b/src/scripts/refresh-knowledge-index.ts new file mode 100644 index 0000000..f2dce0c --- /dev/null +++ b/src/scripts/refresh-knowledge-index.ts @@ -0,0 +1,9 @@ +#!/usr/bin/env bun +// Detached entrypoint for warming the knowledge-index TTL cache. +// Spawned fire-and-forget from session-start.ts (Phase 1 background tasks). +// The TTL gate lives in refreshKnowledgeIndex — spawning this every session +// is cheap; it only calls `gh api` when the cache is stale (>6h). + +import { refreshKnowledgeIndex } from "../lib/knowledge-index.ts"; + +await refreshKnowledgeIndex().catch(() => {}); diff --git a/src/scripts/session-start.ts b/src/scripts/session-start.ts index 7efc5df..beb3f1d 100644 --- a/src/scripts/session-start.ts +++ b/src/scripts/session-start.ts @@ -188,6 +188,18 @@ const mcpPrune = Bun.spawn(["bun", join(CLAUDE_DIR, "src", "scripts", "prune-mcp }); mcpPrune.unref?.(); +// Warm the team-knowledge TTL cache in the background. The TTL gate inside +// refreshKnowledgeIndex means this only calls `gh api` when the cache is stale +// (>6h), so spawning it every session is cheap. +const knowledgeRefresh = Bun.spawn( + ["bun", join(CLAUDE_DIR, "src", "scripts", "refresh-knowledge-index.ts")], + { + stdout: "ignore", + stderr: "ignore", + }, +); +knowledgeRefresh.unref?.(); + const logRotations = [ rotateLog(join(CLAUDE_DIR, "sessions.log")), rotateLog(join(CLAUDE_DIR, "hooks.log")), diff --git a/tests/knowledge-index.test.ts b/tests/knowledge-index.test.ts new file mode 100644 index 0000000..285bf5f --- /dev/null +++ b/tests/knowledge-index.test.ts @@ -0,0 +1,110 @@ +// Tests for knowledge-index.ts — covers the two pure exported functions. +// No network, no real cache I/O. + +import { describe, expect, test } from "bun:test"; +import { isStale, parseContentsListing } from "../src/lib/knowledge-index.ts"; + +// ── parseContentsListing ─────────────────────────────────────────────────────── + +describe("parseContentsListing", () => { + test("drops entries with type !== 'file'", () => { + const entries = [ + { name: "notes", type: "dir" }, + { name: "foo.md", type: "file" }, + ]; + expect(parseContentsListing(entries)).toEqual(["foo"]); + }); + + test("drops entries whose name does not end with .md", () => { + const entries = [ + { name: "foo.ts", type: "file" }, + { name: "bar.json", type: "file" }, + { name: "baz.md", type: "file" }, + ]; + expect(parseContentsListing(entries)).toEqual(["baz"]); + }); + + test("drops NON_NOTE_FILES (README.md, INDEX.md, CONTRIBUTING.md)", () => { + const entries = [ + { name: "README.md", type: "file" }, + { name: "INDEX.md", type: "file" }, + { name: "CONTRIBUTING.md", type: "file" }, + { name: "gotcha.md", type: "file" }, + ]; + expect(parseContentsListing(entries)).toEqual(["gotcha"]); + }); + + test("strips .md suffix → returns slug", () => { + const entries = [{ name: "my-note.md", type: "file" }]; + expect(parseContentsListing(entries)).toEqual(["my-note"]); + }); + + test("returns slugs sorted alphabetically", () => { + const entries = [ + { name: "zebra.md", type: "file" }, + { name: "apple.md", type: "file" }, + { name: "mango.md", type: "file" }, + ]; + expect(parseContentsListing(entries)).toEqual(["apple", "mango", "zebra"]); + }); + + test("returns [] for an empty listing", () => { + expect(parseContentsListing([])).toEqual([]); + }); + + test("returns [] when listing has only dirs and non-.md files", () => { + const entries = [ + { name: "README.md", type: "file" }, + { name: "scripts", type: "dir" }, + { name: "config.json", type: "file" }, + ]; + expect(parseContentsListing(entries)).toEqual([]); + }); + + test("mixed realistic listing", () => { + const entries = [ + { name: "README.md", type: "file" }, + { name: "INDEX.md", type: "file" }, + { name: "CONTRIBUTING.md", type: "file" }, + { name: "scripts", type: "dir" }, + { name: "deployment.md", type: "file" }, + { name: "auth-patterns.md", type: "file" }, + { name: ".github", type: "dir" }, + ]; + expect(parseContentsListing(entries)).toEqual(["auth-patterns", "deployment"]); + }); +}); + +// ── isStale ──────────────────────────────────────────────────────────────────── + +describe("isStale", () => { + test("undefined → stale", () => { + expect(isStale(undefined)).toBe(true); + }); + + test("garbage string → stale", () => { + expect(isStale("not-a-date")).toBe(true); + }); + + test("epoch (very old) → stale", () => { + expect(isStale(new Date(0).toISOString())).toBe(true); + }); + + test("fresh timestamp (just now) → not stale", () => { + expect(isStale(new Date().toISOString())).toBe(false); + }); + + test("timestamp 5 hours ago → not stale (TTL is 6h)", () => { + const fiveHoursAgo = new Date(Date.now() - 5 * 60 * 60 * 1000).toISOString(); + expect(isStale(fiveHoursAgo)).toBe(false); + }); + + test("timestamp 7 hours ago → stale (TTL is 6h)", () => { + const sevenHoursAgo = new Date(Date.now() - 7 * 60 * 60 * 1000).toISOString(); + expect(isStale(sevenHoursAgo)).toBe(true); + }); + + test("empty string → stale (Date.parse returns NaN)", () => { + expect(isStale("")).toBe(true); + }); +}); diff --git a/tests/team-knowledge.test.ts b/tests/team-knowledge.test.ts index cd814bb..e5525c2 100644 --- a/tests/team-knowledge.test.ts +++ b/tests/team-knowledge.test.ts @@ -39,6 +39,23 @@ describe("teamKnowledgeAwareness — no-op cases", () => { }); }); +describe("teamKnowledgeAwareness — explicit repoPath bypasses cache", () => { + test("explicit repoPath uses local clone, not the TTL cache", async () => { + // Even if KNOWLEDGE_REPO_PATH is unset and the cache is cold, an explicit + // repoPath argument drives the local-clone code path (Branch A). + const dir = await sandbox(); + try { + await writeFile(join(dir, "concept.md"), "# Concept"); + const result = await teamKnowledgeAwareness(dir); + // Output must reference the local path (clone branch), not just a count. + expect(result.join("\n")).toContain(dir); + expect(result.join("\n")).toContain("1 shared note"); + } finally { + await rm(dir, { recursive: true, force: true }); + } + }); +}); + describe("teamKnowledgeAwareness — non-empty corpus", () => { test("1 note → output contains 'shared note' (singular) and the repo path", async () => { const dir = await sandbox();