getagentseal · iamtoruk · Jun 18, 2026 · Jun 18, 2026
diff --git a/src/providers/cursor.ts b/src/providers/cursor.ts
@@ -53,6 +53,9 @@ type BubbleRow = {
   text_length: number | null
   bubble_type: number | null
   code_blocks: Uint8Array | string | null
+  /// Only populated on the paged scan path (BUBBLE_QUERY_PAGE) used for very
+  /// large databases; undefined on the un-paged BUBBLE_QUERY_SINCE path.
+  rid?: number
 }
 
 type AgentKvRow = {
@@ -352,6 +355,29 @@ const BUBBLE_QUERY_SINCE_TAIL = `
 `
 const BUBBLE_QUERY_SINCE = BUBBLE_QUERY_SINCE_HEAD + BUBBLE_QUERY_SINCE_TAIL
 
+// Paged variant for very large DBs: fetches one ROWID-descending page below a
+// cursor. Returns ROWID and createdAt so the caller can stop once it has paged
+// past the requested window floor. No date predicate here — the caller filters
+// by createdAt in JS so it can see the window boundary.
+const BUBBLE_QUERY_PAGE = `
+  SELECT
+    key as bubble_key,
+    ROWID as rid,
+    json_extract(value, '$.tokenCount.inputTokens') as input_tokens,
+    json_extract(value, '$.tokenCount.outputTokens') as output_tokens,
+    json_extract(value, '$.modelInfo.modelName') as model,
+    json_extract(value, '$.createdAt') as created_at,
+    json_extract(value, '$.conversationId') as conversation_id,
+    CAST(substr(json_extract(value, '$.text'), 1, 500) AS BLOB) as user_text,
+    length(json_extract(value, '$.text')) as text_length,
+    json_extract(value, '$.type') as bubble_type,
+    CAST(json_extract(value, '$.codeBlocks') AS BLOB) as code_blocks
+  FROM cursorDiskKV
+  WHERE key LIKE 'bubbleId:%' AND ROWID < ?
+  ORDER BY ROWID DESC
+  LIMIT ?
+`
+
 function validateSchema(db: SqliteDatabase): boolean {
   try {
     const rows = db.query<{ cnt: number }>(
@@ -400,6 +426,50 @@ function takeUserMessage(queues: Map<string, UserMessageQueue>, conversationId:
   return msg
 }
 
+/// Scans bubbles for very large DBs by paging ROWID-descending (newest first),
+/// keeping only rows within the requested window (createdAt > timeFloor), and
+/// stopping once a full page lands below the floor. A `budget` caps the number
+/// of in-range bubbles collected so a genuinely enormous in-range scan can't
+/// stall; `truncated` is set only when that budget is actually hit, so the
+/// caller warns only when older in-range sessions were really dropped.
+function scanBubblesPaged(
+  db: SqliteDatabase,
+  timeFloor: string,
+  budget: number,
+): { rows: BubbleRow[]; truncated: boolean } {
+  const BATCH = 25_000
+  const collected: BubbleRow[] = []
+  let beforeRowId = Number.MAX_SAFE_INTEGER
+  let truncated = false
+
+  paging: while (true) {
+    let batch: BubbleRow[]
+    try {
+      batch = db.query<BubbleRow>(BUBBLE_QUERY_PAGE, [beforeRowId, BATCH])
+    } catch {
+      break
+    }
+    if (batch.length === 0) break
+
+    for (const row of batch) {
+      if (collected.length >= budget) { truncated = true; break paging }
+      if (row.created_at != null && row.created_at > timeFloor) collected.push(row)
+    }
+
+    const oldest = batch[batch.length - 1]!
+    beforeRowId = oldest.rid ?? 0
+    if (beforeRowId <= 0) break
+    if (batch.length < BATCH) break // exhausted the table
+    // Pages are ROWID-descending (~chronological), so once the oldest row in a
+    // full page predates the window, every older page does too.
+    if (oldest.created_at != null && oldest.created_at <= timeFloor) break
+  }
+
+  // Restore ROWID-ascending order to match the un-paged query's row ordering.
+  collected.sort((a, b) => (a.rid ?? 0) - (b.rid ?? 0))
+  return { rows: collected, truncated }
+}
+
 function parseBubbles(
   db: SqliteDatabase,
   seenKeys: Set<string>,
@@ -408,53 +478,42 @@ function parseBubbles(
   const results: ParsedProviderCall[] = []
   let skipped = 0
 
-  // Hard cap on rows to scan. The BUBBLE_QUERY_SINCE filter relies on
-  // json_extract over the value BLOB, which SQLite cannot serve from an
-  // index — every row is JSON-decoded. Multi-GB Cursor DBs (power users,
-  // years of usage) regularly exceed 500k bubble rows and were producing
-  // 30s+ parse stalls. Compute a ROWID cutoff that limits the scan to the
-  // MAX_BUBBLES most-recent bubbles when the user is over the cap, and
-  // warn so they know older sessions may be missing.
-  const MAX_BUBBLES = 250_000
-  let rowIdCutoff = 0
+  // The bubble timestamp lives inside the JSON value (no index), so the date
+  // filter forces a full JSON decode per row. Multi-GB Cursor DBs (500k+
+  // bubbles) were producing 30s+ parse stalls, so the scan is bounded. The old
+  // approach kept only the most-recent MAX_BUBBLES by ROWID, which dropped
+  // in-range older sessions and warned even when the requested window fit
+  // comfortably. Instead, for large DBs we page the requested window
+  // (ROWID-descending, stopping past the window floor) and only fall back to a
+  // hard budget — warning — when the in-range scan genuinely exceeds it.
+  // Override the budget in tests via CODEBURN_CURSOR_MAX_BUBBLES.
+  const MAX_BUBBLES = Number(process.env['CODEBURN_CURSOR_MAX_BUBBLES']) || 250_000
+
+  let total = 0
   try {
     const countRows = db.query<{ cnt: number }>(
       "SELECT COUNT(*) as cnt FROM cursorDiskKV WHERE key LIKE 'bubbleId:%'"
     )
-    const total = countRows[0]?.cnt ?? 0
-    if (total > MAX_BUBBLES) {
-      // Find the ROWID of the (MAX_BUBBLES)th most-recent bubble. Anything
-      // below this rowid is older and gets skipped. Bubbles are written
-      // chronologically so ROWID order ≈ insertion order.
-      const cutoffRows = db.query<{ rid: number }>(
-        `SELECT MIN(rid) as rid FROM (
-           SELECT ROWID as rid FROM cursorDiskKV
-           WHERE key LIKE 'bubbleId:%'
-           ORDER BY ROWID DESC
-           LIMIT ?
-         )`,
-        [MAX_BUBBLES]
-      )
-      rowIdCutoff = cutoffRows[0]?.rid ?? 0
-      process.stderr.write(
-        `codeburn: Cursor database has ${total.toLocaleString()} bubbles, ` +
-        `scanning the most recent ${MAX_BUBBLES.toLocaleString()}. ` +
-        `Older sessions may be missing from this report.\n`
-      )
-    }
-  } catch { /* best-effort diagnostic */ }
+    total = countRows[0]?.cnt ?? 0
+  } catch { /* best-effort */ }
 
   const userMessages = buildUserMessageMap(db, timeFloor)
 
-  // Append the rowid cutoff when active. Empty string when not capped so the
-  // query string compares identically to the un-capped version on small DBs.
-  const rowIdFilter = rowIdCutoff > 0 ? ' AND ROWID >= ?' : ''
-  const params: unknown[] = rowIdCutoff > 0 ? [timeFloor, rowIdCutoff] : [timeFloor]
-  const cappedQuery = BUBBLE_QUERY_SINCE_HEAD + rowIdFilter + BUBBLE_QUERY_SINCE_TAIL
-
   let rows: BubbleRow[]
   try {
-    rows = db.query<BubbleRow>(cappedQuery, params)
+    if (total > MAX_BUBBLES) {
+      const scan = scanBubblesPaged(db, timeFloor, MAX_BUBBLES)
+      rows = scan.rows
+      if (scan.truncated) {
+        process.stderr.write(
+          `codeburn: Cursor database has ${total.toLocaleString()} bubbles and the ` +
+          `requested range exceeds the ${MAX_BUBBLES.toLocaleString()}-bubble scan budget; ` +
+          `the oldest sessions in range may be missing from this report.\n`
+        )
+      }
+    } else {
+      rows = db.query<BubbleRow>(BUBBLE_QUERY_SINCE, [timeFloor])
+    }
   } catch {
     return { calls: results }
   }

diff --git a/tests/providers/cursor-large-db-cap.test.ts b/tests/providers/cursor-large-db-cap.test.ts
@@ -0,0 +1,133 @@
+import { describe, it, expect, beforeEach, afterEach } from 'vitest'
+import { mkdtemp, rm, writeFile } from 'fs/promises'
+import { tmpdir } from 'os'
+import { join } from 'path'
+
+import { isSqliteAvailable } from '../../src/sqlite.js'
+import { getAllProviders } from '../../src/providers/index.js'
+import type { Provider, ParsedProviderCall } from '../../src/providers/types.js'
+import type { DateRange } from '../../src/types.js'
+
+/// Regression for #482: the Cursor scan must not drop in-range sessions just
+/// because the DB has more bubbles than the scan budget. The old code kept the
+/// most-recent MAX_BUBBLES rows *by ROWID* and warned unconditionally; the new
+/// code pages the requested time window and only truncates (with a warning)
+/// when the in-range scan genuinely exceeds the budget. We shrink the budget
+/// via CODEBURN_CURSOR_MAX_BUBBLES so a tiny fixture exercises the capped path.
+
+const skipReason = isSqliteAvailable() ? null : 'node:sqlite not available — needs Node 22+; skipping'
+
+let tmpDir: string
+let savedBudget: string | undefined
+
+beforeEach(async () => {
+  tmpDir = await mkdtemp(join(tmpdir(), 'cursor-cap-'))
+  savedBudget = process.env['CODEBURN_CURSOR_MAX_BUBBLES']
+})
+
+afterEach(async () => {
+  if (savedBudget === undefined) delete process.env['CODEBURN_CURSOR_MAX_BUBBLES']
+  else process.env['CODEBURN_CURSOR_MAX_BUBBLES'] = savedBudget
+  await rm(tmpDir, { recursive: true, force: true })
+})
+
+type Bubble = { conversationId: string; createdAt: string; model: string; tokens: number }
+
+/// Inserts assistant bubbles in array order, so ROWID follows array index.
+async function createDb(bubbles: Bubble[]): Promise<string> {
+  const dbPath = join(tmpDir, 'state.vscdb')
+  await writeFile(dbPath, '')
+  const Module = await import('node:module')
+  const requireForSqlite = Module.createRequire(import.meta.url)
+  const { DatabaseSync } = requireForSqlite('node:sqlite') as {
+    DatabaseSync: new (path: string) => {
+      exec(sql: string): void
+      prepare(sql: string): { run(...p: unknown[]): unknown }
+      close(): void
+    }
+  }
+  const db = new DatabaseSync(dbPath)
+  db.exec('CREATE TABLE cursorDiskKV (key TEXT PRIMARY KEY, value TEXT)')
+  const stmt = db.prepare('INSERT INTO cursorDiskKV (key, value) VALUES (?, ?)')
+  bubbles.forEach((b, i) => {
+    stmt.run(
+      `bubbleId:${b.conversationId}:bubble-${i}`,
+      JSON.stringify({
+        type: 2,
+        conversationId: b.conversationId,
+        text: 'def hello(): pass',
+        tokenCount: { inputTokens: b.tokens, outputTokens: b.tokens },
+        createdAt: b.createdAt,
+        modelInfo: { modelName: b.model },
+      }),
+    )
+  })
+  db.close()
+  return dbPath
+}
+
+async function getCursorProvider(): Promise<Provider> {
+  const p = (await getAllProviders()).find(p => p.name === 'cursor')
+  if (!p) throw new Error('cursor provider not registered')
+  return p
+}
+
+async function parse(dbPath: string, range: DateRange): Promise<ParsedProviderCall[]> {
+  const provider = await getCursorProvider()
+  const source = { path: dbPath, project: 'test', provider: 'cursor' }
+  const calls: ParsedProviderCall[] = []
+  for await (const call of provider.createSessionParser(source, new Set<string>(), range).parse()) {
+    calls.push(call)
+  }
+  return calls
+}
+
+const iso = (daysAgo: number) => new Date(Date.now() - daysAgo * 24 * 60 * 60 * 1000).toISOString()
+const last30Days = (): DateRange => ({ start: new Date(Date.now() - 30 * 24 * 60 * 60 * 1000), end: new Date() })
+const last120Days = (): DateRange => ({ start: new Date(Date.now() - 120 * 24 * 60 * 60 * 1000), end: new Date() })
+
+describe.skipIf(skipReason !== null)('cursor large-DB scan cap (#482)', () => {
+  it('keeps in-range sessions even when they have low ROWIDs and the DB is over budget', async () => {
+    // In-range bubbles inserted FIRST (low ROWID); out-of-range bubbles inserted
+    // LATER (high ROWID). The old "most-recent N by ROWID" cap would scan only
+    // the high-ROWID out-of-range rows and drop the in-range ones entirely.
+    const dbPath = await createDb([
+      { conversationId: 'recent-A', createdAt: iso(1), model: 'gpt-5', tokens: 100 },
+      { conversationId: 'recent-B', createdAt: iso(2), model: 'gpt-5', tokens: 100 },
+      { conversationId: 'old-C', createdAt: iso(300), model: 'gpt-5', tokens: 100 },
+      { conversationId: 'old-D', createdAt: iso(300), model: 'gpt-5', tokens: 100 },
+    ])
+    process.env['CODEBURN_CURSOR_MAX_BUBBLES'] = '2' // total 4 > budget 2 -> capped path
+
+    const calls = await parse(dbPath, last30Days())
+    // Both in-range sessions are present (the old ROWID cap returned 0 here).
+    expect(calls.length).toBe(2)
+  })
+
+  it('returns the whole window when in-range bubbles fit the budget (over-budget DB)', async () => {
+    const dbPath = await createDb([
+      { conversationId: 'A', createdAt: iso(1), model: 'gpt-5', tokens: 100 },
+      { conversationId: 'B', createdAt: iso(2), model: 'gpt-5', tokens: 100 },
+      { conversationId: 'old', createdAt: iso(300), model: 'gpt-5', tokens: 100 },
+      { conversationId: 'older', createdAt: iso(301), model: 'gpt-5', tokens: 100 },
+    ])
+    process.env['CODEBURN_CURSOR_MAX_BUBBLES'] = '3' // total 4 > budget 3, but in-range 2 <= 3
+    const calls = await parse(dbPath, last30Days())
+    expect(calls.length).toBe(2) // both in-range, none truncated
+  })
+
+  it('truncates to the budget and keeps the newest in-range bubbles when over budget', async () => {
+    // Four in-range bubbles, oldest->newest by ROWID; budget 2 keeps the two newest.
+    const dbPath = await createDb([
+      { conversationId: 'd1', createdAt: iso(40), model: 'old-model', tokens: 100 },
+      { conversationId: 'd2', createdAt: iso(30), model: 'old-model', tokens: 100 },
+      { conversationId: 'd3', createdAt: iso(2), model: 'new-model', tokens: 100 },
+      { conversationId: 'd4', createdAt: iso(1), model: 'new-model', tokens: 100 },
+    ])
+    process.env['CODEBURN_CURSOR_MAX_BUBBLES'] = '2' // total 4 > budget 2
+    const calls = await parse(dbPath, last120Days())
+    expect(calls.length).toBe(2)
+    // Budget keeps the highest-ROWID (newest-inserted) bubbles.
+    expect(calls.every(c => c.model === 'new-model')).toBe(true)
+  })
+})