Skip to content

Commit 42a32af

Browse files
authored
fix: harden search reliability and indexing hygiene (#22)
* fix: harden search reliability and indexing hygiene * chore: format indexer for quality checks * fix: restore accurate stats for no-op incremental indexing
1 parent a6b65f1 commit 42a32af

24 files changed

Lines changed: 1486 additions & 137 deletions

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ dist/
77
*.log
88
.DS_Store
99
.env
10+
opencode.jsonc
11+
nul
1012
.vscode/
1113
*.swp
1214
*.swo

src/constants/codebase-context.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,5 +6,6 @@ export const CODEBASE_CONTEXT_DIRNAME = '.codebase-context' as const;
66
export const MEMORY_FILENAME = 'memory.json' as const;
77
export const INTELLIGENCE_FILENAME = 'intelligence.json' as const;
88
export const KEYWORD_INDEX_FILENAME = 'index.json' as const;
9+
export const INDEXING_STATS_FILENAME = 'indexing-stats.json' as const;
910
export const VECTOR_DB_DIRNAME = 'index' as const;
1011
export const MANIFEST_FILENAME = 'manifest.json' as const;

src/core/analyzer-registry.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,9 @@ export class AnalyzerRegistry {
7070
const analyzer = this.findAnalyzer(filePath, content);
7171

7272
if (!analyzer) {
73-
console.warn(`No analyzer found for file: ${filePath}`);
73+
if (process.env.CODEBASE_CONTEXT_DEBUG) {
74+
console.error(`[DEBUG] No analyzer found for file: ${filePath}`);
75+
}
7476
return null;
7577
}
7678

src/core/indexer.ts

Lines changed: 73 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import {
3030
import { getFileCommitDates } from '../utils/git-dates.js';
3131
import {
3232
CODEBASE_CONTEXT_DIRNAME,
33+
INDEXING_STATS_FILENAME,
3334
INTELLIGENCE_FILENAME,
3435
KEYWORD_INDEX_FILENAME,
3536
MANIFEST_FILENAME,
@@ -51,6 +52,13 @@ export interface IndexerOptions {
5152
incrementalOnly?: boolean;
5253
}
5354

55+
interface PersistedIndexingStats {
56+
indexedFiles: number;
57+
totalChunks: number;
58+
totalFiles: number;
59+
generatedAt: string;
60+
}
61+
5462
export class CodebaseIndexer {
5563
private rootPath: string;
5664
private config: CodebaseConfig;
@@ -181,16 +189,18 @@ export class CodebaseIndexer {
181189
// Phase 1b: Incremental diff (if incremental mode)
182190
const contextDir = path.join(this.rootPath, CODEBASE_CONTEXT_DIRNAME);
183191
const manifestPath = path.join(contextDir, MANIFEST_FILENAME);
192+
const indexingStatsPath = path.join(contextDir, INDEXING_STATS_FILENAME);
184193
let diff: ManifestDiff | null = null;
185194
let currentHashes: Record<string, string> | null = null;
195+
let previousManifest: FileManifest | null = null;
186196

187197
if (this.incrementalOnly) {
188198
this.updateProgress('scanning', 10);
189199
console.error('Computing file hashes for incremental diff...');
190200
currentHashes = await computeFileHashes(files, this.rootPath);
191201

192-
const oldManifest = await readManifest(manifestPath);
193-
diff = diffManifest(oldManifest, currentHashes);
202+
previousManifest = await readManifest(manifestPath);
203+
diff = diffManifest(previousManifest, currentHashes);
194204

195205
console.error(
196206
`Incremental diff: ${diff.added.length} added, ${diff.changed.length} changed, ` +
@@ -210,6 +220,52 @@ export class CodebaseIndexer {
210220
this.updateProgress('complete', 100);
211221
stats.duration = Date.now() - startTime;
212222
stats.completedAt = new Date();
223+
224+
let restoredFromPersistedStats = false;
225+
226+
try {
227+
const persisted = JSON.parse(
228+
await fs.readFile(indexingStatsPath, 'utf-8')
229+
) as Partial<PersistedIndexingStats>;
230+
231+
if (
232+
typeof persisted.indexedFiles === 'number' &&
233+
typeof persisted.totalChunks === 'number' &&
234+
typeof persisted.totalFiles === 'number'
235+
) {
236+
stats.indexedFiles = persisted.indexedFiles;
237+
stats.totalChunks = persisted.totalChunks;
238+
stats.totalFiles = persisted.totalFiles;
239+
restoredFromPersistedStats = true;
240+
}
241+
} catch {
242+
// No persisted stats yet — fall back below
243+
}
244+
245+
if (!restoredFromPersistedStats) {
246+
if (previousManifest) {
247+
stats.indexedFiles = Object.keys(previousManifest.files).length;
248+
}
249+
250+
try {
251+
const existingIndexPath = path.join(contextDir, KEYWORD_INDEX_FILENAME);
252+
const existingChunks = JSON.parse(await fs.readFile(existingIndexPath, 'utf-8'));
253+
if (Array.isArray(existingChunks)) {
254+
stats.totalChunks = existingChunks.length;
255+
if (stats.indexedFiles === 0) {
256+
const uniqueFiles = new Set(
257+
existingChunks.map((c: { filePath?: string }) => c.filePath)
258+
);
259+
stats.indexedFiles = uniqueFiles.size;
260+
}
261+
}
262+
} catch {
263+
// Keyword index doesn't exist yet — keep best-known counts
264+
}
265+
}
266+
267+
stats.totalFiles = files.length;
268+
213269
return stats;
214270
}
215271
}
@@ -559,6 +615,14 @@ export class CodebaseIndexer {
559615
};
560616
await writeManifest(manifestPath, manifest);
561617

618+
const persistedStats: PersistedIndexingStats = {
619+
indexedFiles: stats.indexedFiles,
620+
totalChunks: stats.totalChunks,
621+
totalFiles: stats.totalFiles,
622+
generatedAt: new Date().toISOString()
623+
};
624+
await fs.writeFile(indexingStatsPath, JSON.stringify(persistedStats, null, 2));
625+
562626
// Phase 5: Complete
563627
this.updateProgress('complete', 100);
564628

@@ -591,6 +655,7 @@ export class CodebaseIndexer {
591655

592656
private async scanFiles(): Promise<string[]> {
593657
const files: string[] = [];
658+
const seen = new Set<string>();
594659

595660
// Read .gitignore if respecting it
596661
let ig: ReturnType<typeof ignore.default> | null = null;
@@ -617,6 +682,12 @@ export class CodebaseIndexer {
617682
});
618683

619684
for (const file of matches) {
685+
const normalizedFile = file.replace(/\\/g, '/');
686+
if (seen.has(normalizedFile)) {
687+
continue;
688+
}
689+
seen.add(normalizedFile);
690+
620691
const relativePath = path.relative(this.rootPath, file);
621692

622693
// Check gitignore

src/core/search-quality.ts

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
import type { SearchResult } from '../types/index.js';
2+
import { isTestingRelatedQuery } from '../preflight/query-scope.js';
3+
4+
export interface SearchQualityAssessment {
5+
status: 'ok' | 'low_confidence';
6+
confidence: number;
7+
signals: string[];
8+
nextSteps?: string[];
9+
}
10+
11+
export function isTestArtifactPath(filePath: string): boolean {
12+
const normalized = filePath.toLowerCase().replace(/\\/g, '/');
13+
return (
14+
normalized.includes('.spec.') ||
15+
normalized.includes('.test.') ||
16+
normalized.includes('/e2e/') ||
17+
normalized.includes('/__tests__/')
18+
);
19+
}
20+
21+
export function assessSearchQuality(
22+
query: string,
23+
results: SearchResult[]
24+
): SearchQualityAssessment {
25+
if (results.length === 0) {
26+
return {
27+
status: 'low_confidence',
28+
confidence: 0,
29+
signals: ['no results returned'],
30+
nextSteps: [
31+
'Try a narrower query with one concrete symbol, route, or file hint.',
32+
'Apply search filters (framework/language/componentType/layer).',
33+
'Use get_component_usage for dependency or wiring lookups.'
34+
]
35+
};
36+
}
37+
38+
const topSlice = results.slice(0, Math.min(3, results.length));
39+
const topScore = results[0].score;
40+
const secondScore = results[1]?.score ?? topScore;
41+
const topAverage = topSlice.reduce((sum, result) => sum + result.score, 0) / topSlice.length;
42+
const topSeparation = Math.max(0, topScore - secondScore);
43+
const testRatio =
44+
topSlice.filter((result) => isTestArtifactPath(result.filePath)).length / topSlice.length;
45+
const queryIsTesting = isTestingRelatedQuery(query);
46+
47+
const signals: string[] = [];
48+
if (topScore < 0.3) {
49+
signals.push(`low top score (${topScore.toFixed(2)})`);
50+
}
51+
if (topAverage < 0.32) {
52+
signals.push(`weak top-${topSlice.length} average (${topAverage.toFixed(2)})`);
53+
}
54+
if (topSlice.length > 1 && topSeparation < 0.03) {
55+
signals.push(`tight top spread (${topSeparation.toFixed(2)})`);
56+
}
57+
if (!queryIsTesting && testRatio >= 0.67) {
58+
signals.push(
59+
`test artifacts dominate top-${topSlice.length} (${Math.round(testRatio * 100)}%)`
60+
);
61+
}
62+
63+
let confidence = topScore;
64+
if (topAverage < 0.32) confidence -= 0.08;
65+
if (topSlice.length > 1 && topSeparation < 0.03) confidence -= 0.05;
66+
if (!queryIsTesting && testRatio >= 0.67) confidence -= 0.15;
67+
confidence = Math.max(0, Math.min(1, Number(confidence.toFixed(2))));
68+
69+
const lowConfidence = signals.length >= 2 || confidence < 0.35;
70+
71+
return {
72+
status: lowConfidence ? 'low_confidence' : 'ok',
73+
confidence,
74+
signals,
75+
...(lowConfidence && {
76+
nextSteps: [
77+
'Add one or two concrete symbols, routes, or file hints to the query.',
78+
'Apply filters (framework/language/componentType/layer) to narrow candidates.',
79+
'Use get_component_usage when the question is about wiring or usages.'
80+
]
81+
})
82+
};
83+
}

0 commit comments

Comments
 (0)