From 6b75c6daef36d45f79a1c815570a612540554ca5 Mon Sep 17 00:00:00 2001 From: Ravi Tharuma Date: Fri, 27 Mar 2026 23:23:36 +0100 Subject: [PATCH 1/5] Avoid blocking startup on corpus embedding rebuild --- src/memory/observations.ts | 30 ++++- src/server.ts | 14 +-- tests/memory/prepare-search-index.test.ts | 139 ++++++++++++++++++++++ 3 files changed, 174 insertions(+), 9 deletions(-) create mode 100644 tests/memory/prepare-search-index.test.ts diff --git a/src/memory/observations.ts b/src/memory/observations.ts index e523cda..2037449 100644 --- a/src/memory/observations.ts +++ b/src/memory/observations.ts @@ -17,6 +17,8 @@ import { generateEmbedding, batchGenerateEmbeddings, getVectorDimensions, + hydrateIndex, + isEmbeddingEnabled, makeOramaObservationId, } from '../store/orama-store.js'; import { getObservationStore, initObservationStore } from '../store/obs-store.js'; @@ -623,8 +625,8 @@ export function suggestTopicKey(type: string, title: string): string { } /** - * Reload observations into the Orama index. - * Called during server startup to restore the search index. + * Reload observations into the Orama index with full corpus embeddings. + * Intended for explicit heavy rebuilds, not normal MCP startup. * * Optimization: uses batch embedding (ONNX processes 64 texts at a time) * instead of individual embed calls. This reduces startup CPU from minutes @@ -704,6 +706,30 @@ export async function reindexObservations(): Promise { return count; } +/** + * Prepare the search index for startup and hot-reload without blocking on + * corpus-wide embedding generation. + * + * This hydrates the lexical/BM25 index immediately so MCP availability is not + * coupled to embedding provider throughput. Missing vectors are queued for the + * existing background backfill cycle. + */ +export async function prepareSearchIndex(): Promise { + await resetDb(); + const count = await hydrateIndex(observations as unknown as any[]); + + vectorMissingIds.clear(); + if (isEmbeddingEnabled()) { + for (const obs of observations) { + if ((obs.status ?? 'active') === 'active') { + vectorMissingIds.add(obs.id); + } + } + } + + return count; +} + // ── Vector-missing observability & backfill ───────────────────────── /** diff --git a/src/server.ts b/src/server.ts index c7f5cae..8f6b33d 100644 --- a/src/server.ts +++ b/src/server.ts @@ -20,13 +20,12 @@ import { watchFile } from 'node:fs'; import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js'; import { z } from 'zod'; import { KnowledgeGraphManager } from './memory/graph.js'; -import { initObservations, storeObservation, reindexObservations, migrateProjectIds, getObservation, getAllObservations } from './memory/observations.js'; +import { initObservations, storeObservation, prepareSearchIndex, migrateProjectIds, getObservation, getAllObservations } from './memory/observations.js'; import { withFreshIndex } from './memory/freshness.js'; import { initObservationStore, getObservationStore } from './store/obs-store.js'; import { initMiniSkillStore } from './store/mini-skill-store.js'; import { initSessionStore } from './store/session-store.js'; import { checkProjectAttribution, auditProjectObservations } from './memory/attribution-guard.js'; -import { resetDb } from './store/orama-store.js'; import { createAutoRelations } from './memory/auto-relations.js'; import { extractEntities } from './memory/entity-extractor.js'; import { compactSearch, compactTimeline, compactDetail } from './compact/engine.js'; @@ -279,9 +278,9 @@ export async function createMemorixServer( await graphManager.init(); await initObservations(projectDir); - const reindexed = await reindexObservations(); - if (reindexed > 0) { - console.error(`[memorix] Reindexed ${reindexed} observations for project: ${project.id}`); + const indexed = await prepareSearchIndex(); + if (indexed > 0) { + console.error(`[memorix] Prepared search index for ${indexed} observations in project: ${project.id}`); } const llmConfig = initLLM(); @@ -3523,12 +3522,13 @@ export async function createMemorixServer( if (reloading) return; reloading = true; try { +<<<<<<< HEAD await resetDb(); await initObservationStore(projectDir); await initObservations(projectDir); - const count = await reindexObservations(); + const count = await prepareSearchIndex(); if (count > 0) { - console.error(`[memorix] Hot-reloaded ${count} observations (external write detected)`); + console.error(`[memorix] Hot-reloaded search index for ${count} observations (external write detected)`); } } catch { /* silent */ } reloading = false; diff --git a/tests/memory/prepare-search-index.test.ts b/tests/memory/prepare-search-index.test.ts new file mode 100644 index 0000000..99bfef2 --- /dev/null +++ b/tests/memory/prepare-search-index.test.ts @@ -0,0 +1,139 @@ +import { beforeEach, describe, expect, it, vi } from 'vitest'; + +const mockResetDb = vi.fn(); +const mockBatchGenerateEmbeddings = vi.fn(); +const mockHydrateIndex = vi.fn(); +const mockInsertObservation = vi.fn(); +const mockLoadObservationsJson = vi.fn(); +const mockLoadIdCounter = vi.fn(); +const mockIsEmbeddingEnabled = vi.fn(); + +vi.mock('../../src/store/orama-store.js', () => ({ + insertObservation: mockInsertObservation, + removeObservation: vi.fn(), + resetDb: mockResetDb, + generateEmbedding: vi.fn(), + batchGenerateEmbeddings: mockBatchGenerateEmbeddings, + hydrateIndex: mockHydrateIndex, + isEmbeddingEnabled: mockIsEmbeddingEnabled, + makeOramaObservationId: (projectId: string, observationId: number) => `${projectId}:${observationId}`, +})); + +vi.mock('../../src/store/persistence.js', () => ({ + saveObservationsJson: vi.fn(), + loadObservationsJson: mockLoadObservationsJson, + saveIdCounter: vi.fn(), + loadIdCounter: mockLoadIdCounter, +})); + +vi.mock('../../src/store/file-lock.js', () => ({ + withFileLock: async (_dir: string, fn: () => Promise) => fn(), +})); + +vi.mock('../../src/compact/token-budget.js', () => ({ + countTextTokens: () => 0, +})); + +vi.mock('../../src/memory/entity-extractor.js', () => ({ + extractEntities: () => [], + enrichConcepts: (concepts: string[]) => concepts, +})); + +describe('prepareSearchIndex', () => { + beforeEach(() => { + vi.resetModules(); + mockResetDb.mockReset(); + mockBatchGenerateEmbeddings.mockReset(); + mockHydrateIndex.mockReset(); + mockInsertObservation.mockReset(); + mockLoadObservationsJson.mockReset(); + mockLoadIdCounter.mockReset(); + mockIsEmbeddingEnabled.mockReset(); + }); + + it('hydrates the lexical index without triggering batch embeddings and queues active docs for backfill', async () => { + mockLoadObservationsJson.mockResolvedValue([ + { + id: 1, + projectId: 'AVIDS2/memorix', + entityName: 'search-layer', + type: 'what-changed', + title: 'Prepared startup index', + narrative: 'Build lexical index first, defer vectors.', + facts: ['Startup should not block on embeddings'], + filesModified: ['src/server.ts'], + concepts: ['startup-index'], + tokens: 42, + createdAt: '2026-03-18T00:00:00.000Z', + status: 'active', + source: 'agent', + }, + { + id: 2, + projectId: 'AVIDS2/memorix', + entityName: 'history', + type: 'decision', + title: 'Resolved old note', + narrative: 'Should stay out of the backfill queue.', + facts: [], + filesModified: [], + concepts: ['resolved'], + tokens: 12, + createdAt: '2026-03-18T00:00:01.000Z', + status: 'resolved', + source: 'agent', + }, + ]); + mockLoadIdCounter.mockResolvedValue(3); + mockHydrateIndex.mockResolvedValue(2); + mockIsEmbeddingEnabled.mockReturnValue(true); + + const { initObservations, prepareSearchIndex, getVectorMissingIds } = await import('../../src/memory/observations.js'); + + await initObservations('E:/tmp/project'); + const count = await prepareSearchIndex(); + + expect(count).toBe(2); + expect(mockResetDb).toHaveBeenCalledOnce(); + expect(mockHydrateIndex).toHaveBeenCalledOnce(); + expect(mockHydrateIndex).toHaveBeenCalledWith( + expect.arrayContaining([ + expect.objectContaining({ id: 1, title: 'Prepared startup index' }), + expect.objectContaining({ id: 2, title: 'Resolved old note' }), + ]), + ); + expect(mockBatchGenerateEmbeddings).not.toHaveBeenCalled(); + expect(getVectorMissingIds()).toEqual([1]); + }); + + it('leaves the backfill queue empty when vector search is not enabled', async () => { + mockLoadObservationsJson.mockResolvedValue([ + { + id: 7, + projectId: 'AVIDS2/memorix', + entityName: 'fallback', + type: 'discovery', + title: 'Fulltext only startup', + narrative: 'Embedding provider disabled.', + facts: [], + filesModified: [], + concepts: ['bm25'], + tokens: 9, + createdAt: '2026-03-18T00:00:00.000Z', + status: 'active', + source: 'agent', + }, + ]); + mockLoadIdCounter.mockResolvedValue(8); + mockHydrateIndex.mockResolvedValue(1); + mockIsEmbeddingEnabled.mockReturnValue(false); + + const { initObservations, prepareSearchIndex, getVectorMissingIds } = await import('../../src/memory/observations.js'); + + await initObservations('E:/tmp/project'); + await prepareSearchIndex(); + + expect(mockBatchGenerateEmbeddings).not.toHaveBeenCalled(); + expect(getVectorMissingIds()).toEqual([]); + }); +}); From 810d92e79ac5426d1740177c085dad77043db9e9 Mon Sep 17 00:00:00 2001 From: Ravi Tharuma Date: Sun, 29 Mar 2026 23:51:11 +0200 Subject: [PATCH 2/5] fix: hydrate all observations regardless of status Remove the active-only filter in hydrateIndex() so resolved and archived observations are indexed at startup. Status filtering belongs at query time (searchObservations), not index time. Add 4-case hydrate-index test covering: - indexes active + resolved + archived observations - stores status field faithfully for per-status queries - skips malformed observations gracefully - idempotent re-hydration is a no-op --- src/store/orama-store.ts | 1 - tests/store/hydrate-index.test.ts | 94 +++++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+), 1 deletion(-) create mode 100644 tests/store/hydrate-index.test.ts diff --git a/src/store/orama-store.ts b/src/store/orama-store.ts index 604618f..939b37f 100644 --- a/src/store/orama-store.ts +++ b/src/store/orama-store.ts @@ -242,7 +242,6 @@ export async function hydrateIndex(observations: any[]): Promise { let inserted = 0; for (const obs of observations) { if (!obs || !obs.id || !obs.projectId) continue; - if ((obs.status ?? 'active') !== 'active') continue; try { const doc: MemorixDocument = { id: makeOramaObservationId(obs.projectId, obs.id), diff --git a/tests/store/hydrate-index.test.ts b/tests/store/hydrate-index.test.ts new file mode 100644 index 0000000..c207169 --- /dev/null +++ b/tests/store/hydrate-index.test.ts @@ -0,0 +1,94 @@ +import { describe, it, expect, beforeEach } from 'vitest'; +import { resetDb, hydrateIndex, makeOramaObservationId } from '../../src/store/orama-store.js'; +import { count, search } from '@orama/orama'; + +// Minimal observation shape matching what hydrateIndex expects +function makeObs(id: number, status: string, title: string) { + return { + id, + projectId: 'test/hydrate-project', + entityName: `entity-${id}`, + type: 'discovery', + title, + narrative: `Narrative for observation ${id}`, + facts: ['fact-a'], + filesModified: [], + concepts: ['test'], + tokens: 100, + createdAt: new Date().toISOString(), + accessCount: 0, + lastAccessedAt: '', + status, + source: 'agent', + }; +} + +describe('hydrateIndex – status handling', () => { + beforeEach(async () => { + await resetDb(); + }); + + it('indexes active, resolved, AND archived observations', async () => { + const observations = [ + makeObs(1, 'active', 'Active observation'), + makeObs(2, 'resolved', 'Resolved observation'), + makeObs(3, 'archived', 'Archived observation'), + ]; + + const inserted = await hydrateIndex(observations); + expect(inserted).toBe(3); + }); + + it('stores the status field faithfully in the index', async () => { + const observations = [ + makeObs(10, 'active', 'Status active entry'), + makeObs(11, 'resolved', 'Status resolved entry'), + makeObs(12, 'archived', 'Status archived entry'), + ]; + + await hydrateIndex(observations); + + // Import getDb dynamically to access the raw database for verification + const { getDb } = await import('../../src/store/orama-store.js'); + const db = await getDb(); + + // Search for each status value to confirm they're indexed + const activeHits = await search(db, { term: 'Status active entry', properties: ['title'] }); + const resolvedHits = await search(db, { term: 'Status resolved entry', properties: ['title'] }); + const archivedHits = await search(db, { term: 'Status archived entry', properties: ['title'] }); + + expect(activeHits.count).toBeGreaterThanOrEqual(1); + expect(resolvedHits.count).toBeGreaterThanOrEqual(1); + expect(archivedHits.count).toBeGreaterThanOrEqual(1); + }); + + it('skips malformed observations without crashing', async () => { + const observations = [ + makeObs(20, 'active', 'Good observation'), + null, + { id: null, projectId: 'x' }, + { id: 21 }, // missing projectId + makeObs(22, 'resolved', 'Another good one'), + ]; + + const inserted = await hydrateIndex(observations as any[]); + expect(inserted).toBe(2); + }); + + it('is idempotent – second call is a no-op', async () => { + const observations = [ + makeObs(30, 'active', 'First hydration'), + makeObs(31, 'resolved', 'First hydration resolved'), + ]; + + const first = await hydrateIndex(observations); + expect(first).toBe(2); + + // Second call with more observations should return 0 (already hydrated) + const second = await hydrateIndex([ + ...observations, + makeObs(32, 'archived', 'Late arrival'), + ]); + expect(second).toBe(0); + }); +}); From 4bae5fd7863ee2bc6fb497e5bff51667462ab0a5 Mon Sep 17 00:00:00 2001 From: Ravi Tharuma Date: Wed, 8 Apr 2026 19:17:43 +0200 Subject: [PATCH 3/5] fix(startup): include all observation statuses in vectorMissingIds backfill prepareSearchIndex() was queueing only active observations for post-startup vector recovery. This left resolved and archived observations permanently excluded from hybrid/vector search after a restart, creating an asymmetry between active memories (which regain vector behavior) and non-active ones (which remain lexical-only indefinitely). Status filtering is a query-time concern (searchObservations applies it via the status post-filter), not an index-time concern. The backfill queue must mirror the full corpus so all statuses remain eligible for hybrid search. --- src/memory/observations.ts | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/memory/observations.ts b/src/memory/observations.ts index 2037449..af51734 100644 --- a/src/memory/observations.ts +++ b/src/memory/observations.ts @@ -721,9 +721,10 @@ export async function prepareSearchIndex(): Promise { vectorMissingIds.clear(); if (isEmbeddingEnabled()) { for (const obs of observations) { - if ((obs.status ?? 'active') === 'active') { - vectorMissingIds.add(obs.id); - } + // Queue ALL statuses for vector backfill — status filtering happens at query time, + // not at index time. Omitting non-active observations here would permanently + // exclude resolved/archived memories from hybrid search after restart. + vectorMissingIds.add(obs.id); } } From 711969d0c157239edfbb69fd533adb9cc4afbd75 Mon Sep 17 00:00:00 2001 From: Ravi Tharuma Date: Wed, 8 Apr 2026 21:16:09 +0200 Subject: [PATCH 4/5] fix: remove accidental merge conflict marker from server.ts --- src/server.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/server.ts b/src/server.ts index 8f6b33d..2616c92 100644 --- a/src/server.ts +++ b/src/server.ts @@ -23,6 +23,7 @@ import { KnowledgeGraphManager } from './memory/graph.js'; import { initObservations, storeObservation, prepareSearchIndex, migrateProjectIds, getObservation, getAllObservations } from './memory/observations.js'; import { withFreshIndex } from './memory/freshness.js'; import { initObservationStore, getObservationStore } from './store/obs-store.js'; +import { resetDb } from './store/orama-store.js'; import { initMiniSkillStore } from './store/mini-skill-store.js'; import { initSessionStore } from './store/session-store.js'; import { checkProjectAttribution, auditProjectObservations } from './memory/attribution-guard.js'; @@ -3522,7 +3523,6 @@ export async function createMemorixServer( if (reloading) return; reloading = true; try { -<<<<<<< HEAD await resetDb(); await initObservationStore(projectDir); await initObservations(projectDir); From afdbfe206d7fe25ce09d106eb0f5a83b7bd43943 Mon Sep 17 00:00:00 2001 From: Ravi Tharuma Date: Wed, 8 Apr 2026 21:20:46 +0200 Subject: [PATCH 5/5] fix(tests): add obs-store mock and correct assertions in prepare-search-index test - Add vi.mock for obs-store.js so initObservations can load mocked data without hitting real SQLite/JSON backends during unit tests - Update getVectorMissingIds assertion from [1] to [1, 2] to match the intentional behavior from 4bae5fd which queues all statuses for backfill - Add 30s timeout to http-embedding-fallback integration test to match beforeAll/afterAll timeouts (test was timing out at default 5s) --- tests/integration/http-embedding-fallback.test.ts | 2 +- tests/memory/prepare-search-index.test.ts | 14 +++++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/tests/integration/http-embedding-fallback.test.ts b/tests/integration/http-embedding-fallback.test.ts index f84fbcf..b94cc62 100644 --- a/tests/integration/http-embedding-fallback.test.ts +++ b/tests/integration/http-embedding-fallback.test.ts @@ -436,5 +436,5 @@ describe('HTTP embedding fallback regression', () => { } finally { errorSpy.mockRestore(); } - }); + }, 30_000); }); diff --git a/tests/memory/prepare-search-index.test.ts b/tests/memory/prepare-search-index.test.ts index 99bfef2..cd7f0cb 100644 --- a/tests/memory/prepare-search-index.test.ts +++ b/tests/memory/prepare-search-index.test.ts @@ -26,6 +26,18 @@ vi.mock('../../src/store/persistence.js', () => ({ loadIdCounter: mockLoadIdCounter, })); +vi.mock('../../src/store/obs-store.js', () => ({ + initObservationStore: vi.fn().mockResolvedValue(undefined), + getObservationStore: () => ({ + loadAll: mockLoadObservationsJson, + loadIdCounter: mockLoadIdCounter, + ensureFresh: vi.fn().mockResolvedValue(false), + close: vi.fn(), + getBackendName: () => 'json', + getGeneration: () => 0, + }), +})); + vi.mock('../../src/store/file-lock.js', () => ({ withFileLock: async (_dir: string, fn: () => Promise) => fn(), })); @@ -103,7 +115,7 @@ describe('prepareSearchIndex', () => { ]), ); expect(mockBatchGenerateEmbeddings).not.toHaveBeenCalled(); - expect(getVectorMissingIds()).toEqual([1]); + expect(getVectorMissingIds()).toEqual([1, 2]); }); it('leaves the backfill queue empty when vector search is not enabled', async () => {