From 6b75c6daef36d45f79a1c815570a612540554ca5 Mon Sep 17 00:00:00 2001
From: Ravi Tharuma <RaviTharuma@users.noreply.github.com>
Date: Fri, 27 Mar 2026 23:23:36 +0100
Subject: [PATCH 1/5] Avoid blocking startup on corpus embedding rebuild

---
 src/memory/observations.ts                |  30 ++++-
 src/server.ts                             |  14 +--
 tests/memory/prepare-search-index.test.ts | 139 ++++++++++++++++++++++
 3 files changed, 174 insertions(+), 9 deletions(-)
 create mode 100644 tests/memory/prepare-search-index.test.ts
diff --git a/src/memory/observations.ts b/src/memory/observations.ts
index e523cda..2037449 100644
--- a/src/memory/observations.ts
+++ b/src/memory/observations.ts
@@ -17,6 +17,8 @@ import {
   generateEmbedding,
   batchGenerateEmbeddings,
   getVectorDimensions,
+  hydrateIndex,
+  isEmbeddingEnabled,
   makeOramaObservationId,
 } from '../store/orama-store.js';
 import { getObservationStore, initObservationStore } from '../store/obs-store.js';
@@ -623,8 +625,8 @@ export function suggestTopicKey(type: string, title: string): string {
 }
 
 /**
- * Reload observations into the Orama index.
- * Called during server startup to restore the search index.
+ * Reload observations into the Orama index with full corpus embeddings.
+ * Intended for explicit heavy rebuilds, not normal MCP startup.
  *
  * Optimization: uses batch embedding (ONNX processes 64 texts at a time)
  * instead of individual embed calls. This reduces startup CPU from minutes
@@ -704,6 +706,30 @@ export async function reindexObservations(): Promise<number> {
   return count;
 }
 
+/**
+ * Prepare the search index for startup and hot-reload without blocking on
+ * corpus-wide embedding generation.
+ *
+ * This hydrates the lexical/BM25 index immediately so MCP availability is not
+ * coupled to embedding provider throughput. Missing vectors are queued for the
+ * existing background backfill cycle.
+ */
+export async function prepareSearchIndex(): Promise<number> {
+  await resetDb();
+  const count = await hydrateIndex(observations as unknown as any[]);
+
+  vectorMissingIds.clear();
+  if (isEmbeddingEnabled()) {
+    for (const obs of observations) {
+      if ((obs.status ?? 'active') === 'active') {
+        vectorMissingIds.add(obs.id);
+      }
+    }
+  }
+
+  return count;
+}
+
 // ── Vector-missing observability & backfill ─────────────────────────
 
 /**
diff --git a/src/server.ts b/src/server.ts
index c7f5cae..8f6b33d 100644
--- a/src/server.ts
+++ b/src/server.ts
@@ -20,13 +20,12 @@ import { watchFile } from 'node:fs';
 import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
 import { z } from 'zod';
 import { KnowledgeGraphManager } from './memory/graph.js';
-import { initObservations, storeObservation, reindexObservations, migrateProjectIds, getObservation, getAllObservations } from './memory/observations.js';
+import { initObservations, storeObservation, prepareSearchIndex, migrateProjectIds, getObservation, getAllObservations } from './memory/observations.js';
 import { withFreshIndex } from './memory/freshness.js';
 import { initObservationStore, getObservationStore } from './store/obs-store.js';
 import { initMiniSkillStore } from './store/mini-skill-store.js';
 import { initSessionStore } from './store/session-store.js';
 import { checkProjectAttribution, auditProjectObservations } from './memory/attribution-guard.js';
-import { resetDb } from './store/orama-store.js';
 import { createAutoRelations } from './memory/auto-relations.js';
 import { extractEntities } from './memory/entity-extractor.js';
 import { compactSearch, compactTimeline, compactDetail } from './compact/engine.js';
@@ -279,9 +278,9 @@ export async function createMemorixServer(
     await graphManager.init();
     await initObservations(projectDir);
 
-    const reindexed = await reindexObservations();
-    if (reindexed > 0) {
-      console.error(`[memorix] Reindexed ${reindexed} observations for project: ${project.id}`);
+    const indexed = await prepareSearchIndex();
+    if (indexed > 0) {
+      console.error(`[memorix] Prepared search index for ${indexed} observations in project: ${project.id}`);
     }
 
     const llmConfig = initLLM();
@@ -3523,12 +3522,13 @@ export async function createMemorixServer(
           if (reloading) return;
           reloading = true;
           try {
+<<<<<<< HEAD
             await resetDb();
             await initObservationStore(projectDir);
             await initObservations(projectDir);
-            const count = await reindexObservations();
+            const count = await prepareSearchIndex();
             if (count > 0) {
-              console.error(`[memorix] Hot-reloaded ${count} observations (external write detected)`);
+              console.error(`[memorix] Hot-reloaded search index for ${count} observations (external write detected)`);
             }
           } catch { /* silent */ }
           reloading = false;
diff --git a/tests/memory/prepare-search-index.test.ts b/tests/memory/prepare-search-index.test.ts
new file mode 100644
index 0000000..99bfef2
--- /dev/null
+++ b/tests/memory/prepare-search-index.test.ts
@@ -0,0 +1,139 @@
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+
+const mockResetDb = vi.fn();
+const mockBatchGenerateEmbeddings = vi.fn();
+const mockHydrateIndex = vi.fn();
+const mockInsertObservation = vi.fn();
+const mockLoadObservationsJson = vi.fn();
+const mockLoadIdCounter = vi.fn();
+const mockIsEmbeddingEnabled = vi.fn();
+
+vi.mock('../../src/store/orama-store.js', () => ({
+  insertObservation: mockInsertObservation,
+  removeObservation: vi.fn(),
+  resetDb: mockResetDb,
+  generateEmbedding: vi.fn(),
+  batchGenerateEmbeddings: mockBatchGenerateEmbeddings,
+  hydrateIndex: mockHydrateIndex,
+  isEmbeddingEnabled: mockIsEmbeddingEnabled,
+  makeOramaObservationId: (projectId: string, observationId: number) => `${projectId}:${observationId}`,
+}));
+
+vi.mock('../../src/store/persistence.js', () => ({
+  saveObservationsJson: vi.fn(),
+  loadObservationsJson: mockLoadObservationsJson,
+  saveIdCounter: vi.fn(),
+  loadIdCounter: mockLoadIdCounter,
+}));
+
+vi.mock('../../src/store/file-lock.js', () => ({
+  withFileLock: async (_dir: string, fn: () => Promise<unknown>) => fn(),
+}));
+
+vi.mock('../../src/compact/token-budget.js', () => ({
+  countTextTokens: () => 0,
+}));
+
+vi.mock('../../src/memory/entity-extractor.js', () => ({
+  extractEntities: () => [],
+  enrichConcepts: (concepts: string[]) => concepts,
+}));
+
+describe('prepareSearchIndex', () => {
+  beforeEach(() => {
+    vi.resetModules();
+    mockResetDb.mockReset();
+    mockBatchGenerateEmbeddings.mockReset();
+    mockHydrateIndex.mockReset();
+    mockInsertObservation.mockReset();
+    mockLoadObservationsJson.mockReset();
+    mockLoadIdCounter.mockReset();
+    mockIsEmbeddingEnabled.mockReset();
+  });
+
+  it('hydrates the lexical index without triggering batch embeddings and queues active docs for backfill', async () => {
+    mockLoadObservationsJson.mockResolvedValue([
+      {
+        id: 1,
+        projectId: 'AVIDS2/memorix',
+        entityName: 'search-layer',
+        type: 'what-changed',
+        title: 'Prepared startup index',
+        narrative: 'Build lexical index first, defer vectors.',
+        facts: ['Startup should not block on embeddings'],
+        filesModified: ['src/server.ts'],
+        concepts: ['startup-index'],
+        tokens: 42,
+        createdAt: '2026-03-18T00:00:00.000Z',
+        status: 'active',
+        source: 'agent',
+      },
+      {
+        id: 2,
+        projectId: 'AVIDS2/memorix',
+        entityName: 'history',
+        type: 'decision',
+        title: 'Resolved old note',
+        narrative: 'Should stay out of the backfill queue.',
+        facts: [],
+        filesModified: [],
+        concepts: ['resolved'],
+        tokens: 12,
+        createdAt: '2026-03-18T00:00:01.000Z',
+        status: 'resolved',
+        source: 'agent',
+      },
+    ]);
+    mockLoadIdCounter.mockResolvedValue(3);
+    mockHydrateIndex.mockResolvedValue(2);
+    mockIsEmbeddingEnabled.mockReturnValue(true);
+
+    const { initObservations, prepareSearchIndex, getVectorMissingIds } = await import('../../src/memory/observations.js');
+
+    await initObservations('E:/tmp/project');
+    const count = await prepareSearchIndex();
+
+    expect(count).toBe(2);
+    expect(mockResetDb).toHaveBeenCalledOnce();
+    expect(mockHydrateIndex).toHaveBeenCalledOnce();
+    expect(mockHydrateIndex).toHaveBeenCalledWith(
+      expect.arrayContaining([
+        expect.objectContaining({ id: 1, title: 'Prepared startup index' }),
+        expect.objectContaining({ id: 2, title: 'Resolved old note' }),
+      ]),
+    );
+    expect(mockBatchGenerateEmbeddings).not.toHaveBeenCalled();
+    expect(getVectorMissingIds()).toEqual([1]);
+  });
+
+  it('leaves the backfill queue empty when vector search is not enabled', async () => {
+    mockLoadObservationsJson.mockResolvedValue([
+      {
+        id: 7,
+        projectId: 'AVIDS2/memorix',
+        entityName: 'fallback',
+        type: 'discovery',
+        title: 'Fulltext only startup',
+        narrative: 'Embedding provider disabled.',
+        facts: [],
+        filesModified: [],
+        concepts: ['bm25'],
+        tokens: 9,
+        createdAt: '2026-03-18T00:00:00.000Z',
+        status: 'active',
+        source: 'agent',
+      },
+    ]);
+    mockLoadIdCounter.mockResolvedValue(8);
+    mockHydrateIndex.mockResolvedValue(1);
+    mockIsEmbeddingEnabled.mockReturnValue(false);
+
+    const { initObservations, prepareSearchIndex, getVectorMissingIds } = await import('../../src/memory/observations.js');
+
+    await initObservations('E:/tmp/project');
+    await prepareSearchIndex();
+
+    expect(mockBatchGenerateEmbeddings).not.toHaveBeenCalled();
+    expect(getVectorMissingIds()).toEqual([]);
+  });
+});

From 810d92e79ac5426d1740177c085dad77043db9e9 Mon Sep 17 00:00:00 2001
From: Ravi Tharuma <RaviTharuma@users.noreply.github.com>
Date: Sun, 29 Mar 2026 23:51:11 +0200
Subject: [PATCH 2/5] fix: hydrate all observations regardless of status

Remove the active-only filter in hydrateIndex() so resolved and
archived observations are indexed at startup. Status filtering
belongs at query time (searchObservations), not index time.

Add 4-case hydrate-index test covering:
- indexes active + resolved + archived observations
- stores status field faithfully for per-status queries
- skips malformed observations gracefully
- idempotent re-hydration is a no-op
---
 src/store/orama-store.ts          |  1 -
 tests/store/hydrate-index.test.ts | 94 +++++++++++++++++++++++++++++++
 2 files changed, 94 insertions(+), 1 deletion(-)
 create mode 100644 tests/store/hydrate-index.test.ts

diff --git a/src/store/orama-store.ts b/src/store/orama-store.ts
index 604618f..939b37f 100644
--- a/src/store/orama-store.ts
+++ b/src/store/orama-store.ts
@@ -242,7 +242,6 @@ export async function hydrateIndex(observations: any[]): Promise<number> {
   let inserted = 0;
   for (const obs of observations) {
     if (!obs || !obs.id || !obs.projectId) continue;
-    if ((obs.status ?? 'active') !== 'active') continue;
     try {
       const doc: MemorixDocument = {
         id: makeOramaObservationId(obs.projectId, obs.id),
diff --git a/tests/store/hydrate-index.test.ts b/tests/store/hydrate-index.test.ts
new file mode 100644
index 0000000..c207169
--- /dev/null
+++ b/tests/store/hydrate-index.test.ts
@@ -0,0 +1,94 @@
+import { describe, it, expect, beforeEach } from 'vitest';
+import { resetDb, hydrateIndex, makeOramaObservationId } from '../../src/store/orama-store.js';
+import { count, search } from '@orama/orama';
+
+// Minimal observation shape matching what hydrateIndex expects
+function makeObs(id: number, status: string, title: string) {
+  return {
+    id,
+    projectId: 'test/hydrate-project',
+    entityName: `entity-${id}`,
+    type: 'discovery',
+    title,
+    narrative: `Narrative for observation ${id}`,
+    facts: ['fact-a'],
+    filesModified: [],
+    concepts: ['test'],
+    tokens: 100,
+    createdAt: new Date().toISOString(),
+    accessCount: 0,
+    lastAccessedAt: '',
+    status,
+    source: 'agent',
+  };
+}
+
+describe('hydrateIndex – status handling', () => {
+  beforeEach(async () => {
+    await resetDb();
+  });
+
+  it('indexes active, resolved, AND archived observations', async () => {
+    const observations = [
+      makeObs(1, 'active', 'Active observation'),
+      makeObs(2, 'resolved', 'Resolved observation'),
+      makeObs(3, 'archived', 'Archived observation'),
+    ];
+
+    const inserted = await hydrateIndex(observations);
+    expect(inserted).toBe(3);
+  });
+
+  it('stores the status field faithfully in the index', async () => {
+    const observations = [
+      makeObs(10, 'active', 'Status active entry'),
+      makeObs(11, 'resolved', 'Status resolved entry'),
+      makeObs(12, 'archived', 'Status archived entry'),
+    ];
+
+    await hydrateIndex(observations);
+
+    // Import getDb dynamically to access the raw database for verification
+    const { getDb } = await import('../../src/store/orama-store.js');
+    const db = await getDb();
+
+    // Search for each status value to confirm they're indexed
+    const activeHits = await search(db, { term: 'Status active entry', properties: ['title'] });
+    const resolvedHits = await search(db, { term: 'Status resolved entry', properties: ['title'] });
+    const archivedHits = await search(db, { term: 'Status archived entry', properties: ['title'] });
+
+    expect(activeHits.count).toBeGreaterThanOrEqual(1);
+    expect(resolvedHits.count).toBeGreaterThanOrEqual(1);
+    expect(archivedHits.count).toBeGreaterThanOrEqual(1);
+  });
+
+  it('skips malformed observations without crashing', async () => {
+    const observations = [
+      makeObs(20, 'active', 'Good observation'),
+      null,
+      { id: null, projectId: 'x' },
+      { id: 21 }, // missing projectId
+      makeObs(22, 'resolved', 'Another good one'),
+    ];
+
+    const inserted = await hydrateIndex(observations as any[]);
+    expect(inserted).toBe(2);
+  });
+
+  it('is idempotent – second call is a no-op', async () => {
+    const observations = [
+      makeObs(30, 'active', 'First hydration'),
+      makeObs(31, 'resolved', 'First hydration resolved'),
+    ];
+
+    const first = await hydrateIndex(observations);
+    expect(first).toBe(2);
+
+    // Second call with more observations should return 0 (already hydrated)
+    const second = await hydrateIndex([
+      ...observations,
+      makeObs(32, 'archived', 'Late arrival'),
+    ]);
+    expect(second).toBe(0);
+  });
+});

From 4bae5fd7863ee2bc6fb497e5bff51667462ab0a5 Mon Sep 17 00:00:00 2001
From: Ravi Tharuma <RaviTharuma@users.noreply.github.com>
Date: Wed, 8 Apr 2026 19:17:43 +0200
Subject: [PATCH 3/5] fix(startup): include all observation statuses in
 vectorMissingIds backfill

prepareSearchIndex() was queueing only active observations for post-startup
vector recovery. This left resolved and archived observations permanently
excluded from hybrid/vector search after a restart, creating an asymmetry
between active memories (which regain vector behavior) and non-active ones
(which remain lexical-only indefinitely).

Status filtering is a query-time concern (searchObservations applies it via
the status post-filter), not an index-time concern. The backfill queue must
mirror the full corpus so all statuses remain eligible for hybrid search.
---
 src/memory/observations.ts | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/memory/observations.ts b/src/memory/observations.ts
index 2037449..af51734 100644
--- a/src/memory/observations.ts
+++ b/src/memory/observations.ts
@@ -721,9 +721,10 @@ export async function prepareSearchIndex(): Promise<number> {
   vectorMissingIds.clear();
   if (isEmbeddingEnabled()) {
     for (const obs of observations) {
-      if ((obs.status ?? 'active') === 'active') {
-        vectorMissingIds.add(obs.id);
-      }
+      // Queue ALL statuses for vector backfill — status filtering happens at query time,
+      // not at index time. Omitting non-active observations here would permanently
+      // exclude resolved/archived memories from hybrid search after restart.
+      vectorMissingIds.add(obs.id);
     }
   }
 

From 711969d0c157239edfbb69fd533adb9cc4afbd75 Mon Sep 17 00:00:00 2001
From: Ravi Tharuma <RaviTharuma@users.noreply.github.com>
Date: Wed, 8 Apr 2026 21:16:09 +0200
Subject: [PATCH 4/5] fix: remove accidental merge conflict marker from
 server.ts

---
 src/server.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/server.ts b/src/server.ts
index 8f6b33d..2616c92 100644
--- a/src/server.ts
+++ b/src/server.ts
@@ -23,6 +23,7 @@ import { KnowledgeGraphManager } from './memory/graph.js';
 import { initObservations, storeObservation, prepareSearchIndex, migrateProjectIds, getObservation, getAllObservations } from './memory/observations.js';
 import { withFreshIndex } from './memory/freshness.js';
 import { initObservationStore, getObservationStore } from './store/obs-store.js';
+import { resetDb } from './store/orama-store.js';
 import { initMiniSkillStore } from './store/mini-skill-store.js';
 import { initSessionStore } from './store/session-store.js';
 import { checkProjectAttribution, auditProjectObservations } from './memory/attribution-guard.js';
@@ -3522,7 +3523,6 @@ export async function createMemorixServer(
           if (reloading) return;
           reloading = true;
           try {
-<<<<<<< HEAD
             await resetDb();
             await initObservationStore(projectDir);
             await initObservations(projectDir);

From afdbfe206d7fe25ce09d106eb0f5a83b7bd43943 Mon Sep 17 00:00:00 2001
From: Ravi Tharuma <RaviTharuma@users.noreply.github.com>
Date: Wed, 8 Apr 2026 21:20:46 +0200
Subject: [PATCH 5/5] fix(tests): add obs-store mock and correct assertions in
 prepare-search-index test

- Add vi.mock for obs-store.js so initObservations can load mocked data
  without hitting real SQLite/JSON backends during unit tests
- Update getVectorMissingIds assertion from [1] to [1, 2] to match the
  intentional behavior from 4bae5fd which queues all statuses for backfill
- Add 30s timeout to http-embedding-fallback integration test to match
  beforeAll/afterAll timeouts (test was timing out at default 5s)
---
 tests/integration/http-embedding-fallback.test.ts |  2 +-
 tests/memory/prepare-search-index.test.ts         | 14 +++++++++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/tests/integration/http-embedding-fallback.test.ts b/tests/integration/http-embedding-fallback.test.ts
index f84fbcf..b94cc62 100644
--- a/tests/integration/http-embedding-fallback.test.ts
+++ b/tests/integration/http-embedding-fallback.test.ts
@@ -436,5 +436,5 @@ describe('HTTP embedding fallback regression', () => {
     } finally {
       errorSpy.mockRestore();
     }
-  });
+  }, 30_000);
 });
diff --git a/tests/memory/prepare-search-index.test.ts b/tests/memory/prepare-search-index.test.ts
index 99bfef2..cd7f0cb 100644
--- a/tests/memory/prepare-search-index.test.ts
+++ b/tests/memory/prepare-search-index.test.ts
@@ -26,6 +26,18 @@ vi.mock('../../src/store/persistence.js', () => ({
   loadIdCounter: mockLoadIdCounter,
 }));
 
+vi.mock('../../src/store/obs-store.js', () => ({
+  initObservationStore: vi.fn().mockResolvedValue(undefined),
+  getObservationStore: () => ({
+    loadAll: mockLoadObservationsJson,
+    loadIdCounter: mockLoadIdCounter,
+    ensureFresh: vi.fn().mockResolvedValue(false),
+    close: vi.fn(),
+    getBackendName: () => 'json',
+    getGeneration: () => 0,
+  }),
+}));
+
 vi.mock('../../src/store/file-lock.js', () => ({
   withFileLock: async (_dir: string, fn: () => Promise<unknown>) => fn(),
 }));
@@ -103,7 +115,7 @@ describe('prepareSearchIndex', () => {
       ]),
     );
     expect(mockBatchGenerateEmbeddings).not.toHaveBeenCalled();
-    expect(getVectorMissingIds()).toEqual([1]);
+    expect(getVectorMissingIds()).toEqual([1, 2]);
   });
 
   it('leaves the backfill queue empty when vector search is not enabled', async () => {