From 6e1369a5569a574438a558154c05dd872fb3ac20 Mon Sep 17 00:00:00 2001 From: Ravi Tharuma Date: Sun, 29 Mar 2026 01:11:12 +0100 Subject: [PATCH 1/3] feat: audio ingestion via Whisper API (memorix-ek1) --- src/multimodal/audio-loader.ts | 145 +++++++++++++++++++++ src/multimodal/index.ts | 12 ++ src/server.ts | 45 +++++++ tests/multimodal/audio-loader.test.ts | 173 ++++++++++++++++++++++++++ 4 files changed, 375 insertions(+) create mode 100644 src/multimodal/audio-loader.ts create mode 100644 src/multimodal/index.ts create mode 100644 tests/multimodal/audio-loader.test.ts diff --git a/src/multimodal/audio-loader.ts b/src/multimodal/audio-loader.ts new file mode 100644 index 0000000..0c5f1ba --- /dev/null +++ b/src/multimodal/audio-loader.ts @@ -0,0 +1,145 @@ +/** + * Audio Loader — Whisper API Integration + * + * Transcribes audio files via OpenAI Whisper or Groq Whisper API, + * then stores the transcript as a Memorix observation. + * + * Supports: mp3, wav, m4a, webm, mp4, ogg, flac + * Providers: OpenAI (whisper-1), Groq (whisper-large-v3) + */ + +import { getLLMApiKey } from '../config.js'; + +// ── Types ──────────────────────────────────────────────────────────── + +export interface AudioInput { + /** Base64-encoded audio data */ + base64: string; + /** Audio MIME type (default: audio/mp3) */ + mimeType?: string; + /** Original filename */ + filename?: string; + /** ISO language code for transcription hint */ + language?: string; + /** Whisper provider: openai or groq */ + provider?: 'openai' | 'groq'; +} + +export interface TranscriptionResult { + /** Transcribed text */ + text: string; + /** Audio duration in seconds */ + duration?: number; + /** Detected language */ + language?: string; + /** Provider used */ + provider: string; +} + +// ── Provider Config ────────────────────────────────────────────────── + +const PROVIDERS = { + openai: { + baseUrl: 'https://api.openai.com/v1', + model: 'whisper-1', + }, + groq: { + baseUrl: 'https://api.groq.com/openai/v1', + model: 'whisper-large-v3', + }, +} as const; + +// ── Core Functions ─────────────────────────────────────────────────── + +/** + * Transcribe audio via Whisper API. + * + * @throws Error if no API key configured or API returns error. + */ +export async function transcribeAudio(input: AudioInput): Promise { + const apiKey = getLLMApiKey(); + if (!apiKey) { + throw new Error( + 'No API key configured for audio transcription. ' + + 'Set MEMORIX_LLM_API_KEY, MEMORIX_API_KEY, or OPENAI_API_KEY.', + ); + } + + const providerName = input.provider + ?? (process.env.MEMORIX_AUDIO_PROVIDER as 'openai' | 'groq' | undefined) + ?? 'openai'; + const config = PROVIDERS[providerName] ?? PROVIDERS.openai; + + // Build multipart form + const audioBuffer = Buffer.from(input.base64, 'base64'); + const blob = new Blob([audioBuffer], { type: input.mimeType ?? 'audio/mp3' }); + const form = new FormData(); + form.append('file', blob, input.filename ?? 'audio.mp3'); + form.append('model', config.model); + form.append('response_format', 'json'); + if (input.language) { + form.append('language', input.language); + } + + const response = await fetch(`${config.baseUrl}/audio/transcriptions`, { + method: 'POST', + headers: { 'Authorization': `Bearer ${apiKey}` }, + body: form, + signal: AbortSignal.timeout(120_000), // 2 min timeout for large files + }); + + if (!response.ok) { + const errorText = await response.text().catch(() => 'unknown error'); + throw new Error(`Whisper API error (${response.status}): ${errorText}`); + } + + const data = await response.json() as { + text: string; + duration?: number; + language?: string; + }; + + return { + text: data.text, + duration: data.duration, + language: data.language, + provider: providerName, + }; +} + +/** + * Transcribe audio and store as a Memorix observation. + */ +export async function ingestAudio( + input: AudioInput, + storeFn: (obs: { + entityName: string; + type: string; + title: string; + narrative: string; + concepts: string[]; + projectId: string; + }) => Promise<{ observation: { id: number }; upserted: boolean }>, + projectId: string, +): Promise<{ observationId: number; text: string; duration?: number }> { + const result = await transcribeAudio(input); + + const entityName = input.filename + ? input.filename.replace(/\.[^.]+$/, '') + : `audio-${Date.now()}`; + + const { observation } = await storeFn({ + entityName, + type: 'discovery', + title: `Audio transcript: ${entityName}`, + narrative: result.text, + concepts: ['audio', 'transcript', ...(result.language ? [result.language] : [])], + projectId, + }); + + return { + observationId: observation.id, + text: result.text, + duration: result.duration, + }; +} diff --git a/src/multimodal/index.ts b/src/multimodal/index.ts new file mode 100644 index 0000000..823b7fc --- /dev/null +++ b/src/multimodal/index.ts @@ -0,0 +1,12 @@ +/** + * Multimodal Ingestion — Unified Entry Point + * + * Re-exports all multimodal loaders for convenient access. + */ + +export { + transcribeAudio, + ingestAudio, + type AudioInput, + type TranscriptionResult, +} from './audio-loader.js'; diff --git a/src/server.ts b/src/server.ts index 69162fd..8b78565 100644 --- a/src/server.ts +++ b/src/server.ts @@ -3055,6 +3055,51 @@ export async function createMemorixServer( }, ); + // ── Multimodal Ingestion Tools ───────────────────────────────────── + + server.registerTool( + 'memorix_ingest_audio', + { + title: 'Ingest Audio', + description: + 'Transcribe audio via Whisper API (OpenAI or Groq) and store the transcript as a memory observation. ' + + 'Supports mp3, wav, m4a, webm, ogg, flac formats.', + inputSchema: { + base64: z.string().describe('Base64-encoded audio data'), + mimeType: z.string().optional().describe('Audio MIME type (e.g. audio/mp3)'), + filename: z.string().optional().describe('Original filename'), + language: z.string().optional().describe('ISO language code for transcription hint'), + provider: z.enum(['openai', 'groq']).optional().describe('Whisper provider (default: openai)'), + }, + }, + async (args) => { + try { + const { ingestAudio } = await import('./multimodal/index.js'); + markInternalWrite(); + const result = await ingestAudio( + args, + (obs) => storeObservation(obs), + project.id, + ); + return { + content: [{ + type: 'text' as const, + text: `🎤 Audio transcribed (${result.duration ? result.duration.toFixed(1) + 's' : 'unknown duration'})\n` + + `Observation #${result.observationId}\n` + + `Preview: ${result.text.slice(0, 300)}${result.text.length > 300 ? '…' : ''}`, + }], + }; + } catch (err: unknown) { + return { + content: [{ + type: 'text' as const, + text: `❌ Audio ingestion failed: ${err instanceof Error ? err.message : String(err)}`, + }], + isError: true, + }; + } + }, + ); // Deferred initialization — runs AFTER transport connect so MCP handshake isn't blocked. // Sync advisory scan and file watcher are non-essential for tool functionality. const deferredInit = async () => { diff --git a/tests/multimodal/audio-loader.test.ts b/tests/multimodal/audio-loader.test.ts new file mode 100644 index 0000000..ce98f68 --- /dev/null +++ b/tests/multimodal/audio-loader.test.ts @@ -0,0 +1,173 @@ +import { describe, it, expect, afterEach, beforeEach } from 'bun:test'; +import { transcribeAudio, ingestAudio } from '../../src/multimodal/audio-loader.js'; +import { resetConfigCache } from '../../src/config.js'; + +describe('audio-loader', () => { + const originalFetch = globalThis.fetch; + + beforeEach(() => { + resetConfigCache(); + }); + + afterEach(() => { + globalThis.fetch = originalFetch; + // Clean up env vars + delete process.env.OPENAI_API_KEY; + delete process.env.MEMORIX_LLM_API_KEY; + delete process.env.MEMORIX_API_KEY; + delete process.env.MEMORIX_AUDIO_PROVIDER; + resetConfigCache(); + }); + + it('calls OpenAI Whisper endpoint by default', async () => { + let calledUrl = ''; + let calledHeaders: Record = {}; + globalThis.fetch = (async (url: any, opts: any) => { + calledUrl = String(url); + calledHeaders = opts?.headers ?? {}; + return new Response(JSON.stringify({ text: 'hello world', duration: 5.2 }), { status: 200 }); + }) as typeof fetch; + + process.env.OPENAI_API_KEY = 'test-key-123'; + const result = await transcribeAudio({ + base64: Buffer.from('fake audio data').toString('base64'), + }); + + expect(calledUrl).toContain('api.openai.com'); + expect(calledUrl).toContain('/audio/transcriptions'); + expect(calledHeaders['Authorization']).toBe('Bearer test-key-123'); + expect(result.text).toBe('hello world'); + expect(result.duration).toBe(5.2); + expect(result.provider).toBe('openai'); + }); + + it('calls Groq endpoint when provider=groq', async () => { + let calledUrl = ''; + globalThis.fetch = (async (url: any) => { + calledUrl = String(url); + return new Response(JSON.stringify({ text: 'groq result' }), { status: 200 }); + }) as typeof fetch; + + process.env.OPENAI_API_KEY = 'test-key'; + const result = await transcribeAudio({ + base64: Buffer.from('fake').toString('base64'), + provider: 'groq', + }); + + expect(calledUrl).toContain('api.groq.com'); + expect(result.provider).toBe('groq'); + }); + + it('uses MEMORIX_AUDIO_PROVIDER env var', async () => { + let calledUrl = ''; + globalThis.fetch = (async (url: any) => { + calledUrl = String(url); + return new Response(JSON.stringify({ text: 'env result' }), { status: 200 }); + }) as typeof fetch; + + process.env.OPENAI_API_KEY = 'test-key'; + process.env.MEMORIX_AUDIO_PROVIDER = 'groq'; + const result = await transcribeAudio({ + base64: Buffer.from('fake').toString('base64'), + }); + + expect(calledUrl).toContain('api.groq.com'); + expect(result.provider).toBe('groq'); + }); + + it('throws when no API key configured', async () => { + // Ensure no API keys are set + delete process.env.OPENAI_API_KEY; + delete process.env.MEMORIX_LLM_API_KEY; + delete process.env.MEMORIX_API_KEY; + delete process.env.ANTHROPIC_API_KEY; + delete process.env.OPENROUTER_API_KEY; + + await expect( + transcribeAudio({ base64: 'dGVzdA==' }), + ).rejects.toThrow('No API key configured'); + }); + + it('throws on API error response', async () => { + globalThis.fetch = (async () => { + return new Response('Rate limit exceeded', { status: 429 }); + }) as typeof fetch; + + process.env.OPENAI_API_KEY = 'test-key'; + await expect( + transcribeAudio({ base64: Buffer.from('audio').toString('base64') }), + ).rejects.toThrow('Whisper API error (429)'); + }); + + it('passes language parameter', async () => { + let formData: FormData | null = null; + globalThis.fetch = (async (_url: any, opts: any) => { + formData = opts?.body; + return new Response(JSON.stringify({ text: 'bonjour', language: 'fr' }), { status: 200 }); + }) as typeof fetch; + + process.env.OPENAI_API_KEY = 'test-key'; + const result = await transcribeAudio({ + base64: Buffer.from('french audio').toString('base64'), + language: 'fr', + }); + + expect(result.text).toBe('bonjour'); + expect(result.language).toBe('fr'); + // FormData should have language field + expect(formData).toBeTruthy(); + }); + + it('ingestAudio stores observation with correct fields', async () => { + globalThis.fetch = (async () => { + return new Response(JSON.stringify({ text: 'transcribed content', duration: 30 }), { status: 200 }); + }) as typeof fetch; + + process.env.OPENAI_API_KEY = 'test-key'; + + let storedObs: Record | null = null; + const storeFn = async (obs: Record) => { + storedObs = obs; + return { observation: { id: 42 }, upserted: false }; + }; + + const result = await ingestAudio( + { base64: Buffer.from('audio data').toString('base64'), filename: 'meeting-notes.mp3' }, + storeFn as any, + 'project-123', + ); + + expect(result.observationId).toBe(42); + expect(result.text).toBe('transcribed content'); + expect(result.duration).toBe(30); + expect(storedObs).toBeTruthy(); + expect(storedObs!.entityName).toBe('meeting-notes'); + expect(storedObs!.type).toBe('discovery'); + expect(storedObs!.projectId).toBe('project-123'); + expect((storedObs!.concepts as string[])).toContain('audio'); + expect((storedObs!.concepts as string[])).toContain('transcript'); + }); + + it('ingestAudio uses timestamp for unnamed files', async () => { + globalThis.fetch = (async () => { + return new Response(JSON.stringify({ text: 'text' }), { status: 200 }); + }) as typeof fetch; + + process.env.OPENAI_API_KEY = 'test-key'; + + let storedObs: Record | null = null; + const storeFn = async (obs: Record) => { + storedObs = obs; + return { observation: { id: 1 }, upserted: false }; + }; + + await ingestAudio( + { base64: Buffer.from('data').toString('base64') }, + storeFn as any, + 'proj', + ); + + expect(storedObs).toBeTruthy(); + expect((storedObs!.entityName as string)).toMatch(/^audio-\d+$/); + }); +}); From c57784c5bc5922dae310964a4e24ebfc532b8a63 Mon Sep 17 00:00:00 2001 From: Ravi Tharuma Date: Mon, 30 Mar 2026 09:22:16 +0200 Subject: [PATCH 2/3] =?UTF-8?q?fix:=20address=20PR=20review=20=E2=80=94=20?= =?UTF-8?q?move=20markInternalWrite=20to=20write=20callback,=20use=20vites?= =?UTF-8?q?t=20imports?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/server.ts | 3 +-- tests/multimodal/audio-loader.test.ts | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/server.ts b/src/server.ts index 8b78565..1beec63 100644 --- a/src/server.ts +++ b/src/server.ts @@ -3075,10 +3075,9 @@ export async function createMemorixServer( async (args) => { try { const { ingestAudio } = await import('./multimodal/index.js'); - markInternalWrite(); const result = await ingestAudio( args, - (obs) => storeObservation(obs), + (obs) => { markInternalWrite(); return storeObservation(obs); }, project.id, ); return { diff --git a/tests/multimodal/audio-loader.test.ts b/tests/multimodal/audio-loader.test.ts index ce98f68..bcd4151 100644 --- a/tests/multimodal/audio-loader.test.ts +++ b/tests/multimodal/audio-loader.test.ts @@ -1,4 +1,4 @@ -import { describe, it, expect, afterEach, beforeEach } from 'bun:test'; +import { describe, it, expect, afterEach, beforeEach } from 'vitest'; import { transcribeAudio, ingestAudio } from '../../src/multimodal/audio-loader.js'; import { resetConfigCache } from '../../src/config.js'; From 55a196d132d6130077d33b66f9294b1b22b60bc7 Mon Sep 17 00:00:00 2001 From: Ravi Tharuma Date: Wed, 8 Apr 2026 19:21:28 +0200 Subject: [PATCH 3/3] =?UTF-8?q?fix(audio):=20address=20PR=20review=20block?= =?UTF-8?q?ers=20=E2=80=94=20ObservationType,=20dedicated=20Whisper=20conf?= =?UTF-8?q?ig?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three changes: 1. storeFn parameter type: type: string -> type: ObservationType so the callback signature is compatible with storeObservation() without a cast. 2. Dedicated Whisper key resolution: transcribeAudio() now checks MEMORIX_WHISPER_API_KEY first, then falls back to OPENAI_API_KEY. Does NOT fall through to MEMORIX_LLM_API_KEY — that key may point to Anthropic or another non-Whisper provider, which would send the wrong credential to the /audio/transcriptions endpoint. 3. MEMORIX_WHISPER_BASE_URL: optional override for self-hosted or alternative Whisper-compatible endpoints (vLLM, LocalAI, etc.). Tests: added coverage for MEMORIX_WHISPER_API_KEY priority and MEMORIX_WHISPER_BASE_URL custom endpoint; cleanup ensures both vars are cleared in afterEach. --- src/multimodal/audio-loader.ts | 59 +++++++++++++++++++++++---- tests/multimodal/audio-loader.test.ts | 34 +++++++++++++++ 2 files changed, 85 insertions(+), 8 deletions(-) diff --git a/src/multimodal/audio-loader.ts b/src/multimodal/audio-loader.ts index 0c5f1ba..e8d1490 100644 --- a/src/multimodal/audio-loader.ts +++ b/src/multimodal/audio-loader.ts @@ -6,9 +6,19 @@ * * Supports: mp3, wav, m4a, webm, mp4, ogg, flac * Providers: OpenAI (whisper-1), Groq (whisper-large-v3) + * + * Configuration: + * MEMORIX_WHISPER_API_KEY — API key for Whisper transcription (preferred) + * MEMORIX_WHISPER_BASE_URL — Custom Whisper-compatible endpoint (optional) + * MEMORIX_AUDIO_PROVIDER — Provider preset: 'openai' | 'groq' (default: openai) + * + * Falls back to OPENAI_API_KEY when MEMORIX_WHISPER_API_KEY is not set, + * since OpenAI Whisper is the default provider. Does NOT fall back to + * generic MEMORIX_LLM_API_KEY to avoid sending the wrong credential to + * a non-Whisper provider (e.g. Anthropic). */ -import { getLLMApiKey } from '../config.js'; +import type { ObservationType } from '../types.js'; // ── Types ──────────────────────────────────────────────────────────── @@ -49,6 +59,38 @@ const PROVIDERS = { }, } as const; +// ── Key Resolution ─────────────────────────────────────────────────── + +/** + * Resolve the API key for Whisper transcription. + * + * Resolution order: + * 1. MEMORIX_WHISPER_API_KEY (dedicated Whisper key — always correct) + * 2. OPENAI_API_KEY (OpenAI Whisper default) + * + * Intentionally does NOT fall back to MEMORIX_LLM_API_KEY or MEMORIX_API_KEY: + * those may point to Anthropic or another non-Whisper provider, which would + * send the wrong credential to the /audio/transcriptions endpoint. + */ +function getWhisperApiKey(): string | undefined { + return ( + process.env.MEMORIX_WHISPER_API_KEY || + process.env.OPENAI_API_KEY + ); +} + +/** + * Resolve the base URL for the Whisper endpoint. + * + * Resolution order: + * 1. MEMORIX_WHISPER_BASE_URL (custom / self-hosted endpoint) + * 2. Provider preset default (api.openai.com or api.groq.com) + */ +function getWhisperBaseUrl(providerDefault: string): string { + const custom = process.env.MEMORIX_WHISPER_BASE_URL; + return custom ? custom.replace(/\/+$/, '') : providerDefault; +} + // ── Core Functions ─────────────────────────────────────────────────── /** @@ -57,31 +99,32 @@ const PROVIDERS = { * @throws Error if no API key configured or API returns error. */ export async function transcribeAudio(input: AudioInput): Promise { - const apiKey = getLLMApiKey(); + const apiKey = getWhisperApiKey(); if (!apiKey) { throw new Error( 'No API key configured for audio transcription. ' + - 'Set MEMORIX_LLM_API_KEY, MEMORIX_API_KEY, or OPENAI_API_KEY.', + 'Set MEMORIX_WHISPER_API_KEY or OPENAI_API_KEY.', ); } const providerName = input.provider ?? (process.env.MEMORIX_AUDIO_PROVIDER as 'openai' | 'groq' | undefined) ?? 'openai'; - const config = PROVIDERS[providerName] ?? PROVIDERS.openai; + const preset = PROVIDERS[providerName] ?? PROVIDERS.openai; + const baseUrl = getWhisperBaseUrl(preset.baseUrl); // Build multipart form const audioBuffer = Buffer.from(input.base64, 'base64'); const blob = new Blob([audioBuffer], { type: input.mimeType ?? 'audio/mp3' }); const form = new FormData(); form.append('file', blob, input.filename ?? 'audio.mp3'); - form.append('model', config.model); + form.append('model', preset.model); form.append('response_format', 'json'); if (input.language) { form.append('language', input.language); } - const response = await fetch(`${config.baseUrl}/audio/transcriptions`, { + const response = await fetch(`${baseUrl}/audio/transcriptions`, { method: 'POST', headers: { 'Authorization': `Bearer ${apiKey}` }, body: form, @@ -114,7 +157,7 @@ export async function ingestAudio( input: AudioInput, storeFn: (obs: { entityName: string; - type: string; + type: ObservationType; title: string; narrative: string; concepts: string[]; @@ -130,7 +173,7 @@ export async function ingestAudio( const { observation } = await storeFn({ entityName, - type: 'discovery', + type: 'discovery' as ObservationType, title: `Audio transcript: ${entityName}`, narrative: result.text, concepts: ['audio', 'transcript', ...(result.language ? [result.language] : [])], diff --git a/tests/multimodal/audio-loader.test.ts b/tests/multimodal/audio-loader.test.ts index bcd4151..023e2d6 100644 --- a/tests/multimodal/audio-loader.test.ts +++ b/tests/multimodal/audio-loader.test.ts @@ -16,6 +16,8 @@ describe('audio-loader', () => { delete process.env.MEMORIX_LLM_API_KEY; delete process.env.MEMORIX_API_KEY; delete process.env.MEMORIX_AUDIO_PROVIDER; + delete process.env.MEMORIX_WHISPER_API_KEY; + delete process.env.MEMORIX_WHISPER_BASE_URL; resetConfigCache(); }); @@ -75,11 +77,43 @@ describe('audio-loader', () => { expect(result.provider).toBe('groq'); }); + it('uses MEMORIX_WHISPER_API_KEY over OPENAI_API_KEY', async () => { + let calledHeaders: Record = {}; + globalThis.fetch = (async (_url: any, opts: any) => { + calledHeaders = opts?.headers ?? {}; + return new Response(JSON.stringify({ text: 'result' }), { status: 200 }); + }) as typeof fetch; + + process.env.OPENAI_API_KEY = 'wrong-key'; + process.env.MEMORIX_WHISPER_API_KEY = 'whisper-specific-key'; + const result = await transcribeAudio({ base64: Buffer.from('data').toString('base64') }); + + expect(calledHeaders['Authorization']).toBe('Bearer whisper-specific-key'); + expect(result.text).toBe('result'); + }); + + it('uses MEMORIX_WHISPER_BASE_URL for custom endpoint', async () => { + let calledUrl = ''; + globalThis.fetch = (async (url: any) => { + calledUrl = String(url); + return new Response(JSON.stringify({ text: 'custom result' }), { status: 200 }); + }) as typeof fetch; + + process.env.OPENAI_API_KEY = 'test-key'; + process.env.MEMORIX_WHISPER_BASE_URL = 'https://my-whisper.example.com/v1'; + const result = await transcribeAudio({ base64: Buffer.from('data').toString('base64') }); + + expect(calledUrl).toContain('my-whisper.example.com'); + expect(calledUrl).toContain('/audio/transcriptions'); + expect(result.text).toBe('custom result'); + }); + it('throws when no API key configured', async () => { // Ensure no API keys are set delete process.env.OPENAI_API_KEY; delete process.env.MEMORIX_LLM_API_KEY; delete process.env.MEMORIX_API_KEY; + delete process.env.MEMORIX_WHISPER_API_KEY; delete process.env.ANTHROPIC_API_KEY; delete process.env.OPENROUTER_API_KEY;