diff --git a/CLAUDE.md b/CLAUDE.md index 364704a..abf44a3 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -9,7 +9,7 @@ Single binary with 20 modules behind a lib crate: - `config.rs` — loads `~/.engraph/config.toml` and `vault.toml`, merges CLI args, provides `data_dir()` - `chunker.rs` — smart chunking with break-point scoring algorithm. Finds optimal split points considering headings, code fences, blank lines, and thematic breaks. `split_oversized_chunks()` handles token-aware secondary splitting with overlap - `docid.rs` — deterministic 6-char hex IDs for files (SHA-256 of path, truncated). Shown in search results for quick reference -- `embedder.rs` — downloads and runs `all-MiniLM-L6-v2` ONNX model (384-dim). SHA256-verified on download. Uses `ort` for inference, `tokenizers` for tokenization. Implements `ModelBackend` trait. **Not `Send`** — all embedding is serial +- `embedder.rs` — downloads and runs `bge-small-en-v1.5` ONNX model (384-dim, 512 token context). SHA256-verified on download. Uses `ort` for inference, `tokenizers` for tokenization. Implements `ModelBackend` trait. **Not `Send`** — all embedding is serial - `model.rs` — pluggable `ModelBackend` trait, model registry, and `parse_model_spec()`. Enables future model swapping without changing consumer code - `fts.rs` — FTS5 full-text search support. Re-exports `FtsResult` from store. BM25-ranked keyword search - `fusion.rs` — Reciprocal Rank Fusion (RRF) engine. Merges semantic + FTS5 + graph results. Supports lane weighting, `--explain` output with per-lane detail diff --git a/README.md b/README.md index ea2ebd9..2235f2e 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ Plain vector search treats your notes as isolated documents. But knowledge isn't - **MCP server for AI agents** — `engraph serve` exposes 13 tools (search, read, context bundles, note creation) that Claude, Cursor, or any MCP client can call directly. - **Real-time sync** — file watcher keeps the index fresh as you edit in Obsidian. No manual re-indexing needed. - **Smart write pipeline** — AI agents can create notes with automatic tag resolution, wikilink discovery, and folder placement based on semantic similarity. -- **Fully local** — ONNX embeddings (`all-MiniLM-L6-v2`, 23MB), SQLite storage, no network required after initial model download. +- **Fully local** — ONNX embeddings (`bge-small-en-v1.5`, 127MB), SQLite storage, no network required after initial model download. ## What problem it solves @@ -80,7 +80,7 @@ cargo install --git https://github.com/devwhodevs/engraph ```bash engraph index ~/path/to/vault -# Downloads embedding model on first run (~23MB) +# Downloads embedding model on first run (~127MB) # Incremental — only re-embeds changed files on subsequent runs ``` diff --git a/src/embedder.rs b/src/embedder.rs index 5770904..40dd57f 100644 --- a/src/embedder.rs +++ b/src/embedder.rs @@ -11,12 +11,11 @@ use tokenizers::Tokenizer; use tracing::info; const MODEL_URL: &str = - "https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/onnx/model.onnx"; + "https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/onnx/model.onnx"; const TOKENIZER_URL: &str = - "https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/tokenizer.json"; -/// SHA-256 of the ONNX model file. Set to empty string to skip verification -/// until we can compute the real hash from a download. -const MODEL_SHA256: &str = "6fd5d72fe4589f189f8ebc006442dbb529bb7ce38f8082112682524616046452"; + "https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/tokenizer.json"; +/// SHA-256 of the ONNX model file. +const MODEL_SHA256: &str = "828e1496d7fabb79cfa4dcd84fa38625c0d3d21da474a00f08db0f559940cf35"; pub const EMBEDDING_DIM: usize = 384; pub struct Embedder { @@ -175,7 +174,7 @@ impl crate::model::ModelBackend for Embedder { } fn name(&self) -> &str { - "onnx:all-MiniLM-L6-v2" + "onnx:bge-small-en-v1.5" } } diff --git a/src/model.rs b/src/model.rs index b8c966c..485d377 100644 --- a/src/model.rs +++ b/src/model.rs @@ -42,12 +42,12 @@ impl Default for ModelRegistry { fn default() -> Self { Self { entries: vec![ModelRegistryEntry { - name: "onnx:all-MiniLM-L6-v2".to_string(), + name: "onnx:bge-small-en-v1.5".to_string(), format: ModelFormat::Onnx, - url: "https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/onnx/model.onnx".to_string(), - sha256: "6fd5d72fe4589f189f8ebc006442dbb529bb7ce38f8082112682524616046452".to_string(), + url: "https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/onnx/model.onnx".to_string(), + sha256: "828e1496d7fabb79cfa4dcd84fa38625c0d3d21da474a00f08db0f559940cf35".to_string(), dim: 384, - description: "Lightweight general-purpose sentence embeddings".to_string(), + description: "High-quality English embeddings, 512 token context".to_string(), }], } } @@ -96,16 +96,16 @@ mod tests { let registry = ModelRegistry::default(); assert_eq!(registry.entries.len(), 1); let entry = ®istry.entries[0]; - assert_eq!(entry.name, "onnx:all-MiniLM-L6-v2"); + assert_eq!(entry.name, "onnx:bge-small-en-v1.5"); assert_eq!(entry.dim, 384); assert_eq!(entry.format, ModelFormat::Onnx); } #[test] fn test_parse_model_spec_onnx() { - let spec = parse_model_spec("onnx:all-MiniLM-L6-v2"); + let spec = parse_model_spec("onnx:bge-small-en-v1.5"); assert_eq!(spec.format, ModelFormat::Onnx); - assert_eq!(spec.name, "all-MiniLM-L6-v2"); + assert_eq!(spec.name, "bge-small-en-v1.5"); assert!(spec.path.is_empty()); } @@ -128,7 +128,7 @@ mod tests { #[test] fn test_registry_get_existing() { let registry = ModelRegistry::default(); - let entry = registry.get("onnx:all-MiniLM-L6-v2"); + let entry = registry.get("onnx:bge-small-en-v1.5"); assert!(entry.is_some()); assert_eq!(entry.unwrap().dim, 384); } diff --git a/src/search.rs b/src/search.rs index f776cff..84d002a 100644 --- a/src/search.rs +++ b/src/search.rs @@ -233,7 +233,7 @@ pub fn run_status(json: bool, data_dir: &Path) -> Result<()> { // Compute index size on disk (sqlite db file). let index_size = std::fs::metadata(&db_path).map(|m| m.len()).unwrap_or(0); - let model_name = "all-MiniLM-L6-v2"; + let model_name = "bge-small-en-v1.5"; let output = format_status(&stats, index_size, model_name, json); print!("{output}"); @@ -451,7 +451,7 @@ mod tests { wikilink_count: None, mention_count: None, }; - let output = format_status(&stats, 2_516_582, "all-MiniLM-L6-v2", false); + let output = format_status(&stats, 2_516_582, "bge-small-en-v1.5", false); assert!(output.contains("/path/to/vault"), "missing vault path"); assert!(output.contains("42"), "missing file count"); @@ -459,7 +459,7 @@ mod tests { assert!(output.contains("3"), "missing tombstone count"); assert!(output.contains("2026-03-19 14:30:00"), "missing last index"); assert!(output.contains("2.4 MB"), "missing index size"); - assert!(output.contains("all-MiniLM-L6-v2"), "missing model"); + assert!(output.contains("bge-small-en-v1.5"), "missing model"); } #[test] @@ -474,7 +474,7 @@ mod tests { wikilink_count: None, mention_count: None, }; - let output = format_status(&stats, 2_516_582, "all-MiniLM-L6-v2", true); + let output = format_status(&stats, 2_516_582, "bge-small-en-v1.5", true); let parsed: serde_json::Value = serde_json::from_str(&output).unwrap(); assert_eq!(parsed["vault"], "/path/to/vault"); @@ -483,7 +483,7 @@ mod tests { assert_eq!(parsed["tombstones"], 3); assert_eq!(parsed["last_indexed"], "2026-03-19 14:30:00"); assert_eq!(parsed["index_size"], 2_516_582); - assert_eq!(parsed["model"], "all-MiniLM-L6-v2"); + assert_eq!(parsed["model"], "bge-small-en-v1.5"); } #[test] diff --git a/tests/integration.rs b/tests/integration.rs index 89923d1..3bae40b 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -1,6 +1,6 @@ //! Integration tests for engraph. //! -//! All tests are `#[ignore]` because they require the ONNX model download (~23MB). +//! All tests are `#[ignore]` because they require the ONNX model download (~127MB). //! Run with: `cargo test --test integration -- --ignored` use std::path::{Path, PathBuf};