diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ce4818b..c5f4d07 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,6 +14,9 @@ jobs: runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 + - name: Install CMake (Ubuntu) + if: runner.os == 'Linux' + run: sudo apt-get update && sudo apt-get install -y cmake - uses: dtolnay/rust-toolchain@stable with: components: rustfmt, clippy diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index d10be69..d63db36 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -19,6 +19,9 @@ jobs: contents: write steps: - uses: actions/checkout@v4 + - name: Install CMake (Ubuntu) + if: runner.os == 'Linux' + run: sudo apt-get update && sudo apt-get install -y cmake - uses: dtolnay/rust-toolchain@stable - run: cargo build --release - name: Archive binary @@ -60,6 +63,7 @@ jobs: sha256 "SHA256" license "MIT" + depends_on "cmake" => :build depends_on "rust" => :build def install diff --git a/CHANGELOG.md b/CHANGELOG.md index 5e71f03..703e2b7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,29 @@ # Changelog +## [1.0.1] - 2026-03-26 + +### Changed +- **Inference backend switched from candle to llama.cpp** — via `llama-cpp-2` Rust bindings. Gets full Metal GPU acceleration on macOS (88 files indexed in 70s vs 37+ minutes on CPU with candle). Same backend as [qmd](https://github.com/tobi/qmd). +- Default embedding model produces 256-dim vectors via embeddinggemma-300M (Matryoshka truncation) +- BERT GGUF architecture support added alongside Gemma (future model flexibility) +- Progress bar during indexing via indicatif (was silent for minutes) +- CI workflow installs CMake on Ubuntu (required for llama.cpp build) + +### Fixed +- **Prompt format applied during embedding** — `embed_one` uses search_query prefix, `embed_batch` uses search_document prefix. Without this, embeddinggemma operated in wrong symmetric mode. +- **GGUF tokenizer fallback** — added `shimmytok` crate to extract tokenizer from GGUF metadata when tokenizer.json is unavailable (Google Gemma repos are gated) +- **LlamaBackend singleton** — global `OnceLock` prevents double-initialization crash when loading multiple models +- **Orchestrator/reranker use built-in tokenizer** — llama.cpp reads tokenizer from GGUF metadata, no external tokenizer.json needed +- **Dimension migration clears FTS** — `reset_for_reindex` now also clears `chunks_fts` to prevent duplicate entries +- **LLM cache wired into search** — `search_with_intelligence` checks/populates `llm_cache` table +- **MCP server wires intelligence** — search handler passes orchestrator + reranker via `SearchConfig` +- **CLI search wires intelligence** — `run_search` loads models when intelligence enabled +- **Qwen3 GGUF filename** — fixed case sensitivity (was 404) +- **Embedding batch params** — `n_ubatch >= n_tokens` assertion, use `encode()` not `decode()`, `AddBos::Never` (PromptFormat adds ``) + +### Removed +- `candle-core`, `candle-nn`, `candle-transformers` dependencies (replaced by `llama-cpp-2`) + ## [1.0.0] - 2026-03-25 ### Added diff --git a/CLAUDE.md b/CLAUDE.md index 86ae2e3..f5739b8 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -9,7 +9,7 @@ Single binary with 19 modules behind a lib crate: - `config.rs` — loads `~/.engraph/config.toml` and `vault.toml`, merges CLI args, provides `data_dir()`. Includes `intelligence: Option` and `[models]` section for model overrides. `Config::save()` writes back to disk. - `chunker.rs` — smart chunking with break-point scoring algorithm. Finds optimal split points considering headings, code fences, blank lines, and thematic breaks. `split_oversized_chunks()` handles token-aware secondary splitting with overlap - `docid.rs` — deterministic 6-char hex IDs for files (SHA-256 of path, truncated). Shown in search results for quick reference -- `llm.rs` — candle model management. Three traits: `EmbedModel` (embeddings), `RerankModel` (cross-encoder scoring), `OrchestratorModel` (query intent + expansion). Three candle implementations: `CandleEmbed` (custom bidirectional transformer from GGUF for embeddinggemma), `CandleOrchestrator` (quantized_qwen3 for query analysis), `CandleRerank` (quantized_qwen3 for relevance scoring). Also: `MockLlm` for testing, `HfModelUri` for model download, `PromptFormat` for model-family prompt templates, `heuristic_orchestrate()` fast path, `LaneWeights` per query intent +- `llm.rs` — ML inference via llama.cpp (Rust bindings: `llama-cpp-2`). Three traits: `EmbedModel` (embeddings), `RerankModel` (cross-encoder scoring), `OrchestratorModel` (query intent + expansion). Three llama.cpp implementations: `LlamaEmbed` (embeddinggemma-300M GGUF on Metal GPU), `LlamaOrchestrator` (Qwen3-0.6B for query analysis + expansion), `LlamaRerank` (Qwen3-Reranker-0.6B for relevance scoring). Global `LlamaBackend` via `OnceLock`. Also: `MockLlm` for testing, `HfModelUri` for model download, `FlexTokenizer` (HuggingFace tokenizers + shimmytok GGUF fallback), `PromptFormat` for model-family prompt templates, `heuristic_orchestrate()` fast path, `LaneWeights` per query intent - `fts.rs` — FTS5 full-text search support. Re-exports `FtsResult` from store. BM25-ranked keyword search - `fusion.rs` — Reciprocal Rank Fusion (RRF) engine. Merges semantic + FTS5 + graph + reranker results. Supports per-lane weighting, `--explain` output with intent + per-lane detail - `context.rs` — context engine. Six functions: `read` (full note content + metadata), `list` (filtered note listing with `created_by` filter), `vault_map` (structure overview), `who` (person context bundle), `project` (project context bundle), `context_topic` (rich topic context with budget trimming). Pure functions taking `ContextParams` — no model loading except `context_topic` which reuses `search_internal` @@ -52,14 +52,13 @@ Single vault only. Re-indexing a different vault path triggers a confirmation pr ## Dependencies to be aware of -- `candle-core` (0.9) — HuggingFace pure Rust ML framework. GGUF model loading, tensor ops. `metal` feature for macOS GPU acceleration -- `candle-nn` (0.9) — neural network building blocks (RmsNorm, rotary embeddings, etc.) -- `candle-transformers` (0.9) — pre-built transformer model architectures. Used: `quantized_qwen3` for orchestrator + reranker +- `llama-cpp-2` (0.1) — Rust bindings to llama.cpp. GGUF model loading + inference. Metal GPU on macOS, CUDA on Linux. Compiles llama.cpp C++ via build script (requires CMake) +- `shimmytok` (0.7) — pure Rust tokenizer that reads from GGUF metadata. Fallback when tokenizer.json is unavailable (gated HuggingFace repos) +- `tokenizers` (0.22) — HuggingFace tokenizer. Kept for FlexTokenizer HuggingFace backend - `sqlite-vec` (0.1.8-alpha.1) — SQLite extension for vector search. Provides vec0 virtual tables with KNN via `vec_distance_cosine()` - `zerocopy` (0.7) — zero-copy serialization for vector data passed to sqlite-vec - `strsim` (0.11) — string similarity for fuzzy tag matching and fuzzy link matching - `time` (0.3) — date/time handling for frontmatter timestamps -- `tokenizers` (0.22) — HuggingFace tokenizer. Needs `fancy-regex` feature. Used for all three GGUF models - `ignore` (0.4) — vault walking with `.gitignore` support - `rusqlite` (0.32) — bundled SQLite with FTS5 support - `rmcp` (1.2) — MCP server SDK for stdio transport @@ -68,12 +67,13 @@ Single vault only. Re-indexing a different vault path triggers a confirmation pr ## Testing -- Unit tests in each module (`cargo test --lib`) — 271 tests, no network required +- Unit tests in each module (`cargo test --lib`) — 270 tests, no network required - Integration tests (`cargo test --test integration -- --ignored`) — require GGUF model download +- Build requires CMake (for llama.cpp C++ compilation) ## CI/CD -- CI: `cargo fmt --check` + `cargo clippy -- -D warnings` + `cargo test --lib` on macOS + Ubuntu +- CI: `cargo fmt --check` + `cargo clippy -- -D warnings` + `cargo test --lib` on macOS + Ubuntu. Ubuntu step installs CMake. - Release: native builds on macOS arm64 (macos-14) + Linux x86_64 (ubuntu-latest). Triggered by `v*` tags - Homebrew: `devwhodevs/homebrew-tap` — formula builds from source tarball diff --git a/Cargo.lock b/Cargo.lock index d6c3ef7..22e8525 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -31,12 +31,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "allocator-api2" -version = "0.2.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" - [[package]] name = "android_system_properties" version = "0.1.5" @@ -132,14 +126,32 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" [[package]] -name = "bindgen_cuda" -version = "0.1.6" +name = "bindgen" +version = "0.72.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "282be55fb326843bb67cccceeeaf21c961ef303f60018f9a2ab69494dad8eaf9" +checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895" dependencies = [ - "glob", - "num_cpus", - "rayon", + "bitflags 2.11.0", + "cexpr", + "clang-sys", + "itertools 0.13.0", + "log", + "prettyplease", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "syn", +] + +[[package]] +name = "bit-set" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1" +dependencies = [ + "bit-vec 0.6.3", ] [[package]] @@ -148,9 +160,15 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" dependencies = [ - "bit-vec", + "bit-vec 0.8.0", ] +[[package]] +name = "bit-vec" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" + [[package]] name = "bit-vec" version = "0.8.0" @@ -169,12 +187,6 @@ version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" -[[package]] -name = "block" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d8c1fef690941d3e7788d328517591fecc684c084084702d6ff1641e993699a" - [[package]] name = "block-buffer" version = "0.10.4" @@ -184,15 +196,6 @@ dependencies = [ "generic-array", ] -[[package]] -name = "block2" -version = "0.6.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdeb9d870516001442e364c5220d3574d2da8dc765554b4a617230d33fa58ef5" -dependencies = [ - "objc2", -] - [[package]] name = "bstr" version = "1.12.1" @@ -209,26 +212,6 @@ version = "3.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" -[[package]] -name = "bytemuck" -version = "1.25.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec" -dependencies = [ - "bytemuck_derive", -] - -[[package]] -name = "bytemuck_derive" -version = "1.10.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9abbd1bc6865053c427f7198e6af43bfdedc55ab791faed4fbd361d789575ff" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "byteorder" version = "1.5.0" @@ -241,105 +224,6 @@ version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" -[[package]] -name = "candle-core" -version = "0.9.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c15b675b80d994b2eadb20a4bbe434eabeb454eac3ee5e2b4cf6f147ee9be091" -dependencies = [ - "byteorder", - "candle-kernels", - "candle-metal-kernels", - "candle-ug", - "cudarc 0.19.4", - "float8 0.6.1", - "gemm 0.19.0", - "half", - "libm", - "memmap2", - "num-traits", - "num_cpus", - "objc2-foundation", - "objc2-metal", - "rand", - "rand_distr", - "rayon", - "safetensors 0.7.0", - "thiserror 2.0.18", - "yoke 0.8.1", - "zip", -] - -[[package]] -name = "candle-kernels" -version = "0.9.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8455f84bd810047c7c41216683c1020c915a9f8a740b3b0eabdd4fb2fbaa660" -dependencies = [ - "bindgen_cuda", -] - -[[package]] -name = "candle-metal-kernels" -version = "0.9.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fdfe9d06de16ce49961e49084e5b79a75a9bdf157246e7c7b6328e87a7aa25d" -dependencies = [ - "half", - "objc2", - "objc2-foundation", - "objc2-metal", - "once_cell", - "thiserror 2.0.18", - "tracing", -] - -[[package]] -name = "candle-nn" -version = "0.9.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3045fa9e7aef8567d209a27d56b692f60b96f4d0569f4c3011f8ca6715c65e03" -dependencies = [ - "candle-core", - "half", - "libc", - "num-traits", - "rayon", - "safetensors 0.7.0", - "serde", - "thiserror 2.0.18", -] - -[[package]] -name = "candle-transformers" -version = "0.9.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b538ec4aa807c416a2ddd3621044888f188827862e2a6fcacba4738e89795d01" -dependencies = [ - "byteorder", - "candle-core", - "candle-nn", - "fancy-regex 0.17.0", - "num-traits", - "rand", - "rayon", - "serde", - "serde_json", - "serde_plain", - "tracing", -] - -[[package]] -name = "candle-ug" -version = "0.9.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c22d62be69068bf58987a45f690612739d8d2ea1bf508c1b87dc6815a019575d" -dependencies = [ - "ug", - "ug-cuda", - "ug-metal", -] - [[package]] name = "castaway" version = "0.2.4" @@ -356,9 +240,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a0dd1ca384932ff3641c8718a02769f1698e7563dc6974ffd03346116310423" dependencies = [ "find-msvc-tools", + "jobserver", + "libc", "shlex", ] +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] + [[package]] name = "cfg-if" version = "1.0.4" @@ -379,6 +274,17 @@ dependencies = [ "windows-link", ] +[[package]] +name = "clang-sys" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" +dependencies = [ + "glob", + "libc", + "libloading", +] + [[package]] name = "clap" version = "4.6.0" @@ -419,6 +325,15 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" +[[package]] +name = "cmake" +version = "0.1.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75443c44cd6b379beb8c5b45d85d0773baf31cce901fe7bb252f4eff3008ef7d" +dependencies = [ + "cc", +] + [[package]] name = "colorchoice" version = "1.0.5" @@ -453,33 +368,12 @@ dependencies = [ "windows-sys 0.59.0", ] -[[package]] -name = "core-foundation" -version = "0.9.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" -dependencies = [ - "core-foundation-sys", - "libc", -] - [[package]] name = "core-foundation-sys" version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" -[[package]] -name = "core-graphics-types" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45390e6114f68f718cc7a830514a96f903cccd70d02a8f6d9f643ac4ba45afaf" -dependencies = [ - "bitflags 1.3.2", - "core-foundation", - "libc", -] - [[package]] name = "cpufeatures" version = "0.2.17" @@ -523,12 +417,6 @@ version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" -[[package]] -name = "crunchy" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" - [[package]] name = "crypto-common" version = "0.1.7" @@ -539,27 +427,6 @@ dependencies = [ "typenum", ] -[[package]] -name = "cudarc" -version = "0.17.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0bf99ab37ee7072d64d906aa2dada9a3422f1d975cdf8c8055a573bc84897ed8" -dependencies = [ - "half", - "libloading 0.8.9", -] - -[[package]] -name = "cudarc" -version = "0.19.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f071cd6a7b5d51607df76aa2d426aaabc7a74bc6bdb885b8afa63a880572ad9b" -dependencies = [ - "float8 0.7.0", - "half", - "libloading 0.9.0", -] - [[package]] name = "darling" version = "0.20.11" @@ -709,16 +576,6 @@ dependencies = [ "windows-sys 0.48.0", ] -[[package]] -name = "dispatch2" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e0e367e4e7da84520dedcac1901e4da967309406d1e51017ae1abfb97adbd38" -dependencies = [ - "bitflags 2.11.0", - "objc2", -] - [[package]] name = "displaydoc" version = "0.2.5" @@ -736,22 +593,6 @@ version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555" -[[package]] -name = "dyn-stack" -version = "0.13.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c4713e43e2886ba72b8271aa66c93d722116acf7a75555cce11dcde84388fe8" -dependencies = [ - "bytemuck", - "dyn-stack-macros", -] - -[[package]] -name = "dyn-stack-macros" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1d926b4d407d372f141f93bb444696142c29d32962ccbd3531117cf3aa0bfa9" - [[package]] name = "either" version = "1.15.0" @@ -764,18 +605,26 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" +[[package]] +name = "encoding_rs" +version = "0.8.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" +dependencies = [ + "cfg-if", +] + [[package]] name = "engraph" version = "1.0.0" dependencies = [ "anyhow", - "candle-core", - "candle-nn", - "candle-transformers", "clap", "dirs", + "encoding_rs", "ignore", "indicatif", + "llama-cpp-2", "notify", "notify-debouncer-full", "rayon", @@ -784,6 +633,7 @@ dependencies = [ "serde", "serde_json", "sha2", + "shimmytok", "sqlite-vec", "strsim", "tempfile", @@ -798,12 +648,20 @@ dependencies = [ ] [[package]] -name = "enum-as-inner" -version = "0.6.1" +name = "enumflags2" +version = "0.7.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1e6a265c649f3f5979b601d26f1d05ada116434c87741c9493cb56218f76cbc" +checksum = "1027f7680c853e056ebcec683615fb6fbbc07dbaa13b4d5d9442b146ded4ecef" +dependencies = [ + "enumflags2_derive", +] + +[[package]] +name = "enumflags2_derive" +version = "0.7.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67c78a4d8fdf9953a5c9d458f9efe940fd97a0cab0941c075a813ac594733827" dependencies = [ - "heck", "proc-macro2", "quote", "syn", @@ -845,22 +703,22 @@ checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" [[package]] name = "fancy-regex" -version = "0.14.0" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e24cb5a94bcae1e5408b0effca5cd7172ea3c5755049c5f3af4cd283a165298" +checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2" dependencies = [ - "bit-set", + "bit-set 0.5.3", "regex-automata", "regex-syntax", ] [[package]] name = "fancy-regex" -version = "0.17.0" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72cf461f865c862bb7dc573f643dd6a2b6842f7c30b07882b56bd148cc2761b8" +checksum = "6e24cb5a94bcae1e5408b0effca5cd7172ea3c5755049c5f3af4cd283a165298" dependencies = [ - "bit-set", + "bit-set 0.8.0", "regex-automata", "regex-syntax", ] @@ -898,35 +756,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" [[package]] -name = "flate2" -version = "1.1.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" -dependencies = [ - "crc32fast", - "miniz_oxide", -] - -[[package]] -name = "float8" -version = "0.6.1" +name = "find_cuda_helper" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "719a903cc23e4a89e87962c2a80fdb45cdaad0983a89bd150bb57b4c8571a7d5" +checksum = "f9f9e65c593dd01ac77daad909ea4ad17f0d6d1776193fc8ea766356177abdad" dependencies = [ - "cudarc 0.19.4", - "half", - "num-traits", - "rand", - "rand_distr", + "glob", ] [[package]] -name = "float8" -version = "0.7.0" +name = "flate2" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2d1f04709a8ac06e8e8042875a3c466cc4832d3c1a18dbcb9dba3c6e83046bc" +checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" dependencies = [ - "half", + "crc32fast", + "miniz_oxide", ] [[package]] @@ -941,39 +786,6 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" -[[package]] -name = "foldhash" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" - -[[package]] -name = "foreign-types" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d737d9aa519fb7b749cbc3b962edcf310a8dd1f4b67c91c4f83975dbdd17d965" -dependencies = [ - "foreign-types-macros", - "foreign-types-shared", -] - -[[package]] -name = "foreign-types-macros" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a5c6c585bc94aaf2c7b51dd4c2ba22680844aba4c687be581871a6f518c5742" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "foreign-types-shared" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa9a19cbb55df58761df49b23516a86d432839add4af60fc256da840f66ed35b" - [[package]] name = "form_urlencoded" version = "1.2.2" @@ -1080,244 +892,6 @@ dependencies = [ "slab", ] -[[package]] -name = "gemm" -version = "0.18.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab96b703d31950f1aeddded248bc95543c9efc7ac9c4a21fda8703a83ee35451" -dependencies = [ - "dyn-stack", - "gemm-c32 0.18.2", - "gemm-c64 0.18.2", - "gemm-common 0.18.2", - "gemm-f16 0.18.2", - "gemm-f32 0.18.2", - "gemm-f64 0.18.2", - "num-complex", - "num-traits", - "paste", - "raw-cpuid", - "seq-macro", -] - -[[package]] -name = "gemm" -version = "0.19.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa0673db364b12263d103b68337a68fbecc541d6f6b61ba72fe438654709eacb" -dependencies = [ - "dyn-stack", - "gemm-c32 0.19.0", - "gemm-c64 0.19.0", - "gemm-common 0.19.0", - "gemm-f16 0.19.0", - "gemm-f32 0.19.0", - "gemm-f64 0.19.0", - "num-complex", - "num-traits", - "paste", - "raw-cpuid", - "seq-macro", -] - -[[package]] -name = "gemm-c32" -version = "0.18.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6db9fd9f40421d00eea9dd0770045a5603b8d684654816637732463f4073847" -dependencies = [ - "dyn-stack", - "gemm-common 0.18.2", - "num-complex", - "num-traits", - "paste", - "raw-cpuid", - "seq-macro", -] - -[[package]] -name = "gemm-c32" -version = "0.19.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "086936dbdcb99e37aad81d320f98f670e53c1e55a98bee70573e83f95beb128c" -dependencies = [ - "dyn-stack", - "gemm-common 0.19.0", - "num-complex", - "num-traits", - "paste", - "raw-cpuid", - "seq-macro", -] - -[[package]] -name = "gemm-c64" -version = "0.18.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfcad8a3d35a43758330b635d02edad980c1e143dc2f21e6fd25f9e4eada8edf" -dependencies = [ - "dyn-stack", - "gemm-common 0.18.2", - "num-complex", - "num-traits", - "paste", - "raw-cpuid", - "seq-macro", -] - -[[package]] -name = "gemm-c64" -version = "0.19.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20c8aeeeec425959bda4d9827664029ba1501a90a0d1e6228e48bef741db3a3f" -dependencies = [ - "dyn-stack", - "gemm-common 0.19.0", - "num-complex", - "num-traits", - "paste", - "raw-cpuid", - "seq-macro", -] - -[[package]] -name = "gemm-common" -version = "0.18.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a352d4a69cbe938b9e2a9cb7a3a63b7e72f9349174a2752a558a8a563510d0f3" -dependencies = [ - "bytemuck", - "dyn-stack", - "half", - "libm", - "num-complex", - "num-traits", - "once_cell", - "paste", - "pulp 0.21.5", - "raw-cpuid", - "rayon", - "seq-macro", - "sysctl", -] - -[[package]] -name = "gemm-common" -version = "0.19.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88027625910cc9b1085aaaa1c4bc46bb3a36aad323452b33c25b5e4e7c8e2a3e" -dependencies = [ - "bytemuck", - "dyn-stack", - "half", - "libm", - "num-complex", - "num-traits", - "once_cell", - "paste", - "pulp 0.22.2", - "raw-cpuid", - "rayon", - "seq-macro", - "sysctl", -] - -[[package]] -name = "gemm-f16" -version = "0.18.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cff95ae3259432f3c3410eaa919033cd03791d81cebd18018393dc147952e109" -dependencies = [ - "dyn-stack", - "gemm-common 0.18.2", - "gemm-f32 0.18.2", - "half", - "num-complex", - "num-traits", - "paste", - "raw-cpuid", - "rayon", - "seq-macro", -] - -[[package]] -name = "gemm-f16" -version = "0.19.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3df7a55202e6cd6739d82ae3399c8e0c7e1402859b30e4cb780e61525d9486e" -dependencies = [ - "dyn-stack", - "gemm-common 0.19.0", - "gemm-f32 0.19.0", - "half", - "num-complex", - "num-traits", - "paste", - "raw-cpuid", - "rayon", - "seq-macro", -] - -[[package]] -name = "gemm-f32" -version = "0.18.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc8d3d4385393304f407392f754cd2dc4b315d05063f62cf09f47b58de276864" -dependencies = [ - "dyn-stack", - "gemm-common 0.18.2", - "num-complex", - "num-traits", - "paste", - "raw-cpuid", - "seq-macro", -] - -[[package]] -name = "gemm-f32" -version = "0.19.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02e0b8c9da1fbec6e3e3ab2ce6bc259ef18eb5f6f0d3e4edf54b75f9fd41a81c" -dependencies = [ - "dyn-stack", - "gemm-common 0.19.0", - "num-complex", - "num-traits", - "paste", - "raw-cpuid", - "seq-macro", -] - -[[package]] -name = "gemm-f64" -version = "0.18.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35b2a4f76ce4b8b16eadc11ccf2e083252d8237c1b589558a49b0183545015bd" -dependencies = [ - "dyn-stack", - "gemm-common 0.18.2", - "num-complex", - "num-traits", - "paste", - "raw-cpuid", - "seq-macro", -] - -[[package]] -name = "gemm-f64" -version = "0.19.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "056131e8f2a521bfab322f804ccd652520c79700d81209e9d9275bbdecaadc6a" -dependencies = [ - "dyn-stack", - "gemm-common 0.19.0", - "num-complex", - "num-traits", - "paste", - "raw-cpuid", - "seq-macro", -] - [[package]] name = "generic-array" version = "0.14.7" @@ -1383,21 +957,6 @@ dependencies = [ "regex-syntax", ] -[[package]] -name = "half" -version = "2.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" -dependencies = [ - "bytemuck", - "cfg-if", - "crunchy", - "num-traits", - "rand", - "rand_distr", - "zerocopy 0.8.42", -] - [[package]] name = "hashbrown" version = "0.14.5" @@ -1413,7 +972,7 @@ version = "0.15.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" dependencies = [ - "foldhash 0.1.5", + "foldhash", ] [[package]] @@ -1421,13 +980,6 @@ name = "hashbrown" version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" -dependencies = [ - "allocator-api2", - "equivalent", - "foldhash 0.2.0", - "serde", - "serde_core", -] [[package]] name = "hashlink" @@ -1444,12 +996,6 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" -[[package]] -name = "hermit-abi" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" - [[package]] name = "iana-time-zone" version = "0.1.65" @@ -1482,7 +1028,7 @@ checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43" dependencies = [ "displaydoc", "potential_utf", - "yoke 0.8.1", + "yoke", "zerofrom", "zerovec", ] @@ -1549,7 +1095,7 @@ dependencies = [ "displaydoc", "icu_locale_core", "writeable", - "yoke 0.8.1", + "yoke", "zerofrom", "zerotrie", "zerovec", @@ -1664,6 +1210,15 @@ version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" +[[package]] +name = "itertools" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +dependencies = [ + "either", +] + [[package]] name = "itertools" version = "0.14.0" @@ -1679,6 +1234,16 @@ version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" +[[package]] +name = "jobserver" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom 0.3.4", + "libc", +] + [[package]] name = "js-sys" version = "0.3.91" @@ -1737,22 +1302,6 @@ dependencies = [ "windows-link", ] -[[package]] -name = "libloading" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "754ca22de805bb5744484a5b151a9e1a8e837d5dc232c2d7d8c2e3492edc8b60" -dependencies = [ - "cfg-if", - "windows-link", -] - -[[package]] -name = "libm" -version = "0.2.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" - [[package]] name = "libredox" version = "0.1.14" @@ -1783,10 +1332,38 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" [[package]] -name = "litemap" -version = "0.8.1" +name = "litemap" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" + +[[package]] +name = "llama-cpp-2" +version = "0.1.140" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5604c13b9c847157470479a64d1d7c94f3089709309f82f2fdcbcd43510f2f2" +dependencies = [ + "encoding_rs", + "enumflags2", + "llama-cpp-sys-2", + "thiserror 2.0.18", + "tracing", + "tracing-core", +] + +[[package]] +name = "llama-cpp-sys-2" +version = "0.1.140" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" +checksum = "cbdd3e2c06f3a9a47466a631735946e9ad47fef565b88bc8766a3794474a66f3" +dependencies = [ + "bindgen", + "cc", + "cmake", + "find_cuda_helper", + "glob", + "walkdir", +] [[package]] name = "log" @@ -1810,15 +1387,6 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "670fdfda89751bc4a84ac13eaa63e205cf0fd22b4c9a5fbfa085b63c1f1d3a30" -[[package]] -name = "malloc_buf" -version = "0.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62bb907fe88d54d8d9ce32a3cceab4218ed2f6b7d35617cafe9adf84e43919cb" -dependencies = [ - "libc", -] - [[package]] name = "matchers" version = "0.2.0" @@ -1834,31 +1402,6 @@ version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" -[[package]] -name = "memmap2" -version = "0.9.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "714098028fe011992e1c3962653c96b2d578c4b4bce9036e15ff220319b1e0e3" -dependencies = [ - "libc", - "stable_deref_trait", -] - -[[package]] -name = "metal" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ecfd3296f8c56b7c1f6fbac3c71cefa9d78ce009850c45000015f206dc7fa21" -dependencies = [ - "bitflags 2.11.0", - "block", - "core-graphics-types", - "foreign-types", - "log", - "objc", - "paste", -] - [[package]] name = "minimal-lexical" version = "0.2.1" @@ -1969,77 +1512,12 @@ dependencies = [ "windows-sys 0.61.2", ] -[[package]] -name = "num" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" -dependencies = [ - "num-bigint", - "num-complex", - "num-integer", - "num-iter", - "num-rational", - "num-traits", -] - -[[package]] -name = "num-bigint" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" -dependencies = [ - "num-integer", - "num-traits", -] - -[[package]] -name = "num-complex" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" -dependencies = [ - "bytemuck", - "num-traits", -] - [[package]] name = "num-conv" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c6673768db2d862beb9b39a78fdcb1a69439615d5794a1be50caa9bc92c81967" -[[package]] -name = "num-integer" -version = "0.1.46" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" -dependencies = [ - "num-traits", -] - -[[package]] -name = "num-iter" -version = "0.1.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" -dependencies = [ - "autocfg", - "num-integer", - "num-traits", -] - -[[package]] -name = "num-rational" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" -dependencies = [ - "num-bigint", - "num-integer", - "num-traits", -] - [[package]] name = "num-traits" version = "0.2.19" @@ -2047,17 +1525,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", - "libm", -] - -[[package]] -name = "num_cpus" -version = "1.17.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b" -dependencies = [ - "hermit-abi", - "libc", ] [[package]] @@ -2066,68 +1533,6 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" -[[package]] -name = "objc" -version = "0.2.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "915b1b472bc21c53464d6c8461c9d3af805ba1ef837e1cac254428f4a77177b1" -dependencies = [ - "malloc_buf", -] - -[[package]] -name = "objc2" -version = "0.6.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a12a8ed07aefc768292f076dc3ac8c48f3781c8f2d5851dd3d98950e8c5a89f" -dependencies = [ - "objc2-encode", -] - -[[package]] -name = "objc2-core-foundation" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a180dd8642fa45cdb7dd721cd4c11b1cadd4929ce112ebd8b9f5803cc79d536" -dependencies = [ - "bitflags 2.11.0", - "dispatch2", - "objc2", -] - -[[package]] -name = "objc2-encode" -version = "4.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef25abbcd74fb2609453eb695bd2f860d389e457f67dc17cafc8b8cbc89d0c33" - -[[package]] -name = "objc2-foundation" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3e0adef53c21f888deb4fa59fc59f7eb17404926ee8a6f59f5df0fd7f9f3272" -dependencies = [ - "bitflags 2.11.0", - "block2", - "libc", - "objc2", - "objc2-core-foundation", -] - -[[package]] -name = "objc2-metal" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0125f776a10d00af4152d74616409f0d4a2053a6f57fa5b7d6aa2854ac04794" -dependencies = [ - "bitflags 2.11.0", - "block2", - "dispatch2", - "objc2", - "objc2-core-foundation", - "objc2-foundation", -] - [[package]] name = "once_cell" version = "1.21.4" @@ -2231,43 +1636,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "pulp" -version = "0.21.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96b86df24f0a7ddd5e4b95c94fc9ed8a98f1ca94d3b01bdce2824097e7835907" -dependencies = [ - "bytemuck", - "cfg-if", - "libm", - "num-complex", - "reborrow", - "version_check", -] - -[[package]] -name = "pulp" -version = "0.22.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e205bb30d5b916c55e584c22201771bcf2bad9aabd5d4127f38387140c38632" -dependencies = [ - "bytemuck", - "cfg-if", - "libm", - "num-complex", - "paste", - "pulp-wasm-simd-flag", - "raw-cpuid", - "reborrow", - "version_check", -] - -[[package]] -name = "pulp-wasm-simd-flag" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40e24eee682d89fb193496edf918a7f407d30175b2e785fe057e4392dfd182e0" - [[package]] name = "quote" version = "1.0.45" @@ -2318,25 +1686,6 @@ dependencies = [ "getrandom 0.3.4", ] -[[package]] -name = "rand_distr" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463" -dependencies = [ - "num-traits", - "rand", -] - -[[package]] -name = "raw-cpuid" -version = "11.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "498cd0dc59d73224351ee52a95fee0f1a617a2eae0e7d9d720cc622c73a54186" -dependencies = [ - "bitflags 2.11.0", -] - [[package]] name = "rayon" version = "1.11.0" @@ -2354,7 +1703,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2964d0cf57a3e7a06e8183d14a8b527195c706b7983549cd5462d5aa3747438f" dependencies = [ "either", - "itertools", + "itertools 0.14.0", "rayon", ] @@ -2368,12 +1717,6 @@ dependencies = [ "crossbeam-utils", ] -[[package]] -name = "reborrow" -version = "0.5.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03251193000f4bd3b042892be858ee50e8b3719f2b08e5833ac4353724632430" - [[package]] name = "redox_syscall" version = "0.7.3" @@ -2506,6 +1849,12 @@ dependencies = [ "smallvec", ] +[[package]] +name = "rustc-hash" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" + [[package]] name = "rustix" version = "1.1.4" @@ -2566,27 +1915,6 @@ version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" -[[package]] -name = "safetensors" -version = "0.4.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44560c11236a6130a46ce36c836a62936dc81ebf8c36a37947423571be0e55b6" -dependencies = [ - "serde", - "serde_json", -] - -[[package]] -name = "safetensors" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "675656c1eabb620b921efea4f9199f97fc86e36dd6ffd1fbbe48d0f59a4987f5" -dependencies = [ - "hashbrown 0.16.1", - "serde", - "serde_json", -] - [[package]] name = "same-file" version = "1.0.6" @@ -2628,12 +1956,6 @@ version = "1.0.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" -[[package]] -name = "seq-macro" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc" - [[package]] name = "serde" version = "1.0.228" @@ -2688,15 +2010,6 @@ dependencies = [ "zmij", ] -[[package]] -name = "serde_plain" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ce1fc6db65a611022b23a0dec6975d63fb80a302cb3388835ff02c097258d50" -dependencies = [ - "serde", -] - [[package]] name = "serde_spanned" version = "0.6.9" @@ -2726,6 +2039,18 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "shimmytok" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f2381f12d5c3f475feaf705354294803f350c13d0788f3ab367ac5979df9021" +dependencies = [ + "fancy-regex 0.13.0", + "rayon", + "regex", + "thiserror 2.0.18", +] + [[package]] name = "shlex" version = "1.3.0" @@ -2817,20 +2142,6 @@ dependencies = [ "syn", ] -[[package]] -name = "sysctl" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01198a2debb237c62b6826ec7081082d951f46dbb64b0e8c7649a452230d1dfc" -dependencies = [ - "bitflags 2.11.0", - "byteorder", - "enum-as-inner", - "libc", - "thiserror 1.0.69", - "walkdir", -] - [[package]] name = "tempfile" version = "3.27.0" @@ -2936,7 +2247,7 @@ dependencies = [ "esaxx-rs", "fancy-regex 0.14.0", "getrandom 0.3.4", - "itertools", + "itertools 0.14.0", "log", "macro_rules_attribute", "monostate", @@ -3092,66 +2403,12 @@ dependencies = [ "tracing-log", ] -[[package]] -name = "typed-path" -version = "0.12.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e28f89b80c87b8fb0cf04ab448d5dd0dd0ade2f8891bae878de66a75a28600e" - [[package]] name = "typenum" version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" -[[package]] -name = "ug" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76b761acf8af3494640d826a8609e2265e19778fb43306c7f15379c78c9b05b0" -dependencies = [ - "gemm 0.18.2", - "half", - "libloading 0.8.9", - "memmap2", - "num", - "num-traits", - "num_cpus", - "rayon", - "safetensors 0.4.5", - "serde", - "thiserror 1.0.69", - "tracing", - "yoke 0.7.5", -] - -[[package]] -name = "ug-cuda" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f0a1fa748f26166778c33b8498255ebb7c6bffb472bcc0a72839e07ebb1d9b5" -dependencies = [ - "cudarc 0.17.8", - "half", - "serde", - "thiserror 1.0.69", - "ug", -] - -[[package]] -name = "ug-metal" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f7adf545a99a086d362efc739e7cf4317c18cbeda22706000fd434d70ea3d95" -dependencies = [ - "half", - "metal", - "objc", - "serde", - "thiserror 1.0.69", - "ug", -] - [[package]] name = "unicode-ident" version = "1.0.24" @@ -3798,18 +3055,6 @@ version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" -[[package]] -name = "yoke" -version = "0.7.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40" -dependencies = [ - "serde", - "stable_deref_trait", - "yoke-derive 0.7.5", - "zerofrom", -] - [[package]] name = "yoke" version = "0.8.1" @@ -3817,22 +3062,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954" dependencies = [ "stable_deref_trait", - "yoke-derive 0.8.1", + "yoke-derive", "zerofrom", ] -[[package]] -name = "yoke-derive" -version = "0.7.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" -dependencies = [ - "proc-macro2", - "quote", - "syn", - "synstructure", -] - [[package]] name = "yoke-derive" version = "0.8.1" @@ -3920,7 +3153,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851" dependencies = [ "displaydoc", - "yoke 0.8.1", + "yoke", "zerofrom", ] @@ -3930,7 +3163,7 @@ version = "0.11.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002" dependencies = [ - "yoke 0.8.1", + "yoke", "zerofrom", "zerovec-derive", ] @@ -3946,18 +3179,6 @@ dependencies = [ "syn", ] -[[package]] -name = "zip" -version = "7.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c42e33efc22a0650c311c2ef19115ce232583abbe80850bc8b66509ebef02de0" -dependencies = [ - "crc32fast", - "indexmap", - "memchr", - "typed-path", -] - [[package]] name = "zmij" version = "1.0.21" diff --git a/Cargo.toml b/Cargo.toml index 93a2bc4..6733237 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,14 +34,12 @@ rmcp = { version = "1.2", features = ["transport-io"] } tokio = { version = "1", features = ["macros", "rt-multi-thread"] } notify = "7.0" notify-debouncer-full = "0.4" -candle-core = "0.9" -candle-nn = "0.9" -candle-transformers = "0.9" +llama-cpp-2 = "0.1" +encoding_rs = "0.8" +shimmytok = "0.7" [features] default = [] -metal = ["candle-core/metal"] -cuda = ["candle-core/cuda"] [dev-dependencies] tempfile = "3" diff --git a/README.md b/README.md index ce82aba..69da14d 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ Plain vector search treats your notes as isolated documents. But knowledge isn't - **MCP server for AI agents** — `engraph serve` exposes 13 tools (search, read, context bundles, note creation) that Claude, Cursor, or any MCP client can call directly. - **Real-time sync** — file watcher keeps the index fresh as you edit in Obsidian. No manual re-indexing needed. - **Smart write pipeline** — AI agents can create notes with automatic tag resolution, wikilink discovery, and folder placement based on semantic similarity. -- **Fully local** — pure Rust ML via [candle](https://github.com/huggingface/candle) with GGUF models (~300MB mandatory, ~1.3GB optional for intelligence). Metal-accelerated on macOS. No API keys, no cloud. +- **Fully local** — [llama.cpp](https://github.com/ggml-org/llama.cpp) inference with GGUF models (~300MB mandatory, ~1.3GB optional for intelligence). Metal GPU-accelerated on macOS (88 files indexed in 70s). No API keys, no cloud. ## What problem it solves @@ -57,7 +57,7 @@ Your vault (markdown files) Claude / Cursor / any MCP client ``` -1. **Index** — walks your vault, chunks markdown by headings, embeds with a local GGUF model (candle), stores everything in SQLite with FTS5 + sqlite-vec + a wikilink graph +1. **Index** — walks your vault, chunks markdown by headings, embeds with a local GGUF model via llama.cpp (Metal GPU on macOS), stores everything in SQLite with FTS5 + sqlite-vec + a wikilink graph 2. **Search** — an orchestrator classifies the query and sets lane weights, then runs up to four lanes (semantic KNN, BM25 keyword, graph expansion, cross-encoder reranking), fused via RRF 3. **Serve** — starts an MCP server that AI agents connect to, with a file watcher that re-indexes changes in real time @@ -190,7 +190,7 @@ engraph resolves tags against the registry (fuzzy matching), discovers potential | AI agent access | MCP server (13 tools) | Custom API needed | No | | Write capability | Create/append/move with smart filing | No | Manual | | Real-time sync | File watcher, 2s debounce | Manual re-index | N/A | -| Runs locally | Yes, pure Rust + Metal acceleration | Depends | Yes | +| Runs locally | Yes, llama.cpp + Metal GPU | Depends | Yes | | Setup | One binary, one command | Framework + code | Built-in | engraph is not a replacement for Obsidian — it's the intelligence layer that sits between your vault and your AI tools. @@ -199,7 +199,7 @@ engraph is not a replacement for Obsidian — it's the intelligence layer that s - 4-lane hybrid search (semantic + FTS5 + graph + cross-encoder reranker) with two-pass RRF fusion - LLM research orchestrator: query intent classification + query expansion + adaptive lane weights -- Pure Rust ML via candle (GGUF models, Metal acceleration on macOS) +- llama.cpp inference via Rust bindings (GGUF models, Metal GPU on macOS, CUDA on Linux) - Intelligence opt-in: heuristic fallback when disabled, LLM-powered when enabled - MCP server with 13 tools (7 read, 6 write) via stdio - Real-time file watching with 2s debounce and startup reconciliation @@ -242,7 +242,7 @@ All data stored in `~/.engraph/` — single SQLite database (~10MB typical), GGU ## Development ```bash -cargo test --lib # 271 unit tests, no network +cargo test --lib # 270 unit tests, no network (requires CMake for llama.cpp) cargo clippy -- -D warnings cargo fmt --check diff --git a/src/indexer.rs b/src/indexer.rs index d382945..774fc98 100644 --- a/src/indexer.rs +++ b/src/indexer.rs @@ -4,6 +4,7 @@ use std::time::{Duration, Instant}; use anyhow::{Context, Result, anyhow}; use ignore::WalkBuilder; +use indicatif::{ProgressBar, ProgressStyle}; use sha2::{Digest, Sha256}; use tracing::info; @@ -440,7 +441,7 @@ pub fn run_index(vault_path: &Path, config: &Config, rebuild: bool) -> Result = Vec::new(); + let pb = ProgressBar::new(file_contents.len() as u64); + pb.set_style( + ProgressStyle::with_template(" [{bar:40.cyan/blue}] {pos}/{len} {msg} ({eta})") + .unwrap() + .progress_chars("=>-"), + ); + store.conn().execute_batch("BEGIN DEFERRED")?; for (rel_str, content, hash) in &file_contents { + pb.set_message(rel_str.clone()); let result = index_file(rel_str, content, hash, store, embedder, vault_path, config)?; total_chunks += result.total_chunks; indexed_rel_paths.push(rel_str.clone()); + pb.inc(1); } + pb.finish_with_message("done"); store.commit()?; // Step 9: Build vault graph edges. diff --git a/src/llm.rs b/src/llm.rs index f4c818f..30d913a 100644 --- a/src/llm.rs +++ b/src/llm.rs @@ -1,25 +1,36 @@ use std::io::Read; use std::path::{Path, PathBuf}; +use std::sync::{Mutex, OnceLock}; -use anyhow::{Result, bail}; +use anyhow::{Context as _, Result, bail}; use indicatif::{ProgressBar, ProgressStyle}; use sha2::{Digest, Sha256}; -use anyhow::Context as _; -use candle_core::{D, DType, Device, IndexOp, Tensor}; -use candle_nn::{Embedding, Module}; - -// ── Device selection ───────────────────────────────────────────────────────── - -/// Select best available device: Metal on macOS (with `metal` feature), CPU elsewhere. -fn select_device() -> Result { - #[cfg(feature = "metal")] - { - if let Ok(device) = Device::new_metal(0) { - return Ok(device); - } - } - Ok(Device::Cpu) +use llama_cpp_2::context::params::LlamaContextParams; +use llama_cpp_2::llama_backend::LlamaBackend; +use llama_cpp_2::llama_batch::LlamaBatch; +use llama_cpp_2::model::params::LlamaModelParams; +use llama_cpp_2::model::{AddBos, LlamaModel}; +use llama_cpp_2::sampling::LlamaSampler; + +static BACKEND: OnceLock = OnceLock::new(); +/// Mutex used only during the first initialization of `BACKEND`. +static BACKEND_INIT: Mutex<()> = Mutex::new(()); + +/// Get or initialize the global llama.cpp backend. +/// Safe to call from multiple places — the backend is initialized at most once. +pub fn llama_backend() -> Result<&'static LlamaBackend> { + if let Some(b) = BACKEND.get() { + return Ok(b); + } + let _guard = BACKEND_INIT.lock().unwrap(); + // Double-checked: another thread may have initialized while we waited. + if let Some(b) = BACKEND.get() { + return Ok(b); + } + let backend = + LlamaBackend::init().map_err(|e| anyhow::anyhow!("initializing llama backend: {e}"))?; + Ok(BACKEND.get_or_init(|| backend)) } // ── Prompt format ──────────────────────────────────────────────────────────── @@ -71,7 +82,7 @@ impl PromptFormat { // ── Types ──────────────────────────────────────────────────────────────────── /// Classified intent of an incoming search query. -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub enum QueryIntent { /// User wants a precise fact or term match. Exact, @@ -84,7 +95,7 @@ pub enum QueryIntent { } /// Output produced by an orchestrator model for a query. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] pub struct OrchestrationResult { /// Classified query intent. pub intent: QueryIntent, @@ -444,6 +455,134 @@ pub fn ensure_model(uri: &HfModelUri, models_dir: &Path) -> Result { Ok(path) } +/// Tokenizer that can be backed by either HuggingFace tokenizers crate or shimmytok (GGUF-embedded). +pub enum FlexTokenizer { + HuggingFace(Box), + Gguf(Box), +} + +impl FlexTokenizer { + /// Encode text into token IDs. + pub fn encode(&self, text: &str, add_special: bool) -> Result> { + match self { + Self::HuggingFace(t) => { + let enc = t + .encode(text, add_special) + .map_err(|e| anyhow::anyhow!("tokenization: {e}"))?; + Ok(enc.get_ids().to_vec()) + } + Self::Gguf(t) => { + let ids = t + .encode(text, add_special) + .map_err(|e| anyhow::anyhow!("tokenization: {e}"))?; + Ok(ids) + } + } + } + + /// Count tokens in text. + pub fn token_count(&self, text: &str) -> usize { + self.encode(text, false).map(|ids| ids.len()).unwrap_or(0) + } + + /// Look up a token's ID by string (only available with HuggingFace backend). + pub fn token_to_id(&self, token: &str) -> Option { + match self { + Self::HuggingFace(t) => t.token_to_id(token), + Self::Gguf(_) => None, + } + } + + /// Decode token IDs back to text (only available with HuggingFace backend). + pub fn decode(&self, ids: &[u32], skip_special: bool) -> Result { + match self { + Self::HuggingFace(t) => t + .decode(ids, skip_special) + .map_err(|e| anyhow::anyhow!("decode: {e}")), + Self::Gguf(_) => bail!("decode not supported with GGUF tokenizer"), + } + } +} + +/// Load tokenizer for a model. Tries external tokenizer.json first, falls back to GGUF-embedded. +fn load_tokenizer_for_model(uri: &HfModelUri, models_dir: &Path) -> Result { + // First try: external tokenizer.json from candidate repos. + if let Some(tok) = try_external_tokenizer(uri, models_dir) { + return Ok(FlexTokenizer::HuggingFace(Box::new(tok))); + } + + // Fallback: load tokenizer from GGUF file metadata. + let model_path = uri.cache_path(models_dir); + if model_path.exists() { + tracing::info!( + "no external tokenizer found, loading from GGUF: {}", + model_path.display() + ); + let tok = shimmytok::Tokenizer::from_gguf_file(&model_path) + .map_err(|e| anyhow::anyhow!("loading tokenizer from GGUF metadata: {e}"))?; + return Ok(FlexTokenizer::Gguf(Box::new(tok))); + } + + bail!( + "could not find tokenizer for model '{}': no external tokenizer.json \ + and GGUF file not yet downloaded", + uri.repo + ) +} + +/// Try downloading tokenizer.json from candidate HuggingFace repos. +fn try_external_tokenizer(uri: &HfModelUri, models_dir: &Path) -> Option { + let mut candidates: Vec = vec![uri.repo.clone()]; + + // Non-GGUF variant: "org/model-GGUF" → "org/model" + let base_repo = uri.repo.trim_end_matches("-GGUF").to_string(); + if base_repo != uri.repo { + candidates.push(base_repo); + } + + // Known upstream repos for default models (GGUF repos rarely ship tokenizers). + let model_lower = uri.repo.to_lowercase(); + if model_lower.contains("all-minilm") { + candidates.push("sentence-transformers/all-MiniLM-L6-v2".to_string()); + } else if model_lower.contains("embeddinggemma") { + candidates.push("google/embeddinggemma-300m".to_string()); + candidates.push("google/gemma-2b".to_string()); + } else if model_lower.contains("qwen3") { + let base_name = uri + .repo + .rsplit('/') + .next() + .unwrap_or("") + .trim_end_matches("-GGUF") + .trim_end_matches("-Q8_0-GGUF"); + if !base_name.is_empty() { + candidates.push(format!("Qwen/{base_name}")); + } + } + + for repo in &candidates { + let tok_uri = HfModelUri { + repo: repo.clone(), + filename: "tokenizer.json".to_string(), + }; + let tok_path = tok_uri.cache_path(models_dir); + + if tok_path.exists() + && let Ok(tok) = tokenizers::Tokenizer::from_file(&tok_path) + { + return Some(tok); + } + + if let Ok(p) = ensure_model(&tok_uri, models_dir) + && let Ok(tok) = tokenizers::Tokenizer::from_file(&p) + { + return Some(tok); + } + } + + None +} + /// Default model URIs for the intelligence layer. pub struct ModelDefaults { pub embed_uri: String, @@ -459,167 +598,51 @@ impl Default for ModelDefaults { embed_dim: 256, rerank_uri: "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf" .into(), - expand_uri: "hf:Qwen/Qwen3-0.6B-GGUF/qwen3-0.6b-q8_0.gguf".into(), + expand_uri: "hf:Qwen/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf".into(), } } } -// ── CandleEmbed — GGUF embedding model via candle ────────────────────────── - -/// Quantized matrix multiplication wrapper (mirrors candle-transformers pattern). -#[derive(Debug, Clone)] -struct CandleQMatMul { - inner: candle_core::quantized::QMatMul, -} - -impl CandleQMatMul { - fn from_qtensor(qtensor: candle_core::quantized::QTensor) -> candle_core::Result { - let inner = candle_core::quantized::QMatMul::from_qtensor(qtensor)?; - Ok(Self { inner }) - } - - fn forward(&self, xs: &Tensor) -> candle_core::Result { - self.inner.forward(xs) - } -} - -/// Single transformer layer for the embedding model. -#[derive(Debug, Clone)] -struct EmbedLayer { - attention_wq: CandleQMatMul, - attention_wk: CandleQMatMul, - attention_wv: CandleQMatMul, - attention_wo: CandleQMatMul, - attention_q_norm: candle_transformers::quantized_nn::RmsNorm, - attention_k_norm: candle_transformers::quantized_nn::RmsNorm, - attention_norm: candle_transformers::quantized_nn::RmsNorm, - post_attention_norm: candle_transformers::quantized_nn::RmsNorm, - ffn_norm: candle_transformers::quantized_nn::RmsNorm, - post_ffn_norm: candle_transformers::quantized_nn::RmsNorm, - ffn_gate: CandleQMatMul, - ffn_up: CandleQMatMul, - ffn_down: CandleQMatMul, - n_head: usize, - n_kv_head: usize, - head_dim: usize, - q_dim: usize, - rotary_sin: Tensor, - rotary_cos: Tensor, -} - -impl EmbedLayer { - /// Bidirectional forward pass — no causal mask, no KV cache. - fn forward(&self, x: &Tensor) -> candle_core::Result { - let (b_sz, seq_len, _) = x.dims3()?; - - // --- Attention block --- - let residual = x; - let x = self.attention_norm.forward(x)?; - - let q = self.attention_wq.forward(&x)?; - let k = self.attention_wk.forward(&x)?; - let v = self.attention_wv.forward(&x)?; - - let q = q - .reshape((b_sz, seq_len, self.n_head, self.head_dim))? - .transpose(1, 2)?; - let k = k - .reshape((b_sz, seq_len, self.n_kv_head, self.head_dim))? - .transpose(1, 2)?; - let v = v - .reshape((b_sz, seq_len, self.n_kv_head, self.head_dim))? - .transpose(1, 2)?; - - let q = self.attention_q_norm.forward(&q.contiguous()?)?; - let k = self.attention_k_norm.forward(&k.contiguous()?)?; - - // Apply rotary embeddings (truncated to seq_len). - let q = Self::apply_rotary(&q, &self.rotary_cos, &self.rotary_sin, seq_len)?; - let k = Self::apply_rotary(&k, &self.rotary_cos, &self.rotary_sin, seq_len)?; - - // Repeat KV heads for GQA. - let n_rep = self.n_head / self.n_kv_head; - let k = candle_transformers::utils::repeat_kv(k, n_rep)?; - let v = candle_transformers::utils::repeat_kv(v, n_rep)?; - - // Scaled dot-product attention — BIDIRECTIONAL (no mask). - let scale = 1.0 / (self.head_dim as f64).sqrt(); - let attn_weights = (q.matmul(&k.transpose(2, 3)?)? * scale)?; - let attn_weights = candle_nn::ops::softmax_last_dim(&attn_weights)?; - let attn_output = attn_weights.matmul(&v)?; - - let attn_output = attn_output - .transpose(1, 2)? - .reshape((b_sz, seq_len, self.q_dim))?; - let attn_output = self.attention_wo.forward(&attn_output)?; - let x = self.post_attention_norm.forward(&attn_output)?; - let x = (x + residual)?; - - // --- FFN block --- - let residual = &x; - let h = self.ffn_norm.forward(&x)?; - let gate = self.ffn_gate.forward(&h)?; - let up = self.ffn_up.forward(&h)?; - let h = (candle_nn::ops::silu(&gate)? * up)?; - let h = self.ffn_down.forward(&h)?; - let h = self.post_ffn_norm.forward(&h)?; - h + residual - } - - /// Apply rotary embeddings to a [batch, heads, seq, dim] tensor. - fn apply_rotary( - x: &Tensor, - cos: &Tensor, - sin: &Tensor, - seq_len: usize, - ) -> candle_core::Result { - let cos = cos.i(..seq_len)?.unsqueeze(0)?.unsqueeze(0)?; - let sin = sin.i(..seq_len)?.unsqueeze(0)?.unsqueeze(0)?; - let dim = x.dim(D::Minus1)?; - let half = dim / 2; - let x1 = x.narrow(D::Minus1, 0, half)?; - let x2 = x.narrow(D::Minus1, half, half)?; - let rotated = Tensor::cat(&[&x2.neg()?, &x1], D::Minus1)?; - let out = (x.broadcast_mul(&cos)? + rotated.broadcast_mul(&sin)?)?; - Ok(out) - } -} +// ── LlamaEmbed — GGUF embedding model via llama.cpp ────────────────────────── -/// GGUF embedding model loaded via candle. +/// GGUF embedding model loaded via llama.cpp. /// -/// Loads a quantized Gemma-family embedding model (e.g., embeddinggemma-300M) -/// from a GGUF file and produces dense float vectors via bidirectional attention -/// + mean pooling + L2 normalization. -pub struct CandleEmbed { - layers: Vec, - tok_embeddings: Embedding, - norm: candle_transformers::quantized_nn::RmsNorm, - embedding_length: usize, - tokenizer: tokenizers::Tokenizer, - device: Device, +/// Loads a quantized embedding model from a GGUF file and produces dense float +/// vectors via llama.cpp's built-in embedding support with mean pooling + L2 +/// normalization. Supports Metal acceleration on macOS automatically. +/// +/// `LlamaModel` is `Send + Sync`, so this struct is `Send`. `LlamaContext` is +/// `!Send`, so we create it per-call. The global `LlamaBackend` is referenced +/// via `llama_backend()` — no need to store it per-struct. +pub struct LlamaEmbed { + model: LlamaModel, + tokenizer: FlexTokenizer, dim: usize, prompt_format: PromptFormat, } -impl std::fmt::Debug for CandleEmbed { +// Safety: LlamaModel is Send+Sync per llama-cpp-2 docs. +// FlexTokenizer contains only Send types (tokenizers::Tokenizer is Send, shimmytok::Tokenizer is Send). +// We never store a LlamaContext (which is !Send) — it is created per-call. +unsafe impl Send for LlamaEmbed {} + +impl std::fmt::Debug for LlamaEmbed { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("CandleEmbed") + f.debug_struct("LlamaEmbed") .field("dim", &self.dim) - .field("embedding_length", &self.embedding_length) - .field("num_layers", &self.layers.len()) .field("prompt_format", &self.prompt_format) .finish() } } -impl CandleEmbed { +impl LlamaEmbed { /// Load a GGUF embedding model from `models_dir`. /// /// Steps: /// 1. Resolve model URI (from config override or `ModelDefaults`) /// 2. `ensure_model()` to download if needed /// 3. Load tokenizer (try same repo's tokenizer.json, then repo without -GGUF suffix) - /// 4. Load GGUF and build layer structs for bidirectional embedding + /// 4. Load GGUF model via llama.cpp /// 5. Detect prompt format from filename pub fn new(models_dir: &Path, config: &crate::config::Config) -> Result { let defaults = ModelDefaults::default(); @@ -632,7 +655,7 @@ impl CandleEmbed { let model_path = ensure_model(&uri, models_dir)?; // Load tokenizer: try from the same HF repo, then from the non-GGUF variant. - let tokenizer = Self::load_tokenizer(&uri, models_dir)?; + let tokenizer = load_tokenizer_for_model(&uri, models_dir)?; // Detect prompt format from filename. let prompt_format = PromptFormat::detect(&uri.filename); @@ -640,341 +663,104 @@ impl CandleEmbed { // Target output dimensionality. let dim = defaults.embed_dim; - // Load GGUF and build model. - let device = select_device()?; - let (layers, tok_embeddings, norm, embedding_length) = - Self::load_gguf(&model_path, &device)?; + // Get or initialize the global llama.cpp backend, then load model. + let backend = llama_backend()?; + let model_params = LlamaModelParams::default(); + let model = LlamaModel::load_from_file(backend, &model_path, &model_params) + .map_err(|e| anyhow::anyhow!("loading GGUF model {}: {e}", model_path.display()))?; - tracing::info!( - "loaded CandleEmbed: {} layers, embedding_length={}, target_dim={}, device={:?}", - layers.len(), - embedding_length, - dim, - device - ); + tracing::info!("loaded LlamaEmbed from {}, target_dim={}", uri_str, dim); Ok(Self { - layers, - tok_embeddings, - norm, - embedding_length, + model, tokenizer, - device, dim, prompt_format, }) } - /// Try to load tokenizer.json from the same HF repo, or from repo without "-GGUF" suffix. - fn load_tokenizer(uri: &HfModelUri, models_dir: &Path) -> Result { - // Try 1: tokenizer.json from the same repo. - let tok_uri = HfModelUri { - repo: uri.repo.clone(), - filename: "tokenizer.json".to_string(), - }; - let tok_path = tok_uri.cache_path(models_dir); - if tok_path.exists() { - return tokenizers::Tokenizer::from_file(&tok_path).map_err(|e| { - anyhow::anyhow!("loading tokenizer from {}: {e}", tok_path.display()) - }); - } - - // Try 2: download from the same repo. - if let Ok(p) = ensure_model(&tok_uri, models_dir) { - return tokenizers::Tokenizer::from_file(&p) - .map_err(|e| anyhow::anyhow!("loading tokenizer from {}: {e}", p.display())); - } - - // Try 3: non-GGUF variant of the repo (e.g., "org/model-GGUF" -> "org/model"). - let base_repo = uri.repo.trim_end_matches("-GGUF").to_string(); - if base_repo != uri.repo { - let base_tok_uri = HfModelUri { - repo: base_repo, - filename: "tokenizer.json".to_string(), - }; - if let Ok(p) = ensure_model(&base_tok_uri, models_dir) { - return tokenizers::Tokenizer::from_file(&p) - .map_err(|e| anyhow::anyhow!("loading tokenizer from {}: {e}", p.display())); - } - } - - bail!( - "could not find or download tokenizer for model repo '{}'", - uri.repo - ); - } - - /// Load GGUF file and construct layer structs for bidirectional embedding. - fn load_gguf( - path: &Path, - device: &Device, - ) -> Result<( - Vec, - Embedding, - candle_transformers::quantized_nn::RmsNorm, - usize, - )> { - use candle_core::quantized::gguf_file; - - let mut file = std::fs::File::open(path) - .map_err(|e| anyhow::anyhow!("opening GGUF {}: {e}", path.display()))?; - let ct = gguf_file::Content::read(&mut file) - .map_err(|e| anyhow::anyhow!("reading GGUF {}: {e}", path.display()))?; - - // Detect architecture prefix (same probe as candle-transformers quantized_gemma3). - let prefix = ["gemma3", "gemma2", "gemma", "gemma-embedding"] - .iter() - .find(|p| { - ct.metadata - .contains_key(&format!("{}.attention.head_count", p)) - }) - .copied() - .unwrap_or("gemma3"); - - let md_get = |s: &str| -> Result<&gguf_file::Value> { - let key = format!("{prefix}.{s}"); - ct.metadata - .get(&key) - .ok_or_else(|| anyhow::anyhow!("cannot find {key} in GGUF metadata")) - }; - - let head_count = md_get("attention.head_count")? - .to_u32() - .map_err(|e| anyhow::anyhow!("{e}"))? as usize; - let head_count_kv = md_get("attention.head_count_kv")? - .to_u32() - .map_err(|e| anyhow::anyhow!("{e}"))? as usize; - let block_count = md_get("block_count")? - .to_u32() - .map_err(|e| anyhow::anyhow!("{e}"))? as usize; - let embedding_length = md_get("embedding_length")? - .to_u32() - .map_err(|e| anyhow::anyhow!("{e}"))? as usize; - let key_length = md_get("attention.key_length")? - .to_u32() - .map_err(|e| anyhow::anyhow!("{e}"))? as usize; - let rms_norm_eps = md_get("attention.layer_norm_rms_epsilon")? - .to_f32() - .map_err(|e| anyhow::anyhow!("{e}"))? as f64; - let rope_freq_base = md_get("rope.freq_base") - .and_then(|v| v.to_f32().map_err(|e| anyhow::anyhow!("{e}"))) - .unwrap_or(10_000.0); - - let q_dim = head_count * key_length; - - // Build rotary embedding tables (shared by all layers for the base freq). - let max_seq_len: usize = 8192; // Sufficient for embedding inputs. - let (rotary_sin, rotary_cos) = - Self::build_rotary_tables(key_length, rope_freq_base, max_seq_len, device)?; - - // Load token embeddings. - let tok_embd = ct - .tensor(&mut file, "token_embd.weight", device) - .map_err(|e| anyhow::anyhow!("loading token_embd.weight: {e}"))?; - let tok_embd_deq = tok_embd - .dequantize(device) - .map_err(|e| anyhow::anyhow!("dequantizing token_embd: {e}"))?; - let tok_embeddings = Embedding::new(tok_embd_deq, embedding_length); - - // Final norm. - let norm_qt = ct - .tensor(&mut file, "output_norm.weight", device) - .map_err(|e| anyhow::anyhow!("loading output_norm.weight: {e}"))?; - let norm = candle_transformers::quantized_nn::RmsNorm::from_qtensor(norm_qt, rms_norm_eps) - .map_err(|e| anyhow::anyhow!("creating RmsNorm: {e}"))?; - - // Load transformer layers. - let mut layers = Vec::with_capacity(block_count); - for idx in 0..block_count { - let p = format!("blk.{idx}"); - - // Helper: load a quantized weight tensor as QMatMul. - macro_rules! load_q { - ($name:expr) => {{ - let full = format!("{}.{}", p, $name); - let qt = ct - .tensor(&mut file, &full, device) - .map_err(|e| anyhow::anyhow!("loading {full}: {e}"))?; - CandleQMatMul::from_qtensor(qt) - .map_err(|e| anyhow::anyhow!("QMatMul for {full}: {e}"))? - }}; - } - - // Helper: load a norm weight tensor as RmsNorm. - macro_rules! load_norm { - ($name:expr) => {{ - let full = format!("{}.{}", p, $name); - let qt = ct - .tensor(&mut file, &full, device) - .map_err(|e| anyhow::anyhow!("loading {full}: {e}"))?; - candle_transformers::quantized_nn::RmsNorm::from_qtensor(qt, rms_norm_eps) - .map_err(|e| anyhow::anyhow!("RmsNorm for {full}: {e}"))? - }}; - } - - layers.push(EmbedLayer { - attention_wq: load_q!("attn_q.weight"), - attention_wk: load_q!("attn_k.weight"), - attention_wv: load_q!("attn_v.weight"), - attention_wo: load_q!("attn_output.weight"), - attention_q_norm: load_norm!("attn_q_norm.weight"), - attention_k_norm: load_norm!("attn_k_norm.weight"), - attention_norm: load_norm!("attn_norm.weight"), - post_attention_norm: load_norm!("post_attention_norm.weight"), - ffn_norm: load_norm!("ffn_norm.weight"), - post_ffn_norm: load_norm!("post_ffw_norm.weight"), - ffn_gate: load_q!("ffn_gate.weight"), - ffn_up: load_q!("ffn_up.weight"), - ffn_down: load_q!("ffn_down.weight"), - n_head: head_count, - n_kv_head: head_count_kv, - head_dim: key_length, - q_dim, - rotary_sin: rotary_sin.clone(), - rotary_cos: rotary_cos.clone(), - }); - } - - Ok((layers, tok_embeddings, norm, embedding_length)) - } - - /// Build sin/cos rotary embedding tables of shape [max_seq_len, head_dim]. - fn build_rotary_tables( - head_dim: usize, - freq_base: f32, - max_seq_len: usize, - device: &Device, - ) -> Result<(Tensor, Tensor)> { - let half = head_dim / 2; - let theta: Vec = (0..half) - .map(|i| 1.0 / freq_base.powf(i as f32 / half as f32)) - .collect(); - let theta = Tensor::new(theta.as_slice(), device) - .map_err(|e| anyhow::anyhow!("rotary theta: {e}"))?; - let positions = Tensor::arange(0, max_seq_len as u32, device) - .map_err(|e| anyhow::anyhow!("rotary positions: {e}"))? - .to_dtype(DType::F32) - .map_err(|e| anyhow::anyhow!("rotary positions dtype: {e}"))?; - // [max_seq_len, half] - let freqs = positions - .unsqueeze(1) - .map_err(|e| anyhow::anyhow!("rotary unsqueeze: {e}"))? - .broadcast_mul(&theta.unsqueeze(0).map_err(|e| anyhow::anyhow!("{e}"))?) - .map_err(|e| anyhow::anyhow!("rotary freqs: {e}"))?; - // Duplicate to [max_seq_len, head_dim] to match x1,x2 concatenation. - let freqs = Tensor::cat(&[&freqs, &freqs], D::Minus1) - .map_err(|e| anyhow::anyhow!("rotary cat: {e}"))?; - let sin = freqs - .sin() - .map_err(|e| anyhow::anyhow!("rotary sin: {e}"))?; - let cos = freqs - .cos() - .map_err(|e| anyhow::anyhow!("rotary cos: {e}"))?; - Ok((sin, cos)) - } - - /// Run a bidirectional forward pass and return the mean-pooled, truncated, - /// L2-normalized embedding. + /// Run embedding inference and return the truncated, L2-normalized embedding. fn embed_text(&self, text: &str) -> Result> { - let encoding = self - .tokenizer - .encode(text, true) + // Tokenize using llama.cpp's built-in tokenizer. + // Use AddBos::Never because PromptFormat already adds for embeddinggemma. + let tokens = self + .model + .str_to_token(text, AddBos::Never) .map_err(|e| anyhow::anyhow!("tokenization failed: {e}"))?; - let token_ids = encoding.get_ids(); - if token_ids.is_empty() { + if tokens.is_empty() { bail!("tokenizer returned empty token sequence"); } - let input = Tensor::new(token_ids, &self.device) - .map_err(|e| anyhow::anyhow!("creating input tensor: {e}"))? - .unsqueeze(0) - .map_err(|e| anyhow::anyhow!("unsqueeze: {e}"))?; - - // Token embeddings, scaled by sqrt(embedding_length) (Gemma convention). - let mut hidden = self - .tok_embeddings - .forward(&input) - .map_err(|e| anyhow::anyhow!("token embedding forward: {e}"))?; - hidden = (hidden * (self.embedding_length as f64).sqrt()) - .map_err(|e| anyhow::anyhow!("scaling embeddings: {e}"))?; - - // Forward through all transformer layers (bidirectional — no causal mask). - for layer in &self.layers { - hidden = layer - .forward(&hidden) - .map_err(|e| anyhow::anyhow!("layer forward: {e}"))?; - } + // Create a context with embeddings enabled (per-call, since LlamaContext is !Send). + // n_ubatch must be >= n_tokens for the encoder, and n_ctx must fit all tokens. + let n_tokens = tokens.len() as u32; + let n_ctx = std::num::NonZeroU32::new(n_tokens.max(64) + 16); + let ctx_params = LlamaContextParams::default() + .with_embeddings(true) + .with_n_ctx(n_ctx) + .with_n_ubatch(n_tokens.max(512)) + .with_n_batch(n_tokens.max(512)); + let mut ctx = self + .model + .new_context(llama_backend()?, ctx_params) + .map_err(|e| anyhow::anyhow!("creating embedding context: {e}"))?; - // Final layer norm. - hidden = self - .norm - .forward(&hidden) - .map_err(|e| anyhow::anyhow!("final norm: {e}"))?; + // Create batch and add tokens — mark all as outputs for embedding. + let mut batch = LlamaBatch::new(tokens.len() + 16, 1); + batch + .add_sequence(&tokens, 0, true) + .map_err(|e| anyhow::anyhow!("adding sequence to batch: {e}"))?; - // Mean pool across sequence dimension: [1, seq_len, hidden] -> [1, hidden]. - let seq_len = hidden - .dim(1) - .map_err(|e| anyhow::anyhow!("getting seq dim: {e}"))?; - let pooled = (hidden.sum(1).map_err(|e| anyhow::anyhow!("sum: {e}"))? / (seq_len as f64)) - .map_err(|e| anyhow::anyhow!("mean div: {e}"))?; + // Encode (compute embeddings). Use encode() for embedding models. + ctx.encode(&mut batch) + .map_err(|e| anyhow::anyhow!("embedding encode failed: {e}"))?; - // Squeeze batch dimension: [1, hidden] -> [hidden]. - let pooled = pooled - .squeeze(0) - .map_err(|e| anyhow::anyhow!("squeeze: {e}"))?; + // Get embeddings for sequence 0 (mean pooled by llama.cpp). + let embeddings = ctx + .embeddings_seq_ith(0) + .map_err(|e| anyhow::anyhow!("getting embeddings: {e}"))?; // Truncate to target dimensionality. - let full_dim = pooled - .dim(0) - .map_err(|e| anyhow::anyhow!("dim check: {e}"))?; - let truncated = if full_dim > self.dim { - pooled - .narrow(0, 0, self.dim) - .map_err(|e| anyhow::anyhow!("truncate: {e}"))? + let full_dim = embeddings.len(); + let truncated: Vec = if full_dim > self.dim { + embeddings[..self.dim].to_vec() } else { - pooled + embeddings.to_vec() }; // L2 normalize. - let norm_val = truncated - .sqr() - .map_err(|e| anyhow::anyhow!("sqr: {e}"))? - .sum_all() - .map_err(|e| anyhow::anyhow!("sum_all: {e}"))? - .sqrt() - .map_err(|e| anyhow::anyhow!("sqrt: {e}"))?; - let norm_scalar: f32 = norm_val - .to_scalar() - .map_err(|e| anyhow::anyhow!("norm scalar: {e}"))?; - - let normalized = if norm_scalar > 0.0 { - (truncated / norm_scalar as f64).map_err(|e| anyhow::anyhow!("normalize: {e}"))? + let norm: f32 = truncated.iter().map(|x| x * x).sum::().sqrt(); + let normalized = if norm > 0.0 { + truncated.iter().map(|x| x / norm).collect() } else { truncated }; - let vec: Vec = normalized - .to_vec1() - .map_err(|e| anyhow::anyhow!("to_vec1: {e}"))?; - Ok(vec) + Ok(normalized) } } -impl EmbedModel for CandleEmbed { +impl EmbedModel for LlamaEmbed { fn embed_batch(&mut self, texts: &[&str]) -> Result>> { - // Process texts sequentially — candle quantized ops are single-threaded. - texts.iter().map(|t| self.embed_text(t)).collect() + // Process texts sequentially — llama.cpp context is per-call. + // Apply document prompt format for indexing (asymmetric models need this). + texts + .iter() + .map(|t| { + let formatted = self.prompt_format.format_document("", t); + self.embed_text(&formatted) + }) + .collect() } fn embed_one(&mut self, text: &str) -> Result> { - self.embed_text(text) + // Apply query prompt format (asymmetric models like embeddinggemma need this). + let formatted = self.prompt_format.format_query(text); + self.embed_text(&formatted) } fn token_count(&self, text: &str) -> usize { - self.tokenizer - .encode(text, false) - .map(|enc| enc.get_ids().len()) - .unwrap_or(text.len() / 4 + 1) + self.tokenizer.token_count(text) } fn dim(&self) -> usize { @@ -1093,7 +879,7 @@ fn extract_json_object(text: &str) -> Option<&str> { None } -// ── CandleOrchestrator — GGUF text generation via candle ───────────────────── +// ── LlamaOrchestrator — GGUF text generation via llama.cpp ───────────────────── const ORCHESTRATOR_SYSTEM_PROMPT: &str = r#"You are a search query analyzer. Given a user's search query, classify it and expand it. @@ -1103,33 +889,36 @@ Return JSON with: Be concise. Only return the JSON object."#; -/// Quantized Qwen3 model for query orchestration and expansion. +/// Quantized Qwen3 model for query orchestration and expansion via llama.cpp. /// /// Loads a Qwen3 GGUF model and performs autoregressive generation to classify /// queries and produce expansions. Falls back to `heuristic_orchestrate` if -/// generation or JSON parsing fails. -pub struct CandleOrchestrator { - model: candle_transformers::models::quantized_qwen3::ModelWeights, - tokenizer: tokenizers::Tokenizer, - device: Device, +/// generation or JSON parsing fails. Uses Metal acceleration on macOS automatically. +/// +/// Uses llama.cpp's built-in tokenizer for both encoding and decoding — no +/// external tokenizer.json required. The global `LlamaBackend` is used via +/// `llama_backend()`. +pub struct LlamaOrchestrator { + model: LlamaModel, } -impl std::fmt::Debug for CandleOrchestrator { +// Safety: LlamaModel is Send+Sync per llama-cpp-2 docs. +// LlamaContext is created per-call and never stored. +unsafe impl Send for LlamaOrchestrator {} + +impl std::fmt::Debug for LlamaOrchestrator { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("CandleOrchestrator") - .field("device", &self.device) - .finish() + f.debug_struct("LlamaOrchestrator").finish() } } -impl CandleOrchestrator { +impl LlamaOrchestrator { /// Load a Qwen3 GGUF model for orchestration from `models_dir`. /// /// Steps: /// 1. Resolve model URI (from config override or `ModelDefaults`) /// 2. `ensure_model()` to download if needed - /// 3. Load tokenizer from the model repo (or the non-GGUF base repo) - /// 4. Load GGUF via `ModelWeights::from_gguf()` + /// 3. Load GGUF model via llama.cpp (uses built-in tokenizer — no tokenizer.json needed) pub fn new(models_dir: &Path, config: &crate::config::Config) -> Result { let defaults = ModelDefaults::default(); let uri_str = config @@ -1140,71 +929,17 @@ impl CandleOrchestrator { let uri = HfModelUri::parse(uri_str)?; let model_path = ensure_model(&uri, models_dir)?; - // Load tokenizer (same strategy as CandleEmbed). - let tokenizer = Self::load_tokenizer(&uri, models_dir)?; + // Use global backend and llama.cpp's built-in tokenizer (no tokenizer.json required). + let backend = llama_backend()?; + let model_params = LlamaModelParams::default(); + let model = + LlamaModel::load_from_file(backend, &model_path, &model_params).map_err(|e| { + anyhow::anyhow!("loading orchestrator model {}: {e}", model_path.display()) + })?; - let device = select_device()?; - - // Load GGUF model. - let mut file = std::fs::File::open(&model_path) - .map_err(|e| anyhow::anyhow!("opening GGUF {}: {e}", model_path.display()))?; - let ct = candle_core::quantized::gguf_file::Content::read(&mut file) - .map_err(|e| anyhow::anyhow!("reading GGUF {}: {e}", model_path.display()))?; - let model = candle_transformers::models::quantized_qwen3::ModelWeights::from_gguf( - ct, &mut file, &device, - ) - .map_err(|e| anyhow::anyhow!("loading Qwen3 model weights: {e}"))?; - - tracing::info!( - "loaded CandleOrchestrator from {}, device={:?}", - uri_str, - device - ); - - Ok(Self { - model, - tokenizer, - device, - }) - } - - /// Try to load tokenizer.json from the same HF repo, or from the non-GGUF base repo. - fn load_tokenizer(uri: &HfModelUri, models_dir: &Path) -> Result { - // Try 1: tokenizer.json from the same repo. - let tok_uri = HfModelUri { - repo: uri.repo.clone(), - filename: "tokenizer.json".to_string(), - }; - let tok_path = tok_uri.cache_path(models_dir); - if tok_path.exists() { - return tokenizers::Tokenizer::from_file(&tok_path).map_err(|e| { - anyhow::anyhow!("loading tokenizer from {}: {e}", tok_path.display()) - }); - } - - // Try 2: download from the same repo. - if let Ok(p) = ensure_model(&tok_uri, models_dir) { - return tokenizers::Tokenizer::from_file(&p) - .map_err(|e| anyhow::anyhow!("loading tokenizer from {}: {e}", p.display())); - } - - // Try 3: non-GGUF variant of the repo (e.g., "Qwen/Qwen3-0.6B-GGUF" -> "Qwen/Qwen3-0.6B"). - let base_repo = uri.repo.trim_end_matches("-GGUF").to_string(); - if base_repo != uri.repo { - let base_tok_uri = HfModelUri { - repo: base_repo, - filename: "tokenizer.json".to_string(), - }; - if let Ok(p) = ensure_model(&base_tok_uri, models_dir) { - return tokenizers::Tokenizer::from_file(&p) - .map_err(|e| anyhow::anyhow!("loading tokenizer from {}: {e}", p.display())); - } - } + tracing::info!("loaded LlamaOrchestrator from {}", uri_str); - bail!( - "could not find or download tokenizer for model repo '{}'", - uri.repo - ); + Ok(Self { model }) } /// Format a chat prompt in Qwen3 ChatML format. @@ -1218,89 +953,76 @@ impl CandleOrchestrator { /// Run autoregressive generation (greedy decode) up to `max_tokens`. /// Returns the generated text (excluding the prompt). - fn generate(&mut self, prompt: &str, max_tokens: usize) -> Result { - self.model.clear_kv_cache(); - - let encoding = self - .tokenizer - .encode(prompt, true) + fn generate(&self, prompt: &str, max_tokens: usize) -> Result { + // Tokenize using llama.cpp's built-in tokenizer. + let tokens = self + .model + .str_to_token(prompt, AddBos::Always) .map_err(|e| anyhow::anyhow!("tokenization failed: {e}"))?; - let prompt_tokens = encoding.get_ids(); - if prompt_tokens.is_empty() { + if tokens.is_empty() { bail!("tokenizer returned empty token sequence"); } - // Determine EOS token ID. - let eos_token_id = self - .tokenizer - .token_to_id("<|im_end|>") - .or_else(|| self.tokenizer.token_to_id("<|endoftext|>")) - .unwrap_or(151643); // Qwen3 default EOS - - // Process the prompt in a single forward pass. - let input = Tensor::new(prompt_tokens, &self.device)? - .unsqueeze(0) - .map_err(|e| anyhow::anyhow!("unsqueeze prompt: {e}"))?; - let logits = self + // Create context per-call (LlamaContext is !Send). + let n_ctx = (tokens.len() + max_tokens + 16) as u32; + let ctx_params = LlamaContextParams::default().with_n_ctx(std::num::NonZeroU32::new(n_ctx)); + let mut ctx = self .model - .forward(&input, 0) - .map_err(|e| anyhow::anyhow!("forward pass (prompt): {e}"))?; - - // Get the last token's logits and pick argmax. - let logits = logits - .to_dtype(DType::F32) - .map_err(|e| anyhow::anyhow!("logits dtype: {e}"))?; - let next_token = logits - .i(0)? - .argmax(D::Minus1) - .map_err(|e| anyhow::anyhow!("argmax: {e}"))? - .to_scalar::() - .map_err(|e| anyhow::anyhow!("scalar: {e}"))?; - - let mut generated_tokens: Vec = vec![next_token]; - let mut offset = prompt_tokens.len(); - - if next_token == eos_token_id { - // Model produced EOS immediately. - return Ok(String::new()); + .new_context(llama_backend()?, ctx_params) + .map_err(|e| anyhow::anyhow!("creating orchestrator context: {e}"))?; + + // Process prompt tokens in a batch. + let mut batch = LlamaBatch::new(tokens.len() + max_tokens + 16, 1); + for (i, token) in tokens.iter().enumerate() { + let is_last = i == tokens.len() - 1; + batch + .add(*token, i as i32, &[0], is_last) + .map_err(|e| anyhow::anyhow!("adding prompt token to batch: {e}"))?; } - // Autoregressive loop. - for _ in 1..max_tokens { - let input = Tensor::new(&[*generated_tokens.last().unwrap()], &self.device)? - .unsqueeze(0) - .map_err(|e| anyhow::anyhow!("unsqueeze step: {e}"))?; - let logits = self - .model - .forward(&input, offset) - .map_err(|e| anyhow::anyhow!("forward pass (step): {e}"))?; - offset += 1; - - let logits = logits - .to_dtype(DType::F32) - .map_err(|e| anyhow::anyhow!("logits dtype: {e}"))?; - let token = logits - .i(0)? - .argmax(D::Minus1) - .map_err(|e| anyhow::anyhow!("argmax: {e}"))? - .to_scalar::() - .map_err(|e| anyhow::anyhow!("scalar: {e}"))?; - - if token == eos_token_id { + ctx.decode(&mut batch) + .map_err(|e| anyhow::anyhow!("prompt decode failed: {e}"))?; + + // Autoregressive generation loop. + let mut sampler = LlamaSampler::greedy(); + let mut output = String::new(); + // Each token may produce multi-byte UTF-8 sequences; use an encoding_rs decoder + // to correctly reassemble them across token boundaries. + let mut decoder = encoding_rs::UTF_8.new_decoder(); + let mut n_cur = tokens.len(); + + for _ in 0..max_tokens { + let new_token = sampler.sample(&ctx, batch.n_tokens() - 1); + sampler.accept(new_token); + + // Check for end-of-generation. + if self.model.is_eog_token(new_token) { break; } - generated_tokens.push(token); + + // Decode this token to text using llama.cpp's built-in tokenizer. + let piece = self + .model + .token_to_piece(new_token, &mut decoder, false, None) + .map_err(|e| anyhow::anyhow!("token_to_piece failed: {e}"))?; + output.push_str(&piece); + + // Add token to batch for next iteration. + batch.clear(); + batch + .add(new_token, n_cur as i32, &[0], true) + .map_err(|e| anyhow::anyhow!("adding generated token to batch: {e}"))?; + n_cur += 1; + + ctx.decode(&mut batch) + .map_err(|e| anyhow::anyhow!("generation decode failed: {e}"))?; } - let text = self - .tokenizer - .decode(&generated_tokens, true) - .map_err(|e| anyhow::anyhow!("decoding generated tokens: {e}"))?; - Ok(text) + Ok(output) } } -impl OrchestratorModel for CandleOrchestrator { +impl OrchestratorModel for LlamaOrchestrator { fn orchestrate(&mut self, query: &str) -> Result { let prompt = Self::format_prompt(query); @@ -1322,7 +1044,7 @@ impl OrchestratorModel for CandleOrchestrator { } } -// ── CandleRerank — GGUF cross-encoder reranker via candle ───────────────────── +// ── LlamaRerank — GGUF cross-encoder reranker via llama.cpp ───────────────────── /// Format query+document for cross-encoder reranking. pub fn format_reranker_input(query: &str, document: &str) -> String { @@ -1334,39 +1056,43 @@ pub fn format_reranker_input(query: &str, document: &str) -> String { ) } -/// Quantized Qwen3 cross-encoder for reranking search results. +/// Quantized Qwen3 cross-encoder for reranking search results via llama.cpp. /// /// Loads a Qwen3-Reranker GGUF model and scores (query, document) pairs by /// running a single forward pass and extracting Yes/No logit probabilities. -/// Unlike `CandleOrchestrator`, this does NOT do autoregressive generation — +/// Unlike `LlamaOrchestrator`, this does NOT do autoregressive generation — /// just one pass through the full input to get logits at the last position. -pub struct CandleRerank { - model: candle_transformers::models::quantized_qwen3::ModelWeights, - tokenizer: tokenizers::Tokenizer, - device: Device, - yes_token_id: u32, - no_token_id: u32, +/// +/// Uses llama.cpp's built-in tokenizer to look up Yes/No token IDs — no +/// external tokenizer.json required. The global `LlamaBackend` is used via +/// `llama_backend()`. +pub struct LlamaRerank { + model: LlamaModel, + yes_token_id: i32, + no_token_id: i32, } -impl std::fmt::Debug for CandleRerank { +// Safety: LlamaModel is Send+Sync per llama-cpp-2 docs. +// LlamaContext is created per-call and never stored. +unsafe impl Send for LlamaRerank {} + +impl std::fmt::Debug for LlamaRerank { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("CandleRerank") - .field("device", &self.device) + f.debug_struct("LlamaRerank") .field("yes_token_id", &self.yes_token_id) .field("no_token_id", &self.no_token_id) .finish() } } -impl CandleRerank { +impl LlamaRerank { /// Load a Qwen3-Reranker GGUF model from `models_dir`. /// /// Steps: /// 1. Resolve model URI (from config override or `ModelDefaults::default().rerank_uri`) /// 2. `ensure_model()` to download if needed - /// 3. Load tokenizer from the model repo (or the non-GGUF base repo) - /// 4. Load GGUF via `ModelWeights::from_gguf()` - /// 5. Look up "Yes" and "No" token IDs from the tokenizer + /// 3. Load GGUF model via llama.cpp + /// 4. Look up Yes/No token IDs using the model's built-in tokenizer (no tokenizer.json needed) pub fn new(models_dir: &Path, config: &crate::config::Config) -> Result { let defaults = ModelDefaults::default(); let uri_str = config @@ -1377,132 +1103,86 @@ impl CandleRerank { let uri = HfModelUri::parse(uri_str)?; let model_path = ensure_model(&uri, models_dir)?; - // Load tokenizer (same strategy as CandleOrchestrator). - let tokenizer = Self::load_tokenizer(&uri, models_dir)?; - - // Look up Yes/No token IDs. - let yes_token_id = tokenizer - .token_to_id("Yes") - .ok_or_else(|| anyhow::anyhow!("tokenizer has no 'Yes' token"))?; - let no_token_id = tokenizer - .token_to_id("No") - .ok_or_else(|| anyhow::anyhow!("tokenizer has no 'No' token"))?; - - let device = select_device()?; - - // Load GGUF model. - let mut file = std::fs::File::open(&model_path) - .map_err(|e| anyhow::anyhow!("opening GGUF {}: {e}", model_path.display()))?; - let ct = candle_core::quantized::gguf_file::Content::read(&mut file) - .map_err(|e| anyhow::anyhow!("reading GGUF {}: {e}", model_path.display()))?; - let model = candle_transformers::models::quantized_qwen3::ModelWeights::from_gguf( - ct, &mut file, &device, - ) - .map_err(|e| anyhow::anyhow!("loading Qwen3 reranker model weights: {e}"))?; + // Use global backend and llama.cpp's built-in tokenizer (no tokenizer.json required). + let backend = llama_backend()?; + let model_params = LlamaModelParams::default(); + let model = LlamaModel::load_from_file(backend, &model_path, &model_params) + .map_err(|e| anyhow::anyhow!("loading reranker model {}: {e}", model_path.display()))?; + + // Look up Yes/No token IDs via the model's built-in tokenizer. + // str_to_token returns Vec; we take the first token ID (skip BOS). + let yes_tokens = model + .str_to_token("Yes", AddBos::Never) + .map_err(|e| anyhow::anyhow!("tokenizing 'Yes': {e}"))?; + let yes_token_id = yes_tokens + .first() + .map(|t| t.0) + .ok_or_else(|| anyhow::anyhow!("model tokenizer returned no tokens for 'Yes'"))?; + + let no_tokens = model + .str_to_token("No", AddBos::Never) + .map_err(|e| anyhow::anyhow!("tokenizing 'No': {e}"))?; + let no_token_id = no_tokens + .first() + .map(|t| t.0) + .ok_or_else(|| anyhow::anyhow!("model tokenizer returned no tokens for 'No'"))?; tracing::info!( - "loaded CandleRerank from {}, device={:?}, yes_id={}, no_id={}", + "loaded LlamaRerank from {}, yes_id={}, no_id={}", uri_str, - device, yes_token_id, no_token_id ); Ok(Self { model, - tokenizer, - device, yes_token_id, no_token_id, }) } - - /// Try to load tokenizer.json from the same HF repo, or from the non-GGUF base repo. - fn load_tokenizer(uri: &HfModelUri, models_dir: &Path) -> Result { - // Try 1: tokenizer.json from the same repo. - let tok_uri = HfModelUri { - repo: uri.repo.clone(), - filename: "tokenizer.json".to_string(), - }; - let tok_path = tok_uri.cache_path(models_dir); - if tok_path.exists() { - return tokenizers::Tokenizer::from_file(&tok_path).map_err(|e| { - anyhow::anyhow!("loading tokenizer from {}: {e}", tok_path.display()) - }); - } - - // Try 2: download from the same repo. - if let Ok(p) = ensure_model(&tok_uri, models_dir) { - return tokenizers::Tokenizer::from_file(&p) - .map_err(|e| anyhow::anyhow!("loading tokenizer from {}: {e}", p.display())); - } - - // Try 3: non-GGUF variant of the repo. - let base_repo = uri.repo.trim_end_matches("-GGUF").to_string(); - if base_repo != uri.repo { - let base_tok_uri = HfModelUri { - repo: base_repo, - filename: "tokenizer.json".to_string(), - }; - if let Ok(p) = ensure_model(&base_tok_uri, models_dir) { - return tokenizers::Tokenizer::from_file(&p) - .map_err(|e| anyhow::anyhow!("loading tokenizer from {}: {e}", p.display())); - } - } - - bail!( - "could not find or download tokenizer for model repo '{}'", - uri.repo - ); - } } -impl RerankModel for CandleRerank { +impl RerankModel for LlamaRerank { fn rerank_score(&mut self, query: &str, document: &str) -> Result { - self.model.clear_kv_cache(); - let input_text = format_reranker_input(query, document); - let encoding = self - .tokenizer - .encode(input_text.as_str(), true) + // Tokenize using llama.cpp's built-in tokenizer. + let tokens = self + .model + .str_to_token(&input_text, AddBos::Always) .map_err(|e| anyhow::anyhow!("tokenization failed: {e}"))?; - let token_ids = encoding.get_ids(); - if token_ids.is_empty() { + if tokens.is_empty() { bail!("tokenizer returned empty token sequence"); } - // Single forward pass through the full input (no autoregressive generation). - let input = Tensor::new(token_ids, &self.device)? - .unsqueeze(0) - .map_err(|e| anyhow::anyhow!("unsqueeze input: {e}"))?; - let logits = self + // Create context per-call (LlamaContext is !Send). + let n_ctx = (tokens.len() + 16) as u32; + let ctx_params = LlamaContextParams::default().with_n_ctx(std::num::NonZeroU32::new(n_ctx)); + let mut ctx = self .model - .forward(&input, 0) - .map_err(|e| anyhow::anyhow!("forward pass: {e}"))?; - - // logits shape: [1, seq_len, vocab_size] or [1, vocab_size] (last position). - // Extract logits for the last position. - let logits = logits - .to_dtype(DType::F32) - .map_err(|e| anyhow::anyhow!("logits dtype: {e}"))?; - let last_logits = logits - .i(0) - .map_err(|e| anyhow::anyhow!("batch index: {e}"))?; - - // Extract Yes/No logits. - let yes_logit: f32 = last_logits - .i(self.yes_token_id as usize) - .map_err(|e| anyhow::anyhow!("yes logit index: {e}"))? - .to_scalar() - .map_err(|e| anyhow::anyhow!("yes logit scalar: {e}"))?; - let no_logit: f32 = last_logits - .i(self.no_token_id as usize) - .map_err(|e| anyhow::anyhow!("no logit index: {e}"))? - .to_scalar() - .map_err(|e| anyhow::anyhow!("no logit scalar: {e}"))?; - - // Softmax over Yes/No to get probability. + .new_context(llama_backend()?, ctx_params) + .map_err(|e| anyhow::anyhow!("creating reranker context: {e}"))?; + + // Create batch with all tokens; mark last as logit-producing. + let mut batch = LlamaBatch::new(tokens.len() + 16, 1); + for (i, token) in tokens.iter().enumerate() { + let is_last = i == tokens.len() - 1; + batch + .add(*token, i as i32, &[0], is_last) + .map_err(|e| anyhow::anyhow!("adding token to reranker batch: {e}"))?; + } + + // Single forward pass through the full input. + ctx.decode(&mut batch) + .map_err(|e| anyhow::anyhow!("reranker decode failed: {e}"))?; + + // Get logits for the last token position. + let logits = ctx.get_logits_ith(batch.n_tokens() - 1); + + // Extract Yes/No logits and compute softmax probability. + let yes_logit = logits[self.yes_token_id as usize]; + let no_logit = logits[self.no_token_id as usize]; + let max_logit = yes_logit.max(no_logit); let yes_exp = (yes_logit - max_logit).exp(); let no_exp = (no_logit - max_logit).exp(); @@ -1629,17 +1309,21 @@ mod tests { let defaults = ModelDefaults::default(); assert!(defaults.embed_uri.starts_with("hf:")); assert_eq!(defaults.embed_dim, 256); + assert!( + defaults.embed_uri.contains("embeddinggemma"), + "default embed model should be embeddinggemma" + ); } - // ── CandleEmbed / PromptFormat tests ──────────────────────────────────── + // ── LlamaEmbed / PromptFormat tests ──────────────────────────────────── #[test] - fn test_candle_embed_struct_exists() { + fn test_llama_embed_struct_exists() { fn assert_embed_model(_e: &E) {} let mock = MockLlm::new(256); assert_embed_model(&mock); - // CandleEmbed also implements EmbedModel — verified at compile time. - // We can't instantiate CandleEmbed without a real GGUF model, + // LlamaEmbed also implements EmbedModel — verified at compile time. + // We can't instantiate LlamaEmbed without a real GGUF model, // but the trait bound compiles. } @@ -1690,15 +1374,6 @@ mod tests { assert_eq!(formatted, "Title\nBody"); } - #[test] - fn test_select_device_returns_cpu_by_default() { - // Without the `metal` feature, select_device should return CPU. - let device = select_device().unwrap(); - // On CI/test without metal feature, this should be CPU. - // With metal feature on macOS, it could be Metal — both are valid. - let _ = device; // Just verify it doesn't error. - } - // ── heuristic_orchestrate tests ────────────────────────────────────────── #[test] @@ -1799,11 +1474,11 @@ mod tests { assert!(parse_orchestration_json(json).is_err()); } - // ── CandleOrchestrator tests ───────────────────────────────────────────── + // ── LlamaOrchestrator tests ───────────────────────────────────────────── #[test] - fn test_candle_orchestrator_format_prompt() { - let prompt = CandleOrchestrator::format_prompt("how does auth work"); + fn test_llama_orchestrator_format_prompt() { + let prompt = LlamaOrchestrator::format_prompt("how does auth work"); assert!(prompt.contains("<|im_start|>system")); assert!(prompt.contains("<|im_end|>")); assert!(prompt.contains("<|im_start|>user")); @@ -1812,13 +1487,13 @@ mod tests { } #[test] - fn test_candle_orchestrator_implements_trait() { - // Compile-time check: CandleOrchestrator implements OrchestratorModel. + fn test_llama_orchestrator_implements_trait() { + // Compile-time check: LlamaOrchestrator implements OrchestratorModel. fn assert_orchestrator() {} - assert_orchestrator::(); + assert_orchestrator::(); } - // ── CandleRerank tests ────────────────────────────────────────────────── + // ── LlamaRerank tests ────────────────────────────────────────────────── #[test] fn test_format_reranker_input() { @@ -1829,7 +1504,7 @@ mod tests { } #[test] - fn test_candle_rerank_trait_compliance() { + fn test_llama_rerank_trait_compliance() { // Verify MockLlm still satisfies RerankModel. fn assert_rerank(_r: &R) {} let mock = MockLlm::new(256); diff --git a/src/main.rs b/src/main.rs index 7d2872b..10a20f3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -829,7 +829,7 @@ async fn main() -> Result<()> { } ContextAction::Topic { query, budget } => { let models_dir = data_dir.join("models"); - let mut embedder = engraph::llm::CandleEmbed::new(&models_dir, &cfg)?; + let mut embedder = engraph::llm::LlamaEmbed::new(&models_dir, &cfg)?; let bundle = engraph::context::context_topic_with_search( ¶ms, @@ -882,7 +882,7 @@ async fn main() -> Result<()> { .ok_or_else(|| anyhow::anyhow!("No vault path in index."))?; let vault_path = PathBuf::from(&vault_path_str); let models_dir = data_dir.join("models"); - let mut embedder = engraph::llm::CandleEmbed::new(&models_dir, &cfg)?; + let mut embedder = engraph::llm::LlamaEmbed::new(&models_dir, &cfg)?; let profile = config::Config::load_vault_profile().ok().flatten(); match action { diff --git a/src/search.rs b/src/search.rs index 430d56d..eb41be5 100644 --- a/src/search.rs +++ b/src/search.rs @@ -10,7 +10,6 @@ use crate::llm::{self, EmbedModel, OrchestratorModel, RerankModel}; use crate::store::{Store, StoreStats}; /// Compute cache key for orchestration results (SHA256 of query). -#[allow(dead_code)] fn orchestration_cache_key(query: &str) -> String { use sha2::{Digest, Sha256}; let hash = Sha256::digest(query.as_bytes()); @@ -85,11 +84,32 @@ pub fn search_with_intelligence( embedder: &mut impl EmbedModel, config: &mut SearchConfig<'_>, ) -> Result { - // --- Step 1: Orchestrate --- + // --- Step 1: Orchestrate (with LLM cache when orchestrator is present) --- let orchestration = match &mut config.orchestrator { - Some(orch) => orch.orchestrate(query)?, + Some(orch) => { + let cache_key = orchestration_cache_key(query); + if let Some(cached_json) = config.store.get_llm_cache(&cache_key)? { + serde_json::from_str(&cached_json).unwrap_or_else(|_| { + orch.orchestrate(query) + .unwrap_or_else(|_| llm::heuristic_orchestrate(query)) + }) + } else { + let result = orch.orchestrate(query)?; + if let Ok(json) = serde_json::to_string(&result) { + let _ = config + .store + .set_llm_cache(&cache_key, &json, "orchestrator"); + } + result + } + } None => llm::heuristic_orchestrate(query), }; + tracing::debug!( + intent = ?orchestration.intent, + expansions = orchestration.expansions.len(), + "orchestration complete" + ); let weights = llm::LaneWeights::from_intent(&orchestration.intent); // --- Step 2: Run 3-lane retrieval for EACH expanded query --- @@ -302,12 +322,49 @@ pub fn run_search( ) -> Result<()> { let models_dir = data_dir.join("models"); let mut embedder = - crate::llm::CandleEmbed::new(&models_dir, config).context("loading embedder")?; + crate::llm::LlamaEmbed::new(&models_dir, config).context("loading embedder")?; let db_path = data_dir.join("engraph.db"); let store = Store::open(&db_path).context("opening store")?; - let output = search_internal(query, top_n, &store, &mut embedder)?; + // Load intelligence models if enabled. + let mut orchestrator_model: Option> = + if config.intelligence_enabled() { + match crate::llm::LlamaOrchestrator::new(&models_dir, config) { + Ok(o) => Some(Box::new(o)), + Err(e) => { + tracing::warn!("failed to load orchestrator: {e}"); + None + } + } + } else { + None + }; + let mut reranker_model: Option> = if config.intelligence_enabled() { + match crate::llm::LlamaRerank::new(&models_dir, config) { + Ok(r) => Some(Box::new(r)), + Err(e) => { + tracing::warn!("failed to load reranker: {e}"); + None + } + } + } else { + None + }; + + let output = { + let mut search_config = SearchConfig { + orchestrator: orchestrator_model + .as_mut() + .map(|o| o.as_mut() as &mut dyn llm::OrchestratorModel), + reranker: reranker_model + .as_mut() + .map(|r| r.as_mut() as &mut dyn llm::RerankModel), + store: &store, + rerank_candidates: 30, + }; + search_with_intelligence(query, top_n, &mut embedder, &mut search_config)? + }; let results: Vec = output .results diff --git a/src/serve.rs b/src/serve.rs index 48ba85b..6a91346 100644 --- a/src/serve.rs +++ b/src/serve.rs @@ -132,10 +132,8 @@ pub struct EngraphServer { profile: Arc>, tool_router: ToolRouter, /// Query expansion orchestrator (None when intelligence is disabled or failed to load). - #[allow(dead_code)] orchestrator: Option>>>, /// Result reranker (None when intelligence is disabled or failed to load). - #[allow(dead_code)] reranker: Option>>>, } @@ -168,8 +166,31 @@ impl EngraphServer { let top_n = params.0.top_n.unwrap_or(10); let store = self.store.lock().await; let mut embedder = self.embedder.lock().await; - let output = search::search_internal(¶ms.0.query, top_n, &store, &mut *embedder) - .map_err(|e| mcp_err(&e))?; + + // Lock orchestrator and reranker if available for intelligence-enhanced search. + let mut orch_guard = match &self.orchestrator { + Some(o) => Some(o.lock().await), + None => None, + }; + let mut rerank_guard = match &self.reranker { + Some(r) => Some(r.lock().await), + None => None, + }; + + let mut config = search::SearchConfig { + orchestrator: orch_guard + .as_mut() + .map(|g| g.as_mut() as &mut dyn OrchestratorModel), + reranker: rerank_guard + .as_mut() + .map(|g| g.as_mut() as &mut dyn RerankModel), + store: &store, + rerank_candidates: 30, + }; + + let output = + search::search_with_intelligence(¶ms.0.query, top_n, &mut *embedder, &mut config) + .map_err(|e| mcp_err(&e))?; to_json_result(&output.results) } @@ -416,7 +437,7 @@ pub async fn run_serve(data_dir: &Path) -> Result<()> { let store = Store::open(&db_path)?; let config = Config::load()?; - let embedder = crate::llm::CandleEmbed::new(&models_dir, &config)?; + let embedder = crate::llm::LlamaEmbed::new(&models_dir, &config)?; let vault_path_str = store.get_meta("vault_path")?.ok_or_else(|| { anyhow::anyhow!("No vault path in index. Run 'engraph index ' first.") @@ -441,7 +462,7 @@ pub async fn run_serve(data_dir: &Path) -> Result<()> { // Load intelligence models if enabled let orchestrator: Option>>> = if config.intelligence_enabled() { - match crate::llm::CandleOrchestrator::new(&models_dir, &config) { + match crate::llm::LlamaOrchestrator::new(&models_dir, &config) { Ok(orch) => Some(Arc::new(Mutex::new( Box::new(orch) as Box ))), @@ -456,7 +477,7 @@ pub async fn run_serve(data_dir: &Path) -> Result<()> { let reranker: Option>>> = if config.intelligence_enabled() { - match crate::llm::CandleRerank::new(&models_dir, &config) { + match crate::llm::LlamaRerank::new(&models_dir, &config) { Ok(rerank) => Some(Arc::new(Mutex::new( Box::new(rerank) as Box ))), diff --git a/src/store.rs b/src/store.rs index 4485bf3..30f0088 100644 --- a/src/store.rs +++ b/src/store.rs @@ -141,7 +141,12 @@ impl Store { .context("failed to initialize schema")?; self.migrate()?; self.ensure_fts_table()?; - crate::vecstore::init_vec_table(&self.conn, 256)?; + // Use stored embedding dimension if available, defaulting to 384 for new databases. + let dim = self + .get_meta("embedding_dim")? + .and_then(|s| s.parse::().ok()) + .unwrap_or(256); + crate::vecstore::init_vec_table(&self.conn, dim)?; self.migrate_vectors_to_vec0()?; Ok(()) } @@ -1165,11 +1170,12 @@ impl Store { } } - /// Drop the vec table and all chunk records. Used during dimension migration. + /// Drop the vec table and all chunk/FTS records. Used during dimension migration. pub fn reset_for_reindex(&self, new_dim: usize) -> Result<()> { self.conn.execute("DROP TABLE IF EXISTS chunks_vec", [])?; crate::vecstore::init_vec_table(&self.conn, new_dim)?; self.conn.execute("DELETE FROM chunks", [])?; + self.conn.execute("DELETE FROM chunks_fts", [])?; Ok(()) }