diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index ce4818b..c5f4d07 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -14,6 +14,9 @@ jobs:
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
+      - name: Install CMake (Ubuntu)
+        if: runner.os == 'Linux'
+        run: sudo apt-get update && sudo apt-get install -y cmake
       - uses: dtolnay/rust-toolchain@stable
         with:
           components: rustfmt, clippy
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index d10be69..d63db36 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -19,6 +19,9 @@ jobs:
       contents: write
     steps:
       - uses: actions/checkout@v4
+      - name: Install CMake (Ubuntu)
+        if: runner.os == 'Linux'
+        run: sudo apt-get update && sudo apt-get install -y cmake
       - uses: dtolnay/rust-toolchain@stable
       - run: cargo build --release
       - name: Archive binary
@@ -60,6 +63,7 @@ jobs:
             sha256 "SHA256"
             license "MIT"
 
+            depends_on "cmake" => :build
             depends_on "rust" => :build
 
             def install
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5e71f03..703e2b7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,29 @@
 # Changelog
 
+## [1.0.1] - 2026-03-26
+
+### Changed
+- **Inference backend switched from candle to llama.cpp** — via `llama-cpp-2` Rust bindings. Gets full Metal GPU acceleration on macOS (88 files indexed in 70s vs 37+ minutes on CPU with candle). Same backend as [qmd](https://github.com/tobi/qmd).
+- Default embedding model produces 256-dim vectors via embeddinggemma-300M (Matryoshka truncation)
+- BERT GGUF architecture support added alongside Gemma (future model flexibility)
+- Progress bar during indexing via indicatif (was silent for minutes)
+- CI workflow installs CMake on Ubuntu (required for llama.cpp build)
+
+### Fixed
+- **Prompt format applied during embedding** — `embed_one` uses search_query prefix, `embed_batch` uses search_document prefix. Without this, embeddinggemma operated in wrong symmetric mode.
+- **GGUF tokenizer fallback** — added `shimmytok` crate to extract tokenizer from GGUF metadata when tokenizer.json is unavailable (Google Gemma repos are gated)
+- **LlamaBackend singleton** — global `OnceLock` prevents double-initialization crash when loading multiple models
+- **Orchestrator/reranker use built-in tokenizer** — llama.cpp reads tokenizer from GGUF metadata, no external tokenizer.json needed
+- **Dimension migration clears FTS** — `reset_for_reindex` now also clears `chunks_fts` to prevent duplicate entries
+- **LLM cache wired into search** — `search_with_intelligence` checks/populates `llm_cache` table
+- **MCP server wires intelligence** — search handler passes orchestrator + reranker via `SearchConfig`
+- **CLI search wires intelligence** — `run_search` loads models when intelligence enabled
+- **Qwen3 GGUF filename** — fixed case sensitivity (was 404)
+- **Embedding batch params** — `n_ubatch >= n_tokens` assertion, use `encode()` not `decode()`, `AddBos::Never` (PromptFormat adds `<bos>`)
+
+### Removed
+- `candle-core`, `candle-nn`, `candle-transformers` dependencies (replaced by `llama-cpp-2`)
+
 ## [1.0.0] - 2026-03-25
 
 ### Added
diff --git a/CLAUDE.md b/CLAUDE.md
index 86ae2e3..f5739b8 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -9,7 +9,7 @@ Single binary with 19 modules behind a lib crate:
 - `config.rs` — loads `~/.engraph/config.toml` and `vault.toml`, merges CLI args, provides `data_dir()`. Includes `intelligence: Option<bool>` and `[models]` section for model overrides. `Config::save()` writes back to disk.
 - `chunker.rs` — smart chunking with break-point scoring algorithm. Finds optimal split points considering headings, code fences, blank lines, and thematic breaks. `split_oversized_chunks()` handles token-aware secondary splitting with overlap
 - `docid.rs` — deterministic 6-char hex IDs for files (SHA-256 of path, truncated). Shown in search results for quick reference
-- `llm.rs` — candle model management. Three traits: `EmbedModel` (embeddings), `RerankModel` (cross-encoder scoring), `OrchestratorModel` (query intent + expansion). Three candle implementations: `CandleEmbed` (custom bidirectional transformer from GGUF for embeddinggemma), `CandleOrchestrator` (quantized_qwen3 for query analysis), `CandleRerank` (quantized_qwen3 for relevance scoring). Also: `MockLlm` for testing, `HfModelUri` for model download, `PromptFormat` for model-family prompt templates, `heuristic_orchestrate()` fast path, `LaneWeights` per query intent
+- `llm.rs` — ML inference via llama.cpp (Rust bindings: `llama-cpp-2`). Three traits: `EmbedModel` (embeddings), `RerankModel` (cross-encoder scoring), `OrchestratorModel` (query intent + expansion). Three llama.cpp implementations: `LlamaEmbed` (embeddinggemma-300M GGUF on Metal GPU), `LlamaOrchestrator` (Qwen3-0.6B for query analysis + expansion), `LlamaRerank` (Qwen3-Reranker-0.6B for relevance scoring). Global `LlamaBackend` via `OnceLock`. Also: `MockLlm` for testing, `HfModelUri` for model download, `FlexTokenizer` (HuggingFace tokenizers + shimmytok GGUF fallback), `PromptFormat` for model-family prompt templates, `heuristic_orchestrate()` fast path, `LaneWeights` per query intent
 - `fts.rs` — FTS5 full-text search support. Re-exports `FtsResult` from store. BM25-ranked keyword search
 - `fusion.rs` — Reciprocal Rank Fusion (RRF) engine. Merges semantic + FTS5 + graph + reranker results. Supports per-lane weighting, `--explain` output with intent + per-lane detail
 - `context.rs` — context engine. Six functions: `read` (full note content + metadata), `list` (filtered note listing with `created_by` filter), `vault_map` (structure overview), `who` (person context bundle), `project` (project context bundle), `context_topic` (rich topic context with budget trimming). Pure functions taking `ContextParams` — no model loading except `context_topic` which reuses `search_internal`
@@ -52,14 +52,13 @@ Single vault only. Re-indexing a different vault path triggers a confirmation pr
 
 ## Dependencies to be aware of
 
-- `candle-core` (0.9) — HuggingFace pure Rust ML framework. GGUF model loading, tensor ops. `metal` feature for macOS GPU acceleration
-- `candle-nn` (0.9) — neural network building blocks (RmsNorm, rotary embeddings, etc.)
-- `candle-transformers` (0.9) — pre-built transformer model architectures. Used: `quantized_qwen3` for orchestrator + reranker
+- `llama-cpp-2` (0.1) — Rust bindings to llama.cpp. GGUF model loading + inference. Metal GPU on macOS, CUDA on Linux. Compiles llama.cpp C++ via build script (requires CMake)
+- `shimmytok` (0.7) — pure Rust tokenizer that reads from GGUF metadata. Fallback when tokenizer.json is unavailable (gated HuggingFace repos)
+- `tokenizers` (0.22) — HuggingFace tokenizer. Kept for FlexTokenizer HuggingFace backend
 - `sqlite-vec` (0.1.8-alpha.1) — SQLite extension for vector search. Provides vec0 virtual tables with KNN via `vec_distance_cosine()`
 - `zerocopy` (0.7) — zero-copy serialization for vector data passed to sqlite-vec
 - `strsim` (0.11) — string similarity for fuzzy tag matching and fuzzy link matching
 - `time` (0.3) — date/time handling for frontmatter timestamps
-- `tokenizers` (0.22) — HuggingFace tokenizer. Needs `fancy-regex` feature. Used for all three GGUF models
 - `ignore` (0.4) — vault walking with `.gitignore` support
 - `rusqlite` (0.32) — bundled SQLite with FTS5 support
 - `rmcp` (1.2) — MCP server SDK for stdio transport
@@ -68,12 +67,13 @@ Single vault only. Re-indexing a different vault path triggers a confirmation pr
 
 ## Testing
 
-- Unit tests in each module (`cargo test --lib`) — 271 tests, no network required
+- Unit tests in each module (`cargo test --lib`) — 270 tests, no network required
 - Integration tests (`cargo test --test integration -- --ignored`) — require GGUF model download
+- Build requires CMake (for llama.cpp C++ compilation)
 
 ## CI/CD
 
-- CI: `cargo fmt --check` + `cargo clippy -- -D warnings` + `cargo test --lib` on macOS + Ubuntu
+- CI: `cargo fmt --check` + `cargo clippy -- -D warnings` + `cargo test --lib` on macOS + Ubuntu. Ubuntu step installs CMake.
 - Release: native builds on macOS arm64 (macos-14) + Linux x86_64 (ubuntu-latest). Triggered by `v*` tags
 - Homebrew: `devwhodevs/homebrew-tap` — formula builds from source tarball
 
diff --git a/Cargo.lock b/Cargo.lock
index d6c3ef7..22e8525 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -31,12 +31,6 @@ dependencies = [
  "memchr",
 ]
 
-[[package]]
-name = "allocator-api2"
-version = "0.2.21"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
-
 [[package]]
 name = "android_system_properties"
 version = "0.1.5"
@@ -132,14 +126,32 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
 
 [[package]]
-name = "bindgen_cuda"
-version = "0.1.6"
+name = "bindgen"
+version = "0.72.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "282be55fb326843bb67cccceeeaf21c961ef303f60018f9a2ab69494dad8eaf9"
+checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895"
 dependencies = [
- "glob",
- "num_cpus",
- "rayon",
+ "bitflags 2.11.0",
+ "cexpr",
+ "clang-sys",
+ "itertools 0.13.0",
+ "log",
+ "prettyplease",
+ "proc-macro2",
+ "quote",
+ "regex",
+ "rustc-hash",
+ "shlex",
+ "syn",
+]
+
+[[package]]
+name = "bit-set"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1"
+dependencies = [
+ "bit-vec 0.6.3",
 ]
 
 [[package]]
@@ -148,9 +160,15 @@ version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3"
 dependencies = [
- "bit-vec",
+ "bit-vec 0.8.0",
 ]
 
+[[package]]
+name = "bit-vec"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb"
+
 [[package]]
 name = "bit-vec"
 version = "0.8.0"
@@ -169,12 +187,6 @@ version = "2.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af"
 
-[[package]]
-name = "block"
-version = "0.1.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0d8c1fef690941d3e7788d328517591fecc684c084084702d6ff1641e993699a"
-
 [[package]]
 name = "block-buffer"
 version = "0.10.4"
@@ -184,15 +196,6 @@ dependencies = [
  "generic-array",
 ]
 
-[[package]]
-name = "block2"
-version = "0.6.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cdeb9d870516001442e364c5220d3574d2da8dc765554b4a617230d33fa58ef5"
-dependencies = [
- "objc2",
-]
-
 [[package]]
 name = "bstr"
 version = "1.12.1"
@@ -209,26 +212,6 @@ version = "3.20.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb"
 
-[[package]]
-name = "bytemuck"
-version = "1.25.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec"
-dependencies = [
- "bytemuck_derive",
-]
-
-[[package]]
-name = "bytemuck_derive"
-version = "1.10.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f9abbd1bc6865053c427f7198e6af43bfdedc55ab791faed4fbd361d789575ff"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]
-
 [[package]]
 name = "byteorder"
 version = "1.5.0"
@@ -241,105 +224,6 @@ version = "1.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33"
 
-[[package]]
-name = "candle-core"
-version = "0.9.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c15b675b80d994b2eadb20a4bbe434eabeb454eac3ee5e2b4cf6f147ee9be091"
-dependencies = [
- "byteorder",
- "candle-kernels",
- "candle-metal-kernels",
- "candle-ug",
- "cudarc 0.19.4",
- "float8 0.6.1",
- "gemm 0.19.0",
- "half",
- "libm",
- "memmap2",
- "num-traits",
- "num_cpus",
- "objc2-foundation",
- "objc2-metal",
- "rand",
- "rand_distr",
- "rayon",
- "safetensors 0.7.0",
- "thiserror 2.0.18",
- "yoke 0.8.1",
- "zip",
-]
-
-[[package]]
-name = "candle-kernels"
-version = "0.9.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b8455f84bd810047c7c41216683c1020c915a9f8a740b3b0eabdd4fb2fbaa660"
-dependencies = [
- "bindgen_cuda",
-]
-
-[[package]]
-name = "candle-metal-kernels"
-version = "0.9.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2fdfe9d06de16ce49961e49084e5b79a75a9bdf157246e7c7b6328e87a7aa25d"
-dependencies = [
- "half",
- "objc2",
- "objc2-foundation",
- "objc2-metal",
- "once_cell",
- "thiserror 2.0.18",
- "tracing",
-]
-
-[[package]]
-name = "candle-nn"
-version = "0.9.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3045fa9e7aef8567d209a27d56b692f60b96f4d0569f4c3011f8ca6715c65e03"
-dependencies = [
- "candle-core",
- "half",
- "libc",
- "num-traits",
- "rayon",
- "safetensors 0.7.0",
- "serde",
- "thiserror 2.0.18",
-]
-
-[[package]]
-name = "candle-transformers"
-version = "0.9.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b538ec4aa807c416a2ddd3621044888f188827862e2a6fcacba4738e89795d01"
-dependencies = [
- "byteorder",
- "candle-core",
- "candle-nn",
- "fancy-regex 0.17.0",
- "num-traits",
- "rand",
- "rayon",
- "serde",
- "serde_json",
- "serde_plain",
- "tracing",
-]
-
-[[package]]
-name = "candle-ug"
-version = "0.9.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c22d62be69068bf58987a45f690612739d8d2ea1bf508c1b87dc6815a019575d"
-dependencies = [
- "ug",
- "ug-cuda",
- "ug-metal",
-]
-
 [[package]]
 name = "castaway"
 version = "0.2.4"
@@ -356,9 +240,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7a0dd1ca384932ff3641c8718a02769f1698e7563dc6974ffd03346116310423"
 dependencies = [
  "find-msvc-tools",
+ "jobserver",
+ "libc",
  "shlex",
 ]
 
+[[package]]
+name = "cexpr"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
+dependencies = [
+ "nom",
+]
+
 [[package]]
 name = "cfg-if"
 version = "1.0.4"
@@ -379,6 +274,17 @@ dependencies = [
  "windows-link",
 ]
 
+[[package]]
+name = "clang-sys"
+version = "1.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
+dependencies = [
+ "glob",
+ "libc",
+ "libloading",
+]
+
 [[package]]
 name = "clap"
 version = "4.6.0"
@@ -419,6 +325,15 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9"
 
+[[package]]
+name = "cmake"
+version = "0.1.57"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75443c44cd6b379beb8c5b45d85d0773baf31cce901fe7bb252f4eff3008ef7d"
+dependencies = [
+ "cc",
+]
+
 [[package]]
 name = "colorchoice"
 version = "1.0.5"
@@ -453,33 +368,12 @@ dependencies = [
  "windows-sys 0.59.0",
 ]
 
-[[package]]
-name = "core-foundation"
-version = "0.9.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f"
-dependencies = [
- "core-foundation-sys",
- "libc",
-]
-
 [[package]]
 name = "core-foundation-sys"
 version = "0.8.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
 
-[[package]]
-name = "core-graphics-types"
-version = "0.1.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "45390e6114f68f718cc7a830514a96f903cccd70d02a8f6d9f643ac4ba45afaf"
-dependencies = [
- "bitflags 1.3.2",
- "core-foundation",
- "libc",
-]
-
 [[package]]
 name = "cpufeatures"
 version = "0.2.17"
@@ -523,12 +417,6 @@ version = "0.8.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
 
-[[package]]
-name = "crunchy"
-version = "0.2.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5"
-
 [[package]]
 name = "crypto-common"
 version = "0.1.7"
@@ -539,27 +427,6 @@ dependencies = [
  "typenum",
 ]
 
-[[package]]
-name = "cudarc"
-version = "0.17.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0bf99ab37ee7072d64d906aa2dada9a3422f1d975cdf8c8055a573bc84897ed8"
-dependencies = [
- "half",
- "libloading 0.8.9",
-]
-
-[[package]]
-name = "cudarc"
-version = "0.19.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f071cd6a7b5d51607df76aa2d426aaabc7a74bc6bdb885b8afa63a880572ad9b"
-dependencies = [
- "float8 0.7.0",
- "half",
- "libloading 0.9.0",
-]
-
 [[package]]
 name = "darling"
 version = "0.20.11"
@@ -709,16 +576,6 @@ dependencies = [
  "windows-sys 0.48.0",
 ]
 
-[[package]]
-name = "dispatch2"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e0e367e4e7da84520dedcac1901e4da967309406d1e51017ae1abfb97adbd38"
-dependencies = [
- "bitflags 2.11.0",
- "objc2",
-]
-
 [[package]]
 name = "displaydoc"
 version = "0.2.5"
@@ -736,22 +593,6 @@ version = "1.0.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555"
 
-[[package]]
-name = "dyn-stack"
-version = "0.13.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1c4713e43e2886ba72b8271aa66c93d722116acf7a75555cce11dcde84388fe8"
-dependencies = [
- "bytemuck",
- "dyn-stack-macros",
-]
-
-[[package]]
-name = "dyn-stack-macros"
-version = "0.1.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e1d926b4d407d372f141f93bb444696142c29d32962ccbd3531117cf3aa0bfa9"
-
 [[package]]
 name = "either"
 version = "1.15.0"
@@ -764,18 +605,26 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"
 
+[[package]]
+name = "encoding_rs"
+version = "0.8.35"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
+dependencies = [
+ "cfg-if",
+]
+
 [[package]]
 name = "engraph"
 version = "1.0.0"
 dependencies = [
  "anyhow",
- "candle-core",
- "candle-nn",
- "candle-transformers",
  "clap",
  "dirs",
+ "encoding_rs",
  "ignore",
  "indicatif",
+ "llama-cpp-2",
  "notify",
  "notify-debouncer-full",
  "rayon",
@@ -784,6 +633,7 @@ dependencies = [
  "serde",
  "serde_json",
  "sha2",
+ "shimmytok",
  "sqlite-vec",
  "strsim",
  "tempfile",
@@ -798,12 +648,20 @@ dependencies = [
 ]
 
 [[package]]
-name = "enum-as-inner"
-version = "0.6.1"
+name = "enumflags2"
+version = "0.7.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a1e6a265c649f3f5979b601d26f1d05ada116434c87741c9493cb56218f76cbc"
+checksum = "1027f7680c853e056ebcec683615fb6fbbc07dbaa13b4d5d9442b146ded4ecef"
+dependencies = [
+ "enumflags2_derive",
+]
+
+[[package]]
+name = "enumflags2_derive"
+version = "0.7.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67c78a4d8fdf9953a5c9d458f9efe940fd97a0cab0941c075a813ac594733827"
 dependencies = [
- "heck",
  "proc-macro2",
  "quote",
  "syn",
@@ -845,22 +703,22 @@ checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a"
 
 [[package]]
 name = "fancy-regex"
-version = "0.14.0"
+version = "0.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6e24cb5a94bcae1e5408b0effca5cd7172ea3c5755049c5f3af4cd283a165298"
+checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2"
 dependencies = [
- "bit-set",
+ "bit-set 0.5.3",
  "regex-automata",
  "regex-syntax",
 ]
 
 [[package]]
 name = "fancy-regex"
-version = "0.17.0"
+version = "0.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "72cf461f865c862bb7dc573f643dd6a2b6842f7c30b07882b56bd148cc2761b8"
+checksum = "6e24cb5a94bcae1e5408b0effca5cd7172ea3c5755049c5f3af4cd283a165298"
 dependencies = [
- "bit-set",
+ "bit-set 0.8.0",
  "regex-automata",
  "regex-syntax",
 ]
@@ -898,35 +756,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582"
 
 [[package]]
-name = "flate2"
-version = "1.1.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c"
-dependencies = [
- "crc32fast",
- "miniz_oxide",
-]
-
-[[package]]
-name = "float8"
-version = "0.6.1"
+name = "find_cuda_helper"
+version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "719a903cc23e4a89e87962c2a80fdb45cdaad0983a89bd150bb57b4c8571a7d5"
+checksum = "f9f9e65c593dd01ac77daad909ea4ad17f0d6d1776193fc8ea766356177abdad"
 dependencies = [
- "cudarc 0.19.4",
- "half",
- "num-traits",
- "rand",
- "rand_distr",
+ "glob",
 ]
 
 [[package]]
-name = "float8"
-version = "0.7.0"
+name = "flate2"
+version = "1.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c2d1f04709a8ac06e8e8042875a3c466cc4832d3c1a18dbcb9dba3c6e83046bc"
+checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c"
 dependencies = [
- "half",
+ "crc32fast",
+ "miniz_oxide",
 ]
 
 [[package]]
@@ -941,39 +786,6 @@ version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
 
-[[package]]
-name = "foldhash"
-version = "0.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
-
-[[package]]
-name = "foreign-types"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d737d9aa519fb7b749cbc3b962edcf310a8dd1f4b67c91c4f83975dbdd17d965"
-dependencies = [
- "foreign-types-macros",
- "foreign-types-shared",
-]
-
-[[package]]
-name = "foreign-types-macros"
-version = "0.2.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a5c6c585bc94aaf2c7b51dd4c2ba22680844aba4c687be581871a6f518c5742"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]
-
-[[package]]
-name = "foreign-types-shared"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aa9a19cbb55df58761df49b23516a86d432839add4af60fc256da840f66ed35b"
-
 [[package]]
 name = "form_urlencoded"
 version = "1.2.2"
@@ -1080,244 +892,6 @@ dependencies = [
  "slab",
 ]
 
-[[package]]
-name = "gemm"
-version = "0.18.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ab96b703d31950f1aeddded248bc95543c9efc7ac9c4a21fda8703a83ee35451"
-dependencies = [
- "dyn-stack",
- "gemm-c32 0.18.2",
- "gemm-c64 0.18.2",
- "gemm-common 0.18.2",
- "gemm-f16 0.18.2",
- "gemm-f32 0.18.2",
- "gemm-f64 0.18.2",
- "num-complex",
- "num-traits",
- "paste",
- "raw-cpuid",
- "seq-macro",
-]
-
-[[package]]
-name = "gemm"
-version = "0.19.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aa0673db364b12263d103b68337a68fbecc541d6f6b61ba72fe438654709eacb"
-dependencies = [
- "dyn-stack",
- "gemm-c32 0.19.0",
- "gemm-c64 0.19.0",
- "gemm-common 0.19.0",
- "gemm-f16 0.19.0",
- "gemm-f32 0.19.0",
- "gemm-f64 0.19.0",
- "num-complex",
- "num-traits",
- "paste",
- "raw-cpuid",
- "seq-macro",
-]
-
-[[package]]
-name = "gemm-c32"
-version = "0.18.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6db9fd9f40421d00eea9dd0770045a5603b8d684654816637732463f4073847"
-dependencies = [
- "dyn-stack",
- "gemm-common 0.18.2",
- "num-complex",
- "num-traits",
- "paste",
- "raw-cpuid",
- "seq-macro",
-]
-
-[[package]]
-name = "gemm-c32"
-version = "0.19.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "086936dbdcb99e37aad81d320f98f670e53c1e55a98bee70573e83f95beb128c"
-dependencies = [
- "dyn-stack",
- "gemm-common 0.19.0",
- "num-complex",
- "num-traits",
- "paste",
- "raw-cpuid",
- "seq-macro",
-]
-
-[[package]]
-name = "gemm-c64"
-version = "0.18.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dfcad8a3d35a43758330b635d02edad980c1e143dc2f21e6fd25f9e4eada8edf"
-dependencies = [
- "dyn-stack",
- "gemm-common 0.18.2",
- "num-complex",
- "num-traits",
- "paste",
- "raw-cpuid",
- "seq-macro",
-]
-
-[[package]]
-name = "gemm-c64"
-version = "0.19.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "20c8aeeeec425959bda4d9827664029ba1501a90a0d1e6228e48bef741db3a3f"
-dependencies = [
- "dyn-stack",
- "gemm-common 0.19.0",
- "num-complex",
- "num-traits",
- "paste",
- "raw-cpuid",
- "seq-macro",
-]
-
-[[package]]
-name = "gemm-common"
-version = "0.18.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a352d4a69cbe938b9e2a9cb7a3a63b7e72f9349174a2752a558a8a563510d0f3"
-dependencies = [
- "bytemuck",
- "dyn-stack",
- "half",
- "libm",
- "num-complex",
- "num-traits",
- "once_cell",
- "paste",
- "pulp 0.21.5",
- "raw-cpuid",
- "rayon",
- "seq-macro",
- "sysctl",
-]
-
-[[package]]
-name = "gemm-common"
-version = "0.19.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "88027625910cc9b1085aaaa1c4bc46bb3a36aad323452b33c25b5e4e7c8e2a3e"
-dependencies = [
- "bytemuck",
- "dyn-stack",
- "half",
- "libm",
- "num-complex",
- "num-traits",
- "once_cell",
- "paste",
- "pulp 0.22.2",
- "raw-cpuid",
- "rayon",
- "seq-macro",
- "sysctl",
-]
-
-[[package]]
-name = "gemm-f16"
-version = "0.18.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cff95ae3259432f3c3410eaa919033cd03791d81cebd18018393dc147952e109"
-dependencies = [
- "dyn-stack",
- "gemm-common 0.18.2",
- "gemm-f32 0.18.2",
- "half",
- "num-complex",
- "num-traits",
- "paste",
- "raw-cpuid",
- "rayon",
- "seq-macro",
-]
-
-[[package]]
-name = "gemm-f16"
-version = "0.19.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e3df7a55202e6cd6739d82ae3399c8e0c7e1402859b30e4cb780e61525d9486e"
-dependencies = [
- "dyn-stack",
- "gemm-common 0.19.0",
- "gemm-f32 0.19.0",
- "half",
- "num-complex",
- "num-traits",
- "paste",
- "raw-cpuid",
- "rayon",
- "seq-macro",
-]
-
-[[package]]
-name = "gemm-f32"
-version = "0.18.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bc8d3d4385393304f407392f754cd2dc4b315d05063f62cf09f47b58de276864"
-dependencies = [
- "dyn-stack",
- "gemm-common 0.18.2",
- "num-complex",
- "num-traits",
- "paste",
- "raw-cpuid",
- "seq-macro",
-]
-
-[[package]]
-name = "gemm-f32"
-version = "0.19.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "02e0b8c9da1fbec6e3e3ab2ce6bc259ef18eb5f6f0d3e4edf54b75f9fd41a81c"
-dependencies = [
- "dyn-stack",
- "gemm-common 0.19.0",
- "num-complex",
- "num-traits",
- "paste",
- "raw-cpuid",
- "seq-macro",
-]
-
-[[package]]
-name = "gemm-f64"
-version = "0.18.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "35b2a4f76ce4b8b16eadc11ccf2e083252d8237c1b589558a49b0183545015bd"
-dependencies = [
- "dyn-stack",
- "gemm-common 0.18.2",
- "num-complex",
- "num-traits",
- "paste",
- "raw-cpuid",
- "seq-macro",
-]
-
-[[package]]
-name = "gemm-f64"
-version = "0.19.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "056131e8f2a521bfab322f804ccd652520c79700d81209e9d9275bbdecaadc6a"
-dependencies = [
- "dyn-stack",
- "gemm-common 0.19.0",
- "num-complex",
- "num-traits",
- "paste",
- "raw-cpuid",
- "seq-macro",
-]
-
 [[package]]
 name = "generic-array"
 version = "0.14.7"
@@ -1383,21 +957,6 @@ dependencies = [
  "regex-syntax",
 ]
 
-[[package]]
-name = "half"
-version = "2.7.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b"
-dependencies = [
- "bytemuck",
- "cfg-if",
- "crunchy",
- "num-traits",
- "rand",
- "rand_distr",
- "zerocopy 0.8.42",
-]
-
 [[package]]
 name = "hashbrown"
 version = "0.14.5"
@@ -1413,7 +972,7 @@ version = "0.15.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
 dependencies = [
- "foldhash 0.1.5",
+ "foldhash",
 ]
 
 [[package]]
@@ -1421,13 +980,6 @@ name = "hashbrown"
 version = "0.16.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
-dependencies = [
- "allocator-api2",
- "equivalent",
- "foldhash 0.2.0",
- "serde",
- "serde_core",
-]
 
 [[package]]
 name = "hashlink"
@@ -1444,12 +996,6 @@ version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
 
-[[package]]
-name = "hermit-abi"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c"
-
 [[package]]
 name = "iana-time-zone"
 version = "0.1.65"
@@ -1482,7 +1028,7 @@ checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43"
 dependencies = [
  "displaydoc",
  "potential_utf",
- "yoke 0.8.1",
+ "yoke",
  "zerofrom",
  "zerovec",
 ]
@@ -1549,7 +1095,7 @@ dependencies = [
  "displaydoc",
  "icu_locale_core",
  "writeable",
- "yoke 0.8.1",
+ "yoke",
  "zerofrom",
  "zerotrie",
  "zerovec",
@@ -1664,6 +1210,15 @@ version = "1.70.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
 
+[[package]]
+name = "itertools"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186"
+dependencies = [
+ "either",
+]
+
 [[package]]
 name = "itertools"
 version = "0.14.0"
@@ -1679,6 +1234,16 @@ version = "1.0.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2"
 
+[[package]]
+name = "jobserver"
+version = "0.1.34"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33"
+dependencies = [
+ "getrandom 0.3.4",
+ "libc",
+]
+
 [[package]]
 name = "js-sys"
 version = "0.3.91"
@@ -1737,22 +1302,6 @@ dependencies = [
  "windows-link",
 ]
 
-[[package]]
-name = "libloading"
-version = "0.9.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "754ca22de805bb5744484a5b151a9e1a8e837d5dc232c2d7d8c2e3492edc8b60"
-dependencies = [
- "cfg-if",
- "windows-link",
-]
-
-[[package]]
-name = "libm"
-version = "0.2.16"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"
-
 [[package]]
 name = "libredox"
 version = "0.1.14"
@@ -1783,10 +1332,38 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53"
 
 [[package]]
-name = "litemap"
-version = "0.8.1"
+name = "litemap"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77"
+
+[[package]]
+name = "llama-cpp-2"
+version = "0.1.140"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5604c13b9c847157470479a64d1d7c94f3089709309f82f2fdcbcd43510f2f2"
+dependencies = [
+ "encoding_rs",
+ "enumflags2",
+ "llama-cpp-sys-2",
+ "thiserror 2.0.18",
+ "tracing",
+ "tracing-core",
+]
+
+[[package]]
+name = "llama-cpp-sys-2"
+version = "0.1.140"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77"
+checksum = "cbdd3e2c06f3a9a47466a631735946e9ad47fef565b88bc8766a3794474a66f3"
+dependencies = [
+ "bindgen",
+ "cc",
+ "cmake",
+ "find_cuda_helper",
+ "glob",
+ "walkdir",
+]
 
 [[package]]
 name = "log"
@@ -1810,15 +1387,6 @@ version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "670fdfda89751bc4a84ac13eaa63e205cf0fd22b4c9a5fbfa085b63c1f1d3a30"
 
-[[package]]
-name = "malloc_buf"
-version = "0.0.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62bb907fe88d54d8d9ce32a3cceab4218ed2f6b7d35617cafe9adf84e43919cb"
-dependencies = [
- "libc",
-]
-
 [[package]]
 name = "matchers"
 version = "0.2.0"
@@ -1834,31 +1402,6 @@ version = "2.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
 
-[[package]]
-name = "memmap2"
-version = "0.9.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "714098028fe011992e1c3962653c96b2d578c4b4bce9036e15ff220319b1e0e3"
-dependencies = [
- "libc",
- "stable_deref_trait",
-]
-
-[[package]]
-name = "metal"
-version = "0.29.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7ecfd3296f8c56b7c1f6fbac3c71cefa9d78ce009850c45000015f206dc7fa21"
-dependencies = [
- "bitflags 2.11.0",
- "block",
- "core-graphics-types",
- "foreign-types",
- "log",
- "objc",
- "paste",
-]
-
 [[package]]
 name = "minimal-lexical"
 version = "0.2.1"
@@ -1969,77 +1512,12 @@ dependencies = [
  "windows-sys 0.61.2",
 ]
 
-[[package]]
-name = "num"
-version = "0.4.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23"
-dependencies = [
- "num-bigint",
- "num-complex",
- "num-integer",
- "num-iter",
- "num-rational",
- "num-traits",
-]
-
-[[package]]
-name = "num-bigint"
-version = "0.4.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9"
-dependencies = [
- "num-integer",
- "num-traits",
-]
-
-[[package]]
-name = "num-complex"
-version = "0.4.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495"
-dependencies = [
- "bytemuck",
- "num-traits",
-]
-
 [[package]]
 name = "num-conv"
 version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c6673768db2d862beb9b39a78fdcb1a69439615d5794a1be50caa9bc92c81967"
 
-[[package]]
-name = "num-integer"
-version = "0.1.46"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
-dependencies = [
- "num-traits",
-]
-
-[[package]]
-name = "num-iter"
-version = "0.1.45"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf"
-dependencies = [
- "autocfg",
- "num-integer",
- "num-traits",
-]
-
-[[package]]
-name = "num-rational"
-version = "0.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824"
-dependencies = [
- "num-bigint",
- "num-integer",
- "num-traits",
-]
-
 [[package]]
 name = "num-traits"
 version = "0.2.19"
@@ -2047,17 +1525,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
 dependencies = [
  "autocfg",
- "libm",
-]
-
-[[package]]
-name = "num_cpus"
-version = "1.17.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b"
-dependencies = [
- "hermit-abi",
- "libc",
 ]
 
 [[package]]
@@ -2066,68 +1533,6 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
 
-[[package]]
-name = "objc"
-version = "0.2.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "915b1b472bc21c53464d6c8461c9d3af805ba1ef837e1cac254428f4a77177b1"
-dependencies = [
- "malloc_buf",
-]
-
-[[package]]
-name = "objc2"
-version = "0.6.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3a12a8ed07aefc768292f076dc3ac8c48f3781c8f2d5851dd3d98950e8c5a89f"
-dependencies = [
- "objc2-encode",
-]
-
-[[package]]
-name = "objc2-core-foundation"
-version = "0.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2a180dd8642fa45cdb7dd721cd4c11b1cadd4929ce112ebd8b9f5803cc79d536"
-dependencies = [
- "bitflags 2.11.0",
- "dispatch2",
- "objc2",
-]
-
-[[package]]
-name = "objc2-encode"
-version = "4.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ef25abbcd74fb2609453eb695bd2f860d389e457f67dc17cafc8b8cbc89d0c33"
-
-[[package]]
-name = "objc2-foundation"
-version = "0.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e3e0adef53c21f888deb4fa59fc59f7eb17404926ee8a6f59f5df0fd7f9f3272"
-dependencies = [
- "bitflags 2.11.0",
- "block2",
- "libc",
- "objc2",
- "objc2-core-foundation",
-]
-
-[[package]]
-name = "objc2-metal"
-version = "0.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a0125f776a10d00af4152d74616409f0d4a2053a6f57fa5b7d6aa2854ac04794"
-dependencies = [
- "bitflags 2.11.0",
- "block2",
- "dispatch2",
- "objc2",
- "objc2-core-foundation",
- "objc2-foundation",
-]
-
 [[package]]
 name = "once_cell"
 version = "1.21.4"
@@ -2231,43 +1636,6 @@ dependencies = [
  "unicode-ident",
 ]
 
-[[package]]
-name = "pulp"
-version = "0.21.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "96b86df24f0a7ddd5e4b95c94fc9ed8a98f1ca94d3b01bdce2824097e7835907"
-dependencies = [
- "bytemuck",
- "cfg-if",
- "libm",
- "num-complex",
- "reborrow",
- "version_check",
-]
-
-[[package]]
-name = "pulp"
-version = "0.22.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2e205bb30d5b916c55e584c22201771bcf2bad9aabd5d4127f38387140c38632"
-dependencies = [
- "bytemuck",
- "cfg-if",
- "libm",
- "num-complex",
- "paste",
- "pulp-wasm-simd-flag",
- "raw-cpuid",
- "reborrow",
- "version_check",
-]
-
-[[package]]
-name = "pulp-wasm-simd-flag"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "40e24eee682d89fb193496edf918a7f407d30175b2e785fe057e4392dfd182e0"
-
 [[package]]
 name = "quote"
 version = "1.0.45"
@@ -2318,25 +1686,6 @@ dependencies = [
  "getrandom 0.3.4",
 ]
 
-[[package]]
-name = "rand_distr"
-version = "0.5.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463"
-dependencies = [
- "num-traits",
- "rand",
-]
-
-[[package]]
-name = "raw-cpuid"
-version = "11.6.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "498cd0dc59d73224351ee52a95fee0f1a617a2eae0e7d9d720cc622c73a54186"
-dependencies = [
- "bitflags 2.11.0",
-]
-
 [[package]]
 name = "rayon"
 version = "1.11.0"
@@ -2354,7 +1703,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2964d0cf57a3e7a06e8183d14a8b527195c706b7983549cd5462d5aa3747438f"
 dependencies = [
  "either",
- "itertools",
+ "itertools 0.14.0",
  "rayon",
 ]
 
@@ -2368,12 +1717,6 @@ dependencies = [
  "crossbeam-utils",
 ]
 
-[[package]]
-name = "reborrow"
-version = "0.5.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "03251193000f4bd3b042892be858ee50e8b3719f2b08e5833ac4353724632430"
-
 [[package]]
 name = "redox_syscall"
 version = "0.7.3"
@@ -2506,6 +1849,12 @@ dependencies = [
  "smallvec",
 ]
 
+[[package]]
+name = "rustc-hash"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d"
+
 [[package]]
 name = "rustix"
 version = "1.1.4"
@@ -2566,27 +1915,6 @@ version = "1.0.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f"
 
-[[package]]
-name = "safetensors"
-version = "0.4.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "44560c11236a6130a46ce36c836a62936dc81ebf8c36a37947423571be0e55b6"
-dependencies = [
- "serde",
- "serde_json",
-]
-
-[[package]]
-name = "safetensors"
-version = "0.7.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "675656c1eabb620b921efea4f9199f97fc86e36dd6ffd1fbbe48d0f59a4987f5"
-dependencies = [
- "hashbrown 0.16.1",
- "serde",
- "serde_json",
-]
-
 [[package]]
 name = "same-file"
 version = "1.0.6"
@@ -2628,12 +1956,6 @@ version = "1.0.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2"
 
-[[package]]
-name = "seq-macro"
-version = "0.3.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc"
-
 [[package]]
 name = "serde"
 version = "1.0.228"
@@ -2688,15 +2010,6 @@ dependencies = [
  "zmij",
 ]
 
-[[package]]
-name = "serde_plain"
-version = "1.0.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ce1fc6db65a611022b23a0dec6975d63fb80a302cb3388835ff02c097258d50"
-dependencies = [
- "serde",
-]
-
 [[package]]
 name = "serde_spanned"
 version = "0.6.9"
@@ -2726,6 +2039,18 @@ dependencies = [
  "lazy_static",
 ]
 
+[[package]]
+name = "shimmytok"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f2381f12d5c3f475feaf705354294803f350c13d0788f3ab367ac5979df9021"
+dependencies = [
+ "fancy-regex 0.13.0",
+ "rayon",
+ "regex",
+ "thiserror 2.0.18",
+]
+
 [[package]]
 name = "shlex"
 version = "1.3.0"
@@ -2817,20 +2142,6 @@ dependencies = [
  "syn",
 ]
 
-[[package]]
-name = "sysctl"
-version = "0.6.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "01198a2debb237c62b6826ec7081082d951f46dbb64b0e8c7649a452230d1dfc"
-dependencies = [
- "bitflags 2.11.0",
- "byteorder",
- "enum-as-inner",
- "libc",
- "thiserror 1.0.69",
- "walkdir",
-]
-
 [[package]]
 name = "tempfile"
 version = "3.27.0"
@@ -2936,7 +2247,7 @@ dependencies = [
  "esaxx-rs",
  "fancy-regex 0.14.0",
  "getrandom 0.3.4",
- "itertools",
+ "itertools 0.14.0",
  "log",
  "macro_rules_attribute",
  "monostate",
@@ -3092,66 +2403,12 @@ dependencies = [
  "tracing-log",
 ]
 
-[[package]]
-name = "typed-path"
-version = "0.12.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e28f89b80c87b8fb0cf04ab448d5dd0dd0ade2f8891bae878de66a75a28600e"
-
 [[package]]
 name = "typenum"
 version = "1.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb"
 
-[[package]]
-name = "ug"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "76b761acf8af3494640d826a8609e2265e19778fb43306c7f15379c78c9b05b0"
-dependencies = [
- "gemm 0.18.2",
- "half",
- "libloading 0.8.9",
- "memmap2",
- "num",
- "num-traits",
- "num_cpus",
- "rayon",
- "safetensors 0.4.5",
- "serde",
- "thiserror 1.0.69",
- "tracing",
- "yoke 0.7.5",
-]
-
-[[package]]
-name = "ug-cuda"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9f0a1fa748f26166778c33b8498255ebb7c6bffb472bcc0a72839e07ebb1d9b5"
-dependencies = [
- "cudarc 0.17.8",
- "half",
- "serde",
- "thiserror 1.0.69",
- "ug",
-]
-
-[[package]]
-name = "ug-metal"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9f7adf545a99a086d362efc739e7cf4317c18cbeda22706000fd434d70ea3d95"
-dependencies = [
- "half",
- "metal",
- "objc",
- "serde",
- "thiserror 1.0.69",
- "ug",
-]
-
 [[package]]
 name = "unicode-ident"
 version = "1.0.24"
@@ -3798,18 +3055,6 @@ version = "0.6.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9"
 
-[[package]]
-name = "yoke"
-version = "0.7.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40"
-dependencies = [
- "serde",
- "stable_deref_trait",
- "yoke-derive 0.7.5",
- "zerofrom",
-]
-
 [[package]]
 name = "yoke"
 version = "0.8.1"
@@ -3817,22 +3062,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954"
 dependencies = [
  "stable_deref_trait",
- "yoke-derive 0.8.1",
+ "yoke-derive",
  "zerofrom",
 ]
 
-[[package]]
-name = "yoke-derive"
-version = "0.7.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
- "synstructure",
-]
-
 [[package]]
 name = "yoke-derive"
 version = "0.8.1"
@@ -3920,7 +3153,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851"
 dependencies = [
  "displaydoc",
- "yoke 0.8.1",
+ "yoke",
  "zerofrom",
 ]
 
@@ -3930,7 +3163,7 @@ version = "0.11.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002"
 dependencies = [
- "yoke 0.8.1",
+ "yoke",
  "zerofrom",
  "zerovec-derive",
 ]
@@ -3946,18 +3179,6 @@ dependencies = [
  "syn",
 ]
 
-[[package]]
-name = "zip"
-version = "7.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c42e33efc22a0650c311c2ef19115ce232583abbe80850bc8b66509ebef02de0"
-dependencies = [
- "crc32fast",
- "indexmap",
- "memchr",
- "typed-path",
-]
-
 [[package]]
 name = "zmij"
 version = "1.0.21"
diff --git a/Cargo.toml b/Cargo.toml
index 93a2bc4..6733237 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -34,14 +34,12 @@ rmcp = { version = "1.2", features = ["transport-io"] }
 tokio = { version = "1", features = ["macros", "rt-multi-thread"] }
 notify = "7.0"
 notify-debouncer-full = "0.4"
-candle-core = "0.9"
-candle-nn = "0.9"
-candle-transformers = "0.9"
+llama-cpp-2 = "0.1"
+encoding_rs = "0.8"
+shimmytok = "0.7"
 
 [features]
 default = []
-metal = ["candle-core/metal"]
-cuda = ["candle-core/cuda"]
 
 [dev-dependencies]
 tempfile = "3"
diff --git a/README.md b/README.md
index ce82aba..69da14d 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ Plain vector search treats your notes as isolated documents. But knowledge isn't
 - **MCP server for AI agents** — `engraph serve` exposes 13 tools (search, read, context bundles, note creation) that Claude, Cursor, or any MCP client can call directly.
 - **Real-time sync** — file watcher keeps the index fresh as you edit in Obsidian. No manual re-indexing needed.
 - **Smart write pipeline** — AI agents can create notes with automatic tag resolution, wikilink discovery, and folder placement based on semantic similarity.
-- **Fully local** — pure Rust ML via [candle](https://github.com/huggingface/candle) with GGUF models (~300MB mandatory, ~1.3GB optional for intelligence). Metal-accelerated on macOS. No API keys, no cloud.
+- **Fully local** — [llama.cpp](https://github.com/ggml-org/llama.cpp) inference with GGUF models (~300MB mandatory, ~1.3GB optional for intelligence). Metal GPU-accelerated on macOS (88 files indexed in 70s). No API keys, no cloud.
 
 ## What problem it solves
 
@@ -57,7 +57,7 @@ Your vault (markdown files)
   Claude / Cursor / any MCP client
 ```
 
-1. **Index** — walks your vault, chunks markdown by headings, embeds with a local GGUF model (candle), stores everything in SQLite with FTS5 + sqlite-vec + a wikilink graph
+1. **Index** — walks your vault, chunks markdown by headings, embeds with a local GGUF model via llama.cpp (Metal GPU on macOS), stores everything in SQLite with FTS5 + sqlite-vec + a wikilink graph
 2. **Search** — an orchestrator classifies the query and sets lane weights, then runs up to four lanes (semantic KNN, BM25 keyword, graph expansion, cross-encoder reranking), fused via RRF
 3. **Serve** — starts an MCP server that AI agents connect to, with a file watcher that re-indexes changes in real time
 
@@ -190,7 +190,7 @@ engraph resolves tags against the registry (fuzzy matching), discovers potential
 | AI agent access | MCP server (13 tools) | Custom API needed | No |
 | Write capability | Create/append/move with smart filing | No | Manual |
 | Real-time sync | File watcher, 2s debounce | Manual re-index | N/A |
-| Runs locally | Yes, pure Rust + Metal acceleration | Depends | Yes |
+| Runs locally | Yes, llama.cpp + Metal GPU | Depends | Yes |
 | Setup | One binary, one command | Framework + code | Built-in |
 
 engraph is not a replacement for Obsidian — it's the intelligence layer that sits between your vault and your AI tools.
@@ -199,7 +199,7 @@ engraph is not a replacement for Obsidian — it's the intelligence layer that s
 
 - 4-lane hybrid search (semantic + FTS5 + graph + cross-encoder reranker) with two-pass RRF fusion
 - LLM research orchestrator: query intent classification + query expansion + adaptive lane weights
-- Pure Rust ML via candle (GGUF models, Metal acceleration on macOS)
+- llama.cpp inference via Rust bindings (GGUF models, Metal GPU on macOS, CUDA on Linux)
 - Intelligence opt-in: heuristic fallback when disabled, LLM-powered when enabled
 - MCP server with 13 tools (7 read, 6 write) via stdio
 - Real-time file watching with 2s debounce and startup reconciliation
@@ -242,7 +242,7 @@ All data stored in `~/.engraph/` — single SQLite database (~10MB typical), GGU
 ## Development
 
 ```bash
-cargo test --lib          # 271 unit tests, no network
+cargo test --lib          # 270 unit tests, no network (requires CMake for llama.cpp)
 cargo clippy -- -D warnings
 cargo fmt --check
 
diff --git a/src/indexer.rs b/src/indexer.rs
index d382945..774fc98 100644
--- a/src/indexer.rs
+++ b/src/indexer.rs
@@ -4,6 +4,7 @@ use std::time::{Duration, Instant};
 
 use anyhow::{Context, Result, anyhow};
 use ignore::WalkBuilder;
+use indicatif::{ProgressBar, ProgressStyle};
 use sha2::{Digest, Sha256};
 use tracing::info;
 
@@ -440,7 +441,7 @@ pub fn run_index(vault_path: &Path, config: &Config, rebuild: bool) -> Result<In
     let store = Store::open(&db_path)?;
 
     let models_dir = data_dir.join("models");
-    let mut embedder = crate::llm::CandleEmbed::new(&models_dir, config)?;
+    let mut embedder = crate::llm::LlamaEmbed::new(&models_dir, config)?;
 
     // Check for embedding dimension change
     let model_dim = embedder.dim();
@@ -561,12 +562,22 @@ fn run_index_inner(
     let mut total_chunks = 0usize;
     let mut indexed_rel_paths: Vec<String> = Vec::new();
 
+    let pb = ProgressBar::new(file_contents.len() as u64);
+    pb.set_style(
+        ProgressStyle::with_template("  [{bar:40.cyan/blue}] {pos}/{len} {msg} ({eta})")
+            .unwrap()
+            .progress_chars("=>-"),
+    );
+
     store.conn().execute_batch("BEGIN DEFERRED")?;
     for (rel_str, content, hash) in &file_contents {
+        pb.set_message(rel_str.clone());
         let result = index_file(rel_str, content, hash, store, embedder, vault_path, config)?;
         total_chunks += result.total_chunks;
         indexed_rel_paths.push(rel_str.clone());
+        pb.inc(1);
     }
+    pb.finish_with_message("done");
     store.commit()?;
 
     // Step 9: Build vault graph edges.
diff --git a/src/llm.rs b/src/llm.rs
index f4c818f..30d913a 100644
--- a/src/llm.rs
+++ b/src/llm.rs
@@ -1,25 +1,36 @@
 use std::io::Read;
 use std::path::{Path, PathBuf};
+use std::sync::{Mutex, OnceLock};
 
-use anyhow::{Result, bail};
+use anyhow::{Context as _, Result, bail};
 use indicatif::{ProgressBar, ProgressStyle};
 use sha2::{Digest, Sha256};
 
-use anyhow::Context as _;
-use candle_core::{D, DType, Device, IndexOp, Tensor};
-use candle_nn::{Embedding, Module};
-
-// ── Device selection ─────────────────────────────────────────────────────────
-
-/// Select best available device: Metal on macOS (with `metal` feature), CPU elsewhere.
-fn select_device() -> Result<Device> {
-    #[cfg(feature = "metal")]
-    {
-        if let Ok(device) = Device::new_metal(0) {
-            return Ok(device);
-        }
-    }
-    Ok(Device::Cpu)
+use llama_cpp_2::context::params::LlamaContextParams;
+use llama_cpp_2::llama_backend::LlamaBackend;
+use llama_cpp_2::llama_batch::LlamaBatch;
+use llama_cpp_2::model::params::LlamaModelParams;
+use llama_cpp_2::model::{AddBos, LlamaModel};
+use llama_cpp_2::sampling::LlamaSampler;
+
+static BACKEND: OnceLock<LlamaBackend> = OnceLock::new();
+/// Mutex used only during the first initialization of `BACKEND`.
+static BACKEND_INIT: Mutex<()> = Mutex::new(());
+
+/// Get or initialize the global llama.cpp backend.
+/// Safe to call from multiple places — the backend is initialized at most once.
+pub fn llama_backend() -> Result<&'static LlamaBackend> {
+    if let Some(b) = BACKEND.get() {
+        return Ok(b);
+    }
+    let _guard = BACKEND_INIT.lock().unwrap();
+    // Double-checked: another thread may have initialized while we waited.
+    if let Some(b) = BACKEND.get() {
+        return Ok(b);
+    }
+    let backend =
+        LlamaBackend::init().map_err(|e| anyhow::anyhow!("initializing llama backend: {e}"))?;
+    Ok(BACKEND.get_or_init(|| backend))
 }
 
 // ── Prompt format ────────────────────────────────────────────────────────────
@@ -71,7 +82,7 @@ impl PromptFormat {
 // ── Types ────────────────────────────────────────────────────────────────────
 
 /// Classified intent of an incoming search query.
-#[derive(Debug, Clone, PartialEq, Eq)]
+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum QueryIntent {
     /// User wants a precise fact or term match.
     Exact,
@@ -84,7 +95,7 @@ pub enum QueryIntent {
 }
 
 /// Output produced by an orchestrator model for a query.
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
 pub struct OrchestrationResult {
     /// Classified query intent.
     pub intent: QueryIntent,
@@ -444,6 +455,134 @@ pub fn ensure_model(uri: &HfModelUri, models_dir: &Path) -> Result<PathBuf> {
     Ok(path)
 }
 
+/// Tokenizer that can be backed by either HuggingFace tokenizers crate or shimmytok (GGUF-embedded).
+pub enum FlexTokenizer {
+    HuggingFace(Box<tokenizers::Tokenizer>),
+    Gguf(Box<shimmytok::Tokenizer>),
+}
+
+impl FlexTokenizer {
+    /// Encode text into token IDs.
+    pub fn encode(&self, text: &str, add_special: bool) -> Result<Vec<u32>> {
+        match self {
+            Self::HuggingFace(t) => {
+                let enc = t
+                    .encode(text, add_special)
+                    .map_err(|e| anyhow::anyhow!("tokenization: {e}"))?;
+                Ok(enc.get_ids().to_vec())
+            }
+            Self::Gguf(t) => {
+                let ids = t
+                    .encode(text, add_special)
+                    .map_err(|e| anyhow::anyhow!("tokenization: {e}"))?;
+                Ok(ids)
+            }
+        }
+    }
+
+    /// Count tokens in text.
+    pub fn token_count(&self, text: &str) -> usize {
+        self.encode(text, false).map(|ids| ids.len()).unwrap_or(0)
+    }
+
+    /// Look up a token's ID by string (only available with HuggingFace backend).
+    pub fn token_to_id(&self, token: &str) -> Option<u32> {
+        match self {
+            Self::HuggingFace(t) => t.token_to_id(token),
+            Self::Gguf(_) => None,
+        }
+    }
+
+    /// Decode token IDs back to text (only available with HuggingFace backend).
+    pub fn decode(&self, ids: &[u32], skip_special: bool) -> Result<String> {
+        match self {
+            Self::HuggingFace(t) => t
+                .decode(ids, skip_special)
+                .map_err(|e| anyhow::anyhow!("decode: {e}")),
+            Self::Gguf(_) => bail!("decode not supported with GGUF tokenizer"),
+        }
+    }
+}
+
+/// Load tokenizer for a model. Tries external tokenizer.json first, falls back to GGUF-embedded.
+fn load_tokenizer_for_model(uri: &HfModelUri, models_dir: &Path) -> Result<FlexTokenizer> {
+    // First try: external tokenizer.json from candidate repos.
+    if let Some(tok) = try_external_tokenizer(uri, models_dir) {
+        return Ok(FlexTokenizer::HuggingFace(Box::new(tok)));
+    }
+
+    // Fallback: load tokenizer from GGUF file metadata.
+    let model_path = uri.cache_path(models_dir);
+    if model_path.exists() {
+        tracing::info!(
+            "no external tokenizer found, loading from GGUF: {}",
+            model_path.display()
+        );
+        let tok = shimmytok::Tokenizer::from_gguf_file(&model_path)
+            .map_err(|e| anyhow::anyhow!("loading tokenizer from GGUF metadata: {e}"))?;
+        return Ok(FlexTokenizer::Gguf(Box::new(tok)));
+    }
+
+    bail!(
+        "could not find tokenizer for model '{}': no external tokenizer.json \
+         and GGUF file not yet downloaded",
+        uri.repo
+    )
+}
+
+/// Try downloading tokenizer.json from candidate HuggingFace repos.
+fn try_external_tokenizer(uri: &HfModelUri, models_dir: &Path) -> Option<tokenizers::Tokenizer> {
+    let mut candidates: Vec<String> = vec![uri.repo.clone()];
+
+    // Non-GGUF variant: "org/model-GGUF" → "org/model"
+    let base_repo = uri.repo.trim_end_matches("-GGUF").to_string();
+    if base_repo != uri.repo {
+        candidates.push(base_repo);
+    }
+
+    // Known upstream repos for default models (GGUF repos rarely ship tokenizers).
+    let model_lower = uri.repo.to_lowercase();
+    if model_lower.contains("all-minilm") {
+        candidates.push("sentence-transformers/all-MiniLM-L6-v2".to_string());
+    } else if model_lower.contains("embeddinggemma") {
+        candidates.push("google/embeddinggemma-300m".to_string());
+        candidates.push("google/gemma-2b".to_string());
+    } else if model_lower.contains("qwen3") {
+        let base_name = uri
+            .repo
+            .rsplit('/')
+            .next()
+            .unwrap_or("")
+            .trim_end_matches("-GGUF")
+            .trim_end_matches("-Q8_0-GGUF");
+        if !base_name.is_empty() {
+            candidates.push(format!("Qwen/{base_name}"));
+        }
+    }
+
+    for repo in &candidates {
+        let tok_uri = HfModelUri {
+            repo: repo.clone(),
+            filename: "tokenizer.json".to_string(),
+        };
+        let tok_path = tok_uri.cache_path(models_dir);
+
+        if tok_path.exists()
+            && let Ok(tok) = tokenizers::Tokenizer::from_file(&tok_path)
+        {
+            return Some(tok);
+        }
+
+        if let Ok(p) = ensure_model(&tok_uri, models_dir)
+            && let Ok(tok) = tokenizers::Tokenizer::from_file(&p)
+        {
+            return Some(tok);
+        }
+    }
+
+    None
+}
+
 /// Default model URIs for the intelligence layer.
 pub struct ModelDefaults {
     pub embed_uri: String,
@@ -459,167 +598,51 @@ impl Default for ModelDefaults {
             embed_dim: 256,
             rerank_uri: "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf"
                 .into(),
-            expand_uri: "hf:Qwen/Qwen3-0.6B-GGUF/qwen3-0.6b-q8_0.gguf".into(),
+            expand_uri: "hf:Qwen/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf".into(),
         }
     }
 }
 
-// ── CandleEmbed — GGUF embedding model via candle ──────────────────────────
-
-/// Quantized matrix multiplication wrapper (mirrors candle-transformers pattern).
-#[derive(Debug, Clone)]
-struct CandleQMatMul {
-    inner: candle_core::quantized::QMatMul,
-}
-
-impl CandleQMatMul {
-    fn from_qtensor(qtensor: candle_core::quantized::QTensor) -> candle_core::Result<Self> {
-        let inner = candle_core::quantized::QMatMul::from_qtensor(qtensor)?;
-        Ok(Self { inner })
-    }
-
-    fn forward(&self, xs: &Tensor) -> candle_core::Result<Tensor> {
-        self.inner.forward(xs)
-    }
-}
-
-/// Single transformer layer for the embedding model.
-#[derive(Debug, Clone)]
-struct EmbedLayer {
-    attention_wq: CandleQMatMul,
-    attention_wk: CandleQMatMul,
-    attention_wv: CandleQMatMul,
-    attention_wo: CandleQMatMul,
-    attention_q_norm: candle_transformers::quantized_nn::RmsNorm,
-    attention_k_norm: candle_transformers::quantized_nn::RmsNorm,
-    attention_norm: candle_transformers::quantized_nn::RmsNorm,
-    post_attention_norm: candle_transformers::quantized_nn::RmsNorm,
-    ffn_norm: candle_transformers::quantized_nn::RmsNorm,
-    post_ffn_norm: candle_transformers::quantized_nn::RmsNorm,
-    ffn_gate: CandleQMatMul,
-    ffn_up: CandleQMatMul,
-    ffn_down: CandleQMatMul,
-    n_head: usize,
-    n_kv_head: usize,
-    head_dim: usize,
-    q_dim: usize,
-    rotary_sin: Tensor,
-    rotary_cos: Tensor,
-}
-
-impl EmbedLayer {
-    /// Bidirectional forward pass — no causal mask, no KV cache.
-    fn forward(&self, x: &Tensor) -> candle_core::Result<Tensor> {
-        let (b_sz, seq_len, _) = x.dims3()?;
-
-        // --- Attention block ---
-        let residual = x;
-        let x = self.attention_norm.forward(x)?;
-
-        let q = self.attention_wq.forward(&x)?;
-        let k = self.attention_wk.forward(&x)?;
-        let v = self.attention_wv.forward(&x)?;
-
-        let q = q
-            .reshape((b_sz, seq_len, self.n_head, self.head_dim))?
-            .transpose(1, 2)?;
-        let k = k
-            .reshape((b_sz, seq_len, self.n_kv_head, self.head_dim))?
-            .transpose(1, 2)?;
-        let v = v
-            .reshape((b_sz, seq_len, self.n_kv_head, self.head_dim))?
-            .transpose(1, 2)?;
-
-        let q = self.attention_q_norm.forward(&q.contiguous()?)?;
-        let k = self.attention_k_norm.forward(&k.contiguous()?)?;
-
-        // Apply rotary embeddings (truncated to seq_len).
-        let q = Self::apply_rotary(&q, &self.rotary_cos, &self.rotary_sin, seq_len)?;
-        let k = Self::apply_rotary(&k, &self.rotary_cos, &self.rotary_sin, seq_len)?;
-
-        // Repeat KV heads for GQA.
-        let n_rep = self.n_head / self.n_kv_head;
-        let k = candle_transformers::utils::repeat_kv(k, n_rep)?;
-        let v = candle_transformers::utils::repeat_kv(v, n_rep)?;
-
-        // Scaled dot-product attention — BIDIRECTIONAL (no mask).
-        let scale = 1.0 / (self.head_dim as f64).sqrt();
-        let attn_weights = (q.matmul(&k.transpose(2, 3)?)? * scale)?;
-        let attn_weights = candle_nn::ops::softmax_last_dim(&attn_weights)?;
-        let attn_output = attn_weights.matmul(&v)?;
-
-        let attn_output = attn_output
-            .transpose(1, 2)?
-            .reshape((b_sz, seq_len, self.q_dim))?;
-        let attn_output = self.attention_wo.forward(&attn_output)?;
-        let x = self.post_attention_norm.forward(&attn_output)?;
-        let x = (x + residual)?;
-
-        // --- FFN block ---
-        let residual = &x;
-        let h = self.ffn_norm.forward(&x)?;
-        let gate = self.ffn_gate.forward(&h)?;
-        let up = self.ffn_up.forward(&h)?;
-        let h = (candle_nn::ops::silu(&gate)? * up)?;
-        let h = self.ffn_down.forward(&h)?;
-        let h = self.post_ffn_norm.forward(&h)?;
-        h + residual
-    }
-
-    /// Apply rotary embeddings to a [batch, heads, seq, dim] tensor.
-    fn apply_rotary(
-        x: &Tensor,
-        cos: &Tensor,
-        sin: &Tensor,
-        seq_len: usize,
-    ) -> candle_core::Result<Tensor> {
-        let cos = cos.i(..seq_len)?.unsqueeze(0)?.unsqueeze(0)?;
-        let sin = sin.i(..seq_len)?.unsqueeze(0)?.unsqueeze(0)?;
-        let dim = x.dim(D::Minus1)?;
-        let half = dim / 2;
-        let x1 = x.narrow(D::Minus1, 0, half)?;
-        let x2 = x.narrow(D::Minus1, half, half)?;
-        let rotated = Tensor::cat(&[&x2.neg()?, &x1], D::Minus1)?;
-        let out = (x.broadcast_mul(&cos)? + rotated.broadcast_mul(&sin)?)?;
-        Ok(out)
-    }
-}
+// ── LlamaEmbed — GGUF embedding model via llama.cpp ──────────────────────────
 
-/// GGUF embedding model loaded via candle.
+/// GGUF embedding model loaded via llama.cpp.
 ///
-/// Loads a quantized Gemma-family embedding model (e.g., embeddinggemma-300M)
-/// from a GGUF file and produces dense float vectors via bidirectional attention
-/// + mean pooling + L2 normalization.
-pub struct CandleEmbed {
-    layers: Vec<EmbedLayer>,
-    tok_embeddings: Embedding,
-    norm: candle_transformers::quantized_nn::RmsNorm,
-    embedding_length: usize,
-    tokenizer: tokenizers::Tokenizer,
-    device: Device,
+/// Loads a quantized embedding model from a GGUF file and produces dense float
+/// vectors via llama.cpp's built-in embedding support with mean pooling + L2
+/// normalization. Supports Metal acceleration on macOS automatically.
+///
+/// `LlamaModel` is `Send + Sync`, so this struct is `Send`. `LlamaContext` is
+/// `!Send`, so we create it per-call. The global `LlamaBackend` is referenced
+/// via `llama_backend()` — no need to store it per-struct.
+pub struct LlamaEmbed {
+    model: LlamaModel,
+    tokenizer: FlexTokenizer,
     dim: usize,
     prompt_format: PromptFormat,
 }
 
-impl std::fmt::Debug for CandleEmbed {
+// Safety: LlamaModel is Send+Sync per llama-cpp-2 docs.
+// FlexTokenizer contains only Send types (tokenizers::Tokenizer is Send, shimmytok::Tokenizer is Send).
+// We never store a LlamaContext (which is !Send) — it is created per-call.
+unsafe impl Send for LlamaEmbed {}
+
+impl std::fmt::Debug for LlamaEmbed {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("CandleEmbed")
+        f.debug_struct("LlamaEmbed")
             .field("dim", &self.dim)
-            .field("embedding_length", &self.embedding_length)
-            .field("num_layers", &self.layers.len())
             .field("prompt_format", &self.prompt_format)
             .finish()
     }
 }
 
-impl CandleEmbed {
+impl LlamaEmbed {
     /// Load a GGUF embedding model from `models_dir`.
     ///
     /// Steps:
     /// 1. Resolve model URI (from config override or `ModelDefaults`)
     /// 2. `ensure_model()` to download if needed
     /// 3. Load tokenizer (try same repo's tokenizer.json, then repo without -GGUF suffix)
-    /// 4. Load GGUF and build layer structs for bidirectional embedding
+    /// 4. Load GGUF model via llama.cpp
     /// 5. Detect prompt format from filename
     pub fn new(models_dir: &Path, config: &crate::config::Config) -> Result<Self> {
         let defaults = ModelDefaults::default();
@@ -632,7 +655,7 @@ impl CandleEmbed {
         let model_path = ensure_model(&uri, models_dir)?;
 
         // Load tokenizer: try from the same HF repo, then from the non-GGUF variant.
-        let tokenizer = Self::load_tokenizer(&uri, models_dir)?;
+        let tokenizer = load_tokenizer_for_model(&uri, models_dir)?;
 
         // Detect prompt format from filename.
         let prompt_format = PromptFormat::detect(&uri.filename);
@@ -640,341 +663,104 @@ impl CandleEmbed {
         // Target output dimensionality.
         let dim = defaults.embed_dim;
 
-        // Load GGUF and build model.
-        let device = select_device()?;
-        let (layers, tok_embeddings, norm, embedding_length) =
-            Self::load_gguf(&model_path, &device)?;
+        // Get or initialize the global llama.cpp backend, then load model.
+        let backend = llama_backend()?;
+        let model_params = LlamaModelParams::default();
+        let model = LlamaModel::load_from_file(backend, &model_path, &model_params)
+            .map_err(|e| anyhow::anyhow!("loading GGUF model {}: {e}", model_path.display()))?;
 
-        tracing::info!(
-            "loaded CandleEmbed: {} layers, embedding_length={}, target_dim={}, device={:?}",
-            layers.len(),
-            embedding_length,
-            dim,
-            device
-        );
+        tracing::info!("loaded LlamaEmbed from {}, target_dim={}", uri_str, dim);
 
         Ok(Self {
-            layers,
-            tok_embeddings,
-            norm,
-            embedding_length,
+            model,
             tokenizer,
-            device,
             dim,
             prompt_format,
         })
     }
 
-    /// Try to load tokenizer.json from the same HF repo, or from repo without "-GGUF" suffix.
-    fn load_tokenizer(uri: &HfModelUri, models_dir: &Path) -> Result<tokenizers::Tokenizer> {
-        // Try 1: tokenizer.json from the same repo.
-        let tok_uri = HfModelUri {
-            repo: uri.repo.clone(),
-            filename: "tokenizer.json".to_string(),
-        };
-        let tok_path = tok_uri.cache_path(models_dir);
-        if tok_path.exists() {
-            return tokenizers::Tokenizer::from_file(&tok_path).map_err(|e| {
-                anyhow::anyhow!("loading tokenizer from {}: {e}", tok_path.display())
-            });
-        }
-
-        // Try 2: download from the same repo.
-        if let Ok(p) = ensure_model(&tok_uri, models_dir) {
-            return tokenizers::Tokenizer::from_file(&p)
-                .map_err(|e| anyhow::anyhow!("loading tokenizer from {}: {e}", p.display()));
-        }
-
-        // Try 3: non-GGUF variant of the repo (e.g., "org/model-GGUF" -> "org/model").
-        let base_repo = uri.repo.trim_end_matches("-GGUF").to_string();
-        if base_repo != uri.repo {
-            let base_tok_uri = HfModelUri {
-                repo: base_repo,
-                filename: "tokenizer.json".to_string(),
-            };
-            if let Ok(p) = ensure_model(&base_tok_uri, models_dir) {
-                return tokenizers::Tokenizer::from_file(&p)
-                    .map_err(|e| anyhow::anyhow!("loading tokenizer from {}: {e}", p.display()));
-            }
-        }
-
-        bail!(
-            "could not find or download tokenizer for model repo '{}'",
-            uri.repo
-        );
-    }
-
-    /// Load GGUF file and construct layer structs for bidirectional embedding.
-    fn load_gguf(
-        path: &Path,
-        device: &Device,
-    ) -> Result<(
-        Vec<EmbedLayer>,
-        Embedding,
-        candle_transformers::quantized_nn::RmsNorm,
-        usize,
-    )> {
-        use candle_core::quantized::gguf_file;
-
-        let mut file = std::fs::File::open(path)
-            .map_err(|e| anyhow::anyhow!("opening GGUF {}: {e}", path.display()))?;
-        let ct = gguf_file::Content::read(&mut file)
-            .map_err(|e| anyhow::anyhow!("reading GGUF {}: {e}", path.display()))?;
-
-        // Detect architecture prefix (same probe as candle-transformers quantized_gemma3).
-        let prefix = ["gemma3", "gemma2", "gemma", "gemma-embedding"]
-            .iter()
-            .find(|p| {
-                ct.metadata
-                    .contains_key(&format!("{}.attention.head_count", p))
-            })
-            .copied()
-            .unwrap_or("gemma3");
-
-        let md_get = |s: &str| -> Result<&gguf_file::Value> {
-            let key = format!("{prefix}.{s}");
-            ct.metadata
-                .get(&key)
-                .ok_or_else(|| anyhow::anyhow!("cannot find {key} in GGUF metadata"))
-        };
-
-        let head_count = md_get("attention.head_count")?
-            .to_u32()
-            .map_err(|e| anyhow::anyhow!("{e}"))? as usize;
-        let head_count_kv = md_get("attention.head_count_kv")?
-            .to_u32()
-            .map_err(|e| anyhow::anyhow!("{e}"))? as usize;
-        let block_count = md_get("block_count")?
-            .to_u32()
-            .map_err(|e| anyhow::anyhow!("{e}"))? as usize;
-        let embedding_length = md_get("embedding_length")?
-            .to_u32()
-            .map_err(|e| anyhow::anyhow!("{e}"))? as usize;
-        let key_length = md_get("attention.key_length")?
-            .to_u32()
-            .map_err(|e| anyhow::anyhow!("{e}"))? as usize;
-        let rms_norm_eps = md_get("attention.layer_norm_rms_epsilon")?
-            .to_f32()
-            .map_err(|e| anyhow::anyhow!("{e}"))? as f64;
-        let rope_freq_base = md_get("rope.freq_base")
-            .and_then(|v| v.to_f32().map_err(|e| anyhow::anyhow!("{e}")))
-            .unwrap_or(10_000.0);
-
-        let q_dim = head_count * key_length;
-
-        // Build rotary embedding tables (shared by all layers for the base freq).
-        let max_seq_len: usize = 8192; // Sufficient for embedding inputs.
-        let (rotary_sin, rotary_cos) =
-            Self::build_rotary_tables(key_length, rope_freq_base, max_seq_len, device)?;
-
-        // Load token embeddings.
-        let tok_embd = ct
-            .tensor(&mut file, "token_embd.weight", device)
-            .map_err(|e| anyhow::anyhow!("loading token_embd.weight: {e}"))?;
-        let tok_embd_deq = tok_embd
-            .dequantize(device)
-            .map_err(|e| anyhow::anyhow!("dequantizing token_embd: {e}"))?;
-        let tok_embeddings = Embedding::new(tok_embd_deq, embedding_length);
-
-        // Final norm.
-        let norm_qt = ct
-            .tensor(&mut file, "output_norm.weight", device)
-            .map_err(|e| anyhow::anyhow!("loading output_norm.weight: {e}"))?;
-        let norm = candle_transformers::quantized_nn::RmsNorm::from_qtensor(norm_qt, rms_norm_eps)
-            .map_err(|e| anyhow::anyhow!("creating RmsNorm: {e}"))?;
-
-        // Load transformer layers.
-        let mut layers = Vec::with_capacity(block_count);
-        for idx in 0..block_count {
-            let p = format!("blk.{idx}");
-
-            // Helper: load a quantized weight tensor as QMatMul.
-            macro_rules! load_q {
-                ($name:expr) => {{
-                    let full = format!("{}.{}", p, $name);
-                    let qt = ct
-                        .tensor(&mut file, &full, device)
-                        .map_err(|e| anyhow::anyhow!("loading {full}: {e}"))?;
-                    CandleQMatMul::from_qtensor(qt)
-                        .map_err(|e| anyhow::anyhow!("QMatMul for {full}: {e}"))?
-                }};
-            }
-
-            // Helper: load a norm weight tensor as RmsNorm.
-            macro_rules! load_norm {
-                ($name:expr) => {{
-                    let full = format!("{}.{}", p, $name);
-                    let qt = ct
-                        .tensor(&mut file, &full, device)
-                        .map_err(|e| anyhow::anyhow!("loading {full}: {e}"))?;
-                    candle_transformers::quantized_nn::RmsNorm::from_qtensor(qt, rms_norm_eps)
-                        .map_err(|e| anyhow::anyhow!("RmsNorm for {full}: {e}"))?
-                }};
-            }
-
-            layers.push(EmbedLayer {
-                attention_wq: load_q!("attn_q.weight"),
-                attention_wk: load_q!("attn_k.weight"),
-                attention_wv: load_q!("attn_v.weight"),
-                attention_wo: load_q!("attn_output.weight"),
-                attention_q_norm: load_norm!("attn_q_norm.weight"),
-                attention_k_norm: load_norm!("attn_k_norm.weight"),
-                attention_norm: load_norm!("attn_norm.weight"),
-                post_attention_norm: load_norm!("post_attention_norm.weight"),
-                ffn_norm: load_norm!("ffn_norm.weight"),
-                post_ffn_norm: load_norm!("post_ffw_norm.weight"),
-                ffn_gate: load_q!("ffn_gate.weight"),
-                ffn_up: load_q!("ffn_up.weight"),
-                ffn_down: load_q!("ffn_down.weight"),
-                n_head: head_count,
-                n_kv_head: head_count_kv,
-                head_dim: key_length,
-                q_dim,
-                rotary_sin: rotary_sin.clone(),
-                rotary_cos: rotary_cos.clone(),
-            });
-        }
-
-        Ok((layers, tok_embeddings, norm, embedding_length))
-    }
-
-    /// Build sin/cos rotary embedding tables of shape [max_seq_len, head_dim].
-    fn build_rotary_tables(
-        head_dim: usize,
-        freq_base: f32,
-        max_seq_len: usize,
-        device: &Device,
-    ) -> Result<(Tensor, Tensor)> {
-        let half = head_dim / 2;
-        let theta: Vec<f32> = (0..half)
-            .map(|i| 1.0 / freq_base.powf(i as f32 / half as f32))
-            .collect();
-        let theta = Tensor::new(theta.as_slice(), device)
-            .map_err(|e| anyhow::anyhow!("rotary theta: {e}"))?;
-        let positions = Tensor::arange(0, max_seq_len as u32, device)
-            .map_err(|e| anyhow::anyhow!("rotary positions: {e}"))?
-            .to_dtype(DType::F32)
-            .map_err(|e| anyhow::anyhow!("rotary positions dtype: {e}"))?;
-        // [max_seq_len, half]
-        let freqs = positions
-            .unsqueeze(1)
-            .map_err(|e| anyhow::anyhow!("rotary unsqueeze: {e}"))?
-            .broadcast_mul(&theta.unsqueeze(0).map_err(|e| anyhow::anyhow!("{e}"))?)
-            .map_err(|e| anyhow::anyhow!("rotary freqs: {e}"))?;
-        // Duplicate to [max_seq_len, head_dim] to match x1,x2 concatenation.
-        let freqs = Tensor::cat(&[&freqs, &freqs], D::Minus1)
-            .map_err(|e| anyhow::anyhow!("rotary cat: {e}"))?;
-        let sin = freqs
-            .sin()
-            .map_err(|e| anyhow::anyhow!("rotary sin: {e}"))?;
-        let cos = freqs
-            .cos()
-            .map_err(|e| anyhow::anyhow!("rotary cos: {e}"))?;
-        Ok((sin, cos))
-    }
-
-    /// Run a bidirectional forward pass and return the mean-pooled, truncated,
-    /// L2-normalized embedding.
+    /// Run embedding inference and return the truncated, L2-normalized embedding.
     fn embed_text(&self, text: &str) -> Result<Vec<f32>> {
-        let encoding = self
-            .tokenizer
-            .encode(text, true)
+        // Tokenize using llama.cpp's built-in tokenizer.
+        // Use AddBos::Never because PromptFormat already adds <bos> for embeddinggemma.
+        let tokens = self
+            .model
+            .str_to_token(text, AddBos::Never)
             .map_err(|e| anyhow::anyhow!("tokenization failed: {e}"))?;
-        let token_ids = encoding.get_ids();
-        if token_ids.is_empty() {
+        if tokens.is_empty() {
             bail!("tokenizer returned empty token sequence");
         }
 
-        let input = Tensor::new(token_ids, &self.device)
-            .map_err(|e| anyhow::anyhow!("creating input tensor: {e}"))?
-            .unsqueeze(0)
-            .map_err(|e| anyhow::anyhow!("unsqueeze: {e}"))?;
-
-        // Token embeddings, scaled by sqrt(embedding_length) (Gemma convention).
-        let mut hidden = self
-            .tok_embeddings
-            .forward(&input)
-            .map_err(|e| anyhow::anyhow!("token embedding forward: {e}"))?;
-        hidden = (hidden * (self.embedding_length as f64).sqrt())
-            .map_err(|e| anyhow::anyhow!("scaling embeddings: {e}"))?;
-
-        // Forward through all transformer layers (bidirectional — no causal mask).
-        for layer in &self.layers {
-            hidden = layer
-                .forward(&hidden)
-                .map_err(|e| anyhow::anyhow!("layer forward: {e}"))?;
-        }
+        // Create a context with embeddings enabled (per-call, since LlamaContext is !Send).
+        // n_ubatch must be >= n_tokens for the encoder, and n_ctx must fit all tokens.
+        let n_tokens = tokens.len() as u32;
+        let n_ctx = std::num::NonZeroU32::new(n_tokens.max(64) + 16);
+        let ctx_params = LlamaContextParams::default()
+            .with_embeddings(true)
+            .with_n_ctx(n_ctx)
+            .with_n_ubatch(n_tokens.max(512))
+            .with_n_batch(n_tokens.max(512));
+        let mut ctx = self
+            .model
+            .new_context(llama_backend()?, ctx_params)
+            .map_err(|e| anyhow::anyhow!("creating embedding context: {e}"))?;
 
-        // Final layer norm.
-        hidden = self
-            .norm
-            .forward(&hidden)
-            .map_err(|e| anyhow::anyhow!("final norm: {e}"))?;
+        // Create batch and add tokens — mark all as outputs for embedding.
+        let mut batch = LlamaBatch::new(tokens.len() + 16, 1);
+        batch
+            .add_sequence(&tokens, 0, true)
+            .map_err(|e| anyhow::anyhow!("adding sequence to batch: {e}"))?;
 
-        // Mean pool across sequence dimension: [1, seq_len, hidden] -> [1, hidden].
-        let seq_len = hidden
-            .dim(1)
-            .map_err(|e| anyhow::anyhow!("getting seq dim: {e}"))?;
-        let pooled = (hidden.sum(1).map_err(|e| anyhow::anyhow!("sum: {e}"))? / (seq_len as f64))
-            .map_err(|e| anyhow::anyhow!("mean div: {e}"))?;
+        // Encode (compute embeddings). Use encode() for embedding models.
+        ctx.encode(&mut batch)
+            .map_err(|e| anyhow::anyhow!("embedding encode failed: {e}"))?;
 
-        // Squeeze batch dimension: [1, hidden] -> [hidden].
-        let pooled = pooled
-            .squeeze(0)
-            .map_err(|e| anyhow::anyhow!("squeeze: {e}"))?;
+        // Get embeddings for sequence 0 (mean pooled by llama.cpp).
+        let embeddings = ctx
+            .embeddings_seq_ith(0)
+            .map_err(|e| anyhow::anyhow!("getting embeddings: {e}"))?;
 
         // Truncate to target dimensionality.
-        let full_dim = pooled
-            .dim(0)
-            .map_err(|e| anyhow::anyhow!("dim check: {e}"))?;
-        let truncated = if full_dim > self.dim {
-            pooled
-                .narrow(0, 0, self.dim)
-                .map_err(|e| anyhow::anyhow!("truncate: {e}"))?
+        let full_dim = embeddings.len();
+        let truncated: Vec<f32> = if full_dim > self.dim {
+            embeddings[..self.dim].to_vec()
         } else {
-            pooled
+            embeddings.to_vec()
         };
 
         // L2 normalize.
-        let norm_val = truncated
-            .sqr()
-            .map_err(|e| anyhow::anyhow!("sqr: {e}"))?
-            .sum_all()
-            .map_err(|e| anyhow::anyhow!("sum_all: {e}"))?
-            .sqrt()
-            .map_err(|e| anyhow::anyhow!("sqrt: {e}"))?;
-        let norm_scalar: f32 = norm_val
-            .to_scalar()
-            .map_err(|e| anyhow::anyhow!("norm scalar: {e}"))?;
-
-        let normalized = if norm_scalar > 0.0 {
-            (truncated / norm_scalar as f64).map_err(|e| anyhow::anyhow!("normalize: {e}"))?
+        let norm: f32 = truncated.iter().map(|x| x * x).sum::<f32>().sqrt();
+        let normalized = if norm > 0.0 {
+            truncated.iter().map(|x| x / norm).collect()
         } else {
             truncated
         };
 
-        let vec: Vec<f32> = normalized
-            .to_vec1()
-            .map_err(|e| anyhow::anyhow!("to_vec1: {e}"))?;
-        Ok(vec)
+        Ok(normalized)
     }
 }
 
-impl EmbedModel for CandleEmbed {
+impl EmbedModel for LlamaEmbed {
     fn embed_batch(&mut self, texts: &[&str]) -> Result<Vec<Vec<f32>>> {
-        // Process texts sequentially — candle quantized ops are single-threaded.
-        texts.iter().map(|t| self.embed_text(t)).collect()
+        // Process texts sequentially — llama.cpp context is per-call.
+        // Apply document prompt format for indexing (asymmetric models need this).
+        texts
+            .iter()
+            .map(|t| {
+                let formatted = self.prompt_format.format_document("", t);
+                self.embed_text(&formatted)
+            })
+            .collect()
     }
 
     fn embed_one(&mut self, text: &str) -> Result<Vec<f32>> {
-        self.embed_text(text)
+        // Apply query prompt format (asymmetric models like embeddinggemma need this).
+        let formatted = self.prompt_format.format_query(text);
+        self.embed_text(&formatted)
     }
 
     fn token_count(&self, text: &str) -> usize {
-        self.tokenizer
-            .encode(text, false)
-            .map(|enc| enc.get_ids().len())
-            .unwrap_or(text.len() / 4 + 1)
+        self.tokenizer.token_count(text)
     }
 
     fn dim(&self) -> usize {
@@ -1093,7 +879,7 @@ fn extract_json_object(text: &str) -> Option<&str> {
     None
 }
 
-// ── CandleOrchestrator — GGUF text generation via candle ─────────────────────
+// ── LlamaOrchestrator — GGUF text generation via llama.cpp ─────────────────────
 
 const ORCHESTRATOR_SYSTEM_PROMPT: &str = r#"You are a search query analyzer. Given a user's search query, classify it and expand it.
 
@@ -1103,33 +889,36 @@ Return JSON with:
 
 Be concise. Only return the JSON object."#;
 
-/// Quantized Qwen3 model for query orchestration and expansion.
+/// Quantized Qwen3 model for query orchestration and expansion via llama.cpp.
 ///
 /// Loads a Qwen3 GGUF model and performs autoregressive generation to classify
 /// queries and produce expansions. Falls back to `heuristic_orchestrate` if
-/// generation or JSON parsing fails.
-pub struct CandleOrchestrator {
-    model: candle_transformers::models::quantized_qwen3::ModelWeights,
-    tokenizer: tokenizers::Tokenizer,
-    device: Device,
+/// generation or JSON parsing fails. Uses Metal acceleration on macOS automatically.
+///
+/// Uses llama.cpp's built-in tokenizer for both encoding and decoding — no
+/// external tokenizer.json required. The global `LlamaBackend` is used via
+/// `llama_backend()`.
+pub struct LlamaOrchestrator {
+    model: LlamaModel,
 }
 
-impl std::fmt::Debug for CandleOrchestrator {
+// Safety: LlamaModel is Send+Sync per llama-cpp-2 docs.
+// LlamaContext is created per-call and never stored.
+unsafe impl Send for LlamaOrchestrator {}
+
+impl std::fmt::Debug for LlamaOrchestrator {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("CandleOrchestrator")
-            .field("device", &self.device)
-            .finish()
+        f.debug_struct("LlamaOrchestrator").finish()
     }
 }
 
-impl CandleOrchestrator {
+impl LlamaOrchestrator {
     /// Load a Qwen3 GGUF model for orchestration from `models_dir`.
     ///
     /// Steps:
     /// 1. Resolve model URI (from config override or `ModelDefaults`)
     /// 2. `ensure_model()` to download if needed
-    /// 3. Load tokenizer from the model repo (or the non-GGUF base repo)
-    /// 4. Load GGUF via `ModelWeights::from_gguf()`
+    /// 3. Load GGUF model via llama.cpp (uses built-in tokenizer — no tokenizer.json needed)
     pub fn new(models_dir: &Path, config: &crate::config::Config) -> Result<Self> {
         let defaults = ModelDefaults::default();
         let uri_str = config
@@ -1140,71 +929,17 @@ impl CandleOrchestrator {
         let uri = HfModelUri::parse(uri_str)?;
         let model_path = ensure_model(&uri, models_dir)?;
 
-        // Load tokenizer (same strategy as CandleEmbed).
-        let tokenizer = Self::load_tokenizer(&uri, models_dir)?;
+        // Use global backend and llama.cpp's built-in tokenizer (no tokenizer.json required).
+        let backend = llama_backend()?;
+        let model_params = LlamaModelParams::default();
+        let model =
+            LlamaModel::load_from_file(backend, &model_path, &model_params).map_err(|e| {
+                anyhow::anyhow!("loading orchestrator model {}: {e}", model_path.display())
+            })?;
 
-        let device = select_device()?;
-
-        // Load GGUF model.
-        let mut file = std::fs::File::open(&model_path)
-            .map_err(|e| anyhow::anyhow!("opening GGUF {}: {e}", model_path.display()))?;
-        let ct = candle_core::quantized::gguf_file::Content::read(&mut file)
-            .map_err(|e| anyhow::anyhow!("reading GGUF {}: {e}", model_path.display()))?;
-        let model = candle_transformers::models::quantized_qwen3::ModelWeights::from_gguf(
-            ct, &mut file, &device,
-        )
-        .map_err(|e| anyhow::anyhow!("loading Qwen3 model weights: {e}"))?;
-
-        tracing::info!(
-            "loaded CandleOrchestrator from {}, device={:?}",
-            uri_str,
-            device
-        );
-
-        Ok(Self {
-            model,
-            tokenizer,
-            device,
-        })
-    }
-
-    /// Try to load tokenizer.json from the same HF repo, or from the non-GGUF base repo.
-    fn load_tokenizer(uri: &HfModelUri, models_dir: &Path) -> Result<tokenizers::Tokenizer> {
-        // Try 1: tokenizer.json from the same repo.
-        let tok_uri = HfModelUri {
-            repo: uri.repo.clone(),
-            filename: "tokenizer.json".to_string(),
-        };
-        let tok_path = tok_uri.cache_path(models_dir);
-        if tok_path.exists() {
-            return tokenizers::Tokenizer::from_file(&tok_path).map_err(|e| {
-                anyhow::anyhow!("loading tokenizer from {}: {e}", tok_path.display())
-            });
-        }
-
-        // Try 2: download from the same repo.
-        if let Ok(p) = ensure_model(&tok_uri, models_dir) {
-            return tokenizers::Tokenizer::from_file(&p)
-                .map_err(|e| anyhow::anyhow!("loading tokenizer from {}: {e}", p.display()));
-        }
-
-        // Try 3: non-GGUF variant of the repo (e.g., "Qwen/Qwen3-0.6B-GGUF" -> "Qwen/Qwen3-0.6B").
-        let base_repo = uri.repo.trim_end_matches("-GGUF").to_string();
-        if base_repo != uri.repo {
-            let base_tok_uri = HfModelUri {
-                repo: base_repo,
-                filename: "tokenizer.json".to_string(),
-            };
-            if let Ok(p) = ensure_model(&base_tok_uri, models_dir) {
-                return tokenizers::Tokenizer::from_file(&p)
-                    .map_err(|e| anyhow::anyhow!("loading tokenizer from {}: {e}", p.display()));
-            }
-        }
+        tracing::info!("loaded LlamaOrchestrator from {}", uri_str);
 
-        bail!(
-            "could not find or download tokenizer for model repo '{}'",
-            uri.repo
-        );
+        Ok(Self { model })
     }
 
     /// Format a chat prompt in Qwen3 ChatML format.
@@ -1218,89 +953,76 @@ impl CandleOrchestrator {
 
     /// Run autoregressive generation (greedy decode) up to `max_tokens`.
     /// Returns the generated text (excluding the prompt).
-    fn generate(&mut self, prompt: &str, max_tokens: usize) -> Result<String> {
-        self.model.clear_kv_cache();
-
-        let encoding = self
-            .tokenizer
-            .encode(prompt, true)
+    fn generate(&self, prompt: &str, max_tokens: usize) -> Result<String> {
+        // Tokenize using llama.cpp's built-in tokenizer.
+        let tokens = self
+            .model
+            .str_to_token(prompt, AddBos::Always)
             .map_err(|e| anyhow::anyhow!("tokenization failed: {e}"))?;
-        let prompt_tokens = encoding.get_ids();
-        if prompt_tokens.is_empty() {
+        if tokens.is_empty() {
             bail!("tokenizer returned empty token sequence");
         }
 
-        // Determine EOS token ID.
-        let eos_token_id = self
-            .tokenizer
-            .token_to_id("<|im_end|>")
-            .or_else(|| self.tokenizer.token_to_id("<|endoftext|>"))
-            .unwrap_or(151643); // Qwen3 default EOS
-
-        // Process the prompt in a single forward pass.
-        let input = Tensor::new(prompt_tokens, &self.device)?
-            .unsqueeze(0)
-            .map_err(|e| anyhow::anyhow!("unsqueeze prompt: {e}"))?;
-        let logits = self
+        // Create context per-call (LlamaContext is !Send).
+        let n_ctx = (tokens.len() + max_tokens + 16) as u32;
+        let ctx_params = LlamaContextParams::default().with_n_ctx(std::num::NonZeroU32::new(n_ctx));
+        let mut ctx = self
             .model
-            .forward(&input, 0)
-            .map_err(|e| anyhow::anyhow!("forward pass (prompt): {e}"))?;
-
-        // Get the last token's logits and pick argmax.
-        let logits = logits
-            .to_dtype(DType::F32)
-            .map_err(|e| anyhow::anyhow!("logits dtype: {e}"))?;
-        let next_token = logits
-            .i(0)?
-            .argmax(D::Minus1)
-            .map_err(|e| anyhow::anyhow!("argmax: {e}"))?
-            .to_scalar::<u32>()
-            .map_err(|e| anyhow::anyhow!("scalar: {e}"))?;
-
-        let mut generated_tokens: Vec<u32> = vec![next_token];
-        let mut offset = prompt_tokens.len();
-
-        if next_token == eos_token_id {
-            // Model produced EOS immediately.
-            return Ok(String::new());
+            .new_context(llama_backend()?, ctx_params)
+            .map_err(|e| anyhow::anyhow!("creating orchestrator context: {e}"))?;
+
+        // Process prompt tokens in a batch.
+        let mut batch = LlamaBatch::new(tokens.len() + max_tokens + 16, 1);
+        for (i, token) in tokens.iter().enumerate() {
+            let is_last = i == tokens.len() - 1;
+            batch
+                .add(*token, i as i32, &[0], is_last)
+                .map_err(|e| anyhow::anyhow!("adding prompt token to batch: {e}"))?;
         }
 
-        // Autoregressive loop.
-        for _ in 1..max_tokens {
-            let input = Tensor::new(&[*generated_tokens.last().unwrap()], &self.device)?
-                .unsqueeze(0)
-                .map_err(|e| anyhow::anyhow!("unsqueeze step: {e}"))?;
-            let logits = self
-                .model
-                .forward(&input, offset)
-                .map_err(|e| anyhow::anyhow!("forward pass (step): {e}"))?;
-            offset += 1;
-
-            let logits = logits
-                .to_dtype(DType::F32)
-                .map_err(|e| anyhow::anyhow!("logits dtype: {e}"))?;
-            let token = logits
-                .i(0)?
-                .argmax(D::Minus1)
-                .map_err(|e| anyhow::anyhow!("argmax: {e}"))?
-                .to_scalar::<u32>()
-                .map_err(|e| anyhow::anyhow!("scalar: {e}"))?;
-
-            if token == eos_token_id {
+        ctx.decode(&mut batch)
+            .map_err(|e| anyhow::anyhow!("prompt decode failed: {e}"))?;
+
+        // Autoregressive generation loop.
+        let mut sampler = LlamaSampler::greedy();
+        let mut output = String::new();
+        // Each token may produce multi-byte UTF-8 sequences; use an encoding_rs decoder
+        // to correctly reassemble them across token boundaries.
+        let mut decoder = encoding_rs::UTF_8.new_decoder();
+        let mut n_cur = tokens.len();
+
+        for _ in 0..max_tokens {
+            let new_token = sampler.sample(&ctx, batch.n_tokens() - 1);
+            sampler.accept(new_token);
+
+            // Check for end-of-generation.
+            if self.model.is_eog_token(new_token) {
                 break;
             }
-            generated_tokens.push(token);
+
+            // Decode this token to text using llama.cpp's built-in tokenizer.
+            let piece = self
+                .model
+                .token_to_piece(new_token, &mut decoder, false, None)
+                .map_err(|e| anyhow::anyhow!("token_to_piece failed: {e}"))?;
+            output.push_str(&piece);
+
+            // Add token to batch for next iteration.
+            batch.clear();
+            batch
+                .add(new_token, n_cur as i32, &[0], true)
+                .map_err(|e| anyhow::anyhow!("adding generated token to batch: {e}"))?;
+            n_cur += 1;
+
+            ctx.decode(&mut batch)
+                .map_err(|e| anyhow::anyhow!("generation decode failed: {e}"))?;
         }
 
-        let text = self
-            .tokenizer
-            .decode(&generated_tokens, true)
-            .map_err(|e| anyhow::anyhow!("decoding generated tokens: {e}"))?;
-        Ok(text)
+        Ok(output)
     }
 }
 
-impl OrchestratorModel for CandleOrchestrator {
+impl OrchestratorModel for LlamaOrchestrator {
     fn orchestrate(&mut self, query: &str) -> Result<OrchestrationResult> {
         let prompt = Self::format_prompt(query);
 
@@ -1322,7 +1044,7 @@ impl OrchestratorModel for CandleOrchestrator {
     }
 }
 
-// ── CandleRerank — GGUF cross-encoder reranker via candle ─────────────────────
+// ── LlamaRerank — GGUF cross-encoder reranker via llama.cpp ─────────────────────
 
 /// Format query+document for cross-encoder reranking.
 pub fn format_reranker_input(query: &str, document: &str) -> String {
@@ -1334,39 +1056,43 @@ pub fn format_reranker_input(query: &str, document: &str) -> String {
     )
 }
 
-/// Quantized Qwen3 cross-encoder for reranking search results.
+/// Quantized Qwen3 cross-encoder for reranking search results via llama.cpp.
 ///
 /// Loads a Qwen3-Reranker GGUF model and scores (query, document) pairs by
 /// running a single forward pass and extracting Yes/No logit probabilities.
-/// Unlike `CandleOrchestrator`, this does NOT do autoregressive generation —
+/// Unlike `LlamaOrchestrator`, this does NOT do autoregressive generation —
 /// just one pass through the full input to get logits at the last position.
-pub struct CandleRerank {
-    model: candle_transformers::models::quantized_qwen3::ModelWeights,
-    tokenizer: tokenizers::Tokenizer,
-    device: Device,
-    yes_token_id: u32,
-    no_token_id: u32,
+///
+/// Uses llama.cpp's built-in tokenizer to look up Yes/No token IDs — no
+/// external tokenizer.json required. The global `LlamaBackend` is used via
+/// `llama_backend()`.
+pub struct LlamaRerank {
+    model: LlamaModel,
+    yes_token_id: i32,
+    no_token_id: i32,
 }
 
-impl std::fmt::Debug for CandleRerank {
+// Safety: LlamaModel is Send+Sync per llama-cpp-2 docs.
+// LlamaContext is created per-call and never stored.
+unsafe impl Send for LlamaRerank {}
+
+impl std::fmt::Debug for LlamaRerank {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("CandleRerank")
-            .field("device", &self.device)
+        f.debug_struct("LlamaRerank")
             .field("yes_token_id", &self.yes_token_id)
             .field("no_token_id", &self.no_token_id)
             .finish()
     }
 }
 
-impl CandleRerank {
+impl LlamaRerank {
     /// Load a Qwen3-Reranker GGUF model from `models_dir`.
     ///
     /// Steps:
     /// 1. Resolve model URI (from config override or `ModelDefaults::default().rerank_uri`)
     /// 2. `ensure_model()` to download if needed
-    /// 3. Load tokenizer from the model repo (or the non-GGUF base repo)
-    /// 4. Load GGUF via `ModelWeights::from_gguf()`
-    /// 5. Look up "Yes" and "No" token IDs from the tokenizer
+    /// 3. Load GGUF model via llama.cpp
+    /// 4. Look up Yes/No token IDs using the model's built-in tokenizer (no tokenizer.json needed)
     pub fn new(models_dir: &Path, config: &crate::config::Config) -> Result<Self> {
         let defaults = ModelDefaults::default();
         let uri_str = config
@@ -1377,132 +1103,86 @@ impl CandleRerank {
         let uri = HfModelUri::parse(uri_str)?;
         let model_path = ensure_model(&uri, models_dir)?;
 
-        // Load tokenizer (same strategy as CandleOrchestrator).
-        let tokenizer = Self::load_tokenizer(&uri, models_dir)?;
-
-        // Look up Yes/No token IDs.
-        let yes_token_id = tokenizer
-            .token_to_id("Yes")
-            .ok_or_else(|| anyhow::anyhow!("tokenizer has no 'Yes' token"))?;
-        let no_token_id = tokenizer
-            .token_to_id("No")
-            .ok_or_else(|| anyhow::anyhow!("tokenizer has no 'No' token"))?;
-
-        let device = select_device()?;
-
-        // Load GGUF model.
-        let mut file = std::fs::File::open(&model_path)
-            .map_err(|e| anyhow::anyhow!("opening GGUF {}: {e}", model_path.display()))?;
-        let ct = candle_core::quantized::gguf_file::Content::read(&mut file)
-            .map_err(|e| anyhow::anyhow!("reading GGUF {}: {e}", model_path.display()))?;
-        let model = candle_transformers::models::quantized_qwen3::ModelWeights::from_gguf(
-            ct, &mut file, &device,
-        )
-        .map_err(|e| anyhow::anyhow!("loading Qwen3 reranker model weights: {e}"))?;
+        // Use global backend and llama.cpp's built-in tokenizer (no tokenizer.json required).
+        let backend = llama_backend()?;
+        let model_params = LlamaModelParams::default();
+        let model = LlamaModel::load_from_file(backend, &model_path, &model_params)
+            .map_err(|e| anyhow::anyhow!("loading reranker model {}: {e}", model_path.display()))?;
+
+        // Look up Yes/No token IDs via the model's built-in tokenizer.
+        // str_to_token returns Vec<LlamaToken>; we take the first token ID (skip BOS).
+        let yes_tokens = model
+            .str_to_token("Yes", AddBos::Never)
+            .map_err(|e| anyhow::anyhow!("tokenizing 'Yes': {e}"))?;
+        let yes_token_id = yes_tokens
+            .first()
+            .map(|t| t.0)
+            .ok_or_else(|| anyhow::anyhow!("model tokenizer returned no tokens for 'Yes'"))?;
+
+        let no_tokens = model
+            .str_to_token("No", AddBos::Never)
+            .map_err(|e| anyhow::anyhow!("tokenizing 'No': {e}"))?;
+        let no_token_id = no_tokens
+            .first()
+            .map(|t| t.0)
+            .ok_or_else(|| anyhow::anyhow!("model tokenizer returned no tokens for 'No'"))?;
 
         tracing::info!(
-            "loaded CandleRerank from {}, device={:?}, yes_id={}, no_id={}",
+            "loaded LlamaRerank from {}, yes_id={}, no_id={}",
             uri_str,
-            device,
             yes_token_id,
             no_token_id
         );
 
         Ok(Self {
             model,
-            tokenizer,
-            device,
             yes_token_id,
             no_token_id,
         })
     }
-
-    /// Try to load tokenizer.json from the same HF repo, or from the non-GGUF base repo.
-    fn load_tokenizer(uri: &HfModelUri, models_dir: &Path) -> Result<tokenizers::Tokenizer> {
-        // Try 1: tokenizer.json from the same repo.
-        let tok_uri = HfModelUri {
-            repo: uri.repo.clone(),
-            filename: "tokenizer.json".to_string(),
-        };
-        let tok_path = tok_uri.cache_path(models_dir);
-        if tok_path.exists() {
-            return tokenizers::Tokenizer::from_file(&tok_path).map_err(|e| {
-                anyhow::anyhow!("loading tokenizer from {}: {e}", tok_path.display())
-            });
-        }
-
-        // Try 2: download from the same repo.
-        if let Ok(p) = ensure_model(&tok_uri, models_dir) {
-            return tokenizers::Tokenizer::from_file(&p)
-                .map_err(|e| anyhow::anyhow!("loading tokenizer from {}: {e}", p.display()));
-        }
-
-        // Try 3: non-GGUF variant of the repo.
-        let base_repo = uri.repo.trim_end_matches("-GGUF").to_string();
-        if base_repo != uri.repo {
-            let base_tok_uri = HfModelUri {
-                repo: base_repo,
-                filename: "tokenizer.json".to_string(),
-            };
-            if let Ok(p) = ensure_model(&base_tok_uri, models_dir) {
-                return tokenizers::Tokenizer::from_file(&p)
-                    .map_err(|e| anyhow::anyhow!("loading tokenizer from {}: {e}", p.display()));
-            }
-        }
-
-        bail!(
-            "could not find or download tokenizer for model repo '{}'",
-            uri.repo
-        );
-    }
 }
 
-impl RerankModel for CandleRerank {
+impl RerankModel for LlamaRerank {
     fn rerank_score(&mut self, query: &str, document: &str) -> Result<f32> {
-        self.model.clear_kv_cache();
-
         let input_text = format_reranker_input(query, document);
 
-        let encoding = self
-            .tokenizer
-            .encode(input_text.as_str(), true)
+        // Tokenize using llama.cpp's built-in tokenizer.
+        let tokens = self
+            .model
+            .str_to_token(&input_text, AddBos::Always)
             .map_err(|e| anyhow::anyhow!("tokenization failed: {e}"))?;
-        let token_ids = encoding.get_ids();
-        if token_ids.is_empty() {
+        if tokens.is_empty() {
             bail!("tokenizer returned empty token sequence");
         }
 
-        // Single forward pass through the full input (no autoregressive generation).
-        let input = Tensor::new(token_ids, &self.device)?
-            .unsqueeze(0)
-            .map_err(|e| anyhow::anyhow!("unsqueeze input: {e}"))?;
-        let logits = self
+        // Create context per-call (LlamaContext is !Send).
+        let n_ctx = (tokens.len() + 16) as u32;
+        let ctx_params = LlamaContextParams::default().with_n_ctx(std::num::NonZeroU32::new(n_ctx));
+        let mut ctx = self
             .model
-            .forward(&input, 0)
-            .map_err(|e| anyhow::anyhow!("forward pass: {e}"))?;
-
-        // logits shape: [1, seq_len, vocab_size] or [1, vocab_size] (last position).
-        // Extract logits for the last position.
-        let logits = logits
-            .to_dtype(DType::F32)
-            .map_err(|e| anyhow::anyhow!("logits dtype: {e}"))?;
-        let last_logits = logits
-            .i(0)
-            .map_err(|e| anyhow::anyhow!("batch index: {e}"))?;
-
-        // Extract Yes/No logits.
-        let yes_logit: f32 = last_logits
-            .i(self.yes_token_id as usize)
-            .map_err(|e| anyhow::anyhow!("yes logit index: {e}"))?
-            .to_scalar()
-            .map_err(|e| anyhow::anyhow!("yes logit scalar: {e}"))?;
-        let no_logit: f32 = last_logits
-            .i(self.no_token_id as usize)
-            .map_err(|e| anyhow::anyhow!("no logit index: {e}"))?
-            .to_scalar()
-            .map_err(|e| anyhow::anyhow!("no logit scalar: {e}"))?;
-
-        // Softmax over Yes/No to get probability.
+            .new_context(llama_backend()?, ctx_params)
+            .map_err(|e| anyhow::anyhow!("creating reranker context: {e}"))?;
+
+        // Create batch with all tokens; mark last as logit-producing.
+        let mut batch = LlamaBatch::new(tokens.len() + 16, 1);
+        for (i, token) in tokens.iter().enumerate() {
+            let is_last = i == tokens.len() - 1;
+            batch
+                .add(*token, i as i32, &[0], is_last)
+                .map_err(|e| anyhow::anyhow!("adding token to reranker batch: {e}"))?;
+        }
+
+        // Single forward pass through the full input.
+        ctx.decode(&mut batch)
+            .map_err(|e| anyhow::anyhow!("reranker decode failed: {e}"))?;
+
+        // Get logits for the last token position.
+        let logits = ctx.get_logits_ith(batch.n_tokens() - 1);
+
+        // Extract Yes/No logits and compute softmax probability.
+        let yes_logit = logits[self.yes_token_id as usize];
+        let no_logit = logits[self.no_token_id as usize];
+
         let max_logit = yes_logit.max(no_logit);
         let yes_exp = (yes_logit - max_logit).exp();
         let no_exp = (no_logit - max_logit).exp();
@@ -1629,17 +1309,21 @@ mod tests {
         let defaults = ModelDefaults::default();
         assert!(defaults.embed_uri.starts_with("hf:"));
         assert_eq!(defaults.embed_dim, 256);
+        assert!(
+            defaults.embed_uri.contains("embeddinggemma"),
+            "default embed model should be embeddinggemma"
+        );
     }
 
-    // ── CandleEmbed / PromptFormat tests ────────────────────────────────────
+    // ── LlamaEmbed / PromptFormat tests ────────────────────────────────────
 
     #[test]
-    fn test_candle_embed_struct_exists() {
+    fn test_llama_embed_struct_exists() {
         fn assert_embed_model<E: EmbedModel>(_e: &E) {}
         let mock = MockLlm::new(256);
         assert_embed_model(&mock);
-        // CandleEmbed also implements EmbedModel — verified at compile time.
-        // We can't instantiate CandleEmbed without a real GGUF model,
+        // LlamaEmbed also implements EmbedModel — verified at compile time.
+        // We can't instantiate LlamaEmbed without a real GGUF model,
         // but the trait bound compiles.
     }
 
@@ -1690,15 +1374,6 @@ mod tests {
         assert_eq!(formatted, "Title\nBody");
     }
 
-    #[test]
-    fn test_select_device_returns_cpu_by_default() {
-        // Without the `metal` feature, select_device should return CPU.
-        let device = select_device().unwrap();
-        // On CI/test without metal feature, this should be CPU.
-        // With metal feature on macOS, it could be Metal — both are valid.
-        let _ = device; // Just verify it doesn't error.
-    }
-
     // ── heuristic_orchestrate tests ──────────────────────────────────────────
 
     #[test]
@@ -1799,11 +1474,11 @@ mod tests {
         assert!(parse_orchestration_json(json).is_err());
     }
 
-    // ── CandleOrchestrator tests ─────────────────────────────────────────────
+    // ── LlamaOrchestrator tests ─────────────────────────────────────────────
 
     #[test]
-    fn test_candle_orchestrator_format_prompt() {
-        let prompt = CandleOrchestrator::format_prompt("how does auth work");
+    fn test_llama_orchestrator_format_prompt() {
+        let prompt = LlamaOrchestrator::format_prompt("how does auth work");
         assert!(prompt.contains("<|im_start|>system"));
         assert!(prompt.contains("<|im_end|>"));
         assert!(prompt.contains("<|im_start|>user"));
@@ -1812,13 +1487,13 @@ mod tests {
     }
 
     #[test]
-    fn test_candle_orchestrator_implements_trait() {
-        // Compile-time check: CandleOrchestrator implements OrchestratorModel.
+    fn test_llama_orchestrator_implements_trait() {
+        // Compile-time check: LlamaOrchestrator implements OrchestratorModel.
         fn assert_orchestrator<O: OrchestratorModel>() {}
-        assert_orchestrator::<CandleOrchestrator>();
+        assert_orchestrator::<LlamaOrchestrator>();
     }
 
-    // ── CandleRerank tests ──────────────────────────────────────────────────
+    // ── LlamaRerank tests ──────────────────────────────────────────────────
 
     #[test]
     fn test_format_reranker_input() {
@@ -1829,7 +1504,7 @@ mod tests {
     }
 
     #[test]
-    fn test_candle_rerank_trait_compliance() {
+    fn test_llama_rerank_trait_compliance() {
         // Verify MockLlm still satisfies RerankModel.
         fn assert_rerank<R: RerankModel>(_r: &R) {}
         let mock = MockLlm::new(256);
diff --git a/src/main.rs b/src/main.rs
index 7d2872b..10a20f3 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -829,7 +829,7 @@ async fn main() -> Result<()> {
                 }
                 ContextAction::Topic { query, budget } => {
                     let models_dir = data_dir.join("models");
-                    let mut embedder = engraph::llm::CandleEmbed::new(&models_dir, &cfg)?;
+                    let mut embedder = engraph::llm::LlamaEmbed::new(&models_dir, &cfg)?;
 
                     let bundle = engraph::context::context_topic_with_search(
                         &params,
@@ -882,7 +882,7 @@ async fn main() -> Result<()> {
                 .ok_or_else(|| anyhow::anyhow!("No vault path in index."))?;
             let vault_path = PathBuf::from(&vault_path_str);
             let models_dir = data_dir.join("models");
-            let mut embedder = engraph::llm::CandleEmbed::new(&models_dir, &cfg)?;
+            let mut embedder = engraph::llm::LlamaEmbed::new(&models_dir, &cfg)?;
             let profile = config::Config::load_vault_profile().ok().flatten();
 
             match action {
diff --git a/src/search.rs b/src/search.rs
index 430d56d..eb41be5 100644
--- a/src/search.rs
+++ b/src/search.rs
@@ -10,7 +10,6 @@ use crate::llm::{self, EmbedModel, OrchestratorModel, RerankModel};
 use crate::store::{Store, StoreStats};
 
 /// Compute cache key for orchestration results (SHA256 of query).
-#[allow(dead_code)]
 fn orchestration_cache_key(query: &str) -> String {
     use sha2::{Digest, Sha256};
     let hash = Sha256::digest(query.as_bytes());
@@ -85,11 +84,32 @@ pub fn search_with_intelligence(
     embedder: &mut impl EmbedModel,
     config: &mut SearchConfig<'_>,
 ) -> Result<SearchOutput> {
-    // --- Step 1: Orchestrate ---
+    // --- Step 1: Orchestrate (with LLM cache when orchestrator is present) ---
     let orchestration = match &mut config.orchestrator {
-        Some(orch) => orch.orchestrate(query)?,
+        Some(orch) => {
+            let cache_key = orchestration_cache_key(query);
+            if let Some(cached_json) = config.store.get_llm_cache(&cache_key)? {
+                serde_json::from_str(&cached_json).unwrap_or_else(|_| {
+                    orch.orchestrate(query)
+                        .unwrap_or_else(|_| llm::heuristic_orchestrate(query))
+                })
+            } else {
+                let result = orch.orchestrate(query)?;
+                if let Ok(json) = serde_json::to_string(&result) {
+                    let _ = config
+                        .store
+                        .set_llm_cache(&cache_key, &json, "orchestrator");
+                }
+                result
+            }
+        }
         None => llm::heuristic_orchestrate(query),
     };
+    tracing::debug!(
+        intent = ?orchestration.intent,
+        expansions = orchestration.expansions.len(),
+        "orchestration complete"
+    );
     let weights = llm::LaneWeights::from_intent(&orchestration.intent);
 
     // --- Step 2: Run 3-lane retrieval for EACH expanded query ---
@@ -302,12 +322,49 @@ pub fn run_search(
 ) -> Result<()> {
     let models_dir = data_dir.join("models");
     let mut embedder =
-        crate::llm::CandleEmbed::new(&models_dir, config).context("loading embedder")?;
+        crate::llm::LlamaEmbed::new(&models_dir, config).context("loading embedder")?;
 
     let db_path = data_dir.join("engraph.db");
     let store = Store::open(&db_path).context("opening store")?;
 
-    let output = search_internal(query, top_n, &store, &mut embedder)?;
+    // Load intelligence models if enabled.
+    let mut orchestrator_model: Option<Box<dyn llm::OrchestratorModel>> =
+        if config.intelligence_enabled() {
+            match crate::llm::LlamaOrchestrator::new(&models_dir, config) {
+                Ok(o) => Some(Box::new(o)),
+                Err(e) => {
+                    tracing::warn!("failed to load orchestrator: {e}");
+                    None
+                }
+            }
+        } else {
+            None
+        };
+    let mut reranker_model: Option<Box<dyn llm::RerankModel>> = if config.intelligence_enabled() {
+        match crate::llm::LlamaRerank::new(&models_dir, config) {
+            Ok(r) => Some(Box::new(r)),
+            Err(e) => {
+                tracing::warn!("failed to load reranker: {e}");
+                None
+            }
+        }
+    } else {
+        None
+    };
+
+    let output = {
+        let mut search_config = SearchConfig {
+            orchestrator: orchestrator_model
+                .as_mut()
+                .map(|o| o.as_mut() as &mut dyn llm::OrchestratorModel),
+            reranker: reranker_model
+                .as_mut()
+                .map(|r| r.as_mut() as &mut dyn llm::RerankModel),
+            store: &store,
+            rerank_candidates: 30,
+        };
+        search_with_intelligence(query, top_n, &mut embedder, &mut search_config)?
+    };
 
     let results: Vec<SearchResult> = output
         .results
diff --git a/src/serve.rs b/src/serve.rs
index 48ba85b..6a91346 100644
--- a/src/serve.rs
+++ b/src/serve.rs
@@ -132,10 +132,8 @@ pub struct EngraphServer {
     profile: Arc<Option<VaultProfile>>,
     tool_router: ToolRouter<Self>,
     /// Query expansion orchestrator (None when intelligence is disabled or failed to load).
-    #[allow(dead_code)]
     orchestrator: Option<Arc<Mutex<Box<dyn OrchestratorModel + Send>>>>,
     /// Result reranker (None when intelligence is disabled or failed to load).
-    #[allow(dead_code)]
     reranker: Option<Arc<Mutex<Box<dyn RerankModel + Send>>>>,
 }
 
@@ -168,8 +166,31 @@ impl EngraphServer {
         let top_n = params.0.top_n.unwrap_or(10);
         let store = self.store.lock().await;
         let mut embedder = self.embedder.lock().await;
-        let output = search::search_internal(&params.0.query, top_n, &store, &mut *embedder)
-            .map_err(|e| mcp_err(&e))?;
+
+        // Lock orchestrator and reranker if available for intelligence-enhanced search.
+        let mut orch_guard = match &self.orchestrator {
+            Some(o) => Some(o.lock().await),
+            None => None,
+        };
+        let mut rerank_guard = match &self.reranker {
+            Some(r) => Some(r.lock().await),
+            None => None,
+        };
+
+        let mut config = search::SearchConfig {
+            orchestrator: orch_guard
+                .as_mut()
+                .map(|g| g.as_mut() as &mut dyn OrchestratorModel),
+            reranker: rerank_guard
+                .as_mut()
+                .map(|g| g.as_mut() as &mut dyn RerankModel),
+            store: &store,
+            rerank_candidates: 30,
+        };
+
+        let output =
+            search::search_with_intelligence(&params.0.query, top_n, &mut *embedder, &mut config)
+                .map_err(|e| mcp_err(&e))?;
         to_json_result(&output.results)
     }
 
@@ -416,7 +437,7 @@ pub async fn run_serve(data_dir: &Path) -> Result<()> {
 
     let store = Store::open(&db_path)?;
     let config = Config::load()?;
-    let embedder = crate::llm::CandleEmbed::new(&models_dir, &config)?;
+    let embedder = crate::llm::LlamaEmbed::new(&models_dir, &config)?;
 
     let vault_path_str = store.get_meta("vault_path")?.ok_or_else(|| {
         anyhow::anyhow!("No vault path in index. Run 'engraph index <path>' first.")
@@ -441,7 +462,7 @@ pub async fn run_serve(data_dir: &Path) -> Result<()> {
     // Load intelligence models if enabled
     let orchestrator: Option<Arc<Mutex<Box<dyn OrchestratorModel + Send>>>> =
         if config.intelligence_enabled() {
-            match crate::llm::CandleOrchestrator::new(&models_dir, &config) {
+            match crate::llm::LlamaOrchestrator::new(&models_dir, &config) {
                 Ok(orch) => Some(Arc::new(Mutex::new(
                     Box::new(orch) as Box<dyn OrchestratorModel + Send>
                 ))),
@@ -456,7 +477,7 @@ pub async fn run_serve(data_dir: &Path) -> Result<()> {
 
     let reranker: Option<Arc<Mutex<Box<dyn RerankModel + Send>>>> = if config.intelligence_enabled()
     {
-        match crate::llm::CandleRerank::new(&models_dir, &config) {
+        match crate::llm::LlamaRerank::new(&models_dir, &config) {
             Ok(rerank) => Some(Arc::new(Mutex::new(
                 Box::new(rerank) as Box<dyn RerankModel + Send>
             ))),
diff --git a/src/store.rs b/src/store.rs
index 4485bf3..30f0088 100644
--- a/src/store.rs
+++ b/src/store.rs
@@ -141,7 +141,12 @@ impl Store {
             .context("failed to initialize schema")?;
         self.migrate()?;
         self.ensure_fts_table()?;
-        crate::vecstore::init_vec_table(&self.conn, 256)?;
+        // Use stored embedding dimension if available, defaulting to 384 for new databases.
+        let dim = self
+            .get_meta("embedding_dim")?
+            .and_then(|s| s.parse::<usize>().ok())
+            .unwrap_or(256);
+        crate::vecstore::init_vec_table(&self.conn, dim)?;
         self.migrate_vectors_to_vec0()?;
         Ok(())
     }
@@ -1165,11 +1170,12 @@ impl Store {
         }
     }
 
-    /// Drop the vec table and all chunk records. Used during dimension migration.
+    /// Drop the vec table and all chunk/FTS records. Used during dimension migration.
     pub fn reset_for_reindex(&self, new_dim: usize) -> Result<()> {
         self.conn.execute("DROP TABLE IF EXISTS chunks_vec", [])?;
         crate::vecstore::init_vec_table(&self.conn, new_dim)?;
         self.conn.execute("DELETE FROM chunks", [])?;
+        self.conn.execute("DELETE FROM chunks_fts", [])?;
         Ok(())
     }