From b478bec98e02671052415d7cd820b0882afd9fac Mon Sep 17 00:00:00 2001
From: Oleksandr Ostrovskyi <devwhodevs@gmail.com>
Date: Wed, 25 Mar 2026 20:09:09 +0200
Subject: [PATCH 01/17] fix(llm): add shimmytok fallback for GGUF-embedded
 tokenizers

GGUF repos rarely ship tokenizer.json and Google Gemma tokenizers
are gated on HuggingFace. FlexTokenizer enum wraps both HuggingFace
tokenizers crate and shimmytok (extracts from GGUF metadata).
CandleEmbed uses FlexTokenizer, orchestrator/reranker use HF-only.
---
 Cargo.lock |  45 ++++++++-
 Cargo.toml |   1 +
 src/llm.rs | 274 +++++++++++++++++++++++++++--------------------------
 3 files changed, 185 insertions(+), 135 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index d6c3ef7..c1c6967 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -142,15 +142,30 @@ dependencies = [
  "rayon",
 ]
 
+[[package]]
+name = "bit-set"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1"
+dependencies = [
+ "bit-vec 0.6.3",
+]
+
 [[package]]
 name = "bit-set"
 version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3"
 dependencies = [
- "bit-vec",
+ "bit-vec 0.8.0",
 ]
 
+[[package]]
+name = "bit-vec"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb"
+
 [[package]]
 name = "bit-vec"
 version = "0.8.0"
@@ -784,6 +799,7 @@ dependencies = [
  "serde",
  "serde_json",
  "sha2",
+ "shimmytok",
  "sqlite-vec",
  "strsim",
  "tempfile",
@@ -843,13 +859,24 @@ version = "0.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a"
 
+[[package]]
+name = "fancy-regex"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2"
+dependencies = [
+ "bit-set 0.5.3",
+ "regex-automata",
+ "regex-syntax",
+]
+
 [[package]]
 name = "fancy-regex"
 version = "0.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6e24cb5a94bcae1e5408b0effca5cd7172ea3c5755049c5f3af4cd283a165298"
 dependencies = [
- "bit-set",
+ "bit-set 0.8.0",
  "regex-automata",
  "regex-syntax",
 ]
@@ -860,7 +887,7 @@ version = "0.17.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "72cf461f865c862bb7dc573f643dd6a2b6842f7c30b07882b56bd148cc2761b8"
 dependencies = [
- "bit-set",
+ "bit-set 0.8.0",
  "regex-automata",
  "regex-syntax",
 ]
@@ -2726,6 +2753,18 @@ dependencies = [
  "lazy_static",
 ]
 
+[[package]]
+name = "shimmytok"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f2381f12d5c3f475feaf705354294803f350c13d0788f3ab367ac5979df9021"
+dependencies = [
+ "fancy-regex 0.13.0",
+ "rayon",
+ "regex",
+ "thiserror 2.0.18",
+]
+
 [[package]]
 name = "shlex"
 version = "1.3.0"
diff --git a/Cargo.toml b/Cargo.toml
index 93a2bc4..42b1d46 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -37,6 +37,7 @@ notify-debouncer-full = "0.4"
 candle-core = "0.9"
 candle-nn = "0.9"
 candle-transformers = "0.9"
+shimmytok = "0.7"
 
 [features]
 default = []
diff --git a/src/llm.rs b/src/llm.rs
index f4c818f..06f9443 100644
--- a/src/llm.rs
+++ b/src/llm.rs
@@ -444,6 +444,140 @@ pub fn ensure_model(uri: &HfModelUri, models_dir: &Path) -> Result<PathBuf> {
     Ok(path)
 }
 
+/// Tokenizer that can be backed by either HuggingFace tokenizers crate or shimmytok (GGUF-embedded).
+pub enum FlexTokenizer {
+    HuggingFace(Box<tokenizers::Tokenizer>),
+    Gguf(Box<shimmytok::Tokenizer>),
+}
+
+impl FlexTokenizer {
+    /// Encode text into token IDs.
+    pub fn encode(&self, text: &str, add_special: bool) -> Result<Vec<u32>> {
+        match self {
+            Self::HuggingFace(t) => {
+                let enc = t
+                    .encode(text, add_special)
+                    .map_err(|e| anyhow::anyhow!("tokenization: {e}"))?;
+                Ok(enc.get_ids().to_vec())
+            }
+            Self::Gguf(t) => {
+                let ids = t
+                    .encode(text, add_special)
+                    .map_err(|e| anyhow::anyhow!("tokenization: {e}"))?;
+                Ok(ids)
+            }
+        }
+    }
+
+    /// Count tokens in text.
+    pub fn token_count(&self, text: &str) -> usize {
+        self.encode(text, false).map(|ids| ids.len()).unwrap_or(0)
+    }
+
+    /// Look up a token's ID by string (only available with HuggingFace backend).
+    pub fn token_to_id(&self, token: &str) -> Option<u32> {
+        match self {
+            Self::HuggingFace(t) => t.token_to_id(token),
+            Self::Gguf(_) => None,
+        }
+    }
+
+    /// Decode token IDs back to text (only available with HuggingFace backend).
+    pub fn decode(&self, ids: &[u32], skip_special: bool) -> Result<String> {
+        match self {
+            Self::HuggingFace(t) => t
+                .decode(ids, skip_special)
+                .map_err(|e| anyhow::anyhow!("decode: {e}")),
+            Self::Gguf(_) => bail!("decode not supported with GGUF tokenizer"),
+        }
+    }
+}
+
+/// Load tokenizer for a model. Tries external tokenizer.json first, falls back to GGUF-embedded.
+fn load_tokenizer_for_model(uri: &HfModelUri, models_dir: &Path) -> Result<FlexTokenizer> {
+    // First try: external tokenizer.json from candidate repos.
+    if let Some(tok) = try_external_tokenizer(uri, models_dir) {
+        return Ok(FlexTokenizer::HuggingFace(Box::new(tok)));
+    }
+
+    // Fallback: load tokenizer from GGUF file metadata.
+    let model_path = uri.cache_path(models_dir);
+    if model_path.exists() {
+        tracing::info!(
+            "no external tokenizer found, loading from GGUF: {}",
+            model_path.display()
+        );
+        let tok = shimmytok::Tokenizer::from_gguf_file(&model_path)
+            .map_err(|e| anyhow::anyhow!("loading tokenizer from GGUF metadata: {e}"))?;
+        return Ok(FlexTokenizer::Gguf(Box::new(tok)));
+    }
+
+    bail!(
+        "could not find tokenizer for model '{}': no external tokenizer.json \
+         and GGUF file not yet downloaded",
+        uri.repo
+    )
+}
+
+/// Load tokenizer as HuggingFace `tokenizers::Tokenizer` specifically.
+/// Used by CandleOrchestrator and CandleRerank which need decode/token_to_id.
+fn load_hf_tokenizer(uri: &HfModelUri, models_dir: &Path) -> Result<tokenizers::Tokenizer> {
+    try_external_tokenizer(uri, models_dir).ok_or_else(|| {
+        anyhow::anyhow!(
+            "could not find tokenizer.json for model '{}'. \
+             Orchestrator/reranker models require tokenizer.json (not GGUF-embedded).",
+            uri.repo
+        )
+    })
+}
+
+/// Try downloading tokenizer.json from candidate HuggingFace repos.
+fn try_external_tokenizer(uri: &HfModelUri, models_dir: &Path) -> Option<tokenizers::Tokenizer> {
+    let mut candidates: Vec<String> = vec![uri.repo.clone()];
+
+    // Non-GGUF variant: "org/model-GGUF" → "org/model"
+    let base_repo = uri.repo.trim_end_matches("-GGUF").to_string();
+    if base_repo != uri.repo {
+        candidates.push(base_repo);
+    }
+
+    // Known upstream repos for default models (GGUF repos rarely ship tokenizers).
+    let model_lower = uri.repo.to_lowercase();
+    if model_lower.contains("embeddinggemma") {
+        candidates.push("google/embeddinggemma-300m".to_string());
+        candidates.push("google/gemma-2b".to_string());
+    } else if model_lower.contains("qwen3") {
+        let base_name = uri
+            .repo
+            .rsplit('/')
+            .next()
+            .unwrap_or("")
+            .trim_end_matches("-GGUF")
+            .trim_end_matches("-Q8_0-GGUF");
+        if !base_name.is_empty() {
+            candidates.push(format!("Qwen/{base_name}"));
+        }
+    }
+
+    for repo in &candidates {
+        let tok_uri = HfModelUri {
+            repo: repo.clone(),
+            filename: "tokenizer.json".to_string(),
+        };
+        let tok_path = tok_uri.cache_path(models_dir);
+
+        if tok_path.exists() && let Ok(tok) = tokenizers::Tokenizer::from_file(&tok_path) {
+            return Some(tok);
+        }
+
+        if let Ok(p) = ensure_model(&tok_uri, models_dir) && let Ok(tok) = tokenizers::Tokenizer::from_file(&p) {
+            return Some(tok);
+        }
+    }
+
+    None
+}
+
 /// Default model URIs for the intelligence layer.
 pub struct ModelDefaults {
     pub embed_uri: String,
@@ -595,7 +729,7 @@ pub struct CandleEmbed {
     tok_embeddings: Embedding,
     norm: candle_transformers::quantized_nn::RmsNorm,
     embedding_length: usize,
-    tokenizer: tokenizers::Tokenizer,
+    tokenizer: FlexTokenizer,
     device: Device,
     dim: usize,
     prompt_format: PromptFormat,
@@ -632,7 +766,7 @@ impl CandleEmbed {
         let model_path = ensure_model(&uri, models_dir)?;
 
         // Load tokenizer: try from the same HF repo, then from the non-GGUF variant.
-        let tokenizer = Self::load_tokenizer(&uri, models_dir)?;
+        let tokenizer = load_tokenizer_for_model(&uri, models_dir)?;
 
         // Detect prompt format from filename.
         let prompt_format = PromptFormat::detect(&uri.filename);
@@ -665,45 +799,6 @@ impl CandleEmbed {
         })
     }
 
-    /// Try to load tokenizer.json from the same HF repo, or from repo without "-GGUF" suffix.
-    fn load_tokenizer(uri: &HfModelUri, models_dir: &Path) -> Result<tokenizers::Tokenizer> {
-        // Try 1: tokenizer.json from the same repo.
-        let tok_uri = HfModelUri {
-            repo: uri.repo.clone(),
-            filename: "tokenizer.json".to_string(),
-        };
-        let tok_path = tok_uri.cache_path(models_dir);
-        if tok_path.exists() {
-            return tokenizers::Tokenizer::from_file(&tok_path).map_err(|e| {
-                anyhow::anyhow!("loading tokenizer from {}: {e}", tok_path.display())
-            });
-        }
-
-        // Try 2: download from the same repo.
-        if let Ok(p) = ensure_model(&tok_uri, models_dir) {
-            return tokenizers::Tokenizer::from_file(&p)
-                .map_err(|e| anyhow::anyhow!("loading tokenizer from {}: {e}", p.display()));
-        }
-
-        // Try 3: non-GGUF variant of the repo (e.g., "org/model-GGUF" -> "org/model").
-        let base_repo = uri.repo.trim_end_matches("-GGUF").to_string();
-        if base_repo != uri.repo {
-            let base_tok_uri = HfModelUri {
-                repo: base_repo,
-                filename: "tokenizer.json".to_string(),
-            };
-            if let Ok(p) = ensure_model(&base_tok_uri, models_dir) {
-                return tokenizers::Tokenizer::from_file(&p)
-                    .map_err(|e| anyhow::anyhow!("loading tokenizer from {}: {e}", p.display()));
-            }
-        }
-
-        bail!(
-            "could not find or download tokenizer for model repo '{}'",
-            uri.repo
-        );
-    }
-
     /// Load GGUF file and construct layer structs for bidirectional embedding.
     fn load_gguf(
         path: &Path,
@@ -876,11 +971,7 @@ impl CandleEmbed {
     /// Run a bidirectional forward pass and return the mean-pooled, truncated,
     /// L2-normalized embedding.
     fn embed_text(&self, text: &str) -> Result<Vec<f32>> {
-        let encoding = self
-            .tokenizer
-            .encode(text, true)
-            .map_err(|e| anyhow::anyhow!("tokenization failed: {e}"))?;
-        let token_ids = encoding.get_ids();
+        let token_ids = self.tokenizer.encode(text, true)?;
         if token_ids.is_empty() {
             bail!("tokenizer returned empty token sequence");
         }
@@ -971,10 +1062,7 @@ impl EmbedModel for CandleEmbed {
     }
 
     fn token_count(&self, text: &str) -> usize {
-        self.tokenizer
-            .encode(text, false)
-            .map(|enc| enc.get_ids().len())
-            .unwrap_or(text.len() / 4 + 1)
+        self.tokenizer.token_count(text)
     }
 
     fn dim(&self) -> usize {
@@ -1140,12 +1228,11 @@ impl CandleOrchestrator {
         let uri = HfModelUri::parse(uri_str)?;
         let model_path = ensure_model(&uri, models_dir)?;
 
-        // Load tokenizer (same strategy as CandleEmbed).
-        let tokenizer = Self::load_tokenizer(&uri, models_dir)?;
+        // Orchestrator needs HF tokenizer (for decode + token_to_id).
+        let tokenizer = load_hf_tokenizer(&uri, models_dir)?;
 
         let device = select_device()?;
 
-        // Load GGUF model.
         let mut file = std::fs::File::open(&model_path)
             .map_err(|e| anyhow::anyhow!("opening GGUF {}: {e}", model_path.display()))?;
         let ct = candle_core::quantized::gguf_file::Content::read(&mut file)
@@ -1168,45 +1255,6 @@ impl CandleOrchestrator {
         })
     }
 
-    /// Try to load tokenizer.json from the same HF repo, or from the non-GGUF base repo.
-    fn load_tokenizer(uri: &HfModelUri, models_dir: &Path) -> Result<tokenizers::Tokenizer> {
-        // Try 1: tokenizer.json from the same repo.
-        let tok_uri = HfModelUri {
-            repo: uri.repo.clone(),
-            filename: "tokenizer.json".to_string(),
-        };
-        let tok_path = tok_uri.cache_path(models_dir);
-        if tok_path.exists() {
-            return tokenizers::Tokenizer::from_file(&tok_path).map_err(|e| {
-                anyhow::anyhow!("loading tokenizer from {}: {e}", tok_path.display())
-            });
-        }
-
-        // Try 2: download from the same repo.
-        if let Ok(p) = ensure_model(&tok_uri, models_dir) {
-            return tokenizers::Tokenizer::from_file(&p)
-                .map_err(|e| anyhow::anyhow!("loading tokenizer from {}: {e}", p.display()));
-        }
-
-        // Try 3: non-GGUF variant of the repo (e.g., "Qwen/Qwen3-0.6B-GGUF" -> "Qwen/Qwen3-0.6B").
-        let base_repo = uri.repo.trim_end_matches("-GGUF").to_string();
-        if base_repo != uri.repo {
-            let base_tok_uri = HfModelUri {
-                repo: base_repo,
-                filename: "tokenizer.json".to_string(),
-            };
-            if let Ok(p) = ensure_model(&base_tok_uri, models_dir) {
-                return tokenizers::Tokenizer::from_file(&p)
-                    .map_err(|e| anyhow::anyhow!("loading tokenizer from {}: {e}", p.display()));
-            }
-        }
-
-        bail!(
-            "could not find or download tokenizer for model repo '{}'",
-            uri.repo
-        );
-    }
-
     /// Format a chat prompt in Qwen3 ChatML format.
     fn format_prompt(query: &str) -> String {
         format!(
@@ -1377,8 +1425,8 @@ impl CandleRerank {
         let uri = HfModelUri::parse(uri_str)?;
         let model_path = ensure_model(&uri, models_dir)?;
 
-        // Load tokenizer (same strategy as CandleOrchestrator).
-        let tokenizer = Self::load_tokenizer(&uri, models_dir)?;
+        // Reranker needs HF tokenizer (for token_to_id).
+        let tokenizer = load_hf_tokenizer(&uri, models_dir)?;
 
         // Look up Yes/No token IDs.
         let yes_token_id = tokenizer
@@ -1417,44 +1465,6 @@ impl CandleRerank {
         })
     }
 
-    /// Try to load tokenizer.json from the same HF repo, or from the non-GGUF base repo.
-    fn load_tokenizer(uri: &HfModelUri, models_dir: &Path) -> Result<tokenizers::Tokenizer> {
-        // Try 1: tokenizer.json from the same repo.
-        let tok_uri = HfModelUri {
-            repo: uri.repo.clone(),
-            filename: "tokenizer.json".to_string(),
-        };
-        let tok_path = tok_uri.cache_path(models_dir);
-        if tok_path.exists() {
-            return tokenizers::Tokenizer::from_file(&tok_path).map_err(|e| {
-                anyhow::anyhow!("loading tokenizer from {}: {e}", tok_path.display())
-            });
-        }
-
-        // Try 2: download from the same repo.
-        if let Ok(p) = ensure_model(&tok_uri, models_dir) {
-            return tokenizers::Tokenizer::from_file(&p)
-                .map_err(|e| anyhow::anyhow!("loading tokenizer from {}: {e}", p.display()));
-        }
-
-        // Try 3: non-GGUF variant of the repo.
-        let base_repo = uri.repo.trim_end_matches("-GGUF").to_string();
-        if base_repo != uri.repo {
-            let base_tok_uri = HfModelUri {
-                repo: base_repo,
-                filename: "tokenizer.json".to_string(),
-            };
-            if let Ok(p) = ensure_model(&base_tok_uri, models_dir) {
-                return tokenizers::Tokenizer::from_file(&p)
-                    .map_err(|e| anyhow::anyhow!("loading tokenizer from {}: {e}", p.display()));
-            }
-        }
-
-        bail!(
-            "could not find or download tokenizer for model repo '{}'",
-            uri.repo
-        );
-    }
 }
 
 impl RerankModel for CandleRerank {

From bd87d713fa5a75972d14fd190c1ed1af14ab0c09 Mon Sep 17 00:00:00 2001
From: Oleksandr Ostrovskyi <devwhodevs@gmail.com>
Date: Wed, 25 Mar 2026 20:12:21 +0200
Subject: [PATCH 02/17] fix(llm): apply prompt format in CandleEmbed embed_one
 and embed_batch

embed_one now calls prompt_format.format_query() and embed_batch calls
prompt_format.format_document() before passing text to embed_text().
This is required for asymmetric models like embeddinggemma that need
specific prefixes for queries vs documents.
---
 src/llm.rs | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/llm.rs b/src/llm.rs
index 06f9443..8649327 100644
--- a/src/llm.rs
+++ b/src/llm.rs
@@ -1054,11 +1054,20 @@ impl CandleEmbed {
 impl EmbedModel for CandleEmbed {
     fn embed_batch(&mut self, texts: &[&str]) -> Result<Vec<Vec<f32>>> {
         // Process texts sequentially — candle quantized ops are single-threaded.
-        texts.iter().map(|t| self.embed_text(t)).collect()
+        // Apply document prompt format for indexing (asymmetric models need this).
+        texts
+            .iter()
+            .map(|t| {
+                let formatted = self.prompt_format.format_document("", t);
+                self.embed_text(&formatted)
+            })
+            .collect()
     }
 
     fn embed_one(&mut self, text: &str) -> Result<Vec<f32>> {
-        self.embed_text(text)
+        // Apply query prompt format (asymmetric models like embeddinggemma need this).
+        let formatted = self.prompt_format.format_query(text);
+        self.embed_text(&formatted)
     }
 
     fn token_count(&self, text: &str) -> usize {

From 7c6ba8924e336122b596e313fc64167932757c6a Mon Sep 17 00:00:00 2001
From: Oleksandr Ostrovskyi <devwhodevs@gmail.com>
Date: Wed, 25 Mar 2026 20:13:14 +0200
Subject: [PATCH 03/17] fix(store): clear FTS on reindex and use stored dim for
 vec table init

reset_for_reindex now also deletes from chunks_fts so stale keyword
entries don't survive a dimension migration. Store::init() reads the
stored embedding_dim from meta to create the vec table with the correct
dimension, preventing a stale 384-dim table from persisting when the
model outputs 256-dim vectors.
---
 src/store.rs | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/store.rs b/src/store.rs
index 4485bf3..fe8f1bd 100644
--- a/src/store.rs
+++ b/src/store.rs
@@ -141,7 +141,12 @@ impl Store {
             .context("failed to initialize schema")?;
         self.migrate()?;
         self.ensure_fts_table()?;
-        crate::vecstore::init_vec_table(&self.conn, 256)?;
+        // Use stored embedding dimension if available, defaulting to 256 for new databases.
+        let dim = self
+            .get_meta("embedding_dim")?
+            .and_then(|s| s.parse::<usize>().ok())
+            .unwrap_or(256);
+        crate::vecstore::init_vec_table(&self.conn, dim)?;
         self.migrate_vectors_to_vec0()?;
         Ok(())
     }
@@ -1165,11 +1170,12 @@ impl Store {
         }
     }
 
-    /// Drop the vec table and all chunk records. Used during dimension migration.
+    /// Drop the vec table and all chunk/FTS records. Used during dimension migration.
     pub fn reset_for_reindex(&self, new_dim: usize) -> Result<()> {
         self.conn.execute("DROP TABLE IF EXISTS chunks_vec", [])?;
         crate::vecstore::init_vec_table(&self.conn, new_dim)?;
         self.conn.execute("DELETE FROM chunks", [])?;
+        self.conn.execute("DELETE FROM chunks_fts", [])?;
         Ok(())
     }
 

From 90173882741fc1e0c2d99b6d04a54d256a8034bf Mon Sep 17 00:00:00 2001
From: Oleksandr Ostrovskyi <devwhodevs@gmail.com>
Date: Wed, 25 Mar 2026 20:14:53 +0200
Subject: [PATCH 04/17] fix(search): wire LLM cache into
 search_with_intelligence

When an orchestrator is present, compute a SHA256 cache key from the
query and check the llm_cache table first. On miss, call the
orchestrator and store the result. Adds Serialize/Deserialize to
QueryIntent and OrchestrationResult for JSON round-tripping.
Removes #[allow(dead_code)] from orchestration_cache_key.
---
 src/llm.rs    | 13 ++++++++-----
 src/search.rs | 21 ++++++++++++++++++---
 2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/src/llm.rs b/src/llm.rs
index 8649327..5c2f5f4 100644
--- a/src/llm.rs
+++ b/src/llm.rs
@@ -71,7 +71,7 @@ impl PromptFormat {
 // ── Types ────────────────────────────────────────────────────────────────────
 
 /// Classified intent of an incoming search query.
-#[derive(Debug, Clone, PartialEq, Eq)]
+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum QueryIntent {
     /// User wants a precise fact or term match.
     Exact,
@@ -84,7 +84,7 @@ pub enum QueryIntent {
 }
 
 /// Output produced by an orchestrator model for a query.
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
 pub struct OrchestrationResult {
     /// Classified query intent.
     pub intent: QueryIntent,
@@ -566,11 +566,15 @@ fn try_external_tokenizer(uri: &HfModelUri, models_dir: &Path) -> Option<tokeniz
         };
         let tok_path = tok_uri.cache_path(models_dir);
 
-        if tok_path.exists() && let Ok(tok) = tokenizers::Tokenizer::from_file(&tok_path) {
+        if tok_path.exists()
+            && let Ok(tok) = tokenizers::Tokenizer::from_file(&tok_path)
+        {
             return Some(tok);
         }
 
-        if let Ok(p) = ensure_model(&tok_uri, models_dir) && let Ok(tok) = tokenizers::Tokenizer::from_file(&p) {
+        if let Ok(p) = ensure_model(&tok_uri, models_dir)
+            && let Ok(tok) = tokenizers::Tokenizer::from_file(&p)
+        {
             return Some(tok);
         }
     }
@@ -1473,7 +1477,6 @@ impl CandleRerank {
             no_token_id,
         })
     }
-
 }
 
 impl RerankModel for CandleRerank {
diff --git a/src/search.rs b/src/search.rs
index 430d56d..336646e 100644
--- a/src/search.rs
+++ b/src/search.rs
@@ -10,7 +10,6 @@ use crate::llm::{self, EmbedModel, OrchestratorModel, RerankModel};
 use crate::store::{Store, StoreStats};
 
 /// Compute cache key for orchestration results (SHA256 of query).
-#[allow(dead_code)]
 fn orchestration_cache_key(query: &str) -> String {
     use sha2::{Digest, Sha256};
     let hash = Sha256::digest(query.as_bytes());
@@ -85,9 +84,25 @@ pub fn search_with_intelligence(
     embedder: &mut impl EmbedModel,
     config: &mut SearchConfig<'_>,
 ) -> Result<SearchOutput> {
-    // --- Step 1: Orchestrate ---
+    // --- Step 1: Orchestrate (with LLM cache when orchestrator is present) ---
     let orchestration = match &mut config.orchestrator {
-        Some(orch) => orch.orchestrate(query)?,
+        Some(orch) => {
+            let cache_key = orchestration_cache_key(query);
+            if let Some(cached_json) = config.store.get_llm_cache(&cache_key)? {
+                serde_json::from_str(&cached_json).unwrap_or_else(|_| {
+                    orch.orchestrate(query)
+                        .unwrap_or_else(|_| llm::heuristic_orchestrate(query))
+                })
+            } else {
+                let result = orch.orchestrate(query)?;
+                if let Ok(json) = serde_json::to_string(&result) {
+                    let _ = config
+                        .store
+                        .set_llm_cache(&cache_key, &json, "orchestrator");
+                }
+                result
+            }
+        }
         None => llm::heuristic_orchestrate(query),
     };
     let weights = llm::LaneWeights::from_intent(&orchestration.intent);

From 6a55e310ada67f02bc352e63e9fe12af0aba8630 Mon Sep 17 00:00:00 2001
From: Oleksandr Ostrovskyi <devwhodevs@gmail.com>
Date: Wed, 25 Mar 2026 20:16:18 +0200
Subject: [PATCH 05/17] fix(serve): wire orchestrator and reranker into MCP
 search handler

The search tool handler now calls search_with_intelligence with the
orchestrator and reranker from EngraphServer, enabling LLM-powered
query expansion and result reranking in the MCP server. Removes
#[allow(dead_code)] from the orchestrator and reranker fields.
---
 src/serve.rs | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/src/serve.rs b/src/serve.rs
index 48ba85b..6559109 100644
--- a/src/serve.rs
+++ b/src/serve.rs
@@ -132,10 +132,8 @@ pub struct EngraphServer {
     profile: Arc<Option<VaultProfile>>,
     tool_router: ToolRouter<Self>,
     /// Query expansion orchestrator (None when intelligence is disabled or failed to load).
-    #[allow(dead_code)]
     orchestrator: Option<Arc<Mutex<Box<dyn OrchestratorModel + Send>>>>,
     /// Result reranker (None when intelligence is disabled or failed to load).
-    #[allow(dead_code)]
     reranker: Option<Arc<Mutex<Box<dyn RerankModel + Send>>>>,
 }
 
@@ -168,8 +166,31 @@ impl EngraphServer {
         let top_n = params.0.top_n.unwrap_or(10);
         let store = self.store.lock().await;
         let mut embedder = self.embedder.lock().await;
-        let output = search::search_internal(&params.0.query, top_n, &store, &mut *embedder)
-            .map_err(|e| mcp_err(&e))?;
+
+        // Lock orchestrator and reranker if available for intelligence-enhanced search.
+        let mut orch_guard = match &self.orchestrator {
+            Some(o) => Some(o.lock().await),
+            None => None,
+        };
+        let mut rerank_guard = match &self.reranker {
+            Some(r) => Some(r.lock().await),
+            None => None,
+        };
+
+        let mut config = search::SearchConfig {
+            orchestrator: orch_guard
+                .as_mut()
+                .map(|g| g.as_mut() as &mut dyn OrchestratorModel),
+            reranker: rerank_guard
+                .as_mut()
+                .map(|g| g.as_mut() as &mut dyn RerankModel),
+            store: &store,
+            rerank_candidates: 30,
+        };
+
+        let output =
+            search::search_with_intelligence(&params.0.query, top_n, &mut *embedder, &mut config)
+                .map_err(|e| mcp_err(&e))?;
         to_json_result(&output.results)
     }
 

From dd21f41d24ee29b69422d3122b4ae928ff29b39e Mon Sep 17 00:00:00 2001
From: Oleksandr Ostrovskyi <devwhodevs@gmail.com>
Date: Wed, 25 Mar 2026 21:24:49 +0200
Subject: [PATCH 06/17] feat(llm): add BERT GGUF architecture support, switch
 default to all-MiniLM-L6-v2

Add BertLayer struct with LayerNorm+bias, absolute position embeddings,
and GELU FFN activation alongside the existing Gemma EmbedLayer. The
CandleEmbed struct now wraps an EmbedModelVariant enum (Gemma | Bert)
and detects architecture from GGUF metadata (general.architecture).

Switch default embedding model from embeddinggemma-300M (256-dim) to
all-MiniLM-L6-v2-GGUF Q8_0 (384-dim, 25MB). Users can still override
to embeddinggemma via config.toml. Update store default dimension to 384.
---
 src/llm.rs   | 439 +++++++++++++++++++++++++++++++++++++++++++++------
 src/store.rs |   8 +-
 2 files changed, 396 insertions(+), 51 deletions(-)

diff --git a/src/llm.rs b/src/llm.rs
index 5c2f5f4..5b42f79 100644
--- a/src/llm.rs
+++ b/src/llm.rs
@@ -543,7 +543,9 @@ fn try_external_tokenizer(uri: &HfModelUri, models_dir: &Path) -> Option<tokeniz
 
     // Known upstream repos for default models (GGUF repos rarely ship tokenizers).
     let model_lower = uri.repo.to_lowercase();
-    if model_lower.contains("embeddinggemma") {
+    if model_lower.contains("all-minilm") {
+        candidates.push("sentence-transformers/all-MiniLM-L6-v2".to_string());
+    } else if model_lower.contains("embeddinggemma") {
         candidates.push("google/embeddinggemma-300m".to_string());
         candidates.push("google/gemma-2b".to_string());
     } else if model_lower.contains("qwen3") {
@@ -593,8 +595,8 @@ pub struct ModelDefaults {
 impl Default for ModelDefaults {
     fn default() -> Self {
         Self {
-            embed_uri: "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf".into(),
-            embed_dim: 256,
+            embed_uri: "hf:leliuga/all-MiniLM-L6-v2-GGUF/all-MiniLM-L6-v2.Q8_0.gguf".into(),
+            embed_dim: 384,
             rerank_uri: "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf"
                 .into(),
             expand_uri: "hf:Qwen/Qwen3-0.6B-GGUF/qwen3-0.6b-q8_0.gguf".into(),
@@ -723,16 +725,104 @@ impl EmbedLayer {
     }
 }
 
+/// Single BERT transformer layer (LayerNorm + absolute positions + GELU FFN).
+#[derive(Debug, Clone)]
+struct BertLayer {
+    attn_q: CandleQMatMul,
+    attn_q_bias: Tensor,
+    attn_k: CandleQMatMul,
+    attn_k_bias: Tensor,
+    attn_v: CandleQMatMul,
+    attn_v_bias: Tensor,
+    attn_output: CandleQMatMul,
+    attn_output_bias: Tensor,
+    attn_output_norm: candle_nn::LayerNorm,
+    ffn_up: CandleQMatMul,
+    ffn_up_bias: Tensor,
+    ffn_down: CandleQMatMul,
+    ffn_down_bias: Tensor,
+    layer_output_norm: candle_nn::LayerNorm,
+    n_head: usize,
+    head_dim: usize,
+}
+
+impl BertLayer {
+    /// Bidirectional forward pass for BERT architecture.
+    fn forward(&self, x: &Tensor) -> candle_core::Result<Tensor> {
+        let (b_sz, seq_len, _hidden) = x.dims3()?;
+
+        // --- Attention block ---
+        let residual = x;
+
+        let q = self.attn_q.forward(x)?.broadcast_add(&self.attn_q_bias)?;
+        let k = self.attn_k.forward(x)?.broadcast_add(&self.attn_k_bias)?;
+        let v = self.attn_v.forward(x)?.broadcast_add(&self.attn_v_bias)?;
+
+        let q = q
+            .reshape((b_sz, seq_len, self.n_head, self.head_dim))?
+            .transpose(1, 2)?
+            .contiguous()?;
+        let k = k
+            .reshape((b_sz, seq_len, self.n_head, self.head_dim))?
+            .transpose(1, 2)?
+            .contiguous()?;
+        let v = v
+            .reshape((b_sz, seq_len, self.n_head, self.head_dim))?
+            .transpose(1, 2)?
+            .contiguous()?;
+
+        // Scaled dot-product attention — BIDIRECTIONAL (no causal mask).
+        let scale = 1.0 / (self.head_dim as f64).sqrt();
+        let attn_weights = (q.matmul(&k.transpose(2, 3)?)? * scale)?;
+        let attn_weights = candle_nn::ops::softmax_last_dim(&attn_weights)?;
+        let attn_output = attn_weights.matmul(&v)?;
+
+        let attn_output =
+            attn_output
+                .transpose(1, 2)?
+                .reshape((b_sz, seq_len, self.n_head * self.head_dim))?;
+        let attn_output = self
+            .attn_output
+            .forward(&attn_output)?
+            .broadcast_add(&self.attn_output_bias)?;
+        let x = self.attn_output_norm.forward(&(residual + attn_output)?)?;
+
+        // --- FFN block (GELU activation) ---
+        let residual = &x;
+        let h = self.ffn_up.forward(&x)?.broadcast_add(&self.ffn_up_bias)?;
+        let h = h.gelu()?;
+        let h = self
+            .ffn_down
+            .forward(&h)?
+            .broadcast_add(&self.ffn_down_bias)?;
+        self.layer_output_norm.forward(&(residual + h)?)
+    }
+}
+
+/// Model variant: Gemma or BERT architecture.
+enum EmbedModelVariant {
+    Gemma {
+        layers: Vec<EmbedLayer>,
+        tok_embeddings: Embedding,
+        norm: candle_transformers::quantized_nn::RmsNorm,
+        embedding_length: usize,
+    },
+    Bert {
+        layers: Vec<BertLayer>,
+        tok_embeddings: Embedding,
+        pos_embeddings: Tensor,
+        embed_norm: candle_nn::LayerNorm,
+        hidden_size: usize,
+    },
+}
+
 /// GGUF embedding model loaded via candle.
 ///
-/// Loads a quantized Gemma-family embedding model (e.g., embeddinggemma-300M)
-/// from a GGUF file and produces dense float vectors via bidirectional attention
-/// + mean pooling + L2 normalization.
+/// Loads a quantized embedding model (Gemma or BERT family) from a GGUF file
+/// and produces dense float vectors via bidirectional attention + mean pooling
+/// + L2 normalization.
 pub struct CandleEmbed {
-    layers: Vec<EmbedLayer>,
-    tok_embeddings: Embedding,
-    norm: candle_transformers::quantized_nn::RmsNorm,
-    embedding_length: usize,
+    variant: EmbedModelVariant,
     tokenizer: FlexTokenizer,
     device: Device,
     dim: usize,
@@ -741,10 +831,23 @@ pub struct CandleEmbed {
 
 impl std::fmt::Debug for CandleEmbed {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let (arch, num_layers, hidden) = match &self.variant {
+            EmbedModelVariant::Gemma {
+                layers,
+                embedding_length,
+                ..
+            } => ("gemma", layers.len(), *embedding_length),
+            EmbedModelVariant::Bert {
+                layers,
+                hidden_size,
+                ..
+            } => ("bert", layers.len(), *hidden_size),
+        };
         f.debug_struct("CandleEmbed")
+            .field("arch", &arch)
             .field("dim", &self.dim)
-            .field("embedding_length", &self.embedding_length)
-            .field("num_layers", &self.layers.len())
+            .field("hidden_size", &hidden)
+            .field("num_layers", &num_layers)
             .field("prompt_format", &self.prompt_format)
             .finish()
     }
@@ -757,8 +860,9 @@ impl CandleEmbed {
     /// 1. Resolve model URI (from config override or `ModelDefaults`)
     /// 2. `ensure_model()` to download if needed
     /// 3. Load tokenizer (try same repo's tokenizer.json, then repo without -GGUF suffix)
-    /// 4. Load GGUF and build layer structs for bidirectional embedding
-    /// 5. Detect prompt format from filename
+    /// 4. Detect architecture from GGUF metadata (`general.architecture`)
+    /// 5. Load GGUF and build layer structs for bidirectional embedding
+    /// 6. Detect prompt format from filename
     pub fn new(models_dir: &Path, config: &crate::config::Config) -> Result<Self> {
         let defaults = ModelDefaults::default();
         let uri_str = config
@@ -778,24 +882,46 @@ impl CandleEmbed {
         // Target output dimensionality.
         let dim = defaults.embed_dim;
 
-        // Load GGUF and build model.
+        // Detect architecture from GGUF metadata and load accordingly.
         let device = select_device()?;
-        let (layers, tok_embeddings, norm, embedding_length) =
-            Self::load_gguf(&model_path, &device)?;
+        let arch = Self::detect_architecture(&model_path)?;
+
+        let variant = if arch.contains("bert") {
+            Self::load_gguf_bert(&model_path, &device)?
+        } else {
+            let (layers, tok_embeddings, norm, embedding_length) =
+                Self::load_gguf_gemma(&model_path, &device)?;
+            EmbedModelVariant::Gemma {
+                layers,
+                tok_embeddings,
+                norm,
+                embedding_length,
+            }
+        };
 
+        let (arch_name, num_layers, hidden) = match &variant {
+            EmbedModelVariant::Gemma {
+                layers,
+                embedding_length,
+                ..
+            } => ("gemma", layers.len(), *embedding_length),
+            EmbedModelVariant::Bert {
+                layers,
+                hidden_size,
+                ..
+            } => ("bert", layers.len(), *hidden_size),
+        };
         tracing::info!(
-            "loaded CandleEmbed: {} layers, embedding_length={}, target_dim={}, device={:?}",
-            layers.len(),
-            embedding_length,
+            "loaded CandleEmbed: arch={}, {} layers, hidden_size={}, target_dim={}, device={:?}",
+            arch_name,
+            num_layers,
+            hidden,
             dim,
             device
         );
 
         Ok(Self {
-            layers,
-            tok_embeddings,
-            norm,
-            embedding_length,
+            variant,
             tokenizer,
             device,
             dim,
@@ -803,8 +929,34 @@ impl CandleEmbed {
         })
     }
 
-    /// Load GGUF file and construct layer structs for bidirectional embedding.
-    fn load_gguf(
+    /// Read `general.architecture` from GGUF metadata to determine the model family.
+    fn detect_architecture(path: &Path) -> Result<String> {
+        use candle_core::quantized::gguf_file;
+
+        let mut file = std::fs::File::open(path)
+            .map_err(|e| anyhow::anyhow!("opening GGUF {}: {e}", path.display()))?;
+        let ct = gguf_file::Content::read(&mut file)
+            .map_err(|e| anyhow::anyhow!("reading GGUF {}: {e}", path.display()))?;
+
+        // Look for `general.architecture` in metadata.
+        if let Some(val) = ct.metadata.get("general.architecture") {
+            let arch = val
+                .to_string()
+                .map_err(|e| anyhow::anyhow!("reading general.architecture: {e}"))?;
+            Ok(arch.to_lowercase())
+        } else {
+            // Fallback: probe for known architecture prefixes.
+            let has_bert = ct.metadata.contains_key("bert.attention.head_count");
+            if has_bert {
+                Ok("bert".to_string())
+            } else {
+                Ok("gemma".to_string())
+            }
+        }
+    }
+
+    /// Load GGUF file and construct Gemma-family layer structs for bidirectional embedding.
+    fn load_gguf_gemma(
         path: &Path,
         device: &Device,
     ) -> Result<(
@@ -972,6 +1124,155 @@ impl CandleEmbed {
         Ok((sin, cos))
     }
 
+    /// Load GGUF file and construct BERT-family layer structs for bidirectional embedding.
+    fn load_gguf_bert(path: &Path, device: &Device) -> Result<EmbedModelVariant> {
+        use candle_core::quantized::gguf_file;
+
+        let mut file = std::fs::File::open(path)
+            .map_err(|e| anyhow::anyhow!("opening GGUF {}: {e}", path.display()))?;
+        let ct = gguf_file::Content::read(&mut file)
+            .map_err(|e| anyhow::anyhow!("reading GGUF {}: {e}", path.display()))?;
+
+        // Read BERT hyperparameters from metadata.
+        let md_get = |s: &str| -> Result<&gguf_file::Value> {
+            ct.metadata
+                .get(s)
+                .ok_or_else(|| anyhow::anyhow!("cannot find {s} in GGUF metadata"))
+        };
+
+        let head_count = md_get("bert.attention.head_count")?
+            .to_u32()
+            .map_err(|e| anyhow::anyhow!("{e}"))? as usize;
+        let block_count = md_get("bert.block_count")?
+            .to_u32()
+            .map_err(|e| anyhow::anyhow!("{e}"))? as usize;
+        let hidden_size = md_get("bert.embedding_length")?
+            .to_u32()
+            .map_err(|e| anyhow::anyhow!("{e}"))? as usize;
+        let layer_norm_eps = md_get("bert.attention.layer_norm_epsilon")
+            .and_then(|v| v.to_f32().map_err(|e| anyhow::anyhow!("{e}")))
+            .unwrap_or(1e-12) as f64;
+
+        let head_dim = hidden_size / head_count;
+
+        // Load token embeddings.
+        let tok_embd = ct
+            .tensor(&mut file, "token_embd.weight", device)
+            .map_err(|e| anyhow::anyhow!("loading token_embd.weight: {e}"))?;
+        let tok_embd_deq = tok_embd
+            .dequantize(device)
+            .map_err(|e| anyhow::anyhow!("dequantizing token_embd: {e}"))?;
+        let tok_embeddings = Embedding::new(tok_embd_deq, hidden_size);
+
+        // Load absolute position embeddings.
+        let pos_embd = ct
+            .tensor(&mut file, "position_embd.weight", device)
+            .map_err(|e| anyhow::anyhow!("loading position_embd.weight: {e}"))?;
+        let pos_embeddings = pos_embd
+            .dequantize(device)
+            .map_err(|e| anyhow::anyhow!("dequantizing position_embd: {e}"))?;
+
+        // Load embedding LayerNorm (post token+position embeddings).
+        let embed_norm =
+            Self::load_layer_norm(&ct, &mut file, "token_embd_norm", layer_norm_eps, device)?;
+
+        // Load transformer layers.
+        let mut layers = Vec::with_capacity(block_count);
+        for idx in 0..block_count {
+            let p = format!("blk.{idx}");
+
+            // Helper: load a quantized weight tensor as QMatMul.
+            macro_rules! load_q {
+                ($name:expr) => {{
+                    let full = format!("{}.{}", p, $name);
+                    let qt = ct
+                        .tensor(&mut file, &full, device)
+                        .map_err(|e| anyhow::anyhow!("loading {full}: {e}"))?;
+                    CandleQMatMul::from_qtensor(qt)
+                        .map_err(|e| anyhow::anyhow!("QMatMul for {full}: {e}"))?
+                }};
+            }
+
+            // Helper: load a bias tensor (dequantized to f32).
+            macro_rules! load_bias {
+                ($name:expr) => {{
+                    let full = format!("{}.{}", p, $name);
+                    ct.tensor(&mut file, &full, device)
+                        .map_err(|e| anyhow::anyhow!("loading {full}: {e}"))?
+                        .dequantize(device)
+                        .map_err(|e| anyhow::anyhow!("dequantizing {full}: {e}"))?
+                }};
+            }
+
+            let attn_output_norm = Self::load_layer_norm(
+                &ct,
+                &mut file,
+                &format!("{p}.attn_output_norm"),
+                layer_norm_eps,
+                device,
+            )?;
+            let layer_output_norm = Self::load_layer_norm(
+                &ct,
+                &mut file,
+                &format!("{p}.layer_output_norm"),
+                layer_norm_eps,
+                device,
+            )?;
+
+            layers.push(BertLayer {
+                attn_q: load_q!("attn_q.weight"),
+                attn_q_bias: load_bias!("attn_q.bias"),
+                attn_k: load_q!("attn_k.weight"),
+                attn_k_bias: load_bias!("attn_k.bias"),
+                attn_v: load_q!("attn_v.weight"),
+                attn_v_bias: load_bias!("attn_v.bias"),
+                attn_output: load_q!("attn_output.weight"),
+                attn_output_bias: load_bias!("attn_output.bias"),
+                attn_output_norm,
+                ffn_up: load_q!("ffn_up.weight"),
+                ffn_up_bias: load_bias!("ffn_up.bias"),
+                ffn_down: load_q!("ffn_down.weight"),
+                ffn_down_bias: load_bias!("ffn_down.bias"),
+                layer_output_norm,
+                n_head: head_count,
+                head_dim,
+            });
+        }
+
+        Ok(EmbedModelVariant::Bert {
+            layers,
+            tok_embeddings,
+            pos_embeddings,
+            embed_norm,
+            hidden_size,
+        })
+    }
+
+    /// Load a LayerNorm with weight and bias from GGUF tensors.
+    fn load_layer_norm(
+        ct: &candle_core::quantized::gguf_file::Content,
+        file: &mut std::fs::File,
+        prefix: &str,
+        eps: f64,
+        device: &Device,
+    ) -> Result<candle_nn::LayerNorm> {
+        let weight_name = format!("{prefix}.weight");
+        let bias_name = format!("{prefix}.bias");
+
+        let weight = ct
+            .tensor(file, &weight_name, device)
+            .map_err(|e| anyhow::anyhow!("loading {weight_name}: {e}"))?
+            .dequantize(device)
+            .map_err(|e| anyhow::anyhow!("dequantizing {weight_name}: {e}"))?;
+        let bias = ct
+            .tensor(file, &bias_name, device)
+            .map_err(|e| anyhow::anyhow!("loading {bias_name}: {e}"))?
+            .dequantize(device)
+            .map_err(|e| anyhow::anyhow!("dequantizing {bias_name}: {e}"))?;
+
+        Ok(candle_nn::LayerNorm::new(weight, bias, eps))
+    }
+
     /// Run a bidirectional forward pass and return the mean-pooled, truncated,
     /// L2-normalized embedding.
     fn embed_text(&self, text: &str) -> Result<Vec<f32>> {
@@ -980,31 +1281,71 @@ impl CandleEmbed {
             bail!("tokenizer returned empty token sequence");
         }
 
-        let input = Tensor::new(token_ids, &self.device)
+        let input = Tensor::new(token_ids.as_slice(), &self.device)
             .map_err(|e| anyhow::anyhow!("creating input tensor: {e}"))?
             .unsqueeze(0)
             .map_err(|e| anyhow::anyhow!("unsqueeze: {e}"))?;
 
-        // Token embeddings, scaled by sqrt(embedding_length) (Gemma convention).
-        let mut hidden = self
-            .tok_embeddings
-            .forward(&input)
-            .map_err(|e| anyhow::anyhow!("token embedding forward: {e}"))?;
-        hidden = (hidden * (self.embedding_length as f64).sqrt())
-            .map_err(|e| anyhow::anyhow!("scaling embeddings: {e}"))?;
-
-        // Forward through all transformer layers (bidirectional — no causal mask).
-        for layer in &self.layers {
-            hidden = layer
-                .forward(&hidden)
-                .map_err(|e| anyhow::anyhow!("layer forward: {e}"))?;
-        }
+        let hidden = match &self.variant {
+            EmbedModelVariant::Gemma {
+                layers,
+                tok_embeddings,
+                norm,
+                embedding_length,
+            } => {
+                // Token embeddings, scaled by sqrt(embedding_length) (Gemma convention).
+                let mut h = tok_embeddings
+                    .forward(&input)
+                    .map_err(|e| anyhow::anyhow!("token embedding forward: {e}"))?;
+                h = (h * (*embedding_length as f64).sqrt())
+                    .map_err(|e| anyhow::anyhow!("scaling embeddings: {e}"))?;
+
+                for layer in layers {
+                    h = layer
+                        .forward(&h)
+                        .map_err(|e| anyhow::anyhow!("layer forward: {e}"))?;
+                }
+
+                norm.forward(&h)
+                    .map_err(|e| anyhow::anyhow!("final norm: {e}"))?
+            }
+            EmbedModelVariant::Bert {
+                layers,
+                tok_embeddings,
+                pos_embeddings,
+                embed_norm,
+                ..
+            } => {
+                // Token embeddings + absolute position embeddings.
+                let seq_len = token_ids.len();
+                let tok_emb = tok_embeddings
+                    .forward(&input)
+                    .map_err(|e| anyhow::anyhow!("token embedding forward: {e}"))?;
+
+                // Slice position embeddings to seq_len: [max_pos, hidden] -> [seq_len, hidden].
+                let pos_emb = pos_embeddings
+                    .narrow(0, 0, seq_len)
+                    .map_err(|e| anyhow::anyhow!("position embedding slice: {e}"))?
+                    .unsqueeze(0)
+                    .map_err(|e| anyhow::anyhow!("position embedding unsqueeze: {e}"))?;
+
+                let mut h =
+                    (tok_emb + pos_emb).map_err(|e| anyhow::anyhow!("embedding addition: {e}"))?;
+                h = embed_norm
+                    .forward(&h)
+                    .map_err(|e| anyhow::anyhow!("embedding norm: {e}"))?;
+
+                for layer in layers {
+                    h = layer
+                        .forward(&h)
+                        .map_err(|e| anyhow::anyhow!("layer forward: {e}"))?;
+                }
 
-        // Final layer norm.
-        hidden = self
-            .norm
-            .forward(&hidden)
-            .map_err(|e| anyhow::anyhow!("final norm: {e}"))?;
+                // BERT does not have a final norm after the last layer
+                // (the per-layer norms already handle it).
+                h
+            }
+        };
 
         // Mean pool across sequence dimension: [1, seq_len, hidden] -> [1, hidden].
         let seq_len = hidden
@@ -1650,7 +1991,11 @@ mod tests {
     fn test_model_defaults() {
         let defaults = ModelDefaults::default();
         assert!(defaults.embed_uri.starts_with("hf:"));
-        assert_eq!(defaults.embed_dim, 256);
+        assert_eq!(defaults.embed_dim, 384);
+        assert!(
+            defaults.embed_uri.contains("all-MiniLM-L6-v2"),
+            "default embed model should be all-MiniLM-L6-v2-GGUF"
+        );
     }
 
     // ── CandleEmbed / PromptFormat tests ────────────────────────────────────
diff --git a/src/store.rs b/src/store.rs
index fe8f1bd..8214c37 100644
--- a/src/store.rs
+++ b/src/store.rs
@@ -141,11 +141,11 @@ impl Store {
             .context("failed to initialize schema")?;
         self.migrate()?;
         self.ensure_fts_table()?;
-        // Use stored embedding dimension if available, defaulting to 256 for new databases.
+        // Use stored embedding dimension if available, defaulting to 384 for new databases.
         let dim = self
             .get_meta("embedding_dim")?
             .and_then(|s| s.parse::<usize>().ok())
-            .unwrap_or(256);
+            .unwrap_or(384);
         crate::vecstore::init_vec_table(&self.conn, dim)?;
         self.migrate_vectors_to_vec0()?;
         Ok(())
@@ -2264,7 +2264,7 @@ mod tests {
     #[test]
     fn test_store_vec_roundtrip() {
         let store = Store::open_memory().unwrap();
-        let vector: Vec<f32> = (0..256).map(|i| (i as f32) / 256.0).collect();
+        let vector: Vec<f32> = (0..384).map(|i| (i as f32) / 384.0).collect();
         store.insert_vec(0, &vector).unwrap();
 
         let results = store
@@ -2282,7 +2282,7 @@ mod tests {
         let file_id = store
             .insert_file("test.md", "hash123", 0, &[], "abc123", None)
             .unwrap();
-        let vector: Vec<f32> = (0..256).map(|i| (i as f32) / 256.0).collect();
+        let vector: Vec<f32> = (0..384).map(|i| (i as f32) / 384.0).collect();
         store
             .insert_chunk_with_vector(file_id, "heading", "snippet", 0, 100, &vector)
             .unwrap();

From 7171792436e1fa7725ad37738f879681767c70c0 Mon Sep 17 00:00:00 2001
From: Oleksandr Ostrovskyi <devwhodevs@gmail.com>
Date: Wed, 25 Mar 2026 21:25:13 +0200
Subject: [PATCH 07/17] feat: add accelerate feature flag for optimized CPU on
 macOS

---
 Cargo.lock | 8 ++++++++
 Cargo.toml | 1 +
 2 files changed, 9 insertions(+)

diff --git a/Cargo.lock b/Cargo.lock
index c1c6967..90eb8ec 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2,6 +2,12 @@
 # It is not intended for manual editing.
 version = 4
 
+[[package]]
+name = "accelerate-src"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "415ed64958754dbe991900f3940677e6a7eefb4d7367afd70d642677b0c7d19d"
+
 [[package]]
 name = "adler2"
 version = "2.0.1"
@@ -262,6 +268,7 @@ version = "0.9.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c15b675b80d994b2eadb20a4bbe434eabeb454eac3ee5e2b4cf6f147ee9be091"
 dependencies = [
+ "accelerate-src",
  "byteorder",
  "candle-kernels",
  "candle-metal-kernels",
@@ -270,6 +277,7 @@ dependencies = [
  "float8 0.6.1",
  "gemm 0.19.0",
  "half",
+ "libc",
  "libm",
  "memmap2",
  "num-traits",
diff --git a/Cargo.toml b/Cargo.toml
index 42b1d46..e765437 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -42,6 +42,7 @@ shimmytok = "0.7"
 [features]
 default = []
 metal = ["candle-core/metal"]
+accelerate = ["candle-core/accelerate"]
 cuda = ["candle-core/cuda"]
 
 [dev-dependencies]

From 4892309c3661a7e295b4037a6f53ac11b87f745b Mon Sep 17 00:00:00 2001
From: Oleksandr Ostrovskyi <devwhodevs@gmail.com>
Date: Wed, 25 Mar 2026 21:52:02 +0200
Subject: [PATCH 08/17] fix: add indexing progress output, fix Qwen3 GGUF
 filename case

- Print [N/M] file progress during indexing (was silent for minutes)
- Fix expand model URI: Qwen3-0.6B-Q8_0.gguf (uppercase, was 404)
- Add accelerate feature flag for Apple vecLib optimization
---
 src/indexer.rs | 5 ++++-
 src/llm.rs     | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/indexer.rs b/src/indexer.rs
index d382945..309edcf 100644
--- a/src/indexer.rs
+++ b/src/indexer.rs
@@ -561,12 +561,15 @@ fn run_index_inner(
     let mut total_chunks = 0usize;
     let mut indexed_rel_paths: Vec<String> = Vec::new();
 
+    let total_files = file_contents.len();
     store.conn().execute_batch("BEGIN DEFERRED")?;
-    for (rel_str, content, hash) in &file_contents {
+    for (i, (rel_str, content, hash)) in file_contents.iter().enumerate() {
+        eprint!("\r  [{}/{}] {}", i + 1, total_files, rel_str);
         let result = index_file(rel_str, content, hash, store, embedder, vault_path, config)?;
         total_chunks += result.total_chunks;
         indexed_rel_paths.push(rel_str.clone());
     }
+    eprintln!("\r  [{}/{}] done{}", total_files, total_files, " ".repeat(60));
     store.commit()?;
 
     // Step 9: Build vault graph edges.
diff --git a/src/llm.rs b/src/llm.rs
index 5b42f79..bdbdcf9 100644
--- a/src/llm.rs
+++ b/src/llm.rs
@@ -599,7 +599,7 @@ impl Default for ModelDefaults {
             embed_dim: 384,
             rerank_uri: "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf"
                 .into(),
-            expand_uri: "hf:Qwen/Qwen3-0.6B-GGUF/qwen3-0.6b-q8_0.gguf".into(),
+            expand_uri: "hf:Qwen/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf".into(),
         }
     }
 }

From 20be487d8af576476f4515affa1e8586e7dea2ca Mon Sep 17 00:00:00 2001
From: Oleksandr Ostrovskyi <devwhodevs@gmail.com>
Date: Wed, 25 Mar 2026 21:59:56 +0200
Subject: [PATCH 09/17] fix: use float32 RmsNorm for Metal GPU compatibility in
 Gemma embedding

Replace candle_transformers::quantized_nn::RmsNorm (which lacks a Metal
kernel) with candle_nn::RmsNorm throughout the Gemma embedding code.
QTensor weights are dequantized to f32 Tensor at load time so the
standard RmsNorm forward pass runs on Metal without error.

Also restores embeddinggemma as the default model (256-dim), replaces
eprint indexing progress with an indicatif progress bar, and fixes
store tests to match the new default dimension.
---
 src/indexer.rs | 16 ++++++++++++----
 src/llm.rs     | 42 +++++++++++++++++++++++-------------------
 src/store.rs   |  6 +++---
 3 files changed, 38 insertions(+), 26 deletions(-)

diff --git a/src/indexer.rs b/src/indexer.rs
index 309edcf..1f91a80 100644
--- a/src/indexer.rs
+++ b/src/indexer.rs
@@ -5,6 +5,7 @@ use std::time::{Duration, Instant};
 use anyhow::{Context, Result, anyhow};
 use ignore::WalkBuilder;
 use sha2::{Digest, Sha256};
+use indicatif::{ProgressBar, ProgressStyle};
 use tracing::info;
 
 use crate::chunker::{chunk_markdown, split_oversized_chunks};
@@ -561,15 +562,22 @@ fn run_index_inner(
     let mut total_chunks = 0usize;
     let mut indexed_rel_paths: Vec<String> = Vec::new();
 
-    let total_files = file_contents.len();
+    let pb = ProgressBar::new(file_contents.len() as u64);
+    pb.set_style(
+        ProgressStyle::with_template("  [{bar:40.cyan/blue}] {pos}/{len} {msg} ({eta})")
+            .unwrap()
+            .progress_chars("=>-"),
+    );
+
     store.conn().execute_batch("BEGIN DEFERRED")?;
-    for (i, (rel_str, content, hash)) in file_contents.iter().enumerate() {
-        eprint!("\r  [{}/{}] {}", i + 1, total_files, rel_str);
+    for (rel_str, content, hash) in &file_contents {
+        pb.set_message(rel_str.clone());
         let result = index_file(rel_str, content, hash, store, embedder, vault_path, config)?;
         total_chunks += result.total_chunks;
         indexed_rel_paths.push(rel_str.clone());
+        pb.inc(1);
     }
-    eprintln!("\r  [{}/{}] done{}", total_files, total_files, " ".repeat(60));
+    pb.finish_with_message("done");
     store.commit()?;
 
     // Step 9: Build vault graph edges.
diff --git a/src/llm.rs b/src/llm.rs
index bdbdcf9..069470a 100644
--- a/src/llm.rs
+++ b/src/llm.rs
@@ -595,8 +595,8 @@ pub struct ModelDefaults {
 impl Default for ModelDefaults {
     fn default() -> Self {
         Self {
-            embed_uri: "hf:leliuga/all-MiniLM-L6-v2-GGUF/all-MiniLM-L6-v2.Q8_0.gguf".into(),
-            embed_dim: 384,
+            embed_uri: "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf".into(),
+            embed_dim: 256,
             rerank_uri: "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf"
                 .into(),
             expand_uri: "hf:Qwen/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf".into(),
@@ -630,12 +630,12 @@ struct EmbedLayer {
     attention_wk: CandleQMatMul,
     attention_wv: CandleQMatMul,
     attention_wo: CandleQMatMul,
-    attention_q_norm: candle_transformers::quantized_nn::RmsNorm,
-    attention_k_norm: candle_transformers::quantized_nn::RmsNorm,
-    attention_norm: candle_transformers::quantized_nn::RmsNorm,
-    post_attention_norm: candle_transformers::quantized_nn::RmsNorm,
-    ffn_norm: candle_transformers::quantized_nn::RmsNorm,
-    post_ffn_norm: candle_transformers::quantized_nn::RmsNorm,
+    attention_q_norm: candle_nn::RmsNorm,
+    attention_k_norm: candle_nn::RmsNorm,
+    attention_norm: candle_nn::RmsNorm,
+    post_attention_norm: candle_nn::RmsNorm,
+    ffn_norm: candle_nn::RmsNorm,
+    post_ffn_norm: candle_nn::RmsNorm,
     ffn_gate: CandleQMatMul,
     ffn_up: CandleQMatMul,
     ffn_down: CandleQMatMul,
@@ -804,7 +804,7 @@ enum EmbedModelVariant {
     Gemma {
         layers: Vec<EmbedLayer>,
         tok_embeddings: Embedding,
-        norm: candle_transformers::quantized_nn::RmsNorm,
+        norm: candle_nn::RmsNorm,
         embedding_length: usize,
     },
     Bert {
@@ -962,7 +962,7 @@ impl CandleEmbed {
     ) -> Result<(
         Vec<EmbedLayer>,
         Embedding,
-        candle_transformers::quantized_nn::RmsNorm,
+        candle_nn::RmsNorm,
         usize,
     )> {
         use candle_core::quantized::gguf_file;
@@ -1027,12 +1027,14 @@ impl CandleEmbed {
             .map_err(|e| anyhow::anyhow!("dequantizing token_embd: {e}"))?;
         let tok_embeddings = Embedding::new(tok_embd_deq, embedding_length);
 
-        // Final norm.
+        // Final norm (dequantize to f32 for Metal compatibility).
         let norm_qt = ct
             .tensor(&mut file, "output_norm.weight", device)
             .map_err(|e| anyhow::anyhow!("loading output_norm.weight: {e}"))?;
-        let norm = candle_transformers::quantized_nn::RmsNorm::from_qtensor(norm_qt, rms_norm_eps)
-            .map_err(|e| anyhow::anyhow!("creating RmsNorm: {e}"))?;
+        let norm_weight = norm_qt
+            .dequantize(device)
+            .map_err(|e| anyhow::anyhow!("dequantizing output_norm.weight: {e}"))?;
+        let norm = candle_nn::RmsNorm::new(norm_weight, rms_norm_eps);
 
         // Load transformer layers.
         let mut layers = Vec::with_capacity(block_count);
@@ -1051,15 +1053,17 @@ impl CandleEmbed {
                 }};
             }
 
-            // Helper: load a norm weight tensor as RmsNorm.
+            // Helper: load a norm weight tensor as RmsNorm (dequantize for Metal).
             macro_rules! load_norm {
                 ($name:expr) => {{
                     let full = format!("{}.{}", p, $name);
                     let qt = ct
                         .tensor(&mut file, &full, device)
                         .map_err(|e| anyhow::anyhow!("loading {full}: {e}"))?;
-                    candle_transformers::quantized_nn::RmsNorm::from_qtensor(qt, rms_norm_eps)
-                        .map_err(|e| anyhow::anyhow!("RmsNorm for {full}: {e}"))?
+                    let weight = qt
+                        .dequantize(device)
+                        .map_err(|e| anyhow::anyhow!("dequantizing {full}: {e}"))?;
+                    candle_nn::RmsNorm::new(weight, rms_norm_eps)
                 }};
             }
 
@@ -1991,10 +1995,10 @@ mod tests {
     fn test_model_defaults() {
         let defaults = ModelDefaults::default();
         assert!(defaults.embed_uri.starts_with("hf:"));
-        assert_eq!(defaults.embed_dim, 384);
+        assert_eq!(defaults.embed_dim, 256);
         assert!(
-            defaults.embed_uri.contains("all-MiniLM-L6-v2"),
-            "default embed model should be all-MiniLM-L6-v2-GGUF"
+            defaults.embed_uri.contains("embeddinggemma"),
+            "default embed model should be embeddinggemma"
         );
     }
 
diff --git a/src/store.rs b/src/store.rs
index 8214c37..30f0088 100644
--- a/src/store.rs
+++ b/src/store.rs
@@ -145,7 +145,7 @@ impl Store {
         let dim = self
             .get_meta("embedding_dim")?
             .and_then(|s| s.parse::<usize>().ok())
-            .unwrap_or(384);
+            .unwrap_or(256);
         crate::vecstore::init_vec_table(&self.conn, dim)?;
         self.migrate_vectors_to_vec0()?;
         Ok(())
@@ -2264,7 +2264,7 @@ mod tests {
     #[test]
     fn test_store_vec_roundtrip() {
         let store = Store::open_memory().unwrap();
-        let vector: Vec<f32> = (0..384).map(|i| (i as f32) / 384.0).collect();
+        let vector: Vec<f32> = (0..256).map(|i| (i as f32) / 256.0).collect();
         store.insert_vec(0, &vector).unwrap();
 
         let results = store
@@ -2282,7 +2282,7 @@ mod tests {
         let file_id = store
             .insert_file("test.md", "hash123", 0, &[], "abc123", None)
             .unwrap();
-        let vector: Vec<f32> = (0..384).map(|i| (i as f32) / 384.0).collect();
+        let vector: Vec<f32> = (0..256).map(|i| (i as f32) / 256.0).collect();
         store
             .insert_chunk_with_vector(file_id, "heading", "snippet", 0, 100, &vector)
             .unwrap();

From ebb814b2e5e795b585f410ac261f0fbdaa16dd60 Mon Sep 17 00:00:00 2001
From: Oleksandr Ostrovskyi <devwhodevs@gmail.com>
Date: Wed, 25 Mar 2026 23:24:38 +0200
Subject: [PATCH 10/17] refactor(llm): replace candle backend with llama-cpp-2
 for Metal GPU support

candle lacks Metal kernels for quantized GGUF models (rms-norm, QMatMul).
llama.cpp has mature Metal support and auto-detects GPU at build time.

- Replace candle-core/candle-nn/candle-transformers with llama-cpp-2
- CandleEmbed -> LlamaEmbed, CandleOrchestrator -> LlamaOrchestrator,
  CandleRerank -> LlamaRerank
- Remove select_device(), CandleQMatMul, EmbedLayer, BertLayer,
  EmbedModelVariant (llama.cpp handles all model loading internally)
- Remove metal/accelerate/cuda feature flags (llama.cpp handles GPU
  detection at CMake build time)
- LlamaContext is !Send so contexts are created per-call from the
  stored LlamaModel (which is Send+Sync)
- Public API unchanged: traits, MockLlm, download infra, FlexTokenizer,
  PromptFormat, heuristic_orchestrate all preserved
- 270 tests pass (net -1: removed select_device test)
---
 Cargo.lock     | 1107 ++++++---------------------------------------
 Cargo.toml     |    7 +-
 src/indexer.rs |    2 +-
 src/llm.rs     | 1160 ++++++++++--------------------------------------
 src/main.rs    |    4 +-
 src/search.rs  |    2 +-
 src/serve.rs   |    6 +-
 7 files changed, 373 insertions(+), 1915 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 90eb8ec..655637d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2,12 +2,6 @@
 # It is not intended for manual editing.
 version = 4
 
-[[package]]
-name = "accelerate-src"
-version = "0.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "415ed64958754dbe991900f3940677e6a7eefb4d7367afd70d642677b0c7d19d"
-
 [[package]]
 name = "adler2"
 version = "2.0.1"
@@ -37,12 +31,6 @@ dependencies = [
  "memchr",
 ]
 
-[[package]]
-name = "allocator-api2"
-version = "0.2.21"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
-
 [[package]]
 name = "android_system_properties"
 version = "0.1.5"
@@ -138,14 +126,23 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
 
 [[package]]
-name = "bindgen_cuda"
-version = "0.1.6"
+name = "bindgen"
+version = "0.72.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "282be55fb326843bb67cccceeeaf21c961ef303f60018f9a2ab69494dad8eaf9"
+checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895"
 dependencies = [
- "glob",
- "num_cpus",
- "rayon",
+ "bitflags 2.11.0",
+ "cexpr",
+ "clang-sys",
+ "itertools 0.13.0",
+ "log",
+ "prettyplease",
+ "proc-macro2",
+ "quote",
+ "regex",
+ "rustc-hash",
+ "shlex",
+ "syn",
 ]
 
 [[package]]
@@ -190,12 +187,6 @@ version = "2.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af"
 
-[[package]]
-name = "block"
-version = "0.1.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0d8c1fef690941d3e7788d328517591fecc684c084084702d6ff1641e993699a"
-
 [[package]]
 name = "block-buffer"
 version = "0.10.4"
@@ -205,15 +196,6 @@ dependencies = [
  "generic-array",
 ]
 
-[[package]]
-name = "block2"
-version = "0.6.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cdeb9d870516001442e364c5220d3574d2da8dc765554b4a617230d33fa58ef5"
-dependencies = [
- "objc2",
-]
-
 [[package]]
 name = "bstr"
 version = "1.12.1"
@@ -230,26 +212,6 @@ version = "3.20.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb"
 
-[[package]]
-name = "bytemuck"
-version = "1.25.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec"
-dependencies = [
- "bytemuck_derive",
-]
-
-[[package]]
-name = "bytemuck_derive"
-version = "1.10.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f9abbd1bc6865053c427f7198e6af43bfdedc55ab791faed4fbd361d789575ff"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]
-
 [[package]]
 name = "byteorder"
 version = "1.5.0"
@@ -262,107 +224,6 @@ version = "1.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33"
 
-[[package]]
-name = "candle-core"
-version = "0.9.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c15b675b80d994b2eadb20a4bbe434eabeb454eac3ee5e2b4cf6f147ee9be091"
-dependencies = [
- "accelerate-src",
- "byteorder",
- "candle-kernels",
- "candle-metal-kernels",
- "candle-ug",
- "cudarc 0.19.4",
- "float8 0.6.1",
- "gemm 0.19.0",
- "half",
- "libc",
- "libm",
- "memmap2",
- "num-traits",
- "num_cpus",
- "objc2-foundation",
- "objc2-metal",
- "rand",
- "rand_distr",
- "rayon",
- "safetensors 0.7.0",
- "thiserror 2.0.18",
- "yoke 0.8.1",
- "zip",
-]
-
-[[package]]
-name = "candle-kernels"
-version = "0.9.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b8455f84bd810047c7c41216683c1020c915a9f8a740b3b0eabdd4fb2fbaa660"
-dependencies = [
- "bindgen_cuda",
-]
-
-[[package]]
-name = "candle-metal-kernels"
-version = "0.9.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2fdfe9d06de16ce49961e49084e5b79a75a9bdf157246e7c7b6328e87a7aa25d"
-dependencies = [
- "half",
- "objc2",
- "objc2-foundation",
- "objc2-metal",
- "once_cell",
- "thiserror 2.0.18",
- "tracing",
-]
-
-[[package]]
-name = "candle-nn"
-version = "0.9.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3045fa9e7aef8567d209a27d56b692f60b96f4d0569f4c3011f8ca6715c65e03"
-dependencies = [
- "candle-core",
- "half",
- "libc",
- "num-traits",
- "rayon",
- "safetensors 0.7.0",
- "serde",
- "thiserror 2.0.18",
-]
-
-[[package]]
-name = "candle-transformers"
-version = "0.9.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b538ec4aa807c416a2ddd3621044888f188827862e2a6fcacba4738e89795d01"
-dependencies = [
- "byteorder",
- "candle-core",
- "candle-nn",
- "fancy-regex 0.17.0",
- "num-traits",
- "rand",
- "rayon",
- "serde",
- "serde_json",
- "serde_plain",
- "tracing",
-]
-
-[[package]]
-name = "candle-ug"
-version = "0.9.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c22d62be69068bf58987a45f690612739d8d2ea1bf508c1b87dc6815a019575d"
-dependencies = [
- "ug",
- "ug-cuda",
- "ug-metal",
-]
-
 [[package]]
 name = "castaway"
 version = "0.2.4"
@@ -379,9 +240,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7a0dd1ca384932ff3641c8718a02769f1698e7563dc6974ffd03346116310423"
 dependencies = [
  "find-msvc-tools",
+ "jobserver",
+ "libc",
  "shlex",
 ]
 
+[[package]]
+name = "cexpr"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
+dependencies = [
+ "nom",
+]
+
 [[package]]
 name = "cfg-if"
 version = "1.0.4"
@@ -402,6 +274,17 @@ dependencies = [
  "windows-link",
 ]
 
+[[package]]
+name = "clang-sys"
+version = "1.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
+dependencies = [
+ "glob",
+ "libc",
+ "libloading",
+]
+
 [[package]]
 name = "clap"
 version = "4.6.0"
@@ -442,6 +325,15 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9"
 
+[[package]]
+name = "cmake"
+version = "0.1.57"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75443c44cd6b379beb8c5b45d85d0773baf31cce901fe7bb252f4eff3008ef7d"
+dependencies = [
+ "cc",
+]
+
 [[package]]
 name = "colorchoice"
 version = "1.0.5"
@@ -476,33 +368,12 @@ dependencies = [
  "windows-sys 0.59.0",
 ]
 
-[[package]]
-name = "core-foundation"
-version = "0.9.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f"
-dependencies = [
- "core-foundation-sys",
- "libc",
-]
-
 [[package]]
 name = "core-foundation-sys"
 version = "0.8.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
 
-[[package]]
-name = "core-graphics-types"
-version = "0.1.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "45390e6114f68f718cc7a830514a96f903cccd70d02a8f6d9f643ac4ba45afaf"
-dependencies = [
- "bitflags 1.3.2",
- "core-foundation",
- "libc",
-]
-
 [[package]]
 name = "cpufeatures"
 version = "0.2.17"
@@ -546,12 +417,6 @@ version = "0.8.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
 
-[[package]]
-name = "crunchy"
-version = "0.2.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5"
-
 [[package]]
 name = "crypto-common"
 version = "0.1.7"
@@ -562,27 +427,6 @@ dependencies = [
  "typenum",
 ]
 
-[[package]]
-name = "cudarc"
-version = "0.17.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0bf99ab37ee7072d64d906aa2dada9a3422f1d975cdf8c8055a573bc84897ed8"
-dependencies = [
- "half",
- "libloading 0.8.9",
-]
-
-[[package]]
-name = "cudarc"
-version = "0.19.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f071cd6a7b5d51607df76aa2d426aaabc7a74bc6bdb885b8afa63a880572ad9b"
-dependencies = [
- "float8 0.7.0",
- "half",
- "libloading 0.9.0",
-]
-
 [[package]]
 name = "darling"
 version = "0.20.11"
@@ -732,16 +576,6 @@ dependencies = [
  "windows-sys 0.48.0",
 ]
 
-[[package]]
-name = "dispatch2"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e0e367e4e7da84520dedcac1901e4da967309406d1e51017ae1abfb97adbd38"
-dependencies = [
- "bitflags 2.11.0",
- "objc2",
-]
-
 [[package]]
 name = "displaydoc"
 version = "0.2.5"
@@ -759,22 +593,6 @@ version = "1.0.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555"
 
-[[package]]
-name = "dyn-stack"
-version = "0.13.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1c4713e43e2886ba72b8271aa66c93d722116acf7a75555cce11dcde84388fe8"
-dependencies = [
- "bytemuck",
- "dyn-stack-macros",
-]
-
-[[package]]
-name = "dyn-stack-macros"
-version = "0.1.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e1d926b4d407d372f141f93bb444696142c29d32962ccbd3531117cf3aa0bfa9"
-
 [[package]]
 name = "either"
 version = "1.15.0"
@@ -787,18 +605,25 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"
 
+[[package]]
+name = "encoding_rs"
+version = "0.8.35"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
+dependencies = [
+ "cfg-if",
+]
+
 [[package]]
 name = "engraph"
 version = "1.0.0"
 dependencies = [
  "anyhow",
- "candle-core",
- "candle-nn",
- "candle-transformers",
  "clap",
  "dirs",
  "ignore",
  "indicatif",
+ "llama-cpp-2",
  "notify",
  "notify-debouncer-full",
  "rayon",
@@ -822,12 +647,20 @@ dependencies = [
 ]
 
 [[package]]
-name = "enum-as-inner"
-version = "0.6.1"
+name = "enumflags2"
+version = "0.7.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a1e6a265c649f3f5979b601d26f1d05ada116434c87741c9493cb56218f76cbc"
+checksum = "1027f7680c853e056ebcec683615fb6fbbc07dbaa13b4d5d9442b146ded4ecef"
+dependencies = [
+ "enumflags2_derive",
+]
+
+[[package]]
+name = "enumflags2_derive"
+version = "0.7.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67c78a4d8fdf9953a5c9d458f9efe940fd97a0cab0941c075a813ac594733827"
 dependencies = [
- "heck",
  "proc-macro2",
  "quote",
  "syn",
@@ -889,17 +722,6 @@ dependencies = [
  "regex-syntax",
 ]
 
-[[package]]
-name = "fancy-regex"
-version = "0.17.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "72cf461f865c862bb7dc573f643dd6a2b6842f7c30b07882b56bd148cc2761b8"
-dependencies = [
- "bit-set 0.8.0",
- "regex-automata",
- "regex-syntax",
-]
-
 [[package]]
 name = "fastrand"
 version = "2.3.0"
@@ -933,35 +755,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582"
 
 [[package]]
-name = "flate2"
-version = "1.1.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c"
-dependencies = [
- "crc32fast",
- "miniz_oxide",
-]
-
-[[package]]
-name = "float8"
-version = "0.6.1"
+name = "find_cuda_helper"
+version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "719a903cc23e4a89e87962c2a80fdb45cdaad0983a89bd150bb57b4c8571a7d5"
+checksum = "f9f9e65c593dd01ac77daad909ea4ad17f0d6d1776193fc8ea766356177abdad"
 dependencies = [
- "cudarc 0.19.4",
- "half",
- "num-traits",
- "rand",
- "rand_distr",
+ "glob",
 ]
 
 [[package]]
-name = "float8"
-version = "0.7.0"
+name = "flate2"
+version = "1.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c2d1f04709a8ac06e8e8042875a3c466cc4832d3c1a18dbcb9dba3c6e83046bc"
+checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c"
 dependencies = [
- "half",
+ "crc32fast",
+ "miniz_oxide",
 ]
 
 [[package]]
@@ -976,39 +785,6 @@ version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
 
-[[package]]
-name = "foldhash"
-version = "0.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
-
-[[package]]
-name = "foreign-types"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d737d9aa519fb7b749cbc3b962edcf310a8dd1f4b67c91c4f83975dbdd17d965"
-dependencies = [
- "foreign-types-macros",
- "foreign-types-shared",
-]
-
-[[package]]
-name = "foreign-types-macros"
-version = "0.2.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a5c6c585bc94aaf2c7b51dd4c2ba22680844aba4c687be581871a6f518c5742"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]
-
-[[package]]
-name = "foreign-types-shared"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aa9a19cbb55df58761df49b23516a86d432839add4af60fc256da840f66ed35b"
-
 [[package]]
 name = "form_urlencoded"
 version = "1.2.2"
@@ -1115,244 +891,6 @@ dependencies = [
  "slab",
 ]
 
-[[package]]
-name = "gemm"
-version = "0.18.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ab96b703d31950f1aeddded248bc95543c9efc7ac9c4a21fda8703a83ee35451"
-dependencies = [
- "dyn-stack",
- "gemm-c32 0.18.2",
- "gemm-c64 0.18.2",
- "gemm-common 0.18.2",
- "gemm-f16 0.18.2",
- "gemm-f32 0.18.2",
- "gemm-f64 0.18.2",
- "num-complex",
- "num-traits",
- "paste",
- "raw-cpuid",
- "seq-macro",
-]
-
-[[package]]
-name = "gemm"
-version = "0.19.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aa0673db364b12263d103b68337a68fbecc541d6f6b61ba72fe438654709eacb"
-dependencies = [
- "dyn-stack",
- "gemm-c32 0.19.0",
- "gemm-c64 0.19.0",
- "gemm-common 0.19.0",
- "gemm-f16 0.19.0",
- "gemm-f32 0.19.0",
- "gemm-f64 0.19.0",
- "num-complex",
- "num-traits",
- "paste",
- "raw-cpuid",
- "seq-macro",
-]
-
-[[package]]
-name = "gemm-c32"
-version = "0.18.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6db9fd9f40421d00eea9dd0770045a5603b8d684654816637732463f4073847"
-dependencies = [
- "dyn-stack",
- "gemm-common 0.18.2",
- "num-complex",
- "num-traits",
- "paste",
- "raw-cpuid",
- "seq-macro",
-]
-
-[[package]]
-name = "gemm-c32"
-version = "0.19.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "086936dbdcb99e37aad81d320f98f670e53c1e55a98bee70573e83f95beb128c"
-dependencies = [
- "dyn-stack",
- "gemm-common 0.19.0",
- "num-complex",
- "num-traits",
- "paste",
- "raw-cpuid",
- "seq-macro",
-]
-
-[[package]]
-name = "gemm-c64"
-version = "0.18.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dfcad8a3d35a43758330b635d02edad980c1e143dc2f21e6fd25f9e4eada8edf"
-dependencies = [
- "dyn-stack",
- "gemm-common 0.18.2",
- "num-complex",
- "num-traits",
- "paste",
- "raw-cpuid",
- "seq-macro",
-]
-
-[[package]]
-name = "gemm-c64"
-version = "0.19.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "20c8aeeeec425959bda4d9827664029ba1501a90a0d1e6228e48bef741db3a3f"
-dependencies = [
- "dyn-stack",
- "gemm-common 0.19.0",
- "num-complex",
- "num-traits",
- "paste",
- "raw-cpuid",
- "seq-macro",
-]
-
-[[package]]
-name = "gemm-common"
-version = "0.18.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a352d4a69cbe938b9e2a9cb7a3a63b7e72f9349174a2752a558a8a563510d0f3"
-dependencies = [
- "bytemuck",
- "dyn-stack",
- "half",
- "libm",
- "num-complex",
- "num-traits",
- "once_cell",
- "paste",
- "pulp 0.21.5",
- "raw-cpuid",
- "rayon",
- "seq-macro",
- "sysctl",
-]
-
-[[package]]
-name = "gemm-common"
-version = "0.19.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "88027625910cc9b1085aaaa1c4bc46bb3a36aad323452b33c25b5e4e7c8e2a3e"
-dependencies = [
- "bytemuck",
- "dyn-stack",
- "half",
- "libm",
- "num-complex",
- "num-traits",
- "once_cell",
- "paste",
- "pulp 0.22.2",
- "raw-cpuid",
- "rayon",
- "seq-macro",
- "sysctl",
-]
-
-[[package]]
-name = "gemm-f16"
-version = "0.18.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cff95ae3259432f3c3410eaa919033cd03791d81cebd18018393dc147952e109"
-dependencies = [
- "dyn-stack",
- "gemm-common 0.18.2",
- "gemm-f32 0.18.2",
- "half",
- "num-complex",
- "num-traits",
- "paste",
- "raw-cpuid",
- "rayon",
- "seq-macro",
-]
-
-[[package]]
-name = "gemm-f16"
-version = "0.19.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e3df7a55202e6cd6739d82ae3399c8e0c7e1402859b30e4cb780e61525d9486e"
-dependencies = [
- "dyn-stack",
- "gemm-common 0.19.0",
- "gemm-f32 0.19.0",
- "half",
- "num-complex",
- "num-traits",
- "paste",
- "raw-cpuid",
- "rayon",
- "seq-macro",
-]
-
-[[package]]
-name = "gemm-f32"
-version = "0.18.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bc8d3d4385393304f407392f754cd2dc4b315d05063f62cf09f47b58de276864"
-dependencies = [
- "dyn-stack",
- "gemm-common 0.18.2",
- "num-complex",
- "num-traits",
- "paste",
- "raw-cpuid",
- "seq-macro",
-]
-
-[[package]]
-name = "gemm-f32"
-version = "0.19.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "02e0b8c9da1fbec6e3e3ab2ce6bc259ef18eb5f6f0d3e4edf54b75f9fd41a81c"
-dependencies = [
- "dyn-stack",
- "gemm-common 0.19.0",
- "num-complex",
- "num-traits",
- "paste",
- "raw-cpuid",
- "seq-macro",
-]
-
-[[package]]
-name = "gemm-f64"
-version = "0.18.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "35b2a4f76ce4b8b16eadc11ccf2e083252d8237c1b589558a49b0183545015bd"
-dependencies = [
- "dyn-stack",
- "gemm-common 0.18.2",
- "num-complex",
- "num-traits",
- "paste",
- "raw-cpuid",
- "seq-macro",
-]
-
-[[package]]
-name = "gemm-f64"
-version = "0.19.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "056131e8f2a521bfab322f804ccd652520c79700d81209e9d9275bbdecaadc6a"
-dependencies = [
- "dyn-stack",
- "gemm-common 0.19.0",
- "num-complex",
- "num-traits",
- "paste",
- "raw-cpuid",
- "seq-macro",
-]
-
 [[package]]
 name = "generic-array"
 version = "0.14.7"
@@ -1418,21 +956,6 @@ dependencies = [
  "regex-syntax",
 ]
 
-[[package]]
-name = "half"
-version = "2.7.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b"
-dependencies = [
- "bytemuck",
- "cfg-if",
- "crunchy",
- "num-traits",
- "rand",
- "rand_distr",
- "zerocopy 0.8.42",
-]
-
 [[package]]
 name = "hashbrown"
 version = "0.14.5"
@@ -1448,7 +971,7 @@ version = "0.15.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
 dependencies = [
- "foldhash 0.1.5",
+ "foldhash",
 ]
 
 [[package]]
@@ -1456,13 +979,6 @@ name = "hashbrown"
 version = "0.16.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
-dependencies = [
- "allocator-api2",
- "equivalent",
- "foldhash 0.2.0",
- "serde",
- "serde_core",
-]
 
 [[package]]
 name = "hashlink"
@@ -1479,12 +995,6 @@ version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
 
-[[package]]
-name = "hermit-abi"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c"
-
 [[package]]
 name = "iana-time-zone"
 version = "0.1.65"
@@ -1517,7 +1027,7 @@ checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43"
 dependencies = [
  "displaydoc",
  "potential_utf",
- "yoke 0.8.1",
+ "yoke",
  "zerofrom",
  "zerovec",
 ]
@@ -1584,7 +1094,7 @@ dependencies = [
  "displaydoc",
  "icu_locale_core",
  "writeable",
- "yoke 0.8.1",
+ "yoke",
  "zerofrom",
  "zerotrie",
  "zerovec",
@@ -1699,6 +1209,15 @@ version = "1.70.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
 
+[[package]]
+name = "itertools"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186"
+dependencies = [
+ "either",
+]
+
 [[package]]
 name = "itertools"
 version = "0.14.0"
@@ -1714,6 +1233,16 @@ version = "1.0.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2"
 
+[[package]]
+name = "jobserver"
+version = "0.1.34"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33"
+dependencies = [
+ "getrandom 0.3.4",
+ "libc",
+]
+
 [[package]]
 name = "js-sys"
 version = "0.3.91"
@@ -1764,30 +1293,14 @@ checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d"
 
 [[package]]
 name = "libloading"
-version = "0.8.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55"
-dependencies = [
- "cfg-if",
- "windows-link",
-]
-
-[[package]]
-name = "libloading"
-version = "0.9.0"
+version = "0.8.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "754ca22de805bb5744484a5b151a9e1a8e837d5dc232c2d7d8c2e3492edc8b60"
+checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55"
 dependencies = [
  "cfg-if",
  "windows-link",
 ]
 
-[[package]]
-name = "libm"
-version = "0.2.16"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"
-
 [[package]]
 name = "libredox"
 version = "0.1.14"
@@ -1823,6 +1336,34 @@ version = "0.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77"
 
+[[package]]
+name = "llama-cpp-2"
+version = "0.1.140"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5604c13b9c847157470479a64d1d7c94f3089709309f82f2fdcbcd43510f2f2"
+dependencies = [
+ "encoding_rs",
+ "enumflags2",
+ "llama-cpp-sys-2",
+ "thiserror 2.0.18",
+ "tracing",
+ "tracing-core",
+]
+
+[[package]]
+name = "llama-cpp-sys-2"
+version = "0.1.140"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cbdd3e2c06f3a9a47466a631735946e9ad47fef565b88bc8766a3794474a66f3"
+dependencies = [
+ "bindgen",
+ "cc",
+ "cmake",
+ "find_cuda_helper",
+ "glob",
+ "walkdir",
+]
+
 [[package]]
 name = "log"
 version = "0.4.29"
@@ -1845,15 +1386,6 @@ version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "670fdfda89751bc4a84ac13eaa63e205cf0fd22b4c9a5fbfa085b63c1f1d3a30"
 
-[[package]]
-name = "malloc_buf"
-version = "0.0.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62bb907fe88d54d8d9ce32a3cceab4218ed2f6b7d35617cafe9adf84e43919cb"
-dependencies = [
- "libc",
-]
-
 [[package]]
 name = "matchers"
 version = "0.2.0"
@@ -1869,31 +1401,6 @@ version = "2.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
 
-[[package]]
-name = "memmap2"
-version = "0.9.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "714098028fe011992e1c3962653c96b2d578c4b4bce9036e15ff220319b1e0e3"
-dependencies = [
- "libc",
- "stable_deref_trait",
-]
-
-[[package]]
-name = "metal"
-version = "0.29.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7ecfd3296f8c56b7c1f6fbac3c71cefa9d78ce009850c45000015f206dc7fa21"
-dependencies = [
- "bitflags 2.11.0",
- "block",
- "core-graphics-types",
- "foreign-types",
- "log",
- "objc",
- "paste",
-]
-
 [[package]]
 name = "minimal-lexical"
 version = "0.2.1"
@@ -2004,77 +1511,12 @@ dependencies = [
  "windows-sys 0.61.2",
 ]
 
-[[package]]
-name = "num"
-version = "0.4.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23"
-dependencies = [
- "num-bigint",
- "num-complex",
- "num-integer",
- "num-iter",
- "num-rational",
- "num-traits",
-]
-
-[[package]]
-name = "num-bigint"
-version = "0.4.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9"
-dependencies = [
- "num-integer",
- "num-traits",
-]
-
-[[package]]
-name = "num-complex"
-version = "0.4.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495"
-dependencies = [
- "bytemuck",
- "num-traits",
-]
-
 [[package]]
 name = "num-conv"
 version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c6673768db2d862beb9b39a78fdcb1a69439615d5794a1be50caa9bc92c81967"
 
-[[package]]
-name = "num-integer"
-version = "0.1.46"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
-dependencies = [
- "num-traits",
-]
-
-[[package]]
-name = "num-iter"
-version = "0.1.45"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf"
-dependencies = [
- "autocfg",
- "num-integer",
- "num-traits",
-]
-
-[[package]]
-name = "num-rational"
-version = "0.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824"
-dependencies = [
- "num-bigint",
- "num-integer",
- "num-traits",
-]
-
 [[package]]
 name = "num-traits"
 version = "0.2.19"
@@ -2082,17 +1524,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
 dependencies = [
  "autocfg",
- "libm",
-]
-
-[[package]]
-name = "num_cpus"
-version = "1.17.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b"
-dependencies = [
- "hermit-abi",
- "libc",
 ]
 
 [[package]]
@@ -2101,68 +1532,6 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
 
-[[package]]
-name = "objc"
-version = "0.2.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "915b1b472bc21c53464d6c8461c9d3af805ba1ef837e1cac254428f4a77177b1"
-dependencies = [
- "malloc_buf",
-]
-
-[[package]]
-name = "objc2"
-version = "0.6.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3a12a8ed07aefc768292f076dc3ac8c48f3781c8f2d5851dd3d98950e8c5a89f"
-dependencies = [
- "objc2-encode",
-]
-
-[[package]]
-name = "objc2-core-foundation"
-version = "0.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2a180dd8642fa45cdb7dd721cd4c11b1cadd4929ce112ebd8b9f5803cc79d536"
-dependencies = [
- "bitflags 2.11.0",
- "dispatch2",
- "objc2",
-]
-
-[[package]]
-name = "objc2-encode"
-version = "4.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ef25abbcd74fb2609453eb695bd2f860d389e457f67dc17cafc8b8cbc89d0c33"
-
-[[package]]
-name = "objc2-foundation"
-version = "0.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e3e0adef53c21f888deb4fa59fc59f7eb17404926ee8a6f59f5df0fd7f9f3272"
-dependencies = [
- "bitflags 2.11.0",
- "block2",
- "libc",
- "objc2",
- "objc2-core-foundation",
-]
-
-[[package]]
-name = "objc2-metal"
-version = "0.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a0125f776a10d00af4152d74616409f0d4a2053a6f57fa5b7d6aa2854ac04794"
-dependencies = [
- "bitflags 2.11.0",
- "block2",
- "dispatch2",
- "objc2",
- "objc2-core-foundation",
- "objc2-foundation",
-]
-
 [[package]]
 name = "once_cell"
 version = "1.21.4"
@@ -2266,43 +1635,6 @@ dependencies = [
  "unicode-ident",
 ]
 
-[[package]]
-name = "pulp"
-version = "0.21.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "96b86df24f0a7ddd5e4b95c94fc9ed8a98f1ca94d3b01bdce2824097e7835907"
-dependencies = [
- "bytemuck",
- "cfg-if",
- "libm",
- "num-complex",
- "reborrow",
- "version_check",
-]
-
-[[package]]
-name = "pulp"
-version = "0.22.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2e205bb30d5b916c55e584c22201771bcf2bad9aabd5d4127f38387140c38632"
-dependencies = [
- "bytemuck",
- "cfg-if",
- "libm",
- "num-complex",
- "paste",
- "pulp-wasm-simd-flag",
- "raw-cpuid",
- "reborrow",
- "version_check",
-]
-
-[[package]]
-name = "pulp-wasm-simd-flag"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "40e24eee682d89fb193496edf918a7f407d30175b2e785fe057e4392dfd182e0"
-
 [[package]]
 name = "quote"
 version = "1.0.45"
@@ -2353,25 +1685,6 @@ dependencies = [
  "getrandom 0.3.4",
 ]
 
-[[package]]
-name = "rand_distr"
-version = "0.5.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463"
-dependencies = [
- "num-traits",
- "rand",
-]
-
-[[package]]
-name = "raw-cpuid"
-version = "11.6.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "498cd0dc59d73224351ee52a95fee0f1a617a2eae0e7d9d720cc622c73a54186"
-dependencies = [
- "bitflags 2.11.0",
-]
-
 [[package]]
 name = "rayon"
 version = "1.11.0"
@@ -2389,7 +1702,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2964d0cf57a3e7a06e8183d14a8b527195c706b7983549cd5462d5aa3747438f"
 dependencies = [
  "either",
- "itertools",
+ "itertools 0.14.0",
  "rayon",
 ]
 
@@ -2403,12 +1716,6 @@ dependencies = [
  "crossbeam-utils",
 ]
 
-[[package]]
-name = "reborrow"
-version = "0.5.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "03251193000f4bd3b042892be858ee50e8b3719f2b08e5833ac4353724632430"
-
 [[package]]
 name = "redox_syscall"
 version = "0.7.3"
@@ -2541,6 +1848,12 @@ dependencies = [
  "smallvec",
 ]
 
+[[package]]
+name = "rustc-hash"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d"
+
 [[package]]
 name = "rustix"
 version = "1.1.4"
@@ -2601,27 +1914,6 @@ version = "1.0.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f"
 
-[[package]]
-name = "safetensors"
-version = "0.4.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "44560c11236a6130a46ce36c836a62936dc81ebf8c36a37947423571be0e55b6"
-dependencies = [
- "serde",
- "serde_json",
-]
-
-[[package]]
-name = "safetensors"
-version = "0.7.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "675656c1eabb620b921efea4f9199f97fc86e36dd6ffd1fbbe48d0f59a4987f5"
-dependencies = [
- "hashbrown 0.16.1",
- "serde",
- "serde_json",
-]
-
 [[package]]
 name = "same-file"
 version = "1.0.6"
@@ -2663,12 +1955,6 @@ version = "1.0.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2"
 
-[[package]]
-name = "seq-macro"
-version = "0.3.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc"
-
 [[package]]
 name = "serde"
 version = "1.0.228"
@@ -2723,15 +2009,6 @@ dependencies = [
  "zmij",
 ]
 
-[[package]]
-name = "serde_plain"
-version = "1.0.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ce1fc6db65a611022b23a0dec6975d63fb80a302cb3388835ff02c097258d50"
-dependencies = [
- "serde",
-]
-
 [[package]]
 name = "serde_spanned"
 version = "0.6.9"
@@ -2864,20 +2141,6 @@ dependencies = [
  "syn",
 ]
 
-[[package]]
-name = "sysctl"
-version = "0.6.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "01198a2debb237c62b6826ec7081082d951f46dbb64b0e8c7649a452230d1dfc"
-dependencies = [
- "bitflags 2.11.0",
- "byteorder",
- "enum-as-inner",
- "libc",
- "thiserror 1.0.69",
- "walkdir",
-]
-
 [[package]]
 name = "tempfile"
 version = "3.27.0"
@@ -2983,7 +2246,7 @@ dependencies = [
  "esaxx-rs",
  "fancy-regex 0.14.0",
  "getrandom 0.3.4",
- "itertools",
+ "itertools 0.14.0",
  "log",
  "macro_rules_attribute",
  "monostate",
@@ -3139,66 +2402,12 @@ dependencies = [
  "tracing-log",
 ]
 
-[[package]]
-name = "typed-path"
-version = "0.12.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e28f89b80c87b8fb0cf04ab448d5dd0dd0ade2f8891bae878de66a75a28600e"
-
 [[package]]
 name = "typenum"
 version = "1.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb"
 
-[[package]]
-name = "ug"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "76b761acf8af3494640d826a8609e2265e19778fb43306c7f15379c78c9b05b0"
-dependencies = [
- "gemm 0.18.2",
- "half",
- "libloading 0.8.9",
- "memmap2",
- "num",
- "num-traits",
- "num_cpus",
- "rayon",
- "safetensors 0.4.5",
- "serde",
- "thiserror 1.0.69",
- "tracing",
- "yoke 0.7.5",
-]
-
-[[package]]
-name = "ug-cuda"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9f0a1fa748f26166778c33b8498255ebb7c6bffb472bcc0a72839e07ebb1d9b5"
-dependencies = [
- "cudarc 0.17.8",
- "half",
- "serde",
- "thiserror 1.0.69",
- "ug",
-]
-
-[[package]]
-name = "ug-metal"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9f7adf545a99a086d362efc739e7cf4317c18cbeda22706000fd434d70ea3d95"
-dependencies = [
- "half",
- "metal",
- "objc",
- "serde",
- "thiserror 1.0.69",
- "ug",
-]
-
 [[package]]
 name = "unicode-ident"
 version = "1.0.24"
@@ -3845,18 +3054,6 @@ version = "0.6.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9"
 
-[[package]]
-name = "yoke"
-version = "0.7.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40"
-dependencies = [
- "serde",
- "stable_deref_trait",
- "yoke-derive 0.7.5",
- "zerofrom",
-]
-
 [[package]]
 name = "yoke"
 version = "0.8.1"
@@ -3864,22 +3061,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954"
 dependencies = [
  "stable_deref_trait",
- "yoke-derive 0.8.1",
+ "yoke-derive",
  "zerofrom",
 ]
 
-[[package]]
-name = "yoke-derive"
-version = "0.7.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
- "synstructure",
-]
-
 [[package]]
 name = "yoke-derive"
 version = "0.8.1"
@@ -3967,7 +3152,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851"
 dependencies = [
  "displaydoc",
- "yoke 0.8.1",
+ "yoke",
  "zerofrom",
 ]
 
@@ -3977,7 +3162,7 @@ version = "0.11.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002"
 dependencies = [
- "yoke 0.8.1",
+ "yoke",
  "zerofrom",
  "zerovec-derive",
 ]
@@ -3993,18 +3178,6 @@ dependencies = [
  "syn",
 ]
 
-[[package]]
-name = "zip"
-version = "7.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c42e33efc22a0650c311c2ef19115ce232583abbe80850bc8b66509ebef02de0"
-dependencies = [
- "crc32fast",
- "indexmap",
- "memchr",
- "typed-path",
-]
-
 [[package]]
 name = "zmij"
 version = "1.0.21"
diff --git a/Cargo.toml b/Cargo.toml
index e765437..9549eba 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -34,16 +34,11 @@ rmcp = { version = "1.2", features = ["transport-io"] }
 tokio = { version = "1", features = ["macros", "rt-multi-thread"] }
 notify = "7.0"
 notify-debouncer-full = "0.4"
-candle-core = "0.9"
-candle-nn = "0.9"
-candle-transformers = "0.9"
+llama-cpp-2 = "0.1"
 shimmytok = "0.7"
 
 [features]
 default = []
-metal = ["candle-core/metal"]
-accelerate = ["candle-core/accelerate"]
-cuda = ["candle-core/cuda"]
 
 [dev-dependencies]
 tempfile = "3"
diff --git a/src/indexer.rs b/src/indexer.rs
index 1f91a80..501b6da 100644
--- a/src/indexer.rs
+++ b/src/indexer.rs
@@ -441,7 +441,7 @@ pub fn run_index(vault_path: &Path, config: &Config, rebuild: bool) -> Result<In
     let store = Store::open(&db_path)?;
 
     let models_dir = data_dir.join("models");
-    let mut embedder = crate::llm::CandleEmbed::new(&models_dir, config)?;
+    let mut embedder = crate::llm::LlamaEmbed::new(&models_dir, config)?;
 
     // Check for embedding dimension change
     let model_dim = embedder.dim();
diff --git a/src/llm.rs b/src/llm.rs
index 069470a..b048172 100644
--- a/src/llm.rs
+++ b/src/llm.rs
@@ -1,26 +1,16 @@
 use std::io::Read;
 use std::path::{Path, PathBuf};
 
-use anyhow::{Result, bail};
+use anyhow::{Context as _, Result, bail};
 use indicatif::{ProgressBar, ProgressStyle};
 use sha2::{Digest, Sha256};
 
-use anyhow::Context as _;
-use candle_core::{D, DType, Device, IndexOp, Tensor};
-use candle_nn::{Embedding, Module};
-
-// ── Device selection ─────────────────────────────────────────────────────────
-
-/// Select best available device: Metal on macOS (with `metal` feature), CPU elsewhere.
-fn select_device() -> Result<Device> {
-    #[cfg(feature = "metal")]
-    {
-        if let Ok(device) = Device::new_metal(0) {
-            return Ok(device);
-        }
-    }
-    Ok(Device::Cpu)
-}
+use llama_cpp_2::context::params::LlamaContextParams;
+use llama_cpp_2::llama_backend::LlamaBackend;
+use llama_cpp_2::llama_batch::LlamaBatch;
+use llama_cpp_2::model::params::LlamaModelParams;
+use llama_cpp_2::model::{AddBos, LlamaModel};
+use llama_cpp_2::sampling::LlamaSampler;
 
 // ── Prompt format ────────────────────────────────────────────────────────────
 
@@ -520,7 +510,7 @@ fn load_tokenizer_for_model(uri: &HfModelUri, models_dir: &Path) -> Result<FlexT
 }
 
 /// Load tokenizer as HuggingFace `tokenizers::Tokenizer` specifically.
-/// Used by CandleOrchestrator and CandleRerank which need decode/token_to_id.
+/// Used by LlamaOrchestrator and LlamaRerank which need decode/token_to_id.
 fn load_hf_tokenizer(uri: &HfModelUri, models_dir: &Path) -> Result<tokenizers::Tokenizer> {
     try_external_tokenizer(uri, models_dir).ok_or_else(|| {
         anyhow::anyhow!(
@@ -604,265 +594,47 @@ impl Default for ModelDefaults {
     }
 }
 
-// ── CandleEmbed — GGUF embedding model via candle ──────────────────────────
+// ── LlamaEmbed — GGUF embedding model via llama.cpp ──────────────────────────
 
-/// Quantized matrix multiplication wrapper (mirrors candle-transformers pattern).
-#[derive(Debug, Clone)]
-struct CandleQMatMul {
-    inner: candle_core::quantized::QMatMul,
-}
-
-impl CandleQMatMul {
-    fn from_qtensor(qtensor: candle_core::quantized::QTensor) -> candle_core::Result<Self> {
-        let inner = candle_core::quantized::QMatMul::from_qtensor(qtensor)?;
-        Ok(Self { inner })
-    }
-
-    fn forward(&self, xs: &Tensor) -> candle_core::Result<Tensor> {
-        self.inner.forward(xs)
-    }
-}
-
-/// Single transformer layer for the embedding model.
-#[derive(Debug, Clone)]
-struct EmbedLayer {
-    attention_wq: CandleQMatMul,
-    attention_wk: CandleQMatMul,
-    attention_wv: CandleQMatMul,
-    attention_wo: CandleQMatMul,
-    attention_q_norm: candle_nn::RmsNorm,
-    attention_k_norm: candle_nn::RmsNorm,
-    attention_norm: candle_nn::RmsNorm,
-    post_attention_norm: candle_nn::RmsNorm,
-    ffn_norm: candle_nn::RmsNorm,
-    post_ffn_norm: candle_nn::RmsNorm,
-    ffn_gate: CandleQMatMul,
-    ffn_up: CandleQMatMul,
-    ffn_down: CandleQMatMul,
-    n_head: usize,
-    n_kv_head: usize,
-    head_dim: usize,
-    q_dim: usize,
-    rotary_sin: Tensor,
-    rotary_cos: Tensor,
-}
-
-impl EmbedLayer {
-    /// Bidirectional forward pass — no causal mask, no KV cache.
-    fn forward(&self, x: &Tensor) -> candle_core::Result<Tensor> {
-        let (b_sz, seq_len, _) = x.dims3()?;
-
-        // --- Attention block ---
-        let residual = x;
-        let x = self.attention_norm.forward(x)?;
-
-        let q = self.attention_wq.forward(&x)?;
-        let k = self.attention_wk.forward(&x)?;
-        let v = self.attention_wv.forward(&x)?;
-
-        let q = q
-            .reshape((b_sz, seq_len, self.n_head, self.head_dim))?
-            .transpose(1, 2)?;
-        let k = k
-            .reshape((b_sz, seq_len, self.n_kv_head, self.head_dim))?
-            .transpose(1, 2)?;
-        let v = v
-            .reshape((b_sz, seq_len, self.n_kv_head, self.head_dim))?
-            .transpose(1, 2)?;
-
-        let q = self.attention_q_norm.forward(&q.contiguous()?)?;
-        let k = self.attention_k_norm.forward(&k.contiguous()?)?;
-
-        // Apply rotary embeddings (truncated to seq_len).
-        let q = Self::apply_rotary(&q, &self.rotary_cos, &self.rotary_sin, seq_len)?;
-        let k = Self::apply_rotary(&k, &self.rotary_cos, &self.rotary_sin, seq_len)?;
-
-        // Repeat KV heads for GQA.
-        let n_rep = self.n_head / self.n_kv_head;
-        let k = candle_transformers::utils::repeat_kv(k, n_rep)?;
-        let v = candle_transformers::utils::repeat_kv(v, n_rep)?;
-
-        // Scaled dot-product attention — BIDIRECTIONAL (no mask).
-        let scale = 1.0 / (self.head_dim as f64).sqrt();
-        let attn_weights = (q.matmul(&k.transpose(2, 3)?)? * scale)?;
-        let attn_weights = candle_nn::ops::softmax_last_dim(&attn_weights)?;
-        let attn_output = attn_weights.matmul(&v)?;
-
-        let attn_output = attn_output
-            .transpose(1, 2)?
-            .reshape((b_sz, seq_len, self.q_dim))?;
-        let attn_output = self.attention_wo.forward(&attn_output)?;
-        let x = self.post_attention_norm.forward(&attn_output)?;
-        let x = (x + residual)?;
-
-        // --- FFN block ---
-        let residual = &x;
-        let h = self.ffn_norm.forward(&x)?;
-        let gate = self.ffn_gate.forward(&h)?;
-        let up = self.ffn_up.forward(&h)?;
-        let h = (candle_nn::ops::silu(&gate)? * up)?;
-        let h = self.ffn_down.forward(&h)?;
-        let h = self.post_ffn_norm.forward(&h)?;
-        h + residual
-    }
-
-    /// Apply rotary embeddings to a [batch, heads, seq, dim] tensor.
-    fn apply_rotary(
-        x: &Tensor,
-        cos: &Tensor,
-        sin: &Tensor,
-        seq_len: usize,
-    ) -> candle_core::Result<Tensor> {
-        let cos = cos.i(..seq_len)?.unsqueeze(0)?.unsqueeze(0)?;
-        let sin = sin.i(..seq_len)?.unsqueeze(0)?.unsqueeze(0)?;
-        let dim = x.dim(D::Minus1)?;
-        let half = dim / 2;
-        let x1 = x.narrow(D::Minus1, 0, half)?;
-        let x2 = x.narrow(D::Minus1, half, half)?;
-        let rotated = Tensor::cat(&[&x2.neg()?, &x1], D::Minus1)?;
-        let out = (x.broadcast_mul(&cos)? + rotated.broadcast_mul(&sin)?)?;
-        Ok(out)
-    }
-}
-
-/// Single BERT transformer layer (LayerNorm + absolute positions + GELU FFN).
-#[derive(Debug, Clone)]
-struct BertLayer {
-    attn_q: CandleQMatMul,
-    attn_q_bias: Tensor,
-    attn_k: CandleQMatMul,
-    attn_k_bias: Tensor,
-    attn_v: CandleQMatMul,
-    attn_v_bias: Tensor,
-    attn_output: CandleQMatMul,
-    attn_output_bias: Tensor,
-    attn_output_norm: candle_nn::LayerNorm,
-    ffn_up: CandleQMatMul,
-    ffn_up_bias: Tensor,
-    ffn_down: CandleQMatMul,
-    ffn_down_bias: Tensor,
-    layer_output_norm: candle_nn::LayerNorm,
-    n_head: usize,
-    head_dim: usize,
-}
-
-impl BertLayer {
-    /// Bidirectional forward pass for BERT architecture.
-    fn forward(&self, x: &Tensor) -> candle_core::Result<Tensor> {
-        let (b_sz, seq_len, _hidden) = x.dims3()?;
-
-        // --- Attention block ---
-        let residual = x;
-
-        let q = self.attn_q.forward(x)?.broadcast_add(&self.attn_q_bias)?;
-        let k = self.attn_k.forward(x)?.broadcast_add(&self.attn_k_bias)?;
-        let v = self.attn_v.forward(x)?.broadcast_add(&self.attn_v_bias)?;
-
-        let q = q
-            .reshape((b_sz, seq_len, self.n_head, self.head_dim))?
-            .transpose(1, 2)?
-            .contiguous()?;
-        let k = k
-            .reshape((b_sz, seq_len, self.n_head, self.head_dim))?
-            .transpose(1, 2)?
-            .contiguous()?;
-        let v = v
-            .reshape((b_sz, seq_len, self.n_head, self.head_dim))?
-            .transpose(1, 2)?
-            .contiguous()?;
-
-        // Scaled dot-product attention — BIDIRECTIONAL (no causal mask).
-        let scale = 1.0 / (self.head_dim as f64).sqrt();
-        let attn_weights = (q.matmul(&k.transpose(2, 3)?)? * scale)?;
-        let attn_weights = candle_nn::ops::softmax_last_dim(&attn_weights)?;
-        let attn_output = attn_weights.matmul(&v)?;
-
-        let attn_output =
-            attn_output
-                .transpose(1, 2)?
-                .reshape((b_sz, seq_len, self.n_head * self.head_dim))?;
-        let attn_output = self
-            .attn_output
-            .forward(&attn_output)?
-            .broadcast_add(&self.attn_output_bias)?;
-        let x = self.attn_output_norm.forward(&(residual + attn_output)?)?;
-
-        // --- FFN block (GELU activation) ---
-        let residual = &x;
-        let h = self.ffn_up.forward(&x)?.broadcast_add(&self.ffn_up_bias)?;
-        let h = h.gelu()?;
-        let h = self
-            .ffn_down
-            .forward(&h)?
-            .broadcast_add(&self.ffn_down_bias)?;
-        self.layer_output_norm.forward(&(residual + h)?)
-    }
-}
-
-/// Model variant: Gemma or BERT architecture.
-enum EmbedModelVariant {
-    Gemma {
-        layers: Vec<EmbedLayer>,
-        tok_embeddings: Embedding,
-        norm: candle_nn::RmsNorm,
-        embedding_length: usize,
-    },
-    Bert {
-        layers: Vec<BertLayer>,
-        tok_embeddings: Embedding,
-        pos_embeddings: Tensor,
-        embed_norm: candle_nn::LayerNorm,
-        hidden_size: usize,
-    },
-}
-
-/// GGUF embedding model loaded via candle.
+/// GGUF embedding model loaded via llama.cpp.
+///
+/// Loads a quantized embedding model from a GGUF file and produces dense float
+/// vectors via llama.cpp's built-in embedding support with mean pooling + L2
+/// normalization. Supports Metal acceleration on macOS automatically.
 ///
-/// Loads a quantized embedding model (Gemma or BERT family) from a GGUF file
-/// and produces dense float vectors via bidirectional attention + mean pooling
-/// + L2 normalization.
-pub struct CandleEmbed {
-    variant: EmbedModelVariant,
+/// `LlamaModel` is `Send + Sync`, so this struct is `Send`. `LlamaContext` is
+/// `!Send`, so we create it per-call from the stored model and backend.
+pub struct LlamaEmbed {
+    model: LlamaModel,
+    backend: LlamaBackend,
     tokenizer: FlexTokenizer,
-    device: Device,
     dim: usize,
     prompt_format: PromptFormat,
 }
 
-impl std::fmt::Debug for CandleEmbed {
+// Safety: LlamaModel is Send+Sync per llama-cpp-2 docs. LlamaBackend is Send+Sync.
+// FlexTokenizer contains only Send types (tokenizers::Tokenizer is Send, shimmytok::Tokenizer is Send).
+// We never store a LlamaContext (which is !Send) — it is created per-call.
+unsafe impl Send for LlamaEmbed {}
+
+impl std::fmt::Debug for LlamaEmbed {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let (arch, num_layers, hidden) = match &self.variant {
-            EmbedModelVariant::Gemma {
-                layers,
-                embedding_length,
-                ..
-            } => ("gemma", layers.len(), *embedding_length),
-            EmbedModelVariant::Bert {
-                layers,
-                hidden_size,
-                ..
-            } => ("bert", layers.len(), *hidden_size),
-        };
-        f.debug_struct("CandleEmbed")
-            .field("arch", &arch)
+        f.debug_struct("LlamaEmbed")
             .field("dim", &self.dim)
-            .field("hidden_size", &hidden)
-            .field("num_layers", &num_layers)
             .field("prompt_format", &self.prompt_format)
             .finish()
     }
 }
 
-impl CandleEmbed {
+impl LlamaEmbed {
     /// Load a GGUF embedding model from `models_dir`.
     ///
     /// Steps:
     /// 1. Resolve model URI (from config override or `ModelDefaults`)
     /// 2. `ensure_model()` to download if needed
     /// 3. Load tokenizer (try same repo's tokenizer.json, then repo without -GGUF suffix)
-    /// 4. Detect architecture from GGUF metadata (`general.architecture`)
-    /// 5. Load GGUF and build layer structs for bidirectional embedding
-    /// 6. Detect prompt format from filename
+    /// 4. Load GGUF model via llama.cpp
+    /// 5. Detect prompt format from filename
     pub fn new(models_dir: &Path, config: &crate::config::Config) -> Result<Self> {
         let defaults = ModelDefaults::default();
         let uri_str = config
@@ -882,527 +654,86 @@ impl CandleEmbed {
         // Target output dimensionality.
         let dim = defaults.embed_dim;
 
-        // Detect architecture from GGUF metadata and load accordingly.
-        let device = select_device()?;
-        let arch = Self::detect_architecture(&model_path)?;
+        // Initialize llama.cpp backend and load model.
+        let backend =
+            LlamaBackend::init().map_err(|e| anyhow::anyhow!("initializing llama backend: {e}"))?;
+        let model_params = LlamaModelParams::default();
+        let model = LlamaModel::load_from_file(&backend, &model_path, &model_params)
+            .map_err(|e| anyhow::anyhow!("loading GGUF model {}: {e}", model_path.display()))?;
 
-        let variant = if arch.contains("bert") {
-            Self::load_gguf_bert(&model_path, &device)?
-        } else {
-            let (layers, tok_embeddings, norm, embedding_length) =
-                Self::load_gguf_gemma(&model_path, &device)?;
-            EmbedModelVariant::Gemma {
-                layers,
-                tok_embeddings,
-                norm,
-                embedding_length,
-            }
-        };
-
-        let (arch_name, num_layers, hidden) = match &variant {
-            EmbedModelVariant::Gemma {
-                layers,
-                embedding_length,
-                ..
-            } => ("gemma", layers.len(), *embedding_length),
-            EmbedModelVariant::Bert {
-                layers,
-                hidden_size,
-                ..
-            } => ("bert", layers.len(), *hidden_size),
-        };
         tracing::info!(
-            "loaded CandleEmbed: arch={}, {} layers, hidden_size={}, target_dim={}, device={:?}",
-            arch_name,
-            num_layers,
-            hidden,
-            dim,
-            device
+            "loaded LlamaEmbed from {}, target_dim={}",
+            uri_str,
+            dim
         );
 
         Ok(Self {
-            variant,
+            model,
+            backend,
             tokenizer,
-            device,
             dim,
             prompt_format,
         })
     }
 
-    /// Read `general.architecture` from GGUF metadata to determine the model family.
-    fn detect_architecture(path: &Path) -> Result<String> {
-        use candle_core::quantized::gguf_file;
-
-        let mut file = std::fs::File::open(path)
-            .map_err(|e| anyhow::anyhow!("opening GGUF {}: {e}", path.display()))?;
-        let ct = gguf_file::Content::read(&mut file)
-            .map_err(|e| anyhow::anyhow!("reading GGUF {}: {e}", path.display()))?;
-
-        // Look for `general.architecture` in metadata.
-        if let Some(val) = ct.metadata.get("general.architecture") {
-            let arch = val
-                .to_string()
-                .map_err(|e| anyhow::anyhow!("reading general.architecture: {e}"))?;
-            Ok(arch.to_lowercase())
-        } else {
-            // Fallback: probe for known architecture prefixes.
-            let has_bert = ct.metadata.contains_key("bert.attention.head_count");
-            if has_bert {
-                Ok("bert".to_string())
-            } else {
-                Ok("gemma".to_string())
-            }
-        }
-    }
-
-    /// Load GGUF file and construct Gemma-family layer structs for bidirectional embedding.
-    fn load_gguf_gemma(
-        path: &Path,
-        device: &Device,
-    ) -> Result<(
-        Vec<EmbedLayer>,
-        Embedding,
-        candle_nn::RmsNorm,
-        usize,
-    )> {
-        use candle_core::quantized::gguf_file;
-
-        let mut file = std::fs::File::open(path)
-            .map_err(|e| anyhow::anyhow!("opening GGUF {}: {e}", path.display()))?;
-        let ct = gguf_file::Content::read(&mut file)
-            .map_err(|e| anyhow::anyhow!("reading GGUF {}: {e}", path.display()))?;
-
-        // Detect architecture prefix (same probe as candle-transformers quantized_gemma3).
-        let prefix = ["gemma3", "gemma2", "gemma", "gemma-embedding"]
-            .iter()
-            .find(|p| {
-                ct.metadata
-                    .contains_key(&format!("{}.attention.head_count", p))
-            })
-            .copied()
-            .unwrap_or("gemma3");
-
-        let md_get = |s: &str| -> Result<&gguf_file::Value> {
-            let key = format!("{prefix}.{s}");
-            ct.metadata
-                .get(&key)
-                .ok_or_else(|| anyhow::anyhow!("cannot find {key} in GGUF metadata"))
-        };
-
-        let head_count = md_get("attention.head_count")?
-            .to_u32()
-            .map_err(|e| anyhow::anyhow!("{e}"))? as usize;
-        let head_count_kv = md_get("attention.head_count_kv")?
-            .to_u32()
-            .map_err(|e| anyhow::anyhow!("{e}"))? as usize;
-        let block_count = md_get("block_count")?
-            .to_u32()
-            .map_err(|e| anyhow::anyhow!("{e}"))? as usize;
-        let embedding_length = md_get("embedding_length")?
-            .to_u32()
-            .map_err(|e| anyhow::anyhow!("{e}"))? as usize;
-        let key_length = md_get("attention.key_length")?
-            .to_u32()
-            .map_err(|e| anyhow::anyhow!("{e}"))? as usize;
-        let rms_norm_eps = md_get("attention.layer_norm_rms_epsilon")?
-            .to_f32()
-            .map_err(|e| anyhow::anyhow!("{e}"))? as f64;
-        let rope_freq_base = md_get("rope.freq_base")
-            .and_then(|v| v.to_f32().map_err(|e| anyhow::anyhow!("{e}")))
-            .unwrap_or(10_000.0);
-
-        let q_dim = head_count * key_length;
-
-        // Build rotary embedding tables (shared by all layers for the base freq).
-        let max_seq_len: usize = 8192; // Sufficient for embedding inputs.
-        let (rotary_sin, rotary_cos) =
-            Self::build_rotary_tables(key_length, rope_freq_base, max_seq_len, device)?;
-
-        // Load token embeddings.
-        let tok_embd = ct
-            .tensor(&mut file, "token_embd.weight", device)
-            .map_err(|e| anyhow::anyhow!("loading token_embd.weight: {e}"))?;
-        let tok_embd_deq = tok_embd
-            .dequantize(device)
-            .map_err(|e| anyhow::anyhow!("dequantizing token_embd: {e}"))?;
-        let tok_embeddings = Embedding::new(tok_embd_deq, embedding_length);
-
-        // Final norm (dequantize to f32 for Metal compatibility).
-        let norm_qt = ct
-            .tensor(&mut file, "output_norm.weight", device)
-            .map_err(|e| anyhow::anyhow!("loading output_norm.weight: {e}"))?;
-        let norm_weight = norm_qt
-            .dequantize(device)
-            .map_err(|e| anyhow::anyhow!("dequantizing output_norm.weight: {e}"))?;
-        let norm = candle_nn::RmsNorm::new(norm_weight, rms_norm_eps);
-
-        // Load transformer layers.
-        let mut layers = Vec::with_capacity(block_count);
-        for idx in 0..block_count {
-            let p = format!("blk.{idx}");
-
-            // Helper: load a quantized weight tensor as QMatMul.
-            macro_rules! load_q {
-                ($name:expr) => {{
-                    let full = format!("{}.{}", p, $name);
-                    let qt = ct
-                        .tensor(&mut file, &full, device)
-                        .map_err(|e| anyhow::anyhow!("loading {full}: {e}"))?;
-                    CandleQMatMul::from_qtensor(qt)
-                        .map_err(|e| anyhow::anyhow!("QMatMul for {full}: {e}"))?
-                }};
-            }
-
-            // Helper: load a norm weight tensor as RmsNorm (dequantize for Metal).
-            macro_rules! load_norm {
-                ($name:expr) => {{
-                    let full = format!("{}.{}", p, $name);
-                    let qt = ct
-                        .tensor(&mut file, &full, device)
-                        .map_err(|e| anyhow::anyhow!("loading {full}: {e}"))?;
-                    let weight = qt
-                        .dequantize(device)
-                        .map_err(|e| anyhow::anyhow!("dequantizing {full}: {e}"))?;
-                    candle_nn::RmsNorm::new(weight, rms_norm_eps)
-                }};
-            }
-
-            layers.push(EmbedLayer {
-                attention_wq: load_q!("attn_q.weight"),
-                attention_wk: load_q!("attn_k.weight"),
-                attention_wv: load_q!("attn_v.weight"),
-                attention_wo: load_q!("attn_output.weight"),
-                attention_q_norm: load_norm!("attn_q_norm.weight"),
-                attention_k_norm: load_norm!("attn_k_norm.weight"),
-                attention_norm: load_norm!("attn_norm.weight"),
-                post_attention_norm: load_norm!("post_attention_norm.weight"),
-                ffn_norm: load_norm!("ffn_norm.weight"),
-                post_ffn_norm: load_norm!("post_ffw_norm.weight"),
-                ffn_gate: load_q!("ffn_gate.weight"),
-                ffn_up: load_q!("ffn_up.weight"),
-                ffn_down: load_q!("ffn_down.weight"),
-                n_head: head_count,
-                n_kv_head: head_count_kv,
-                head_dim: key_length,
-                q_dim,
-                rotary_sin: rotary_sin.clone(),
-                rotary_cos: rotary_cos.clone(),
-            });
-        }
-
-        Ok((layers, tok_embeddings, norm, embedding_length))
-    }
-
-    /// Build sin/cos rotary embedding tables of shape [max_seq_len, head_dim].
-    fn build_rotary_tables(
-        head_dim: usize,
-        freq_base: f32,
-        max_seq_len: usize,
-        device: &Device,
-    ) -> Result<(Tensor, Tensor)> {
-        let half = head_dim / 2;
-        let theta: Vec<f32> = (0..half)
-            .map(|i| 1.0 / freq_base.powf(i as f32 / half as f32))
-            .collect();
-        let theta = Tensor::new(theta.as_slice(), device)
-            .map_err(|e| anyhow::anyhow!("rotary theta: {e}"))?;
-        let positions = Tensor::arange(0, max_seq_len as u32, device)
-            .map_err(|e| anyhow::anyhow!("rotary positions: {e}"))?
-            .to_dtype(DType::F32)
-            .map_err(|e| anyhow::anyhow!("rotary positions dtype: {e}"))?;
-        // [max_seq_len, half]
-        let freqs = positions
-            .unsqueeze(1)
-            .map_err(|e| anyhow::anyhow!("rotary unsqueeze: {e}"))?
-            .broadcast_mul(&theta.unsqueeze(0).map_err(|e| anyhow::anyhow!("{e}"))?)
-            .map_err(|e| anyhow::anyhow!("rotary freqs: {e}"))?;
-        // Duplicate to [max_seq_len, head_dim] to match x1,x2 concatenation.
-        let freqs = Tensor::cat(&[&freqs, &freqs], D::Minus1)
-            .map_err(|e| anyhow::anyhow!("rotary cat: {e}"))?;
-        let sin = freqs
-            .sin()
-            .map_err(|e| anyhow::anyhow!("rotary sin: {e}"))?;
-        let cos = freqs
-            .cos()
-            .map_err(|e| anyhow::anyhow!("rotary cos: {e}"))?;
-        Ok((sin, cos))
-    }
-
-    /// Load GGUF file and construct BERT-family layer structs for bidirectional embedding.
-    fn load_gguf_bert(path: &Path, device: &Device) -> Result<EmbedModelVariant> {
-        use candle_core::quantized::gguf_file;
-
-        let mut file = std::fs::File::open(path)
-            .map_err(|e| anyhow::anyhow!("opening GGUF {}: {e}", path.display()))?;
-        let ct = gguf_file::Content::read(&mut file)
-            .map_err(|e| anyhow::anyhow!("reading GGUF {}: {e}", path.display()))?;
-
-        // Read BERT hyperparameters from metadata.
-        let md_get = |s: &str| -> Result<&gguf_file::Value> {
-            ct.metadata
-                .get(s)
-                .ok_or_else(|| anyhow::anyhow!("cannot find {s} in GGUF metadata"))
-        };
-
-        let head_count = md_get("bert.attention.head_count")?
-            .to_u32()
-            .map_err(|e| anyhow::anyhow!("{e}"))? as usize;
-        let block_count = md_get("bert.block_count")?
-            .to_u32()
-            .map_err(|e| anyhow::anyhow!("{e}"))? as usize;
-        let hidden_size = md_get("bert.embedding_length")?
-            .to_u32()
-            .map_err(|e| anyhow::anyhow!("{e}"))? as usize;
-        let layer_norm_eps = md_get("bert.attention.layer_norm_epsilon")
-            .and_then(|v| v.to_f32().map_err(|e| anyhow::anyhow!("{e}")))
-            .unwrap_or(1e-12) as f64;
-
-        let head_dim = hidden_size / head_count;
-
-        // Load token embeddings.
-        let tok_embd = ct
-            .tensor(&mut file, "token_embd.weight", device)
-            .map_err(|e| anyhow::anyhow!("loading token_embd.weight: {e}"))?;
-        let tok_embd_deq = tok_embd
-            .dequantize(device)
-            .map_err(|e| anyhow::anyhow!("dequantizing token_embd: {e}"))?;
-        let tok_embeddings = Embedding::new(tok_embd_deq, hidden_size);
-
-        // Load absolute position embeddings.
-        let pos_embd = ct
-            .tensor(&mut file, "position_embd.weight", device)
-            .map_err(|e| anyhow::anyhow!("loading position_embd.weight: {e}"))?;
-        let pos_embeddings = pos_embd
-            .dequantize(device)
-            .map_err(|e| anyhow::anyhow!("dequantizing position_embd: {e}"))?;
-
-        // Load embedding LayerNorm (post token+position embeddings).
-        let embed_norm =
-            Self::load_layer_norm(&ct, &mut file, "token_embd_norm", layer_norm_eps, device)?;
-
-        // Load transformer layers.
-        let mut layers = Vec::with_capacity(block_count);
-        for idx in 0..block_count {
-            let p = format!("blk.{idx}");
-
-            // Helper: load a quantized weight tensor as QMatMul.
-            macro_rules! load_q {
-                ($name:expr) => {{
-                    let full = format!("{}.{}", p, $name);
-                    let qt = ct
-                        .tensor(&mut file, &full, device)
-                        .map_err(|e| anyhow::anyhow!("loading {full}: {e}"))?;
-                    CandleQMatMul::from_qtensor(qt)
-                        .map_err(|e| anyhow::anyhow!("QMatMul for {full}: {e}"))?
-                }};
-            }
-
-            // Helper: load a bias tensor (dequantized to f32).
-            macro_rules! load_bias {
-                ($name:expr) => {{
-                    let full = format!("{}.{}", p, $name);
-                    ct.tensor(&mut file, &full, device)
-                        .map_err(|e| anyhow::anyhow!("loading {full}: {e}"))?
-                        .dequantize(device)
-                        .map_err(|e| anyhow::anyhow!("dequantizing {full}: {e}"))?
-                }};
-            }
-
-            let attn_output_norm = Self::load_layer_norm(
-                &ct,
-                &mut file,
-                &format!("{p}.attn_output_norm"),
-                layer_norm_eps,
-                device,
-            )?;
-            let layer_output_norm = Self::load_layer_norm(
-                &ct,
-                &mut file,
-                &format!("{p}.layer_output_norm"),
-                layer_norm_eps,
-                device,
-            )?;
-
-            layers.push(BertLayer {
-                attn_q: load_q!("attn_q.weight"),
-                attn_q_bias: load_bias!("attn_q.bias"),
-                attn_k: load_q!("attn_k.weight"),
-                attn_k_bias: load_bias!("attn_k.bias"),
-                attn_v: load_q!("attn_v.weight"),
-                attn_v_bias: load_bias!("attn_v.bias"),
-                attn_output: load_q!("attn_output.weight"),
-                attn_output_bias: load_bias!("attn_output.bias"),
-                attn_output_norm,
-                ffn_up: load_q!("ffn_up.weight"),
-                ffn_up_bias: load_bias!("ffn_up.bias"),
-                ffn_down: load_q!("ffn_down.weight"),
-                ffn_down_bias: load_bias!("ffn_down.bias"),
-                layer_output_norm,
-                n_head: head_count,
-                head_dim,
-            });
-        }
-
-        Ok(EmbedModelVariant::Bert {
-            layers,
-            tok_embeddings,
-            pos_embeddings,
-            embed_norm,
-            hidden_size,
-        })
-    }
-
-    /// Load a LayerNorm with weight and bias from GGUF tensors.
-    fn load_layer_norm(
-        ct: &candle_core::quantized::gguf_file::Content,
-        file: &mut std::fs::File,
-        prefix: &str,
-        eps: f64,
-        device: &Device,
-    ) -> Result<candle_nn::LayerNorm> {
-        let weight_name = format!("{prefix}.weight");
-        let bias_name = format!("{prefix}.bias");
-
-        let weight = ct
-            .tensor(file, &weight_name, device)
-            .map_err(|e| anyhow::anyhow!("loading {weight_name}: {e}"))?
-            .dequantize(device)
-            .map_err(|e| anyhow::anyhow!("dequantizing {weight_name}: {e}"))?;
-        let bias = ct
-            .tensor(file, &bias_name, device)
-            .map_err(|e| anyhow::anyhow!("loading {bias_name}: {e}"))?
-            .dequantize(device)
-            .map_err(|e| anyhow::anyhow!("dequantizing {bias_name}: {e}"))?;
-
-        Ok(candle_nn::LayerNorm::new(weight, bias, eps))
-    }
-
-    /// Run a bidirectional forward pass and return the mean-pooled, truncated,
-    /// L2-normalized embedding.
+    /// Run embedding inference and return the truncated, L2-normalized embedding.
     fn embed_text(&self, text: &str) -> Result<Vec<f32>> {
-        let token_ids = self.tokenizer.encode(text, true)?;
-        if token_ids.is_empty() {
+        // Tokenize using llama.cpp's built-in tokenizer.
+        let tokens = self
+            .model
+            .str_to_token(text, AddBos::Always)
+            .map_err(|e| anyhow::anyhow!("tokenization failed: {e}"))?;
+        if tokens.is_empty() {
             bail!("tokenizer returned empty token sequence");
         }
 
-        let input = Tensor::new(token_ids.as_slice(), &self.device)
-            .map_err(|e| anyhow::anyhow!("creating input tensor: {e}"))?
-            .unsqueeze(0)
-            .map_err(|e| anyhow::anyhow!("unsqueeze: {e}"))?;
-
-        let hidden = match &self.variant {
-            EmbedModelVariant::Gemma {
-                layers,
-                tok_embeddings,
-                norm,
-                embedding_length,
-            } => {
-                // Token embeddings, scaled by sqrt(embedding_length) (Gemma convention).
-                let mut h = tok_embeddings
-                    .forward(&input)
-                    .map_err(|e| anyhow::anyhow!("token embedding forward: {e}"))?;
-                h = (h * (*embedding_length as f64).sqrt())
-                    .map_err(|e| anyhow::anyhow!("scaling embeddings: {e}"))?;
-
-                for layer in layers {
-                    h = layer
-                        .forward(&h)
-                        .map_err(|e| anyhow::anyhow!("layer forward: {e}"))?;
-                }
-
-                norm.forward(&h)
-                    .map_err(|e| anyhow::anyhow!("final norm: {e}"))?
-            }
-            EmbedModelVariant::Bert {
-                layers,
-                tok_embeddings,
-                pos_embeddings,
-                embed_norm,
-                ..
-            } => {
-                // Token embeddings + absolute position embeddings.
-                let seq_len = token_ids.len();
-                let tok_emb = tok_embeddings
-                    .forward(&input)
-                    .map_err(|e| anyhow::anyhow!("token embedding forward: {e}"))?;
-
-                // Slice position embeddings to seq_len: [max_pos, hidden] -> [seq_len, hidden].
-                let pos_emb = pos_embeddings
-                    .narrow(0, 0, seq_len)
-                    .map_err(|e| anyhow::anyhow!("position embedding slice: {e}"))?
-                    .unsqueeze(0)
-                    .map_err(|e| anyhow::anyhow!("position embedding unsqueeze: {e}"))?;
-
-                let mut h =
-                    (tok_emb + pos_emb).map_err(|e| anyhow::anyhow!("embedding addition: {e}"))?;
-                h = embed_norm
-                    .forward(&h)
-                    .map_err(|e| anyhow::anyhow!("embedding norm: {e}"))?;
-
-                for layer in layers {
-                    h = layer
-                        .forward(&h)
-                        .map_err(|e| anyhow::anyhow!("layer forward: {e}"))?;
-                }
+        // Create a context with embeddings enabled (per-call, since LlamaContext is !Send).
+        let ctx_params = LlamaContextParams::default()
+            .with_embeddings(true)
+            .with_n_ctx(std::num::NonZeroU32::new(tokens.len() as u32 + 16));
+        let mut ctx = self
+            .model
+            .new_context(&self.backend, ctx_params)
+            .map_err(|e| anyhow::anyhow!("creating embedding context: {e}"))?;
 
-                // BERT does not have a final norm after the last layer
-                // (the per-layer norms already handle it).
-                h
-            }
-        };
+        // Create batch and add tokens.
+        let mut batch = LlamaBatch::new(tokens.len() + 16, 1);
+        batch
+            .add_sequence(&tokens, 0, false)
+            .map_err(|e| anyhow::anyhow!("adding sequence to batch: {e}"))?;
 
-        // Mean pool across sequence dimension: [1, seq_len, hidden] -> [1, hidden].
-        let seq_len = hidden
-            .dim(1)
-            .map_err(|e| anyhow::anyhow!("getting seq dim: {e}"))?;
-        let pooled = (hidden.sum(1).map_err(|e| anyhow::anyhow!("sum: {e}"))? / (seq_len as f64))
-            .map_err(|e| anyhow::anyhow!("mean div: {e}"))?;
+        // Decode (compute embeddings).
+        ctx.decode(&mut batch)
+            .map_err(|e| anyhow::anyhow!("embedding decode failed: {e}"))?;
 
-        // Squeeze batch dimension: [1, hidden] -> [hidden].
-        let pooled = pooled
-            .squeeze(0)
-            .map_err(|e| anyhow::anyhow!("squeeze: {e}"))?;
+        // Get embeddings for sequence 0 (mean pooled by llama.cpp).
+        let embeddings = ctx
+            .embeddings_seq_ith(0)
+            .map_err(|e| anyhow::anyhow!("getting embeddings: {e}"))?;
 
         // Truncate to target dimensionality.
-        let full_dim = pooled
-            .dim(0)
-            .map_err(|e| anyhow::anyhow!("dim check: {e}"))?;
-        let truncated = if full_dim > self.dim {
-            pooled
-                .narrow(0, 0, self.dim)
-                .map_err(|e| anyhow::anyhow!("truncate: {e}"))?
+        let full_dim = embeddings.len();
+        let truncated: Vec<f32> = if full_dim > self.dim {
+            embeddings[..self.dim].to_vec()
         } else {
-            pooled
+            embeddings.to_vec()
         };
 
         // L2 normalize.
-        let norm_val = truncated
-            .sqr()
-            .map_err(|e| anyhow::anyhow!("sqr: {e}"))?
-            .sum_all()
-            .map_err(|e| anyhow::anyhow!("sum_all: {e}"))?
-            .sqrt()
-            .map_err(|e| anyhow::anyhow!("sqrt: {e}"))?;
-        let norm_scalar: f32 = norm_val
-            .to_scalar()
-            .map_err(|e| anyhow::anyhow!("norm scalar: {e}"))?;
-
-        let normalized = if norm_scalar > 0.0 {
-            (truncated / norm_scalar as f64).map_err(|e| anyhow::anyhow!("normalize: {e}"))?
+        let norm: f32 = truncated.iter().map(|x| x * x).sum::<f32>().sqrt();
+        let normalized = if norm > 0.0 {
+            truncated.iter().map(|x| x / norm).collect()
         } else {
             truncated
         };
 
-        let vec: Vec<f32> = normalized
-            .to_vec1()
-            .map_err(|e| anyhow::anyhow!("to_vec1: {e}"))?;
-        Ok(vec)
+        Ok(normalized)
     }
 }
 
-impl EmbedModel for CandleEmbed {
+impl EmbedModel for LlamaEmbed {
     fn embed_batch(&mut self, texts: &[&str]) -> Result<Vec<Vec<f32>>> {
-        // Process texts sequentially — candle quantized ops are single-threaded.
+        // Process texts sequentially — llama.cpp context is per-call.
         // Apply document prompt format for indexing (asymmetric models need this).
         texts
             .iter()
@@ -1539,7 +870,7 @@ fn extract_json_object(text: &str) -> Option<&str> {
     None
 }
 
-// ── CandleOrchestrator — GGUF text generation via candle ─────────────────────
+// ── LlamaOrchestrator — GGUF text generation via llama.cpp ─────────────────────
 
 const ORCHESTRATOR_SYSTEM_PROMPT: &str = r#"You are a search query analyzer. Given a user's search query, classify it and expand it.
 
@@ -1549,33 +880,35 @@ Return JSON with:
 
 Be concise. Only return the JSON object."#;
 
-/// Quantized Qwen3 model for query orchestration and expansion.
+/// Quantized Qwen3 model for query orchestration and expansion via llama.cpp.
 ///
 /// Loads a Qwen3 GGUF model and performs autoregressive generation to classify
 /// queries and produce expansions. Falls back to `heuristic_orchestrate` if
-/// generation or JSON parsing fails.
-pub struct CandleOrchestrator {
-    model: candle_transformers::models::quantized_qwen3::ModelWeights,
+/// generation or JSON parsing fails. Uses Metal acceleration on macOS automatically.
+pub struct LlamaOrchestrator {
+    model: LlamaModel,
+    backend: LlamaBackend,
     tokenizer: tokenizers::Tokenizer,
-    device: Device,
 }
 
-impl std::fmt::Debug for CandleOrchestrator {
+// Safety: LlamaModel and LlamaBackend are Send+Sync. tokenizers::Tokenizer is Send.
+// LlamaContext is created per-call and never stored.
+unsafe impl Send for LlamaOrchestrator {}
+
+impl std::fmt::Debug for LlamaOrchestrator {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("CandleOrchestrator")
-            .field("device", &self.device)
-            .finish()
+        f.debug_struct("LlamaOrchestrator").finish()
     }
 }
 
-impl CandleOrchestrator {
+impl LlamaOrchestrator {
     /// Load a Qwen3 GGUF model for orchestration from `models_dir`.
     ///
     /// Steps:
     /// 1. Resolve model URI (from config override or `ModelDefaults`)
     /// 2. `ensure_model()` to download if needed
     /// 3. Load tokenizer from the model repo (or the non-GGUF base repo)
-    /// 4. Load GGUF via `ModelWeights::from_gguf()`
+    /// 4. Load GGUF model via llama.cpp
     pub fn new(models_dir: &Path, config: &crate::config::Config) -> Result<Self> {
         let defaults = ModelDefaults::default();
         let uri_str = config
@@ -1589,27 +922,18 @@ impl CandleOrchestrator {
         // Orchestrator needs HF tokenizer (for decode + token_to_id).
         let tokenizer = load_hf_tokenizer(&uri, models_dir)?;
 
-        let device = select_device()?;
-
-        let mut file = std::fs::File::open(&model_path)
-            .map_err(|e| anyhow::anyhow!("opening GGUF {}: {e}", model_path.display()))?;
-        let ct = candle_core::quantized::gguf_file::Content::read(&mut file)
-            .map_err(|e| anyhow::anyhow!("reading GGUF {}: {e}", model_path.display()))?;
-        let model = candle_transformers::models::quantized_qwen3::ModelWeights::from_gguf(
-            ct, &mut file, &device,
-        )
-        .map_err(|e| anyhow::anyhow!("loading Qwen3 model weights: {e}"))?;
+        let backend =
+            LlamaBackend::init().map_err(|e| anyhow::anyhow!("initializing llama backend: {e}"))?;
+        let model_params = LlamaModelParams::default();
+        let model = LlamaModel::load_from_file(&backend, &model_path, &model_params)
+            .map_err(|e| anyhow::anyhow!("loading orchestrator model {}: {e}", model_path.display()))?;
 
-        tracing::info!(
-            "loaded CandleOrchestrator from {}, device={:?}",
-            uri_str,
-            device
-        );
+        tracing::info!("loaded LlamaOrchestrator from {}", uri_str);
 
         Ok(Self {
             model,
+            backend,
             tokenizer,
-            device,
         })
     }
 
@@ -1624,80 +948,65 @@ impl CandleOrchestrator {
 
     /// Run autoregressive generation (greedy decode) up to `max_tokens`.
     /// Returns the generated text (excluding the prompt).
-    fn generate(&mut self, prompt: &str, max_tokens: usize) -> Result<String> {
-        self.model.clear_kv_cache();
-
-        let encoding = self
-            .tokenizer
-            .encode(prompt, true)
+    fn generate(&self, prompt: &str, max_tokens: usize) -> Result<String> {
+        // Tokenize using llama.cpp's tokenizer.
+        let tokens = self
+            .model
+            .str_to_token(prompt, AddBos::Always)
             .map_err(|e| anyhow::anyhow!("tokenization failed: {e}"))?;
-        let prompt_tokens = encoding.get_ids();
-        if prompt_tokens.is_empty() {
+        if tokens.is_empty() {
             bail!("tokenizer returned empty token sequence");
         }
 
-        // Determine EOS token ID.
-        let eos_token_id = self
-            .tokenizer
-            .token_to_id("<|im_end|>")
-            .or_else(|| self.tokenizer.token_to_id("<|endoftext|>"))
-            .unwrap_or(151643); // Qwen3 default EOS
-
-        // Process the prompt in a single forward pass.
-        let input = Tensor::new(prompt_tokens, &self.device)?
-            .unsqueeze(0)
-            .map_err(|e| anyhow::anyhow!("unsqueeze prompt: {e}"))?;
-        let logits = self
+        // Create context per-call (LlamaContext is !Send).
+        let n_ctx = (tokens.len() + max_tokens + 16) as u32;
+        let ctx_params =
+            LlamaContextParams::default().with_n_ctx(std::num::NonZeroU32::new(n_ctx));
+        let mut ctx = self
             .model
-            .forward(&input, 0)
-            .map_err(|e| anyhow::anyhow!("forward pass (prompt): {e}"))?;
-
-        // Get the last token's logits and pick argmax.
-        let logits = logits
-            .to_dtype(DType::F32)
-            .map_err(|e| anyhow::anyhow!("logits dtype: {e}"))?;
-        let next_token = logits
-            .i(0)?
-            .argmax(D::Minus1)
-            .map_err(|e| anyhow::anyhow!("argmax: {e}"))?
-            .to_scalar::<u32>()
-            .map_err(|e| anyhow::anyhow!("scalar: {e}"))?;
-
-        let mut generated_tokens: Vec<u32> = vec![next_token];
-        let mut offset = prompt_tokens.len();
-
-        if next_token == eos_token_id {
-            // Model produced EOS immediately.
-            return Ok(String::new());
+            .new_context(&self.backend, ctx_params)
+            .map_err(|e| anyhow::anyhow!("creating orchestrator context: {e}"))?;
+
+        // Process prompt tokens in a batch.
+        let mut batch = LlamaBatch::new(tokens.len() + max_tokens + 16, 1);
+        for (i, token) in tokens.iter().enumerate() {
+            let is_last = i == tokens.len() - 1;
+            batch
+                .add(*token, i as i32, &[0], is_last)
+                .map_err(|e| anyhow::anyhow!("adding prompt token to batch: {e}"))?;
         }
 
-        // Autoregressive loop.
-        for _ in 1..max_tokens {
-            let input = Tensor::new(&[*generated_tokens.last().unwrap()], &self.device)?
-                .unsqueeze(0)
-                .map_err(|e| anyhow::anyhow!("unsqueeze step: {e}"))?;
-            let logits = self
-                .model
-                .forward(&input, offset)
-                .map_err(|e| anyhow::anyhow!("forward pass (step): {e}"))?;
-            offset += 1;
-
-            let logits = logits
-                .to_dtype(DType::F32)
-                .map_err(|e| anyhow::anyhow!("logits dtype: {e}"))?;
-            let token = logits
-                .i(0)?
-                .argmax(D::Minus1)
-                .map_err(|e| anyhow::anyhow!("argmax: {e}"))?
-                .to_scalar::<u32>()
-                .map_err(|e| anyhow::anyhow!("scalar: {e}"))?;
-
-            if token == eos_token_id {
+        ctx.decode(&mut batch)
+            .map_err(|e| anyhow::anyhow!("prompt decode failed: {e}"))?;
+
+        // Autoregressive generation loop.
+        let mut sampler = LlamaSampler::greedy();
+        let mut generated_tokens: Vec<u32> = Vec::new();
+        let mut n_cur = tokens.len();
+
+        for _ in 0..max_tokens {
+            let new_token = sampler.sample(&ctx, batch.n_tokens() - 1);
+            sampler.accept(new_token);
+
+            // Check for end-of-generation.
+            if self.model.is_eog_token(new_token) {
                 break;
             }
-            generated_tokens.push(token);
+
+            generated_tokens.push(new_token.0 as u32);
+
+            // Add token to batch for next iteration.
+            batch.clear();
+            batch
+                .add(new_token, n_cur as i32, &[0], true)
+                .map_err(|e| anyhow::anyhow!("adding generated token to batch: {e}"))?;
+            n_cur += 1;
+
+            ctx.decode(&mut batch)
+                .map_err(|e| anyhow::anyhow!("generation decode failed: {e}"))?;
         }
 
+        // Decode generated tokens back to text using HF tokenizer.
         let text = self
             .tokenizer
             .decode(&generated_tokens, true)
@@ -1706,7 +1015,7 @@ impl CandleOrchestrator {
     }
 }
 
-impl OrchestratorModel for CandleOrchestrator {
+impl OrchestratorModel for LlamaOrchestrator {
     fn orchestrate(&mut self, query: &str) -> Result<OrchestrationResult> {
         let prompt = Self::format_prompt(query);
 
@@ -1728,7 +1037,7 @@ impl OrchestratorModel for CandleOrchestrator {
     }
 }
 
-// ── CandleRerank — GGUF cross-encoder reranker via candle ─────────────────────
+// ── LlamaRerank — GGUF cross-encoder reranker via llama.cpp ─────────────────────
 
 /// Format query+document for cross-encoder reranking.
 pub fn format_reranker_input(query: &str, document: &str) -> String {
@@ -1740,39 +1049,40 @@ pub fn format_reranker_input(query: &str, document: &str) -> String {
     )
 }
 
-/// Quantized Qwen3 cross-encoder for reranking search results.
+/// Quantized Qwen3 cross-encoder for reranking search results via llama.cpp.
 ///
 /// Loads a Qwen3-Reranker GGUF model and scores (query, document) pairs by
 /// running a single forward pass and extracting Yes/No logit probabilities.
-/// Unlike `CandleOrchestrator`, this does NOT do autoregressive generation —
+/// Unlike `LlamaOrchestrator`, this does NOT do autoregressive generation —
 /// just one pass through the full input to get logits at the last position.
-pub struct CandleRerank {
-    model: candle_transformers::models::quantized_qwen3::ModelWeights,
-    tokenizer: tokenizers::Tokenizer,
-    device: Device,
-    yes_token_id: u32,
-    no_token_id: u32,
+pub struct LlamaRerank {
+    model: LlamaModel,
+    backend: LlamaBackend,
+    yes_token_id: i32,
+    no_token_id: i32,
 }
 
-impl std::fmt::Debug for CandleRerank {
+// Safety: LlamaModel and LlamaBackend are Send+Sync.
+// LlamaContext is created per-call and never stored.
+unsafe impl Send for LlamaRerank {}
+
+impl std::fmt::Debug for LlamaRerank {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("CandleRerank")
-            .field("device", &self.device)
+        f.debug_struct("LlamaRerank")
             .field("yes_token_id", &self.yes_token_id)
             .field("no_token_id", &self.no_token_id)
             .finish()
     }
 }
 
-impl CandleRerank {
+impl LlamaRerank {
     /// Load a Qwen3-Reranker GGUF model from `models_dir`.
     ///
     /// Steps:
     /// 1. Resolve model URI (from config override or `ModelDefaults::default().rerank_uri`)
     /// 2. `ensure_model()` to download if needed
-    /// 3. Load tokenizer from the model repo (or the non-GGUF base repo)
-    /// 4. Load GGUF via `ModelWeights::from_gguf()`
-    /// 5. Look up "Yes" and "No" token IDs from the tokenizer
+    /// 3. Load tokenizer from the model repo to look up Yes/No token IDs
+    /// 4. Load GGUF model via llama.cpp
     pub fn new(models_dir: &Path, config: &crate::config::Config) -> Result<Self> {
         let defaults = ModelDefaults::default();
         let uri_str = config
@@ -1783,93 +1093,82 @@ impl CandleRerank {
         let uri = HfModelUri::parse(uri_str)?;
         let model_path = ensure_model(&uri, models_dir)?;
 
-        // Reranker needs HF tokenizer (for token_to_id).
-        let tokenizer = load_hf_tokenizer(&uri, models_dir)?;
+        // Reranker needs HF tokenizer to look up Yes/No token IDs.
+        let hf_tokenizer = load_hf_tokenizer(&uri, models_dir)?;
 
-        // Look up Yes/No token IDs.
-        let yes_token_id = tokenizer
+        let yes_token_id = hf_tokenizer
             .token_to_id("Yes")
-            .ok_or_else(|| anyhow::anyhow!("tokenizer has no 'Yes' token"))?;
-        let no_token_id = tokenizer
+            .ok_or_else(|| anyhow::anyhow!("tokenizer has no 'Yes' token"))?
+            as i32;
+        let no_token_id = hf_tokenizer
             .token_to_id("No")
-            .ok_or_else(|| anyhow::anyhow!("tokenizer has no 'No' token"))?;
+            .ok_or_else(|| anyhow::anyhow!("tokenizer has no 'No' token"))?
+            as i32;
 
-        let device = select_device()?;
-
-        // Load GGUF model.
-        let mut file = std::fs::File::open(&model_path)
-            .map_err(|e| anyhow::anyhow!("opening GGUF {}: {e}", model_path.display()))?;
-        let ct = candle_core::quantized::gguf_file::Content::read(&mut file)
-            .map_err(|e| anyhow::anyhow!("reading GGUF {}: {e}", model_path.display()))?;
-        let model = candle_transformers::models::quantized_qwen3::ModelWeights::from_gguf(
-            ct, &mut file, &device,
-        )
-        .map_err(|e| anyhow::anyhow!("loading Qwen3 reranker model weights: {e}"))?;
+        let backend =
+            LlamaBackend::init().map_err(|e| anyhow::anyhow!("initializing llama backend: {e}"))?;
+        let model_params = LlamaModelParams::default();
+        let model = LlamaModel::load_from_file(&backend, &model_path, &model_params)
+            .map_err(|e| anyhow::anyhow!("loading reranker model {}: {e}", model_path.display()))?;
 
         tracing::info!(
-            "loaded CandleRerank from {}, device={:?}, yes_id={}, no_id={}",
+            "loaded LlamaRerank from {}, yes_id={}, no_id={}",
             uri_str,
-            device,
             yes_token_id,
             no_token_id
         );
 
         Ok(Self {
             model,
-            tokenizer,
-            device,
+            backend,
             yes_token_id,
             no_token_id,
         })
     }
 }
 
-impl RerankModel for CandleRerank {
+impl RerankModel for LlamaRerank {
     fn rerank_score(&mut self, query: &str, document: &str) -> Result<f32> {
-        self.model.clear_kv_cache();
-
         let input_text = format_reranker_input(query, document);
 
-        let encoding = self
-            .tokenizer
-            .encode(input_text.as_str(), true)
+        // Tokenize using llama.cpp's built-in tokenizer.
+        let tokens = self
+            .model
+            .str_to_token(&input_text, AddBos::Always)
             .map_err(|e| anyhow::anyhow!("tokenization failed: {e}"))?;
-        let token_ids = encoding.get_ids();
-        if token_ids.is_empty() {
+        if tokens.is_empty() {
             bail!("tokenizer returned empty token sequence");
         }
 
-        // Single forward pass through the full input (no autoregressive generation).
-        let input = Tensor::new(token_ids, &self.device)?
-            .unsqueeze(0)
-            .map_err(|e| anyhow::anyhow!("unsqueeze input: {e}"))?;
-        let logits = self
+        // Create context per-call (LlamaContext is !Send).
+        let n_ctx = (tokens.len() + 16) as u32;
+        let ctx_params =
+            LlamaContextParams::default().with_n_ctx(std::num::NonZeroU32::new(n_ctx));
+        let mut ctx = self
             .model
-            .forward(&input, 0)
-            .map_err(|e| anyhow::anyhow!("forward pass: {e}"))?;
-
-        // logits shape: [1, seq_len, vocab_size] or [1, vocab_size] (last position).
-        // Extract logits for the last position.
-        let logits = logits
-            .to_dtype(DType::F32)
-            .map_err(|e| anyhow::anyhow!("logits dtype: {e}"))?;
-        let last_logits = logits
-            .i(0)
-            .map_err(|e| anyhow::anyhow!("batch index: {e}"))?;
-
-        // Extract Yes/No logits.
-        let yes_logit: f32 = last_logits
-            .i(self.yes_token_id as usize)
-            .map_err(|e| anyhow::anyhow!("yes logit index: {e}"))?
-            .to_scalar()
-            .map_err(|e| anyhow::anyhow!("yes logit scalar: {e}"))?;
-        let no_logit: f32 = last_logits
-            .i(self.no_token_id as usize)
-            .map_err(|e| anyhow::anyhow!("no logit index: {e}"))?
-            .to_scalar()
-            .map_err(|e| anyhow::anyhow!("no logit scalar: {e}"))?;
-
-        // Softmax over Yes/No to get probability.
+            .new_context(&self.backend, ctx_params)
+            .map_err(|e| anyhow::anyhow!("creating reranker context: {e}"))?;
+
+        // Create batch with all tokens; mark last as logit-producing.
+        let mut batch = LlamaBatch::new(tokens.len() + 16, 1);
+        for (i, token) in tokens.iter().enumerate() {
+            let is_last = i == tokens.len() - 1;
+            batch
+                .add(*token, i as i32, &[0], is_last)
+                .map_err(|e| anyhow::anyhow!("adding token to reranker batch: {e}"))?;
+        }
+
+        // Single forward pass through the full input.
+        ctx.decode(&mut batch)
+            .map_err(|e| anyhow::anyhow!("reranker decode failed: {e}"))?;
+
+        // Get logits for the last token position.
+        let logits = ctx.get_logits_ith(batch.n_tokens() - 1);
+
+        // Extract Yes/No logits and compute softmax probability.
+        let yes_logit = logits[self.yes_token_id as usize];
+        let no_logit = logits[self.no_token_id as usize];
+
         let max_logit = yes_logit.max(no_logit);
         let yes_exp = (yes_logit - max_logit).exp();
         let no_exp = (no_logit - max_logit).exp();
@@ -2002,15 +1301,15 @@ mod tests {
         );
     }
 
-    // ── CandleEmbed / PromptFormat tests ────────────────────────────────────
+    // ── LlamaEmbed / PromptFormat tests ────────────────────────────────────
 
     #[test]
-    fn test_candle_embed_struct_exists() {
+    fn test_llama_embed_struct_exists() {
         fn assert_embed_model<E: EmbedModel>(_e: &E) {}
         let mock = MockLlm::new(256);
         assert_embed_model(&mock);
-        // CandleEmbed also implements EmbedModel — verified at compile time.
-        // We can't instantiate CandleEmbed without a real GGUF model,
+        // LlamaEmbed also implements EmbedModel — verified at compile time.
+        // We can't instantiate LlamaEmbed without a real GGUF model,
         // but the trait bound compiles.
     }
 
@@ -2061,15 +1360,6 @@ mod tests {
         assert_eq!(formatted, "Title\nBody");
     }
 
-    #[test]
-    fn test_select_device_returns_cpu_by_default() {
-        // Without the `metal` feature, select_device should return CPU.
-        let device = select_device().unwrap();
-        // On CI/test without metal feature, this should be CPU.
-        // With metal feature on macOS, it could be Metal — both are valid.
-        let _ = device; // Just verify it doesn't error.
-    }
-
     // ── heuristic_orchestrate tests ──────────────────────────────────────────
 
     #[test]
@@ -2170,11 +1460,11 @@ mod tests {
         assert!(parse_orchestration_json(json).is_err());
     }
 
-    // ── CandleOrchestrator tests ─────────────────────────────────────────────
+    // ── LlamaOrchestrator tests ─────────────────────────────────────────────
 
     #[test]
-    fn test_candle_orchestrator_format_prompt() {
-        let prompt = CandleOrchestrator::format_prompt("how does auth work");
+    fn test_llama_orchestrator_format_prompt() {
+        let prompt = LlamaOrchestrator::format_prompt("how does auth work");
         assert!(prompt.contains("<|im_start|>system"));
         assert!(prompt.contains("<|im_end|>"));
         assert!(prompt.contains("<|im_start|>user"));
@@ -2183,13 +1473,13 @@ mod tests {
     }
 
     #[test]
-    fn test_candle_orchestrator_implements_trait() {
-        // Compile-time check: CandleOrchestrator implements OrchestratorModel.
+    fn test_llama_orchestrator_implements_trait() {
+        // Compile-time check: LlamaOrchestrator implements OrchestratorModel.
         fn assert_orchestrator<O: OrchestratorModel>() {}
-        assert_orchestrator::<CandleOrchestrator>();
+        assert_orchestrator::<LlamaOrchestrator>();
     }
 
-    // ── CandleRerank tests ──────────────────────────────────────────────────
+    // ── LlamaRerank tests ──────────────────────────────────────────────────
 
     #[test]
     fn test_format_reranker_input() {
@@ -2200,7 +1490,7 @@ mod tests {
     }
 
     #[test]
-    fn test_candle_rerank_trait_compliance() {
+    fn test_llama_rerank_trait_compliance() {
         // Verify MockLlm still satisfies RerankModel.
         fn assert_rerank<R: RerankModel>(_r: &R) {}
         let mock = MockLlm::new(256);
diff --git a/src/main.rs b/src/main.rs
index 7d2872b..10a20f3 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -829,7 +829,7 @@ async fn main() -> Result<()> {
                 }
                 ContextAction::Topic { query, budget } => {
                     let models_dir = data_dir.join("models");
-                    let mut embedder = engraph::llm::CandleEmbed::new(&models_dir, &cfg)?;
+                    let mut embedder = engraph::llm::LlamaEmbed::new(&models_dir, &cfg)?;
 
                     let bundle = engraph::context::context_topic_with_search(
                         &params,
@@ -882,7 +882,7 @@ async fn main() -> Result<()> {
                 .ok_or_else(|| anyhow::anyhow!("No vault path in index."))?;
             let vault_path = PathBuf::from(&vault_path_str);
             let models_dir = data_dir.join("models");
-            let mut embedder = engraph::llm::CandleEmbed::new(&models_dir, &cfg)?;
+            let mut embedder = engraph::llm::LlamaEmbed::new(&models_dir, &cfg)?;
             let profile = config::Config::load_vault_profile().ok().flatten();
 
             match action {
diff --git a/src/search.rs b/src/search.rs
index 336646e..cdcf75b 100644
--- a/src/search.rs
+++ b/src/search.rs
@@ -317,7 +317,7 @@ pub fn run_search(
 ) -> Result<()> {
     let models_dir = data_dir.join("models");
     let mut embedder =
-        crate::llm::CandleEmbed::new(&models_dir, config).context("loading embedder")?;
+        crate::llm::LlamaEmbed::new(&models_dir, config).context("loading embedder")?;
 
     let db_path = data_dir.join("engraph.db");
     let store = Store::open(&db_path).context("opening store")?;
diff --git a/src/serve.rs b/src/serve.rs
index 6559109..6a91346 100644
--- a/src/serve.rs
+++ b/src/serve.rs
@@ -437,7 +437,7 @@ pub async fn run_serve(data_dir: &Path) -> Result<()> {
 
     let store = Store::open(&db_path)?;
     let config = Config::load()?;
-    let embedder = crate::llm::CandleEmbed::new(&models_dir, &config)?;
+    let embedder = crate::llm::LlamaEmbed::new(&models_dir, &config)?;
 
     let vault_path_str = store.get_meta("vault_path")?.ok_or_else(|| {
         anyhow::anyhow!("No vault path in index. Run 'engraph index <path>' first.")
@@ -462,7 +462,7 @@ pub async fn run_serve(data_dir: &Path) -> Result<()> {
     // Load intelligence models if enabled
     let orchestrator: Option<Arc<Mutex<Box<dyn OrchestratorModel + Send>>>> =
         if config.intelligence_enabled() {
-            match crate::llm::CandleOrchestrator::new(&models_dir, &config) {
+            match crate::llm::LlamaOrchestrator::new(&models_dir, &config) {
                 Ok(orch) => Some(Arc::new(Mutex::new(
                     Box::new(orch) as Box<dyn OrchestratorModel + Send>
                 ))),
@@ -477,7 +477,7 @@ pub async fn run_serve(data_dir: &Path) -> Result<()> {
 
     let reranker: Option<Arc<Mutex<Box<dyn RerankModel + Send>>>> = if config.intelligence_enabled()
     {
-        match crate::llm::CandleRerank::new(&models_dir, &config) {
+        match crate::llm::LlamaRerank::new(&models_dir, &config) {
             Ok(rerank) => Some(Arc::new(Mutex::new(
                 Box::new(rerank) as Box<dyn RerankModel + Send>
             ))),

From 3db1ae5acc84230b43a7b9cd561529cd981897f7 Mon Sep 17 00:00:00 2001
From: Oleksandr Ostrovskyi <devwhodevs@gmail.com>
Date: Wed, 25 Mar 2026 23:56:48 +0200
Subject: [PATCH 11/17] feat(llm): switch to llama.cpp backend, fix embedding
 params

Replace candle with llama-cpp-2 for all ML inference. Gets Metal GPU
acceleration (88 files in 70s vs 37+ min on CPU).

Fixes: use encode() not decode() for embeddings, set n_ubatch >= n_tokens,
use AddBos::Never (PromptFormat already adds <bos>), force CPU device
for quantized ops (candle Metal unsupported).

Keeps BERT GGUF support code for fallback. Default: embeddinggemma-300M.
---
 src/llm.rs | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/llm.rs b/src/llm.rs
index b048172..2c7c812 100644
--- a/src/llm.rs
+++ b/src/llm.rs
@@ -679,32 +679,38 @@ impl LlamaEmbed {
     /// Run embedding inference and return the truncated, L2-normalized embedding.
     fn embed_text(&self, text: &str) -> Result<Vec<f32>> {
         // Tokenize using llama.cpp's built-in tokenizer.
+        // Use AddBos::Never because PromptFormat already adds <bos> for embeddinggemma.
         let tokens = self
             .model
-            .str_to_token(text, AddBos::Always)
+            .str_to_token(text, AddBos::Never)
             .map_err(|e| anyhow::anyhow!("tokenization failed: {e}"))?;
         if tokens.is_empty() {
             bail!("tokenizer returned empty token sequence");
         }
 
         // Create a context with embeddings enabled (per-call, since LlamaContext is !Send).
+        // n_ubatch must be >= n_tokens for the encoder, and n_ctx must fit all tokens.
+        let n_tokens = tokens.len() as u32;
+        let n_ctx = std::num::NonZeroU32::new(n_tokens.max(64) + 16);
         let ctx_params = LlamaContextParams::default()
             .with_embeddings(true)
-            .with_n_ctx(std::num::NonZeroU32::new(tokens.len() as u32 + 16));
+            .with_n_ctx(n_ctx)
+            .with_n_ubatch(n_tokens.max(512))
+            .with_n_batch(n_tokens.max(512));
         let mut ctx = self
             .model
             .new_context(&self.backend, ctx_params)
             .map_err(|e| anyhow::anyhow!("creating embedding context: {e}"))?;
 
-        // Create batch and add tokens.
+        // Create batch and add tokens — mark all as outputs for embedding.
         let mut batch = LlamaBatch::new(tokens.len() + 16, 1);
         batch
-            .add_sequence(&tokens, 0, false)
+            .add_sequence(&tokens, 0, true)
             .map_err(|e| anyhow::anyhow!("adding sequence to batch: {e}"))?;
 
-        // Decode (compute embeddings).
-        ctx.decode(&mut batch)
-            .map_err(|e| anyhow::anyhow!("embedding decode failed: {e}"))?;
+        // Encode (compute embeddings). Use encode() for embedding models.
+        ctx.encode(&mut batch)
+            .map_err(|e| anyhow::anyhow!("embedding encode failed: {e}"))?;
 
         // Get embeddings for sequence 0 (mean pooled by llama.cpp).
         let embeddings = ctx

From 8c4f1928b0d95841f33d93fbf5dbc61869e57814 Mon Sep 17 00:00:00 2001
From: Oleksandr Ostrovskyi <devwhodevs@gmail.com>
Date: Thu, 26 Mar 2026 00:00:30 +0200
Subject: [PATCH 12/17] style: cargo fmt

---
 src/indexer.rs |  2 +-
 src/llm.rs     | 18 +++++++-----------
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/src/indexer.rs b/src/indexer.rs
index 501b6da..774fc98 100644
--- a/src/indexer.rs
+++ b/src/indexer.rs
@@ -4,8 +4,8 @@ use std::time::{Duration, Instant};
 
 use anyhow::{Context, Result, anyhow};
 use ignore::WalkBuilder;
-use sha2::{Digest, Sha256};
 use indicatif::{ProgressBar, ProgressStyle};
+use sha2::{Digest, Sha256};
 use tracing::info;
 
 use crate::chunker::{chunk_markdown, split_oversized_chunks};
diff --git a/src/llm.rs b/src/llm.rs
index 2c7c812..c73c329 100644
--- a/src/llm.rs
+++ b/src/llm.rs
@@ -661,11 +661,7 @@ impl LlamaEmbed {
         let model = LlamaModel::load_from_file(&backend, &model_path, &model_params)
             .map_err(|e| anyhow::anyhow!("loading GGUF model {}: {e}", model_path.display()))?;
 
-        tracing::info!(
-            "loaded LlamaEmbed from {}, target_dim={}",
-            uri_str,
-            dim
-        );
+        tracing::info!("loaded LlamaEmbed from {}, target_dim={}", uri_str, dim);
 
         Ok(Self {
             model,
@@ -931,8 +927,10 @@ impl LlamaOrchestrator {
         let backend =
             LlamaBackend::init().map_err(|e| anyhow::anyhow!("initializing llama backend: {e}"))?;
         let model_params = LlamaModelParams::default();
-        let model = LlamaModel::load_from_file(&backend, &model_path, &model_params)
-            .map_err(|e| anyhow::anyhow!("loading orchestrator model {}: {e}", model_path.display()))?;
+        let model =
+            LlamaModel::load_from_file(&backend, &model_path, &model_params).map_err(|e| {
+                anyhow::anyhow!("loading orchestrator model {}: {e}", model_path.display())
+            })?;
 
         tracing::info!("loaded LlamaOrchestrator from {}", uri_str);
 
@@ -966,8 +964,7 @@ impl LlamaOrchestrator {
 
         // Create context per-call (LlamaContext is !Send).
         let n_ctx = (tokens.len() + max_tokens + 16) as u32;
-        let ctx_params =
-            LlamaContextParams::default().with_n_ctx(std::num::NonZeroU32::new(n_ctx));
+        let ctx_params = LlamaContextParams::default().with_n_ctx(std::num::NonZeroU32::new(n_ctx));
         let mut ctx = self
             .model
             .new_context(&self.backend, ctx_params)
@@ -1148,8 +1145,7 @@ impl RerankModel for LlamaRerank {
 
         // Create context per-call (LlamaContext is !Send).
         let n_ctx = (tokens.len() + 16) as u32;
-        let ctx_params =
-            LlamaContextParams::default().with_n_ctx(std::num::NonZeroU32::new(n_ctx));
+        let ctx_params = LlamaContextParams::default().with_n_ctx(std::num::NonZeroU32::new(n_ctx));
         let mut ctx = self
             .model
             .new_context(&self.backend, ctx_params)

From f55dd855ffe1d04fd9b15e396462067043cbeee9 Mon Sep 17 00:00:00 2001
From: Oleksandr Ostrovskyi <devwhodevs@gmail.com>
Date: Thu, 26 Mar 2026 00:01:15 +0200
Subject: [PATCH 13/17] ci: install CMake on Ubuntu for llama.cpp build

---
 .github/workflows/ci.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index ce4818b..c5f4d07 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -14,6 +14,9 @@ jobs:
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
+      - name: Install CMake (Ubuntu)
+        if: runner.os == 'Linux'
+        run: sudo apt-get update && sudo apt-get install -y cmake
       - uses: dtolnay/rust-toolchain@stable
         with:
           components: rustfmt, clippy

From 55f67ad9b0820639616894fe99758559dc37921d Mon Sep 17 00:00:00 2001
From: Oleksandr Ostrovskyi <devwhodevs@gmail.com>
Date: Thu, 26 Mar 2026 00:02:02 +0200
Subject: [PATCH 14/17] feat(search): wire intelligence models into CLI search
 path

run_search now loads orchestrator + reranker when intelligence is
enabled and calls search_with_intelligence instead of search_internal.
---
 src/search.rs | 39 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/src/search.rs b/src/search.rs
index cdcf75b..006c68d 100644
--- a/src/search.rs
+++ b/src/search.rs
@@ -322,7 +322,44 @@ pub fn run_search(
     let db_path = data_dir.join("engraph.db");
     let store = Store::open(&db_path).context("opening store")?;
 
-    let output = search_internal(query, top_n, &store, &mut embedder)?;
+    // Load intelligence models if enabled.
+    let mut orchestrator_model: Option<Box<dyn llm::OrchestratorModel>> =
+        if config.intelligence_enabled() {
+            match crate::llm::LlamaOrchestrator::new(&models_dir, config) {
+                Ok(o) => Some(Box::new(o)),
+                Err(e) => {
+                    tracing::warn!("failed to load orchestrator: {e}");
+                    None
+                }
+            }
+        } else {
+            None
+        };
+    let mut reranker_model: Option<Box<dyn llm::RerankModel>> = if config.intelligence_enabled() {
+        match crate::llm::LlamaRerank::new(&models_dir, config) {
+            Ok(r) => Some(Box::new(r)),
+            Err(e) => {
+                tracing::warn!("failed to load reranker: {e}");
+                None
+            }
+        }
+    } else {
+        None
+    };
+
+    let output = {
+        let mut search_config = SearchConfig {
+            orchestrator: orchestrator_model
+                .as_mut()
+                .map(|o| o.as_mut() as &mut dyn llm::OrchestratorModel),
+            reranker: reranker_model
+                .as_mut()
+                .map(|r| r.as_mut() as &mut dyn llm::RerankModel),
+            store: &store,
+            rerank_candidates: 30,
+        };
+        search_with_intelligence(query, top_n, &mut embedder, &mut search_config)?
+    };
 
     let results: Vec<SearchResult> = output
         .results

From 5c7f2857ff0a090d4ffd73e22014b8f25865c4a7 Mon Sep 17 00:00:00 2001
From: Oleksandr Ostrovskyi <devwhodevs@gmail.com>
Date: Thu, 26 Mar 2026 00:34:46 +0200
Subject: [PATCH 15/17] fix: singleton LlamaBackend and built-in tokenizer for
 orchestrator/reranker

Bug 1: LlamaBackend::init() fails with BackendAlreadyInitialized if called
more than once. Add a module-level llama_backend() function using OnceLock +
a Mutex-guarded double-checked init (get_or_try_init is still unstable on
stable Rust). Remove the backend field from LlamaEmbed, LlamaOrchestrator,
and LlamaRerank; all three now share the single static backend.

Bug 2: LlamaOrchestrator and LlamaRerank were loading an external
tokenizer.json via load_hf_tokenizer(), which does not exist in Qwen3 GGUF
repos. Switch both to llama.cpp's built-in tokenizer: str_to_token() for
encoding, token_to_piece() for decoding, and str_to_token("Yes"/"No") for
Yes/No token ID lookup. Remove the tokenizer field from both structs and
drop the load_hf_tokenizer() helper. Add encoding_rs as a direct dependency
(required by token_to_piece's Decoder parameter; was already a transitive dep).

All 270 unit tests pass, clippy clean, fmt clean.
---
 Cargo.lock |   1 +
 Cargo.toml |   1 +
 src/llm.rs | 148 +++++++++++++++++++++++++++++------------------------
 3 files changed, 82 insertions(+), 68 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 655637d..22e8525 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -621,6 +621,7 @@ dependencies = [
  "anyhow",
  "clap",
  "dirs",
+ "encoding_rs",
  "ignore",
  "indicatif",
  "llama-cpp-2",
diff --git a/Cargo.toml b/Cargo.toml
index 9549eba..6733237 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -35,6 +35,7 @@ tokio = { version = "1", features = ["macros", "rt-multi-thread"] }
 notify = "7.0"
 notify-debouncer-full = "0.4"
 llama-cpp-2 = "0.1"
+encoding_rs = "0.8"
 shimmytok = "0.7"
 
 [features]
diff --git a/src/llm.rs b/src/llm.rs
index c73c329..30d913a 100644
--- a/src/llm.rs
+++ b/src/llm.rs
@@ -1,5 +1,6 @@
 use std::io::Read;
 use std::path::{Path, PathBuf};
+use std::sync::{Mutex, OnceLock};
 
 use anyhow::{Context as _, Result, bail};
 use indicatif::{ProgressBar, ProgressStyle};
@@ -12,6 +13,26 @@ use llama_cpp_2::model::params::LlamaModelParams;
 use llama_cpp_2::model::{AddBos, LlamaModel};
 use llama_cpp_2::sampling::LlamaSampler;
 
+static BACKEND: OnceLock<LlamaBackend> = OnceLock::new();
+/// Mutex used only during the first initialization of `BACKEND`.
+static BACKEND_INIT: Mutex<()> = Mutex::new(());
+
+/// Get or initialize the global llama.cpp backend.
+/// Safe to call from multiple places — the backend is initialized at most once.
+pub fn llama_backend() -> Result<&'static LlamaBackend> {
+    if let Some(b) = BACKEND.get() {
+        return Ok(b);
+    }
+    let _guard = BACKEND_INIT.lock().unwrap();
+    // Double-checked: another thread may have initialized while we waited.
+    if let Some(b) = BACKEND.get() {
+        return Ok(b);
+    }
+    let backend =
+        LlamaBackend::init().map_err(|e| anyhow::anyhow!("initializing llama backend: {e}"))?;
+    Ok(BACKEND.get_or_init(|| backend))
+}
+
 // ── Prompt format ────────────────────────────────────────────────────────────
 
 /// Model-family-specific prompt templates for embedding models.
@@ -509,18 +530,6 @@ fn load_tokenizer_for_model(uri: &HfModelUri, models_dir: &Path) -> Result<FlexT
     )
 }
 
-/// Load tokenizer as HuggingFace `tokenizers::Tokenizer` specifically.
-/// Used by LlamaOrchestrator and LlamaRerank which need decode/token_to_id.
-fn load_hf_tokenizer(uri: &HfModelUri, models_dir: &Path) -> Result<tokenizers::Tokenizer> {
-    try_external_tokenizer(uri, models_dir).ok_or_else(|| {
-        anyhow::anyhow!(
-            "could not find tokenizer.json for model '{}'. \
-             Orchestrator/reranker models require tokenizer.json (not GGUF-embedded).",
-            uri.repo
-        )
-    })
-}
-
 /// Try downloading tokenizer.json from candidate HuggingFace repos.
 fn try_external_tokenizer(uri: &HfModelUri, models_dir: &Path) -> Option<tokenizers::Tokenizer> {
     let mut candidates: Vec<String> = vec![uri.repo.clone()];
@@ -603,16 +612,16 @@ impl Default for ModelDefaults {
 /// normalization. Supports Metal acceleration on macOS automatically.
 ///
 /// `LlamaModel` is `Send + Sync`, so this struct is `Send`. `LlamaContext` is
-/// `!Send`, so we create it per-call from the stored model and backend.
+/// `!Send`, so we create it per-call. The global `LlamaBackend` is referenced
+/// via `llama_backend()` — no need to store it per-struct.
 pub struct LlamaEmbed {
     model: LlamaModel,
-    backend: LlamaBackend,
     tokenizer: FlexTokenizer,
     dim: usize,
     prompt_format: PromptFormat,
 }
 
-// Safety: LlamaModel is Send+Sync per llama-cpp-2 docs. LlamaBackend is Send+Sync.
+// Safety: LlamaModel is Send+Sync per llama-cpp-2 docs.
 // FlexTokenizer contains only Send types (tokenizers::Tokenizer is Send, shimmytok::Tokenizer is Send).
 // We never store a LlamaContext (which is !Send) — it is created per-call.
 unsafe impl Send for LlamaEmbed {}
@@ -654,18 +663,16 @@ impl LlamaEmbed {
         // Target output dimensionality.
         let dim = defaults.embed_dim;
 
-        // Initialize llama.cpp backend and load model.
-        let backend =
-            LlamaBackend::init().map_err(|e| anyhow::anyhow!("initializing llama backend: {e}"))?;
+        // Get or initialize the global llama.cpp backend, then load model.
+        let backend = llama_backend()?;
         let model_params = LlamaModelParams::default();
-        let model = LlamaModel::load_from_file(&backend, &model_path, &model_params)
+        let model = LlamaModel::load_from_file(backend, &model_path, &model_params)
             .map_err(|e| anyhow::anyhow!("loading GGUF model {}: {e}", model_path.display()))?;
 
         tracing::info!("loaded LlamaEmbed from {}, target_dim={}", uri_str, dim);
 
         Ok(Self {
             model,
-            backend,
             tokenizer,
             dim,
             prompt_format,
@@ -695,7 +702,7 @@ impl LlamaEmbed {
             .with_n_batch(n_tokens.max(512));
         let mut ctx = self
             .model
-            .new_context(&self.backend, ctx_params)
+            .new_context(llama_backend()?, ctx_params)
             .map_err(|e| anyhow::anyhow!("creating embedding context: {e}"))?;
 
         // Create batch and add tokens — mark all as outputs for embedding.
@@ -887,13 +894,15 @@ Be concise. Only return the JSON object."#;
 /// Loads a Qwen3 GGUF model and performs autoregressive generation to classify
 /// queries and produce expansions. Falls back to `heuristic_orchestrate` if
 /// generation or JSON parsing fails. Uses Metal acceleration on macOS automatically.
+///
+/// Uses llama.cpp's built-in tokenizer for both encoding and decoding — no
+/// external tokenizer.json required. The global `LlamaBackend` is used via
+/// `llama_backend()`.
 pub struct LlamaOrchestrator {
     model: LlamaModel,
-    backend: LlamaBackend,
-    tokenizer: tokenizers::Tokenizer,
 }
 
-// Safety: LlamaModel and LlamaBackend are Send+Sync. tokenizers::Tokenizer is Send.
+// Safety: LlamaModel is Send+Sync per llama-cpp-2 docs.
 // LlamaContext is created per-call and never stored.
 unsafe impl Send for LlamaOrchestrator {}
 
@@ -909,8 +918,7 @@ impl LlamaOrchestrator {
     /// Steps:
     /// 1. Resolve model URI (from config override or `ModelDefaults`)
     /// 2. `ensure_model()` to download if needed
-    /// 3. Load tokenizer from the model repo (or the non-GGUF base repo)
-    /// 4. Load GGUF model via llama.cpp
+    /// 3. Load GGUF model via llama.cpp (uses built-in tokenizer — no tokenizer.json needed)
     pub fn new(models_dir: &Path, config: &crate::config::Config) -> Result<Self> {
         let defaults = ModelDefaults::default();
         let uri_str = config
@@ -921,24 +929,17 @@ impl LlamaOrchestrator {
         let uri = HfModelUri::parse(uri_str)?;
         let model_path = ensure_model(&uri, models_dir)?;
 
-        // Orchestrator needs HF tokenizer (for decode + token_to_id).
-        let tokenizer = load_hf_tokenizer(&uri, models_dir)?;
-
-        let backend =
-            LlamaBackend::init().map_err(|e| anyhow::anyhow!("initializing llama backend: {e}"))?;
+        // Use global backend and llama.cpp's built-in tokenizer (no tokenizer.json required).
+        let backend = llama_backend()?;
         let model_params = LlamaModelParams::default();
         let model =
-            LlamaModel::load_from_file(&backend, &model_path, &model_params).map_err(|e| {
+            LlamaModel::load_from_file(backend, &model_path, &model_params).map_err(|e| {
                 anyhow::anyhow!("loading orchestrator model {}: {e}", model_path.display())
             })?;
 
         tracing::info!("loaded LlamaOrchestrator from {}", uri_str);
 
-        Ok(Self {
-            model,
-            backend,
-            tokenizer,
-        })
+        Ok(Self { model })
     }
 
     /// Format a chat prompt in Qwen3 ChatML format.
@@ -953,7 +954,7 @@ impl LlamaOrchestrator {
     /// Run autoregressive generation (greedy decode) up to `max_tokens`.
     /// Returns the generated text (excluding the prompt).
     fn generate(&self, prompt: &str, max_tokens: usize) -> Result<String> {
-        // Tokenize using llama.cpp's tokenizer.
+        // Tokenize using llama.cpp's built-in tokenizer.
         let tokens = self
             .model
             .str_to_token(prompt, AddBos::Always)
@@ -967,7 +968,7 @@ impl LlamaOrchestrator {
         let ctx_params = LlamaContextParams::default().with_n_ctx(std::num::NonZeroU32::new(n_ctx));
         let mut ctx = self
             .model
-            .new_context(&self.backend, ctx_params)
+            .new_context(llama_backend()?, ctx_params)
             .map_err(|e| anyhow::anyhow!("creating orchestrator context: {e}"))?;
 
         // Process prompt tokens in a batch.
@@ -984,7 +985,10 @@ impl LlamaOrchestrator {
 
         // Autoregressive generation loop.
         let mut sampler = LlamaSampler::greedy();
-        let mut generated_tokens: Vec<u32> = Vec::new();
+        let mut output = String::new();
+        // Each token may produce multi-byte UTF-8 sequences; use an encoding_rs decoder
+        // to correctly reassemble them across token boundaries.
+        let mut decoder = encoding_rs::UTF_8.new_decoder();
         let mut n_cur = tokens.len();
 
         for _ in 0..max_tokens {
@@ -996,7 +1000,12 @@ impl LlamaOrchestrator {
                 break;
             }
 
-            generated_tokens.push(new_token.0 as u32);
+            // Decode this token to text using llama.cpp's built-in tokenizer.
+            let piece = self
+                .model
+                .token_to_piece(new_token, &mut decoder, false, None)
+                .map_err(|e| anyhow::anyhow!("token_to_piece failed: {e}"))?;
+            output.push_str(&piece);
 
             // Add token to batch for next iteration.
             batch.clear();
@@ -1009,12 +1018,7 @@ impl LlamaOrchestrator {
                 .map_err(|e| anyhow::anyhow!("generation decode failed: {e}"))?;
         }
 
-        // Decode generated tokens back to text using HF tokenizer.
-        let text = self
-            .tokenizer
-            .decode(&generated_tokens, true)
-            .map_err(|e| anyhow::anyhow!("decoding generated tokens: {e}"))?;
-        Ok(text)
+        Ok(output)
     }
 }
 
@@ -1058,14 +1062,17 @@ pub fn format_reranker_input(query: &str, document: &str) -> String {
 /// running a single forward pass and extracting Yes/No logit probabilities.
 /// Unlike `LlamaOrchestrator`, this does NOT do autoregressive generation —
 /// just one pass through the full input to get logits at the last position.
+///
+/// Uses llama.cpp's built-in tokenizer to look up Yes/No token IDs — no
+/// external tokenizer.json required. The global `LlamaBackend` is used via
+/// `llama_backend()`.
 pub struct LlamaRerank {
     model: LlamaModel,
-    backend: LlamaBackend,
     yes_token_id: i32,
     no_token_id: i32,
 }
 
-// Safety: LlamaModel and LlamaBackend are Send+Sync.
+// Safety: LlamaModel is Send+Sync per llama-cpp-2 docs.
 // LlamaContext is created per-call and never stored.
 unsafe impl Send for LlamaRerank {}
 
@@ -1084,8 +1091,8 @@ impl LlamaRerank {
     /// Steps:
     /// 1. Resolve model URI (from config override or `ModelDefaults::default().rerank_uri`)
     /// 2. `ensure_model()` to download if needed
-    /// 3. Load tokenizer from the model repo to look up Yes/No token IDs
-    /// 4. Load GGUF model via llama.cpp
+    /// 3. Load GGUF model via llama.cpp
+    /// 4. Look up Yes/No token IDs using the model's built-in tokenizer (no tokenizer.json needed)
     pub fn new(models_dir: &Path, config: &crate::config::Config) -> Result<Self> {
         let defaults = ModelDefaults::default();
         let uri_str = config
@@ -1096,24 +1103,30 @@ impl LlamaRerank {
         let uri = HfModelUri::parse(uri_str)?;
         let model_path = ensure_model(&uri, models_dir)?;
 
-        // Reranker needs HF tokenizer to look up Yes/No token IDs.
-        let hf_tokenizer = load_hf_tokenizer(&uri, models_dir)?;
-
-        let yes_token_id = hf_tokenizer
-            .token_to_id("Yes")
-            .ok_or_else(|| anyhow::anyhow!("tokenizer has no 'Yes' token"))?
-            as i32;
-        let no_token_id = hf_tokenizer
-            .token_to_id("No")
-            .ok_or_else(|| anyhow::anyhow!("tokenizer has no 'No' token"))?
-            as i32;
-
-        let backend =
-            LlamaBackend::init().map_err(|e| anyhow::anyhow!("initializing llama backend: {e}"))?;
+        // Use global backend and llama.cpp's built-in tokenizer (no tokenizer.json required).
+        let backend = llama_backend()?;
         let model_params = LlamaModelParams::default();
-        let model = LlamaModel::load_from_file(&backend, &model_path, &model_params)
+        let model = LlamaModel::load_from_file(backend, &model_path, &model_params)
             .map_err(|e| anyhow::anyhow!("loading reranker model {}: {e}", model_path.display()))?;
 
+        // Look up Yes/No token IDs via the model's built-in tokenizer.
+        // str_to_token returns Vec<LlamaToken>; we take the first token ID (skip BOS).
+        let yes_tokens = model
+            .str_to_token("Yes", AddBos::Never)
+            .map_err(|e| anyhow::anyhow!("tokenizing 'Yes': {e}"))?;
+        let yes_token_id = yes_tokens
+            .first()
+            .map(|t| t.0)
+            .ok_or_else(|| anyhow::anyhow!("model tokenizer returned no tokens for 'Yes'"))?;
+
+        let no_tokens = model
+            .str_to_token("No", AddBos::Never)
+            .map_err(|e| anyhow::anyhow!("tokenizing 'No': {e}"))?;
+        let no_token_id = no_tokens
+            .first()
+            .map(|t| t.0)
+            .ok_or_else(|| anyhow::anyhow!("model tokenizer returned no tokens for 'No'"))?;
+
         tracing::info!(
             "loaded LlamaRerank from {}, yes_id={}, no_id={}",
             uri_str,
@@ -1123,7 +1136,6 @@ impl LlamaRerank {
 
         Ok(Self {
             model,
-            backend,
             yes_token_id,
             no_token_id,
         })
@@ -1148,7 +1160,7 @@ impl RerankModel for LlamaRerank {
         let ctx_params = LlamaContextParams::default().with_n_ctx(std::num::NonZeroU32::new(n_ctx));
         let mut ctx = self
             .model
-            .new_context(&self.backend, ctx_params)
+            .new_context(llama_backend()?, ctx_params)
             .map_err(|e| anyhow::anyhow!("creating reranker context: {e}"))?;
 
         // Create batch with all tokens; mark last as logit-producing.

From 8b8597697a9c0f4214bed9b30f72ee6ce8f0d0c2 Mon Sep 17 00:00:00 2001
From: Oleksandr Ostrovskyi <devwhodevs@gmail.com>
Date: Thu, 26 Mar 2026 00:38:20 +0200
Subject: [PATCH 16/17] fix(llm): global backend singleton, built-in
 tokenizers, wire CLI intelligence

- LlamaBackend shared via OnceLock (was re-initialized per model, crashed)
- Orchestrator/reranker use llama.cpp built-in tokenizer (GGUF-embedded)
- CLI search loads intelligence models when enabled
- Debug log for orchestration results
---
 src/search.rs | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/search.rs b/src/search.rs
index 006c68d..eb41be5 100644
--- a/src/search.rs
+++ b/src/search.rs
@@ -105,6 +105,11 @@ pub fn search_with_intelligence(
         }
         None => llm::heuristic_orchestrate(query),
     };
+    tracing::debug!(
+        intent = ?orchestration.intent,
+        expansions = orchestration.expansions.len(),
+        "orchestration complete"
+    );
     let weights = llm::LaneWeights::from_intent(&orchestration.intent);
 
     // --- Step 2: Run 3-lane retrieval for EACH expanded query ---

From e8c159f79b4ae11a571316aed232ec34a71855e2 Mon Sep 17 00:00:00 2001
From: Oleksandr Ostrovskyi <devwhodevs@gmail.com>
Date: Thu, 26 Mar 2026 00:42:52 +0200
Subject: [PATCH 17/17] docs: update README, CHANGELOG, CLAUDE.md for llama.cpp
 backend

- README: llama.cpp references, Metal GPU, 270 tests, CMake requirement
- CHANGELOG: v1.0.1 entry with all fixes and backend switch
- CLAUDE.md: llama-cpp-2 deps, LlamaEmbed/LlamaOrchestrator/LlamaRerank
- Release workflow: CMake on Ubuntu, cmake dep in Homebrew formula
- Vault spec: updated with hotfix PR reference
---
 .github/workflows/release.yml |  4 ++++
 CHANGELOG.md                  | 24 ++++++++++++++++++++++++
 CLAUDE.md                     | 14 +++++++-------
 README.md                     | 10 +++++-----
 4 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index d10be69..d63db36 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -19,6 +19,9 @@ jobs:
       contents: write
     steps:
       - uses: actions/checkout@v4
+      - name: Install CMake (Ubuntu)
+        if: runner.os == 'Linux'
+        run: sudo apt-get update && sudo apt-get install -y cmake
       - uses: dtolnay/rust-toolchain@stable
       - run: cargo build --release
       - name: Archive binary
@@ -60,6 +63,7 @@ jobs:
             sha256 "SHA256"
             license "MIT"
 
+            depends_on "cmake" => :build
             depends_on "rust" => :build
 
             def install
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5e71f03..703e2b7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,29 @@
 # Changelog
 
+## [1.0.1] - 2026-03-26
+
+### Changed
+- **Inference backend switched from candle to llama.cpp** — via `llama-cpp-2` Rust bindings. Gets full Metal GPU acceleration on macOS (88 files indexed in 70s vs 37+ minutes on CPU with candle). Same backend as [qmd](https://github.com/tobi/qmd).
+- Default embedding model produces 256-dim vectors via embeddinggemma-300M (Matryoshka truncation)
+- BERT GGUF architecture support added alongside Gemma (future model flexibility)
+- Progress bar during indexing via indicatif (was silent for minutes)
+- CI workflow installs CMake on Ubuntu (required for llama.cpp build)
+
+### Fixed
+- **Prompt format applied during embedding** — `embed_one` uses search_query prefix, `embed_batch` uses search_document prefix. Without this, embeddinggemma operated in wrong symmetric mode.
+- **GGUF tokenizer fallback** — added `shimmytok` crate to extract tokenizer from GGUF metadata when tokenizer.json is unavailable (Google Gemma repos are gated)
+- **LlamaBackend singleton** — global `OnceLock` prevents double-initialization crash when loading multiple models
+- **Orchestrator/reranker use built-in tokenizer** — llama.cpp reads tokenizer from GGUF metadata, no external tokenizer.json needed
+- **Dimension migration clears FTS** — `reset_for_reindex` now also clears `chunks_fts` to prevent duplicate entries
+- **LLM cache wired into search** — `search_with_intelligence` checks/populates `llm_cache` table
+- **MCP server wires intelligence** — search handler passes orchestrator + reranker via `SearchConfig`
+- **CLI search wires intelligence** — `run_search` loads models when intelligence enabled
+- **Qwen3 GGUF filename** — fixed case sensitivity (was 404)
+- **Embedding batch params** — `n_ubatch >= n_tokens` assertion, use `encode()` not `decode()`, `AddBos::Never` (PromptFormat adds `<bos>`)
+
+### Removed
+- `candle-core`, `candle-nn`, `candle-transformers` dependencies (replaced by `llama-cpp-2`)
+
 ## [1.0.0] - 2026-03-25
 
 ### Added
diff --git a/CLAUDE.md b/CLAUDE.md
index 86ae2e3..f5739b8 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -9,7 +9,7 @@ Single binary with 19 modules behind a lib crate:
 - `config.rs` — loads `~/.engraph/config.toml` and `vault.toml`, merges CLI args, provides `data_dir()`. Includes `intelligence: Option<bool>` and `[models]` section for model overrides. `Config::save()` writes back to disk.
 - `chunker.rs` — smart chunking with break-point scoring algorithm. Finds optimal split points considering headings, code fences, blank lines, and thematic breaks. `split_oversized_chunks()` handles token-aware secondary splitting with overlap
 - `docid.rs` — deterministic 6-char hex IDs for files (SHA-256 of path, truncated). Shown in search results for quick reference
-- `llm.rs` — candle model management. Three traits: `EmbedModel` (embeddings), `RerankModel` (cross-encoder scoring), `OrchestratorModel` (query intent + expansion). Three candle implementations: `CandleEmbed` (custom bidirectional transformer from GGUF for embeddinggemma), `CandleOrchestrator` (quantized_qwen3 for query analysis), `CandleRerank` (quantized_qwen3 for relevance scoring). Also: `MockLlm` for testing, `HfModelUri` for model download, `PromptFormat` for model-family prompt templates, `heuristic_orchestrate()` fast path, `LaneWeights` per query intent
+- `llm.rs` — ML inference via llama.cpp (Rust bindings: `llama-cpp-2`). Three traits: `EmbedModel` (embeddings), `RerankModel` (cross-encoder scoring), `OrchestratorModel` (query intent + expansion). Three llama.cpp implementations: `LlamaEmbed` (embeddinggemma-300M GGUF on Metal GPU), `LlamaOrchestrator` (Qwen3-0.6B for query analysis + expansion), `LlamaRerank` (Qwen3-Reranker-0.6B for relevance scoring). Global `LlamaBackend` via `OnceLock`. Also: `MockLlm` for testing, `HfModelUri` for model download, `FlexTokenizer` (HuggingFace tokenizers + shimmytok GGUF fallback), `PromptFormat` for model-family prompt templates, `heuristic_orchestrate()` fast path, `LaneWeights` per query intent
 - `fts.rs` — FTS5 full-text search support. Re-exports `FtsResult` from store. BM25-ranked keyword search
 - `fusion.rs` — Reciprocal Rank Fusion (RRF) engine. Merges semantic + FTS5 + graph + reranker results. Supports per-lane weighting, `--explain` output with intent + per-lane detail
 - `context.rs` — context engine. Six functions: `read` (full note content + metadata), `list` (filtered note listing with `created_by` filter), `vault_map` (structure overview), `who` (person context bundle), `project` (project context bundle), `context_topic` (rich topic context with budget trimming). Pure functions taking `ContextParams` — no model loading except `context_topic` which reuses `search_internal`
@@ -52,14 +52,13 @@ Single vault only. Re-indexing a different vault path triggers a confirmation pr
 
 ## Dependencies to be aware of
 
-- `candle-core` (0.9) — HuggingFace pure Rust ML framework. GGUF model loading, tensor ops. `metal` feature for macOS GPU acceleration
-- `candle-nn` (0.9) — neural network building blocks (RmsNorm, rotary embeddings, etc.)
-- `candle-transformers` (0.9) — pre-built transformer model architectures. Used: `quantized_qwen3` for orchestrator + reranker
+- `llama-cpp-2` (0.1) — Rust bindings to llama.cpp. GGUF model loading + inference. Metal GPU on macOS, CUDA on Linux. Compiles llama.cpp C++ via build script (requires CMake)
+- `shimmytok` (0.7) — pure Rust tokenizer that reads from GGUF metadata. Fallback when tokenizer.json is unavailable (gated HuggingFace repos)
+- `tokenizers` (0.22) — HuggingFace tokenizer. Kept for FlexTokenizer HuggingFace backend
 - `sqlite-vec` (0.1.8-alpha.1) — SQLite extension for vector search. Provides vec0 virtual tables with KNN via `vec_distance_cosine()`
 - `zerocopy` (0.7) — zero-copy serialization for vector data passed to sqlite-vec
 - `strsim` (0.11) — string similarity for fuzzy tag matching and fuzzy link matching
 - `time` (0.3) — date/time handling for frontmatter timestamps
-- `tokenizers` (0.22) — HuggingFace tokenizer. Needs `fancy-regex` feature. Used for all three GGUF models
 - `ignore` (0.4) — vault walking with `.gitignore` support
 - `rusqlite` (0.32) — bundled SQLite with FTS5 support
 - `rmcp` (1.2) — MCP server SDK for stdio transport
@@ -68,12 +67,13 @@ Single vault only. Re-indexing a different vault path triggers a confirmation pr
 
 ## Testing
 
-- Unit tests in each module (`cargo test --lib`) — 271 tests, no network required
+- Unit tests in each module (`cargo test --lib`) — 270 tests, no network required
 - Integration tests (`cargo test --test integration -- --ignored`) — require GGUF model download
+- Build requires CMake (for llama.cpp C++ compilation)
 
 ## CI/CD
 
-- CI: `cargo fmt --check` + `cargo clippy -- -D warnings` + `cargo test --lib` on macOS + Ubuntu
+- CI: `cargo fmt --check` + `cargo clippy -- -D warnings` + `cargo test --lib` on macOS + Ubuntu. Ubuntu step installs CMake.
 - Release: native builds on macOS arm64 (macos-14) + Linux x86_64 (ubuntu-latest). Triggered by `v*` tags
 - Homebrew: `devwhodevs/homebrew-tap` — formula builds from source tarball
 
diff --git a/README.md b/README.md
index ce82aba..69da14d 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ Plain vector search treats your notes as isolated documents. But knowledge isn't
 - **MCP server for AI agents** — `engraph serve` exposes 13 tools (search, read, context bundles, note creation) that Claude, Cursor, or any MCP client can call directly.
 - **Real-time sync** — file watcher keeps the index fresh as you edit in Obsidian. No manual re-indexing needed.
 - **Smart write pipeline** — AI agents can create notes with automatic tag resolution, wikilink discovery, and folder placement based on semantic similarity.
-- **Fully local** — pure Rust ML via [candle](https://github.com/huggingface/candle) with GGUF models (~300MB mandatory, ~1.3GB optional for intelligence). Metal-accelerated on macOS. No API keys, no cloud.
+- **Fully local** — [llama.cpp](https://github.com/ggml-org/llama.cpp) inference with GGUF models (~300MB mandatory, ~1.3GB optional for intelligence). Metal GPU-accelerated on macOS (88 files indexed in 70s). No API keys, no cloud.
 
 ## What problem it solves
 
@@ -57,7 +57,7 @@ Your vault (markdown files)
   Claude / Cursor / any MCP client
 ```
 
-1. **Index** — walks your vault, chunks markdown by headings, embeds with a local GGUF model (candle), stores everything in SQLite with FTS5 + sqlite-vec + a wikilink graph
+1. **Index** — walks your vault, chunks markdown by headings, embeds with a local GGUF model via llama.cpp (Metal GPU on macOS), stores everything in SQLite with FTS5 + sqlite-vec + a wikilink graph
 2. **Search** — an orchestrator classifies the query and sets lane weights, then runs up to four lanes (semantic KNN, BM25 keyword, graph expansion, cross-encoder reranking), fused via RRF
 3. **Serve** — starts an MCP server that AI agents connect to, with a file watcher that re-indexes changes in real time
 
@@ -190,7 +190,7 @@ engraph resolves tags against the registry (fuzzy matching), discovers potential
 | AI agent access | MCP server (13 tools) | Custom API needed | No |
 | Write capability | Create/append/move with smart filing | No | Manual |
 | Real-time sync | File watcher, 2s debounce | Manual re-index | N/A |
-| Runs locally | Yes, pure Rust + Metal acceleration | Depends | Yes |
+| Runs locally | Yes, llama.cpp + Metal GPU | Depends | Yes |
 | Setup | One binary, one command | Framework + code | Built-in |
 
 engraph is not a replacement for Obsidian — it's the intelligence layer that sits between your vault and your AI tools.
@@ -199,7 +199,7 @@ engraph is not a replacement for Obsidian — it's the intelligence layer that s
 
 - 4-lane hybrid search (semantic + FTS5 + graph + cross-encoder reranker) with two-pass RRF fusion
 - LLM research orchestrator: query intent classification + query expansion + adaptive lane weights
-- Pure Rust ML via candle (GGUF models, Metal acceleration on macOS)
+- llama.cpp inference via Rust bindings (GGUF models, Metal GPU on macOS, CUDA on Linux)
 - Intelligence opt-in: heuristic fallback when disabled, LLM-powered when enabled
 - MCP server with 13 tools (7 read, 6 write) via stdio
 - Real-time file watching with 2s debounce and startup reconciliation
@@ -242,7 +242,7 @@ All data stored in `~/.engraph/` — single SQLite database (~10MB typical), GGU
 ## Development
 
 ```bash
-cargo test --lib          # 271 unit tests, no network
+cargo test --lib          # 270 unit tests, no network (requires CMake for llama.cpp)
 cargo clippy -- -D warnings
 cargo fmt --check