diff --git a/Cargo.lock b/Cargo.lock index 419059f..fff444f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -146,6 +146,12 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + [[package]] name = "bitflags" version = "2.11.0" @@ -569,6 +575,8 @@ dependencies = [ "ignore", "indicatif", "ndarray", + "notify", + "notify-debouncer-full", "ort", "rayon", "rmcp", @@ -640,6 +648,26 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" +[[package]] +name = "file-id" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1fc6a637b6dc58414714eddd9170ff187ecb0933d4c7024d1abbd23a3cc26e9" +dependencies = [ + "windows-sys 0.60.2", +] + +[[package]] +name = "filetime" +version = "0.2.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f98844151eee8917efc50bd9e8318cb963ae8b297431495d3f758616ea5c57db" +dependencies = [ + "cfg-if", + "libc", + "libredox", +] + [[package]] name = "find-msvc-tools" version = "0.1.9" @@ -692,6 +720,15 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "fsevent-sys" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76ee7a02da4d231650c7cea31349b889be2f45ddb3ef3032d2ec8185f6313fd2" +dependencies = [ + "libc", +] + [[package]] name = "futures" version = "0.3.32" @@ -1079,6 +1116,35 @@ dependencies = [ "web-time", ] +[[package]] +name = "inotify" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdd168d97690d0b8c412d6b6c10360277f4d7ee495c5d0d5d5fe0854923255cc" +dependencies = [ + "bitflags 1.3.2", + "inotify-sys", + "libc", +] + +[[package]] +name = "inotify-sys" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e05c02b5e89bff3b946cedeca278abc628fe811e604f027c45a8aa3cf793d0eb" +dependencies = [ + "libc", +] + +[[package]] +name = "instant" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" +dependencies = [ + "cfg-if", +] + [[package]] name = "is_terminal_polyfill" version = "1.70.2" @@ -1110,6 +1176,26 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "kqueue" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eac30106d7dce88daf4a3fcb4879ea939476d5074a9b7ddd0fb97fa4bed5596a" +dependencies = [ + "kqueue-sys", + "libc", +] + +[[package]] +name = "kqueue-sys" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed9625ffda8729b85e45cf04090035ac368927b8cebc34898e7c120f52e4838b" +dependencies = [ + "bitflags 1.3.2", + "libc", +] + [[package]] name = "lazy_static" version = "1.5.0" @@ -1134,7 +1220,10 @@ version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1744e39d1d6a9948f4f388969627434e31128196de472883b39f148769bfe30a" dependencies = [ + "bitflags 2.11.0", "libc", + "plain", + "redox_syscall", ] [[package]] @@ -1229,6 +1318,18 @@ dependencies = [ "simd-adler32", ] +[[package]] +name = "mio" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc" +dependencies = [ + "libc", + "log", + "wasi", + "windows-sys 0.61.2", +] + [[package]] name = "monostate" version = "0.1.18" @@ -1293,6 +1394,47 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "notify" +version = "7.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c533b4c39709f9ba5005d8002048266593c1cfaf3c5f0739d5b8ab0c6c504009" +dependencies = [ + "bitflags 2.11.0", + "filetime", + "fsevent-sys", + "inotify", + "kqueue", + "libc", + "log", + "mio", + "notify-types", + "walkdir", + "windows-sys 0.52.0", +] + +[[package]] +name = "notify-debouncer-full" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dcf855483228259b2353f89e99df35fc639b2b2510d1166e4858e3f67ec1afb" +dependencies = [ + "file-id", + "log", + "notify", + "notify-types", + "walkdir", +] + +[[package]] +name = "notify-types" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "585d3cb5e12e01aed9e8a1f70d5c6b5e86fe2a6e48fc8cd0b3e0b8df6f6eb174" +dependencies = [ + "instant", +] + [[package]] name = "nu-ansi-term" version = "0.50.3" @@ -1359,7 +1501,7 @@ version = "0.10.76" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "951c002c75e16ea2c65b8c7e4d3d51d5530d8dfa7d060b4776828c88cfb18ecf" dependencies = [ - "bitflags", + "bitflags 2.11.0", "cfg-if", "foreign-types", "libc", @@ -1466,6 +1608,12 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" +[[package]] +name = "plain" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6" + [[package]] name = "portable-atomic" version = "1.13.1" @@ -1611,6 +1759,15 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "redox_syscall" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce70a74e890531977d37e532c34d45e9055d2409ed08ddba14529471ed0be16" +dependencies = [ + "bitflags 2.11.0", +] + [[package]] name = "redox_users" version = "0.4.6" @@ -1726,7 +1883,7 @@ version = "0.32.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7753b721174eb8ff87a9a0e799e2d7bc3749323e773db92e0984debb00019d6e" dependencies = [ - "bitflags", + "bitflags 2.11.0", "fallible-iterator", "fallible-streaming-iterator", "hashlink", @@ -1740,7 +1897,7 @@ version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" dependencies = [ - "bitflags", + "bitflags 2.11.0", "errno", "libc", "linux-raw-sys", @@ -1844,7 +2001,7 @@ version = "3.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d" dependencies = [ - "bitflags", + "bitflags 2.11.0", "core-foundation", "core-foundation-sys", "libc", @@ -2565,7 +2722,7 @@ version = "0.244.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" dependencies = [ - "bitflags", + "bitflags 2.11.0", "hashbrown 0.15.5", "indexmap", "semver", @@ -2725,6 +2882,15 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-sys" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" +dependencies = [ + "windows-targets 0.53.5", +] + [[package]] name = "windows-sys" version = "0.61.2" @@ -2758,13 +2924,30 @@ dependencies = [ "windows_aarch64_gnullvm 0.52.6", "windows_aarch64_msvc 0.52.6", "windows_i686_gnu 0.52.6", - "windows_i686_gnullvm", + "windows_i686_gnullvm 0.52.6", "windows_i686_msvc 0.52.6", "windows_x86_64_gnu 0.52.6", "windows_x86_64_gnullvm 0.52.6", "windows_x86_64_msvc 0.52.6", ] +[[package]] +name = "windows-targets" +version = "0.53.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" +dependencies = [ + "windows-link", + "windows_aarch64_gnullvm 0.53.1", + "windows_aarch64_msvc 0.53.1", + "windows_i686_gnu 0.53.1", + "windows_i686_gnullvm 0.53.1", + "windows_i686_msvc 0.53.1", + "windows_x86_64_gnu 0.53.1", + "windows_x86_64_gnullvm 0.53.1", + "windows_x86_64_msvc 0.53.1", +] + [[package]] name = "windows_aarch64_gnullvm" version = "0.48.5" @@ -2777,6 +2960,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" + [[package]] name = "windows_aarch64_msvc" version = "0.48.5" @@ -2789,6 +2978,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" + [[package]] name = "windows_i686_gnu" version = "0.48.5" @@ -2801,12 +2996,24 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" +[[package]] +name = "windows_i686_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" + [[package]] name = "windows_i686_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" + [[package]] name = "windows_i686_msvc" version = "0.48.5" @@ -2819,6 +3026,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" +[[package]] +name = "windows_i686_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" + [[package]] name = "windows_x86_64_gnu" version = "0.48.5" @@ -2831,6 +3044,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" + [[package]] name = "windows_x86_64_gnullvm" version = "0.48.5" @@ -2843,6 +3062,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" + [[package]] name = "windows_x86_64_msvc" version = "0.48.5" @@ -2855,6 +3080,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" + [[package]] name = "winnow" version = "0.7.15" @@ -2922,7 +3153,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" dependencies = [ "anyhow", - "bitflags", + "bitflags 2.11.0", "indexmap", "log", "serde", diff --git a/Cargo.toml b/Cargo.toml index 19b9f97..5f768de 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,6 +29,8 @@ strsim = "0.11" ignore = "0.4" rmcp = { version = "1.2", features = ["transport-io"] } tokio = { version = "1", features = ["macros", "rt-multi-thread"] } +notify = "7.0" +notify-debouncer-full = "0.4" [dev-dependencies] tempfile = "3" diff --git a/src/context.rs b/src/context.rs index 076caa1..1dfcda8 100644 --- a/src/context.rs +++ b/src/context.rs @@ -201,9 +201,10 @@ pub fn context_list( params: &ContextParams, folder: Option<&str>, tags: &[String], + created_by: Option<&str>, limit: usize, ) -> Result> { - let files = params.store.list_files(folder, tags, limit)?; + let files = params.store.list_files(folder, tags, created_by, limit)?; let file_ids: Vec = files.iter().map(|f| f.id).collect(); let edge_counts = params .store @@ -364,7 +365,7 @@ pub fn context_project(params: &ContextParams, name: &str) -> Result Result { + let max_tokens = 512; + let overlap_tokens = 50; + + // 1. Parse frontmatter for tags and created_by + let parsed = chunk_markdown(content); + let tags = parsed.tags; + let chunks = { + let tc = |s: &str| embedder.token_count(s); + split_oversized_chunks(parsed.chunks, &tc, max_tokens, overlap_tokens) + }; + + // Extract created_by from frontmatter + let (frontmatter, _body) = crate::writer::split_frontmatter(content); + let created_by: Option = frontmatter.lines().find_map(|line| { + let trimmed = line.trim(); + if let Some(val) = trimmed.strip_prefix("created_by:") { + let val = val.trim().trim_matches('"').trim_matches('\''); + if !val.is_empty() { + return Some(val.to_string()); + } + } + None + }); + + // 2. Embed all chunks + let token_counts: Vec = chunks + .iter() + .map(|c| embedder.token_count(&c.text)) + .collect(); + let texts: Vec<&str> = chunks.iter().map(|c| c.text.as_str()).collect(); + let mut all_vectors = Vec::with_capacity(texts.len()); + for batch in texts.chunks(config.batch_size) { + let vectors = embedder.embed_batch(batch)?; + all_vectors.extend(vectors); + } + + // 3. Compute mtime + let mtime = std::fs::metadata(vault_path.join(rel_path)) + .and_then(|m| m.modified()) + .ok() + .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok()) + .map(|d| d.as_secs() as i64) + .unwrap_or(0); + + let docid = generate_docid(rel_path); + + // 4. Begin transaction (skip if caller already opened one) + let owns_transaction = store.conn().is_autocommit(); + if owns_transaction { + store.conn().execute_batch("BEGIN DEFERRED")?; + } + + // 5. If file already exists, clean up old entries + if let Some(record) = store.get_file(rel_path)? { + let vector_ids = store.get_vector_ids_for_file(record.id)?; + for &vid in &vector_ids { + store.delete_vec(vid)?; + } + store.delete_fts_chunks_for_file(record.id)?; + store.delete_file(record.id)?; + } + + // 6. Insert new file and chunks + let file_id = store.insert_file( + rel_path, + content_hash, + mtime, + &tags, + &docid, + created_by.as_deref(), + )?; + + let mut next_vector_id: u64 = store.next_vector_id()?; + let total_chunks = chunks.len(); + + for (chunk_seq, chunk) in chunks.iter().enumerate() { + let heading = chunk.heading.clone().unwrap_or_default(); + let snippet = &chunk.snippet; + let vector = &all_vectors[chunk_seq]; + let vector_id = next_vector_id; + next_vector_id += 1; + + store.insert_chunk_with_vector( + file_id, + &heading, + snippet, + vector_id, + token_counts[chunk_seq] as i64, + vector, + )?; + store.insert_vec(vector_id, vector)?; + store.insert_fts_chunk(file_id, chunk_seq as i64, snippet)?; + } + + // 7. Register tags + for tag in &tags { + store.register_tag(tag, "indexer")?; + } + + // 8. Commit (only if we own the transaction) + if owns_transaction { + store.commit()?; + } + + Ok(IndexFileResult { + file_id, + total_chunks, + docid, + }) +} + +/// Remove a file from the store, cleaning up vec, FTS, and cascading chunks/edges. +/// +/// sqlite-vec virtual tables don't participate in CASCADE deletes, so we must +/// manually delete vector entries before removing the file record. +pub fn remove_file(rel_path: &str, store: &Store) -> Result<()> { + let file = store + .get_file(rel_path)? + .ok_or_else(|| anyhow!("File not found: '{}'", rel_path))?; + + let owns_transaction = store.conn().is_autocommit(); + if owns_transaction { + store.conn().execute_batch("BEGIN DEFERRED")?; + } + + let vector_ids = store.get_vector_ids_for_file(file.id)?; + for &vid in &vector_ids { + store.delete_vec(vid)?; + } + store.delete_fts_chunks_for_file(file.id)?; + store.delete_file(file.id)?; + + if owns_transaction { + store.commit()?; + } + Ok(()) +} + +/// Rename a file in the store, preserving its file_id and all edge integrity. +/// +/// Recomputes the docid from the new path and delegates to `Store::update_file_path` +/// which performs a collision check and updates the path in place. +pub fn rename_file(old_rel: &str, new_rel: &str, store: &Store) -> Result<()> { + let new_docid = generate_docid(new_rel); + store.update_file_path(old_rel, new_rel, &new_docid)?; + Ok(()) +} + /// Main indexing orchestrator. /// /// Walks the vault, diffs against the store, processes new/changed/deleted files, /// embeds chunks in parallel, and writes everything to the store. pub fn run_index(vault_path: &Path, config: &Config, rebuild: bool) -> Result { - let start = Instant::now(); let data_dir = Config::data_dir()?; std::fs::create_dir_all(&data_dir)?; + let db_path = data_dir.join("engraph.db"); + let store = Store::open(&db_path)?; + + let models_dir = data_dir.join("models"); + let mut embedder = Embedder::new(&models_dir)?; + + run_index_inner(vault_path, config, &store, &mut embedder, rebuild) +} + +/// Like [`run_index`], but accepts shared `Store` and `Embedder` references. +/// +/// Useful when the caller already owns these resources (e.g. a file watcher +/// performing a full rescan without re-opening the database or reloading the model). +pub fn run_index_shared( + vault_path: &Path, + config: &Config, + store: &Store, + embedder: &mut Embedder, + rebuild: bool, +) -> Result { + run_index_inner(vault_path, config, store, embedder, rebuild) +} + +/// Shared implementation for [`run_index`] and [`run_index_shared`]. +fn run_index_inner( + vault_path: &Path, + config: &Config, + store: &Store, + embedder: &mut Embedder, + rebuild: bool, +) -> Result { + let start = Instant::now(); + let cleaned = crate::writer::cleanup_temp_files(vault_path)?; if cleaned > 0 { info!(cleaned, "cleaned up incomplete writes from previous run"); } - let db_path = data_dir.join("engraph.db"); - let store = Store::open(&db_path)?; - - let orphans = crate::writer::verify_index_integrity(&store, vault_path)?; + let orphans = crate::writer::verify_index_integrity(store, vault_path)?; if orphans > 0 { info!(orphans, "cleaned up orphan DB entries for missing files"); } @@ -302,7 +498,7 @@ pub fn run_index(vault_path: &Path, config: &Config, rebuild: bool) -> Result Result = new_files.clone(); for file_path in &changed_files { let rel = file_path.strip_prefix(vault_path).unwrap_or(file_path); let rel_str = rel.to_string_lossy().to_string(); - if let Some(record) = store.get_file(&rel_str)? { - let vector_ids = store.get_vector_ids_for_file(record.id)?; - for &vid in &vector_ids { - store.delete_vec(vid)?; - } - store.delete_fts_chunks_for_file(record.id)?; - store.delete_file(record.id)?; + if store.get_file(&rel_str)?.is_some() { + remove_file(&rel_str, store)?; } files_to_index.push(file_path.clone()); } - // Step 6: Read content, chunk, and embed in parallel. - let models_dir = data_dir.join("models"); - let mut embedder = Embedder::new(&models_dir)?; - - // Determine max tokens from embedder (use 512 as default for all-MiniLM-L6-v2). - let max_tokens = 512; - let overlap_tokens = 50; - - // Read all file contents sequentially, then process in parallel. - let file_contents: Vec<(PathBuf, String)> = files_to_index + // Step 6: Read content, index each file via index_file. + // Read all file contents and compute hashes. + let file_contents: Vec<(String, String, String)> = files_to_index .iter() .filter_map(|p| { - std::fs::read_to_string(p) - .ok() - .map(|content| (p.clone(), content)) + let hash = compute_file_hash(p).ok()?; + let content = std::fs::read_to_string(p).ok()?; + let rel = p.strip_prefix(vault_path).unwrap_or(p); + let rel_str = rel.to_string_lossy().to_string(); + Some((rel_str, content, hash)) }) .collect(); // Preserve raw content for edge building (wikilink extraction needs full text). let content_by_path: HashMap = file_contents .iter() - .map(|(path, content)| { - let rel = path.strip_prefix(vault_path).unwrap_or(path); - (rel.to_string_lossy().to_string(), content.clone()) - }) - .collect(); - - // Parallel chunking (embedding is serial since Embedder is not Send+Sync). - let chunked_files: Vec<_> = file_contents - .par_iter() - .map(|(path, content)| { - let parsed = chunk_markdown(content); - // We can't call embedder.token_count across threads, so we defer - // oversized splitting to serial phase. - let rel = path.strip_prefix(vault_path).unwrap_or(path); - let rel_str = rel.to_string_lossy().to_string(); - let hash = { - let mut hasher = Sha256::new(); - hasher.update(content.as_bytes()); - format!("{:x}", hasher.finalize()) - }; - (path.clone(), rel_str, hash, parsed.tags, parsed.chunks) - }) + .map(|(rel_str, content, _hash)| (rel_str.clone(), content.clone())) .collect(); - // Serial: split oversized chunks, embed, and collect results. - struct FileResult { - rel_path: String, - hash: String, - tags: Vec, - mtime: i64, - chunks: Vec<(String, String, Vec, usize)>, // (heading, snippet, vector, token_count) - } - - let mut results: Vec = Vec::new(); + // Serial: chunk, embed, and write each file via index_file. + // Wrap in a single transaction so we get one fsync instead of N. let mut total_chunks = 0usize; + let mut indexed_rel_paths: Vec = Vec::new(); - for (path, rel_str, hash, tags, chunks) in chunked_files { - // Use a closure that borrows embedder for token counting in split phase. - let chunks = { - let tc = |s: &str| embedder.token_count(s); - split_oversized_chunks(chunks, &tc, max_tokens, overlap_tokens) - }; - - let mtime = std::fs::metadata(&path) - .and_then(|m| m.modified()) - .ok() - .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok()) - .map(|d| d.as_secs() as i64) - .unwrap_or(0); - - // Count tokens before embedding (while embedder is borrowed immutably). - let token_counts: Vec = chunks - .iter() - .map(|c| embedder.token_count(&c.text)) - .collect(); - - // Embed in batches (borrows embedder mutably). - let texts: Vec<&str> = chunks.iter().map(|c| c.text.as_str()).collect(); - let mut all_vectors = Vec::with_capacity(texts.len()); - - for batch in texts.chunks(config.batch_size) { - let vectors = embedder.embed_batch(batch)?; - all_vectors.extend(vectors); - } - - let mut chunk_results = Vec::new(); - for (i, chunk) in chunks.iter().enumerate() { - let heading = chunk.heading.clone().unwrap_or_default(); - let snippet = chunk.snippet.clone(); - chunk_results.push((heading, snippet, all_vectors[i].clone(), token_counts[i])); - } - - total_chunks += chunk_results.len(); - results.push(FileResult { - rel_path: rel_str, - hash, - tags, - mtime, - chunks: chunk_results, - }); - } - - // Step 8: Serial write — insert files + chunks into store with vectors. - let mut next_vector_id: u64 = store.next_vector_id()?; - - for result in &results { - let docid = generate_docid(&result.rel_path); - let file_id = store.insert_file( - &result.rel_path, - &result.hash, - result.mtime, - &result.tags, - &docid, - )?; - - for tag in &result.tags { - store.register_tag(tag, "indexer")?; - } - - for (chunk_seq, (heading, snippet, vector, token_count)) in result.chunks.iter().enumerate() - { - let vector_id = next_vector_id; - next_vector_id += 1; - store.insert_chunk_with_vector( - file_id, - heading, - snippet, - vector_id, - *token_count as i64, - vector, - )?; - store.insert_vec(vector_id, vector)?; - store.insert_fts_chunk(file_id, chunk_seq as i64, snippet)?; - } + store.conn().execute_batch("BEGIN DEFERRED")?; + for (rel_str, content, hash) in &file_contents { + let result = index_file(rel_str, content, hash, store, embedder, vault_path, config)?; + total_chunks += result.total_chunks; + indexed_rel_paths.push(rel_str.clone()); } + store.commit()?; // Step 9: Build vault graph edges. info!("building vault graph edges"); @@ -482,11 +563,11 @@ pub fn run_index(vault_path: &Path, config: &Config, rebuild: bool) -> Result Result Result>> = HashMap::new(); - for result in &results { - let folder = result - .rel_path - .split('/') - .next() - .unwrap_or("(root)") - .to_string(); - for (_heading, _snippet, vector, _token_count) in &result.chunks { - folder_vecs.entry(folder.clone()).or_default().push(vector); + let mut folder_vecs: HashMap>> = HashMap::new(); + for rel_path in &indexed_rel_paths { + let folder = rel_path.split('/').next().unwrap_or("(root)").to_string(); + if let Some(file_record) = store.get_file(rel_path)? { + let chunk_vectors = store.get_chunk_vectors_for_file(file_record.id)?; + for vector in chunk_vectors { + folder_vecs.entry(folder.clone()).or_default().push(vector); + } } } @@ -676,6 +756,7 @@ mod tests { 100, &[], &generate_docid("note.md"), + None, ) .unwrap(); @@ -706,6 +787,7 @@ mod tests { 100, &[], &generate_docid("surviving.md"), + None, ) .unwrap(); store @@ -715,6 +797,7 @@ mod tests { 100, &[], &generate_docid("deleted.md"), + None, ) .unwrap(); @@ -757,9 +840,15 @@ mod tests { write_file(root, "c.md", "# C\nNo links here."); let store = Store::open_memory().unwrap(); - let f_a = store.insert_file("a.md", "h1", 100, &[], "aaa111").unwrap(); - let f_b = store.insert_file("b.md", "h2", 100, &[], "bbb222").unwrap(); - let _f_c = store.insert_file("c.md", "h3", 100, &[], "ccc333").unwrap(); + let f_a = store + .insert_file("a.md", "h1", 100, &[], "aaa111", None) + .unwrap(); + let f_b = store + .insert_file("b.md", "h2", 100, &[], "bbb222", None) + .unwrap(); + let _f_c = store + .insert_file("c.md", "h3", 100, &[], "ccc333", None) + .unwrap(); let content_a = std::fs::read_to_string(root.join("a.md")).unwrap(); let content_b = std::fs::read_to_string(root.join("b.md")).unwrap(); @@ -799,10 +888,10 @@ mod tests { fn test_people_mention_detection() { let store = Store::open_memory().unwrap(); let person = store - .insert_file("People/John Nelson.md", "h1", 100, &[], "aaa111") + .insert_file("People/John Nelson.md", "h1", 100, &[], "aaa111", None) .unwrap(); let note = store - .insert_file("daily.md", "h2", 100, &[], "bbb222") + .insert_file("daily.md", "h2", 100, &[], "bbb222", None) .unwrap(); let people = vec![(person, vec!["John Nelson".to_string()])]; diff --git a/src/lib.rs b/src/lib.rs index 611dd05..bb22f42 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -16,4 +16,5 @@ pub mod serve; pub mod store; pub mod tags; pub mod vecstore; +pub mod watcher; pub mod writer; diff --git a/src/links.rs b/src/links.rs index fcc0683..7ed4643 100644 --- a/src/links.rs +++ b/src/links.rs @@ -3,6 +3,7 @@ use std::path::Path; use crate::indexer::extract_aliases_from_frontmatter; use crate::store::Store; +use strsim::normalized_levenshtein; /// A potential wikilink discovered in note content. #[derive(Debug, Clone, PartialEq)] @@ -20,15 +21,31 @@ pub enum LinkMatchType { ExactName, /// Matched an alias from frontmatter Alias, + /// Fuzzy match on note name (0-1000 basis points confidence) + FuzzyName { confidence_bp: u16 }, + /// First-name match for people notes (suggestion-only, 0-1000 basis points) + FirstName { confidence_bp: u16 }, +} + +impl LinkMatchType { + /// Priority for overlap resolution (lower = higher priority). + pub fn priority(&self) -> u8 { + match self { + Self::ExactName => 0, + Self::Alias => 1, + Self::FuzzyName { .. } => 2, + Self::FirstName { .. } => 3, + } + } } /// An entry in the name-to-path lookup table. #[derive(Debug, Clone)] pub(crate) struct NameEntry { - name: String, - name_lower: String, - path: String, - match_type: LinkMatchType, + pub(crate) name: String, + pub(crate) name_lower: String, + pub(crate) path: String, + pub(crate) match_type: LinkMatchType, } /// Build a lookup table of (name, path, match_type) from all indexed files. @@ -145,15 +162,185 @@ fn is_word_boundary_after(content: &[u8], end: usize) -> bool { !ch.is_ascii_alphanumeric() && ch != b'_' } +/// Returns (start_byte, end_byte, cleaned_word) for each word in the text. +/// +/// Splits on non-alphanumeric boundaries (except apostrophes within words). +/// Strips trailing possessives (`'s`) from the cleaned word for matching. +pub(crate) fn word_spans(text: &str) -> Vec<(usize, usize, String)> { + let bytes = text.as_bytes(); + let mut spans = Vec::new(); + let mut i = 0; + + while i < bytes.len() { + // Skip non-word characters + if !bytes[i].is_ascii_alphanumeric() { + i += 1; + continue; + } + + let start = i; + // Consume word characters (alphanumeric + apostrophes within words) + while i < bytes.len() + && (bytes[i].is_ascii_alphanumeric() + || (bytes[i] == b'\'' + && i + 1 < bytes.len() + && bytes[i + 1].is_ascii_alphanumeric())) + { + i += 1; + } + let end = i; + + let mut word = text[start..end].to_string(); + // Strip trailing possessive 's + if word.ends_with("'s") || word.ends_with("'S") { + word.truncate(word.len() - 2); + } + + if !word.is_empty() { + spans.push((start, end, word)); + } + } + + spans +} + +/// Find fuzzy matches for eligible names in content using a sliding window. +/// +/// Uses normalized Levenshtein distance with a 0.92 (920bp) threshold. +/// Skips single-word names unless they come from the People folder. +/// Skips windows overlapping with `existing_regions` (from exact matches). +/// Only matches once per name per content. +pub(crate) fn find_fuzzy_matches( + content: &str, + eligible_names: &[NameEntry], + existing_regions: &[(usize, usize)], + people_folder: Option<&str>, +) -> Vec { + let spans = word_spans(content); + let mut results = Vec::new(); + + for entry in eligible_names { + let word_count = entry.name.split_whitespace().count(); + + // Skip single-word names not from People folder + if word_count <= 1 { + let is_people = people_folder + .map(|pf| entry.path.starts_with(pf)) + .unwrap_or(false); + if !is_people { + continue; + } + // People single-word names must be >= 3 chars (already enforced by build_name_index) + } + + if spans.len() < word_count { + continue; + } + + let mut matched = false; + for win_start in 0..=(spans.len() - word_count) { + if matched { + break; + } + + let win_end_idx = win_start + word_count - 1; + let byte_start = spans[win_start].0; + let byte_end = spans[win_end_idx].1; + + // Skip if overlapping with existing regions + if overlaps_claimed(byte_start, byte_end, existing_regions) { + continue; + } + + // Join cleaned words for comparison + let window_text: String = spans[win_start..=win_end_idx] + .iter() + .map(|(_, _, w)| w.to_lowercase()) + .collect::>() + .join(" "); + + let sim = normalized_levenshtein(&window_text, &entry.name_lower); + let confidence_bp = (sim * 1000.0) as u16; + + if confidence_bp >= 920 { + // Use actual content bytes for matched_text + let matched_text = content[byte_start..byte_end].to_string(); + + results.push(DiscoveredLink { + matched_text, + target_path: entry.path.clone(), + display: Some(content[byte_start..byte_end].to_string()), + match_type: LinkMatchType::FuzzyName { confidence_bp }, + }); + matched = true; + } + } + } + + results +} + +const FIRST_NAME_CONFIDENCE_BP: u16 = 650; + +/// Find first-name matches for People notes in content. +/// +/// For each word in content, checks if it uniquely matches the first name of exactly +/// one People note. Ambiguous matches (0 or 2+ people sharing the same first name) +/// are skipped. Matches overlapping `existing_regions` are also skipped. +pub(crate) fn find_first_name_matches( + content: &str, + people_names: &[NameEntry], + existing_regions: &[(usize, usize)], +) -> Vec { + let spans = word_spans(content); + let mut results = Vec::new(); + + for &(start, end, ref word) in &spans { + // Skip if overlapping with existing regions + if overlaps_claimed(start, end, existing_regions) { + continue; + } + + let word_lower = word.to_lowercase(); + + // Find all people whose name_lower starts with this word followed by a space + let matching: Vec<&NameEntry> = people_names + .iter() + .filter(|e| { + e.name_lower.starts_with(&word_lower) + && e.name_lower.len() > word_lower.len() + && e.name_lower.as_bytes()[word_lower.len()] == b' ' + }) + .collect(); + + // Exactly one match → emit + if matching.len() == 1 { + let entry = matching[0]; + results.push(DiscoveredLink { + matched_text: content[start..end].to_string(), + target_path: entry.path.clone(), + display: Some(content[start..end].to_string()), + match_type: LinkMatchType::FirstName { + confidence_bp: FIRST_NAME_CONFIDENCE_BP, + }, + }); + } + } + + results +} + /// Discover potential wikilink targets in content by matching note names and aliases. /// /// Builds a name index from the store, then scans content for case-insensitive /// matches that aren't inside existing wikilinks and don't overlap with longer -/// already-matched names. +/// already-matched names. After exact matching, runs fuzzy matching for eligible +/// names that weren't already matched. pub fn discover_links( store: &Store, content: &str, vault_path: &Path, + people_folder: Option<&str>, ) -> Result> { let name_index = build_name_index(store, vault_path)?; let wikilink_regions = find_wikilink_regions(content); @@ -162,6 +349,8 @@ pub fn discover_links( let mut links = Vec::new(); let mut claimed: Vec<(usize, usize)> = Vec::new(); + let mut exact_matched_paths: std::collections::HashSet = + std::collections::HashSet::new(); for entry in &name_index { let needle = &entry.name_lower; @@ -191,7 +380,9 @@ pub fn discover_links( let matched_text = content[pos..end].to_string(); let display = match entry.match_type { - LinkMatchType::Alias => Some(matched_text.clone()), + LinkMatchType::Alias + | LinkMatchType::FuzzyName { .. } + | LinkMatchType::FirstName { .. } => Some(matched_text.clone()), LinkMatchType::ExactName => None, }; @@ -203,9 +394,83 @@ pub fn discover_links( }); claimed.push((pos, end)); + exact_matched_paths.insert(entry.path.clone()); } } + // --- Fuzzy matching phase --- + // Build eligible names: multi-word names, or People folder notes >= 3 chars + let eligible: Vec = name_index + .iter() + .filter(|e| !exact_matched_paths.contains(&e.path)) + .filter(|e| { + let word_count = e.name.split_whitespace().count(); + if word_count >= 2 { + return true; + } + // Single-word: only if from People folder and >= 3 chars + if let Some(pf) = people_folder { + e.path.starts_with(pf) && e.name.len() >= 3 + } else { + false + } + }) + .cloned() + .collect(); + + // Combine exact match regions and wikilink regions for fuzzy exclusion + let mut fuzzy_excluded = claimed.clone(); + fuzzy_excluded.extend_from_slice(&wikilink_regions); + let fuzzy_matches = find_fuzzy_matches(content, &eligible, &fuzzy_excluded, people_folder); + + // Track fuzzy match regions for first-name exclusion + let mut first_name_excluded = fuzzy_excluded.clone(); + for fm in &fuzzy_matches { + // Find position of the fuzzy match in content to exclude from first-name matching + let needle = fm.matched_text.to_lowercase(); + let content_lower_bytes = content.to_lowercase(); + if let Some(pos) = content_lower_bytes.find(&needle) { + first_name_excluded.push((pos, pos + needle.len())); + } + } + links.extend(fuzzy_matches); + + // --- First-name matching phase --- + // Filter people names from the name index + if let Some(pf) = people_folder { + let people_names: Vec = name_index + .iter() + .filter(|e| e.path.starts_with(pf) && matches!(e.match_type, LinkMatchType::ExactName)) + .filter(|e| !exact_matched_paths.contains(&e.path)) + .cloned() + .collect(); + + let first_name_matches = + find_first_name_matches(content, &people_names, &first_name_excluded); + links.extend(first_name_matches); + } + + // Sort: exact matches first (by priority), then by confidence descending + links.sort_by(|a, b| { + let pa = a.match_type.priority(); + let pb = b.match_type.priority(); + if pa != pb { + return pa.cmp(&pb); + } + // For same priority, sort by confidence descending + let ca = match &a.match_type { + LinkMatchType::FuzzyName { confidence_bp } => *confidence_bp, + LinkMatchType::FirstName { confidence_bp } => *confidence_bp, + _ => 1000, + }; + let cb = match &b.match_type { + LinkMatchType::FuzzyName { confidence_bp } => *confidence_bp, + LinkMatchType::FirstName { confidence_bp } => *confidence_bp, + _ => 1000, + }; + cb.cmp(&ca) + }); + Ok(links) } @@ -294,6 +559,7 @@ mod tests { 0, &[], "aaa111", + None, ) .unwrap(); store @@ -303,6 +569,7 @@ mod tests { 0, &[], "bbb222", + None, ) .unwrap(); @@ -326,7 +593,7 @@ mod tests { fn test_exact_name_match() { let (store, vault_dir) = setup_store_and_vault(); let content = "Talked to Steve Barbera"; - let links = discover_links(&store, content, vault_dir.path()).unwrap(); + let links = discover_links(&store, content, vault_dir.path(), None).unwrap(); assert_eq!(links.len(), 1); assert_eq!(links[0].matched_text, "Steve Barbera"); assert_eq!(links[0].match_type, LinkMatchType::ExactName); @@ -336,7 +603,7 @@ mod tests { fn test_skip_existing_wikilinks() { let (store, vault_dir) = setup_store_and_vault(); let content = "Talked to [[Steve Barbera]]"; - let links = discover_links(&store, content, vault_dir.path()).unwrap(); + let links = discover_links(&store, content, vault_dir.path(), None).unwrap(); assert_eq!(links.len(), 0); } @@ -344,7 +611,7 @@ mod tests { fn test_multiple_matches() { let (store, vault_dir) = setup_store_and_vault(); let content = "Steve Barbera explained Reciprocal Rank Fusion"; - let links = discover_links(&store, content, vault_dir.path()).unwrap(); + let links = discover_links(&store, content, vault_dir.path(), None).unwrap(); assert_eq!(links.len(), 2); let names: Vec<&str> = links.iter().map(|l| l.matched_text.as_str()).collect(); @@ -356,7 +623,7 @@ mod tests { fn test_alias_match() { let (store, vault_dir) = setup_store_and_vault(); let content = "We use RRF for search"; - let links = discover_links(&store, content, vault_dir.path()).unwrap(); + let links = discover_links(&store, content, vault_dir.path(), None).unwrap(); assert_eq!(links.len(), 1); assert_eq!(links[0].match_type, LinkMatchType::Alias); assert_eq!( @@ -370,7 +637,7 @@ mod tests { fn test_apply_links() { let (store, vault_dir) = setup_store_and_vault(); let content = "Steve Barbera explained RRF to me"; - let links = discover_links(&store, content, vault_dir.path()).unwrap(); + let links = discover_links(&store, content, vault_dir.path(), None).unwrap(); let result = apply_links(content, &links); assert!(result.contains("[[Steve Barbera]]")); @@ -390,7 +657,7 @@ mod tests { fn test_case_insensitive_match() { let (store, vault_dir) = setup_store_and_vault(); let content = "talked to steve barbera today"; - let links = discover_links(&store, content, vault_dir.path()).unwrap(); + let links = discover_links(&store, content, vault_dir.path(), None).unwrap(); assert_eq!(links.len(), 1); assert_eq!(links[0].matched_text, "steve barbera"); } @@ -400,7 +667,158 @@ mod tests { let (store, vault_dir) = setup_store_and_vault(); // "RRF" embedded inside a word should not match let content = "The xRRFy algorithm"; - let links = discover_links(&store, content, vault_dir.path()).unwrap(); + let links = discover_links(&store, content, vault_dir.path(), None).unwrap(); assert_eq!(links.len(), 0); } + + // --- word_spans tests --- + + #[test] + fn test_word_spans_basic() { + let spans = word_spans("hello world"); + assert_eq!(spans.len(), 2); + assert_eq!(spans[0], (0, 5, "hello".to_string())); + assert_eq!(spans[1], (6, 11, "world".to_string())); + } + + #[test] + fn test_word_spans_possessive() { + let spans = word_spans("Steve's book"); + assert_eq!(spans.len(), 2); + assert_eq!(spans[0].2, "Steve"); + assert_eq!(spans[1].2, "book"); + } + + #[test] + fn test_word_spans_preserves_byte_positions() { + let text = "I met Steeve Barbera yesterday"; + let spans = word_spans(text); + assert_eq!(spans.len(), 5); + // spans: I(0,1), met(2,5), Steeve(6,12), Barbera(13,20), yesterday(21,30) + assert_eq!(spans[2].0, 6); + assert_eq!(spans[2].1, 12); + assert_eq!(&text[spans[2].0..spans[2].1], "Steeve"); + } + + // --- find_fuzzy_matches tests --- + + #[test] + fn test_fuzzy_match_typo() { + let _spans = word_spans("I met Steeve Barbera yesterday"); + let entries = vec![NameEntry { + name: "Steve Barbera".into(), + name_lower: "steve barbera".into(), + path: "People/Steve Barbera.md".into(), + match_type: LinkMatchType::ExactName, + }]; + let matches = find_fuzzy_matches("I met Steeve Barbera yesterday", &entries, &[], None); + assert_eq!(matches.len(), 1); + assert_eq!(matches[0].target_path, "People/Steve Barbera.md"); + assert!( + matches!(matches[0].match_type, LinkMatchType::FuzzyName { confidence_bp } if confidence_bp >= 920) + ); + } + + #[test] + fn test_fuzzy_no_match_below_threshold() { + let entries = vec![NameEntry { + name: "Steve Barbera".into(), + name_lower: "steve barbera".into(), + path: "People/Steve Barbera.md".into(), + match_type: LinkMatchType::ExactName, + }]; + let matches = find_fuzzy_matches("Steven Rogers was there", &entries, &[], None); + assert_eq!(matches.len(), 0); + } + + #[test] + fn test_fuzzy_skips_single_word_non_people() { + let entries = vec![NameEntry { + name: "Rust".into(), + name_lower: "rust".into(), + path: "Resources/Rust.md".into(), + match_type: LinkMatchType::ExactName, + }]; + let matches = find_fuzzy_matches("I love Rust programming", &entries, &[], None); + assert_eq!(matches.len(), 0); + } + + #[test] + fn test_fuzzy_skips_claimed_regions() { + let entries = vec![NameEntry { + name: "Steve Barbera".into(), + name_lower: "steve barbera".into(), + path: "People/Steve Barbera.md".into(), + match_type: LinkMatchType::ExactName, + }]; + // Claim the region where "Steve Barbera" would match + let matches = + find_fuzzy_matches("I met Steve Barbera yesterday", &entries, &[(6, 20)], None); + assert_eq!(matches.len(), 0); + } + + // --- find_first_name_matches tests --- + + #[test] + fn test_first_name_unique_match() { + let people = vec![NameEntry { + name: "Steve Barbera".into(), + name_lower: "steve barbera".into(), + path: "People/Steve Barbera.md".into(), + match_type: LinkMatchType::ExactName, + }]; + let matches = find_first_name_matches("I talked to Steve about it.", &people, &[]); + assert_eq!(matches.len(), 1); + assert!(matches!( + matches[0].match_type, + LinkMatchType::FirstName { confidence_bp: 650 } + )); + assert_eq!(matches[0].display, Some("Steve".to_string())); + } + + #[test] + fn test_first_name_ambiguous() { + let people = vec![ + NameEntry { + name: "Steve Barbera".into(), + name_lower: "steve barbera".into(), + path: "People/Steve Barbera.md".into(), + match_type: LinkMatchType::ExactName, + }, + NameEntry { + name: "Steve Rogers".into(), + name_lower: "steve rogers".into(), + path: "People/Steve Rogers.md".into(), + match_type: LinkMatchType::ExactName, + }, + ]; + let matches = find_first_name_matches("I talked to Steve about it.", &people, &[]); + assert_eq!(matches.len(), 0); // ambiguous — no match + } + + #[test] + fn test_first_name_skips_existing_regions() { + let people = vec![NameEntry { + name: "Steve Barbera".into(), + name_lower: "steve barbera".into(), + path: "People/Steve Barbera.md".into(), + match_type: LinkMatchType::ExactName, + }]; + // "Steve" is at position 12..17 + let matches = find_first_name_matches("I talked to Steve about it.", &people, &[(12, 17)]); + assert_eq!(matches.len(), 0); + } + + #[test] + fn test_first_name_no_match_exact_name_only() { + // A person with only a first name (no space) should not match + let people = vec![NameEntry { + name: "Steve".into(), + name_lower: "steve".into(), + path: "People/Steve.md".into(), + match_type: LinkMatchType::ExactName, + }]; + let matches = find_first_name_matches("I talked to Steve about it.", &people, &[]); + assert_eq!(matches.len(), 0); // "steve" doesn't start with "steve " (no space after) + } } diff --git a/src/main.rs b/src/main.rs index 1faffe1..2df5985 100644 --- a/src/main.rs +++ b/src/main.rs @@ -130,6 +130,9 @@ enum ContextAction { /// Filter to notes with all listed tags (comma-separated). #[arg(long, value_delimiter = ',')] tags: Vec, + /// Filter to notes created by a specific agent. + #[arg(long)] + created_by: Option, /// Maximum results. #[arg(long, default_value = "20")] limit: usize, @@ -577,10 +580,16 @@ async fn main() -> Result<()> { ContextAction::List { folder, tags, + created_by, limit, } => { - let items = - engraph::context::context_list(¶ms, folder.as_deref(), &tags, limit)?; + let items = engraph::context::context_list( + ¶ms, + folder.as_deref(), + &tags, + created_by.as_deref(), + limit, + )?; if cli.json { println!("{}", serde_json::to_string_pretty(&items)?); } else { @@ -804,6 +813,9 @@ async fn main() -> Result<()> { if !result.links_added.is_empty() { println!("Links: {}", result.links_added.join(", ")); } + if !result.links_suggested.is_empty() { + println!("Suggested: {}", result.links_suggested.join(", ")); + } } } WriteAction::Append { file, content } => { diff --git a/src/placement.rs b/src/placement.rs index 97b18b4..6bc6e89 100644 --- a/src/placement.rs +++ b/src/placement.rs @@ -3,6 +3,15 @@ use anyhow::Result; use crate::embedder::Embedder; use crate::profile::VaultProfile; use crate::store::Store; +use crate::writer::split_frontmatter; + +#[derive(Debug, Clone, PartialEq)] +pub struct CorrectionInfo { + pub suggested_folder: String, + pub actual_folder: String, +} + +const ENGRAPH_AGENTS: &[&str] = &["claude-code", "cli", "mcp-server"]; #[derive(Debug, Clone)] pub struct PlacementResult { @@ -284,6 +293,93 @@ fn cosine_similarity(a: &[f32], b: &[f32]) -> f64 { } } +/// Detect whether a note was placed by engraph and then moved by the user to a different folder. +/// +/// Returns `Some(CorrectionInfo)` if the frontmatter contains `suggested_folder` and `created_by` +/// matching a known engraph agent, and the actual folder differs from the suggested one. +/// Returns `None` if there's no correction (confirmed placement, missing fields, or external tool). +pub fn detect_correction_from_frontmatter( + content: &str, + actual_folder: &str, +) -> Option { + let (fm, _body) = split_frontmatter(content); + if fm.is_empty() { + return None; + } + + let mut suggested_folder: Option = None; + let mut created_by: Option = None; + + for line in fm.lines() { + if let Some(val) = line.strip_prefix("suggested_folder:") { + suggested_folder = Some(val.trim().trim_matches('"').trim_matches('\'').to_string()); + } else if let Some(val) = line.strip_prefix("created_by:") { + created_by = Some(val.trim().trim_matches('"').trim_matches('\'').to_string()); + } + } + + // Guard: created_by must exist and match a known engraph agent + let agent = created_by?; + if !ENGRAPH_AGENTS.contains(&agent.as_str()) { + return None; + } + + let suggested = suggested_folder?; + + // Compare trimming trailing slashes + let suggested_trimmed = suggested.trim_end_matches('/'); + let actual_trimmed = actual_folder.trim_end_matches('/'); + + if suggested_trimmed == actual_trimmed { + return None; // Confirmation, not correction + } + + Some(CorrectionInfo { + suggested_folder: suggested, + actual_folder: actual_folder.to_string(), + }) +} + +/// Strip `suggested_folder:` and `confidence:` lines from frontmatter. +/// +/// Guards against block scalars (lines ending with `|` or `>`), returning content unchanged +/// with a warning if detected. +pub fn strip_placement_frontmatter(content: &str) -> String { + let (fm, body) = split_frontmatter(content); + if fm.is_empty() { + return content.to_string(); + } + + // The frontmatter includes --- delimiters. We need to filter lines between them. + let mut filtered_lines: Vec<&str> = Vec::new(); + let mut any_stripped = false; + + for line in fm.lines() { + if line.starts_with("suggested_folder:") || line.starts_with("confidence:") { + // Guard: block scalar indicator + let trimmed = line.trim_end(); + if trimmed.ends_with('|') || trimmed.ends_with('>') { + tracing::warn!( + "Frontmatter line appears to use block scalar, skipping strip: {}", + line + ); + return content.to_string(); + } + any_stripped = true; + continue; + } + filtered_lines.push(line); + } + + if !any_stripped { + return content.to_string(); + } + + // Reconstruct: filtered frontmatter + body + let new_fm = filtered_lines.join("\n"); + format!("{}\n{}", new_fm, body) +} + #[cfg(test)] mod tests { use super::*; @@ -509,4 +605,46 @@ mod tests { let b = vec![1.0, 2.0, 3.0]; assert_eq!(cosine_similarity(&a, &b), 0.0); } + + // ── Correction detection tests ────────────────────────────── + + #[test] + fn test_detect_correction() { + let content = "---\ntags:\n - test\ncreated_by: claude-code\nsuggested_folder: 00-Inbox/\nconfidence: 0.45\n---\n\n# Note\n"; + let result = detect_correction_from_frontmatter(content, "01-Projects/"); + assert!(result.is_some()); + let info = result.unwrap(); + assert_eq!(info.suggested_folder, "00-Inbox/"); + assert_eq!(info.actual_folder, "01-Projects/"); + } + + #[test] + fn test_no_correction_when_confirmed() { + let content = "---\nsuggested_folder: 01-Projects/\ncreated_by: cli\n---\n\n# Note\n"; + assert!(detect_correction_from_frontmatter(content, "01-Projects/").is_none()); + } + + #[test] + fn test_no_correction_without_created_by() { + let content = "---\nsuggested_folder: 00-Inbox/\n---\n\n# Note\n"; + assert!(detect_correction_from_frontmatter(content, "01-Projects/").is_none()); + } + + #[test] + fn test_no_correction_external_tool() { + let content = + "---\nsuggested_folder: 00-Inbox/\ncreated_by: some-other-tool\n---\n\n# Note\n"; + assert!(detect_correction_from_frontmatter(content, "01-Projects/").is_none()); + } + + #[test] + fn test_strip_placement_frontmatter() { + let content = "---\ntags:\n - test\nsuggested_folder: 00-Inbox/\nconfidence: 0.45\ncreated_by: cli\n---\n\n# Note\nContent."; + let result = strip_placement_frontmatter(content); + assert!(!result.contains("suggested_folder")); + assert!(!result.contains("confidence")); + assert!(result.contains("tags:")); + assert!(result.contains("created_by: cli")); + assert!(result.contains("# Note")); + } } diff --git a/src/serve.rs b/src/serve.rs index 65ee0ab..81c9cd0 100644 --- a/src/serve.rs +++ b/src/serve.rs @@ -42,6 +42,8 @@ pub struct ListParams { pub folder: Option, /// Filter to notes with all listed tags. pub tags: Option>, + /// Filter to notes created by a specific agent. + pub created_by: Option, /// Maximum results (default 20). pub limit: Option, } @@ -193,8 +195,14 @@ impl EngraphServer { }; let tags = params.0.tags.unwrap_or_default(); let limit = params.0.limit.unwrap_or(20); - let items = context::context_list(&ctx, params.0.folder.as_deref(), &tags, limit) - .map_err(|e| mcp_err(&e))?; + let items = context::context_list( + &ctx, + params.0.folder.as_deref(), + &tags, + params.0.created_by.as_deref(), + limit, + ) + .map_err(|e| mcp_err(&e))?; to_json_result(&items) } @@ -423,11 +431,36 @@ pub async fn run_serve(data_dir: &Path) -> Result<()> { let profile = Config::load_vault_profile().ok().flatten(); + let store_arc = Arc::new(Mutex::new(store)); + let embedder_arc = Arc::new(Mutex::new(embedder)); + let vault_path_arc = Arc::new(vault_path); + let profile_arc = Arc::new(profile); + + // Start file watcher for real-time index updates + let config = Config::load()?; + let mut exclude = config.exclude.clone(); + if let Some(ref prof) = *profile_arc + && let Some(ref archive) = prof.structure.folders.archive + { + let pattern = format!("{}/", archive); + if !exclude.contains(&pattern) { + exclude.push(pattern); + } + } + let (watcher_handle, watcher_shutdown) = crate::watcher::start_watcher( + store_arc.clone(), + embedder_arc.clone(), + vault_path_arc.clone(), + profile_arc.clone(), + config, + exclude, + )?; + let server = EngraphServer { - store: Arc::new(Mutex::new(store)), - embedder: Arc::new(Mutex::new(embedder)), - vault_path: Arc::new(vault_path), - profile: Arc::new(profile), + store: store_arc, + embedder: embedder_arc, + vault_path: vault_path_arc, + profile: profile_arc, tool_router: EngraphServer::tool_router(), }; @@ -436,5 +469,12 @@ pub async fn run_serve(data_dir: &Path) -> Result<()> { let transport = rmcp::transport::io::stdio(); let server_handle = server.serve(transport).await?; server_handle.waiting().await?; + + // Shut down watcher cleanly after MCP transport exits + let _ = watcher_shutdown.send(()); + if let Err(e) = watcher_handle.join() { + tracing::warn!("Watcher thread panicked: {:?}", e); + } + Ok(()) } diff --git a/src/store.rs b/src/store.rs index 7fa405c..9425f66 100644 --- a/src/store.rs +++ b/src/store.rs @@ -13,6 +13,7 @@ pub struct FileRecord { pub tags: Vec, pub indexed_at: String, pub docid: Option, + pub created_by: Option, } /// A record representing a chunk of a file. @@ -45,6 +46,16 @@ pub struct EdgeStats { pub isolated_file_count: usize, } +/// A record of a placement correction (user moved a note from suggested folder). +#[derive(Debug, Clone)] +pub struct PlacementCorrection { + pub id: i64, + pub file_path: String, + pub suggested_folder: String, + pub actual_folder: String, + pub corrected_at: String, +} + /// Summary statistics for the store. #[derive(Debug)] pub struct StoreStats { @@ -189,6 +200,11 @@ impl Store { self.conn .execute_batch("CREATE INDEX IF NOT EXISTS idx_files_docid ON files(docid);")?; + // Add created_by column (idempotent — ignores error if column already exists). + let _ = self + .conn + .execute_batch("ALTER TABLE files ADD COLUMN created_by TEXT;"); + // Check if edges table exists. let has_edges: bool = { let mut stmt = self @@ -232,6 +248,27 @@ impl Store { );", )?; + // Placement corrections table + self.conn.execute_batch( + "CREATE TABLE IF NOT EXISTS placement_corrections ( + id INTEGER PRIMARY KEY, + file_path TEXT NOT NULL, + suggested_folder TEXT NOT NULL, + actual_folder TEXT NOT NULL, + corrected_at TEXT NOT NULL + );", + )?; + + // Link skiplist table (reserved for future use) + self.conn.execute_batch( + "CREATE TABLE IF NOT EXISTS link_skiplist ( + id INTEGER PRIMARY KEY, + pattern TEXT NOT NULL, + reason TEXT, + created_at TEXT NOT NULL + );", + )?; + Ok(()) } @@ -264,19 +301,21 @@ impl Store { mtime: i64, tags: &[String], docid: &str, + created_by: Option<&str>, ) -> Result { let tags_json = serde_json::to_string(tags).unwrap_or_else(|_| "[]".into()); let now = chrono_now(); self.conn.execute( - "INSERT INTO files (path, content_hash, mtime, tags, indexed_at, docid) - VALUES (?1, ?2, ?3, ?4, ?5, ?6) + "INSERT INTO files (path, content_hash, mtime, tags, indexed_at, docid, created_by) + VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7) ON CONFLICT(path) DO UPDATE SET content_hash = excluded.content_hash, mtime = excluded.mtime, tags = excluded.tags, indexed_at = excluded.indexed_at, - docid = excluded.docid", - params![path, hash, mtime, tags_json, now, docid], + docid = excluded.docid, + created_by = excluded.created_by", + params![path, hash, mtime, tags_json, now, docid, created_by], )?; let file_id: i64 = self.conn.query_row( "SELECT id FROM files WHERE path = ?1", @@ -288,7 +327,7 @@ impl Store { pub fn get_file(&self, path: &str) -> Result> { let mut stmt = self.conn.prepare( - "SELECT id, path, content_hash, mtime, tags, indexed_at, docid FROM files WHERE path = ?1", + "SELECT id, path, content_hash, mtime, tags, indexed_at, docid, created_by FROM files WHERE path = ?1", )?; let mut rows = stmt.query_map(params![path], |row| { Ok(FileRecord { @@ -299,6 +338,7 @@ impl Store { tags: parse_tags(&row.get::<_, String>(4)?), indexed_at: row.get(5)?, docid: row.get(6)?, + created_by: row.get(7)?, }) })?; match rows.next() { @@ -308,9 +348,9 @@ impl Store { } pub fn get_all_files(&self) -> Result> { - let mut stmt = self - .conn - .prepare("SELECT id, path, content_hash, mtime, tags, indexed_at, docid FROM files")?; + let mut stmt = self.conn.prepare( + "SELECT id, path, content_hash, mtime, tags, indexed_at, docid, created_by FROM files", + )?; let rows = stmt.query_map([], |row| { Ok(FileRecord { id: row.get(0)?, @@ -320,6 +360,7 @@ impl Store { tags: parse_tags(&row.get::<_, String>(4)?), indexed_at: row.get(5)?, docid: row.get(6)?, + created_by: row.get(7)?, }) })?; let mut files = Vec::new(); @@ -618,7 +659,7 @@ impl Store { /// Look up a file record by its row ID. pub fn get_file_by_id(&self, file_id: i64) -> Result> { let mut stmt = self.conn.prepare( - "SELECT id, path, content_hash, mtime, tags, indexed_at, docid FROM files WHERE id = ?1", + "SELECT id, path, content_hash, mtime, tags, indexed_at, docid, created_by FROM files WHERE id = ?1", )?; let mut rows = stmt.query_map(params![file_id], |row| { Ok(FileRecord { @@ -629,6 +670,7 @@ impl Store { tags: parse_tags(&row.get::<_, String>(4)?), indexed_at: row.get(5)?, docid: row.get(6)?, + created_by: row.get(7)?, }) })?; match rows.next() { @@ -640,7 +682,7 @@ impl Store { /// Look up a file by its 6-character docid. pub fn get_file_by_docid(&self, docid: &str) -> Result> { let mut stmt = self.conn.prepare( - "SELECT id, path, content_hash, mtime, tags, indexed_at, docid FROM files WHERE docid = ?1", + "SELECT id, path, content_hash, mtime, tags, indexed_at, docid, created_by FROM files WHERE docid = ?1", )?; let mut rows = stmt.query_map(params![docid], |row| { Ok(FileRecord { @@ -651,6 +693,7 @@ impl Store { tags: parse_tags(&row.get::<_, String>(4)?), indexed_at: row.get(5)?, docid: row.get(6)?, + created_by: row.get(7)?, }) })?; match rows.next() { @@ -859,10 +902,11 @@ impl Store { &self, folder: Option<&str>, tags: &[String], + created_by: Option<&str>, limit: usize, ) -> Result> { let mut sql = String::from( - "SELECT id, path, content_hash, mtime, tags, indexed_at, docid FROM files WHERE 1=1", + "SELECT id, path, content_hash, mtime, tags, indexed_at, docid, created_by FROM files WHERE 1=1", ); let mut param_values: Vec> = Vec::new(); if let Some(f) = folder { @@ -873,6 +917,10 @@ impl Store { sql.push_str(" AND EXISTS (SELECT 1 FROM json_each(tags) WHERE value = ?)"); param_values.push(Box::new(tag.clone())); } + if let Some(cb) = created_by { + sql.push_str(" AND created_by = ?"); + param_values.push(Box::new(cb.to_string())); + } sql.push_str(" ORDER BY indexed_at DESC LIMIT ?"); param_values.push(Box::new(limit as i64)); @@ -886,6 +934,7 @@ impl Store { tags: parse_tags(&row.get::<_, String>(4)?), indexed_at: row.get(5)?, docid: row.get(6)?, + created_by: row.get(7)?, }) })?; let mut results = Vec::new(); @@ -935,7 +984,7 @@ impl Store { /// Most recently indexed files. pub fn recent_files(&self, limit: usize) -> Result> { let mut stmt = self.conn.prepare( - "SELECT id, path, content_hash, mtime, tags, indexed_at, docid + "SELECT id, path, content_hash, mtime, tags, indexed_at, docid, created_by FROM files ORDER BY indexed_at DESC LIMIT ?", )?; let rows = stmt.query_map(params![limit as i64], |row| { @@ -947,6 +996,7 @@ impl Store { tags: parse_tags(&row.get::<_, String>(4)?), indexed_at: row.get(5)?, docid: row.get(6)?, + created_by: row.get(7)?, }) })?; let mut results = Vec::new(); @@ -1014,7 +1064,7 @@ impl Store { } // Basename match via SQL let mut stmt = self.conn.prepare( - "SELECT id, path, content_hash, mtime, tags, indexed_at, docid FROM files + "SELECT id, path, content_hash, mtime, tags, indexed_at, docid, created_by FROM files WHERE lower(path) LIKE '%/' || lower(?1) OR lower(path) = lower(?1) ORDER BY length(path) ASC LIMIT 1", )?; @@ -1027,6 +1077,7 @@ impl Store { tags: parse_tags(&row.get::<_, String>(4)?), indexed_at: row.get(5)?, docid: row.get(6)?, + created_by: row.get(7)?, }) })?; match rows.next() { @@ -1035,6 +1086,21 @@ impl Store { } } + /// Rename a file's path in the store, preserving its row ID (and thus edge integrity). + pub fn update_file_path(&self, old_path: &str, new_path: &str, new_docid: &str) -> Result<()> { + if self.get_file(new_path)?.is_some() { + anyhow::bail!("target path already exists: {}", new_path); + } + let rows_affected = self.conn.execute( + "UPDATE files SET path = ?1, docid = ?2 WHERE path = ?3", + params![new_path, new_docid, old_path], + )?; + if rows_affected == 0 { + anyhow::bail!("file not found: {}", old_path); + } + Ok(()) + } + // ── Vec (sqlite-vec) ──────────────────────────────────────── pub fn insert_vec(&self, vector_id: u64, embedding: &[f32]) -> Result<()> { @@ -1113,6 +1179,147 @@ impl Store { Ok(results) } + /// Get a single folder's centroid and file count. + pub fn get_folder_centroid(&self, folder: &str) -> Result, usize)>> { + let mut stmt = self + .conn + .prepare("SELECT centroid, file_count FROM folder_centroids WHERE folder = ?1")?; + let mut rows = stmt.query_map(params![folder], |row| { + let blob: Vec = row.get(0)?; + let count: i64 = row.get(1)?; + let centroid: Vec = blob + .chunks_exact(4) + .map(|b| f32::from_le_bytes([b[0], b[1], b[2], b[3]])) + .collect(); + Ok((centroid, count as usize)) + })?; + match rows.next() { + Some(row) => Ok(Some(row?)), + None => Ok(None), + } + } + + /// Incrementally adjust a folder centroid using online mean math. + /// If `increment` is true, adds a file vector; if false, removes one. + pub fn adjust_folder_centroid( + &self, + folder: &str, + file_vec: &[f32], + increment: bool, + ) -> Result<()> { + let existing = self.get_folder_centroid(folder)?; + match (existing, increment) { + (None, true) => { + // New folder — centroid is just this vector + self.upsert_folder_centroid(folder, file_vec, 1)?; + } + (None, false) => { + // Nothing to remove from — no-op + } + (Some((old, n)), true) => { + // online mean addition: new = (old * n + vec) / (n + 1) + let nf = n as f32; + let new_n = n + 1; + let updated: Vec = old + .iter() + .zip(file_vec.iter()) + .map(|(o, v)| (o * nf + v) / new_n as f32) + .collect(); + self.upsert_folder_centroid(folder, &updated, new_n)?; + } + (Some((_old, n)), false) if n <= 1 => { + // Last file — delete centroid row + self.conn.execute( + "DELETE FROM folder_centroids WHERE folder = ?1", + params![folder], + )?; + } + (Some((old, n)), false) => { + // online mean subtraction: new = (old * n - vec) / (n - 1) + let nf = n as f32; + let new_n = n - 1; + let updated: Vec = old + .iter() + .zip(file_vec.iter()) + .map(|(o, v)| (o * nf - v) / new_n as f32) + .collect(); + self.upsert_folder_centroid(folder, &updated, new_n)?; + } + } + Ok(()) + } + + // ── Chunk vectors ────────────────────────────────────────── + + /// Retrieve all chunk vectors for a given file, ordered by chunk id. + pub fn get_chunk_vectors_for_file(&self, file_id: i64) -> Result>> { + let mut stmt = self.conn.prepare( + "SELECT vector FROM chunks WHERE file_id = ?1 AND vector IS NOT NULL ORDER BY id", + )?; + let rows = stmt.query_map(params![file_id], |row| { + let blob: Vec = row.get(0)?; + let vector: Vec = blob + .chunks_exact(4) + .map(|b| f32::from_le_bytes([b[0], b[1], b[2], b[3]])) + .collect(); + Ok(vector) + })?; + let mut results = Vec::new(); + for row in rows { + results.push(row?); + } + Ok(results) + } + + // ── Placement corrections ──────────────────────────────────── + + /// Record a placement correction (user moved a note from suggested folder). + pub fn insert_placement_correction( + &self, + file_path: &str, + suggested_folder: &str, + actual_folder: &str, + ) -> Result<()> { + let dt = time::OffsetDateTime::now_utc(); + let now = format!( + "{:04}-{:02}-{:02}T{:02}:{:02}:{:02}Z", + dt.year(), + dt.month() as u8, + dt.day(), + dt.hour(), + dt.minute(), + dt.second(), + ); + self.conn.execute( + "INSERT INTO placement_corrections (file_path, suggested_folder, actual_folder, corrected_at) + VALUES (?1, ?2, ?3, ?4)", + params![file_path, suggested_folder, actual_folder, now], + )?; + Ok(()) + } + + /// Get recent placement corrections, latest first. + pub fn get_placement_corrections(&self, limit: usize) -> Result> { + let mut stmt = self.conn.prepare( + "SELECT id, file_path, suggested_folder, actual_folder, corrected_at + FROM placement_corrections ORDER BY id DESC LIMIT ?1", + )?; + let rows = stmt.query_map(params![limit as i64], |row| { + Ok(PlacementCorrection { + id: row.get(0)?, + file_path: row.get(1)?, + suggested_folder: row.get(2)?, + actual_folder: row.get(3)?, + corrected_at: row.get(4)?, + }) + })?; + let mut results = Vec::new(); + for row in rows { + results.push(row?); + } + Ok(results) + } + // ── Helpers ───────────────────────────────────────────────── pub fn next_vector_id(&self) -> Result { @@ -1229,7 +1436,7 @@ mod tests { let tags = vec!["rust".to_string(), "programming".to_string()]; let docid = generate_docid("notes/test.md"); let file_id = store - .insert_file("notes/test.md", "abc123", 1700000000, &tags, &docid) + .insert_file("notes/test.md", "abc123", 1700000000, &tags, &docid, None) .unwrap(); assert!(file_id > 0); @@ -1251,6 +1458,7 @@ mod tests { 100, &[], &generate_docid("notes/chunk_test.md"), + None, ) .unwrap(); @@ -1282,6 +1490,7 @@ mod tests { 100, &[], &generate_docid("notes/del.md"), + None, ) .unwrap(); store.insert_chunk(file_id, "H", "snippet", 10, 5).unwrap(); @@ -1331,6 +1540,7 @@ mod tests { 100, &["tag1".to_string()], &docid, + None, ) .unwrap(); store.insert_chunk(file_id, "H", "text", 50, 10).unwrap(); @@ -1353,6 +1563,7 @@ mod tests { 200, &["tag1".to_string()], &docid, + None, ) .unwrap(); store @@ -1401,7 +1612,7 @@ mod tests { let store = Store::open_memory().unwrap(); let docid = generate_docid("notes/findme.md"); store - .insert_file("notes/findme.md", "hash", 100, &[], &docid) + .insert_file("notes/findme.md", "hash", 100, &[], &docid, None) .unwrap(); let rec = store.get_file_by_docid(&docid).unwrap().unwrap(); @@ -1417,10 +1628,24 @@ mod tests { /// Helper: create two files and return their IDs. fn setup_two_files(store: &Store) -> (i64, i64) { let a = store - .insert_file("notes/a.md", "ha", 100, &[], &generate_docid("notes/a.md")) + .insert_file( + "notes/a.md", + "ha", + 100, + &[], + &generate_docid("notes/a.md"), + None, + ) .unwrap(); let b = store - .insert_file("notes/b.md", "hb", 100, &[], &generate_docid("notes/b.md")) + .insert_file( + "notes/b.md", + "hb", + 100, + &[], + &generate_docid("notes/b.md"), + None, + ) .unwrap(); (a, b) } @@ -1450,7 +1675,14 @@ mod tests { let store = Store::open_memory().unwrap(); let (a, b) = setup_two_files(&store); let c = store - .insert_file("notes/c.md", "hc", 100, &[], &generate_docid("notes/c.md")) + .insert_file( + "notes/c.md", + "hc", + 100, + &[], + &generate_docid("notes/c.md"), + None, + ) .unwrap(); // a -> b, c -> a @@ -1471,7 +1703,14 @@ mod tests { let store = Store::open_memory().unwrap(); let (a, b) = setup_two_files(&store); let c = store - .insert_file("notes/c.md", "hc", 100, &[], &generate_docid("notes/c.md")) + .insert_file( + "notes/c.md", + "hc", + 100, + &[], + &generate_docid("notes/c.md"), + None, + ) .unwrap(); // a -> b, b -> c @@ -1507,7 +1746,14 @@ mod tests { let store = Store::open_memory().unwrap(); let (a, b) = setup_two_files(&store); let c = store - .insert_file("notes/c.md", "hc", 100, &[], &generate_docid("notes/c.md")) + .insert_file( + "notes/c.md", + "hc", + 100, + &[], + &generate_docid("notes/c.md"), + None, + ) .unwrap(); store.insert_edge(a, b, "wikilink").unwrap(); @@ -1536,13 +1782,13 @@ mod tests { fn test_get_neighbors_depth_1() { let store = Store::open_memory().unwrap(); let f1 = store - .insert_file("n/f1.md", "h1", 100, &[], &generate_docid("n/f1.md")) + .insert_file("n/f1.md", "h1", 100, &[], &generate_docid("n/f1.md"), None) .unwrap(); let f2 = store - .insert_file("n/f2.md", "h2", 100, &[], &generate_docid("n/f2.md")) + .insert_file("n/f2.md", "h2", 100, &[], &generate_docid("n/f2.md"), None) .unwrap(); let f3 = store - .insert_file("n/f3.md", "h3", 100, &[], &generate_docid("n/f3.md")) + .insert_file("n/f3.md", "h3", 100, &[], &generate_docid("n/f3.md"), None) .unwrap(); store.insert_edge(f1, f2, "wikilink").unwrap(); @@ -1565,16 +1811,16 @@ mod tests { fn test_get_neighbors_depth_2() { let store = Store::open_memory().unwrap(); let f1 = store - .insert_file("n/f1.md", "h1", 100, &[], &generate_docid("n/f1.md")) + .insert_file("n/f1.md", "h1", 100, &[], &generate_docid("n/f1.md"), None) .unwrap(); let f2 = store - .insert_file("n/f2.md", "h2", 100, &[], &generate_docid("n/f2.md")) + .insert_file("n/f2.md", "h2", 100, &[], &generate_docid("n/f2.md"), None) .unwrap(); let f3 = store - .insert_file("n/f3.md", "h3", 100, &[], &generate_docid("n/f3.md")) + .insert_file("n/f3.md", "h3", 100, &[], &generate_docid("n/f3.md"), None) .unwrap(); let f4 = store - .insert_file("n/f4.md", "h4", 100, &[], &generate_docid("n/f4.md")) + .insert_file("n/f4.md", "h4", 100, &[], &generate_docid("n/f4.md"), None) .unwrap(); // f1 -> f2 -> f3 -> f4 @@ -1602,6 +1848,7 @@ mod tests { 100, &["rust".to_string(), "cli".to_string()], &generate_docid("n/f1.md"), + None, ) .unwrap(); let f2 = store @@ -1611,6 +1858,7 @@ mod tests { 100, &["rust".to_string(), "web".to_string()], &generate_docid("n/f2.md"), + None, ) .unwrap(); let _f3 = store @@ -1620,6 +1868,7 @@ mod tests { 100, &["python".to_string()], &generate_docid("n/f3.md"), + None, ) .unwrap(); @@ -1632,7 +1881,14 @@ mod tests { fn test_file_contains_term() { let store = Store::open_memory().unwrap(); let f1 = store - .insert_file("n/fts.md", "h1", 100, &[], &generate_docid("n/fts.md")) + .insert_file( + "n/fts.md", + "h1", + 100, + &[], + &generate_docid("n/fts.md"), + None, + ) .unwrap(); store @@ -1648,7 +1904,14 @@ mod tests { fn test_get_best_chunk_for_file() { let store = Store::open_memory().unwrap(); let f1 = store - .insert_file("n/best.md", "h1", 100, &[], &generate_docid("n/best.md")) + .insert_file( + "n/best.md", + "h1", + 100, + &[], + &generate_docid("n/best.md"), + None, + ) .unwrap(); store @@ -1667,17 +1930,17 @@ mod tests { fn test_get_edge_stats() { let store = Store::open_memory().unwrap(); let a = store - .insert_file("n/a.md", "ha", 100, &[], &generate_docid("n/a.md")) + .insert_file("n/a.md", "ha", 100, &[], &generate_docid("n/a.md"), None) .unwrap(); let b = store - .insert_file("n/b.md", "hb", 100, &[], &generate_docid("n/b.md")) + .insert_file("n/b.md", "hb", 100, &[], &generate_docid("n/b.md"), None) .unwrap(); let c = store - .insert_file("n/c.md", "hc", 100, &[], &generate_docid("n/c.md")) + .insert_file("n/c.md", "hc", 100, &[], &generate_docid("n/c.md"), None) .unwrap(); // d is isolated (no edges). let _d = store - .insert_file("n/d.md", "hd", 100, &[], &generate_docid("n/d.md")) + .insert_file("n/d.md", "hd", 100, &[], &generate_docid("n/d.md"), None) .unwrap(); store.insert_edge(a, b, "wikilink").unwrap(); @@ -1696,10 +1959,24 @@ mod tests { fn test_list_files_no_filter() { let store = Store::open_memory().unwrap(); store - .insert_file("01-Projects/a.md", "h1", 100, &["rust".into()], "aaa111") + .insert_file( + "01-Projects/a.md", + "h1", + 100, + &["rust".into()], + "aaa111", + None, + ) .unwrap(); store - .insert_file("02-Areas/b.md", "h2", 200, &["health".into()], "bbb222") + .insert_file( + "02-Areas/b.md", + "h2", + 200, + &["health".into()], + "bbb222", + None, + ) .unwrap(); store .insert_file( @@ -1708,9 +1985,10 @@ mod tests { 300, &["rust".into(), "cli".into()], "ccc333", + None, ) .unwrap(); - let files = store.list_files(None, &[], 20).unwrap(); + let files = store.list_files(None, &[], None, 20).unwrap(); assert_eq!(files.len(), 3); } @@ -1718,12 +1996,14 @@ mod tests { fn test_list_files_folder_filter() { let store = Store::open_memory().unwrap(); store - .insert_file("01-Projects/a.md", "h1", 100, &[], "aaa111") + .insert_file("01-Projects/a.md", "h1", 100, &[], "aaa111", None) .unwrap(); store - .insert_file("02-Areas/b.md", "h2", 200, &[], "bbb222") + .insert_file("02-Areas/b.md", "h2", 200, &[], "bbb222", None) + .unwrap(); + let files = store + .list_files(Some("01-Projects"), &[], None, 20) .unwrap(); - let files = store.list_files(Some("01-Projects"), &[], 20).unwrap(); assert_eq!(files.len(), 1); assert_eq!(files[0].path, "01-Projects/a.md"); } @@ -1732,36 +2012,74 @@ mod tests { fn test_list_files_tag_filter() { let store = Store::open_memory().unwrap(); store - .insert_file("a.md", "h1", 100, &["rust".into(), "cli".into()], "aaa111") + .insert_file( + "a.md", + "h1", + 100, + &["rust".into(), "cli".into()], + "aaa111", + None, + ) .unwrap(); store - .insert_file("b.md", "h2", 200, &["rust".into()], "bbb222") + .insert_file("b.md", "h2", 200, &["rust".into()], "bbb222", None) .unwrap(); store - .insert_file("c.md", "h3", 300, &["python".into()], "ccc333") + .insert_file("c.md", "h3", 300, &["python".into()], "ccc333", None) .unwrap(); - let files = store.list_files(None, &["rust".into()], 20).unwrap(); + let files = store.list_files(None, &["rust".into()], None, 20).unwrap(); assert_eq!(files.len(), 2); let files = store - .list_files(None, &["rust".into(), "cli".into()], 20) + .list_files(None, &["rust".into(), "cli".into()], None, 20) .unwrap(); assert_eq!(files.len(), 1); assert_eq!(files[0].path, "a.md"); } + #[test] + fn test_list_files_created_by_filter() { + let store = Store::open_memory().unwrap(); + store + .insert_file("a.md", "h1", 100, &[], "aaa111", Some("cli")) + .unwrap(); + store + .insert_file("b.md", "h2", 200, &[], "bbb222", Some("mcp")) + .unwrap(); + store + .insert_file("c.md", "h3", 300, &[], "ccc333", None) + .unwrap(); + + // Filter by "cli" → only the cli-created file + let files = store.list_files(None, &[], Some("cli"), 20).unwrap(); + assert_eq!(files.len(), 1); + assert_eq!(files[0].path, "a.md"); + assert_eq!(files[0].created_by, Some("cli".to_string())); + + // Filter by "mcp" → only the mcp-created file + let files = store.list_files(None, &[], Some("mcp"), 20).unwrap(); + assert_eq!(files.len(), 1); + assert_eq!(files[0].path, "b.md"); + + // Filter by None → all 3 + let files = store.list_files(None, &[], None, 20).unwrap(); + assert_eq!(files.len(), 3); + } + #[test] fn test_folder_note_counts() { let store = Store::open_memory().unwrap(); store - .insert_file("01-Projects/a.md", "h1", 100, &[], "a1") + .insert_file("01-Projects/a.md", "h1", 100, &[], "a1", None) .unwrap(); store - .insert_file("01-Projects/b.md", "h2", 100, &[], "b2") + .insert_file("01-Projects/b.md", "h2", 100, &[], "b2", None) .unwrap(); store - .insert_file("02-Areas/c.md", "h3", 100, &[], "c3") + .insert_file("02-Areas/c.md", "h3", 100, &[], "c3", None) + .unwrap(); + store + .insert_file("root.md", "h4", 100, &[], "d4", None) .unwrap(); - store.insert_file("root.md", "h4", 100, &[], "d4").unwrap(); let counts = store.folder_note_counts().unwrap(); assert!(counts.iter().any(|(f, c)| f == "01-Projects" && *c == 2)); assert!(counts.iter().any(|(f, c)| f == "02-Areas" && *c == 1)); @@ -1772,13 +2090,27 @@ mod tests { fn test_top_tags() { let store = Store::open_memory().unwrap(); store - .insert_file("a.md", "h1", 100, &["rust".into(), "cli".into()], "a1") + .insert_file( + "a.md", + "h1", + 100, + &["rust".into(), "cli".into()], + "a1", + None, + ) .unwrap(); store - .insert_file("b.md", "h2", 100, &["rust".into(), "web".into()], "b2") + .insert_file( + "b.md", + "h2", + 100, + &["rust".into(), "web".into()], + "b2", + None, + ) .unwrap(); store - .insert_file("c.md", "h3", 100, &["rust".into()], "c3") + .insert_file("c.md", "h3", 100, &["rust".into()], "c3", None) .unwrap(); let tags = store.top_tags(10).unwrap(); assert_eq!(tags[0].0, "rust"); @@ -1788,8 +2120,12 @@ mod tests { #[test] fn test_recent_files() { let store = Store::open_memory().unwrap(); - store.insert_file("old.md", "h1", 100, &[], "a1").unwrap(); - store.insert_file("new.md", "h2", 200, &[], "b2").unwrap(); + store + .insert_file("old.md", "h1", 100, &[], "a1", None) + .unwrap(); + store + .insert_file("new.md", "h2", 200, &[], "b2", None) + .unwrap(); let recent = store.recent_files(1).unwrap(); assert_eq!(recent.len(), 1); } @@ -1797,8 +2133,12 @@ mod tests { #[test] fn test_edge_count_for_file() { let store = Store::open_memory().unwrap(); - let f1 = store.insert_file("a.md", "h1", 100, &[], "a1").unwrap(); - let f2 = store.insert_file("b.md", "h2", 100, &[], "b2").unwrap(); + let f1 = store + .insert_file("a.md", "h1", 100, &[], "a1", None) + .unwrap(); + let f2 = store + .insert_file("b.md", "h2", 100, &[], "b2", None) + .unwrap(); store.insert_edge(f1, f2, "wikilink").unwrap(); store.insert_edge(f2, f1, "wikilink").unwrap(); assert_eq!(store.edge_count_for_file(f1).unwrap(), 2); @@ -1809,10 +2149,10 @@ mod tests { fn test_find_file_by_basename() { let store = Store::open_memory().unwrap(); store - .insert_file("01-Projects/Work/note.md", "h1", 100, &[], "aaa111") + .insert_file("01-Projects/Work/note.md", "h1", 100, &[], "aaa111", None) .unwrap(); store - .insert_file("root.md", "h2", 100, &[], "bbb222") + .insert_file("root.md", "h2", 100, &[], "bbb222", None) .unwrap(); let found = store.find_file_by_basename("note").unwrap(); @@ -1829,9 +2169,15 @@ mod tests { #[test] fn test_edge_counts_for_files() { let store = Store::open_memory().unwrap(); - let f1 = store.insert_file("a.md", "h1", 100, &[], "a1").unwrap(); - let f2 = store.insert_file("b.md", "h2", 100, &[], "b2").unwrap(); - let f3 = store.insert_file("c.md", "h3", 100, &[], "c3").unwrap(); + let f1 = store + .insert_file("a.md", "h1", 100, &[], "a1", None) + .unwrap(); + let f2 = store + .insert_file("b.md", "h2", 100, &[], "b2", None) + .unwrap(); + let f3 = store + .insert_file("c.md", "h3", 100, &[], "c3", None) + .unwrap(); store.insert_edge(f1, f2, "wikilink").unwrap(); store.insert_edge(f2, f1, "wikilink").unwrap(); store.insert_edge(f1, f3, "wikilink").unwrap(); @@ -1879,7 +2225,7 @@ mod tests { let store = Store::open_memory().unwrap(); // Insert a file + chunk with a vector BLOB. let file_id = store - .insert_file("test.md", "hash123", 0, &[], "abc123") + .insert_file("test.md", "hash123", 0, &[], "abc123", None) .unwrap(); let vector: Vec = (0..384).map(|i| (i as f32) / 384.0).collect(); store @@ -1951,4 +2297,231 @@ mod tests { let stale = store.stale_tags(1).unwrap(); assert!(stale.is_empty()); } + + #[test] + fn test_adjust_folder_centroid_increment() { + let store = Store::open_memory().unwrap(); + // Seed centroid [1.0, 0.0, 0.0] with n=2 + store + .upsert_folder_centroid("01-Projects", &[1.0, 0.0, 0.0], 2) + .unwrap(); + // Add [0.0, 1.0, 0.0] → new = (old*2 + new) / 3 = [2/3, 1/3, 0] + store + .adjust_folder_centroid("01-Projects", &[0.0, 1.0, 0.0], true) + .unwrap(); + let (centroid, count) = store + .get_folder_centroid("01-Projects") + .unwrap() + .expect("centroid should exist"); + assert_eq!(count, 3); + assert!((centroid[0] - 0.6667).abs() < 0.01); + assert!((centroid[1] - 0.3333).abs() < 0.01); + assert!((centroid[2] - 0.0).abs() < 0.01); + } + + #[test] + fn test_adjust_folder_centroid_decrement() { + let store = Store::open_memory().unwrap(); + // Seed centroid [0.667, 0.333, 0.0] with n=3 + store + .upsert_folder_centroid("01-Projects", &[0.667, 0.333, 0.0], 3) + .unwrap(); + // Remove [0.0, 1.0, 0.0] → new = (old*3 - vec) / 2 = [1.0005, ~0.0, 0.0] + store + .adjust_folder_centroid("01-Projects", &[0.0, 1.0, 0.0], false) + .unwrap(); + let (centroid, count) = store + .get_folder_centroid("01-Projects") + .unwrap() + .expect("centroid should exist"); + assert_eq!(count, 2); + assert!((centroid[0] - 1.0).abs() < 0.01); + assert!((centroid[1] - 0.0).abs() < 0.02); // (0.333*3 - 1.0)/2 = ~0.0 + assert!((centroid[2] - 0.0).abs() < 0.01); + } + + #[test] + fn test_adjust_folder_centroid_decrement_last_file() { + let store = Store::open_memory().unwrap(); + // Seed with n=1 + store + .upsert_folder_centroid("01-Projects", &[1.0, 0.0, 0.0], 1) + .unwrap(); + // Remove last file → centroid deleted + store + .adjust_folder_centroid("01-Projects", &[1.0, 0.0, 0.0], false) + .unwrap(); + let result = store.get_folder_centroid("01-Projects").unwrap(); + assert!(result.is_none()); + } + + #[test] + fn test_adjust_folder_centroid_new_folder() { + let store = Store::open_memory().unwrap(); + // No existing centroid, increment → creates centroid + store + .adjust_folder_centroid("02-Areas", &[0.5, 0.5, 0.0], true) + .unwrap(); + let (centroid, count) = store + .get_folder_centroid("02-Areas") + .unwrap() + .expect("centroid should exist"); + assert_eq!(count, 1); + assert!((centroid[0] - 0.5).abs() < 0.01); + assert!((centroid[1] - 0.5).abs() < 0.01); + assert!((centroid[2] - 0.0).abs() < 0.01); + } + + #[test] + fn test_insert_file_with_created_by() { + let store = Store::open_memory().unwrap(); + let docid = generate_docid("notes/test.md"); + store + .insert_file("notes/test.md", "hash1", 100, &[], &docid, Some("cli")) + .unwrap(); + let rec = store.get_file("notes/test.md").unwrap().unwrap(); + assert_eq!(rec.created_by, Some("cli".to_string())); + } + + #[test] + fn test_insert_file_without_created_by() { + let store = Store::open_memory().unwrap(); + let docid = generate_docid("notes/test.md"); + store + .insert_file("notes/test.md", "hash1", 100, &[], &docid, None) + .unwrap(); + let rec = store.get_file("notes/test.md").unwrap().unwrap(); + assert_eq!(rec.created_by, None); + } + + #[test] + fn test_update_file_path() { + let store = Store::open_memory().unwrap(); + let old_docid = generate_docid("notes/old.md"); + let file_id = store + .insert_file("notes/old.md", "hash1", 100, &[], &old_docid, None) + .unwrap(); + + let new_docid = generate_docid("notes/new.md"); + store + .update_file_path("notes/old.md", "notes/new.md", &new_docid) + .unwrap(); + + // Old path should be gone + assert!(store.get_file("notes/old.md").unwrap().is_none()); + // New path should exist with same file_id + let rec = store.get_file("notes/new.md").unwrap().unwrap(); + assert_eq!(rec.id, file_id); + assert_eq!(rec.docid.unwrap(), new_docid); + } + + #[test] + fn test_update_file_path_collision() { + let store = Store::open_memory().unwrap(); + store + .insert_file( + "notes/a.md", + "h1", + 100, + &[], + &generate_docid("notes/a.md"), + None, + ) + .unwrap(); + store + .insert_file( + "notes/b.md", + "h2", + 100, + &[], + &generate_docid("notes/b.md"), + None, + ) + .unwrap(); + + // Renaming a→b should fail because b already exists + let result = + store.update_file_path("notes/a.md", "notes/b.md", &generate_docid("notes/b.md")); + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains("already exists")); + } + + #[test] + fn test_get_chunk_vectors_for_file() { + let store = Store::open_memory().unwrap(); + let file_id = store + .insert_file( + "notes/vec.md", + "h1", + 100, + &[], + &generate_docid("notes/vec.md"), + None, + ) + .unwrap(); + + let v1: Vec = vec![1.0, 2.0, 3.0]; + let v2: Vec = vec![4.0, 5.0, 6.0]; + store + .insert_chunk_with_vector(file_id, "H1", "text1", 100, 10, &v1) + .unwrap(); + store + .insert_chunk_with_vector(file_id, "H2", "text2", 101, 10, &v2) + .unwrap(); + + let vectors = store.get_chunk_vectors_for_file(file_id).unwrap(); + assert_eq!(vectors.len(), 2); + assert_eq!(vectors[0], v1); + assert_eq!(vectors[1], v2); + } + + #[test] + fn test_get_chunk_vectors_empty() { + let store = Store::open_memory().unwrap(); + let file_id = store + .insert_file( + "notes/empty.md", + "h1", + 100, + &[], + &generate_docid("notes/empty.md"), + None, + ) + .unwrap(); + + let vectors = store.get_chunk_vectors_for_file(file_id).unwrap(); + assert!(vectors.is_empty()); + } + + #[test] + fn test_insert_placement_correction() { + let store = Store::open_memory().unwrap(); + store + .insert_placement_correction("notes/test.md", "00-Inbox", "01-Projects/Work") + .unwrap(); + + let corrections = store.get_placement_corrections(10).unwrap(); + assert_eq!(corrections.len(), 1); + assert_eq!(corrections[0].file_path, "notes/test.md"); + assert_eq!(corrections[0].suggested_folder, "00-Inbox"); + assert_eq!(corrections[0].actual_folder, "01-Projects/Work"); + assert!(!corrections[0].corrected_at.is_empty()); + } + + #[test] + fn test_get_placement_corrections_ordering() { + let store = Store::open_memory().unwrap(); + store + .insert_placement_correction("notes/first.md", "00-Inbox", "01-Projects") + .unwrap(); + store + .insert_placement_correction("notes/second.md", "02-Areas", "03-Resources") + .unwrap(); + + let corrections = store.get_placement_corrections(10).unwrap(); + assert_eq!(corrections.len(), 2); + // Latest first (ORDER BY id DESC) + assert_eq!(corrections[0].file_path, "notes/second.md"); + assert_eq!(corrections[1].file_path, "notes/first.md"); + } } diff --git a/src/watcher.rs b/src/watcher.rs new file mode 100644 index 0000000..915f3e1 --- /dev/null +++ b/src/watcher.rs @@ -0,0 +1,656 @@ +use std::collections::HashMap; +use std::path::{Path, PathBuf}; +use std::sync::Arc; +use std::time::Duration; + +use notify::RecursiveMode; +use notify_debouncer_full::{DebouncedEvent, new_debouncer}; +use tokio::sync::Mutex; +use tokio::sync::mpsc; +use tokio::sync::oneshot; + +use crate::config::Config; +use crate::embedder::Embedder; +use crate::indexer; +use crate::placement; +use crate::profile::VaultProfile; +use crate::store::Store; + +/// Start the file watcher and consumer. Returns a thread handle for the producer +/// and a shutdown sender. On startup, runs a reconciliation index to catch any +/// changes that occurred while the server was down, then begins watching for +/// real-time file changes. +pub fn start_watcher( + store: Arc>, + embedder: Arc>, + vault_path: Arc, + profile: Arc>, + config: Config, + exclude: Vec, +) -> anyhow::Result<(std::thread::JoinHandle<()>, oneshot::Sender<()>)> { + let (tx, rx) = mpsc::channel::>(64); + let (shutdown_tx, shutdown_rx) = oneshot::channel::<()>(); + + // Start producer (begins buffering events immediately) + let producer_handle = start_producer(vault_path.as_ref().clone(), exclude, tx, shutdown_rx); + + // Spawn consumer task + let store_clone = store.clone(); + let embedder_clone = embedder.clone(); + let vault_clone = vault_path.clone(); + let profile_clone = profile.clone(); + let config_clone = config.clone(); + tokio::spawn(async move { + // Startup reconciliation: run index to catch changes since last shutdown + { + let store_lock = store_clone.lock().await; + let mut embedder_lock = embedder_clone.lock().await; + if let Err(e) = crate::indexer::run_index_shared( + &vault_clone, + &config_clone, + &store_lock, + &mut embedder_lock, + false, + ) { + tracing::warn!("Startup reconciliation failed: {:#}", e); + } + } + + // Then consume events + run_consumer( + rx, + store_clone, + embedder_clone, + vault_clone, + profile_clone, + config_clone, + ) + .await; + }); + + Ok((producer_handle, shutdown_tx)) +} + +/// Events sent from the watcher producer to the consumer. +#[derive(Debug, Clone)] +pub enum WatchEvent { + /// File content was modified or a new file was created. + Changed(PathBuf), + /// File was deleted. + Deleted(PathBuf), + /// File was moved/renamed (detected via content hash or inode tracking). + Moved { from: PathBuf, to: PathBuf }, + /// macOS FSEvents buffer overflow — full rescan needed. + FullRescan, +} + +/// Start the producer thread. Returns thread handle. +/// The producer watches the vault, debounces events, and sends batches to tx. +pub fn start_producer( + vault_path: PathBuf, + exclude: Vec, + tx: mpsc::Sender>, + mut shutdown_rx: oneshot::Receiver<()>, +) -> std::thread::JoinHandle<()> { + std::thread::spawn(move || { + // Create std channel for debouncer events + let (debouncer_tx, debouncer_rx) = std::sync::mpsc::channel(); + + let mut debouncer = match new_debouncer(Duration::from_secs(2), None, debouncer_tx) { + Ok(d) => d, + Err(e) => { + tracing::error!("Failed to create file watcher: {}", e); + return; + } + }; + + if let Err(e) = debouncer.watch(&vault_path, RecursiveMode::Recursive) { + tracing::error!("Failed to watch {:?}: {}", vault_path, e); + return; + } + + tracing::info!("File watcher started for {:?}", vault_path); + + loop { + // Check shutdown (non-blocking) + if shutdown_rx.try_recv().is_ok() { + tracing::info!("Watcher shutting down"); + break; + } + + match debouncer_rx.recv_timeout(Duration::from_millis(500)) { + Ok(Ok(events)) => { + let watch_events = process_debounced_events(&events, &vault_path, &exclude); + if !watch_events.is_empty() && tx.blocking_send(watch_events).is_err() { + tracing::info!("Consumer gone, watcher exiting"); + break; + } + } + Ok(Err(errors)) => { + for e in errors { + tracing::warn!("Watcher error: {:?}", e); + } + } + Err(std::sync::mpsc::RecvTimeoutError::Timeout) => continue, + Err(std::sync::mpsc::RecvTimeoutError::Disconnected) => break, + } + } + }) +} + +/// Convert `DebouncedEvent`s to `WatchEvent`s, filtering to `.md` files. +fn process_debounced_events( + events: &[DebouncedEvent], + vault_path: &Path, + exclude: &[String], +) -> Vec { + let mut result = Vec::new(); + + for debounced in events { + let event = &debounced.event; // Access the inner notify::Event + + let paths: Vec<&PathBuf> = event + .paths + .iter() + .filter(|p| p.extension().map(|e| e == "md").unwrap_or(false)) + .filter(|p| !is_excluded(p, vault_path, exclude)) + .collect(); + + if paths.is_empty() { + continue; + } + + use notify::EventKind; + match &event.kind { + EventKind::Create(_) | EventKind::Modify(_) => { + for path in paths { + result.push(WatchEvent::Changed(path.clone())); + } + } + EventKind::Remove(_) => { + for path in paths { + result.push(WatchEvent::Deleted(path.clone())); + } + } + EventKind::Other => { + result.push(WatchEvent::FullRescan); + } + _ => {} + } + } + + result +} + +/// Check if a path should be excluded (mirrors `walk_vault` logic in `indexer.rs`). +fn is_excluded(path: &Path, vault_path: &Path, exclude: &[String]) -> bool { + let rel = path.strip_prefix(vault_path).unwrap_or(path); + let rel_str = rel.to_string_lossy(); + exclude.iter().any(|pattern| { + if pattern.ends_with('/') { + let dir_name = pattern.trim_end_matches('/'); + rel_str.split('/').any(|component| component == dir_name) + } else { + rel_str.contains(pattern.as_str()) + } + }) +} + +/// Detect file moves by matching `Deleted` + `Changed` pairs via content hash. +/// +/// When a file is moved, the OS reports a delete at the old path and a create at +/// the new path. We match these by comparing the stored content hash (for the +/// deleted file) against the on-disk content hash (for the new file). Matched +/// pairs are replaced with `Moved { from, to }` events. +fn detect_moves(events: &mut Vec, store: &Store, vault_path: &Path) { + // Collect deletion paths and their stored content hashes. + let mut deletion_hashes: HashMap = HashMap::new(); + for event in events.iter() { + if let WatchEvent::Deleted(path) = event { + let rel = path + .strip_prefix(vault_path) + .unwrap_or(path) + .to_string_lossy() + .to_string(); + if let Ok(Some(record)) = store.get_file(&rel) { + deletion_hashes.insert(record.content_hash.clone(), path.clone()); + } + } + } + + if deletion_hashes.is_empty() { + return; + } + + // Collect creation paths (Changed events for files NOT already in store = new files). + let mut creation_hashes: HashMap = HashMap::new(); + for event in events.iter() { + if let WatchEvent::Changed(path) = event { + let rel = path + .strip_prefix(vault_path) + .unwrap_or(path) + .to_string_lossy() + .to_string(); + // Only consider files not already in the store (truly new files). + if store.get_file(&rel).ok().flatten().is_none() + && let Ok(hash) = indexer::compute_file_hash(path) + { + creation_hashes.insert(hash, path.clone()); + } + } + } + + // Match deletions to creations by content hash. + let mut moves: Vec<(PathBuf, PathBuf)> = Vec::new(); + for (hash, del_path) in &deletion_hashes { + if let Some(create_path) = creation_hashes.get(hash) { + moves.push((del_path.clone(), create_path.clone())); + } + } + + if moves.is_empty() { + return; + } + + // Replace matched pairs with Moved events. + let move_from_set: std::collections::HashSet = + moves.iter().map(|(from, _)| from.clone()).collect(); + let move_to_set: std::collections::HashSet = + moves.iter().map(|(_, to)| to.clone()).collect(); + + events.retain(|event| match event { + WatchEvent::Deleted(p) => !move_from_set.contains(p), + WatchEvent::Changed(p) => !move_to_set.contains(p), + _ => true, + }); + + for (from, to) in moves { + tracing::info!(from = %from.display(), to = %to.display(), "detected file move"); + events.push(WatchEvent::Moved { from, to }); + } +} + +/// Consumer async task that processes batches of watch events. +/// +/// Two-pass processing: +/// - Pass 1: Apply mutations (index/remove/rename files) +/// - Pass 2: Rebuild edges for affected files +pub async fn run_consumer( + mut rx: mpsc::Receiver>, + store: Arc>, + embedder: Arc>, + vault_path: Arc, + _profile: Arc>, + config: Config, +) { + tracing::info!("Watcher consumer started"); + + while let Some(mut events) = rx.recv().await { + tracing::info!(count = events.len(), "processing event batch"); + + // Move detection (needs store lock briefly) + { + let store_guard = store.lock().await; + detect_moves(&mut events, &store_guard, &vault_path); + } + + let mut affected_file_ids: Vec = Vec::new(); + let mut had_full_rescan = false; + + // Pass 1: mutations (one event at a time) + for event in &events { + match event { + WatchEvent::Changed(path) => { + let rel = path + .strip_prefix(vault_path.as_ref()) + .unwrap_or(path) + .to_string_lossy() + .to_string(); + + let content = match std::fs::read_to_string(path) { + Ok(c) => c, + Err(e) => { + tracing::warn!(path = %path.display(), error = %e, "failed to read changed file, skipping"); + continue; + } + }; + + let content_hash = match indexer::compute_file_hash(path) { + Ok(h) => h, + Err(e) => { + tracing::warn!(path = %path.display(), error = %e, "failed to hash changed file, skipping"); + continue; + } + }; + + let store_guard = store.lock().await; + // Check if file is new (not yet in store) before indexing + let is_new_file = store_guard.get_file(&rel).ok().flatten().is_none(); + + let mut embedder_guard = embedder.lock().await; + match indexer::index_file( + &rel, + &content, + &content_hash, + &store_guard, + &mut embedder_guard, + &vault_path, + &config, + ) { + Ok(result) => { + tracing::info!( + path = %rel, + file_id = result.file_id, + chunks = result.total_chunks, + "indexed changed file" + ); + affected_file_ids.push(result.file_id); + + // Adjust folder centroid for newly added files + if is_new_file + && let Ok(vectors) = + store_guard.get_chunk_vectors_for_file(result.file_id) + && !vectors.is_empty() + { + let dim = vectors[0].len(); + let mut mean = vec![0.0f32; dim]; + for v in &vectors { + for (i, val) in v.iter().enumerate() { + mean[i] += val; + } + } + let n = vectors.len() as f32; + for val in &mut mean { + *val /= n; + } + + let folder = std::path::Path::new(&rel) + .parent() + .map(|p| p.to_string_lossy().to_string()) + .unwrap_or_default(); + if let Err(e) = + store_guard.adjust_folder_centroid(&folder, &mean, true) + { + tracing::warn!( + path = %rel, + error = %e, + "failed to adjust centroid for new file" + ); + } + } + } + Err(e) => { + tracing::warn!(path = %rel, error = %e, "failed to index changed file"); + } + } + drop(embedder_guard); + drop(store_guard); + } + + WatchEvent::Deleted(path) => { + let rel = path + .strip_prefix(vault_path.as_ref()) + .unwrap_or(path) + .to_string_lossy() + .to_string(); + + let store_guard = store.lock().await; + + // Capture mean vector BEFORE removal for centroid adjustment + let mean_vec_and_folder = + store_guard.get_file(&rel).ok().flatten().and_then(|file| { + let vectors = store_guard.get_chunk_vectors_for_file(file.id).ok()?; + if vectors.is_empty() { + return None; + } + let dim = vectors[0].len(); + let mut mean = vec![0.0f32; dim]; + for v in &vectors { + for (i, val) in v.iter().enumerate() { + mean[i] += val; + } + } + let n = vectors.len() as f32; + for val in &mut mean { + *val /= n; + } + let folder = std::path::Path::new(&rel) + .parent() + .map(|p| p.to_string_lossy().to_string()) + .unwrap_or_default(); + Some((mean, folder)) + }); + + match indexer::remove_file(&rel, &store_guard) { + Ok(()) => { + tracing::info!(path = %rel, "removed deleted file from index"); + + // Adjust folder centroid after successful removal + if let Some((mean, folder)) = mean_vec_and_folder + && let Err(e) = + store_guard.adjust_folder_centroid(&folder, &mean, false) + { + tracing::warn!( + path = %rel, + error = %e, + "failed to adjust centroid for deleted file" + ); + } + } + Err(e) => { + tracing::warn!(path = %rel, error = %e, "failed to remove deleted file"); + } + } + drop(store_guard); + } + + WatchEvent::Moved { from, to } => { + let old_rel = from + .strip_prefix(vault_path.as_ref()) + .unwrap_or(from) + .to_string_lossy() + .to_string(); + let new_rel = to + .strip_prefix(vault_path.as_ref()) + .unwrap_or(to) + .to_string_lossy() + .to_string(); + + // Phase 1: Store operations under lock + let needs_frontmatter_strip = { + let store_guard = store.lock().await; + match indexer::rename_file(&old_rel, &new_rel, &store_guard) { + Ok(()) => { + tracing::info!(from = %old_rel, to = %new_rel, "renamed file in index"); + // Track the file_id for edge rebuild + if let Ok(Some(record)) = store_guard.get_file(&new_rel) { + affected_file_ids.push(record.id); + } + + // Placement correction detection + if let Ok(content) = std::fs::read_to_string(to) { + let actual_folder = std::path::Path::new(&new_rel) + .parent() + .map(|p| p.to_string_lossy().to_string()) + .unwrap_or_default(); + + match placement::detect_correction_from_frontmatter( + &content, + &actual_folder, + ) { + Some(correction) => { + tracing::info!( + file = %new_rel, + suggested = %correction.suggested_folder, + actual = %correction.actual_folder, + "placement correction detected" + ); + + // Compute mean vector from file chunks + if let Ok(Some(file)) = store_guard.get_file(&new_rel) + && let Ok(vectors) = + store_guard.get_chunk_vectors_for_file(file.id) + && !vectors.is_empty() + { + let dim = vectors[0].len(); + let mut mean = vec![0.0f32; dim]; + for v in &vectors { + for (i, val) in v.iter().enumerate() { + mean[i] += val; + } + } + let n = vectors.len() as f32; + for val in &mut mean { + *val /= n; + } + + // Adjust centroids: boost actual, decay suggested + if let Err(e) = store_guard.adjust_folder_centroid( + &correction.actual_folder, + &mean, + true, + ) { + tracing::warn!(error = %e, "failed to adjust actual folder centroid"); + } + if let Err(e) = store_guard.adjust_folder_centroid( + &correction.suggested_folder, + &mean, + false, + ) { + tracing::warn!(error = %e, "failed to adjust suggested folder centroid"); + } + } + + // Log the correction + if let Err(e) = store_guard.insert_placement_correction( + &new_rel, + &correction.suggested_folder, + &correction.actual_folder, + ) { + tracing::warn!(error = %e, "failed to log placement correction"); + } + + // Signal that frontmatter strip is needed (done outside lock) + let stripped = + placement::strip_placement_frontmatter(&content); + if stripped != content { + Some(stripped) + } else { + None + } + } + None => { + // Check if it's a confirmation (suggested == actual) — just strip + let has_suggested = + content.contains("suggested_folder:"); + if has_suggested { + let stripped = + placement::strip_placement_frontmatter( + &content, + ); + if stripped != content { + Some(stripped) + } else { + None + } + } else { + None + } + } + } + } else { + None + } + } + Err(e) => { + tracing::warn!(from = %old_rel, to = %new_rel, error = %e, "failed to rename file"); + None + } + } + }; // store_guard dropped here + + // Phase 2: Frontmatter file I/O without store lock. + // The write triggers a Changed event that gets re-indexed anyway. + if let Some(stripped) = needs_frontmatter_strip { + let tmp = to.with_extension("md.tmp"); + if let Err(e) = + std::fs::write(&tmp, &stripped).and_then(|_| std::fs::rename(&tmp, to)) + { + tracing::warn!(error = %e, "failed to strip placement frontmatter"); + let _ = std::fs::remove_file(&tmp); + } + } + } + + WatchEvent::FullRescan => { + // FullRescan: holds both locks for the entire rescan duration. + // This blocks MCP tool calls but is acceptable since FullRescan + // is rare (macOS FSEvents buffer overflow). Future optimization: + // process files one-at-a-time with per-file lock release. + tracing::info!("performing full rescan"); + let store_guard = store.lock().await; + let mut embedder_guard = embedder.lock().await; + match indexer::run_index_shared( + &vault_path, + &config, + &store_guard, + &mut embedder_guard, + false, + ) { + Ok(result) => { + tracing::info!( + new = result.new_files, + updated = result.updated_files, + deleted = result.deleted_files, + chunks = result.total_chunks, + duration_secs = result.duration.as_secs_f64(), + "full rescan complete" + ); + had_full_rescan = true; + } + Err(e) => { + tracing::warn!(error = %e, "full rescan failed"); + } + } + drop(embedder_guard); + drop(store_guard); + } + } + } + + // Pass 2: edge rebuild for affected files (skip if full rescan already rebuilt everything) + if !had_full_rescan && !affected_file_ids.is_empty() { + tracing::info!( + count = affected_file_ids.len(), + "rebuilding edges for affected files" + ); + let store_guard = store.lock().await; + for file_id in &affected_file_ids { + // Delete old edges first + if let Err(e) = store_guard.delete_edges_for_file(*file_id) { + tracing::warn!(file_id, error = %e, "failed to delete old edges"); + continue; + } + + if let Ok(Some(file)) = store_guard.get_file_by_id(*file_id) { + let content = + std::fs::read_to_string(vault_path.join(&file.path)).unwrap_or_default(); + if let Err(e) = indexer::build_edges_for_file(&store_guard, *file_id, &content) + { + tracing::warn!( + file_id, + path = %file.path, + error = %e, + "failed to rebuild edges" + ); + } + } + } + drop(store_guard); + } + + tracing::info!("batch processing complete"); + } + + tracing::info!("Watcher consumer shutting down (channel closed)"); +} diff --git a/src/writer.rs b/src/writer.rs index 529bd4f..f431cc2 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -47,6 +47,8 @@ pub struct WriteResult { pub docid: String, pub tags: Vec, pub links_added: Vec, + #[serde(skip_serializing_if = "Vec::is_empty")] + pub links_suggested: Vec, pub folder: String, pub confidence: f64, pub strategy: String, @@ -277,15 +279,41 @@ pub fn create_note( let resolved_tags = store.resolve_tags(&input.tags)?; // Step 3: Discover links and apply them - let discovered = links::discover_links(store, &input.content, vault_path)?; - let links_added: Vec = discovered.iter().map(|l| l.target_path.clone()).collect(); + let people_folder = profile.and_then(|p| p.structure.folders.people.as_deref()); + let discovered = links::discover_links(store, &input.content, vault_path, people_folder)?; + + // Split discovered links into auto-apply and suggestion-only + let (auto_apply, suggestions): (Vec<_>, Vec<_>) = + discovered.into_iter().partition(|l| match &l.match_type { + links::LinkMatchType::ExactName | links::LinkMatchType::Alias => true, + links::LinkMatchType::FuzzyName { confidence_bp } => *confidence_bp >= 920, + links::LinkMatchType::FirstName { .. } => false, + }); + + let links_added: Vec = auto_apply.iter().map(|l| l.target_path.clone()).collect(); + let links_suggested: Vec = suggestions + .iter() + .map(|l| { + let target_name = l + .target_path + .rsplit('/') + .next() + .unwrap_or(&l.target_path) + .trim_end_matches(".md"); + if let Some(ref display) = l.display { + format!("[[{}|{}]]", target_name, display) + } else { + format!("[[{}]]", target_name) + } + }) + .collect(); - // Apply discovered links to content — wrap matched text in [[wikilinks]] + // Apply auto-apply links to content — wrap matched text in [[wikilinks]] let mut content_with_links = input.content.clone(); // Apply in reverse order of position to preserve offsets let mut replacements: Vec<(usize, usize, String)> = Vec::new(); let content_lower = content_with_links.to_lowercase(); - for link in &discovered { + for link in &auto_apply { let search_lower = link.matched_text.to_lowercase(); if let Some(pos) = content_lower.find(&search_lower) { let end = pos + link.matched_text.len(); @@ -375,7 +403,14 @@ pub fn create_note( store.begin_transaction()?; let result = (|| -> Result { let mtime = file_mtime(&temp_path).unwrap_or(0); - let file_id = store.insert_file(&rel_path, &content_hash, mtime, &resolved_tags, &docid)?; + let file_id = store.insert_file( + &rel_path, + &content_hash, + mtime, + &resolved_tags, + &docid, + Some(&input.created_by), + )?; let mut next_vid = store.next_vector_id()?; for (chunk_seq, (heading, snippet, vector, token_count)) in chunk_data.iter().enumerate() { @@ -411,52 +446,27 @@ pub fn create_note( actual_mtime, &resolved_tags, &docid, + Some(&input.created_by), )?; - // Incrementally update folder centroid with new note's vectors - if let Ok(centroids) = store.get_folder_centroids() { + // Incrementally update folder centroid with new note's mean vector + { let folder = &placement_result.folder; let new_vecs: Vec<&[f32]> = chunk_data.iter().map(|(_, _, v, _)| v.as_slice()).collect(); if !new_vecs.is_empty() { - let existing = centroids.iter().find(|(f, _)| f == folder); - let dim = 384; - let updated_centroid = if let Some((_, old_centroid)) = existing { - // Weighted merge: old centroid already represents N vectors, - // new vectors are added. Approximate by averaging old centroid with new mean. - let mut new_mean = vec![0.0f32; dim]; - for v in &new_vecs { - for (i, val) in v.iter().enumerate() { - new_mean[i] += val; - } + let dim = new_vecs[0].len(); + let mut mean_vec = vec![0.0f32; dim]; + for v in &new_vecs { + for (i, val) in v.iter().enumerate() { + mean_vec[i] += val; } - let n = new_vecs.len() as f32; - for val in &mut new_mean { - *val /= n; - } - // Weighted average: existing has more weight - let old_weight = 0.9f32; - let new_weight = 0.1f32; - old_centroid - .iter() - .zip(new_mean.iter()) - .map(|(o, n)| o * old_weight + n * new_weight) - .collect::>() - } else { - // First note in this folder — centroid IS the mean of new vectors - let mut mean = vec![0.0f32; dim]; - for v in &new_vecs { - for (i, val) in v.iter().enumerate() { - mean[i] += val; - } - } - let n = new_vecs.len() as f32; - for val in &mut mean { - *val /= n; - } - mean - }; - let _ = store.upsert_folder_centroid(folder, &updated_centroid, new_vecs.len()); + } + let n = new_vecs.len() as f32; + for val in &mut mean_vec { + *val /= n; + } + let _ = store.adjust_folder_centroid(folder, &mean_vec, true); } } } @@ -473,6 +483,7 @@ pub fn create_note( docid, tags: resolved_tags, links_added, + links_suggested, folder: placement_result.folder, confidence: placement_result.confidence, strategy: strategy_name, @@ -544,6 +555,7 @@ pub fn append_to_note( mtime, &file_record.tags, &docid, + file_record.created_by.as_deref(), )?; let mut next_vid = store.next_vector_id()?; @@ -572,6 +584,7 @@ pub fn append_to_note( actual_mtime, &file_record.tags, &docid, + file_record.created_by.as_deref(), )?; } Err(e) => { @@ -592,6 +605,7 @@ pub fn append_to_note( docid, tags: file_record.tags, links_added: vec![], + links_suggested: vec![], folder, confidence: 1.0, strategy: "Append".to_string(), @@ -648,7 +662,14 @@ pub fn update_metadata( // Step 5: Update store record (metadata-only, no re-chunking) let mtime = file_mtime(&full_path)?; - store.insert_file(&file_record.path, &content_hash, mtime, &tags, &docid)?; + store.insert_file( + &file_record.path, + &content_hash, + mtime, + &tags, + &docid, + file_record.created_by.as_deref(), + )?; // Register tags for tag in &tags { @@ -666,6 +687,7 @@ pub fn update_metadata( docid, tags, links_added: vec![], + links_suggested: vec![], folder, confidence: 1.0, strategy: "UpdateMetadata".to_string(), @@ -728,6 +750,7 @@ pub fn move_note( mtime, &file_record.tags, &new_docid, + file_record.created_by.as_deref(), )?; Ok(()) @@ -750,6 +773,7 @@ pub fn move_note( docid: new_docid, tags: file_record.tags, links_added: vec![], + links_suggested: vec![], folder: new_folder.to_string(), confidence: 1.0, strategy: "Move".to_string(), @@ -835,6 +859,7 @@ pub fn archive_note( docid, tags, links_added: vec![], + links_suggested: vec![], folder: archive_folder.to_string(), confidence: 1.0, strategy: "Archive".to_string(), @@ -920,7 +945,14 @@ pub fn unarchive_note( store.begin_transaction()?; let result = (|| -> Result<()> { - let file_id = store.insert_file(&original_path, &content_hash, mtime, &tags, &docid)?; + let file_id = store.insert_file( + &original_path, + &content_hash, + mtime, + &tags, + &docid, + Some("unarchive"), + )?; let mut next_vid = store.next_vector_id()?; for (seq, (heading, snippet, vector, token_count)) in chunk_data.iter().enumerate() { @@ -962,6 +994,7 @@ pub fn unarchive_note( docid, tags, links_added: vec![], + links_suggested: vec![], folder, confidence: 1.0, strategy: "Unarchive".to_string(), @@ -1122,6 +1155,7 @@ mod tests { 100, &[], &crate::docid::generate_docid("notes/existing.md"), + None, ) .unwrap(); store @@ -1131,6 +1165,7 @@ mod tests { 100, &[], &crate::docid::generate_docid("notes/gone.md"), + None, ) .unwrap();