diff --git a/Cargo.toml b/Cargo.toml index 12167d65..f7572654 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "vectorless" -version = "0.1.12" +version = "0.1.13" edition = "2024" authors = ["zTgx "] description = "Hierarchical, reasoning-native document intelligence engine" @@ -59,6 +59,16 @@ indextree = { version = "4.8.0", features = ["deser"] } # LRU cache lru = "0.12" +# Checksum +sha2 = "0.10" + +# Compression +flate2 = "1.0" + +# File locking (Unix) +[target.'cfg(unix)'.dependencies] +libc = "0.2" + # PDF processing pdf-extract = "0.10.0" lopdf = "0.34" diff --git a/README.md b/README.md index 52ae5a05..a89e2f28 100644 --- a/README.md +++ b/README.md @@ -134,22 +134,8 @@ async fn main() -> vectorless::Result<()> { ## Examples -See the [examples/](examples/) directory for complete working examples: - -| Example | Description | -|---------|-------------| -| [basic.rs](examples/basic.rs) | Minimal ~30 line example showing core API | -| [index.rs](examples/index.rs) | Document indexing pipeline | -| [retrieve.rs](examples/retrieve.rs) | Retrieval pipeline with options | -| [events.rs](examples/events.rs) | Event-driven indexing with EventEmitter | -| [session.rs](examples/session.rs) | Session management with statistics | -| [batch_processing.rs](examples/batch_processing.rs) | Batch document processing | -| [content_aggregation.rs](examples/content_aggregation.rs) | Content aggregation strategies | -| [streaming.rs](examples/streaming.rs) | Streaming document processing | -| [multi_format.rs](examples/multi_format.rs) | Multi-format document support | -| [custom_pilot.rs](examples/custom_pilot.rs) | Custom pilot implementation | -| [cli_tool.rs](examples/cli_tool.rs) | CLI application example | -| [markdownflow.rs](examples/markdownflow.rs) | Markdown workflow example | +See the [examples/](examples/) directory for complete working examples + ## Architecture diff --git a/examples/content_aggregation.rs b/examples/content_aggregation.rs index 5fe71a32..9ead2aeb 100644 --- a/examples/content_aggregation.rs +++ b/examples/content_aggregation.rs @@ -19,12 +19,12 @@ use vectorless::retrieval::content::{ StructureBuilder, OutputFormat, RelevanceScorer, ScoringStrategyConfig, ContentChunk, ScoringContext, }; -use vectorless::domain::NodeId; +use vectorless::document::NodeId; use indextree::Arena; fn make_node_id() -> NodeId { let mut arena = Arena::new(); - let node = vectorless::domain::TreeNode { + let node = vectorless::document::TreeNode { title: "Test".to_string(), structure: String::new(), content: String::new(), @@ -135,7 +135,7 @@ fn main() { for (name, format) in formats { let builder = StructureBuilder::new(format); - let tree = vectorless::domain::DocumentTree::new("Test", ""); + let tree = vectorless::document::DocumentTree::new("Test", ""); let structured = builder.build(result.selected.clone(), &tree); println!("\n{} Output ({} chars, {} tokens):", name, structured.content.len(), structured.metadata.total_tokens); diff --git a/examples/index.rs b/examples/index.rs index cbb318b1..bd2b6aac 100644 --- a/examples/index.rs +++ b/examples/index.rs @@ -76,8 +76,8 @@ async fn main() -> vectorless::Result<()> { /// Print tree structure up to a maximum depth. fn print_tree_structure( - tree: &vectorless::domain::DocumentTree, - node_id: vectorless::domain::NodeId, + tree: &vectorless::document::DocumentTree, + node_id: vectorless::document::NodeId, current_depth: usize, max_depth: usize, ) { diff --git a/examples/retrieve.rs b/examples/retrieve.rs index f3ed1751..a8a86beb 100644 --- a/examples/retrieve.rs +++ b/examples/retrieve.rs @@ -16,7 +16,7 @@ //! ``` use std::sync::Arc; -use vectorless::domain::DocumentTree; +use vectorless::document::DocumentTree; use vectorless::retrieval::{ PipelineRetriever, RetrieveOptions, Retriever, StrategyPreference, pipeline::RetrievalOrchestrator, diff --git a/examples/storage_async.rs b/examples/storage_async.rs new file mode 100644 index 00000000..f7ecfaec --- /dev/null +++ b/examples/storage_async.rs @@ -0,0 +1,95 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Async workspace usage example. +//! +//! This example demonstrates async workspace operations: +//! - Creating an async workspace +//! - Concurrent document access +//! - Async LRU cache +//! +//! # Usage +//! +//! ```bash +//! cargo run --example storage_async +//! ``` + +use std::sync::Arc; + +use vectorless::document::DocumentTree; +use vectorless::storage::{AsyncWorkspace, DocumentMeta, PersistedDocument}; + +fn create_doc(id: &str, name: &str) -> PersistedDocument { + let meta = DocumentMeta::new(id, name, "md"); + let content = format!("Content for {}", name); + let tree = DocumentTree::new("Root", &content); + PersistedDocument::new(meta, tree) +} + +#[tokio::main] +async fn main() -> vectorless::Result<()> { + println!("=== Async Workspace Example ===\n"); + + let workspace_path = "./example_async_workspace"; + + // 1. Create async workspace + println!("1. Creating async workspace..."); + let workspace = AsyncWorkspace::new(workspace_path).await?; + println!(" ✓ Created\n"); + + // 2. Add documents + println!("2. Adding documents..."); + workspace.add(&create_doc("doc-1", "Document One")).await?; + workspace.add(&create_doc("doc-2", "Document Two")).await?; + workspace.add(&create_doc("doc-3", "Document Three")).await?; + println!(" ✓ Added 3 documents\n"); + + // 3. Concurrent access example + println!("3. Concurrent access from multiple tasks..."); + let ws = Arc::new(workspace); + + let mut handles = vec![]; + + // Spawn concurrent read tasks + for i in 1..=3 { + let ws_clone = ws.clone(); + let handle = tokio::spawn(async move { + let id = format!("doc-{}", i); + let doc = ws_clone.load(&id).await.unwrap().unwrap(); + println!(" [Task {}] Loaded: {}", i, doc.meta.name); + }); + handles.push(handle); + } + + // Wait for all tasks + for handle in handles { + handle.await.unwrap(); + } + println!(" ✓ All concurrent loads completed\n"); + + // 4. Cache stats + println!("4. Cache statistics:"); + let stats = ws.cache_stats().await; + println!(" - Hits: {}", stats.hits); + println!(" - Misses: {}", stats.misses); + println!(); + + // 5. Clone and share + println!("5. Workspace can be cloned cheaply (Arc internally)..."); + let ws2 = ws.clone(); + let ws3 = ws.clone(); + + let len1 = ws.len().await; + let len2 = ws2.len().await; + let len3 = ws3.len().await; + + println!(" ws1.len() = {}, ws2.len() = {}, ws3.len() = {}", len1, len2, len3); + println!(" ✓ All clones share the same state\n"); + + // Cleanup + println!("Cleaning up..."); + std::fs::remove_dir_all(workspace_path).ok(); + println!(" ✓ Done!"); + + Ok(()) +} diff --git a/examples/storage_backend.rs b/examples/storage_backend.rs new file mode 100644 index 00000000..3b9a5fd9 --- /dev/null +++ b/examples/storage_backend.rs @@ -0,0 +1,130 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Custom storage backend example. +//! +//! This example shows how to implement a custom StorageBackend. +//! Useful for integrating with databases, cloud storage, etc. +//! +//! # Usage +//! +//! ```bash +//! cargo run --example storage_backend +//! ``` + +use std::collections::HashMap; +use std::sync::{Arc, RwLock}; + +use vectorless::document::DocumentTree; +use vectorless::storage::{DocumentMeta, PersistedDocument, StorageBackend, Workspace}; +use vectorless::Result; + +/// A simple in-memory backend with logging. +/// +/// This demonstrates how to implement StorageBackend trait. +/// In production, you might implement S3, PostgreSQL, Redis, etc. +#[derive(Debug)] +struct LoggingMemoryBackend { + name: &'static str, + data: RwLock>>, +} + +impl LoggingMemoryBackend { + fn new(name: &'static str) -> Self { + Self { + name, + data: RwLock::new(HashMap::new()), + } + } +} + +impl StorageBackend for LoggingMemoryBackend { + fn get(&self, key: &str) -> Result>> { + let data = self.data.read().unwrap(); + let result = data.get(key).cloned(); + println!(" [{}] GET '{}' -> {}", self.name, key, if result.is_some() { "found" } else { "not found" }); + Ok(result) + } + + fn put(&self, key: &str, value: &[u8]) -> Result<()> { + let mut data = self.data.write().unwrap(); + data.insert(key.to_string(), value.to_vec()); + println!(" [{}] PUT '{}' ({} bytes)", self.name, key, value.len()); + Ok(()) + } + + fn delete(&self, key: &str) -> Result { + let mut data = self.data.write().unwrap(); + let existed = data.remove(key).is_some(); + println!(" [{}] DELETE '{}' -> {}", self.name, key, existed); + Ok(existed) + } + + fn exists(&self, key: &str) -> Result { + let data = self.data.read().unwrap(); + Ok(data.contains_key(key)) + } + + fn keys(&self) -> Result> { + let data = self.data.read().unwrap(); + Ok(data.keys().cloned().collect()) + } + + fn len(&self) -> Result { + let data = self.data.read().unwrap(); + Ok(data.len()) + } + + fn clear(&self) -> Result<()> { + let mut data = self.data.write().unwrap(); + data.clear(); + println!(" [{}] CLEAR", self.name); + Ok(()) + } + + fn backend_name(&self) -> &'static str { + self.name + } +} + +fn main() -> vectorless::Result<()> { + println!("=== Custom Storage Backend Example ===\n"); + + // 1. Create custom backend + println!("1. Creating custom backend..."); + let backend = Arc::new(LoggingMemoryBackend::new("MyCustomBackend")); + println!(" ✓ Backend: {}\n", backend.backend_name()); + + // 2. Create workspace with custom backend + println!("2. Creating workspace with custom backend..."); + let mut workspace = Workspace::with_backend(backend)?; + println!(" ✓ Workspace created\n"); + + // 3. Add a document (watch the logging) + println!("3. Adding document (observe backend calls):"); + let meta = DocumentMeta::new("custom-doc", "Custom Backend Test", "md"); + let tree = DocumentTree::new("Root", "Testing custom backend!"); + let doc = PersistedDocument::new(meta, tree); + workspace.add(&doc)?; + println!(); + + // 4. Load the document + println!("4. Loading document:"); + let loaded = workspace.load("custom-doc")?.unwrap(); + println!(" ✓ Loaded: {}\n", loaded.meta.name); + + // 5. Show workspace stats + println!("5. Workspace stats:"); + println!(" - Documents: {}", workspace.len()); + println!(" - Cache size: {}", workspace.cache_len()); + println!(); + + println!("✓ Custom backend example complete!"); + println!("\nTip: Implement StorageBackend to integrate with:"); + println!(" - S3 / GCS / Azure Blob"); + println!(" - PostgreSQL / MySQL"); + println!(" - Redis / Memcached"); + println!(" - Any custom storage system"); + + Ok(()) +} diff --git a/examples/storage_compression.rs b/examples/storage_compression.rs new file mode 100644 index 00000000..303f582a --- /dev/null +++ b/examples/storage_compression.rs @@ -0,0 +1,95 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Compression example. +//! +//! This example demonstrates compression support in storage: +//! - GzipCodec for compressed storage +//! - IdentityCodec for uncompressed storage +//! - Codec trait for custom compression +//! +//! # Usage +//! +//! ```bash +//! cargo run --example storage_compression +//! ``` + +use vectorless::storage::{GzipCodec, IdentityCodec, Codec}; +use vectorless::Result; + +fn main() -> Result<()> { + println!("=== Compression Example ===\n"); + + // Test data + let original = b"Lorem ipsum dolor sit amet, consectetur adipiscing elit. \ + Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. \ + Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris."; + println!("Original data ({} bytes):", original.len()); + println!(" {:?}...\n", String::from_utf8_lossy(&original[..50])); + + // 1. Identity codec (no compression) + println!("1. IdentityCodec (no compression):"); + let identity = IdentityCodec::new(); + + let identity_encoded = identity.encode(original)?; + let identity_decoded = identity.decode(&identity_encoded)?; + + println!(" Encoded size: {} bytes", identity_encoded.len()); + println!(" Compression ratio: {:.1}%", + (identity_encoded.len() as f64 / original.len() as f64) * 100.0); + assert_eq!(original.to_vec(), identity_decoded); + println!(" ✓ Roundtrip verified\n"); + + // 2. Gzip codec with different levels + println!("2. GzipCodec with different compression levels:"); + + for level in [1, 6, 9] { + let gzip = GzipCodec::new(level); + let compressed = gzip.encode(original)?; + + println!(" Level {}: {} bytes ({:.1}% of original)", + level, + compressed.len(), + (compressed.len() as f64 / original.len() as f64) * 100.0); + } + println!(); + + // 3. Gzip roundtrip + println!("3. Gzip roundtrip verification:"); + let gzip = GzipCodec::new(6); + + let encoded = gzip.encode(original)?; + let decoded = gzip.decode(&encoded)?; + + assert_eq!(original.to_vec(), decoded); + println!(" ✓ Encoded {} bytes -> {} bytes", + original.len(), encoded.len()); + println!(" ✓ Decoded back to {} bytes", decoded.len()); + println!(" ✓ Data integrity verified\n"); + + // 4. Empty data handling + println!("4. Edge cases:"); + let empty: &[u8] = &[]; + + let empty_encoded = gzip.encode(empty)?; + let empty_decoded = gzip.decode(&empty_encoded)?; + assert!(empty_decoded.is_empty()); + println!(" ✓ Empty data handled correctly\n"); + + // 5. Comparison + println!("5. Summary:"); + println!(" Original: {} bytes", original.len()); + println!(" Identity: {} bytes (100.0%)", identity_encoded.len()); + println!(" Gzip (lvl6): {} bytes ({:.1}%)", + encoded.len(), + (encoded.len() as f64 / original.len() as f64) * 100.0); + println!(); + + println!("✓ Compression example complete!"); + println!("\nUsage tips:"); + println!(" - Use GzipCodec for large text documents"); + println!(" - Use IdentityCodec for already-compressed data (PDF, images)"); + println!(" - Level 6 is a good default (balance of speed vs ratio)"); + + Ok(()) +} diff --git a/examples/storage_migration.rs b/examples/storage_migration.rs new file mode 100644 index 00000000..5874046c --- /dev/null +++ b/examples/storage_migration.rs @@ -0,0 +1,138 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Version migration example. +//! +//! This example demonstrates how to use the migration system +//! for upgrading data formats between versions. +//! +//! # Usage +//! +//! ```bash +//! cargo run --example storage_migration +//! ``` + +use vectorless::storage::{Migration, MigrationContext, Migrator}; +use vectorless::{Error, Result}; + +/// Example migration from v1 to v2. +/// +/// Imagine v1 stored data as plain text, +/// and v2 adds a header prefix. +#[derive(Debug)] +struct V1ToV2; + +impl Migration for V1ToV2 { + fn from_version(&self) -> u32 { + 1 + } + + fn to_version(&self) -> u32 { + 2 + } + + fn description(&self) -> &str { + "Add version header to data format" + } + + fn migrate(&self, data: &[u8], _ctx: &MigrationContext) -> Result> { + // Add a simple header: "V2:" prefix + let mut result = b"V2:".to_vec(); + result.extend_from_slice(data); + Ok(result) + } +} + +/// Example migration from v2 to v3. +/// +/// V3 adds compression (simulated with base64-like encoding). +#[derive(Debug)] +struct V2ToV3; + +impl Migration for V2ToV3 { + fn from_version(&self) -> u32 { + 2 + } + + fn to_version(&self) -> u32 { + 3 + } + + fn description(&self) -> &str { + "Add compression to data format" + } + + fn migrate(&self, data: &[u8], _ctx: &MigrationContext) -> Result> { + // Simulate compression by adding prefix + let mut result = b"V3:COMPRESSED:".to_vec(); + result.extend_from_slice(data); + Ok(result) + } +} + +fn main() -> vectorless::Result<()> { + println!("=== Version Migration Example ===\n"); + + // 1. Create migrator + println!("1. Creating migrator and registering migrations..."); + let mut migrator = Migrator::new(); + migrator.register(Box::new(V1ToV2)); + migrator.register(Box::new(V2ToV3)); + + println!(" Registered migrations:"); + for (from, to, desc) in migrator.list_migrations() { + println!(" - v{} -> v{}: {}", from, to, desc); + } + println!(); + + // 2. Check migration paths + println!("2. Checking migration paths:"); + println!(" Can migrate v1 -> v2: {}", migrator.can_migrate(1, 2)); + println!(" Can migrate v1 -> v3: {}", migrator.can_migrate(1, 3)); + println!(" Can migrate v2 -> v3: {}", migrator.can_migrate(2, 3)); + println!(" Can migrate v1 -> v4: {}", migrator.can_migrate(1, 4)); + println!(); + + // 3. Migrate from v1 to v3 (multi-step) + println!("3. Migrating data from v1 to v3 (via v2):"); + let original_data = b"Hello, World!"; + println!(" Original (v1): {:?}", String::from_utf8_lossy(original_data)); + + let migrated = migrator.migrate(original_data, 1, 3)?; + println!(" Migrated (v3): {:?}", String::from_utf8_lossy(&migrated)); + println!(); + + // 4. Direct migration + println!("4. Direct migration v2 -> v3:"); + let v2_data = b"V2:Some data"; + let v3_data = migrator.migrate(v2_data, 2, 3)?; + println!(" V2: {:?}", String::from_utf8_lossy(v2_data)); + println!(" V3: {:?}", String::from_utf8_lossy(&v3_data)); + println!(); + + // 5. No migration needed + println!("5. Same version (no migration):"); + let data = b"Already v3"; + let result = migrator.migrate(data, 3, 3)?; + assert_eq!(data.to_vec(), result); + println!(" ✓ Data unchanged when from == to"); + println!(); + + // 6. Error case: no path + println!("6. Error handling (no migration path):"); + match migrator.migrate(b"test", 1, 99) { + Err(Error::VersionMismatch(msg)) => { + println!(" Expected error: {}", msg); + } + _ => unreachable!(), + } + println!(); + + println!("✓ Migration example complete!"); + println!("\nKey points:"); + println!(" - Migrations are registered as v(N) -> v(N+1)"); + println!(" - Migrator finds paths automatically (BFS)"); + println!(" - Multi-step migrations are handled transparently"); + + Ok(()) +} diff --git a/examples/storage_workspace.rs b/examples/storage_workspace.rs new file mode 100644 index 00000000..9f93310c --- /dev/null +++ b/examples/storage_workspace.rs @@ -0,0 +1,99 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Basic workspace usage example. +//! +//! This example demonstrates the core storage API: +//! - Creating a workspace +//! - Adding documents +//! - Loading documents with LRU cache +//! - Listing and removing documents +//! +//! # Usage +//! +//! ```bash +//! cargo run --example storage_workspace +//! ``` + +use vectorless::document::DocumentTree; +use vectorless::storage::{DocumentMeta, PersistedDocument, Workspace}; + +fn main() -> vectorless::Result<()> { + println!("=== Storage Workspace Example ===\n"); + + // Create a temporary workspace + let workspace_path = "./example_workspace"; + + // 1. Create a workspace with custom cache size + println!("1. Creating workspace at '{}'...", workspace_path); + let mut workspace = Workspace::with_cache_size(workspace_path, 100)?; + println!(" ✓ Workspace created\n"); + + // 2. Create a document + println!("2. Creating a document..."); + let meta = DocumentMeta::new("doc-001", "Getting Started Guide", "md") + .with_description("An introduction to the workspace API") + .with_source_path("./docs/getting-started.md"); + + let tree = DocumentTree::new("Introduction", "Welcome to Vectorless storage module!"); + + let doc = PersistedDocument::new(meta, tree); + println!(" ✓ Document created: {}\n", doc.meta.id); + + // 3. Add document to workspace + println!("3. Adding document to workspace..."); + workspace.add(&doc)?; + println!(" ✓ Document saved\n"); + + // 4. List all documents + println!("4. Listing documents:"); + for id in workspace.list_documents() { + if let Some(meta) = workspace.get_meta(id) { + println!(" - {} ({})", meta.doc_name, meta.id); + if let Some(ref desc) = meta.doc_description { + println!(" Description: {}", desc); + } + } + } + println!(); + + // 5. Load document (uses LRU cache) + println!("5. Loading document..."); + let loaded = workspace.load("doc-001")?.expect("Document should exist"); + println!(" ✓ Loaded: {}", loaded.meta.name); + let root = loaded.tree.root(); + if let Some(node) = loaded.tree.get(root) { + println!(" Root node title: {}", node.title); + } + println!(); + + // 6. Cache statistics + println!("6. Cache statistics:"); + let stats = workspace.cache_stats(); + println!(" - Hits: {}", stats.hits); + println!(" - Misses: {}", stats.misses); + println!(" - Evictions: {}", stats.evictions); + println!(" - Utilization: {:.1}%", workspace.cache_utilization() * 100.0); + println!(); + + // 7. Load again (should hit cache) + println!("7. Loading document again (should hit cache)..."); + let _ = workspace.load("doc-001")?; + let stats = workspace.cache_stats(); + println!(" ✓ Cache hits: {}", stats.hits); + println!(); + + // 8. Remove document + println!("8. Removing document..."); + let removed = workspace.remove("doc-001")?; + println!(" ✓ Removed: {}", removed); + println!(" Workspace is empty: {}", workspace.is_empty()); + println!(); + + // Cleanup + println!("Cleaning up..."); + std::fs::remove_dir_all(workspace_path).ok(); + println!(" ✓ Done!"); + + Ok(()) +} diff --git a/src/client/engine.rs b/src/client/engine.rs index 8156586e..0c0785c4 100644 --- a/src/client/engine.rs +++ b/src/client/engine.rs @@ -45,7 +45,8 @@ use std::sync::{Arc, Mutex, RwLock}; use tracing::info; use crate::config::Config; -use crate::domain::{DocumentTree, Error, Result}; +use crate::error::Result; +use crate::{DocumentTree, Error}; use crate::index::PipelineExecutor; use crate::retrieval::{PipelineRetriever, RetrieveOptions}; use crate::storage::Workspace; diff --git a/src/client/indexer.rs b/src/client/indexer.rs index 7f41cde8..8ecb25d4 100644 --- a/src/client/indexer.rs +++ b/src/client/indexer.rs @@ -25,7 +25,7 @@ use std::sync::{Arc, Mutex}; use tracing::info; use uuid::Uuid; -use crate::domain::{Error, Result}; +use crate::error::{Error, Result}; use crate::index::{IndexInput, IndexMode, PipelineExecutor, PipelineOptions, SummaryStrategy}; use crate::parser::DocumentFormat; use crate::storage::{DocumentMeta, PersistedDocument}; diff --git a/src/client/retriever.rs b/src/client/retriever.rs index 7f0099ca..ee7a0cbd 100644 --- a/src/client/retriever.rs +++ b/src/client/retriever.rs @@ -22,7 +22,8 @@ use std::sync::Arc; use tracing::info; use crate::config::Config; -use crate::domain::{DocumentTree, Error, NodeId, Result}; +use crate::document::{DocumentTree, NodeId}; +use crate::error::{Error, Result}; use crate::retrieval::content::ContentAggregatorConfig; use crate::retrieval::{ QueryComplexity, RetrieveOptions, RetrieveResponse, RetrievalResult, Retriever, SufficiencyLevel, diff --git a/src/client/session.rs b/src/client/session.rs index 1b5d55ef..f659ac75 100644 --- a/src/client/session.rs +++ b/src/client/session.rs @@ -31,7 +31,8 @@ use std::time::{Duration, Instant}; use tracing::info; use uuid::Uuid; -use crate::domain::{DocumentTree, Error, Result}; +use crate::{DocumentTree, Error}; +use crate::error::Result; use crate::retrieval::RetrieveOptions; use crate::storage::PersistedDocument; diff --git a/src/client/types.rs b/src/client/types.rs index 40816257..861d52e6 100644 --- a/src/client/types.rs +++ b/src/client/types.rs @@ -8,7 +8,7 @@ use serde::{Deserialize, Serialize}; use std::path::PathBuf; -use crate::domain::DocumentTree; +use crate::document::DocumentTree; use crate::parser::DocumentFormat; // ============================================================ diff --git a/src/client/workspace.rs b/src/client/workspace.rs index 731a5e71..c5525bfa 100644 --- a/src/client/workspace.rs +++ b/src/client/workspace.rs @@ -27,7 +27,8 @@ use std::sync::{Arc, RwLock}; use tracing::{debug, info, warn}; -use crate::domain::{Error, Result}; +use crate::{Error}; +use crate::error::Result; use crate::storage::{DocumentMetaEntry, PersistedDocument, Workspace}; use super::events::{EventEmitter, WorkspaceEvent}; @@ -353,17 +354,29 @@ pub struct WorkspaceStats { #[cfg(test)] mod tests { use super::*; + use tempfile::TempDir; + use crate::storage::WorkspaceOptions; #[test] fn test_workspace_client_creation() { - let workspace = Workspace::open("./test_workspace").unwrap(); + let temp = TempDir::new().unwrap(); + let options = WorkspaceOptions { + file_lock: false, + ..Default::default() + }; + let workspace = Workspace::open_with_options(temp.path(), options).unwrap(); let client = WorkspaceClient::new(workspace); assert!(client.is_empty()); } #[test] fn test_workspace_stats() { - let workspace = Workspace::open("./test_workspace").unwrap(); + let temp = TempDir::new().unwrap(); + let options = WorkspaceOptions { + file_lock: false, + ..Default::default() + }; + let workspace = Workspace::open_with_options(temp.path(), options).unwrap(); let client = WorkspaceClient::new(workspace); let stats = client.stats().unwrap(); diff --git a/src/config/mod.rs b/src/config/mod.rs index 98ad2e8a..d821332a 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -87,7 +87,8 @@ pub use types::{ // Retrieval configs RetrievalConfig, SearchConfig, // Storage and sufficiency - StorageConfig, CacheConfig, StrategyConfig, SufficiencyConfig, + StorageConfig, CompressionAlgorithm, CompressionConfig, + CacheConfig, StrategyConfig, SufficiencyConfig, // Content aggregator ContentAggregatorConfig, // Concurrency diff --git a/src/config/types/mod.rs b/src/config/types/mod.rs index a824ee3f..39536156 100644 --- a/src/config/types/mod.rs +++ b/src/config/types/mod.rs @@ -23,7 +23,8 @@ pub use indexer::IndexerConfig; pub use llm::{LlmConfig, SummaryConfig}; pub use retrieval::{RetrievalConfig, SearchConfig}; pub use storage::{ - CacheConfig, StorageConfig, StrategyConfig, SufficiencyConfig, + CacheConfig, CompressionAlgorithm, CompressionConfig, + StorageConfig, StrategyConfig, SufficiencyConfig, }; /// Main configuration for vectorless. diff --git a/src/config/types/storage.rs b/src/config/types/storage.rs index 0dc55ed9..562c7ba3 100644 --- a/src/config/types/storage.rs +++ b/src/config/types/storage.rs @@ -12,16 +12,58 @@ pub struct StorageConfig { /// Workspace directory for persisted documents. #[serde(default = "default_workspace_dir")] pub workspace_dir: PathBuf, + + /// LRU cache size (number of documents). + #[serde(default = "default_cache_size")] + pub cache_size: usize, + + /// Enable atomic writes (write to temp file, then rename). + /// This prevents data corruption on crash. + #[serde(default = "default_atomic_writes")] + pub atomic_writes: bool, + + /// Enable file locking for multi-process safety. + #[serde(default = "default_file_lock")] + pub file_lock: bool, + + /// Enable checksum verification for data integrity. + #[serde(default = "default_checksum_enabled")] + pub checksum_enabled: bool, + + /// Enable compression for stored documents. + #[serde(default)] + pub compression: CompressionConfig, } fn default_workspace_dir() -> PathBuf { PathBuf::from("./workspace") } +fn default_cache_size() -> usize { + 100 +} + +fn default_atomic_writes() -> bool { + true +} + +fn default_file_lock() -> bool { + true +} + +fn default_checksum_enabled() -> bool { + true +} + impl Default for StorageConfig { fn default() -> Self { Self { workspace_dir: default_workspace_dir(), + cache_size: default_cache_size(), + atomic_writes: default_atomic_writes(), + file_lock: default_file_lock(), + checksum_enabled: default_checksum_enabled(), + compression: CompressionConfig::default(), } } } @@ -37,6 +79,109 @@ impl StorageConfig { self.workspace_dir = dir.into(); self } + + /// Set the cache size. + pub fn with_cache_size(mut self, size: usize) -> Self { + self.cache_size = size; + self + } + + /// Enable or disable atomic writes. + pub fn with_atomic_writes(mut self, enabled: bool) -> Self { + self.atomic_writes = enabled; + self + } + + /// Enable or disable file locking. + pub fn with_file_lock(mut self, enabled: bool) -> Self { + self.file_lock = enabled; + self + } + + /// Enable or disable checksum verification. + pub fn with_checksum(mut self, enabled: bool) -> Self { + self.checksum_enabled = enabled; + self + } + + /// Set compression configuration. + pub fn with_compression(mut self, compression: CompressionConfig) -> Self { + self.compression = compression; + self + } +} + +/// Compression configuration. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CompressionConfig { + /// Enable compression. + #[serde(default = "default_compression_enabled")] + pub enabled: bool, + + /// Compression algorithm. + #[serde(default = "default_compression_algorithm")] + pub algorithm: CompressionAlgorithm, + + /// Compression level (1-9, higher = better compression but slower). + #[serde(default = "default_compression_level")] + pub level: u32, +} + +fn default_compression_enabled() -> bool { + false +} + +fn default_compression_algorithm() -> CompressionAlgorithm { + CompressionAlgorithm::Gzip +} + +fn default_compression_level() -> u32 { + 6 +} + +impl Default for CompressionConfig { + fn default() -> Self { + Self { + enabled: default_compression_enabled(), + algorithm: default_compression_algorithm(), + level: default_compression_level(), + } + } +} + +impl CompressionConfig { + /// Create new compression config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Enable or disable compression. + pub fn with_enabled(mut self, enabled: bool) -> Self { + self.enabled = enabled; + self + } + + /// Set the compression algorithm. + pub fn with_algorithm(mut self, algorithm: CompressionAlgorithm) -> Self { + self.algorithm = algorithm; + self + } + + /// Set the compression level. + pub fn with_level(mut self, level: u32) -> Self { + self.level = level.clamp(1, 9); + self + } +} + +/// Compression algorithm. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum CompressionAlgorithm { + /// Gzip compression. + Gzip, + /// Zstandard compression. + Zstd, } /// Sufficiency checker configuration. @@ -248,6 +393,44 @@ mod tests { fn test_storage_config_defaults() { let config = StorageConfig::default(); assert_eq!(config.workspace_dir, PathBuf::from("./workspace")); + assert_eq!(config.cache_size, 100); + assert!(config.atomic_writes); + assert!(config.file_lock); + assert!(config.checksum_enabled); + assert!(!config.compression.enabled); + } + + #[test] + fn test_storage_config_builders() { + let config = StorageConfig::new() + .with_workspace_dir("/data/workspace") + .with_cache_size(200) + .with_atomic_writes(false) + .with_file_lock(false) + .with_checksum(false); + + assert_eq!(config.workspace_dir, PathBuf::from("/data/workspace")); + assert_eq!(config.cache_size, 200); + assert!(!config.atomic_writes); + assert!(!config.file_lock); + assert!(!config.checksum_enabled); + } + + #[test] + fn test_compression_config_defaults() { + let config = CompressionConfig::default(); + assert!(!config.enabled); + assert_eq!(config.algorithm, CompressionAlgorithm::Gzip); + assert_eq!(config.level, 6); + } + + #[test] + fn test_compression_config_level_clamp() { + let config = CompressionConfig::new().with_level(15); + assert_eq!(config.level, 9); // clamped to max + + let config = CompressionConfig::new().with_level(0); + assert_eq!(config.level, 1); // clamped to min } #[test] diff --git a/src/domain/mod.rs b/src/document/mod.rs similarity index 55% rename from src/domain/mod.rs rename to src/document/mod.rs index 75970a12..f045fcfe 100644 --- a/src/domain/mod.rs +++ b/src/document/mod.rs @@ -1,9 +1,9 @@ // Copyright (c) 2026 vectorless developers // SPDX-License-Identifier: Apache-2.0 -//! Domain layer - pure data structures with zero business logic. +//! Document types - pure data structures for document tree representation. //! -//! This module contains the core domain types that represent document trees. +//! This module contains the core types that represent hierarchical documents. //! These types have no dependencies on indexing or retrieval logic. //! //! # Types @@ -12,16 +12,14 @@ //! - [`DocumentTree`] - Arena-based tree structure //! - [`NodeId`] - Unique identifier for tree nodes //! - [`TocView`] - Table of Contents generator -//! - [`Error`] - Domain error types +//! - [`StructureNode`] - JSON export structure -mod error; mod node; +mod structure; mod toc; -mod token; mod tree; -pub use error::{Error, Result}; pub use node::{NodeId, TreeNode}; +pub use structure::{DocumentStructure, StructureNode}; pub use toc::{TocConfig, TocEntry, TocNode, TocView}; -pub use token::{estimate_tokens, estimate_tokens_batch, estimate_tokens_fast}; -pub use tree::{DocumentStructure, DocumentTree, RetrievalIndex, StructureNode}; +pub use tree::{DocumentTree, RetrievalIndex}; diff --git a/src/domain/node.rs b/src/document/node.rs similarity index 100% rename from src/domain/node.rs rename to src/document/node.rs diff --git a/src/document/structure.rs b/src/document/structure.rs new file mode 100644 index 00000000..6fa93b35 --- /dev/null +++ b/src/document/structure.rs @@ -0,0 +1,67 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Document structure types for JSON export. +//! +//! These types define the JSON format for exporting document trees, +//! compatible with PageIndex format. + +use serde::{Deserialize, Serialize}; + +/// A node in the document structure for JSON export. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct StructureNode { + /// Node title. + pub title: String, + /// Unique node identifier. + pub node_id: String, + /// Starting line number (1-based). + pub start_index: usize, + /// Ending line number (1-based). + pub end_index: usize, + /// Generated summary (optional). + #[serde(skip_serializing_if = "Option::is_none")] + pub summary: Option, + /// Child nodes. + #[serde(skip_serializing_if = "Vec::is_empty")] + pub nodes: Vec, +} + +/// Document structure for JSON export. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DocumentStructure { + /// Document name. + pub doc_name: String, + /// Tree structure. + pub structure: Vec, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_structure_node_serialization() { + let node = StructureNode { + title: "Introduction".to_string(), + node_id: "0001".to_string(), + start_index: 1, + end_index: 10, + summary: Some("A brief intro".to_string()), + nodes: vec![], + }; + + let json = serde_json::to_string(&node).unwrap(); + assert!(json.contains("Introduction")); + } + + #[test] + fn test_document_structure() { + let doc = DocumentStructure { + doc_name: "test.md".to_string(), + structure: vec![], + }; + + assert_eq!(doc.doc_name, "test.md"); + } +} diff --git a/src/domain/toc.rs b/src/document/toc.rs similarity index 100% rename from src/domain/toc.rs rename to src/document/toc.rs diff --git a/src/domain/tree.rs b/src/document/tree.rs similarity index 96% rename from src/domain/tree.rs rename to src/document/tree.rs index 94f138a3..090dacae 100644 --- a/src/domain/tree.rs +++ b/src/document/tree.rs @@ -12,34 +12,7 @@ use indextree::Arena; use serde::{Deserialize, Serialize}; use super::node::{NodeId, TreeNode}; - -/// JSON structure for exporting document tree (matches PageIndex format). -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct StructureNode { - /// Node title. - pub title: String, - /// Unique node identifier. - pub node_id: String, - /// Starting line number (1-based). - pub start_index: usize, - /// Ending line number (1-based). - pub end_index: usize, - /// Generated summary (optional). - #[serde(skip_serializing_if = "Option::is_none")] - pub summary: Option, - /// Child nodes. - #[serde(skip_serializing_if = "Vec::is_empty")] - pub nodes: Vec, -} - -/// Document structure for JSON export. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct DocumentStructure { - /// Document name. - pub doc_name: String, - /// Tree structure. - pub structure: Vec, -} +use super::structure::{DocumentStructure, StructureNode}; /// Pre-computed index for efficient retrieval operations. /// diff --git a/src/domain/error.rs b/src/domain/error.rs deleted file mode 100644 index 2f91bd38..00000000 --- a/src/domain/error.rs +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Error types for the vectorless library. - -use thiserror::Error; - -/// The main error type for vectorless operations. -#[derive(Debug, Error)] -pub enum Error { - /// An error occurred while parsing a document. - #[error("Document parsing error: {0}")] - Parse(String), - - /// An error occurred while building the index. - #[error("Index building error: {0}")] - IndexBuild(String), - - /// An error occurred during retrieval. - #[error("Retrieval error: {0}")] - Retrieval(String), - - /// An error occurred during summarization. - #[error("Summarization error: {0}")] - Summarization(String), - - /// An error occurred during LLM call. - #[error("LLM error: {0}")] - Llm(String), - - /// An error occurred during I/O operations. - #[error("IO error: {0}")] - Io(#[from] std::io::Error), - - /// An error occurred during serialization/deserialization. - #[error("Serialization error: {0}")] - Serialization(#[from] serde_json::Error), - - /// The requested node was not found. - #[error("Node not found: {0}")] - NodeNotFound(String), - - /// The requested document was not found. - #[error("Document not found: {0}")] - DocumentNotFound(String), - - /// Invalid configuration. - #[error("Invalid configuration: {0}")] - Config(String), - - /// A generic error with a message. - #[error("{0}")] - Other(String), -} - -/// A specialized result type for vectorless operations. -pub type Result = std::result::Result; diff --git a/src/error.rs b/src/error.rs new file mode 100644 index 00000000..615dd671 --- /dev/null +++ b/src/error.rs @@ -0,0 +1,330 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Error types for the vectorless library. +//! +//! This module provides a comprehensive error type hierarchy for all operations. +//! All errors are consolidated into [`Error`] with specific variants for each category. + +use thiserror::Error; + +/// The main error type for vectorless operations. +#[derive(Debug, Error)] +pub enum Error { + // ========================================================================= + // Document & Parsing Errors + // ========================================================================= + + /// An error occurred while parsing a document. + #[error("Document parsing error: {0}")] + Parse(String), + + /// Unsupported document format. + #[error("Unsupported document format: {0}")] + UnsupportedFormat(String), + + /// Invalid document structure. + #[error("Invalid document structure: {0}")] + InvalidStructure(String), + + // ========================================================================= + // Index Errors + // ========================================================================= + + /// An error occurred while building the index. + #[error("Index building error: {0}")] + IndexBuild(String), + + /// Index not found. + #[error("Index not found: {0}")] + IndexNotFound(String), + + /// Index corrupted. + #[error("Index corrupted: {0}")] + IndexCorrupted(String), + + // ========================================================================= + // Retrieval Errors + // ========================================================================= + + /// An error occurred during retrieval. + #[error("Retrieval error: {0}")] + Retrieval(String), + + /// No relevant content found. + #[error("No relevant content found for query")] + NoRelevantContent, + + /// Search timeout. + #[error("Search timeout after {0}ms")] + SearchTimeout(u64), + + // ========================================================================= + // LLM Errors + // ========================================================================= + + /// An error occurred during LLM call. + #[error("LLM error: {0}")] + Llm(String), + + /// LLM rate limit exceeded. + #[error("LLM rate limit exceeded, retry after {0}ms")] + RateLimitExceeded(u64), + + /// LLM quota exceeded. + #[error("LLM quota exceeded")] + QuotaExceeded, + + // ========================================================================= + // Summary Errors + // ========================================================================= + + /// An error occurred during summarization. + #[error("Summarization error: {0}")] + Summarization(String), + + /// Summary too long. + #[error("Summary exceeds maximum length: {0} tokens")] + SummaryTooLong(usize), + + // ========================================================================= + // Storage Errors + // ========================================================================= + + /// An error occurred during I/O operations. + #[error("IO error: {0}")] + Io(#[from] std::io::Error), + + /// Workspace error. + #[error("Workspace error: {0}")] + Workspace(String), + + /// Cache error. + #[error("Cache error: {0}")] + Cache(String), + + /// Serialization error. + #[error("Serialization error: {0}")] + Serialization(String), + + /// Document not found. + #[error("Document not found: {0}")] + DocumentNotFound(String), + + /// Checksum mismatch. + #[error("Checksum mismatch: {0}")] + ChecksumMismatch(String), + + /// Workspace locked by another process. + #[error("Workspace locked by another process")] + WorkspaceLocked, + + /// Format version mismatch. + #[error("Format version mismatch: {0}")] + VersionMismatch(String), + + // ========================================================================= + // Configuration Errors + // ========================================================================= + + /// TOML parsing error. + #[error("TOML parsing error: {0}")] + Toml(String), + + /// Invalid configuration. + #[error("Invalid configuration: {0}")] + Config(String), + + /// Missing required configuration. + #[error("Missing required configuration: {0}")] + MissingConfig(String), + + // ========================================================================= + // Node Errors + // ========================================================================= + + /// The requested node was not found. + #[error("Node not found: {0}")] + NodeNotFound(String), + + // ========================================================================= + // Input Validation Errors + // ========================================================================= + + /// Invalid input. + #[error("Invalid input: {0}")] + InvalidInput(String), + + /// Empty input. + #[error("Empty input: {field}")] + EmptyInput { + /// The field that was empty. + field: String, + }, + + /// Out of range. + #[error("{field} out of range: expected {min}-{max}, got {actual}")] + OutOfRange { + /// The field that was out of range. + field: String, + /// Minimum allowed value. + min: String, + /// Maximum allowed value. + max: String, + /// Actual value received. + actual: String, + }, + + // ========================================================================= + // Throttle Errors + // ========================================================================= + + /// Throttle error. + #[error("Throttle error: {0}")] + Throttle(String), + + /// Concurrency limit exceeded. + #[error("Concurrency limit exceeded: {0} pending")] + ConcurrencyLimitExceeded(usize), + + // ========================================================================= + // Timeout Errors + // ========================================================================= + + /// Operation timeout. + #[error("Operation timeout: {0}")] + Timeout(String), + + // ========================================================================= + // Generic Errors + // ========================================================================= + + /// A generic error with a message. + #[error("{0}")] + Other(String), + + /// Error with context. + #[error("{context}: {source}")] + WithContext { + /// Additional context describing where/why the error occurred. + context: String, + /// The underlying error. + #[source] + source: Box, + }, +} + +impl Error { + /// Create an error with additional context. + #[must_use] + pub fn with_context(self, context: impl Into) -> Self { + Self::WithContext { + context: context.into(), + source: Box::new(self), + } + } + + /// Check if this is a retryable error. + #[must_use] + pub fn is_retryable(&self) -> bool { + matches!( + self, + Self::RateLimitExceeded(_) + | Self::SearchTimeout(_) + | Self::Timeout(_) + | Self::Llm(_) + ) + } + + /// Check if this is a not found error. + #[must_use] + pub fn is_not_found(&self) -> bool { + matches!( + self, + Self::NodeNotFound(_) | Self::DocumentNotFound(_) | Self::IndexNotFound(_) + ) + } + + /// Check if this is a timeout error. + #[must_use] + pub fn is_timeout(&self) -> bool { + matches!(self, Self::Timeout(_) | Self::SearchTimeout(_)) + } + + /// Check if this is a configuration error. + #[must_use] + pub fn is_config_error(&self) -> bool { + matches!(self, Self::Config(_) | Self::MissingConfig(_)) + } + + /// Create an empty input error. + pub fn empty_input(field: impl Into) -> Self { + Self::EmptyInput { + field: field.into(), + } + } + + /// Create an out of range error. + pub fn out_of_range( + field: impl Into, + min: impl Into, + max: impl Into, + actual: impl Into, + ) -> Self { + Self::OutOfRange { + field: field.into(), + min: min.into(), + max: max.into(), + actual: actual.into(), + } + } +} + +/// A specialized result type for vectorless operations. +pub type Result = std::result::Result; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_error_context() { + let inner = Error::Parse("test".to_string()); + let with_context = inner.with_context("While processing document"); + + let msg = format!("{}", with_context); + assert!(msg.contains("While processing document")); + assert!(msg.contains("test")); + } + + #[test] + fn test_is_retryable() { + assert!(Error::RateLimitExceeded(1000).is_retryable()); + assert!(Error::Timeout("test".to_string()).is_retryable()); + assert!(!Error::Config("test".to_string()).is_retryable()); + } + + #[test] + fn test_is_not_found() { + assert!(Error::NodeNotFound("1".to_string()).is_not_found()); + assert!(Error::DocumentNotFound("doc".to_string()).is_not_found()); + assert!(!Error::Parse("test".to_string()).is_not_found()); + } + + #[test] + fn test_empty_input() { + let err = Error::empty_input("query"); + let msg = format!("{}", err); + assert!(msg.contains("query")); + } + + #[test] + fn test_out_of_range() { + let err = Error::out_of_range("depth", "0", "10", "15"); + let msg = format!("{}", err); + assert!(msg.contains("depth")); + assert!(msg.contains("0")); + assert!(msg.contains("10")); + assert!(msg.contains("15")); + } +} diff --git a/src/index/config.rs b/src/index/config.rs new file mode 100644 index 00000000..55128822 --- /dev/null +++ b/src/index/config.rs @@ -0,0 +1,268 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Configuration types for the index pipeline. +//! +//! This module contains all configuration types used by the indexing pipeline: +//! - [`IndexMode`] - Document format selection +//! - [`PipelineOptions`] - Full pipeline configuration +//! - [`OptimizationConfig`] - Tree optimization settings +//! - [`ThinningConfig`] - Node merging settings + +use crate::config::{ConcurrencyConfig, IndexerConfig}; +use super::summary::SummaryStrategy; + +/// Index mode for document processing. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum IndexMode { + /// Auto-detect format from file extension. + Auto, + /// Force Markdown format. + Markdown, + /// Force PDF format. + Pdf, + /// Force DOCX format. + Docx, + /// Force HTML format. + Html, +} + +impl Default for IndexMode { + fn default() -> Self { + Self::Auto + } +} + +/// Configuration for tree optimization. +#[derive(Debug, Clone)] +pub struct OptimizationConfig { + /// Whether optimization is enabled. + pub enabled: bool, + + /// Maximum tree depth (flatten if exceeded). + pub max_depth: Option, + + /// Maximum children per node (group if exceeded). + pub max_children: Option, + + /// Minimum tokens for a leaf node (merge smaller ones). + pub merge_leaf_threshold: usize, +} + +impl Default for OptimizationConfig { + fn default() -> Self { + Self { + enabled: true, + max_depth: None, + max_children: None, + merge_leaf_threshold: 50, + } + } +} + +impl OptimizationConfig { + /// Create a new optimization config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Disable optimization entirely. + pub fn disabled() -> Self { + Self { + enabled: false, + ..Self::default() + } + } + + /// Set maximum depth. + pub fn with_max_depth(mut self, depth: usize) -> Self { + self.max_depth = Some(depth); + self + } + + /// Set maximum children per node. + pub fn with_max_children(mut self, max: usize) -> Self { + self.max_children = Some(max); + self + } +} + +/// Configuration for thinning (merging small nodes). +#[derive(Debug, Clone)] +pub struct ThinningConfig { + /// Whether thinning is enabled. + pub enabled: bool, + + /// Token threshold for merging. + pub threshold: usize, +} + +impl Default for ThinningConfig { + fn default() -> Self { + Self { + enabled: false, + threshold: 500, + } + } +} + +impl ThinningConfig { + /// Create disabled config. + pub fn disabled() -> Self { + Self::default() + } + + /// Create enabled config with threshold. + pub fn enabled(threshold: usize) -> Self { + Self { + enabled: true, + threshold, + } + } + + /// Set the token threshold. + pub fn with_threshold(mut self, threshold: usize) -> Self { + self.threshold = threshold; + self + } +} + +/// Pipeline options for index execution. +#[derive(Debug, Clone)] +pub struct PipelineOptions { + /// Index mode. + pub mode: IndexMode, + + /// Whether to generate node IDs. + pub generate_ids: bool, + + /// Summary generation strategy. + pub summary_strategy: SummaryStrategy, + + /// Thinning configuration. + pub thinning: ThinningConfig, + + /// Optimization configuration. + pub optimization: OptimizationConfig, + + /// Whether to generate document description. + pub generate_description: bool, + + /// Concurrency configuration. + pub concurrency: ConcurrencyConfig, + + /// Indexer configuration. + pub indexer: IndexerConfig, +} + +impl Default for PipelineOptions { + fn default() -> Self { + Self { + mode: IndexMode::Auto, + generate_ids: true, + summary_strategy: SummaryStrategy::default(), + thinning: ThinningConfig::default(), + optimization: OptimizationConfig::default(), + generate_description: true, + concurrency: ConcurrencyConfig::default(), + indexer: IndexerConfig::default(), + } + } +} + +impl PipelineOptions { + /// Create new pipeline options with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Set the index mode. + pub fn with_mode(mut self, mode: IndexMode) -> Self { + self.mode = mode; + self + } + + /// Set whether to generate node IDs. + pub fn with_generate_ids(mut self, generate: bool) -> Self { + self.generate_ids = generate; + self + } + + /// Set the summary strategy. + pub fn with_summary_strategy(mut self, strategy: SummaryStrategy) -> Self { + self.summary_strategy = strategy; + self + } + + /// Set the thinning configuration. + pub fn with_thinning(mut self, thinning: ThinningConfig) -> Self { + self.thinning = thinning; + self + } + + /// Set the optimization configuration. + pub fn with_optimization(mut self, optimization: OptimizationConfig) -> Self { + self.optimization = optimization; + self + } + + /// Set whether to generate document description. + pub fn with_generate_description(mut self, generate: bool) -> Self { + self.generate_description = generate; + self + } + + /// Set the concurrency configuration. + pub fn with_concurrency(mut self, concurrency: ConcurrencyConfig) -> Self { + self.concurrency = concurrency; + self + } + + /// Set the indexer configuration. + pub fn with_indexer(mut self, indexer: IndexerConfig) -> Self { + self.indexer = indexer; + self + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_index_mode_default() { + let mode = IndexMode::default(); + assert_eq!(mode, IndexMode::Auto); + } + + #[test] + fn test_optimization_config() { + let config = OptimizationConfig::new() + .with_max_depth(5) + .with_max_children(10); + + assert!(config.enabled); + assert_eq!(config.max_depth, Some(5)); + assert_eq!(config.max_children, Some(10)); + } + + #[test] + fn test_thinning_config() { + let config = ThinningConfig::enabled(300); + assert!(config.enabled); + assert_eq!(config.threshold, 300); + + let disabled = ThinningConfig::disabled(); + assert!(!disabled.enabled); + } + + #[test] + fn test_pipeline_options_builder() { + let options = PipelineOptions::new() + .with_mode(IndexMode::Markdown) + .with_generate_ids(false); + + assert_eq!(options.mode, IndexMode::Markdown); + assert!(!options.generate_ids); + } +} diff --git a/src/index/incremental/detector.rs b/src/index/incremental/detector.rs index 688197b0..1db0d4fc 100644 --- a/src/index/incremental/detector.rs +++ b/src/index/incremental/detector.rs @@ -8,7 +8,7 @@ use std::hash::{Hash, Hasher}; use std::path::Path; use std::time::SystemTime; -use crate::domain::DocumentTree; +use crate::document::DocumentTree; /// Type of change detected. #[derive(Debug, Clone, Copy, PartialEq, Eq)] diff --git a/src/index/incremental/updater.rs b/src/index/incremental/updater.rs index 2762df9b..fd1575df 100644 --- a/src/index/incremental/updater.rs +++ b/src/index/incremental/updater.rs @@ -5,7 +5,8 @@ use tracing::info; -use crate::domain::{DocumentTree, NodeId, Result}; +use crate::document::{DocumentTree, NodeId}; +use crate::error::Result; use crate::parser::RawNode; use super::detector::ChangeDetector; diff --git a/src/index/mod.rs b/src/index/mod.rs index 0eb72f7c..96de34a5 100644 --- a/src/index/mod.rs +++ b/src/index/mod.rs @@ -23,13 +23,11 @@ //! # Usage //! //! ```rust,ignore -//! use vectorless::domain::index::pipeline::{PipelineExecutor, IndexOptions}; -//! use vectorless::domain::index::summary::SummaryStrategy; +//! use vectorless::index::{PipelineExecutor, IndexInput, PipelineOptions}; +//! use vectorless::index::summary::SummaryStrategy; //! -//! let options = IndexOptions { -//! summary_strategy: SummaryStrategy::selective(100, true), -//! ..Default::default() -//! }; +//! let options = PipelineOptions::new() +//! .with_summary_strategy(SummaryStrategy::selective(100, true)); //! //! let result = PipelineExecutor::new() //! .with_options(options) @@ -37,6 +35,7 @@ //! .await?; //! ``` +pub mod config; pub mod incremental; pub mod pipeline; pub mod stages; @@ -48,6 +47,11 @@ pub use pipeline::{ PipelineExecutor, PipelineOrchestrator, StageResult, StageRetryConfig, }; +// Re-export config types +pub use config::{ + IndexMode, OptimizationConfig, PipelineOptions, ThinningConfig, +}; + // Re-export stages pub use stages::IndexStage; @@ -60,130 +64,5 @@ pub use summary::{ // Re-export incremental pub use incremental::{ChangeDetector, ChangeSet, PartialUpdater}; -// Re-export config types +// Re-export config types from crate config pub use crate::config::{ConcurrencyConfig, IndexerConfig}; - -/// Configuration for tree optimization. -#[derive(Debug, Clone)] -pub struct OptimizationConfig { - /// Whether optimization is enabled. - pub enabled: bool, - - /// Maximum tree depth (flatten if exceeded). - pub max_depth: Option, - - /// Maximum children per node (group if exceeded). - pub max_children: Option, - - /// Minimum tokens for a leaf node (merge smaller ones). - pub merge_leaf_threshold: usize, -} - -impl Default for OptimizationConfig { - fn default() -> Self { - Self { - enabled: true, - max_depth: None, - max_children: None, - merge_leaf_threshold: 50, - } - } -} - -/// Configuration for thinning (merging small nodes). -#[derive(Debug, Clone)] -pub struct ThinningConfig { - /// Whether thinning is enabled. - pub enabled: bool, - - /// Token threshold for merging. - pub threshold: usize, -} - -impl Default for ThinningConfig { - fn default() -> Self { - Self { - enabled: false, - threshold: 500, - } - } -} - -impl ThinningConfig { - /// Create disabled config. - pub fn disabled() -> Self { - Self::default() - } - - /// Create enabled config with threshold. - pub fn enabled(threshold: usize) -> Self { - Self { - enabled: true, - threshold, - } - } -} - -/// Index mode. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum IndexMode { - /// Auto-detect format from file extension. - Auto, - /// Force Markdown format. - Markdown, - /// Force PDF format. - Pdf, - /// Force DOCX format. - Docx, - /// Force HTML format. - Html, -} - -impl Default for IndexMode { - fn default() -> Self { - Self::Auto - } -} - -/// Pipeline options (v2). -#[derive(Debug, Clone)] -pub struct PipelineOptions { - /// Index mode. - pub mode: IndexMode, - - /// Whether to generate node IDs. - pub generate_ids: bool, - - /// Summary generation strategy. - pub summary_strategy: SummaryStrategy, - - /// Thinning configuration. - pub thinning: ThinningConfig, - - /// Optimization configuration. - pub optimization: OptimizationConfig, - - /// Whether to generate document description. - pub generate_description: bool, - - /// Concurrency configuration. - pub concurrency: ConcurrencyConfig, - - /// Indexer configuration. - pub indexer: IndexerConfig, -} - -impl Default for PipelineOptions { - fn default() -> Self { - Self { - mode: IndexMode::Auto, - generate_ids: true, - summary_strategy: SummaryStrategy::default(), - thinning: ThinningConfig::default(), - optimization: OptimizationConfig::default(), - generate_description: true, - concurrency: ConcurrencyConfig::default(), - indexer: IndexerConfig::default(), - } - } -} diff --git a/src/index/pipeline/context.rs b/src/index/pipeline/context.rs index 656d7909..777033fc 100644 --- a/src/index/pipeline/context.rs +++ b/src/index/pipeline/context.rs @@ -6,7 +6,7 @@ use std::collections::HashMap; use std::path::PathBuf; -use crate::domain::{DocumentTree, NodeId}; +use crate::document::{DocumentTree, NodeId}; use crate::llm::LlmClient; use crate::parser::{DocumentFormat, RawNode}; diff --git a/src/index/pipeline/executor.rs b/src/index/pipeline/executor.rs index e1c12506..ee560e91 100644 --- a/src/index/pipeline/executor.rs +++ b/src/index/pipeline/executor.rs @@ -8,7 +8,7 @@ use tracing::info; -use crate::domain::Result; +use crate::error::Result; use crate::llm::LlmClient; use super::super::PipelineOptions; diff --git a/src/index/pipeline/orchestrator.rs b/src/index/pipeline/orchestrator.rs index fb471f51..299317d8 100644 --- a/src/index/pipeline/orchestrator.rs +++ b/src/index/pipeline/orchestrator.rs @@ -27,7 +27,7 @@ use std::collections::HashMap; use std::time::Instant; use tracing::{error, info, warn}; -use crate::domain::Result; +use crate::error::Result; use super::super::PipelineOptions; use super::super::stages::IndexStage; @@ -208,7 +208,7 @@ impl PipelineOrchestrator { for entry in &self.stages { for dep in &entry.depends_on { if !name_to_idx.contains_key(dep.as_str()) { - return Err(crate::domain::Error::Config(format!( + return Err(crate::error::Error::Config(format!( "Stage '{}' depends on non-existent stage '{}'", entry.stage.name(), dep @@ -265,7 +265,7 @@ impl PipelineOrchestrator { .filter(|&&i| !result.contains(&i)) .map(|&i| self.stages[i].stage.name()) .collect(); - return Err(crate::domain::Error::Config(format!( + return Err(crate::error::Error::Config(format!( "Circular dependency detected involving stages: {:?}", remaining ))); diff --git a/src/index/stages/build.rs b/src/index/stages/build.rs index ed7f0ee9..1ab16d26 100644 --- a/src/index/stages/build.rs +++ b/src/index/stages/build.rs @@ -7,8 +7,10 @@ use super::async_trait; use std::time::Instant; use tracing::info; -use crate::domain::{DocumentTree, NodeId, Result, estimate_tokens}; +use crate::document::{DocumentTree, NodeId}; +use crate::error::Result; use crate::parser::RawNode; +use crate::util::estimate_tokens; use super::{IndexStage, StageResult}; use crate::index::ThinningConfig; diff --git a/src/index/stages/enhance.rs b/src/index/stages/enhance.rs index f510e2e0..d1d0f6fd 100644 --- a/src/index/stages/enhance.rs +++ b/src/index/stages/enhance.rs @@ -8,7 +8,9 @@ use std::sync::Arc; use std::time::Instant; use tracing::{info, warn}; -use crate::domain::{DocumentTree, NodeId, Result}; + +use crate::error::Result; +use crate::document::{DocumentTree, NodeId}; use crate::llm::LlmClient; use super::{IndexStage, StageResult}; diff --git a/src/index/stages/enrich.rs b/src/index/stages/enrich.rs index 2c3759fe..7b0c670d 100644 --- a/src/index/stages/enrich.rs +++ b/src/index/stages/enrich.rs @@ -7,7 +7,8 @@ use super::async_trait; use std::time::Instant; use tracing::info; -use crate::domain::{DocumentTree, NodeId, Result, TocView}; +use crate::document::{DocumentTree, NodeId, TocView}; +use crate::error::Result; use super::{IndexStage, StageResult}; use crate::index::pipeline::IndexContext; @@ -116,7 +117,7 @@ impl IndexStage for EnrichStage { let tree = ctx .tree .as_mut() - .ok_or_else(|| crate::domain::Error::IndexBuild("Tree not built".to_string()))?; + .ok_or_else(|| crate::Error::IndexBuild("Tree not built".to_string()))?; // 1. Calculate page ranges Self::calculate_page_ranges(tree); diff --git a/src/index/stages/mod.rs b/src/index/stages/mod.rs index 9d6f8c85..5a55383d 100644 --- a/src/index/stages/mod.rs +++ b/src/index/stages/mod.rs @@ -18,7 +18,7 @@ pub use parse::ParseStage; pub use persist::PersistStage; use super::pipeline::{FailurePolicy, IndexContext, StageResult}; -use crate::domain::Result; +use crate::error::Result; pub use async_trait::async_trait; /// Index pipeline stage. diff --git a/src/index/stages/optimize.rs b/src/index/stages/optimize.rs index d84633bf..571e947d 100644 --- a/src/index/stages/optimize.rs +++ b/src/index/stages/optimize.rs @@ -7,7 +7,9 @@ use super::async_trait; use std::time::Instant; use tracing::info; -use crate::domain::{NodeId, Result}; + +use crate::error::Result; +use crate::document::{NodeId}; use crate::index::pipeline::IndexContext; use super::{IndexStage, StageResult}; @@ -23,7 +25,7 @@ impl OptimizeStage { /// Merge adjacent small leaf nodes. fn merge_small_leaves( - tree: &mut crate::domain::DocumentTree, + tree: &mut crate::document::DocumentTree, min_tokens: usize, metrics: &mut crate::index::IndexMetrics, ) -> usize { @@ -86,7 +88,7 @@ impl OptimizeStage { } /// Remove empty intermediate nodes. - fn remove_empty_nodes(tree: &mut crate::domain::DocumentTree) -> usize { + fn remove_empty_nodes(tree: &mut crate::document::DocumentTree) -> usize { let mut removed_count = 0; // Find nodes with no content and only one child @@ -154,7 +156,7 @@ impl IndexStage for OptimizeStage { let tree = ctx .tree .as_mut() - .ok_or_else(|| crate::domain::Error::IndexBuild("Tree not built".to_string()))?; + .ok_or_else(|| crate::Error::IndexBuild("Tree not built".to_string()))?; let mut merged_count = 0; diff --git a/src/index/stages/parse.rs b/src/index/stages/parse.rs index 0322760e..150d1803 100644 --- a/src/index/stages/parse.rs +++ b/src/index/stages/parse.rs @@ -7,7 +7,7 @@ use super::async_trait; use std::time::Instant; use tracing::info; -use crate::domain::Result; +use crate::error::Result; use crate::parser::DocumentFormat; use crate::parser::ParserRegistry; @@ -35,7 +35,7 @@ impl ParseStage { IndexInput::File(path) => { let ext = path.extension().and_then(|e| e.to_str()).unwrap_or(""); DocumentFormat::from_extension(ext).ok_or_else(|| { - crate::domain::Error::Parse(format!("Unknown format: {}", ext)) + crate::Error::Parse(format!("Unknown format: {}", ext)) }) } IndexInput::Content { format, .. } => Ok(*format), diff --git a/src/index/stages/persist.rs b/src/index/stages/persist.rs index d2ac2e47..e0d93f7d 100644 --- a/src/index/stages/persist.rs +++ b/src/index/stages/persist.rs @@ -7,7 +7,7 @@ use super::async_trait; use std::time::Instant; use tracing::info; -use crate::domain::Result; +use crate::error::Result; use crate::storage::{DocumentMeta as StorageMeta, PersistedDocument, Workspace}; use super::{IndexStage, StageResult}; @@ -37,12 +37,12 @@ impl PersistStage { let workspace = self .workspace .as_mut() - .ok_or_else(|| crate::domain::Error::Config("No workspace configured".to_string()))?; + .ok_or_else(|| crate::Error::Config("No workspace configured".to_string()))?; let tree = ctx .tree .as_ref() - .ok_or_else(|| crate::domain::Error::IndexBuild("Tree not built".to_string()))?; + .ok_or_else(|| crate::Error::IndexBuild("Tree not built".to_string()))?; // Create metadata let meta = StorageMeta::new(&ctx.doc_id, &ctx.name, ctx.format.extension()) diff --git a/src/index/summary/full.rs b/src/index/summary/full.rs index 9c1eff00..c9e76e33 100644 --- a/src/index/summary/full.rs +++ b/src/index/summary/full.rs @@ -3,7 +3,7 @@ //! Full summary strategy - generate summaries for all nodes. -use crate::domain::NodeId; +use crate::document::NodeId; use crate::llm::LlmClient; use super::{SummaryGenerator, SummaryStrategyConfig}; diff --git a/src/index/summary/selective.rs b/src/index/summary/selective.rs index 3049278e..18c8946e 100644 --- a/src/index/summary/selective.rs +++ b/src/index/summary/selective.rs @@ -3,7 +3,7 @@ //! Selective summary strategy - generate summaries only for qualifying nodes. -use crate::domain::{DocumentTree, NodeId}; +use crate::document::{DocumentTree, NodeId}; use crate::llm::LlmClient; use super::{SummaryGenerator, SummaryStrategyConfig}; diff --git a/src/index/summary/strategy.rs b/src/index/summary/strategy.rs index 5b731232..eac0055c 100644 --- a/src/index/summary/strategy.rs +++ b/src/index/summary/strategy.rs @@ -5,7 +5,7 @@ use async_trait::async_trait; -use crate::domain::{DocumentTree, NodeId}; +use crate::document::{DocumentTree, NodeId}; use crate::llm::{LlmClient, LlmResult}; /// Configuration for summary strategies. diff --git a/src/lib.rs b/src/lib.rs index 8a9e5615..cd3dc7a9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -106,13 +106,15 @@ pub mod client; pub mod config; -pub mod domain; +pub mod document; +pub mod error; pub mod index; pub mod llm; pub mod parser; pub mod retrieval; pub mod storage; pub mod throttle; +pub mod util; // ============================================================================= // Re-exports (Convenience API) @@ -121,12 +123,18 @@ pub mod throttle; // Client API (most common entry point) pub use client::{DocumentInfo, Engine, EngineBuilder, IndexedDocument}; -// Domain types -pub use domain::{ - DocumentStructure, DocumentTree, Error, NodeId, Result, StructureNode, TocConfig, TocEntry, - TocNode, TocView, TreeNode, estimate_tokens, estimate_tokens_fast, +// Error types +pub use error::{Error, Result}; + +// Document types +pub use document::{ + DocumentStructure, DocumentTree, NodeId, StructureNode, TocConfig, TocEntry, + TocNode, TocView, TreeNode, }; +// Utility functions +pub use util::{estimate_tokens, estimate_tokens_fast}; + // Configuration pub use config::{Config, ConfigLoader, RetrievalConfig, SummaryConfig}; @@ -155,7 +163,7 @@ pub use retrieval::{ }; // Storage -pub use storage::{DocumentMeta as StorageDocumentMeta, PersistedDocument, Workspace}; +pub use storage::{AsyncWorkspace, DocumentMeta as StorageDocumentMeta, PersistedDocument, Workspace}; // Throttle pub use throttle::{ConcurrencyConfig, ConcurrencyController, RateLimiter}; diff --git a/src/llm/error.rs b/src/llm/error.rs index 2cd8245d..5969cf72 100644 --- a/src/llm/error.rs +++ b/src/llm/error.rs @@ -93,9 +93,9 @@ impl From for LlmError { } } -impl From for crate::domain::Error { +impl From for crate::Error { fn from(e: LlmError) -> Self { - crate::domain::Error::Llm(e.to_string()) + crate::Error::Llm(e.to_string()) } } diff --git a/src/parser/docx/parser.rs b/src/parser/docx/parser.rs index dd59ccca..15d593c8 100644 --- a/src/parser/docx/parser.rs +++ b/src/parser/docx/parser.rs @@ -32,7 +32,8 @@ use std::path::Path; use async_trait::async_trait; use zip::ZipArchive; -use crate::domain::{Error, Result}; +use crate::{Error}; +use crate::error::Result; use crate::parser::{DocumentFormat, DocumentMeta, DocumentParser, ParseResult, RawNode}; use super::styles::StyleResolver; diff --git a/src/parser/markdown/parser.rs b/src/parser/markdown/parser.rs index 366be1be..cc6df8a1 100644 --- a/src/parser/markdown/parser.rs +++ b/src/parser/markdown/parser.rs @@ -7,7 +7,8 @@ use async_trait::async_trait; use pulldown_cmark::Options; use std::path::Path; -use crate::domain::{Result, estimate_tokens}; +use crate::error::Result; +use crate::util::estimate_tokens; use crate::parser::{DocumentFormat, DocumentMeta, DocumentParser, ParseResult, RawNode}; use super::config::MarkdownConfig; @@ -398,7 +399,7 @@ impl DocumentParser for MarkdownParser { async fn parse_file(&self, path: &Path) -> Result { let content = tokio::fs::read_to_string(path) .await - .map_err(|e| crate::domain::Error::Parse(format!("Failed to read file: {}", e)))?; + .map_err(|e| crate::Error::Parse(format!("Failed to read file: {}", e)))?; let mut result = self.parse(&content).await?; diff --git a/src/parser/pdf/parser.rs b/src/parser/pdf/parser.rs index c047d21a..a96bf0c2 100644 --- a/src/parser/pdf/parser.rs +++ b/src/parser/pdf/parser.rs @@ -8,7 +8,8 @@ use std::path::Path; use lopdf::Document as LopdfDocument; use tracing::{info, warn}; -use crate::domain::{Error, Result}; +use crate::{Error}; +use crate::error::Result; use crate::parser::DocumentParser; use crate::parser::toc::TocProcessor; diff --git a/src/parser/pdf/types.rs b/src/parser/pdf/types.rs index 8c6e27b0..1c2ac9fc 100644 --- a/src/parser/pdf/types.rs +++ b/src/parser/pdf/types.rs @@ -3,7 +3,7 @@ //! PDF document types. -use crate::domain::estimate_tokens; +use crate::util::estimate_tokens; use serde::{Deserialize, Serialize}; /// A single page from a PDF document. diff --git a/src/parser/registry.rs b/src/parser/registry.rs index 947552ac..ae632e4c 100644 --- a/src/parser/registry.rs +++ b/src/parser/registry.rs @@ -11,7 +11,8 @@ use std::collections::HashMap; use std::path::Path; use std::sync::{Arc, RwLock}; -use crate::domain::{Error, Result}; +use crate::{Error}; +use crate::error::Result; use crate::parser::{DocumentFormat, DocumentParser, MarkdownParser, ParseResult, PdfParser}; /// Type alias for parser factory functions. diff --git a/src/parser/toc/assigner.rs b/src/parser/toc/assigner.rs index 86087885..a62e6486 100644 --- a/src/parser/toc/assigner.rs +++ b/src/parser/toc/assigner.rs @@ -7,7 +7,7 @@ use std::collections::HashMap; use tracing::{debug, info}; use crate::config::LlmConfig; -use crate::domain::Result; +use crate::error::Result; use crate::parser::pdf::PdfPage; use super::types::{PageOffset, TocEntry}; diff --git a/src/parser/toc/detector.rs b/src/parser/toc/detector.rs index f8112f07..6688adfc 100644 --- a/src/parser/toc/detector.rs +++ b/src/parser/toc/detector.rs @@ -7,7 +7,7 @@ use regex::Regex; use tracing::debug; use crate::config::LlmConfig; -use crate::domain::Result; +use crate::error::Result; use super::types::TocDetection; use crate::llm::LlmClient; diff --git a/src/parser/toc/parser.rs b/src/parser/toc/parser.rs index 9cbeee1f..20b61af2 100644 --- a/src/parser/toc/parser.rs +++ b/src/parser/toc/parser.rs @@ -6,7 +6,7 @@ use tracing::debug; use crate::config::LlmConfig; -use crate::domain::Result; +use crate::error::Result; use super::types::TocEntry; use crate::llm::LlmClient; diff --git a/src/parser/toc/processor.rs b/src/parser/toc/processor.rs index 991b0f6d..7b7cf945 100644 --- a/src/parser/toc/processor.rs +++ b/src/parser/toc/processor.rs @@ -5,7 +5,7 @@ use tracing::{debug, info, warn}; -use crate::domain::Result; +use crate::error::Result; use crate::parser::pdf::PdfPage; use super::assigner::{PageAssigner, PageAssignerConfig}; diff --git a/src/parser/toc/repairer.rs b/src/parser/toc/repairer.rs index 4a00383c..8a26b8cd 100644 --- a/src/parser/toc/repairer.rs +++ b/src/parser/toc/repairer.rs @@ -6,7 +6,7 @@ use tracing::{debug, info}; use crate::config::LlmConfig; -use crate::domain::Result; +use crate::error::Result; use crate::parser::pdf::PdfPage; use super::types::{TocEntry, VerificationError, VerificationReport}; diff --git a/src/parser/toc/verifier.rs b/src/parser/toc/verifier.rs index e1e9c457..a0243bc1 100644 --- a/src/parser/toc/verifier.rs +++ b/src/parser/toc/verifier.rs @@ -7,7 +7,7 @@ use rand::seq::SliceRandom; use tracing::{debug, info}; use crate::config::LlmConfig; -use crate::domain::Result; +use crate::error::Result; use crate::parser::pdf::PdfPage; use super::types::{ErrorType, TocEntry, VerificationError, VerificationReport}; diff --git a/src/parser/traits.rs b/src/parser/traits.rs index 551aed86..296fcabe 100644 --- a/src/parser/traits.rs +++ b/src/parser/traits.rs @@ -7,7 +7,7 @@ use async_trait::async_trait; use std::path::Path; use super::{DocumentFormat, ParseResult}; -use crate::domain::Result; +use crate::error::Result; /// A parser for extracting content from documents. /// @@ -54,7 +54,7 @@ pub trait DocumentParser: Send + Sync { async fn parse_file(&self, path: &Path) -> Result { let content = tokio::fs::read_to_string(path) .await - .map_err(|e| crate::domain::Error::Parse(format!("Failed to read file: {}", e)))?; + .map_err(|e| crate::Error::Parse(format!("Failed to read file: {}", e)))?; self.parse(&content).await } diff --git a/src/retrieval/cache/path_cache.rs b/src/retrieval/cache/path_cache.rs index e9202150..a394fa1f 100644 --- a/src/retrieval/cache/path_cache.rs +++ b/src/retrieval/cache/path_cache.rs @@ -9,7 +9,7 @@ use std::time::{Duration, Instant}; use super::super::types::SearchPath; use crate::config::CacheConfig as AppConfig; -use crate::domain::NodeId; +use crate::document::NodeId; /// Cache entry for a search path. #[derive(Debug, Clone)] diff --git a/src/retrieval/content/aggregator.rs b/src/retrieval/content/aggregator.rs index 9edb625b..87a8f20e 100644 --- a/src/retrieval/content/aggregator.rs +++ b/src/retrieval/content/aggregator.rs @@ -10,7 +10,8 @@ use std::collections::HashMap; use tracing::{debug, info}; -use crate::domain::{DocumentTree, NodeId, estimate_tokens}; +use crate::document::{DocumentTree, NodeId}; +use crate::util::estimate_tokens; use super::budget::{AllocationResult, AllocationStrategy, BudgetAllocator, SelectedContent}; use super::builder::{ContentMetadata, StructureBuilder, StructuredContent}; @@ -350,7 +351,7 @@ mod tests { fn make_test_node_id() -> NodeId { let mut arena = Arena::new(); - let node = crate::domain::TreeNode { + let node = crate::document::TreeNode { title: "Test".to_string(), structure: String::new(), content: String::new(), diff --git a/src/retrieval/content/budget.rs b/src/retrieval/content/budget.rs index fa91e9c0..1b4ed279 100644 --- a/src/retrieval/content/budget.rs +++ b/src/retrieval/content/budget.rs @@ -8,7 +8,8 @@ use std::collections::HashMap; -use crate::domain::{estimate_tokens, NodeId}; +use crate::document::NodeId; +use crate::util::estimate_tokens; use super::scorer::ContentRelevance; @@ -526,7 +527,7 @@ mod tests { fn make_test_node_id() -> NodeId { let mut arena = Arena::new(); - let node = crate::domain::TreeNode { + let node = crate::document::TreeNode { title: "Test".to_string(), structure: String::new(), content: String::new(), diff --git a/src/retrieval/content/builder.rs b/src/retrieval/content/builder.rs index c3b5792f..e0248e7b 100644 --- a/src/retrieval/content/builder.rs +++ b/src/retrieval/content/builder.rs @@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize}; -use crate::domain::DocumentTree; +use crate::document::DocumentTree; use super::budget::SelectedContent; use super::config::OutputFormatConfig; @@ -309,7 +309,7 @@ impl StructureBuilder { // Group by parent use std::collections::HashMap; - let mut by_parent: HashMap, Vec<&SelectedContent>> = + let mut by_parent: HashMap, Vec<&SelectedContent>> = HashMap::new(); for content in &selected { @@ -327,7 +327,7 @@ impl StructureBuilder { // Build tree recursively fn build_node( content: &SelectedContent, - all_by_parent: &HashMap, Vec<&SelectedContent>>, + all_by_parent: &HashMap, Vec<&SelectedContent>>, ) -> ContentTreeNode { let mut node = ContentTreeNode::new(content.title.clone()) .with_content(content.content.clone(), content.score); @@ -413,12 +413,12 @@ fn render_tree(node: &ContentTreeNode, depth: usize) -> String { #[cfg(test)] mod tests { use super::*; - use crate::domain::NodeId; + use crate::document::NodeId; use indextree::Arena; fn make_test_node_id() -> NodeId { let mut arena = Arena::new(); - let node = crate::domain::TreeNode { + let node = crate::document::TreeNode { title: "Test".to_string(), structure: String::new(), content: String::new(), diff --git a/src/retrieval/content/scorer.rs b/src/retrieval/content/scorer.rs index ba04a6ce..daf49550 100644 --- a/src/retrieval/content/scorer.rs +++ b/src/retrieval/content/scorer.rs @@ -8,7 +8,8 @@ use std::collections::HashMap; -use crate::domain::{estimate_tokens, NodeId}; +use crate::document::NodeId; +use crate::util::estimate_tokens; use super::config::ScoringStrategyConfig; @@ -339,7 +340,7 @@ mod tests { fn make_test_node_id() -> NodeId { let mut arena = Arena::new(); - let node = crate::domain::TreeNode { + let node = crate::document::TreeNode { title: "Test".to_string(), structure: String::new(), content: String::new(), diff --git a/src/retrieval/context.rs b/src/retrieval/context.rs index 595c9083..c4f278b9 100644 --- a/src/retrieval/context.rs +++ b/src/retrieval/context.rs @@ -28,7 +28,8 @@ //! ``` use super::types::RetrievalResult; -use crate::domain::{DocumentTree, NodeId, estimate_tokens}; +use crate::document::{DocumentTree, NodeId}; +use crate::util::estimate_tokens; use std::collections::HashSet; /// Pruning strategy for context building. @@ -476,7 +477,7 @@ impl ContextBuilder { } } - fn format_node_section(&self, node: &crate::domain::TreeNode, depth: usize) -> String { + fn format_node_section(&self, node: &crate::document::TreeNode, depth: usize) -> String { let mut section = String::new(); if self.include_titles { diff --git a/src/retrieval/pilot/builder.rs b/src/retrieval/pilot/builder.rs index 725b4394..931c19b0 100644 --- a/src/retrieval/pilot/builder.rs +++ b/src/retrieval/pilot/builder.rs @@ -16,7 +16,7 @@ use std::collections::HashSet; -use crate::domain::{DocumentTree, NodeId}; +use crate::document::{DocumentTree, NodeId}; use super::SearchState; /// Token budget distribution for context building. @@ -436,7 +436,7 @@ mod tests { fn create_test_tree() -> DocumentTree { let mut arena = Arena::new(); - let root = arena.new_node(crate::domain::TreeNode { + let root = arena.new_node(crate::document::TreeNode { title: "Root".to_string(), content: "Root content".to_string(), summary: "Root summary".to_string(), @@ -444,7 +444,7 @@ mod tests { ..Default::default() }); - let child1 = arena.new_node(crate::domain::TreeNode { + let child1 = arena.new_node(crate::document::TreeNode { title: "Configuration".to_string(), content: "Config content".to_string(), summary: "Configuration options".to_string(), @@ -452,7 +452,7 @@ mod tests { ..Default::default() }); - let child2 = arena.new_node(crate::domain::TreeNode { + let child2 = arena.new_node(crate::document::TreeNode { title: "API Reference".to_string(), content: "API content".to_string(), summary: "API documentation".to_string(), @@ -463,7 +463,7 @@ mod tests { root.append(child1, &mut arena); root.append(child2, &mut arena); - DocumentTree::from_raw(arena, crate::domain::NodeId(root)) + DocumentTree::from_raw(arena, crate::document::NodeId(root)) } #[test] diff --git a/src/retrieval/pilot/decision.rs b/src/retrieval/pilot/decision.rs index 69a117d6..084582c2 100644 --- a/src/retrieval/pilot/decision.rs +++ b/src/retrieval/pilot/decision.rs @@ -8,7 +8,7 @@ use serde::{Deserialize, Serialize}; -use crate::domain::NodeId; +use crate::document::NodeId; /// Pilot's navigation decision result. /// @@ -243,7 +243,7 @@ mod tests { let mut arena = Arena::new(); let mut ids = Vec::new(); for i in 0..count { - let node = crate::domain::TreeNode { + let node = crate::document::TreeNode { title: format!("Node {}", i), structure: String::new(), content: String::new(), diff --git a/src/retrieval/pilot/llm_pilot.rs b/src/retrieval/pilot/llm_pilot.rs index c163396a..10118ff0 100644 --- a/src/retrieval/pilot/llm_pilot.rs +++ b/src/retrieval/pilot/llm_pilot.rs @@ -10,7 +10,7 @@ use async_trait::async_trait; use std::sync::Arc; use tracing::{debug, info, warn}; -use crate::domain::DocumentTree; +use crate::document::DocumentTree; use crate::llm::LlmClient; use super::builder::ContextBuilder; @@ -147,7 +147,7 @@ impl LlmPilot { &self, point: InterventionPoint, context: &super::builder::PilotContext, - candidates: &[crate::domain::NodeId], + candidates: &[crate::document::NodeId], ) -> PilotDecision { // Build prompt let prompt = self.prompt_builder.build(point, context); @@ -192,7 +192,7 @@ impl LlmPilot { /// Create a default decision when LLM fails. fn default_decision( &self, - candidates: &[crate::domain::NodeId], + candidates: &[crate::document::NodeId], point: InterventionPoint, ) -> PilotDecision { let ranked = candidates @@ -357,14 +357,14 @@ impl Pilot for LlmPilot { #[cfg(test)] mod tests { use super::*; - use crate::domain::NodeId; + use crate::document::NodeId; use indextree::Arena; fn create_test_node_ids(count: usize) -> Vec { let mut arena = Arena::new(); let mut ids = Vec::new(); for i in 0..count { - let node = crate::domain::TreeNode { + let node = crate::document::TreeNode { title: format!("Node {}", i), structure: String::new(), content: String::new(), diff --git a/src/retrieval/pilot/noop.rs b/src/retrieval/pilot/noop.rs index daa95648..b79156a5 100644 --- a/src/retrieval/pilot/noop.rs +++ b/src/retrieval/pilot/noop.rs @@ -9,7 +9,7 @@ use async_trait::async_trait; -use crate::domain::DocumentTree; +use crate::document::DocumentTree; use super::{InterventionPoint, Pilot, PilotConfig, PilotDecision, SearchState}; @@ -103,7 +103,7 @@ impl Pilot for NoopPilot { #[cfg(test)] mod tests { use super::*; - use crate::domain::NodeId; + use crate::document::NodeId; use std::collections::HashSet; #[test] diff --git a/src/retrieval/pilot/parser.rs b/src/retrieval/pilot/parser.rs index 9bb0bd48..ca88ff26 100644 --- a/src/retrieval/pilot/parser.rs +++ b/src/retrieval/pilot/parser.rs @@ -13,7 +13,7 @@ use regex::Regex; use serde::{Deserialize, Serialize}; use tracing::warn; -use crate::domain::NodeId; +use crate::document::NodeId; use super::decision::{PilotDecision, RankedCandidate, SearchDirection, InterventionPoint}; /// Parsed response from LLM. @@ -348,7 +348,7 @@ mod tests { let mut arena = Arena::new(); let mut ids = Vec::new(); for i in 0..count { - let node = crate::domain::TreeNode { + let node = crate::document::TreeNode { title: format!("Node {}", i), structure: String::new(), content: String::new(), diff --git a/src/retrieval/pilot/trait.rs b/src/retrieval/pilot/trait.rs index 2017aa94..94e7fac7 100644 --- a/src/retrieval/pilot/trait.rs +++ b/src/retrieval/pilot/trait.rs @@ -11,7 +11,7 @@ use async_trait::async_trait; use std::collections::HashSet; use std::sync::LazyLock; -use crate::domain::{DocumentTree, NodeId}; +use crate::document::{DocumentTree, NodeId}; use super::{PilotConfig, PilotDecision, InterventionPoint}; diff --git a/src/retrieval/pipeline/context.rs b/src/retrieval/pipeline/context.rs index b12d3d9f..3537e7a3 100644 --- a/src/retrieval/pipeline/context.rs +++ b/src/retrieval/pipeline/context.rs @@ -10,7 +10,7 @@ use std::collections::HashMap; use std::sync::Arc; use std::time::Instant; -use crate::domain::{DocumentTree, NodeId, RetrievalIndex}; +use crate::document::{DocumentTree, NodeId, RetrievalIndex}; use crate::retrieval::pilot::Pilot; use crate::retrieval::types::{ NavigationStep, QueryComplexity, RetrieveOptions, RetrieveResponse, SearchPath, diff --git a/src/retrieval/pipeline/orchestrator.rs b/src/retrieval/pipeline/orchestrator.rs index 2dcde02e..fc013014 100644 --- a/src/retrieval/pipeline/orchestrator.rs +++ b/src/retrieval/pipeline/orchestrator.rs @@ -15,7 +15,8 @@ use std::sync::Arc; use std::time::Instant; use tracing::{debug, error, info, warn}; -use crate::domain::{DocumentTree, Result}; +use crate::document::{DocumentTree}; +use crate::error::Result; use crate::retrieval::pilot::{Pilot, SearchState}; // FailurePolicy is re-exported for stages use crate::retrieval::types::{RetrieveOptions, RetrieveResponse}; @@ -148,7 +149,7 @@ impl RetrievalOrchestrator { for entry in &self.stages { for dep in &entry.depends_on { if !name_to_idx.contains_key(dep.as_str()) { - return Err(crate::domain::Error::Config(format!( + return Err(crate::Error::Config(format!( "Stage '{}' depends on non-existent stage '{}'", entry.stage.name(), dep @@ -205,7 +206,7 @@ impl RetrievalOrchestrator { .filter(|i| !result.contains(i)) .map(|i| self.stages[i].stage.name()) .collect(); - return Err(crate::domain::Error::Config(format!( + return Err(crate::Error::Config(format!( "Circular dependency detected involving stages: {:?}", remaining ))); diff --git a/src/retrieval/pipeline/stage.rs b/src/retrieval/pipeline/stage.rs index 946a9fba..285c717f 100644 --- a/src/retrieval/pipeline/stage.rs +++ b/src/retrieval/pipeline/stage.rs @@ -9,7 +9,7 @@ use async_trait::async_trait; -use crate::domain::Result; +use crate::error::Result; use crate::index::pipeline::FailurePolicy; use super::context::PipelineContext; diff --git a/src/retrieval/pipeline_retriever.rs b/src/retrieval/pipeline_retriever.rs index e51d187a..b7254645 100644 --- a/src/retrieval/pipeline_retriever.rs +++ b/src/retrieval/pipeline_retriever.rs @@ -15,7 +15,8 @@ use super::retriever::{CostEstimate, Retriever, RetrieverError, RetrieverResult} use super::stages::{AnalyzeStage, JudgeStage, PlanStage, SearchStage}; use super::strategy::LlmStrategy; use super::types::{RetrieveOptions, RetrieveResponse}; -use crate::domain::DocumentTree; +use crate::document::DocumentTree; +use crate::error::Result; use crate::llm::LlmClient; use crate::retrieval::pilot::{LlmPilot, PilotConfig}; diff --git a/src/retrieval/retriever.rs b/src/retrieval/retriever.rs index 83763cdb..97c280c0 100644 --- a/src/retrieval/retriever.rs +++ b/src/retrieval/retriever.rs @@ -6,7 +6,7 @@ use async_trait::async_trait; use super::types::{RetrieveOptions, RetrieveResponse}; -use crate::domain::DocumentTree; +use crate::document::DocumentTree; /// Result type for retriever operations. pub type RetrieverResult = Result; diff --git a/src/retrieval/search/beam.rs b/src/retrieval/search/beam.rs index 2dec5e40..ea73051c 100644 --- a/src/retrieval/search/beam.rs +++ b/src/retrieval/search/beam.rs @@ -14,7 +14,7 @@ use super::super::RetrievalContext; use super::super::types::{NavigationDecision, NavigationStep, SearchPath}; use super::scorer::{NodeScorer, ScoringContext}; use super::{SearchConfig, SearchResult, SearchTree}; -use crate::domain::{DocumentTree, NodeId}; +use crate::document::{DocumentTree, NodeId}; use crate::retrieval::pilot::{Pilot, SearchState}; /// Beam search - explores multiple paths simultaneously. diff --git a/src/retrieval/search/greedy.rs b/src/retrieval/search/greedy.rs index ad9fd8d8..89357225 100644 --- a/src/retrieval/search/greedy.rs +++ b/src/retrieval/search/greedy.rs @@ -13,7 +13,7 @@ use super::super::RetrievalContext; use super::super::types::{NavigationDecision, NavigationStep, SearchPath}; use super::scorer::{NodeScorer, ScoringContext}; use super::{SearchConfig, SearchResult, SearchTree}; -use crate::domain::{DocumentTree, NodeId}; +use crate::document::{DocumentTree, NodeId}; use crate::retrieval::pilot::{Pilot, SearchState}; /// Greedy search - always follows the best single path. diff --git a/src/retrieval/search/mcts.rs b/src/retrieval/search/mcts.rs index 2cc6fbd0..667a0d28 100644 --- a/src/retrieval/search/mcts.rs +++ b/src/retrieval/search/mcts.rs @@ -14,7 +14,7 @@ use super::super::types::{NavigationDecision, NavigationStep, SearchPath}; use super::scorer::NodeScorer; use super::{SearchConfig, SearchResult, SearchTree}; use crate::config::StrategyConfig; -use crate::domain::{DocumentTree, NodeId}; +use crate::document::{DocumentTree, NodeId}; use crate::retrieval::pilot::Pilot; /// Statistics for a node in MCTS. diff --git a/src/retrieval/search/scorer.rs b/src/retrieval/search/scorer.rs index e22f8239..0d051938 100644 --- a/src/retrieval/search/scorer.rs +++ b/src/retrieval/search/scorer.rs @@ -5,7 +5,7 @@ //! //! Implements the NodeScore formula: `Σ ChunkScore(n) / √(N+1)` -use crate::domain::{DocumentTree, NodeId}; +use crate::document::{DocumentTree, NodeId}; /// Context for scoring calculations. #[derive(Debug, Clone)] diff --git a/src/retrieval/search/trait.rs b/src/retrieval/search/trait.rs index 927753cf..1790b703 100644 --- a/src/retrieval/search/trait.rs +++ b/src/retrieval/search/trait.rs @@ -7,7 +7,7 @@ use async_trait::async_trait; use super::super::RetrievalContext; use super::super::types::{NavigationStep, SearchPath}; -use crate::domain::DocumentTree; +use crate::document::DocumentTree; use crate::retrieval::pilot::Pilot; /// Result of a search operation. diff --git a/src/retrieval/stages/analyze.rs b/src/retrieval/stages/analyze.rs index c26b7e4c..3eabca1f 100644 --- a/src/retrieval/stages/analyze.rs +++ b/src/retrieval/stages/analyze.rs @@ -11,7 +11,7 @@ use async_trait::async_trait; use tracing::info; -use crate::domain::{DocumentTree, TocView}; +use crate::document::{DocumentTree, TocView}; use crate::retrieval::complexity::ComplexityDetector; use crate::retrieval::pipeline::{FailurePolicy, PipelineContext, RetrievalStage, StageOutcome}; // QueryComplexity is used in context @@ -108,7 +108,7 @@ impl AnalyzeStage { let mut matches: Vec<(String, f32)> = Vec::new(); fn collect_sections( - nodes: &[crate::domain::TocNode], + nodes: &[crate::document::TocNode], query_lower: &str, matches: &mut Vec<(String, f32)>, ) { @@ -165,7 +165,7 @@ impl RetrievalStage for AnalyzeStage { FailurePolicy::fail() // Must succeed } - async fn execute(&self, ctx: &mut PipelineContext) -> crate::domain::Result { + async fn execute(&self, ctx: &mut PipelineContext) -> crate::error::Result { info!("Analyzing query: '{}'", ctx.query); // 1. Detect complexity diff --git a/src/retrieval/stages/judge.rs b/src/retrieval/stages/judge.rs index 9cc11e68..1178f402 100644 --- a/src/retrieval/stages/judge.rs +++ b/src/retrieval/stages/judge.rs @@ -10,7 +10,7 @@ use async_trait::async_trait; // Arc is used for async sharing use tracing::{info, warn}; -use crate::domain::estimate_tokens; +use crate::util::estimate_tokens; use crate::llm::LlmClient; use crate::retrieval::content::{ContentAggregator, ContentAggregatorConfig}; use crate::retrieval::pipeline::{FailurePolicy, PipelineContext, RetrievalStage, StageOutcome}; @@ -167,7 +167,7 @@ impl JudgeStage { } /// Collect content from leaf descendants of a node (excluding the node itself). - fn collect_leaf_content(&self, tree: &crate::domain::DocumentTree, node_id: crate::domain::NodeId) -> String { + fn collect_leaf_content(&self, tree: &crate::document::DocumentTree, node_id: crate::document::NodeId) -> String { let mut content_parts = Vec::new(); // Start with children, not the node itself @@ -177,7 +177,7 @@ impl JudgeStage { return String::new(); } - let mut stack: Vec = children; + let mut stack: Vec = children; while let Some(current_id) = stack.pop() { let current_children = tree.children(current_id); @@ -319,7 +319,7 @@ impl RetrievalStage for JudgeStage { true // Can trigger backtracking to search } - async fn execute(&self, ctx: &mut PipelineContext) -> crate::domain::Result { + async fn execute(&self, ctx: &mut PipelineContext) -> crate::error::Result { let start = std::time::Instant::now(); info!( diff --git a/src/retrieval/stages/plan.rs b/src/retrieval/stages/plan.rs index 7177322b..0b98003c 100644 --- a/src/retrieval/stages/plan.rs +++ b/src/retrieval/stages/plan.rs @@ -155,7 +155,7 @@ impl RetrievalStage for PlanStage { FailurePolicy::fail() // Must succeed } - async fn execute(&self, ctx: &mut PipelineContext) -> crate::domain::Result { + async fn execute(&self, ctx: &mut PipelineContext) -> crate::error::Result { info!("Planning retrieval strategy"); // 1. Select strategy diff --git a/src/retrieval/stages/search.rs b/src/retrieval/stages/search.rs index e9addfe7..121378f5 100644 --- a/src/retrieval/stages/search.rs +++ b/src/retrieval/stages/search.rs @@ -11,7 +11,7 @@ use async_trait::async_trait; use std::sync::Arc; use tracing::{info, warn}; -use crate::domain::DocumentTree; +use crate::document::DocumentTree; // LlmClient is used via strategy use crate::retrieval::pilot::Pilot; use crate::retrieval::RetrievalContext; // Legacy context @@ -187,7 +187,7 @@ impl RetrievalStage for SearchStage { true // Can receive backtracks from judge } - async fn execute(&self, ctx: &mut PipelineContext) -> crate::domain::Result { + async fn execute(&self, ctx: &mut PipelineContext) -> crate::error::Result { let start = std::time::Instant::now(); // Get strategy and algorithm diff --git a/src/retrieval/strategy/keyword.rs b/src/retrieval/strategy/keyword.rs index bfb34a68..7e505f0e 100644 --- a/src/retrieval/strategy/keyword.rs +++ b/src/retrieval/strategy/keyword.rs @@ -11,7 +11,7 @@ use std::collections::{HashMap, HashSet}; use super::super::RetrievalContext; use super::super::types::{NavigationDecision, QueryComplexity}; use super::r#trait::{NodeEvaluation, RetrievalStrategy, StrategyCapabilities}; -use crate::domain::{DocumentTree, NodeId}; +use crate::document::{DocumentTree, NodeId}; /// Keyword-based retrieval strategy. /// diff --git a/src/retrieval/strategy/llm.rs b/src/retrieval/strategy/llm.rs index 7a3ed89e..c1ca5037 100644 --- a/src/retrieval/strategy/llm.rs +++ b/src/retrieval/strategy/llm.rs @@ -11,7 +11,7 @@ use serde::Deserialize; use super::super::RetrievalContext; use super::super::types::{NavigationDecision, QueryComplexity}; use super::r#trait::{NodeEvaluation, RetrievalStrategy, StrategyCapabilities}; -use crate::domain::{DocumentTree, NodeId, TocView}; +use crate::document::{DocumentTree, NodeId, TocView}; use crate::llm::LlmClient; /// LLM response for navigation decision. diff --git a/src/retrieval/strategy/semantic.rs b/src/retrieval/strategy/semantic.rs index 170e7998..1e924538 100644 --- a/src/retrieval/strategy/semantic.rs +++ b/src/retrieval/strategy/semantic.rs @@ -11,7 +11,7 @@ use super::super::RetrievalContext; use super::super::types::{NavigationDecision, QueryComplexity}; use super::r#trait::{NodeEvaluation, RetrievalStrategy, StrategyCapabilities}; use crate::config::StrategyConfig; -use crate::domain::{DocumentTree, NodeId}; +use crate::document::{DocumentTree, NodeId}; /// Embedding model trait for semantic strategies. #[async_trait] diff --git a/src/retrieval/strategy/trait.rs b/src/retrieval/strategy/trait.rs index 3699a128..895d60a2 100644 --- a/src/retrieval/strategy/trait.rs +++ b/src/retrieval/strategy/trait.rs @@ -7,7 +7,7 @@ use async_trait::async_trait; use super::super::RetrievalContext; use super::super::types::{NavigationDecision, QueryComplexity}; -use crate::domain::{DocumentTree, NodeId}; +use crate::document::{DocumentTree, NodeId}; /// Result of evaluating a single node. #[derive(Debug, Clone)] diff --git a/src/retrieval/types.rs b/src/retrieval/types.rs index 2077f325..82ee5504 100644 --- a/src/retrieval/types.rs +++ b/src/retrieval/types.rs @@ -6,7 +6,7 @@ use serde::{Deserialize, Serialize}; use super::context::{PruningStrategy, TokenEstimation}; -use crate::domain::NodeId; +use crate::document::NodeId; /// Query complexity level for adaptive strategy selection. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] diff --git a/src/storage/async_workspace.rs b/src/storage/async_workspace.rs new file mode 100644 index 00000000..56c43373 --- /dev/null +++ b/src/storage/async_workspace.rs @@ -0,0 +1,586 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Async workspace management for document collections. +//! +//! This module provides an async version of [`Workspace`](super::Workspace) +//! for integration with async runtimes like Tokio. +//! +//! # Example +//! +//! ```rust,ignore +//! use vectorless::storage::AsyncWorkspace; +//! +//! #[tokio::main] +//! async fn main() -> Result<()> { +//! let mut workspace = AsyncWorkspace::new("./workspace").await?; +//! +//! // Add a document +//! workspace.add(&doc).await?; +//! +//! // Load with caching +//! let loaded = workspace.load("doc-1").await?; +//! +//! Ok(()) +//! } +//! ``` + +use std::collections::HashMap; +use std::path::{Path, PathBuf}; +use std::sync::Arc; + +use serde::{Deserialize, Serialize}; +use tokio::sync::RwLock; +use tracing::{debug, info, warn}; + +use super::backend::{FileBackend, StorageBackend}; +use super::cache::DocumentCache; +use super::persistence::{PersistedDocument, load_document_from_bytes, save_document_to_bytes}; +use crate::error::Result; +use crate::Error; + +const META_KEY: &str = "_meta"; +const DEFAULT_CACHE_SIZE: usize = 100; + +/// Lightweight metadata entry for the async workspace index. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AsyncDocumentMetaEntry { + /// Document ID. + pub id: String, + /// Document name/title. + pub doc_name: String, + /// Document description. + #[serde(default)] + pub doc_description: Option, + /// Document type (pdf, md, etc.). + pub doc_type: String, + /// Source file path. + #[serde(default)] + pub path: Option, + /// Page count (for PDFs). + #[serde(skip_serializing_if = "Option::is_none")] + pub page_count: Option, + /// Line count (for markdown). + #[serde(skip_serializing_if = "Option::is_none")] + pub line_count: Option, +} + +/// Options for async workspace creation. +#[derive(Debug, Clone)] +pub struct AsyncWorkspaceOptions { + /// LRU cache size (default: 100). + pub cache_size: usize, +} + +impl Default for AsyncWorkspaceOptions { + fn default() -> Self { + Self { + cache_size: DEFAULT_CACHE_SIZE, + } + } +} + +impl AsyncWorkspaceOptions { + /// Create new options with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Set the cache size. + pub fn with_cache_size(mut self, size: usize) -> Self { + self.cache_size = size; + self + } +} + +/// Inner state for the async workspace. +struct AsyncWorkspaceInner { + /// Storage backend. + backend: Arc, + /// Root path (for file-based backends). + root: Option, + /// Document metadata index. + meta_index: HashMap, + /// LRU cache for loaded documents. + cache: DocumentCache, +} + +/// An async workspace for managing indexed documents. +/// +/// Uses `tokio::sync::RwLock` for async-safe concurrent access. +/// All operations are async and can be safely called from multiple tasks. +/// +/// # Thread Safety +/// +/// The async workspace is fully thread-safe and can be cloned cheaply +/// (it uses `Arc` internally). +#[derive(Clone)] +pub struct AsyncWorkspace { + inner: Arc>, +} + +impl std::fmt::Debug for AsyncWorkspace { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("AsyncWorkspace") + .finish() + } +} + +impl AsyncWorkspace { + /// Create a new async workspace with a storage backend. + pub async fn with_backend(backend: Arc) -> Result { + Self::with_backend_and_options(backend, AsyncWorkspaceOptions::default()).await + } + + /// Create an async workspace with backend and options. + pub async fn with_backend_and_options( + backend: Arc, + options: AsyncWorkspaceOptions, + ) -> Result { + let mut inner = AsyncWorkspaceInner { + backend, + root: None, + meta_index: HashMap::new(), + cache: DocumentCache::with_capacity(options.cache_size), + }; + + Self::load_meta_index(&mut inner)?; + + Ok(Self { + inner: Arc::new(RwLock::new(inner)), + }) + } + + /// Create a new file-based async workspace at the given path. + pub async fn new(path: impl Into) -> Result { + Self::with_options(path, AsyncWorkspaceOptions::default()).await + } + + /// Create a new async workspace with custom cache size. + pub async fn with_cache_size(path: impl Into, cache_size: usize) -> Result { + Self::with_options(path, AsyncWorkspaceOptions { + cache_size, + ..Default::default() + }).await + } + + /// Create a new async workspace with custom options. + pub async fn with_options(path: impl Into, options: AsyncWorkspaceOptions) -> Result { + let root = path.into(); + let backend = Arc::new(FileBackend::new(&root)?); + + let mut inner = AsyncWorkspaceInner { + backend, + root: Some(root), + meta_index: HashMap::new(), + cache: DocumentCache::with_capacity(options.cache_size), + }; + + Self::load_meta_index(&mut inner)?; + + Ok(Self { + inner: Arc::new(RwLock::new(inner)), + }) + } + + /// Get the workspace root path (if file-based). + pub async fn path(&self) -> Option { + let inner = self.inner.read().await; + inner.root.clone() + } + + /// List all document IDs in the workspace. + pub async fn list_documents(&self) -> Vec { + let inner = self.inner.read().await; + inner.meta_index.keys().cloned().collect() + } + + /// Get metadata for a document. + pub async fn get_meta(&self, id: &str) -> Option { + let inner = self.inner.read().await; + inner.meta_index.get(id).cloned() + } + + /// Check if a document exists. + pub async fn contains(&self, id: &str) -> bool { + let inner = self.inner.read().await; + inner.meta_index.contains_key(id) + } + + /// Add a document to the workspace. + pub async fn add(&self, doc: &PersistedDocument) -> Result<()> { + let mut inner = self.inner.write().await; + + let doc_id = doc.meta.id.clone(); + let key = Self::doc_key(&doc_id); + + // Serialize and save via backend + let bytes = save_document_to_bytes(doc)?; + inner.backend.put(&key, &bytes)?; + + // Update meta index + let meta_entry = AsyncDocumentMetaEntry { + id: doc_id.clone(), + doc_name: doc.meta.name.clone(), + doc_description: doc.meta.description.clone(), + doc_type: doc.meta.format.clone(), + path: doc + .meta + .source_path + .as_ref() + .map(|p| p.to_string_lossy().to_string()), + page_count: if doc.pages.is_empty() { None } else { Some(doc.pages.len()) }, + line_count: doc.meta.line_count, + }; + + inner.meta_index.insert(doc_id.clone(), meta_entry); + Self::save_meta_index(&inner)?; + + // Remove from cache if present + let _ = inner.cache.remove(&doc_id); + + info!("Saved document {} to async workspace", doc_id); + Ok(()) + } + + /// Load a document from the workspace. + /// + /// Uses LRU cache: returns cached version if available, + /// otherwise loads from backend and caches it. + pub async fn load(&self, id: &str) -> Result> { + // First check if document exists (read lock) + { + let inner = self.inner.read().await; + if !inner.meta_index.contains_key(id) { + return Ok(None); + } + + // Check LRU cache + if let Some(cached) = inner.cache.get(id)? { + debug!("Cache hit for document {}", id); + return Ok(Some(cached)); + } + } + + // Load from backend (need read lock for backend access) + let inner = self.inner.read().await; + let key = Self::doc_key(id); + + match inner.backend.get(&key)? { + Some(bytes) => { + let doc = load_document_from_bytes(&bytes)?; + + // Note: We can't modify the cache with only a read lock + // For now, we return the document without caching + // A more sophisticated implementation would use a separate cache structure + + debug!("Loaded document {} from backend", id); + Ok(Some(doc)) + } + None => { + warn!("Document {} in meta index but not in backend", id); + Ok(None) + } + } + } + + /// Load a document and cache it (requires write lock for caching). + pub async fn load_and_cache(&self, id: &str) -> Result> { + // First check if document exists (read lock) + { + let inner = self.inner.read().await; + if !inner.meta_index.contains_key(id) { + return Ok(None); + } + + // Check LRU cache + if let Some(cached) = inner.cache.get(id)? { + debug!("Cache hit for document {}", id); + return Ok(Some(cached)); + } + } + + // Load from backend and cache (write lock) + let inner = self.inner.write().await; + let key = Self::doc_key(id); + + match inner.backend.get(&key)? { + Some(bytes) => { + let doc = load_document_from_bytes(&bytes)?; + + // Add to cache + inner.cache.put(id.to_string(), doc.clone())?; + + debug!("Loaded and cached document {}", id); + Ok(Some(doc)) + } + None => { + warn!("Document {} in meta index but not in backend", id); + Ok(None) + } + } + } + + /// Remove a document from the workspace. + pub async fn remove(&self, id: &str) -> Result { + let mut inner = self.inner.write().await; + + if !inner.meta_index.contains_key(id) { + return Ok(false); + } + + let key = Self::doc_key(id); + inner.backend.delete(&key)?; + + inner.meta_index.remove(id); + + // Remove from cache + let _ = inner.cache.remove(id); + + Self::save_meta_index(&inner)?; + + info!("Removed document {} from async workspace", id); + Ok(true) + } + + /// Get the number of documents in the workspace. + pub async fn len(&self) -> usize { + let inner = self.inner.read().await; + inner.meta_index.len() + } + + /// Check if the workspace is empty. + pub async fn is_empty(&self) -> bool { + let inner = self.inner.read().await; + inner.meta_index.is_empty() + } + + /// Get the number of items currently in the LRU cache. + pub async fn cache_len(&self) -> usize { + let inner = self.inner.read().await; + inner.cache.len() + } + + /// Get cache utilization (0.0 to 1.0). + pub async fn cache_utilization(&self) -> f64 { + let inner = self.inner.read().await; + inner.cache.utilization() + } + + /// Get cache statistics. + pub async fn cache_stats(&self) -> super::cache::CacheStats { + let inner = self.inner.read().await; + inner.cache.stats() + } + + /// Clear the LRU cache. + pub async fn clear_cache(&self) -> Result<()> { + let inner = self.inner.write().await; + inner.cache.clear()?; + debug!("Cleared async document cache"); + Ok(()) + } + + /// Get the storage key for a document. + fn doc_key(id: &str) -> String { + format!("doc:{}", id) + } + + /// Load the meta index from backend. + fn load_meta_index(inner: &mut AsyncWorkspaceInner) -> Result<()> { + match inner.backend.get(META_KEY)? { + Some(bytes) => { + let meta: HashMap = serde_json::from_slice(&bytes) + .map_err(|e| Error::Parse(format!("Failed to parse meta index: {}", e)))?; + inner.meta_index = meta; + info!( + "Loaded {} document(s) from async workspace index", + inner.meta_index.len() + ); + } + None => { + // Try to rebuild from existing keys + Self::rebuild_meta_index(inner)?; + } + } + Ok(()) + } + + /// Save the meta index to backend. + fn save_meta_index(inner: &AsyncWorkspaceInner) -> Result<()> { + let bytes = serde_json::to_vec_pretty(&inner.meta_index) + .map_err(|e| Error::Parse(format!("Failed to serialize meta index: {}", e)))?; + inner.backend.put(META_KEY, &bytes)?; + Ok(()) + } + + /// Rebuild the meta index from existing documents. + fn rebuild_meta_index(inner: &mut AsyncWorkspaceInner) -> Result<()> { + let keys = inner.backend.keys()?; + let doc_keys: Vec<_> = keys + .iter() + .filter(|k| k.starts_with("doc:")) + .collect(); + + for key in doc_keys { + if let Some(bytes) = inner.backend.get(key)? { + if let Ok(doc) = load_document_from_bytes(&bytes) { + let doc_id = doc.meta.id.clone(); + let meta_entry = AsyncDocumentMetaEntry { + id: doc_id.clone(), + doc_name: doc.meta.name, + doc_description: doc.meta.description, + doc_type: doc.meta.format, + path: doc + .meta + .source_path + .as_ref() + .map(|p| p.to_string_lossy().to_string()), + page_count: if doc.pages.is_empty() { None } else { Some(doc.pages.len()) }, + line_count: doc.meta.line_count, + }; + inner.meta_index.insert(doc_id, meta_entry); + } + } + } + + if !inner.meta_index.is_empty() { + Self::save_meta_index(inner)?; + info!( + "Rebuilt async index from {} document(s)", + inner.meta_index.len() + ); + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::document::DocumentTree; + + fn create_test_doc(id: &str) -> PersistedDocument { + let meta = super::super::persistence::DocumentMeta::new(id, "Test Doc", "md"); + let tree = DocumentTree::new("Root", "Content"); + PersistedDocument::new(meta, tree) + } + + #[tokio::test] + async fn test_async_workspace_create() { + let backend = Arc::new(super::super::backend::MemoryBackend::new()); + let workspace = AsyncWorkspace::with_backend(backend).await.unwrap(); + + assert!(workspace.is_empty().await); + assert_eq!(workspace.len().await, 0); + } + + #[tokio::test] + async fn test_async_workspace_add_and_load() { + let backend = Arc::new(super::super::backend::MemoryBackend::new()); + let workspace = AsyncWorkspace::with_backend(backend).await.unwrap(); + + let doc = create_test_doc("doc-1"); + workspace.add(&doc).await.unwrap(); + + assert_eq!(workspace.len().await, 1); + assert!(workspace.contains("doc-1").await); + + let loaded = workspace.load("doc-1").await.unwrap(); + assert!(loaded.is_some()); + assert_eq!(loaded.unwrap().meta.id, "doc-1"); + } + + #[tokio::test] + async fn test_async_workspace_remove() { + let backend = Arc::new(super::super::backend::MemoryBackend::new()); + let workspace = AsyncWorkspace::with_backend(backend).await.unwrap(); + + let doc = create_test_doc("doc-1"); + workspace.add(&doc).await.unwrap(); + + let removed = workspace.remove("doc-1").await.unwrap(); + assert!(removed); + assert!(workspace.is_empty().await); + + let removed_again = workspace.remove("doc-1").await.unwrap(); + assert!(!removed_again); + } + + #[tokio::test] + async fn test_async_workspace_cache() { + let backend = Arc::new(super::super::backend::MemoryBackend::new()); + let workspace = AsyncWorkspace::with_backend(backend).await.unwrap(); + + let doc = create_test_doc("doc-1"); + workspace.add(&doc).await.unwrap(); + + // First load with caching + let _ = workspace.load_and_cache("doc-1").await.unwrap(); + let stats = workspace.cache_stats().await; + assert_eq!(stats.misses, 1); + + // Second load should hit cache + let _ = workspace.load_and_cache("doc-1").await.unwrap(); + let stats = workspace.cache_stats().await; + assert_eq!(stats.hits, 1); + } + + #[tokio::test] + async fn test_async_workspace_list_documents() { + let backend = Arc::new(super::super::backend::MemoryBackend::new()); + let workspace = AsyncWorkspace::with_backend(backend).await.unwrap(); + + workspace.add(&create_test_doc("doc-1")).await.unwrap(); + workspace.add(&create_test_doc("doc-2")).await.unwrap(); + workspace.add(&create_test_doc("doc-3")).await.unwrap(); + + let docs = workspace.list_documents().await; + assert_eq!(docs.len(), 3); + } + + #[tokio::test] + async fn test_async_workspace_get_meta() { + let backend = Arc::new(super::super::backend::MemoryBackend::new()); + let workspace = AsyncWorkspace::with_backend(backend).await.unwrap(); + + let doc = create_test_doc("doc-1"); + workspace.add(&doc).await.unwrap(); + + let meta = workspace.get_meta("doc-1").await; + assert!(meta.is_some()); + let meta = meta.unwrap(); + assert_eq!(meta.id, "doc-1"); + assert_eq!(meta.doc_name, "Test Doc"); + assert_eq!(meta.doc_type, "md"); + } + + #[tokio::test] + async fn test_async_workspace_concurrent_access() { + let backend = Arc::new(super::super::backend::MemoryBackend::new()); + let workspace = Arc::new(AsyncWorkspace::with_backend(backend).await.unwrap()); + + // Spawn multiple concurrent tasks + let mut handles = vec![]; + + for i in 0..10 { + let ws = workspace.clone(); + let handle = tokio::spawn(async move { + let id = format!("doc-{}", i); + let doc = create_test_doc(&id); + ws.add(&doc).await.unwrap(); + let loaded = ws.load(&id).await.unwrap(); + assert!(loaded.is_some()); + }); + handles.push(handle); + } + + // Wait for all tasks + for handle in handles { + handle.await.unwrap(); + } + + assert_eq!(workspace.len().await, 10); + } +} diff --git a/src/storage/backend/file.rs b/src/storage/backend/file.rs new file mode 100644 index 00000000..915d0b4c --- /dev/null +++ b/src/storage/backend/file.rs @@ -0,0 +1,295 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! File system storage backend. + +use std::fs; +use std::path::{Path, PathBuf}; +use std::sync::RwLock; + +use tracing::{debug, warn}; + +use super::StorageBackend; +use crate::error::Result; +use crate::Error; + +/// File system storage backend. +/// +/// Stores each key-value pair as a separate file in a directory. +/// The key is used as the filename (with `.bin` extension). +/// +/// # Structure +/// +/// ```text +/// workspace/ +/// ├── doc-1.bin # Document 1 +/// ├── doc-2.bin # Document 2 +/// ├── _meta.json # Metadata index +/// └── .workspace.lock # Lock file +/// ``` +/// +/// # Thread Safety +/// +/// Uses `RwLock` for thread-safe operations on the directory listing cache. +#[derive(Debug)] +pub struct FileBackend { + /// Root directory for storage. + root: PathBuf, + /// Cached directory listing (refreshed on miss). + cache: RwLock>>, +} + +impl FileBackend { + /// Create a new file backend at the given path. + /// + /// Creates the directory if it doesn't exist. + pub fn new(path: impl Into) -> Result { + let root = path.into(); + fs::create_dir_all(&root).map_err(Error::Io)?; + + Ok(Self { + root, + cache: RwLock::new(None), + }) + } + + /// Open an existing file backend. + /// + /// Creates the directory if it doesn't exist. + pub fn open(path: impl Into) -> Result { + Self::new(path) + } + + /// Get the root path. + pub fn root(&self) -> &Path { + &self.root + } + + /// Convert a key to a file path. + fn key_to_path(&self, key: &str) -> PathBuf { + // Sanitize key to prevent path traversal + let sanitized = key + .replace("..", "_") + .replace(['/', '\\', ':'], "_"); + self.root.join(format!("{}.bin", sanitized)) + } + + /// Refresh the directory listing cache. + fn refresh_cache(&self) -> Result> { + let entries: Vec = fs::read_dir(&self.root) + .map_err(Error::Io)? + .filter_map(|entry| entry.ok()) + .filter_map(|entry| { + let path = entry.path(); + if path.extension()?.to_str()? == "bin" { + path.file_stem()?.to_str().map(|s| s.to_string()) + } else { + None + } + }) + .collect(); + + // Update cache + if let Ok(mut cache) = self.cache.write() { + *cache = Some(entries.clone()); + } + + Ok(entries) + } + + /// Get cached keys or refresh cache. + fn get_keys(&self) -> Result> { + // Try to read from cache first + if let Ok(cache) = self.cache.read() { + if let Some(ref keys) = *cache { + return Ok(keys.clone()); + } + } + + // Refresh cache + self.refresh_cache() + } + + /// Invalidate the cache. + pub fn invalidate_cache(&self) { + if let Ok(mut cache) = self.cache.write() { + *cache = None; + } + } +} + +impl StorageBackend for FileBackend { + fn get(&self, key: &str) -> Result>> { + let path = self.key_to_path(key); + + if !path.exists() { + return Ok(None); + } + + let data = fs::read(&path).map_err(Error::Io)?; + debug!("Read {} bytes from {}", data.len(), key); + + Ok(Some(data)) + } + + fn put(&self, key: &str, value: &[u8]) -> Result<()> { + let path = self.key_to_path(key); + + // Use atomic write (temp file + rename) + let temp_path = path.with_extension("tmp"); + + fs::write(&temp_path, value).map_err(Error::Io)?; + fs::rename(&temp_path, &path).map_err(Error::Io)?; + + // Invalidate cache + self.invalidate_cache(); + + debug!("Wrote {} bytes to {}", value.len(), key); + Ok(()) + } + + fn delete(&self, key: &str) -> Result { + let path = self.key_to_path(key); + + if !path.exists() { + return Ok(false); + } + + fs::remove_file(&path).map_err(Error::Io)?; + + // Invalidate cache + self.invalidate_cache(); + + debug!("Deleted {}", key); + Ok(true) + } + + fn exists(&self, key: &str) -> Result { + let path = self.key_to_path(key); + Ok(path.exists()) + } + + fn keys(&self) -> Result> { + self.get_keys() + } + + fn len(&self) -> Result { + Ok(self.get_keys()?.len()) + } + + fn clear(&self) -> Result<()> { + let keys = self.get_keys()?; + + for key in &keys { + let path = self.key_to_path(key); + if path.exists() { + fs::remove_file(&path).map_err(Error::Io)?; + } + } + + // Clear cache + if let Ok(mut cache) = self.cache.write() { + *cache = None; + } + + debug!("Cleared {} entries", keys.len()); + Ok(()) + } + + fn backend_name(&self) -> &'static str { + "file" + } + + fn batch_put(&self, items: &[(&str, &[u8])]) -> Result<()> { + for (key, value) in items { + self.put(key, value)?; + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + #[test] + fn test_file_backend_basic() { + let temp = TempDir::new().unwrap(); + let backend = FileBackend::new(temp.path()).unwrap(); + + // Put and get + backend.put("key1", b"value1").unwrap(); + let value = backend.get("key1").unwrap(); + assert_eq!(value, Some(b"value1".to_vec())); + + // Exists + assert!(backend.exists("key1").unwrap()); + assert!(!backend.exists("key2").unwrap()); + + // Delete + assert!(backend.delete("key1").unwrap()); + assert!(!backend.exists("key1").unwrap()); + assert!(!backend.delete("key1").unwrap()); // Already deleted + } + + #[test] + fn test_file_backend_keys() { + let temp = TempDir::new().unwrap(); + let backend = FileBackend::new(temp.path()).unwrap(); + + backend.put("key1", b"v1").unwrap(); + backend.put("key2", b"v2").unwrap(); + backend.put("key3", b"v3").unwrap(); + + let keys = backend.keys().unwrap(); + assert_eq!(keys.len(), 3); + assert!(keys.contains(&"key1".to_string())); + } + + #[test] + fn test_file_backend_clear() { + let temp = TempDir::new().unwrap(); + let backend = FileBackend::new(temp.path()).unwrap(); + + backend.put("key1", b"v1").unwrap(); + backend.put("key2", b"v2").unwrap(); + + backend.clear().unwrap(); + + assert!(backend.is_empty().unwrap()); + } + + #[test] + fn test_file_backend_batch() { + let temp = TempDir::new().unwrap(); + let backend = FileBackend::new(temp.path()).unwrap(); + + let items: Vec<(&str, &[u8])> = vec![ + ("k1", b"v1".as_slice()), + ("k2", b"v2".as_slice()), + ("k3", b"v3".as_slice()), + ]; + + backend.batch_put(&items).unwrap(); + + let results = backend.batch_get(&["k1", "k2", "k3", "k4"]).unwrap(); + assert_eq!(results.len(), 4); + assert!(results[0].is_some()); + assert!(results[3].is_none()); + } + + #[test] + fn test_file_backend_key_sanitization() { + let temp = TempDir::new().unwrap(); + let backend = FileBackend::new(temp.path()).unwrap(); + + // Keys with special characters should be sanitized + backend.put("../etc/passwd", b"malicious").unwrap(); + backend.put("path/to/file", b"nested").unwrap(); + + // Both should be stored safely + assert!(backend.exists("../etc/passwd").unwrap()); + assert!(backend.exists("path/to/file").unwrap()); + } +} diff --git a/src/storage/backend/memory.rs b/src/storage/backend/memory.rs new file mode 100644 index 00000000..013c87f9 --- /dev/null +++ b/src/storage/backend/memory.rs @@ -0,0 +1,173 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! In-memory storage backend (for testing). + +use std::collections::HashMap; +use std::sync::RwLock; + +use super::StorageBackend; +use crate::error::Result; + +/// In-memory storage backend. +/// +/// Stores all data in a `HashMap`. Useful for testing and scenarios +/// where persistence is not required. +/// +/// # Thread Safety +/// +/// Uses `RwLock` for thread-safe access to the internal map. +#[derive(Debug, Default)] +pub struct MemoryBackend { + /// Internal storage. + data: RwLock>>, +} + +impl MemoryBackend { + /// Create a new in-memory backend. + pub fn new() -> Self { + Self::default() + } + + /// Create a new in-memory backend with pre-seeded data. + pub fn with_data(data: HashMap>) -> Self { + Self { + data: RwLock::new(data), + } + } +} + +impl StorageBackend for MemoryBackend { + fn get(&self, key: &str) -> Result>> { + let data = self.data.read().map_err(|_| { + crate::Error::Cache("Memory backend lock poisoned".to_string()) + })?; + Ok(data.get(key).cloned()) + } + + fn put(&self, key: &str, value: &[u8]) -> Result<()> { + let mut data = self.data.write().map_err(|_| { + crate::Error::Cache("Memory backend lock poisoned".to_string()) + })?; + data.insert(key.to_string(), value.to_vec()); + Ok(()) + } + + fn delete(&self, key: &str) -> Result { + let mut data = self.data.write().map_err(|_| { + crate::Error::Cache("Memory backend lock poisoned".to_string()) + })?; + Ok(data.remove(key).is_some()) + } + + fn exists(&self, key: &str) -> Result { + let data = self.data.read().map_err(|_| { + crate::Error::Cache("Memory backend lock poisoned".to_string()) + })?; + Ok(data.contains_key(key)) + } + + fn keys(&self) -> Result> { + let data = self.data.read().map_err(|_| { + crate::Error::Cache("Memory backend lock poisoned".to_string()) + })?; + Ok(data.keys().cloned().collect()) + } + + fn len(&self) -> Result { + let data = self.data.read().map_err(|_| { + crate::Error::Cache("Memory backend lock poisoned".to_string()) + })?; + Ok(data.len()) + } + + fn clear(&self) -> Result<()> { + let mut data = self.data.write().map_err(|_| { + crate::Error::Cache("Memory backend lock poisoned".to_string()) + })?; + data.clear(); + Ok(()) + } + + fn batch_put(&self, items: &[(&str, &[u8])]) -> Result<()> { + let mut data = self.data.write().map_err(|_| { + crate::Error::Cache("Memory backend lock poisoned".to_string()) + })?; + for (key, value) in items { + data.insert(key.to_string(), value.to_vec()); + } + Ok(()) + } + + fn backend_name(&self) -> &'static str { + "memory" + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_memory_backend_basic() { + let backend = MemoryBackend::new(); + + // Put and get + backend.put("key1", b"value1").unwrap(); + let value = backend.get("key1").unwrap(); + assert_eq!(value, Some(b"value1".to_vec())); + + // Non-existent key + let missing = backend.get("missing").unwrap(); + assert!(missing.is_none()); + } + + #[test] + fn test_memory_backend_delete() { + let backend = MemoryBackend::new(); + + backend.put("key1", b"value1").unwrap(); + assert!(backend.exists("key1").unwrap()); + + let deleted = backend.delete("key1").unwrap(); + assert!(deleted); + assert!(!backend.exists("key1").unwrap()); + + // Delete non-existent + let not_deleted = backend.delete("missing").unwrap(); + assert!(!not_deleted); + } + + #[test] + fn test_memory_backend_keys() { + let backend = MemoryBackend::new(); + + backend.put("key1", b"v1").unwrap(); + backend.put("key2", b"v2").unwrap(); + backend.put("key3", b"v3").unwrap(); + + let keys = backend.keys().unwrap(); + assert_eq!(keys.len(), 3); + } + + #[test] + fn test_memory_backend_clear() { + let backend = MemoryBackend::new(); + + backend.put("key1", b"v1").unwrap(); + backend.put("key2", b"v2").unwrap(); + + backend.clear().unwrap(); + assert!(backend.is_empty().unwrap()); + } + + #[test] + fn test_memory_backend_with_data() { + let mut initial = HashMap::new(); + initial.insert("k1".to_string(), b"v1".to_vec()); + initial.insert("k2".to_string(), b"v2".to_vec()); + + let backend = MemoryBackend::with_data(initial); + assert_eq!(backend.len().unwrap(), 2); + } +} diff --git a/src/storage/backend/mod.rs b/src/storage/backend/mod.rs new file mode 100644 index 00000000..b8d7ccef --- /dev/null +++ b/src/storage/backend/mod.rs @@ -0,0 +1,35 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Storage backend abstraction. +//! +//! This module provides a trait-based abstraction for different storage backends, +//! allowing the workspace to work with various storage systems: +//! +//! - **FileBackend**: File system storage (default) +//! - **MemoryBackend**: In-memory storage (for testing) +//! +//! # Example +//! +//! ```rust,ignore +//! use vectorless::storage::backend::{StorageBackend, FileBackend}; +//! +//! let backend = FileBackend::new("./workspace"); +//! +//! // Store data +//! backend.put("doc-1", b"document data")?; +//! +//! // Retrieve data +//! let data = backend.get("doc-1")?; +//! +//! // List all keys +//! let keys = backend.keys()?; +//! ``` + +mod file; +mod memory; +mod trait_def; + +pub use file::FileBackend; +pub use memory::MemoryBackend; +pub use trait_def::StorageBackend; diff --git a/src/storage/backend/trait_def.rs b/src/storage/backend/trait_def.rs new file mode 100644 index 00000000..782bdac0 --- /dev/null +++ b/src/storage/backend/trait_def.rs @@ -0,0 +1,113 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Storage backend trait definition. + +use std::fmt::Debug; + +use crate::error::Result; + +/// Storage backend trait for abstracting different storage systems. +/// +/// This trait provides a simple key-value interface for document storage. +/// Implementations can use different underlying storage systems: +/// +/// - File system +/// - In-memory (for testing) +/// - Database (SQLite, RocksDB, etc.) +/// - Cloud storage (S3, etc.) +/// +/// # Thread Safety +/// +/// All implementations must be `Send + Sync` to support concurrent access. +pub trait StorageBackend: Debug + Send + Sync { + /// Get a value by key. + /// + /// Returns `None` if the key doesn't exist. + fn get(&self, key: &str) -> Result>>; + + /// Store a value with the given key. + /// + /// Overwrites any existing value. + fn put(&self, key: &str, value: &[u8]) -> Result<()>; + + /// Delete a value by key. + /// + /// Returns `true` if the value was deleted, `false` if it didn't exist. + fn delete(&self, key: &str) -> Result; + + /// Check if a key exists. + fn exists(&self, key: &str) -> Result; + + /// List all keys in the storage. + fn keys(&self) -> Result>; + + /// Get the number of entries in storage. + fn len(&self) -> Result; + + /// Check if storage is empty. + fn is_empty(&self) -> Result { + Ok(self.len()? == 0) + } + + /// Clear all entries from storage. + fn clear(&self) -> Result<()>; + + // ======================================================================== + // Batch operations (optional, default implementations) + // ======================================================================== + + /// Get multiple values by keys. + /// + /// Returns a vector of options, one for each key. + fn batch_get(&self, keys: &[&str]) -> Result>>> { + keys.iter().map(|k| self.get(k)).collect() + } + + /// Store multiple key-value pairs. + /// + /// Default implementation calls `put` for each item. + fn batch_put(&self, items: &[(&str, &[u8])]) -> Result<()> { + for (key, value) in items { + self.put(key, value)?; + } + Ok(()) + } + + /// Delete multiple keys. + /// + /// Returns the number of keys that were actually deleted. + fn batch_delete(&self, keys: &[&str]) -> Result { + let mut count = 0; + for key in keys { + if self.delete(key)? { + count += 1; + } + } + Ok(count) + } + + // ======================================================================== + // Metadata operations + // ======================================================================== + + /// Get storage backend name. + fn backend_name(&self) -> &'static str; + + /// Get storage statistics. + fn stats(&self) -> StorageStats { + StorageStats { + backend: self.backend_name().to_string(), + entries: self.len().unwrap_or(0), + } + } +} + +/// Storage statistics. +#[derive(Debug, Clone)] +pub struct StorageStats { + /// Backend name. + pub backend: String, + /// Number of entries. + pub entries: usize, +} diff --git a/src/storage/cache.rs b/src/storage/cache.rs new file mode 100644 index 00000000..4e7e6a57 --- /dev/null +++ b/src/storage/cache.rs @@ -0,0 +1,370 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Document cache with LRU eviction policy. +//! +//! This module provides a thread-safe LRU cache for loaded documents, +//! allowing efficient reuse of loaded document data while limiting memory usage. +//! +//! # Metrics +//! +//! The cache tracks: +//! - Hits: Number of successful cache lookups +//! - Misses: Number of failed cache lookups +//! - Evictions: Number of entries evicted due to capacity +//! - Utilization: Current usage as percentage of capacity + +use std::num::NonZeroUsize; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::Mutex; + +use lru::LruCache; + +use super::persistence::PersistedDocument; +use crate::error::Result; +use crate::Error; + +/// Default cache size (number of documents). +const DEFAULT_CACHE_SIZE: usize = 100; + +/// A thread-safe LRU cache for documents. +/// +/// Uses interior mutability via `Mutex` for safe concurrent access. +/// The cache automatically evicts least-recently-used entries when full. +/// +/// # Metrics +/// +/// The cache maintains atomic counters for: +/// - **hits**: Successful cache lookups +/// - **misses**: Failed cache lookups (document not in cache) +/// - **evictions**: Entries removed due to capacity limits +#[derive(Debug)] +pub struct DocumentCache { + /// Inner cache protected by Mutex. + inner: Mutex>, + /// Maximum capacity. + capacity: usize, + /// Number of cache hits. + hits: AtomicU64, + /// Number of cache misses. + misses: AtomicU64, + /// Number of cache evictions. + evictions: AtomicU64, +} + +impl DocumentCache { + /// Create a new cache with default capacity (100 documents). + #[must_use] + pub fn new() -> Self { + Self::with_capacity(DEFAULT_CACHE_SIZE) + } + + /// Create a new cache with custom capacity. + /// + /// # Panics + /// + /// This function does not panic, but capacities below 1 are normalized to 1. + #[must_use] + pub fn with_capacity(capacity: usize) -> Self { + let capacity = capacity.max(1); + let non_zero = NonZeroUsize::new(capacity).unwrap_or_else(|| { + NonZeroUsize::new(DEFAULT_CACHE_SIZE).expect("default is non-zero") + }); + + Self { + inner: Mutex::new(LruCache::new(non_zero)), + capacity, + hits: AtomicU64::new(0), + misses: AtomicU64::new(0), + evictions: AtomicU64::new(0), + } + } + + /// Get a document from the cache. + /// + /// Returns `None` if the document is not in the cache. + /// Updates the access order (moves to most-recently-used). + /// + /// # Errors + /// + /// Returns an error if the cache lock is poisoned. + pub fn get(&self, id: &str) -> Result> { + let mut cache = self.lock()?; + let result = cache.get(id).cloned(); + + // Update metrics + if result.is_some() { + self.hits.fetch_add(1, Ordering::Relaxed); + } else { + self.misses.fetch_add(1, Ordering::Relaxed); + } + + Ok(result) + } + + /// Check if a document is in the cache. + pub fn contains(&self, id: &str) -> bool { + self.lock() + .map(|cache| cache.contains(id)) + .unwrap_or(false) + } + + /// Put a document into the cache. + /// + /// If the cache is full, evicts the least-recently-used entry. + /// Returns the evicted entry if any. + /// + /// # Errors + /// + /// Returns an error if the cache lock is poisoned. + pub fn put(&self, id: String, doc: PersistedDocument) -> Result> { + let mut cache = self.lock()?; + + // Track capacity before put to detect eviction + let was_full = cache.len() >= self.capacity; + + let evicted = cache.put(id, doc); + + // Track evictions + if evicted.is_some() || was_full { + self.evictions.fetch_add(1, Ordering::Relaxed); + } + + Ok(evicted) + } + + /// Remove a document from the cache. + /// + /// Returns the removed document if it was in the cache. + /// + /// # Errors + /// + /// Returns an error if the cache lock is poisoned. + pub fn remove(&self, id: &str) -> Result> { + let mut cache = self.lock()?; + Ok(cache.pop(id)) + } + + /// Clear all entries from the cache. + /// + /// # Errors + /// + /// Returns an error if the cache lock is poisoned. + pub fn clear(&self) -> Result<()> { + let mut cache = self.lock()?; + cache.clear(); + Ok(()) + } + + /// Get the number of entries currently in the cache. + pub fn len(&self) -> usize { + self.lock() + .map(|cache| cache.len()) + .unwrap_or(0) + } + + /// Check if the cache is empty. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Get the maximum capacity of the cache. + pub fn capacity(&self) -> usize { + self.capacity + } + + /// Get cache utilization (0.0 to 1.0). + pub fn utilization(&self) -> f64 { + let len = self.len(); + if self.capacity == 0 { + return 0.0; + } + len as f64 / self.capacity as f64 + } + + /// Get all document IDs currently in the cache. + /// + /// # Errors + /// + /// Returns an error if the cache lock is poisoned. + pub fn keys(&self) -> Result> { + let cache = self.lock()?; + Ok(cache.iter().map(|(k, _)| k.clone()).collect()) + } + + /// Get cache statistics including metrics. + pub fn stats(&self) -> CacheStats { + CacheStats { + len: self.len(), + capacity: self.capacity, + utilization: self.utilization(), + hits: self.hits.load(Ordering::Relaxed), + misses: self.misses.load(Ordering::Relaxed), + evictions: self.evictions.load(Ordering::Relaxed), + } + } + + /// Get the number of cache hits. + pub fn hits(&self) -> u64 { + self.hits.load(Ordering::Relaxed) + } + + /// Get the number of cache misses. + pub fn misses(&self) -> u64 { + self.misses.load(Ordering::Relaxed) + } + + /// Get the number of cache evictions. + pub fn evictions(&self) -> u64 { + self.evictions.load(Ordering::Relaxed) + } + + /// Get the cache hit rate (0.0 to 1.0). + pub fn hit_rate(&self) -> f64 { + let hits = self.hits.load(Ordering::Relaxed); + let misses = self.misses.load(Ordering::Relaxed); + let total = hits + misses; + if total == 0 { + 0.0 + } else { + hits as f64 / total as f64 + } + } + + /// Reset all metrics counters to zero. + pub fn reset_metrics(&self) { + self.hits.store(0, Ordering::Relaxed); + self.misses.store(0, Ordering::Relaxed); + self.evictions.store(0, Ordering::Relaxed); + } + + /// Lock the inner cache. + fn lock(&self) -> Result>> { + self.inner.lock().map_err(|_| { + Error::Cache("Cache lock poisoned".to_string()) + }) + } +} + +impl Default for DocumentCache { + fn default() -> Self { + Self::new() + } +} + +/// Cache statistics including metrics. +#[derive(Debug, Clone, Copy)] +pub struct CacheStats { + /// Number of entries in cache. + pub len: usize, + /// Maximum capacity. + pub capacity: usize, + /// Utilization (0.0 to 1.0). + pub utilization: f64, + /// Number of cache hits. + pub hits: u64, + /// Number of cache misses. + pub misses: u64, + /// Number of cache evictions. + pub evictions: u64, +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::storage::{DocumentMeta, PersistedDocument}; + use crate::document::DocumentTree; + + fn create_test_doc(id: &str) -> PersistedDocument { + let meta = DocumentMeta::new(id, "Test Doc", "md"); + let tree = DocumentTree::new("Root", "Content"); + PersistedDocument::new(meta, tree) + } + + #[test] + fn test_cache_basic() { + let cache = DocumentCache::with_capacity(3); + + // Add documents + let doc1 = create_test_doc("doc1"); + let doc2 = create_test_doc("doc2"); + + cache.put("doc1".to_string(), doc1.clone()).unwrap(); + cache.put("doc2".to_string(), doc2.clone()).unwrap(); + + assert_eq!(cache.len(), 2); + assert!(cache.contains("doc1")); + assert!(cache.contains("doc2")); + } + + #[test] + fn test_cache_get() { + let cache = DocumentCache::with_capacity(3); + let doc = create_test_doc("doc1"); + + cache.put("doc1".to_string(), doc).unwrap(); + + let retrieved = cache.get("doc1").unwrap(); + assert!(retrieved.is_some()); + assert_eq!(retrieved.unwrap().meta.id, "doc1"); + + let missing = cache.get("missing").unwrap(); + assert!(missing.is_none()); + } + + #[test] + fn test_cache_eviction() { + let cache = DocumentCache::with_capacity(2); + + cache.put("doc1".to_string(), create_test_doc("doc1")).unwrap(); + cache.put("doc2".to_string(), create_test_doc("doc2")).unwrap(); + cache.put("doc3".to_string(), create_test_doc("doc3")).unwrap(); + + // doc1 should be evicted (least recently used) + assert!(!cache.contains("doc1")); + assert!(cache.contains("doc2")); + assert!(cache.contains("doc3")); + } + + #[test] + fn test_cache_remove() { + let cache = DocumentCache::new(); + + cache.put("doc1".to_string(), create_test_doc("doc1")).unwrap(); + assert!(cache.contains("doc1")); + + let removed = cache.remove("doc1").unwrap(); + assert!(removed.is_some()); + assert!(!cache.contains("doc1")); + + let not_found = cache.remove("missing").unwrap(); + assert!(not_found.is_none()); + } + + #[test] + fn test_cache_clear() { + let cache = DocumentCache::new(); + + cache.put("doc1".to_string(), create_test_doc("doc1")).unwrap(); + cache.put("doc2".to_string(), create_test_doc("doc2")).unwrap(); + + assert_eq!(cache.len(), 2); + + cache.clear().unwrap(); + + assert!(cache.is_empty()); + } + + #[test] + fn test_cache_utilization() { + let cache = DocumentCache::with_capacity(10); + + assert_eq!(cache.utilization(), 0.0); + + cache.put("doc1".to_string(), create_test_doc("doc1")).unwrap(); + assert!((cache.utilization() - 0.1).abs() < 0.01); + + cache.put("doc2".to_string(), create_test_doc("doc2")).unwrap(); + assert!((cache.utilization() - 0.2).abs() < 0.01); + } +} diff --git a/src/storage/codec.rs b/src/storage/codec.rs new file mode 100644 index 00000000..3fcfd055 --- /dev/null +++ b/src/storage/codec.rs @@ -0,0 +1,241 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Codec abstraction for compression and decompression. +//! +//! This module provides a codec trait for compressing/decompressing data, +//! with implementations for: +//! +//! - **Identity**: No compression (pass-through) +//! - **Gzip**: Standard gzip compression +//! +//! # Example +//! +//! ```rust,ignore +//! use vectorless::storage::codec::{Codec, GzipCodec}; +//! +//! let codec = GzipCodec::new(6); +//! +//! let data = b"some data to compress"; +//! let compressed = codec.encode(data)?; +//! let decompressed = codec.decode(&compressed)?; +//! +//! assert_eq!(data.as_slice(), decompressed.as_slice()); +//! ``` + +use std::fmt::Debug; +use std::io::{Read, Write}; + +use flate2::read::GzDecoder; +use flate2::write::GzEncoder; +use flate2::Compression; + +use crate::error::Result; +use crate::Error; + +/// Codec trait for compression/decompression. +pub trait Codec: Debug + Send + Sync { + /// Encode (compress) data. + fn encode(&self, data: &[u8]) -> Result>; + + /// Decode (decompress) data. + fn decode(&self, data: &[u8]) -> Result>; + + /// Get the codec name. + fn name(&self) -> &'static str; +} + +/// Identity codec (no compression). +/// +/// Passes data through unchanged. +#[derive(Debug, Clone, Copy, Default)] +pub struct IdentityCodec; + +impl IdentityCodec { + /// Create a new identity codec. + pub fn new() -> Self { + Self::default() + } +} + +impl Codec for IdentityCodec { + fn encode(&self, data: &[u8]) -> Result> { + Ok(data.to_vec()) + } + + fn decode(&self, data: &[u8]) -> Result> { + Ok(data.to_vec()) + } + + fn name(&self) -> &'static str { + "identity" + } +} + +/// Gzip codec. +/// +/// Uses the `flate2` crate for gzip compression. +#[derive(Debug, Clone)] +pub struct GzipCodec { + /// Compression level (0-9). + level: u32, +} + +impl GzipCodec { + /// Create a new gzip codec with the given compression level. + /// + /// Level is clamped to 0-9: + /// - 0: No compression + /// - 1: Fastest compression + /// - 6: Default (good balance) + /// - 9: Best compression (slowest) + pub fn new(level: u32) -> Self { + Self { + level: level.clamp(0, 9), + } + } + + /// Create a codec with fast compression (level 1). + pub fn fast() -> Self { + Self::new(1) + } + + /// Create a codec with default compression (level 6). + pub fn default_level() -> Self { + Self::new(6) + } + + /// Create a codec with best compression (level 9). + pub fn best() -> Self { + Self::new(9) + } +} + +impl Default for GzipCodec { + fn default() -> Self { + Self::default_level() + } +} + +impl Codec for GzipCodec { + fn encode(&self, data: &[u8]) -> Result> { + let mut encoder = GzEncoder::new(Vec::new(), Compression::new(self.level)); + encoder.write_all(data).map_err(|e| Error::Parse(format!("Gzip encode error: {}", e)))?; + encoder.finish().map_err(|e| Error::Parse(format!("Gzip finish error: {}", e))) + } + + fn decode(&self, data: &[u8]) -> Result> { + let mut decoder = GzDecoder::new(data); + let mut decoded = Vec::new(); + decoder + .read_to_end(&mut decoded) + .map_err(|e| Error::Parse(format!("Gzip decode error: {}", e)))?; + Ok(decoded) + } + + fn name(&self) -> &'static str { + "gzip" + } +} + +/// Create a codec from configuration. +pub fn codec_from_config( + enabled: bool, + algorithm: crate::config::CompressionAlgorithm, + level: u32, +) -> Box { + if !enabled { + return Box::new(IdentityCodec::new()); + } + + match algorithm { + crate::config::CompressionAlgorithm::Gzip => Box::new(GzipCodec::new(level)), + crate::config::CompressionAlgorithm::Zstd => { + // Zstd not implemented yet, fallback to gzip + // TODO: Add zstd support when needed + Box::new(GzipCodec::new(level)) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_identity_codec() { + let codec = IdentityCodec::new(); + let data = b"test data"; + + let encoded = codec.encode(data).unwrap(); + let decoded = codec.decode(&encoded).unwrap(); + + assert_eq!(data.as_slice(), decoded.as_slice()); + assert_eq!(codec.name(), "identity"); + } + + #[test] + fn test_gzip_codec_basic() { + let codec = GzipCodec::default(); + let data = b"Hello, World! This is a test string for compression."; + + let encoded = codec.encode(data).unwrap(); + let decoded = codec.decode(&encoded).unwrap(); + + assert_eq!(data.as_slice(), decoded.as_slice()); + assert_eq!(codec.name(), "gzip"); + + // Compressed should be smaller for repetitive data + // Note: For very small data, gzip overhead might make it larger + let repetitive = b"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + let compressed = codec.encode(repetitive).unwrap(); + assert!(compressed.len() < repetitive.len()); + } + + #[test] + fn test_gzip_codec_levels() { + let data = b"This is test data that should compress well. ".repeat(100); + let data = data.into_iter().map(|b| b as u8).collect::>(); + + let codec_fast = GzipCodec::fast(); + let codec_best = GzipCodec::best(); + + let compressed_fast = codec_fast.encode(&data).unwrap(); + let compressed_best = codec_best.encode(&data).unwrap(); + + // Both should decompress to the same data + assert_eq!(codec_fast.decode(&compressed_fast).unwrap(), data); + assert_eq!(codec_best.decode(&compressed_best).unwrap(), data); + + // Best compression should be smaller or equal + assert!(compressed_best.len() <= compressed_fast.len()); + } + + #[test] + fn test_gzip_empty_data() { + let codec = GzipCodec::default(); + let data = b""; + + let encoded = codec.encode(data).unwrap(); + let decoded = codec.decode(&encoded).unwrap(); + + assert!(decoded.is_empty()); + } + + #[test] + fn test_codec_from_config() { + use crate::config::CompressionAlgorithm; + + // Disabled compression + let codec = codec_from_config(false, CompressionAlgorithm::Gzip, 6); + let data = b"test"; + let encoded = codec.encode(data).unwrap(); + assert_eq!(encoded, data); + + // Enabled compression + let codec = codec_from_config(true, CompressionAlgorithm::Gzip, 6); + let encoded = codec.encode(data).unwrap(); + let decoded = codec.decode(&encoded).unwrap(); + assert_eq!(decoded, data); + } +} diff --git a/src/storage/lock.rs b/src/storage/lock.rs new file mode 100644 index 00000000..66a65d46 --- /dev/null +++ b/src/storage/lock.rs @@ -0,0 +1,277 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! File locking for workspace safety. +//! +//! Provides cross-process file locking to prevent data corruption +//! when multiple processes access the same workspace. + +// File locking inherently requires unsafe FFI calls +#![allow(unsafe_code)] +//! +//! Provides cross-process file locking to prevent data corruption +//! when multiple processes access the same workspace. + +use std::fs::{File, OpenOptions}; +use std::path::Path; + +use crate::error::Result; +use crate::Error; + +/// A file lock that is automatically released when dropped. +/// +/// Uses the `flock` on Unix and `LockFileEx` on Windows. +#[derive(Debug)] +pub struct FileLock { + /// The locked file handle. + file: Option, + /// Path to the lock file (for debugging). + path: std::path::PathBuf, + /// Whether the lock is held exclusively. + exclusive: bool, +} + +impl FileLock { + /// Try to acquire an file lock. + /// + /// # Arguments + /// + /// * `path` - Path to the lock file (will be created if it doesn't exist) + /// * `exclusive` - If true, acquires an exclusive (write) lock; otherwise a shared (read) lock + /// + /// # Errors + /// + /// Returns `Error::WorkspaceLocked` if the lock is held by another process. + pub fn try_lock(path: impl Into, exclusive: bool) -> Result { + let path = path.into(); + + // Ensure parent directory exists + if let Some(parent) = path.parent() { + std::fs::create_dir_all(parent).map_err(Error::Io)?; + } + + // Open or create the lock file + let file = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .truncate(false) + .open(&path) + .map_err(Error::Io)?; + + // Try to acquire the lock + #[cfg(unix)] + { + use std::os::unix::fs::MetadataExt; + let fd = std::os::unix::io::AsRawFd::as_raw_fd(&file); + + let result = if exclusive { + // LOCK_EX | LOCK_NB + unsafe { libc::flock(fd, 0x02 | 0x04) } + } else { + // LOCK_SH | LOCK_NB + unsafe { libc::flock(fd, 0x01 | 0x04) } + }; + + if result != 0 { + return Err(Error::WorkspaceLocked); + } + + Ok(Self { + file: Some(file), + path, + exclusive, + }) + } + + #[cfg(windows)] + { + use std::os::windows::fs::OpenOptionsExt; + use windows_sys::Win32::Storage::FileSystem::{ + LockFileEx, LOCKFILE_EXCLUSIVE_LOCK, LOCKFILE_FAIL_IMMEDIATELY, + }; + + let handle = std::os::windows::io::AsRawHandle::as_raw_handle(&file); + + let mut overlapped = std::mem::MaybeUninit::zeroed(); + let result = unsafe { + LockFileEx( + handle, + if exclusive { LOCKFILE_EXCLUSIVE_LOCK } else { 0 } | LOCKFILE_FAIL_IMMEDIATELY, + 0, + 0xFFFFFFFF, + 0xFFFFFFFF, + overlapped.as_mut_ptr(), + ) + }; + + if result == 0 { + return Err(Error::WorkspaceLocked); + } + + Ok(Self { + file: Some(file), + path, + exclusive, + }) + } + + #[cfg(not(any(unix, windows)))] + { + // Fallback: No file locking available + // Just keep the file open, which provides some protection + Ok(Self { + file: Some(file), + path, + exclusive, + }) + } + } + + /// Try to acquire a lock without blocking. + /// + /// Returns `Ok(FileLock)` if the lock was acquired, or `Ok(None)` if it would block. + pub fn try_lock_no_wait( + path: impl Into, + exclusive: bool, + ) -> Result> { + match Self::try_lock(&path.into(), exclusive) { + Ok(lock) => Ok(Some(lock)), + Err(Error::WorkspaceLocked) => Ok(None), + Err(e) => Err(e), + } + } + + /// Check if the lock file is locked by another process. + /// + /// This is useful for checking without acquiring a lock. + pub fn is_locked(path: impl Into) -> bool { + Self::try_lock(&path.into(), false).is_err() + } + + /// Release the lock. + pub fn unlock(mut self) { + if let Some(file) = self.file.take() { + // File will be unlocked when dropped + drop(file); + } + } + + /// Get the lock file path. + pub fn path(&self) -> &Path { + &self.path + } + + /// Check if this is an exclusive lock. + pub fn is_exclusive(&self) -> bool { + self.exclusive + } +} + +impl Drop for FileLock { + fn drop(&mut self) { + if let Some(file) = self.file.take() { + // File descriptor closed, lock automatically released + drop(file); + } + } +} + +/// A scoped lock guard that releases the lock when dropped. +/// +/// This is useful for ensuring the lock is released even on panic. +pub struct ScopedLock { + lock: Option, +} + +impl ScopedLock { + /// Acquire a scoped lock. + pub fn new(path: impl Into, exclusive: bool) -> Result { + let lock = FileLock::try_lock(path, exclusive)?; + Ok(Self { lock: Some(lock) }) + } + + /// Release the lock early. + pub fn release(mut self) { + if let Some(lock) = self.lock.take() { + lock.unlock(); + } + } +} + +impl Drop for ScopedLock { + fn drop(&mut self) { + // Lock automatically released when FileLock is dropped + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + #[test] + fn test_file_lock_acquire_release() { + let temp = TempDir::new().unwrap(); + let lock_path = temp.path().join("test.lock"); + + let lock = FileLock::try_lock(&lock_path, true).unwrap(); + assert!(lock.is_exclusive()); + + // Should be able to unlock + lock.unlock(); + } + + #[test] + fn test_file_lock_conflict() { + let temp = TempDir::new().unwrap(); + let lock_path = temp.path().join("conflict.lock"); + + // Acquire exclusive lock + let _lock1 = FileLock::try_lock(&lock_path, true).unwrap(); + + // Try to acquire another exclusive lock - should fail + let result = FileLock::try_lock(&lock_path, true); + assert!(matches!(result, Err(Error::WorkspaceLocked))); + } + + #[test] + fn test_file_lock_shared() { + let temp = TempDir::new().unwrap(); + let lock_path = temp.path().join("shared.lock"); + + // Acquire shared lock + let lock1 = FileLock::try_lock(&lock_path, false).unwrap(); + assert!(!lock1.is_exclusive()); + + // Should be able to acquire another shared lock + let lock2 = FileLock::try_lock(&lock_path, false).unwrap(); + assert!(!lock2.is_exclusive()); + + // But exclusive lock should fail + let result = FileLock::try_lock(&lock_path, true); + assert!(matches!(result, Err(Error::WorkspaceLocked))); + + lock1.unlock(); + lock2.unlock(); + } + + #[test] + fn test_scoped_lock() { + let temp = TempDir::new().unwrap(); + let lock_path = temp.path().join("scoped.lock"); + + { + let _scoped = ScopedLock::new(&lock_path, true).unwrap(); + // Lock held here + + // Another lock should fail + let result = FileLock::try_lock(&lock_path, true); + assert!(matches!(result, Err(Error::WorkspaceLocked))); + } + // Lock released here + + // Now should succeed + let _lock = FileLock::try_lock(&lock_path, true).unwrap(); + } +} diff --git a/src/storage/migration.rs b/src/storage/migration.rs new file mode 100644 index 00000000..b73c0f6e --- /dev/null +++ b/src/storage/migration.rs @@ -0,0 +1,383 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Version migration system for persisted data. +//! +//! This module provides a framework for migrating data between format versions. +//! When the data format changes, migrations can automatically upgrade older data. +//! +//! # Example +//! +//! ```rust,ignore +//! use vectorless::storage::migration::{Migration, Migrator, MigrationContext}; +//! +//! // Define a migration from v1 to v2 +//! struct V1ToV2; +//! +//! impl Migration for V1ToV2 { +//! fn from_version(&self) -> u32 { 1 } +//! fn to_version(&self) -> u32 { 2 } +//! fn migrate(&self, data: &[u8], ctx: &MigrationContext) -> Result> { +//! // Transform data from v1 to v2 format +//! // ... +//! } +//! } +//! +//! // Register migrations +//! let mut migrator = Migrator::new(); +//! migrator.register(Box::new(V1ToV2)); +//! +//! // Migrate data +//! let migrated = migrator.migrate(data, 1, 2)?; +//! ``` + +use std::collections::HashMap; + +use tracing::{debug, info, warn}; + +use crate::error::Result; +use crate::Error; + +/// Current data format version. +pub const CURRENT_VERSION: u32 = 1; + +/// Migration context providing additional information for migrations. +#[derive(Debug, Clone)] +pub struct MigrationContext { + /// Source version. + pub from_version: u32, + /// Target version. + pub to_version: u32, + /// Additional metadata. + pub metadata: HashMap, +} + +impl MigrationContext { + /// Create a new migration context. + pub fn new(from_version: u32, to_version: u32) -> Self { + Self { + from_version, + to_version, + metadata: HashMap::new(), + } + } + + /// Add metadata. + pub fn with_metadata(mut self, key: impl Into, value: impl Into) -> Self { + self.metadata.insert(key.into(), value.into()); + self + } +} + +/// Trait for data migrations. +/// +/// A migration transforms data from one version to the next. +pub trait Migration: Send + Sync { + /// Get the source version this migration applies to. + fn from_version(&self) -> u32; + + /// Get the target version this migration produces. + fn to_version(&self) -> u32; + + /// Get a human-readable description of this migration. + fn description(&self) -> &str; + + /// Perform the migration. + /// + /// # Arguments + /// + /// * `data` - The data to migrate + /// * `ctx` - Migration context with additional information + /// + /// # Returns + /// + /// The migrated data in the new format. + fn migrate(&self, data: &[u8], ctx: &MigrationContext) -> Result>; + + /// Check if this migration can be applied to the given data. + /// + /// Default implementation always returns true. + fn can_migrate(&self, _data: &[u8]) -> bool { + true + } +} + +/// Migration registry and executor. +pub struct Migrator { + /// Registered migrations, keyed by (from_version, to_version). + migrations: HashMap<(u32, u32), Box>, +} + +impl Default for Migrator { + fn default() -> Self { + Self::new() + } +} + +impl std::fmt::Debug for Migrator { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Migrator") + .field("migration_count", &self.migrations.len()) + .finish() + } +} + +impl Migrator { + /// Create a new migrator. + pub fn new() -> Self { + Self { + migrations: HashMap::new(), + } + } + + /// Register a migration. + pub fn register(&mut self, migration: Box) { + let key = (migration.from_version(), migration.to_version()); + debug!( + "Registering migration: v{} -> v{}", + key.0, key.1 + ); + self.migrations.insert(key, migration); + } + + /// Check if a migration path exists between two versions. + pub fn can_migrate(&self, from_version: u32, to_version: u32) -> bool { + if from_version == to_version { + return true; + } + + // Check if we have a direct migration + if self.migrations.contains_key(&(from_version, to_version)) { + return true; + } + + // Check if we have a path through intermediate versions + self.find_migration_path(from_version, to_version).is_some() + } + + /// Find a migration path between two versions. + /// + /// Returns a sequence of version numbers to migrate through. + fn find_migration_path(&self, from_version: u32, to_version: u32) -> Option> { + if from_version == to_version { + return Some(vec![from_version]); + } + + // Simple BFS to find a path + use std::collections::{HashSet, VecDeque}; + + let mut visited: HashSet = HashSet::new(); + let mut queue: VecDeque = VecDeque::new(); + let mut parent: HashMap = HashMap::new(); + + queue.push_back(from_version); + visited.insert(from_version); + + while let Some(current) = queue.pop_front() { + // Find all migrations from current version + for ((from, to), _) in &self.migrations { + if *from == current && !visited.contains(to) { + visited.insert(*to); + parent.insert(*to, current); + queue.push_back(*to); + + if *to == to_version { + // Reconstruct path + let mut path = vec![to_version]; + let mut v = to_version; + while let Some(&p) = parent.get(&v) { + if p == from_version { + path.push(p); + break; + } + path.push(p); + v = p; + } + path.reverse(); + return Some(path); + } + } + } + } + + None + } + + /// Migrate data from one version to another. + /// + /// If a direct migration exists, it will be used. + /// Otherwise, the migrator will try to find a path through intermediate versions. + pub fn migrate(&self, data: &[u8], from_version: u32, to_version: u32) -> Result> { + if from_version == to_version { + return Ok(data.to_vec()); + } + + // Find migration path + let path = self.find_migration_path(from_version, to_version) + .ok_or_else(|| Error::VersionMismatch(format!( + "No migration path from v{} to v{}", + from_version, to_version + )))?; + + if path.len() < 2 { + return Ok(data.to_vec()); + } + + info!( + "Migrating data from v{} to v{} via path: {:?}", + from_version, to_version, path + ); + + let mut current_data = data.to_vec(); + let mut current_version = from_version; + + for next_version in path.iter().skip(1) { + let key = (current_version, *next_version); + let migration = self.migrations.get(&key) + .ok_or_else(|| Error::VersionMismatch(format!( + "Missing migration from v{} to v{}", + current_version, next_version + )))?; + + let ctx = MigrationContext::new(current_version, *next_version); + + debug!( + "Applying migration: v{} -> v{} ({})", + current_version, next_version, migration.description() + ); + + current_data = migration.migrate(¤t_data, &ctx)?; + current_version = *next_version; + } + + Ok(current_data) + } + + /// Get the list of registered migrations. + pub fn list_migrations(&self) -> Vec<(u32, u32, &str)> { + self.migrations + .values() + .map(|m| (m.from_version(), m.to_version(), m.description())) + .collect() + } +} + +// ============================================================================ +// Built-in migrations +// ============================================================================ + +/// Placeholder migration for future versions. +/// This is a template that can be copied for actual migrations. +#[derive(Debug)] +pub struct PlaceholderMigration { + from: u32, + to: u32, +} + +impl PlaceholderMigration { + /// Create a new placeholder migration. + pub fn new(from: u32, to: u32) -> Self { + Self { from, to } + } +} + +impl Migration for PlaceholderMigration { + fn from_version(&self) -> u32 { + self.from + } + + fn to_version(&self) -> u32 { + self.to + } + + fn description(&self) -> &str { + "Placeholder migration (no-op)" + } + + fn migrate(&self, data: &[u8], _ctx: &MigrationContext) -> Result> { + warn!( + "Using placeholder migration from v{} to v{} - no changes made", + self.from, self.to + ); + Ok(data.to_vec()) + } +} + +/// Create a default migrator with all built-in migrations registered. +pub fn default_migrator() -> Migrator { + Migrator::new() + // Add migrations as needed when versions change + // migrator.register(Box::new(V1ToV2::new())); +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_migration_context() { + let ctx = MigrationContext::new(1, 2) + .with_metadata("key", "value"); + + assert_eq!(ctx.from_version, 1); + assert_eq!(ctx.to_version, 2); + assert_eq!(ctx.metadata.get("key"), Some(&"value".to_string())); + } + + #[test] + fn test_migrator_no_migration_needed() { + let migrator = Migrator::new(); + let data = b"test data"; + + let result = migrator.migrate(data, 1, 1).unwrap(); + assert_eq!(result, data); + } + + #[test] + fn test_migrator_no_path() { + let migrator = Migrator::new(); + let data = b"test data"; + + let result = migrator.migrate(data, 1, 2); + assert!(result.is_err()); + } + + #[test] + fn test_migrator_with_placeholder() { + let mut migrator = Migrator::new(); + migrator.register(Box::new(PlaceholderMigration::new(1, 2))); + + assert!(migrator.can_migrate(1, 2)); + assert!(!migrator.can_migrate(1, 3)); + + let data = b"test data"; + let result = migrator.migrate(data, 1, 2).unwrap(); + assert_eq!(result, data); + } + + #[test] + fn test_migrator_path_finding() { + let mut migrator = Migrator::new(); + migrator.register(Box::new(PlaceholderMigration::new(1, 2))); + migrator.register(Box::new(PlaceholderMigration::new(2, 3))); + + assert!(migrator.can_migrate(1, 3)); + + let path = migrator.find_migration_path(1, 3).unwrap(); + assert_eq!(path, vec![1, 2, 3]); + + let data = b"test data"; + let result = migrator.migrate(data, 1, 3).unwrap(); + assert_eq!(result, data); + } + + #[test] + fn test_list_migrations() { + let mut migrator = Migrator::new(); + migrator.register(Box::new(PlaceholderMigration::new(1, 2))); + migrator.register(Box::new(PlaceholderMigration::new(2, 3))); + + let list = migrator.list_migrations(); + assert_eq!(list.len(), 2); + } +} diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 0d07d143..f8d97b1f 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -5,13 +5,16 @@ //! //! This module provides: //! - **Workspace** — A directory-based document collection manager with LRU cache -//! - **Persistence** — Save/load document trees and metadata +//! - **Persistence** — Save/load document trees and metadata with atomic writes +//! - **Cache** — LRU cache for loaded documents +//! - **Lock** — File locking for multi-process safety +//! - **Backend** — Storage backend abstraction (file, memory, etc.) //! //! # Example //! //! ```rust,no_run //! use vectorless::storage::{Workspace, PersistedDocument, DocumentMeta}; -//! use vectorless::domain::DocumentTree; +//! use vectorless::document::DocumentTree; //! //! // Create a workspace //! let mut workspace = Workspace::new("./my_workspace")?; @@ -26,13 +29,28 @@ //! let loaded = workspace.load("doc-1")?.unwrap(); //! ``` +pub mod async_workspace; +pub mod backend; +pub mod cache; +pub mod codec; +pub mod lock; +pub mod migration; mod persistence; mod workspace; // Re-export main types +pub use backend::{FileBackend, MemoryBackend, StorageBackend}; +pub use cache::DocumentCache; +pub use codec::{Codec, GzipCodec, IdentityCodec, codec_from_config}; +pub use migration::{Migration, MigrationContext, Migrator, CURRENT_VERSION}; +pub use lock::{FileLock, ScopedLock}; pub use persistence::{ - DocumentMeta, PageContent, PersistedDocument, load_document, load_index, save_document, - save_index, + DocumentMeta, PageContent, PersistedDocument, + load_document, load_document_from_bytes, load_document_with_options, + load_index, load_index_from_bytes, load_index_with_options, + save_document, save_document_to_bytes, save_document_with_options, + save_index, save_index_to_bytes, save_index_with_options, + PersistenceOptions, }; - -pub use workspace::{DocumentMetaEntry, Workspace}; +pub use async_workspace::{AsyncDocumentMetaEntry, AsyncWorkspace, AsyncWorkspaceOptions}; +pub use workspace::{DocumentMetaEntry, Workspace, WorkspaceOptions}; diff --git a/src/storage/persistence.rs b/src/storage/persistence.rs index a77a3e0b..245f33a6 100644 --- a/src/storage/persistence.rs +++ b/src/storage/persistence.rs @@ -2,12 +2,25 @@ // SPDX-License-Identifier: Apache-2.0 //! Persistence utilities for saving and loading document indices. - +//! +//! # Features +//! +//! - **Atomic writes**: Write to temp file, then rename for crash safety +//! - **Checksum verification**: SHA-256 checksums for data integrity +//! - **Version header**: Format version for future migrations + +use sha2::{Digest, Sha256}; use serde::{Deserialize, Serialize}; -use std::io; +use std::fs::File; +use std::io::{BufReader, BufWriter, Read, Write}; use std::path::{Path, PathBuf}; -use crate::domain::{DocumentTree, Error, Result}; +use crate::document::DocumentTree; +use crate::error::Result; +use crate::Error; + +/// Current format version for persisted documents. +const FORMAT_VERSION: u32 = 1; /// Metadata for a persisted document. #[derive(Debug, Clone, Serialize, Deserialize)] @@ -113,46 +126,559 @@ pub struct PageContent { pub content: String, } -/// Save a document to a JSON file. +/// Wrapper for persisted data with checksum. +#[derive(Debug, Serialize, Deserialize)] +struct PersistedWrapper { + /// Format version. + version: u32, + /// SHA-256 checksum of the payload. + checksum: String, + /// The actual data. + payload: T, +} + +/// Options for save/load operations. +#[derive(Debug, Clone)] +pub struct PersistenceOptions { + /// Use atomic writes (temp file + rename). + pub atomic_writes: bool, + /// Verify checksums on load. + pub verify_checksum: bool, +} + +impl Default for PersistenceOptions { + fn default() -> Self { + Self { + atomic_writes: true, + verify_checksum: true, + } + } +} + +impl PersistenceOptions { + /// Create new options with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Set atomic writes option. + pub fn with_atomic_writes(mut self, enabled: bool) -> Self { + self.atomic_writes = enabled; + self + } + + /// Set checksum verification option. + pub fn with_verify_checksum(mut self, enabled: bool) -> Self { + self.verify_checksum = enabled; + self + } +} + +/// Calculate SHA-256 checksum of data. +fn calculate_checksum(data: &[u8]) -> String { + let mut hasher = Sha256::new(); + hasher.update(data); + format!("{:x}", hasher.finalize()) +} + +/// Save a document to a JSON file with atomic write and checksum. +/// +/// # Atomic Write +/// +/// When `atomic_writes` is enabled (default), this function: +/// 1. Writes to a temporary file (`.tmp` suffix) +/// 2. Renames temp file to target (atomic on most filesystems) +/// +/// This prevents data corruption if the process crashes during write. +/// +/// # Errors +/// +/// Returns an error if: +/// - Serialization fails +/// - Cannot create temp file +/// - Write fails +/// - Rename fails pub fn save_document(path: &Path, doc: &PersistedDocument) -> Result<()> { - let json = serde_json::to_string_pretty(doc) - .map_err(|e| Error::Io(io::Error::new(io::ErrorKind::Other, e)))?; + save_document_with_options(path, doc, &PersistenceOptions::default()) +} - std::fs::write(path, json).map_err(|e| Error::Io(e))?; +/// Save a document with custom options. +pub fn save_document_with_options( + path: &Path, + doc: &PersistedDocument, + options: &PersistenceOptions, +) -> Result<()> { + // Serialize the payload first + let payload_bytes = serde_json::to_vec(doc) + .map_err(|e| Error::Serialization(e.to_string()))?; + + // Calculate checksum + let checksum = calculate_checksum(&payload_bytes); + + // Create wrapper + let wrapper = PersistedWrapper { + version: FORMAT_VERSION, + checksum, + payload: doc.clone(), + }; + + // Serialize wrapper + let json = serde_json::to_string_pretty(&wrapper) + .map_err(|e| Error::Serialization(e.to_string()))?; + + if options.atomic_writes { + // Atomic write: write to temp file, then rename + let temp_path = path.with_extension("tmp"); + + // Ensure parent directory exists + if let Some(parent) = path.parent() { + std::fs::create_dir_all(parent).map_err(Error::Io)?; + } + + // Write to temp file + { + let file = File::create(&temp_path).map_err(Error::Io)?; + let mut writer = BufWriter::new(file); + writer.write_all(json.as_bytes()).map_err(Error::Io)?; + writer.flush().map_err(Error::Io)?; + } + + // Atomic rename + std::fs::rename(&temp_path, path).map_err(Error::Io)?; + } else { + // Direct write (not atomic) + std::fs::write(path, json).map_err(Error::Io)?; + } Ok(()) } -/// Load a document from a JSON file. +/// Load a document from a JSON file with checksum verification. +/// +/// # Checksum Verification +/// +/// When `verify_checksum` is enabled (default), this function: +/// 1. Reads the file +/// 2. Parses the wrapper +/// 3. Re-serializes the payload +/// 4. Verifies the checksum matches +/// +/// # Errors +/// +/// Returns an error if: +/// - File doesn't exist +/// - Parse fails +/// - Checksum mismatch +/// - Version mismatch (future: migration) pub fn load_document(path: &Path) -> Result { - let json = std::fs::read_to_string(path).map_err(|e| Error::Io(e))?; + load_document_with_options(path, &PersistenceOptions::default()) +} - let doc: PersistedDocument = serde_json::from_str(&json) +/// Load a document with custom options. +pub fn load_document_with_options( + path: &Path, + options: &PersistenceOptions, +) -> Result { + if !path.exists() { + return Err(Error::DocumentNotFound( + path.display().to_string() + )); + } + + let file = File::open(path).map_err(Error::Io)?; + let reader = BufReader::new(file); + + // Parse wrapper + let wrapper: PersistedWrapper = serde_json::from_reader(reader) .map_err(|e| Error::Parse(format!("Failed to parse document: {}", e)))?; - Ok(doc) + // Check version + if wrapper.version != FORMAT_VERSION { + return Err(Error::Parse(format!( + "Unsupported format version: {} (expected {})", + wrapper.version, FORMAT_VERSION + ))); + } + + // Verify checksum if enabled + if options.verify_checksum { + let payload_bytes = serde_json::to_vec(&wrapper.payload) + .map_err(|e| Error::Serialization(e.to_string()))?; + + let expected_checksum = calculate_checksum(&payload_bytes); + + if wrapper.checksum != expected_checksum { + return Err(Error::Parse(format!( + "Checksum mismatch: expected {}, got {}", + expected_checksum, wrapper.checksum + ))); + } + } + + Ok(wrapper.payload) } /// Save the workspace index (metadata for all documents). pub fn save_index(path: &Path, entries: &[DocumentMeta]) -> Result<()> { - let json = serde_json::to_string_pretty(entries) - .map_err(|e| Error::Io(io::Error::new(io::ErrorKind::Other, e)))?; + save_index_with_options(path, entries, &PersistenceOptions::default()) +} + +/// Save the workspace index with custom options. +pub fn save_index_with_options( + path: &Path, + entries: &[DocumentMeta], + options: &PersistenceOptions, +) -> Result<()> { + // Serialize payload + let payload_bytes = serde_json::to_vec(entries) + .map_err(|e| Error::Serialization(e.to_string()))?; + + let checksum = calculate_checksum(&payload_bytes); + + let wrapper = PersistedWrapper { + version: FORMAT_VERSION, + checksum, + payload: entries.to_vec(), + }; + + let json = serde_json::to_string_pretty(&wrapper) + .map_err(|e| Error::Serialization(e.to_string()))?; + + if options.atomic_writes { + let temp_path = path.with_extension("tmp"); + + // Ensure parent directory exists + if let Some(parent) = path.parent() { + std::fs::create_dir_all(parent).map_err(Error::Io)?; + } - std::fs::write(path, json).map_err(|e| Error::Io(e))?; + // Write to temp file + { + let file = File::create(&temp_path).map_err(Error::Io)?; + let mut writer = BufWriter::new(file); + writer.write_all(json.as_bytes()).map_err(Error::Io)?; + writer.flush().map_err(Error::Io)?; + } + + // Atomic rename + std::fs::rename(&temp_path, path).map_err(Error::Io)?; + } else { + std::fs::write(path, json).map_err(Error::Io)?; + } Ok(()) } /// Load the workspace index. pub fn load_index(path: &Path) -> Result> { + load_index_with_options(path, &PersistenceOptions::default()) +} + +/// Load the workspace index with custom options. +pub fn load_index_with_options( + path: &Path, + options: &PersistenceOptions, +) -> Result> { if !path.exists() { return Ok(Vec::new()); } - let json = std::fs::read_to_string(path).map_err(|e| Error::Io(e))?; + let file = File::open(path).map_err(Error::Io)?; + let reader = BufReader::new(file); + + let wrapper: PersistedWrapper> = serde_json::from_reader(reader) + .map_err(|e| Error::Parse(format!("Failed to parse index: {}", e)))?; + + // Check version + if wrapper.version != FORMAT_VERSION { + return Err(Error::Parse(format!( + "Unsupported format version: {} (expected {})", + wrapper.version, FORMAT_VERSION + ))); + } + + // Verify checksum if enabled + if options.verify_checksum { + let payload_bytes = serde_json::to_vec(&wrapper.payload) + .map_err(|e| Error::Serialization(e.to_string()))?; + + let expected_checksum = calculate_checksum(&payload_bytes); + + if wrapper.checksum != expected_checksum { + return Err(Error::Parse(format!( + "Checksum mismatch: expected {}, got {}", + expected_checksum, wrapper.checksum + ))); + } + } + + Ok(wrapper.payload) +} + +// ============================================================================ +// Bytes-based serialization (for StorageBackend integration) +// ============================================================================ + +/// Serialize a document to bytes (JSON with checksum wrapper). +/// +/// This is useful for storage backends that work with byte arrays. +pub fn save_document_to_bytes(doc: &PersistedDocument) -> Result> { + // Serialize the payload first + let payload_bytes = serde_json::to_vec(doc) + .map_err(|e| Error::Serialization(e.to_string()))?; + + // Calculate checksum + let checksum = calculate_checksum(&payload_bytes); + + // Create wrapper + let wrapper = PersistedWrapper { + version: FORMAT_VERSION, + checksum, + payload: doc.clone(), + }; + + // Serialize wrapper + serde_json::to_vec(&wrapper) + .map_err(|e| Error::Serialization(e.to_string())) +} + +/// Deserialize a document from bytes. +/// +/// Verifies checksum by default. +pub fn load_document_from_bytes(data: &[u8]) -> Result { + load_document_from_bytes_with_options(data, true) +} + +/// Deserialize a document from bytes with optional checksum verification. +pub fn load_document_from_bytes_with_options( + data: &[u8], + verify_checksum: bool, +) -> Result { + // Parse wrapper + let wrapper: PersistedWrapper = serde_json::from_slice(data) + .map_err(|e| Error::Parse(format!("Failed to parse document: {}", e)))?; + + // Check version + if wrapper.version != FORMAT_VERSION { + return Err(Error::VersionMismatch(format!( + "Expected version {}, got {}", + FORMAT_VERSION, wrapper.version + ))); + } + + // Verify checksum if enabled + if verify_checksum { + let payload_bytes = serde_json::to_vec(&wrapper.payload) + .map_err(|e| Error::Serialization(e.to_string()))?; + + let expected_checksum = calculate_checksum(&payload_bytes); + + if wrapper.checksum != expected_checksum { + return Err(Error::ChecksumMismatch(format!( + "Expected {}, got {}", + expected_checksum, wrapper.checksum + ))); + } + } + + Ok(wrapper.payload) +} + +/// Serialize an index to bytes. +pub fn save_index_to_bytes(entries: &[DocumentMeta]) -> Result> { + let payload_bytes = serde_json::to_vec(entries) + .map_err(|e| Error::Serialization(e.to_string()))?; + + let checksum = calculate_checksum(&payload_bytes); + + let wrapper = PersistedWrapper { + version: FORMAT_VERSION, + checksum, + payload: entries.to_vec(), + }; - let entries: Vec = serde_json::from_str(&json) + serde_json::to_vec(&wrapper) + .map_err(|e| Error::Serialization(e.to_string())) +} + +/// Deserialize an index from bytes. +pub fn load_index_from_bytes(data: &[u8]) -> Result> { + load_index_from_bytes_with_options(data, true) +} + +/// Deserialize an index from bytes with optional checksum verification. +pub fn load_index_from_bytes_with_options( + data: &[u8], + verify_checksum: bool, +) -> Result> { + let wrapper: PersistedWrapper> = serde_json::from_slice(data) .map_err(|e| Error::Parse(format!("Failed to parse index: {}", e)))?; - Ok(entries) + // Check version + if wrapper.version != FORMAT_VERSION { + return Err(Error::VersionMismatch(format!( + "Expected version {}, got {}", + FORMAT_VERSION, wrapper.version + ))); + } + + // Verify checksum if enabled + if verify_checksum { + let payload_bytes = serde_json::to_vec(&wrapper.payload) + .map_err(|e| Error::Serialization(e.to_string()))?; + + let expected_checksum = calculate_checksum(&payload_bytes); + + if wrapper.checksum != expected_checksum { + return Err(Error::ChecksumMismatch(format!( + "Expected {}, got {}", + expected_checksum, wrapper.checksum + ))); + } + } + + Ok(wrapper.payload) +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + fn create_test_doc(id: &str) -> PersistedDocument { + let meta = DocumentMeta::new(id, "Test Doc", "md"); + let tree = DocumentTree::new("Root", "Content"); + PersistedDocument::new(meta, tree) + } + + #[test] + fn test_save_and_load_document() { + let temp = TempDir::new().unwrap(); + let path = temp.path().join("test.json"); + + let doc = create_test_doc("doc-1"); + save_document(&path, &doc).unwrap(); + + let loaded = load_document(&path).unwrap(); + assert_eq!(loaded.meta.id, "doc-1"); + assert_eq!(loaded.meta.name, "Test Doc"); + } + + #[test] + fn test_atomic_write() { + let temp = TempDir::new().unwrap(); + let path = temp.path().join("atomic.json"); + + let doc = create_test_doc("doc-atomic"); + let options = PersistenceOptions::new().with_atomic_writes(true); + save_document_with_options(&path, &doc, &options).unwrap(); + + // Temp file should not exist after save + assert!(!path.with_extension("tmp").exists()); + + let loaded = load_document(&path).unwrap(); + assert_eq!(loaded.meta.id, "doc-atomic"); + } + + #[test] + fn test_checksum_verification() { + let temp = TempDir::new().unwrap(); + let path = temp.path().join("checksum.json"); + + let doc = create_test_doc("doc-checksum"); + save_document(&path, &doc).unwrap(); + + // Corrupt the file + let content = std::fs::read_to_string(&path).unwrap(); + let corrupted = content.replace("doc-checksum", "doc-corrupted"); + std::fs::write(&path, corrupted).unwrap(); + + // Load should fail with checksum error + let result = load_document(&path); + assert!(result.is_err()); + let err = result.unwrap_err(); + assert!(matches!(err, Error::Parse(_))); + } + + #[test] + fn test_checksum_disabled() { + let temp = TempDir::new().unwrap(); + let path = temp.path().join("no-checksum.json"); + + let doc = create_test_doc("doc-no-check"); + save_document(&path, &doc).unwrap(); + + // Load with checksum disabled should succeed + let options = PersistenceOptions::new().with_verify_checksum(false); + let result = load_document_with_options(&path, &options); + assert!(result.is_ok()); + let loaded = result.unwrap(); + assert_eq!(loaded.meta.id, "doc-no-check"); + + // Now corrupt the checksum field specifically + let content = std::fs::read_to_string(&path).unwrap(); + // Change the checksum value but keep the payload intact + let corrupted = content.replace( + &calculate_checksum(&serde_json::to_vec(&doc).unwrap()), + "0000000000000000000000000000000000000000000000000000000000000000" + ); + std::fs::write(&path, corrupted).unwrap(); + + // Load with checksum disabled should still succeed + let result = load_document_with_options(&path, &options); + assert!(result.is_ok()); + + // Load with checksum enabled should fail + let options_enabled = PersistenceOptions::new().with_verify_checksum(true); + let result = load_document_with_options(&path, &options_enabled); + assert!(result.is_err()); + } + + #[test] + fn test_load_nonexistent() { + let result = load_document(Path::new("/nonexistent/path.json")); + assert!(result.is_err()); + assert!(result.unwrap_err().is_not_found()); + } + + #[test] + fn test_save_and_load_index() { + let temp = TempDir::new().unwrap(); + let path = temp.path().join("_meta.json"); + + let mut entries = Vec::new(); + entries.push(DocumentMeta::new("doc-1", "Doc 1", "md")); + entries.push(DocumentMeta::new("doc-2", "Doc 2", "pdf")); + + save_index(&path, &entries).unwrap(); + + let loaded = load_index(&path).unwrap(); + assert_eq!(loaded.len(), 2); + assert_eq!(loaded[0].id, "doc-1"); + assert_eq!(loaded[1].format, "pdf"); + } + + #[test] + fn test_load_empty_index() { + let temp = TempDir::new().unwrap(); + let path = temp.path().join("nonexistent.json"); + + let loaded = load_index(&path).unwrap(); + assert!(loaded.is_empty()); + } + + #[test] + fn test_checksum_calculation() { + let data1 = b"test data"; + let data2 = b"test data"; + let data3 = b"different data"; + + let checksum1 = calculate_checksum(data1); + let checksum2 = calculate_checksum(data2); + let checksum3 = calculate_checksum(data3); + + assert_eq!(checksum1, checksum2); + assert_ne!(checksum1, checksum3); + assert_eq!(checksum1.len(), 64); // SHA-256 produces 64 hex chars + } } diff --git a/src/storage/workspace.rs b/src/storage/workspace.rs index 9cd1a83f..c13e99ec 100644 --- a/src/storage/workspace.rs +++ b/src/storage/workspace.rs @@ -3,41 +3,46 @@ //! Workspace management for document collections. //! -//! A workspace is a directory containing indexed documents and metadata. +//! A workspace manages indexed documents using a storage backend abstraction. //! Uses lazy-loading pattern with LRU cache: //! - Metadata index always in memory //! - Full documents loaded on demand with LRU eviction //! -//! # Structure +//! # Backends //! -//! ```text -//! workspace/ -//! ├── _meta.json # Lightweight index: all document metadata -//! ├── {doc_id_1}.json # Document 1 full data (tree + pages) -//! ├── {doc_id_2}.json # Document 2 full data -//! └── ... -//! ``` +//! The workspace supports different storage backends: +//! - **FileBackend**: File system storage (default) +//! - **MemoryBackend**: In-memory storage (for testing) +//! +//! # Example //! -//! # Thread Safety +//! ```rust,ignore +//! use vectorless::storage::{Workspace, FileBackend}; //! -//! The workspace uses interior mutability for the LRU cache: -//! - Read operations (`get_meta`, `contains`, `list_documents`) only need `&self` -//! - Cache updates happen internally via `Mutex` +//! // Default file-based workspace +//! let mut workspace = Workspace::new("./my_workspace")?; +//! +//! // Or with custom backend +//! let backend = std::sync::Arc::new(FileBackend::new("./my_workspace")?); +//! let mut workspace = Workspace::with_backend(backend)?; +//! ``` use std::collections::HashMap; -use std::fs; -use std::num::NonZeroUsize; use std::path::{Path, PathBuf}; -use std::sync::Mutex; +use std::sync::Arc; -use lru::LruCache; use serde::{Deserialize, Serialize}; use tracing::{debug, info, warn}; -use super::persistence::{PersistedDocument, load_document, save_document}; -use crate::domain::{Error, Result}; +use super::backend::{FileBackend, StorageBackend}; +use super::cache::DocumentCache; +use super::lock::FileLock; +use super::persistence::{PersistedDocument, load_document_from_bytes, save_document_to_bytes}; +use crate::error::Result; +use crate::Error; -const META_FILE: &str = "_meta.json"; +const META_KEY: &str = "_meta"; +const LOCK_FILE: &str = ".workspace.lock"; const DEFAULT_CACHE_SIZE: usize = 100; /// Lightweight metadata entry for the index. @@ -63,51 +68,132 @@ pub struct DocumentMetaEntry { pub line_count: Option, } -/// Inner state for Workspace (separated for interior mutability). -#[derive(Debug)] -struct Inner { - /// LRU cache for loaded full documents. - document_cache: LruCache, -} - /// A workspace for managing indexed documents. /// /// Uses LRU cache for loaded documents to balance memory usage -/// and access performance. The cache uses interior mutability, -/// so read operations only require `&self`. +/// and access performance. +/// +/// # Thread Safety +/// +/// The workspace is thread-safe when used with a thread-safe backend. +/// Read operations only require `&self`. #[derive(Debug)] pub struct Workspace { - /// Root directory for the workspace. - root: PathBuf, - + /// Storage backend. + backend: Arc, + /// Root path (for file-based backends, used for locking). + root: Option, /// Document metadata index (id -> meta). /// This is always loaded in memory. meta_index: HashMap, + /// LRU cache for loaded documents. + cache: DocumentCache, + /// File lock for multi-process safety (file backends only). + _lock: Option, +} - /// Inner state with LRU cache (protected by Mutex for interior mutability). - inner: Mutex, +/// Options for workspace creation. +#[derive(Debug, Clone)] +pub struct WorkspaceOptions { + /// Enable file locking (default: true, only for file backends). + pub file_lock: bool, + /// LRU cache size (default: 100). + pub cache_size: usize, +} + +impl Default for WorkspaceOptions { + fn default() -> Self { + Self { + file_lock: true, + cache_size: DEFAULT_CACHE_SIZE, + } + } +} + +impl WorkspaceOptions { + /// Create new options with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Set the cache size. + pub fn with_cache_size(mut self, size: usize) -> Self { + self.cache_size = size; + self + } + + /// Enable or disable file locking. + pub fn with_file_lock(mut self, enabled: bool) -> Self { + self.file_lock = enabled; + self + } } impl Workspace { - /// Create a new workspace at the given path with default cache size. + /// Create a new workspace with a storage backend. + /// + /// # Example + /// + /// ```rust,ignore + /// let backend = Arc::new(FileBackend::new("./workspace")?); + /// let workspace = Workspace::with_backend(backend)?; + /// ``` + pub fn with_backend(backend: Arc) -> Result { + Self::with_backend_and_options(backend, WorkspaceOptions::default()) + } + + /// Create a workspace with backend and options. + pub fn with_backend_and_options( + backend: Arc, + options: WorkspaceOptions, + ) -> Result { + let mut workspace = Self { + backend, + root: None, + meta_index: HashMap::new(), + cache: DocumentCache::with_capacity(options.cache_size), + _lock: None, + }; + + workspace.load_meta_index()?; + Ok(workspace) + } + + /// Create a new file-based workspace at the given path. + /// + /// This is a convenience method that creates a `FileBackend` internally. pub fn new(path: impl Into) -> Result { - Self::with_cache_size(path, DEFAULT_CACHE_SIZE) + Self::with_options(path, WorkspaceOptions::default()) } /// Create a new workspace with custom LRU cache size. pub fn with_cache_size(path: impl Into, cache_size: usize) -> Result { + Self::with_options(path, WorkspaceOptions { + cache_size, + ..Default::default() + }) + } + + /// Create a new workspace with custom options. + pub fn with_options(path: impl Into, options: WorkspaceOptions) -> Result { let root = path.into(); - fs::create_dir_all(&root).map_err(Error::Io)?; - let capacity = NonZeroUsize::new(cache_size.max(1)) - .unwrap_or_else(|| NonZeroUsize::new(DEFAULT_CACHE_SIZE).unwrap()); + // Acquire file lock if enabled + let lock = if options.file_lock { + let lock_path = root.join(LOCK_FILE); + Some(FileLock::try_lock(&lock_path, true)?) + } else { + None + }; + + let backend = Arc::new(FileBackend::new(&root)?); let mut workspace = Self { - root, + backend, + root: Some(root), meta_index: HashMap::new(), - inner: Mutex::new(Inner { - document_cache: LruCache::new(capacity), - }), + cache: DocumentCache::with_capacity(options.cache_size), + _lock: lock, }; workspace.load_meta_index()?; @@ -116,36 +202,57 @@ impl Workspace { /// Open an existing workspace, or create if it doesn't exist. pub fn open(path: impl Into + Clone) -> Result { - Self::open_with_cache_size(path, DEFAULT_CACHE_SIZE) + Self::open_with_options(path, WorkspaceOptions::default()) } /// Open with custom cache size. pub fn open_with_cache_size( path: impl Into + Clone, cache_size: usize, + ) -> Result { + Self::open_with_options(path, WorkspaceOptions { + cache_size, + ..Default::default() + }) + } + + /// Open with custom options. + pub fn open_with_options( + path: impl Into + Clone, + options: WorkspaceOptions, ) -> Result { let root = path.clone().into(); - if root.exists() { - let capacity = NonZeroUsize::new(cache_size.max(1)) - .unwrap_or_else(|| NonZeroUsize::new(DEFAULT_CACHE_SIZE).unwrap()); - - let mut workspace = Self { - root, - meta_index: HashMap::new(), - inner: Mutex::new(Inner { - document_cache: LruCache::new(capacity), - }), - }; - workspace.load_meta_index()?; - Ok(workspace) + + // Acquire file lock if enabled + let lock = if options.file_lock && root.exists() { + let lock_path = root.join(LOCK_FILE); + Some(FileLock::try_lock(&lock_path, true)?) } else { - Self::with_cache_size(path, cache_size) - } + None + }; + + let backend = Arc::new(FileBackend::new(&root)?); + + let mut workspace = Self { + backend, + root: Some(root), + meta_index: HashMap::new(), + cache: DocumentCache::with_capacity(options.cache_size), + _lock: lock, + }; + + workspace.load_meta_index()?; + Ok(workspace) + } + + /// Get the workspace root path (if file-based). + pub fn path(&self) -> Option<&Path> { + self.root.as_deref() } - /// Get the workspace root path. - pub fn path(&self) -> &Path { - &self.root + /// Get the storage backend. + pub fn backend(&self) -> &dyn StorageBackend { + self.backend.as_ref() } /// List all document IDs in the workspace. @@ -164,17 +271,15 @@ impl Workspace { } /// Add a document to the workspace. - /// - /// This saves the full document to disk and updates the meta index. - /// The document is NOT cached (lazy loading on first access). pub fn add(&mut self, doc: &PersistedDocument) -> Result<()> { let doc_id = doc.meta.id.clone(); - let doc_path = self.document_path(&doc_id); + let key = self.doc_key(&doc_id); - // Save full document to disk - save_document(&doc_path, doc)?; + // Serialize and save via backend + let bytes = save_document_to_bytes(doc)?; + self.backend.put(&key, &bytes)?; - // Update meta index (lightweight) + // Update meta index let meta_entry = DocumentMetaEntry { id: doc_id.clone(), doc_name: doc.meta.name.clone(), @@ -185,17 +290,15 @@ impl Workspace { .source_path .as_ref() .map(|p| p.to_string_lossy().to_string()), - page_count: doc.pages.first().map(|p| p.page), - line_count: None, // TODO: track this + page_count: if doc.pages.is_empty() { None } else { Some(doc.pages.len()) }, + line_count: doc.meta.line_count, }; self.meta_index.insert(doc_id.clone(), meta_entry); self.save_meta_index()?; - // Remove from cache if present (will lazy load on next access) - if let Ok(mut inner) = self.inner.lock() { - inner.document_cache.pop(&doc_id); - } + // Remove from cache if present + let _ = self.cache.remove(&doc_id); info!("Saved document {} to workspace", doc_id); Ok(()) @@ -204,47 +307,35 @@ impl Workspace { /// Load a document from the workspace. /// /// Uses LRU cache: returns cached version if available, - /// otherwise loads from disk and caches it. - /// - /// This method only requires `&self` (interior mutability for cache). + /// otherwise loads from backend and caches it. pub fn load(&self, id: &str) -> Result> { if !self.contains(id) { return Ok(None); } - // Check LRU cache first (with lock) - { - let mut inner = self - .inner - .lock() - .map_err(|_| Error::Other("Workspace lock poisoned".to_string()))?; - - if let Some(cached) = inner.document_cache.get(id) { - debug!("Cache hit for document {}", id); - return Ok(Some(cached.clone())); - } + // Check LRU cache first + if let Some(cached) = self.cache.get(id)? { + debug!("Cache hit for document {}", id); + return Ok(Some(cached)); } - // Load from disk (lock released during I/O) - let doc_path = self.document_path(id); - if !doc_path.exists() { - warn!("Document {} in meta index but file missing", id); - return Ok(None); - } + // Load from backend + let key = self.doc_key(id); + match self.backend.get(&key)? { + Some(bytes) => { + let doc = load_document_from_bytes(&bytes)?; - let doc = load_document(&doc_path)?; + // Add to LRU cache + self.cache.put(id.to_string(), doc.clone())?; - // Add to LRU cache (with lock) - { - let mut inner = self - .inner - .lock() - .map_err(|_| Error::Other("Workspace lock poisoned".to_string()))?; - inner.document_cache.put(id.to_string(), doc.clone()); + debug!("Loaded document {} from backend (cached)", id); + Ok(Some(doc)) + } + None => { + warn!("Document {} in meta index but not in backend", id); + Ok(None) + } } - - debug!("Loaded document {} from disk (cached)", id); - Ok(Some(doc)) } /// Remove a document from the workspace. @@ -253,17 +344,13 @@ impl Workspace { return Ok(false); } - let doc_path = self.document_path(id); - if doc_path.exists() { - fs::remove_file(&doc_path).map_err(Error::Io)?; - } + let key = self.doc_key(id); + self.backend.delete(&key)?; self.meta_index.remove(id); // Remove from cache - if let Ok(mut inner) = self.inner.lock() { - inner.document_cache.pop(id); - } + let _ = self.cache.remove(id); self.save_meta_index()?; @@ -283,83 +370,70 @@ impl Workspace { /// Get the number of items currently in the LRU cache. pub fn cache_len(&self) -> usize { - self.inner - .lock() - .map(|inner| inner.document_cache.len()) - .unwrap_or(0) + self.cache.len() } - /// Clear the LRU cache (does not remove documents from workspace). - pub fn clear_cache(&self) { - if let Ok(mut inner) = self.inner.lock() { - inner.document_cache.clear(); - debug!("Cleared document cache"); - } + /// Get cache utilization (0.0 to 1.0). + pub fn cache_utilization(&self) -> f64 { + self.cache.utilization() } - /// Get the path for a document file. - fn document_path(&self, id: &str) -> PathBuf { - self.root.join(format!("{}.json", id)) + /// Get cache statistics. + pub fn cache_stats(&self) -> super::cache::CacheStats { + self.cache.stats() } - /// Get the path for the meta index file. - fn meta_path(&self) -> PathBuf { - self.root.join(META_FILE) + /// Clear the LRU cache (does not remove documents from workspace). + pub fn clear_cache(&self) -> Result<()> { + self.cache.clear()?; + debug!("Cleared document cache"); + Ok(()) } - /// Load the meta index from disk. - fn load_meta_index(&mut self) -> Result<()> { - let meta_path = self.meta_path(); + /// Get the storage key for a document. + fn doc_key(&self, id: &str) -> String { + format!("doc:{}", id) + } - if !meta_path.exists() { - // Try to rebuild from existing files - self.rebuild_meta_index()?; - return Ok(()); + /// Load the meta index from backend. + fn load_meta_index(&mut self) -> Result<()> { + match self.backend.get(META_KEY)? { + Some(bytes) => { + let meta: HashMap = serde_json::from_slice(&bytes) + .map_err(|e| Error::Parse(format!("Failed to parse meta index: {}", e)))?; + self.meta_index = meta; + info!( + "Loaded {} document(s) from workspace index", + self.meta_index.len() + ); + } + None => { + // Try to rebuild from existing keys + self.rebuild_meta_index()?; + } } - - let content = fs::read_to_string(&meta_path).map_err(Error::Io)?; - - let meta: HashMap = serde_json::from_str(&content) - .map_err(|e| Error::Parse(format!("Failed to parse meta index: {}", e)))?; - - self.meta_index = meta; - info!( - "Loaded {} document(s) from workspace index", - self.meta_index.len() - ); Ok(()) } - /// Save the meta index to disk. + /// Save the meta index to backend. fn save_meta_index(&self) -> Result<()> { - let content = serde_json::to_string_pretty(&self.meta_index) + let bytes = serde_json::to_vec_pretty(&self.meta_index) .map_err(|e| Error::Parse(format!("Failed to serialize meta index: {}", e)))?; - - fs::write(self.meta_path(), content).map_err(Error::Io)?; - + self.backend.put(META_KEY, &bytes)?; Ok(()) } - /// Rebuild the meta index from existing document files. + /// Rebuild the meta index from existing documents. fn rebuild_meta_index(&mut self) -> Result<()> { - let entries: Vec<_> = fs::read_dir(&self.root) - .map_err(Error::Io)? - .filter_map(|entry| entry.ok()) - .filter(|entry| { - entry - .path() - .extension() - .map(|ext| ext == "json") - .unwrap_or(false) - }) - .filter_map(|entry| { - let path = entry.path(); - // Skip the meta file itself - if path.file_stem()?.to_str()? == "_meta" { - return None; - } - // Try to load the document and extract metadata - load_document(&path).ok().map(|doc| { + let keys = self.backend.keys()?; + let doc_keys: Vec<_> = keys + .iter() + .filter(|k| k.starts_with("doc:")) + .collect(); + + for key in doc_keys { + if let Some(bytes) = self.backend.get(key)? { + if let Ok(doc) = load_document_from_bytes(&bytes) { let doc_id = doc.meta.id.clone(); let meta_entry = DocumentMetaEntry { id: doc_id.clone(), @@ -371,22 +445,18 @@ impl Workspace { .source_path .as_ref() .map(|p| p.to_string_lossy().to_string()), - page_count: doc.pages.first().map(|p| p.page), - line_count: None, + page_count: if doc.pages.is_empty() { None } else { Some(doc.pages.len()) }, + line_count: doc.meta.line_count, }; - (doc_id, meta_entry) - }) - }) - .collect(); - - for (id, entry) in entries { - self.meta_index.insert(id, entry); + self.meta_index.insert(doc_id, meta_entry); + } + } } if !self.meta_index.is_empty() { self.save_meta_index()?; info!( - "Rebuilt index from {} document file(s)", + "Rebuilt index from {} document(s)", self.meta_index.len() ); } @@ -394,3 +464,90 @@ impl Workspace { Ok(()) } } + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + #[test] + fn test_workspace_create() { + let temp = TempDir::new().unwrap(); + let workspace = Workspace::new(temp.path()).unwrap(); + + assert!(workspace.is_empty()); + assert_eq!(workspace.len(), 0); + } + + #[test] + fn test_workspace_with_memory_backend() { + let backend = Arc::new(super::super::backend::MemoryBackend::new()); + let mut workspace = Workspace::with_backend(backend).unwrap(); + + assert!(workspace.is_empty()); + + // Add a document + let meta = super::super::persistence::DocumentMeta::new("doc-1", "Test", "md"); + let tree = crate::document::DocumentTree::new("Root", "Content"); + let doc = PersistedDocument::new(meta, tree); + + workspace.add(&doc).unwrap(); + assert_eq!(workspace.len(), 1); + + // Load it back + let loaded = workspace.load("doc-1").unwrap(); + assert!(loaded.is_some()); + assert_eq!(loaded.unwrap().meta.id, "doc-1"); + } + + #[test] + fn test_workspace_open() { + let temp = TempDir::new().unwrap(); + let path = temp.path().join("workspace"); + + let options = WorkspaceOptions { + file_lock: false, + ..Default::default() + }; + + let workspace = Workspace::open_with_options(&path, options.clone()).unwrap(); + assert!(workspace.is_empty()); + + drop(workspace); + let workspace2 = Workspace::open_with_options(&path, options).unwrap(); + assert!(workspace2.is_empty()); + } + + #[test] + fn test_workspace_cache_operations() { + let temp = TempDir::new().unwrap(); + let workspace = Workspace::with_cache_size(temp.path(), 5).unwrap(); + + assert_eq!(workspace.cache_len(), 0); + assert_eq!(workspace.cache.utilization(), 0.0); + + workspace.clear_cache().unwrap(); + assert_eq!(workspace.cache_len(), 0); + } + + #[test] + fn test_workspace_cache_stats() { + let backend = Arc::new(super::super::backend::MemoryBackend::new()); + let mut workspace = Workspace::with_backend(backend).unwrap(); + + let meta = super::super::persistence::DocumentMeta::new("doc-1", "Test", "md"); + let tree = crate::document::DocumentTree::new("Root", "Content"); + let doc = PersistedDocument::new(meta, tree); + workspace.add(&doc).unwrap(); + + // First load - cache miss + let _ = workspace.load("doc-1").unwrap(); + let stats = workspace.cache_stats(); + assert_eq!(stats.misses, 1); + + // Second load - cache hit + let _ = workspace.load("doc-1").unwrap(); + let stats = workspace.cache_stats(); + assert_eq!(stats.hits, 1); + } +} diff --git a/src/util/format.rs b/src/util/format.rs new file mode 100644 index 00000000..059b9ed6 --- /dev/null +++ b/src/util/format.rs @@ -0,0 +1,212 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Text formatting utilities. + +/// Truncate text to a maximum length with ellipsis. +/// +/// # Example +/// +/// ``` +/// use vectorless::util::format::truncate; +/// +/// assert_eq!(truncate("hello world", 8), "hello..."); +/// assert_eq!(truncate("hi", 10), "hi"); +/// ``` +pub fn truncate(text: &str, max_len: usize) -> String { + if text.len() <= max_len { + return text.to_string(); + } + + if max_len <= 3 { + return ".".repeat(max_len); + } + + format!("{}...", &text[..max_len - 3]) +} + +/// Truncate text to a maximum length, respecting word boundaries. +pub fn truncate_words(text: &str, max_len: usize) -> String { + if text.len() <= max_len { + return text.to_string(); + } + + if max_len <= 3 { + return ".".repeat(max_len); + } + + // Find a good break point + let truncated = &text[..max_len - 3]; + + // Try to break at a word boundary + if let Some(last_space) = truncated.rfind(' ') { + if last_space > max_len / 2 { + return format!("{}...", &truncated[..last_space]); + } + } + + format!("{}...", truncated) +} + +/// Format a number with thousand separators. +/// +/// # Example +/// +/// ``` +/// use vectorless::util::format::format_number; +/// +/// assert_eq!(format_number(1000), "1,000"); +/// assert_eq!(format_number(1234567), "1,234,567"); +/// ``` +pub fn format_number(n: usize) -> String { + let s = n.to_string(); + let mut result = String::new(); + let chars: Vec = s.chars().collect(); + + for (i, c) in chars.iter().enumerate() { + if i > 0 && (chars.len() - i) % 3 == 0 { + result.push(','); + } + result.push(*c); + } + + result +} + +/// Format bytes for human-readable display. +/// +/// # Example +/// +/// ``` +/// use vectorless::util::format::format_bytes; +/// +/// assert_eq!(format_bytes(500), "500 B"); +/// assert_eq!(format_bytes(1024), "1.0 KB"); +/// assert_eq!(format_bytes(1536), "1.5 KB"); +/// assert_eq!(format_bytes(1048576), "1.0 MB"); +/// ``` +pub fn format_bytes(bytes: usize) -> String { + const KB: usize = 1024; + const MB: usize = KB * 1024; + const GB: usize = MB * 1024; + + if bytes >= GB { + format!("{:.1} GB", bytes as f64 / GB as f64) + } else if bytes >= MB { + format!("{:.1} MB", bytes as f64 / MB as f64) + } else if bytes >= KB { + format!("{:.1} KB", bytes as f64 / KB as f64) + } else { + format!("{} B", bytes) + } +} + +/// Format a percentage. +/// +/// # Example +/// +/// ``` +/// use vectorless::util::format::format_percent; +/// +/// assert_eq!(format_percent(0.5), "50.0%"); +/// assert_eq!(format_percent(0.123), "12.3%"); +/// ``` +pub fn format_percent(value: f32) -> String { + format!("{:.1}%", value * 100.0) +} + +/// Clean whitespace in text (collapse multiple spaces, trim). +pub fn clean_whitespace(text: &str) -> String { + text.split_whitespace().collect::>().join(" ") +} + +/// Indent each line of text. +pub fn indent(text: &str, spaces: usize) -> String { + let indent_str = " ".repeat(spaces); + text.lines() + .map(|line| format!("{}{}", indent_str, line)) + .collect::>() + .join("\n") +} + +/// Count words in text. +pub fn word_count(text: &str) -> usize { + text.split_whitespace().count() +} + +/// Count lines in text. +pub fn line_count(text: &str) -> usize { + if text.is_empty() { + return 0; + } + text.chars().filter(|&c| c == '\n').count() + 1 +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_truncate() { + assert_eq!(truncate("hello", 10), "hello"); + assert_eq!(truncate("hello world", 8), "hello..."); + assert_eq!(truncate("hi", 3), "hi"); + } + + #[test] + fn test_truncate_words() { + // "hello world foo" with max_len=12: + // truncated = "hello wor" (9 chars), last_space at 5 + // 5 > 12/2 is false, so no word boundary break + assert_eq!(truncate_words("hello world foo", 12), "hello wor..."); + // Word boundary break happens when space is past halfway + assert_eq!(truncate_words("hello world foo bar", 15), "hello world..."); + assert_eq!(truncate_words("hello", 10), "hello"); + } + + #[test] + fn test_format_number() { + assert_eq!(format_number(100), "100"); + assert_eq!(format_number(1000), "1,000"); + assert_eq!(format_number(1234567), "1,234,567"); + } + + #[test] + fn test_format_bytes() { + assert_eq!(format_bytes(500), "500 B"); + assert_eq!(format_bytes(1024), "1.0 KB"); + assert_eq!(format_bytes(1536), "1.5 KB"); + assert_eq!(format_bytes(1048576), "1.0 MB"); + } + + #[test] + fn test_format_percent() { + assert_eq!(format_percent(0.5), "50.0%"); + assert_eq!(format_percent(1.0), "100.0%"); + } + + #[test] + fn test_clean_whitespace() { + assert_eq!(clean_whitespace(" hello world "), "hello world"); + assert_eq!(clean_whitespace("single"), "single"); + } + + #[test] + fn test_indent() { + assert_eq!(indent("hello\nworld", 2), " hello\n world"); + } + + #[test] + fn test_word_count() { + assert_eq!(word_count("hello world"), 2); + assert_eq!(word_count(" hello world "), 2); + assert_eq!(word_count(""), 0); + } + + #[test] + fn test_line_count() { + assert_eq!(line_count("hello\nworld"), 2); + assert_eq!(line_count("single"), 1); + assert_eq!(line_count(""), 0); + } +} diff --git a/src/util/mod.rs b/src/util/mod.rs new file mode 100644 index 00000000..9ec7184e --- /dev/null +++ b/src/util/mod.rs @@ -0,0 +1,21 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Utility functions and helpers. +//! +//! This module provides common utilities used across the codebase: +//! +//! - **Token estimation** — Fast and accurate token counting +//! - **Timing** — Performance measurement utilities +//! - **Format** — Text and number formatting utilities + +mod format; +mod timing; +mod token; + +pub use format::{ + clean_whitespace, format_bytes, format_number, format_percent, indent, line_count, + truncate, truncate_words, word_count, +}; +pub use timing::{format_duration, format_duration_compact, Timer}; +pub use token::{estimate_tokens, estimate_tokens_batch, estimate_tokens_fast}; diff --git a/src/util/timing.rs b/src/util/timing.rs new file mode 100644 index 00000000..5b3cabb9 --- /dev/null +++ b/src/util/timing.rs @@ -0,0 +1,159 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Timing and performance measurement utilities. + +use std::time::{Duration, Instant}; + +/// A simple timing guard that records elapsed time on drop. +/// +/// # Example +/// +/// ```rust +/// use vectorless::util::timing::Timer; +/// +/// let timer = Timer::start("indexing"); +/// // ... do work ... +/// drop(timer); // Logs elapsed time +/// ``` +#[derive(Debug)] +pub struct Timer { + label: String, + start: Instant, + log_on_drop: bool, +} + +impl Timer { + /// Create and start a new timer. + pub fn start(label: impl Into) -> Self { + Self { + label: label.into(), + start: Instant::now(), + log_on_drop: true, + } + } + + /// Create a silent timer (doesn't log on drop). + pub fn silent() -> Self { + Self { + label: String::new(), + start: Instant::now(), + log_on_drop: false, + } + } + + /// Get the elapsed time without stopping. + pub fn elapsed(&self) -> Duration { + self.start.elapsed() + } + + /// Get elapsed time in milliseconds. + pub fn elapsed_ms(&self) -> u64 { + self.elapsed().as_millis() as u64 + } + + /// Get elapsed time in seconds. + pub fn elapsed_secs(&self) -> f64 { + self.elapsed().as_secs_f64() + } + + /// Stop the timer and return the elapsed duration. + pub fn stop(self) -> Duration { + let elapsed = self.elapsed(); + if self.log_on_drop { + tracing::debug!( + "{} completed in {:.2}ms", + self.label, + elapsed.as_secs_f64() * 1000.0 + ); + } + elapsed + } + + /// Stop the timer and return elapsed milliseconds. + pub fn stop_ms(self) -> u64 { + self.stop().as_millis() as u64 + } + + /// Disable logging on drop. + pub fn silent_on_drop(mut self) -> Self { + self.log_on_drop = false; + self + } + + /// Reset the timer. + pub fn reset(&mut self) { + self.start = Instant::now(); + } +} + +impl Drop for Timer { + fn drop(&mut self) { + if self.log_on_drop { + let elapsed = self.elapsed(); + tracing::debug!( + "{} completed in {:.2}ms", + self.label, + elapsed.as_secs_f64() * 1000.0 + ); + } + } +} + +/// Format a duration for human-readable display. +pub fn format_duration(duration: Duration) -> String { + let total_ms = duration.as_millis(); + + if total_ms < 1000 { + format!("{}ms", total_ms) + } else if total_ms < 60_000 { + format!("{:.2}s", duration.as_secs_f64()) + } else { + let secs = duration.as_secs(); + let mins = secs / 60; + let remaining_secs = secs % 60; + format!("{}m {}s", mins, remaining_secs) + } +} + +/// Format a duration as a compact string. +pub fn format_duration_compact(duration: Duration) -> String { + let total_ms = duration.as_millis(); + + if total_ms < 1000 { + format!("{}ms", total_ms) + } else if total_ms < 60_000 { + format!("{:.1}s", duration.as_secs_f64()) + } else { + let mins = duration.as_secs() / 60; + let secs = duration.as_secs() % 60; + format!("{}:{:02}", mins, secs) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_timer_elapsed() { + let timer = Timer::silent(); + std::thread::sleep(std::time::Duration::from_millis(10)); + let elapsed = timer.elapsed(); + assert!(elapsed.as_millis() >= 10); + } + + #[test] + fn test_format_duration() { + assert_eq!(format_duration(Duration::from_millis(500)), "500ms"); + assert_eq!(format_duration(Duration::from_millis(1500)), "1.50s"); + assert_eq!(format_duration(Duration::from_secs(90)), "1m 30s"); + } + + #[test] + fn test_format_duration_compact() { + assert_eq!(format_duration_compact(Duration::from_millis(500)), "500ms"); + assert_eq!(format_duration_compact(Duration::from_millis(1500)), "1.5s"); + assert_eq!(format_duration_compact(Duration::from_secs(90)), "1:30"); + } +} diff --git a/src/domain/token.rs b/src/util/token.rs similarity index 100% rename from src/domain/token.rs rename to src/util/token.rs diff --git a/vectorless.example.toml b/vectorless.example.toml index 66e85e21..aa097ae6 100644 --- a/vectorless.example.toml +++ b/vectorless.example.toml @@ -159,6 +159,32 @@ dedup_threshold = 0.9 # └── {doc_id_2}.json # Document 2 workspace_dir = "./workspace" +# LRU cache size (number of documents to keep in memory) +cache_size = 100 + +# Enable atomic writes (temp file + rename) +# This prevents data corruption on crash +atomic_writes = true + +# Enable file locking for multi-process safety +# Prevents concurrent access from multiple processes +file_lock = true + +# Enable checksum verification for data integrity +# Uses SHA-256 to verify file integrity on load +checksum_enabled = true + +# Compression settings +[storage.compression] +# Enable compression for stored documents +enabled = false + +# Compression algorithm: "gzip" or "zstd" +algorithm = "gzip" + +# Compression level (1-9, higher = better compression but slower) +level = 6 + [concurrency] # Maximum concurrent LLM API calls # This limits how many requests can be in-flight at the same time