diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4f59786a..db3f1dba 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -8,6 +8,7 @@ on: env: CARGO_TERM_COLOR: always + RUST_BACKTRACE: 1 jobs: check: @@ -17,7 +18,7 @@ jobs: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable - uses: Swatinem/rust-cache@v2 - - run: cargo check + - run: cargo check --all-features fmt: name: Format @@ -27,7 +28,7 @@ jobs: - uses: dtolnay/rust-toolchain@stable with: components: rustfmt - - run: cargo fmt -- --check + - run: cargo fmt --all -- --check clippy: name: Clippy @@ -38,7 +39,7 @@ jobs: with: components: clippy - uses: Swatinem/rust-cache@v2 - - run: cargo clippy -- -W clippy::all + - run: cargo clippy --all-targets --all-features -- -D warnings test: name: Test @@ -47,4 +48,42 @@ jobs: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable - uses: Swatinem/rust-cache@v2 - - run: cargo test --lib + - run: cargo test --lib --all-features + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + + docs: + name: Documentation + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + - uses: Swatinem/rust-cache@v2 + - name: Check documentation build + run: cargo doc --no-deps --all-features + env: + RUSTDOCFLAGS: -D warnings + + doctest: + name: Doc Tests + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + - uses: Swatinem/rust-cache@v2 + - name: Run documentation tests + run: cargo test --doc --all-features + continue-on-error: true # Allow doctest failures initially + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + + # Release build check + build: + name: Build + runs-on: ubuntu-latest + needs: [check, fmt, clippy, test, docs] + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + - uses: Swatinem/rust-cache@v2 + - run: cargo build --release diff --git a/examples/basic.rs b/examples/basic.rs index 66b10fe0..7064d889 100644 --- a/examples/basic.rs +++ b/examples/basic.rs @@ -11,7 +11,7 @@ //! cargo run --example basic //! ``` -use vectorless::Engine; +use vectorless::{Engine, IndexContext}; #[tokio::main] async fn main() -> vectorless::Result<()> { @@ -21,17 +21,18 @@ async fn main() -> vectorless::Result<()> { let client = Engine::builder() .with_workspace("./workspace") .build() - .map_err(|e| vectorless::Error::Config(e.to_string()))?; + .await + .map_err(|e: vectorless::BuildError| vectorless::Error::Config(e.to_string()))?; println!("✓ Client created\n"); // 2. Index a document - let doc_id = client.index("./README.md").await?; + let doc_id = client.index(IndexContext::from_path("./README.md")).await?; println!("✓ Indexed: {}\n", doc_id); // 3. List documents println!("Documents:"); - for doc in client.list_documents() { + for doc in client.list_documents().await? { println!(" - {} ({})", doc.name, doc.id); } println!(); @@ -55,7 +56,7 @@ async fn main() -> vectorless::Result<()> { println!("✓ Client cloned for concurrent use\n"); // 6. Cleanup - client.remove(&doc_id)?; + client.remove(&doc_id).await?; println!("✓ Removed: {}", doc_id); println!("\n=== Done ==="); diff --git a/examples/batch_processing.rs b/examples/batch_processing.rs index 16a29896..1e0d11ee 100644 --- a/examples/batch_processing.rs +++ b/examples/batch_processing.rs @@ -12,7 +12,7 @@ //! cargo run --example batch_processing //! ``` -use vectorless::client::EngineBuilder; +use vectorless::client::{EngineBuilder, IndexContext}; #[tokio::main] async fn main() -> Result<(), Box> { @@ -23,9 +23,10 @@ async fn main() -> Result<(), Box> { let engine = EngineBuilder::new() .with_workspace("./workspace_batch_example") .build() - .map_err(|e| vectorless::Error::Config(e.to_string()))?; + .await + .map_err(|e: vectorless::BuildError| vectorless::Error::Config(e.to_string()))?; - let session = engine.session(); + let session = engine.session().await; println!(" ✓ Session created: {}\n", session.id()); // 2. Create sample documents @@ -1058,7 +1059,7 @@ Implement authentication for production. for (name, _) in &documents { let path = temp_dir.path().join(name); - match session.index(&path).await { + match session.index(IndexContext::from_path(&path)).await { Ok(doc_id) => { doc_ids.push(doc_id); } @@ -1146,7 +1147,7 @@ Implement authentication for production. // 7. Cleanup println!("Step 7: Cleanup..."); for doc_id in &doc_ids { - engine.remove(doc_id)?; + engine.remove(doc_id).await?; } println!(" ✓ Removed {} documents\n", doc_ids.len()); diff --git a/examples/events.rs b/examples/events.rs index 8f736d88..706454fc 100644 --- a/examples/events.rs +++ b/examples/events.rs @@ -17,7 +17,7 @@ use std::sync::Arc; use std::sync::atomic::{AtomicUsize, Ordering}; -use vectorless::client::{EngineBuilder, EventEmitter, IndexEvent, QueryEvent}; +use vectorless::client::{EngineBuilder, EventEmitter, IndexContext, IndexEvent, QueryEvent}; #[tokio::main] async fn main() -> Result<(), Box> { @@ -95,7 +95,8 @@ async fn main() -> Result<(), Box> { .with_workspace("./workspace_events_example") .with_events(events) .build() - .map_err(|e| vectorless::Error::Config(e.to_string()))?; + .await + .map_err(|e: vectorless::BuildError| vectorless::Error::Config(e.to_string()))?; println!(" ✓ Engine created\n"); // 3. Index a document (events will fire) @@ -122,7 +123,7 @@ The event system uses handlers that can be attached to the engine builder. let doc_path = temp_dir.path().join("example.md"); tokio::fs::write(&doc_path, doc_content).await?; - let doc_id = engine.index(&doc_path).await?; + let doc_id = engine.index(IndexContext::from_path(&doc_path)).await?; println!(); // 4. Query the document (events will fire) @@ -161,7 +162,7 @@ The event system uses handlers that can be attached to the engine builder. // 7. Cleanup println!("Step 7: Cleanup..."); - engine.remove(&doc_id)?; + engine.remove(&doc_id).await?; println!(" ✓ Document removed\n"); println!("=== Example Complete ==="); diff --git a/examples/markdownflow.rs b/examples/markdownflow.rs index ba566aa6..60e96f54 100644 --- a/examples/markdownflow.rs +++ b/examples/markdownflow.rs @@ -20,7 +20,7 @@ //! ``` use vectorless::Engine; -use vectorless::client::IndexOptions; +use vectorless::client::{IndexContext, IndexOptions}; /// Sample markdown content for demonstration. const SAMPLE_MARKDOWN: &str = r#" @@ -46,7 +46,8 @@ async fn main() -> Result<(), Box> { let client = Engine::builder() .with_workspace("./workspace") .build() - .map_err(|e| vectorless::Error::Config(e.to_string()))?; + .await + .map_err(|e: vectorless::BuildError| vectorless::Error::Config(e.to_string()))?; println!(" - Client created successfully"); println!(); @@ -61,8 +62,9 @@ async fn main() -> Result<(), Box> { // Check if we should generate summaries (requires API key) println!(" - API key detected, generating summaries..."); - let options = IndexOptions::new().with_summaries(); - let doc_id = client.index_with_options(&md_path, options).await?; + let doc_id = client + .index(IndexContext::from_path(&md_path).with_options(IndexOptions::new().with_summaries())) + .await?; println!(" - Document indexed successfully"); println!(" - Document ID: {}", doc_id); @@ -72,7 +74,7 @@ async fn main() -> Result<(), Box> { println!("Step 3: Document structure (JSON):"); println!(); - match client.get_structure(&doc_id) { + match client.get_structure(&doc_id).await { Ok(tree) => { // Export to JSON format (PageIndex compatible) let structure = tree.to_structure_json("sample.md"); @@ -121,7 +123,7 @@ async fn main() -> Result<(), Box> { // Step 5: Cleanup println!("Step 5: Cleanup..."); - client.remove(&doc_id)?; + client.remove(&doc_id).await?; println!(" - Document removed"); println!("\n=== Example Complete ==="); diff --git a/examples/session.rs b/examples/session.rs index 30a2f97a..d5cfd68d 100644 --- a/examples/session.rs +++ b/examples/session.rs @@ -15,7 +15,7 @@ //! cargo run --example session //! ``` -use vectorless::client::EngineBuilder; +use vectorless::client::{EngineBuilder, IndexContext}; #[tokio::main] async fn main() -> Result<(), Box> { @@ -26,12 +26,13 @@ async fn main() -> Result<(), Box> { let engine = EngineBuilder::new() .with_workspace("./workspace_session_example") .build() - .map_err(|e| vectorless::Error::Config(e.to_string()))?; + .await + .map_err(|e: vectorless::BuildError| vectorless::Error::Config(e.to_string()))?; println!(" ✓ Engine created\n"); // 2. Create a session for multi-document operations println!("Step 2: Creating session..."); - let session = engine.session(); + let session = engine.session().await; println!(" ✓ Session ID: {}\n", session.id()); // 3. Index multiple documents into the session @@ -118,13 +119,13 @@ token_budget = 4000 tokio::fs::write(&doc3_path, doc3_content).await?; // Index into session - let doc1_id = session.index(&doc1_path).await?; + let doc1_id = session.index(IndexContext::from_path(&doc1_path)).await?; println!(" ✓ Indexed: architecture.md -> {}", &doc1_id[..8]); - let doc2_id = session.index(&doc2_path).await?; + let doc2_id = session.index(IndexContext::from_path(&doc2_path)).await?; println!(" ✓ Indexed: api.md -> {}", &doc2_id[..8]); - let doc3_id = session.index(&doc3_path).await?; + let doc3_id = session.index(IndexContext::from_path(&doc3_path)).await?; println!(" ✓ Indexed: config.md -> {}", &doc3_id[..8]); println!(); @@ -194,9 +195,9 @@ token_budget = 4000 // 9. Cleanup println!("Step 9: Cleanup..."); - engine.remove(&doc1_id)?; - engine.remove(&doc2_id)?; - engine.remove(&doc3_id)?; + engine.remove(&doc1_id).await?; + engine.remove(&doc2_id).await?; + engine.remove(&doc3_id).await?; println!(" ✓ Documents removed\n"); println!("=== Example Complete ==="); diff --git a/examples/storage_async.rs b/examples/storage_async.rs deleted file mode 100644 index 606755b4..00000000 --- a/examples/storage_async.rs +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Async workspace usage example. -//! -//! This example demonstrates async workspace operations: -//! - Creating an async workspace -//! - Concurrent document access -//! - Async LRU cache -//! -//! # Usage -//! -//! ```bash -//! cargo run --example storage_async -//! ``` - -use std::sync::Arc; - -use vectorless::document::DocumentTree; -use vectorless::storage::{AsyncWorkspace, DocumentMeta, PersistedDocument}; - -fn create_doc(id: &str, name: &str) -> PersistedDocument { - let meta = DocumentMeta::new(id, name, "md"); - let content = format!("Content for {}", name); - let tree = DocumentTree::new("Root", &content); - PersistedDocument::new(meta, tree) -} - -#[tokio::main] -async fn main() -> vectorless::Result<()> { - println!("=== Async Workspace Example ===\n"); - - let workspace_path = "./example_async_workspace"; - - // 1. Create async workspace - println!("1. Creating async workspace..."); - let workspace = AsyncWorkspace::new(workspace_path).await?; - println!(" ✓ Created\n"); - - // 2. Add documents - println!("2. Adding documents..."); - workspace.add(&create_doc("doc-1", "Document One")).await?; - workspace.add(&create_doc("doc-2", "Document Two")).await?; - workspace - .add(&create_doc("doc-3", "Document Three")) - .await?; - println!(" ✓ Added 3 documents\n"); - - // 3. Concurrent access example - println!("3. Concurrent access from multiple tasks..."); - let ws = Arc::new(workspace); - - let mut handles = vec![]; - - // Spawn concurrent read tasks - for i in 1..=3 { - let ws_clone = ws.clone(); - let handle = tokio::spawn(async move { - let id = format!("doc-{}", i); - let doc = ws_clone.load(&id).await.unwrap().unwrap(); - println!(" [Task {}] Loaded: {}", i, doc.meta.name); - }); - handles.push(handle); - } - - // Wait for all tasks - for handle in handles { - handle.await.unwrap(); - } - println!(" ✓ All concurrent loads completed\n"); - - // 4. Cache stats - println!("4. Cache statistics:"); - let stats = ws.cache_stats().await; - println!(" - Hits: {}", stats.hits); - println!(" - Misses: {}", stats.misses); - println!(); - - // 5. Clone and share - println!("5. Workspace can be cloned cheaply (Arc internally)..."); - let ws2 = ws.clone(); - let ws3 = ws.clone(); - - let len1 = ws.len().await; - let len2 = ws2.len().await; - let len3 = ws3.len().await; - - println!( - " ws1.len() = {}, ws2.len() = {}, ws3.len() = {}", - len1, len2, len3 - ); - println!(" ✓ All clones share the same state\n"); - - // Cleanup - println!("Cleaning up..."); - std::fs::remove_dir_all(workspace_path).ok(); - println!(" ✓ Done!"); - - Ok(()) -} diff --git a/examples/storage_backend.rs b/examples/storage_backend.rs index 7ad6f6f5..a239013c 100644 --- a/examples/storage_backend.rs +++ b/examples/storage_backend.rs @@ -96,7 +96,8 @@ impl StorageBackend for LoggingMemoryBackend { } } -fn main() -> vectorless::Result<()> { +#[tokio::main] +async fn main() -> vectorless::Result<()> { println!("=== Custom Storage Backend Example ===\n"); // 1. Create custom backend @@ -106,7 +107,7 @@ fn main() -> vectorless::Result<()> { // 2. Create workspace with custom backend println!("2. Creating workspace with custom backend..."); - let mut workspace = Workspace::with_backend(backend)?; + let workspace = Workspace::with_backend(backend).await?; println!(" ✓ Workspace created\n"); // 3. Add a document (watch the logging) @@ -114,18 +115,18 @@ fn main() -> vectorless::Result<()> { let meta = DocumentMeta::new("custom-doc", "Custom Backend Test", "md"); let tree = DocumentTree::new("Root", "Testing custom backend!"); let doc = PersistedDocument::new(meta, tree); - workspace.add(&doc)?; + workspace.add(&doc).await?; println!(); // 4. Load the document println!("4. Loading document:"); - let loaded = workspace.load("custom-doc")?.unwrap(); + let loaded = workspace.load_and_cache("custom-doc").await?.unwrap(); println!(" ✓ Loaded: {}\n", loaded.meta.name); // 5. Show workspace stats println!("5. Workspace stats:"); - println!(" - Documents: {}", workspace.len()); - println!(" - Cache size: {}", workspace.cache_len()); + println!(" - Documents: {}", workspace.len().await); + println!(" - Cache size: {}", workspace.cache_len().await); println!(); println!("✓ Custom backend example complete!"); diff --git a/examples/storage_workspace.rs b/examples/storage_workspace.rs index c0067653..794a7145 100644 --- a/examples/storage_workspace.rs +++ b/examples/storage_workspace.rs @@ -4,7 +4,7 @@ //! Basic workspace usage example. //! //! This example demonstrates the core storage API: -//! - Creating a workspace +//! - Creating an async workspace //! - Adding documents //! - Loading documents with LRU cache //! - Listing and removing documents @@ -18,7 +18,8 @@ use vectorless::document::DocumentTree; use vectorless::storage::{DocumentMeta, PersistedDocument, Workspace}; -fn main() -> vectorless::Result<()> { +#[tokio::main] +async fn main() -> vectorless::Result<()> { println!("=== Storage Workspace Example ===\n"); // Create a temporary workspace @@ -26,7 +27,9 @@ fn main() -> vectorless::Result<()> { // 1. Create a workspace with custom cache size println!("1. Creating workspace at '{}'...", workspace_path); - let mut workspace = Workspace::with_cache_size(workspace_path, 100)?; + let workspace = Workspace::with_cache_size(workspace_path, 100) + .await + .map_err(|e| vectorless::Error::Workspace(e.to_string()))?; println!(" ✓ Workspace created\n"); // 2. Create a document @@ -42,13 +45,16 @@ fn main() -> vectorless::Result<()> { // 3. Add document to workspace println!("3. Adding document to workspace..."); - workspace.add(&doc)?; + workspace + .add(&doc) + .await + .map_err(|e| vectorless::Error::Workspace(e.to_string()))?; println!(" ✓ Document saved\n"); // 4. List all documents println!("4. Listing documents:"); - for id in workspace.list_documents() { - if let Some(meta) = workspace.get_meta(id) { + for id in workspace.list_documents().await { + if let Some(meta) = workspace.get_meta(&id).await { println!(" - {} ({})", meta.doc_name, meta.id); if let Some(ref desc) = meta.doc_description { println!(" Description: {}", desc); @@ -59,7 +65,11 @@ fn main() -> vectorless::Result<()> { // 5. Load document (uses LRU cache) println!("5. Loading document..."); - let loaded = workspace.load("doc-001")?.expect("Document should exist"); + let loaded = workspace + .load_and_cache("doc-001") + .await + .map_err(|e| vectorless::Error::Workspace(e.to_string()))? + .expect("Document should exist"); println!(" ✓ Loaded: {}", loaded.meta.name); let root = loaded.tree.root(); if let Some(node) = loaded.tree.get(root) { @@ -69,28 +79,34 @@ fn main() -> vectorless::Result<()> { // 6. Cache statistics println!("6. Cache statistics:"); - let stats = workspace.cache_stats(); + let stats = workspace.cache_stats().await; println!(" - Hits: {}", stats.hits); println!(" - Misses: {}", stats.misses); println!(" - Evictions: {}", stats.evictions); println!( " - Utilization: {:.1}%", - workspace.cache_utilization() * 100.0 + workspace.cache_utilization().await * 100.0 ); println!(); // 7. Load again (should hit cache) println!("7. Loading document again (should hit cache)..."); - let _ = workspace.load("doc-001")?; - let stats = workspace.cache_stats(); + let _ = workspace + .load_and_cache("doc-001") + .await + .map_err(|e| vectorless::Error::Workspace(e.to_string()))?; + let stats = workspace.cache_stats().await; println!(" ✓ Cache hits: {}", stats.hits); println!(); // 8. Remove document println!("8. Removing document..."); - let removed = workspace.remove("doc-001")?; + let removed = workspace + .remove("doc-001") + .await + .map_err(|e| vectorless::Error::Workspace(e.to_string()))?; println!(" ✓ Removed: {}", removed); - println!(" Workspace is empty: {}", workspace.is_empty()); + println!(" Workspace is empty: {}", workspace.is_empty().await); println!(); // Cleanup diff --git a/src/client/builder.rs b/src/client/builder.rs index 3243bcfa..12db36bb 100644 --- a/src/client/builder.rs +++ b/src/client/builder.rs @@ -2,6 +2,36 @@ // SPDX-License-Identifier: Apache-2.0 //! Builder pattern for creating Engine clients. +//! +//! This module provides [`EngineBuilder`] for configuring and building +//! [`Engine`] instances with sensible defaults. +//! +//! # Example +//! +//! ```rust,no_run +//! use vectorless::client::EngineBuilder; +//! +//! # #[tokio::main] +//! # async fn main() -> Result<(), vectorless::BuildError> { +//! // Simple setup with workspace +//! let engine = EngineBuilder::new() +//! .with_workspace("./my_workspace") +//! .with_openai(std::env::var("OPENAI_API_KEY").unwrap()) +//! .build() +//! .await?; +//! +//! // Advanced configuration +//! let engine = EngineBuilder::new() +//! .with_workspace("./data") +//! .with_model("gpt-4o", None) +//! .with_endpoint("https://api.openai.com/v1") +//! .with_top_k(10) +//! .precise() +//! .build() +//! .await?; +//! # Ok(()) +//! # } +//! ``` use std::path::PathBuf; @@ -20,15 +50,29 @@ const CONFIG_FILE_NAMES: &[&str] = &["vectorless.toml", "config.toml", ".vectorl /// The builder uses sensible defaults and automatically loads /// LLM configuration from environment variables or config files. /// +/// # Configuration Priority +/// +/// Configuration is loaded in this order (later overrides earlier): +/// 1. Default configuration +/// 2. Auto-detected config file +/// 3. Explicit config file (`with_config_path`) +/// 4. Custom config object (`with_config`) +/// 5. Individual builder methods +/// /// # Example /// /// ```rust,no_run /// use vectorless::client::EngineBuilder; /// +/// # #[tokio::main] +/// # async fn main() -> Result<(), vectorless::BuildError> { /// let client = EngineBuilder::new() /// .with_workspace("./my_workspace") -/// .build()?; -/// # Ok::<(), vectorless::domain::Error>(()) +/// .with_openai(std::env::var("OPENAI_API_KEY").unwrap()) +/// .build() +/// .await?; +/// # Ok(()) +/// # } /// ``` #[derive(Debug)] pub struct EngineBuilder { @@ -46,6 +90,24 @@ pub struct EngineBuilder { /// Event emitter. events: Option, + + /// LLM API key (override). + api_key: Option, + + /// LLM model name (override). + model: Option, + + /// LLM endpoint URL (override). + endpoint: Option, + + /// Top-K for retrieval (override). + top_k: Option, + + /// Fast mode flag. + fast_mode: bool, + + /// Precise mode flag. + precise_mode: bool, } impl EngineBuilder { @@ -58,10 +120,38 @@ impl EngineBuilder { config: None, retrieval_config: None, events: None, + api_key: None, + model: None, + endpoint: None, + top_k: None, + fast_mode: false, + precise_mode: false, } } + // ============================================================ + // Basic Configuration + // ============================================================ + /// Set the workspace path for document persistence. + /// + /// The workspace stores indexed documents and metadata. + /// If not set, defaults to `./workspace` or the value in config. + /// + /// # Example + /// + /// ```rust,no_run + /// use vectorless::client::EngineBuilder; + /// + /// # #[tokio::main] + /// # async fn main() -> Result<(), vectorless::BuildError> { + /// let engine = EngineBuilder::new() + /// .with_workspace("./data") + /// .build() + /// .await?; + /// # Ok(()) + /// # } + /// ``` #[must_use] pub fn with_workspace(mut self, path: impl Into) -> Self { self.workspace = Some(path.into()); @@ -69,13 +159,19 @@ impl EngineBuilder { } /// Set the configuration file path. + /// + /// If not set, the builder searches for `vectorless.toml`, + /// `config.toml`, or `.vectorless.toml` in the current directory + /// and parent directories. #[must_use] pub fn with_config_path(mut self, path: impl Into) -> Self { self.config_path = Some(path.into()); self } - /// Set a custom configuration. + /// Set a custom configuration object. + /// + /// This overrides any config file settings. #[must_use] pub fn with_config(mut self, config: Config) -> Self { self.config = Some(config); @@ -96,6 +192,133 @@ impl EngineBuilder { self } + // ============================================================ + // LLM Configuration + // ============================================================ + + /// Configure for OpenAI API. + /// + /// Uses `gpt-4o` model by default. Use [`with_model`](EngineBuilder::with_model) + /// to specify a different model. + /// + /// # Example + /// + /// ```rust,no_run + /// use vectorless::client::EngineBuilder; + /// + /// # #[tokio::main] + /// # async fn main() -> Result<(), vectorless::BuildError> { + /// let engine = EngineBuilder::new() + /// .with_workspace("./data") + /// .with_openai(std::env::var("OPENAI_API_KEY").unwrap()) + /// .build() + /// .await?; + /// # Ok(()) + /// # } + /// ``` + #[must_use] + pub fn with_openai(self, api_key: impl Into) -> Self { + self.with_model("gpt-4o", Some(api_key.into())) + } + + /// Set the LLM model and optional API key. + /// + /// # Arguments + /// + /// * `model` - Model name (e.g., "gpt-4o", "gpt-4o-mini", "claude-3-5-sonnet") + /// * `api_key` - Optional API key (uses environment variable if not provided) + /// + /// # Example + /// + /// ```rust,no_run + /// use vectorless::client::EngineBuilder; + /// + /// # #[tokio::main] + /// # async fn main() -> Result<(), vectorless::BuildError> { + /// let engine = EngineBuilder::new() + /// .with_workspace("./data") + /// .with_model("gpt-4o-mini", Some("sk-...".to_string())) + /// .build() + /// .await?; + /// # Ok(()) + /// # } + /// ``` + #[must_use] + pub fn with_model(mut self, model: impl Into, api_key: Option) -> Self { + self.model = Some(model.into()); + self.api_key = api_key; + self + } + + /// Set a custom LLM endpoint URL. + /// + /// Use this for OpenAI-compatible APIs (e.g., Azure OpenAI, local models). + /// + /// # Example + /// + /// ```rust,no_run + /// use vectorless::client::EngineBuilder; + /// + /// # #[tokio::main] + /// # async fn main() -> Result<(), vectorless::BuildError> { + /// let engine = EngineBuilder::new() + /// .with_workspace("./data") + /// .with_model("deepseek-chat", Some("sk-...".to_string())) + /// .with_endpoint("https://api.deepseek.com/v1") + /// .build() + /// .await?; + /// # Ok(()) + /// # } + /// ``` + #[must_use] + pub fn with_endpoint(mut self, url: impl Into) -> Self { + self.endpoint = Some(url.into()); + self + } + + // ============================================================ + // Retrieval Configuration + // ============================================================ + + /// Set the number of results to return from queries. + /// + /// Default is 5. Higher values return more context but cost more tokens. + #[must_use] + pub fn with_top_k(mut self, k: usize) -> Self { + self.top_k = Some(k); + self + } + + // ============================================================ + // Preset Configurations + // ============================================================ + + /// Enable fast mode for quicker but less thorough retrieval. + /// + /// Fast mode uses: + /// - Keyword-based retrieval (no LLM calls) + /// - Lower beam width / MCTS simulations + /// - Lazy summary generation + #[must_use] + pub fn fast(mut self) -> Self { + self.fast_mode = true; + self.precise_mode = false; + self + } + + /// Enable precise mode for higher quality retrieval. + /// + /// Precise mode uses: + /// - MCTS-based retrieval + /// - Higher simulation count + /// - Full summary generation + #[must_use] + pub fn precise(mut self) -> Self { + self.precise_mode = true; + self.fast_mode = false; + self + } + /// Search for config file in current directory and parent directories. fn find_config_file() -> Option { let current_dir = std::env::current_dir().ok()?; @@ -129,18 +352,29 @@ impl EngineBuilder { /// Build the Engine client. /// - /// Configuration is loaded from: - /// 1. Explicitly provided config (via `with_config`) - /// 2. Configuration file (auto-detected or specified via `with_config_path`) - /// 3. Default configuration (if no config file found) - /// /// # Errors /// /// Returns a [`BuildError`] if: /// - Configuration loading fails /// - Workspace creation fails /// - Required API key is missing - pub fn build(self) -> Result { + /// + /// # Example + /// + /// ```rust,no_run + /// use vectorless::client::EngineBuilder; + /// + /// # #[tokio::main] + /// # async fn main() -> Result<(), vectorless::BuildError> { + /// let engine = EngineBuilder::new() + /// .with_workspace("./data") + /// .with_openai(std::env::var("OPENAI_API_KEY").unwrap()) + /// .build() + /// .await?; + /// # Ok(()) + /// # } + /// ``` + pub async fn build(self) -> Result { // Load or create configuration let mut config = if let Some(config) = self.config { config @@ -157,20 +391,42 @@ impl EngineBuilder { Config::default() }; - // Override retrieval config if provided + // Apply builder overrides to retrieval config if let Some(retrieval_config) = self.retrieval_config { config.retrieval = retrieval_config; } + // Apply individual overrides + if let Some(api_key) = self.api_key { + config.retrieval.api_key = Some(api_key); + } + if let Some(model) = self.model { + config.retrieval.model = model; + } + if let Some(endpoint) = self.endpoint { + config.retrieval.endpoint = endpoint; + } + if let Some(top_k) = self.top_k { + config.retrieval.top_k = top_k; + } + + // Apply preset modes + if self.fast_mode { + config.retrieval.search.max_iterations = 5; + } + if self.precise_mode { + config.retrieval.search.max_iterations = 100; + } + // Open workspace: prefer explicit path, fallback to config - let workspace = if let Some(path) = &self.workspace { - Some(Workspace::open(path).map_err(|e| BuildError::Workspace(e.to_string()))?) - } else { - Some( - Workspace::open(&config.storage.workspace_dir) - .map_err(|e| BuildError::Workspace(e.to_string()))?, - ) - }; + let workspace_path = self + .workspace + .as_ref() + .unwrap_or(&config.storage.workspace_dir); + + let workspace = Workspace::new(workspace_path) + .await + .map_err(|e| BuildError::Workspace(e.to_string()))?; // Create pipeline executor with LLM client if API key is available let executor = if let Some(api_key) = config.summary.api_key.clone() { @@ -215,6 +471,7 @@ impl EngineBuilder { // Build engine Engine::with_components(config, workspace, retriever, executor) + .await .map_err(|e| BuildError::Other(e.to_string())) } } @@ -255,6 +512,8 @@ mod tests { fn test_builder_defaults() { let builder = EngineBuilder::new(); assert!(builder.workspace.is_none()); + assert!(!builder.fast_mode); + assert!(!builder.precise_mode); } #[test] @@ -263,4 +522,42 @@ mod tests { assert_eq!(builder.workspace, Some(PathBuf::from("./test_workspace"))); } + + #[test] + fn test_builder_with_openai() { + let builder = EngineBuilder::new().with_openai("sk-test-key"); + + assert_eq!(builder.model, Some("gpt-4o".to_string())); + assert_eq!(builder.api_key, Some("sk-test-key".to_string())); + } + + #[test] + fn test_builder_with_model() { + let builder = EngineBuilder::new().with_model("gpt-4o-mini", Some("sk-test".to_string())); + + assert_eq!(builder.model, Some("gpt-4o-mini".to_string())); + } + + #[test] + fn test_builder_fast_mode() { + let builder = EngineBuilder::new().fast(); + + assert!(builder.fast_mode); + assert!(!builder.precise_mode); + } + + #[test] + fn test_builder_precise_mode() { + let builder = EngineBuilder::new().precise(); + + assert!(builder.precise_mode); + assert!(!builder.fast_mode); + } + + #[test] + fn test_builder_top_k() { + let builder = EngineBuilder::new().with_top_k(10); + + assert_eq!(builder.top_k, Some(10)); + } } diff --git a/src/client/engine.rs b/src/client/engine.rs index 94745222..caaef8b5 100644 --- a/src/client/engine.rs +++ b/src/client/engine.rs @@ -19,17 +19,24 @@ //! # Example //! //! ```rust,no_run -//! use vectorless::client::{Engine, EngineBuilder}; +//! use vectorless::client::{Engine, EngineBuilder, IndexContext}; //! //! # #[tokio::main] -//! # async fn main() -> vectorless::domain::Result<()> { +//! # async fn main() -> vectorless::Result<()> { //! // Create a client //! let client = EngineBuilder::new() //! .with_workspace("./my_workspace") //! .build()?; //! -//! // Index a document -//! let doc_id = client.index("./document.md").await?; +//! // Index a document from file +//! let doc_id = client.index(IndexContext::from_path("./document.md")).await?; +//! +//! // Index HTML content +//! let html = "

Title

Content

"; +//! let doc_id2 = client.index( +//! IndexContext::from_content(html, vectorless::parser::DocumentFormat::Html) +//! .with_name("webpage") +//! ).await?; //! //! // Query the document //! let result = client.query(&doc_id, "What is this?").await?; @@ -39,8 +46,7 @@ //! # } //! ``` -use std::path::Path; -use std::sync::{Arc, Mutex, RwLock}; +use std::sync::Arc; use tracing::info; @@ -53,10 +59,11 @@ use crate::{DocumentTree, Error}; use super::context::ClientContext; use super::events::EventEmitter; +use super::index_context::IndexContext; use super::indexer::IndexerClient; use super::retriever::RetrieverClient; use super::session::Session; -use super::types::{DocumentInfo, IndexOptions, QueryResult}; +use super::types::{DocumentInfo, QueryResult}; use super::workspace::WorkspaceClient; /// The main Engine client. @@ -99,14 +106,18 @@ impl Engine { /// Create a new client with default configuration. /// /// Note: Prefer using [`Engine::builder()`] for more control. - fn new() -> Result { + async fn new() -> Result { let config = Config::default(); + let workspace = Workspace::new("./workspace") + .await + .map_err(|e| Error::Workspace(e.to_string()))?; Self::with_components( config, - None, + workspace, PipelineRetriever::new(), PipelineExecutor::new(), ) + .await } // ============================================================ @@ -114,9 +125,9 @@ impl Engine { // ============================================================ /// Create a new client with the given components. - pub(crate) fn with_components( + pub(crate) async fn with_components( config: Config, - workspace: Option, + workspace: Workspace, retriever: PipelineRetriever, executor: PipelineExecutor, ) -> Result { @@ -130,15 +141,16 @@ impl Engine { let retriever = RetrieverClient::new(retriever, Arc::clone(&config)).with_events(events.clone()); - // Create workspace client (if workspace provided) - let workspace_client = - workspace.map(|ws| WorkspaceClient::new(ws).with_events(events.clone())); + // Create workspace client + let workspace_client = WorkspaceClient::new(workspace) + .await + .with_events(events.clone()); Ok(Self { config, indexer, retriever, - workspace: workspace_client, + workspace: Some(workspace_client), events, }) } @@ -147,36 +159,65 @@ impl Engine { // Document Indexing // ============================================================ - /// Index a document from a file path. + /// Index a document. + /// + /// This is the main entry point for indexing documents. The [`IndexContext`] + /// parameter specifies the source (file path, content string, or bytes) + /// and indexing options. + /// + /// # Arguments + /// + /// * `ctx` - The index context containing source and options + /// + /// # Returns /// - /// Returns a unique document ID. + /// A unique document ID string. /// /// # Errors /// /// Returns an error if: - /// - The file does not exist + /// - The file does not exist (for path sources) /// - The file format is not supported /// - The pipeline execution fails - pub async fn index(&self, path: impl AsRef) -> Result { - self.index_with_options(path, IndexOptions::default()).await - } - - /// Index a document with custom options. /// - /// # Errors + /// # Example /// - /// See [`Engine::index`]. - pub async fn index_with_options( - &self, - path: impl AsRef, - options: IndexOptions, - ) -> Result { - let doc = self.indexer.index_with_options(path, options).await?; + /// ```rust,no_run + /// use vectorless::client::{Engine, EngineBuilder, IndexContext, IndexMode}; + /// use vectorless::parser::DocumentFormat; + /// + /// # #[tokio::main] + /// # async fn main() -> vectorless::Result<()> { + /// let engine = EngineBuilder::new() + /// .with_workspace("./data") + /// .build()?; + /// + /// // From file + /// let id1 = engine.index(IndexContext::from_path("./doc.md")).await?; + /// + /// // From content + /// let html = "

Title

"; + /// let id2 = engine.index( + /// IndexContext::from_content(html, DocumentFormat::Html) + /// .with_name("webpage") + /// ).await?; + /// + /// // From bytes with force mode + /// let pdf_bytes = std::fs::read("./doc.pdf")?; + /// let id3 = engine.index( + /// IndexContext::from_bytes(pdf_bytes, DocumentFormat::Pdf) + /// .with_mode(IndexMode::Force) + /// ).await?; + /// # Ok(()) + /// # } + /// ``` + pub async fn index(&self, ctx: IndexContext) -> Result { + let doc = self.indexer.index(ctx).await?; let persisted = self.indexer.to_persisted(doc); // Save to workspace if configured if let Some(ref workspace) = self.workspace { - workspace.save(&persisted)?; + workspace.save(&persisted).await?; } let doc_id = persisted.meta.id.clone(); @@ -199,7 +240,7 @@ impl Engine { /// - The document is not found /// - The retrieval fails pub async fn query(&self, doc_id: &str, question: &str) -> Result { - let tree = self.get_structure(doc_id)?; + let tree = self.get_structure(doc_id).await?; let options = RetrieveOptions::new() .with_top_k(self.config.retrieval.top_k) @@ -221,7 +262,7 @@ impl Engine { question: &str, ctx: &ClientContext, ) -> Result { - let tree = self.get_structure(doc_id)?; + let tree = self.get_structure(doc_id).await?; let mut options = RetrieveOptions::new() .with_top_k(self.config.retrieval.top_k) @@ -255,13 +296,17 @@ impl Engine { /// - Automatic caching of document trees /// - Cross-document queries /// - Session statistics - pub fn session(&self) -> Session { - let workspace = self.workspace.clone().unwrap_or_else(|| { - WorkspaceClient::from_arc( - Arc::new(RwLock::new(Workspace::open("./temp_workspace").unwrap())), - self.events.clone(), - ) - }); + pub async fn session(&self) -> Session { + let workspace = match &self.workspace { + Some(ws) => ws.clone(), + None => { + // Create a temporary workspace if none configured + let async_ws = Workspace::new("./temp_workspace") + .await + .expect("Failed to create temp workspace"); + WorkspaceClient::new(async_ws).await + } + }; Session::new( self.indexer.clone(), @@ -276,12 +321,17 @@ impl Engine { // ============================================================ /// Get a list of all indexed documents. - #[must_use] - pub fn list_documents(&self) -> Vec { - match &self.workspace { - Some(workspace) => workspace.list().unwrap_or_default(), - None => Vec::new(), - } + /// + /// # Errors + /// + /// Returns an error if the workspace operation fails. + pub async fn list_documents(&self) -> Result> { + let workspace = self + .workspace + .as_ref() + .ok_or_else(|| Error::Config("No workspace configured".to_string()))?; + + workspace.list().await } /// Get document structure (tree). @@ -291,14 +341,15 @@ impl Engine { /// Returns an error if: /// - No workspace is configured /// - The document is not found - pub fn get_structure(&self, doc_id: &str) -> Result { + pub async fn get_structure(&self, doc_id: &str) -> Result { let workspace = self .workspace .as_ref() .ok_or_else(|| Error::Config("No workspace configured".to_string()))?; let doc = workspace - .load(doc_id)? + .load(doc_id) + .await? .ok_or_else(|| Error::DocumentNotFound(format!("Document not found: {}", doc_id)))?; Ok(doc.tree) @@ -312,14 +363,15 @@ impl Engine { /// - No workspace is configured /// - The document is not found /// - No page content is available - pub fn get_page_content(&self, doc_id: &str, pages: &str) -> Result { + pub async fn get_page_content(&self, doc_id: &str, pages: &str) -> Result { let workspace = self .workspace .as_ref() .ok_or_else(|| Error::Config("No workspace configured".to_string()))?; let doc = workspace - .load(doc_id)? + .load(doc_id) + .await? .ok_or_else(|| Error::DocumentNotFound(format!("Document not found: {}", doc_id)))?; if doc.pages.is_empty() { @@ -381,17 +433,17 @@ impl Engine { /// # Errors /// /// Returns an error if no workspace is configured. - pub fn load(&self, doc_id: &str) -> Result { + pub async fn load(&self, doc_id: &str) -> Result { let workspace = self .workspace .as_ref() .ok_or_else(|| Error::Config("No workspace configured".to_string()))?; - if !workspace.exists(doc_id)? { + if !workspace.exists(doc_id).await? { return Ok(false); } - let _ = workspace.load(doc_id)?; + let _ = workspace.load(doc_id).await?; Ok(true) } @@ -400,13 +452,13 @@ impl Engine { /// # Errors /// /// Returns an error if no workspace is configured. - pub fn remove(&self, doc_id: &str) -> Result { + pub async fn remove(&self, doc_id: &str) -> Result { let workspace = self .workspace .as_ref() .ok_or_else(|| Error::Config("No workspace configured".to_string()))?; - workspace.remove(doc_id) + workspace.remove(doc_id).await } /// Check if a document exists in the workspace. @@ -414,13 +466,13 @@ impl Engine { /// # Errors /// /// Returns an error if no workspace is configured. - pub fn exists(&self, doc_id: &str) -> Result { + pub async fn exists(&self, doc_id: &str) -> Result { let workspace = self .workspace .as_ref() .ok_or_else(|| Error::Config("No workspace configured".to_string()))?; - workspace.exists(doc_id) + workspace.exists(doc_id).await } /// Get metadata for a document. @@ -428,13 +480,13 @@ impl Engine { /// # Errors /// /// Returns an error if no workspace is configured. - pub fn get_metadata(&self, doc_id: &str) -> Result> { + pub async fn get_metadata(&self, doc_id: &str) -> Result> { let workspace = self .workspace .as_ref() .ok_or_else(|| Error::Config("No workspace configured".to_string()))?; - workspace.get_document_info(doc_id) + workspace.get_document_info(doc_id).await } /// Remove multiple documents from the workspace. @@ -444,13 +496,13 @@ impl Engine { /// # Errors /// /// Returns an error if no workspace is configured. - pub fn batch_remove(&self, doc_ids: &[&str]) -> Result { + pub async fn batch_remove(&self, doc_ids: &[&str]) -> Result { let workspace = self .workspace .as_ref() .ok_or_else(|| Error::Config("No workspace configured".to_string()))?; - workspace.batch_remove(doc_ids) + workspace.batch_remove(doc_ids).await } /// Remove all documents from the workspace. @@ -460,25 +512,36 @@ impl Engine { /// # Errors /// /// Returns an error if no workspace is configured. - pub fn clear(&self) -> Result { + pub async fn clear(&self) -> Result { let workspace = self .workspace .as_ref() .ok_or_else(|| Error::Config("No workspace configured".to_string()))?; - workspace.clear() + workspace.clear().await } /// Get the number of indexed documents. - #[must_use] - pub fn len(&self) -> usize { - self.workspace.as_ref().map(|w| w.len()).unwrap_or(0) + /// + /// # Errors + /// + /// Returns an error if the workspace operation fails. + pub async fn len(&self) -> Result { + let workspace = self + .workspace + .as_ref() + .ok_or_else(|| Error::Config("No workspace configured".to_string()))?; + + Ok(workspace.len().await) } /// Check if there are no documents. - #[must_use] - pub fn is_empty(&self) -> bool { - self.len() == 0 + /// + /// # Errors + /// + /// Returns an error if the workspace operation fails. + pub async fn is_empty(&self) -> Result { + Ok(self.len().await? == 0) } // ============================================================ @@ -518,17 +581,10 @@ impl Clone for Engine { } } -impl Default for Engine { - fn default() -> Self { - Self::new().expect("Failed to create default Engine client") - } -} - impl std::fmt::Debug for Engine { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("Engine") .field("has_workspace", &self.workspace.is_some()) - .field("doc_count", &self.len()) .finish_non_exhaustive() } } diff --git a/src/client/index_context.rs b/src/client/index_context.rs new file mode 100644 index 00000000..88ad7fd0 --- /dev/null +++ b/src/client/index_context.rs @@ -0,0 +1,440 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Index context for document indexing operations. +//! +//! This module provides [`IndexContext`], a unified type for specifying +//! document input sources for the [`Engine::index`](super::Engine::index) method. +//! +//! # Overview +//! +//! `IndexContext` supports three input types: +//! - **File path** - Load and parse a file from disk +//! - **Content string** - Parse content directly (for HTML, Markdown, text) +//! - **Byte data** - Parse binary data (for PDF, DOCX) +//! +//! # Examples +//! +//! ## From file path +//! +//! ```rust,no_run +//! use vectorless::client::IndexContext; +//! +//! let ctx = IndexContext::from_path("./document.md"); +//! ``` +//! +//! ## From content string +//! +//! ```rust +//! use vectorless::client::IndexContext; +//! use vectorless::parser::DocumentFormat; +//! +//! let html = "

Title

Content

"; +//! let ctx = IndexContext::from_content(html, DocumentFormat::Html) +//! .with_name("webpage"); +//! ``` +//! +//! ## From bytes +//! +//! ```rust +//! use vectorless::client::IndexContext; +//! use vectorless::parser::DocumentFormat; +//! +//! let pdf_bytes = vec![/* PDF binary data */]; +//! let ctx = IndexContext::from_bytes(pdf_bytes, DocumentFormat::Pdf); +//! ``` +//! +//! ## With options +//! +//! ```rust,no_run +//! use vectorless::client::{IndexContext, IndexMode}; +//! +//! let ctx = IndexContext::from_path("./document.pdf") +//! .with_mode(IndexMode::Force); +//! ``` + +use std::path::PathBuf; + +use crate::parser::DocumentFormat; + +use super::types::{IndexMode, IndexOptions}; + +// ============================================================ +// Index Source +// ============================================================ + +/// The source of document content for indexing. +/// +/// This enum represents the different ways a document can be provided +/// to the indexing pipeline. +#[derive(Debug, Clone)] +pub enum IndexSource { + /// Load document from a file path. + /// + /// The format is detected from the file extension. + Path(PathBuf), + + /// Parse document from a string. + /// + /// Used for text-based formats like HTML and Markdown. + /// The format must be explicitly specified. + Content { + /// The document content as a UTF-8 string. + data: String, + /// The document format. + format: DocumentFormat, + }, + + /// Parse document from binary data. + /// + /// Used for binary formats like PDF and DOCX. + /// The format must be explicitly specified. + Bytes { + /// The document content as raw bytes. + data: Vec, + /// The document format. + format: DocumentFormat, + }, +} + +impl IndexSource { + /// Get the format of this source, if known. + /// + /// Returns `None` for `Path` sources (format detected from extension). + pub fn format(&self) -> Option { + match self { + IndexSource::Path(_) => None, + IndexSource::Content { format, .. } => Some(*format), + IndexSource::Bytes { format, .. } => Some(*format), + } + } + + /// Check if this is a path source. + pub fn is_path(&self) -> bool { + matches!(self, IndexSource::Path(_)) + } + + /// Check if this is a content source. + pub fn is_content(&self) -> bool { + matches!(self, IndexSource::Content { .. }) + } + + /// Check if this is a bytes source. + pub fn is_bytes(&self) -> bool { + matches!(self, IndexSource::Bytes { .. }) + } +} + +// ============================================================ +// Index Context +// ============================================================ + +/// Context for document indexing operations. +/// +/// `IndexContext` provides a unified interface for specifying document +/// input sources. It supports files, content strings, and binary data. +/// +/// # Type Parameters +/// +/// The context is constructed using one of: +/// - [`IndexContext::from_path`] - Load from file +/// - [`IndexContext::from_content`] - Parse string content +/// - [`IndexContext::from_bytes`] - Parse binary data +/// +/// Additional configuration can be chained: +/// - [`with_name`](IndexContext::with_name) - Set document name +/// - [`with_options`](IndexContext::with_options) - Set indexing options +/// - [`with_mode`](IndexContext::with_mode) - Set indexing mode +/// +/// # Examples +/// +/// ```rust,no_run +/// use vectorless::client::{Engine, EngineBuilder, IndexContext, IndexMode}; +/// use vectorless::parser::DocumentFormat; +/// +/// # #[tokio::main] +/// # async fn main() -> vectorless::Result<()> { +/// let engine = EngineBuilder::new() +/// .with_workspace("./data") +/// .build()?; +/// +/// // Index from file +/// let id1 = engine.index(IndexContext::from_path("./doc.md")).await?; +/// +/// // Index HTML content +/// let html = "

Title

Content

"; +/// let id2 = engine.index( +/// IndexContext::from_content(html, DocumentFormat::Html) +/// .with_name("webpage") +/// ).await?; +/// +/// // Index with force mode +/// let id3 = engine.index( +/// IndexContext::from_path("./doc.pdf") +/// .with_mode(IndexMode::Force) +/// ).await?; +/// +/// # Ok(()) +/// # } +/// ``` +#[derive(Debug, Clone)] +pub struct IndexContext { + /// The document source. + pub(crate) source: IndexSource, + + /// Optional document name for metadata. + /// + /// If not set, the name is derived from: + /// - File name (for path sources) + /// - "untitled" (for content/bytes sources) + pub(crate) name: Option, + + /// Indexing options. + pub(crate) options: IndexOptions, +} + +impl IndexContext { + /// Create an index context from a file path. + /// + /// The document format is automatically detected from the file extension. + /// + /// # Supported Extensions + /// + /// - `.md`, `.markdown` → Markdown + /// - `.pdf` → PDF + /// - `.docx` → DOCX + /// - `.html`, `.htm` → HTML + /// - `.txt` → Plain text + /// + /// # Example + /// + /// ```rust + /// use vectorless::client::IndexContext; + /// + /// let ctx = IndexContext::from_path("./documents/report.pdf"); + /// ``` + pub fn from_path(path: impl Into) -> Self { + Self { + source: IndexSource::Path(path.into()), + name: None, + options: IndexOptions::default(), + } + } + + /// Create an index context from a content string. + /// + /// Use this for text-based formats where you have the content + /// as a string. The format must be explicitly specified. + /// + /// # Example + /// + /// ```rust + /// use vectorless::client::IndexContext; + /// use vectorless::parser::DocumentFormat; + /// + /// let markdown = "# Title\n\nContent here."; + /// let ctx = IndexContext::from_content(markdown, DocumentFormat::Markdown); + /// ``` + pub fn from_content(content: impl Into, format: DocumentFormat) -> Self { + Self { + source: IndexSource::Content { + data: content.into(), + format, + }, + name: None, + options: IndexOptions::default(), + } + } + + /// Create an index context from binary data. + /// + /// Use this for binary formats like PDF and DOCX where you + /// have the raw bytes. The format must be explicitly specified. + /// + /// # Example + /// + /// ```rust + /// use vectorless::client::IndexContext; + /// use vectorless::parser::DocumentFormat; + /// + /// let pdf_bytes: Vec = vec![/* PDF binary data */]; + /// let ctx = IndexContext::from_bytes(pdf_bytes, DocumentFormat::Pdf); + /// ``` + pub fn from_bytes(bytes: Vec, format: DocumentFormat) -> Self { + Self { + source: IndexSource::Bytes { + data: bytes, + format, + }, + name: None, + options: IndexOptions::default(), + } + } + + /// Set the document name. + /// + /// The name is used in document metadata and listings. + /// If not set, it's derived from the source. + /// + /// # Example + /// + /// ```rust + /// use vectorless::client::IndexContext; + /// use vectorless::parser::DocumentFormat; + /// + /// let ctx = IndexContext::from_content("...", DocumentFormat::Html) + /// .with_name("homepage"); + /// ``` + pub fn with_name(mut self, name: impl Into) -> Self { + self.name = Some(name.into()); + self + } + + /// Set the indexing options. + /// + /// # Example + /// + /// ```rust + /// use vectorless::client::{IndexContext, IndexOptions, IndexMode}; + /// + /// let options = IndexOptions { + /// mode: IndexMode::Force, + /// ..Default::default() + /// }; + /// + /// let ctx = IndexContext::from_path("./doc.md") + /// .with_options(options); + /// ``` + pub fn with_options(mut self, options: IndexOptions) -> Self { + self.options = options; + self + } + + /// Set the indexing mode. + /// + /// This is a convenience method for setting just the mode. + /// + /// # Modes + /// + /// - [`IndexMode::Default`] - Skip if already indexed (default) + /// - [`IndexMode::Force`] - Always re-index + /// - [`IndexMode::Incremental`] - Only re-index changed files + /// + /// # Example + /// + /// ```rust + /// use vectorless::client::{IndexContext, IndexMode}; + /// + /// let ctx = IndexContext::from_path("./doc.md") + /// .with_mode(IndexMode::Force); + /// ``` + pub fn with_mode(mut self, mode: IndexMode) -> Self { + self.options.mode = mode; + self + } + + /// Get the source of this context. + pub fn source(&self) -> &IndexSource { + &self.source + } + + /// Get the document name, if set. + pub fn name(&self) -> Option<&str> { + self.name.as_deref() + } + + /// Get the indexing options. + pub fn options(&self) -> &IndexOptions { + &self.options + } +} + +impl From for IndexContext { + fn from(path: PathBuf) -> Self { + Self::from_path(path) + } +} + +impl From<&std::path::Path> for IndexContext { + fn from(path: &std::path::Path) -> Self { + Self::from_path(path.to_path_buf()) + } +} + +impl std::fmt::Display for IndexSource { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + IndexSource::Path(p) => write!(f, "path:{}", p.display()), + IndexSource::Content { format, .. } => write!(f, "content:{}", format.extension()), + IndexSource::Bytes { format, .. } => write!(f, "bytes:{}", format.extension()), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_from_path() { + let ctx = IndexContext::from_path("./test.md"); + assert!(ctx.source.is_path()); + assert!(ctx.name.is_none()); + } + + #[test] + fn test_from_content() { + let ctx = IndexContext::from_content("# Title", DocumentFormat::Markdown); + assert!(ctx.source.is_content()); + assert!(ctx.name.is_none()); + } + + #[test] + fn test_from_bytes() { + let ctx = IndexContext::from_bytes(vec![1, 2, 3], DocumentFormat::Pdf); + assert!(ctx.source.is_bytes()); + } + + #[test] + fn test_with_name() { + let ctx = IndexContext::from_path("./test.md").with_name("My Document"); + + assert_eq!(ctx.name(), Some("My Document")); + } + + #[test] + fn test_with_mode() { + let ctx = IndexContext::from_path("./test.md").with_mode(IndexMode::Force); + + assert_eq!(ctx.options.mode, IndexMode::Force); + } + + #[test] + fn test_chaining() { + let ctx = IndexContext::from_content("", DocumentFormat::Html) + .with_name("page") + .with_mode(IndexMode::Force); + + assert!(ctx.source.is_content()); + assert_eq!(ctx.name(), Some("page")); + assert_eq!(ctx.options.mode, IndexMode::Force); + } + + #[test] + fn test_from_path_trait() { + let ctx = IndexContext::from(PathBuf::from("./test.md")); + assert!(ctx.source.is_path()); + } + + #[test] + fn test_source_format() { + let content_source = IndexSource::Content { + data: "test".to_string(), + format: DocumentFormat::Html, + }; + assert_eq!(content_source.format(), Some(DocumentFormat::Html)); + + let path_source = IndexSource::Path(PathBuf::from("./test.md")); + assert_eq!(path_source.format(), None); + } +} diff --git a/src/client/indexer.rs b/src/client/indexer.rs index 4462992b..679dbea0 100644 --- a/src/client/indexer.rs +++ b/src/client/indexer.rs @@ -9,14 +9,15 @@ //! # Example //! //! ```rust,ignore +//! use vectorless::client::{IndexerClient, IndexContext}; +//! //! let indexer = IndexerClient::new(executor); //! //! let result = indexer -//! .index("./document.md") -//! .with_summaries() +//! .index(IndexContext::from_path("./document.md")) //! .await?; //! -//! println!("Indexed: {} ({} nodes)", result.doc_id, result.node_count); +//! println!("Indexed: {} ({} nodes)", result.id, result.tree.as_ref().map(|t| t.node_count()).unwrap_or(0)); //! ``` use std::path::{Path, PathBuf}; @@ -32,7 +33,8 @@ use crate::storage::{DocumentMeta, PersistedDocument}; use super::context::ClientContext; use super::events::{EventEmitter, IndexEvent}; -use super::types::{IndexMode as ClientIndexMode, IndexOptions, IndexedDocument}; +use super::index_context::{IndexContext, IndexSource}; +use super::types::{IndexOptions, IndexedDocument}; /// Document indexing client. /// @@ -106,29 +108,46 @@ impl IndexerClient { } } - /// Index a document from a file path. + /// Index a document from an index context. + /// + /// This is the main entry point for indexing documents. The context + /// specifies the source (path, content, or bytes) and options. /// /// # Errors /// /// Returns an error if: - /// - The file does not exist + /// - The file does not exist (for path sources) /// - The file format is not supported /// - The pipeline execution fails - pub async fn index(&self, path: impl AsRef) -> Result { - self.index_with_options(path, IndexOptions::default()).await - } - - /// Index a document with custom options. /// - /// # Errors + /// # Example /// - /// See [`IndexerClient::index`]. - pub async fn index_with_options( - &self, - path: impl AsRef, - options: IndexOptions, - ) -> Result { - let path = path.as_ref(); + /// ```rust,ignore + /// use vectorless::client::{IndexerClient, IndexContext}; + /// use vectorless::parser::DocumentFormat; + /// + /// // From file path + /// let doc = indexer.index(IndexContext::from_path("./doc.md")).await?; + /// + /// // From HTML content + /// let html = "

Title

"; + /// let doc = indexer.index( + /// IndexContext::from_content(html, DocumentFormat::Html) + /// .with_name("webpage") + /// ).await?; + /// ``` + pub async fn index(&self, ctx: IndexContext) -> Result { + match &ctx.source { + IndexSource::Path(path) => self.index_from_path(path, &ctx).await, + IndexSource::Content { data, format } => { + self.index_from_content(data, *format, &ctx).await + } + IndexSource::Bytes { data, format } => self.index_from_bytes(data, *format, &ctx).await, + } + } + + /// Index from a file path. + async fn index_from_path(&self, path: &Path, ctx: &IndexContext) -> Result { let path = path.canonicalize().unwrap_or_else(|_| path.to_path_buf()); if !path.exists() { @@ -143,31 +162,15 @@ impl IndexerClient { // Generate document ID let doc_id = Uuid::new_v4().to_string(); - // Detect format - let format = self.detect_format(&path, &options)?; + // Detect format from extension + let format = self.detect_format_from_path(&path)?; self.events .emit_index(IndexEvent::FormatDetected { format }); info!("Indexing {:?} document: {}", format, path.display()); - // Convert client options to pipeline options - let pipeline_options = PipelineOptions { - mode: match options.mode { - ClientIndexMode::Auto => IndexMode::Auto, - ClientIndexMode::Pdf => IndexMode::Pdf, - ClientIndexMode::Markdown => IndexMode::Markdown, - ClientIndexMode::Html => IndexMode::Html, - ClientIndexMode::Docx => IndexMode::Docx, - }, - generate_ids: options.generate_ids, - summary_strategy: if options.generate_summaries { - SummaryStrategy::selective(self.config.min_summary_tokens, false) - } else { - SummaryStrategy::none() - }, - generate_description: options.generate_description, - ..Default::default() - }; + // Build pipeline options + let pipeline_options = self.build_pipeline_options(&ctx.options, format); // Create pipeline input and execute let input = IndexInput::file(&path); @@ -179,7 +182,111 @@ impl IndexerClient { executor.execute(input, pipeline_options).await? }; - // Build indexed document + self.build_indexed_document(doc_id, result, format, ctx.name.as_deref(), Some(&path)) + } + + /// Index from content string. + async fn index_from_content( + &self, + content: &str, + format: DocumentFormat, + ctx: &IndexContext, + ) -> Result { + // Emit start event + self.events.emit_index(IndexEvent::Started { + path: ctx.name.clone().unwrap_or_else(|| "content".to_string()), + }); + + let doc_id = Uuid::new_v4().to_string(); + self.events + .emit_index(IndexEvent::FormatDetected { format }); + + info!("Indexing {:?} document from content", format); + + let pipeline_options = self.build_pipeline_options(&ctx.options, format); + + let input = IndexInput::content(content); + let result = { + let mut executor = self + .executor + .lock() + .map_err(|_| Error::Other("Pipeline executor lock poisoned".to_string()))?; + executor.execute(input, pipeline_options).await? + }; + + self.build_indexed_document(doc_id, result, format, ctx.name.as_deref(), None) + } + + /// Index from binary data. + async fn index_from_bytes( + &self, + bytes: &[u8], + format: DocumentFormat, + ctx: &IndexContext, + ) -> Result { + // Emit start event + self.events.emit_index(IndexEvent::Started { + path: ctx.name.clone().unwrap_or_else(|| "bytes".to_string()), + }); + + let doc_id = Uuid::new_v4().to_string(); + self.events + .emit_index(IndexEvent::FormatDetected { format }); + + info!( + "Indexing {:?} document from bytes ({} bytes)", + format, + bytes.len() + ); + + let pipeline_options = self.build_pipeline_options(&ctx.options, format); + + let input = IndexInput::bytes(bytes); + let result = { + let mut executor = self + .executor + .lock() + .map_err(|_| Error::Other("Pipeline executor lock poisoned".to_string()))?; + executor.execute(input, pipeline_options).await? + }; + + self.build_indexed_document(doc_id, result, format, ctx.name.as_deref(), None) + } + + /// Build pipeline options from client options. + fn build_pipeline_options( + &self, + options: &IndexOptions, + format: DocumentFormat, + ) -> PipelineOptions { + PipelineOptions { + mode: match format { + DocumentFormat::Markdown => IndexMode::Markdown, + DocumentFormat::Pdf => IndexMode::Pdf, + DocumentFormat::Html => IndexMode::Html, + DocumentFormat::Docx => IndexMode::Docx, + DocumentFormat::Text => IndexMode::Auto, + }, + generate_ids: options.generate_ids, + summary_strategy: if options.generate_summaries { + SummaryStrategy::selective(self.config.min_summary_tokens, false) + } else { + SummaryStrategy::none() + }, + generate_description: options.generate_description, + ..Default::default() + } + } + + /// Build indexed document from pipeline result. + fn build_indexed_document( + &self, + doc_id: String, + result: crate::index::IndexResult, + format: DocumentFormat, + name: Option<&str>, + path: Option<&Path>, + ) -> Result { let tree = result .tree .ok_or_else(|| Error::Parse("Document tree not generated".to_string()))?; @@ -187,11 +294,22 @@ impl IndexerClient { let node_count = tree.node_count(); self.events.emit_index(IndexEvent::TreeBuilt { node_count }); + let doc_name = name + .map(str::to_string) + .or_else(|| { + path.and_then(|p| p.file_stem()) + .map(|s| s.to_string_lossy().to_string()) + }) + .unwrap_or_else(|| result.name.clone()); + let mut doc = IndexedDocument::new(&doc_id, format) - .with_name(&result.name) - .with_source_path(&path) + .with_name(&doc_name) .with_tree(tree); + if let Some(p) = path { + doc = doc.with_source_path(p); + } + if let Some(desc) = &result.description { doc = doc.with_description(desc); } @@ -206,19 +324,11 @@ impl IndexerClient { Ok(doc) } - /// Detect document format from path and options. - pub fn detect_format(&self, path: &Path, options: &IndexOptions) -> Result { - match options.mode { - ClientIndexMode::Auto => { - let ext = path.extension().and_then(|e| e.to_str()).unwrap_or(""); - DocumentFormat::from_extension(ext) - .ok_or_else(|| Error::Parse(format!("Unknown format: {}", ext))) - } - ClientIndexMode::Pdf => Ok(DocumentFormat::Pdf), - ClientIndexMode::Markdown => Ok(DocumentFormat::Markdown), - ClientIndexMode::Html => Ok(DocumentFormat::Html), - ClientIndexMode::Docx => Ok(DocumentFormat::Docx), - } + /// Detect document format from file extension. + fn detect_format_from_path(&self, path: &Path) -> Result { + let ext = path.extension().and_then(|e| e.to_str()).unwrap_or(""); + DocumentFormat::from_extension(ext) + .ok_or_else(|| Error::Parse(format!("Unsupported format: {}", ext))) } /// Validate a document before indexing. @@ -257,7 +367,7 @@ impl IndexerClient { if format.is_none() { return Ok(ValidationResult { valid: false, - errors: vec![format!("Unknown format: {}", ext)], + errors: vec![format!("Unsupported format: {}", ext)], warnings, format: None, estimated_size, diff --git a/src/client/mod.rs b/src/client/mod.rs index 33093d8c..abc2b8e1 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -6,6 +6,7 @@ //! This module provides the main entry point for using vectorless: //! - [`Engine`] — The main client for indexing and querying documents //! - [`EngineBuilder`] — Builder pattern for client configuration +//! - [`IndexContext`] — Unified input for document indexing //! - [`Session`] — Multi-document session management //! //! # Architecture @@ -14,42 +15,47 @@ //! //! ```text //! client/ -//! ├── mod.rs → Re-exports and documentation -//! ├── engine.rs → Main orchestrator -//! ├── builder.rs → Builder pattern -//! ├── types.rs → Public API types -//! ├── context.rs → Request context and configuration -//! ├── session.rs → Session management -//! ├── indexer.rs → Document indexing operations -//! ├── retriever.rs → Query and retrieval operations -//! ├── workspace.rs → Workspace CRUD operations -//! └── events.rs → Event system and callbacks +//! ├── mod.rs → Re-exports and documentation +//! ├── engine.rs → Main orchestrator +//! ├── builder.rs → Builder pattern +//! ├── index_context.rs → Index input types +//! ├── types.rs → Public API types +//! ├── context.rs → Request context and configuration +//! ├── session.rs → Session management +//! ├── indexer.rs → Document indexing operations +//! ├── retriever.rs → Query and retrieval operations +//! ├── workspace.rs → Workspace CRUD operations +//! └── events.rs → Event system and callbacks //! ``` //! //! # Quick Start //! //! ```rust,no_run -//! use vectorless::client::{Engine, EngineBuilder}; +//! use vectorless::client::{Engine, EngineBuilder, IndexContext}; //! //! # #[tokio::main] -//! # async fn main() -> vectorless::domain::Result<()> { +//! # async fn main() -> vectorless::Result<()> { //! // Create a client with default settings //! let client = EngineBuilder::new() //! .with_workspace("./my_workspace") //! .build()?; //! -//! // Index a document -//! let doc_id = client.index("./document.md").await?; +//! // Index a document from file +//! let doc_id = client.index(IndexContext::from_path("./document.md")).await?; //! -//! // Get document structure -//! let structure = client.get_structure(&doc_id)?; +//! // Index HTML content directly +//! let html = "

Title

Content

"; +//! let doc_id2 = client.index( +//! IndexContext::from_content(html, vectorless::parser::DocumentFormat::Html) +//! .with_name("webpage") +//! ).await?; //! //! // Query the document //! let result = client.query(&doc_id, "What is this?").await?; //! println!("{}", result.content); //! //! // List all documents -//! for doc in client.list_documents() { +//! for doc in client.list_documents().await? { //! println!("{}: {}", doc.id, doc.name); //! } //! # Ok(()) @@ -61,9 +67,9 @@ //! For multi-document operations, use sessions: //! //! ```rust,no_run -//! # use vectorless::client::{Engine, EngineBuilder}; +//! # use vectorless::client::{Engine, EngineBuilder, IndexContext}; //! # #[tokio::main] -//! # async fn main() -> vectorless::domain::Result<()> { +//! # async fn main() -> vectorless::Result<()> { //! let client = EngineBuilder::new() //! .with_workspace("./workspace") //! .build()?; @@ -71,8 +77,8 @@ //! let session = client.session(); //! //! // Index multiple documents -//! let doc1 = session.index("./doc1.md").await?; -//! let doc2 = session.index("./doc2.md").await?; +//! let doc1 = session.index(IndexContext::from_path("./doc1.md")).await?; +//! let doc2 = session.index(IndexContext::from_path("./doc2.md")).await?; //! //! // Query across all documents //! let results = session.query_all("What is the architecture?").await?; @@ -87,7 +93,7 @@ //! ```rust,no_run //! # use vectorless::client::{Engine, EngineBuilder, EventEmitter, events::IndexEvent}; //! # #[tokio::main] -//! # async fn main() -> vectorless::domain::Result<()> { +//! # async fn main() -> vectorless::Result<()> { //! let events = EventEmitter::new() //! .on_index(|e| match e { //! IndexEvent::Complete { doc_id } => println!("Indexed: {}", doc_id), @@ -113,6 +119,7 @@ mod builder; mod context; mod engine; pub mod events; +mod index_context; mod indexer; mod retriever; mod session; @@ -126,6 +133,12 @@ mod workspace; pub use builder::{BuildError, EngineBuilder}; pub use engine::Engine; +// ============================================================ +// Index Context +// ============================================================ + +pub use index_context::{IndexContext, IndexSource}; + // ============================================================ // Sub-Clients // ============================================================ diff --git a/src/client/session.rs b/src/client/session.rs index 0868caaf..b698bd35 100644 --- a/src/client/session.rs +++ b/src/client/session.rs @@ -9,11 +9,13 @@ //! # Example //! //! ```rust,ignore +//! use vectorless::client::IndexContext; +//! //! let session = client.session(); //! //! // Index multiple documents -//! let doc1 = session.index("./doc1.md").await?; -//! let doc2 = session.index("./doc2.md").await?; +//! let doc1 = session.index(IndexContext::from_path("./doc1.md")).await?; +//! let doc2 = session.index(IndexContext::from_path("./doc2.md")).await?; //! //! // Query across all documents //! let results = session.query_all("What is X?").await?; @@ -24,7 +26,6 @@ use std::cell::Cell; use std::collections::HashMap; -use std::path::Path; use std::sync::Arc; use std::time::{Duration, Instant}; @@ -40,7 +41,7 @@ use super::context::ClientContext; use super::events::EventEmitter; use super::indexer::IndexerClient; use super::retriever::RetrieverClient; -use super::types::{DocumentInfo, IndexOptions, QueryResult}; +use super::types::{DocumentInfo, QueryResult}; use super::workspace::WorkspaceClient; /// Session for managing multiple documents. @@ -260,22 +261,33 @@ impl Session { /// Index a document into this session. /// /// The document is indexed, saved to workspace, and cached in this session. - pub async fn index(&self, path: impl AsRef) -> Result { - self.index_with_options(path, IndexOptions::default()).await - } - - /// Index a document with options. - pub async fn index_with_options( - &self, - path: impl AsRef, - options: IndexOptions, - ) -> Result { + /// + /// # Arguments + /// + /// * `ctx` - The index context containing source and options + /// + /// # Example + /// + /// ```rust,ignore + /// use vectorless::client::IndexContext; + /// use vectorless::parser::DocumentFormat; + /// + /// // From file + /// let id1 = session.index(IndexContext::from_path("./doc.md")).await?; + /// + /// // From content + /// let html = "Content"; + /// let id2 = session.index( + /// IndexContext::from_content(html, DocumentFormat::Html) + /// ).await?; + /// ``` + pub async fn index(&self, ctx: super::IndexContext) -> Result { // Index the document - let doc = self.indexer.index_with_options(path, options).await?; + let doc = self.indexer.index(ctx).await?; // Save to workspace let persisted = self.indexer.to_persisted(doc); - self.workspace.save(&persisted)?; + self.workspace.save(&persisted).await?; // Cache in session let doc_id = persisted.meta.id.clone(); @@ -391,10 +403,10 @@ impl Session { self.stats.increment_cache_misses(); // Load from workspace - let doc = self - .workspace - .load(doc_id)? - .ok_or_else(|| Error::DocumentNotFound(format!("Document not found: {}", doc_id)))?; + let doc = + self.workspace.load(doc_id).await?.ok_or_else(|| { + Error::DocumentNotFound(format!("Document not found: {}", doc_id)) + })?; let tree = doc.tree; diff --git a/src/client/types.rs b/src/client/types.rs index 4e3087b9..31438a62 100644 --- a/src/client/types.rs +++ b/src/client/types.rs @@ -126,29 +126,29 @@ pub struct PageContent { // Index Types // ============================================================ -/// Document indexing mode. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] +/// Document indexing behavior mode. +/// +/// Controls how the indexer handles existing documents and re-indexing. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] pub enum IndexMode { - /// Automatically detect format from file extension. - Auto, - - /// Force PDF parsing. - Pdf, - - /// Force Markdown parsing. - Markdown, - - /// Force HTML parsing. - Html, - - /// Force DOCX parsing. - Docx, -} - -impl Default for IndexMode { - fn default() -> Self { - Self::Auto - } + /// Default mode - skip if already indexed. + /// + /// If a document with the same source has already been indexed, + /// the operation is skipped and the existing document ID is returned. + #[default] + Default, + + /// Force re-indexing. + /// + /// Always re-index the document, even if it has been indexed before. + /// A new document ID is generated. + Force, + + /// Incremental mode - only re-index changed files. + /// + /// Re-index only if the file has been modified since the last index. + /// For content/bytes sources, this behaves like [`IndexMode::Default`]. + Incremental, } /// Options for indexing a document. @@ -173,7 +173,7 @@ pub struct IndexOptions { impl Default for IndexOptions { fn default() -> Self { Self { - mode: IndexMode::Auto, + mode: IndexMode::Default, generate_summaries: false, include_text: true, generate_ids: true, @@ -201,6 +201,12 @@ impl IndexOptions { } /// Set the indexing mode. + /// + /// # Modes + /// + /// - [`IndexMode::Default`] - Skip if already indexed + /// - [`IndexMode::Force`] - Always re-index + /// - [`IndexMode::Incremental`] - Only re-index changed files pub fn with_mode(mut self, mode: IndexMode) -> Self { self.mode = mode; self @@ -338,10 +344,10 @@ mod tests { fn test_index_options() { let options = IndexOptions::new() .with_summaries() - .with_mode(IndexMode::Pdf); + .with_mode(IndexMode::Force); assert!(options.generate_summaries); - assert_eq!(options.mode, IndexMode::Pdf); + assert_eq!(options.mode, IndexMode::Force); } #[test] diff --git a/src/client/workspace.rs b/src/client/workspace.rs index 1d9575d2..0b880ba6 100644 --- a/src/client/workspace.rs +++ b/src/client/workspace.rs @@ -3,43 +3,49 @@ //! Workspace management client. //! -//! This module provides CRUD operations for document persistence +//! This module provides async CRUD operations for document persistence //! through the workspace abstraction. //! //! # Example //! //! ```rust,ignore -//! let workspace = WorkspaceClient::new(workspace_storage); +//! let workspace = WorkspaceClient::new(workspace_storage).await; //! //! // Save a document -//! workspace.save(&doc)?; +//! workspace.save(&doc).await?; //! //! // Load a document -//! let doc = workspace.load("doc-id")?; +//! let doc = workspace.load("doc-id").await?; //! //! // List all documents -//! for doc in workspace.list()? { +//! for doc in workspace.list().await? { //! println!("{}: {}", doc.id, doc.name); //! } //! ``` -use std::sync::{Arc, RwLock}; +use std::sync::Arc; use tracing::{debug, info, warn}; -use crate::Error; use crate::error::Result; -use crate::storage::{DocumentMetaEntry, PersistedDocument, Workspace}; +use crate::storage::{PersistedDocument, Workspace}; use super::events::{EventEmitter, WorkspaceEvent}; use super::types::DocumentInfo; /// Workspace management client. /// -/// Provides thread-safe CRUD operations for document persistence. +/// Provides async thread-safe CRUD operations for document persistence. +/// All operations are async and can be safely called from multiple tasks. +/// +/// # Thread Safety +/// +/// The client is fully thread-safe and can be cloned cheaply +/// (it uses `Arc` internally). +#[derive(Clone)] pub struct WorkspaceClient { /// Workspace storage. - workspace: Arc>, + workspace: Arc, /// Event emitter. events: EventEmitter, @@ -69,9 +75,9 @@ impl Default for WorkspaceClientConfig { impl WorkspaceClient { /// Create a new workspace client. - pub fn new(workspace: Workspace) -> Self { + pub async fn new(workspace: Workspace) -> Self { Self { - workspace: Arc::new(RwLock::new(workspace)), + workspace: Arc::new(workspace), events: EventEmitter::new(), config: WorkspaceClientConfig::default(), } @@ -90,7 +96,7 @@ impl WorkspaceClient { } /// Create from an existing workspace Arc. - pub(crate) fn from_arc(workspace: Arc>, events: EventEmitter) -> Self { + pub(crate) fn from_arc(workspace: Arc, events: EventEmitter) -> Self { Self { workspace, events, @@ -103,16 +109,10 @@ impl WorkspaceClient { /// # Errors /// /// Returns an error if the workspace write fails. - pub fn save(&self, doc: &PersistedDocument) -> Result<()> { + pub async fn save(&self, doc: &PersistedDocument) -> Result<()> { let doc_id = doc.meta.id.clone(); - { - let mut ws = self - .workspace - .write() - .map_err(|_| Error::Other("Workspace lock poisoned".to_string()))?; - ws.add(doc)?; - } + self.workspace.add(doc).await?; info!("Saved document: {}", doc_id); self.events.emit_workspace(WorkspaceEvent::Saved { doc_id }); @@ -127,17 +127,12 @@ impl WorkspaceClient { /// # Errors /// /// Returns an error if the workspace read fails. - pub fn load(&self, doc_id: &str) -> Result> { - let ws = self - .workspace - .read() - .map_err(|_| Error::Other("Workspace lock poisoned".to_string()))?; - - if !ws.contains(doc_id) { + pub async fn load(&self, doc_id: &str) -> Result> { + if !self.workspace.contains(doc_id).await { return Ok(None); } - let doc = ws.load(doc_id)?; + let doc = self.workspace.load_and_cache(doc_id).await?; let cache_hit = doc.is_some(); if let Some(ref doc) = doc { @@ -159,14 +154,8 @@ impl WorkspaceClient { /// # Errors /// /// Returns an error if the workspace write fails. - pub fn remove(&self, doc_id: &str) -> Result { - let removed = { - let mut ws = self - .workspace - .write() - .map_err(|_| Error::Other("Workspace lock poisoned".to_string()))?; - ws.remove(doc_id)? - }; + pub async fn remove(&self, doc_id: &str) -> Result { + let removed = self.workspace.remove(doc_id).await?; if removed { info!("Removed document: {}", doc_id); @@ -183,12 +172,8 @@ impl WorkspaceClient { /// # Errors /// /// Returns an error if the workspace read fails. - pub fn exists(&self, doc_id: &str) -> Result { - let ws = self - .workspace - .read() - .map_err(|_| Error::Other("Workspace lock poisoned".to_string()))?; - Ok(ws.contains(doc_id)) + pub async fn exists(&self, doc_id: &str) -> Result { + Ok(self.workspace.contains(doc_id).await) } /// List all documents in the workspace. @@ -196,38 +181,24 @@ impl WorkspaceClient { /// # Errors /// /// Returns an error if the workspace read fails. - pub fn list(&self) -> Result> { - let ws = self - .workspace - .read() - .map_err(|_| Error::Other("Workspace lock poisoned".to_string()))?; - - Ok(ws - .list_documents() - .iter() - .filter_map(|id| ws.get_meta(id)) - .map(|meta| DocumentInfo { - id: meta.id.clone(), - name: meta.doc_name.clone(), - format: meta.doc_type.clone(), - description: meta.doc_description.clone(), - page_count: meta.page_count, - line_count: meta.line_count, - }) - .collect()) - } + pub async fn list(&self) -> Result> { + let doc_ids = self.workspace.list_documents().await; + let mut result = Vec::with_capacity(doc_ids.len()); + + for id in &doc_ids { + if let Some(meta) = self.workspace.get_meta(id).await { + result.push(DocumentInfo { + id: meta.id, + name: meta.doc_name, + format: meta.doc_type, + description: meta.doc_description, + page_count: meta.page_count, + line_count: meta.line_count, + }); + } + } - /// Get document metadata without loading the full document. - /// - /// # Errors - /// - /// Returns an error if the workspace read fails. - pub fn get_meta(&self, doc_id: &str) -> Result> { - let ws = self - .workspace - .read() - .map_err(|_| Error::Other("Workspace lock poisoned".to_string()))?; - Ok(ws.get_meta(doc_id).cloned()) + Ok(result) } /// Get document info by ID. @@ -235,20 +206,19 @@ impl WorkspaceClient { /// # Errors /// /// Returns an error if the workspace read fails. - pub fn get_document_info(&self, doc_id: &str) -> Result> { - let ws = self + pub async fn get_document_info(&self, doc_id: &str) -> Result> { + Ok(self .workspace - .read() - .map_err(|_| Error::Other("Workspace lock poisoned".to_string()))?; - - Ok(ws.get_meta(doc_id).map(|meta| DocumentInfo { - id: meta.id.clone(), - name: meta.doc_name.clone(), - format: meta.doc_type.clone(), - description: meta.doc_description.clone(), - page_count: meta.page_count, - line_count: meta.line_count, - })) + .get_meta(doc_id) + .await + .map(|meta| DocumentInfo { + id: meta.id, + name: meta.doc_name, + format: meta.doc_type, + description: meta.doc_description, + page_count: meta.page_count, + line_count: meta.line_count, + })) } /// Remove multiple documents from the workspace. @@ -258,22 +228,15 @@ impl WorkspaceClient { /// # Errors /// /// Returns an error if the workspace write fails. - pub fn batch_remove(&self, doc_ids: &[&str]) -> Result { + pub async fn batch_remove(&self, doc_ids: &[&str]) -> Result { let mut removed = 0; - { - let mut ws = self - .workspace - .write() - .map_err(|_| Error::Other("Workspace lock poisoned".to_string()))?; - - for doc_id in doc_ids { - if ws.remove(doc_id)? { - removed += 1; - self.events.emit_workspace(WorkspaceEvent::Removed { - doc_id: doc_id.to_string(), - }); - } + for doc_id in doc_ids { + if self.workspace.remove(doc_id).await? { + removed += 1; + self.events.emit_workspace(WorkspaceEvent::Removed { + doc_id: doc_id.to_string(), + }); } } @@ -291,28 +254,12 @@ impl WorkspaceClient { /// # Errors /// /// Returns an error if the workspace write fails. - pub fn clear(&self) -> Result { - let doc_ids: Vec; - - { - let ws = self - .workspace - .read() - .map_err(|_| Error::Other("Workspace lock poisoned".to_string()))?; - doc_ids = ws.list_documents().iter().map(|s| s.to_string()).collect(); - } - + pub async fn clear(&self) -> Result { + let doc_ids = self.workspace.list_documents().await; let count = doc_ids.len(); - { - let mut ws = self - .workspace - .write() - .map_err(|_| Error::Other("Workspace lock poisoned".to_string()))?; - - for doc_id in &doc_ids { - let _ = ws.remove(doc_id); - } + for doc_id in &doc_ids { + let _ = self.workspace.remove(doc_id).await; } if count > 0 { @@ -325,47 +272,28 @@ impl WorkspaceClient { } /// Get workspace statistics. - /// - /// # Errors - /// - /// Returns an error if the workspace read fails. - pub fn stats(&self) -> Result { - let ws = self - .workspace - .read() - .map_err(|_| Error::Other("Workspace lock poisoned".to_string()))?; - + pub async fn stats(&self) -> Result { Ok(WorkspaceStats { - document_count: ws.len(), + document_count: self.workspace.len().await, }) } /// Get the number of documents in the workspace. - pub fn len(&self) -> usize { - self.workspace.read().map(|ws| ws.len()).unwrap_or(0) + pub async fn len(&self) -> usize { + self.workspace.len().await } /// Check if the workspace is empty. - pub fn is_empty(&self) -> bool { - self.len() == 0 + pub async fn is_empty(&self) -> bool { + self.workspace.is_empty().await } /// Get the underlying workspace Arc (for advanced use). - pub(crate) fn inner(&self) -> Arc> { + pub(crate) fn inner(&self) -> Arc { Arc::clone(&self.workspace) } } -impl Clone for WorkspaceClient { - fn clone(&self) -> Self { - Self { - workspace: Arc::clone(&self.workspace), - events: self.events.clone(), - config: self.config.clone(), - } - } -} - /// Workspace statistics. #[derive(Debug, Clone)] pub struct WorkspaceStats { @@ -376,32 +304,24 @@ pub struct WorkspaceStats { #[cfg(test)] mod tests { use super::*; - use crate::storage::WorkspaceOptions; - use tempfile::TempDir; - - #[test] - fn test_workspace_client_creation() { - let temp = TempDir::new().unwrap(); - let options = WorkspaceOptions { - file_lock: false, - ..Default::default() - }; - let workspace = Workspace::open_with_options(temp.path(), options).unwrap(); - let client = WorkspaceClient::new(workspace); - assert!(client.is_empty()); + use crate::storage::backend::MemoryBackend; + use std::sync::Arc as StdArc; + + #[tokio::test] + async fn test_workspace_client_creation() { + let backend = StdArc::new(MemoryBackend::new()); + let workspace = Workspace::with_backend(backend).await.unwrap(); + let client = WorkspaceClient::new(workspace).await; + assert!(client.is_empty().await); } - #[test] - fn test_workspace_stats() { - let temp = TempDir::new().unwrap(); - let options = WorkspaceOptions { - file_lock: false, - ..Default::default() - }; - let workspace = Workspace::open_with_options(temp.path(), options).unwrap(); - let client = WorkspaceClient::new(workspace); - - let stats = client.stats().unwrap(); + #[tokio::test] + async fn test_workspace_stats() { + let backend = StdArc::new(MemoryBackend::new()); + let workspace = Workspace::with_backend(backend).await.unwrap(); + let client = WorkspaceClient::new(workspace).await; + + let stats = client.stats().await.unwrap(); assert_eq!(stats.document_count, 0); } } diff --git a/src/index/pipeline/context.rs b/src/index/pipeline/context.rs index 777033fc..ab9a462d 100644 --- a/src/index/pipeline/context.rs +++ b/src/index/pipeline/context.rs @@ -19,7 +19,7 @@ pub enum IndexInput { /// Index from file path. File(PathBuf), - /// Index from raw content. + /// Index from raw content string. Content { /// Content string. content: String, @@ -28,6 +28,16 @@ pub enum IndexInput { /// Document format. format: DocumentFormat, }, + + /// Index from binary data. + Bytes { + /// Binary data. + data: Vec, + /// Document name. + name: String, + /// Document format. + format: DocumentFormat, + }, } impl IndexInput { @@ -36,8 +46,17 @@ impl IndexInput { Self::File(path.into()) } - /// Create input from content. - pub fn content( + /// Create input from content string. + pub fn content(content: impl Into) -> Self { + Self::Content { + content: content.into(), + name: String::new(), + format: DocumentFormat::Text, + } + } + + /// Create input from content with name and format. + pub fn content_with( content: impl Into, name: impl Into, format: DocumentFormat, @@ -48,6 +67,52 @@ impl IndexInput { format, } } + + /// Create input from binary data. + pub fn bytes(data: impl Into>) -> Self { + Self::Bytes { + data: data.into(), + name: String::new(), + format: DocumentFormat::Pdf, + } + } + + /// Create input from binary data with name and format. + pub fn bytes_with( + data: impl Into>, + name: impl Into, + format: DocumentFormat, + ) -> Self { + Self::Bytes { + data: data.into(), + name: name.into(), + format, + } + } + + /// Check if this is a file input. + pub fn is_file(&self) -> bool { + matches!(self, Self::File(_)) + } + + /// Check if this is a content input. + pub fn is_content(&self) -> bool { + matches!(self, Self::Content { .. }) + } + + /// Check if this is a bytes input. + pub fn is_bytes(&self) -> bool { + matches!(self, Self::Bytes { .. }) + } + + /// Get the format if available. + pub fn format(&self) -> Option { + match self { + Self::File(_) => None, + Self::Content { format, .. } => Some(*format), + Self::Bytes { format, .. } => Some(*format), + } + } } /// Result from a single stage execution. diff --git a/src/index/pipeline/executor.rs b/src/index/pipeline/executor.rs index ee560e91..83649271 100644 --- a/src/index/pipeline/executor.rs +++ b/src/index/pipeline/executor.rs @@ -135,7 +135,7 @@ impl PipelineExecutor { self } - /// Add persistence stage with workspace. + /// Add persistence stage with async workspace. pub fn with_persistence(mut self, workspace: crate::storage::Workspace) -> Self { self.orchestrator = self .orchestrator diff --git a/src/index/stages/parse.rs b/src/index/stages/parse.rs index 72657b9b..d5f0ba56 100644 --- a/src/index/stages/parse.rs +++ b/src/index/stages/parse.rs @@ -38,6 +38,7 @@ impl ParseStage { .ok_or_else(|| crate::Error::Parse(format!("Unknown format: {}", ext))) } IndexInput::Content { format, .. } => Ok(*format), + IndexInput::Bytes { format, .. } => Ok(*format), }, IndexMode::Markdown => Ok(DocumentFormat::Markdown), IndexMode::Pdf => Ok(DocumentFormat::Pdf), @@ -96,6 +97,13 @@ impl IndexStage for ParseStage { // Parse content directly self.parser_registry.parse(content, *format).await? } + IndexInput::Bytes { data, name, format } => { + // Set name + ctx.name = name.clone(); + + // Parse bytes + self.parser_registry.parse_bytes(data, *format).await? + } }; // Store results diff --git a/src/index/stages/persist.rs b/src/index/stages/persist.rs index e0d93f7d..26d3aad4 100644 --- a/src/index/stages/persist.rs +++ b/src/index/stages/persist.rs @@ -33,10 +33,10 @@ impl PersistStage { } /// Save document to workspace. - fn save_to_workspace(&mut self, ctx: &IndexContext) -> Result<()> { + async fn save_to_workspace(&self, ctx: &IndexContext) -> Result<()> { let workspace = self .workspace - .as_mut() + .as_ref() .ok_or_else(|| crate::Error::Config("No workspace configured".to_string()))?; let tree = ctx @@ -54,7 +54,7 @@ impl PersistStage { // Add pages if available (for PDFs) // Note: pages would need to be stored in context during parse stage - workspace.add(&doc)?; + workspace.add(&doc).await?; info!("Saved document {} to workspace", ctx.doc_id); Ok(()) @@ -82,7 +82,7 @@ impl IndexStage for PersistStage { // Only persist if workspace is configured if self.workspace.is_some() { - self.save_to_workspace(ctx)?; + self.save_to_workspace(ctx).await?; } else { info!("No workspace configured, skipping persistence"); } diff --git a/src/lib.rs b/src/lib.rs index 00818c67..3065f4d0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -69,7 +69,7 @@ //! use vectorless::{EngineBuilder, Engine}; //! //! #[tokio::main] -//! async fn main() -> vectorless::domain::Result<()> { +//! async fn main() -> vectorless::Result<()> { //! // Create client //! let mut client = EngineBuilder::new() //! .with_workspace("./workspace") @@ -121,7 +121,10 @@ pub mod util; // ============================================================================= // Client API (most common entry point) -pub use client::{DocumentInfo, Engine, EngineBuilder, IndexedDocument}; +pub use client::{ + BuildError, DocumentInfo, Engine, EngineBuilder, IndexContext, IndexMode, IndexOptions, + IndexSource, IndexedDocument, +}; // Error types pub use error::{Error, Result}; @@ -149,8 +152,9 @@ pub use parser::{ // Indexing pub use index::pipeline::{CustomStageBuilder, PipelineOrchestrator}; pub use index::{ - ChangeDetector, ChangeSet, IndexContext, IndexInput, IndexMetrics, IndexMode, IndexResult, - IndexStage, PartialUpdater, PipelineExecutor, PipelineOptions, SummaryStrategy, + ChangeDetector, ChangeSet, IndexContext as PipelineIndexContext, IndexInput, IndexMetrics, + IndexMode as PipelineIndexMode, IndexResult, IndexStage, PartialUpdater, PipelineExecutor, + PipelineOptions, SummaryStrategy, }; // Retrieval @@ -163,9 +167,7 @@ pub use retrieval::{ }; // Storage -pub use storage::{ - AsyncWorkspace, DocumentMeta as StorageDocumentMeta, PersistedDocument, Workspace, -}; +pub use storage::{DocumentMeta as StorageDocumentMeta, PersistedDocument, Workspace}; // Throttle pub use throttle::{ConcurrencyConfig, ConcurrencyController, RateLimiter}; diff --git a/src/parser/docx/mod.rs b/src/parser/docx/mod.rs index 385c03e4..b5bf602e 100644 --- a/src/parser/docx/mod.rs +++ b/src/parser/docx/mod.rs @@ -22,11 +22,11 @@ //! //! ```rust,no_run //! use vectorless::parser::docx::DocxParser; -//! use vectorless::domain::DocumentParser; +//! use vectorless::DocumentParser; //! use std::path::Path; //! //! # #[tokio::main] -//! # async fn main() -> vectorless::domain::Result<()> { +//! # async fn main() -> vectorless::Result<()> { //! let parser = DocxParser::new(); //! let result = parser.parse_file(Path::new("document.docx")).await?; //! diff --git a/src/parser/docx/parser.rs b/src/parser/docx/parser.rs index aa3885f6..c32a047f 100644 --- a/src/parser/docx/parser.rs +++ b/src/parser/docx/parser.rs @@ -10,11 +10,11 @@ //! //! ```rust,no_run //! use vectorless::parser::docx::DocxParser; -//! use vectorless::domain::DocumentParser; +//! use vectorless::DocumentParser; //! use std::path::Path; //! //! # #[tokio::main] -//! # async fn main() -> vectorless::domain::Result<()> { +//! # async fn main() -> vectorless::Result<()> { //! let parser = DocxParser::new(); //! let result = parser.parse_file(Path::new("document.docx")).await?; //! diff --git a/src/parser/markdown/parser.rs b/src/parser/markdown/parser.rs index 09fc8888..7d0041a0 100644 --- a/src/parser/markdown/parser.rs +++ b/src/parser/markdown/parser.rs @@ -32,7 +32,7 @@ use super::frontmatter; /// use vectorless::parser::DocumentParser; /// /// # #[tokio::main] -/// # async fn main() -> vectorless::domain::Result<()> { +/// # async fn main() -> vectorless::Result<()> { /// let parser = MarkdownParser::new(); /// let result = parser.parse("# Title\n\nContent").await?; /// diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 10d69738..4442f25c 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -20,7 +20,7 @@ //! use vectorless::parser::{DocumentParser, MarkdownParser, DocumentFormat}; //! //! # #[tokio::main] -//! # async fn main() -> vectorless::domain::Result<()> { +//! # async fn main() -> vectorless::Result<()> { //! // Create a parser //! let parser = MarkdownParser::new(); //! diff --git a/src/parser/pdf/mod.rs b/src/parser/pdf/mod.rs index 5df9bff6..880e8025 100644 --- a/src/parser/pdf/mod.rs +++ b/src/parser/pdf/mod.rs @@ -13,7 +13,7 @@ //! use vectorless::parser::pdf::{PdfParser, PdfPage}; //! use std::path::Path; //! -//! # fn main() -> vectorless::domain::Result<()> { +//! # fn main() -> vectorless::Result<()> { //! let parser = PdfParser::new(); //! let result = parser.parse_file(Path::new("document.pdf"))?; //! diff --git a/src/parser/registry.rs b/src/parser/registry.rs index 9667c176..e59863a3 100644 --- a/src/parser/registry.rs +++ b/src/parser/registry.rs @@ -130,6 +130,39 @@ impl ParserRegistry { .ok_or_else(|| Error::Parse(format!("Unsupported format: {:?}", format)))?; parser.parse_file(path).await } + + /// Parse binary data using the appropriate parser. + /// + /// For text-based formats, the bytes are converted to UTF-8 string first. + /// For binary formats (PDF, DOCX), the parser handles the bytes directly. + pub async fn parse_bytes(&self, bytes: &[u8], format: DocumentFormat) -> Result { + match format { + DocumentFormat::Markdown | DocumentFormat::Html | DocumentFormat::Text => { + // Text formats - convert to string first + let content = std::str::from_utf8(bytes) + .map_err(|e| Error::Parse(format!("Invalid UTF-8 content: {}", e)))?; + self.parse(content, format).await + } + DocumentFormat::Pdf | DocumentFormat::Docx => { + // Binary formats - write to temp file and parse + // This is a temporary solution until parsers support bytes directly + let temp_dir = std::env::temp_dir(); + let ext = format.extension(); + let temp_file = + temp_dir.join(format!("vectorless_temp_{}.{}", uuid::Uuid::new_v4(), ext)); + + std::fs::write(&temp_file, bytes) + .map_err(|e| Error::Parse(format!("Failed to write temp file: {}", e)))?; + + let result = self.parse_file_as(&temp_file, format).await; + + // Clean up temp file + let _ = std::fs::remove_file(&temp_file); + + result + } + } + } } impl Default for ParserRegistry { diff --git a/src/parser/toc/mod.rs b/src/parser/toc/mod.rs index 83341cae..99e4861d 100644 --- a/src/parser/toc/mod.rs +++ b/src/parser/toc/mod.rs @@ -47,7 +47,7 @@ //! use vectorless::parser::pdf::{PdfParser, PdfPage}; //! //! # #[tokio::main] -//! # async fn main() -> vectorless::domain::Result<()> { +//! # async fn main() -> vectorless::Result<()> { //! // Parse PDF //! let pdf_parser = PdfParser::new(); //! let result = pdf_parser.parse_file("document.pdf".as_ref())?; diff --git a/src/parser/toc/processor.rs b/src/parser/toc/processor.rs index 7b7cf945..1d26f9a6 100644 --- a/src/parser/toc/processor.rs +++ b/src/parser/toc/processor.rs @@ -72,7 +72,7 @@ impl Default for TocProcessorConfig { /// use vectorless::parser::pdf::PdfParser; /// /// # #[tokio::main] -/// # async fn main() -> vectorless::domain::Result<()> { +/// # async fn main() -> vectorless::Result<()> { /// // Parse PDF /// let pdf_parser = PdfParser::new(); /// let result = pdf_parser.parse_file("document.pdf".as_ref())?; diff --git a/src/parser/traits.rs b/src/parser/traits.rs index 296fcabe..e93cef70 100644 --- a/src/parser/traits.rs +++ b/src/parser/traits.rs @@ -20,7 +20,7 @@ use crate::error::Result; /// use vectorless::parser::{DocumentParser, MarkdownParser}; /// /// # #[tokio::main] -/// # async fn main() -> vectorless::domain::Result<()> { +/// # async fn main() -> vectorless::Result<()> { /// let parser = MarkdownParser::new(); /// let content = "# Title\n\nContent here."; /// let result = parser.parse(content).await?; diff --git a/src/retrieval/mod.rs b/src/retrieval/mod.rs index 4124e9a6..8e7d3a2a 100644 --- a/src/retrieval/mod.rs +++ b/src/retrieval/mod.rs @@ -70,6 +70,9 @@ pub use pipeline_retriever::PipelineRetriever; pub use retriever::{RetrievalContext, Retriever, RetrieverError, RetrieverResult}; pub use types::*; +// Re-export StrategyPreference as Strategy for convenience +pub use types::StrategyPreference as Strategy; + // Pipeline exports pub use pipeline::{ CandidateNode, ExecutionGroup, FailurePolicy, PipelineContext, RetrievalMetrics, diff --git a/src/retrieval/strategy/llm.rs b/src/retrieval/strategy/llm.rs index c1ca5037..befc3eb5 100644 --- a/src/retrieval/strategy/llm.rs +++ b/src/retrieval/strategy/llm.rs @@ -34,7 +34,7 @@ struct NavigationResponse { /// # Example /// /// ```rust,no_run -/// use vectorless::domain::retriever::strategy::LlmStrategy; +/// use vectorless::retriever::strategy::LlmStrategy; /// use vectorless::llm::LlmClient; /// /// let client = LlmClient::with_defaults(); diff --git a/src/storage/async_workspace.rs b/src/storage/async_workspace.rs deleted file mode 100644 index 0f72ddc3..00000000 --- a/src/storage/async_workspace.rs +++ /dev/null @@ -1,597 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Async workspace management for document collections. -//! -//! This module provides an async version of [`Workspace`](super::Workspace) -//! for integration with async runtimes like Tokio. -//! -//! # Example -//! -//! ```rust,ignore -//! use vectorless::storage::AsyncWorkspace; -//! -//! #[tokio::main] -//! async fn main() -> Result<()> { -//! let mut workspace = AsyncWorkspace::new("./workspace").await?; -//! -//! // Add a document -//! workspace.add(&doc).await?; -//! -//! // Load with caching -//! let loaded = workspace.load("doc-1").await?; -//! -//! Ok(()) -//! } -//! ``` - -use std::collections::HashMap; -use std::path::{Path, PathBuf}; -use std::sync::Arc; - -use serde::{Deserialize, Serialize}; -use tokio::sync::RwLock; -use tracing::{debug, info, warn}; - -use super::backend::{FileBackend, StorageBackend}; -use super::cache::DocumentCache; -use super::persistence::{PersistedDocument, load_document_from_bytes, save_document_to_bytes}; -use crate::Error; -use crate::error::Result; - -const META_KEY: &str = "_meta"; -const DEFAULT_CACHE_SIZE: usize = 100; - -/// Lightweight metadata entry for the async workspace index. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct AsyncDocumentMetaEntry { - /// Document ID. - pub id: String, - /// Document name/title. - pub doc_name: String, - /// Document description. - #[serde(default)] - pub doc_description: Option, - /// Document type (pdf, md, etc.). - pub doc_type: String, - /// Source file path. - #[serde(default)] - pub path: Option, - /// Page count (for PDFs). - #[serde(skip_serializing_if = "Option::is_none")] - pub page_count: Option, - /// Line count (for markdown). - #[serde(skip_serializing_if = "Option::is_none")] - pub line_count: Option, -} - -/// Options for async workspace creation. -#[derive(Debug, Clone)] -pub struct AsyncWorkspaceOptions { - /// LRU cache size (default: 100). - pub cache_size: usize, -} - -impl Default for AsyncWorkspaceOptions { - fn default() -> Self { - Self { - cache_size: DEFAULT_CACHE_SIZE, - } - } -} - -impl AsyncWorkspaceOptions { - /// Create new options with defaults. - pub fn new() -> Self { - Self::default() - } - - /// Set the cache size. - pub fn with_cache_size(mut self, size: usize) -> Self { - self.cache_size = size; - self - } -} - -/// Inner state for the async workspace. -struct AsyncWorkspaceInner { - /// Storage backend. - backend: Arc, - /// Root path (for file-based backends). - root: Option, - /// Document metadata index. - meta_index: HashMap, - /// LRU cache for loaded documents. - cache: DocumentCache, -} - -/// An async workspace for managing indexed documents. -/// -/// Uses `tokio::sync::RwLock` for async-safe concurrent access. -/// All operations are async and can be safely called from multiple tasks. -/// -/// # Thread Safety -/// -/// The async workspace is fully thread-safe and can be cloned cheaply -/// (it uses `Arc` internally). -#[derive(Clone)] -pub struct AsyncWorkspace { - inner: Arc>, -} - -impl std::fmt::Debug for AsyncWorkspace { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("AsyncWorkspace").finish() - } -} - -impl AsyncWorkspace { - /// Create a new async workspace with a storage backend. - pub async fn with_backend(backend: Arc) -> Result { - Self::with_backend_and_options(backend, AsyncWorkspaceOptions::default()).await - } - - /// Create an async workspace with backend and options. - pub async fn with_backend_and_options( - backend: Arc, - options: AsyncWorkspaceOptions, - ) -> Result { - let mut inner = AsyncWorkspaceInner { - backend, - root: None, - meta_index: HashMap::new(), - cache: DocumentCache::with_capacity(options.cache_size), - }; - - Self::load_meta_index(&mut inner)?; - - Ok(Self { - inner: Arc::new(RwLock::new(inner)), - }) - } - - /// Create a new file-based async workspace at the given path. - pub async fn new(path: impl Into) -> Result { - Self::with_options(path, AsyncWorkspaceOptions::default()).await - } - - /// Create a new async workspace with custom cache size. - pub async fn with_cache_size(path: impl Into, cache_size: usize) -> Result { - Self::with_options( - path, - AsyncWorkspaceOptions { - cache_size, - ..Default::default() - }, - ) - .await - } - - /// Create a new async workspace with custom options. - pub async fn with_options( - path: impl Into, - options: AsyncWorkspaceOptions, - ) -> Result { - let root = path.into(); - let backend = Arc::new(FileBackend::new(&root)?); - - let mut inner = AsyncWorkspaceInner { - backend, - root: Some(root), - meta_index: HashMap::new(), - cache: DocumentCache::with_capacity(options.cache_size), - }; - - Self::load_meta_index(&mut inner)?; - - Ok(Self { - inner: Arc::new(RwLock::new(inner)), - }) - } - - /// Get the workspace root path (if file-based). - pub async fn path(&self) -> Option { - let inner = self.inner.read().await; - inner.root.clone() - } - - /// List all document IDs in the workspace. - pub async fn list_documents(&self) -> Vec { - let inner = self.inner.read().await; - inner.meta_index.keys().cloned().collect() - } - - /// Get metadata for a document. - pub async fn get_meta(&self, id: &str) -> Option { - let inner = self.inner.read().await; - inner.meta_index.get(id).cloned() - } - - /// Check if a document exists. - pub async fn contains(&self, id: &str) -> bool { - let inner = self.inner.read().await; - inner.meta_index.contains_key(id) - } - - /// Add a document to the workspace. - pub async fn add(&self, doc: &PersistedDocument) -> Result<()> { - let mut inner = self.inner.write().await; - - let doc_id = doc.meta.id.clone(); - let key = Self::doc_key(&doc_id); - - // Serialize and save via backend - let bytes = save_document_to_bytes(doc)?; - inner.backend.put(&key, &bytes)?; - - // Update meta index - let meta_entry = AsyncDocumentMetaEntry { - id: doc_id.clone(), - doc_name: doc.meta.name.clone(), - doc_description: doc.meta.description.clone(), - doc_type: doc.meta.format.clone(), - path: doc - .meta - .source_path - .as_ref() - .map(|p| p.to_string_lossy().to_string()), - page_count: if doc.pages.is_empty() { - None - } else { - Some(doc.pages.len()) - }, - line_count: doc.meta.line_count, - }; - - inner.meta_index.insert(doc_id.clone(), meta_entry); - Self::save_meta_index(&inner)?; - - // Remove from cache if present - let _ = inner.cache.remove(&doc_id); - - info!("Saved document {} to async workspace", doc_id); - Ok(()) - } - - /// Load a document from the workspace. - /// - /// Uses LRU cache: returns cached version if available, - /// otherwise loads from backend and caches it. - pub async fn load(&self, id: &str) -> Result> { - // First check if document exists (read lock) - { - let inner = self.inner.read().await; - if !inner.meta_index.contains_key(id) { - return Ok(None); - } - - // Check LRU cache - if let Some(cached) = inner.cache.get(id)? { - debug!("Cache hit for document {}", id); - return Ok(Some(cached)); - } - } - - // Load from backend (need read lock for backend access) - let inner = self.inner.read().await; - let key = Self::doc_key(id); - - match inner.backend.get(&key)? { - Some(bytes) => { - let doc = load_document_from_bytes(&bytes)?; - - // Note: We can't modify the cache with only a read lock - // For now, we return the document without caching - // A more sophisticated implementation would use a separate cache structure - - debug!("Loaded document {} from backend", id); - Ok(Some(doc)) - } - None => { - warn!("Document {} in meta index but not in backend", id); - Ok(None) - } - } - } - - /// Load a document and cache it (requires write lock for caching). - pub async fn load_and_cache(&self, id: &str) -> Result> { - // First check if document exists (read lock) - { - let inner = self.inner.read().await; - if !inner.meta_index.contains_key(id) { - return Ok(None); - } - - // Check LRU cache - if let Some(cached) = inner.cache.get(id)? { - debug!("Cache hit for document {}", id); - return Ok(Some(cached)); - } - } - - // Load from backend and cache (write lock) - let inner = self.inner.write().await; - let key = Self::doc_key(id); - - match inner.backend.get(&key)? { - Some(bytes) => { - let doc = load_document_from_bytes(&bytes)?; - - // Add to cache - inner.cache.put(id.to_string(), doc.clone())?; - - debug!("Loaded and cached document {}", id); - Ok(Some(doc)) - } - None => { - warn!("Document {} in meta index but not in backend", id); - Ok(None) - } - } - } - - /// Remove a document from the workspace. - pub async fn remove(&self, id: &str) -> Result { - let mut inner = self.inner.write().await; - - if !inner.meta_index.contains_key(id) { - return Ok(false); - } - - let key = Self::doc_key(id); - inner.backend.delete(&key)?; - - inner.meta_index.remove(id); - - // Remove from cache - let _ = inner.cache.remove(id); - - Self::save_meta_index(&inner)?; - - info!("Removed document {} from async workspace", id); - Ok(true) - } - - /// Get the number of documents in the workspace. - pub async fn len(&self) -> usize { - let inner = self.inner.read().await; - inner.meta_index.len() - } - - /// Check if the workspace is empty. - pub async fn is_empty(&self) -> bool { - let inner = self.inner.read().await; - inner.meta_index.is_empty() - } - - /// Get the number of items currently in the LRU cache. - pub async fn cache_len(&self) -> usize { - let inner = self.inner.read().await; - inner.cache.len() - } - - /// Get cache utilization (0.0 to 1.0). - pub async fn cache_utilization(&self) -> f64 { - let inner = self.inner.read().await; - inner.cache.utilization() - } - - /// Get cache statistics. - pub async fn cache_stats(&self) -> super::cache::CacheStats { - let inner = self.inner.read().await; - inner.cache.stats() - } - - /// Clear the LRU cache. - pub async fn clear_cache(&self) -> Result<()> { - let inner = self.inner.write().await; - inner.cache.clear()?; - debug!("Cleared async document cache"); - Ok(()) - } - - /// Get the storage key for a document. - fn doc_key(id: &str) -> String { - format!("doc:{}", id) - } - - /// Load the meta index from backend. - fn load_meta_index(inner: &mut AsyncWorkspaceInner) -> Result<()> { - match inner.backend.get(META_KEY)? { - Some(bytes) => { - let meta: HashMap = serde_json::from_slice(&bytes) - .map_err(|e| Error::Parse(format!("Failed to parse meta index: {}", e)))?; - inner.meta_index = meta; - info!( - "Loaded {} document(s) from async workspace index", - inner.meta_index.len() - ); - } - None => { - // Try to rebuild from existing keys - Self::rebuild_meta_index(inner)?; - } - } - Ok(()) - } - - /// Save the meta index to backend. - fn save_meta_index(inner: &AsyncWorkspaceInner) -> Result<()> { - let bytes = serde_json::to_vec_pretty(&inner.meta_index) - .map_err(|e| Error::Parse(format!("Failed to serialize meta index: {}", e)))?; - inner.backend.put(META_KEY, &bytes)?; - Ok(()) - } - - /// Rebuild the meta index from existing documents. - fn rebuild_meta_index(inner: &mut AsyncWorkspaceInner) -> Result<()> { - let keys = inner.backend.keys()?; - let doc_keys: Vec<_> = keys.iter().filter(|k| k.starts_with("doc:")).collect(); - - for key in doc_keys { - if let Some(bytes) = inner.backend.get(key)? { - if let Ok(doc) = load_document_from_bytes(&bytes) { - let doc_id = doc.meta.id.clone(); - let meta_entry = AsyncDocumentMetaEntry { - id: doc_id.clone(), - doc_name: doc.meta.name, - doc_description: doc.meta.description, - doc_type: doc.meta.format, - path: doc - .meta - .source_path - .as_ref() - .map(|p| p.to_string_lossy().to_string()), - page_count: if doc.pages.is_empty() { - None - } else { - Some(doc.pages.len()) - }, - line_count: doc.meta.line_count, - }; - inner.meta_index.insert(doc_id, meta_entry); - } - } - } - - if !inner.meta_index.is_empty() { - Self::save_meta_index(inner)?; - info!( - "Rebuilt async index from {} document(s)", - inner.meta_index.len() - ); - } - - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::document::DocumentTree; - - fn create_test_doc(id: &str) -> PersistedDocument { - let meta = super::super::persistence::DocumentMeta::new(id, "Test Doc", "md"); - let tree = DocumentTree::new("Root", "Content"); - PersistedDocument::new(meta, tree) - } - - #[tokio::test] - async fn test_async_workspace_create() { - let backend = Arc::new(super::super::backend::MemoryBackend::new()); - let workspace = AsyncWorkspace::with_backend(backend).await.unwrap(); - - assert!(workspace.is_empty().await); - assert_eq!(workspace.len().await, 0); - } - - #[tokio::test] - async fn test_async_workspace_add_and_load() { - let backend = Arc::new(super::super::backend::MemoryBackend::new()); - let workspace = AsyncWorkspace::with_backend(backend).await.unwrap(); - - let doc = create_test_doc("doc-1"); - workspace.add(&doc).await.unwrap(); - - assert_eq!(workspace.len().await, 1); - assert!(workspace.contains("doc-1").await); - - let loaded = workspace.load("doc-1").await.unwrap(); - assert!(loaded.is_some()); - assert_eq!(loaded.unwrap().meta.id, "doc-1"); - } - - #[tokio::test] - async fn test_async_workspace_remove() { - let backend = Arc::new(super::super::backend::MemoryBackend::new()); - let workspace = AsyncWorkspace::with_backend(backend).await.unwrap(); - - let doc = create_test_doc("doc-1"); - workspace.add(&doc).await.unwrap(); - - let removed = workspace.remove("doc-1").await.unwrap(); - assert!(removed); - assert!(workspace.is_empty().await); - - let removed_again = workspace.remove("doc-1").await.unwrap(); - assert!(!removed_again); - } - - #[tokio::test] - async fn test_async_workspace_cache() { - let backend = Arc::new(super::super::backend::MemoryBackend::new()); - let workspace = AsyncWorkspace::with_backend(backend).await.unwrap(); - - let doc = create_test_doc("doc-1"); - workspace.add(&doc).await.unwrap(); - - // First load with caching - let _ = workspace.load_and_cache("doc-1").await.unwrap(); - let stats = workspace.cache_stats().await; - assert_eq!(stats.misses, 1); - - // Second load should hit cache - let _ = workspace.load_and_cache("doc-1").await.unwrap(); - let stats = workspace.cache_stats().await; - assert_eq!(stats.hits, 1); - } - - #[tokio::test] - async fn test_async_workspace_list_documents() { - let backend = Arc::new(super::super::backend::MemoryBackend::new()); - let workspace = AsyncWorkspace::with_backend(backend).await.unwrap(); - - workspace.add(&create_test_doc("doc-1")).await.unwrap(); - workspace.add(&create_test_doc("doc-2")).await.unwrap(); - workspace.add(&create_test_doc("doc-3")).await.unwrap(); - - let docs = workspace.list_documents().await; - assert_eq!(docs.len(), 3); - } - - #[tokio::test] - async fn test_async_workspace_get_meta() { - let backend = Arc::new(super::super::backend::MemoryBackend::new()); - let workspace = AsyncWorkspace::with_backend(backend).await.unwrap(); - - let doc = create_test_doc("doc-1"); - workspace.add(&doc).await.unwrap(); - - let meta = workspace.get_meta("doc-1").await; - assert!(meta.is_some()); - let meta = meta.unwrap(); - assert_eq!(meta.id, "doc-1"); - assert_eq!(meta.doc_name, "Test Doc"); - assert_eq!(meta.doc_type, "md"); - } - - #[tokio::test] - async fn test_async_workspace_concurrent_access() { - let backend = Arc::new(super::super::backend::MemoryBackend::new()); - let workspace = Arc::new(AsyncWorkspace::with_backend(backend).await.unwrap()); - - // Spawn multiple concurrent tasks - let mut handles = vec![]; - - for i in 0..10 { - let ws = workspace.clone(); - let handle = tokio::spawn(async move { - let id = format!("doc-{}", i); - let doc = create_test_doc(&id); - ws.add(&doc).await.unwrap(); - let loaded = ws.load(&id).await.unwrap(); - assert!(loaded.is_some()); - }); - handles.push(handle); - } - - // Wait for all tasks - for handle in handles { - handle.await.unwrap(); - } - - assert_eq!(workspace.len().await, 10); - } -} diff --git a/src/storage/mod.rs b/src/storage/mod.rs index b2aee4a9..bf50d96e 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -4,7 +4,7 @@ //! Storage module for persisting document indices. //! //! This module provides: -//! - **Workspace** — A directory-based document collection manager with LRU cache +//! - **Workspace** — An async directory-based document collection manager with LRU cache //! - **Persistence** — Save/load document trees and metadata with atomic writes //! - **Cache** — LRU cache for loaded documents //! - **Lock** — File locking for multi-process safety @@ -16,30 +16,32 @@ //! use vectorless::storage::{Workspace, PersistedDocument, DocumentMeta}; //! use vectorless::document::DocumentTree; //! +//! # #[tokio::main] +//! # async fn main() -> vectorless::error::Result<()> { //! // Create a workspace -//! let mut workspace = Workspace::new("./my_workspace")?; +//! let workspace = Workspace::new("./my_workspace").await?; //! //! // Add a document //! let meta = DocumentMeta::new("doc-1", "My Document", "md"); //! let tree = DocumentTree::new("Root", "Content"); //! let doc = PersistedDocument::new(meta, tree); -//! workspace.add(&doc)?; +//! workspace.add(&doc).await?; //! //! // Load it back (uses LRU cache) -//! let loaded = workspace.load("doc-1")?.unwrap(); +//! let loaded = workspace.load_and_cache("doc-1").await?.unwrap(); +//! # Ok(()) +//! # } //! ``` -pub mod async_workspace; pub mod backend; pub mod cache; pub mod codec; pub mod lock; pub mod migration; mod persistence; -mod workspace; +pub mod workspace; // Re-export main types -pub use async_workspace::{AsyncDocumentMetaEntry, AsyncWorkspace, AsyncWorkspaceOptions}; pub use backend::{FileBackend, MemoryBackend, StorageBackend}; pub use cache::DocumentCache; pub use codec::{Codec, GzipCodec, IdentityCodec, codec_from_config}; diff --git a/src/storage/workspace.rs b/src/storage/workspace.rs index 610592e1..a0ee5cb9 100644 --- a/src/storage/workspace.rs +++ b/src/storage/workspace.rs @@ -1,30 +1,35 @@ // Copyright (c) 2026 vectorless developers // SPDX-License-Identifier: Apache-2.0 -//! Workspace management for document collections. +//! Async workspace management for document collections. //! -//! A workspace manages indexed documents using a storage backend abstraction. -//! Uses lazy-loading pattern with LRU cache: -//! - Metadata index always in memory -//! - Full documents loaded on demand with LRU eviction +//! This module provides the primary workspace implementation for document +//! persistence, using async I/O for integration with runtimes like Tokio. //! -//! # Backends +//! # Features //! -//! The workspace supports different storage backends: -//! - **FileBackend**: File system storage (default) -//! - **MemoryBackend**: In-memory storage (for testing) +//! - **Async I/O** - All operations are async for non-blocking performance +//! - **LRU Cache** - Automatic caching with configurable size +//! - **Thread-Safe** - Fully thread-safe with `Arc` +//! - **Pluggable Backend** - Use file storage, in-memory, or custom backends //! //! # Example //! //! ```rust,ignore -//! use vectorless::storage::{Workspace, FileBackend}; +//! use vectorless::storage::Workspace; //! -//! // Default file-based workspace -//! let mut workspace = Workspace::new("./my_workspace")?; +//! #[tokio::main] +//! async fn main() -> Result<()> { +//! let workspace = Workspace::new("./workspace").await?; //! -//! // Or with custom backend -//! let backend = std::sync::Arc::new(FileBackend::new("./my_workspace")?); -//! let mut workspace = Workspace::with_backend(backend)?; +//! // Add a document +//! workspace.add(&doc).await?; +//! +//! // Load with caching +//! let loaded = workspace.load_and_cache("doc-1").await?; +//! +//! Ok(()) +//! } //! ``` use std::collections::HashMap; @@ -32,20 +37,19 @@ use std::path::{Path, PathBuf}; use std::sync::Arc; use serde::{Deserialize, Serialize}; +use tokio::sync::RwLock; use tracing::{debug, info, warn}; use super::backend::{FileBackend, StorageBackend}; use super::cache::DocumentCache; -use super::lock::FileLock; use super::persistence::{PersistedDocument, load_document_from_bytes, save_document_to_bytes}; use crate::Error; use crate::error::Result; const META_KEY: &str = "_meta"; -const LOCK_FILE: &str = ".workspace.lock"; const DEFAULT_CACHE_SIZE: usize = 100; -/// Lightweight metadata entry for the index. +/// Lightweight metadata entry for the async workspace index. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct DocumentMetaEntry { /// Document ID. @@ -68,35 +72,9 @@ pub struct DocumentMetaEntry { pub line_count: Option, } -/// A workspace for managing indexed documents. -/// -/// Uses LRU cache for loaded documents to balance memory usage -/// and access performance. -/// -/// # Thread Safety -/// -/// The workspace is thread-safe when used with a thread-safe backend. -/// Read operations only require `&self`. -#[derive(Debug)] -pub struct Workspace { - /// Storage backend. - backend: Arc, - /// Root path (for file-based backends, used for locking). - root: Option, - /// Document metadata index (id -> meta). - /// This is always loaded in memory. - meta_index: HashMap, - /// LRU cache for loaded documents. - cache: DocumentCache, - /// File lock for multi-process safety (file backends only). - _lock: Option, -} - -/// Options for workspace creation. +/// Options for async workspace creation. #[derive(Debug, Clone)] pub struct WorkspaceOptions { - /// Enable file locking (default: true, only for file backends). - pub file_lock: bool, /// LRU cache size (default: 100). pub cache_size: usize, } @@ -104,7 +82,6 @@ pub struct WorkspaceOptions { impl Default for WorkspaceOptions { fn default() -> Self { Self { - file_lock: true, cache_size: DEFAULT_CACHE_SIZE, } } @@ -121,53 +98,72 @@ impl WorkspaceOptions { self.cache_size = size; self } +} - /// Enable or disable file locking. - pub fn with_file_lock(mut self, enabled: bool) -> Self { - self.file_lock = enabled; - self +/// Inner state for the async workspace. +struct WorkspaceInner { + /// Storage backend. + backend: Arc, + /// Root path (for file-based backends). + root: Option, + /// Document metadata index. + meta_index: HashMap, + /// LRU cache for loaded documents. + cache: DocumentCache, +} + +/// An async workspace for managing indexed documents. +/// +/// Uses `tokio::sync::RwLock` for async-safe concurrent access. +/// All operations are async and can be safely called from multiple tasks. +/// +/// # Thread Safety +/// +/// The async workspace is fully thread-safe and can be cloned cheaply +/// (it uses `Arc` internally). +#[derive(Clone)] +pub struct Workspace { + inner: Arc>, +} + +impl std::fmt::Debug for Workspace { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Workspace").finish() } } impl Workspace { - /// Create a new workspace with a storage backend. - /// - /// # Example - /// - /// ```rust,ignore - /// let backend = Arc::new(FileBackend::new("./workspace")?); - /// let workspace = Workspace::with_backend(backend)?; - /// ``` - pub fn with_backend(backend: Arc) -> Result { - Self::with_backend_and_options(backend, WorkspaceOptions::default()) + /// Create a new async workspace with a storage backend. + pub async fn with_backend(backend: Arc) -> Result { + Self::with_backend_and_options(backend, WorkspaceOptions::default()).await } - /// Create a workspace with backend and options. - pub fn with_backend_and_options( + /// Create an async workspace with backend and options. + pub async fn with_backend_and_options( backend: Arc, options: WorkspaceOptions, ) -> Result { - let mut workspace = Self { + let mut inner = WorkspaceInner { backend, root: None, meta_index: HashMap::new(), cache: DocumentCache::with_capacity(options.cache_size), - _lock: None, }; - workspace.load_meta_index()?; - Ok(workspace) + Self::load_meta_index(&mut inner)?; + + Ok(Self { + inner: Arc::new(RwLock::new(inner)), + }) } - /// Create a new file-based workspace at the given path. - /// - /// This is a convenience method that creates a `FileBackend` internally. - pub fn new(path: impl Into) -> Result { - Self::with_options(path, WorkspaceOptions::default()) + /// Create a new file-based async workspace at the given path. + pub async fn new(path: impl Into) -> Result { + Self::with_options(path, WorkspaceOptions::default()).await } - /// Create a new workspace with custom LRU cache size. - pub fn with_cache_size(path: impl Into, cache_size: usize) -> Result { + /// Create a new async workspace with custom cache size. + pub async fn with_cache_size(path: impl Into, cache_size: usize) -> Result { Self::with_options( path, WorkspaceOptions { @@ -175,115 +171,62 @@ impl Workspace { ..Default::default() }, ) + .await } - /// Create a new workspace with custom options. - pub fn with_options(path: impl Into, options: WorkspaceOptions) -> Result { + /// Create a new async workspace with custom options. + pub async fn with_options(path: impl Into, options: WorkspaceOptions) -> Result { let root = path.into(); - - // Acquire file lock if enabled - let lock = if options.file_lock { - let lock_path = root.join(LOCK_FILE); - Some(FileLock::try_lock(&lock_path, true)?) - } else { - None - }; - let backend = Arc::new(FileBackend::new(&root)?); - let mut workspace = Self { + let mut inner = WorkspaceInner { backend, root: Some(root), meta_index: HashMap::new(), cache: DocumentCache::with_capacity(options.cache_size), - _lock: lock, }; - workspace.load_meta_index()?; - Ok(workspace) - } - - /// Open an existing workspace, or create if it doesn't exist. - pub fn open(path: impl Into + Clone) -> Result { - Self::open_with_options(path, WorkspaceOptions::default()) - } + Self::load_meta_index(&mut inner)?; - /// Open with custom cache size. - pub fn open_with_cache_size( - path: impl Into + Clone, - cache_size: usize, - ) -> Result { - Self::open_with_options( - path, - WorkspaceOptions { - cache_size, - ..Default::default() - }, - ) - } - - /// Open with custom options. - pub fn open_with_options( - path: impl Into + Clone, - options: WorkspaceOptions, - ) -> Result { - let root = path.clone().into(); - - // Acquire file lock if enabled - let lock = if options.file_lock && root.exists() { - let lock_path = root.join(LOCK_FILE); - Some(FileLock::try_lock(&lock_path, true)?) - } else { - None - }; - - let backend = Arc::new(FileBackend::new(&root)?); - - let mut workspace = Self { - backend, - root: Some(root), - meta_index: HashMap::new(), - cache: DocumentCache::with_capacity(options.cache_size), - _lock: lock, - }; - - workspace.load_meta_index()?; - Ok(workspace) + Ok(Self { + inner: Arc::new(RwLock::new(inner)), + }) } /// Get the workspace root path (if file-based). - pub fn path(&self) -> Option<&Path> { - self.root.as_deref() - } - - /// Get the storage backend. - pub fn backend(&self) -> &dyn StorageBackend { - self.backend.as_ref() + pub async fn path(&self) -> Option { + let inner = self.inner.read().await; + inner.root.clone() } /// List all document IDs in the workspace. - pub fn list_documents(&self) -> Vec<&str> { - self.meta_index.keys().map(|s| s.as_str()).collect() + pub async fn list_documents(&self) -> Vec { + let inner = self.inner.read().await; + inner.meta_index.keys().cloned().collect() } /// Get metadata for a document. - pub fn get_meta(&self, id: &str) -> Option<&DocumentMetaEntry> { - self.meta_index.get(id) + pub async fn get_meta(&self, id: &str) -> Option { + let inner = self.inner.read().await; + inner.meta_index.get(id).cloned() } /// Check if a document exists. - pub fn contains(&self, id: &str) -> bool { - self.meta_index.contains_key(id) + pub async fn contains(&self, id: &str) -> bool { + let inner = self.inner.read().await; + inner.meta_index.contains_key(id) } /// Add a document to the workspace. - pub fn add(&mut self, doc: &PersistedDocument) -> Result<()> { + pub async fn add(&self, doc: &PersistedDocument) -> Result<()> { + let mut inner = self.inner.write().await; + let doc_id = doc.meta.id.clone(); - let key = self.doc_key(&doc_id); + let key = Self::doc_key(&doc_id); // Serialize and save via backend let bytes = save_document_to_bytes(doc)?; - self.backend.put(&key, &bytes)?; + inner.backend.put(&key, &bytes)?; // Update meta index let meta_entry = DocumentMetaEntry { @@ -304,13 +247,13 @@ impl Workspace { line_count: doc.meta.line_count, }; - self.meta_index.insert(doc_id.clone(), meta_entry); - self.save_meta_index()?; + inner.meta_index.insert(doc_id.clone(), meta_entry); + Self::save_meta_index(&inner)?; // Remove from cache if present - let _ = self.cache.remove(&doc_id); + let _ = inner.cache.remove(&doc_id); - info!("Saved document {} to workspace", doc_id); + info!("Saved document {} to async workspace", doc_id); Ok(()) } @@ -318,27 +261,71 @@ impl Workspace { /// /// Uses LRU cache: returns cached version if available, /// otherwise loads from backend and caches it. - pub fn load(&self, id: &str) -> Result> { - if !self.contains(id) { - return Ok(None); + pub async fn load(&self, id: &str) -> Result> { + // First check if document exists (read lock) + { + let inner = self.inner.read().await; + if !inner.meta_index.contains_key(id) { + return Ok(None); + } + + // Check LRU cache + if let Some(cached) = inner.cache.get(id)? { + debug!("Cache hit for document {}", id); + return Ok(Some(cached)); + } } - // Check LRU cache first - if let Some(cached) = self.cache.get(id)? { - debug!("Cache hit for document {}", id); - return Ok(Some(cached)); + // Load from backend (need read lock for backend access) + let inner = self.inner.read().await; + let key = Self::doc_key(id); + + match inner.backend.get(&key)? { + Some(bytes) => { + let doc = load_document_from_bytes(&bytes)?; + + // Note: We can't modify the cache with only a read lock + // For now, we return the document without caching + // A more sophisticated implementation would use a separate cache structure + + debug!("Loaded document {} from backend", id); + Ok(Some(doc)) + } + None => { + warn!("Document {} in meta index but not in backend", id); + Ok(None) + } } + } - // Load from backend - let key = self.doc_key(id); - match self.backend.get(&key)? { + /// Load a document and cache it (requires write lock for caching). + pub async fn load_and_cache(&self, id: &str) -> Result> { + // First check if document exists (read lock) + { + let inner = self.inner.read().await; + if !inner.meta_index.contains_key(id) { + return Ok(None); + } + + // Check LRU cache + if let Some(cached) = inner.cache.get(id)? { + debug!("Cache hit for document {}", id); + return Ok(Some(cached)); + } + } + + // Load from backend and cache (write lock) + let inner = self.inner.write().await; + let key = Self::doc_key(id); + + match inner.backend.get(&key)? { Some(bytes) => { let doc = load_document_from_bytes(&bytes)?; - // Add to LRU cache - self.cache.put(id.to_string(), doc.clone())?; + // Add to cache + inner.cache.put(id.to_string(), doc.clone())?; - debug!("Loaded document {} from backend (cached)", id); + debug!("Loaded and cached document {}", id); Ok(Some(doc)) } None => { @@ -349,97 +336,105 @@ impl Workspace { } /// Remove a document from the workspace. - pub fn remove(&mut self, id: &str) -> Result { - if !self.contains(id) { + pub async fn remove(&self, id: &str) -> Result { + let mut inner = self.inner.write().await; + + if !inner.meta_index.contains_key(id) { return Ok(false); } - let key = self.doc_key(id); - self.backend.delete(&key)?; + let key = Self::doc_key(id); + inner.backend.delete(&key)?; - self.meta_index.remove(id); + inner.meta_index.remove(id); // Remove from cache - let _ = self.cache.remove(id); + let _ = inner.cache.remove(id); - self.save_meta_index()?; + Self::save_meta_index(&inner)?; - info!("Removed document {} from workspace", id); + info!("Removed document {} from async workspace", id); Ok(true) } /// Get the number of documents in the workspace. - pub fn len(&self) -> usize { - self.meta_index.len() + pub async fn len(&self) -> usize { + let inner = self.inner.read().await; + inner.meta_index.len() } /// Check if the workspace is empty. - pub fn is_empty(&self) -> bool { - self.meta_index.is_empty() + pub async fn is_empty(&self) -> bool { + let inner = self.inner.read().await; + inner.meta_index.is_empty() } /// Get the number of items currently in the LRU cache. - pub fn cache_len(&self) -> usize { - self.cache.len() + pub async fn cache_len(&self) -> usize { + let inner = self.inner.read().await; + inner.cache.len() } /// Get cache utilization (0.0 to 1.0). - pub fn cache_utilization(&self) -> f64 { - self.cache.utilization() + pub async fn cache_utilization(&self) -> f64 { + let inner = self.inner.read().await; + inner.cache.utilization() } /// Get cache statistics. - pub fn cache_stats(&self) -> super::cache::CacheStats { - self.cache.stats() + pub async fn cache_stats(&self) -> super::cache::CacheStats { + let inner = self.inner.read().await; + inner.cache.stats() } - /// Clear the LRU cache (does not remove documents from workspace). - pub fn clear_cache(&self) -> Result<()> { - self.cache.clear()?; - debug!("Cleared document cache"); + /// Clear the LRU cache. + pub async fn clear_cache(&self) -> Result<()> { + let inner = self.inner.write().await; + inner.cache.clear()?; + debug!("Cleared async document cache"); Ok(()) } /// Get the storage key for a document. - fn doc_key(&self, id: &str) -> String { + fn doc_key(id: &str) -> String { format!("doc:{}", id) } /// Load the meta index from backend. - fn load_meta_index(&mut self) -> Result<()> { - match self.backend.get(META_KEY)? { + fn load_meta_index(inner: &mut WorkspaceInner) -> Result<()> { + match inner.backend.get(META_KEY)? { Some(bytes) => { let meta: HashMap = serde_json::from_slice(&bytes) .map_err(|e| Error::Parse(format!("Failed to parse meta index: {}", e)))?; - self.meta_index = meta; + inner.meta_index = meta; info!( - "Loaded {} document(s) from workspace index", - self.meta_index.len() + "Loaded {} document(s) from async workspace index", + inner.meta_index.len() ); } None => { // Try to rebuild from existing keys - self.rebuild_meta_index()?; + Self::rebuild_meta_index(inner)?; } } Ok(()) } /// Save the meta index to backend. - fn save_meta_index(&self) -> Result<()> { - let bytes = serde_json::to_vec_pretty(&self.meta_index) + fn save_meta_index(inner: &WorkspaceInner) -> Result<()> { + let bytes = serde_json::to_vec_pretty(&inner.meta_index) .map_err(|e| Error::Parse(format!("Failed to serialize meta index: {}", e)))?; - self.backend.put(META_KEY, &bytes)?; + inner.backend.put(META_KEY, &bytes)?; Ok(()) } /// Rebuild the meta index from existing documents. - fn rebuild_meta_index(&mut self) -> Result<()> { - let keys = self.backend.keys()?; + fn rebuild_meta_index(inner: &mut WorkspaceInner) -> Result<()> { + let keys = inner.backend.keys()?; let doc_keys: Vec<_> = keys.iter().filter(|k| k.starts_with("doc:")).collect(); for key in doc_keys { - if let Some(bytes) = self.backend.get(key)? { + if let Some(bytes) = inner.backend.get(key)? { if let Ok(doc) = load_document_from_bytes(&bytes) { let doc_id = doc.meta.id.clone(); let meta_entry = DocumentMetaEntry { @@ -459,14 +454,17 @@ impl Workspace { }, line_count: doc.meta.line_count, }; - self.meta_index.insert(doc_id, meta_entry); + inner.meta_index.insert(doc_id, meta_entry); } } } - if !self.meta_index.is_empty() { - self.save_meta_index()?; - info!("Rebuilt index from {} document(s)", self.meta_index.len()); + if !inner.meta_index.is_empty() { + Self::save_meta_index(inner)?; + info!( + "Rebuilt async index from {} document(s)", + inner.meta_index.len() + ); } Ok(()) @@ -476,86 +474,128 @@ impl Workspace { #[cfg(test)] mod tests { use super::*; - use tempfile::TempDir; + use crate::document::DocumentTree; - #[test] - fn test_workspace_create() { - let temp = TempDir::new().unwrap(); - let workspace = Workspace::new(temp.path()).unwrap(); - - assert!(workspace.is_empty()); - assert_eq!(workspace.len(), 0); + fn create_test_doc(id: &str) -> PersistedDocument { + let meta = super::super::persistence::DocumentMeta::new(id, "Test Doc", "md"); + let tree = DocumentTree::new("Root", "Content"); + PersistedDocument::new(meta, tree) } - #[test] - fn test_workspace_with_memory_backend() { + #[tokio::test] + async fn test_async_workspace_create() { let backend = Arc::new(super::super::backend::MemoryBackend::new()); - let mut workspace = Workspace::with_backend(backend).unwrap(); + let workspace = Workspace::with_backend(backend).await.unwrap(); + + assert!(workspace.is_empty().await); + assert_eq!(workspace.len().await, 0); + } - assert!(workspace.is_empty()); + #[tokio::test] + async fn test_async_workspace_add_and_load() { + let backend = Arc::new(super::super::backend::MemoryBackend::new()); + let workspace = Workspace::with_backend(backend).await.unwrap(); - // Add a document - let meta = super::super::persistence::DocumentMeta::new("doc-1", "Test", "md"); - let tree = crate::document::DocumentTree::new("Root", "Content"); - let doc = PersistedDocument::new(meta, tree); + let doc = create_test_doc("doc-1"); + workspace.add(&doc).await.unwrap(); - workspace.add(&doc).unwrap(); - assert_eq!(workspace.len(), 1); + assert_eq!(workspace.len().await, 1); + assert!(workspace.contains("doc-1").await); - // Load it back - let loaded = workspace.load("doc-1").unwrap(); + let loaded = workspace.load("doc-1").await.unwrap(); assert!(loaded.is_some()); assert_eq!(loaded.unwrap().meta.id, "doc-1"); } - #[test] - fn test_workspace_open() { - let temp = TempDir::new().unwrap(); - let path = temp.path().join("workspace"); + #[tokio::test] + async fn test_async_workspace_remove() { + let backend = Arc::new(super::super::backend::MemoryBackend::new()); + let workspace = Workspace::with_backend(backend).await.unwrap(); - let options = WorkspaceOptions { - file_lock: false, - ..Default::default() - }; + let doc = create_test_doc("doc-1"); + workspace.add(&doc).await.unwrap(); - let workspace = Workspace::open_with_options(&path, options.clone()).unwrap(); - assert!(workspace.is_empty()); + let removed = workspace.remove("doc-1").await.unwrap(); + assert!(removed); + assert!(workspace.is_empty().await); - drop(workspace); - let workspace2 = Workspace::open_with_options(&path, options).unwrap(); - assert!(workspace2.is_empty()); + let removed_again = workspace.remove("doc-1").await.unwrap(); + assert!(!removed_again); } - #[test] - fn test_workspace_cache_operations() { - let temp = TempDir::new().unwrap(); - let workspace = Workspace::with_cache_size(temp.path(), 5).unwrap(); + #[tokio::test] + async fn test_async_workspace_cache() { + let backend = Arc::new(super::super::backend::MemoryBackend::new()); + let workspace = Workspace::with_backend(backend).await.unwrap(); - assert_eq!(workspace.cache_len(), 0); - assert_eq!(workspace.cache.utilization(), 0.0); + let doc = create_test_doc("doc-1"); + workspace.add(&doc).await.unwrap(); - workspace.clear_cache().unwrap(); - assert_eq!(workspace.cache_len(), 0); + // First load with caching + let _ = workspace.load_and_cache("doc-1").await.unwrap(); + let stats = workspace.cache_stats().await; + assert_eq!(stats.misses, 1); + + // Second load should hit cache + let _ = workspace.load_and_cache("doc-1").await.unwrap(); + let stats = workspace.cache_stats().await; + assert_eq!(stats.hits, 1); } - #[test] - fn test_workspace_cache_stats() { + #[tokio::test] + async fn test_async_workspace_list_documents() { let backend = Arc::new(super::super::backend::MemoryBackend::new()); - let mut workspace = Workspace::with_backend(backend).unwrap(); + let workspace = Workspace::with_backend(backend).await.unwrap(); - let meta = super::super::persistence::DocumentMeta::new("doc-1", "Test", "md"); - let tree = crate::document::DocumentTree::new("Root", "Content"); - let doc = PersistedDocument::new(meta, tree); - workspace.add(&doc).unwrap(); + workspace.add(&create_test_doc("doc-1")).await.unwrap(); + workspace.add(&create_test_doc("doc-2")).await.unwrap(); + workspace.add(&create_test_doc("doc-3")).await.unwrap(); - // First load - cache miss - let _ = workspace.load("doc-1").unwrap(); - let stats = workspace.cache_stats(); - assert_eq!(stats.misses, 1); + let docs = workspace.list_documents().await; + assert_eq!(docs.len(), 3); + } - // Second load - cache hit - let _ = workspace.load("doc-1").unwrap(); - let stats = workspace.cache_stats(); - assert_eq!(stats.hits, 1); + #[tokio::test] + async fn test_async_workspace_get_meta() { + let backend = Arc::new(super::super::backend::MemoryBackend::new()); + let workspace = Workspace::with_backend(backend).await.unwrap(); + + let doc = create_test_doc("doc-1"); + workspace.add(&doc).await.unwrap(); + + let meta = workspace.get_meta("doc-1").await; + assert!(meta.is_some()); + let meta = meta.unwrap(); + assert_eq!(meta.id, "doc-1"); + assert_eq!(meta.doc_name, "Test Doc"); + assert_eq!(meta.doc_type, "md"); + } + + #[tokio::test] + async fn test_async_workspace_concurrent_access() { + let backend = Arc::new(super::super::backend::MemoryBackend::new()); + let workspace = Arc::new(Workspace::with_backend(backend).await.unwrap()); + + // Spawn multiple concurrent tasks + let mut handles = vec![]; + + for i in 0..10 { + let ws = workspace.clone(); + let handle = tokio::spawn(async move { + let id = format!("doc-{}", i); + let doc = create_test_doc(&id); + ws.add(&doc).await.unwrap(); + let loaded = ws.load(&id).await.unwrap(); + assert!(loaded.is_some()); + }); + handles.push(handle); + } + + // Wait for all tasks + for handle in handles { + handle.await.unwrap(); + } + + assert_eq!(workspace.len().await, 10); } } diff --git a/src/util/token.rs b/src/util/token.rs index 129032c2..390f20cf 100644 --- a/src/util/token.rs +++ b/src/util/token.rs @@ -32,7 +32,7 @@ fn get_bpe() -> &'static CoreBPE { /// # Example /// /// ``` -/// use vectorless::domain::estimate_tokens; +/// use vectorless::estimate_tokens; /// /// assert_eq!(estimate_tokens(""), 0); /// assert!(estimate_tokens("hello world") > 0); @@ -54,7 +54,7 @@ pub fn estimate_tokens(text: &str) -> usize { /// # Example /// /// ``` -/// use vectorless::domain::estimate_tokens_fast; +/// use vectorless::estimate_tokens_fast; /// /// assert_eq!(estimate_tokens_fast(""), 0); /// assert_eq!(estimate_tokens_fast("hi"), 1); // 2 chars -> 1 token min