From 8d05972f4319da4a9a15b033f5a12628a92a3aa4 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Sun, 5 Apr 2026 11:21:16 +0800 Subject: [PATCH 1/8] refactor: move domain types to document module and update error handling BREAKING CHANGE: The `domain` module has been renamed to `document` and error types are now in their own `error` module. All imports using `vectorless::domain::*` should be updated to `vectorless::document::*`. The `Error` and `Result` types are now accessible via `vectorless::error::*`. - Rename `src/domain/mod.rs` to `src/document/mod.rs` - Move `StructureNode` and `DocumentStructure` types to new `src/document/structure.rs` module - Remove error types from document module and create separate error module - Update all imports across the codebase to reflect new module structure - Move token estimation utilities to `util` module - Update example files to use new import paths --- examples/content_aggregation.rs | 6 +-- examples/index.rs | 4 +- examples/retrieve.rs | 2 +- src/client/engine.rs | 3 +- src/client/indexer.rs | 2 +- src/client/retriever.rs | 3 +- src/client/session.rs | 3 +- src/client/types.rs | 2 +- src/client/workspace.rs | 3 +- src/{domain => document}/mod.rs | 14 +++--- src/{domain => document}/node.rs | 0 src/document/structure.rs | 67 ++++++++++++++++++++++++++ src/{domain => document}/toc.rs | 0 src/{domain => document}/tree.rs | 29 +---------- src/{domain => }/error.rs | 0 src/index/incremental/detector.rs | 2 +- src/index/incremental/updater.rs | 3 +- src/index/pipeline/context.rs | 2 +- src/index/pipeline/executor.rs | 2 +- src/index/pipeline/orchestrator.rs | 6 +-- src/index/stages/build.rs | 4 +- src/index/stages/enhance.rs | 4 +- src/index/stages/enrich.rs | 5 +- src/index/stages/mod.rs | 2 +- src/index/stages/optimize.rs | 10 ++-- src/index/stages/parse.rs | 4 +- src/index/stages/persist.rs | 6 +-- src/index/summary/full.rs | 2 +- src/index/summary/selective.rs | 2 +- src/index/summary/strategy.rs | 2 +- src/lib.rs | 18 +++++-- src/llm/error.rs | 4 +- src/parser/docx/parser.rs | 3 +- src/parser/markdown/parser.rs | 5 +- src/parser/pdf/parser.rs | 3 +- src/parser/pdf/types.rs | 2 +- src/parser/registry.rs | 3 +- src/parser/toc/assigner.rs | 2 +- src/parser/toc/detector.rs | 2 +- src/parser/toc/parser.rs | 2 +- src/parser/toc/processor.rs | 2 +- src/parser/toc/repairer.rs | 2 +- src/parser/toc/verifier.rs | 2 +- src/parser/traits.rs | 4 +- src/retrieval/cache/path_cache.rs | 2 +- src/retrieval/content/aggregator.rs | 5 +- src/retrieval/content/budget.rs | 5 +- src/retrieval/content/builder.rs | 10 ++-- src/retrieval/content/scorer.rs | 5 +- src/retrieval/context.rs | 5 +- src/retrieval/pilot/builder.rs | 10 ++-- src/retrieval/pilot/decision.rs | 4 +- src/retrieval/pilot/llm_pilot.rs | 10 ++-- src/retrieval/pilot/noop.rs | 4 +- src/retrieval/pilot/parser.rs | 4 +- src/retrieval/pilot/trait.rs | 2 +- src/retrieval/pipeline/context.rs | 2 +- src/retrieval/pipeline/orchestrator.rs | 7 +-- src/retrieval/pipeline/stage.rs | 2 +- src/retrieval/pipeline_retriever.rs | 3 +- src/retrieval/retriever.rs | 2 +- src/retrieval/search/beam.rs | 2 +- src/retrieval/search/greedy.rs | 2 +- src/retrieval/search/mcts.rs | 2 +- src/retrieval/search/scorer.rs | 2 +- src/retrieval/search/trait.rs | 2 +- src/retrieval/stages/analyze.rs | 6 +-- src/retrieval/stages/judge.rs | 8 +-- src/retrieval/stages/plan.rs | 2 +- src/retrieval/stages/search.rs | 4 +- src/retrieval/strategy/keyword.rs | 2 +- src/retrieval/strategy/llm.rs | 2 +- src/retrieval/strategy/semantic.rs | 2 +- src/retrieval/strategy/trait.rs | 2 +- src/retrieval/types.rs | 2 +- src/storage/persistence.rs | 3 +- src/storage/workspace.rs | 3 +- src/util/mod.rs | 8 +++ src/{domain => util}/token.rs | 0 79 files changed, 231 insertions(+), 153 deletions(-) rename src/{domain => document}/mod.rs (55%) rename src/{domain => document}/node.rs (100%) create mode 100644 src/document/structure.rs rename src/{domain => document}/toc.rs (100%) rename src/{domain => document}/tree.rs (96%) rename src/{domain => }/error.rs (100%) create mode 100644 src/util/mod.rs rename src/{domain => util}/token.rs (100%) diff --git a/examples/content_aggregation.rs b/examples/content_aggregation.rs index 5fe71a32..9ead2aeb 100644 --- a/examples/content_aggregation.rs +++ b/examples/content_aggregation.rs @@ -19,12 +19,12 @@ use vectorless::retrieval::content::{ StructureBuilder, OutputFormat, RelevanceScorer, ScoringStrategyConfig, ContentChunk, ScoringContext, }; -use vectorless::domain::NodeId; +use vectorless::document::NodeId; use indextree::Arena; fn make_node_id() -> NodeId { let mut arena = Arena::new(); - let node = vectorless::domain::TreeNode { + let node = vectorless::document::TreeNode { title: "Test".to_string(), structure: String::new(), content: String::new(), @@ -135,7 +135,7 @@ fn main() { for (name, format) in formats { let builder = StructureBuilder::new(format); - let tree = vectorless::domain::DocumentTree::new("Test", ""); + let tree = vectorless::document::DocumentTree::new("Test", ""); let structured = builder.build(result.selected.clone(), &tree); println!("\n{} Output ({} chars, {} tokens):", name, structured.content.len(), structured.metadata.total_tokens); diff --git a/examples/index.rs b/examples/index.rs index cbb318b1..bd2b6aac 100644 --- a/examples/index.rs +++ b/examples/index.rs @@ -76,8 +76,8 @@ async fn main() -> vectorless::Result<()> { /// Print tree structure up to a maximum depth. fn print_tree_structure( - tree: &vectorless::domain::DocumentTree, - node_id: vectorless::domain::NodeId, + tree: &vectorless::document::DocumentTree, + node_id: vectorless::document::NodeId, current_depth: usize, max_depth: usize, ) { diff --git a/examples/retrieve.rs b/examples/retrieve.rs index f3ed1751..a8a86beb 100644 --- a/examples/retrieve.rs +++ b/examples/retrieve.rs @@ -16,7 +16,7 @@ //! ``` use std::sync::Arc; -use vectorless::domain::DocumentTree; +use vectorless::document::DocumentTree; use vectorless::retrieval::{ PipelineRetriever, RetrieveOptions, Retriever, StrategyPreference, pipeline::RetrievalOrchestrator, diff --git a/src/client/engine.rs b/src/client/engine.rs index 8156586e..0c0785c4 100644 --- a/src/client/engine.rs +++ b/src/client/engine.rs @@ -45,7 +45,8 @@ use std::sync::{Arc, Mutex, RwLock}; use tracing::info; use crate::config::Config; -use crate::domain::{DocumentTree, Error, Result}; +use crate::error::Result; +use crate::{DocumentTree, Error}; use crate::index::PipelineExecutor; use crate::retrieval::{PipelineRetriever, RetrieveOptions}; use crate::storage::Workspace; diff --git a/src/client/indexer.rs b/src/client/indexer.rs index 7f41cde8..8ecb25d4 100644 --- a/src/client/indexer.rs +++ b/src/client/indexer.rs @@ -25,7 +25,7 @@ use std::sync::{Arc, Mutex}; use tracing::info; use uuid::Uuid; -use crate::domain::{Error, Result}; +use crate::error::{Error, Result}; use crate::index::{IndexInput, IndexMode, PipelineExecutor, PipelineOptions, SummaryStrategy}; use crate::parser::DocumentFormat; use crate::storage::{DocumentMeta, PersistedDocument}; diff --git a/src/client/retriever.rs b/src/client/retriever.rs index 7f0099ca..ee7a0cbd 100644 --- a/src/client/retriever.rs +++ b/src/client/retriever.rs @@ -22,7 +22,8 @@ use std::sync::Arc; use tracing::info; use crate::config::Config; -use crate::domain::{DocumentTree, Error, NodeId, Result}; +use crate::document::{DocumentTree, NodeId}; +use crate::error::{Error, Result}; use crate::retrieval::content::ContentAggregatorConfig; use crate::retrieval::{ QueryComplexity, RetrieveOptions, RetrieveResponse, RetrievalResult, Retriever, SufficiencyLevel, diff --git a/src/client/session.rs b/src/client/session.rs index 1b5d55ef..f659ac75 100644 --- a/src/client/session.rs +++ b/src/client/session.rs @@ -31,7 +31,8 @@ use std::time::{Duration, Instant}; use tracing::info; use uuid::Uuid; -use crate::domain::{DocumentTree, Error, Result}; +use crate::{DocumentTree, Error}; +use crate::error::Result; use crate::retrieval::RetrieveOptions; use crate::storage::PersistedDocument; diff --git a/src/client/types.rs b/src/client/types.rs index 40816257..861d52e6 100644 --- a/src/client/types.rs +++ b/src/client/types.rs @@ -8,7 +8,7 @@ use serde::{Deserialize, Serialize}; use std::path::PathBuf; -use crate::domain::DocumentTree; +use crate::document::DocumentTree; use crate::parser::DocumentFormat; // ============================================================ diff --git a/src/client/workspace.rs b/src/client/workspace.rs index 731a5e71..feb4116d 100644 --- a/src/client/workspace.rs +++ b/src/client/workspace.rs @@ -27,7 +27,8 @@ use std::sync::{Arc, RwLock}; use tracing::{debug, info, warn}; -use crate::domain::{Error, Result}; +use crate::{Error}; +use crate::error::Result; use crate::storage::{DocumentMetaEntry, PersistedDocument, Workspace}; use super::events::{EventEmitter, WorkspaceEvent}; diff --git a/src/domain/mod.rs b/src/document/mod.rs similarity index 55% rename from src/domain/mod.rs rename to src/document/mod.rs index 75970a12..f045fcfe 100644 --- a/src/domain/mod.rs +++ b/src/document/mod.rs @@ -1,9 +1,9 @@ // Copyright (c) 2026 vectorless developers // SPDX-License-Identifier: Apache-2.0 -//! Domain layer - pure data structures with zero business logic. +//! Document types - pure data structures for document tree representation. //! -//! This module contains the core domain types that represent document trees. +//! This module contains the core types that represent hierarchical documents. //! These types have no dependencies on indexing or retrieval logic. //! //! # Types @@ -12,16 +12,14 @@ //! - [`DocumentTree`] - Arena-based tree structure //! - [`NodeId`] - Unique identifier for tree nodes //! - [`TocView`] - Table of Contents generator -//! - [`Error`] - Domain error types +//! - [`StructureNode`] - JSON export structure -mod error; mod node; +mod structure; mod toc; -mod token; mod tree; -pub use error::{Error, Result}; pub use node::{NodeId, TreeNode}; +pub use structure::{DocumentStructure, StructureNode}; pub use toc::{TocConfig, TocEntry, TocNode, TocView}; -pub use token::{estimate_tokens, estimate_tokens_batch, estimate_tokens_fast}; -pub use tree::{DocumentStructure, DocumentTree, RetrievalIndex, StructureNode}; +pub use tree::{DocumentTree, RetrievalIndex}; diff --git a/src/domain/node.rs b/src/document/node.rs similarity index 100% rename from src/domain/node.rs rename to src/document/node.rs diff --git a/src/document/structure.rs b/src/document/structure.rs new file mode 100644 index 00000000..6fa93b35 --- /dev/null +++ b/src/document/structure.rs @@ -0,0 +1,67 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Document structure types for JSON export. +//! +//! These types define the JSON format for exporting document trees, +//! compatible with PageIndex format. + +use serde::{Deserialize, Serialize}; + +/// A node in the document structure for JSON export. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct StructureNode { + /// Node title. + pub title: String, + /// Unique node identifier. + pub node_id: String, + /// Starting line number (1-based). + pub start_index: usize, + /// Ending line number (1-based). + pub end_index: usize, + /// Generated summary (optional). + #[serde(skip_serializing_if = "Option::is_none")] + pub summary: Option, + /// Child nodes. + #[serde(skip_serializing_if = "Vec::is_empty")] + pub nodes: Vec, +} + +/// Document structure for JSON export. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DocumentStructure { + /// Document name. + pub doc_name: String, + /// Tree structure. + pub structure: Vec, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_structure_node_serialization() { + let node = StructureNode { + title: "Introduction".to_string(), + node_id: "0001".to_string(), + start_index: 1, + end_index: 10, + summary: Some("A brief intro".to_string()), + nodes: vec![], + }; + + let json = serde_json::to_string(&node).unwrap(); + assert!(json.contains("Introduction")); + } + + #[test] + fn test_document_structure() { + let doc = DocumentStructure { + doc_name: "test.md".to_string(), + structure: vec![], + }; + + assert_eq!(doc.doc_name, "test.md"); + } +} diff --git a/src/domain/toc.rs b/src/document/toc.rs similarity index 100% rename from src/domain/toc.rs rename to src/document/toc.rs diff --git a/src/domain/tree.rs b/src/document/tree.rs similarity index 96% rename from src/domain/tree.rs rename to src/document/tree.rs index 94f138a3..090dacae 100644 --- a/src/domain/tree.rs +++ b/src/document/tree.rs @@ -12,34 +12,7 @@ use indextree::Arena; use serde::{Deserialize, Serialize}; use super::node::{NodeId, TreeNode}; - -/// JSON structure for exporting document tree (matches PageIndex format). -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct StructureNode { - /// Node title. - pub title: String, - /// Unique node identifier. - pub node_id: String, - /// Starting line number (1-based). - pub start_index: usize, - /// Ending line number (1-based). - pub end_index: usize, - /// Generated summary (optional). - #[serde(skip_serializing_if = "Option::is_none")] - pub summary: Option, - /// Child nodes. - #[serde(skip_serializing_if = "Vec::is_empty")] - pub nodes: Vec, -} - -/// Document structure for JSON export. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct DocumentStructure { - /// Document name. - pub doc_name: String, - /// Tree structure. - pub structure: Vec, -} +use super::structure::{DocumentStructure, StructureNode}; /// Pre-computed index for efficient retrieval operations. /// diff --git a/src/domain/error.rs b/src/error.rs similarity index 100% rename from src/domain/error.rs rename to src/error.rs diff --git a/src/index/incremental/detector.rs b/src/index/incremental/detector.rs index 688197b0..1db0d4fc 100644 --- a/src/index/incremental/detector.rs +++ b/src/index/incremental/detector.rs @@ -8,7 +8,7 @@ use std::hash::{Hash, Hasher}; use std::path::Path; use std::time::SystemTime; -use crate::domain::DocumentTree; +use crate::document::DocumentTree; /// Type of change detected. #[derive(Debug, Clone, Copy, PartialEq, Eq)] diff --git a/src/index/incremental/updater.rs b/src/index/incremental/updater.rs index 2762df9b..fd1575df 100644 --- a/src/index/incremental/updater.rs +++ b/src/index/incremental/updater.rs @@ -5,7 +5,8 @@ use tracing::info; -use crate::domain::{DocumentTree, NodeId, Result}; +use crate::document::{DocumentTree, NodeId}; +use crate::error::Result; use crate::parser::RawNode; use super::detector::ChangeDetector; diff --git a/src/index/pipeline/context.rs b/src/index/pipeline/context.rs index 656d7909..777033fc 100644 --- a/src/index/pipeline/context.rs +++ b/src/index/pipeline/context.rs @@ -6,7 +6,7 @@ use std::collections::HashMap; use std::path::PathBuf; -use crate::domain::{DocumentTree, NodeId}; +use crate::document::{DocumentTree, NodeId}; use crate::llm::LlmClient; use crate::parser::{DocumentFormat, RawNode}; diff --git a/src/index/pipeline/executor.rs b/src/index/pipeline/executor.rs index e1c12506..ee560e91 100644 --- a/src/index/pipeline/executor.rs +++ b/src/index/pipeline/executor.rs @@ -8,7 +8,7 @@ use tracing::info; -use crate::domain::Result; +use crate::error::Result; use crate::llm::LlmClient; use super::super::PipelineOptions; diff --git a/src/index/pipeline/orchestrator.rs b/src/index/pipeline/orchestrator.rs index fb471f51..299317d8 100644 --- a/src/index/pipeline/orchestrator.rs +++ b/src/index/pipeline/orchestrator.rs @@ -27,7 +27,7 @@ use std::collections::HashMap; use std::time::Instant; use tracing::{error, info, warn}; -use crate::domain::Result; +use crate::error::Result; use super::super::PipelineOptions; use super::super::stages::IndexStage; @@ -208,7 +208,7 @@ impl PipelineOrchestrator { for entry in &self.stages { for dep in &entry.depends_on { if !name_to_idx.contains_key(dep.as_str()) { - return Err(crate::domain::Error::Config(format!( + return Err(crate::error::Error::Config(format!( "Stage '{}' depends on non-existent stage '{}'", entry.stage.name(), dep @@ -265,7 +265,7 @@ impl PipelineOrchestrator { .filter(|&&i| !result.contains(&i)) .map(|&i| self.stages[i].stage.name()) .collect(); - return Err(crate::domain::Error::Config(format!( + return Err(crate::error::Error::Config(format!( "Circular dependency detected involving stages: {:?}", remaining ))); diff --git a/src/index/stages/build.rs b/src/index/stages/build.rs index ed7f0ee9..1ab16d26 100644 --- a/src/index/stages/build.rs +++ b/src/index/stages/build.rs @@ -7,8 +7,10 @@ use super::async_trait; use std::time::Instant; use tracing::info; -use crate::domain::{DocumentTree, NodeId, Result, estimate_tokens}; +use crate::document::{DocumentTree, NodeId}; +use crate::error::Result; use crate::parser::RawNode; +use crate::util::estimate_tokens; use super::{IndexStage, StageResult}; use crate::index::ThinningConfig; diff --git a/src/index/stages/enhance.rs b/src/index/stages/enhance.rs index f510e2e0..d1d0f6fd 100644 --- a/src/index/stages/enhance.rs +++ b/src/index/stages/enhance.rs @@ -8,7 +8,9 @@ use std::sync::Arc; use std::time::Instant; use tracing::{info, warn}; -use crate::domain::{DocumentTree, NodeId, Result}; + +use crate::error::Result; +use crate::document::{DocumentTree, NodeId}; use crate::llm::LlmClient; use super::{IndexStage, StageResult}; diff --git a/src/index/stages/enrich.rs b/src/index/stages/enrich.rs index 2c3759fe..7b0c670d 100644 --- a/src/index/stages/enrich.rs +++ b/src/index/stages/enrich.rs @@ -7,7 +7,8 @@ use super::async_trait; use std::time::Instant; use tracing::info; -use crate::domain::{DocumentTree, NodeId, Result, TocView}; +use crate::document::{DocumentTree, NodeId, TocView}; +use crate::error::Result; use super::{IndexStage, StageResult}; use crate::index::pipeline::IndexContext; @@ -116,7 +117,7 @@ impl IndexStage for EnrichStage { let tree = ctx .tree .as_mut() - .ok_or_else(|| crate::domain::Error::IndexBuild("Tree not built".to_string()))?; + .ok_or_else(|| crate::Error::IndexBuild("Tree not built".to_string()))?; // 1. Calculate page ranges Self::calculate_page_ranges(tree); diff --git a/src/index/stages/mod.rs b/src/index/stages/mod.rs index 9d6f8c85..5a55383d 100644 --- a/src/index/stages/mod.rs +++ b/src/index/stages/mod.rs @@ -18,7 +18,7 @@ pub use parse::ParseStage; pub use persist::PersistStage; use super::pipeline::{FailurePolicy, IndexContext, StageResult}; -use crate::domain::Result; +use crate::error::Result; pub use async_trait::async_trait; /// Index pipeline stage. diff --git a/src/index/stages/optimize.rs b/src/index/stages/optimize.rs index d84633bf..571e947d 100644 --- a/src/index/stages/optimize.rs +++ b/src/index/stages/optimize.rs @@ -7,7 +7,9 @@ use super::async_trait; use std::time::Instant; use tracing::info; -use crate::domain::{NodeId, Result}; + +use crate::error::Result; +use crate::document::{NodeId}; use crate::index::pipeline::IndexContext; use super::{IndexStage, StageResult}; @@ -23,7 +25,7 @@ impl OptimizeStage { /// Merge adjacent small leaf nodes. fn merge_small_leaves( - tree: &mut crate::domain::DocumentTree, + tree: &mut crate::document::DocumentTree, min_tokens: usize, metrics: &mut crate::index::IndexMetrics, ) -> usize { @@ -86,7 +88,7 @@ impl OptimizeStage { } /// Remove empty intermediate nodes. - fn remove_empty_nodes(tree: &mut crate::domain::DocumentTree) -> usize { + fn remove_empty_nodes(tree: &mut crate::document::DocumentTree) -> usize { let mut removed_count = 0; // Find nodes with no content and only one child @@ -154,7 +156,7 @@ impl IndexStage for OptimizeStage { let tree = ctx .tree .as_mut() - .ok_or_else(|| crate::domain::Error::IndexBuild("Tree not built".to_string()))?; + .ok_or_else(|| crate::Error::IndexBuild("Tree not built".to_string()))?; let mut merged_count = 0; diff --git a/src/index/stages/parse.rs b/src/index/stages/parse.rs index 0322760e..150d1803 100644 --- a/src/index/stages/parse.rs +++ b/src/index/stages/parse.rs @@ -7,7 +7,7 @@ use super::async_trait; use std::time::Instant; use tracing::info; -use crate::domain::Result; +use crate::error::Result; use crate::parser::DocumentFormat; use crate::parser::ParserRegistry; @@ -35,7 +35,7 @@ impl ParseStage { IndexInput::File(path) => { let ext = path.extension().and_then(|e| e.to_str()).unwrap_or(""); DocumentFormat::from_extension(ext).ok_or_else(|| { - crate::domain::Error::Parse(format!("Unknown format: {}", ext)) + crate::Error::Parse(format!("Unknown format: {}", ext)) }) } IndexInput::Content { format, .. } => Ok(*format), diff --git a/src/index/stages/persist.rs b/src/index/stages/persist.rs index d2ac2e47..e0d93f7d 100644 --- a/src/index/stages/persist.rs +++ b/src/index/stages/persist.rs @@ -7,7 +7,7 @@ use super::async_trait; use std::time::Instant; use tracing::info; -use crate::domain::Result; +use crate::error::Result; use crate::storage::{DocumentMeta as StorageMeta, PersistedDocument, Workspace}; use super::{IndexStage, StageResult}; @@ -37,12 +37,12 @@ impl PersistStage { let workspace = self .workspace .as_mut() - .ok_or_else(|| crate::domain::Error::Config("No workspace configured".to_string()))?; + .ok_or_else(|| crate::Error::Config("No workspace configured".to_string()))?; let tree = ctx .tree .as_ref() - .ok_or_else(|| crate::domain::Error::IndexBuild("Tree not built".to_string()))?; + .ok_or_else(|| crate::Error::IndexBuild("Tree not built".to_string()))?; // Create metadata let meta = StorageMeta::new(&ctx.doc_id, &ctx.name, ctx.format.extension()) diff --git a/src/index/summary/full.rs b/src/index/summary/full.rs index 9c1eff00..c9e76e33 100644 --- a/src/index/summary/full.rs +++ b/src/index/summary/full.rs @@ -3,7 +3,7 @@ //! Full summary strategy - generate summaries for all nodes. -use crate::domain::NodeId; +use crate::document::NodeId; use crate::llm::LlmClient; use super::{SummaryGenerator, SummaryStrategyConfig}; diff --git a/src/index/summary/selective.rs b/src/index/summary/selective.rs index 3049278e..18c8946e 100644 --- a/src/index/summary/selective.rs +++ b/src/index/summary/selective.rs @@ -3,7 +3,7 @@ //! Selective summary strategy - generate summaries only for qualifying nodes. -use crate::domain::{DocumentTree, NodeId}; +use crate::document::{DocumentTree, NodeId}; use crate::llm::LlmClient; use super::{SummaryGenerator, SummaryStrategyConfig}; diff --git a/src/index/summary/strategy.rs b/src/index/summary/strategy.rs index 5b731232..eac0055c 100644 --- a/src/index/summary/strategy.rs +++ b/src/index/summary/strategy.rs @@ -5,7 +5,7 @@ use async_trait::async_trait; -use crate::domain::{DocumentTree, NodeId}; +use crate::document::{DocumentTree, NodeId}; use crate::llm::{LlmClient, LlmResult}; /// Configuration for summary strategies. diff --git a/src/lib.rs b/src/lib.rs index 8a9e5615..3a283c32 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -106,13 +106,15 @@ pub mod client; pub mod config; -pub mod domain; +pub mod document; +pub mod error; pub mod index; pub mod llm; pub mod parser; pub mod retrieval; pub mod storage; pub mod throttle; +pub mod util; // ============================================================================= // Re-exports (Convenience API) @@ -121,12 +123,18 @@ pub mod throttle; // Client API (most common entry point) pub use client::{DocumentInfo, Engine, EngineBuilder, IndexedDocument}; -// Domain types -pub use domain::{ - DocumentStructure, DocumentTree, Error, NodeId, Result, StructureNode, TocConfig, TocEntry, - TocNode, TocView, TreeNode, estimate_tokens, estimate_tokens_fast, +// Error types +pub use error::{Error, Result}; + +// Document types +pub use document::{ + DocumentStructure, DocumentTree, NodeId, StructureNode, TocConfig, TocEntry, + TocNode, TocView, TreeNode, }; +// Utility functions +pub use util::{estimate_tokens, estimate_tokens_fast}; + // Configuration pub use config::{Config, ConfigLoader, RetrievalConfig, SummaryConfig}; diff --git a/src/llm/error.rs b/src/llm/error.rs index 2cd8245d..5969cf72 100644 --- a/src/llm/error.rs +++ b/src/llm/error.rs @@ -93,9 +93,9 @@ impl From for LlmError { } } -impl From for crate::domain::Error { +impl From for crate::Error { fn from(e: LlmError) -> Self { - crate::domain::Error::Llm(e.to_string()) + crate::Error::Llm(e.to_string()) } } diff --git a/src/parser/docx/parser.rs b/src/parser/docx/parser.rs index dd59ccca..15d593c8 100644 --- a/src/parser/docx/parser.rs +++ b/src/parser/docx/parser.rs @@ -32,7 +32,8 @@ use std::path::Path; use async_trait::async_trait; use zip::ZipArchive; -use crate::domain::{Error, Result}; +use crate::{Error}; +use crate::error::Result; use crate::parser::{DocumentFormat, DocumentMeta, DocumentParser, ParseResult, RawNode}; use super::styles::StyleResolver; diff --git a/src/parser/markdown/parser.rs b/src/parser/markdown/parser.rs index 366be1be..cc6df8a1 100644 --- a/src/parser/markdown/parser.rs +++ b/src/parser/markdown/parser.rs @@ -7,7 +7,8 @@ use async_trait::async_trait; use pulldown_cmark::Options; use std::path::Path; -use crate::domain::{Result, estimate_tokens}; +use crate::error::Result; +use crate::util::estimate_tokens; use crate::parser::{DocumentFormat, DocumentMeta, DocumentParser, ParseResult, RawNode}; use super::config::MarkdownConfig; @@ -398,7 +399,7 @@ impl DocumentParser for MarkdownParser { async fn parse_file(&self, path: &Path) -> Result { let content = tokio::fs::read_to_string(path) .await - .map_err(|e| crate::domain::Error::Parse(format!("Failed to read file: {}", e)))?; + .map_err(|e| crate::Error::Parse(format!("Failed to read file: {}", e)))?; let mut result = self.parse(&content).await?; diff --git a/src/parser/pdf/parser.rs b/src/parser/pdf/parser.rs index c047d21a..a96bf0c2 100644 --- a/src/parser/pdf/parser.rs +++ b/src/parser/pdf/parser.rs @@ -8,7 +8,8 @@ use std::path::Path; use lopdf::Document as LopdfDocument; use tracing::{info, warn}; -use crate::domain::{Error, Result}; +use crate::{Error}; +use crate::error::Result; use crate::parser::DocumentParser; use crate::parser::toc::TocProcessor; diff --git a/src/parser/pdf/types.rs b/src/parser/pdf/types.rs index 8c6e27b0..1c2ac9fc 100644 --- a/src/parser/pdf/types.rs +++ b/src/parser/pdf/types.rs @@ -3,7 +3,7 @@ //! PDF document types. -use crate::domain::estimate_tokens; +use crate::util::estimate_tokens; use serde::{Deserialize, Serialize}; /// A single page from a PDF document. diff --git a/src/parser/registry.rs b/src/parser/registry.rs index 947552ac..ae632e4c 100644 --- a/src/parser/registry.rs +++ b/src/parser/registry.rs @@ -11,7 +11,8 @@ use std::collections::HashMap; use std::path::Path; use std::sync::{Arc, RwLock}; -use crate::domain::{Error, Result}; +use crate::{Error}; +use crate::error::Result; use crate::parser::{DocumentFormat, DocumentParser, MarkdownParser, ParseResult, PdfParser}; /// Type alias for parser factory functions. diff --git a/src/parser/toc/assigner.rs b/src/parser/toc/assigner.rs index 86087885..a62e6486 100644 --- a/src/parser/toc/assigner.rs +++ b/src/parser/toc/assigner.rs @@ -7,7 +7,7 @@ use std::collections::HashMap; use tracing::{debug, info}; use crate::config::LlmConfig; -use crate::domain::Result; +use crate::error::Result; use crate::parser::pdf::PdfPage; use super::types::{PageOffset, TocEntry}; diff --git a/src/parser/toc/detector.rs b/src/parser/toc/detector.rs index f8112f07..6688adfc 100644 --- a/src/parser/toc/detector.rs +++ b/src/parser/toc/detector.rs @@ -7,7 +7,7 @@ use regex::Regex; use tracing::debug; use crate::config::LlmConfig; -use crate::domain::Result; +use crate::error::Result; use super::types::TocDetection; use crate::llm::LlmClient; diff --git a/src/parser/toc/parser.rs b/src/parser/toc/parser.rs index 9cbeee1f..20b61af2 100644 --- a/src/parser/toc/parser.rs +++ b/src/parser/toc/parser.rs @@ -6,7 +6,7 @@ use tracing::debug; use crate::config::LlmConfig; -use crate::domain::Result; +use crate::error::Result; use super::types::TocEntry; use crate::llm::LlmClient; diff --git a/src/parser/toc/processor.rs b/src/parser/toc/processor.rs index 991b0f6d..7b7cf945 100644 --- a/src/parser/toc/processor.rs +++ b/src/parser/toc/processor.rs @@ -5,7 +5,7 @@ use tracing::{debug, info, warn}; -use crate::domain::Result; +use crate::error::Result; use crate::parser::pdf::PdfPage; use super::assigner::{PageAssigner, PageAssignerConfig}; diff --git a/src/parser/toc/repairer.rs b/src/parser/toc/repairer.rs index 4a00383c..8a26b8cd 100644 --- a/src/parser/toc/repairer.rs +++ b/src/parser/toc/repairer.rs @@ -6,7 +6,7 @@ use tracing::{debug, info}; use crate::config::LlmConfig; -use crate::domain::Result; +use crate::error::Result; use crate::parser::pdf::PdfPage; use super::types::{TocEntry, VerificationError, VerificationReport}; diff --git a/src/parser/toc/verifier.rs b/src/parser/toc/verifier.rs index e1e9c457..a0243bc1 100644 --- a/src/parser/toc/verifier.rs +++ b/src/parser/toc/verifier.rs @@ -7,7 +7,7 @@ use rand::seq::SliceRandom; use tracing::{debug, info}; use crate::config::LlmConfig; -use crate::domain::Result; +use crate::error::Result; use crate::parser::pdf::PdfPage; use super::types::{ErrorType, TocEntry, VerificationError, VerificationReport}; diff --git a/src/parser/traits.rs b/src/parser/traits.rs index 551aed86..296fcabe 100644 --- a/src/parser/traits.rs +++ b/src/parser/traits.rs @@ -7,7 +7,7 @@ use async_trait::async_trait; use std::path::Path; use super::{DocumentFormat, ParseResult}; -use crate::domain::Result; +use crate::error::Result; /// A parser for extracting content from documents. /// @@ -54,7 +54,7 @@ pub trait DocumentParser: Send + Sync { async fn parse_file(&self, path: &Path) -> Result { let content = tokio::fs::read_to_string(path) .await - .map_err(|e| crate::domain::Error::Parse(format!("Failed to read file: {}", e)))?; + .map_err(|e| crate::Error::Parse(format!("Failed to read file: {}", e)))?; self.parse(&content).await } diff --git a/src/retrieval/cache/path_cache.rs b/src/retrieval/cache/path_cache.rs index e9202150..a394fa1f 100644 --- a/src/retrieval/cache/path_cache.rs +++ b/src/retrieval/cache/path_cache.rs @@ -9,7 +9,7 @@ use std::time::{Duration, Instant}; use super::super::types::SearchPath; use crate::config::CacheConfig as AppConfig; -use crate::domain::NodeId; +use crate::document::NodeId; /// Cache entry for a search path. #[derive(Debug, Clone)] diff --git a/src/retrieval/content/aggregator.rs b/src/retrieval/content/aggregator.rs index 9edb625b..87a8f20e 100644 --- a/src/retrieval/content/aggregator.rs +++ b/src/retrieval/content/aggregator.rs @@ -10,7 +10,8 @@ use std::collections::HashMap; use tracing::{debug, info}; -use crate::domain::{DocumentTree, NodeId, estimate_tokens}; +use crate::document::{DocumentTree, NodeId}; +use crate::util::estimate_tokens; use super::budget::{AllocationResult, AllocationStrategy, BudgetAllocator, SelectedContent}; use super::builder::{ContentMetadata, StructureBuilder, StructuredContent}; @@ -350,7 +351,7 @@ mod tests { fn make_test_node_id() -> NodeId { let mut arena = Arena::new(); - let node = crate::domain::TreeNode { + let node = crate::document::TreeNode { title: "Test".to_string(), structure: String::new(), content: String::new(), diff --git a/src/retrieval/content/budget.rs b/src/retrieval/content/budget.rs index fa91e9c0..1b4ed279 100644 --- a/src/retrieval/content/budget.rs +++ b/src/retrieval/content/budget.rs @@ -8,7 +8,8 @@ use std::collections::HashMap; -use crate::domain::{estimate_tokens, NodeId}; +use crate::document::NodeId; +use crate::util::estimate_tokens; use super::scorer::ContentRelevance; @@ -526,7 +527,7 @@ mod tests { fn make_test_node_id() -> NodeId { let mut arena = Arena::new(); - let node = crate::domain::TreeNode { + let node = crate::document::TreeNode { title: "Test".to_string(), structure: String::new(), content: String::new(), diff --git a/src/retrieval/content/builder.rs b/src/retrieval/content/builder.rs index c3b5792f..e0248e7b 100644 --- a/src/retrieval/content/builder.rs +++ b/src/retrieval/content/builder.rs @@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize}; -use crate::domain::DocumentTree; +use crate::document::DocumentTree; use super::budget::SelectedContent; use super::config::OutputFormatConfig; @@ -309,7 +309,7 @@ impl StructureBuilder { // Group by parent use std::collections::HashMap; - let mut by_parent: HashMap, Vec<&SelectedContent>> = + let mut by_parent: HashMap, Vec<&SelectedContent>> = HashMap::new(); for content in &selected { @@ -327,7 +327,7 @@ impl StructureBuilder { // Build tree recursively fn build_node( content: &SelectedContent, - all_by_parent: &HashMap, Vec<&SelectedContent>>, + all_by_parent: &HashMap, Vec<&SelectedContent>>, ) -> ContentTreeNode { let mut node = ContentTreeNode::new(content.title.clone()) .with_content(content.content.clone(), content.score); @@ -413,12 +413,12 @@ fn render_tree(node: &ContentTreeNode, depth: usize) -> String { #[cfg(test)] mod tests { use super::*; - use crate::domain::NodeId; + use crate::document::NodeId; use indextree::Arena; fn make_test_node_id() -> NodeId { let mut arena = Arena::new(); - let node = crate::domain::TreeNode { + let node = crate::document::TreeNode { title: "Test".to_string(), structure: String::new(), content: String::new(), diff --git a/src/retrieval/content/scorer.rs b/src/retrieval/content/scorer.rs index ba04a6ce..daf49550 100644 --- a/src/retrieval/content/scorer.rs +++ b/src/retrieval/content/scorer.rs @@ -8,7 +8,8 @@ use std::collections::HashMap; -use crate::domain::{estimate_tokens, NodeId}; +use crate::document::NodeId; +use crate::util::estimate_tokens; use super::config::ScoringStrategyConfig; @@ -339,7 +340,7 @@ mod tests { fn make_test_node_id() -> NodeId { let mut arena = Arena::new(); - let node = crate::domain::TreeNode { + let node = crate::document::TreeNode { title: "Test".to_string(), structure: String::new(), content: String::new(), diff --git a/src/retrieval/context.rs b/src/retrieval/context.rs index 595c9083..c4f278b9 100644 --- a/src/retrieval/context.rs +++ b/src/retrieval/context.rs @@ -28,7 +28,8 @@ //! ``` use super::types::RetrievalResult; -use crate::domain::{DocumentTree, NodeId, estimate_tokens}; +use crate::document::{DocumentTree, NodeId}; +use crate::util::estimate_tokens; use std::collections::HashSet; /// Pruning strategy for context building. @@ -476,7 +477,7 @@ impl ContextBuilder { } } - fn format_node_section(&self, node: &crate::domain::TreeNode, depth: usize) -> String { + fn format_node_section(&self, node: &crate::document::TreeNode, depth: usize) -> String { let mut section = String::new(); if self.include_titles { diff --git a/src/retrieval/pilot/builder.rs b/src/retrieval/pilot/builder.rs index 725b4394..931c19b0 100644 --- a/src/retrieval/pilot/builder.rs +++ b/src/retrieval/pilot/builder.rs @@ -16,7 +16,7 @@ use std::collections::HashSet; -use crate::domain::{DocumentTree, NodeId}; +use crate::document::{DocumentTree, NodeId}; use super::SearchState; /// Token budget distribution for context building. @@ -436,7 +436,7 @@ mod tests { fn create_test_tree() -> DocumentTree { let mut arena = Arena::new(); - let root = arena.new_node(crate::domain::TreeNode { + let root = arena.new_node(crate::document::TreeNode { title: "Root".to_string(), content: "Root content".to_string(), summary: "Root summary".to_string(), @@ -444,7 +444,7 @@ mod tests { ..Default::default() }); - let child1 = arena.new_node(crate::domain::TreeNode { + let child1 = arena.new_node(crate::document::TreeNode { title: "Configuration".to_string(), content: "Config content".to_string(), summary: "Configuration options".to_string(), @@ -452,7 +452,7 @@ mod tests { ..Default::default() }); - let child2 = arena.new_node(crate::domain::TreeNode { + let child2 = arena.new_node(crate::document::TreeNode { title: "API Reference".to_string(), content: "API content".to_string(), summary: "API documentation".to_string(), @@ -463,7 +463,7 @@ mod tests { root.append(child1, &mut arena); root.append(child2, &mut arena); - DocumentTree::from_raw(arena, crate::domain::NodeId(root)) + DocumentTree::from_raw(arena, crate::document::NodeId(root)) } #[test] diff --git a/src/retrieval/pilot/decision.rs b/src/retrieval/pilot/decision.rs index 69a117d6..084582c2 100644 --- a/src/retrieval/pilot/decision.rs +++ b/src/retrieval/pilot/decision.rs @@ -8,7 +8,7 @@ use serde::{Deserialize, Serialize}; -use crate::domain::NodeId; +use crate::document::NodeId; /// Pilot's navigation decision result. /// @@ -243,7 +243,7 @@ mod tests { let mut arena = Arena::new(); let mut ids = Vec::new(); for i in 0..count { - let node = crate::domain::TreeNode { + let node = crate::document::TreeNode { title: format!("Node {}", i), structure: String::new(), content: String::new(), diff --git a/src/retrieval/pilot/llm_pilot.rs b/src/retrieval/pilot/llm_pilot.rs index c163396a..10118ff0 100644 --- a/src/retrieval/pilot/llm_pilot.rs +++ b/src/retrieval/pilot/llm_pilot.rs @@ -10,7 +10,7 @@ use async_trait::async_trait; use std::sync::Arc; use tracing::{debug, info, warn}; -use crate::domain::DocumentTree; +use crate::document::DocumentTree; use crate::llm::LlmClient; use super::builder::ContextBuilder; @@ -147,7 +147,7 @@ impl LlmPilot { &self, point: InterventionPoint, context: &super::builder::PilotContext, - candidates: &[crate::domain::NodeId], + candidates: &[crate::document::NodeId], ) -> PilotDecision { // Build prompt let prompt = self.prompt_builder.build(point, context); @@ -192,7 +192,7 @@ impl LlmPilot { /// Create a default decision when LLM fails. fn default_decision( &self, - candidates: &[crate::domain::NodeId], + candidates: &[crate::document::NodeId], point: InterventionPoint, ) -> PilotDecision { let ranked = candidates @@ -357,14 +357,14 @@ impl Pilot for LlmPilot { #[cfg(test)] mod tests { use super::*; - use crate::domain::NodeId; + use crate::document::NodeId; use indextree::Arena; fn create_test_node_ids(count: usize) -> Vec { let mut arena = Arena::new(); let mut ids = Vec::new(); for i in 0..count { - let node = crate::domain::TreeNode { + let node = crate::document::TreeNode { title: format!("Node {}", i), structure: String::new(), content: String::new(), diff --git a/src/retrieval/pilot/noop.rs b/src/retrieval/pilot/noop.rs index daa95648..b79156a5 100644 --- a/src/retrieval/pilot/noop.rs +++ b/src/retrieval/pilot/noop.rs @@ -9,7 +9,7 @@ use async_trait::async_trait; -use crate::domain::DocumentTree; +use crate::document::DocumentTree; use super::{InterventionPoint, Pilot, PilotConfig, PilotDecision, SearchState}; @@ -103,7 +103,7 @@ impl Pilot for NoopPilot { #[cfg(test)] mod tests { use super::*; - use crate::domain::NodeId; + use crate::document::NodeId; use std::collections::HashSet; #[test] diff --git a/src/retrieval/pilot/parser.rs b/src/retrieval/pilot/parser.rs index 9bb0bd48..ca88ff26 100644 --- a/src/retrieval/pilot/parser.rs +++ b/src/retrieval/pilot/parser.rs @@ -13,7 +13,7 @@ use regex::Regex; use serde::{Deserialize, Serialize}; use tracing::warn; -use crate::domain::NodeId; +use crate::document::NodeId; use super::decision::{PilotDecision, RankedCandidate, SearchDirection, InterventionPoint}; /// Parsed response from LLM. @@ -348,7 +348,7 @@ mod tests { let mut arena = Arena::new(); let mut ids = Vec::new(); for i in 0..count { - let node = crate::domain::TreeNode { + let node = crate::document::TreeNode { title: format!("Node {}", i), structure: String::new(), content: String::new(), diff --git a/src/retrieval/pilot/trait.rs b/src/retrieval/pilot/trait.rs index 2017aa94..94e7fac7 100644 --- a/src/retrieval/pilot/trait.rs +++ b/src/retrieval/pilot/trait.rs @@ -11,7 +11,7 @@ use async_trait::async_trait; use std::collections::HashSet; use std::sync::LazyLock; -use crate::domain::{DocumentTree, NodeId}; +use crate::document::{DocumentTree, NodeId}; use super::{PilotConfig, PilotDecision, InterventionPoint}; diff --git a/src/retrieval/pipeline/context.rs b/src/retrieval/pipeline/context.rs index b12d3d9f..3537e7a3 100644 --- a/src/retrieval/pipeline/context.rs +++ b/src/retrieval/pipeline/context.rs @@ -10,7 +10,7 @@ use std::collections::HashMap; use std::sync::Arc; use std::time::Instant; -use crate::domain::{DocumentTree, NodeId, RetrievalIndex}; +use crate::document::{DocumentTree, NodeId, RetrievalIndex}; use crate::retrieval::pilot::Pilot; use crate::retrieval::types::{ NavigationStep, QueryComplexity, RetrieveOptions, RetrieveResponse, SearchPath, diff --git a/src/retrieval/pipeline/orchestrator.rs b/src/retrieval/pipeline/orchestrator.rs index 2dcde02e..fc013014 100644 --- a/src/retrieval/pipeline/orchestrator.rs +++ b/src/retrieval/pipeline/orchestrator.rs @@ -15,7 +15,8 @@ use std::sync::Arc; use std::time::Instant; use tracing::{debug, error, info, warn}; -use crate::domain::{DocumentTree, Result}; +use crate::document::{DocumentTree}; +use crate::error::Result; use crate::retrieval::pilot::{Pilot, SearchState}; // FailurePolicy is re-exported for stages use crate::retrieval::types::{RetrieveOptions, RetrieveResponse}; @@ -148,7 +149,7 @@ impl RetrievalOrchestrator { for entry in &self.stages { for dep in &entry.depends_on { if !name_to_idx.contains_key(dep.as_str()) { - return Err(crate::domain::Error::Config(format!( + return Err(crate::Error::Config(format!( "Stage '{}' depends on non-existent stage '{}'", entry.stage.name(), dep @@ -205,7 +206,7 @@ impl RetrievalOrchestrator { .filter(|i| !result.contains(i)) .map(|i| self.stages[i].stage.name()) .collect(); - return Err(crate::domain::Error::Config(format!( + return Err(crate::Error::Config(format!( "Circular dependency detected involving stages: {:?}", remaining ))); diff --git a/src/retrieval/pipeline/stage.rs b/src/retrieval/pipeline/stage.rs index 946a9fba..285c717f 100644 --- a/src/retrieval/pipeline/stage.rs +++ b/src/retrieval/pipeline/stage.rs @@ -9,7 +9,7 @@ use async_trait::async_trait; -use crate::domain::Result; +use crate::error::Result; use crate::index::pipeline::FailurePolicy; use super::context::PipelineContext; diff --git a/src/retrieval/pipeline_retriever.rs b/src/retrieval/pipeline_retriever.rs index e51d187a..b7254645 100644 --- a/src/retrieval/pipeline_retriever.rs +++ b/src/retrieval/pipeline_retriever.rs @@ -15,7 +15,8 @@ use super::retriever::{CostEstimate, Retriever, RetrieverError, RetrieverResult} use super::stages::{AnalyzeStage, JudgeStage, PlanStage, SearchStage}; use super::strategy::LlmStrategy; use super::types::{RetrieveOptions, RetrieveResponse}; -use crate::domain::DocumentTree; +use crate::document::DocumentTree; +use crate::error::Result; use crate::llm::LlmClient; use crate::retrieval::pilot::{LlmPilot, PilotConfig}; diff --git a/src/retrieval/retriever.rs b/src/retrieval/retriever.rs index 83763cdb..97c280c0 100644 --- a/src/retrieval/retriever.rs +++ b/src/retrieval/retriever.rs @@ -6,7 +6,7 @@ use async_trait::async_trait; use super::types::{RetrieveOptions, RetrieveResponse}; -use crate::domain::DocumentTree; +use crate::document::DocumentTree; /// Result type for retriever operations. pub type RetrieverResult = Result; diff --git a/src/retrieval/search/beam.rs b/src/retrieval/search/beam.rs index 2dec5e40..ea73051c 100644 --- a/src/retrieval/search/beam.rs +++ b/src/retrieval/search/beam.rs @@ -14,7 +14,7 @@ use super::super::RetrievalContext; use super::super::types::{NavigationDecision, NavigationStep, SearchPath}; use super::scorer::{NodeScorer, ScoringContext}; use super::{SearchConfig, SearchResult, SearchTree}; -use crate::domain::{DocumentTree, NodeId}; +use crate::document::{DocumentTree, NodeId}; use crate::retrieval::pilot::{Pilot, SearchState}; /// Beam search - explores multiple paths simultaneously. diff --git a/src/retrieval/search/greedy.rs b/src/retrieval/search/greedy.rs index ad9fd8d8..89357225 100644 --- a/src/retrieval/search/greedy.rs +++ b/src/retrieval/search/greedy.rs @@ -13,7 +13,7 @@ use super::super::RetrievalContext; use super::super::types::{NavigationDecision, NavigationStep, SearchPath}; use super::scorer::{NodeScorer, ScoringContext}; use super::{SearchConfig, SearchResult, SearchTree}; -use crate::domain::{DocumentTree, NodeId}; +use crate::document::{DocumentTree, NodeId}; use crate::retrieval::pilot::{Pilot, SearchState}; /// Greedy search - always follows the best single path. diff --git a/src/retrieval/search/mcts.rs b/src/retrieval/search/mcts.rs index 2cc6fbd0..667a0d28 100644 --- a/src/retrieval/search/mcts.rs +++ b/src/retrieval/search/mcts.rs @@ -14,7 +14,7 @@ use super::super::types::{NavigationDecision, NavigationStep, SearchPath}; use super::scorer::NodeScorer; use super::{SearchConfig, SearchResult, SearchTree}; use crate::config::StrategyConfig; -use crate::domain::{DocumentTree, NodeId}; +use crate::document::{DocumentTree, NodeId}; use crate::retrieval::pilot::Pilot; /// Statistics for a node in MCTS. diff --git a/src/retrieval/search/scorer.rs b/src/retrieval/search/scorer.rs index e22f8239..0d051938 100644 --- a/src/retrieval/search/scorer.rs +++ b/src/retrieval/search/scorer.rs @@ -5,7 +5,7 @@ //! //! Implements the NodeScore formula: `Σ ChunkScore(n) / √(N+1)` -use crate::domain::{DocumentTree, NodeId}; +use crate::document::{DocumentTree, NodeId}; /// Context for scoring calculations. #[derive(Debug, Clone)] diff --git a/src/retrieval/search/trait.rs b/src/retrieval/search/trait.rs index 927753cf..1790b703 100644 --- a/src/retrieval/search/trait.rs +++ b/src/retrieval/search/trait.rs @@ -7,7 +7,7 @@ use async_trait::async_trait; use super::super::RetrievalContext; use super::super::types::{NavigationStep, SearchPath}; -use crate::domain::DocumentTree; +use crate::document::DocumentTree; use crate::retrieval::pilot::Pilot; /// Result of a search operation. diff --git a/src/retrieval/stages/analyze.rs b/src/retrieval/stages/analyze.rs index c26b7e4c..3eabca1f 100644 --- a/src/retrieval/stages/analyze.rs +++ b/src/retrieval/stages/analyze.rs @@ -11,7 +11,7 @@ use async_trait::async_trait; use tracing::info; -use crate::domain::{DocumentTree, TocView}; +use crate::document::{DocumentTree, TocView}; use crate::retrieval::complexity::ComplexityDetector; use crate::retrieval::pipeline::{FailurePolicy, PipelineContext, RetrievalStage, StageOutcome}; // QueryComplexity is used in context @@ -108,7 +108,7 @@ impl AnalyzeStage { let mut matches: Vec<(String, f32)> = Vec::new(); fn collect_sections( - nodes: &[crate::domain::TocNode], + nodes: &[crate::document::TocNode], query_lower: &str, matches: &mut Vec<(String, f32)>, ) { @@ -165,7 +165,7 @@ impl RetrievalStage for AnalyzeStage { FailurePolicy::fail() // Must succeed } - async fn execute(&self, ctx: &mut PipelineContext) -> crate::domain::Result { + async fn execute(&self, ctx: &mut PipelineContext) -> crate::error::Result { info!("Analyzing query: '{}'", ctx.query); // 1. Detect complexity diff --git a/src/retrieval/stages/judge.rs b/src/retrieval/stages/judge.rs index 9cc11e68..1178f402 100644 --- a/src/retrieval/stages/judge.rs +++ b/src/retrieval/stages/judge.rs @@ -10,7 +10,7 @@ use async_trait::async_trait; // Arc is used for async sharing use tracing::{info, warn}; -use crate::domain::estimate_tokens; +use crate::util::estimate_tokens; use crate::llm::LlmClient; use crate::retrieval::content::{ContentAggregator, ContentAggregatorConfig}; use crate::retrieval::pipeline::{FailurePolicy, PipelineContext, RetrievalStage, StageOutcome}; @@ -167,7 +167,7 @@ impl JudgeStage { } /// Collect content from leaf descendants of a node (excluding the node itself). - fn collect_leaf_content(&self, tree: &crate::domain::DocumentTree, node_id: crate::domain::NodeId) -> String { + fn collect_leaf_content(&self, tree: &crate::document::DocumentTree, node_id: crate::document::NodeId) -> String { let mut content_parts = Vec::new(); // Start with children, not the node itself @@ -177,7 +177,7 @@ impl JudgeStage { return String::new(); } - let mut stack: Vec = children; + let mut stack: Vec = children; while let Some(current_id) = stack.pop() { let current_children = tree.children(current_id); @@ -319,7 +319,7 @@ impl RetrievalStage for JudgeStage { true // Can trigger backtracking to search } - async fn execute(&self, ctx: &mut PipelineContext) -> crate::domain::Result { + async fn execute(&self, ctx: &mut PipelineContext) -> crate::error::Result { let start = std::time::Instant::now(); info!( diff --git a/src/retrieval/stages/plan.rs b/src/retrieval/stages/plan.rs index 7177322b..0b98003c 100644 --- a/src/retrieval/stages/plan.rs +++ b/src/retrieval/stages/plan.rs @@ -155,7 +155,7 @@ impl RetrievalStage for PlanStage { FailurePolicy::fail() // Must succeed } - async fn execute(&self, ctx: &mut PipelineContext) -> crate::domain::Result { + async fn execute(&self, ctx: &mut PipelineContext) -> crate::error::Result { info!("Planning retrieval strategy"); // 1. Select strategy diff --git a/src/retrieval/stages/search.rs b/src/retrieval/stages/search.rs index e9addfe7..121378f5 100644 --- a/src/retrieval/stages/search.rs +++ b/src/retrieval/stages/search.rs @@ -11,7 +11,7 @@ use async_trait::async_trait; use std::sync::Arc; use tracing::{info, warn}; -use crate::domain::DocumentTree; +use crate::document::DocumentTree; // LlmClient is used via strategy use crate::retrieval::pilot::Pilot; use crate::retrieval::RetrievalContext; // Legacy context @@ -187,7 +187,7 @@ impl RetrievalStage for SearchStage { true // Can receive backtracks from judge } - async fn execute(&self, ctx: &mut PipelineContext) -> crate::domain::Result { + async fn execute(&self, ctx: &mut PipelineContext) -> crate::error::Result { let start = std::time::Instant::now(); // Get strategy and algorithm diff --git a/src/retrieval/strategy/keyword.rs b/src/retrieval/strategy/keyword.rs index bfb34a68..7e505f0e 100644 --- a/src/retrieval/strategy/keyword.rs +++ b/src/retrieval/strategy/keyword.rs @@ -11,7 +11,7 @@ use std::collections::{HashMap, HashSet}; use super::super::RetrievalContext; use super::super::types::{NavigationDecision, QueryComplexity}; use super::r#trait::{NodeEvaluation, RetrievalStrategy, StrategyCapabilities}; -use crate::domain::{DocumentTree, NodeId}; +use crate::document::{DocumentTree, NodeId}; /// Keyword-based retrieval strategy. /// diff --git a/src/retrieval/strategy/llm.rs b/src/retrieval/strategy/llm.rs index 7a3ed89e..c1ca5037 100644 --- a/src/retrieval/strategy/llm.rs +++ b/src/retrieval/strategy/llm.rs @@ -11,7 +11,7 @@ use serde::Deserialize; use super::super::RetrievalContext; use super::super::types::{NavigationDecision, QueryComplexity}; use super::r#trait::{NodeEvaluation, RetrievalStrategy, StrategyCapabilities}; -use crate::domain::{DocumentTree, NodeId, TocView}; +use crate::document::{DocumentTree, NodeId, TocView}; use crate::llm::LlmClient; /// LLM response for navigation decision. diff --git a/src/retrieval/strategy/semantic.rs b/src/retrieval/strategy/semantic.rs index 170e7998..1e924538 100644 --- a/src/retrieval/strategy/semantic.rs +++ b/src/retrieval/strategy/semantic.rs @@ -11,7 +11,7 @@ use super::super::RetrievalContext; use super::super::types::{NavigationDecision, QueryComplexity}; use super::r#trait::{NodeEvaluation, RetrievalStrategy, StrategyCapabilities}; use crate::config::StrategyConfig; -use crate::domain::{DocumentTree, NodeId}; +use crate::document::{DocumentTree, NodeId}; /// Embedding model trait for semantic strategies. #[async_trait] diff --git a/src/retrieval/strategy/trait.rs b/src/retrieval/strategy/trait.rs index 3699a128..895d60a2 100644 --- a/src/retrieval/strategy/trait.rs +++ b/src/retrieval/strategy/trait.rs @@ -7,7 +7,7 @@ use async_trait::async_trait; use super::super::RetrievalContext; use super::super::types::{NavigationDecision, QueryComplexity}; -use crate::domain::{DocumentTree, NodeId}; +use crate::document::{DocumentTree, NodeId}; /// Result of evaluating a single node. #[derive(Debug, Clone)] diff --git a/src/retrieval/types.rs b/src/retrieval/types.rs index 2077f325..82ee5504 100644 --- a/src/retrieval/types.rs +++ b/src/retrieval/types.rs @@ -6,7 +6,7 @@ use serde::{Deserialize, Serialize}; use super::context::{PruningStrategy, TokenEstimation}; -use crate::domain::NodeId; +use crate::document::NodeId; /// Query complexity level for adaptive strategy selection. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] diff --git a/src/storage/persistence.rs b/src/storage/persistence.rs index a77a3e0b..d2870f7c 100644 --- a/src/storage/persistence.rs +++ b/src/storage/persistence.rs @@ -7,7 +7,8 @@ use serde::{Deserialize, Serialize}; use std::io; use std::path::{Path, PathBuf}; -use crate::domain::{DocumentTree, Error, Result}; +use crate::{DocumentTree, Error}; +use crate::error::Result; /// Metadata for a persisted document. #[derive(Debug, Clone, Serialize, Deserialize)] diff --git a/src/storage/workspace.rs b/src/storage/workspace.rs index 9cd1a83f..530ad272 100644 --- a/src/storage/workspace.rs +++ b/src/storage/workspace.rs @@ -35,7 +35,8 @@ use serde::{Deserialize, Serialize}; use tracing::{debug, info, warn}; use super::persistence::{PersistedDocument, load_document, save_document}; -use crate::domain::{Error, Result}; +use crate::{Error}; +use crate::error::Result; const META_FILE: &str = "_meta.json"; const DEFAULT_CACHE_SIZE: usize = 100; diff --git a/src/util/mod.rs b/src/util/mod.rs new file mode 100644 index 00000000..9d22d295 --- /dev/null +++ b/src/util/mod.rs @@ -0,0 +1,8 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Utility functions and helpers. + +mod token; + +pub use token::{estimate_tokens, estimate_tokens_batch, estimate_tokens_fast}; diff --git a/src/domain/token.rs b/src/util/token.rs similarity index 100% rename from src/domain/token.rs rename to src/util/token.rs From f6f781910575cc35999c4b6343384b838450ad86 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Sun, 5 Apr 2026 11:41:47 +0800 Subject: [PATCH 2/8] feat(error): enhance error handling with comprehensive error type hierarchy Add extensive error variants categorized by domain including document & parsing, index, retrieval, LLM, storage, serialization, configuration, and validation errors. Include helper methods for error categorization and context addition with retryable, not-found, timeout, and configuration error checks. Add comprehensive tests for error functionality. refactor(index): move configuration types to dedicated module Relocate IndexMode, OptimizationConfig, ThinningConfig, and PipelineOptions to src/index/config.rs to improve module organization and separation of concerns. feat(storage): add document cache with LRU eviction policy Implement thread-safe LRU cache for documents using interior mutability via Mutex. Provide cache statistics, utilization tracking, and proper error handling for concurrent access scenarios. --- src/error.rs | 271 ++++++++++++++++++++++++++++++++++++- src/index/config.rs | 268 +++++++++++++++++++++++++++++++++++++ src/index/mod.rs | 143 ++------------------ src/storage/cache.rs | 282 +++++++++++++++++++++++++++++++++++++++ src/storage/mod.rs | 6 +- src/storage/workspace.rs | 134 ++++++++++--------- src/util/format.rs | 212 +++++++++++++++++++++++++++++ src/util/mod.rs | 13 ++ src/util/timing.rs | 159 ++++++++++++++++++++++ 9 files changed, 1287 insertions(+), 201 deletions(-) create mode 100644 src/index/config.rs create mode 100644 src/storage/cache.rs create mode 100644 src/util/format.rs create mode 100644 src/util/timing.rs diff --git a/src/error.rs b/src/error.rs index 2f91bd38..d9b8e7d5 100644 --- a/src/error.rs +++ b/src/error.rs @@ -2,40 +2,123 @@ // SPDX-License-Identifier: Apache-2.0 //! Error types for the vectorless library. +//! +//! This module provides a comprehensive error type hierarchy for all operations. +//! All errors are consolidated into [`Error`] with specific variants for each category. use thiserror::Error; /// The main error type for vectorless operations. #[derive(Debug, Error)] pub enum Error { + // ========================================================================= + // Document & Parsing Errors + // ========================================================================= + /// An error occurred while parsing a document. #[error("Document parsing error: {0}")] Parse(String), + /// Unsupported document format. + #[error("Unsupported document format: {0}")] + UnsupportedFormat(String), + + /// Invalid document structure. + #[error("Invalid document structure: {0}")] + InvalidStructure(String), + + // ========================================================================= + // Index Errors + // ========================================================================= + /// An error occurred while building the index. #[error("Index building error: {0}")] IndexBuild(String), + /// Index not found. + #[error("Index not found: {0}")] + IndexNotFound(String), + + /// Index corrupted. + #[error("Index corrupted: {0}")] + IndexCorrupted(String), + + // ========================================================================= + // Retrieval Errors + // ========================================================================= + /// An error occurred during retrieval. #[error("Retrieval error: {0}")] Retrieval(String), - /// An error occurred during summarization. - #[error("Summarization error: {0}")] - Summarization(String), + /// No relevant content found. + #[error("No relevant content found for query")] + NoRelevantContent, + + /// Search timeout. + #[error("Search timeout after {0}ms")] + SearchTimeout(u64), + + // ========================================================================= + // LLM Errors + // ========================================================================= /// An error occurred during LLM call. #[error("LLM error: {0}")] Llm(String), + /// LLM rate limit exceeded. + #[error("LLM rate limit exceeded, retry after {0}ms")] + RateLimitExceeded(u64), + + /// LLM quota exceeded. + #[error("LLM quota exceeded")] + QuotaExceeded, + + // ========================================================================= + // Summary Errors + // ========================================================================= + + /// An error occurred during summarization. + #[error("Summarization error: {0}")] + Summarization(String), + + /// Summary too long. + #[error("Summary exceeds maximum length: {0} tokens")] + SummaryTooLong(usize), + + // ========================================================================= + // Storage Errors + // ========================================================================= + /// An error occurred during I/O operations. #[error("IO error: {0}")] Io(#[from] std::io::Error), + /// Workspace error. + #[error("Workspace error: {0}")] + Workspace(String), + + /// Cache error. + #[error("Cache error: {0}")] + Cache(String), + + // ========================================================================= + // Serialization Errors + // ========================================================================= + /// An error occurred during serialization/deserialization. #[error("Serialization error: {0}")] Serialization(#[from] serde_json::Error), + /// TOML parsing error. + #[error("TOML parsing error: {0}")] + Toml(String), + + // ========================================================================= + // Node & Document Errors + // ========================================================================= + /// The requested node was not found. #[error("Node not found: {0}")] NodeNotFound(String), @@ -44,14 +127,196 @@ pub enum Error { #[error("Document not found: {0}")] DocumentNotFound(String), + // ========================================================================= + // Configuration Errors + // ========================================================================= + /// Invalid configuration. #[error("Invalid configuration: {0}")] Config(String), + /// Missing required configuration. + #[error("Missing required configuration: {0}")] + MissingConfig(String), + + // ========================================================================= + // Input Validation Errors + // ========================================================================= + + /// Invalid input. + #[error("Invalid input: {0}")] + InvalidInput(String), + + /// Empty input. + #[error("Empty input: {field}")] + EmptyInput { + /// The field that was empty. + field: String, + }, + + /// Out of range. + #[error("{field} out of range: expected {min}-{max}, got {actual}")] + OutOfRange { + /// The field that was out of range. + field: String, + /// Minimum allowed value. + min: String, + /// Maximum allowed value. + max: String, + /// Actual value received. + actual: String, + }, + + // ========================================================================= + // Throttle Errors + // ========================================================================= + + /// Throttle error. + #[error("Throttle error: {0}")] + Throttle(String), + + /// Concurrency limit exceeded. + #[error("Concurrency limit exceeded: {0} pending")] + ConcurrencyLimitExceeded(usize), + + // ========================================================================= + // Timeout Errors + // ========================================================================= + + /// Operation timeout. + #[error("Operation timeout: {0}")] + Timeout(String), + + // ========================================================================= + // Generic Errors + // ========================================================================= + /// A generic error with a message. #[error("{0}")] Other(String), + + /// Error with context. + #[error("{context}: {source}")] + WithContext { + /// Additional context describing where/why the error occurred. + context: String, + /// The underlying error. + #[source] + source: Box, + }, +} + +impl Error { + /// Create an error with additional context. + #[must_use] + pub fn with_context(self, context: impl Into) -> Self { + Self::WithContext { + context: context.into(), + source: Box::new(self), + } + } + + /// Check if this is a retryable error. + #[must_use] + pub fn is_retryable(&self) -> bool { + matches!( + self, + Self::RateLimitExceeded(_) + | Self::SearchTimeout(_) + | Self::Timeout(_) + | Self::Llm(_) + ) + } + + /// Check if this is a not found error. + #[must_use] + pub fn is_not_found(&self) -> bool { + matches!( + self, + Self::NodeNotFound(_) | Self::DocumentNotFound(_) | Self::IndexNotFound(_) + ) + } + + /// Check if this is a timeout error. + #[must_use] + pub fn is_timeout(&self) -> bool { + matches!(self, Self::Timeout(_) | Self::SearchTimeout(_)) + } + + /// Check if this is a configuration error. + #[must_use] + pub fn is_config_error(&self) -> bool { + matches!(self, Self::Config(_) | Self::MissingConfig(_)) + } + + /// Create an empty input error. + pub fn empty_input(field: impl Into) -> Self { + Self::EmptyInput { + field: field.into(), + } + } + + /// Create an out of range error. + pub fn out_of_range( + field: impl Into, + min: impl Into, + max: impl Into, + actual: impl Into, + ) -> Self { + Self::OutOfRange { + field: field.into(), + min: min.into(), + max: max.into(), + actual: actual.into(), + } + } } /// A specialized result type for vectorless operations. pub type Result = std::result::Result; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_error_context() { + let inner = Error::Parse("test".to_string()); + let with_context = inner.with_context("While processing document"); + + let msg = format!("{}", with_context); + assert!(msg.contains("While processing document")); + assert!(msg.contains("test")); + } + + #[test] + fn test_is_retryable() { + assert!(Error::RateLimitExceeded(1000).is_retryable()); + assert!(Error::Timeout("test".to_string()).is_retryable()); + assert!(!Error::Config("test".to_string()).is_retryable()); + } + + #[test] + fn test_is_not_found() { + assert!(Error::NodeNotFound("1".to_string()).is_not_found()); + assert!(Error::DocumentNotFound("doc".to_string()).is_not_found()); + assert!(!Error::Parse("test".to_string()).is_not_found()); + } + + #[test] + fn test_empty_input() { + let err = Error::empty_input("query"); + let msg = format!("{}", err); + assert!(msg.contains("query")); + } + + #[test] + fn test_out_of_range() { + let err = Error::out_of_range("depth", "0", "10", "15"); + let msg = format!("{}", err); + assert!(msg.contains("depth")); + assert!(msg.contains("0")); + assert!(msg.contains("10")); + assert!(msg.contains("15")); + } +} diff --git a/src/index/config.rs b/src/index/config.rs new file mode 100644 index 00000000..55128822 --- /dev/null +++ b/src/index/config.rs @@ -0,0 +1,268 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Configuration types for the index pipeline. +//! +//! This module contains all configuration types used by the indexing pipeline: +//! - [`IndexMode`] - Document format selection +//! - [`PipelineOptions`] - Full pipeline configuration +//! - [`OptimizationConfig`] - Tree optimization settings +//! - [`ThinningConfig`] - Node merging settings + +use crate::config::{ConcurrencyConfig, IndexerConfig}; +use super::summary::SummaryStrategy; + +/// Index mode for document processing. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum IndexMode { + /// Auto-detect format from file extension. + Auto, + /// Force Markdown format. + Markdown, + /// Force PDF format. + Pdf, + /// Force DOCX format. + Docx, + /// Force HTML format. + Html, +} + +impl Default for IndexMode { + fn default() -> Self { + Self::Auto + } +} + +/// Configuration for tree optimization. +#[derive(Debug, Clone)] +pub struct OptimizationConfig { + /// Whether optimization is enabled. + pub enabled: bool, + + /// Maximum tree depth (flatten if exceeded). + pub max_depth: Option, + + /// Maximum children per node (group if exceeded). + pub max_children: Option, + + /// Minimum tokens for a leaf node (merge smaller ones). + pub merge_leaf_threshold: usize, +} + +impl Default for OptimizationConfig { + fn default() -> Self { + Self { + enabled: true, + max_depth: None, + max_children: None, + merge_leaf_threshold: 50, + } + } +} + +impl OptimizationConfig { + /// Create a new optimization config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Disable optimization entirely. + pub fn disabled() -> Self { + Self { + enabled: false, + ..Self::default() + } + } + + /// Set maximum depth. + pub fn with_max_depth(mut self, depth: usize) -> Self { + self.max_depth = Some(depth); + self + } + + /// Set maximum children per node. + pub fn with_max_children(mut self, max: usize) -> Self { + self.max_children = Some(max); + self + } +} + +/// Configuration for thinning (merging small nodes). +#[derive(Debug, Clone)] +pub struct ThinningConfig { + /// Whether thinning is enabled. + pub enabled: bool, + + /// Token threshold for merging. + pub threshold: usize, +} + +impl Default for ThinningConfig { + fn default() -> Self { + Self { + enabled: false, + threshold: 500, + } + } +} + +impl ThinningConfig { + /// Create disabled config. + pub fn disabled() -> Self { + Self::default() + } + + /// Create enabled config with threshold. + pub fn enabled(threshold: usize) -> Self { + Self { + enabled: true, + threshold, + } + } + + /// Set the token threshold. + pub fn with_threshold(mut self, threshold: usize) -> Self { + self.threshold = threshold; + self + } +} + +/// Pipeline options for index execution. +#[derive(Debug, Clone)] +pub struct PipelineOptions { + /// Index mode. + pub mode: IndexMode, + + /// Whether to generate node IDs. + pub generate_ids: bool, + + /// Summary generation strategy. + pub summary_strategy: SummaryStrategy, + + /// Thinning configuration. + pub thinning: ThinningConfig, + + /// Optimization configuration. + pub optimization: OptimizationConfig, + + /// Whether to generate document description. + pub generate_description: bool, + + /// Concurrency configuration. + pub concurrency: ConcurrencyConfig, + + /// Indexer configuration. + pub indexer: IndexerConfig, +} + +impl Default for PipelineOptions { + fn default() -> Self { + Self { + mode: IndexMode::Auto, + generate_ids: true, + summary_strategy: SummaryStrategy::default(), + thinning: ThinningConfig::default(), + optimization: OptimizationConfig::default(), + generate_description: true, + concurrency: ConcurrencyConfig::default(), + indexer: IndexerConfig::default(), + } + } +} + +impl PipelineOptions { + /// Create new pipeline options with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Set the index mode. + pub fn with_mode(mut self, mode: IndexMode) -> Self { + self.mode = mode; + self + } + + /// Set whether to generate node IDs. + pub fn with_generate_ids(mut self, generate: bool) -> Self { + self.generate_ids = generate; + self + } + + /// Set the summary strategy. + pub fn with_summary_strategy(mut self, strategy: SummaryStrategy) -> Self { + self.summary_strategy = strategy; + self + } + + /// Set the thinning configuration. + pub fn with_thinning(mut self, thinning: ThinningConfig) -> Self { + self.thinning = thinning; + self + } + + /// Set the optimization configuration. + pub fn with_optimization(mut self, optimization: OptimizationConfig) -> Self { + self.optimization = optimization; + self + } + + /// Set whether to generate document description. + pub fn with_generate_description(mut self, generate: bool) -> Self { + self.generate_description = generate; + self + } + + /// Set the concurrency configuration. + pub fn with_concurrency(mut self, concurrency: ConcurrencyConfig) -> Self { + self.concurrency = concurrency; + self + } + + /// Set the indexer configuration. + pub fn with_indexer(mut self, indexer: IndexerConfig) -> Self { + self.indexer = indexer; + self + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_index_mode_default() { + let mode = IndexMode::default(); + assert_eq!(mode, IndexMode::Auto); + } + + #[test] + fn test_optimization_config() { + let config = OptimizationConfig::new() + .with_max_depth(5) + .with_max_children(10); + + assert!(config.enabled); + assert_eq!(config.max_depth, Some(5)); + assert_eq!(config.max_children, Some(10)); + } + + #[test] + fn test_thinning_config() { + let config = ThinningConfig::enabled(300); + assert!(config.enabled); + assert_eq!(config.threshold, 300); + + let disabled = ThinningConfig::disabled(); + assert!(!disabled.enabled); + } + + #[test] + fn test_pipeline_options_builder() { + let options = PipelineOptions::new() + .with_mode(IndexMode::Markdown) + .with_generate_ids(false); + + assert_eq!(options.mode, IndexMode::Markdown); + assert!(!options.generate_ids); + } +} diff --git a/src/index/mod.rs b/src/index/mod.rs index 0eb72f7c..96de34a5 100644 --- a/src/index/mod.rs +++ b/src/index/mod.rs @@ -23,13 +23,11 @@ //! # Usage //! //! ```rust,ignore -//! use vectorless::domain::index::pipeline::{PipelineExecutor, IndexOptions}; -//! use vectorless::domain::index::summary::SummaryStrategy; +//! use vectorless::index::{PipelineExecutor, IndexInput, PipelineOptions}; +//! use vectorless::index::summary::SummaryStrategy; //! -//! let options = IndexOptions { -//! summary_strategy: SummaryStrategy::selective(100, true), -//! ..Default::default() -//! }; +//! let options = PipelineOptions::new() +//! .with_summary_strategy(SummaryStrategy::selective(100, true)); //! //! let result = PipelineExecutor::new() //! .with_options(options) @@ -37,6 +35,7 @@ //! .await?; //! ``` +pub mod config; pub mod incremental; pub mod pipeline; pub mod stages; @@ -48,6 +47,11 @@ pub use pipeline::{ PipelineExecutor, PipelineOrchestrator, StageResult, StageRetryConfig, }; +// Re-export config types +pub use config::{ + IndexMode, OptimizationConfig, PipelineOptions, ThinningConfig, +}; + // Re-export stages pub use stages::IndexStage; @@ -60,130 +64,5 @@ pub use summary::{ // Re-export incremental pub use incremental::{ChangeDetector, ChangeSet, PartialUpdater}; -// Re-export config types +// Re-export config types from crate config pub use crate::config::{ConcurrencyConfig, IndexerConfig}; - -/// Configuration for tree optimization. -#[derive(Debug, Clone)] -pub struct OptimizationConfig { - /// Whether optimization is enabled. - pub enabled: bool, - - /// Maximum tree depth (flatten if exceeded). - pub max_depth: Option, - - /// Maximum children per node (group if exceeded). - pub max_children: Option, - - /// Minimum tokens for a leaf node (merge smaller ones). - pub merge_leaf_threshold: usize, -} - -impl Default for OptimizationConfig { - fn default() -> Self { - Self { - enabled: true, - max_depth: None, - max_children: None, - merge_leaf_threshold: 50, - } - } -} - -/// Configuration for thinning (merging small nodes). -#[derive(Debug, Clone)] -pub struct ThinningConfig { - /// Whether thinning is enabled. - pub enabled: bool, - - /// Token threshold for merging. - pub threshold: usize, -} - -impl Default for ThinningConfig { - fn default() -> Self { - Self { - enabled: false, - threshold: 500, - } - } -} - -impl ThinningConfig { - /// Create disabled config. - pub fn disabled() -> Self { - Self::default() - } - - /// Create enabled config with threshold. - pub fn enabled(threshold: usize) -> Self { - Self { - enabled: true, - threshold, - } - } -} - -/// Index mode. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum IndexMode { - /// Auto-detect format from file extension. - Auto, - /// Force Markdown format. - Markdown, - /// Force PDF format. - Pdf, - /// Force DOCX format. - Docx, - /// Force HTML format. - Html, -} - -impl Default for IndexMode { - fn default() -> Self { - Self::Auto - } -} - -/// Pipeline options (v2). -#[derive(Debug, Clone)] -pub struct PipelineOptions { - /// Index mode. - pub mode: IndexMode, - - /// Whether to generate node IDs. - pub generate_ids: bool, - - /// Summary generation strategy. - pub summary_strategy: SummaryStrategy, - - /// Thinning configuration. - pub thinning: ThinningConfig, - - /// Optimization configuration. - pub optimization: OptimizationConfig, - - /// Whether to generate document description. - pub generate_description: bool, - - /// Concurrency configuration. - pub concurrency: ConcurrencyConfig, - - /// Indexer configuration. - pub indexer: IndexerConfig, -} - -impl Default for PipelineOptions { - fn default() -> Self { - Self { - mode: IndexMode::Auto, - generate_ids: true, - summary_strategy: SummaryStrategy::default(), - thinning: ThinningConfig::default(), - optimization: OptimizationConfig::default(), - generate_description: true, - concurrency: ConcurrencyConfig::default(), - indexer: IndexerConfig::default(), - } - } -} diff --git a/src/storage/cache.rs b/src/storage/cache.rs new file mode 100644 index 00000000..2f8a0067 --- /dev/null +++ b/src/storage/cache.rs @@ -0,0 +1,282 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Document cache with LRU eviction policy. +//! +//! This module provides a thread-safe LRU cache for loaded documents, +//! allowing efficient reuse of loaded document data while limiting memory usage. + +use std::num::NonZeroUsize; +use std::sync::Mutex; + +use lru::LruCache; + +use super::persistence::PersistedDocument; +use crate::error::Result; +use crate::Error; + +/// Default cache size (number of documents). +const DEFAULT_CACHE_SIZE: usize = 100; + +/// A thread-safe LRU cache for documents. +/// +/// Uses interior mutability via `Mutex` for safe concurrent access. +/// The cache automatically evicts least-recently-used entries when full. +#[derive(Debug)] +pub struct DocumentCache { + /// Inner cache protected by Mutex. + inner: Mutex>, + /// Maximum capacity. + capacity: usize, +} + +impl DocumentCache { + /// Create a new cache with default capacity (100 documents). + #[must_use] + pub fn new() -> Self { + Self::with_capacity(DEFAULT_CACHE_SIZE) + } + + /// Create a new cache with custom capacity. + /// + /// # Panics + /// + /// This function does not panic, but capacities below 1 are normalized to 1. + #[must_use] + pub fn with_capacity(capacity: usize) -> Self { + let capacity = capacity.max(1); + let non_zero = NonZeroUsize::new(capacity).unwrap_or_else(|| { + NonZeroUsize::new(DEFAULT_CACHE_SIZE).expect("default is non-zero") + }); + + Self { + inner: Mutex::new(LruCache::new(non_zero)), + capacity, + } + } + + /// Get a document from the cache. + /// + /// Returns `None` if the document is not in the cache. + /// Updates the access order (moves to most-recently-used). + /// + /// # Errors + /// + /// Returns an error if the cache lock is poisoned. + pub fn get(&self, id: &str) -> Result> { + let mut cache = self.lock()?; + Ok(cache.get(id).cloned()) + } + + /// Check if a document is in the cache. + pub fn contains(&self, id: &str) -> bool { + self.lock() + .map(|cache| cache.contains(id)) + .unwrap_or(false) + } + + /// Put a document into the cache. + /// + /// If the cache is full, evicts the least-recently-used entry. + /// Returns the evicted entry if any. + /// + /// # Errors + /// + /// Returns an error if the cache lock is poisoned. + pub fn put(&self, id: String, doc: PersistedDocument) -> Result> { + let mut cache = self.lock()?; + Ok(cache.put(id, doc)) + } + + /// Remove a document from the cache. + /// + /// Returns the removed document if it was in the cache. + /// + /// # Errors + /// + /// Returns an error if the cache lock is poisoned. + pub fn remove(&self, id: &str) -> Result> { + let mut cache = self.lock()?; + Ok(cache.pop(id)) + } + + /// Clear all entries from the cache. + /// + /// # Errors + /// + /// Returns an error if the cache lock is poisoned. + pub fn clear(&self) -> Result<()> { + let mut cache = self.lock()?; + cache.clear(); + Ok(()) + } + + /// Get the number of entries currently in the cache. + pub fn len(&self) -> usize { + self.lock() + .map(|cache| cache.len()) + .unwrap_or(0) + } + + /// Check if the cache is empty. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Get the maximum capacity of the cache. + pub fn capacity(&self) -> usize { + self.capacity + } + + /// Get cache utilization (0.0 to 1.0). + pub fn utilization(&self) -> f64 { + let len = self.len(); + if self.capacity == 0 { + return 0.0; + } + len as f64 / self.capacity as f64 + } + + /// Get all document IDs currently in the cache. + /// + /// # Errors + /// + /// Returns an error if the cache lock is poisoned. + pub fn keys(&self) -> Result> { + let cache = self.lock()?; + Ok(cache.iter().map(|(k, _)| k.clone()).collect()) + } + + /// Get cache statistics. + pub fn stats(&self) -> CacheStats { + CacheStats { + len: self.len(), + capacity: self.capacity, + utilization: self.utilization(), + } + } + + /// Lock the inner cache. + fn lock(&self) -> Result>> { + self.inner.lock().map_err(|_| { + Error::Cache("Cache lock poisoned".to_string()) + }) + } +} + +impl Default for DocumentCache { + fn default() -> Self { + Self::new() + } +} + +/// Cache statistics. +#[derive(Debug, Clone, Copy)] +pub struct CacheStats { + /// Number of entries in cache. + pub len: usize, + /// Maximum capacity. + pub capacity: usize, + /// Utilization (0.0 to 1.0). + pub utilization: f64, +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::storage::{DocumentMeta, PersistedDocument}; + use crate::document::DocumentTree; + + fn create_test_doc(id: &str) -> PersistedDocument { + let meta = DocumentMeta::new(id, "Test Doc", "md"); + let tree = DocumentTree::new("Root", "Content"); + PersistedDocument::new(meta, tree) + } + + #[test] + fn test_cache_basic() { + let cache = DocumentCache::with_capacity(3); + + // Add documents + let doc1 = create_test_doc("doc1"); + let doc2 = create_test_doc("doc2"); + + cache.put("doc1".to_string(), doc1.clone()).unwrap(); + cache.put("doc2".to_string(), doc2.clone()).unwrap(); + + assert_eq!(cache.len(), 2); + assert!(cache.contains("doc1")); + assert!(cache.contains("doc2")); + } + + #[test] + fn test_cache_get() { + let cache = DocumentCache::with_capacity(3); + let doc = create_test_doc("doc1"); + + cache.put("doc1".to_string(), doc).unwrap(); + + let retrieved = cache.get("doc1").unwrap(); + assert!(retrieved.is_some()); + assert_eq!(retrieved.unwrap().meta.id, "doc1"); + + let missing = cache.get("missing").unwrap(); + assert!(missing.is_none()); + } + + #[test] + fn test_cache_eviction() { + let cache = DocumentCache::with_capacity(2); + + cache.put("doc1".to_string(), create_test_doc("doc1")).unwrap(); + cache.put("doc2".to_string(), create_test_doc("doc2")).unwrap(); + cache.put("doc3".to_string(), create_test_doc("doc3")).unwrap(); + + // doc1 should be evicted (least recently used) + assert!(!cache.contains("doc1")); + assert!(cache.contains("doc2")); + assert!(cache.contains("doc3")); + } + + #[test] + fn test_cache_remove() { + let cache = DocumentCache::new(); + + cache.put("doc1".to_string(), create_test_doc("doc1")).unwrap(); + assert!(cache.contains("doc1")); + + let removed = cache.remove("doc1").unwrap(); + assert!(removed.is_some()); + assert!(!cache.contains("doc1")); + + let not_found = cache.remove("missing").unwrap(); + assert!(not_found.is_none()); + } + + #[test] + fn test_cache_clear() { + let cache = DocumentCache::new(); + + cache.put("doc1".to_string(), create_test_doc("doc1")).unwrap(); + cache.put("doc2".to_string(), create_test_doc("doc2")).unwrap(); + + assert_eq!(cache.len(), 2); + + cache.clear().unwrap(); + + assert!(cache.is_empty()); + } + + #[test] + fn test_cache_utilization() { + let cache = DocumentCache::with_capacity(10); + + assert_eq!(cache.utilization(), 0.0); + + cache.put("doc1".to_string(), create_test_doc("doc1")).unwrap(); + assert!((cache.utilization() - 0.1).abs() < 0.01); + + cache.put("doc2".to_string(), create_test_doc("doc2")).unwrap(); + assert!((cache.utilization() - 0.2).abs() < 0.01); + } +} diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 0d07d143..d5adc212 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -6,12 +6,13 @@ //! This module provides: //! - **Workspace** — A directory-based document collection manager with LRU cache //! - **Persistence** — Save/load document trees and metadata +//! - **Cache** — LRU cache for loaded documents //! //! # Example //! //! ```rust,no_run //! use vectorless::storage::{Workspace, PersistedDocument, DocumentMeta}; -//! use vectorless::domain::DocumentTree; +//! use vectorless::document::DocumentTree; //! //! // Create a workspace //! let mut workspace = Workspace::new("./my_workspace")?; @@ -26,13 +27,14 @@ //! let loaded = workspace.load("doc-1")?.unwrap(); //! ``` +pub mod cache; mod persistence; mod workspace; // Re-export main types +pub use cache::DocumentCache; pub use persistence::{ DocumentMeta, PageContent, PersistedDocument, load_document, load_index, save_document, save_index, }; - pub use workspace::{DocumentMetaEntry, Workspace}; diff --git a/src/storage/workspace.rs b/src/storage/workspace.rs index 530ad272..1ad67d98 100644 --- a/src/storage/workspace.rs +++ b/src/storage/workspace.rs @@ -26,17 +26,15 @@ use std::collections::HashMap; use std::fs; -use std::num::NonZeroUsize; use std::path::{Path, PathBuf}; -use std::sync::Mutex; -use lru::LruCache; use serde::{Deserialize, Serialize}; use tracing::{debug, info, warn}; +use super::cache::DocumentCache; use super::persistence::{PersistedDocument, load_document, save_document}; -use crate::{Error}; use crate::error::Result; +use crate::Error; const META_FILE: &str = "_meta.json"; const DEFAULT_CACHE_SIZE: usize = 100; @@ -64,13 +62,6 @@ pub struct DocumentMetaEntry { pub line_count: Option, } -/// Inner state for Workspace (separated for interior mutability). -#[derive(Debug)] -struct Inner { - /// LRU cache for loaded full documents. - document_cache: LruCache, -} - /// A workspace for managing indexed documents. /// /// Uses LRU cache for loaded documents to balance memory usage @@ -85,8 +76,8 @@ pub struct Workspace { /// This is always loaded in memory. meta_index: HashMap, - /// Inner state with LRU cache (protected by Mutex for interior mutability). - inner: Mutex, + /// LRU cache for loaded documents. + cache: DocumentCache, } impl Workspace { @@ -100,15 +91,10 @@ impl Workspace { let root = path.into(); fs::create_dir_all(&root).map_err(Error::Io)?; - let capacity = NonZeroUsize::new(cache_size.max(1)) - .unwrap_or_else(|| NonZeroUsize::new(DEFAULT_CACHE_SIZE).unwrap()); - let mut workspace = Self { root, meta_index: HashMap::new(), - inner: Mutex::new(Inner { - document_cache: LruCache::new(capacity), - }), + cache: DocumentCache::with_capacity(cache_size), }; workspace.load_meta_index()?; @@ -127,15 +113,10 @@ impl Workspace { ) -> Result { let root = path.clone().into(); if root.exists() { - let capacity = NonZeroUsize::new(cache_size.max(1)) - .unwrap_or_else(|| NonZeroUsize::new(DEFAULT_CACHE_SIZE).unwrap()); - let mut workspace = Self { root, meta_index: HashMap::new(), - inner: Mutex::new(Inner { - document_cache: LruCache::new(capacity), - }), + cache: DocumentCache::with_capacity(cache_size), }; workspace.load_meta_index()?; Ok(workspace) @@ -186,17 +167,15 @@ impl Workspace { .source_path .as_ref() .map(|p| p.to_string_lossy().to_string()), - page_count: doc.pages.first().map(|p| p.page), - line_count: None, // TODO: track this + page_count: if doc.pages.is_empty() { None } else { Some(doc.pages.len()) }, + line_count: doc.meta.line_count, }; self.meta_index.insert(doc_id.clone(), meta_entry); self.save_meta_index()?; // Remove from cache if present (will lazy load on next access) - if let Ok(mut inner) = self.inner.lock() { - inner.document_cache.pop(&doc_id); - } + let _ = self.cache.remove(&doc_id); info!("Saved document {} to workspace", doc_id); Ok(()) @@ -213,20 +192,13 @@ impl Workspace { return Ok(None); } - // Check LRU cache first (with lock) - { - let mut inner = self - .inner - .lock() - .map_err(|_| Error::Other("Workspace lock poisoned".to_string()))?; - - if let Some(cached) = inner.document_cache.get(id) { - debug!("Cache hit for document {}", id); - return Ok(Some(cached.clone())); - } + // Check LRU cache first + if let Some(cached) = self.cache.get(id)? { + debug!("Cache hit for document {}", id); + return Ok(Some(cached)); } - // Load from disk (lock released during I/O) + // Load from disk let doc_path = self.document_path(id); if !doc_path.exists() { warn!("Document {} in meta index but file missing", id); @@ -235,14 +207,8 @@ impl Workspace { let doc = load_document(&doc_path)?; - // Add to LRU cache (with lock) - { - let mut inner = self - .inner - .lock() - .map_err(|_| Error::Other("Workspace lock poisoned".to_string()))?; - inner.document_cache.put(id.to_string(), doc.clone()); - } + // Add to LRU cache + self.cache.put(id.to_string(), doc.clone())?; debug!("Loaded document {} from disk (cached)", id); Ok(Some(doc)) @@ -262,9 +228,7 @@ impl Workspace { self.meta_index.remove(id); // Remove from cache - if let Ok(mut inner) = self.inner.lock() { - inner.document_cache.pop(id); - } + let _ = self.cache.remove(id); self.save_meta_index()?; @@ -284,18 +248,19 @@ impl Workspace { /// Get the number of items currently in the LRU cache. pub fn cache_len(&self) -> usize { - self.inner - .lock() - .map(|inner| inner.document_cache.len()) - .unwrap_or(0) + self.cache.len() + } + + /// Get cache utilization (0.0 to 1.0). + pub fn cache_utilization(&self) -> f64 { + self.cache.utilization() } /// Clear the LRU cache (does not remove documents from workspace). - pub fn clear_cache(&self) { - if let Ok(mut inner) = self.inner.lock() { - inner.document_cache.clear(); - debug!("Cleared document cache"); - } + pub fn clear_cache(&self) -> Result<()> { + self.cache.clear()?; + debug!("Cleared document cache"); + Ok(()) } /// Get the path for a document file. @@ -372,8 +337,8 @@ impl Workspace { .source_path .as_ref() .map(|p| p.to_string_lossy().to_string()), - page_count: doc.pages.first().map(|p| p.page), - line_count: None, + page_count: if doc.pages.is_empty() { None } else { Some(doc.pages.len()) }, + line_count: doc.meta.line_count, }; (doc_id, meta_entry) }) @@ -395,3 +360,44 @@ impl Workspace { Ok(()) } } + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + #[test] + fn test_workspace_create() { + let temp = TempDir::new().unwrap(); + let workspace = Workspace::new(temp.path()).unwrap(); + + assert!(workspace.is_empty()); + assert_eq!(workspace.len(), 0); + } + + #[test] + fn test_workspace_open() { + let temp = TempDir::new().unwrap(); + let path = temp.path().join("workspace"); + + // Create new + let workspace = Workspace::open(&path).unwrap(); + assert!(workspace.is_empty()); + + // Reopen existing + let workspace2 = Workspace::open(&path).unwrap(); + assert!(workspace2.is_empty()); + } + + #[test] + fn test_workspace_cache_operations() { + let temp = TempDir::new().unwrap(); + let workspace = Workspace::with_cache_size(temp.path(), 5).unwrap(); + + assert_eq!(workspace.cache_len(), 0); + assert_eq!(workspace.cache.utilization(), 0.0); + + workspace.clear_cache().unwrap(); + assert_eq!(workspace.cache_len(), 0); + } +} diff --git a/src/util/format.rs b/src/util/format.rs new file mode 100644 index 00000000..059b9ed6 --- /dev/null +++ b/src/util/format.rs @@ -0,0 +1,212 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Text formatting utilities. + +/// Truncate text to a maximum length with ellipsis. +/// +/// # Example +/// +/// ``` +/// use vectorless::util::format::truncate; +/// +/// assert_eq!(truncate("hello world", 8), "hello..."); +/// assert_eq!(truncate("hi", 10), "hi"); +/// ``` +pub fn truncate(text: &str, max_len: usize) -> String { + if text.len() <= max_len { + return text.to_string(); + } + + if max_len <= 3 { + return ".".repeat(max_len); + } + + format!("{}...", &text[..max_len - 3]) +} + +/// Truncate text to a maximum length, respecting word boundaries. +pub fn truncate_words(text: &str, max_len: usize) -> String { + if text.len() <= max_len { + return text.to_string(); + } + + if max_len <= 3 { + return ".".repeat(max_len); + } + + // Find a good break point + let truncated = &text[..max_len - 3]; + + // Try to break at a word boundary + if let Some(last_space) = truncated.rfind(' ') { + if last_space > max_len / 2 { + return format!("{}...", &truncated[..last_space]); + } + } + + format!("{}...", truncated) +} + +/// Format a number with thousand separators. +/// +/// # Example +/// +/// ``` +/// use vectorless::util::format::format_number; +/// +/// assert_eq!(format_number(1000), "1,000"); +/// assert_eq!(format_number(1234567), "1,234,567"); +/// ``` +pub fn format_number(n: usize) -> String { + let s = n.to_string(); + let mut result = String::new(); + let chars: Vec = s.chars().collect(); + + for (i, c) in chars.iter().enumerate() { + if i > 0 && (chars.len() - i) % 3 == 0 { + result.push(','); + } + result.push(*c); + } + + result +} + +/// Format bytes for human-readable display. +/// +/// # Example +/// +/// ``` +/// use vectorless::util::format::format_bytes; +/// +/// assert_eq!(format_bytes(500), "500 B"); +/// assert_eq!(format_bytes(1024), "1.0 KB"); +/// assert_eq!(format_bytes(1536), "1.5 KB"); +/// assert_eq!(format_bytes(1048576), "1.0 MB"); +/// ``` +pub fn format_bytes(bytes: usize) -> String { + const KB: usize = 1024; + const MB: usize = KB * 1024; + const GB: usize = MB * 1024; + + if bytes >= GB { + format!("{:.1} GB", bytes as f64 / GB as f64) + } else if bytes >= MB { + format!("{:.1} MB", bytes as f64 / MB as f64) + } else if bytes >= KB { + format!("{:.1} KB", bytes as f64 / KB as f64) + } else { + format!("{} B", bytes) + } +} + +/// Format a percentage. +/// +/// # Example +/// +/// ``` +/// use vectorless::util::format::format_percent; +/// +/// assert_eq!(format_percent(0.5), "50.0%"); +/// assert_eq!(format_percent(0.123), "12.3%"); +/// ``` +pub fn format_percent(value: f32) -> String { + format!("{:.1}%", value * 100.0) +} + +/// Clean whitespace in text (collapse multiple spaces, trim). +pub fn clean_whitespace(text: &str) -> String { + text.split_whitespace().collect::>().join(" ") +} + +/// Indent each line of text. +pub fn indent(text: &str, spaces: usize) -> String { + let indent_str = " ".repeat(spaces); + text.lines() + .map(|line| format!("{}{}", indent_str, line)) + .collect::>() + .join("\n") +} + +/// Count words in text. +pub fn word_count(text: &str) -> usize { + text.split_whitespace().count() +} + +/// Count lines in text. +pub fn line_count(text: &str) -> usize { + if text.is_empty() { + return 0; + } + text.chars().filter(|&c| c == '\n').count() + 1 +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_truncate() { + assert_eq!(truncate("hello", 10), "hello"); + assert_eq!(truncate("hello world", 8), "hello..."); + assert_eq!(truncate("hi", 3), "hi"); + } + + #[test] + fn test_truncate_words() { + // "hello world foo" with max_len=12: + // truncated = "hello wor" (9 chars), last_space at 5 + // 5 > 12/2 is false, so no word boundary break + assert_eq!(truncate_words("hello world foo", 12), "hello wor..."); + // Word boundary break happens when space is past halfway + assert_eq!(truncate_words("hello world foo bar", 15), "hello world..."); + assert_eq!(truncate_words("hello", 10), "hello"); + } + + #[test] + fn test_format_number() { + assert_eq!(format_number(100), "100"); + assert_eq!(format_number(1000), "1,000"); + assert_eq!(format_number(1234567), "1,234,567"); + } + + #[test] + fn test_format_bytes() { + assert_eq!(format_bytes(500), "500 B"); + assert_eq!(format_bytes(1024), "1.0 KB"); + assert_eq!(format_bytes(1536), "1.5 KB"); + assert_eq!(format_bytes(1048576), "1.0 MB"); + } + + #[test] + fn test_format_percent() { + assert_eq!(format_percent(0.5), "50.0%"); + assert_eq!(format_percent(1.0), "100.0%"); + } + + #[test] + fn test_clean_whitespace() { + assert_eq!(clean_whitespace(" hello world "), "hello world"); + assert_eq!(clean_whitespace("single"), "single"); + } + + #[test] + fn test_indent() { + assert_eq!(indent("hello\nworld", 2), " hello\n world"); + } + + #[test] + fn test_word_count() { + assert_eq!(word_count("hello world"), 2); + assert_eq!(word_count(" hello world "), 2); + assert_eq!(word_count(""), 0); + } + + #[test] + fn test_line_count() { + assert_eq!(line_count("hello\nworld"), 2); + assert_eq!(line_count("single"), 1); + assert_eq!(line_count(""), 0); + } +} diff --git a/src/util/mod.rs b/src/util/mod.rs index 9d22d295..9ec7184e 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -2,7 +2,20 @@ // SPDX-License-Identifier: Apache-2.0 //! Utility functions and helpers. +//! +//! This module provides common utilities used across the codebase: +//! +//! - **Token estimation** — Fast and accurate token counting +//! - **Timing** — Performance measurement utilities +//! - **Format** — Text and number formatting utilities +mod format; +mod timing; mod token; +pub use format::{ + clean_whitespace, format_bytes, format_number, format_percent, indent, line_count, + truncate, truncate_words, word_count, +}; +pub use timing::{format_duration, format_duration_compact, Timer}; pub use token::{estimate_tokens, estimate_tokens_batch, estimate_tokens_fast}; diff --git a/src/util/timing.rs b/src/util/timing.rs new file mode 100644 index 00000000..5b3cabb9 --- /dev/null +++ b/src/util/timing.rs @@ -0,0 +1,159 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Timing and performance measurement utilities. + +use std::time::{Duration, Instant}; + +/// A simple timing guard that records elapsed time on drop. +/// +/// # Example +/// +/// ```rust +/// use vectorless::util::timing::Timer; +/// +/// let timer = Timer::start("indexing"); +/// // ... do work ... +/// drop(timer); // Logs elapsed time +/// ``` +#[derive(Debug)] +pub struct Timer { + label: String, + start: Instant, + log_on_drop: bool, +} + +impl Timer { + /// Create and start a new timer. + pub fn start(label: impl Into) -> Self { + Self { + label: label.into(), + start: Instant::now(), + log_on_drop: true, + } + } + + /// Create a silent timer (doesn't log on drop). + pub fn silent() -> Self { + Self { + label: String::new(), + start: Instant::now(), + log_on_drop: false, + } + } + + /// Get the elapsed time without stopping. + pub fn elapsed(&self) -> Duration { + self.start.elapsed() + } + + /// Get elapsed time in milliseconds. + pub fn elapsed_ms(&self) -> u64 { + self.elapsed().as_millis() as u64 + } + + /// Get elapsed time in seconds. + pub fn elapsed_secs(&self) -> f64 { + self.elapsed().as_secs_f64() + } + + /// Stop the timer and return the elapsed duration. + pub fn stop(self) -> Duration { + let elapsed = self.elapsed(); + if self.log_on_drop { + tracing::debug!( + "{} completed in {:.2}ms", + self.label, + elapsed.as_secs_f64() * 1000.0 + ); + } + elapsed + } + + /// Stop the timer and return elapsed milliseconds. + pub fn stop_ms(self) -> u64 { + self.stop().as_millis() as u64 + } + + /// Disable logging on drop. + pub fn silent_on_drop(mut self) -> Self { + self.log_on_drop = false; + self + } + + /// Reset the timer. + pub fn reset(&mut self) { + self.start = Instant::now(); + } +} + +impl Drop for Timer { + fn drop(&mut self) { + if self.log_on_drop { + let elapsed = self.elapsed(); + tracing::debug!( + "{} completed in {:.2}ms", + self.label, + elapsed.as_secs_f64() * 1000.0 + ); + } + } +} + +/// Format a duration for human-readable display. +pub fn format_duration(duration: Duration) -> String { + let total_ms = duration.as_millis(); + + if total_ms < 1000 { + format!("{}ms", total_ms) + } else if total_ms < 60_000 { + format!("{:.2}s", duration.as_secs_f64()) + } else { + let secs = duration.as_secs(); + let mins = secs / 60; + let remaining_secs = secs % 60; + format!("{}m {}s", mins, remaining_secs) + } +} + +/// Format a duration as a compact string. +pub fn format_duration_compact(duration: Duration) -> String { + let total_ms = duration.as_millis(); + + if total_ms < 1000 { + format!("{}ms", total_ms) + } else if total_ms < 60_000 { + format!("{:.1}s", duration.as_secs_f64()) + } else { + let mins = duration.as_secs() / 60; + let secs = duration.as_secs() % 60; + format!("{}:{:02}", mins, secs) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_timer_elapsed() { + let timer = Timer::silent(); + std::thread::sleep(std::time::Duration::from_millis(10)); + let elapsed = timer.elapsed(); + assert!(elapsed.as_millis() >= 10); + } + + #[test] + fn test_format_duration() { + assert_eq!(format_duration(Duration::from_millis(500)), "500ms"); + assert_eq!(format_duration(Duration::from_millis(1500)), "1.50s"); + assert_eq!(format_duration(Duration::from_secs(90)), "1m 30s"); + } + + #[test] + fn test_format_duration_compact() { + assert_eq!(format_duration_compact(Duration::from_millis(500)), "500ms"); + assert_eq!(format_duration_compact(Duration::from_millis(1500)), "1.5s"); + assert_eq!(format_duration_compact(Duration::from_secs(90)), "1:30"); + } +} From d035d6b79e386e98e1c801f9f2fd98abc298ca09 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Sun, 5 Apr 2026 13:02:17 +0800 Subject: [PATCH 3/8] feat(storage): add file locking, checksum verification, and atomic writes Add comprehensive storage enhancements including: - File locking mechanism for multi-process safety using platform-specific implementations (flock on Unix, LockFileEx on Windows) - SHA-256 checksum verification for data integrity - Atomic writes using temp file + rename pattern to prevent corruption - New CompressionConfig and CompressionAlgorithm for optional compression - Enhanced StorageConfig with cache_size, atomic_writes, file_lock, checksum_enabled, and compression settings - Persistence options for configurable save/load behavior - Updated error types including ChecksumMismatch, WorkspaceLocked, and VersionMismatch - Test improvements using temporary directories and WorkspaceOptions New dependencies added: sha2 for checksums, libc for Unix file locking. BREAKING CHANGE: Storage format now includes checksums and version headers. --- Cargo.toml | 7 + src/client/workspace.rs | 16 +- src/config/mod.rs | 3 +- src/config/types/mod.rs | 3 +- src/config/types/storage.rs | 183 +++++++++++++++ src/error.rs | 44 ++-- src/storage/cache.rs | 96 +++++++- src/storage/lock.rs | 271 ++++++++++++++++++++++ src/storage/mod.rs | 13 +- src/storage/persistence.rs | 434 ++++++++++++++++++++++++++++++++++-- src/storage/workspace.rs | 88 +++++++- vectorless.example.toml | 26 +++ 12 files changed, 1129 insertions(+), 55 deletions(-) create mode 100644 src/storage/lock.rs diff --git a/Cargo.toml b/Cargo.toml index 12167d65..49505200 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -59,6 +59,13 @@ indextree = { version = "4.8.0", features = ["deser"] } # LRU cache lru = "0.12" +# Checksum +sha2 = "0.10" + +# File locking (Unix) +[target.'cfg(unix)'.dependencies] +libc = "0.2" + # PDF processing pdf-extract = "0.10.0" lopdf = "0.34" diff --git a/src/client/workspace.rs b/src/client/workspace.rs index feb4116d..c5525bfa 100644 --- a/src/client/workspace.rs +++ b/src/client/workspace.rs @@ -354,17 +354,29 @@ pub struct WorkspaceStats { #[cfg(test)] mod tests { use super::*; + use tempfile::TempDir; + use crate::storage::WorkspaceOptions; #[test] fn test_workspace_client_creation() { - let workspace = Workspace::open("./test_workspace").unwrap(); + let temp = TempDir::new().unwrap(); + let options = WorkspaceOptions { + file_lock: false, + ..Default::default() + }; + let workspace = Workspace::open_with_options(temp.path(), options).unwrap(); let client = WorkspaceClient::new(workspace); assert!(client.is_empty()); } #[test] fn test_workspace_stats() { - let workspace = Workspace::open("./test_workspace").unwrap(); + let temp = TempDir::new().unwrap(); + let options = WorkspaceOptions { + file_lock: false, + ..Default::default() + }; + let workspace = Workspace::open_with_options(temp.path(), options).unwrap(); let client = WorkspaceClient::new(workspace); let stats = client.stats().unwrap(); diff --git a/src/config/mod.rs b/src/config/mod.rs index 98ad2e8a..d821332a 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -87,7 +87,8 @@ pub use types::{ // Retrieval configs RetrievalConfig, SearchConfig, // Storage and sufficiency - StorageConfig, CacheConfig, StrategyConfig, SufficiencyConfig, + StorageConfig, CompressionAlgorithm, CompressionConfig, + CacheConfig, StrategyConfig, SufficiencyConfig, // Content aggregator ContentAggregatorConfig, // Concurrency diff --git a/src/config/types/mod.rs b/src/config/types/mod.rs index a824ee3f..39536156 100644 --- a/src/config/types/mod.rs +++ b/src/config/types/mod.rs @@ -23,7 +23,8 @@ pub use indexer::IndexerConfig; pub use llm::{LlmConfig, SummaryConfig}; pub use retrieval::{RetrievalConfig, SearchConfig}; pub use storage::{ - CacheConfig, StorageConfig, StrategyConfig, SufficiencyConfig, + CacheConfig, CompressionAlgorithm, CompressionConfig, + StorageConfig, StrategyConfig, SufficiencyConfig, }; /// Main configuration for vectorless. diff --git a/src/config/types/storage.rs b/src/config/types/storage.rs index 0dc55ed9..562c7ba3 100644 --- a/src/config/types/storage.rs +++ b/src/config/types/storage.rs @@ -12,16 +12,58 @@ pub struct StorageConfig { /// Workspace directory for persisted documents. #[serde(default = "default_workspace_dir")] pub workspace_dir: PathBuf, + + /// LRU cache size (number of documents). + #[serde(default = "default_cache_size")] + pub cache_size: usize, + + /// Enable atomic writes (write to temp file, then rename). + /// This prevents data corruption on crash. + #[serde(default = "default_atomic_writes")] + pub atomic_writes: bool, + + /// Enable file locking for multi-process safety. + #[serde(default = "default_file_lock")] + pub file_lock: bool, + + /// Enable checksum verification for data integrity. + #[serde(default = "default_checksum_enabled")] + pub checksum_enabled: bool, + + /// Enable compression for stored documents. + #[serde(default)] + pub compression: CompressionConfig, } fn default_workspace_dir() -> PathBuf { PathBuf::from("./workspace") } +fn default_cache_size() -> usize { + 100 +} + +fn default_atomic_writes() -> bool { + true +} + +fn default_file_lock() -> bool { + true +} + +fn default_checksum_enabled() -> bool { + true +} + impl Default for StorageConfig { fn default() -> Self { Self { workspace_dir: default_workspace_dir(), + cache_size: default_cache_size(), + atomic_writes: default_atomic_writes(), + file_lock: default_file_lock(), + checksum_enabled: default_checksum_enabled(), + compression: CompressionConfig::default(), } } } @@ -37,6 +79,109 @@ impl StorageConfig { self.workspace_dir = dir.into(); self } + + /// Set the cache size. + pub fn with_cache_size(mut self, size: usize) -> Self { + self.cache_size = size; + self + } + + /// Enable or disable atomic writes. + pub fn with_atomic_writes(mut self, enabled: bool) -> Self { + self.atomic_writes = enabled; + self + } + + /// Enable or disable file locking. + pub fn with_file_lock(mut self, enabled: bool) -> Self { + self.file_lock = enabled; + self + } + + /// Enable or disable checksum verification. + pub fn with_checksum(mut self, enabled: bool) -> Self { + self.checksum_enabled = enabled; + self + } + + /// Set compression configuration. + pub fn with_compression(mut self, compression: CompressionConfig) -> Self { + self.compression = compression; + self + } +} + +/// Compression configuration. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CompressionConfig { + /// Enable compression. + #[serde(default = "default_compression_enabled")] + pub enabled: bool, + + /// Compression algorithm. + #[serde(default = "default_compression_algorithm")] + pub algorithm: CompressionAlgorithm, + + /// Compression level (1-9, higher = better compression but slower). + #[serde(default = "default_compression_level")] + pub level: u32, +} + +fn default_compression_enabled() -> bool { + false +} + +fn default_compression_algorithm() -> CompressionAlgorithm { + CompressionAlgorithm::Gzip +} + +fn default_compression_level() -> u32 { + 6 +} + +impl Default for CompressionConfig { + fn default() -> Self { + Self { + enabled: default_compression_enabled(), + algorithm: default_compression_algorithm(), + level: default_compression_level(), + } + } +} + +impl CompressionConfig { + /// Create new compression config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Enable or disable compression. + pub fn with_enabled(mut self, enabled: bool) -> Self { + self.enabled = enabled; + self + } + + /// Set the compression algorithm. + pub fn with_algorithm(mut self, algorithm: CompressionAlgorithm) -> Self { + self.algorithm = algorithm; + self + } + + /// Set the compression level. + pub fn with_level(mut self, level: u32) -> Self { + self.level = level.clamp(1, 9); + self + } +} + +/// Compression algorithm. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum CompressionAlgorithm { + /// Gzip compression. + Gzip, + /// Zstandard compression. + Zstd, } /// Sufficiency checker configuration. @@ -248,6 +393,44 @@ mod tests { fn test_storage_config_defaults() { let config = StorageConfig::default(); assert_eq!(config.workspace_dir, PathBuf::from("./workspace")); + assert_eq!(config.cache_size, 100); + assert!(config.atomic_writes); + assert!(config.file_lock); + assert!(config.checksum_enabled); + assert!(!config.compression.enabled); + } + + #[test] + fn test_storage_config_builders() { + let config = StorageConfig::new() + .with_workspace_dir("/data/workspace") + .with_cache_size(200) + .with_atomic_writes(false) + .with_file_lock(false) + .with_checksum(false); + + assert_eq!(config.workspace_dir, PathBuf::from("/data/workspace")); + assert_eq!(config.cache_size, 200); + assert!(!config.atomic_writes); + assert!(!config.file_lock); + assert!(!config.checksum_enabled); + } + + #[test] + fn test_compression_config_defaults() { + let config = CompressionConfig::default(); + assert!(!config.enabled); + assert_eq!(config.algorithm, CompressionAlgorithm::Gzip); + assert_eq!(config.level, 6); + } + + #[test] + fn test_compression_config_level_clamp() { + let config = CompressionConfig::new().with_level(15); + assert_eq!(config.level, 9); // clamped to max + + let config = CompressionConfig::new().with_level(0); + assert_eq!(config.level, 1); // clamped to min } #[test] diff --git a/src/error.rs b/src/error.rs index d9b8e7d5..615dd671 100644 --- a/src/error.rs +++ b/src/error.rs @@ -103,34 +103,34 @@ pub enum Error { #[error("Cache error: {0}")] Cache(String), - // ========================================================================= - // Serialization Errors - // ========================================================================= - - /// An error occurred during serialization/deserialization. + /// Serialization error. #[error("Serialization error: {0}")] - Serialization(#[from] serde_json::Error), + Serialization(String), - /// TOML parsing error. - #[error("TOML parsing error: {0}")] - Toml(String), + /// Document not found. + #[error("Document not found: {0}")] + DocumentNotFound(String), - // ========================================================================= - // Node & Document Errors - // ========================================================================= + /// Checksum mismatch. + #[error("Checksum mismatch: {0}")] + ChecksumMismatch(String), - /// The requested node was not found. - #[error("Node not found: {0}")] - NodeNotFound(String), + /// Workspace locked by another process. + #[error("Workspace locked by another process")] + WorkspaceLocked, - /// The requested document was not found. - #[error("Document not found: {0}")] - DocumentNotFound(String), + /// Format version mismatch. + #[error("Format version mismatch: {0}")] + VersionMismatch(String), // ========================================================================= // Configuration Errors // ========================================================================= + /// TOML parsing error. + #[error("TOML parsing error: {0}")] + Toml(String), + /// Invalid configuration. #[error("Invalid configuration: {0}")] Config(String), @@ -139,6 +139,14 @@ pub enum Error { #[error("Missing required configuration: {0}")] MissingConfig(String), + // ========================================================================= + // Node Errors + // ========================================================================= + + /// The requested node was not found. + #[error("Node not found: {0}")] + NodeNotFound(String), + // ========================================================================= // Input Validation Errors // ========================================================================= diff --git a/src/storage/cache.rs b/src/storage/cache.rs index 2f8a0067..4e7e6a57 100644 --- a/src/storage/cache.rs +++ b/src/storage/cache.rs @@ -5,8 +5,17 @@ //! //! This module provides a thread-safe LRU cache for loaded documents, //! allowing efficient reuse of loaded document data while limiting memory usage. +//! +//! # Metrics +//! +//! The cache tracks: +//! - Hits: Number of successful cache lookups +//! - Misses: Number of failed cache lookups +//! - Evictions: Number of entries evicted due to capacity +//! - Utilization: Current usage as percentage of capacity use std::num::NonZeroUsize; +use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::Mutex; use lru::LruCache; @@ -22,12 +31,25 @@ const DEFAULT_CACHE_SIZE: usize = 100; /// /// Uses interior mutability via `Mutex` for safe concurrent access. /// The cache automatically evicts least-recently-used entries when full. +/// +/// # Metrics +/// +/// The cache maintains atomic counters for: +/// - **hits**: Successful cache lookups +/// - **misses**: Failed cache lookups (document not in cache) +/// - **evictions**: Entries removed due to capacity limits #[derive(Debug)] pub struct DocumentCache { /// Inner cache protected by Mutex. inner: Mutex>, /// Maximum capacity. capacity: usize, + /// Number of cache hits. + hits: AtomicU64, + /// Number of cache misses. + misses: AtomicU64, + /// Number of cache evictions. + evictions: AtomicU64, } impl DocumentCache { @@ -52,6 +74,9 @@ impl DocumentCache { Self { inner: Mutex::new(LruCache::new(non_zero)), capacity, + hits: AtomicU64::new(0), + misses: AtomicU64::new(0), + evictions: AtomicU64::new(0), } } @@ -65,7 +90,16 @@ impl DocumentCache { /// Returns an error if the cache lock is poisoned. pub fn get(&self, id: &str) -> Result> { let mut cache = self.lock()?; - Ok(cache.get(id).cloned()) + let result = cache.get(id).cloned(); + + // Update metrics + if result.is_some() { + self.hits.fetch_add(1, Ordering::Relaxed); + } else { + self.misses.fetch_add(1, Ordering::Relaxed); + } + + Ok(result) } /// Check if a document is in the cache. @@ -85,7 +119,18 @@ impl DocumentCache { /// Returns an error if the cache lock is poisoned. pub fn put(&self, id: String, doc: PersistedDocument) -> Result> { let mut cache = self.lock()?; - Ok(cache.put(id, doc)) + + // Track capacity before put to detect eviction + let was_full = cache.len() >= self.capacity; + + let evicted = cache.put(id, doc); + + // Track evictions + if evicted.is_some() || was_full { + self.evictions.fetch_add(1, Ordering::Relaxed); + } + + Ok(evicted) } /// Remove a document from the cache. @@ -147,15 +192,52 @@ impl DocumentCache { Ok(cache.iter().map(|(k, _)| k.clone()).collect()) } - /// Get cache statistics. + /// Get cache statistics including metrics. pub fn stats(&self) -> CacheStats { CacheStats { len: self.len(), capacity: self.capacity, utilization: self.utilization(), + hits: self.hits.load(Ordering::Relaxed), + misses: self.misses.load(Ordering::Relaxed), + evictions: self.evictions.load(Ordering::Relaxed), } } + /// Get the number of cache hits. + pub fn hits(&self) -> u64 { + self.hits.load(Ordering::Relaxed) + } + + /// Get the number of cache misses. + pub fn misses(&self) -> u64 { + self.misses.load(Ordering::Relaxed) + } + + /// Get the number of cache evictions. + pub fn evictions(&self) -> u64 { + self.evictions.load(Ordering::Relaxed) + } + + /// Get the cache hit rate (0.0 to 1.0). + pub fn hit_rate(&self) -> f64 { + let hits = self.hits.load(Ordering::Relaxed); + let misses = self.misses.load(Ordering::Relaxed); + let total = hits + misses; + if total == 0 { + 0.0 + } else { + hits as f64 / total as f64 + } + } + + /// Reset all metrics counters to zero. + pub fn reset_metrics(&self) { + self.hits.store(0, Ordering::Relaxed); + self.misses.store(0, Ordering::Relaxed); + self.evictions.store(0, Ordering::Relaxed); + } + /// Lock the inner cache. fn lock(&self) -> Result>> { self.inner.lock().map_err(|_| { @@ -170,7 +252,7 @@ impl Default for DocumentCache { } } -/// Cache statistics. +/// Cache statistics including metrics. #[derive(Debug, Clone, Copy)] pub struct CacheStats { /// Number of entries in cache. @@ -179,6 +261,12 @@ pub struct CacheStats { pub capacity: usize, /// Utilization (0.0 to 1.0). pub utilization: f64, + /// Number of cache hits. + pub hits: u64, + /// Number of cache misses. + pub misses: u64, + /// Number of cache evictions. + pub evictions: u64, } #[cfg(test)] diff --git a/src/storage/lock.rs b/src/storage/lock.rs new file mode 100644 index 00000000..57931691 --- /dev/null +++ b/src/storage/lock.rs @@ -0,0 +1,271 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! File locking for workspace safety. +//! +//! Provides cross-process file locking to prevent data corruption +//! when multiple processes access the same workspace. + +use std::fs::{File, OpenOptions}; +use std::path::Path; + +use crate::error::Result; +use crate::Error; + +/// A file lock that is automatically released when dropped. +/// +/// Uses the `flock` on Unix and `LockFileEx` on Windows. +#[derive(Debug)] +pub struct FileLock { + /// The locked file handle. + file: Option, + /// Path to the lock file (for debugging). + path: std::path::PathBuf, + /// Whether the lock is held exclusively. + exclusive: bool, +} + +impl FileLock { + /// Try to acquire an file lock. + /// + /// # Arguments + /// + /// * `path` - Path to the lock file (will be created if it doesn't exist) + /// * `exclusive` - If true, acquires an exclusive (write) lock; otherwise a shared (read) lock + /// + /// # Errors + /// + /// Returns `Error::WorkspaceLocked` if the lock is held by another process. + pub fn try_lock(path: impl Into, exclusive: bool) -> Result { + let path = path.into(); + + // Ensure parent directory exists + if let Some(parent) = path.parent() { + std::fs::create_dir_all(parent).map_err(Error::Io)?; + } + + // Open or create the lock file + let file = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .truncate(false) + .open(&path) + .map_err(Error::Io)?; + + // Try to acquire the lock + #[cfg(unix)] + { + use std::os::unix::fs::MetadataExt; + let fd = std::os::unix::io::AsRawFd::as_raw_fd(&file); + + let result = if exclusive { + // LOCK_EX | LOCK_NB + unsafe { libc::flock(fd, 0x02 | 0x04) } + } else { + // LOCK_SH | LOCK_NB + unsafe { libc::flock(fd, 0x01 | 0x04) } + }; + + if result != 0 { + return Err(Error::WorkspaceLocked); + } + + Ok(Self { + file: Some(file), + path, + exclusive, + }) + } + + #[cfg(windows)] + { + use std::os::windows::fs::OpenOptionsExt; + use windows_sys::Win32::Storage::FileSystem::{ + LockFileEx, LOCKFILE_EXCLUSIVE_LOCK, LOCKFILE_FAIL_IMMEDIATELY, + }; + + let handle = std::os::windows::io::AsRawHandle::as_raw_handle(&file); + + let mut overlapped = std::mem::MaybeUninit::zeroed(); + let result = unsafe { + LockFileEx( + handle, + if exclusive { LOCKFILE_EXCLUSIVE_LOCK } else { 0 } | LOCKFILE_FAIL_IMMEDIATELY, + 0, + 0xFFFFFFFF, + 0xFFFFFFFF, + overlapped.as_mut_ptr(), + ) + }; + + if result == 0 { + return Err(Error::WorkspaceLocked); + } + + Ok(Self { + file: Some(file), + path, + exclusive, + }) + } + + #[cfg(not(any(unix, windows)))] + { + // Fallback: No file locking available + // Just keep the file open, which provides some protection + Ok(Self { + file: Some(file), + path, + exclusive, + }) + } + } + + /// Try to acquire a lock without blocking. + /// + /// Returns `Ok(FileLock)` if the lock was acquired, or `Ok(None)` if it would block. + pub fn try_lock_no_wait( + path: impl Into, + exclusive: bool, + ) -> Result> { + match Self::try_lock(&path.into(), exclusive) { + Ok(lock) => Ok(Some(lock)), + Err(Error::WorkspaceLocked) => Ok(None), + Err(e) => Err(e), + } + } + + /// Check if the lock file is locked by another process. + /// + /// This is useful for checking without acquiring a lock. + pub fn is_locked(path: impl Into) -> bool { + Self::try_lock(&path.into(), false).is_err() + } + + /// Release the lock. + pub fn unlock(mut self) { + if let Some(file) = self.file.take() { + // File will be unlocked when dropped + drop(file); + } + } + + /// Get the lock file path. + pub fn path(&self) -> &Path { + &self.path + } + + /// Check if this is an exclusive lock. + pub fn is_exclusive(&self) -> bool { + self.exclusive + } +} + +impl Drop for FileLock { + fn drop(&mut self) { + if let Some(file) = self.file.take() { + // File descriptor closed, lock automatically released + drop(file); + } + } +} + +/// A scoped lock guard that releases the lock when dropped. +/// +/// This is useful for ensuring the lock is released even on panic. +pub struct ScopedLock { + lock: Option, +} + +impl ScopedLock { + /// Acquire a scoped lock. + pub fn new(path: impl Into, exclusive: bool) -> Result { + let lock = FileLock::try_lock(path, exclusive)?; + Ok(Self { lock: Some(lock) }) + } + + /// Release the lock early. + pub fn release(mut self) { + if let Some(lock) = self.lock.take() { + lock.unlock(); + } + } +} + +impl Drop for ScopedLock { + fn drop(&mut self) { + // Lock automatically released when FileLock is dropped + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + #[test] + fn test_file_lock_acquire_release() { + let temp = TempDir::new().unwrap(); + let lock_path = temp.path().join("test.lock"); + + let lock = FileLock::try_lock(&lock_path, true).unwrap(); + assert!(lock.is_exclusive()); + + // Should be able to unlock + lock.unlock(); + } + + #[test] + fn test_file_lock_conflict() { + let temp = TempDir::new().unwrap(); + let lock_path = temp.path().join("conflict.lock"); + + // Acquire exclusive lock + let _lock1 = FileLock::try_lock(&lock_path, true).unwrap(); + + // Try to acquire another exclusive lock - should fail + let result = FileLock::try_lock(&lock_path, true); + assert!(matches!(result, Err(Error::WorkspaceLocked))); + } + + #[test] + fn test_file_lock_shared() { + let temp = TempDir::new().unwrap(); + let lock_path = temp.path().join("shared.lock"); + + // Acquire shared lock + let lock1 = FileLock::try_lock(&lock_path, false).unwrap(); + assert!(!lock1.is_exclusive()); + + // Should be able to acquire another shared lock + let lock2 = FileLock::try_lock(&lock_path, false).unwrap(); + assert!(!lock2.is_exclusive()); + + // But exclusive lock should fail + let result = FileLock::try_lock(&lock_path, true); + assert!(matches!(result, Err(Error::WorkspaceLocked))); + + lock1.unlock(); + lock2.unlock(); + } + + #[test] + fn test_scoped_lock() { + let temp = TempDir::new().unwrap(); + let lock_path = temp.path().join("scoped.lock"); + + { + let _scoped = ScopedLock::new(&lock_path, true).unwrap(); + // Lock held here + + // Another lock should fail + let result = FileLock::try_lock(&lock_path, true); + assert!(matches!(result, Err(Error::WorkspaceLocked))); + } + // Lock released here + + // Now should succeed + let _lock = FileLock::try_lock(&lock_path, true).unwrap(); + } +} diff --git a/src/storage/mod.rs b/src/storage/mod.rs index d5adc212..0fba85ed 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -5,8 +5,9 @@ //! //! This module provides: //! - **Workspace** — A directory-based document collection manager with LRU cache -//! - **Persistence** — Save/load document trees and metadata +//! - **Persistence** — Save/load document trees and metadata with atomic writes //! - **Cache** — LRU cache for loaded documents +//! - **Lock** — File locking for multi-process safety //! //! # Example //! @@ -28,13 +29,17 @@ //! ``` pub mod cache; +pub mod lock; mod persistence; mod workspace; // Re-export main types pub use cache::DocumentCache; +pub use lock::{FileLock, ScopedLock}; pub use persistence::{ - DocumentMeta, PageContent, PersistedDocument, load_document, load_index, save_document, - save_index, + DocumentMeta, PageContent, PersistedDocument, + load_document, load_document_with_options, load_index, load_index_with_options, + save_document, save_document_with_options, save_index, save_index_with_options, + PersistenceOptions, }; -pub use workspace::{DocumentMetaEntry, Workspace}; +pub use workspace::{DocumentMetaEntry, Workspace, WorkspaceOptions}; diff --git a/src/storage/persistence.rs b/src/storage/persistence.rs index d2870f7c..2095bcbd 100644 --- a/src/storage/persistence.rs +++ b/src/storage/persistence.rs @@ -2,13 +2,25 @@ // SPDX-License-Identifier: Apache-2.0 //! Persistence utilities for saving and loading document indices. - +//! +//! # Features +//! +//! - **Atomic writes**: Write to temp file, then rename for crash safety +//! - **Checksum verification**: SHA-256 checksums for data integrity +//! - **Version header**: Format version for future migrations + +use sha2::{Digest, Sha256}; use serde::{Deserialize, Serialize}; -use std::io; +use std::fs::File; +use std::io::{BufReader, BufWriter, Read, Write}; use std::path::{Path, PathBuf}; -use crate::{DocumentTree, Error}; +use crate::document::DocumentTree; use crate::error::Result; +use crate::Error; + +/// Current format version for persisted documents. +const FORMAT_VERSION: u32 = 1; /// Metadata for a persisted document. #[derive(Debug, Clone, Serialize, Deserialize)] @@ -114,46 +126,434 @@ pub struct PageContent { pub content: String, } -/// Save a document to a JSON file. +/// Wrapper for persisted data with checksum. +#[derive(Debug, Serialize, Deserialize)] +struct PersistedWrapper { + /// Format version. + version: u32, + /// SHA-256 checksum of the payload. + checksum: String, + /// The actual data. + payload: T, +} + +/// Options for save/load operations. +#[derive(Debug, Clone)] +pub struct PersistenceOptions { + /// Use atomic writes (temp file + rename). + pub atomic_writes: bool, + /// Verify checksums on load. + pub verify_checksum: bool, +} + +impl Default for PersistenceOptions { + fn default() -> Self { + Self { + atomic_writes: true, + verify_checksum: true, + } + } +} + +impl PersistenceOptions { + /// Create new options with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Set atomic writes option. + pub fn with_atomic_writes(mut self, enabled: bool) -> Self { + self.atomic_writes = enabled; + self + } + + /// Set checksum verification option. + pub fn with_verify_checksum(mut self, enabled: bool) -> Self { + self.verify_checksum = enabled; + self + } +} + +/// Calculate SHA-256 checksum of data. +fn calculate_checksum(data: &[u8]) -> String { + let mut hasher = Sha256::new(); + hasher.update(data); + format!("{:x}", hasher.finalize()) +} + +/// Save a document to a JSON file with atomic write and checksum. +/// +/// # Atomic Write +/// +/// When `atomic_writes` is enabled (default), this function: +/// 1. Writes to a temporary file (`.tmp` suffix) +/// 2. Renames temp file to target (atomic on most filesystems) +/// +/// This prevents data corruption if the process crashes during write. +/// +/// # Errors +/// +/// Returns an error if: +/// - Serialization fails +/// - Cannot create temp file +/// - Write fails +/// - Rename fails pub fn save_document(path: &Path, doc: &PersistedDocument) -> Result<()> { - let json = serde_json::to_string_pretty(doc) - .map_err(|e| Error::Io(io::Error::new(io::ErrorKind::Other, e)))?; + save_document_with_options(path, doc, &PersistenceOptions::default()) +} - std::fs::write(path, json).map_err(|e| Error::Io(e))?; +/// Save a document with custom options. +pub fn save_document_with_options( + path: &Path, + doc: &PersistedDocument, + options: &PersistenceOptions, +) -> Result<()> { + // Serialize the payload first + let payload_bytes = serde_json::to_vec(doc) + .map_err(|e| Error::Serialization(e.to_string()))?; + + // Calculate checksum + let checksum = calculate_checksum(&payload_bytes); + + // Create wrapper + let wrapper = PersistedWrapper { + version: FORMAT_VERSION, + checksum, + payload: doc.clone(), + }; + + // Serialize wrapper + let json = serde_json::to_string_pretty(&wrapper) + .map_err(|e| Error::Serialization(e.to_string()))?; + + if options.atomic_writes { + // Atomic write: write to temp file, then rename + let temp_path = path.with_extension("tmp"); + + // Ensure parent directory exists + if let Some(parent) = path.parent() { + std::fs::create_dir_all(parent).map_err(Error::Io)?; + } + + // Write to temp file + { + let file = File::create(&temp_path).map_err(Error::Io)?; + let mut writer = BufWriter::new(file); + writer.write_all(json.as_bytes()).map_err(Error::Io)?; + writer.flush().map_err(Error::Io)?; + } + + // Atomic rename + std::fs::rename(&temp_path, path).map_err(Error::Io)?; + } else { + // Direct write (not atomic) + std::fs::write(path, json).map_err(Error::Io)?; + } Ok(()) } -/// Load a document from a JSON file. +/// Load a document from a JSON file with checksum verification. +/// +/// # Checksum Verification +/// +/// When `verify_checksum` is enabled (default), this function: +/// 1. Reads the file +/// 2. Parses the wrapper +/// 3. Re-serializes the payload +/// 4. Verifies the checksum matches +/// +/// # Errors +/// +/// Returns an error if: +/// - File doesn't exist +/// - Parse fails +/// - Checksum mismatch +/// - Version mismatch (future: migration) pub fn load_document(path: &Path) -> Result { - let json = std::fs::read_to_string(path).map_err(|e| Error::Io(e))?; + load_document_with_options(path, &PersistenceOptions::default()) +} - let doc: PersistedDocument = serde_json::from_str(&json) +/// Load a document with custom options. +pub fn load_document_with_options( + path: &Path, + options: &PersistenceOptions, +) -> Result { + if !path.exists() { + return Err(Error::DocumentNotFound( + path.display().to_string() + )); + } + + let file = File::open(path).map_err(Error::Io)?; + let reader = BufReader::new(file); + + // Parse wrapper + let wrapper: PersistedWrapper = serde_json::from_reader(reader) .map_err(|e| Error::Parse(format!("Failed to parse document: {}", e)))?; - Ok(doc) + // Check version + if wrapper.version != FORMAT_VERSION { + return Err(Error::Parse(format!( + "Unsupported format version: {} (expected {})", + wrapper.version, FORMAT_VERSION + ))); + } + + // Verify checksum if enabled + if options.verify_checksum { + let payload_bytes = serde_json::to_vec(&wrapper.payload) + .map_err(|e| Error::Serialization(e.to_string()))?; + + let expected_checksum = calculate_checksum(&payload_bytes); + + if wrapper.checksum != expected_checksum { + return Err(Error::Parse(format!( + "Checksum mismatch: expected {}, got {}", + expected_checksum, wrapper.checksum + ))); + } + } + + Ok(wrapper.payload) } /// Save the workspace index (metadata for all documents). pub fn save_index(path: &Path, entries: &[DocumentMeta]) -> Result<()> { - let json = serde_json::to_string_pretty(entries) - .map_err(|e| Error::Io(io::Error::new(io::ErrorKind::Other, e)))?; + save_index_with_options(path, entries, &PersistenceOptions::default()) +} - std::fs::write(path, json).map_err(|e| Error::Io(e))?; +/// Save the workspace index with custom options. +pub fn save_index_with_options( + path: &Path, + entries: &[DocumentMeta], + options: &PersistenceOptions, +) -> Result<()> { + // Serialize payload + let payload_bytes = serde_json::to_vec(entries) + .map_err(|e| Error::Serialization(e.to_string()))?; + + let checksum = calculate_checksum(&payload_bytes); + + let wrapper = PersistedWrapper { + version: FORMAT_VERSION, + checksum, + payload: entries.to_vec(), + }; + + let json = serde_json::to_string_pretty(&wrapper) + .map_err(|e| Error::Serialization(e.to_string()))?; + + if options.atomic_writes { + let temp_path = path.with_extension("tmp"); + + // Ensure parent directory exists + if let Some(parent) = path.parent() { + std::fs::create_dir_all(parent).map_err(Error::Io)?; + } + + // Write to temp file + { + let file = File::create(&temp_path).map_err(Error::Io)?; + let mut writer = BufWriter::new(file); + writer.write_all(json.as_bytes()).map_err(Error::Io)?; + writer.flush().map_err(Error::Io)?; + } + + // Atomic rename + std::fs::rename(&temp_path, path).map_err(Error::Io)?; + } else { + std::fs::write(path, json).map_err(Error::Io)?; + } Ok(()) } /// Load the workspace index. pub fn load_index(path: &Path) -> Result> { + load_index_with_options(path, &PersistenceOptions::default()) +} + +/// Load the workspace index with custom options. +pub fn load_index_with_options( + path: &Path, + options: &PersistenceOptions, +) -> Result> { if !path.exists() { return Ok(Vec::new()); } - let json = std::fs::read_to_string(path).map_err(|e| Error::Io(e))?; + let file = File::open(path).map_err(Error::Io)?; + let reader = BufReader::new(file); - let entries: Vec = serde_json::from_str(&json) + let wrapper: PersistedWrapper> = serde_json::from_reader(reader) .map_err(|e| Error::Parse(format!("Failed to parse index: {}", e)))?; - Ok(entries) + // Check version + if wrapper.version != FORMAT_VERSION { + return Err(Error::Parse(format!( + "Unsupported format version: {} (expected {})", + wrapper.version, FORMAT_VERSION + ))); + } + + // Verify checksum if enabled + if options.verify_checksum { + let payload_bytes = serde_json::to_vec(&wrapper.payload) + .map_err(|e| Error::Serialization(e.to_string()))?; + + let expected_checksum = calculate_checksum(&payload_bytes); + + if wrapper.checksum != expected_checksum { + return Err(Error::Parse(format!( + "Checksum mismatch: expected {}, got {}", + expected_checksum, wrapper.checksum + ))); + } + } + + Ok(wrapper.payload) +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + fn create_test_doc(id: &str) -> PersistedDocument { + let meta = DocumentMeta::new(id, "Test Doc", "md"); + let tree = DocumentTree::new("Root", "Content"); + PersistedDocument::new(meta, tree) + } + + #[test] + fn test_save_and_load_document() { + let temp = TempDir::new().unwrap(); + let path = temp.path().join("test.json"); + + let doc = create_test_doc("doc-1"); + save_document(&path, &doc).unwrap(); + + let loaded = load_document(&path).unwrap(); + assert_eq!(loaded.meta.id, "doc-1"); + assert_eq!(loaded.meta.name, "Test Doc"); + } + + #[test] + fn test_atomic_write() { + let temp = TempDir::new().unwrap(); + let path = temp.path().join("atomic.json"); + + let doc = create_test_doc("doc-atomic"); + let options = PersistenceOptions::new().with_atomic_writes(true); + save_document_with_options(&path, &doc, &options).unwrap(); + + // Temp file should not exist after save + assert!(!path.with_extension("tmp").exists()); + + let loaded = load_document(&path).unwrap(); + assert_eq!(loaded.meta.id, "doc-atomic"); + } + + #[test] + fn test_checksum_verification() { + let temp = TempDir::new().unwrap(); + let path = temp.path().join("checksum.json"); + + let doc = create_test_doc("doc-checksum"); + save_document(&path, &doc).unwrap(); + + // Corrupt the file + let content = std::fs::read_to_string(&path).unwrap(); + let corrupted = content.replace("doc-checksum", "doc-corrupted"); + std::fs::write(&path, corrupted).unwrap(); + + // Load should fail with checksum error + let result = load_document(&path); + assert!(result.is_err()); + let err = result.unwrap_err(); + assert!(matches!(err, Error::Parse(_))); + } + + #[test] + fn test_checksum_disabled() { + let temp = TempDir::new().unwrap(); + let path = temp.path().join("no-checksum.json"); + + let doc = create_test_doc("doc-no-check"); + save_document(&path, &doc).unwrap(); + + // Load with checksum disabled should succeed + let options = PersistenceOptions::new().with_verify_checksum(false); + let result = load_document_with_options(&path, &options); + assert!(result.is_ok()); + let loaded = result.unwrap(); + assert_eq!(loaded.meta.id, "doc-no-check"); + + // Now corrupt the checksum field specifically + let content = std::fs::read_to_string(&path).unwrap(); + // Change the checksum value but keep the payload intact + let corrupted = content.replace( + &calculate_checksum(&serde_json::to_vec(&doc).unwrap()), + "0000000000000000000000000000000000000000000000000000000000000000" + ); + std::fs::write(&path, corrupted).unwrap(); + + // Load with checksum disabled should still succeed + let result = load_document_with_options(&path, &options); + assert!(result.is_ok()); + + // Load with checksum enabled should fail + let options_enabled = PersistenceOptions::new().with_verify_checksum(true); + let result = load_document_with_options(&path, &options_enabled); + assert!(result.is_err()); + } + + #[test] + fn test_load_nonexistent() { + let result = load_document(Path::new("/nonexistent/path.json")); + assert!(result.is_err()); + assert!(result.unwrap_err().is_not_found()); + } + + #[test] + fn test_save_and_load_index() { + let temp = TempDir::new().unwrap(); + let path = temp.path().join("_meta.json"); + + let mut entries = Vec::new(); + entries.push(DocumentMeta::new("doc-1", "Doc 1", "md")); + entries.push(DocumentMeta::new("doc-2", "Doc 2", "pdf")); + + save_index(&path, &entries).unwrap(); + + let loaded = load_index(&path).unwrap(); + assert_eq!(loaded.len(), 2); + assert_eq!(loaded[0].id, "doc-1"); + assert_eq!(loaded[1].format, "pdf"); + } + + #[test] + fn test_load_empty_index() { + let temp = TempDir::new().unwrap(); + let path = temp.path().join("nonexistent.json"); + + let loaded = load_index(&path).unwrap(); + assert!(loaded.is_empty()); + } + + #[test] + fn test_checksum_calculation() { + let data1 = b"test data"; + let data2 = b"test data"; + let data3 = b"different data"; + + let checksum1 = calculate_checksum(data1); + let checksum2 = calculate_checksum(data2); + let checksum3 = calculate_checksum(data3); + + assert_eq!(checksum1, checksum2); + assert_ne!(checksum1, checksum3); + assert_eq!(checksum1.len(), 64); // SHA-256 produces 64 hex chars + } } diff --git a/src/storage/workspace.rs b/src/storage/workspace.rs index 1ad67d98..63ad2a22 100644 --- a/src/storage/workspace.rs +++ b/src/storage/workspace.rs @@ -23,6 +23,11 @@ //! The workspace uses interior mutability for the LRU cache: //! - Read operations (`get_meta`, `contains`, `list_documents`) only need `&self` //! - Cache updates happen internally via `Mutex` +//! +//! # File Locking +//! +//! When enabled (default), the workspace uses an exclusive file lock +//! to prevent concurrent access from multiple processes. use std::collections::HashMap; use std::fs; @@ -32,11 +37,13 @@ use serde::{Deserialize, Serialize}; use tracing::{debug, info, warn}; use super::cache::DocumentCache; +use super::lock::FileLock; use super::persistence::{PersistedDocument, load_document, save_document}; use crate::error::Result; use crate::Error; const META_FILE: &str = "_meta.json"; +const LOCK_FILE: &str = ".workspace.lock"; const DEFAULT_CACHE_SIZE: usize = 100; /// Lightweight metadata entry for the index. @@ -78,23 +85,61 @@ pub struct Workspace { /// LRU cache for loaded documents. cache: DocumentCache, + + /// File lock for multi-process safety. + _lock: Option, +} + +/// Options for workspace creation. +#[derive(Debug, Clone)] +pub struct WorkspaceOptions { + /// Enable file locking (default: true). + pub file_lock: bool, + /// LRU cache size (default: 100). + pub cache_size: usize, +} + +impl Default for WorkspaceOptions { + fn default() -> Self { + Self { + file_lock: true, + cache_size: DEFAULT_CACHE_SIZE, + } + } } impl Workspace { /// Create a new workspace at the given path with default cache size. pub fn new(path: impl Into) -> Result { - Self::with_cache_size(path, DEFAULT_CACHE_SIZE) + Self::with_options(path, WorkspaceOptions::default()) } /// Create a new workspace with custom LRU cache size. pub fn with_cache_size(path: impl Into, cache_size: usize) -> Result { + Self::with_options(path, WorkspaceOptions { + cache_size, + ..Default::default() + }) + } + + /// Create a new workspace with custom options. + pub fn with_options(path: impl Into, options: WorkspaceOptions) -> Result { let root = path.into(); fs::create_dir_all(&root).map_err(Error::Io)?; + // Acquire file lock if enabled + let lock = if options.file_lock { + let lock_path = root.join(LOCK_FILE); + Some(FileLock::try_lock(&lock_path, true)?) + } else { + None + }; + let mut workspace = Self { root, meta_index: HashMap::new(), - cache: DocumentCache::with_capacity(cache_size), + cache: DocumentCache::with_capacity(options.cache_size), + _lock: lock, }; workspace.load_meta_index()?; @@ -103,25 +148,45 @@ impl Workspace { /// Open an existing workspace, or create if it doesn't exist. pub fn open(path: impl Into + Clone) -> Result { - Self::open_with_cache_size(path, DEFAULT_CACHE_SIZE) + Self::open_with_options(path, WorkspaceOptions::default()) } /// Open with custom cache size. pub fn open_with_cache_size( path: impl Into + Clone, cache_size: usize, + ) -> Result { + Self::open_with_options(path, WorkspaceOptions { + cache_size, + ..Default::default() + }) + } + + /// Open with custom options. + pub fn open_with_options( + path: impl Into + Clone, + options: WorkspaceOptions, ) -> Result { let root = path.clone().into(); if root.exists() { + // Acquire file lock if enabled + let lock = if options.file_lock { + let lock_path = root.join(LOCK_FILE); + Some(FileLock::try_lock(&lock_path, true)?) + } else { + None + }; + let mut workspace = Self { root, meta_index: HashMap::new(), - cache: DocumentCache::with_capacity(cache_size), + cache: DocumentCache::with_capacity(options.cache_size), + _lock: lock, }; workspace.load_meta_index()?; Ok(workspace) } else { - Self::with_cache_size(path, cache_size) + Self::with_options(path, options) } } @@ -380,12 +445,19 @@ mod tests { let temp = TempDir::new().unwrap(); let path = temp.path().join("workspace"); + // Use options without file lock to allow reopening + let options = WorkspaceOptions { + file_lock: false, + ..Default::default() + }; + // Create new - let workspace = Workspace::open(&path).unwrap(); + let workspace = Workspace::open_with_options(&path, options.clone()).unwrap(); assert!(workspace.is_empty()); - // Reopen existing - let workspace2 = Workspace::open(&path).unwrap(); + // Reopen existing (need to drop first workspace to release lock) + drop(workspace); + let workspace2 = Workspace::open_with_options(&path, options).unwrap(); assert!(workspace2.is_empty()); } diff --git a/vectorless.example.toml b/vectorless.example.toml index 66e85e21..aa097ae6 100644 --- a/vectorless.example.toml +++ b/vectorless.example.toml @@ -159,6 +159,32 @@ dedup_threshold = 0.9 # └── {doc_id_2}.json # Document 2 workspace_dir = "./workspace" +# LRU cache size (number of documents to keep in memory) +cache_size = 100 + +# Enable atomic writes (temp file + rename) +# This prevents data corruption on crash +atomic_writes = true + +# Enable file locking for multi-process safety +# Prevents concurrent access from multiple processes +file_lock = true + +# Enable checksum verification for data integrity +# Uses SHA-256 to verify file integrity on load +checksum_enabled = true + +# Compression settings +[storage.compression] +# Enable compression for stored documents +enabled = false + +# Compression algorithm: "gzip" or "zstd" +algorithm = "gzip" + +# Compression level (1-9, higher = better compression but slower) +level = 6 + [concurrency] # Maximum concurrent LLM API calls # This limits how many requests can be in-flight at the same time From 74b147edfdd81c0cea5c95485fea299dada94c00 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Sun, 5 Apr 2026 13:25:00 +0800 Subject: [PATCH 4/8] feat(storage): add async workspace with storage backend abstraction - Introduce AsyncWorkspace with RwLock-based concurrency support - Add StorageBackend trait with FileBackend and MemoryBackend implementations - Implement file-based storage with atomic writes and directory caching - Add in-memory backend for testing purposes - Include comprehensive async workspace functionality with caching - Update Cargo.toml with flate2 compression dependency - Export AsyncWorkspace in main library module --- Cargo.toml | 3 + src/lib.rs | 2 +- src/storage/async_workspace.rs | 586 +++++++++++++++++++++++++++++++ src/storage/backend/file.rs | 295 ++++++++++++++++ src/storage/backend/memory.rs | 173 +++++++++ src/storage/backend/mod.rs | 35 ++ src/storage/backend/trait_def.rs | 113 ++++++ src/storage/codec.rs | 241 +++++++++++++ src/storage/migration.rs | 383 ++++++++++++++++++++ src/storage/mod.rs | 15 +- src/storage/persistence.rs | 125 +++++++ src/storage/workspace.rs | 362 +++++++++++-------- 12 files changed, 2188 insertions(+), 145 deletions(-) create mode 100644 src/storage/async_workspace.rs create mode 100644 src/storage/backend/file.rs create mode 100644 src/storage/backend/memory.rs create mode 100644 src/storage/backend/mod.rs create mode 100644 src/storage/backend/trait_def.rs create mode 100644 src/storage/codec.rs create mode 100644 src/storage/migration.rs diff --git a/Cargo.toml b/Cargo.toml index 49505200..4bc94998 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -62,6 +62,9 @@ lru = "0.12" # Checksum sha2 = "0.10" +# Compression +flate2 = "1.0" + # File locking (Unix) [target.'cfg(unix)'.dependencies] libc = "0.2" diff --git a/src/lib.rs b/src/lib.rs index 3a283c32..cd3dc7a9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -163,7 +163,7 @@ pub use retrieval::{ }; // Storage -pub use storage::{DocumentMeta as StorageDocumentMeta, PersistedDocument, Workspace}; +pub use storage::{AsyncWorkspace, DocumentMeta as StorageDocumentMeta, PersistedDocument, Workspace}; // Throttle pub use throttle::{ConcurrencyConfig, ConcurrencyController, RateLimiter}; diff --git a/src/storage/async_workspace.rs b/src/storage/async_workspace.rs new file mode 100644 index 00000000..6fb4f49e --- /dev/null +++ b/src/storage/async_workspace.rs @@ -0,0 +1,586 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Async workspace management for document collections. +//! +//! This module provides an async version of [`Workspace`](super::Workspace) +//! for integration with async runtimes like Tokio. +//! +//! # Example +//! +//! ```rust,ignore +//! use vectorless::storage::AsyncWorkspace; +//! +//! #[tokio::main] +//! async fn main() -> Result<()> { +//! let mut workspace = AsyncWorkspace::new("./workspace").await?; +//! +//! // Add a document +//! workspace.add(&doc).await?; +//! +//! // Load with caching +//! let loaded = workspace.load("doc-1").await?; +//! +//! Ok(()) +//! } +//! ``` + +use std::collections::HashMap; +use std::path::{Path, PathBuf}; +use std::sync::Arc; + +use serde::{Deserialize, Serialize}; +use tokio::sync::RwLock; +use tracing::{debug, info, warn}; + +use super::backend::{FileBackend, StorageBackend}; +use super::cache::DocumentCache; +use super::persistence::{PersistedDocument, load_document_from_bytes, save_document_to_bytes}; +use crate::error::Result; +use crate::Error; + +const META_KEY: &str = "_meta"; +const DEFAULT_CACHE_SIZE: usize = 100; + +/// Lightweight metadata entry for the async workspace index. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AsyncDocumentMetaEntry { + /// Document ID. + pub id: String, + /// Document name/title. + pub doc_name: String, + /// Document description. + #[serde(default)] + pub doc_description: Option, + /// Document type (pdf, md, etc.). + pub doc_type: String, + /// Source file path. + #[serde(default)] + pub path: Option, + /// Page count (for PDFs). + #[serde(skip_serializing_if = "Option::is_none")] + pub page_count: Option, + /// Line count (for markdown). + #[serde(skip_serializing_if = "Option::is_none")] + pub line_count: Option, +} + +/// Options for async workspace creation. +#[derive(Debug, Clone)] +pub struct AsyncWorkspaceOptions { + /// LRU cache size (default: 100). + pub cache_size: usize, +} + +impl Default for AsyncWorkspaceOptions { + fn default() -> Self { + Self { + cache_size: DEFAULT_CACHE_SIZE, + } + } +} + +impl AsyncWorkspaceOptions { + /// Create new options with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Set the cache size. + pub fn with_cache_size(mut self, size: usize) -> Self { + self.cache_size = size; + self + } +} + +/// Inner state for the async workspace. +struct AsyncWorkspaceInner { + /// Storage backend. + backend: Arc, + /// Root path (for file-based backends). + root: Option, + /// Document metadata index. + meta_index: HashMap, + /// LRU cache for loaded documents. + cache: DocumentCache, +} + +/// An async workspace for managing indexed documents. +/// +/// Uses `tokio::sync::RwLock` for async-safe concurrent access. +/// All operations are async and can be safely called from multiple tasks. +/// +/// # Thread Safety +/// +/// The async workspace is fully thread-safe and can be cloned cheaply +/// (it uses `Arc` internally). +#[derive(Clone)] +pub struct AsyncWorkspace { + inner: Arc>, +} + +impl std::fmt::Debug for AsyncWorkspace { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("AsyncWorkspace") + .finish() + } +} + +impl AsyncWorkspace { + /// Create a new async workspace with a storage backend. + pub async fn with_backend(backend: Arc) -> Result { + Self::with_backend_and_options(backend, AsyncWorkspaceOptions::default()).await + } + + /// Create an async workspace with backend and options. + pub async fn with_backend_and_options( + backend: Arc, + options: AsyncWorkspaceOptions, + ) -> Result { + let mut inner = AsyncWorkspaceInner { + backend, + root: None, + meta_index: HashMap::new(), + cache: DocumentCache::with_capacity(options.cache_size), + }; + + Self::load_meta_index(&mut inner)?; + + Ok(Self { + inner: Arc::new(RwLock::new(inner)), + }) + } + + /// Create a new file-based async workspace at the given path. + pub async fn new(path: impl Into) -> Result { + Self::with_options(path, AsyncWorkspaceOptions::default()).await + } + + /// Create a new async workspace with custom cache size. + pub async fn with_cache_size(path: impl Into, cache_size: usize) -> Result { + Self::with_options(path, AsyncWorkspaceOptions { + cache_size, + ..Default::default() + }).await + } + + /// Create a new async workspace with custom options. + pub async fn with_options(path: impl Into, options: AsyncWorkspaceOptions) -> Result { + let root = path.into(); + let backend = Arc::new(FileBackend::new(&root)?); + + let mut inner = AsyncWorkspaceInner { + backend, + root: Some(root), + meta_index: HashMap::new(), + cache: DocumentCache::with_capacity(options.cache_size), + }; + + Self::load_meta_index(&mut inner)?; + + Ok(Self { + inner: Arc::new(RwLock::new(inner)), + }) + } + + /// Get the workspace root path (if file-based). + pub async fn path(&self) -> Option { + let inner = self.inner.read().await; + inner.root.clone() + } + + /// List all document IDs in the workspace. + pub async fn list_documents(&self) -> Vec { + let inner = self.inner.read().await; + inner.meta_index.keys().cloned().collect() + } + + /// Get metadata for a document. + pub async fn get_meta(&self, id: &str) -> Option { + let inner = self.inner.read().await; + inner.meta_index.get(id).cloned() + } + + /// Check if a document exists. + pub async fn contains(&self, id: &str) -> bool { + let inner = self.inner.read().await; + inner.meta_index.contains_key(id) + } + + /// Add a document to the workspace. + pub async fn add(&self, doc: &PersistedDocument) -> Result<()> { + let mut inner = self.inner.write().await; + + let doc_id = doc.meta.id.clone(); + let key = Self::doc_key(&doc_id); + + // Serialize and save via backend + let bytes = save_document_to_bytes(doc)?; + inner.backend.put(&key, &bytes)?; + + // Update meta index + let meta_entry = AsyncDocumentMetaEntry { + id: doc_id.clone(), + doc_name: doc.meta.name.clone(), + doc_description: doc.meta.description.clone(), + doc_type: doc.meta.format.clone(), + path: doc + .meta + .source_path + .as_ref() + .map(|p| p.to_string_lossy().to_string()), + page_count: if doc.pages.is_empty() { None } else { Some(doc.pages.len()) }, + line_count: doc.meta.line_count, + }; + + inner.meta_index.insert(doc_id.clone(), meta_entry); + Self::save_meta_index(&inner)?; + + // Remove from cache if present + let _ = inner.cache.remove(&doc_id); + + info!("Saved document {} to async workspace", doc_id); + Ok(()) + } + + /// Load a document from the workspace. + /// + /// Uses LRU cache: returns cached version if available, + /// otherwise loads from backend and caches it. + pub async fn load(&self, id: &str) -> Result> { + // First check if document exists (read lock) + { + let inner = self.inner.read().await; + if !inner.meta_index.contains_key(id) { + return Ok(None); + } + + // Check LRU cache + if let Some(cached) = inner.cache.get(id)? { + debug!("Cache hit for document {}", id); + return Ok(Some(cached)); + } + } + + // Load from backend (need read lock for backend access) + let inner = self.inner.read().await; + let key = Self::doc_key(id); + + match inner.backend.get(&key)? { + Some(bytes) => { + let doc = load_document_from_bytes(&bytes)?; + + // Note: We can't modify the cache with only a read lock + // For now, we return the document without caching + // A more sophisticated implementation would use a separate cache structure + + debug!("Loaded document {} from backend", id); + Ok(Some(doc)) + } + None => { + warn!("Document {} in meta index but not in backend", id); + Ok(None) + } + } + } + + /// Load a document and cache it (requires write lock for caching). + pub async fn load_and_cache(&self, id: &str) -> Result> { + // First check if document exists (read lock) + { + let inner = self.inner.read().await; + if !inner.meta_index.contains_key(id) { + return Ok(None); + } + + // Check LRU cache + if let Some(cached) = inner.cache.get(id)? { + debug!("Cache hit for document {}", id); + return Ok(Some(cached)); + } + } + + // Load from backend and cache (write lock) + let mut inner = self.inner.write().await; + let key = Self::doc_key(id); + + match inner.backend.get(&key)? { + Some(bytes) => { + let doc = load_document_from_bytes(&bytes)?; + + // Add to cache + inner.cache.put(id.to_string(), doc.clone())?; + + debug!("Loaded and cached document {}", id); + Ok(Some(doc)) + } + None => { + warn!("Document {} in meta index but not in backend", id); + Ok(None) + } + } + } + + /// Remove a document from the workspace. + pub async fn remove(&self, id: &str) -> Result { + let mut inner = self.inner.write().await; + + if !inner.meta_index.contains_key(id) { + return Ok(false); + } + + let key = Self::doc_key(id); + inner.backend.delete(&key)?; + + inner.meta_index.remove(id); + + // Remove from cache + let _ = inner.cache.remove(id); + + Self::save_meta_index(&inner)?; + + info!("Removed document {} from async workspace", id); + Ok(true) + } + + /// Get the number of documents in the workspace. + pub async fn len(&self) -> usize { + let inner = self.inner.read().await; + inner.meta_index.len() + } + + /// Check if the workspace is empty. + pub async fn is_empty(&self) -> bool { + let inner = self.inner.read().await; + inner.meta_index.is_empty() + } + + /// Get the number of items currently in the LRU cache. + pub async fn cache_len(&self) -> usize { + let inner = self.inner.read().await; + inner.cache.len() + } + + /// Get cache utilization (0.0 to 1.0). + pub async fn cache_utilization(&self) -> f64 { + let inner = self.inner.read().await; + inner.cache.utilization() + } + + /// Get cache statistics. + pub async fn cache_stats(&self) -> super::cache::CacheStats { + let inner = self.inner.read().await; + inner.cache.stats() + } + + /// Clear the LRU cache. + pub async fn clear_cache(&self) -> Result<()> { + let mut inner = self.inner.write().await; + inner.cache.clear()?; + debug!("Cleared async document cache"); + Ok(()) + } + + /// Get the storage key for a document. + fn doc_key(id: &str) -> String { + format!("doc:{}", id) + } + + /// Load the meta index from backend. + fn load_meta_index(inner: &mut AsyncWorkspaceInner) -> Result<()> { + match inner.backend.get(META_KEY)? { + Some(bytes) => { + let meta: HashMap = serde_json::from_slice(&bytes) + .map_err(|e| Error::Parse(format!("Failed to parse meta index: {}", e)))?; + inner.meta_index = meta; + info!( + "Loaded {} document(s) from async workspace index", + inner.meta_index.len() + ); + } + None => { + // Try to rebuild from existing keys + Self::rebuild_meta_index(inner)?; + } + } + Ok(()) + } + + /// Save the meta index to backend. + fn save_meta_index(inner: &AsyncWorkspaceInner) -> Result<()> { + let bytes = serde_json::to_vec_pretty(&inner.meta_index) + .map_err(|e| Error::Parse(format!("Failed to serialize meta index: {}", e)))?; + inner.backend.put(META_KEY, &bytes)?; + Ok(()) + } + + /// Rebuild the meta index from existing documents. + fn rebuild_meta_index(inner: &mut AsyncWorkspaceInner) -> Result<()> { + let keys = inner.backend.keys()?; + let doc_keys: Vec<_> = keys + .iter() + .filter(|k| k.starts_with("doc:")) + .collect(); + + for key in doc_keys { + if let Some(bytes) = inner.backend.get(key)? { + if let Ok(doc) = load_document_from_bytes(&bytes) { + let doc_id = doc.meta.id.clone(); + let meta_entry = AsyncDocumentMetaEntry { + id: doc_id.clone(), + doc_name: doc.meta.name, + doc_description: doc.meta.description, + doc_type: doc.meta.format, + path: doc + .meta + .source_path + .as_ref() + .map(|p| p.to_string_lossy().to_string()), + page_count: if doc.pages.is_empty() { None } else { Some(doc.pages.len()) }, + line_count: doc.meta.line_count, + }; + inner.meta_index.insert(doc_id, meta_entry); + } + } + } + + if !inner.meta_index.is_empty() { + Self::save_meta_index(inner)?; + info!( + "Rebuilt async index from {} document(s)", + inner.meta_index.len() + ); + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::document::DocumentTree; + + fn create_test_doc(id: &str) -> PersistedDocument { + let meta = super::super::persistence::DocumentMeta::new(id, "Test Doc", "md"); + let tree = DocumentTree::new("Root", "Content"); + PersistedDocument::new(meta, tree) + } + + #[tokio::test] + async fn test_async_workspace_create() { + let backend = Arc::new(super::super::backend::MemoryBackend::new()); + let workspace = AsyncWorkspace::with_backend(backend).await.unwrap(); + + assert!(workspace.is_empty().await); + assert_eq!(workspace.len().await, 0); + } + + #[tokio::test] + async fn test_async_workspace_add_and_load() { + let backend = Arc::new(super::super::backend::MemoryBackend::new()); + let workspace = AsyncWorkspace::with_backend(backend).await.unwrap(); + + let doc = create_test_doc("doc-1"); + workspace.add(&doc).await.unwrap(); + + assert_eq!(workspace.len().await, 1); + assert!(workspace.contains("doc-1").await); + + let loaded = workspace.load("doc-1").await.unwrap(); + assert!(loaded.is_some()); + assert_eq!(loaded.unwrap().meta.id, "doc-1"); + } + + #[tokio::test] + async fn test_async_workspace_remove() { + let backend = Arc::new(super::super::backend::MemoryBackend::new()); + let workspace = AsyncWorkspace::with_backend(backend).await.unwrap(); + + let doc = create_test_doc("doc-1"); + workspace.add(&doc).await.unwrap(); + + let removed = workspace.remove("doc-1").await.unwrap(); + assert!(removed); + assert!(workspace.is_empty().await); + + let removed_again = workspace.remove("doc-1").await.unwrap(); + assert!(!removed_again); + } + + #[tokio::test] + async fn test_async_workspace_cache() { + let backend = Arc::new(super::super::backend::MemoryBackend::new()); + let workspace = AsyncWorkspace::with_backend(backend).await.unwrap(); + + let doc = create_test_doc("doc-1"); + workspace.add(&doc).await.unwrap(); + + // First load with caching + let _ = workspace.load_and_cache("doc-1").await.unwrap(); + let stats = workspace.cache_stats().await; + assert_eq!(stats.misses, 1); + + // Second load should hit cache + let _ = workspace.load_and_cache("doc-1").await.unwrap(); + let stats = workspace.cache_stats().await; + assert_eq!(stats.hits, 1); + } + + #[tokio::test] + async fn test_async_workspace_list_documents() { + let backend = Arc::new(super::super::backend::MemoryBackend::new()); + let workspace = AsyncWorkspace::with_backend(backend).await.unwrap(); + + workspace.add(&create_test_doc("doc-1")).await.unwrap(); + workspace.add(&create_test_doc("doc-2")).await.unwrap(); + workspace.add(&create_test_doc("doc-3")).await.unwrap(); + + let docs = workspace.list_documents().await; + assert_eq!(docs.len(), 3); + } + + #[tokio::test] + async fn test_async_workspace_get_meta() { + let backend = Arc::new(super::super::backend::MemoryBackend::new()); + let workspace = AsyncWorkspace::with_backend(backend).await.unwrap(); + + let doc = create_test_doc("doc-1"); + workspace.add(&doc).await.unwrap(); + + let meta = workspace.get_meta("doc-1").await; + assert!(meta.is_some()); + let meta = meta.unwrap(); + assert_eq!(meta.id, "doc-1"); + assert_eq!(meta.doc_name, "Test Doc"); + assert_eq!(meta.doc_type, "md"); + } + + #[tokio::test] + async fn test_async_workspace_concurrent_access() { + let backend = Arc::new(super::super::backend::MemoryBackend::new()); + let workspace = Arc::new(AsyncWorkspace::with_backend(backend).await.unwrap()); + + // Spawn multiple concurrent tasks + let mut handles = vec![]; + + for i in 0..10 { + let ws = workspace.clone(); + let handle = tokio::spawn(async move { + let id = format!("doc-{}", i); + let doc = create_test_doc(&id); + ws.add(&doc).await.unwrap(); + let loaded = ws.load(&id).await.unwrap(); + assert!(loaded.is_some()); + }); + handles.push(handle); + } + + // Wait for all tasks + for handle in handles { + handle.await.unwrap(); + } + + assert_eq!(workspace.len().await, 10); + } +} diff --git a/src/storage/backend/file.rs b/src/storage/backend/file.rs new file mode 100644 index 00000000..915d0b4c --- /dev/null +++ b/src/storage/backend/file.rs @@ -0,0 +1,295 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! File system storage backend. + +use std::fs; +use std::path::{Path, PathBuf}; +use std::sync::RwLock; + +use tracing::{debug, warn}; + +use super::StorageBackend; +use crate::error::Result; +use crate::Error; + +/// File system storage backend. +/// +/// Stores each key-value pair as a separate file in a directory. +/// The key is used as the filename (with `.bin` extension). +/// +/// # Structure +/// +/// ```text +/// workspace/ +/// ├── doc-1.bin # Document 1 +/// ├── doc-2.bin # Document 2 +/// ├── _meta.json # Metadata index +/// └── .workspace.lock # Lock file +/// ``` +/// +/// # Thread Safety +/// +/// Uses `RwLock` for thread-safe operations on the directory listing cache. +#[derive(Debug)] +pub struct FileBackend { + /// Root directory for storage. + root: PathBuf, + /// Cached directory listing (refreshed on miss). + cache: RwLock>>, +} + +impl FileBackend { + /// Create a new file backend at the given path. + /// + /// Creates the directory if it doesn't exist. + pub fn new(path: impl Into) -> Result { + let root = path.into(); + fs::create_dir_all(&root).map_err(Error::Io)?; + + Ok(Self { + root, + cache: RwLock::new(None), + }) + } + + /// Open an existing file backend. + /// + /// Creates the directory if it doesn't exist. + pub fn open(path: impl Into) -> Result { + Self::new(path) + } + + /// Get the root path. + pub fn root(&self) -> &Path { + &self.root + } + + /// Convert a key to a file path. + fn key_to_path(&self, key: &str) -> PathBuf { + // Sanitize key to prevent path traversal + let sanitized = key + .replace("..", "_") + .replace(['/', '\\', ':'], "_"); + self.root.join(format!("{}.bin", sanitized)) + } + + /// Refresh the directory listing cache. + fn refresh_cache(&self) -> Result> { + let entries: Vec = fs::read_dir(&self.root) + .map_err(Error::Io)? + .filter_map(|entry| entry.ok()) + .filter_map(|entry| { + let path = entry.path(); + if path.extension()?.to_str()? == "bin" { + path.file_stem()?.to_str().map(|s| s.to_string()) + } else { + None + } + }) + .collect(); + + // Update cache + if let Ok(mut cache) = self.cache.write() { + *cache = Some(entries.clone()); + } + + Ok(entries) + } + + /// Get cached keys or refresh cache. + fn get_keys(&self) -> Result> { + // Try to read from cache first + if let Ok(cache) = self.cache.read() { + if let Some(ref keys) = *cache { + return Ok(keys.clone()); + } + } + + // Refresh cache + self.refresh_cache() + } + + /// Invalidate the cache. + pub fn invalidate_cache(&self) { + if let Ok(mut cache) = self.cache.write() { + *cache = None; + } + } +} + +impl StorageBackend for FileBackend { + fn get(&self, key: &str) -> Result>> { + let path = self.key_to_path(key); + + if !path.exists() { + return Ok(None); + } + + let data = fs::read(&path).map_err(Error::Io)?; + debug!("Read {} bytes from {}", data.len(), key); + + Ok(Some(data)) + } + + fn put(&self, key: &str, value: &[u8]) -> Result<()> { + let path = self.key_to_path(key); + + // Use atomic write (temp file + rename) + let temp_path = path.with_extension("tmp"); + + fs::write(&temp_path, value).map_err(Error::Io)?; + fs::rename(&temp_path, &path).map_err(Error::Io)?; + + // Invalidate cache + self.invalidate_cache(); + + debug!("Wrote {} bytes to {}", value.len(), key); + Ok(()) + } + + fn delete(&self, key: &str) -> Result { + let path = self.key_to_path(key); + + if !path.exists() { + return Ok(false); + } + + fs::remove_file(&path).map_err(Error::Io)?; + + // Invalidate cache + self.invalidate_cache(); + + debug!("Deleted {}", key); + Ok(true) + } + + fn exists(&self, key: &str) -> Result { + let path = self.key_to_path(key); + Ok(path.exists()) + } + + fn keys(&self) -> Result> { + self.get_keys() + } + + fn len(&self) -> Result { + Ok(self.get_keys()?.len()) + } + + fn clear(&self) -> Result<()> { + let keys = self.get_keys()?; + + for key in &keys { + let path = self.key_to_path(key); + if path.exists() { + fs::remove_file(&path).map_err(Error::Io)?; + } + } + + // Clear cache + if let Ok(mut cache) = self.cache.write() { + *cache = None; + } + + debug!("Cleared {} entries", keys.len()); + Ok(()) + } + + fn backend_name(&self) -> &'static str { + "file" + } + + fn batch_put(&self, items: &[(&str, &[u8])]) -> Result<()> { + for (key, value) in items { + self.put(key, value)?; + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + #[test] + fn test_file_backend_basic() { + let temp = TempDir::new().unwrap(); + let backend = FileBackend::new(temp.path()).unwrap(); + + // Put and get + backend.put("key1", b"value1").unwrap(); + let value = backend.get("key1").unwrap(); + assert_eq!(value, Some(b"value1".to_vec())); + + // Exists + assert!(backend.exists("key1").unwrap()); + assert!(!backend.exists("key2").unwrap()); + + // Delete + assert!(backend.delete("key1").unwrap()); + assert!(!backend.exists("key1").unwrap()); + assert!(!backend.delete("key1").unwrap()); // Already deleted + } + + #[test] + fn test_file_backend_keys() { + let temp = TempDir::new().unwrap(); + let backend = FileBackend::new(temp.path()).unwrap(); + + backend.put("key1", b"v1").unwrap(); + backend.put("key2", b"v2").unwrap(); + backend.put("key3", b"v3").unwrap(); + + let keys = backend.keys().unwrap(); + assert_eq!(keys.len(), 3); + assert!(keys.contains(&"key1".to_string())); + } + + #[test] + fn test_file_backend_clear() { + let temp = TempDir::new().unwrap(); + let backend = FileBackend::new(temp.path()).unwrap(); + + backend.put("key1", b"v1").unwrap(); + backend.put("key2", b"v2").unwrap(); + + backend.clear().unwrap(); + + assert!(backend.is_empty().unwrap()); + } + + #[test] + fn test_file_backend_batch() { + let temp = TempDir::new().unwrap(); + let backend = FileBackend::new(temp.path()).unwrap(); + + let items: Vec<(&str, &[u8])> = vec![ + ("k1", b"v1".as_slice()), + ("k2", b"v2".as_slice()), + ("k3", b"v3".as_slice()), + ]; + + backend.batch_put(&items).unwrap(); + + let results = backend.batch_get(&["k1", "k2", "k3", "k4"]).unwrap(); + assert_eq!(results.len(), 4); + assert!(results[0].is_some()); + assert!(results[3].is_none()); + } + + #[test] + fn test_file_backend_key_sanitization() { + let temp = TempDir::new().unwrap(); + let backend = FileBackend::new(temp.path()).unwrap(); + + // Keys with special characters should be sanitized + backend.put("../etc/passwd", b"malicious").unwrap(); + backend.put("path/to/file", b"nested").unwrap(); + + // Both should be stored safely + assert!(backend.exists("../etc/passwd").unwrap()); + assert!(backend.exists("path/to/file").unwrap()); + } +} diff --git a/src/storage/backend/memory.rs b/src/storage/backend/memory.rs new file mode 100644 index 00000000..013c87f9 --- /dev/null +++ b/src/storage/backend/memory.rs @@ -0,0 +1,173 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! In-memory storage backend (for testing). + +use std::collections::HashMap; +use std::sync::RwLock; + +use super::StorageBackend; +use crate::error::Result; + +/// In-memory storage backend. +/// +/// Stores all data in a `HashMap`. Useful for testing and scenarios +/// where persistence is not required. +/// +/// # Thread Safety +/// +/// Uses `RwLock` for thread-safe access to the internal map. +#[derive(Debug, Default)] +pub struct MemoryBackend { + /// Internal storage. + data: RwLock>>, +} + +impl MemoryBackend { + /// Create a new in-memory backend. + pub fn new() -> Self { + Self::default() + } + + /// Create a new in-memory backend with pre-seeded data. + pub fn with_data(data: HashMap>) -> Self { + Self { + data: RwLock::new(data), + } + } +} + +impl StorageBackend for MemoryBackend { + fn get(&self, key: &str) -> Result>> { + let data = self.data.read().map_err(|_| { + crate::Error::Cache("Memory backend lock poisoned".to_string()) + })?; + Ok(data.get(key).cloned()) + } + + fn put(&self, key: &str, value: &[u8]) -> Result<()> { + let mut data = self.data.write().map_err(|_| { + crate::Error::Cache("Memory backend lock poisoned".to_string()) + })?; + data.insert(key.to_string(), value.to_vec()); + Ok(()) + } + + fn delete(&self, key: &str) -> Result { + let mut data = self.data.write().map_err(|_| { + crate::Error::Cache("Memory backend lock poisoned".to_string()) + })?; + Ok(data.remove(key).is_some()) + } + + fn exists(&self, key: &str) -> Result { + let data = self.data.read().map_err(|_| { + crate::Error::Cache("Memory backend lock poisoned".to_string()) + })?; + Ok(data.contains_key(key)) + } + + fn keys(&self) -> Result> { + let data = self.data.read().map_err(|_| { + crate::Error::Cache("Memory backend lock poisoned".to_string()) + })?; + Ok(data.keys().cloned().collect()) + } + + fn len(&self) -> Result { + let data = self.data.read().map_err(|_| { + crate::Error::Cache("Memory backend lock poisoned".to_string()) + })?; + Ok(data.len()) + } + + fn clear(&self) -> Result<()> { + let mut data = self.data.write().map_err(|_| { + crate::Error::Cache("Memory backend lock poisoned".to_string()) + })?; + data.clear(); + Ok(()) + } + + fn batch_put(&self, items: &[(&str, &[u8])]) -> Result<()> { + let mut data = self.data.write().map_err(|_| { + crate::Error::Cache("Memory backend lock poisoned".to_string()) + })?; + for (key, value) in items { + data.insert(key.to_string(), value.to_vec()); + } + Ok(()) + } + + fn backend_name(&self) -> &'static str { + "memory" + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_memory_backend_basic() { + let backend = MemoryBackend::new(); + + // Put and get + backend.put("key1", b"value1").unwrap(); + let value = backend.get("key1").unwrap(); + assert_eq!(value, Some(b"value1".to_vec())); + + // Non-existent key + let missing = backend.get("missing").unwrap(); + assert!(missing.is_none()); + } + + #[test] + fn test_memory_backend_delete() { + let backend = MemoryBackend::new(); + + backend.put("key1", b"value1").unwrap(); + assert!(backend.exists("key1").unwrap()); + + let deleted = backend.delete("key1").unwrap(); + assert!(deleted); + assert!(!backend.exists("key1").unwrap()); + + // Delete non-existent + let not_deleted = backend.delete("missing").unwrap(); + assert!(!not_deleted); + } + + #[test] + fn test_memory_backend_keys() { + let backend = MemoryBackend::new(); + + backend.put("key1", b"v1").unwrap(); + backend.put("key2", b"v2").unwrap(); + backend.put("key3", b"v3").unwrap(); + + let keys = backend.keys().unwrap(); + assert_eq!(keys.len(), 3); + } + + #[test] + fn test_memory_backend_clear() { + let backend = MemoryBackend::new(); + + backend.put("key1", b"v1").unwrap(); + backend.put("key2", b"v2").unwrap(); + + backend.clear().unwrap(); + assert!(backend.is_empty().unwrap()); + } + + #[test] + fn test_memory_backend_with_data() { + let mut initial = HashMap::new(); + initial.insert("k1".to_string(), b"v1".to_vec()); + initial.insert("k2".to_string(), b"v2".to_vec()); + + let backend = MemoryBackend::with_data(initial); + assert_eq!(backend.len().unwrap(), 2); + } +} diff --git a/src/storage/backend/mod.rs b/src/storage/backend/mod.rs new file mode 100644 index 00000000..b8d7ccef --- /dev/null +++ b/src/storage/backend/mod.rs @@ -0,0 +1,35 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Storage backend abstraction. +//! +//! This module provides a trait-based abstraction for different storage backends, +//! allowing the workspace to work with various storage systems: +//! +//! - **FileBackend**: File system storage (default) +//! - **MemoryBackend**: In-memory storage (for testing) +//! +//! # Example +//! +//! ```rust,ignore +//! use vectorless::storage::backend::{StorageBackend, FileBackend}; +//! +//! let backend = FileBackend::new("./workspace"); +//! +//! // Store data +//! backend.put("doc-1", b"document data")?; +//! +//! // Retrieve data +//! let data = backend.get("doc-1")?; +//! +//! // List all keys +//! let keys = backend.keys()?; +//! ``` + +mod file; +mod memory; +mod trait_def; + +pub use file::FileBackend; +pub use memory::MemoryBackend; +pub use trait_def::StorageBackend; diff --git a/src/storage/backend/trait_def.rs b/src/storage/backend/trait_def.rs new file mode 100644 index 00000000..782bdac0 --- /dev/null +++ b/src/storage/backend/trait_def.rs @@ -0,0 +1,113 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Storage backend trait definition. + +use std::fmt::Debug; + +use crate::error::Result; + +/// Storage backend trait for abstracting different storage systems. +/// +/// This trait provides a simple key-value interface for document storage. +/// Implementations can use different underlying storage systems: +/// +/// - File system +/// - In-memory (for testing) +/// - Database (SQLite, RocksDB, etc.) +/// - Cloud storage (S3, etc.) +/// +/// # Thread Safety +/// +/// All implementations must be `Send + Sync` to support concurrent access. +pub trait StorageBackend: Debug + Send + Sync { + /// Get a value by key. + /// + /// Returns `None` if the key doesn't exist. + fn get(&self, key: &str) -> Result>>; + + /// Store a value with the given key. + /// + /// Overwrites any existing value. + fn put(&self, key: &str, value: &[u8]) -> Result<()>; + + /// Delete a value by key. + /// + /// Returns `true` if the value was deleted, `false` if it didn't exist. + fn delete(&self, key: &str) -> Result; + + /// Check if a key exists. + fn exists(&self, key: &str) -> Result; + + /// List all keys in the storage. + fn keys(&self) -> Result>; + + /// Get the number of entries in storage. + fn len(&self) -> Result; + + /// Check if storage is empty. + fn is_empty(&self) -> Result { + Ok(self.len()? == 0) + } + + /// Clear all entries from storage. + fn clear(&self) -> Result<()>; + + // ======================================================================== + // Batch operations (optional, default implementations) + // ======================================================================== + + /// Get multiple values by keys. + /// + /// Returns a vector of options, one for each key. + fn batch_get(&self, keys: &[&str]) -> Result>>> { + keys.iter().map(|k| self.get(k)).collect() + } + + /// Store multiple key-value pairs. + /// + /// Default implementation calls `put` for each item. + fn batch_put(&self, items: &[(&str, &[u8])]) -> Result<()> { + for (key, value) in items { + self.put(key, value)?; + } + Ok(()) + } + + /// Delete multiple keys. + /// + /// Returns the number of keys that were actually deleted. + fn batch_delete(&self, keys: &[&str]) -> Result { + let mut count = 0; + for key in keys { + if self.delete(key)? { + count += 1; + } + } + Ok(count) + } + + // ======================================================================== + // Metadata operations + // ======================================================================== + + /// Get storage backend name. + fn backend_name(&self) -> &'static str; + + /// Get storage statistics. + fn stats(&self) -> StorageStats { + StorageStats { + backend: self.backend_name().to_string(), + entries: self.len().unwrap_or(0), + } + } +} + +/// Storage statistics. +#[derive(Debug, Clone)] +pub struct StorageStats { + /// Backend name. + pub backend: String, + /// Number of entries. + pub entries: usize, +} diff --git a/src/storage/codec.rs b/src/storage/codec.rs new file mode 100644 index 00000000..3fcfd055 --- /dev/null +++ b/src/storage/codec.rs @@ -0,0 +1,241 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Codec abstraction for compression and decompression. +//! +//! This module provides a codec trait for compressing/decompressing data, +//! with implementations for: +//! +//! - **Identity**: No compression (pass-through) +//! - **Gzip**: Standard gzip compression +//! +//! # Example +//! +//! ```rust,ignore +//! use vectorless::storage::codec::{Codec, GzipCodec}; +//! +//! let codec = GzipCodec::new(6); +//! +//! let data = b"some data to compress"; +//! let compressed = codec.encode(data)?; +//! let decompressed = codec.decode(&compressed)?; +//! +//! assert_eq!(data.as_slice(), decompressed.as_slice()); +//! ``` + +use std::fmt::Debug; +use std::io::{Read, Write}; + +use flate2::read::GzDecoder; +use flate2::write::GzEncoder; +use flate2::Compression; + +use crate::error::Result; +use crate::Error; + +/// Codec trait for compression/decompression. +pub trait Codec: Debug + Send + Sync { + /// Encode (compress) data. + fn encode(&self, data: &[u8]) -> Result>; + + /// Decode (decompress) data. + fn decode(&self, data: &[u8]) -> Result>; + + /// Get the codec name. + fn name(&self) -> &'static str; +} + +/// Identity codec (no compression). +/// +/// Passes data through unchanged. +#[derive(Debug, Clone, Copy, Default)] +pub struct IdentityCodec; + +impl IdentityCodec { + /// Create a new identity codec. + pub fn new() -> Self { + Self::default() + } +} + +impl Codec for IdentityCodec { + fn encode(&self, data: &[u8]) -> Result> { + Ok(data.to_vec()) + } + + fn decode(&self, data: &[u8]) -> Result> { + Ok(data.to_vec()) + } + + fn name(&self) -> &'static str { + "identity" + } +} + +/// Gzip codec. +/// +/// Uses the `flate2` crate for gzip compression. +#[derive(Debug, Clone)] +pub struct GzipCodec { + /// Compression level (0-9). + level: u32, +} + +impl GzipCodec { + /// Create a new gzip codec with the given compression level. + /// + /// Level is clamped to 0-9: + /// - 0: No compression + /// - 1: Fastest compression + /// - 6: Default (good balance) + /// - 9: Best compression (slowest) + pub fn new(level: u32) -> Self { + Self { + level: level.clamp(0, 9), + } + } + + /// Create a codec with fast compression (level 1). + pub fn fast() -> Self { + Self::new(1) + } + + /// Create a codec with default compression (level 6). + pub fn default_level() -> Self { + Self::new(6) + } + + /// Create a codec with best compression (level 9). + pub fn best() -> Self { + Self::new(9) + } +} + +impl Default for GzipCodec { + fn default() -> Self { + Self::default_level() + } +} + +impl Codec for GzipCodec { + fn encode(&self, data: &[u8]) -> Result> { + let mut encoder = GzEncoder::new(Vec::new(), Compression::new(self.level)); + encoder.write_all(data).map_err(|e| Error::Parse(format!("Gzip encode error: {}", e)))?; + encoder.finish().map_err(|e| Error::Parse(format!("Gzip finish error: {}", e))) + } + + fn decode(&self, data: &[u8]) -> Result> { + let mut decoder = GzDecoder::new(data); + let mut decoded = Vec::new(); + decoder + .read_to_end(&mut decoded) + .map_err(|e| Error::Parse(format!("Gzip decode error: {}", e)))?; + Ok(decoded) + } + + fn name(&self) -> &'static str { + "gzip" + } +} + +/// Create a codec from configuration. +pub fn codec_from_config( + enabled: bool, + algorithm: crate::config::CompressionAlgorithm, + level: u32, +) -> Box { + if !enabled { + return Box::new(IdentityCodec::new()); + } + + match algorithm { + crate::config::CompressionAlgorithm::Gzip => Box::new(GzipCodec::new(level)), + crate::config::CompressionAlgorithm::Zstd => { + // Zstd not implemented yet, fallback to gzip + // TODO: Add zstd support when needed + Box::new(GzipCodec::new(level)) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_identity_codec() { + let codec = IdentityCodec::new(); + let data = b"test data"; + + let encoded = codec.encode(data).unwrap(); + let decoded = codec.decode(&encoded).unwrap(); + + assert_eq!(data.as_slice(), decoded.as_slice()); + assert_eq!(codec.name(), "identity"); + } + + #[test] + fn test_gzip_codec_basic() { + let codec = GzipCodec::default(); + let data = b"Hello, World! This is a test string for compression."; + + let encoded = codec.encode(data).unwrap(); + let decoded = codec.decode(&encoded).unwrap(); + + assert_eq!(data.as_slice(), decoded.as_slice()); + assert_eq!(codec.name(), "gzip"); + + // Compressed should be smaller for repetitive data + // Note: For very small data, gzip overhead might make it larger + let repetitive = b"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + let compressed = codec.encode(repetitive).unwrap(); + assert!(compressed.len() < repetitive.len()); + } + + #[test] + fn test_gzip_codec_levels() { + let data = b"This is test data that should compress well. ".repeat(100); + let data = data.into_iter().map(|b| b as u8).collect::>(); + + let codec_fast = GzipCodec::fast(); + let codec_best = GzipCodec::best(); + + let compressed_fast = codec_fast.encode(&data).unwrap(); + let compressed_best = codec_best.encode(&data).unwrap(); + + // Both should decompress to the same data + assert_eq!(codec_fast.decode(&compressed_fast).unwrap(), data); + assert_eq!(codec_best.decode(&compressed_best).unwrap(), data); + + // Best compression should be smaller or equal + assert!(compressed_best.len() <= compressed_fast.len()); + } + + #[test] + fn test_gzip_empty_data() { + let codec = GzipCodec::default(); + let data = b""; + + let encoded = codec.encode(data).unwrap(); + let decoded = codec.decode(&encoded).unwrap(); + + assert!(decoded.is_empty()); + } + + #[test] + fn test_codec_from_config() { + use crate::config::CompressionAlgorithm; + + // Disabled compression + let codec = codec_from_config(false, CompressionAlgorithm::Gzip, 6); + let data = b"test"; + let encoded = codec.encode(data).unwrap(); + assert_eq!(encoded, data); + + // Enabled compression + let codec = codec_from_config(true, CompressionAlgorithm::Gzip, 6); + let encoded = codec.encode(data).unwrap(); + let decoded = codec.decode(&encoded).unwrap(); + assert_eq!(decoded, data); + } +} diff --git a/src/storage/migration.rs b/src/storage/migration.rs new file mode 100644 index 00000000..b73c0f6e --- /dev/null +++ b/src/storage/migration.rs @@ -0,0 +1,383 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Version migration system for persisted data. +//! +//! This module provides a framework for migrating data between format versions. +//! When the data format changes, migrations can automatically upgrade older data. +//! +//! # Example +//! +//! ```rust,ignore +//! use vectorless::storage::migration::{Migration, Migrator, MigrationContext}; +//! +//! // Define a migration from v1 to v2 +//! struct V1ToV2; +//! +//! impl Migration for V1ToV2 { +//! fn from_version(&self) -> u32 { 1 } +//! fn to_version(&self) -> u32 { 2 } +//! fn migrate(&self, data: &[u8], ctx: &MigrationContext) -> Result> { +//! // Transform data from v1 to v2 format +//! // ... +//! } +//! } +//! +//! // Register migrations +//! let mut migrator = Migrator::new(); +//! migrator.register(Box::new(V1ToV2)); +//! +//! // Migrate data +//! let migrated = migrator.migrate(data, 1, 2)?; +//! ``` + +use std::collections::HashMap; + +use tracing::{debug, info, warn}; + +use crate::error::Result; +use crate::Error; + +/// Current data format version. +pub const CURRENT_VERSION: u32 = 1; + +/// Migration context providing additional information for migrations. +#[derive(Debug, Clone)] +pub struct MigrationContext { + /// Source version. + pub from_version: u32, + /// Target version. + pub to_version: u32, + /// Additional metadata. + pub metadata: HashMap, +} + +impl MigrationContext { + /// Create a new migration context. + pub fn new(from_version: u32, to_version: u32) -> Self { + Self { + from_version, + to_version, + metadata: HashMap::new(), + } + } + + /// Add metadata. + pub fn with_metadata(mut self, key: impl Into, value: impl Into) -> Self { + self.metadata.insert(key.into(), value.into()); + self + } +} + +/// Trait for data migrations. +/// +/// A migration transforms data from one version to the next. +pub trait Migration: Send + Sync { + /// Get the source version this migration applies to. + fn from_version(&self) -> u32; + + /// Get the target version this migration produces. + fn to_version(&self) -> u32; + + /// Get a human-readable description of this migration. + fn description(&self) -> &str; + + /// Perform the migration. + /// + /// # Arguments + /// + /// * `data` - The data to migrate + /// * `ctx` - Migration context with additional information + /// + /// # Returns + /// + /// The migrated data in the new format. + fn migrate(&self, data: &[u8], ctx: &MigrationContext) -> Result>; + + /// Check if this migration can be applied to the given data. + /// + /// Default implementation always returns true. + fn can_migrate(&self, _data: &[u8]) -> bool { + true + } +} + +/// Migration registry and executor. +pub struct Migrator { + /// Registered migrations, keyed by (from_version, to_version). + migrations: HashMap<(u32, u32), Box>, +} + +impl Default for Migrator { + fn default() -> Self { + Self::new() + } +} + +impl std::fmt::Debug for Migrator { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Migrator") + .field("migration_count", &self.migrations.len()) + .finish() + } +} + +impl Migrator { + /// Create a new migrator. + pub fn new() -> Self { + Self { + migrations: HashMap::new(), + } + } + + /// Register a migration. + pub fn register(&mut self, migration: Box) { + let key = (migration.from_version(), migration.to_version()); + debug!( + "Registering migration: v{} -> v{}", + key.0, key.1 + ); + self.migrations.insert(key, migration); + } + + /// Check if a migration path exists between two versions. + pub fn can_migrate(&self, from_version: u32, to_version: u32) -> bool { + if from_version == to_version { + return true; + } + + // Check if we have a direct migration + if self.migrations.contains_key(&(from_version, to_version)) { + return true; + } + + // Check if we have a path through intermediate versions + self.find_migration_path(from_version, to_version).is_some() + } + + /// Find a migration path between two versions. + /// + /// Returns a sequence of version numbers to migrate through. + fn find_migration_path(&self, from_version: u32, to_version: u32) -> Option> { + if from_version == to_version { + return Some(vec![from_version]); + } + + // Simple BFS to find a path + use std::collections::{HashSet, VecDeque}; + + let mut visited: HashSet = HashSet::new(); + let mut queue: VecDeque = VecDeque::new(); + let mut parent: HashMap = HashMap::new(); + + queue.push_back(from_version); + visited.insert(from_version); + + while let Some(current) = queue.pop_front() { + // Find all migrations from current version + for ((from, to), _) in &self.migrations { + if *from == current && !visited.contains(to) { + visited.insert(*to); + parent.insert(*to, current); + queue.push_back(*to); + + if *to == to_version { + // Reconstruct path + let mut path = vec![to_version]; + let mut v = to_version; + while let Some(&p) = parent.get(&v) { + if p == from_version { + path.push(p); + break; + } + path.push(p); + v = p; + } + path.reverse(); + return Some(path); + } + } + } + } + + None + } + + /// Migrate data from one version to another. + /// + /// If a direct migration exists, it will be used. + /// Otherwise, the migrator will try to find a path through intermediate versions. + pub fn migrate(&self, data: &[u8], from_version: u32, to_version: u32) -> Result> { + if from_version == to_version { + return Ok(data.to_vec()); + } + + // Find migration path + let path = self.find_migration_path(from_version, to_version) + .ok_or_else(|| Error::VersionMismatch(format!( + "No migration path from v{} to v{}", + from_version, to_version + )))?; + + if path.len() < 2 { + return Ok(data.to_vec()); + } + + info!( + "Migrating data from v{} to v{} via path: {:?}", + from_version, to_version, path + ); + + let mut current_data = data.to_vec(); + let mut current_version = from_version; + + for next_version in path.iter().skip(1) { + let key = (current_version, *next_version); + let migration = self.migrations.get(&key) + .ok_or_else(|| Error::VersionMismatch(format!( + "Missing migration from v{} to v{}", + current_version, next_version + )))?; + + let ctx = MigrationContext::new(current_version, *next_version); + + debug!( + "Applying migration: v{} -> v{} ({})", + current_version, next_version, migration.description() + ); + + current_data = migration.migrate(¤t_data, &ctx)?; + current_version = *next_version; + } + + Ok(current_data) + } + + /// Get the list of registered migrations. + pub fn list_migrations(&self) -> Vec<(u32, u32, &str)> { + self.migrations + .values() + .map(|m| (m.from_version(), m.to_version(), m.description())) + .collect() + } +} + +// ============================================================================ +// Built-in migrations +// ============================================================================ + +/// Placeholder migration for future versions. +/// This is a template that can be copied for actual migrations. +#[derive(Debug)] +pub struct PlaceholderMigration { + from: u32, + to: u32, +} + +impl PlaceholderMigration { + /// Create a new placeholder migration. + pub fn new(from: u32, to: u32) -> Self { + Self { from, to } + } +} + +impl Migration for PlaceholderMigration { + fn from_version(&self) -> u32 { + self.from + } + + fn to_version(&self) -> u32 { + self.to + } + + fn description(&self) -> &str { + "Placeholder migration (no-op)" + } + + fn migrate(&self, data: &[u8], _ctx: &MigrationContext) -> Result> { + warn!( + "Using placeholder migration from v{} to v{} - no changes made", + self.from, self.to + ); + Ok(data.to_vec()) + } +} + +/// Create a default migrator with all built-in migrations registered. +pub fn default_migrator() -> Migrator { + Migrator::new() + // Add migrations as needed when versions change + // migrator.register(Box::new(V1ToV2::new())); +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_migration_context() { + let ctx = MigrationContext::new(1, 2) + .with_metadata("key", "value"); + + assert_eq!(ctx.from_version, 1); + assert_eq!(ctx.to_version, 2); + assert_eq!(ctx.metadata.get("key"), Some(&"value".to_string())); + } + + #[test] + fn test_migrator_no_migration_needed() { + let migrator = Migrator::new(); + let data = b"test data"; + + let result = migrator.migrate(data, 1, 1).unwrap(); + assert_eq!(result, data); + } + + #[test] + fn test_migrator_no_path() { + let migrator = Migrator::new(); + let data = b"test data"; + + let result = migrator.migrate(data, 1, 2); + assert!(result.is_err()); + } + + #[test] + fn test_migrator_with_placeholder() { + let mut migrator = Migrator::new(); + migrator.register(Box::new(PlaceholderMigration::new(1, 2))); + + assert!(migrator.can_migrate(1, 2)); + assert!(!migrator.can_migrate(1, 3)); + + let data = b"test data"; + let result = migrator.migrate(data, 1, 2).unwrap(); + assert_eq!(result, data); + } + + #[test] + fn test_migrator_path_finding() { + let mut migrator = Migrator::new(); + migrator.register(Box::new(PlaceholderMigration::new(1, 2))); + migrator.register(Box::new(PlaceholderMigration::new(2, 3))); + + assert!(migrator.can_migrate(1, 3)); + + let path = migrator.find_migration_path(1, 3).unwrap(); + assert_eq!(path, vec![1, 2, 3]); + + let data = b"test data"; + let result = migrator.migrate(data, 1, 3).unwrap(); + assert_eq!(result, data); + } + + #[test] + fn test_list_migrations() { + let mut migrator = Migrator::new(); + migrator.register(Box::new(PlaceholderMigration::new(1, 2))); + migrator.register(Box::new(PlaceholderMigration::new(2, 3))); + + let list = migrator.list_migrations(); + assert_eq!(list.len(), 2); + } +} diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 0fba85ed..f8d97b1f 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -8,6 +8,7 @@ //! - **Persistence** — Save/load document trees and metadata with atomic writes //! - **Cache** — LRU cache for loaded documents //! - **Lock** — File locking for multi-process safety +//! - **Backend** — Storage backend abstraction (file, memory, etc.) //! //! # Example //! @@ -28,18 +29,28 @@ //! let loaded = workspace.load("doc-1")?.unwrap(); //! ``` +pub mod async_workspace; +pub mod backend; pub mod cache; +pub mod codec; pub mod lock; +pub mod migration; mod persistence; mod workspace; // Re-export main types +pub use backend::{FileBackend, MemoryBackend, StorageBackend}; pub use cache::DocumentCache; +pub use codec::{Codec, GzipCodec, IdentityCodec, codec_from_config}; +pub use migration::{Migration, MigrationContext, Migrator, CURRENT_VERSION}; pub use lock::{FileLock, ScopedLock}; pub use persistence::{ DocumentMeta, PageContent, PersistedDocument, - load_document, load_document_with_options, load_index, load_index_with_options, - save_document, save_document_with_options, save_index, save_index_with_options, + load_document, load_document_from_bytes, load_document_with_options, + load_index, load_index_from_bytes, load_index_with_options, + save_document, save_document_to_bytes, save_document_with_options, + save_index, save_index_to_bytes, save_index_with_options, PersistenceOptions, }; +pub use async_workspace::{AsyncDocumentMetaEntry, AsyncWorkspace, AsyncWorkspaceOptions}; pub use workspace::{DocumentMetaEntry, Workspace, WorkspaceOptions}; diff --git a/src/storage/persistence.rs b/src/storage/persistence.rs index 2095bcbd..245f33a6 100644 --- a/src/storage/persistence.rs +++ b/src/storage/persistence.rs @@ -415,6 +415,131 @@ pub fn load_index_with_options( Ok(wrapper.payload) } +// ============================================================================ +// Bytes-based serialization (for StorageBackend integration) +// ============================================================================ + +/// Serialize a document to bytes (JSON with checksum wrapper). +/// +/// This is useful for storage backends that work with byte arrays. +pub fn save_document_to_bytes(doc: &PersistedDocument) -> Result> { + // Serialize the payload first + let payload_bytes = serde_json::to_vec(doc) + .map_err(|e| Error::Serialization(e.to_string()))?; + + // Calculate checksum + let checksum = calculate_checksum(&payload_bytes); + + // Create wrapper + let wrapper = PersistedWrapper { + version: FORMAT_VERSION, + checksum, + payload: doc.clone(), + }; + + // Serialize wrapper + serde_json::to_vec(&wrapper) + .map_err(|e| Error::Serialization(e.to_string())) +} + +/// Deserialize a document from bytes. +/// +/// Verifies checksum by default. +pub fn load_document_from_bytes(data: &[u8]) -> Result { + load_document_from_bytes_with_options(data, true) +} + +/// Deserialize a document from bytes with optional checksum verification. +pub fn load_document_from_bytes_with_options( + data: &[u8], + verify_checksum: bool, +) -> Result { + // Parse wrapper + let wrapper: PersistedWrapper = serde_json::from_slice(data) + .map_err(|e| Error::Parse(format!("Failed to parse document: {}", e)))?; + + // Check version + if wrapper.version != FORMAT_VERSION { + return Err(Error::VersionMismatch(format!( + "Expected version {}, got {}", + FORMAT_VERSION, wrapper.version + ))); + } + + // Verify checksum if enabled + if verify_checksum { + let payload_bytes = serde_json::to_vec(&wrapper.payload) + .map_err(|e| Error::Serialization(e.to_string()))?; + + let expected_checksum = calculate_checksum(&payload_bytes); + + if wrapper.checksum != expected_checksum { + return Err(Error::ChecksumMismatch(format!( + "Expected {}, got {}", + expected_checksum, wrapper.checksum + ))); + } + } + + Ok(wrapper.payload) +} + +/// Serialize an index to bytes. +pub fn save_index_to_bytes(entries: &[DocumentMeta]) -> Result> { + let payload_bytes = serde_json::to_vec(entries) + .map_err(|e| Error::Serialization(e.to_string()))?; + + let checksum = calculate_checksum(&payload_bytes); + + let wrapper = PersistedWrapper { + version: FORMAT_VERSION, + checksum, + payload: entries.to_vec(), + }; + + serde_json::to_vec(&wrapper) + .map_err(|e| Error::Serialization(e.to_string())) +} + +/// Deserialize an index from bytes. +pub fn load_index_from_bytes(data: &[u8]) -> Result> { + load_index_from_bytes_with_options(data, true) +} + +/// Deserialize an index from bytes with optional checksum verification. +pub fn load_index_from_bytes_with_options( + data: &[u8], + verify_checksum: bool, +) -> Result> { + let wrapper: PersistedWrapper> = serde_json::from_slice(data) + .map_err(|e| Error::Parse(format!("Failed to parse index: {}", e)))?; + + // Check version + if wrapper.version != FORMAT_VERSION { + return Err(Error::VersionMismatch(format!( + "Expected version {}, got {}", + FORMAT_VERSION, wrapper.version + ))); + } + + // Verify checksum if enabled + if verify_checksum { + let payload_bytes = serde_json::to_vec(&wrapper.payload) + .map_err(|e| Error::Serialization(e.to_string()))?; + + let expected_checksum = calculate_checksum(&payload_bytes); + + if wrapper.checksum != expected_checksum { + return Err(Error::ChecksumMismatch(format!( + "Expected {}, got {}", + expected_checksum, wrapper.checksum + ))); + } + } + + Ok(wrapper.payload) +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/storage/workspace.rs b/src/storage/workspace.rs index 63ad2a22..c13e99ec 100644 --- a/src/storage/workspace.rs +++ b/src/storage/workspace.rs @@ -3,46 +3,45 @@ //! Workspace management for document collections. //! -//! A workspace is a directory containing indexed documents and metadata. +//! A workspace manages indexed documents using a storage backend abstraction. //! Uses lazy-loading pattern with LRU cache: //! - Metadata index always in memory //! - Full documents loaded on demand with LRU eviction //! -//! # Structure +//! # Backends //! -//! ```text -//! workspace/ -//! ├── _meta.json # Lightweight index: all document metadata -//! ├── {doc_id_1}.json # Document 1 full data (tree + pages) -//! ├── {doc_id_2}.json # Document 2 full data -//! └── ... -//! ``` +//! The workspace supports different storage backends: +//! - **FileBackend**: File system storage (default) +//! - **MemoryBackend**: In-memory storage (for testing) //! -//! # Thread Safety +//! # Example //! -//! The workspace uses interior mutability for the LRU cache: -//! - Read operations (`get_meta`, `contains`, `list_documents`) only need `&self` -//! - Cache updates happen internally via `Mutex` +//! ```rust,ignore +//! use vectorless::storage::{Workspace, FileBackend}; //! -//! # File Locking +//! // Default file-based workspace +//! let mut workspace = Workspace::new("./my_workspace")?; //! -//! When enabled (default), the workspace uses an exclusive file lock -//! to prevent concurrent access from multiple processes. +//! // Or with custom backend +//! let backend = std::sync::Arc::new(FileBackend::new("./my_workspace")?); +//! let mut workspace = Workspace::with_backend(backend)?; +//! ``` use std::collections::HashMap; -use std::fs; use std::path::{Path, PathBuf}; +use std::sync::Arc; use serde::{Deserialize, Serialize}; use tracing::{debug, info, warn}; +use super::backend::{FileBackend, StorageBackend}; use super::cache::DocumentCache; use super::lock::FileLock; -use super::persistence::{PersistedDocument, load_document, save_document}; +use super::persistence::{PersistedDocument, load_document_from_bytes, save_document_to_bytes}; use crate::error::Result; use crate::Error; -const META_FILE: &str = "_meta.json"; +const META_KEY: &str = "_meta"; const LOCK_FILE: &str = ".workspace.lock"; const DEFAULT_CACHE_SIZE: usize = 100; @@ -72,28 +71,31 @@ pub struct DocumentMetaEntry { /// A workspace for managing indexed documents. /// /// Uses LRU cache for loaded documents to balance memory usage -/// and access performance. The cache uses interior mutability, -/// so read operations only require `&self`. +/// and access performance. +/// +/// # Thread Safety +/// +/// The workspace is thread-safe when used with a thread-safe backend. +/// Read operations only require `&self`. #[derive(Debug)] pub struct Workspace { - /// Root directory for the workspace. - root: PathBuf, - + /// Storage backend. + backend: Arc, + /// Root path (for file-based backends, used for locking). + root: Option, /// Document metadata index (id -> meta). /// This is always loaded in memory. meta_index: HashMap, - /// LRU cache for loaded documents. cache: DocumentCache, - - /// File lock for multi-process safety. + /// File lock for multi-process safety (file backends only). _lock: Option, } /// Options for workspace creation. #[derive(Debug, Clone)] pub struct WorkspaceOptions { - /// Enable file locking (default: true). + /// Enable file locking (default: true, only for file backends). pub file_lock: bool, /// LRU cache size (default: 100). pub cache_size: usize, @@ -108,8 +110,58 @@ impl Default for WorkspaceOptions { } } +impl WorkspaceOptions { + /// Create new options with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Set the cache size. + pub fn with_cache_size(mut self, size: usize) -> Self { + self.cache_size = size; + self + } + + /// Enable or disable file locking. + pub fn with_file_lock(mut self, enabled: bool) -> Self { + self.file_lock = enabled; + self + } +} + impl Workspace { - /// Create a new workspace at the given path with default cache size. + /// Create a new workspace with a storage backend. + /// + /// # Example + /// + /// ```rust,ignore + /// let backend = Arc::new(FileBackend::new("./workspace")?); + /// let workspace = Workspace::with_backend(backend)?; + /// ``` + pub fn with_backend(backend: Arc) -> Result { + Self::with_backend_and_options(backend, WorkspaceOptions::default()) + } + + /// Create a workspace with backend and options. + pub fn with_backend_and_options( + backend: Arc, + options: WorkspaceOptions, + ) -> Result { + let mut workspace = Self { + backend, + root: None, + meta_index: HashMap::new(), + cache: DocumentCache::with_capacity(options.cache_size), + _lock: None, + }; + + workspace.load_meta_index()?; + Ok(workspace) + } + + /// Create a new file-based workspace at the given path. + /// + /// This is a convenience method that creates a `FileBackend` internally. pub fn new(path: impl Into) -> Result { Self::with_options(path, WorkspaceOptions::default()) } @@ -125,7 +177,6 @@ impl Workspace { /// Create a new workspace with custom options. pub fn with_options(path: impl Into, options: WorkspaceOptions) -> Result { let root = path.into(); - fs::create_dir_all(&root).map_err(Error::Io)?; // Acquire file lock if enabled let lock = if options.file_lock { @@ -135,8 +186,11 @@ impl Workspace { None }; + let backend = Arc::new(FileBackend::new(&root)?); + let mut workspace = Self { - root, + backend, + root: Some(root), meta_index: HashMap::new(), cache: DocumentCache::with_capacity(options.cache_size), _lock: lock, @@ -168,31 +222,37 @@ impl Workspace { options: WorkspaceOptions, ) -> Result { let root = path.clone().into(); - if root.exists() { - // Acquire file lock if enabled - let lock = if options.file_lock { - let lock_path = root.join(LOCK_FILE); - Some(FileLock::try_lock(&lock_path, true)?) - } else { - None - }; - - let mut workspace = Self { - root, - meta_index: HashMap::new(), - cache: DocumentCache::with_capacity(options.cache_size), - _lock: lock, - }; - workspace.load_meta_index()?; - Ok(workspace) + + // Acquire file lock if enabled + let lock = if options.file_lock && root.exists() { + let lock_path = root.join(LOCK_FILE); + Some(FileLock::try_lock(&lock_path, true)?) } else { - Self::with_options(path, options) - } + None + }; + + let backend = Arc::new(FileBackend::new(&root)?); + + let mut workspace = Self { + backend, + root: Some(root), + meta_index: HashMap::new(), + cache: DocumentCache::with_capacity(options.cache_size), + _lock: lock, + }; + + workspace.load_meta_index()?; + Ok(workspace) } - /// Get the workspace root path. - pub fn path(&self) -> &Path { - &self.root + /// Get the workspace root path (if file-based). + pub fn path(&self) -> Option<&Path> { + self.root.as_deref() + } + + /// Get the storage backend. + pub fn backend(&self) -> &dyn StorageBackend { + self.backend.as_ref() } /// List all document IDs in the workspace. @@ -211,17 +271,15 @@ impl Workspace { } /// Add a document to the workspace. - /// - /// This saves the full document to disk and updates the meta index. - /// The document is NOT cached (lazy loading on first access). pub fn add(&mut self, doc: &PersistedDocument) -> Result<()> { let doc_id = doc.meta.id.clone(); - let doc_path = self.document_path(&doc_id); + let key = self.doc_key(&doc_id); - // Save full document to disk - save_document(&doc_path, doc)?; + // Serialize and save via backend + let bytes = save_document_to_bytes(doc)?; + self.backend.put(&key, &bytes)?; - // Update meta index (lightweight) + // Update meta index let meta_entry = DocumentMetaEntry { id: doc_id.clone(), doc_name: doc.meta.name.clone(), @@ -239,7 +297,7 @@ impl Workspace { self.meta_index.insert(doc_id.clone(), meta_entry); self.save_meta_index()?; - // Remove from cache if present (will lazy load on next access) + // Remove from cache if present let _ = self.cache.remove(&doc_id); info!("Saved document {} to workspace", doc_id); @@ -249,9 +307,7 @@ impl Workspace { /// Load a document from the workspace. /// /// Uses LRU cache: returns cached version if available, - /// otherwise loads from disk and caches it. - /// - /// This method only requires `&self` (interior mutability for cache). + /// otherwise loads from backend and caches it. pub fn load(&self, id: &str) -> Result> { if !self.contains(id) { return Ok(None); @@ -263,20 +319,23 @@ impl Workspace { return Ok(Some(cached)); } - // Load from disk - let doc_path = self.document_path(id); - if !doc_path.exists() { - warn!("Document {} in meta index but file missing", id); - return Ok(None); + // Load from backend + let key = self.doc_key(id); + match self.backend.get(&key)? { + Some(bytes) => { + let doc = load_document_from_bytes(&bytes)?; + + // Add to LRU cache + self.cache.put(id.to_string(), doc.clone())?; + + debug!("Loaded document {} from backend (cached)", id); + Ok(Some(doc)) + } + None => { + warn!("Document {} in meta index but not in backend", id); + Ok(None) + } } - - let doc = load_document(&doc_path)?; - - // Add to LRU cache - self.cache.put(id.to_string(), doc.clone())?; - - debug!("Loaded document {} from disk (cached)", id); - Ok(Some(doc)) } /// Remove a document from the workspace. @@ -285,10 +344,8 @@ impl Workspace { return Ok(false); } - let doc_path = self.document_path(id); - if doc_path.exists() { - fs::remove_file(&doc_path).map_err(Error::Io)?; - } + let key = self.doc_key(id); + self.backend.delete(&key)?; self.meta_index.remove(id); @@ -321,6 +378,11 @@ impl Workspace { self.cache.utilization() } + /// Get cache statistics. + pub fn cache_stats(&self) -> super::cache::CacheStats { + self.cache.stats() + } + /// Clear the LRU cache (does not remove documents from workspace). pub fn clear_cache(&self) -> Result<()> { self.cache.clear()?; @@ -328,69 +390,50 @@ impl Workspace { Ok(()) } - /// Get the path for a document file. - fn document_path(&self, id: &str) -> PathBuf { - self.root.join(format!("{}.json", id)) - } - - /// Get the path for the meta index file. - fn meta_path(&self) -> PathBuf { - self.root.join(META_FILE) + /// Get the storage key for a document. + fn doc_key(&self, id: &str) -> String { + format!("doc:{}", id) } - /// Load the meta index from disk. + /// Load the meta index from backend. fn load_meta_index(&mut self) -> Result<()> { - let meta_path = self.meta_path(); - - if !meta_path.exists() { - // Try to rebuild from existing files - self.rebuild_meta_index()?; - return Ok(()); + match self.backend.get(META_KEY)? { + Some(bytes) => { + let meta: HashMap = serde_json::from_slice(&bytes) + .map_err(|e| Error::Parse(format!("Failed to parse meta index: {}", e)))?; + self.meta_index = meta; + info!( + "Loaded {} document(s) from workspace index", + self.meta_index.len() + ); + } + None => { + // Try to rebuild from existing keys + self.rebuild_meta_index()?; + } } - - let content = fs::read_to_string(&meta_path).map_err(Error::Io)?; - - let meta: HashMap = serde_json::from_str(&content) - .map_err(|e| Error::Parse(format!("Failed to parse meta index: {}", e)))?; - - self.meta_index = meta; - info!( - "Loaded {} document(s) from workspace index", - self.meta_index.len() - ); Ok(()) } - /// Save the meta index to disk. + /// Save the meta index to backend. fn save_meta_index(&self) -> Result<()> { - let content = serde_json::to_string_pretty(&self.meta_index) + let bytes = serde_json::to_vec_pretty(&self.meta_index) .map_err(|e| Error::Parse(format!("Failed to serialize meta index: {}", e)))?; - - fs::write(self.meta_path(), content).map_err(Error::Io)?; - + self.backend.put(META_KEY, &bytes)?; Ok(()) } - /// Rebuild the meta index from existing document files. + /// Rebuild the meta index from existing documents. fn rebuild_meta_index(&mut self) -> Result<()> { - let entries: Vec<_> = fs::read_dir(&self.root) - .map_err(Error::Io)? - .filter_map(|entry| entry.ok()) - .filter(|entry| { - entry - .path() - .extension() - .map(|ext| ext == "json") - .unwrap_or(false) - }) - .filter_map(|entry| { - let path = entry.path(); - // Skip the meta file itself - if path.file_stem()?.to_str()? == "_meta" { - return None; - } - // Try to load the document and extract metadata - load_document(&path).ok().map(|doc| { + let keys = self.backend.keys()?; + let doc_keys: Vec<_> = keys + .iter() + .filter(|k| k.starts_with("doc:")) + .collect(); + + for key in doc_keys { + if let Some(bytes) = self.backend.get(key)? { + if let Ok(doc) = load_document_from_bytes(&bytes) { let doc_id = doc.meta.id.clone(); let meta_entry = DocumentMetaEntry { id: doc_id.clone(), @@ -405,19 +448,15 @@ impl Workspace { page_count: if doc.pages.is_empty() { None } else { Some(doc.pages.len()) }, line_count: doc.meta.line_count, }; - (doc_id, meta_entry) - }) - }) - .collect(); - - for (id, entry) in entries { - self.meta_index.insert(id, entry); + self.meta_index.insert(doc_id, meta_entry); + } + } } if !self.meta_index.is_empty() { self.save_meta_index()?; info!( - "Rebuilt index from {} document file(s)", + "Rebuilt index from {} document(s)", self.meta_index.len() ); } @@ -440,22 +479,40 @@ mod tests { assert_eq!(workspace.len(), 0); } + #[test] + fn test_workspace_with_memory_backend() { + let backend = Arc::new(super::super::backend::MemoryBackend::new()); + let mut workspace = Workspace::with_backend(backend).unwrap(); + + assert!(workspace.is_empty()); + + // Add a document + let meta = super::super::persistence::DocumentMeta::new("doc-1", "Test", "md"); + let tree = crate::document::DocumentTree::new("Root", "Content"); + let doc = PersistedDocument::new(meta, tree); + + workspace.add(&doc).unwrap(); + assert_eq!(workspace.len(), 1); + + // Load it back + let loaded = workspace.load("doc-1").unwrap(); + assert!(loaded.is_some()); + assert_eq!(loaded.unwrap().meta.id, "doc-1"); + } + #[test] fn test_workspace_open() { let temp = TempDir::new().unwrap(); let path = temp.path().join("workspace"); - // Use options without file lock to allow reopening let options = WorkspaceOptions { file_lock: false, ..Default::default() }; - // Create new let workspace = Workspace::open_with_options(&path, options.clone()).unwrap(); assert!(workspace.is_empty()); - // Reopen existing (need to drop first workspace to release lock) drop(workspace); let workspace2 = Workspace::open_with_options(&path, options).unwrap(); assert!(workspace2.is_empty()); @@ -472,4 +529,25 @@ mod tests { workspace.clear_cache().unwrap(); assert_eq!(workspace.cache_len(), 0); } + + #[test] + fn test_workspace_cache_stats() { + let backend = Arc::new(super::super::backend::MemoryBackend::new()); + let mut workspace = Workspace::with_backend(backend).unwrap(); + + let meta = super::super::persistence::DocumentMeta::new("doc-1", "Test", "md"); + let tree = crate::document::DocumentTree::new("Root", "Content"); + let doc = PersistedDocument::new(meta, tree); + workspace.add(&doc).unwrap(); + + // First load - cache miss + let _ = workspace.load("doc-1").unwrap(); + let stats = workspace.cache_stats(); + assert_eq!(stats.misses, 1); + + // Second load - cache hit + let _ = workspace.load("doc-1").unwrap(); + let stats = workspace.cache_stats(); + assert_eq!(stats.hits, 1); + } } From 59425f6c18d28fc743e1c055577bff85eef8de78 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Sun, 5 Apr 2026 13:28:16 +0800 Subject: [PATCH 5/8] refactor(storage): remove unnecessary mut bindings in async workspace Remove unused mut keywords from inner variable bindings in AsyncWorkspace methods since the variables are not being mutated. feat(storage): add unsafe_code allow attribute to lock module Add unsafe_code allow attribute to the lock module along with updated documentation comments to clarify the need for unsafe FFI calls in file locking implementation. --- src/storage/async_workspace.rs | 4 ++-- src/storage/lock.rs | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/storage/async_workspace.rs b/src/storage/async_workspace.rs index 6fb4f49e..56c43373 100644 --- a/src/storage/async_workspace.rs +++ b/src/storage/async_workspace.rs @@ -301,7 +301,7 @@ impl AsyncWorkspace { } // Load from backend and cache (write lock) - let mut inner = self.inner.write().await; + let inner = self.inner.write().await; let key = Self::doc_key(id); match inner.backend.get(&key)? { @@ -375,7 +375,7 @@ impl AsyncWorkspace { /// Clear the LRU cache. pub async fn clear_cache(&self) -> Result<()> { - let mut inner = self.inner.write().await; + let inner = self.inner.write().await; inner.cache.clear()?; debug!("Cleared async document cache"); Ok(()) diff --git a/src/storage/lock.rs b/src/storage/lock.rs index 57931691..66a65d46 100644 --- a/src/storage/lock.rs +++ b/src/storage/lock.rs @@ -6,6 +6,12 @@ //! Provides cross-process file locking to prevent data corruption //! when multiple processes access the same workspace. +// File locking inherently requires unsafe FFI calls +#![allow(unsafe_code)] +//! +//! Provides cross-process file locking to prevent data corruption +//! when multiple processes access the same workspace. + use std::fs::{File, OpenOptions}; use std::path::Path; From 6c597626e841ba591f5bddc6852bfe44fe8c04d9 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Sun, 5 Apr 2026 13:44:03 +0800 Subject: [PATCH 6/8] feat(storage): add comprehensive storage examples - Add async workspace usage example demonstrating concurrent document access and async LRU cache operations - Add custom storage backend example showing how to implement StorageBackend trait with logging memory backend implementation - Add compression example demonstrating GzipCodec and IdentityCodec for compressed and uncompressed storage operations - Add version migration example showcasing migration system for upgrading data formats between versions with multi-step migration support - Add basic workspace usage example covering core storage API including document creation, loading with LRU cache, listing and removal operations These examples provide practical demonstrations of the storage module capabilities and serve as reference implementations for common use cases. --- examples/storage_async.rs | 95 ++++++++++++++++++++++ examples/storage_backend.rs | 130 ++++++++++++++++++++++++++++++ examples/storage_compression.rs | 95 ++++++++++++++++++++++ examples/storage_migration.rs | 138 ++++++++++++++++++++++++++++++++ examples/storage_workspace.rs | 99 +++++++++++++++++++++++ 5 files changed, 557 insertions(+) create mode 100644 examples/storage_async.rs create mode 100644 examples/storage_backend.rs create mode 100644 examples/storage_compression.rs create mode 100644 examples/storage_migration.rs create mode 100644 examples/storage_workspace.rs diff --git a/examples/storage_async.rs b/examples/storage_async.rs new file mode 100644 index 00000000..f7ecfaec --- /dev/null +++ b/examples/storage_async.rs @@ -0,0 +1,95 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Async workspace usage example. +//! +//! This example demonstrates async workspace operations: +//! - Creating an async workspace +//! - Concurrent document access +//! - Async LRU cache +//! +//! # Usage +//! +//! ```bash +//! cargo run --example storage_async +//! ``` + +use std::sync::Arc; + +use vectorless::document::DocumentTree; +use vectorless::storage::{AsyncWorkspace, DocumentMeta, PersistedDocument}; + +fn create_doc(id: &str, name: &str) -> PersistedDocument { + let meta = DocumentMeta::new(id, name, "md"); + let content = format!("Content for {}", name); + let tree = DocumentTree::new("Root", &content); + PersistedDocument::new(meta, tree) +} + +#[tokio::main] +async fn main() -> vectorless::Result<()> { + println!("=== Async Workspace Example ===\n"); + + let workspace_path = "./example_async_workspace"; + + // 1. Create async workspace + println!("1. Creating async workspace..."); + let workspace = AsyncWorkspace::new(workspace_path).await?; + println!(" ✓ Created\n"); + + // 2. Add documents + println!("2. Adding documents..."); + workspace.add(&create_doc("doc-1", "Document One")).await?; + workspace.add(&create_doc("doc-2", "Document Two")).await?; + workspace.add(&create_doc("doc-3", "Document Three")).await?; + println!(" ✓ Added 3 documents\n"); + + // 3. Concurrent access example + println!("3. Concurrent access from multiple tasks..."); + let ws = Arc::new(workspace); + + let mut handles = vec![]; + + // Spawn concurrent read tasks + for i in 1..=3 { + let ws_clone = ws.clone(); + let handle = tokio::spawn(async move { + let id = format!("doc-{}", i); + let doc = ws_clone.load(&id).await.unwrap().unwrap(); + println!(" [Task {}] Loaded: {}", i, doc.meta.name); + }); + handles.push(handle); + } + + // Wait for all tasks + for handle in handles { + handle.await.unwrap(); + } + println!(" ✓ All concurrent loads completed\n"); + + // 4. Cache stats + println!("4. Cache statistics:"); + let stats = ws.cache_stats().await; + println!(" - Hits: {}", stats.hits); + println!(" - Misses: {}", stats.misses); + println!(); + + // 5. Clone and share + println!("5. Workspace can be cloned cheaply (Arc internally)..."); + let ws2 = ws.clone(); + let ws3 = ws.clone(); + + let len1 = ws.len().await; + let len2 = ws2.len().await; + let len3 = ws3.len().await; + + println!(" ws1.len() = {}, ws2.len() = {}, ws3.len() = {}", len1, len2, len3); + println!(" ✓ All clones share the same state\n"); + + // Cleanup + println!("Cleaning up..."); + std::fs::remove_dir_all(workspace_path).ok(); + println!(" ✓ Done!"); + + Ok(()) +} diff --git a/examples/storage_backend.rs b/examples/storage_backend.rs new file mode 100644 index 00000000..3b9a5fd9 --- /dev/null +++ b/examples/storage_backend.rs @@ -0,0 +1,130 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Custom storage backend example. +//! +//! This example shows how to implement a custom StorageBackend. +//! Useful for integrating with databases, cloud storage, etc. +//! +//! # Usage +//! +//! ```bash +//! cargo run --example storage_backend +//! ``` + +use std::collections::HashMap; +use std::sync::{Arc, RwLock}; + +use vectorless::document::DocumentTree; +use vectorless::storage::{DocumentMeta, PersistedDocument, StorageBackend, Workspace}; +use vectorless::Result; + +/// A simple in-memory backend with logging. +/// +/// This demonstrates how to implement StorageBackend trait. +/// In production, you might implement S3, PostgreSQL, Redis, etc. +#[derive(Debug)] +struct LoggingMemoryBackend { + name: &'static str, + data: RwLock>>, +} + +impl LoggingMemoryBackend { + fn new(name: &'static str) -> Self { + Self { + name, + data: RwLock::new(HashMap::new()), + } + } +} + +impl StorageBackend for LoggingMemoryBackend { + fn get(&self, key: &str) -> Result>> { + let data = self.data.read().unwrap(); + let result = data.get(key).cloned(); + println!(" [{}] GET '{}' -> {}", self.name, key, if result.is_some() { "found" } else { "not found" }); + Ok(result) + } + + fn put(&self, key: &str, value: &[u8]) -> Result<()> { + let mut data = self.data.write().unwrap(); + data.insert(key.to_string(), value.to_vec()); + println!(" [{}] PUT '{}' ({} bytes)", self.name, key, value.len()); + Ok(()) + } + + fn delete(&self, key: &str) -> Result { + let mut data = self.data.write().unwrap(); + let existed = data.remove(key).is_some(); + println!(" [{}] DELETE '{}' -> {}", self.name, key, existed); + Ok(existed) + } + + fn exists(&self, key: &str) -> Result { + let data = self.data.read().unwrap(); + Ok(data.contains_key(key)) + } + + fn keys(&self) -> Result> { + let data = self.data.read().unwrap(); + Ok(data.keys().cloned().collect()) + } + + fn len(&self) -> Result { + let data = self.data.read().unwrap(); + Ok(data.len()) + } + + fn clear(&self) -> Result<()> { + let mut data = self.data.write().unwrap(); + data.clear(); + println!(" [{}] CLEAR", self.name); + Ok(()) + } + + fn backend_name(&self) -> &'static str { + self.name + } +} + +fn main() -> vectorless::Result<()> { + println!("=== Custom Storage Backend Example ===\n"); + + // 1. Create custom backend + println!("1. Creating custom backend..."); + let backend = Arc::new(LoggingMemoryBackend::new("MyCustomBackend")); + println!(" ✓ Backend: {}\n", backend.backend_name()); + + // 2. Create workspace with custom backend + println!("2. Creating workspace with custom backend..."); + let mut workspace = Workspace::with_backend(backend)?; + println!(" ✓ Workspace created\n"); + + // 3. Add a document (watch the logging) + println!("3. Adding document (observe backend calls):"); + let meta = DocumentMeta::new("custom-doc", "Custom Backend Test", "md"); + let tree = DocumentTree::new("Root", "Testing custom backend!"); + let doc = PersistedDocument::new(meta, tree); + workspace.add(&doc)?; + println!(); + + // 4. Load the document + println!("4. Loading document:"); + let loaded = workspace.load("custom-doc")?.unwrap(); + println!(" ✓ Loaded: {}\n", loaded.meta.name); + + // 5. Show workspace stats + println!("5. Workspace stats:"); + println!(" - Documents: {}", workspace.len()); + println!(" - Cache size: {}", workspace.cache_len()); + println!(); + + println!("✓ Custom backend example complete!"); + println!("\nTip: Implement StorageBackend to integrate with:"); + println!(" - S3 / GCS / Azure Blob"); + println!(" - PostgreSQL / MySQL"); + println!(" - Redis / Memcached"); + println!(" - Any custom storage system"); + + Ok(()) +} diff --git a/examples/storage_compression.rs b/examples/storage_compression.rs new file mode 100644 index 00000000..303f582a --- /dev/null +++ b/examples/storage_compression.rs @@ -0,0 +1,95 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Compression example. +//! +//! This example demonstrates compression support in storage: +//! - GzipCodec for compressed storage +//! - IdentityCodec for uncompressed storage +//! - Codec trait for custom compression +//! +//! # Usage +//! +//! ```bash +//! cargo run --example storage_compression +//! ``` + +use vectorless::storage::{GzipCodec, IdentityCodec, Codec}; +use vectorless::Result; + +fn main() -> Result<()> { + println!("=== Compression Example ===\n"); + + // Test data + let original = b"Lorem ipsum dolor sit amet, consectetur adipiscing elit. \ + Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. \ + Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris."; + println!("Original data ({} bytes):", original.len()); + println!(" {:?}...\n", String::from_utf8_lossy(&original[..50])); + + // 1. Identity codec (no compression) + println!("1. IdentityCodec (no compression):"); + let identity = IdentityCodec::new(); + + let identity_encoded = identity.encode(original)?; + let identity_decoded = identity.decode(&identity_encoded)?; + + println!(" Encoded size: {} bytes", identity_encoded.len()); + println!(" Compression ratio: {:.1}%", + (identity_encoded.len() as f64 / original.len() as f64) * 100.0); + assert_eq!(original.to_vec(), identity_decoded); + println!(" ✓ Roundtrip verified\n"); + + // 2. Gzip codec with different levels + println!("2. GzipCodec with different compression levels:"); + + for level in [1, 6, 9] { + let gzip = GzipCodec::new(level); + let compressed = gzip.encode(original)?; + + println!(" Level {}: {} bytes ({:.1}% of original)", + level, + compressed.len(), + (compressed.len() as f64 / original.len() as f64) * 100.0); + } + println!(); + + // 3. Gzip roundtrip + println!("3. Gzip roundtrip verification:"); + let gzip = GzipCodec::new(6); + + let encoded = gzip.encode(original)?; + let decoded = gzip.decode(&encoded)?; + + assert_eq!(original.to_vec(), decoded); + println!(" ✓ Encoded {} bytes -> {} bytes", + original.len(), encoded.len()); + println!(" ✓ Decoded back to {} bytes", decoded.len()); + println!(" ✓ Data integrity verified\n"); + + // 4. Empty data handling + println!("4. Edge cases:"); + let empty: &[u8] = &[]; + + let empty_encoded = gzip.encode(empty)?; + let empty_decoded = gzip.decode(&empty_encoded)?; + assert!(empty_decoded.is_empty()); + println!(" ✓ Empty data handled correctly\n"); + + // 5. Comparison + println!("5. Summary:"); + println!(" Original: {} bytes", original.len()); + println!(" Identity: {} bytes (100.0%)", identity_encoded.len()); + println!(" Gzip (lvl6): {} bytes ({:.1}%)", + encoded.len(), + (encoded.len() as f64 / original.len() as f64) * 100.0); + println!(); + + println!("✓ Compression example complete!"); + println!("\nUsage tips:"); + println!(" - Use GzipCodec for large text documents"); + println!(" - Use IdentityCodec for already-compressed data (PDF, images)"); + println!(" - Level 6 is a good default (balance of speed vs ratio)"); + + Ok(()) +} diff --git a/examples/storage_migration.rs b/examples/storage_migration.rs new file mode 100644 index 00000000..5874046c --- /dev/null +++ b/examples/storage_migration.rs @@ -0,0 +1,138 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Version migration example. +//! +//! This example demonstrates how to use the migration system +//! for upgrading data formats between versions. +//! +//! # Usage +//! +//! ```bash +//! cargo run --example storage_migration +//! ``` + +use vectorless::storage::{Migration, MigrationContext, Migrator}; +use vectorless::{Error, Result}; + +/// Example migration from v1 to v2. +/// +/// Imagine v1 stored data as plain text, +/// and v2 adds a header prefix. +#[derive(Debug)] +struct V1ToV2; + +impl Migration for V1ToV2 { + fn from_version(&self) -> u32 { + 1 + } + + fn to_version(&self) -> u32 { + 2 + } + + fn description(&self) -> &str { + "Add version header to data format" + } + + fn migrate(&self, data: &[u8], _ctx: &MigrationContext) -> Result> { + // Add a simple header: "V2:" prefix + let mut result = b"V2:".to_vec(); + result.extend_from_slice(data); + Ok(result) + } +} + +/// Example migration from v2 to v3. +/// +/// V3 adds compression (simulated with base64-like encoding). +#[derive(Debug)] +struct V2ToV3; + +impl Migration for V2ToV3 { + fn from_version(&self) -> u32 { + 2 + } + + fn to_version(&self) -> u32 { + 3 + } + + fn description(&self) -> &str { + "Add compression to data format" + } + + fn migrate(&self, data: &[u8], _ctx: &MigrationContext) -> Result> { + // Simulate compression by adding prefix + let mut result = b"V3:COMPRESSED:".to_vec(); + result.extend_from_slice(data); + Ok(result) + } +} + +fn main() -> vectorless::Result<()> { + println!("=== Version Migration Example ===\n"); + + // 1. Create migrator + println!("1. Creating migrator and registering migrations..."); + let mut migrator = Migrator::new(); + migrator.register(Box::new(V1ToV2)); + migrator.register(Box::new(V2ToV3)); + + println!(" Registered migrations:"); + for (from, to, desc) in migrator.list_migrations() { + println!(" - v{} -> v{}: {}", from, to, desc); + } + println!(); + + // 2. Check migration paths + println!("2. Checking migration paths:"); + println!(" Can migrate v1 -> v2: {}", migrator.can_migrate(1, 2)); + println!(" Can migrate v1 -> v3: {}", migrator.can_migrate(1, 3)); + println!(" Can migrate v2 -> v3: {}", migrator.can_migrate(2, 3)); + println!(" Can migrate v1 -> v4: {}", migrator.can_migrate(1, 4)); + println!(); + + // 3. Migrate from v1 to v3 (multi-step) + println!("3. Migrating data from v1 to v3 (via v2):"); + let original_data = b"Hello, World!"; + println!(" Original (v1): {:?}", String::from_utf8_lossy(original_data)); + + let migrated = migrator.migrate(original_data, 1, 3)?; + println!(" Migrated (v3): {:?}", String::from_utf8_lossy(&migrated)); + println!(); + + // 4. Direct migration + println!("4. Direct migration v2 -> v3:"); + let v2_data = b"V2:Some data"; + let v3_data = migrator.migrate(v2_data, 2, 3)?; + println!(" V2: {:?}", String::from_utf8_lossy(v2_data)); + println!(" V3: {:?}", String::from_utf8_lossy(&v3_data)); + println!(); + + // 5. No migration needed + println!("5. Same version (no migration):"); + let data = b"Already v3"; + let result = migrator.migrate(data, 3, 3)?; + assert_eq!(data.to_vec(), result); + println!(" ✓ Data unchanged when from == to"); + println!(); + + // 6. Error case: no path + println!("6. Error handling (no migration path):"); + match migrator.migrate(b"test", 1, 99) { + Err(Error::VersionMismatch(msg)) => { + println!(" Expected error: {}", msg); + } + _ => unreachable!(), + } + println!(); + + println!("✓ Migration example complete!"); + println!("\nKey points:"); + println!(" - Migrations are registered as v(N) -> v(N+1)"); + println!(" - Migrator finds paths automatically (BFS)"); + println!(" - Multi-step migrations are handled transparently"); + + Ok(()) +} diff --git a/examples/storage_workspace.rs b/examples/storage_workspace.rs new file mode 100644 index 00000000..9f93310c --- /dev/null +++ b/examples/storage_workspace.rs @@ -0,0 +1,99 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Basic workspace usage example. +//! +//! This example demonstrates the core storage API: +//! - Creating a workspace +//! - Adding documents +//! - Loading documents with LRU cache +//! - Listing and removing documents +//! +//! # Usage +//! +//! ```bash +//! cargo run --example storage_workspace +//! ``` + +use vectorless::document::DocumentTree; +use vectorless::storage::{DocumentMeta, PersistedDocument, Workspace}; + +fn main() -> vectorless::Result<()> { + println!("=== Storage Workspace Example ===\n"); + + // Create a temporary workspace + let workspace_path = "./example_workspace"; + + // 1. Create a workspace with custom cache size + println!("1. Creating workspace at '{}'...", workspace_path); + let mut workspace = Workspace::with_cache_size(workspace_path, 100)?; + println!(" ✓ Workspace created\n"); + + // 2. Create a document + println!("2. Creating a document..."); + let meta = DocumentMeta::new("doc-001", "Getting Started Guide", "md") + .with_description("An introduction to the workspace API") + .with_source_path("./docs/getting-started.md"); + + let tree = DocumentTree::new("Introduction", "Welcome to Vectorless storage module!"); + + let doc = PersistedDocument::new(meta, tree); + println!(" ✓ Document created: {}\n", doc.meta.id); + + // 3. Add document to workspace + println!("3. Adding document to workspace..."); + workspace.add(&doc)?; + println!(" ✓ Document saved\n"); + + // 4. List all documents + println!("4. Listing documents:"); + for id in workspace.list_documents() { + if let Some(meta) = workspace.get_meta(id) { + println!(" - {} ({})", meta.doc_name, meta.id); + if let Some(ref desc) = meta.doc_description { + println!(" Description: {}", desc); + } + } + } + println!(); + + // 5. Load document (uses LRU cache) + println!("5. Loading document..."); + let loaded = workspace.load("doc-001")?.expect("Document should exist"); + println!(" ✓ Loaded: {}", loaded.meta.name); + let root = loaded.tree.root(); + if let Some(node) = loaded.tree.get(root) { + println!(" Root node title: {}", node.title); + } + println!(); + + // 6. Cache statistics + println!("6. Cache statistics:"); + let stats = workspace.cache_stats(); + println!(" - Hits: {}", stats.hits); + println!(" - Misses: {}", stats.misses); + println!(" - Evictions: {}", stats.evictions); + println!(" - Utilization: {:.1}%", workspace.cache_utilization() * 100.0); + println!(); + + // 7. Load again (should hit cache) + println!("7. Loading document again (should hit cache)..."); + let _ = workspace.load("doc-001")?; + let stats = workspace.cache_stats(); + println!(" ✓ Cache hits: {}", stats.hits); + println!(); + + // 8. Remove document + println!("8. Removing document..."); + let removed = workspace.remove("doc-001")?; + println!(" ✓ Removed: {}", removed); + println!(" Workspace is empty: {}", workspace.is_empty()); + println!(); + + // Cleanup + println!("Cleaning up..."); + std::fs::remove_dir_all(workspace_path).ok(); + println!(" ✓ Done!"); + + Ok(()) +} From d4578a71958de0a0a012d5d548885267ad8d51e4 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Sun, 5 Apr 2026 13:44:41 +0800 Subject: [PATCH 7/8] docs(README): remove redundant examples table - Remove outdated examples table that listed all example files with descriptions - Simplify examples section to reference examples directory directly - Clean up formatting and reduce verbosity in documentation --- README.md | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 52ae5a05..a89e2f28 100644 --- a/README.md +++ b/README.md @@ -134,22 +134,8 @@ async fn main() -> vectorless::Result<()> { ## Examples -See the [examples/](examples/) directory for complete working examples: - -| Example | Description | -|---------|-------------| -| [basic.rs](examples/basic.rs) | Minimal ~30 line example showing core API | -| [index.rs](examples/index.rs) | Document indexing pipeline | -| [retrieve.rs](examples/retrieve.rs) | Retrieval pipeline with options | -| [events.rs](examples/events.rs) | Event-driven indexing with EventEmitter | -| [session.rs](examples/session.rs) | Session management with statistics | -| [batch_processing.rs](examples/batch_processing.rs) | Batch document processing | -| [content_aggregation.rs](examples/content_aggregation.rs) | Content aggregation strategies | -| [streaming.rs](examples/streaming.rs) | Streaming document processing | -| [multi_format.rs](examples/multi_format.rs) | Multi-format document support | -| [custom_pilot.rs](examples/custom_pilot.rs) | Custom pilot implementation | -| [cli_tool.rs](examples/cli_tool.rs) | CLI application example | -| [markdownflow.rs](examples/markdownflow.rs) | Markdown workflow example | +See the [examples/](examples/) directory for complete working examples + ## Architecture From 802d1409941daa52f3e649b6f0ae59d14243994b Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Sun, 5 Apr 2026 13:45:04 +0800 Subject: [PATCH 8/8] chore(release): bump version from 0.1.12 to 0.1.13 - Update package version in Cargo.toml from 0.1.12 to 0.1.13 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 4bc94998..f7572654 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "vectorless" -version = "0.1.12" +version = "0.1.13" edition = "2024" authors = ["zTgx "] description = "Hierarchical, reasoning-native document intelligence engine"