diff --git a/Cargo.toml b/Cargo.toml index 00d69362..01f983fb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "vectorless" -version = "0.1.10" +version = "0.1.11" edition = "2024" authors = ["zTgx "] description = "Hierarchical, reasoning-native document intelligence engine" @@ -73,6 +73,7 @@ rand = "0.8" [dev-dependencies] tempfile = "3.10" tokio-test = "0.4" +tracing-subscriber = { version = "0.3", features = ["env-filter"] } [profile.release] opt-level = 3 diff --git a/README.md b/README.md index 50b88351..d0649efb 100644 --- a/README.md +++ b/README.md @@ -11,10 +11,12 @@ -Ultra performant document intelligence engine for RAG, with core written in **Rust**. Zero vector database, zero embedding model — just LLM-powered tree navigation. Incremental indexing and multi-format support out-of-box. +Ultra performant document intelligence engine for RAG, with written in **Rust**. Zero vector database, zero embedding model — just LLM-powered tree navigation. Incremental indexing and multi-format support out-of-box. ⭐ **Drop a star to help us grow!** +**⚠️ Early Development**: This project is in active development. The API and features are likely to evolve, and breaking changes may occur. + ## Why Vectorless? @@ -109,14 +111,15 @@ cp templates/template.toml ./vectorless.toml Basic usage: ```rust -use vectorless::client::{Engine, EngineBuilder}; +use vectorless::Engine; #[tokio::main] -async fn main() -> vectorless::domain::Result<()> { +async fn main() -> vectorless::Result<()> { // Create client - let client = EngineBuilder::new() + let client = Engine::builder() .with_workspace("./workspace") - .build()?; + .build() + .map_err(|e| vectorless::Error::Config(e.to_string()))?; // Index a document let doc_id = client.index("./document.md").await?; @@ -133,6 +136,21 @@ async fn main() -> vectorless::domain::Result<()> { See the [examples/](examples/) directory for complete working examples: +| Example | Description | +|---------|-------------| +| [basic.rs](examples/basic.rs) | Minimal ~30 line example showing core API | +| [index.rs](examples/index.rs) | Document indexing pipeline | +| [retrieve.rs](examples/retrieve.rs) | Retrieval pipeline with options | +| [events.rs](examples/events.rs) | Event-driven indexing with EventEmitter | +| [session.rs](examples/session.rs) | Session management with statistics | +| [batch_processing.rs](examples/batch_processing.rs) | Batch document processing | +| [content_aggregation.rs](examples/content_aggregation.rs) | Content aggregation strategies | +| [streaming.rs](examples/streaming.rs) | Streaming document processing | +| [multi_format.rs](examples/multi_format.rs) | Multi-format document support | +| [custom_pilot.rs](examples/custom_pilot.rs) | Custom pilot implementation | +| [cli_tool.rs](examples/cli_tool.rs) | CLI application example | +| [markdownflow.rs](examples/markdownflow.rs) | Markdown workflow example | + ## Architecture ### Pilot Architecture @@ -141,7 +159,7 @@ See the [examples/](examples/) directory for complete working examples: ### System Overview -![Architecture](docs/design/architecture-v2.svg) +![Architecture](docs/design/architecture.svg) ## Contributing diff --git a/docs/design/architecture-v2.svg b/docs/design/architecture.svg similarity index 100% rename from docs/design/architecture-v2.svg rename to docs/design/architecture.svg diff --git a/docs/design/client-module.md b/docs/design/client-module.md new file mode 100644 index 00000000..e4ab796b --- /dev/null +++ b/docs/design/client-module.md @@ -0,0 +1,794 @@ +# Client Module Refactoring Design + +## Overview + +This document describes the refactoring of the `client` module to achieve a more professional, product-level architecture with clear separation of concerns. + +## Current Problems + +### 1. God Object Anti-pattern +`engine.rs` (600+ lines) handles too many responsibilities: +- Document indexing +- Document retrieval +- Workspace management +- Configuration management +- Format detection +- Page parsing + +### 2. Mixed Abstraction Levels +High-level operations (`query()`) mixed with low-level utilities (`parse_page_range()`). + +### 3. No Session Management +Each operation is independent; no way to maintain context across multiple operations. + +### 4. Missing Event System +No progress callbacks or event hooks for long-running operations. + +### 5. Scattered State Management +State split across `Arc>`, `Arc>`, `Arc`. + +--- + +## Proposed Architecture + +### Module Structure + +``` +src/client/ +├── mod.rs # Re-exports and documentation +├── engine.rs # Core orchestrator (simplified) +├── builder.rs # Builder pattern (enhanced) +├── types.rs # Public API types +├── context.rs # Request context and configuration +├── session.rs # Session management +├── indexer.rs # Document indexing operations +├── retriever.rs # Query and retrieval operations +├── workspace.rs # Workspace operations (CRUD) +└── events.rs # Event system and callbacks +``` + +### Architecture Diagram + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Client API │ +├─────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ +│ │ EngineBuilder │───▶│ Engine │◀───│ Session │ │ +│ └──────────────┘ └──────┬───────┘ └──────────────┘ │ +│ │ │ +│ ┌──────────────┼──────────────┐ │ +│ ▼ ▼ ▼ │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ +│ │ Indexer │ │ Retriever │ │ Workspace │ │ +│ │ Client │ │ Client │ │ Client │ │ +│ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ │ +│ │ │ │ │ +│ └───────────────┴───────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌────────────────┐ │ +│ │ Context │ │ +│ │ (Request State)│ │ +│ └────────────────┘ │ +│ │ +│ ┌────────────────┐ │ +│ │ Events │ │ +│ │ (Callbacks) │ │ +│ └────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +--- + +## Component Design + +### 1. Context (`context.rs`) + +Request-scoped configuration and state management. + +```rust +/// Request context for client operations. +pub struct ClientContext { + /// Unique request ID for tracing. + pub request_id: Uuid, + + /// Request-specific configuration overrides. + pub config: RequestContextConfig, + + /// Event emitter for this request. + pub events: EventEmitter, + + /// Request metadata. + pub metadata: HashMap, + + /// Request deadline (for timeout). + pub deadline: Option, +} + +/// Request-specific configuration overrides. +pub struct RequestContextConfig { + /// Override top_k for retrieval. + pub top_k: Option, + + /// Override token budget. + pub token_budget: Option, + + /// Override content format. + pub content_format: Option, + + /// Enable/disable features. + pub features: FeatureFlags, +} + +/// Feature flags for request. +pub struct FeatureFlags { + pub include_summaries: bool, + pub include_content: bool, + pub enable_cache: bool, + pub enable_sufficiency_check: bool, +} +``` + +### 2. Session (`session.rs`) + +Multi-document session management. + +```rust +/// Session for managing multiple document operations. +pub struct Session { + /// Session ID. + pub id: Uuid, + + /// Session configuration. + config: SessionConfig, + + /// Active document contexts. + documents: HashMap, + + /// Shared engine reference. + engine: Engine, + + /// Session statistics. + stats: SessionStats, + + /// Created at timestamp. + created_at: DateTime, +} + +/// Document context within a session. +pub struct DocumentContext { + /// Document ID. + pub doc_id: String, + + /// Preloaded tree (cached). + tree: Option>, + + /// Document metadata. + meta: DocumentMeta, + + /// Access statistics. + access_count: usize, + last_accessed: DateTime, +} + +/// Session configuration. +pub struct SessionConfig { + /// Maximum documents to keep in memory. + pub max_cached_documents: usize, + + /// Preload strategy. + pub preload_strategy: PreloadStrategy, + + /// Cache eviction policy. + pub eviction_policy: EvictionPolicy, +} + +impl Session { + /// Create a new session. + pub fn new(engine: Engine) -> Self; + + /// Index a document into this session. + pub async fn index(&self, path: impl AsRef) -> Result; + + /// Query a document within this session. + pub async fn query(&self, doc_id: &str, question: &str) -> Result; + + /// Query across all documents in session. + pub async fn query_all(&self, question: &str) -> Result>; + + /// Get document tree (cached). + pub fn get_tree(&self, doc_id: &str) -> Result>; + + /// Preload documents for faster access. + pub async fn preload(&self, doc_ids: &[&str]) -> Result<()>; + + /// Clear session cache. + pub fn clear_cache(&self); + + /// Get session statistics. + pub fn stats(&self) -> &SessionStats; +} +``` + +### 3. Indexer Client (`indexer.rs`) + +Document indexing operations. + +```rust +/// Document indexing client. +pub struct IndexerClient { + /// Pipeline executor. + executor: Arc>, + + /// Configuration. + config: IndexerConfig, +} + +/// Indexing configuration. +pub struct IndexerConfig { + /// Default index mode. + pub default_mode: IndexMode, + + /// Summary generation strategy. + pub summary_strategy: SummaryStrategy, + + /// Whether to generate node IDs. + pub generate_ids: bool, + + /// Whether to generate descriptions. + pub generate_descriptions: bool, +} + +impl IndexerClient { + /// Create a new indexer client. + pub fn new(executor: PipelineExecutor) -> Self; + + /// Index a document from file. + pub async fn index_file( + &self, + path: impl AsRef, + options: IndexOptions, + events: &EventEmitter, + ) -> Result; + + /// Index from raw content. + pub async fn index_content( + &self, + content: &str, + format: DocumentFormat, + options: IndexOptions, + ) -> Result; + + /// Detect document format. + pub fn detect_format(&self, path: &Path, options: &IndexOptions) -> Result; + + /// Validate document before indexing. + pub fn validate(&self, path: &Path) -> Result; +} + +/// Indexing events. +pub enum IndexEvent { + /// Started indexing. + Started { path: String }, + + /// Format detected. + FormatDetected { format: DocumentFormat }, + + /// Parsing progress. + ParsingProgress { percent: u8 }, + + /// Tree building complete. + TreeBuilt { node_count: usize }, + + /// Summary generation progress. + SummaryProgress { completed: usize, total: usize }, + + /// Indexing complete. + Complete { doc_id: String }, + + /// Error occurred. + Error { message: String }, +} +``` + +### 4. Retriever Client (`retriever.rs`) + +Query and retrieval operations. + +```rust +/// Document retrieval client. +pub struct RetrieverClient { + /// Pipeline retriever. + retriever: Arc, + + /// Configuration. + config: RetrieverConfig, +} + +/// Retrieval configuration. +pub struct RetrieverConfig { + /// Default top_k. + pub default_top_k: usize, + + /// Default token budget. + pub default_token_budget: usize, + + /// Content aggregator config. + pub content_config: ContentAggregatorConfig, + + /// Enable caching. + pub enable_cache: bool, +} + +impl RetrieverClient { + /// Create a new retriever client. + pub fn new(retriever: PipelineRetriever) -> Self; + + /// Query a document tree. + pub async fn query( + &self, + tree: &DocumentTree, + question: &str, + options: RetrieveOptions, + ctx: &ClientContext, + ) -> Result; + + /// Query with streaming results. + pub async fn query_stream( + &self, + tree: &DocumentTree, + question: &str, + options: RetrieveOptions, + ) -> impl Stream; + + /// Get similar nodes. + pub fn find_similar( + &self, + tree: &DocumentTree, + node_id: NodeId, + top_k: usize, + ) -> Result>; + + /// Get node context (ancestors + siblings). + pub fn get_node_context( + &self, + tree: &DocumentTree, + node_id: NodeId, + depth: usize, + ) -> Result; +} + +/// Query events for streaming. +pub enum QueryEvent { + /// Search started. + SearchStarted { query: String }, + + /// Node visited during search. + NodeVisited { node_id: String, title: String, score: f32 }, + + /// Candidate found. + CandidateFound { node_id: String, score: f32 }, + + /// Sufficiency check result. + SufficiencyCheck { level: SufficiencyLevel, tokens: usize }, + + /// Result ready. + ResultReady { result: RetrievalResult }, + + /// Query complete. + Complete { total_results: usize, confidence: f32 }, +} +``` + +### 5. Workspace Client (`workspace.rs`) + +Document persistence operations. + +```rust +/// Workspace management client. +pub struct WorkspaceClient { + /// Workspace storage. + workspace: Arc>, + + /// Configuration. + config: WorkspaceConfig, +} + +/// Workspace configuration. +pub struct WorkspaceConfig { + /// Auto-save interval (seconds). + pub auto_save_interval: Option, + + /// Maximum cache size. + pub max_cache_size: usize, +} + +impl WorkspaceClient { + /// Create a new workspace client. + pub fn new(workspace: Workspace) -> Self; + + /// Save a document. + pub fn save(&self, doc: &PersistedDocument) -> Result<()>; + + /// Load a document. + pub fn load(&self, doc_id: &str) -> Result>; + + /// Remove a document. + pub fn remove(&self, doc_id: &str) -> Result; + + /// Check if document exists. + pub fn exists(&self, doc_id: &str) -> Result; + + /// List all documents. + pub fn list(&self) -> Result>; + + /// Get document metadata. + pub fn get_meta(&self, doc_id: &str) -> Result>; + + /// Batch operations. + pub fn batch_remove(&self, doc_ids: &[&str]) -> Result; + + /// Clear workspace. + pub fn clear(&self) -> Result; + + /// Get workspace statistics. + pub fn stats(&self) -> WorkspaceStats; +} + +/// Workspace statistics. +pub struct WorkspaceStats { + pub document_count: usize, + pub total_size_bytes: u64, + pub cache_hit_rate: f32, + pub oldest_document: Option>, + pub newest_document: Option>, +} +``` + +### 6. Events (`events.rs`) + +Event system for callbacks and progress reporting. + +```rust +/// Event emitter for client operations. +pub struct EventEmitter { + /// Event handlers. + handlers: Vec>, + + /// Async handlers (for non-blocking events). + async_handlers: Vec>, +} + +/// Event handler trait. +pub trait EventHandler: Send + Sync { + fn handle(&self, event: &Event); +} + +/// Async event handler trait. +#[async_trait] +pub trait AsyncEventHandler: Send + Sync { + async fn handle(&self, event: &Event); +} + +/// Event types. +#[derive(Debug, Clone)] +pub enum Event { + /// Indexing events. + Index(IndexEvent), + + /// Query events. + Query(QueryEvent), + + /// Workspace events. + Workspace(WorkspaceEvent), + + /// Session events. + Session(SessionEvent), +} + +/// Workspace events. +pub enum WorkspaceEvent { + DocumentSaved { doc_id: String }, + DocumentLoaded { doc_id: String, cache_hit: bool }, + DocumentRemoved { doc_id: String }, + WorkspaceCleared { count: usize }, +} + +/// Session events. +pub enum SessionEvent { + SessionCreated { session_id: Uuid }, + DocumentAdded { doc_id: String }, + DocumentEvicted { doc_id: String, reason: EvictionReason }, + SessionClosed { session_id: Uuid }, +} + +impl EventEmitter { + /// Create a new event emitter. + pub fn new() -> Self; + + /// Add a sync handler. + pub fn on(mut self, handler: H) -> Self; + + /// Add an async handler. + pub fn on_async(mut self, handler: Arc) -> Self; + + /// Emit an event. + pub fn emit(&self, event: Event); + + /// Emit an event asynchronously. + pub async fn emit_async(&self, event: Event); +} + +/// Convenience handler builders. +impl EventEmitter { + /// Create handler from closure. + pub fn on_index(self, f: F) -> Self; + + /// Create handler from closure. + pub fn on_query(self, f: F) -> Self; + + /// Create progress callback. + pub fn on_progress(self, f: F) -> Self; +} + +/// Progress information. +pub struct Progress { + pub operation: Operation, + pub current: usize, + pub total: usize, + pub message: String, +} + +pub enum Operation { + Indexing, + Querying, + Loading, + Saving, +} +``` + +### 7. Simplified Engine (`engine.rs`) + +The main orchestrator, now much simpler. + +```rust +/// The main Engine client - orchestrates sub-clients. +pub struct Engine { + /// Configuration. + config: Arc, + + /// Indexer client. + indexer: IndexerClient, + + /// Retriever client. + retriever: RetrieverClient, + + /// Workspace client (optional). + workspace: Option, + + /// Event emitter. + events: EventEmitter, +} + +impl Engine { + /// Create a builder for custom configuration. + pub fn builder() -> EngineBuilder; + + // ============================================================ + // Convenience Methods (delegate to sub-clients) + // ============================================================ + + /// Index a document. + pub async fn index(&self, path: impl AsRef) -> Result { + self.index_with_options(path, IndexOptions::default()).await + } + + /// Index with options. + pub async fn index_with_options( + &self, + path: impl AsRef, + options: IndexOptions, + ) -> Result; + + /// Query a document. + pub async fn query(&self, doc_id: &str, question: &str) -> Result; + + /// Create a session for multi-document operations. + pub fn session(&self) -> Session; + + /// Get the indexer client. + pub fn indexer(&self) -> &IndexerClient; + + /// Get the retriever client. + pub fn retriever(&self) -> &RetrieverClient; + + /// Get the workspace client. + pub fn workspace(&self) -> Option<&WorkspaceClient>; + + /// Get configuration. + pub fn config(&self) -> &Config; + + // ============================================================ + // Document Operations (delegate to workspace) + // ============================================================ + + /// List documents. + pub fn list_documents(&self) -> Vec; + + /// Get document structure. + pub fn get_structure(&self, doc_id: &str) -> Result; + + /// Get page content. + pub fn get_page_content(&self, doc_id: &str, pages: &str) -> Result; + + /// Remove document. + pub fn remove(&self, doc_id: &str) -> Result; + + /// Check existence. + pub fn exists(&self, doc_id: &str) -> Result; +} +``` + +--- + +## API Examples + +### Basic Usage (Same as Before) + +```rust +let client = EngineBuilder::new() + .with_workspace("./workspace") + .build()?; + +// Index +let doc_id = client.index("./document.md").await?; + +// Query +let result = client.query(&doc_id, "What is this?").await?; +``` + +### With Events + +```rust +let client = EngineBuilder::new() + .with_workspace("./workspace") + .with_events( + EventEmitter::new() + .on_index(|e| match e { + IndexEvent::Complete { doc_id } => println!("Indexed: {}", doc_id), + _ => {} + }) + .on_query(|e| match e { + QueryEvent::NodeVisited { title, score, .. } => { + println!("Visited: {} (score: {:.2})", title, score); + } + _ => {} + }) + ) + .build()?; +``` + +### Session-Based Multi-Document + +```rust +let client = EngineBuilder::new() + .with_workspace("./workspace") + .build()?; + +// Create session +let session = client.session(); + +// Index multiple documents +let doc1 = session.index("./doc1.md").await?; +let doc2 = session.index("./doc2.md").await?; +let doc3 = session.index("./doc3.md").await?; + +// Query across all documents +let results = session.query_all("What is the architecture?").await?; + +// Query single document (cached tree) +let result = session.query(&doc1, "Summary?").await?; + +// Session stats +println!("Cache hit rate: {:.2}%", session.stats().cache_hit_rate * 100.0); +``` + +### Streaming Query + +```rust +let client = EngineBuilder::new() + .with_workspace("./workspace") + .build()?; + +// Stream query results +let mut stream = client.retriever() + .query_stream(&tree, "What is X?", RetrieveOptions::default()); + +while let Some(event) = stream.next().await { + match event { + QueryEvent::NodeVisited { title, score, .. } => { + println!("Exploring: {}", title); + } + QueryEvent::ResultReady { result } => { + println!("Found: {}", result.title); + } + QueryEvent::Complete { total_results, confidence } => { + println!("Done: {} results, confidence: {:.2}", total_results, confidence); + } + _ => {} + } +} +``` + +### Request Context + +```rust +let ctx = ClientContext::new() + .with_top_k(10) + .with_token_budget(8000) + .with_deadline(Duration::from_secs(30)); + +let result = client.retriever() + .query(&tree, "complex question", options, &ctx) + .await?; +``` + +--- + +## Migration Path + +### Phase 1: Add New Modules (Non-Breaking) +1. Create `context.rs`, `events.rs` +2. Create `indexer.rs`, `retriever.rs`, `workspace.rs` as wrappers +3. Update `engine.rs` to use sub-clients internally +4. All existing API remains unchanged + +### Phase 2: Add Session Support (Non-Breaking) +1. Add `session.rs` +2. Add `Engine::session()` method +3. Add multi-document query support + +### Phase 3: Enhance Events (Non-Breaking) +1. Add streaming query support +2. Add progress callbacks +3. Add async event handlers + +### Phase 4: Deprecate Old API (Breaking, Future) +1. Mark direct workspace access as deprecated +2. Encourage use of sub-clients +3. Eventually remove deprecated methods + +--- + +## File Structure After Refactoring + +``` +src/client/ +├── mod.rs # ~50 lines - exports and docs +├── engine.rs # ~150 lines - orchestration only +├── builder.rs # ~200 lines - enhanced builder +├── types.rs # ~250 lines - public types +├── context.rs # ~150 lines - request context +├── session.rs # ~200 lines - session management +├── indexer.rs # ~200 lines - indexing ops +├── retriever.rs # ~200 lines - retrieval ops +├── workspace.rs # ~150 lines - workspace ops +└── events.rs # ~200 lines - event system +``` + +Total: ~1750 lines (vs current ~1000 lines, but much better organized) + +--- + +## Benefits + +1. **Single Responsibility**: Each module has one clear purpose +2. **Testability**: Sub-clients can be tested independently +3. **Extensibility**: Easy to add new features without touching Engine +4. **Performance**: Session caching reduces redundant loads +5. **Observability**: Events provide visibility into operations +6. **API Clarity**: Clear separation between indexing, retrieval, and storage +7. **Streaming**: Support for progressive results +8. **Context Management**: Request-scoped configuration diff --git a/docs/design/content-aggregation.md b/docs/design/content-aggregation.md new file mode 100644 index 00000000..22a7d7dd --- /dev/null +++ b/docs/design/content-aggregation.md @@ -0,0 +1,361 @@ +# Content Aggregation Design + +> Version: 1.0 +> Status: Draft +> Last Updated: 2026-04-04 + +## Overview + +Content Aggregation is the final stage of the retrieval pipeline that transforms candidate nodes into structured, relevant content for the user. This document describes the design for a precision-focused, budget-aware content aggregation system. + +## Problem Statement + +### Current Implementation + +The current `aggregate_content` in `JudgeStage` collects content naively: + +``` +Candidate Node → Node's own content + ALL descendant leaf content +``` + +### Issues + +| Issue | Impact | +|-------|--------| +| **No relevance filtering** | Returns all content from subtree, including irrelevant parts | +| **No token budget** | Large documents may return tens of thousands of tokens | +| **No prioritization** | All leaf content treated equally | +| **Lost structure** | Flat concatenation loses hierarchical context | + +## Design Goals + +1. **Precision First** - Only return truly relevant content +2. **Budget Aware** - Optimize within token constraints +3. **Structure Aware** - Maintain hierarchical context +4. **Incremental** - Support progressive refinement +5. **Explainable** - Traceable selection decisions + +## Architecture + +### High-Level Flow + +``` +┌─────────────────────────────────────────────────────────────┐ +│ Content Aggregator │ +├─────────────────────────────────────────────────────────────┤ +│ │ +│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ +│ │ Relevance │ │ Budget │ │ Structure │ │ +│ │ Scorer │─▶│ Allocator │─▶│ Builder │ │ +│ └──────────────┘ └──────────────┘ └──────────────┘ │ +│ ↑ ↑ ↑ │ +│ │ │ │ │ +│ ┌──────┴──────┐ ┌──────┴──────┐ ┌──────┴──────┐ │ +│ │ Query- │ │ Token │ │ Hierarchy │ │ +│ │ Node │ │ Budget │ │ Context │ │ +│ │ Scoring │ │ Config │ │ Assembly │ │ +│ └─────────────┘ └─────────────┘ └─────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────┘ +``` + +### Processing Pipeline + +``` +Candidate Nodes + │ + ▼ +┌─────────────────┐ +│ 1. Collect │ Gather all nodes from candidates + descendants +│ Nodes │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ 2. Score │ Compute relevance score for each content chunk +│ Relevance │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ 3. Filter │ Remove content below relevance threshold +│ by Score │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ 4. Allocate │ Distribute token budget optimally +│ Budget │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ 5. Build │ Assemble structured output +│ Structure │ +└────────┬────────┘ + │ + ▼ + Final Content +``` + +## Module Design + +### 1. RelevanceScorer + +Computes fine-grained relevance scores for content. + +```rust +pub struct RelevanceScorer { + query_keywords: Vec, + strategy: ScoringStrategy, +} + +pub enum ScoringStrategy { + /// Fast keyword matching only + KeywordOnly, + /// Keyword + BM25 scoring + KeywordWithBM25, + /// Keyword + LLM reranking + Hybrid { rerank_top_k: usize }, +} + +pub struct ContentRelevance { + pub node_id: NodeId, + pub chunk: ContentChunk, + pub score: f32, + pub components: ScoreComponents, +} + +pub struct ScoreComponents { + pub keyword_score: f32, // Keyword match quality + pub depth_penalty: f32, // Distance from candidate node + pub path_bonus: f32, // Parent node relevance + pub density_score: f32, // Information density +} +``` + +#### Scoring Formula + +``` +final_score = ( + keyword_score * 0.50 + + depth_penalty * 0.20 + + path_bonus * 0.15 + + density_score * 0.15 +).clamp(0.0, 1.0) + +where: + depth_penalty = 0.9^depth // 10% penalty per level + path_bonus = parent_score * 0.2 + density_score = (1 - stopword_ratio) * 0.7 + entity_ratio * 0.3 +``` + +### 2. BudgetAllocator + +Distributes token budget across scored content. + +```rust +pub struct BudgetAllocator { + total_budget: usize, + strategy: AllocationStrategy, +} + +pub enum AllocationStrategy { + /// Select highest-scoring content first + Greedy, + /// Distribute proportionally to scores + Proportional, + /// Ensure each depth level has representation + Hierarchical { min_per_level: f32 }, +} + +pub struct AllocationResult { + pub selected: Vec, + pub tokens_used: usize, + pub remaining_budget: usize, +} + +pub struct SelectedContent { + pub node_id: NodeId, + pub content: String, + pub tokens: usize, + pub score: f32, + pub truncation: Option, +} +``` + +#### Hierarchical Allocation + +``` +For each depth level (0 to max_depth): + 1. Sort content by score + 2. Allocate up to min_per_level budget + 3. Continue until level budget exhausted + 4. Move to next level + +Benefits: +- Ensures context from all levels +- Prevents shallow-only or deep-only results +- Maintains document structure awareness +``` + +### 3. StructureBuilder + +Assembles selected content into structured output. + +```rust +pub struct StructureBuilder { + format: OutputFormat, + include_metadata: bool, +} + +pub enum OutputFormat { + Markdown, + Json, + Tree, + Flat, +} + +pub struct StructuredContent { + pub content: String, + pub structure: Option, + pub metadata: ContentMetadata, +} +``` + +#### Markdown Output Format + +```markdown +## Parent Section +Parent content here... + +### Child Section A +Child A content here... + +### Child Section B +Child B content here... +``` + +## Configuration + +```toml +[retrieval.content] +# Maximum tokens to return +token_budget = 4000 + +# Minimum relevance score (0.0 - 1.0) +min_relevance_score = 0.3 + +# Scoring strategy: "keyword_only" | "keyword_bm25" | "hybrid" +scoring_strategy = "keyword_bm25" + +# Output format: "markdown" | "json" | "tree" +output_format = "markdown" + +# Include relevance scores in output +include_scores = false + +# Hierarchical allocation minimum per level +hierarchical_min_per_level = 0.1 +``` + +## Integration Points + +### JudgeStage Integration + +```rust +impl JudgeStage { + pub fn with_content_aggregator(mut self, config: ContentAggregatorConfig) -> Self { + self.content_aggregator = Some(ContentAggregator::new(config)); + self + } + + fn aggregate_content(&self, ctx: &PipelineContext) -> (String, usize) { + if let Some(aggregator) = &self.content_aggregator { + aggregator.aggregate(&ctx.candidates, &ctx.tree, &ctx.query) + } else { + // Fallback to legacy behavior + self.aggregate_content_legacy(ctx) + } + } +} +``` + +### RetrieveOptions Extension + +```rust +impl RetrieveOptions { + pub fn with_content_config(mut self, config: ContentAggregatorConfig) -> Self { + self.content_config = Some(config); + self + } +} +``` + +## Performance Characteristics + +### Latency by Strategy + +| Strategy | Latency | Precision | Use Case | +|----------|---------|-----------|----------| +| `KeywordOnly` | ~1ms | Medium | Quick preview | +| `KeywordWithBM25` | ~5ms | High | Default choice | +| `Hybrid` | ~200ms | Highest | Precision queries | + +### Memory Usage + +- Scorer: O(n) where n = total content length +- Allocator: O(m) where m = number of chunks +- Builder: O(k) where k = selected content size + +## Future Enhancements + +1. **Semantic Chunking** - Split content by semantic boundaries, not just nodes +2. **LLM Reranking** - Use LLM to rerank top-k chunks +3. **Query-Aware Truncation** - Truncate based on query relevance, not just length +4. **Caching** - Cache aggregation results for repeated queries +5. **Streaming** - Stream content as it's selected + +## File Structure + +``` +src/retrieval/content/ +├── mod.rs # Module entry point +├── aggregator.rs # Main aggregator logic +├── scorer.rs # Relevance scoring +├── budget.rs # Token budget allocation +├── builder.rs # Structured output building +├── truncation.rs # Smart truncation utilities +└── config.rs # Configuration types +``` + +## Implementation Priority + +| Phase | Component | Priority | +|-------|-----------|----------| +| P0 | `RelevanceScorer` (keyword) | High | +| P0 | `BudgetAllocator` (greedy) | High | +| P1 | `StructureBuilder` (markdown) | Medium | +| P1 | BM25 scoring | Medium | +| P2 | Hybrid strategy (LLM rerank) | Low | +| P2 | Caching layer | Low | + +## Testing Strategy + +### Unit Tests + +- Scorer: Test keyword extraction, BM25 calculation, density scoring +- Allocator: Test budget distribution, truncation, edge cases +- Builder: Test output formats, structure preservation + +### Integration Tests + +- End-to-end aggregation with real documents +- Performance benchmarks +- Token budget compliance + +### Quality Metrics + +- Precision@k: Relevance of top-k selected chunks +- Recall: Coverage of relevant content +- Latency: P50, P95, P99 response times diff --git a/docs/design/pilot.md b/docs/design/pilot.md index 0f907f25..d86e00a7 100644 --- a/docs/design/pilot.md +++ b/docs/design/pilot.md @@ -76,115 +76,6 @@ Pilot 是 Vectorless 检索系统的核心智能组件,负责理解查询、 └─────────────────────────────────────────────────────────────────────────────┘ ``` -### 1.2 核心接口定义 - -```rust -/// 搜索状态 - 传给 Pilot 的上下文信息 -pub struct SearchState<'a> { - /// 文档树 - pub tree: &'a DocumentTree, - /// 用户查询 - pub query: &'a str, - /// 当前路径(从根到当前节点) - pub path: &'a [NodeId], - /// 候选子节点 - pub candidates: &'a [NodeId], - /// 已访问的节点 - pub visited: &'a HashSet, - /// 当前深度 - pub depth: usize, - /// 搜索迭代次数 - pub iteration: usize, - /// 当前最高分 - pub best_score: f32, - /// 是否在回溯中 - pub is_backtracking: bool, -} - -/// Pilot trait - 核心接口 -#[async_trait] -pub trait Pilot: Send + Sync { - /// 获取 Pilot 名称 - fn name(&self) -> &str; - - /// 判断是否应该介入 - fn should_intervene(&self, state: &SearchState<'_>) -> bool; - - /// 做出决策 - async fn decide(&self, state: &SearchState<'_>) -> PilotDecision; - - /// 搜索开始前的指导 - async fn guide_start( - &self, - tree: &DocumentTree, - query: &str - ) -> Option; - - /// 获取配置 - fn config(&self) -> &PilotConfig; - - /// 获取指标 - fn metrics(&self) -> &PilotMetrics; - - /// 重置状态(新查询开始时调用) - fn reset(&self); -} -``` - -### 1.3 Pilot 决策类型 - -```rust -/// Pilot 决策结果 -#[derive(Debug, Clone)] -pub struct PilotDecision { - /// 候选节点排序(按推荐优先级) - pub ranked_candidates: Vec, - /// 搜索方向建议 - pub direction: SearchDirection, - /// 置信度 (0.0 - 1.0) - pub confidence: f32, - /// 决策原因(可解释性) - pub reasoning: String, - /// 介入点标识 - pub intervention_point: InterventionPoint, -} - -/// 排序后的候选节点 -#[derive(Debug, Clone)] -pub struct RankedCandidate { - pub node_id: NodeId, - pub score: f32, - pub reason: Option, -} - -/// 搜索方向建议 -#[derive(Debug, Clone)] -pub enum SearchDirection { - /// 继续深入当前分支 - GoDeeper { - reason: String, - }, - /// 探索兄弟节点 - ExploreSiblings { - recommended: Vec, - }, - /// 回溯到父节点 - Backtrack { - reason: String, - alternative_branches: Vec, - }, - /// 跳转到指定节点(非局部移动) - JumpTo { - target: NodeId, - reason: String, - }, - /// 当前节点就是答案 - FoundAnswer { - confidence: f32, - }, -} -``` - --- ## 1.4 Pilot 决策的信息来源 diff --git a/docs/design/roadmap.md b/docs/design/roadmap.md deleted file mode 100644 index 87a867a6..00000000 --- a/docs/design/roadmap.md +++ /dev/null @@ -1,247 +0,0 @@ -# 架构评估与路线图 - -> 评估日期: 2026-04-03 -> 评估版本: v0.1.7 - -## 当前状态 - -| 指标 | 状态 | -|------|------| -| **测试** | 129 passed, 0 failed | -| **代码量** | 17,695 行 Rust (112 文件) | -| **模块** | client, domain, index, retrieval, llm, parser, storage, throttle | -| **编译** | 成功 (仅 warnings) | - -## 架构亮点 - -### 1. 双 Pipeline 设计一致 - -Index 和 Retrieval 都采用相同的 orchestrator 模式: -- 依赖解析 (topological sort) -- ExecutionGroup 支持并行 -- FailurePolicy (Fail/Skip/Retry) -- StageOutcome 流程控制 - -``` -┌─────────────────────────────────────────────────────────────┐ -│ Orchestrator 模式 │ -├─────────────────────────────────────────────────────────────┤ -│ Index Pipeline │ Retrieval Pipeline │ -│ ───────────── │ ───────────────── │ -│ Parse → Build → │ Analyze → Plan → │ -│ Enhance → Enrich → │ Search → Judge │ -│ Optimize │ (支持回溯) │ -└─────────────────────────────────────────────────────────────┘ -``` - -### 2. 清晰的分层架构 - -``` -client (Engine) → index/retrieval → domain ← parser/llm/config -``` - -- **client**: 高层 API,封装内部复杂性 -- **domain**: 核心领域类型,无外部依赖 -- **index/retrieval**: 业务逻辑,操作 domain -- **parser/llm/config**: 基础设施,提供能力 - -### 3. 良好的模块化 - -每个模块职责单一: -- `parser/` - 文档解析 (Markdown, PDF, DOCX) -- `llm/` - LLM 客户端 (retry, fallback, pool) -- `storage/` - 持久化 (Workspace, LRU cache) -- `throttle/` - 限流控制 - ---- - -## 待改进项 - -### 代码质量 (Clippy Warnings) - -| 类型 | 数量 | 示例 | -|------|------|------| -| unused variable | 8 | `_context`, `_query`, `_strategy` | -| dead_code | 5 | `find_stage_index`, `term_frequency` | -| must_use | 12 | builder 方法缺少 `#[must_use]` | -| style | 3 | redundant else, unnecessary hashes | - -### 功能缺失 - -| 模块 | 缺失 | 影响 | -|------|------|------| -| `parser/registry.rs` | HTML parser | HTML 格式不支持 | -| `parser/toc/processor.rs` | 无 ToC 文档的结构提取 | 依赖 LLM | -| `retrieval/strategy/llm.rs` | 批量 prompt 优化 | 性能 | - -### 架构限制 - -| 限制 | 说明 | -|------|------| -| **并行执行未实现** | ExecutionGroup 已设计但 `execute()` 仍顺序执行 | -| **Strategy 无切换** | Plan 选择策略后中途不可切换 | -| **增量索引骨架** | `ChangeDetector` 存在但未集成到 pipeline | - ---- - -## 下一阶段优化方案 - -### Phase 1: 代码清理 (优先级: 高) - -**目标**: 消除所有 clippy warnings - -| 任务 | 文件 | 工作量 | -|------|------|--------| -| 添加 `#[must_use]` | builder 类型 | ~12 处 | -| 修复 unused variables | 各模块 | ~8 处 | -| 移除 dead code | `search/mod.rs`, `strategy/keyword.rs` | ~5 处 | -| 修复 style issues | 散落各处 | ~3 处 | - -**验收标准**: `cargo clippy` 无 warnings - ---- - -### Phase 2: 功能补全 (优先级: 中) - -#### 2.1 HTML Parser - -```rust -// src/parser/html/mod.rs (新建) -pub struct HtmlParser { - config: HtmlConfig, -} - -impl DocumentParser for HtmlParser { - fn parse(&self, content: &str) -> ParseResult { - // 使用 html5ever 或 scraper crate - } -} -``` - -#### 2.2 Strategy 热切换 - -当前: Plan 阶段选择策略后固定 -目标: Search 阶段根据效果动态切换 - -```rust -// 在 SearchStage 中 -if current_strategy.is_struggling() { - ctx.switch_strategy(Strategy::more_capable()); -} -``` - -#### 2.3 增量索引集成 - -```rust -// 在 PipelineExecutor 中 -pub fn execute_incremental( - &mut self, - input: IndexInput, - changes: ChangeSet, -) -> Result { - // 只处理变更部分 -} -``` - ---- - -### Phase 3: 性能优化 (优先级: 中) - -#### 3.1 并行执行实现 - -**当前状态**: `ExecutionGroup` 已设计,但 `execute()` 仍顺序执行 - -```rust -// 当前 (顺序) -for &stage_idx in &group.stage_indices { - entry.stage.execute(&mut ctx).await?; -} - -// 目标 (并行) -futures::future::try_join_all( - group.stage_indices.iter() - .map(|&idx| self.stages[idx].execute(&ctx)) -).await?; -``` - -**挑战**: -- `PipelineContext` 需要 `Send + Sync` -- 需要细粒度锁或消息传递 - -#### 3.2 Path Cache 命中率 - -```rust -// 添加热点查询缓存 -pub struct PathCache { - entries: LruCache, - hot_queries: Arc>>, // 新增 -} -``` - -#### 3.3 批量 LLM 调用 - -```rust -// 当前: 逐个评估 -for node_id in node_ids { - self.evaluate_node(tree, node_id, context).await; -} - -// 目标: 批量评估 -self.evaluate_nodes_batch(tree, node_ids, context).await; -``` - ---- - -### Phase 4: 测试增强 (优先级: 低) - -| 测试类型 | 当前 | 目标 | -|----------|------|------| -| 单元测试 | 129 | +50 | -| 集成测试 | 0 (仅 examples) | +10 | -| Property 测试 | 0 | +5 | -| 覆盖率报告 | 无 | cargo-tarpaulin | - ---- - -## 执行顺序 - -``` -Phase 1 (代码清理) - ↓ -Phase 3.1 (并行执行) - ↓ -Phase 2 (功能补全) - ↓ -Phase 4 (测试增强) -``` - -**建议首先执行 Phase 1 代码清理**,消除所有 clippy warnings,使代码库更干净。 - ---- - -## 文件变更预览 - -### Phase 1 涉及文件 - -``` -src/ -├── client/builder.rs # 添加 #[must_use] -├── config/types.rs # 添加 #[must_use] -├── domain/tree.rs # 移除 dead code -├── index/ -│ ├── pipeline/orchestrator.rs # 移除 find_stage_index -│ └── stages/*.rs # 修复 unused -├── retrieval/ -│ ├── search/mod.rs # 移除 dead code -│ ├── strategy/keyword.rs # 移除 term_frequency -│ └── stages/*.rs # 修复 unused -└── llm/client.rs # 修复 unused max_tokens -``` - ---- - -## 参考资料 - -- [Architecture v2](./architecture-v2.svg) -- [Pipeline Design](./v2.md) -- [RFCs](../rfcs/) diff --git a/examples/batch_processing.rs b/examples/batch_processing.rs new file mode 100644 index 00000000..6906189f --- /dev/null +++ b/examples/batch_processing.rs @@ -0,0 +1,972 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Batch document processing example. +//! +//! This example demonstrates how to efficiently process +//! multiple documents in batch mode using sessions. +//! +//! # Usage +//! +//! ```bash +//! cargo run --example batch_processing +//! ``` + +use vectorless::client::EngineBuilder; + +#[tokio::main] +async fn main() -> Result<(), Box> { + println!("=== Batch Document Processing Example ===\n"); + + // 1. Create engine and session + println!("Step 1: Setting up..."); + let engine = EngineBuilder::new() + .with_workspace("./workspace_batch_example") + .build() + .map_err(|e| vectorless::Error::Config(e.to_string()))?; + + let session = engine.session(); + println!(" ✓ Session created: {}\n", session.id()); + + // 2. Create sample documents + println!("Step 2: Creating sample documents..."); + let temp_dir = tempfile::tempdir()?; + + let documents = vec![ + ("intro.md", r#"# Introduction + +Welcome to the vectorless library. This is a document intelligence engine. + +## Features + +- Tree-based navigation +- Multi-format support +- Session management +"#), + ("api.md", r#"# API Reference + +## Engine + +The main client for document operations. + +### Methods + +- `index(path)`: Index a document +- `query(question)`: Query indexed content + +## Session + +Multi-document operations with caching. + +### Methods + +- `index(path)`: Index into session +- `query_all(question)`: Query across all documents +"#), + ("guide.md", r#"# User Guide + +## Getting Started + +First, create a client with workspace configuration. + +## Best Practices + +- Use sessions for multi-document operations +- Enable caching for better performance +- Monitor events for debugging +"#), + ("advanced.md", r#"# Advanced Topics + +## Performance Tuning + +Configure retrieval parameters for optimal performance. + +### Parameters + +- `top_k`: Number of results +- `max_tokens`: Token budget + +## Custom Pilots + +Implement custom navigation logic. +"#), + ("reference.md", r#"# Reference + +## Configuration + +All configuration is done via TOML files. + +### Example + +```toml +[retrieval] +top_k = 5 +max_tokens = 4000 +``` +"#), + ("examples.md", r#"# Examples + +## Basic Usage + +Simple indexing and querying example. + +## Batch Processing + +Process multiple documents concurrently. + +## Session Usage + +Multi-document operations with caching. +"#), + ("faq.md", r#"# FAQ + +## Common Questions + +**Q: How do I index a document?** +A: Use `engine.index(path)` method. + +**Q: How to query?** +A: Use `engine.query(doc_id, question)` method. + +**Q: What formats are supported?** +A: Markdown, PDF, DOCX, HTML. +"#), + ("changelog.md", r#"# Changelog + +## Version 0.1.0 + +- Initial release +- Basic indexing support +- Simple retrieval + +## Version 0.2.0 + +- Session support +- Event system +- Content aggregator +"#), + ("contributing.md", r#"# Contributing + +## How to Contribute + +We welcome contributions! Please follow these steps: + +1. Fork the repository +2. Create a feature branch +3. Submit a pull request + +## Code Style + +- Run `cargo fmt` +- Run `cargo clippy` +- Add tests +"#), + ("license.md", r#"# License + +Apache License, Version 2.0 + +Copyright 2026 vectorless developers +"#), + ("architecture.md", r#"# Architecture + +## Overview + +Vectorless uses a tree-based architecture. + +## Components + +- Parser: Document parsing +- Indexer: Tree building +- Retriever: Content search +- Storage: Persistence +"#), + ("security.md", r#"# Security + +## Security Considerations + +- API keys are stored securely +- No sensitive data in logs +- Input validation + +## Best Practices + +- Use environment variables +- Rotate keys periodically +"#), + ("performance.md", r#"# Performance + +## Optimization Tips + +- Use caching effectively +- Configure appropriate batch sizes +- Monitor memory usage + +## Benchmarks + +Run `cargo bench` for performance metrics. +"#), + ("testing.md", r#"# Testing + +## Running Tests + +```bash +cargo test +``` + +## Test Coverage + +- Unit tests +- Integration tests +- Example tests +"#), + ("deployment.md", r#"# Deployment + +## Production Setup + +- Configure workspace directory +- Set up logging +- Monitor performance + +## Configuration + +Use TOML configuration files. +"#), + ("troubleshooting.md", r#"# Troubleshooting + +## Common Issues + +### Indexing Fails + +Check file format and permissions. + +### Query Returns Empty + +Ensure document is indexed. + +### Performance Issues + +Reduce batch size or enable caching. +"#), + ("integrations.md", r#"# Integrations + +## LLM Providers + +- OpenAI +- Anthropic +- Local models + +## Storage Backends + +- File system (default) +- S3 (planned) +"#), + ("migrations.md", r#"# Migrations + +## Version Migrations + +### 0.1.x to 0.2.x + +- Update configuration format +- Re-index documents +"#), + ("roadmap.md", r#"# Roadmap + +## Future Plans + +### Short Term + +- Streaming support +- More formats + +### Long Term + +- Distributed indexing +- Real-time updates +"#), + ("credits.md", r#"# Credits + +## Contributors + +Thanks to all contributors! + +## Libraries + +Built with Rust and many open-source libraries. +"#), + ("index.md", r#"# Index + +## Quick Links + +- [Introduction](intro.md) +- [API Reference](api.md) +- [User Guide](guide.md) + +## Search + +Use the search functionality to find specific content. +"#), + ("search.md", r#"# Search + +## Search Functionality + +### Basic Search + +```rust +let results = engine.query(&doc_id, "search term").await?; +``` + +### Advanced Search + +Use sessions for cross-document search. +"#), + ("export.md", r#"# Export + +## Exporting Data + +### JSON Export + +```rust +let json = tree.to_structure_json(); +``` + +### Custom Formats + +Implement custom exporters as needed. +"#), + ("import.md", r#"# Import + +## Importing Data + +### From Files + +```rust +let doc_id = engine.index("./document.md").await?; +``` + +### From Memory + +Use the content directly with parsers. +"#), + ("validation.md", r#"# Validation + +## Input Validation + +### Document Paths + +Must exist and be readable. + +### Configuration + +Validated on load with helpful errors. + +### Queries + +Sanitized before processing. +"#), + ("formatting.md", r#"# Formatting + +## Content Formatting + +### Markdown + +Standard CommonMark with extensions. + +### Code Blocks + +Syntax highlighting support. + +### Tables + +Basic table parsing. +"#), + ("localization.md", r#"# Localization + +## Internationalization + +Currently English-only. + +## Future Support + +Planned i18n support for: +- Error messages +- UI strings +- Documentation +"#), + ("accessibility.md", r#"# Accessibility + +## Accessibility + +### Documentation + +Clear and comprehensive docs. + +### API Design + +Consistent and intuitive naming. + +### Error Messages + +Helpful and actionable. +"#), + ("glossary.md", r#"# Glossary + +## Terms + +- **Document Tree**: Hierarchical structure +- **Session**: Multi-document context +- **Workspace**: Document storage +- **Retrieval**: Content search +"#), + ("appendix.md", r#"# Appendix + +## Additional Resources + +- [GitHub Repository](https://github.com) +- [Documentation Site](https://docs.vectorless.dev) +- [Community Discord](https://discord.gg) +"#), + ("summary.md", r#"# Summary + +## Overview + +This documentation covers all aspects of vectorless. + +## Next Steps + +- Try the examples +- Join the community +- Contribute! +"#), + ("conclusion.md", r#"# Conclusion + +## Thank You + +Thanks for using vectorless! + +## Feedback + +We'd love to hear from you. Open an issue on GitHub. +"#), + ("revision.md", r#"# Revision History + +## Document Versions + +| Version | Date | Changes | +|---------|------------|---------------------------| +| 1.0 | 2026-01-01 | Initial version | +| 1.1 | 2026-02-01 | Session support | +"#), + ("feedback.md", r#"# Feedback + +## Providing Feedback + +We value your input! + +### Channels + +- GitHub Issues +- Discord Community +- Email Support + +### What to Share + +- Bug reports +- Feature requests +- Documentation improvements +"#), + ("support.md", r#"# Support + +## Getting Help + +### Documentation + +Start with the user guide. + +### Community + +Join our Discord for discussions. + +### Enterprise + +Contact us for enterprise support. +"#), + ("updates.md", r#"# Updates + +## Staying Updated + +### Version Updates + +Check the changelog for updates. + +### Security Updates + +Apply security patches promptly. + +### Deprecations + +Watch for deprecation notices. +"#), + ("resources.md", r#"# Resources + +## External Resources + +### Official + +- Documentation: docs.vectorless.dev +- GitHub: github.com/vectorless +- Discord: discord.gg/vectorless + +### Community + +- Blog posts +- Tutorial videos +- Example projects +"#), + ("contact.md", r#"# Contact + +## Contact Information + +### General Inquiries + +Email: hello@vectorless.dev + +### Security Issues + +Email: security@vectorless.dev + +### Enterprise Sales + +Email: enterprise@vectorless.dev +"#), + ("privacy.md", r#"# Privacy Policy + +## Data Handling + +Vectorless processes documents locally. + +## No Tracking + +We don't track usage or content. + +## API Keys + +Stored securely in configuration files. +"#), + ("terms.md", r#"# Terms of Service + +## Usage Terms + +By using vectorless, you agree to: + +- Use responsibly +- Follow applicable laws +- Respect rate limits + +## Changes + +Terms may be updated. Check for revisions. +"#), + ("legal.md", r#"# Legal + +## Licensing + +Apache License 2.0 + +## Copyright + +Copyright 2026 vectorless developers + +## Trademarks + +Vectorless is a trademark. +"#), + ("versioning.md", r#"# Versioning + +## Semantic Versioning + +We follow semver: + +- MAJOR: Breaking changes +- MINOR: New features +- PATCH: Bug fixes + +## Current Version + +0.1.10 +"#), + ("compatibility.md", r#"# Compatibility + +## Supported Versions + +- Rust 1.70+ +- Tokio 1.x + +## Platform Support + +- Linux +- macOS +- Windows + +## Breaking Changes + +Documented in changelog. +"#), + ("installation.md", r#"# Installation + +## Requirements + +- Rust 1.70+ +- Tokio runtime + +## Install + +```bash +cargo install vectorless +``` + +## Verify + +```bash +vectorless --version +``` +"#), + ("quickstart.md", r#"# Quick Start + +## 5-Minute Setup + +1. Install vectorless +2. Create a client +3. Index a document +4. Query! + +```rust +let client = Engine::builder().build()?; +let doc_id = client.index("./doc.md").await?; +let result = client.query(&doc_id, "What is this?").await?; +``` +"#), + ("tutorial.md", r#"# Tutorial + +## Introduction + +This tutorial covers basic usage. + +## Step 1: Setup + +Create a client with workspace. + +## Step 2: Index + +Index your first document. + +## Step 3: Query + +Ask questions about your document. + +## Step 4: Next + +Explore advanced features. +"#), + ("examples_overview.md", r#"# Examples Overview + +## Available Examples + +| Example | Description | +|-----------------|--------------------------------| +| basic.rs | Basic usage | +| session.rs | Multi-document operations | +| events.rs | Event callbacks | +| batch.rs | Batch processing | + +## Running Examples + +```bash +cargo run --example +``` +"#), + ("configuration.md", r#"# Configuration + +## Configuration File + +Use `config.toml` for settings: + +```toml +[storage] +workspace_dir = "./workspace" + +[retrieval] +top_k = 5 +max_tokens = 4000 +``` + +## Environment Variables + +- `OPENAI_API_KEY`: LLM API key +"#), + ("optimization.md", r#"# Optimization + +## Performance Tips + +- Use sessions for caching +- Batch document indexing +- Configure appropriate token limits + +## Memory Management + +Documents are cached in sessions. + +## Concurrency + +Use `buffer_unordered` for parallel indexing. +"#), + ("errors.md", r#"# Error Handling + +## Error Types + +- `ConfigError`: Configuration issues +- `ParseError`: Document parsing failures +- `RetrievalError`: Query failures + +## Handling Errors + +```rust +match result { + Ok(response) => { /* success */ }, + Err(Error::Parse(msg)) => { /* handle parse error */ }, + Err(e) => { /* other error */ }, +} +``` +"#), + ("logging.md", r#"# Logging + +## Log Levels + +- ERROR: Serious issues +- WARN: Potential issues +- INFO: General information +- DEBUG: Detailed information +- TRACE: Very detailed + +## Enabling Logs + +```bash +RUST_LOG=debug cargo run +``` +"#), + ("metrics.md", r#"# Metrics + +## Available Metrics + +- Query count +- Cache hit rate +- Average query time + +## Accessing Metrics + +```rust +let stats = session.stats(); +println!("Cache hit rate: {:.1}%", stats.cache_hit_rate() * 100.0); +``` +"#), + ("health.md", r#"# Health Checks + +## Workspace Health + +Check workspace integrity: + +```rust +let docs = engine.list_documents(); +println!("{} documents indexed", docs.len()); +``` + +## Session Health + +Monitor session statistics regularly. +"#), + ("backup.md", r#"# Backup + +## Backing Up + +Copy the workspace directory: + +```bash +cp -r ./workspace ./workspace_backup +``` + +## Restoration + +Restore by copying back: + +```bash +cp -r ./workspace_backup ./workspace +``` +"#), + ("recovery.md", r#"# Recovery + +## Corrupted Documents + +Remove and re-index: + +```rust +engine.remove(&doc_id)?; +engine.index(&path).await?; +``` + +## Session Recovery + +Create a new session if issues occur. +"#), + ("monitoring.md", r#"# Monitoring + +## Production Monitoring + +Use events for real-time monitoring: + +```rust +let events = EventEmitter::new() + .on_query(|e| { + // Log to monitoring system + }); +``` + +## Alerts + +Set up alerts for error rates. +"#), + ("scaling.md", r#"# Scaling + +## Horizontal Scaling + +Run multiple instances with shared storage. + +## Vertical Scaling + +Increase resources for single instance. + +## Considerations + +- Storage backend +- Cache coordination +- Rate limiting +"#), + ("security_config.md", r#"# Security Configuration + +## API Keys + +Store securely: + +```toml +[summary] +api_key = "${OPENAI_API_KEY}" +``` + +## Network Security + +Use HTTPS for all API calls. + +## Access Control + +Implement authentication for production. +"#), + ]; + + for (name, content) in &documents { + let path = temp_dir.path().join(name); + std::fs::write(&path, content)?; + } + + println!(" ✓ Created {} sample documents\n", documents.len()); + + // 3. Batch indexing with progress + println!("Step 3: Batch indexing..."); + let start = std::time::Instant::now(); + let mut doc_ids = Vec::new(); + + for (name, _) in &documents { + let path = temp_dir.path().join(name); + match session.index(&path).await { + Ok(doc_id) => { + doc_ids.push(doc_id); + } + Err(e) => { + eprintln!(" ✗ Failed to index {}: {}", name, e); + } + } + } + + let elapsed = start.elapsed(); + println!(" ✓ Indexed {} documents in {:?}", doc_ids.len(), elapsed); + println!(" - Rate: {:.1} docs/sec", doc_ids.len() as f64 / elapsed.as_secs_f64()); + println!(); + + // 4. Show session stats + println!("Step 4: Session statistics:"); + let stats = session.stats(); + println!(" - Documents in session: {}", session.list_documents().len()); + println!(" - Queries: {}", stats.query_count.get()); + println!(); + + // 5. Batch query with progress + println!("Step 5: Batch querying..."); + let queries = vec![ + "What is vectorless?", + "How to index?", + "Configuration options", + "API methods", + "Performance tips", + "Error handling", + "Logging setup", + "Security considerations", + "Scaling options", + "Getting help", + ]; + + let start = std::time::Instant::now(); + let mut success_count = 0; + + for query in &queries { + match session.query_all(query).await { + Ok(results) => { + if !results.is_empty() { + success_count += 1; + } + } + Err(e) => { + eprintln!(" ✗ Query failed: {}", e); + } + } + } + + let elapsed = start.elapsed(); + println!(" ✓ Completed {} queries in {:?}", queries.len(), elapsed); + println!(" - Success rate: {:.0}%", (success_count as f64 / queries.len() as f64) * 100.0); + println!(" - Rate: {:.1} queries/sec", queries.len() as f64 / elapsed.as_secs_f64()); + println!(); + + // 6. Final statistics + println!("Step 6: Final statistics:"); + let stats = session.stats(); + println!(" - Total documents: {}", session.list_documents().len()); + println!(" - Total queries: {}", stats.query_count.get()); + println!(" - Cache hits: {}", stats.cache_hits.get()); + println!(" - Cache misses: {}", stats.cache_misses.get()); + println!( + " - Cache hit rate: {:.1}%", + stats.cache_hit_rate() * 100.0 + ); + if let Some(avg_time) = stats.avg_query_time() { + println!(" - Avg query time: {:?}", avg_time); + } + println!(" - Session age: {:?}", session.age()); + println!(); + + // 7. Cleanup + println!("Step 7: Cleanup..."); + for doc_id in &doc_ids { + engine.remove(doc_id)?; + } + println!(" ✓ Removed {} documents\n", doc_ids.len()); + + println!("=== Example Complete ==="); + Ok(()) +} diff --git a/examples/cli_tool.rs b/examples/cli_tool.rs new file mode 100644 index 00000000..62a05f33 --- /dev/null +++ b/examples/cli_tool.rs @@ -0,0 +1,122 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! CLI tool example for vectorless. +//! +//! This example shows how to build a command-line tool +//! using vectorless for document indexing and querying. +//! +//! # What you'll learn: +//! - How to structure a CLI application +//! - How to handle subcommands (index, query, info) +//! - How to manage configuration and workspace +//! - How to provide user-friendly output +//! +//! # Example commands: +//! +//! ```bash +//! # Index a document +//! vectorless-cli index ./document.md +//! +//! # Query a document +//! vectorless-cli query "What is the main topic?" +//! +//! # List indexed documents +//! vectorless-cli list +//! +//! # Show document info +//! vectorless-cli info +//! +//! # Delete a document +//! vectorless-cli delete +//! ``` +//! +//! # Implementation notes: +//! +//! ## Recommended crates: +//! - `clap` for argument parsing +//! - `colored` or `termcolor` for colored output +//! - `indicatif` for progress bars +//! - `serde` for configuration +//! +//! ## Configuration file: +//! ```toml +//! # ~/.vectorless/config.toml +//! [llm] +//! provider = "openai" +//! model = "gpt-4" +//! +//! [index] +//! cache_size = 100 +//! +//! [retrieval] +//! max_iterations = 10 +//! ``` +//! +//! # TODO: Implementation steps +//! +//! 1. Define CLI structure with clap +//! 2. Implement index subcommand +//! 3. Implement query subcommand +//! 4. Implement list/info subcommands +//! 5. Add configuration management +//! 6. Add colored output and progress + +// TODO: Implement CLI tool +// ``` +// use clap::{Parser, Subcommand}; +// use vectorless::client::{Engine, EngineBuilder}; +// +// #[derive(Parser)] +// #[command(name = "vectorless-cli")] +// struct Cli { +// #[command(subcommand)] +// command: Commands, +// } +// +// #[derive(Subcommand)] +// enum Commands { +// /// Index a document +// Index { +// /// Path to document +// path: PathBuf, +// }, +// /// Query an indexed document +// Query { +// /// Document ID +// doc_id: String, +// /// Query string +// query: String, +// }, +// /// List all indexed documents +// List, +// } +// +// #[tokio::main] +// async fn main() -> Result<()> { +// let cli = Cli::parse(); +// let engine = EngineBuilder::new().build()?; +// +// match cli.command { +// Commands::Index { path } => { +// let doc_id = engine.index(&path).await?; +// println!("Indexed: {}", doc_id); +// } +// Commands::Query { doc_id, query } => { +// let result = engine.query(&doc_id, &query).await?; +// println!("{}", result.content); +// } +// Commands::List => { +// // List documents +// } +// } +// +// Ok(()) +// } +// ``` + +fn main() { + // TODO: Implement full CLI tool + + println!("TODO: Implement cli_tool example"); +} diff --git a/examples/content_aggregation.rs b/examples/content_aggregation.rs new file mode 100644 index 00000000..5fe71a32 --- /dev/null +++ b/examples/content_aggregation.rs @@ -0,0 +1,175 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Content Aggregation Accuracy Example +//! +//! This example demonstrates the content aggregation module's ability to: +//! 1. Score content relevance +//! 2. Allocate token budget +//! 3. Build structured output +//! +//! # Usage +//! +//! ```bash +//! cargo run --example content_aggregation +//! ``` + +use vectorless::retrieval::content::{ + ContentAggregator, ContentAggregatorConfig, BudgetAllocator, AllocationStrategy, + StructureBuilder, OutputFormat, RelevanceScorer, ScoringStrategyConfig, + ContentChunk, ScoringContext, +}; +use vectorless::domain::NodeId; +use indextree::Arena; + +fn make_node_id() -> NodeId { + let mut arena = Arena::new(); + let node = vectorless::domain::TreeNode { + title: "Test".to_string(), + structure: String::new(), + content: String::new(), + summary: String::new(), + depth: 0, + start_index: 0, + end_index: 0, + start_page: None, + end_page: None, + node_id: None, + physical_index: None, + token_count: None, + }; + NodeId(arena.new_node(node)) +} + +fn main() { + println!("=== Content Aggregation Accuracy Demo ===\n"); + + // 1. Demonstrate Relevance Scoring + println!("1. Relevance Scoring Demo"); + println!("---------------------------"); + + let query = "What is the architecture of vectorless?"; + let scorer = RelevanceScorer::new(query, ScoringStrategyConfig::KeywordWithBM25); + + let chunks = vec![ + ContentChunk::new( + make_node_id(), + "Architecture Overview".to_string(), + "Vectorless uses a tree-based architecture for document navigation. The system consists of multiple stages: parsing, indexing, and retrieval.".to_string(), + 0, + ), + ContentChunk::new( + make_node_id(), + "Installation Guide".to_string(), + "To install vectorless, add it to your Cargo.toml file. Then run cargo build to compile.".to_string(), + 1, + ), + ContentChunk::new( + make_node_id(), + "Core Components".to_string(), + "The architecture includes Pilot for navigation, Judge for sufficiency checking, and multiple search algorithms like beam search and greedy search.".to_string(), + 1, + ), + ]; + + let ctx = ScoringContext::default(); + + println!("Query: \"{}\"", query); + println!("\nScored chunks:"); + for chunk in &chunks { + let relevance = scorer.score_chunk(chunk, &ctx); + println!(" - '{}' (depth {}): score {:.3}", + chunk.title, chunk.depth, relevance.score); + println!(" Components: keyword={:.2}, bm25={:.2}, depth_penalty={:.2}, density={:.2}", + relevance.components.keyword_score, + relevance.components.bm25_score, + relevance.components.depth_penalty, + relevance.components.density_score, + ); + } + + // 2. Demonstrate Budget Allocation + println!("\n\n2. Budget Allocation Demo"); + println!("---------------------------"); + + let scored: Vec<_> = chunks + .iter() + .map(|chunk| scorer.score_chunk(chunk, &ctx)) + .collect(); + + let strategies = vec![ + ("Greedy", AllocationStrategy::Greedy), + ("Hierarchical (20%/level)", AllocationStrategy::Hierarchical { min_per_level: 0.2 }), + ]; + + for (name, strategy) in strategies { + let allocator = BudgetAllocator::new(200) + .with_strategy(strategy); + + let result = allocator.allocate(scored.clone(), 2); + + println!("\n{} Strategy:", name); + println!(" Tokens used: {}/{}", result.tokens_used, 200); + println!(" Items selected: {}", result.selected.len()); + println!(" Avg score: {:.3}", result.stats.avg_score); + + for content in &result.selected { + let trunc = if content.is_truncated() { " [truncated]" } else { "" }; + println!(" - '{}' ({} tokens, score {:.2}){}", + content.title, content.tokens, content.score, trunc); + } + } + + // 3. Demonstrate Structure Building + println!("\n\n3. Structure Building Demo"); + println!("---------------------------"); + + let formats = vec![ + ("Markdown", OutputFormat::Markdown), + ("Flat", OutputFormat::Flat), + ]; + + let allocator = BudgetAllocator::new(500) + .with_strategy(AllocationStrategy::Greedy); + let result = allocator.allocate(scored.clone(), 2); + + for (name, format) in formats { + let builder = StructureBuilder::new(format); + let tree = vectorless::domain::DocumentTree::new("Test", ""); + let structured = builder.build(result.selected.clone(), &tree); + + println!("\n{} Output ({} chars, {} tokens):", name, structured.content.len(), structured.metadata.total_tokens); + let preview = if structured.content.len() > 300 { + format!("{}...", &structured.content[..300]) + } else { + structured.content.clone() + }; + println!("{}", preview.lines().take(8).collect::>().join("\n")); + } + + // 4. Demonstrate Full Aggregation Pipeline + println!("\n\n4. Full Aggregation Pipeline Demo"); + println!("-----------------------------------"); + + let configs = vec![ + ("Default (4000 tokens)", ContentAggregatorConfig::default()), + ("Conservative (1000 tokens)", ContentAggregatorConfig::new() + .with_token_budget(1000) + .with_min_relevance(0.3)), + ("High Precision (2000 tokens, 0.5 threshold)", ContentAggregatorConfig::new() + .with_token_budget(2000) + .with_min_relevance(0.5)), + ]; + + for (name, config) in configs { + println!("\n{} Config:", name); + println!(" Token budget: {}", config.token_budget); + println!(" Min relevance: {:.1}", config.min_relevance_score); + + let aggregator = ContentAggregator::new(config); + // Note: Full aggregation requires a DocumentTree with actual content + let _ = aggregator; // Suppress unused warning + } + + println!("\n=== Demo Complete ==="); +} diff --git a/examples/custom_pilot.rs b/examples/custom_pilot.rs new file mode 100644 index 00000000..bd7a730e --- /dev/null +++ b/examples/custom_pilot.rs @@ -0,0 +1,67 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Custom Pilot implementation example. +//! +//! This example demonstrates how to implement a custom Pilot +//! that provides navigation guidance during retrieval. +//! +//! # What you'll learn: +//! - How to implement the Pilot trait +//! - When to intervene (START, FORK, BACKTRACK, EVALUATE) +//! - How to provide ranked candidates +//! - How to integrate custom Pilot with the retrieval pipeline +//! +//! # Key concepts: +//! +//! ## Intervention Points +//! - START: Before search begins - analyze query, set direction +//! - FORK: At branch points - rank candidates, guide path selection +//! - BACKTRACK: When search fails - suggest alternatives +//! - EVALUATE: After content found - check sufficiency +//! +//! ## Score Merging +//! ```text +//! final_score = α × algorithm_score + β × llm_score +//! ``` +//! +//! # TODO: Implementation steps +//! +//! 1. Define your custom Pilot struct +//! 2. Implement the Pilot trait +//! 3. Configure intervention conditions +//! 4. Integrate with EngineBuilder + +// TODO: Implement custom Pilot +// ``` +// use vectorless::retrieval::pilot::{Pilot, PilotDecision, SearchState, InterventionPoint}; +// +// pub struct MyCustomPilot { +// // Your fields here +// } +// +// impl Pilot for MyCustomPilot { +// fn should_intervene(&self, state: &SearchState, point: InterventionPoint) -> bool { +// // Decide when to intervene +// todo!() +// } +// +// async fn decide(&self, state: &SearchState) -> PilotDecision { +// // Make navigation decision +// todo!() +// } +// } +// ``` + +fn main() { + // TODO: Show how to use custom Pilot with EngineBuilder + // + // let pilot = MyCustomPilot::new(); + // let engine = EngineBuilder::new() + // .with_pilot(Arc::new(pilot)) + // .build()?; + // + // // Use engine with custom Pilot guidance + + println!("TODO: Implement custom_pilot example"); +} diff --git a/examples/events.rs b/examples/events.rs new file mode 100644 index 00000000..eab7b68a --- /dev/null +++ b/examples/events.rs @@ -0,0 +1,152 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Event callbacks example. +//! +//! This example demonstrates the event system for: +//! - Monitoring indexing progress +//! - Tracking query execution +//! - Debugging retrieval behavior +//! +//! # Usage +//! +//! ```bash +//! cargo run --example events +//! ``` + +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; + +use vectorless::client::{EngineBuilder, EventEmitter, IndexEvent, QueryEvent}; + +#[tokio::main] +async fn main() -> Result<(), Box> { + println!("=== Event Callbacks Example ===\n"); + + // 1. Create event emitter with handlers + println!("Step 1: Setting up event handlers...\n"); + + let index_count = Arc::new(AtomicUsize::new(0)); + let query_count = Arc::new(AtomicUsize::new(0)); + let nodes_visited = Arc::new(AtomicUsize::new(0)); + + let index_count_clone = index_count.clone(); + let query_count_clone = query_count.clone(); + let nodes_visited_clone = nodes_visited.clone(); + + let events = EventEmitter::new() + // Index events + .on_index(move |e| { + match e { + IndexEvent::Started { path } => { + println!(" [INDEX] Started: {}", path); + } + IndexEvent::FormatDetected { format } => { + println!(" [INDEX] Format: {:?}", format); + } + IndexEvent::TreeBuilt { node_count } => { + println!(" [INDEX] Tree built: {} nodes", node_count); + } + IndexEvent::Complete { doc_id } => { + println!(" [INDEX] Complete: {}", &doc_id[..8]); + index_count_clone.fetch_add(1, Ordering::SeqCst); + } + IndexEvent::Error { message } => { + println!(" [INDEX] Error: {}", message); + } + _ => {} + } + }) + // Query events + .on_query(move |e| { + match e { + QueryEvent::Started { query } => { + println!(" [QUERY] Started: \"{}\"", query); + query_count_clone.fetch_add(1, Ordering::SeqCst); + } + QueryEvent::NodeVisited { title, score, .. } => { + println!(" [QUERY] Visited: \"{}\" (score: {:.2})", title, score); + nodes_visited_clone.fetch_add(1, Ordering::SeqCst); + } + QueryEvent::CandidateFound { node_id, score } => { + println!(" [QUERY] Candidate: {} (score: {:.2})", &node_id[..8], score); + } + QueryEvent::Complete { total_results, confidence } => { + println!(" [QUERY] Complete: {} results, confidence: {:.2}", total_results, confidence); + } + QueryEvent::Error { message } => { + println!(" [QUERY] Error: {}", message); + } + _ => {} + } + }); + + println!(" ✓ Event handlers configured\n"); + + // 2. Create engine with events + println!("Step 2: Creating engine with event emitter..."); + let engine = EngineBuilder::new() + .with_workspace("./workspace_events_example") + .with_events(events) + .build() + .map_err(|e| vectorless::Error::Config(e.to_string()))?; + println!(" ✓ Engine created\n"); + + // 3. Index a document (events will fire) + println!("Step 3: Indexing document (watch events)...\n"); + + let temp_dir = tempfile::tempdir()?; + let doc_content = r#"# Example Document + +## Introduction + +This is an example document for demonstrating event callbacks. + +## Features + +- Event monitoring for indexing +- Event monitoring for queries +- Progress tracking + +## Architecture + +The event system uses handlers that can be attached to the engine builder. +"#; + + let doc_path = temp_dir.path().join("example.md"); + tokio::fs::write(&doc_path, doc_content).await?; + + let doc_id = engine.index(&doc_path).await?; + println!(); + + // 4. Query the document (events will fire) + println!("Step 4: Querying document (watch events)...\n"); + + let result = engine.query(&doc_id, "What features are available?").await?; + println!(); + + // 5. Show results + println!("Step 5: Query result:"); + println!(" - Score: {:.2}", result.score); + println!(" - Nodes: {}", result.node_ids.len()); + if !result.content.is_empty() { + let preview: String = result.content.chars().take(100).collect(); + println!(" - Content: {}...", preview); + } + println!(); + + // 6. Show statistics + println!("Step 6: Event statistics:"); + println!(" - Index events fired: {}", index_count.load(Ordering::SeqCst)); + println!(" - Query events fired: {}", query_count.load(Ordering::SeqCst)); + println!(" - Nodes visited: {}", nodes_visited.load(Ordering::SeqCst)); + println!(); + + // 7. Cleanup + println!("Step 7: Cleanup..."); + engine.remove(&doc_id)?; + println!(" ✓ Document removed\n"); + + println!("=== Example Complete ==="); + Ok(()) +} diff --git a/examples/markdownflow.rs b/examples/markdownflow.rs index 4cde85f9..ba566aa6 100644 --- a/examples/markdownflow.rs +++ b/examples/markdownflow.rs @@ -35,6 +35,9 @@ Vectorless is a document indexing and retrieval library that uses tree-based nav #[tokio::main] async fn main() -> Result<(), Box> { + // Initialize tracing for debug output (set RUST_LOG=debug to see more) + tracing_subscriber::fmt::init(); + println!("=== Vectorless Markdown Flow Example ===\n"); // Step 1: Create a Vectorless client (no API key needed - LLM config is automatic) diff --git a/examples/multi_format.rs b/examples/multi_format.rs new file mode 100644 index 00000000..f146b851 --- /dev/null +++ b/examples/multi_format.rs @@ -0,0 +1,77 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Multi-format document processing example. +//! +//! This example demonstrates how to work with different +//! document formats (Markdown, PDF, DOCX, HTML). +//! +//! # What you'll learn: +//! - How to index documents of different formats +//! - How format detection works +//! - How to configure format-specific parsing options +//! - How to handle mixed-format document sets +//! +//! # Supported formats: +//! - **Markdown** (.md): Full support with ToC extraction +//! - **PDF** (.pdf): Text extraction, structure inference +//! - **DOCX** (.docx): Word document parsing +//! - **HTML** (.html, .htm): Web page parsing (planned) +//! - **Plain text** (.txt): Basic text parsing (planned) +//! +//! # Format-specific considerations: +//! +//! ## Markdown +//! - Best format for structured documents +//! - Automatic heading hierarchy detection +//! - Code block handling +//! +//! ## PDF +//! - Text extraction quality varies +//! - No explicit structure (inferred from fonts/spacing) +//! - Tables and images not supported +//! +//! ## DOCX +//! - Good structure preservation +//! - Styles mapped to hierarchy +//! - Limited formatting support +//! +//! # TODO: Implementation steps +//! +//! 1. Detect document format from extension or content +//! 2. Configure format-specific parser options +//! 3. Index documents of mixed formats +//! 4. Query across all formats + +// TODO: Implement multi-format example +// ``` +// use vectorless::client::{Engine, EngineBuilder}; +// use vectorless::parser::DocumentFormat; +// +// async fn index_multiple_formats(engine: &Engine) { +// // Index different formats +// let md_doc = engine.index("./README.md").await?; +// let pdf_doc = engine.index("./paper.pdf").await?; +// let docx_doc = engine.index("./report.docx").await?; +// +// // Query works across all formats +// let result = engine.query(&md_doc, "What is this about?").await?; +// } +// ``` + +fn main() { + // TODO: Show multi-format indexing and querying + // + // // Index documents of different formats + // let md_id = engine.index("./docs/guide.md").await?; + // let pdf_id = engine.index("./docs/paper.pdf").await?; + // let docx_id = engine.index("./docs/report.docx").await?; + // + // // Each can be queried independently + // for doc_id in &[md_id, pdf_id, docx_id] { + // let result = engine.query(doc_id, "summary").await?; + // println!("Result: {}", result.content); + // } + + println!("TODO: Implement multi_format example"); +} diff --git a/examples/session.rs b/examples/session.rs new file mode 100644 index 00000000..25aaf3ab --- /dev/null +++ b/examples/session.rs @@ -0,0 +1,207 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Session-based multi-document operations example. +//! +//! This example demonstrates the Session API for: +//! - Managing multiple documents in a single session +//! - Cross-document queries +//! - Session caching for improved performance +//! - Session statistics +//! +//! # Usage +//! +//! ```bash +//! cargo run --example session +//! ``` + +use vectorless::client::EngineBuilder; + +#[tokio::main] +async fn main() -> Result<(), Box> { + println!("=== Session-Based Multi-Document Example ===\n"); + + // 1. Create the engine + println!("Step 1: Creating engine..."); + let engine = EngineBuilder::new() + .with_workspace("./workspace_session_example") + .build() + .map_err(|e| vectorless::Error::Config(e.to_string()))?; + println!(" ✓ Engine created\n"); + + // 2. Create a session for multi-document operations + println!("Step 2: Creating session..."); + let session = engine.session(); + println!(" ✓ Session ID: {}\n", session.id()); + + // 3. Index multiple documents into the session + println!("Step 3: Indexing documents..."); + + // Create sample documents + let temp_dir = tempfile::tempdir()?; + + let doc1_content = r#"# Architecture Guide + +## Overview + +Vectorless uses a tree-based architecture for document navigation. + +## Components + +- **Indexer**: Parses documents and builds tree structure +- **Retriever**: Navigates tree to find relevant content +- **Workspace**: Manages document persistence +"#; + + let doc2_content = r#"# API Reference + +## Engine + +The main entry point for vectorless operations. + +### Methods + +- `index(path)`: Index a document +- `query(doc_id, question)`: Query a document +- `list_documents()`: List all documents + +## Session + +Multi-document operations with caching. + +### Methods + +- `index(path)`: Index into session +- `query(doc_id, question)`: Query cached document +- `query_all(question)`: Query across all documents +"#; + + let doc3_content = r#"# Configuration Guide + +## Workspace Settings + +The workspace directory stores indexed documents. + +```toml +[storage] +workspace_dir = "./workspace" +``` + +## Retrieval Settings + +Configure retrieval behavior: + +```toml +[retrieval] +top_k = 5 +max_tokens = 4000 +``` + +## Content Aggregator + +Control content aggregation: + +```toml +[retrieval.content] +enabled = true +token_budget = 4000 +``` +"#; + + // Write sample documents + let doc1_path = temp_dir.path().join("architecture.md"); + let doc2_path = temp_dir.path().join("api.md"); + let doc3_path = temp_dir.path().join("config.md"); + + tokio::fs::write(&doc1_path, doc1_content).await?; + tokio::fs::write(&doc2_path, doc2_content).await?; + tokio::fs::write(&doc3_path, doc3_content).await?; + + // Index into session + let doc1_id = session.index(&doc1_path).await?; + println!(" ✓ Indexed: architecture.md -> {}", &doc1_id[..8]); + + let doc2_id = session.index(&doc2_path).await?; + println!(" ✓ Indexed: api.md -> {}", &doc2_id[..8]); + + let doc3_id = session.index(&doc3_path).await?; + println!(" ✓ Indexed: config.md -> {}", &doc3_id[..8]); + println!(); + + // 4. List documents in session + println!("Step 4: Session documents:"); + for doc in session.list_documents() { + println!(" - {} ({})", doc.name, &doc.id[..8]); + } + println!(); + + // 5. Query individual documents (uses cache) + println!("Step 5: Query individual documents..."); + let query = "What methods are available?"; + + println!(" Query: \"{}\"", query); + let start = std::time::Instant::now(); + let result = session.query(&doc2_id, query).await?; + let elapsed = start.elapsed(); + println!(" - Time: {:?}", elapsed); + println!(" - Score: {:.2}", result.score); + if !result.content.is_empty() { + let preview: String = result.content.chars().take(100).collect(); + println!(" - Preview: {}...", preview); + } + println!(); + + // 6. Query same document again (should be faster due to cache) + println!("Step 6: Query cached document (should be faster)..."); + let start = std::time::Instant::now(); + let result = session.query(&doc2_id, "How to list documents?").await?; + let cached_elapsed = start.elapsed(); + println!(" - Time: {:?}", cached_elapsed); + println!(" - Score: {:.2}", result.score); + println!(); + + // 7. Query across all documents + println!("Step 7: Cross-document query..."); + let query = "How to configure the workspace?"; + println!(" Query: \"{}\"", query); + + let results = session.query_all(query).await?; + println!(" Found {} relevant documents:", results.len()); + + for (i, result) in results.iter().enumerate() { + println!( + " {}. {} (score: {:.2})", + i + 1, + &result.doc_id[..8], + result.score + ); + } + println!(); + + // 8. Show session statistics + println!("Step 8: Session statistics:"); + let stats = session.stats(); + println!(" - Documents: {}", session.list_documents().len()); + println!(" - Queries: {}", stats.query_count.get()); + println!(" - Cache hits: {}", stats.cache_hits.get()); + println!(" - Cache misses: {}", stats.cache_misses.get()); + println!( + " - Cache hit rate: {:.1}%", + stats.cache_hit_rate() * 100.0 + ); + if let Some(avg_time) = stats.avg_query_time() { + println!(" - Avg query time: {:?}", avg_time); + } + println!(" - Session age: {:?}", session.age()); + println!(); + + // 9. Cleanup + println!("Step 9: Cleanup..."); + engine.remove(&doc1_id)?; + engine.remove(&doc2_id)?; + engine.remove(&doc3_id)?; + println!(" ✓ Documents removed\n"); + + println!("=== Example Complete ==="); + Ok(()) +} diff --git a/examples/streaming.rs b/examples/streaming.rs new file mode 100644 index 00000000..8942110c --- /dev/null +++ b/examples/streaming.rs @@ -0,0 +1,70 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Streaming retrieval example. +//! +//! This example demonstrates how to use streaming retrieval +//! to get results incrementally as they are found. +//! +//! # What you'll learn: +//! - How to use `query_stream()` for progressive results +//! - How to handle RetrieveEvent types +//! - How to display results as they arrive +//! - How to cancel long-running queries +//! +//! # RetrieveEvent types: +//! - `Started`: Query began, shows planned strategy +//! - `NodeVisited`: A node was visited during search +//! - `ContentFound`: Relevant content was found +//! - `Backtracking`: Search is backtracking for more data +//! - `Completed`: Query finished with final results +//! - `Error`: An error occurred +//! +//! # Use cases: +//! - Interactive Q&A with real-time feedback +//! - Long-running queries on large documents +//! - Debugging retrieval behavior +//! - Building responsive UIs +//! +//! # TODO: Implementation steps +//! +//! 1. Configure engine for streaming +//! 2. Call query_stream() instead of query() +//! 3. Process events as they arrive +//! 4. Handle completion and errors + +// TODO: Implement streaming retrieval +// ``` +// use vectorless::client::{Engine, RetrieveEvent}; +// +// async fn streaming_query( +// engine: &Engine, +// doc_id: &DocumentId, +// query: &str, +// ) { +// let mut stream = engine.query_stream(doc_id, query).await; +// +// while let Some(event) = stream.next().await { +// match event { +// RetrieveEvent::Started { strategy } => { +// println!("Starting search with strategy: {:?}", strategy); +// } +// RetrieveEvent::ContentFound { node_id, preview } => { +// println!("Found: {} - {}", node_id, preview); +// } +// RetrieveEvent::Completed { response } => { +// println!("Done! Confidence: {}", response.confidence); +// } +// _ => {} +// } +// } +// } +// ``` + +fn main() { + // TODO: Show streaming query usage + // + // streaming_query(&engine, &doc_id, "What is the architecture?").await; + + println!("TODO: Implement streaming example"); +} diff --git a/src/client/builder.rs b/src/client/builder.rs index 243e047e..76a335cf 100644 --- a/src/client/builder.rs +++ b/src/client/builder.rs @@ -9,7 +9,8 @@ use crate::config::{Config, ConfigLoader, RetrievalConfig}; use crate::retrieval::PipelineRetriever; use crate::storage::Workspace; -use super::Engine; +use super::engine::Engine; +use super::events::EventEmitter; /// Default configuration file names to search for. const CONFIG_FILE_NAMES: &[&str] = &["vectorless.toml", "config.toml", ".vectorless.toml"]; @@ -42,6 +43,9 @@ pub struct EngineBuilder { /// Custom retrieval config. retrieval_config: Option, + + /// Event emitter. + events: Option, } impl EngineBuilder { @@ -53,6 +57,7 @@ impl EngineBuilder { config_path: None, config: None, retrieval_config: None, + events: None, } } @@ -84,6 +89,13 @@ impl EngineBuilder { self } + /// Set the event emitter for callbacks. + #[must_use] + pub fn with_events(mut self, events: EventEmitter) -> Self { + self.events = Some(events); + self + } + /// Search for config file in current directory and parent directories. fn find_config_file() -> Option { let current_dir = std::env::current_dir().ok()?; @@ -127,32 +139,33 @@ impl EngineBuilder { /// Returns a [`BuildError`] if: /// - Configuration loading fails /// - Workspace creation fails + /// - Required API key is missing pub fn build(self) -> Result { // Load or create configuration - let config = if let Some(config) = self.config { - // Use explicitly provided config + let mut config = if let Some(config) = self.config { config } else if let Some(path) = self.config_path { - // Load from specified path ConfigLoader::new() .file(&path) .load() .map_err(|e| BuildError::Config(e.to_string()))? } else if let Some(config_path) = Self::find_config_file() { - // Auto-detect config file ConfigLoader::new().file(&config_path).load().map_err(|e| { BuildError::Config(format!("Failed to load {}: {}", config_path.display(), e)) })? } else { - // Use defaults Config::default() }; + // Override retrieval config if provided + if let Some(retrieval_config) = self.retrieval_config { + config.retrieval = retrieval_config; + } + // Open workspace: prefer explicit path, fallback to config let workspace = if let Some(path) = &self.workspace { Some(Workspace::open(path).map_err(|e| BuildError::Workspace(e.to_string()))?) } else { - // Use workspace_dir from config Some( Workspace::open(&config.storage.workspace_dir) .map_err(|e| BuildError::Workspace(e.to_string()))?, @@ -175,25 +188,33 @@ impl EngineBuilder { }; // Create pipeline retriever with config - let retrieval_config = self - .retrieval_config - .unwrap_or_else(|| config.retrieval.clone()); + let retrieval_config = config.retrieval.clone(); let mut retriever = PipelineRetriever::new().with_max_iterations(retrieval_config.search.max_iterations); - // Add LLM client if API key is available in retrieval config - if let Some(ref api_key) = retrieval_config.api_key { - let llm_config = crate::llm::LlmConfig::new(&retrieval_config.model) - .with_endpoint(retrieval_config.endpoint.clone()) - .with_api_key(api_key.clone()) - .with_temperature(retrieval_config.temperature); - let llm_client = crate::llm::LlmClient::new(llm_config); - retriever = retriever.with_llm_client(llm_client); + // LLM API key is REQUIRED for retrieval (Pilot needs it for semantic navigation) + // Try retrieval config first, then fall back to summary config + let retrieval_api_key = retrieval_config.api_key.clone() + .or_else(|| config.summary.api_key.clone()) + .ok_or(BuildError::MissingApiKey)?; + + let llm_config = crate::llm::LlmConfig::new(&retrieval_config.model) + .with_endpoint(retrieval_config.endpoint.clone()) + .with_api_key(retrieval_api_key) + .with_temperature(retrieval_config.temperature); + let llm_client = crate::llm::LlmClient::new(llm_config); + retriever = retriever.with_llm_client(llm_client); + + // Configure content aggregator if enabled + if retrieval_config.content.enabled { + retriever = retriever.with_content_config( + retrieval_config.content.to_aggregator_config() + ); } - Ok(Engine::with_components( - config, workspace, retriever, executor, - )) + // Build engine + Engine::with_components(config, workspace, retriever, executor) + .map_err(|e| BuildError::Other(e.to_string())) } } @@ -213,6 +234,14 @@ pub enum BuildError { /// Workspace error. #[error("Workspace error: {0}")] Workspace(String), + + /// Missing API key for retrieval. + #[error("Missing API key: LLM API key is required for retrieval. Set OPENAI_API_KEY environment variable or configure retrieval.api_key")] + MissingApiKey, + + /// Other error. + #[error("{0}")] + Other(String), } #[cfg(test)] diff --git a/src/client/context.rs b/src/client/context.rs new file mode 100644 index 00000000..344c05cb --- /dev/null +++ b/src/client/context.rs @@ -0,0 +1,337 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Request context and configuration. +//! +//! This module provides request-scoped configuration and state management +//! for client operations. It allows overriding global configuration on a +//! per-request basis. +//! +//! # Example +//! +//! ```rust,ignore +//! let ctx = ClientContext::new() +//! .with_top_k(10) +//! .with_token_budget(8000) +//! .with_timeout(Duration::from_secs(30)); +//! +//! let result = client.query_with_context(&doc_id, "query", &ctx).await?; +//! ``` + +use std::collections::HashMap; +use std::time::{Duration, Instant}; + +use uuid::Uuid; + +use crate::retrieval::content::OutputFormatConfig; + +/// Request context for client operations. +/// +/// Provides request-scoped configuration overrides and metadata. +#[derive(Debug, Clone)] +pub struct ClientContext { + /// Unique request ID for tracing. + pub request_id: Uuid, + + /// Request-specific configuration overrides. + pub config: RequestContextConfig, + + /// Request metadata (custom key-value pairs). + pub metadata: HashMap, + + /// Request deadline (for timeout). + pub deadline: Option, + + /// Priority (higher = more important). + pub priority: u8, +} + +impl Default for ClientContext { + fn default() -> Self { + Self::new() + } +} + +impl ClientContext { + /// Create a new context with defaults. + pub fn new() -> Self { + Self { + request_id: Uuid::new_v4(), + config: RequestContextConfig::default(), + metadata: HashMap::new(), + deadline: None, + priority: 5, // Default priority + } + } + + /// Create a context with a specific request ID. + pub fn with_id(id: Uuid) -> Self { + Self { + request_id: id, + ..Self::new() + } + } + + /// Set the top_k override for retrieval. + pub fn with_top_k(mut self, top_k: usize) -> Self { + self.config.top_k = Some(top_k); + self + } + + /// Set the token budget override. + pub fn with_token_budget(mut self, budget: usize) -> Self { + self.config.token_budget = Some(budget); + self + } + + /// Set the content format override. + pub fn with_content_format(mut self, format: OutputFormatConfig) -> Self { + self.config.content_format = Some(format); + self + } + + /// Set whether to include summaries. + pub fn with_summaries(mut self, include: bool) -> Self { + self.config.features.include_summaries = include; + self + } + + /// Set whether to include content. + pub fn with_content(mut self, include: bool) -> Self { + self.config.features.include_content = include; + self + } + + /// Set whether to enable caching. + pub fn with_cache(mut self, enable: bool) -> Self { + self.config.features.enable_cache = enable; + self + } + + /// Set whether to enable sufficiency checking. + pub fn with_sufficiency_check(mut self, enable: bool) -> Self { + self.config.features.enable_sufficiency_check = enable; + self + } + + /// Set a timeout duration. + pub fn with_timeout(mut self, duration: Duration) -> Self { + self.deadline = Some(Instant::now() + duration); + self + } + + /// Set a deadline. + pub fn with_deadline(mut self, deadline: Instant) -> Self { + self.deadline = Some(deadline); + self + } + + /// Set the priority (0-10, higher = more important). + pub fn with_priority(mut self, priority: u8) -> Self { + self.priority = priority.min(10); + self + } + + /// Add metadata. + pub fn with_metadata(mut self, key: impl Into, value: impl Into) -> Self { + self.metadata.insert(key.into(), value.into()); + self + } + + /// Check if the request has timed out. + pub fn is_timed_out(&self) -> bool { + self.deadline + .map(|d| Instant::now() > d) + .unwrap_or(false) + } + + /// Get remaining time until deadline. + pub fn remaining_time(&self) -> Option { + self.deadline + .map(|d| d.saturating_duration_since(Instant::now())) + } + + /// Merge with another context (other takes precedence). + pub fn merge(&self, other: &ClientContext) -> ClientContext { + let mut merged = self.clone(); + merged.request_id = other.request_id; + + if other.config.top_k.is_some() { + merged.config.top_k = other.config.top_k; + } + if other.config.token_budget.is_some() { + merged.config.token_budget = other.config.token_budget; + } + if other.config.content_format.is_some() { + merged.config.content_format = other.config.content_format.clone(); + } + if other.deadline.is_some() { + merged.deadline = other.deadline; + } + if other.priority != 5 { + merged.priority = other.priority; + } + + // Merge metadata + for (k, v) in &other.metadata { + merged.metadata.insert(k.clone(), v.clone()); + } + + // Merge feature flags + merged.config.features = FeatureFlags { + include_summaries: other.config.features.include_summaries, + include_content: other.config.features.include_content, + enable_cache: other.config.features.enable_cache, + enable_sufficiency_check: other.config.features.enable_sufficiency_check, + }; + + merged + } +} + +/// Request-specific configuration overrides. +#[derive(Debug, Clone, Default)] +pub struct RequestContextConfig { + /// Override top_k for retrieval. + pub top_k: Option, + + /// Override token budget. + pub token_budget: Option, + + /// Override content format. + pub content_format: Option, + + /// Feature flags. + pub features: FeatureFlags, +} + +/// Feature flags for request. +#[derive(Debug, Clone, Copy)] +pub struct FeatureFlags { + /// Include summaries in results. + pub include_summaries: bool, + + /// Include content in results. + pub include_content: bool, + + /// Enable result caching. + pub enable_cache: bool, + + /// Enable sufficiency checking. + pub enable_sufficiency_check: bool, +} + +impl Default for FeatureFlags { + fn default() -> Self { + Self { + include_summaries: true, + include_content: true, + enable_cache: true, + enable_sufficiency_check: true, + } + } +} + +impl FeatureFlags { + /// Create with all features enabled. + pub fn all() -> Self { + Self { + include_summaries: true, + include_content: true, + enable_cache: true, + enable_sufficiency_check: true, + } + } + + /// Create with minimal features (fastest). + pub fn minimal() -> Self { + Self { + include_summaries: false, + include_content: true, + enable_cache: false, + enable_sufficiency_check: false, + } + } + + /// Create for deep analysis. + pub fn deep() -> Self { + Self { + include_summaries: true, + include_content: true, + enable_cache: true, + enable_sufficiency_check: true, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_context_creation() { + let ctx = ClientContext::new(); + assert!(!ctx.request_id.is_nil()); + assert!(ctx.config.top_k.is_none()); + assert!(ctx.deadline.is_none()); + } + + #[test] + fn test_context_with_overrides() { + let ctx = ClientContext::new() + .with_top_k(10) + .with_token_budget(8000) + .with_cache(false); + + assert_eq!(ctx.config.top_k, Some(10)); + assert_eq!(ctx.config.token_budget, Some(8000)); + assert!(!ctx.config.features.enable_cache); + } + + #[test] + fn test_context_timeout() { + let ctx = ClientContext::new() + .with_timeout(Duration::from_millis(100)); + + assert!(!ctx.is_timed_out()); + assert!(ctx.remaining_time().is_some()); + } + + #[test] + fn test_context_metadata() { + let ctx = ClientContext::new() + .with_metadata("user", "test") + .with_metadata("version", "1.0"); + + assert_eq!(ctx.metadata.get("user"), Some(&"test".to_string())); + assert_eq!(ctx.metadata.get("version"), Some(&"1.0".to_string())); + } + + #[test] + fn test_context_merge() { + let ctx1 = ClientContext::new() + .with_top_k(5) + .with_metadata("key1", "value1"); + + let ctx2 = ClientContext::new() + .with_top_k(10) + .with_metadata("key2", "value2"); + + let merged = ctx1.merge(&ctx2); + + assert_eq!(merged.config.top_k, Some(10)); + assert_eq!(merged.metadata.get("key1"), Some(&"value1".to_string())); + assert_eq!(merged.metadata.get("key2"), Some(&"value2".to_string())); + } + + #[test] + fn test_feature_flags() { + let all = FeatureFlags::all(); + assert!(all.include_summaries); + assert!(all.include_content); + + let minimal = FeatureFlags::minimal(); + assert!(!minimal.include_summaries); + assert!(!minimal.enable_cache); + } +} diff --git a/src/client/engine.rs b/src/client/engine.rs index aeaa87b5..8156586e 100644 --- a/src/client/engine.rs +++ b/src/client/engine.rs @@ -1,26 +1,20 @@ // Copyright (c) 2026 vectorless developers // SPDX-License-Identifier: Apache-2.0 -//! Main Engine client for document indexing and retrieval. +//! Main Engine client - the entry point for vectorless. //! -//! This module provides the high-level API for: -//! - Indexing documents (Markdown, PDF, DOCX, HTML) -//! - Retrieving document structure -//! - Querying documents with adaptive retrieval +//! This module provides the main client for document indexing and retrieval. +//! The Engine is an orchestrator that delegates to specialized sub-clients. //! -//! # Design +//! # Architecture //! -//! The client uses **interior mutability** patterns to allow sharing across -//! async tasks while maintaining thread safety: -//! -//! - `Arc>` - Thread-safe workspace access (multiple readers, single writer) -//! - `Arc>` - Exclusive pipeline execution -//! - `Arc` - Immutable retriever (uses interior mutability internally) -//! -//! # Thread Safety -//! -//! `Engine` is `Clone + Send + Sync`. Cloning is cheap (reference count increment). -//! All clones share the same underlying resources. +//! ```text +//! Engine (Orchestrator) +//! ├── IndexerClient → Document indexing +//! ├── RetrieverClient → Query and retrieval +//! ├── WorkspaceClient → Document persistence +//! └── EventEmitter → Progress and events +//! ``` //! //! # Example //! @@ -34,13 +28,13 @@ //! .with_workspace("./my_workspace") //! .build()?; //! -//! // Clone for use in multiple tasks (cheap - just Arc clone) -//! let client1 = client.clone(); -//! let client2 = client.clone(); -//! -//! // Can use concurrently +//! // Index a document //! let doc_id = client.index("./document.md").await?; +//! +//! // Query the document //! let result = client.query(&doc_id, "What is this?").await?; +//! +//! println!("Found: {}", result.content); //! # Ok(()) //! # } //! ``` @@ -49,16 +43,20 @@ use std::path::Path; use std::sync::{Arc, Mutex, RwLock}; use tracing::info; -use uuid::Uuid; use crate::config::Config; use crate::domain::{DocumentTree, Error, Result}; -use crate::index::{IndexInput, PipelineExecutor, PipelineOptions, SummaryStrategy}; -use crate::parser::DocumentFormat; -use crate::retrieval::{PipelineRetriever, Retriever}; -use crate::storage::{DocumentMeta as StorageMeta, PersistedDocument, Workspace}; - -use super::types::{DocumentInfo, IndexMode, IndexOptions, QueryResult}; +use crate::index::PipelineExecutor; +use crate::retrieval::{PipelineRetriever, RetrieveOptions}; +use crate::storage::Workspace; + +use super::context::ClientContext; +use super::events::EventEmitter; +use super::indexer::IndexerClient; +use super::retriever::RetrieverClient; +use super::session::Session; +use super::types::{DocumentInfo, IndexOptions, QueryResult}; +use super::workspace::WorkspaceClient; /// The main Engine client. /// @@ -68,30 +66,26 @@ use super::types::{DocumentInfo, IndexMode, IndexOptions, QueryResult}; /// # Cloning /// /// Cloning is cheap - it only increments reference counts (`Arc`). All clones -/// share the same underlying resources (workspace, retriever, executor). +/// share the same underlying resources. /// /// # Thread Safety /// -/// The client is `Clone + Send + Sync` and can be safely shared across -/// threads. All mutable state is protected by appropriate synchronization: -/// -/// - Workspace: `Arc>` - Multiple readers, single writer -/// - Executor: `Arc>` - Exclusive access during indexing -/// - Retriever: `Arc` - Immutable, uses internal synchronization +/// The client is `Clone + Send + Sync` and can be safely shared across threads. pub struct Engine { /// Configuration (immutable, shared). config: Arc, - /// Workspace for persistence (with built-in LRU cache). - /// Uses RwLock for concurrent read access. - workspace: Option>>, + /// Indexer client for document indexing. + indexer: IndexerClient, - /// Pipeline retriever (immutable, uses interior mutability internally). - retriever: Arc, + /// Retriever client for queries. + retriever: RetrieverClient, - /// Pipeline executor for indexing. - /// Uses Mutex for exclusive access during pipeline execution. - executor: Arc>, + /// Workspace client for persistence. + workspace: Option, + + /// Event emitter. + events: EventEmitter, } impl Engine { @@ -106,11 +100,47 @@ impl Engine { /// Note: Prefer using [`Engine::builder()`] for more control. fn new() -> Result { let config = Config::default(); + Self::with_components( + config, + None, + PipelineRetriever::new(), + PipelineExecutor::new(), + ) + } + + // ============================================================ + // Constructor (for Builder) + // ============================================================ + + /// Create a new client with the given components. + pub(crate) fn with_components( + config: Config, + workspace: Option, + retriever: PipelineRetriever, + executor: PipelineExecutor, + ) -> Result { + let config = Arc::new(config); + let events = EventEmitter::new(); + + // Create indexer client + let indexer = IndexerClient::new(executor) + .with_events(events.clone()); + + // Create retriever client + let retriever = RetrieverClient::new(retriever, Arc::clone(&config)) + .with_events(events.clone()); + + // Create workspace client (if workspace provided) + let workspace_client = workspace.map(|ws| { + WorkspaceClient::new(ws).with_events(events.clone()) + }); + Ok(Self { - config: Arc::new(config), - workspace: None, - retriever: Arc::new(PipelineRetriever::new()), - executor: Arc::new(Mutex::new(PipelineExecutor::new())), + config, + indexer, + retriever, + workspace: workspace_client, + events, }) } @@ -142,94 +172,101 @@ impl Engine { path: impl AsRef, options: IndexOptions, ) -> Result { - let path = path.as_ref(); - let path = path.canonicalize().unwrap_or_else(|_| path.to_path_buf()); - - if !path.exists() { - return Err(Error::Parse(format!("File not found: {}", path.display()))); - } - - // Generate document ID - let doc_id = Uuid::new_v4().to_string(); - - // Detect format - let format = self.detect_format(&path, &options)?; - - info!("Indexing {:?} document: {}", format, path.display()); - - // Convert client options to pipeline options - let pipeline_options = PipelineOptions { - mode: match options.mode { - IndexMode::Auto => crate::index::IndexMode::Auto, - IndexMode::Pdf => crate::index::IndexMode::Pdf, - IndexMode::Markdown => crate::index::IndexMode::Markdown, - IndexMode::Html => crate::index::IndexMode::Html, - IndexMode::Docx => crate::index::IndexMode::Docx, - }, - generate_ids: options.generate_ids, - summary_strategy: if options.generate_summaries { - SummaryStrategy::selective(self.config.indexer.min_summary_tokens, false) - } else { - SummaryStrategy::none() - }, - generate_description: options.generate_description, - ..Default::default() - }; - - // Create pipeline input and execute (with mutex lock) - let input = IndexInput::file(&path); - let result = { - let mut executor = self - .executor - .lock() - .map_err(|_| Error::Other("Pipeline executor lock poisoned".to_string()))?; - executor.execute(input, pipeline_options).await? - }; - - // Build persisted document - let tree = result - .tree - .ok_or_else(|| Error::Parse("Document tree not generated".to_string()))?; - - let meta = StorageMeta::new(&doc_id, &result.name, format.extension()) - .with_source_path(path.to_string_lossy().to_string()) - .with_description(result.description.clone().unwrap_or_default()); - - let mut doc = PersistedDocument::new(meta, tree); - - // Add page count if available - if let Some(page_count) = result.page_count { - for i in 1..=page_count { - doc.add_page(i, ""); - } - } + let doc = self.indexer.index_with_options(path, options).await?; + let persisted = self.indexer.to_persisted(doc); // Save to workspace if configured if let Some(ref workspace) = self.workspace { - let mut ws = workspace - .write() - .map_err(|_| Error::Other("Workspace lock poisoned".to_string()))?; - ws.add(&doc)?; - info!("Saved document {} to workspace", doc_id); + workspace.save(&persisted)?; } - info!("Indexing complete. Document ID: {}", doc_id); + let doc_id = persisted.meta.id.clone(); + info!("Indexed document: {}", doc_id); Ok(doc_id) } - /// Detect document format from path and options. - fn detect_format(&self, path: &Path, options: &IndexOptions) -> Result { - match options.mode { - IndexMode::Auto => { - let ext = path.extension().and_then(|e| e.to_str()).unwrap_or(""); - DocumentFormat::from_extension(ext) - .ok_or_else(|| Error::Parse(format!("Unknown format: {}", ext))) - } - IndexMode::Pdf => Ok(DocumentFormat::Pdf), - IndexMode::Markdown => Ok(DocumentFormat::Markdown), - IndexMode::Html => Ok(DocumentFormat::Html), - IndexMode::Docx => Ok(DocumentFormat::Docx), + // ============================================================ + // Document Querying + // ============================================================ + + /// Query a document. + /// + /// Uses the adaptive retriever to find relevant content. + /// + /// # Errors + /// + /// Returns an error if: + /// - No workspace is configured + /// - The document is not found + /// - The retrieval fails + pub async fn query(&self, doc_id: &str, question: &str) -> Result { + let tree = self.get_structure(doc_id)?; + + let options = RetrieveOptions::new() + .with_top_k(self.config.retrieval.top_k) + .with_include_content(true) + .with_include_summaries(true); + + let mut result = self.retriever.query(&tree, question, &options).await?; + result.doc_id = doc_id.to_string(); + + Ok(result) + } + + /// Query a document with context. + /// + /// Allows request-specific configuration overrides. + pub async fn query_with_context( + &self, + doc_id: &str, + question: &str, + ctx: &ClientContext, + ) -> Result { + let tree = self.get_structure(doc_id)?; + + let mut options = RetrieveOptions::new() + .with_top_k(self.config.retrieval.top_k) + .with_include_content(true) + .with_include_summaries(true); + + // Apply context overrides + if let Some(top_k) = ctx.config.top_k { + options.top_k = top_k; } + if let Some(token_budget) = ctx.config.token_budget { + options.max_tokens = token_budget; + } + + let mut result = self.retriever.query_with_context(&tree, question, &options, ctx).await?; + result.doc_id = doc_id.to_string(); + + Ok(result) + } + + // ============================================================ + // Session Management + // ============================================================ + + /// Create a session for multi-document operations. + /// + /// Sessions provide: + /// - Automatic caching of document trees + /// - Cross-document queries + /// - Session statistics + pub fn session(&self) -> Session { + let workspace = self.workspace.clone().unwrap_or_else(|| { + WorkspaceClient::from_arc( + Arc::new(RwLock::new(Workspace::open("./temp_workspace").unwrap())), + self.events.clone(), + ) + }); + + Session::new( + self.indexer.clone(), + self.retriever.clone(), + workspace, + self.events.clone(), + ) } // ============================================================ @@ -240,24 +277,7 @@ impl Engine { #[must_use] pub fn list_documents(&self) -> Vec { match &self.workspace { - Some(workspace) => { - let ws = match workspace.read() { - Ok(guard) => guard, - Err(_) => return Vec::new(), - }; - ws.list_documents() - .iter() - .filter_map(|id| ws.get_meta(id)) - .map(|meta| DocumentInfo { - id: meta.id.clone(), - name: meta.doc_name.clone(), - format: meta.doc_type.clone(), - description: meta.doc_description.clone(), - page_count: meta.page_count, - line_count: meta.line_count, - }) - .collect() - } + Some(workspace) => workspace.list().unwrap_or_default(), None => Vec::new(), } } @@ -270,18 +290,10 @@ impl Engine { /// - No workspace is configured /// - The document is not found pub fn get_structure(&self, doc_id: &str) -> Result { - let workspace = self - .workspace - .as_ref() + let workspace = self.workspace.as_ref() .ok_or_else(|| Error::Config("No workspace configured".to_string()))?; - // Use read lock - Workspace::load now uses interior mutability for cache - let ws = workspace - .read() - .map_err(|_| Error::Other("Workspace lock poisoned".to_string()))?; - - let doc = ws - .load(doc_id)? + let doc = workspace.load(doc_id)? .ok_or_else(|| Error::DocumentNotFound(format!("Document not found: {}", doc_id)))?; Ok(doc.tree) @@ -296,18 +308,10 @@ impl Engine { /// - The document is not found /// - No page content is available pub fn get_page_content(&self, doc_id: &str, pages: &str) -> Result { - let workspace = self - .workspace - .as_ref() + let workspace = self.workspace.as_ref() .ok_or_else(|| Error::Config("No workspace configured".to_string()))?; - // Use read lock - Workspace::load now uses interior mutability for cache - let ws = workspace - .read() - .map_err(|_| Error::Other("Workspace lock poisoned".to_string()))?; - - let doc = ws - .load(doc_id)? + let doc = workspace.load(doc_id)? .ok_or_else(|| Error::DocumentNotFound(format!("Document not found: {}", doc_id)))?; if doc.pages.is_empty() { @@ -358,73 +362,8 @@ impl Engine { Ok(result) } - /// Query a document. - /// - /// Uses the adaptive retriever to find relevant content. - /// - /// # Errors - /// - /// Returns an error if: - /// - No workspace is configured - /// - The document is not found - /// - The retrieval fails - pub async fn query(&self, doc_id: &str, question: &str) -> Result { - let tree = self.get_structure(doc_id)?; - - // Build retrieve options from config - let retrieve_options = crate::retrieval::RetrieveOptions::new() - .with_top_k(self.config.retrieval.top_k) - .with_include_content(true) - .with_include_summaries(true); - - // Use adaptive retriever - let response = self - .retriever - .retrieve(&tree, question, &retrieve_options) - .await - .map_err(|e| Error::Retrieval(e.to_string()))?; - - // Extract node IDs and build content from results - let node_ids: Vec = response - .results - .iter() - .filter_map(|r| r.node_id.clone()) - .collect(); - - let content_parts: Vec = response - .results - .iter() - .map(|r| { - let mut parts = vec![format!("## {}", r.title)]; - - if let Some(ref summary) = r.summary { - parts.push(format!("Summary: {}", summary)); - } - - if let Some(ref content) = r.content { - parts.push(content.clone()); - } - - parts.join("\n\n") - }) - .collect(); - - let content = if content_parts.is_empty() { - response.content - } else { - content_parts.join("\n\n---\n\n") - }; - - Ok(QueryResult { - doc_id: doc_id.to_string(), - node_ids, - content, - score: response.confidence, - }) - } - // ============================================================ - // Persistence + // Persistence Operations // ============================================================ /// Load a document from the workspace into cache. @@ -435,21 +374,14 @@ impl Engine { /// /// Returns an error if no workspace is configured. pub fn load(&self, doc_id: &str) -> Result { - let workspace = self - .workspace - .as_ref() + let workspace = self.workspace.as_ref() .ok_or_else(|| Error::Config("No workspace configured".to_string()))?; - // Use read lock - Workspace::load now uses interior mutability for cache - let ws = workspace - .read() - .map_err(|_| Error::Other("Workspace lock poisoned".to_string()))?; - - if !ws.contains(doc_id) { + if !workspace.exists(doc_id)? { return Ok(false); } - let _ = ws.load(doc_id)?; + let _ = workspace.load(doc_id)?; Ok(true) } @@ -459,15 +391,10 @@ impl Engine { /// /// Returns an error if no workspace is configured. pub fn remove(&self, doc_id: &str) -> Result { - let workspace = self - .workspace - .as_ref() + let workspace = self.workspace.as_ref() .ok_or_else(|| Error::Config("No workspace configured".to_string()))?; - let mut ws = workspace - .write() - .map_err(|_| Error::Other("Workspace lock poisoned".to_string()))?; - ws.remove(doc_id) + workspace.remove(doc_id) } /// Check if a document exists in the workspace. @@ -476,15 +403,10 @@ impl Engine { /// /// Returns an error if no workspace is configured. pub fn exists(&self, doc_id: &str) -> Result { - let workspace = self - .workspace - .as_ref() + let workspace = self.workspace.as_ref() .ok_or_else(|| Error::Config("No workspace configured".to_string()))?; - let ws = workspace - .read() - .map_err(|_| Error::Other("Workspace lock poisoned".to_string()))?; - Ok(ws.contains(doc_id)) + workspace.exists(doc_id) } /// Get metadata for a document. @@ -493,23 +415,10 @@ impl Engine { /// /// Returns an error if no workspace is configured. pub fn get_metadata(&self, doc_id: &str) -> Result> { - let workspace = self - .workspace - .as_ref() + let workspace = self.workspace.as_ref() .ok_or_else(|| Error::Config("No workspace configured".to_string()))?; - let ws = workspace - .read() - .map_err(|_| Error::Other("Workspace lock poisoned".to_string()))?; - - Ok(ws.get_meta(doc_id).map(|meta| DocumentInfo { - id: meta.id.clone(), - name: meta.doc_name.clone(), - format: meta.doc_type.clone(), - description: meta.doc_description.clone(), - page_count: meta.page_count, - line_count: meta.line_count, - })) + workspace.get_document_info(doc_id) } /// Remove multiple documents from the workspace. @@ -520,22 +429,10 @@ impl Engine { /// /// Returns an error if no workspace is configured. pub fn batch_remove(&self, doc_ids: &[&str]) -> Result { - let workspace = self - .workspace - .as_ref() + let workspace = self.workspace.as_ref() .ok_or_else(|| Error::Config("No workspace configured".to_string()))?; - let mut ws = workspace - .write() - .map_err(|_| Error::Other("Workspace lock poisoned".to_string()))?; - - let mut removed = 0; - for doc_id in doc_ids { - if ws.remove(doc_id)? { - removed += 1; - } - } - Ok(removed) + workspace.batch_remove(doc_ids) } /// Remove all documents from the workspace. @@ -546,38 +443,16 @@ impl Engine { /// /// Returns an error if no workspace is configured. pub fn clear(&self) -> Result { - let workspace = self - .workspace - .as_ref() + let workspace = self.workspace.as_ref() .ok_or_else(|| Error::Config("No workspace configured".to_string()))?; - let mut ws = workspace - .write() - .map_err(|_| Error::Other("Workspace lock poisoned".to_string()))?; - - let doc_ids: Vec = ws.list_documents().iter().map(|s| s.to_string()).collect(); - let count = doc_ids.len(); - - for doc_id in &doc_ids { - let _ = ws.remove(doc_id); - } - - Ok(count) + workspace.clear() } /// Get the number of indexed documents. #[must_use] pub fn len(&self) -> usize { - match &self.workspace { - Some(workspace) => { - let ws = match workspace.read() { - Ok(guard) => guard, - Err(_) => return 0, - }; - ws.len() - } - None => 0, - } + self.workspace.as_ref().map(|w| w.len()).unwrap_or(0) } /// Check if there are no documents. @@ -587,22 +462,27 @@ impl Engine { } // ============================================================ - // Internal API (for Builder) + // Sub-Client Access // ============================================================ - /// Create a new client with the given components. - pub(crate) fn with_components( - config: Config, - workspace: Option, - retriever: PipelineRetriever, - executor: PipelineExecutor, - ) -> Self { - Self { - config: Arc::new(config), - workspace: workspace.map(|w| Arc::new(RwLock::new(w))), - retriever: Arc::new(retriever), - executor: Arc::new(Mutex::new(executor)), - } + /// Get the indexer client. + pub fn indexer(&self) -> &IndexerClient { + &self.indexer + } + + /// Get the retriever client. + pub fn retriever(&self) -> &RetrieverClient { + &self.retriever + } + + /// Get the workspace client. + pub fn workspace(&self) -> Option<&WorkspaceClient> { + self.workspace.as_ref() + } + + /// Get the configuration. + pub fn config(&self) -> &Config { + &self.config } } @@ -610,9 +490,10 @@ impl Clone for Engine { fn clone(&self) -> Self { Self { config: Arc::clone(&self.config), - workspace: self.workspace.as_ref().map(Arc::clone), - retriever: Arc::clone(&self.retriever), - executor: Arc::clone(&self.executor), + indexer: self.indexer.clone(), + retriever: self.retriever.clone(), + workspace: self.workspace.clone(), + events: self.events.clone(), } } } @@ -631,3 +512,15 @@ impl std::fmt::Debug for Engine { .finish_non_exhaustive() } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_engine_builder() { + let builder = Engine::builder(); + // Builder exists + let _ = builder; + } +} diff --git a/src/client/events.rs b/src/client/events.rs new file mode 100644 index 00000000..a1d797c4 --- /dev/null +++ b/src/client/events.rs @@ -0,0 +1,365 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Event system for client operations. +//! +//! This module provides event types and handlers for observing +//! and reacting to client operations (indexing, querying, etc.). +//! +//! # Example +//! +//! ```rust,ignore +//! let emitter = EventEmitter::new() +//! .on_index(|e| match e { +//! IndexEvent::Complete { doc_id } => println!("Indexed: {}", doc_id), +//! _ => {} +//! }); +//! +//! let client = EngineBuilder::new() +//! .with_events(emitter) +//! .build()?; +//! ``` + +use std::sync::Arc; + +use async_trait::async_trait; +use tracing::info; + +use crate::parser::DocumentFormat; +use crate::retrieval::SufficiencyLevel; + +/// Event types for client operations. +#[derive(Debug, Clone)] +pub enum Event { + /// Indexing events. + Index(IndexEvent), + + /// Query events. + Query(QueryEvent), + + /// Workspace events. + Workspace(WorkspaceEvent), +} + +/// Indexing operation events. +#[derive(Debug, Clone)] +pub enum IndexEvent { + /// Started indexing a document. + Started { + /// File path being indexed. + path: String, + }, + + /// Document format detected. + FormatDetected { + /// Detected format. + format: DocumentFormat, + }, + + /// Parsing progress update. + ParsingProgress { + /// Percentage complete (0-100). + percent: u8, + }, + + /// Document tree built. + TreeBuilt { + /// Number of nodes in the tree. + node_count: usize, + }, + + /// Summary generation progress. + SummaryProgress { + /// Number of summaries completed. + completed: usize, + /// Total summaries to generate. + total: usize, + }, + + /// Indexing completed successfully. + Complete { + /// Generated document ID. + doc_id: String, + }, + + /// Error occurred during indexing. + Error { + /// Error message. + message: String, + }, +} + +/// Query operation events. +#[derive(Debug, Clone)] +pub enum QueryEvent { + /// Search started. + Started { + /// The query string. + query: String, + }, + + /// Node visited during search. + NodeVisited { + /// Node ID. + node_id: String, + /// Node title. + title: String, + /// Relevance score. + score: f32, + }, + + /// Candidate result found. + CandidateFound { + /// Node ID. + node_id: String, + /// Relevance score. + score: f32, + }, + + /// Sufficiency check result. + SufficiencyCheck { + /// Sufficiency level. + level: SufficiencyLevel, + /// Total tokens collected. + tokens: usize, + }, + + /// Query completed. + Complete { + /// Total results found. + total_results: usize, + /// Overall confidence score. + confidence: f32, + }, + + /// Error occurred during query. + Error { + /// Error message. + message: String, + }, +} + +/// Workspace operation events. +#[derive(Debug, Clone)] +pub enum WorkspaceEvent { + /// Document saved to workspace. + Saved { + /// Document ID. + doc_id: String, + }, + + /// Document loaded from workspace. + Loaded { + /// Document ID. + doc_id: String, + /// Whether it was a cache hit. + cache_hit: bool, + }, + + /// Document removed from workspace. + Removed { + /// Document ID. + doc_id: String, + }, + + /// Workspace cleared. + Cleared { + /// Number of documents removed. + count: usize, + }, +} + +/// Sync event handler trait. +pub trait EventHandler: Send + Sync { + /// Handle an event. + fn handle(&self, event: &Event); +} + +/// Async event handler trait. +#[async_trait] +pub trait AsyncEventHandler: Send + Sync { + /// Handle an event asynchronously. + async fn handle(&self, event: &Event); +} + +/// Type alias for sync index handler. +pub type IndexHandler = Box; + +/// Type alias for sync query handler. +pub type QueryHandler = Box; + +/// Type alias for sync workspace handler. +pub type WorkspaceHandler = Box; + +/// Event emitter for client operations. +/// +/// Collects event handlers and dispatches events to them. +#[derive(Default)] +pub struct EventEmitter { + /// Index event handlers. + index_handlers: Vec, + + /// Query event handlers. + query_handlers: Vec, + + /// Workspace event handlers. + workspace_handlers: Vec, + + /// Async handlers. + async_handlers: Vec>, +} + +impl EventEmitter { + /// Create a new event emitter with no handlers. + pub fn new() -> Self { + Self::default() + } + + /// Add an index event handler. + pub fn on_index(mut self, handler: F) -> Self + where + F: Fn(&IndexEvent) + Send + Sync + 'static, + { + self.index_handlers.push(Box::new(handler)); + self + } + + /// Add a query event handler. + pub fn on_query(mut self, handler: F) -> Self + where + F: Fn(&QueryEvent) + Send + Sync + 'static, + { + self.query_handlers.push(Box::new(handler)); + self + } + + /// Add a workspace event handler. + pub fn on_workspace(mut self, handler: F) -> Self + where + F: Fn(&WorkspaceEvent) + Send + Sync + 'static, + { + self.workspace_handlers.push(Box::new(handler)); + self + } + + /// Add an async event handler. + pub fn with_async_handler(mut self, handler: Arc) -> Self + where + H: AsyncEventHandler + 'static, + { + self.async_handlers.push(handler); + self + } + + /// Emit an index event. + pub fn emit_index(&self, event: IndexEvent) { + for handler in &self.index_handlers { + handler(&event); + } + for handler in &self.async_handlers { + // For sync context, we just log async handlers + let event = Event::Index(event.clone()); + info!("Async event: {:?}", event); + } + } + + /// Emit a query event. + pub fn emit_query(&self, event: QueryEvent) { + for handler in &self.query_handlers { + handler(&event); + } + } + + /// Emit a workspace event. + pub fn emit_workspace(&self, event: WorkspaceEvent) { + for handler in &self.workspace_handlers { + handler(&event); + } + } + + /// Check if there are any handlers registered. + pub fn has_handlers(&self) -> bool { + !self.index_handlers.is_empty() + || !self.query_handlers.is_empty() + || !self.workspace_handlers.is_empty() + || !self.async_handlers.is_empty() + } + + /// Merge another emitter into this one. + pub fn merge(mut self, other: EventEmitter) -> Self { + self.index_handlers.extend(other.index_handlers); + self.query_handlers.extend(other.query_handlers); + self.workspace_handlers.extend(other.workspace_handlers); + self.async_handlers.extend(other.async_handlers); + self + } +} + +impl std::fmt::Debug for EventEmitter { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("EventEmitter") + .field("index_handlers", &self.index_handlers.len()) + .field("query_handlers", &self.query_handlers.len()) + .field("workspace_handlers", &self.workspace_handlers.len()) + .field("async_handlers", &self.async_handlers.len()) + .finish() + } +} + +impl Clone for EventEmitter { + fn clone(&self) -> Self { + // Clone returns an empty emitter since we can't clone closures + Self::new() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::atomic::{AtomicUsize, Ordering}; + use std::sync::Arc; + + #[test] + fn test_event_emitter_index() { + let counter = Arc::new(AtomicUsize::new(0)); + let counter_clone = counter.clone(); + + let emitter = EventEmitter::new().on_index(move |_e| { + counter_clone.fetch_add(1, Ordering::SeqCst); + }); + + emitter.emit_index(IndexEvent::Started { + path: "test.md".to_string(), + }); + emitter.emit_index(IndexEvent::Complete { + doc_id: "123".to_string(), + }); + + assert_eq!(counter.load(Ordering::SeqCst), 2); + } + + #[test] + fn test_event_emitter_query() { + let counter = Arc::new(AtomicUsize::new(0)); + let counter_clone = counter.clone(); + + let emitter = EventEmitter::new().on_query(move |_e| { + counter_clone.fetch_add(1, Ordering::SeqCst); + }); + + emitter.emit_query(QueryEvent::Started { + query: "test".to_string(), + }); + + assert_eq!(counter.load(Ordering::SeqCst), 1); + } + + #[test] + fn test_event_emitter_has_handlers() { + let empty = EventEmitter::new(); + assert!(!empty.has_handlers()); + + let with_handler = EventEmitter::new().on_index(|_| {}); + assert!(with_handler.has_handlers()); + } +} diff --git a/src/client/indexer.rs b/src/client/indexer.rs new file mode 100644 index 00000000..7f41cde8 --- /dev/null +++ b/src/client/indexer.rs @@ -0,0 +1,351 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Document indexing client. +//! +//! This module provides document indexing operations including +//! format detection, parsing, and tree building. +//! +//! # Example +//! +//! ```rust,ignore +//! let indexer = IndexerClient::new(executor); +//! +//! let result = indexer +//! .index("./document.md") +//! .with_summaries() +//! .await?; +//! +//! println!("Indexed: {} ({} nodes)", result.doc_id, result.node_count); +//! ``` + +use std::path::{Path, PathBuf}; +use std::sync::{Arc, Mutex}; + +use tracing::info; +use uuid::Uuid; + +use crate::domain::{Error, Result}; +use crate::index::{IndexInput, IndexMode, PipelineExecutor, PipelineOptions, SummaryStrategy}; +use crate::parser::DocumentFormat; +use crate::storage::{DocumentMeta, PersistedDocument}; + +use super::context::ClientContext; +use super::events::{EventEmitter, IndexEvent}; +use super::types::{IndexOptions, IndexMode as ClientIndexMode, IndexedDocument}; + +/// Document indexing client. +/// +/// Provides operations for parsing and indexing documents. +pub struct IndexerClient { + /// Pipeline executor. + executor: Arc>, + + /// Event emitter. + events: EventEmitter, + + /// Configuration. + config: IndexerConfig, +} + +/// Indexer configuration. +#[derive(Debug, Clone)] +pub struct IndexerConfig { + /// Minimum content tokens required to generate a summary. + pub min_summary_tokens: usize, + + /// Whether to generate IDs by default. + pub generate_ids: bool, + + /// Whether to generate descriptions by default. + pub generate_descriptions: bool, +} + +impl Default for IndexerConfig { + fn default() -> Self { + Self { + min_summary_tokens: 20, + generate_ids: true, + generate_descriptions: false, + } + } +} + +impl IndexerClient { + /// Create a new indexer client. + pub fn new(executor: PipelineExecutor) -> Self { + Self { + executor: Arc::new(Mutex::new(executor)), + events: EventEmitter::new(), + config: IndexerConfig::default(), + } + } + + /// Create with event emitter. + pub fn with_events(mut self, events: EventEmitter) -> Self { + self.events = events; + self + } + + /// Create with configuration. + pub fn with_config(mut self, config: IndexerConfig) -> Self { + self.config = config; + self + } + + /// Create from an existing executor Arc. + pub(crate) fn from_arc( + executor: Arc>, + events: EventEmitter, + config: IndexerConfig, + ) -> Self { + Self { + executor, + events, + config, + } + } + + /// Index a document from a file path. + /// + /// # Errors + /// + /// Returns an error if: + /// - The file does not exist + /// - The file format is not supported + /// - The pipeline execution fails + pub async fn index(&self, path: impl AsRef) -> Result { + self.index_with_options(path, IndexOptions::default()).await + } + + /// Index a document with custom options. + /// + /// # Errors + /// + /// See [`IndexerClient::index`]. + pub async fn index_with_options( + &self, + path: impl AsRef, + options: IndexOptions, + ) -> Result { + let path = path.as_ref(); + let path = path.canonicalize().unwrap_or_else(|_| path.to_path_buf()); + + if !path.exists() { + return Err(Error::Parse(format!("File not found: {}", path.display()))); + } + + // Emit start event + self.events.emit_index(IndexEvent::Started { + path: path.display().to_string(), + }); + + // Generate document ID + let doc_id = Uuid::new_v4().to_string(); + + // Detect format + let format = self.detect_format(&path, &options)?; + self.events.emit_index(IndexEvent::FormatDetected { format }); + + info!("Indexing {:?} document: {}", format, path.display()); + + // Convert client options to pipeline options + let pipeline_options = PipelineOptions { + mode: match options.mode { + ClientIndexMode::Auto => IndexMode::Auto, + ClientIndexMode::Pdf => IndexMode::Pdf, + ClientIndexMode::Markdown => IndexMode::Markdown, + ClientIndexMode::Html => IndexMode::Html, + ClientIndexMode::Docx => IndexMode::Docx, + }, + generate_ids: options.generate_ids, + summary_strategy: if options.generate_summaries { + SummaryStrategy::selective(self.config.min_summary_tokens, false) + } else { + SummaryStrategy::none() + }, + generate_description: options.generate_description, + ..Default::default() + }; + + // Create pipeline input and execute + let input = IndexInput::file(&path); + let result = { + let mut executor = self.executor.lock() + .map_err(|_| Error::Other("Pipeline executor lock poisoned".to_string()))?; + executor.execute(input, pipeline_options).await? + }; + + // Build indexed document + let tree = result + .tree + .ok_or_else(|| Error::Parse("Document tree not generated".to_string()))?; + + let node_count = tree.node_count(); + self.events.emit_index(IndexEvent::TreeBuilt { node_count }); + + let mut doc = IndexedDocument::new(&doc_id, format) + .with_name(&result.name) + .with_source_path(&path) + .with_tree(tree); + + if let Some(desc) = &result.description { + doc = doc.with_description(desc); + } + + if let Some(page_count) = result.page_count { + doc = doc.with_page_count(page_count); + } + + info!("Indexing complete: {} ({} nodes)", doc_id, node_count); + self.events.emit_index(IndexEvent::Complete { doc_id }); + + Ok(doc) + } + + /// Detect document format from path and options. + pub fn detect_format(&self, path: &Path, options: &IndexOptions) -> Result { + match options.mode { + ClientIndexMode::Auto => { + let ext = path.extension().and_then(|e| e.to_str()).unwrap_or(""); + DocumentFormat::from_extension(ext) + .ok_or_else(|| Error::Parse(format!("Unknown format: {}", ext))) + } + ClientIndexMode::Pdf => Ok(DocumentFormat::Pdf), + ClientIndexMode::Markdown => Ok(DocumentFormat::Markdown), + ClientIndexMode::Html => Ok(DocumentFormat::Html), + ClientIndexMode::Docx => Ok(DocumentFormat::Docx), + } + } + + /// Validate a document before indexing. + /// + /// # Errors + /// + /// Returns an error if the file doesn't exist or is not readable. + pub fn validate(&self, path: impl AsRef) -> Result { + let path = path.as_ref(); + + if !path.exists() { + return Ok(ValidationResult { + valid: false, + errors: vec![format!("File not found: {}", path.display())], + warnings: vec![], + format: None, + estimated_size: 0, + }); + } + + let metadata = std::fs::metadata(path) + .map_err(|e| Error::Parse(format!("Cannot read file metadata: {}", e)))?; + + let estimated_size = metadata.len() as usize; + let mut warnings = Vec::new(); + + // Check file size + if estimated_size > 100 * 1024 * 1024 { + warnings.push("Large file (>100MB) may take longer to index".to_string()); + } + + // Detect format + let ext = path.extension().and_then(|e| e.to_str()).unwrap_or(""); + let format = DocumentFormat::from_extension(ext); + + if format.is_none() { + return Ok(ValidationResult { + valid: false, + errors: vec![format!("Unknown format: {}", ext)], + warnings, + format: None, + estimated_size, + }); + } + + Ok(ValidationResult { + valid: true, + errors: vec![], + warnings, + format, + estimated_size, + }) + } + + /// Convert IndexedDocument to PersistedDocument for storage. + pub fn to_persisted(&self, doc: IndexedDocument) -> PersistedDocument { + let meta = DocumentMeta::new(&doc.id, &doc.name, doc.format.extension()) + .with_source_path( + doc.source_path + .as_ref() + .map(|p| p.to_string_lossy().to_string()) + .unwrap_or_default(), + ) + .with_description(doc.description.clone().unwrap_or_default()); + + let mut persisted = PersistedDocument::new( + meta, + doc.tree.expect("IndexedDocument must have a tree"), + ); + + for page in doc.pages { + persisted.add_page(page.page, &page.content); + } + + persisted + } + + /// Get the underlying executor Arc (for advanced use). + pub(crate) fn inner(&self) -> Arc> { + Arc::clone(&self.executor) + } +} + +impl Clone for IndexerClient { + fn clone(&self) -> Self { + Self { + executor: Arc::clone(&self.executor), + events: self.events.clone(), + config: self.config.clone(), + } + } +} + +/// Document validation result. +#[derive(Debug, Clone)] +pub struct ValidationResult { + /// Whether the document is valid for indexing. + pub valid: bool, + + /// Validation errors (prevents indexing). + pub errors: Vec, + + /// Validation warnings (non-blocking). + pub warnings: Vec, + + /// Detected document format. + pub format: Option, + + /// Estimated file size in bytes. + pub estimated_size: usize, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_indexer_client_creation() { + let executor = PipelineExecutor::new(); + let client = IndexerClient::new(executor); + assert_eq!(client.config.min_summary_tokens, 20); + } + + #[test] + fn test_validate_missing_file() { + let executor = PipelineExecutor::new(); + let client = IndexerClient::new(executor); + + let result = client.validate("./nonexistent.md").unwrap(); + assert!(!result.valid); + assert!(!result.errors.is_empty()); + } +} diff --git a/src/client/mod.rs b/src/client/mod.rs index 907d8c0e..51abecd0 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -6,6 +6,25 @@ //! This module provides the main entry point for using vectorless: //! - [`Engine`] — The main client for indexing and querying documents //! - [`EngineBuilder`] — Builder pattern for client configuration +//! - [`Session`] — Multi-document session management +//! +//! # Architecture +//! +//! The client module is organized into specialized sub-modules: +//! +//! ```text +//! client/ +//! ├── mod.rs → Re-exports and documentation +//! ├── engine.rs → Main orchestrator +//! ├── builder.rs → Builder pattern +//! ├── types.rs → Public API types +//! ├── context.rs → Request context and configuration +//! ├── session.rs → Session management +//! ├── indexer.rs → Document indexing operations +//! ├── retriever.rs → Query and retrieval operations +//! ├── workspace.rs → Workspace CRUD operations +//! └── events.rs → Event system and callbacks +//! ``` //! //! # Quick Start //! @@ -15,11 +34,7 @@ //! # #[tokio::main] //! # async fn main() -> vectorless::domain::Result<()> { //! // Create a client with default settings -//! let client = Engine::new()?; -//! -//! // Or use the builder for custom configuration //! let client = EngineBuilder::new() -//! .with_api_key("your-api-key") //! .with_workspace("./my_workspace") //! .build()?; //! @@ -29,6 +44,10 @@ //! // Get document structure //! let structure = client.get_structure(&doc_id)?; //! +//! // Query the document +//! let result = client.query(&doc_id, "What is this?").await?; +//! println!("{}", result.content); +//! //! // List all documents //! for doc in client.list_documents() { //! println!("{}: {}", doc.id, doc.name); @@ -37,19 +56,117 @@ //! # } //! ``` //! +//! # Session-Based Operations +//! +//! For multi-document operations, use sessions: +//! +//! ```rust,no_run +//! # use vectorless::client::{Engine, EngineBuilder}; +//! # #[tokio::main] +//! # async fn main() -> vectorless::domain::Result<()> { +//! let client = EngineBuilder::new() +//! .with_workspace("./workspace") +//! .build()?; +//! +//! let session = client.session(); +//! +//! // Index multiple documents +//! let doc1 = session.index("./doc1.md").await?; +//! let doc2 = session.index("./doc2.md").await?; +//! +//! // Query across all documents +//! let results = session.query_all("What is the architecture?").await?; +//! # Ok(()) +//! # } +//! ``` +//! +//! # Events and Progress +//! +//! Monitor operation progress with events: +//! +//! ```rust,no_run +//! # use vectorless::client::{Engine, EngineBuilder, EventEmitter, events::IndexEvent}; +//! # #[tokio::main] +//! # async fn main() -> vectorless::domain::Result<()> { +//! let events = EventEmitter::new() +//! .on_index(|e| match e { +//! IndexEvent::Complete { doc_id } => println!("Indexed: {}", doc_id), +//! _ => {} +//! }); +//! +//! let client = EngineBuilder::new() +//! .with_events(events) +//! .build()?; +//! # Ok(()) +//! # } +//! ``` +//! //! # Features //! //! - **Document Indexing** — Parse and index Markdown, PDF, and text files //! - **Tree-Based Structure** — Documents organized as hierarchical trees //! - **Workspace Persistence** — Save and load indexed documents -//! - **Builder Pattern** — Flexible client configuration +//! - **Session Management** — Multi-document operations with caching +//! - **Event System** — Progress callbacks and monitoring mod builder; +mod context; mod engine; +pub mod events; +mod indexer; +mod retriever; +mod session; mod types; +mod workspace; -// Re-export main types -pub use types::{DocumentInfo, IndexMode, IndexOptions, IndexedDocument, PageContent, QueryResult}; +// ============================================================ +// Main Types +// ============================================================ -pub use builder::{BuildError, EngineBuilder}; pub use engine::Engine; +pub use builder::{BuildError, EngineBuilder}; + +// ============================================================ +// Sub-Clients +// ============================================================ + +pub use indexer::IndexerClient; +pub use retriever::RetrieverClient; +pub use workspace::WorkspaceClient; +pub use session::Session; + +// ============================================================ +// Context and Events +// ============================================================ + +pub use context::{ClientContext, FeatureFlags, RequestContextConfig}; +pub use events::{ + EventEmitter, Event, EventHandler, AsyncEventHandler, + IndexEvent, QueryEvent, WorkspaceEvent, +}; + +// ============================================================ +// Types +// ============================================================ + +pub use types::{ + // Document types + IndexedDocument, PageContent, + // Index types + IndexMode, IndexOptions, + // Query types + QueryResult, + // Document info + DocumentInfo, + // Error types + ClientError, +}; + +// ============================================================ +// Sub-Client Types +// ============================================================ + +pub use indexer::{IndexerConfig, ValidationResult}; +pub use retriever::{RetrieverClientConfig, NodeContext}; +pub use workspace::{WorkspaceClientConfig, WorkspaceStats}; +pub use session::{SessionConfig, SessionStats, EvictionPolicy, PreloadStrategy}; diff --git a/src/client/retriever.rs b/src/client/retriever.rs new file mode 100644 index 00000000..7f0099ca --- /dev/null +++ b/src/client/retriever.rs @@ -0,0 +1,408 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Document retrieval client. +//! +//! This module provides query and retrieval operations for document content. +//! +//! # Example +//! +//! ```rust,ignore +//! let retriever = RetrieverClient::new(pipeline_retriever); +//! +//! let result = retriever +//! .query(&tree, "What is this?", RetrieveOptions::default()) +//! .await?; +//! +//! println!("Found {} results", result.results.len()); +//! ``` + +use std::sync::Arc; + +use tracing::info; + +use crate::config::Config; +use crate::domain::{DocumentTree, Error, NodeId, Result}; +use crate::retrieval::content::ContentAggregatorConfig; +use crate::retrieval::{ + QueryComplexity, RetrieveOptions, RetrieveResponse, RetrievalResult, Retriever, SufficiencyLevel, +}; + +use super::context::ClientContext; +use super::events::{EventEmitter, QueryEvent}; +use super::types::QueryResult; + +/// Document retrieval client. +/// +/// Provides operations for querying document content. +pub struct RetrieverClient { + /// Pipeline retriever. + retriever: Arc, + + /// Configuration reference. + config: Arc, + + /// Event emitter. + events: EventEmitter, + + /// Default retrieval options. + default_options: RetrieveOptions, +} + +/// Retriever configuration. +#[derive(Debug, Clone)] +pub struct RetrieverClientConfig { + /// Default top_k for retrieval. + pub default_top_k: usize, + + /// Default token budget. + pub default_token_budget: usize, + + /// Content aggregator config. + pub content_config: Option, + + /// Enable result caching. + pub enable_cache: bool, +} + +impl Default for RetrieverClientConfig { + fn default() -> Self { + Self { + default_top_k: 5, + default_token_budget: 4000, + content_config: None, + enable_cache: true, + } + } +} + +impl RetrieverClient { + /// Create a new retriever client. + pub fn new(retriever: crate::retrieval::PipelineRetriever, config: Arc) -> Self { + Self { + retriever: Arc::new(retriever), + config, + events: EventEmitter::new(), + default_options: RetrieveOptions::default(), + } + } + + /// Create with event emitter. + pub fn with_events(mut self, events: EventEmitter) -> Self { + self.events = events; + self + } + + /// Create with configuration. + pub fn with_config(mut self, config: RetrieverClientConfig) -> Self { + self.default_options = RetrieveOptions::new() + .with_top_k(config.default_top_k) + .with_max_tokens(config.default_token_budget) + .with_enable_cache(config.enable_cache); + self + } + + /// Create from existing retriever Arc. + pub(crate) fn from_arc( + retriever: Arc, + config: Arc, + events: EventEmitter, + ) -> Self { + Self { + retriever, + config, + events, + default_options: RetrieveOptions::default(), + } + } + + /// Query a document tree. + /// + /// # Errors + /// + /// Returns an error if: + /// - The retrieval pipeline fails + pub async fn query( + &self, + tree: &DocumentTree, + question: &str, + options: &RetrieveOptions, + ) -> Result { + self.query_with_context(tree, question, options, &ClientContext::new()).await + } + + /// Query with request context. + /// + /// # Errors + /// + /// Returns an error if: + /// - The retrieval pipeline fails + /// - The request has timed out + pub async fn query_with_context( + &self, + tree: &DocumentTree, + question: &str, + options: &RetrieveOptions, + ctx: &ClientContext, + ) -> Result { + // Check timeout + if ctx.is_timed_out() { + return Err(Error::Other("Request timed out".to_string())); + } + + self.events.emit_query(QueryEvent::Started { + query: question.to_string(), + }); + + info!("Querying: {:?}", question); + + // Apply context overrides + let mut options = options.clone(); + if let Some(top_k) = ctx.config.top_k { + options.top_k = top_k; + } + if let Some(token_budget) = ctx.config.token_budget { + options.max_tokens = token_budget; + } + + // Execute retrieval + let response = self.retriever + .retrieve(tree, question, &options) + .await + .map_err(|e| Error::Retrieval(e.to_string()))?; + + // Build result + let result = self.build_query_result(&response); + + self.events.emit_query(QueryEvent::Complete { + total_results: result.node_ids.len(), + confidence: result.score, + }); + + Ok(result) + } + + /// Build QueryResult from RetrieveResponse. + fn build_query_result(&self, response: &RetrieveResponse) -> QueryResult { + // Extract node IDs + let node_ids: Vec = response + .results + .iter() + .filter_map(|r| r.node_id.clone()) + .collect(); + + // Build content + let content_parts: Vec = response + .results + .iter() + .map(|r| { + let mut parts = vec![format!("## {}", r.title)]; + if let Some(ref content) = r.content { + parts.push(content.clone()); + } + parts.join("\n\n") + }) + .collect(); + + let content = if content_parts.is_empty() { + response.content.clone() + } else { + content_parts.join("\n\n---\n\n") + }; + + QueryResult { + doc_id: String::new(), // Will be set by caller + node_ids, + content, + score: response.confidence, + } + } + + /// Get similar nodes to a given node. + /// + /// Uses tree structure and content to find similar nodes. + pub fn find_similar( + &self, + tree: &DocumentTree, + node_id: NodeId, + top_k: usize, + ) -> Result> { + let mut results = Vec::new(); + + // Get the target node's content for comparison + let target_content = tree + .get(node_id) + .map(|n| n.content.clone()) + .unwrap_or_default(); + + if target_content.is_empty() { + return Ok(results); + } + + // Extract keywords from target content + let target_keywords = self.extract_keywords(&target_content); + + // Search all nodes for similarity + let root = tree.root(); + let mut stack = vec![root]; + + while let Some(current_id) = stack.pop() { + if current_id == node_id { + // Skip the target node itself + stack.extend(tree.children(current_id)); + continue; + } + + if let Some(node) = tree.get(current_id) { + let node_keywords = self.extract_keywords(&node.content); + let similarity = self.calculate_similarity(&target_keywords, &node_keywords); + + if similarity > 0.3 { + results.push(RetrievalResult::new(&node.title) + .with_node_id(format!("{:?}", current_id)) + .with_content(node.content.clone()) + .with_score(similarity) + .with_depth(tree.depth(current_id))); + } + } + + stack.extend(tree.children(current_id)); + } + + // Sort by score and take top_k + results.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap_or(std::cmp::Ordering::Equal)); + results.truncate(top_k); + + Ok(results) + } + + /// Extract keywords from content. + fn extract_keywords(&self, content: &str) -> Vec { + content + .to_lowercase() + .split_whitespace() + .filter(|w| w.len() > 3) + .take(20) + .map(|s| s.to_string()) + .collect() + } + + /// Calculate similarity between keyword sets. + fn calculate_similarity(&self, set1: &[String], set2: &[String]) -> f32 { + if set1.is_empty() || set2.is_empty() { + return 0.0; + } + + let set1_set: std::collections::HashSet<_> = set1.iter().collect(); + let set2_set: std::collections::HashSet<_> = set2.iter().collect(); + + let intersection = set1_set.intersection(&set2_set).count(); + let union = set1_set.union(&set2_set).count(); + + intersection as f32 / union as f32 + } + + /// Get node context (ancestors and siblings). + /// + /// Returns the node's ancestors up to the specified depth, + /// along with sibling nodes at each level. + pub fn get_node_context( + &self, + tree: &DocumentTree, + node_id: NodeId, + ancestor_depth: usize, + ) -> Result { + let mut ancestors = Vec::new(); + let mut siblings = Vec::new(); + + // Get ancestors + let mut current_id = Some(node_id); + let mut depth = 0; + + while let Some(id) = current_id { + if depth >= ancestor_depth { + break; + } + + if let Some(node) = tree.get(id) { + ancestors.push(RetrievalResult::new(&node.title) + .with_node_id(format!("{:?}", id)) + .with_depth(tree.depth(id))); + + // Get siblings at this level + if let Some(parent_id) = tree.parent(id) { + for child_id in tree.children(parent_id) { + if child_id != id { + if let Some(sibling) = tree.get(child_id) { + siblings.push(RetrievalResult::new(&sibling.title) + .with_node_id(format!("{:?}", child_id)) + .with_depth(tree.depth(child_id))); + } + } + } + } + } + + current_id = tree.parent(id); + depth += 1; + } + + // Get the target node + let target = tree + .get(node_id) + .map(|n| { + RetrievalResult::new(&n.title) + .with_node_id(format!("{:?}", node_id)) + .with_content(n.content.clone()) + .with_depth(tree.depth(node_id)) + }); + + Ok(NodeContext { + target, + ancestors, + siblings, + }) + } + + /// Get the underlying retriever Arc. + pub(crate) fn inner(&self) -> Arc { + Arc::clone(&self.retriever) + } +} + +impl Clone for RetrieverClient { + fn clone(&self) -> Self { + Self { + retriever: Arc::clone(&self.retriever), + config: Arc::clone(&self.config), + events: self.events.clone(), + default_options: self.default_options.clone(), + } + } +} + +/// Node context information. +#[derive(Debug, Clone)] +pub struct NodeContext { + /// The target node. + pub target: Option, + + /// Ancestor nodes (ordered from parent to root). + pub ancestors: Vec, + + /// Sibling nodes at each ancestor level. + pub siblings: Vec, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_retriever_client_creation() { + let config = Arc::new(Config::default()); + let retriever = crate::retrieval::PipelineRetriever::new(); + let client = RetrieverClient::new(retriever, config); + assert!(client.default_options.top_k > 0); + } +} diff --git a/src/client/session.rs b/src/client/session.rs new file mode 100644 index 00000000..1b5d55ef --- /dev/null +++ b/src/client/session.rs @@ -0,0 +1,493 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Session management for multi-document operations. +//! +//! This module provides session-based document management with +//! automatic caching and cross-document querying. +//! +//! # Example +//! +//! ```rust,ignore +//! let session = client.session(); +//! +//! // Index multiple documents +//! let doc1 = session.index("./doc1.md").await?; +//! let doc2 = session.index("./doc2.md").await?; +//! +//! // Query across all documents +//! let results = session.query_all("What is X?").await?; +//! +//! // Query single document (uses cached tree) +//! let result = session.query(&doc1, "Summary?").await?; +//! ``` + +use std::cell::Cell; +use std::collections::HashMap; +use std::path::Path; +use std::sync::Arc; +use std::time::{Duration, Instant}; + +use tracing::info; +use uuid::Uuid; + +use crate::domain::{DocumentTree, Error, Result}; +use crate::retrieval::RetrieveOptions; +use crate::storage::PersistedDocument; + +use super::context::ClientContext; +use super::events::EventEmitter; +use super::indexer::IndexerClient; +use super::retriever::RetrieverClient; +use super::types::{DocumentInfo, IndexOptions, QueryResult}; +use super::workspace::WorkspaceClient; + +/// Session for managing multiple documents. +/// +/// Provides automatic caching of document trees and cross-document operations. +pub struct Session { + /// Session ID. + pub id: Uuid, + + /// Session configuration. + config: SessionConfig, + + /// Document contexts (cached). + documents: HashMap, + + /// Indexer client. + indexer: IndexerClient, + + /// Retriever client. + retriever: RetrieverClient, + + /// Workspace client. + workspace: WorkspaceClient, + + /// Event emitter. + events: EventEmitter, + + /// Session statistics. + stats: SessionStats, + + /// Created at timestamp. + created_at: Instant, +} + +/// Document context within a session. +#[derive(Debug, Clone)] +struct DocumentContext { + /// Document ID. + doc_id: String, + + /// Cached document tree. + tree: Option>, + + /// Document metadata. + meta: DocumentInfo, + + /// Access count. + access_count: usize, + + /// Last access time. + last_accessed: Instant, +} + +/// Session configuration. +#[derive(Debug, Clone)] +pub struct SessionConfig { + /// Maximum documents to cache in memory. + pub max_cached_documents: usize, + + /// Cache eviction policy. + pub eviction_policy: EvictionPolicy, + + /// Preload strategy when indexing. + pub preload_strategy: PreloadStrategy, +} + +impl Default for SessionConfig { + fn default() -> Self { + Self { + max_cached_documents: 100, + eviction_policy: EvictionPolicy::Lru, + preload_strategy: PreloadStrategy::Lazy, + } + } +} + +/// Cache eviction policy. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum EvictionPolicy { + /// Least recently used. + Lru, + /// First in, first out. + Fifo, + /// No eviction (until session closes). + None, +} + +/// Document preload strategy. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum PreloadStrategy { + /// Load trees on demand. + Lazy, + /// Load trees immediately when indexing. + Eager, +} + +/// Session statistics. +#[derive(Debug, Default)] +pub struct SessionStats { + /// Total documents in session. + pub document_count: Cell, + + /// Total queries made. + pub query_count: Cell, + + /// Cache hits. + pub cache_hits: Cell, + + /// Cache misses. + pub cache_misses: Cell, + + /// Total query time (in microseconds). + total_query_time_us: Cell, +} + +impl SessionStats { + /// Get the cache hit rate. + pub fn cache_hit_rate(&self) -> f32 { + let total = self.cache_hits.get() + self.cache_misses.get(); + if total == 0 { + 0.0 + } else { + self.cache_hits.get() as f32 / total as f32 + } + } + + /// Get the total query time. + pub fn total_query_time(&self) -> Duration { + Duration::from_micros(self.total_query_time_us.get()) + } + + /// Get the average query time. + pub fn avg_query_time(&self) -> Option { + let count = self.query_count.get(); + if count == 0 { + None + } else { + Some(self.total_query_time() / count as u32) + } + } + + /// Increment query count. + fn increment_query_count(&self) { + self.query_count.set(self.query_count.get() + 1); + } + + /// Add query time. + fn add_query_time(&self, duration: Duration) { + self.total_query_time_us.set( + self.total_query_time_us.get() + duration.as_micros() as u64 + ); + } + + /// Increment cache hits. + fn increment_cache_hits(&self) { + self.cache_hits.set(self.cache_hits.get() + 1); + } + + /// Increment cache misses. + fn increment_cache_misses(&self) { + self.cache_misses.set(self.cache_misses.get() + 1); + } +} + +impl Clone for SessionStats { + fn clone(&self) -> Self { + Self { + document_count: Cell::new(self.document_count.get()), + query_count: Cell::new(self.query_count.get()), + cache_hits: Cell::new(self.cache_hits.get()), + cache_misses: Cell::new(self.cache_misses.get()), + total_query_time_us: Cell::new(self.total_query_time_us.get()), + } + } +} + +impl Session { + /// Create a new session. + pub(crate) fn new( + indexer: IndexerClient, + retriever: RetrieverClient, + workspace: WorkspaceClient, + events: EventEmitter, + ) -> Self { + Self { + id: Uuid::new_v4(), + config: SessionConfig::default(), + documents: HashMap::new(), + indexer, + retriever, + workspace, + events, + stats: SessionStats::default(), + created_at: Instant::now(), + } + } + + /// Create with configuration. + pub fn with_config(mut self, config: SessionConfig) -> Self { + self.config = config; + self + } + + /// Get the session ID. + pub fn id(&self) -> Uuid { + self.id + } + + /// Get session age. + pub fn age(&self) -> Duration { + Instant::now().duration_since(self.created_at) + } + + // ============================================================ + // Document Indexing + // ============================================================ + + /// Index a document into this session. + /// + /// The document is indexed, saved to workspace, and cached in this session. + pub async fn index(&self, path: impl AsRef) -> Result { + self.index_with_options(path, IndexOptions::default()).await + } + + /// Index a document with options. + pub async fn index_with_options( + &self, + path: impl AsRef, + options: IndexOptions, + ) -> Result { + // Index the document + let doc = self.indexer.index_with_options(path, options).await?; + + // Save to workspace + let persisted = self.indexer.to_persisted(doc); + self.workspace.save(&persisted)?; + + // Cache in session + let doc_id = persisted.meta.id.clone(); + + info!("Session {}: indexed document {}", self.id, doc_id); + + Ok(doc_id) + } + + // ============================================================ + // Document Querying + // ============================================================ + + /// Query a document within this session. + /// + /// Uses the cached tree if available, otherwise loads from workspace. + pub async fn query(&self, doc_id: &str, question: &str) -> Result { + self.query_with_options(doc_id, question, RetrieveOptions::default()).await + } + + /// Query a document with options. + pub async fn query_with_options( + &self, + doc_id: &str, + question: &str, + options: RetrieveOptions, + ) -> Result { + let start = Instant::now(); + + // Get the document tree + let tree = self.get_tree(doc_id).await?; + + // Query + let mut result = self.retriever.query(&tree, question, &options).await?; + result.doc_id = doc_id.to_string(); + + // Update stats + self.stats.increment_query_count(); + self.stats.add_query_time(start.elapsed()); + + Ok(result) + } + + /// Query across all documents in this session. + /// + /// Searches each document and merges results. + pub async fn query_all(&self, question: &str) -> Result> { + self.query_all_with_options(question, RetrieveOptions::default()).await + } + + /// Query across all documents with options. + pub async fn query_all_with_options( + &self, + question: &str, + options: RetrieveOptions, + ) -> Result> { + let doc_ids: Vec = self.documents.keys().cloned().collect(); + + if doc_ids.is_empty() { + return Ok(Vec::new()); + } + + let mut results = Vec::new(); + + for doc_id in &doc_ids { + match self.query_with_options(doc_id, question, options.clone()).await { + Ok(result) => { + if !result.node_ids.is_empty() { + results.push(result); + } + } + Err(e) => { + info!("Query failed for {}: {}", doc_id, e); + } + } + } + + // Sort by score descending + results.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap_or(std::cmp::Ordering::Equal)); + + Ok(results) + } + + // ============================================================ + // Document Management + // ============================================================ + + /// Get list of documents in this session. + pub fn list_documents(&self) -> Vec { + self.documents.values().map(|ctx| ctx.meta.clone()).collect() + } + + /// Get a document tree (from cache or workspace). + pub async fn get_tree(&self, doc_id: &str) -> Result { + // Check cache first + if let Some(tree) = self.get_cached_tree(doc_id) { + self.stats.increment_cache_hits(); + return Ok((*tree).clone()); + } + + self.stats.increment_cache_misses(); + + // Load from workspace + let doc = self.workspace.load(doc_id)? + .ok_or_else(|| Error::DocumentNotFound(format!("Document not found: {}", doc_id)))?; + + let tree = doc.tree; + + // Cache for future use + self.cache_tree(doc_id, &tree); + + Ok(tree) + } + + /// Preload documents into the session cache. + /// + /// Useful for warming up the cache before querying. + pub async fn preload(&self, doc_ids: &[&str]) -> Result { + let mut loaded = 0; + + for doc_id in doc_ids { + if self.get_cached_tree(doc_id).is_none() { + if let Ok(tree) = self.get_tree(doc_id).await { + self.cache_tree(doc_id, &tree); + loaded += 1; + } + } + } + + info!("Session {}: preloaded {} documents", self.id, loaded); + Ok(loaded) + } + + /// Remove a document from the session. + pub fn remove_document(&self, doc_id: &str) -> bool { + // Note: This would need interior mutability for full implementation + false + } + + /// Clear all documents from the session cache. + pub fn clear_cache(&self) { + // Note: This would need interior mutability for full implementation + } + + // ============================================================ + // Statistics + // ============================================================ + + /// Get session statistics. + pub fn stats(&self) -> SessionStats { + self.stats.clone() + } + + /// Get the number of cached documents. + pub fn cached_count(&self) -> usize { + self.documents.values().filter(|d| d.tree.is_some()).count() + } + + // ============================================================ + // Internal Methods + // ============================================================ + + /// Cache a document in this session. + fn cache_document(&self, doc: crate::client::types::IndexedDocument) { + // Note: This would need interior mutability for full implementation + // For now, this is a placeholder + } + + /// Get a cached tree. + fn get_cached_tree(&self, doc_id: &str) -> Option> { + self.documents.get(doc_id).and_then(|ctx| ctx.tree.clone()) + } + + /// Cache a tree. + fn cache_tree(&self, doc_id: &str, tree: &DocumentTree) { + // Note: This would need interior mutability for full implementation + } +} + +impl Clone for Session { + fn clone(&self) -> Self { + Self { + id: self.id, + config: self.config.clone(), + documents: self.documents.clone(), + indexer: self.indexer.clone(), + retriever: self.retriever.clone(), + workspace: self.workspace.clone(), + events: self.events.clone(), + stats: self.stats.clone(), + created_at: self.created_at, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_session_config() { + let config = SessionConfig::default(); + assert_eq!(config.max_cached_documents, 100); + assert_eq!(config.eviction_policy, EvictionPolicy::Lru); + } + + #[test] + fn test_session_stats() { + let stats = SessionStats::default(); + stats.cache_hits.set(8); + stats.cache_misses.set(2); + + assert!((stats.cache_hit_rate() - 0.8).abs() < 0.01); + } +} diff --git a/src/client/types.rs b/src/client/types.rs index e0e68a3a..40816257 100644 --- a/src/client/types.rs +++ b/src/client/types.rs @@ -1,7 +1,9 @@ // Copyright (c) 2026 vectorless developers // SPDX-License-Identifier: Apache-2.0 -//! Client type definitions. +//! Public API types for the client module. +//! +//! This module contains all types exposed in the public API. use serde::{Deserialize, Serialize}; use std::path::PathBuf; @@ -9,6 +11,10 @@ use std::path::PathBuf; use crate::domain::DocumentTree; use crate::parser::DocumentFormat; +// ============================================================ +// Document Types +// ============================================================ + /// An indexed document with its tree structure and metadata. #[derive(Debug, Clone)] pub struct IndexedDocument { @@ -116,6 +122,10 @@ pub struct PageContent { pub content: String, } +// ============================================================ +// Index Types +// ============================================================ + /// Document indexing mode. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum IndexMode { @@ -164,7 +174,7 @@ impl Default for IndexOptions { fn default() -> Self { Self { mode: IndexMode::Auto, - generate_summaries: false, // Disabled by default, requires API key + generate_summaries: false, include_text: true, generate_ids: true, generate_description: false, @@ -189,8 +199,18 @@ impl IndexOptions { self.generate_description = true; self } + + /// Set the indexing mode. + pub fn with_mode(mut self, mode: IndexMode) -> Self { + self.mode = mode; + self + } } +// ============================================================ +// Query Types +// ============================================================ + /// Result of a document query. #[derive(Debug, Clone)] pub struct QueryResult { @@ -207,6 +227,32 @@ pub struct QueryResult { pub score: f32, } +impl QueryResult { + /// Create a new query result. + pub fn new(doc_id: impl Into) -> Self { + Self { + doc_id: doc_id.into(), + node_ids: Vec::new(), + content: String::new(), + score: 0.0, + } + } + + /// Check if the result is empty. + pub fn is_empty(&self) -> bool { + self.node_ids.is_empty() + } + + /// Get the number of results. + pub fn len(&self) -> usize { + self.node_ids.len() + } +} + +// ============================================================ +// Document Info Types +// ============================================================ + /// Document info for listing. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct DocumentInfo { @@ -228,3 +274,89 @@ pub struct DocumentInfo { /// Line count (for text files). pub line_count: Option, } + +impl DocumentInfo { + /// Create a new document info. + pub fn new(id: impl Into, name: impl Into) -> Self { + Self { + id: id.into(), + name: name.into(), + format: String::new(), + description: None, + page_count: None, + line_count: None, + } + } + + /// Set the format. + pub fn with_format(mut self, format: impl Into) -> Self { + self.format = format.into(); + self + } +} + +// ============================================================ +// Error Types +// ============================================================ + +/// Client error types. +#[derive(Debug, Clone, thiserror::Error)] +pub enum ClientError { + /// Document not found. + #[error("Document not found: {0}")] + NotFound(String), + + /// Invalid operation. + #[error("Invalid operation: {0}")] + InvalidOperation(String), + + /// Configuration error. + #[error("Configuration error: {0}")] + Config(String), + + /// Timeout error. + #[error("Operation timed out")] + Timeout, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_indexed_document() { + let doc = IndexedDocument::new("doc-1", DocumentFormat::Markdown) + .with_name("Test Document") + .with_description("A test document"); + + assert_eq!(doc.id, "doc-1"); + assert_eq!(doc.name, "Test Document"); + assert!(doc.tree.is_none()); + } + + #[test] + fn test_index_options() { + let options = IndexOptions::new() + .with_summaries() + .with_mode(IndexMode::Pdf); + + assert!(options.generate_summaries); + assert_eq!(options.mode, IndexMode::Pdf); + } + + #[test] + fn test_query_result() { + let result = QueryResult::new("doc-1"); + assert!(result.is_empty()); + assert_eq!(result.len(), 0); + } + + #[test] + fn test_document_info() { + let info = DocumentInfo::new("doc-1", "Test") + .with_format("markdown"); + + assert_eq!(info.id, "doc-1"); + assert_eq!(info.format, "markdown"); + } +} diff --git a/src/client/workspace.rs b/src/client/workspace.rs new file mode 100644 index 00000000..731a5e71 --- /dev/null +++ b/src/client/workspace.rs @@ -0,0 +1,372 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Workspace management client. +//! +//! This module provides CRUD operations for document persistence +//! through the workspace abstraction. +//! +//! # Example +//! +//! ```rust,ignore +//! let workspace = WorkspaceClient::new(workspace_storage); +//! +//! // Save a document +//! workspace.save(&doc)?; +//! +//! // Load a document +//! let doc = workspace.load("doc-id")?; +//! +//! // List all documents +//! for doc in workspace.list()? { +//! println!("{}: {}", doc.id, doc.name); +//! } +//! ``` + +use std::sync::{Arc, RwLock}; + +use tracing::{debug, info, warn}; + +use crate::domain::{Error, Result}; +use crate::storage::{DocumentMetaEntry, PersistedDocument, Workspace}; + +use super::events::{EventEmitter, WorkspaceEvent}; +use super::types::DocumentInfo; + +/// Workspace management client. +/// +/// Provides thread-safe CRUD operations for document persistence. +pub struct WorkspaceClient { + /// Workspace storage. + workspace: Arc>, + + /// Event emitter. + events: EventEmitter, + + /// Configuration. + config: WorkspaceClientConfig, +} + +/// Workspace client configuration. +#[derive(Debug, Clone)] +pub struct WorkspaceClientConfig { + /// Auto-save interval in seconds (None = disabled). + pub auto_save_interval: Option, + + /// Enable verbose logging. + pub verbose: bool, +} + +impl Default for WorkspaceClientConfig { + fn default() -> Self { + Self { + auto_save_interval: None, + verbose: false, + } + } +} + +impl WorkspaceClient { + /// Create a new workspace client. + pub fn new(workspace: Workspace) -> Self { + Self { + workspace: Arc::new(RwLock::new(workspace)), + events: EventEmitter::new(), + config: WorkspaceClientConfig::default(), + } + } + + /// Create with event emitter. + pub fn with_events(mut self, events: EventEmitter) -> Self { + self.events = events; + self + } + + /// Create with configuration. + pub fn with_config(mut self, config: WorkspaceClientConfig) -> Self { + self.config = config; + self + } + + /// Create from an existing workspace Arc. + pub(crate) fn from_arc(workspace: Arc>, events: EventEmitter) -> Self { + Self { + workspace, + events, + config: WorkspaceClientConfig::default(), + } + } + + /// Save a document to the workspace. + /// + /// # Errors + /// + /// Returns an error if the workspace write fails. + pub fn save(&self, doc: &PersistedDocument) -> Result<()> { + let doc_id = doc.meta.id.clone(); + + { + let mut ws = self.workspace.write() + .map_err(|_| Error::Other("Workspace lock poisoned".to_string()))?; + ws.add(doc)?; + } + + info!("Saved document: {}", doc_id); + self.events.emit_workspace(WorkspaceEvent::Saved { doc_id }); + + Ok(()) + } + + /// Load a document from the workspace. + /// + /// Returns `Ok(None)` if the document doesn't exist. + /// + /// # Errors + /// + /// Returns an error if the workspace read fails. + pub fn load(&self, doc_id: &str) -> Result> { + let ws = self.workspace.read() + .map_err(|_| Error::Other("Workspace lock poisoned".to_string()))?; + + if !ws.contains(doc_id) { + return Ok(None); + } + + let doc = ws.load(doc_id)?; + let cache_hit = doc.is_some(); + + if let Some(ref doc) = doc { + debug!("Loaded document: {} (cache={})", doc_id, cache_hit); + } + + self.events.emit_workspace(WorkspaceEvent::Loaded { + doc_id: doc_id.to_string(), + cache_hit, + }); + + Ok(doc) + } + + /// Remove a document from the workspace. + /// + /// Returns `Ok(true)` if the document was removed, `Ok(false)` if it didn't exist. + /// + /// # Errors + /// + /// Returns an error if the workspace write fails. + pub fn remove(&self, doc_id: &str) -> Result { + let removed = { + let mut ws = self.workspace.write() + .map_err(|_| Error::Other("Workspace lock poisoned".to_string()))?; + ws.remove(doc_id)? + }; + + if removed { + info!("Removed document: {}", doc_id); + self.events.emit_workspace(WorkspaceEvent::Removed { + doc_id: doc_id.to_string(), + }); + } + + Ok(removed) + } + + /// Check if a document exists in the workspace. + /// + /// # Errors + /// + /// Returns an error if the workspace read fails. + pub fn exists(&self, doc_id: &str) -> Result { + let ws = self.workspace.read() + .map_err(|_| Error::Other("Workspace lock poisoned".to_string()))?; + Ok(ws.contains(doc_id)) + } + + /// List all documents in the workspace. + /// + /// # Errors + /// + /// Returns an error if the workspace read fails. + pub fn list(&self) -> Result> { + let ws = self.workspace.read() + .map_err(|_| Error::Other("Workspace lock poisoned".to_string()))?; + + Ok(ws.list_documents() + .iter() + .filter_map(|id| ws.get_meta(id)) + .map(|meta| DocumentInfo { + id: meta.id.clone(), + name: meta.doc_name.clone(), + format: meta.doc_type.clone(), + description: meta.doc_description.clone(), + page_count: meta.page_count, + line_count: meta.line_count, + }) + .collect()) + } + + /// Get document metadata without loading the full document. + /// + /// # Errors + /// + /// Returns an error if the workspace read fails. + pub fn get_meta(&self, doc_id: &str) -> Result> { + let ws = self.workspace.read() + .map_err(|_| Error::Other("Workspace lock poisoned".to_string()))?; + Ok(ws.get_meta(doc_id).cloned()) + } + + /// Get document info by ID. + /// + /// # Errors + /// + /// Returns an error if the workspace read fails. + pub fn get_document_info(&self, doc_id: &str) -> Result> { + let ws = self.workspace.read() + .map_err(|_| Error::Other("Workspace lock poisoned".to_string()))?; + + Ok(ws.get_meta(doc_id).map(|meta| DocumentInfo { + id: meta.id.clone(), + name: meta.doc_name.clone(), + format: meta.doc_type.clone(), + description: meta.doc_description.clone(), + page_count: meta.page_count, + line_count: meta.line_count, + })) + } + + /// Remove multiple documents from the workspace. + /// + /// Returns the number of documents successfully removed. + /// + /// # Errors + /// + /// Returns an error if the workspace write fails. + pub fn batch_remove(&self, doc_ids: &[&str]) -> Result { + let mut removed = 0; + + { + let mut ws = self.workspace.write() + .map_err(|_| Error::Other("Workspace lock poisoned".to_string()))?; + + for doc_id in doc_ids { + if ws.remove(doc_id)? { + removed += 1; + self.events.emit_workspace(WorkspaceEvent::Removed { + doc_id: doc_id.to_string(), + }); + } + } + } + + if removed > 0 { + info!("Batch removed {} documents", removed); + } + + Ok(removed) + } + + /// Clear all documents from the workspace. + /// + /// Returns the number of documents removed. + /// + /// # Errors + /// + /// Returns an error if the workspace write fails. + pub fn clear(&self) -> Result { + let doc_ids: Vec; + + { + let ws = self.workspace.read() + .map_err(|_| Error::Other("Workspace lock poisoned".to_string()))?; + doc_ids = ws.list_documents().iter().map(|s| s.to_string()).collect(); + } + + let count = doc_ids.len(); + + { + let mut ws = self.workspace.write() + .map_err(|_| Error::Other("Workspace lock poisoned".to_string()))?; + + for doc_id in &doc_ids { + let _ = ws.remove(doc_id); + } + } + + if count > 0 { + info!("Cleared workspace: {} documents removed", count); + self.events.emit_workspace(WorkspaceEvent::Cleared { count }); + } + + Ok(count) + } + + /// Get workspace statistics. + /// + /// # Errors + /// + /// Returns an error if the workspace read fails. + pub fn stats(&self) -> Result { + let ws = self.workspace.read() + .map_err(|_| Error::Other("Workspace lock poisoned".to_string()))?; + + Ok(WorkspaceStats { + document_count: ws.len(), + }) + } + + /// Get the number of documents in the workspace. + pub fn len(&self) -> usize { + self.workspace.read() + .map(|ws| ws.len()) + .unwrap_or(0) + } + + /// Check if the workspace is empty. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Get the underlying workspace Arc (for advanced use). + pub(crate) fn inner(&self) -> Arc> { + Arc::clone(&self.workspace) + } +} + +impl Clone for WorkspaceClient { + fn clone(&self) -> Self { + Self { + workspace: Arc::clone(&self.workspace), + events: self.events.clone(), + config: self.config.clone(), + } + } +} + +/// Workspace statistics. +#[derive(Debug, Clone)] +pub struct WorkspaceStats { + /// Number of documents in the workspace. + pub document_count: usize, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_workspace_client_creation() { + let workspace = Workspace::open("./test_workspace").unwrap(); + let client = WorkspaceClient::new(workspace); + assert!(client.is_empty()); + } + + #[test] + fn test_workspace_stats() { + let workspace = Workspace::open("./test_workspace").unwrap(); + let client = WorkspaceClient::new(workspace); + + let stats = client.stats().unwrap(); + assert_eq!(stats.document_count, 0); + } +} diff --git a/src/config/docs.rs b/src/config/docs.rs new file mode 100644 index 00000000..7e2330b9 --- /dev/null +++ b/src/config/docs.rs @@ -0,0 +1,307 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Configuration documentation generation. +//! +//! This module provides utilities for generating documentation +//! from configuration types, including markdown reference and +//! example TOML files. + +use super::types::Config; + +/// Configuration documentation generator. +#[derive(Debug, Clone)] +pub struct ConfigDocs { + config: Config, +} + +impl ConfigDocs { + /// Create a new documentation generator. + pub fn new(config: Config) -> Self { + Self { config } + } + + /// Create with default configuration. + pub fn with_defaults() -> Self { + Self::new(Config::default()) + } + + /// Generate markdown documentation for the configuration. + pub fn to_markdown(&self) -> String { + let mut md = String::new(); + + md.push_str("# Configuration Reference\n\n"); + md.push_str("This document describes all configuration options for vectorless.\n\n"); + md.push_str("## Configuration File\n\n"); + md.push_str("Configuration is loaded from a TOML file. Default locations:\n"); + md.push_str("- `./vectorless.toml`\n"); + md.push_str("- `./config.toml`\n"); + md.push_str("- `./.vectorless.toml`\n\n"); + + // Indexer section + md.push_str("## `[indexer]`\n\n"); + md.push_str("Controls document indexing behavior.\n\n"); + md.push_str("| Option | Type | Default | Description |\n"); + md.push_str("|--------|------|---------|-------------|\n"); + self.add_row(&mut md, "subsection_threshold", "usize", "300", + "Word count threshold for splitting sections into subsections"); + self.add_row(&mut md, "max_segment_tokens", "usize", "3000", + "Maximum tokens to send in a single segmentation request"); + self.add_row(&mut md, "max_summary_tokens", "usize", "200", + "Maximum tokens for each summary"); + self.add_row(&mut md, "min_summary_tokens", "usize", "20", + "Minimum content tokens required to generate a summary"); + md.push_str("\n"); + + // Summary section + md.push_str("## `[summary]`\n\n"); + md.push_str("LLM configuration for summary generation.\n\n"); + md.push_str("| Option | Type | Default | Description |\n"); + md.push_str("|--------|------|---------|-------------|\n"); + self.add_row(&mut md, "model", "string", "gpt-4o-mini", "Model for summarization"); + self.add_row(&mut md, "endpoint", "string", "https://api.openai.com/v1", "API endpoint"); + self.add_row(&mut md, "api_key", "string?", "null", "API key (optional, can use env var)"); + self.add_row(&mut md, "max_tokens", "usize", "200", "Maximum tokens for summary generation"); + self.add_row(&mut md, "temperature", "f32", "0.0", "Temperature for summary generation"); + md.push_str("\n"); + + // Retrieval section + md.push_str("## `[retrieval]`\n\n"); + md.push_str("Retrieval model and behavior configuration.\n\n"); + md.push_str("| Option | Type | Default | Description |\n"); + md.push_str("|--------|------|---------|-------------|\n"); + self.add_row(&mut md, "model", "string", "gpt-4o", "Model for retrieval navigation"); + self.add_row(&mut md, "endpoint", "string", "https://api.openai.com/v1", "API endpoint"); + self.add_row(&mut md, "api_key", "string?", "null", "API key (defaults to summary.api_key)"); + self.add_row(&mut md, "top_k", "usize", "3", "Number of top results to return"); + self.add_row(&mut md, "max_tokens", "usize", "1000", "Maximum tokens for retrieval context"); + self.add_row(&mut md, "temperature", "f32", "0.0", "Temperature for retrieval"); + md.push_str("\n"); + + // Retrieval.search section + md.push_str("## `[retrieval.search]`\n\n"); + md.push_str("Search algorithm configuration.\n\n"); + md.push_str("| Option | Type | Default | Description |\n"); + md.push_str("|--------|------|---------|-------------|\n"); + self.add_row(&mut md, "top_k", "usize", "5", "Number of top-k results to return"); + self.add_row(&mut md, "beam_width", "usize", "3", "Beam width for multi-path search"); + self.add_row(&mut md, "max_iterations", "usize", "10", "Maximum iterations for search algorithms"); + self.add_row(&mut md, "min_score", "f32", "0.1", "Minimum score to include a path"); + md.push_str("\n"); + + // Retrieval.sufficiency section + md.push_str("## `[retrieval.sufficiency]`\n\n"); + md.push_str("Sufficiency checker configuration.\n\n"); + md.push_str("| Option | Type | Default | Description |\n"); + md.push_str("|--------|------|---------|-------------|\n"); + self.add_row(&mut md, "min_tokens", "usize", "500", "Minimum tokens for sufficiency"); + self.add_row(&mut md, "target_tokens", "usize", "2000", "Target tokens for full sufficiency"); + self.add_row(&mut md, "max_tokens", "usize", "4000", "Maximum tokens before stopping"); + self.add_row(&mut md, "min_content_length", "usize", "200", "Minimum content length (characters)"); + self.add_row(&mut md, "confidence_threshold", "f32", "0.7", "Confidence threshold for LLM judge"); + md.push_str("\n"); + + // Retrieval.content section + md.push_str("## `[retrieval.content]`\n\n"); + md.push_str("Content aggregator configuration.\n\n"); + md.push_str("| Option | Type | Default | Description |\n"); + md.push_str("|--------|------|---------|-------------|\n"); + self.add_row(&mut md, "enabled", "bool", "true", "Enable content aggregator"); + self.add_row(&mut md, "token_budget", "usize", "4000", "Maximum tokens for aggregated content"); + self.add_row(&mut md, "min_relevance_score", "f32", "0.2", "Minimum relevance score threshold (0.0-1.0)"); + self.add_row(&mut md, "scoring_strategy", "string", "keyword_bm25", "Scoring strategy (keyword_only, keyword_bm25, hybrid)"); + self.add_row(&mut md, "output_format", "string", "markdown", "Output format (markdown, json, tree, flat)"); + self.add_row(&mut md, "include_scores", "bool", "false", "Include relevance scores in output"); + self.add_row(&mut md, "hierarchical_min_per_level", "f32", "0.1", "Minimum budget allocation per depth level"); + self.add_row(&mut md, "deduplicate", "bool", "true", "Enable content deduplication"); + self.add_row(&mut md, "dedup_threshold", "f32", "0.9", "Similarity threshold for deduplication"); + md.push_str("\n"); + + // Retrieval.strategy section + md.push_str("## `[retrieval.strategy]`\n\n"); + md.push_str("Strategy-specific configuration.\n\n"); + md.push_str("| Option | Type | Default | Description |\n"); + md.push_str("|--------|------|---------|-------------|\n"); + self.add_row(&mut md, "exploration_weight", "f32", "1.414", "MCTS exploration weight (√2)"); + self.add_row(&mut md, "similarity_threshold", "f32", "0.5", "Semantic similarity threshold"); + self.add_row(&mut md, "high_similarity_threshold", "f32", "0.8", "High similarity for 'answer' decision"); + self.add_row(&mut md, "low_similarity_threshold", "f32", "0.3", "Low similarity for 'explore' decision"); + md.push_str("\n"); + + // Storage section + md.push_str("## `[storage]`\n\n"); + md.push_str("Storage configuration.\n\n"); + md.push_str("| Option | Type | Default | Description |\n"); + md.push_str("|--------|------|---------|-------------|\n"); + self.add_row(&mut md, "workspace_dir", "string", "./workspace", "Workspace directory for persisted documents"); + md.push_str("\n"); + + // Concurrency section + md.push_str("## `[concurrency]`\n\n"); + md.push_str("Concurrency control configuration.\n\n"); + md.push_str("| Option | Type | Default | Description |\n"); + md.push_str("|--------|------|---------|-------------|\n"); + self.add_row(&mut md, "max_concurrent_requests", "usize", "10", "Maximum concurrent LLM API calls"); + self.add_row(&mut md, "requests_per_minute", "usize", "500", "Rate limit: requests per minute"); + self.add_row(&mut md, "enabled", "bool", "true", "Enable rate limiting"); + self.add_row(&mut md, "semaphore_enabled", "bool", "true", "Enable semaphore-based concurrency"); + md.push_str("\n"); + + // Fallback section + md.push_str("## `[fallback]`\n\n"); + md.push_str("Fallback/error recovery configuration.\n\n"); + md.push_str("| Option | Type | Default | Description |\n"); + md.push_str("|--------|------|---------|-------------|\n"); + self.add_row(&mut md, "enabled", "bool", "true", "Enable graceful degradation"); + self.add_row(&mut md, "models", "[string]", "[\"gpt-4o-mini\", \"glm-4-flash\"]", "Fallback models in priority order"); + self.add_row(&mut md, "endpoints", "[string]", "[]", "Fallback endpoints in priority order"); + self.add_row(&mut md, "on_rate_limit", "string", "retry_then_fallback", "Behavior on rate limit (retry, fallback, retry_then_fallback, fail)"); + self.add_row(&mut md, "on_timeout", "string", "retry_then_fallback", "Behavior on timeout"); + self.add_row(&mut md, "on_all_failed", "string", "return_error", "Behavior when all attempts fail (return_error, return_cache)"); + md.push_str("\n"); + + md + } + + fn add_row(&self, md: &mut String, name: &str, ty: &str, default: &str, desc: &str) { + md.push_str(&format!("| `{}` | {} | {} | {} |\n", name, ty, default, desc)); + } + + /// Generate an example TOML file with all options. + pub fn to_example_toml(&self) -> String { + toml::to_string_pretty(&self.config).unwrap_or_else(|e| { + format!("# Error generating TOML: {}\n\n# Using default config\n{}", + e, Self::fallback_toml()) + }) + } + + fn fallback_toml() -> String { + r#"# Vectorless Configuration Example +# Copy this file to config.toml and fill in your API keys + +[indexer] +subsection_threshold = 300 +max_segment_tokens = 3000 +max_summary_tokens = 200 +min_summary_tokens = 20 + +[summary] +model = "gpt-4o-mini" +endpoint = "https://api.openai.com/v1" +# api_key = "sk-..." +max_tokens = 200 +temperature = 0.0 + +[retrieval] +model = "gpt-4o" +endpoint = "https://api.openai.com/v1" +# api_key = "sk-..." +top_k = 3 +max_tokens = 1000 +temperature = 0.0 + +[retrieval.search] +top_k = 5 +beam_width = 3 +max_iterations = 10 +min_score = 0.1 + +[retrieval.sufficiency] +min_tokens = 500 +target_tokens = 2000 +max_tokens = 4000 +min_content_length = 200 +confidence_threshold = 0.7 + +[retrieval.cache] +max_entries = 1000 +ttl_secs = 3600 + +[retrieval.strategy] +exploration_weight = 1.414 +similarity_threshold = 0.5 +high_similarity_threshold = 0.8 +low_similarity_threshold = 0.3 + +[retrieval.content] +enabled = true +token_budget = 4000 +min_relevance_score = 0.2 +scoring_strategy = "keyword_bm25" +output_format = "markdown" +include_scores = false +hierarchical_min_per_level = 0.1 +deduplicate = true +dedup_threshold = 0.9 + +[storage] +workspace_dir = "./workspace" + +[concurrency] +max_concurrent_requests = 10 +requests_per_minute = 500 +enabled = true +semaphore_enabled = true + +[fallback] +enabled = true +models = ["gpt-4o-mini", "glm-4-flash"] +on_rate_limit = "retry_then_fallback" +on_timeout = "retry_then_fallback" +on_all_failed = "return_error" +"#.to_string() + } + + /// Generate a minimal example TOML file. + pub fn to_minimal_toml(&self) -> String { + r#"# Minimal Vectorless Configuration +# Most options have sensible defaults + +[summary] +api_key = "your-api-key-here" + +[retrieval] +top_k = 5 +"#.to_string() + } +} + +impl Default for ConfigDocs { + fn default() -> Self { + Self::with_defaults() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_config_docs_markdown() { + let docs = ConfigDocs::with_defaults(); + let md = docs.to_markdown(); + + assert!(md.contains("# Configuration Reference")); + assert!(md.contains("## `[indexer]`")); + assert!(md.contains("## `[retrieval]`")); + assert!(md.contains("## `[retrieval.content]`")); + } + + #[test] + fn test_config_docs_toml() { + let docs = ConfigDocs::with_defaults(); + let toml = docs.to_example_toml(); + + assert!(toml.contains("[indexer]")); + assert!(toml.contains("[retrieval]")); + } + + #[test] + fn test_config_docs_minimal_toml() { + let docs = ConfigDocs::with_defaults(); + let toml = docs.to_minimal_toml(); + + assert!(toml.contains("[summary]")); + assert!(toml.len() < 200); // Should be minimal + } +} diff --git a/src/config/loader.rs b/src/config/loader.rs index e83dc229..fe2c6736 100644 --- a/src/config/loader.rs +++ b/src/config/loader.rs @@ -3,14 +3,46 @@ //! Configuration loader. //! -//! Loads configuration from TOML files only. -//! All configuration comes from config files, not environment variables. -//! This ensures configuration is explicit and traceable. +//! Loads configuration from TOML files with optional environment variable +//! overrides and validation. +//! +//! # Example +//! +//! ```rust,no_run +//! use vectorless::config::{ConfigLoader, Config}; +//! +//! // Load from file +//! let config = ConfigLoader::new() +//! .file("config.toml") +//! .load()?; +//! +//! // Load with validation +//! let config = ConfigLoader::new() +//! .file("config.toml") +//! .with_validation(true) +//! .load()?; +//! +//! // Load with environment variable override +//! let config = ConfigLoader::new() +//! .file("config.toml") +//! .with_env("VECTORLESS_") +//! .load()?; +//! +//! // Layered configuration +//! let config = ConfigLoader::new() +//! .file("default.toml") +//! .file("production.toml") +//! .with_validation(true) +//! .load()?; +//! # Ok::<(), vectorless::config::ConfigError>(()) +//! ``` use std::path::{Path, PathBuf}; use thiserror::Error; +use super::merge::Merge; use super::types::Config; +use super::validator::ConfigValidator; /// Configuration loading errors. #[derive(Debug, Error)] @@ -30,59 +62,235 @@ pub enum ConfigError { /// Invalid configuration value. #[error("Invalid configuration: {0}")] Invalid(String), + + /// Configuration validation failed. + #[error("{0}")] + Validation(#[from] super::types::ConfigValidationError), + + /// Environment variable error. + #[error("Environment variable error: {0}")] + Env(String), } /// Configuration loader. -/// -/// # Example -/// -/// ```rust,no_run -/// use vectorless::config::{ConfigLoader, Config}; -/// -/// // Load from file -/// let config = ConfigLoader::new() -/// .file("config.toml") -/// .load()?; -/// -/// // Or use defaults -/// let config = Config::default(); -/// # Ok::<(), vectorless::config::ConfigError>(()) -/// ``` -#[derive(Debug, Default)] +#[derive(Debug)] pub struct ConfigLoader { - /// Configuration file path. - file: Option, + /// Configuration file paths (loaded in order, later files override earlier). + files: Vec, + + /// Environment variable prefix (optional). + env_prefix: Option, + + /// Whether to validate after loading. + validate: bool, + + /// Custom validator (optional). + validator: Option, +} + +impl Default for ConfigLoader { + fn default() -> Self { + Self::new() + } } impl ConfigLoader { /// Create a new configuration loader with defaults. pub fn new() -> Self { - Self::default() + Self { + files: Vec::new(), + env_prefix: None, + validate: false, + validator: None, + } } /// Specify a configuration file to load. + /// + /// Multiple files can be specified; later files override earlier ones. pub fn file>(mut self, path: P) -> Self { - self.file = Some(path.as_ref().to_path_buf()); + self.files.push(path.as_ref().to_path_buf()); + self + } + + /// Specify multiple configuration files. + pub fn files(mut self, paths: I) -> Self + where + I: IntoIterator, + P: AsRef, + { + self.files + .extend(paths.into_iter().map(|p| p.as_ref().to_path_buf())); + self + } + + /// Enable environment variable override. + /// + /// Variables like `VECTORLESS_SUMMARY__API_KEY` override config values. + /// Use `__` (double underscore) to separate nested keys. + pub fn with_env(mut self, prefix: impl Into) -> Self { + self.env_prefix = Some(prefix.into()); + self + } + + /// Enable or disable validation after loading. + pub fn with_validation(mut self, validate: bool) -> Self { + self.validate = validate; + self + } + + /// Set a custom validator. + pub fn with_validator(mut self, validator: ConfigValidator) -> Self { + self.validator = Some(validator); self } /// Load the configuration. /// - /// If no file is specified, returns default configuration. - /// If file is specified but doesn't exist, returns an error. + /// # Behavior + /// + /// 1. Start with default configuration + /// 2. Load and merge each specified file (in order) + /// 3. Apply environment variable overrides (if enabled) + /// 4. Validate configuration (if enabled) + /// + /// # Errors + /// + /// Returns an error if: + /// - A specified file doesn't exist + /// - A file can't be parsed as valid TOML + /// - Validation fails (when enabled) pub fn load(self) -> Result { - if let Some(ref path) = self.file { + let mut config = Config::default(); + + // Load and merge each file + for path in &self.files { if path.exists() { let content = std::fs::read_to_string(path)?; - let config: Config = toml::from_str(&content)?; - Ok(config) + let file_config: Config = toml::from_str(&content)?; + config.merge(&file_config, super::merge::MergeStrategy::Replace); } else { - Err(ConfigError::NotFound(path.clone())) + return Err(ConfigError::NotFound(path.clone())); + } + } + + // Apply environment variable overrides + if let Some(ref prefix) = self.env_prefix { + self.apply_env_overrides(&mut config, prefix)?; + } + + // Validate if requested + if self.validate { + let validator = self.validator.unwrap_or_default(); + validator.validate(&config)?; + } + + Ok(config) + } + + /// Apply environment variable overrides to the configuration. + fn apply_env_overrides(&self, config: &mut Config, prefix: &str) -> Result<(), ConfigError> { + for (key, value) in std::env::vars() { + if !key.starts_with(prefix) { + continue; + } + + // Parse the path: VECTORLESS_SUMMARY__API_KEY -> ["summary", "api_key"] + let path_str = key.trim_start_matches(prefix).trim_start_matches('_'); + let parts: Vec<&str> = path_str.split("__").collect(); + + if parts.is_empty() { + continue; + } + + // Apply the override + self.set_by_path(config, &parts, &value)?; + } + + Ok(()) + } + + /// Set a configuration value by path. + fn set_by_path(&self, config: &mut Config, path: &[&str], value: &str) -> Result<(), ConfigError> { + match path { + ["summary", "api_key"] => { + config.summary.api_key = Some(value.to_string()); + } + ["summary", "model"] => { + config.summary.model = value.to_string(); } + ["summary", "endpoint"] => { + config.summary.endpoint = value.to_string(); + } + ["summary", "max_tokens"] => { + config.summary.max_tokens = value.parse().map_err(|e| { + ConfigError::Env(format!("Invalid max_tokens: {}", e)) + })?; + } + ["retrieval", "api_key"] => { + config.retrieval.api_key = Some(value.to_string()); + } + ["retrieval", "model"] => { + config.retrieval.model = value.to_string(); + } + ["retrieval", "endpoint"] => { + config.retrieval.endpoint = value.to_string(); + } + ["retrieval", "top_k"] => { + config.retrieval.top_k = value.parse().map_err(|e| { + ConfigError::Env(format!("Invalid top_k: {}", e)) + })?; + } + ["storage", "workspace_dir"] => { + config.storage.workspace_dir = PathBuf::from(value); + } + ["concurrency", "max_concurrent_requests"] => { + config.concurrency.max_concurrent_requests = value.parse().map_err(|e| { + ConfigError::Env(format!("Invalid max_concurrent_requests: {}", e)) + })?; + } + _ => { + // Unknown path - could log a warning + } + } + + Ok(()) + } +} + +/// Default configuration file names to search for. +pub const CONFIG_FILE_NAMES: &[&str] = + &["vectorless.toml", "config.toml", ".vectorless.toml"]; + +/// Find a configuration file in current or parent directories. +pub fn find_config_file() -> Option { + let current_dir = std::env::current_dir().ok()?; + + // Search in current directory first + for name in CONFIG_FILE_NAMES { + let path = current_dir.join(name); + if path.exists() { + return Some(path); + } + } + + // Search in parent directories (up to 3 levels) + let mut dir = current_dir.as_path(); + for _ in 0..3 { + if let Some(parent) = dir.parent() { + for name in CONFIG_FILE_NAMES { + let path = parent.join(name); + if path.exists() { + return Some(path); + } + } + dir = parent; } else { - Ok(Config::default()) + break; } } + + None } #[cfg(test)] @@ -106,4 +314,24 @@ mod tests { let config = ConfigLoader::new().load().unwrap(); assert_eq!(config.indexer.subsection_threshold, 300); } + + #[test] + fn test_config_loader_not_found() { + let result = ConfigLoader::new() + .file("nonexistent_config.toml") + .load(); + + assert!(result.is_err()); + assert!(matches!(result.unwrap_err(), ConfigError::NotFound(_))); + } + + #[test] + fn test_config_loader_with_validation() { + let config = ConfigLoader::new() + .with_validation(true) + .load() + .unwrap(); + + assert_eq!(config.retrieval.model, "gpt-4o"); + } } diff --git a/src/config/merge.rs b/src/config/merge.rs new file mode 100644 index 00000000..438872b5 --- /dev/null +++ b/src/config/merge.rs @@ -0,0 +1,356 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Configuration merging. +//! +//! This module provides utilities for merging multiple configurations, +//! enabling layered configuration from multiple sources. + +use super::types::{ + CacheConfig, Config, ConcurrencyConfig, ContentAggregatorConfig, FallbackConfig, + IndexerConfig, RetrievalConfig, SearchConfig, StorageConfig, StrategyConfig, SufficiencyConfig, + SummaryConfig, +}; + +/// Configuration merge strategy. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum MergeStrategy { + /// Replace with source value. + Replace, + /// Keep existing value if present (don't overwrite). + KeepExisting, + /// Recursively merge nested structures. + Recursive, +} + +/// Trait for configuration merging. +pub trait Merge { + /// Merge another configuration into this one. + fn merge(&mut self, other: &Self, strategy: MergeStrategy); +} + +impl Merge for Config { + fn merge(&mut self, other: &Self, strategy: MergeStrategy) { + self.indexer.merge(&other.indexer, strategy); + self.summary.merge(&other.summary, strategy); + self.retrieval.merge(&other.retrieval, strategy); + self.storage.merge(&other.storage, strategy); + self.concurrency.merge(&other.concurrency, strategy); + self.fallback.merge(&other.fallback, strategy); + } +} + +impl Merge for IndexerConfig { + fn merge(&mut self, other: &Self, strategy: MergeStrategy) { + if strategy == MergeStrategy::Replace || self.subsection_threshold == 300 { + self.subsection_threshold = other.subsection_threshold; + } + if strategy == MergeStrategy::Replace || self.max_segment_tokens == 3000 { + self.max_segment_tokens = other.max_segment_tokens; + } + if strategy == MergeStrategy::Replace || self.max_summary_tokens == 200 { + self.max_summary_tokens = other.max_summary_tokens; + } + if strategy == MergeStrategy::Replace || self.min_summary_tokens == 20 { + self.min_summary_tokens = other.min_summary_tokens; + } + } +} + +impl Merge for SummaryConfig { + fn merge(&mut self, other: &Self, strategy: MergeStrategy) { + if strategy == MergeStrategy::Replace || self.model == "gpt-4o-mini" { + self.model = other.model.clone(); + } + if strategy == MergeStrategy::Replace || self.endpoint == "https://api.openai.com/v1" { + self.endpoint = other.endpoint.clone(); + } + // Always merge API keys if present + if other.api_key.is_some() { + self.api_key = other.api_key.clone(); + } + if strategy == MergeStrategy::Replace || self.max_tokens == 200 { + self.max_tokens = other.max_tokens; + } + if strategy == MergeStrategy::Replace || self.temperature == 0.0 { + self.temperature = other.temperature; + } + } +} + +impl Merge for RetrievalConfig { + fn merge(&mut self, other: &Self, strategy: MergeStrategy) { + if strategy == MergeStrategy::Replace || self.model == "gpt-4o" { + self.model = other.model.clone(); + } + if strategy == MergeStrategy::Replace || self.endpoint == "https://api.openai.com/v1" { + self.endpoint = other.endpoint.clone(); + } + if other.api_key.is_some() { + self.api_key = other.api_key.clone(); + } + if strategy == MergeStrategy::Replace || self.max_tokens == 1000 { + self.max_tokens = other.max_tokens; + } + if strategy == MergeStrategy::Replace || self.temperature == 0.0 { + self.temperature = other.temperature; + } + if strategy == MergeStrategy::Replace || self.top_k == 3 { + self.top_k = other.top_k; + } + + self.search.merge(&other.search, strategy); + self.sufficiency.merge(&other.sufficiency, strategy); + self.cache.merge(&other.cache, strategy); + self.strategy.merge(&other.strategy, strategy); + self.content.merge(&other.content, strategy); + } +} + +impl Merge for SearchConfig { + fn merge(&mut self, other: &Self, strategy: MergeStrategy) { + if strategy == MergeStrategy::Replace || self.top_k == 5 { + self.top_k = other.top_k; + } + if strategy == MergeStrategy::Replace || self.beam_width == 3 { + self.beam_width = other.beam_width; + } + if strategy == MergeStrategy::Replace || self.max_iterations == 10 { + self.max_iterations = other.max_iterations; + } + if strategy == MergeStrategy::Replace || (self.min_score - 0.1).abs() < f32::EPSILON { + self.min_score = other.min_score; + } + } +} + +impl Merge for SufficiencyConfig { + fn merge(&mut self, other: &Self, strategy: MergeStrategy) { + if strategy == MergeStrategy::Replace || self.min_tokens == 500 { + self.min_tokens = other.min_tokens; + } + if strategy == MergeStrategy::Replace || self.target_tokens == 2000 { + self.target_tokens = other.target_tokens; + } + if strategy == MergeStrategy::Replace || self.max_tokens == 4000 { + self.max_tokens = other.max_tokens; + } + if strategy == MergeStrategy::Replace || self.min_content_length == 200 { + self.min_content_length = other.min_content_length; + } + if strategy == MergeStrategy::Replace || (self.confidence_threshold - 0.7).abs() < f32::EPSILON + { + self.confidence_threshold = other.confidence_threshold; + } + } +} + +impl Merge for CacheConfig { + fn merge(&mut self, other: &Self, strategy: MergeStrategy) { + if strategy == MergeStrategy::Replace || self.max_entries == 1000 { + self.max_entries = other.max_entries; + } + if strategy == MergeStrategy::Replace || self.ttl_secs == 3600 { + self.ttl_secs = other.ttl_secs; + } + } +} + +impl Merge for StrategyConfig { + fn merge(&mut self, other: &Self, strategy: MergeStrategy) { + if strategy == MergeStrategy::Replace + || (self.exploration_weight - 1.414).abs() < 0.001 + { + self.exploration_weight = other.exploration_weight; + } + if strategy == MergeStrategy::Replace || (self.similarity_threshold - 0.5).abs() < f32::EPSILON + { + self.similarity_threshold = other.similarity_threshold; + } + if strategy == MergeStrategy::Replace + || (self.high_similarity_threshold - 0.8).abs() < f32::EPSILON + { + self.high_similarity_threshold = other.high_similarity_threshold; + } + if strategy == MergeStrategy::Replace + || (self.low_similarity_threshold - 0.3).abs() < f32::EPSILON + { + self.low_similarity_threshold = other.low_similarity_threshold; + } + } +} + +impl Merge for ContentAggregatorConfig { + fn merge(&mut self, other: &Self, strategy: MergeStrategy) { + if other.enabled != self.enabled { + self.enabled = other.enabled; + } + if strategy == MergeStrategy::Replace || self.token_budget == 4000 { + self.token_budget = other.token_budget; + } + if strategy == MergeStrategy::Replace || (self.min_relevance_score - 0.2).abs() < f32::EPSILON + { + self.min_relevance_score = other.min_relevance_score; + } + if strategy == MergeStrategy::Replace || self.scoring_strategy == "keyword_bm25" { + self.scoring_strategy = other.scoring_strategy.clone(); + } + if strategy == MergeStrategy::Replace || self.output_format == "markdown" { + self.output_format = other.output_format.clone(); + } + if other.include_scores != self.include_scores { + self.include_scores = other.include_scores; + } + if strategy == MergeStrategy::Replace + || (self.hierarchical_min_per_level - 0.1).abs() < f32::EPSILON + { + self.hierarchical_min_per_level = other.hierarchical_min_per_level; + } + if other.deduplicate != self.deduplicate { + self.deduplicate = other.deduplicate; + } + if strategy == MergeStrategy::Replace || (self.dedup_threshold - 0.9).abs() < f32::EPSILON { + self.dedup_threshold = other.dedup_threshold; + } + } +} + +impl Merge for StorageConfig { + fn merge(&mut self, other: &Self, strategy: MergeStrategy) { + if strategy == MergeStrategy::Replace + || self.workspace_dir == std::path::PathBuf::from("./workspace") + { + self.workspace_dir = other.workspace_dir.clone(); + } + } +} + +impl Merge for ConcurrencyConfig { + fn merge(&mut self, other: &Self, strategy: MergeStrategy) { + if strategy == MergeStrategy::Replace || self.max_concurrent_requests == 10 { + self.max_concurrent_requests = other.max_concurrent_requests; + } + if strategy == MergeStrategy::Replace || self.requests_per_minute == 500 { + self.requests_per_minute = other.requests_per_minute; + } + if other.enabled != self.enabled { + self.enabled = other.enabled; + } + if other.semaphore_enabled != self.semaphore_enabled { + self.semaphore_enabled = other.semaphore_enabled; + } + } +} + +impl Merge for FallbackConfig { + fn merge(&mut self, other: &Self, strategy: MergeStrategy) { + if other.enabled != self.enabled { + self.enabled = other.enabled; + } + if !other.models.is_empty() { + self.models = other.models.clone(); + } + if !other.endpoints.is_empty() { + self.endpoints = other.endpoints.clone(); + } + if strategy == MergeStrategy::Replace { + self.on_rate_limit = other.on_rate_limit; + self.on_timeout = other.on_timeout; + self.on_all_failed = other.on_all_failed; + self.max_retries = other.max_retries; + self.initial_retry_delay_ms = other.initial_retry_delay_ms; + self.max_retry_delay_ms = other.max_retry_delay_ms; + self.retry_multiplier = other.retry_multiplier; + } + } +} + +/// Configuration overlay for layered configuration. +/// +/// Allows building a configuration from multiple sources, +/// with later overlays taking precedence. +#[derive(Debug, Clone)] +pub struct ConfigOverlay { + /// Base configuration. + base: Config, + /// Overlay configurations (applied in order). + overlays: Vec, +} + +impl ConfigOverlay { + /// Create a new overlay with a base configuration. + pub fn new(base: Config) -> Self { + Self { + base, + overlays: Vec::new(), + } + } + + /// Add an overlay configuration. + pub fn overlay(mut self, config: Config) -> Self { + self.overlays.push(config); + self + } + + /// Resolve all overlays into a final configuration. + pub fn resolve(self) -> Config { + let mut result = self.base; + for overlay in self.overlays { + result.merge(&overlay, MergeStrategy::Replace); + } + result + } +} + +impl Default for ConfigOverlay { + fn default() -> Self { + Self::new(Config::default()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_config_merge() { + let mut base = Config::default(); + let mut overlay = Config::default(); + + overlay.retrieval.top_k = 10; + overlay.summary.model = "gpt-4o".to_string(); + + base.merge(&overlay, MergeStrategy::Replace); + + assert_eq!(base.retrieval.top_k, 10); + assert_eq!(base.summary.model, "gpt-4o"); + } + + #[test] + fn test_config_overlay() { + let mut overlay1 = Config::default(); + overlay1.retrieval.top_k = 5; + + let mut overlay2 = Config::default(); + overlay2.retrieval.top_k = 10; + + let config = ConfigOverlay::new(Config::default()) + .overlay(overlay1) + .overlay(overlay2) + .resolve(); + + assert_eq!(config.retrieval.top_k, 10); + } + + #[test] + fn test_merge_keeps_api_keys() { + let mut base = Config::default(); + let mut overlay = Config::default(); + + overlay.summary.api_key = Some("test-key".to_string()); + + base.merge(&overlay, MergeStrategy::Replace); + + assert_eq!(base.summary.api_key, Some("test-key".to_string())); + } +} diff --git a/src/config/mod.rs b/src/config/mod.rs index 23e98f4e..98ad2e8a 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -3,15 +3,98 @@ //! Configuration management for vectorless. //! -//! This module provides configuration loading and validation: -//! - [`Config`] - Main configuration structure -//! - [`IndexerConfig`] - Indexing parameters -//! - [`SummaryConfig`] - Summarization model settings -//! - [`RetrievalConfig`] - Retrieval model settings -//! - [`StorageConfig`] - Storage paths +//! This module provides comprehensive configuration loading, validation, +//! and management: +//! +//! - [`Config`] — Main configuration structure +//! - [`ConfigLoader`] — Load configuration from TOML files +//! - [`ConfigValidator`] — Validate configuration values +//! - [`ConfigDocs`] — Generate configuration documentation +//! +//! # Quick Start +//! +//! ```rust,no_run +//! use vectorless::config::{Config, ConfigLoader}; +//! +//! // Load from file +//! let config = ConfigLoader::new() +//! .file("config.toml") +//! .with_validation(true) +//! .load()?; +//! +//! // Or use defaults +//! let config = Config::default(); +//! # Ok::<(), vectorless::config::ConfigError>(()) +//! ``` +//! +//! # Layered Configuration +//! +//! Multiple configuration files can be layered: +//! +//! ```rust,no_run +//! use vectorless::config::ConfigLoader; +//! +//! let config = ConfigLoader::new() +//! .file("default.toml") // Base defaults +//! .file("production.toml") // Production overrides +//! .with_env("VECTORLESS_") // Environment overrides +//! .with_validation(true) +//! .load()?; +//! # Ok::<(), vectorless::config::ConfigError>(()) +//! ``` +//! +//! # Environment Variables +//! +//! When enabled with `with_env()`, environment variables can override config: +//! +//! | Variable | Config Path | +//! |----------|-------------| +//! | `VECTORLESS_SUMMARY__API_KEY` | `summary.api_key` | +//! | `VECTORLESS_RETRIEVAL__TOP_K` | `retrieval.top_k` | +//! | `VECTORLESS_STORAGE__WORKSPACE_DIR` | `storage.workspace_dir` | +//! +//! # Configuration Sections +//! +//! - `[indexer]` — Document indexing parameters +//! - `[summary]` — Summarization model settings +//! - `[retrieval]` — Retrieval model settings +//! - `[retrieval.search]` — Search algorithm configuration +//! - `[retrieval.sufficiency]` — Sufficiency checker settings +//! - `[retrieval.content]` — Content aggregator settings +//! - `[retrieval.strategy]` — Strategy-specific settings +//! - `[retrieval.cache]` — Cache configuration +//! - `[storage]` — Storage paths +//! - `[concurrency]` — Concurrency control +//! - `[fallback]` — Error recovery settings +mod docs; mod loader; +mod merge; mod types; +mod validator; -pub use loader::{ConfigError, ConfigLoader}; -pub use types::*; +// Re-export main types +pub use docs::ConfigDocs; +pub use loader::{find_config_file, ConfigError, ConfigLoader, CONFIG_FILE_NAMES}; +pub use merge::{ConfigOverlay, Merge, MergeStrategy}; +pub use types::{ + // Main config + Config, + // Indexer + IndexerConfig, + // LLM configs + LlmConfig, SummaryConfig, + // Retrieval configs + RetrievalConfig, SearchConfig, + // Storage and sufficiency + StorageConfig, CacheConfig, StrategyConfig, SufficiencyConfig, + // Content aggregator + ContentAggregatorConfig, + // Concurrency + ConcurrencyConfig, + // Fallback + FallbackBehavior, FallbackConfig, OnAllFailedBehavior, + // Validation + ConfigValidationError, ValidationError, Severity, +}; +pub use validator::{ConfigValidator, ValidationRule}; diff --git a/src/config/types.rs b/src/config/types.rs deleted file mode 100644 index 3a40d920..00000000 --- a/src/config/types.rs +++ /dev/null @@ -1,578 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Configuration type definitions. -//! -//! All configuration values are defined inline in `Default` trait implementations. -//! Configuration is loaded from TOML files only - no environment variable magic. - -use serde::{Deserialize, Serialize}; -use std::path::PathBuf; - -/// Main configuration for vectorless. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct Config { - /// Indexer configuration. - #[serde(default)] - pub indexer: IndexerConfig, - - /// Summary model configuration. - #[serde(default)] - pub summary: SummaryConfig, - - /// Retrieval model configuration. - #[serde(default)] - pub retrieval: RetrievalConfig, - - /// Storage configuration. - #[serde(default)] - pub storage: StorageConfig, - - /// Concurrency control configuration. - #[serde(default)] - pub concurrency: ConcurrencyConfig, - - /// Fallback/error recovery configuration. - #[serde(default)] - pub fallback: FallbackConfig, -} - -impl Default for Config { - fn default() -> Self { - Self { - indexer: IndexerConfig::default(), - summary: SummaryConfig::default(), - retrieval: RetrievalConfig::default(), - storage: StorageConfig::default(), - concurrency: ConcurrencyConfig::default(), - fallback: FallbackConfig::default(), - } - } -} - -/// Indexer configuration. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct IndexerConfig { - /// Word count threshold for splitting sections into subsections. - #[serde(default)] - pub subsection_threshold: usize, - - /// Maximum tokens to send in a single segmentation request. - #[serde(default)] - pub max_segment_tokens: usize, - - /// Maximum tokens for each summary. - #[serde(default)] - pub max_summary_tokens: usize, - - /// Minimum content tokens required to generate a summary. - #[serde(default)] - pub min_summary_tokens: usize, -} - -impl Default for IndexerConfig { - fn default() -> Self { - Self { - subsection_threshold: 300, - max_segment_tokens: 3000, - max_summary_tokens: 200, - min_summary_tokens: 20, - } - } -} - -/// Generic LLM configuration. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct LlmConfig { - /// Model name (e.g., "gpt-4o-mini", "claude-3-haiku"). - #[serde(default)] - pub model: String, - - /// API endpoint. - #[serde(default)] - pub endpoint: String, - - /// API key. - #[serde(default)] - pub api_key: Option, - - /// Maximum tokens for responses. - #[serde(default)] - pub max_tokens: usize, - - /// Temperature for generation. - #[serde(default)] - pub temperature: f32, -} - -impl Default for LlmConfig { - fn default() -> Self { - Self { - model: "gpt-4o-mini".to_string(), - endpoint: "https://api.openai.com/v1".to_string(), - api_key: None, - max_tokens: 1000, - temperature: 0.0, - } - } -} - -impl LlmConfig { - /// Create a new LLM config with defaults. - pub fn new() -> Self { - Self::default() - } - - /// Set the model. - pub fn with_model(mut self, model: impl Into) -> Self { - self.model = model.into(); - self - } - - /// Set the endpoint. - pub fn with_endpoint(mut self, endpoint: impl Into) -> Self { - self.endpoint = endpoint.into(); - self - } - - /// Set the API key. - pub fn with_api_key(mut self, api_key: impl Into) -> Self { - self.api_key = Some(api_key.into()); - self - } - - /// Get the API key from config. - pub fn get_api_key(&self) -> Option<&str> { - self.api_key.as_deref() - } -} - -/// Summary model configuration. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct SummaryConfig { - /// Model name for summarization. - #[serde(default)] - pub model: String, - - /// API endpoint for summary model. - #[serde(default)] - pub endpoint: String, - - /// API key. - #[serde(default)] - pub api_key: Option, - - /// Maximum tokens for summary generation. - #[serde(default)] - pub max_tokens: usize, - - /// Temperature for summary generation. - #[serde(default)] - pub temperature: f32, -} - -impl Default for SummaryConfig { - fn default() -> Self { - Self { - model: "gpt-4o-mini".to_string(), - endpoint: "https://api.openai.com/v1".to_string(), - api_key: None, - max_tokens: 200, - temperature: 0.0, - } - } -} - -/// Retrieval model configuration (for navigation). -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct RetrievalConfig { - /// Model name for retrieval/navigation. - #[serde(default)] - pub model: String, - - /// API endpoint for retrieval model. - #[serde(default)] - pub endpoint: String, - - /// API key. - #[serde(default)] - pub api_key: Option, - - /// Maximum tokens for retrieval context. - #[serde(default)] - pub max_tokens: usize, - - /// Temperature for retrieval. - #[serde(default)] - pub temperature: f32, - - /// Number of top-k results to return. - #[serde(default)] - pub top_k: usize, - - /// Search algorithm configuration. - #[serde(default)] - pub search: SearchConfig, - - /// Sufficiency checker configuration. - #[serde(default)] - pub sufficiency: SufficiencyConfig, - - /// Cache configuration. - #[serde(default)] - pub cache: CacheConfig, - - /// Strategy-specific configuration. - #[serde(default)] - pub strategy: StrategyConfig, -} - -impl Default for RetrievalConfig { - fn default() -> Self { - Self { - model: "gpt-4o".to_string(), - endpoint: "https://api.openai.com/v1".to_string(), - api_key: None, - max_tokens: 1000, - temperature: 0.0, - top_k: 3, - search: SearchConfig::default(), - sufficiency: SufficiencyConfig::default(), - cache: CacheConfig::default(), - strategy: StrategyConfig::default(), - } - } -} - -/// Search algorithm configuration. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct SearchConfig { - /// Number of top-k results to return. - #[serde(default)] - pub top_k: usize, - - /// Beam width for multi-path search. - #[serde(default)] - pub beam_width: usize, - - /// Maximum iterations for search algorithms. - #[serde(default)] - pub max_iterations: usize, - - /// Minimum score to include a path. - #[serde(default)] - pub min_score: f32, -} - -impl Default for SearchConfig { - fn default() -> Self { - Self { - top_k: 5, - beam_width: 3, - max_iterations: 10, - min_score: 0.1, - } - } -} - -/// Sufficiency checker configuration. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct SufficiencyConfig { - /// Minimum tokens for sufficiency. - #[serde(default)] - pub min_tokens: usize, - - /// Target tokens for full sufficiency. - #[serde(default)] - pub target_tokens: usize, - - /// Maximum tokens before stopping. - #[serde(default)] - pub max_tokens: usize, - - /// Minimum content length (characters). - #[serde(default)] - pub min_content_length: usize, - - /// Confidence threshold for LLM judge. - #[serde(default)] - pub confidence_threshold: f32, -} - -impl Default for SufficiencyConfig { - fn default() -> Self { - Self { - min_tokens: 500, - target_tokens: 2000, - max_tokens: 4000, - min_content_length: 200, - confidence_threshold: 0.7, - } - } -} - -/// Cache configuration. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct CacheConfig { - /// Maximum number of cache entries. - #[serde(default)] - pub max_entries: usize, - - /// Time-to-live for cache entries (seconds). - #[serde(default)] - pub ttl_secs: u64, -} - -impl Default for CacheConfig { - fn default() -> Self { - Self { - max_entries: 1000, - ttl_secs: 3600, - } - } -} - -/// Strategy-specific configuration. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct StrategyConfig { - /// MCTS exploration weight (sqrt(2) ≈ 1.414). - #[serde(default)] - pub exploration_weight: f32, - - /// Semantic similarity threshold. - #[serde(default)] - pub similarity_threshold: f32, - - /// High similarity threshold for "answer" decision. - #[serde(default)] - pub high_similarity_threshold: f32, - - /// Low similarity threshold for "explore" decision. - #[serde(default)] - pub low_similarity_threshold: f32, -} - -impl Default for StrategyConfig { - fn default() -> Self { - Self { - exploration_weight: 1.414, - similarity_threshold: 0.5, - high_similarity_threshold: 0.8, - low_similarity_threshold: 0.3, - } - } -} - -/// Storage configuration. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct StorageConfig { - /// Workspace directory for persisted documents. - #[serde(default)] - pub workspace_dir: PathBuf, -} - -impl Default for StorageConfig { - fn default() -> Self { - Self { - workspace_dir: PathBuf::from("./workspace"), - } - } -} - -/// Concurrency control configuration. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ConcurrencyConfig { - /// Maximum concurrent LLM API calls. - #[serde(default)] - pub max_concurrent_requests: usize, - - /// Rate limit: requests per minute. - #[serde(default)] - pub requests_per_minute: usize, - - /// Whether rate limiting is enabled. - #[serde(default = "default_true")] - pub enabled: bool, - - /// Whether semaphore-based concurrency limiting is enabled. - #[serde(default = "default_true")] - pub semaphore_enabled: bool, -} - -fn default_true() -> bool { - true -} - -impl Default for ConcurrencyConfig { - fn default() -> Self { - Self { - max_concurrent_requests: 10, - requests_per_minute: 500, - enabled: true, - semaphore_enabled: true, - } - } -} - -impl ConcurrencyConfig { - /// Create a new config with defaults. - pub fn new() -> Self { - Self::default() - } - - /// Set the maximum concurrent requests. - pub fn with_max_concurrent_requests(mut self, max: usize) -> Self { - self.max_concurrent_requests = max; - self - } - - /// Set the requests per minute rate limit. - pub fn with_requests_per_minute(mut self, rpm: usize) -> Self { - self.requests_per_minute = rpm; - self - } - - /// Enable or disable rate limiting. - pub fn with_enabled(mut self, enabled: bool) -> Self { - self.enabled = enabled; - self - } - - /// Enable or disable semaphore. - pub fn with_semaphore_enabled(mut self, enabled: bool) -> Self { - self.semaphore_enabled = enabled; - self - } - - /// Convert to the runtime concurrency config. - pub fn to_runtime_config(&self) -> crate::throttle::ConcurrencyConfig { - crate::throttle::ConcurrencyConfig { - max_concurrent_requests: self.max_concurrent_requests, - requests_per_minute: self.requests_per_minute, - enabled: self.enabled, - semaphore_enabled: self.semaphore_enabled, - } - } -} - -impl From for crate::throttle::ConcurrencyConfig { - fn from(config: ConcurrencyConfig) -> Self { - config.to_runtime_config() - } -} - -/// Fallback behavior when encountering errors. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -#[serde(rename_all = "snake_case")] -pub enum FallbackBehavior { - /// Only retry with the same model/endpoint. - Retry, - /// Immediately switch to fallback model/endpoint. - Fallback, - /// Retry first, then fallback if still failing. - RetryThenFallback, - /// Fail immediately without retry or fallback. - Fail, -} - -impl Default for FallbackBehavior { - fn default() -> Self { - Self::RetryThenFallback - } -} - -/// Behavior when all fallback attempts fail. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -#[serde(rename_all = "snake_case")] -pub enum OnAllFailedBehavior { - /// Return the error to the caller. - ReturnError, - /// Try to return cached result if available. - ReturnCache, -} - -impl Default for OnAllFailedBehavior { - fn default() -> Self { - Self::ReturnError - } -} - -/// Fallback configuration for error recovery. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct FallbackConfig { - /// Whether fallback is enabled. - #[serde(default = "default_true")] - pub enabled: bool, - - /// Fallback models in priority order. - #[serde(default)] - pub models: Vec, - - /// Fallback endpoints in priority order. - #[serde(default)] - pub endpoints: Vec, - - /// Behavior on rate limit error (429). - #[serde(default)] - pub on_rate_limit: FallbackBehavior, - - /// Behavior on timeout error. - #[serde(default)] - pub on_timeout: FallbackBehavior, - - /// Behavior when all attempts fail. - #[serde(default)] - pub on_all_failed: OnAllFailedBehavior, -} - -impl Default for FallbackConfig { - fn default() -> Self { - Self { - enabled: true, - models: vec!["gpt-4o-mini".to_string(), "glm-4-flash".to_string()], - endpoints: vec![], - on_rate_limit: FallbackBehavior::RetryThenFallback, - on_timeout: FallbackBehavior::RetryThenFallback, - on_all_failed: OnAllFailedBehavior::ReturnError, - } - } -} - -impl FallbackConfig { - /// Create a new fallback config with defaults. - pub fn new() -> Self { - Self::default() - } - - /// Disable fallback entirely. - pub fn disabled() -> Self { - Self { - enabled: false, - ..Self::default() - } - } - - /// Set fallback models. - pub fn with_models(mut self, models: Vec) -> Self { - self.models = models; - self - } - - /// Set fallback endpoints. - pub fn with_endpoints(mut self, endpoints: Vec) -> Self { - self.endpoints = endpoints; - self - } - - /// Set behavior on rate limit. - pub fn with_on_rate_limit(mut self, behavior: FallbackBehavior) -> Self { - self.on_rate_limit = behavior; - self - } - - /// Set behavior on timeout. - pub fn with_on_timeout(mut self, behavior: FallbackBehavior) -> Self { - self.on_timeout = behavior; - self - } -} diff --git a/src/config/types/concurrency.rs b/src/config/types/concurrency.rs new file mode 100644 index 00000000..c4172ba8 --- /dev/null +++ b/src/config/types/concurrency.rs @@ -0,0 +1,122 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Concurrency control configuration types. + +use serde::{Deserialize, Serialize}; + +/// Concurrency control configuration. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConcurrencyConfig { + /// Maximum concurrent LLM API calls. + #[serde(default = "default_max_concurrent_requests")] + pub max_concurrent_requests: usize, + + /// Rate limit: requests per minute. + #[serde(default = "default_requests_per_minute")] + pub requests_per_minute: usize, + + /// Whether rate limiting is enabled. + #[serde(default = "default_true")] + pub enabled: bool, + + /// Whether semaphore-based concurrency limiting is enabled. + #[serde(default = "default_true")] + pub semaphore_enabled: bool, +} + +fn default_max_concurrent_requests() -> usize { + 10 +} + +fn default_requests_per_minute() -> usize { + 500 +} + +fn default_true() -> bool { + true +} + +impl Default for ConcurrencyConfig { + fn default() -> Self { + Self { + max_concurrent_requests: default_max_concurrent_requests(), + requests_per_minute: default_requests_per_minute(), + enabled: default_true(), + semaphore_enabled: default_true(), + } + } +} + +impl ConcurrencyConfig { + /// Create a new config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Set the maximum concurrent requests. + pub fn with_max_concurrent_requests(mut self, max: usize) -> Self { + self.max_concurrent_requests = max; + self + } + + /// Set the requests per minute rate limit. + pub fn with_requests_per_minute(mut self, rpm: usize) -> Self { + self.requests_per_minute = rpm; + self + } + + /// Enable or disable rate limiting. + pub fn with_enabled(mut self, enabled: bool) -> Self { + self.enabled = enabled; + self + } + + /// Enable or disable semaphore. + pub fn with_semaphore_enabled(mut self, enabled: bool) -> Self { + self.semaphore_enabled = enabled; + self + } + + /// Convert to the runtime concurrency config. + pub fn to_runtime_config(&self) -> crate::throttle::ConcurrencyConfig { + crate::throttle::ConcurrencyConfig { + max_concurrent_requests: self.max_concurrent_requests, + requests_per_minute: self.requests_per_minute, + enabled: self.enabled, + semaphore_enabled: self.semaphore_enabled, + } + } +} + +impl From for crate::throttle::ConcurrencyConfig { + fn from(config: ConcurrencyConfig) -> Self { + config.to_runtime_config() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_concurrency_config_defaults() { + let config = ConcurrencyConfig::default(); + assert_eq!(config.max_concurrent_requests, 10); + assert_eq!(config.requests_per_minute, 500); + assert!(config.enabled); + assert!(config.semaphore_enabled); + } + + #[test] + fn test_concurrency_config_builder() { + let config = ConcurrencyConfig::new() + .with_max_concurrent_requests(20) + .with_requests_per_minute(1000) + .with_enabled(false); + + assert_eq!(config.max_concurrent_requests, 20); + assert_eq!(config.requests_per_minute, 1000); + assert!(!config.enabled); + } +} diff --git a/src/config/types/content.rs b/src/config/types/content.rs new file mode 100644 index 00000000..62741cd7 --- /dev/null +++ b/src/config/types/content.rs @@ -0,0 +1,222 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Content aggregator configuration types. + +use serde::{Deserialize, Serialize}; + +/// Content aggregator configuration. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ContentAggregatorConfig { + /// Whether content aggregator is enabled. + /// When disabled, uses simple content collection (legacy behavior). + #[serde(default = "default_true")] + pub enabled: bool, + + /// Maximum tokens for aggregated content. + #[serde(default = "default_token_budget")] + pub token_budget: usize, + + /// Minimum relevance score threshold (0.0 - 1.0). + /// Content below this threshold will be filtered out. + #[serde(default = "default_min_relevance_score")] + pub min_relevance_score: f32, + + /// Scoring strategy: "keyword_only" | "keyword_bm25" | "hybrid" + #[serde(default = "default_scoring_strategy")] + pub scoring_strategy: String, + + /// Output format: "markdown" | "json" | "tree" | "flat" + #[serde(default = "default_output_format")] + pub output_format: String, + + /// Include relevance scores in output. + #[serde(default)] + pub include_scores: bool, + + /// Minimum budget allocation per depth level (0.0 - 1.0). + /// Ensures each tree level gets representation. + #[serde(default = "default_hierarchical_min_per_level")] + pub hierarchical_min_per_level: f32, + + /// Enable content deduplication. + #[serde(default = "default_true")] + pub deduplicate: bool, + + /// Similarity threshold for deduplication (0.0 - 1.0). + /// Higher = more aggressive deduplication. + #[serde(default = "default_dedup_threshold")] + pub dedup_threshold: f32, +} + +fn default_true() -> bool { + true +} + +fn default_token_budget() -> usize { + 4000 +} + +fn default_min_relevance_score() -> f32 { + 0.2 +} + +fn default_scoring_strategy() -> String { + "keyword_bm25".to_string() +} + +fn default_output_format() -> String { + "markdown".to_string() +} + +fn default_hierarchical_min_per_level() -> f32 { + 0.1 +} + +fn default_dedup_threshold() -> f32 { + 0.9 +} + +impl Default for ContentAggregatorConfig { + fn default() -> Self { + Self { + enabled: default_true(), + token_budget: default_token_budget(), + min_relevance_score: default_min_relevance_score(), + scoring_strategy: default_scoring_strategy(), + output_format: default_output_format(), + include_scores: false, + hierarchical_min_per_level: default_hierarchical_min_per_level(), + deduplicate: default_true(), + dedup_threshold: default_dedup_threshold(), + } + } +} + +impl ContentAggregatorConfig { + /// Create a new config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Disable content aggregator (use legacy behavior). + pub fn disabled() -> Self { + Self { + enabled: false, + ..Self::default() + } + } + + /// Set the token budget. + pub fn with_token_budget(mut self, budget: usize) -> Self { + self.token_budget = budget; + self + } + + /// Set the minimum relevance score. + pub fn with_min_relevance(mut self, score: f32) -> Self { + self.min_relevance_score = score.clamp(0.0, 1.0); + self + } + + /// Set the scoring strategy. + pub fn with_scoring_strategy(mut self, strategy: impl Into) -> Self { + self.scoring_strategy = strategy.into(); + self + } + + /// Set the output format. + pub fn with_output_format(mut self, format: impl Into) -> Self { + self.output_format = format.into(); + self + } + + /// Enable/disable score inclusion. + pub fn with_include_scores(mut self, include: bool) -> Self { + self.include_scores = include; + self + } + + /// Enable/disable deduplication. + pub fn with_deduplicate(mut self, dedupe: bool) -> Self { + self.deduplicate = dedupe; + self + } + + /// Convert to the retrieval content aggregator config. + pub fn to_aggregator_config(&self) -> crate::retrieval::content::ContentAggregatorConfig { + use crate::retrieval::content::{ + ContentAggregatorConfig as RetrievalContentConfig, OutputFormatConfig, + ScoringStrategyConfig, + }; + + let scoring_strategy = match self.scoring_strategy.as_str() { + "keyword_only" => ScoringStrategyConfig::KeywordOnly, + "hybrid" => ScoringStrategyConfig::Hybrid, + _ => ScoringStrategyConfig::KeywordWithBM25, + }; + + let output_format = match self.output_format.as_str() { + "json" => OutputFormatConfig::Json, + "tree" => OutputFormatConfig::Tree, + "flat" => OutputFormatConfig::Flat, + _ => OutputFormatConfig::Markdown, + }; + + RetrievalContentConfig { + token_budget: self.token_budget, + min_relevance_score: self.min_relevance_score, + scoring_strategy, + output_format, + include_scores: self.include_scores, + hierarchical_min_per_level: self.hierarchical_min_per_level, + deduplicate: self.deduplicate, + dedup_threshold: self.dedup_threshold, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_content_aggregator_config_defaults() { + let config = ContentAggregatorConfig::default(); + assert!(config.enabled); + assert_eq!(config.token_budget, 4000); + assert_eq!(config.min_relevance_score, 0.2); + assert_eq!(config.scoring_strategy, "keyword_bm25"); + assert_eq!(config.output_format, "markdown"); + assert!(config.deduplicate); + } + + #[test] + fn test_content_aggregator_config_disabled() { + let config = ContentAggregatorConfig::disabled(); + assert!(!config.enabled); + } + + #[test] + fn test_content_aggregator_config_builder() { + let config = ContentAggregatorConfig::new() + .with_token_budget(8000) + .with_min_relevance(0.5) + .with_scoring_strategy("hybrid") + .with_output_format("json"); + + assert_eq!(config.token_budget, 8000); + assert_eq!(config.min_relevance_score, 0.5); + assert_eq!(config.scoring_strategy, "hybrid"); + assert_eq!(config.output_format, "json"); + } + + #[test] + fn test_min_relevance_clamping() { + let config = ContentAggregatorConfig::new().with_min_relevance(1.5); + assert_eq!(config.min_relevance_score, 1.0); + + let config = ContentAggregatorConfig::new().with_min_relevance(-0.5); + assert_eq!(config.min_relevance_score, 0.0); + } +} diff --git a/src/config/types/fallback.rs b/src/config/types/fallback.rs new file mode 100644 index 00000000..fa199b30 --- /dev/null +++ b/src/config/types/fallback.rs @@ -0,0 +1,233 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Fallback and error recovery configuration types. + +use serde::{Deserialize, Serialize}; + +/// Fallback behavior when encountering errors. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum FallbackBehavior { + /// Only retry with the same model/endpoint. + Retry, + /// Immediately switch to fallback model/endpoint. + Fallback, + /// Retry first, then fallback if still failing. + RetryThenFallback, + /// Fail immediately without retry or fallback. + Fail, +} + +impl Default for FallbackBehavior { + fn default() -> Self { + Self::RetryThenFallback + } +} + +/// Behavior when all fallback attempts fail. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum OnAllFailedBehavior { + /// Return the error to the caller. + ReturnError, + /// Try to return cached result if available. + ReturnCache, +} + +impl Default for OnAllFailedBehavior { + fn default() -> Self { + Self::ReturnError + } +} + +/// Fallback configuration for error recovery. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct FallbackConfig { + /// Whether fallback is enabled. + #[serde(default = "default_true")] + pub enabled: bool, + + /// Fallback models in priority order. + #[serde(default = "default_fallback_models")] + pub models: Vec, + + /// Fallback endpoints in priority order. + #[serde(default)] + pub endpoints: Vec, + + /// Behavior on rate limit error (429). + #[serde(default)] + pub on_rate_limit: FallbackBehavior, + + /// Behavior on timeout error. + #[serde(default)] + pub on_timeout: FallbackBehavior, + + /// Behavior when all attempts fail. + #[serde(default)] + pub on_all_failed: OnAllFailedBehavior, + + /// Maximum retry attempts. + #[serde(default = "default_max_retries")] + pub max_retries: usize, + + /// Initial retry delay in milliseconds. + #[serde(default = "default_initial_retry_delay_ms")] + pub initial_retry_delay_ms: u64, + + /// Maximum retry delay in milliseconds. + #[serde(default = "default_max_retry_delay_ms")] + pub max_retry_delay_ms: u64, + + /// Retry delay multiplier (exponential backoff). + #[serde(default = "default_retry_multiplier")] + pub retry_multiplier: f32, +} + +fn default_fallback_models() -> Vec { + vec!["gpt-4o-mini".to_string(), "glm-4-flash".to_string()] +} + +fn default_max_retries() -> usize { + 3 +} + +fn default_initial_retry_delay_ms() -> u64 { + 1000 +} + +fn default_max_retry_delay_ms() -> u64 { + 30000 +} + +fn default_retry_multiplier() -> f32 { + 2.0 +} + +impl Default for FallbackConfig { + fn default() -> Self { + Self { + enabled: default_true(), + models: default_fallback_models(), + endpoints: Vec::new(), + on_rate_limit: FallbackBehavior::default(), + on_timeout: FallbackBehavior::default(), + on_all_failed: OnAllFailedBehavior::default(), + max_retries: default_max_retries(), + initial_retry_delay_ms: default_initial_retry_delay_ms(), + max_retry_delay_ms: default_max_retry_delay_ms(), + retry_multiplier: default_retry_multiplier(), + } + } +} + +fn default_true() -> bool { + true +} + +impl FallbackConfig { + /// Create a new fallback config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Disable fallback entirely. + pub fn disabled() -> Self { + Self { + enabled: false, + ..Self::default() + } + } + + /// Set fallback models. + pub fn with_models(mut self, models: Vec) -> Self { + self.models = models; + self + } + + /// Set fallback endpoints. + pub fn with_endpoints(mut self, endpoints: Vec) -> Self { + self.endpoints = endpoints; + self + } + + /// Set behavior on rate limit. + pub fn with_on_rate_limit(mut self, behavior: FallbackBehavior) -> Self { + self.on_rate_limit = behavior; + self + } + + /// Set behavior on timeout. + pub fn with_on_timeout(mut self, behavior: FallbackBehavior) -> Self { + self.on_timeout = behavior; + self + } + + /// Set behavior when all attempts fail. + pub fn with_on_all_failed(mut self, behavior: OnAllFailedBehavior) -> Self { + self.on_all_failed = behavior; + self + } + + /// Set maximum retries. + pub fn with_max_retries(mut self, max: usize) -> Self { + self.max_retries = max; + self + } + + /// Calculate retry delay with exponential backoff. + pub fn calculate_retry_delay(&self, attempt: usize) -> std::time::Duration { + let delay_ms = if attempt == 0 { + self.initial_retry_delay_ms + } else { + let delay = self.initial_retry_delay_ms as f32 + * self.retry_multiplier.powi(attempt as i32); + delay.min(self.max_retry_delay_ms as f32) as u64 + }; + std::time::Duration::from_millis(delay_ms) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_fallback_config_defaults() { + let config = FallbackConfig::default(); + assert!(config.enabled); + assert_eq!(config.models.len(), 2); + assert_eq!(config.on_rate_limit, FallbackBehavior::RetryThenFallback); + assert_eq!(config.max_retries, 3); + } + + #[test] + fn test_fallback_config_disabled() { + let config = FallbackConfig::disabled(); + assert!(!config.enabled); + } + + #[test] + fn test_fallback_behavior_serde() { + let behavior = FallbackBehavior::RetryThenFallback; + let json = serde_json::to_string(&behavior).unwrap(); + assert_eq!(json, "\"retry_then_fallback\""); + + let decoded: FallbackBehavior = serde_json::from_str(&json).unwrap(); + assert_eq!(decoded, behavior); + } + + #[test] + fn test_retry_delay_calculation() { + let config = FallbackConfig::default(); + + let d0 = config.calculate_retry_delay(0); + let d1 = config.calculate_retry_delay(1); + let d2 = config.calculate_retry_delay(2); + + assert_eq!(d0.as_millis(), 1000); + assert_eq!(d1.as_millis(), 2000); + assert_eq!(d2.as_millis(), 4000); + } +} diff --git a/src/config/types/indexer.rs b/src/config/types/indexer.rs new file mode 100644 index 00000000..6353122a --- /dev/null +++ b/src/config/types/indexer.rs @@ -0,0 +1,108 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Indexer configuration types. + +use serde::{Deserialize, Serialize}; + +/// Indexer configuration. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct IndexerConfig { + /// Word count threshold for splitting sections into subsections. + #[serde(default = "default_subsection_threshold")] + pub subsection_threshold: usize, + + /// Maximum tokens to send in a single segmentation request. + #[serde(default = "default_max_segment_tokens")] + pub max_segment_tokens: usize, + + /// Maximum tokens for each summary. + #[serde(default = "default_max_summary_tokens")] + pub max_summary_tokens: usize, + + /// Minimum content tokens required to generate a summary. + #[serde(default = "default_min_summary_tokens")] + pub min_summary_tokens: usize, +} + +fn default_subsection_threshold() -> usize { + 300 +} + +fn default_max_segment_tokens() -> usize { + 3000 +} + +fn default_max_summary_tokens() -> usize { + 200 +} + +fn default_min_summary_tokens() -> usize { + 20 +} + +impl Default for IndexerConfig { + fn default() -> Self { + Self { + subsection_threshold: default_subsection_threshold(), + max_segment_tokens: default_max_segment_tokens(), + max_summary_tokens: default_max_summary_tokens(), + min_summary_tokens: default_min_summary_tokens(), + } + } +} + +impl IndexerConfig { + /// Create a new indexer config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Set the subsection threshold. + pub fn with_subsection_threshold(mut self, threshold: usize) -> Self { + self.subsection_threshold = threshold; + self + } + + /// Set the maximum segment tokens. + pub fn with_max_segment_tokens(mut self, tokens: usize) -> Self { + self.max_segment_tokens = tokens; + self + } + + /// Set the maximum summary tokens. + pub fn with_max_summary_tokens(mut self, tokens: usize) -> Self { + self.max_summary_tokens = tokens; + self + } + + /// Set the minimum summary tokens. + pub fn with_min_summary_tokens(mut self, tokens: usize) -> Self { + self.min_summary_tokens = tokens; + self + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_indexer_config_defaults() { + let config = IndexerConfig::default(); + assert_eq!(config.subsection_threshold, 300); + assert_eq!(config.max_segment_tokens, 3000); + assert_eq!(config.max_summary_tokens, 200); + assert_eq!(config.min_summary_tokens, 20); + } + + #[test] + fn test_indexer_config_builder() { + let config = IndexerConfig::new() + .with_subsection_threshold(500) + .with_max_summary_tokens(300); + + assert_eq!(config.subsection_threshold, 500); + assert_eq!(config.max_summary_tokens, 300); + } +} diff --git a/src/config/types/llm.rs b/src/config/types/llm.rs new file mode 100644 index 00000000..a98ee7d3 --- /dev/null +++ b/src/config/types/llm.rs @@ -0,0 +1,218 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! LLM configuration types for summary and retrieval. + +use serde::{Deserialize, Serialize}; + +/// Generic LLM configuration. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct LlmConfig { + /// Model name (e.g., "gpt-4o-mini", "claude-3-haiku"). + #[serde(default = "default_model")] + pub model: String, + + /// API endpoint. + #[serde(default = "default_endpoint")] + pub endpoint: String, + + /// API key. + #[serde(default)] + pub api_key: Option, + + /// Maximum tokens for responses. + #[serde(default = "default_max_tokens")] + pub max_tokens: usize, + + /// Temperature for generation. + #[serde(default = "default_temperature")] + pub temperature: f32, +} + +fn default_model() -> String { + "gpt-4o-mini".to_string() +} + +fn default_endpoint() -> String { + "https://api.openai.com/v1".to_string() +} + +fn default_max_tokens() -> usize { + 1000 +} + +fn default_temperature() -> f32 { + 0.0 +} + +impl Default for LlmConfig { + fn default() -> Self { + Self { + model: default_model(), + endpoint: default_endpoint(), + api_key: None, + max_tokens: default_max_tokens(), + temperature: default_temperature(), + } + } +} + +impl LlmConfig { + /// Create a new LLM config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Set the model. + pub fn with_model(mut self, model: impl Into) -> Self { + self.model = model.into(); + self + } + + /// Set the endpoint. + pub fn with_endpoint(mut self, endpoint: impl Into) -> Self { + self.endpoint = endpoint.into(); + self + } + + /// Set the API key. + pub fn with_api_key(mut self, api_key: impl Into) -> Self { + self.api_key = Some(api_key.into()); + self + } + + /// Set the maximum tokens. + pub fn with_max_tokens(mut self, max_tokens: usize) -> Self { + self.max_tokens = max_tokens; + self + } + + /// Set the temperature. + pub fn with_temperature(mut self, temperature: f32) -> Self { + self.temperature = temperature; + self + } + + /// Get the API key from config. + pub fn get_api_key(&self) -> Option<&str> { + self.api_key.as_deref() + } +} + +/// Summary model configuration. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SummaryConfig { + /// Model name for summarization. + #[serde(default = "default_summary_model")] + pub model: String, + + /// API endpoint for summary model. + #[serde(default = "default_endpoint")] + pub endpoint: String, + + /// API key. + #[serde(default)] + pub api_key: Option, + + /// Maximum tokens for summary generation. + #[serde(default = "default_max_summary_tokens")] + pub max_tokens: usize, + + /// Temperature for summary generation. + #[serde(default = "default_temperature")] + pub temperature: f32, +} + +fn default_summary_model() -> String { + "gpt-4o-mini".to_string() +} + +fn default_max_summary_tokens() -> usize { + 200 +} + +impl Default for SummaryConfig { + fn default() -> Self { + Self { + model: default_summary_model(), + endpoint: default_endpoint(), + api_key: None, + max_tokens: default_max_summary_tokens(), + temperature: default_temperature(), + } + } +} + +impl SummaryConfig { + /// Create a new summary config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Set the model. + pub fn with_model(mut self, model: impl Into) -> Self { + self.model = model.into(); + self + } + + /// Set the endpoint. + pub fn with_endpoint(mut self, endpoint: impl Into) -> Self { + self.endpoint = endpoint.into(); + self + } + + /// Set the API key. + pub fn with_api_key(mut self, api_key: impl Into) -> Self { + self.api_key = Some(api_key.into()); + self + } + + /// Set the maximum tokens. + pub fn with_max_tokens(mut self, max_tokens: usize) -> Self { + self.max_tokens = max_tokens; + self + } + + /// Convert to generic LLM config. + pub fn to_llm_config(&self) -> LlmConfig { + LlmConfig { + model: self.model.clone(), + endpoint: self.endpoint.clone(), + api_key: self.api_key.clone(), + max_tokens: self.max_tokens, + temperature: self.temperature, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_llm_config_defaults() { + let config = LlmConfig::default(); + assert_eq!(config.model, "gpt-4o-mini"); + assert_eq!(config.endpoint, "https://api.openai.com/v1"); + assert!(config.api_key.is_none()); + } + + #[test] + fn test_llm_config_builder() { + let config = LlmConfig::new() + .with_model("gpt-4o") + .with_api_key("test-key") + .with_max_tokens(2000); + + assert_eq!(config.model, "gpt-4o"); + assert_eq!(config.api_key, Some("test-key".to_string())); + assert_eq!(config.max_tokens, 2000); + } + + #[test] + fn test_summary_config() { + let config = SummaryConfig::default(); + assert_eq!(config.model, "gpt-4o-mini"); + assert_eq!(config.max_tokens, 200); + } +} diff --git a/src/config/types/mod.rs b/src/config/types/mod.rs new file mode 100644 index 00000000..a824ee3f --- /dev/null +++ b/src/config/types/mod.rs @@ -0,0 +1,336 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Configuration type definitions. +//! +//! All configuration values are defined inline in `Default` trait implementations. +//! Configuration is loaded from TOML files only - no environment variable magic. + +mod content; +mod concurrency; +mod fallback; +mod indexer; +mod llm; +mod retrieval; +mod storage; + +use serde::{Deserialize, Serialize}; + +pub use content::ContentAggregatorConfig; +pub use concurrency::ConcurrencyConfig; +pub use fallback::{FallbackBehavior, FallbackConfig, OnAllFailedBehavior}; +pub use indexer::IndexerConfig; +pub use llm::{LlmConfig, SummaryConfig}; +pub use retrieval::{RetrievalConfig, SearchConfig}; +pub use storage::{ + CacheConfig, StorageConfig, StrategyConfig, SufficiencyConfig, +}; + +/// Main configuration for vectorless. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Config { + /// Indexer configuration. + #[serde(default)] + pub indexer: IndexerConfig, + + /// Summary model configuration. + #[serde(default)] + pub summary: SummaryConfig, + + /// Retrieval model configuration. + #[serde(default)] + pub retrieval: RetrievalConfig, + + /// Storage configuration. + #[serde(default)] + pub storage: StorageConfig, + + /// Concurrency control configuration. + #[serde(default)] + pub concurrency: ConcurrencyConfig, + + /// Fallback/error recovery configuration. + #[serde(default)] + pub fallback: FallbackConfig, +} + +impl Default for Config { + fn default() -> Self { + Self { + indexer: IndexerConfig::default(), + summary: SummaryConfig::default(), + retrieval: RetrievalConfig::default(), + storage: StorageConfig::default(), + concurrency: ConcurrencyConfig::default(), + fallback: FallbackConfig::default(), + } + } +} + +impl Config { + /// Create a new configuration with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Set the indexer configuration. + pub fn with_indexer(mut self, indexer: IndexerConfig) -> Self { + self.indexer = indexer; + self + } + + /// Set the summary configuration. + pub fn with_summary(mut self, summary: SummaryConfig) -> Self { + self.summary = summary; + self + } + + /// Set the retrieval configuration. + pub fn with_retrieval(mut self, retrieval: RetrievalConfig) -> Self { + self.retrieval = retrieval; + self + } + + /// Set the storage configuration. + pub fn with_storage(mut self, storage: StorageConfig) -> Self { + self.storage = storage; + self + } + + /// Set the concurrency configuration. + pub fn with_concurrency(mut self, concurrency: ConcurrencyConfig) -> Self { + self.concurrency = concurrency; + self + } + + /// Set the fallback configuration. + pub fn with_fallback(mut self, fallback: FallbackConfig) -> Self { + self.fallback = fallback; + self + } + + /// Validate the configuration. + pub fn validate(&self) -> Result<(), ConfigValidationError> { + let mut errors = Vec::new(); + + // Validate indexer + if self.indexer.subsection_threshold == 0 { + errors.push(ValidationError::error( + "indexer.subsection_threshold", + "Subsection threshold must be greater than 0", + )); + } + + // Validate summary + if self.summary.max_tokens == 0 { + errors.push(ValidationError::error( + "summary.max_tokens", + "Summary max tokens must be greater than 0", + )); + } + + // Validate retrieval + if self.retrieval.top_k == 0 { + errors.push(ValidationError::error( + "retrieval.top_k", + "Top K must be greater than 0", + )); + } + + if self.retrieval.temperature < 0.0 || self.retrieval.temperature > 2.0 { + errors.push(ValidationError::warning( + "retrieval.temperature", + "Temperature outside typical range [0.0, 2.0]", + ).with_actual(self.retrieval.temperature.to_string())); + } + + // Validate content aggregator + if self.retrieval.content.token_budget == 0 { + errors.push(ValidationError::error( + "retrieval.content.token_budget", + "Token budget must be greater than 0", + )); + } + + if self.retrieval.content.min_relevance_score < 0.0 + || self.retrieval.content.min_relevance_score > 1.0 + { + errors.push(ValidationError::error( + "retrieval.content.min_relevance_score", + "Min relevance score must be between 0.0 and 1.0", + ) + .with_expected("0.0 - 1.0") + .with_actual(self.retrieval.content.min_relevance_score.to_string())); + } + + // Validate concurrency + if self.concurrency.max_concurrent_requests == 0 { + errors.push(ValidationError::error( + "concurrency.max_concurrent_requests", + "Max concurrent requests must be greater than 0", + )); + } + + // Validate fallback + if self.fallback.enabled && self.fallback.models.is_empty() { + errors.push(ValidationError::warning( + "fallback.models", + "Fallback enabled but no fallback models configured", + )); + } + + if errors.is_empty() { + Ok(()) + } else { + Err(ConfigValidationError { errors }) + } + } +} + +/// Configuration validation error. +#[derive(Debug, Clone, thiserror::Error)] +#[error("Configuration validation failed with {} error(s)", self.errors.len())] +pub struct ConfigValidationError { + /// Validation errors. + pub errors: Vec, +} + +/// A single validation error. +#[derive(Debug, Clone)] +pub struct ValidationError { + /// Field path (e.g., "retrieval.content.token_budget"). + pub path: String, + + /// Error message. + pub message: String, + + /// Expected value/range. + pub expected: Option, + + /// Actual value. + pub actual: Option, + + /// Severity level. + pub severity: Severity, +} + +impl ValidationError { + /// Create an error-level validation error. + pub fn error(path: impl Into, message: impl Into) -> Self { + Self { + path: path.into(), + message: message.into(), + expected: None, + actual: None, + severity: Severity::Error, + } + } + + /// Create a warning-level validation error. + pub fn warning(path: impl Into, message: impl Into) -> Self { + Self { + path: path.into(), + message: message.into(), + expected: None, + actual: None, + severity: Severity::Warning, + } + } + + /// Create an info-level validation error. + pub fn info(path: impl Into, message: impl Into) -> Self { + Self { + path: path.into(), + message: message.into(), + expected: None, + actual: None, + severity: Severity::Info, + } + } + + /// Set the expected value. + pub fn with_expected(mut self, expected: impl Into) -> Self { + self.expected = Some(expected.into()); + self + } + + /// Set the actual value. + pub fn with_actual(mut self, actual: impl Into) -> Self { + self.actual = Some(actual.into()); + self + } +} + +impl std::fmt::Display for ValidationError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let severity = match self.severity { + Severity::Error => "ERROR", + Severity::Warning => "WARNING", + Severity::Info => "INFO", + }; + write!(f, "[{}] {}: {}", severity, self.path, self.message)?; + if let Some(ref expected) = self.expected { + write!(f, " (expected: {})", expected)?; + } + if let Some(ref actual) = self.actual { + write!(f, " (actual: {})", actual)?; + } + Ok(()) + } +} + +/// Validation severity level. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Severity { + /// Error - must fix. + Error, + /// Warning - should fix. + Warning, + /// Info - suggestion. + Info, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_config_defaults() { + let config = Config::default(); + assert_eq!(config.indexer.subsection_threshold, 300); + assert_eq!(config.summary.model, "gpt-4o-mini"); + assert_eq!(config.retrieval.model, "gpt-4o"); + assert_eq!(config.concurrency.max_concurrent_requests, 10); + } + + #[test] + fn test_config_validation_success() { + let config = Config::default(); + assert!(config.validate().is_ok()); + } + + #[test] + fn test_config_validation_errors() { + let mut config = Config::default(); + config.retrieval.content.token_budget = 0; + config.retrieval.content.min_relevance_score = 1.5; + + let result = config.validate(); + assert!(result.is_err()); + + let err = result.unwrap_err(); + assert!(!err.errors.is_empty()); + } + + #[test] + fn test_validation_error_display() { + let err = ValidationError::error("test.field", "Invalid value") + .with_expected(">= 1") + .with_actual("0"); + + let display = format!("{}", err); + assert!(display.contains("ERROR")); + assert!(display.contains("test.field")); + assert!(display.contains("expected")); + } +} diff --git a/src/config/types/retrieval.rs b/src/config/types/retrieval.rs new file mode 100644 index 00000000..d111b686 --- /dev/null +++ b/src/config/types/retrieval.rs @@ -0,0 +1,219 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Retrieval configuration types. + +use serde::{Deserialize, Serialize}; + +use super::content::ContentAggregatorConfig; +use super::storage::{CacheConfig, StrategyConfig, SufficiencyConfig}; + +/// Retrieval model configuration (for navigation). +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RetrievalConfig { + /// Model name for retrieval/navigation. + #[serde(default = "default_retrieval_model")] + pub model: String, + + /// API endpoint for retrieval model. + #[serde(default = "default_endpoint")] + pub endpoint: String, + + /// API key. + #[serde(default)] + pub api_key: Option, + + /// Maximum tokens for retrieval context. + #[serde(default = "default_max_retrieval_tokens")] + pub max_tokens: usize, + + /// Temperature for retrieval. + #[serde(default = "default_temperature")] + pub temperature: f32, + + /// Number of top-k results to return. + #[serde(default = "default_top_k")] + pub top_k: usize, + + /// Search algorithm configuration. + #[serde(default)] + pub search: SearchConfig, + + /// Sufficiency checker configuration. + #[serde(default)] + pub sufficiency: SufficiencyConfig, + + /// Cache configuration. + #[serde(default)] + pub cache: CacheConfig, + + /// Strategy-specific configuration. + #[serde(default)] + pub strategy: StrategyConfig, + + /// Content aggregator configuration. + #[serde(default)] + pub content: ContentAggregatorConfig, +} + +fn default_retrieval_model() -> String { + "gpt-4o".to_string() +} + +fn default_endpoint() -> String { + "https://api.openai.com/v1".to_string() +} + +fn default_max_retrieval_tokens() -> usize { + 1000 +} + +fn default_temperature() -> f32 { + 0.0 +} + +fn default_top_k() -> usize { + 3 +} + +impl Default for RetrievalConfig { + fn default() -> Self { + Self { + model: default_retrieval_model(), + endpoint: default_endpoint(), + api_key: None, + max_tokens: default_max_retrieval_tokens(), + temperature: default_temperature(), + top_k: default_top_k(), + search: SearchConfig::default(), + sufficiency: SufficiencyConfig::default(), + cache: CacheConfig::default(), + strategy: StrategyConfig::default(), + content: ContentAggregatorConfig::default(), + } + } +} + +impl RetrievalConfig { + /// Create a new retrieval config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Set the model. + pub fn with_model(mut self, model: impl Into) -> Self { + self.model = model.into(); + self + } + + /// Set the endpoint. + pub fn with_endpoint(mut self, endpoint: impl Into) -> Self { + self.endpoint = endpoint.into(); + self + } + + /// Set the API key. + pub fn with_api_key(mut self, api_key: impl Into) -> Self { + self.api_key = Some(api_key.into()); + self + } + + /// Set the top_k. + pub fn with_top_k(mut self, top_k: usize) -> Self { + self.top_k = top_k; + self + } +} + +/// Search algorithm configuration. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SearchConfig { + /// Number of top-k results to return. + #[serde(default = "default_search_top_k")] + pub top_k: usize, + + /// Beam width for multi-path search. + #[serde(default = "default_beam_width")] + pub beam_width: usize, + + /// Maximum iterations for search algorithms. + #[serde(default = "default_max_iterations")] + pub max_iterations: usize, + + /// Minimum score to include a path. + #[serde(default = "default_min_score")] + pub min_score: f32, +} + +fn default_search_top_k() -> usize { + 5 +} + +fn default_beam_width() -> usize { + 3 +} + +fn default_max_iterations() -> usize { + 10 +} + +fn default_min_score() -> f32 { + 0.1 +} + +impl Default for SearchConfig { + fn default() -> Self { + Self { + top_k: default_search_top_k(), + beam_width: default_beam_width(), + max_iterations: default_max_iterations(), + min_score: default_min_score(), + } + } +} + +impl SearchConfig { + /// Create new search config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Set the top_k. + pub fn with_top_k(mut self, top_k: usize) -> Self { + self.top_k = top_k; + self + } + + /// Set the beam width. + pub fn with_beam_width(mut self, width: usize) -> Self { + self.beam_width = width; + self + } + + /// Set the max iterations. + pub fn with_max_iterations(mut self, max: usize) -> Self { + self.max_iterations = max; + self + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_retrieval_config_defaults() { + let config = RetrievalConfig::default(); + assert_eq!(config.model, "gpt-4o"); + assert_eq!(config.top_k, 3); + assert_eq!(config.search.top_k, 5); + } + + #[test] + fn test_search_config_defaults() { + let config = SearchConfig::default(); + assert_eq!(config.top_k, 5); + assert_eq!(config.beam_width, 3); + assert_eq!(config.max_iterations, 10); + } +} diff --git a/src/config/types/storage.rs b/src/config/types/storage.rs new file mode 100644 index 00000000..0dc55ed9 --- /dev/null +++ b/src/config/types/storage.rs @@ -0,0 +1,274 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Storage and sufficiency configuration types. + +use serde::{Deserialize, Serialize}; +use std::path::PathBuf; + +/// Storage configuration. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct StorageConfig { + /// Workspace directory for persisted documents. + #[serde(default = "default_workspace_dir")] + pub workspace_dir: PathBuf, +} + +fn default_workspace_dir() -> PathBuf { + PathBuf::from("./workspace") +} + +impl Default for StorageConfig { + fn default() -> Self { + Self { + workspace_dir: default_workspace_dir(), + } + } +} + +impl StorageConfig { + /// Create new storage config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Set the workspace directory. + pub fn with_workspace_dir(mut self, dir: impl Into) -> Self { + self.workspace_dir = dir.into(); + self + } +} + +/// Sufficiency checker configuration. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SufficiencyConfig { + /// Minimum tokens for sufficiency. + #[serde(default = "default_min_tokens")] + pub min_tokens: usize, + + /// Target tokens for full sufficiency. + #[serde(default = "default_target_tokens")] + pub target_tokens: usize, + + /// Maximum tokens before stopping. + #[serde(default = "default_max_tokens")] + pub max_tokens: usize, + + /// Minimum content length (characters). + #[serde(default = "default_min_content_length")] + pub min_content_length: usize, + + /// Confidence threshold for LLM judge. + #[serde(default = "default_confidence_threshold")] + pub confidence_threshold: f32, +} + +fn default_min_tokens() -> usize { + 500 +} + +fn default_target_tokens() -> usize { + 2000 +} + +fn default_max_tokens() -> usize { + 4000 +} + +fn default_min_content_length() -> usize { + 200 +} + +fn default_confidence_threshold() -> f32 { + 0.7 +} + +impl Default for SufficiencyConfig { + fn default() -> Self { + Self { + min_tokens: default_min_tokens(), + target_tokens: default_target_tokens(), + max_tokens: default_max_tokens(), + min_content_length: default_min_content_length(), + confidence_threshold: default_confidence_threshold(), + } + } +} + +impl SufficiencyConfig { + /// Create new sufficiency config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Set the minimum tokens. + pub fn with_min_tokens(mut self, tokens: usize) -> Self { + self.min_tokens = tokens; + self + } + + /// Set the target tokens. + pub fn with_target_tokens(mut self, tokens: usize) -> Self { + self.target_tokens = tokens; + self + } + + /// Set the maximum tokens. + pub fn with_max_tokens(mut self, tokens: usize) -> Self { + self.max_tokens = tokens; + self + } + + /// Set the confidence threshold. + pub fn with_confidence_threshold(mut self, threshold: f32) -> Self { + self.confidence_threshold = threshold; + self + } +} + +/// Cache configuration. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CacheConfig { + /// Maximum number of cache entries. + #[serde(default = "default_max_entries")] + pub max_entries: usize, + + /// Time-to-live for cache entries (seconds). + #[serde(default = "default_ttl_secs")] + pub ttl_secs: u64, +} + +fn default_max_entries() -> usize { + 1000 +} + +fn default_ttl_secs() -> u64 { + 3600 +} + +impl Default for CacheConfig { + fn default() -> Self { + Self { + max_entries: default_max_entries(), + ttl_secs: default_ttl_secs(), + } + } +} + +impl CacheConfig { + /// Create new cache config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Set the maximum entries. + pub fn with_max_entries(mut self, max: usize) -> Self { + self.max_entries = max; + self + } + + /// Set the TTL in seconds. + pub fn with_ttl_secs(mut self, secs: u64) -> Self { + self.ttl_secs = secs; + self + } +} + +/// Strategy-specific configuration. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct StrategyConfig { + /// MCTS exploration weight (sqrt(2) ≈ 1.414). + #[serde(default = "default_exploration_weight")] + pub exploration_weight: f32, + + /// Semantic similarity threshold. + #[serde(default = "default_similarity_threshold")] + pub similarity_threshold: f32, + + /// High similarity threshold for "answer" decision. + #[serde(default = "default_high_similarity_threshold")] + pub high_similarity_threshold: f32, + + /// Low similarity threshold for "explore" decision. + #[serde(default = "default_low_similarity_threshold")] + pub low_similarity_threshold: f32, +} + +fn default_exploration_weight() -> f32 { + 1.414 +} + +fn default_similarity_threshold() -> f32 { + 0.5 +} + +fn default_high_similarity_threshold() -> f32 { + 0.8 +} + +fn default_low_similarity_threshold() -> f32 { + 0.3 +} + +impl Default for StrategyConfig { + fn default() -> Self { + Self { + exploration_weight: default_exploration_weight(), + similarity_threshold: default_similarity_threshold(), + high_similarity_threshold: default_high_similarity_threshold(), + low_similarity_threshold: default_low_similarity_threshold(), + } + } +} + +impl StrategyConfig { + /// Create new strategy config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Set the exploration weight. + pub fn with_exploration_weight(mut self, weight: f32) -> Self { + self.exploration_weight = weight; + self + } + + /// Set the similarity threshold. + pub fn with_similarity_threshold(mut self, threshold: f32) -> Self { + self.similarity_threshold = threshold; + self + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_storage_config_defaults() { + let config = StorageConfig::default(); + assert_eq!(config.workspace_dir, PathBuf::from("./workspace")); + } + + #[test] + fn test_sufficiency_config_defaults() { + let config = SufficiencyConfig::default(); + assert_eq!(config.min_tokens, 500); + assert_eq!(config.target_tokens, 2000); + assert_eq!(config.max_tokens, 4000); + } + + #[test] + fn test_cache_config_defaults() { + let config = CacheConfig::default(); + assert_eq!(config.max_entries, 1000); + assert_eq!(config.ttl_secs, 3600); + } + + #[test] + fn test_strategy_config_defaults() { + let config = StrategyConfig::default(); + assert!((config.exploration_weight - 1.414).abs() < 0.001); + assert_eq!(config.similarity_threshold, 0.5); + } +} diff --git a/src/config/validator.rs b/src/config/validator.rs new file mode 100644 index 00000000..8a3596fd --- /dev/null +++ b/src/config/validator.rs @@ -0,0 +1,359 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Configuration validation. +//! +//! This module provides comprehensive validation for configuration values, +//! including range checks, consistency checks, and dependency validation. + +use super::types::{Config, ConfigValidationError, Severity, ValidationError}; + +/// Configuration validator. +#[derive(Debug, Default)] +pub struct ConfigValidator { + /// Validation rules to apply. + rules: Vec>, +} + +impl ConfigValidator { + /// Create a new validator with default rules. + pub fn new() -> Self { + Self { + rules: vec![ + Box::new(RangeValidator), + Box::new(ConsistencyValidator), + Box::new(DependencyValidator), + ], + } + } + + /// Add a custom validation rule. + pub fn with_rule(mut self, rule: Box) -> Self { + self.rules.push(rule); + self + } + + /// Validate the configuration. + pub fn validate(&self, config: &Config) -> Result<(), ConfigValidationError> { + let mut errors = Vec::new(); + + for rule in &self.rules { + rule.validate(config, &mut errors); + } + + // Only fail on errors, not warnings or info + let has_errors = errors.iter().any(|e| e.severity == Severity::Error); + + if has_errors { + Err(ConfigValidationError { errors }) + } else { + Ok(()) + } + } +} + +/// Trait for validation rules. +pub trait ValidationRule: std::fmt::Debug + Send + Sync { + /// Validate the configuration, appending errors if found. + fn validate(&self, config: &Config, errors: &mut Vec); +} + +/// Validates value ranges. +#[derive(Debug)] +struct RangeValidator; + +impl ValidationRule for RangeValidator { + fn validate(&self, config: &Config, errors: &mut Vec) { + // Indexer ranges + if config.indexer.subsection_threshold == 0 { + errors.push(ValidationError::error( + "indexer.subsection_threshold", + "Subsection threshold must be greater than 0", + )); + } + + if config.indexer.subsection_threshold > 10000 { + errors.push(ValidationError::warning( + "indexer.subsection_threshold", + "Subsection threshold is very high, may impact performance", + ).with_actual(config.indexer.subsection_threshold.to_string())); + } + + // Summary ranges + if config.summary.max_tokens == 0 { + errors.push(ValidationError::error( + "summary.max_tokens", + "Summary max tokens must be greater than 0", + )); + } + + if config.summary.temperature < 0.0 || config.summary.temperature > 2.0 { + errors.push(ValidationError::warning( + "summary.temperature", + "Temperature outside typical range [0.0, 2.0]", + ).with_actual(config.summary.temperature.to_string())); + } + + // Retrieval ranges + if config.retrieval.top_k == 0 { + errors.push(ValidationError::error( + "retrieval.top_k", + "Top K must be greater than 0", + )); + } + + if config.retrieval.search.beam_width == 0 { + errors.push(ValidationError::error( + "retrieval.search.beam_width", + "Beam width must be greater than 0", + )); + } + + // Content aggregator ranges + if config.retrieval.content.token_budget == 0 { + errors.push(ValidationError::error( + "retrieval.content.token_budget", + "Token budget must be greater than 0", + )); + } + + if config.retrieval.content.min_relevance_score < 0.0 + || config.retrieval.content.min_relevance_score > 1.0 + { + errors.push(ValidationError::error( + "retrieval.content.min_relevance_score", + "Min relevance score must be between 0.0 and 1.0", + ) + .with_expected("0.0 - 1.0") + .with_actual(config.retrieval.content.min_relevance_score.to_string())); + } + + if config.retrieval.content.hierarchical_min_per_level < 0.0 + || config.retrieval.content.hierarchical_min_per_level > 1.0 + { + errors.push(ValidationError::error( + "retrieval.content.hierarchical_min_per_level", + "Hierarchical min per level must be between 0.0 and 1.0", + )); + } + + // Concurrency ranges + if config.concurrency.max_concurrent_requests == 0 { + errors.push(ValidationError::error( + "concurrency.max_concurrent_requests", + "Max concurrent requests must be greater than 0", + )); + } + + if config.concurrency.requests_per_minute == 0 { + errors.push(ValidationError::error( + "concurrency.requests_per_minute", + "Requests per minute must be greater than 0", + )); + } + + // Fallback ranges + if config.fallback.max_retries == 0 { + errors.push(ValidationError::warning( + "fallback.max_retries", + "Max retries is 0, fallback will not retry", + )); + } + } +} + +/// Validates configuration consistency. +#[derive(Debug)] +struct ConsistencyValidator; + +impl ValidationRule for ConsistencyValidator { + fn validate(&self, config: &Config, errors: &mut Vec) { + // Check if summary tokens are reasonable + if config.summary.max_tokens > config.indexer.max_segment_tokens { + errors.push(ValidationError::warning( + "summary.max_tokens", + "Summary max tokens exceeds max segment tokens", + ) + .with_expected(format!("<= {}", config.indexer.max_segment_tokens)) + .with_actual(config.summary.max_tokens.to_string())); + } + + // Check if content token budget is reasonable + if config.retrieval.content.token_budget > 100000 { + errors.push(ValidationError::warning( + "retrieval.content.token_budget", + "Token budget is very high, may cause performance issues", + ).with_actual(config.retrieval.content.token_budget.to_string())); + } + + // Check if sufficiency thresholds are consistent + if config.retrieval.sufficiency.min_tokens > config.retrieval.sufficiency.target_tokens { + errors.push(ValidationError::error( + "retrieval.sufficiency.min_tokens", + "Min tokens cannot exceed target tokens", + ) + .with_expected(format!("<= {}", config.retrieval.sufficiency.target_tokens)) + .with_actual(config.retrieval.sufficiency.min_tokens.to_string())); + } + + if config.retrieval.sufficiency.target_tokens > config.retrieval.sufficiency.max_tokens { + errors.push(ValidationError::error( + "retrieval.sufficiency.target_tokens", + "Target tokens cannot exceed max tokens", + ) + .with_expected(format!("<= {}", config.retrieval.sufficiency.max_tokens)) + .with_actual(config.retrieval.sufficiency.target_tokens.to_string())); + } + + // Check scoring strategy validity + let valid_strategies = ["keyword_only", "keyword_bm25", "hybrid"]; + if !valid_strategies.contains(&config.retrieval.content.scoring_strategy.as_str()) { + errors.push(ValidationError::error( + "retrieval.content.scoring_strategy", + "Invalid scoring strategy", + ) + .with_expected(format!("one of: {:?}", valid_strategies)) + .with_actual(config.retrieval.content.scoring_strategy.clone())); + } + + // Check output format validity + let valid_formats = ["markdown", "json", "tree", "flat"]; + if !valid_formats.contains(&config.retrieval.content.output_format.as_str()) { + errors.push(ValidationError::error( + "retrieval.content.output_format", + "Invalid output format", + ) + .with_expected(format!("one of: {:?}", valid_formats)) + .with_actual(config.retrieval.content.output_format.clone())); + } + } +} + +/// Validates configuration dependencies. +#[derive(Debug)] +struct DependencyValidator; + +impl ValidationRule for DependencyValidator { + fn validate(&self, config: &Config, errors: &mut Vec) { + // Check if API key is available when summaries are needed + if config.summary.api_key.is_none() { + // Check if any feature requires LLM + if config.indexer.max_summary_tokens > 0 { + errors.push(ValidationError::info( + "summary.api_key", + "No API key configured, summary generation will be disabled", + )); + } + } + + // Check fallback configuration + if config.fallback.enabled { + if config.fallback.models.is_empty() && config.fallback.endpoints.is_empty() { + errors.push(ValidationError::warning( + "fallback.models", + "Fallback enabled but no fallback models or endpoints configured", + )); + } + + // Check retry behavior consistency + if matches!( + config.fallback.on_rate_limit, + super::types::FallbackBehavior::Fallback + ) && config.fallback.models.is_empty() + { + errors.push(ValidationError::error( + "fallback.models", + "Rate limit behavior is 'fallback' but no fallback models configured", + )); + } + } + + // Check cache configuration + if config.retrieval.cache.max_entries == 0 { + errors.push(ValidationError::warning( + "retrieval.cache.max_entries", + "Cache disabled (max_entries = 0), performance may be impacted", + )); + } + + // Check strategy configuration + if config.retrieval.strategy.exploration_weight <= 0.0 { + errors.push(ValidationError::error( + "retrieval.strategy.exploration_weight", + "Exploration weight must be positive", + ).with_actual(config.retrieval.strategy.exploration_weight.to_string())); + } + + // Check similarity thresholds are ordered correctly + if config.retrieval.strategy.low_similarity_threshold + >= config.retrieval.strategy.high_similarity_threshold + { + errors.push(ValidationError::error( + "retrieval.strategy.low_similarity_threshold", + "Low similarity threshold must be less than high similarity threshold", + ) + .with_expected(format!( + "< {}", + config.retrieval.strategy.high_similarity_threshold + )) + .with_actual(config.retrieval.strategy.low_similarity_threshold.to_string())); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_validator_valid_config() { + let config = Config::default(); + let validator = ConfigValidator::new(); + // Default config should pass validation (no errors, warnings are ok) + let result = validator.validate(&config); + assert!(result.is_ok(), "Default config should pass validation"); + } + + #[test] + fn test_validator_catches_range_errors() { + let mut config = Config::default(); + config.retrieval.content.token_budget = 0; + config.retrieval.content.min_relevance_score = 1.5; + + let validator = ConfigValidator::new(); + let result = validator.validate(&config); + + assert!(result.is_err()); + let err = result.unwrap_err(); + assert!(err.errors.iter().any(|e| e.path.contains("token_budget"))); + } + + #[test] + fn test_validator_catches_consistency_errors() { + let mut config = Config::default(); + config.retrieval.sufficiency.min_tokens = 3000; + config.retrieval.sufficiency.target_tokens = 2000; + + let validator = ConfigValidator::new(); + let result = validator.validate(&config); + + assert!(result.is_err()); + let err = result.unwrap_err(); + assert!(err.errors.iter().any(|e| e.path.contains("min_tokens"))); + } + + #[test] + fn test_validator_catches_dependency_warnings() { + let mut config = Config::default(); + config.fallback.enabled = true; + config.fallback.models.clear(); + + let validator = ConfigValidator::new(); + let result = validator.validate(&config); + + // Should succeed but with warnings + if let Err(err) = result { + assert!(err.errors.iter().any(|e| e.path.contains("fallback.models"))); + } + } +} diff --git a/src/domain/mod.rs b/src/domain/mod.rs index d5aa3e5c..75970a12 100644 --- a/src/domain/mod.rs +++ b/src/domain/mod.rs @@ -24,4 +24,4 @@ pub use error::{Error, Result}; pub use node::{NodeId, TreeNode}; pub use toc::{TocConfig, TocEntry, TocNode, TocView}; pub use token::{estimate_tokens, estimate_tokens_batch, estimate_tokens_fast}; -pub use tree::{DocumentStructure, DocumentTree, StructureNode}; +pub use tree::{DocumentStructure, DocumentTree, RetrievalIndex, StructureNode}; diff --git a/src/domain/node.rs b/src/domain/node.rs index ea9939b2..04359572 100644 --- a/src/domain/node.rs +++ b/src/domain/node.rs @@ -53,6 +53,15 @@ pub struct TreeNode { /// Title of this section. pub title: String, + /// Hierarchical structure index (e.g., "1", "1.1", "1.2.3"). + /// + /// This provides a human-readable path to the node and is useful for: + /// - LLM navigation (easier to understand "go to section 2.1.3") + /// - Table of contents display + /// - Cross-referencing + #[serde(default)] + pub structure: String, + /// Raw text content (populated at leaves). #[serde(default)] pub content: String, @@ -93,6 +102,7 @@ impl Default for TreeNode { fn default() -> Self { Self { title: String::new(), + structure: String::new(), content: String::new(), summary: String::new(), depth: 0, diff --git a/src/domain/tree.rs b/src/domain/tree.rs index 1f63bbff..94f138a3 100644 --- a/src/domain/tree.rs +++ b/src/domain/tree.rs @@ -4,7 +4,9 @@ //! Document tree using arena-based allocation. //! //! This structure provides better memory locality and simpler -//! lifetime management compared to `Rc`. +//! lifetime management compared to `Rc}`. + +use std::collections::HashMap; use indextree::Arena; use serde::{Deserialize, Serialize}; @@ -39,6 +41,172 @@ pub struct DocumentStructure { pub structure: Vec, } +/// Pre-computed index for efficient retrieval operations. +/// +/// Built once after the document tree is fully constructed. +/// Provides O(1) access to commonly needed traversal data. +#[derive(Debug, Clone)] +pub struct RetrievalIndex { + /// All leaf nodes in the tree. + leaves: Vec, + + /// Nodes grouped by depth level. + /// level_index[0] = root, level_index[1] = level 1 nodes, etc. + level_index: Vec>, + + /// Path from root to each node (inclusive). + path_cache: HashMap>, + + /// Siblings for each node (excluding self). + siblings_cache: HashMap>, + + /// Structure string to NodeId mapping. + /// e.g., "1.2.3" -> NodeId + structure_index: HashMap, + + /// Page number to NodeId mapping. + /// Maps each page to the most specific (deepest) node containing it. + page_index: HashMap, + + /// NodeId to page range mapping. + node_page_range: HashMap, + + /// Total node count. + node_count: usize, + + /// Maximum depth in the tree. + max_depth: usize, +} + +impl RetrievalIndex { + /// Get all leaf nodes. + pub fn leaves(&self) -> &[NodeId] { + &self.leaves + } + + /// Get nodes at a specific depth level. + /// + /// Returns None if the level doesn't exist. + pub fn level(&self, depth: usize) -> Option<&[NodeId]> { + self.level_index.get(depth).map(|v| v.as_slice()) + } + + /// Get all levels. + pub fn levels(&self) -> &[Vec] { + &self.level_index + } + + /// Get the path from root to a node (inclusive). + /// + /// Returns None if the node is not in the index. + pub fn path_to(&self, node: NodeId) -> Option<&[NodeId]> { + self.path_cache.get(&node).map(|v| v.as_slice()) + } + + /// Get siblings of a node (excluding the node itself). + /// + /// Returns None if the node is not in the index or has no siblings. + pub fn siblings(&self, node: NodeId) -> Option<&[NodeId]> { + self.siblings_cache.get(&node).map(|v| v.as_slice()) + } + + /// Find a node by its structure index. + /// + /// # Example + /// ```ignore + /// // Find section 2.1.3 + /// let node = index.find_by_structure("2.1.3"); + /// ``` + pub fn find_by_structure(&self, structure: &str) -> Option { + self.structure_index.get(structure).copied() + } + + /// Find the most specific node containing a page number. + /// + /// Returns the deepest node whose page range contains the given page. + pub fn find_by_page(&self, page: usize) -> Option { + self.page_index.get(&page).copied() + } + + /// Find all nodes whose page range overlaps with the given range. + /// + /// This is useful for retrieving all content that spans a range of pages. + /// + /// # Example + /// ```ignore + /// // Find all nodes covering pages 10-15 + /// let nodes = index.find_nodes_by_page_range(10, 15); + /// ``` + pub fn find_nodes_by_page_range(&self, start: usize, end: usize) -> Vec { + let mut result = Vec::new(); + for (&node_id, &(node_start, node_end)) in &self.node_page_range { + // Check if ranges overlap: node_start <= end && start <= node_end + if node_start <= end && start <= node_end { + result.push(node_id); + } + } + // Sort by start page for consistent ordering + result.sort_by_key(|&id| { + self.node_page_range.get(&id).map(|(s, _)| *s).unwrap_or(0) + }); + result + } + + /// Get all page numbers covered by a node. + /// + /// Returns None if the node has no page information. + pub fn get_pages_for_node(&self, node: NodeId) -> Option> { + let (start, end) = self.node_page_range.get(&node)?; + Some((*start..=*end).collect()) + } + + /// Get the page range for a node. + pub fn page_range(&self, node: NodeId) -> Option<(usize, usize)> { + self.node_page_range.get(&node).copied() + } + + /// Get all nodes that are leaves within a page range. + /// + /// This returns only leaf nodes (nodes with no children) that + /// overlap with the given page range. + pub fn find_leaves_by_page_range(&self, start: usize, end: usize) -> Vec { + let leaves_set: std::collections::HashSet = self.leaves.iter().copied().collect(); + self.find_nodes_by_page_range(start, end) + .into_iter() + .filter(|id| leaves_set.contains(id)) + .collect() + } + + /// Get the total number of pages in the document. + pub fn total_pages(&self) -> usize { + self.node_page_range + .values() + .map(|(_, end)| *end) + .max() + .unwrap_or(0) + } + + /// Get all structure indices. + pub fn structures(&self) -> &HashMap { + &self.structure_index + } + + /// Get the total number of nodes. + pub fn node_count(&self) -> usize { + self.node_count + } + + /// Get the maximum depth in the tree. + pub fn max_depth(&self) -> usize { + self.max_depth + } + + /// Get the number of levels. + pub fn level_count(&self) -> usize { + self.level_index.len() + } +} + /// A hierarchical document tree structure. /// /// Uses an arena-based tree representation for efficient traversal @@ -50,6 +218,10 @@ pub struct DocumentTree { /// The root node ID. root_id: NodeId, + + /// Cached leaf nodes (rebuilt on demand). + #[serde(skip)] + leaves_cache: Option>, } impl DocumentTree { @@ -58,6 +230,7 @@ impl DocumentTree { let mut arena = Arena::new(); let root_data = TreeNode { title: title.to_string(), + structure: String::new(), // Root has no structure index content: content.to_string(), summary: String::new(), depth: 0, @@ -71,9 +244,13 @@ impl DocumentTree { }; let root_id = arena.new_node(root_data); + // Root is initially a leaf + let leaves_cache = Some(vec![NodeId(root_id)]); + Self { arena, root_id: NodeId(root_id), + leaves_cache, } } @@ -81,7 +258,11 @@ impl DocumentTree { /// /// This is useful for deserialization and testing. pub fn from_raw(arena: Arena, root_id: NodeId) -> Self { - Self { arena, root_id } + Self { + arena, + root_id, + leaves_cache: None, // Will be rebuilt on demand + } } /// Get the root node ID. @@ -111,10 +292,28 @@ impl DocumentTree { /// Add a child node to the specified parent. /// /// Returns the ID of the newly created child node. + /// The structure is automatically calculated based on siblings. pub fn add_child(&mut self, parent: NodeId, title: &str, content: &str) -> NodeId { let parent_depth = self.arena.get(parent.0).map(|n| n.get().depth).unwrap_or(0); + let parent_structure = self + .arena + .get(parent.0) + .map(|n| n.get().structure.clone()) + .unwrap_or_default(); + + // Calculate child index (1-based) + let child_index = parent.0.children(&self.arena).count() + 1; + + // Calculate structure: parent_structure.child_index + let child_structure = if parent_structure.is_empty() { + child_index.to_string() + } else { + format!("{}.{}", parent_structure, child_index) + }; + let child_data = TreeNode { title: title.to_string(), + structure: child_structure, content: content.to_string(), summary: String::new(), depth: parent_depth + 1, @@ -128,6 +327,15 @@ impl DocumentTree { }; let child_id = self.arena.new_node(child_data); parent.0.append(child_id, &mut self.arena); + + // Update leaves cache + if let Some(ref mut cache) = self.leaves_cache { + // Remove parent from leaves (it's no longer a leaf) + cache.retain(|&id| id != parent); + // Add child to leaves + cache.push(NodeId(child_id)); + } + NodeId(child_id) } @@ -155,9 +363,27 @@ impl DocumentTree { id.0.children(&self.arena).next().is_none() } + /// Get the number of children of a node. + /// + /// This is more efficient than `children().len()` as it doesn't allocate. + pub fn child_count(&self, id: NodeId) -> usize { + id.0.children(&self.arena).count() + } + + /// Get the children of a node as an iterator. + /// + /// Use this instead of `children()` when you only need to iterate, + /// as it avoids allocating a Vec. + pub fn children_iter(&self, id: NodeId) -> impl Iterator + '_ { + id.0.children(&self.arena).map(NodeId) + } + /// Get the children of a node. + /// + /// Returns a Vec for cases where you need owned access to the children. + /// Consider using `children_iter()` if you only need to iterate. pub fn children(&self, id: NodeId) -> Vec { - id.0.children(&self.arena).map(NodeId).collect() + self.children_iter(id).collect() } /// Get the parent of a node. @@ -167,12 +393,87 @@ impl DocumentTree { id.0.parent(&self.arena).map(NodeId) } + /// Get the siblings of a node (excluding the node itself). + /// + /// Returns an empty iterator for the root node. + pub fn siblings_iter(&self, id: NodeId) -> impl Iterator + '_ { + id.0.preceding_siblings(&self.arena) + .chain(id.0.following_siblings(&self.arena)) + .map(NodeId) + } + + /// Get the ancestors of a node from parent to root. + /// + /// Returns an empty iterator for the root node. + pub fn ancestors_iter(&self, id: NodeId) -> impl Iterator + '_ { + id.0.ancestors(&self.arena).map(NodeId) + } + + /// Get the path from root to a node (inclusive). + /// + /// Returns the path as a Vec starting from the root. + pub fn path_from_root(&self, id: NodeId) -> Vec { + let mut path: Vec = self.ancestors_iter(id).collect(); + path.reverse(); + path.push(id); + path + } + + /// Get the depth of a node (root = 0). + pub fn depth(&self, id: NodeId) -> usize { + self.get(id).map(|n| n.depth).unwrap_or(0) + } + + /// Get the first child of a node. + /// + /// Returns None if the node has no children. + pub fn first_child(&self, id: NodeId) -> Option { + self.children_iter(id).next() + } + + /// Get the last child of a node. + /// + /// Returns None if the node has no children. + pub fn last_child(&self, id: NodeId) -> Option { + self.children_iter(id).last() + } + /// Get all leaf nodes in the tree. + /// + /// Uses cached leaves if available, otherwise rebuilds the cache. pub fn leaves(&self) -> Vec { - self.traverse() + if let Some(ref cache) = self.leaves_cache { + return cache.clone(); + } + + // Rebuild cache on demand + let leaves: Vec = self + .traverse() .into_iter() .filter(|id| self.is_leaf(*id)) - .collect() + .collect(); + + // Note: Can't mutate self here, caller should use rebuild_leaves_cache() + leaves + } + + /// Rebuild the leaves cache. + /// + /// Call this after deserialization or batch modifications. + pub fn rebuild_leaves_cache(&mut self) { + self.leaves_cache = Some( + self.traverse() + .into_iter() + .filter(|id| self.is_leaf(*id)) + .collect(), + ); + } + + /// Invalidate the leaves cache. + /// + /// Called automatically by mutation methods. + pub fn invalidate_leaves_cache(&mut self) { + self.leaves_cache = None; } /// Get all nodes in the tree (depth-first order). @@ -210,6 +511,13 @@ impl DocumentTree { } } + /// Update a node's structure index. + pub fn set_structure(&mut self, id: NodeId, structure: &str) { + if let Some(node) = self.get_mut(id) { + node.structure = structure.to_string(); + } + } + /// Set page boundaries for a node. pub fn set_page_boundaries(&mut self, id: NodeId, start: usize, end: usize) { if let Some(node) = self.get_mut(id) { @@ -244,6 +552,62 @@ impl DocumentTree { } } + /// Find a node by its structure index. + /// + /// This is a convenience method that builds an index if needed. + /// For repeated queries, build a RetrievalIndex once. + pub fn find_by_structure(&self, structure: &str) -> Option { + // Linear search - for repeated use, build RetrievalIndex + for node_id in self.traverse() { + if let Some(node) = self.get(node_id) { + if node.structure == structure { + return Some(node_id); + } + } + } + None + } + + /// Find the most specific node containing a page. + /// + /// This is a convenience method that builds an index if needed. + /// For repeated queries, build a RetrievalIndex once. + pub fn find_by_page(&self, page: usize) -> Option { + let mut best_match: Option<(NodeId, usize)> = None; + + // Find the deepest node containing this page + for node_id in self.traverse() { + if let Some((start, end)) = self.page_range(node_id) { + if page >= start && page <= end { + let depth = self.get(node_id).map(|n| n.depth).unwrap_or(0); + match &best_match { + None => best_match = Some((node_id, depth)), + Some((_, best_depth)) if depth > *best_depth => { + best_match = Some((node_id, depth)); + } + _ => {} + } + } + } + } + + best_match.map(|(id, _)| id) + } + + /// Get all nodes whose page range overlaps with the given range. + pub fn find_nodes_by_page_range(&self, start: usize, end: usize) -> Vec { + self.traverse() + .into_iter() + .filter(|&id| { + if let Some((node_start, node_end)) = self.page_range(id) { + node_start <= end && start <= node_end + } else { + false + } + }) + .collect() + } + /// Set the node ID (identifier string). pub fn set_node_id(&mut self, id: NodeId, node_id: &str) { if let Some(node) = self.get_mut(id) { @@ -274,6 +638,128 @@ impl DocumentTree { } } + /// Build a retrieval index for efficient operations. + /// + /// This should be called once after the tree is fully constructed. + /// The index provides O(1) access to commonly needed traversal data. + /// + /// # Example + /// + /// ```ignore + /// let tree = /* build tree */; + /// let index = tree.build_retrieval_index(); + /// + /// // Fast access to leaves + /// for leaf in index.leaves() { + /// // process leaf + /// } + /// + /// // Fast path lookup + /// if let Some(path) = index.path_to(node_id) { + /// // path[0] = root, path[-1] = node_id + /// } + /// + /// // Fast structure lookup + /// if let Some(node) = index.find_by_structure("2.1.3") { + /// // Found section 2.1.3 + /// } + /// + /// // Fast page lookup + /// if let Some(node) = index.find_by_page(42) { + /// // Found node containing page 42 + /// } + /// ``` + pub fn build_retrieval_index(&self) -> RetrievalIndex { + let mut leaves = Vec::new(); + let mut level_index: Vec> = Vec::new(); + let mut path_cache: HashMap> = HashMap::new(); + let mut siblings_cache: HashMap> = HashMap::new(); + let mut structure_index: HashMap = HashMap::new(); + let mut page_index: HashMap = HashMap::new(); + let mut node_page_range: HashMap = HashMap::new(); + let mut max_depth = 0; + let node_count = self.node_count(); + + // BFS to build level index + let mut current_level = vec![self.root_id]; + + // Initialize root path + path_cache.insert(self.root_id, vec![self.root_id]); + + while !current_level.is_empty() { + level_index.push(current_level.clone()); + + let mut next_level = Vec::new(); + + for &node_id in ¤t_level { + let children: Vec = self.children(node_id); + + // Get node data + if let Some(node) = self.get(node_id) { + max_depth = max_depth.max(node.depth); + + // Build structure index + if !node.structure.is_empty() { + structure_index.insert(node.structure.clone(), node_id); + } + + // Build page index and page range + if let (Some(start), Some(end)) = (node.start_page, node.end_page) { + node_page_range.insert(node_id, (start, end)); + + // Map each page to this node (will be overwritten by deeper nodes) + for page in start..=end { + page_index.insert(page, node_id); + } + } + } + + // Check if leaf + if children.is_empty() { + leaves.push(node_id); + } + + // Build siblings cache for children + if children.len() > 1 { + for (i, &child) in children.iter().enumerate() { + let siblings: Vec = children + .iter() + .enumerate() + .filter(|(j, _)| *j != i) + .map(|(_, &c)| c) + .collect(); + siblings_cache.insert(child, siblings); + } + } + + // Build path cache for children + if let Some(parent_path) = path_cache.get(&node_id).cloned() { + for &child in &children { + let mut child_path = parent_path.clone(); + child_path.push(child); + path_cache.insert(child, child_path); + } + } + + next_level.extend(children); + } + + current_level = next_level; + } + + RetrievalIndex { + leaves, + level_index, + path_cache, + siblings_cache, + structure_index, + page_index, + node_page_range, + node_count, + max_depth, + } + } + /// Recursively build structure nodes starting from the given node. fn build_structure_nodes(&self, node_id: NodeId) -> Vec { let children = self.children(node_id); diff --git a/src/parser/markdown/parser.rs b/src/parser/markdown/parser.rs index 7e1f3a2d..366be1be 100644 --- a/src/parser/markdown/parser.rs +++ b/src/parser/markdown/parser.rs @@ -320,12 +320,14 @@ fn finish_current_node( config: &MarkdownConfig, current_line: usize, ) -> Option { - // Handle preamble content + // Handle preamble content (content before first heading) if nodes.is_empty() && !content_buffer.trim().is_empty() { if config.create_preamble_node { let content = content_buffer.trim(); *preamble_content = content.to_string(); } + // Clear the buffer after storing as preamble to avoid duplication + content_buffer.clear(); } // Finish current heading node diff --git a/src/retrieval/content/aggregator.rs b/src/retrieval/content/aggregator.rs new file mode 100644 index 00000000..9edb625b --- /dev/null +++ b/src/retrieval/content/aggregator.rs @@ -0,0 +1,402 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Main content aggregator combining all components. +//! +//! This module provides the main [`ContentAggregator`] that orchestrates +//! scoring, budget allocation, and structure building. + +use std::collections::HashMap; + +use tracing::{debug, info}; + +use crate::domain::{DocumentTree, NodeId, estimate_tokens}; + +use super::budget::{AllocationResult, AllocationStrategy, BudgetAllocator, SelectedContent}; +use super::builder::{ContentMetadata, StructureBuilder, StructuredContent}; +use super::config::{ContentAggregatorConfig, OutputFormatConfig, ScoringStrategyConfig}; +use super::scorer::{ + ContentChunk, ContentRelevance, RelevanceScorer, ScoreComponents, ScoringContext, +}; + +/// Candidate node from retrieval. +#[derive(Debug, Clone)] +pub struct CandidateNode { + /// Node ID. + pub node_id: NodeId, + /// Relevance score from search. + pub score: f32, + /// Depth in tree. + pub depth: usize, +} + +impl CandidateNode { + /// Create a new candidate. + #[must_use] + pub fn new(node_id: NodeId, score: f32, depth: usize) -> Self { + Self { node_id, score, depth } + } +} + +/// Result of content aggregation. +#[derive(Debug, Clone)] +pub struct AggregationResult { + /// Aggregated content string. + pub content: String, + /// Total tokens used. + pub tokens_used: usize, + /// Number of nodes included. + pub nodes_included: usize, + /// Average relevance score. + pub avg_score: f32, + /// Whether content was truncated due to budget. + pub was_truncated: bool, + /// Metadata about the aggregation. + pub metadata: ContentMetadata, +} + +impl AggregationResult { + /// Check if result is empty. + #[must_use] + pub fn is_empty(&self) -> bool { + self.content.is_empty() + } +} + +/// Content aggregator combining scoring, allocation, and building. +#[derive(Debug)] +pub struct ContentAggregator { + /// Configuration. + config: ContentAggregatorConfig, +} + +impl ContentAggregator { + /// Create a new content aggregator. + #[must_use] + pub fn new(config: ContentAggregatorConfig) -> Self { + Self { config } + } + + /// Create aggregator with default configuration. + #[must_use] + pub fn with_defaults() -> Self { + Self::new(ContentAggregatorConfig::default()) + } + + /// Aggregate content from candidate nodes. + /// + /// # Arguments + /// + /// * `candidates` - Candidate nodes from retrieval + /// * `tree` - Document tree + /// * `query` - Query string for relevance scoring + /// + /// # Returns + /// + /// Aggregated content within token budget. + #[must_use] + pub fn aggregate( + &self, + candidates: &[CandidateNode], + tree: &DocumentTree, + query: &str, + ) -> AggregationResult { + let start = std::time::Instant::now(); + + // Step 1: Collect all content chunks from candidates and their descendants + let chunks = self.collect_chunks(candidates, tree); + debug!( + "Collected {} content chunks from {} candidates", + chunks.len(), + candidates.len() + ); + + if chunks.is_empty() { + return AggregationResult { + content: String::new(), + tokens_used: 0, + nodes_included: 0, + avg_score: 0.0, + was_truncated: false, + metadata: ContentMetadata::default(), + }; + } + + // Step 2: Score all chunks for relevance + let scorer = RelevanceScorer::new(query, self.config.scoring_strategy); + let scoring_ctx = self.build_scoring_context(&chunks); + let scored = scorer.score_chunks(&chunks, &scoring_ctx); + + // Filter by minimum score + let filtered: Vec<_> = scored + .into_iter() + .filter(|r| r.score >= self.config.min_relevance_score) + .collect(); + + debug!( + "Scored {} chunks, {} passed threshold {:.2}", + chunks.len(), + filtered.len(), + self.config.min_relevance_score + ); + + if filtered.is_empty() { + // Fall back to returning best candidate content + return self.fallback_result(candidates, tree); + } + + // Step 3: Allocate token budget + let max_depth = filtered.iter().map(|r| r.chunk.depth).max().unwrap_or(0); + let strategy = self.get_allocation_strategy(); + let allocator = BudgetAllocator::new(self.config.token_budget) + .with_strategy(strategy); + + let allocation = allocator.allocate(filtered, max_depth); + + info!( + "Allocated {} tokens to {} items (strategy: {:?})", + allocation.tokens_used, + allocation.selected.len(), + self.config.scoring_strategy + ); + + // Step 4: Build structured output + let builder = StructureBuilder::from_config( + self.config.output_format, + self.config.include_scores, + ); + + let structured = builder.build(allocation.selected.clone(), tree); + + // Build result + let was_truncated = allocation.selected.iter().any(|s| s.is_truncated()); + + AggregationResult { + content: structured.content, + tokens_used: allocation.tokens_used, + nodes_included: allocation.selected.len(), + avg_score: allocation.stats.avg_score, + was_truncated, + metadata: structured.metadata, + } + } + + /// Collect content chunks from candidates and descendants. + fn collect_chunks( + &self, + candidates: &[CandidateNode], + tree: &DocumentTree, + ) -> Vec { + let mut chunks = Vec::new(); + let mut visited: HashMap = HashMap::new(); + + for candidate in candidates { + // Add candidate's own content + if let Some(node) = tree.get(candidate.node_id) { + if !node.content.is_empty() { + chunks.push(ContentChunk::new( + candidate.node_id, + node.title.clone(), + node.content.clone(), + candidate.depth, + )); + visited.insert(candidate.node_id, true); + } + + // Collect leaf descendants + self.collect_descendant_chunks( + candidate.node_id, + tree, + candidate.depth, + &mut chunks, + &mut visited, + ); + } + } + + chunks + } + + /// Collect chunks from descendant nodes. + fn collect_descendant_chunks( + &self, + parent_id: NodeId, + tree: &DocumentTree, + parent_depth: usize, + chunks: &mut Vec, + visited: &mut HashMap, + ) { + let children = tree.children(parent_id); + + for child_id in children { + if visited.contains_key(&child_id) { + continue; + } + visited.insert(child_id, true); + + if let Some(node) = tree.get(child_id) { + let child_depth = parent_depth + 1; + + if tree.is_leaf(child_id) { + // Leaf node - add its content + if !node.content.is_empty() { + chunks.push(ContentChunk::new( + child_id, + node.title.clone(), + node.content.clone(), + child_depth, + )); + } + } else { + // Non-leaf - recurse + self.collect_descendant_chunks(child_id, tree, child_depth, chunks, visited); + } + } + } + } + + /// Build scoring context from chunks. + fn build_scoring_context(&self, chunks: &[ContentChunk]) -> ScoringContext { + let total_len: usize = chunks.iter().map(|c| c.content.len()).sum(); + let avg_len = if chunks.is_empty() { + 100.0 + } else { + total_len as f32 / chunks.len() as f32 + }; + + // Build document frequency map + let mut doc_freq: HashMap = HashMap::new(); + for chunk in chunks { + let mut seen_in_doc = std::collections::HashSet::new(); + for word in chunk.content.to_lowercase().split_whitespace() { + if !seen_in_doc.contains(word) { + *doc_freq.entry(word.to_string()).or_insert(0) += 1; + seen_in_doc.insert(word); + } + } + } + + ScoringContext { + avg_doc_len: avg_len, + doc_count: chunks.len(), + doc_freq, + parent_score: None, + } + } + + /// Get allocation strategy from config. + fn get_allocation_strategy(&self) -> AllocationStrategy { + AllocationStrategy::Hierarchical { + min_per_level: self.config.hierarchical_min_per_level, + } + } + + /// Fallback result when no content passes threshold. + fn fallback_result( + &self, + candidates: &[CandidateNode], + tree: &DocumentTree, + ) -> AggregationResult { + // Return best candidate's content + if let Some(best) = candidates.first() { + if let Some(node) = tree.get(best.node_id) { + let content = if !node.content.is_empty() { + node.content.clone() + } else if !node.summary.is_empty() { + node.summary.clone() + } else { + String::new() + }; + + let tokens = estimate_tokens(&content); + + return AggregationResult { + content: format!("## {}\n\n{}", node.title, content), + tokens_used: tokens, + nodes_included: 1, + avg_score: best.score, + was_truncated: false, + metadata: ContentMetadata { + total_tokens: tokens, + node_count: 1, + avg_score: best.score, + max_depth: best.depth, + }, + }; + } + } + + AggregationResult { + content: String::new(), + tokens_used: 0, + nodes_included: 0, + avg_score: 0.0, + was_truncated: false, + metadata: ContentMetadata::default(), + } + } +} + +impl Default for ContentAggregator { + fn default() -> Self { + Self::with_defaults() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use indextree::Arena; + + fn make_test_node_id() -> NodeId { + let mut arena = Arena::new(); + let node = crate::domain::TreeNode { + title: "Test".to_string(), + structure: String::new(), + content: String::new(), + summary: String::new(), + depth: 0, + start_index: 0, + end_index: 0, + start_page: None, + end_page: None, + node_id: None, + physical_index: None, + token_count: None, + }; + NodeId(arena.new_node(node)) + } + + #[test] + fn test_aggregator_creation() { + let config = ContentAggregatorConfig::default(); + let aggregator = ContentAggregator::new(config); + assert_eq!(aggregator.config.token_budget, 4000); + } + + #[test] + fn test_aggregator_with_defaults() { + let aggregator = ContentAggregator::with_defaults(); + assert_eq!(aggregator.config.token_budget, 4000); + } + + #[test] + fn test_empty_candidates() { + let aggregator = ContentAggregator::with_defaults(); + let tree = DocumentTree::new("Test", ""); + + let result = aggregator.aggregate(&[], &tree, "test query"); + + assert!(result.is_empty()); + assert_eq!(result.tokens_used, 0); + } + + #[test] + fn test_candidate_node_creation() { + let node_id = make_test_node_id(); + let candidate = CandidateNode::new(node_id, 0.8, 2); + + assert_eq!(candidate.score, 0.8); + assert_eq!(candidate.depth, 2); + } +} diff --git a/src/retrieval/content/budget.rs b/src/retrieval/content/budget.rs new file mode 100644 index 00000000..fa91e9c0 --- /dev/null +++ b/src/retrieval/content/budget.rs @@ -0,0 +1,624 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Token budget allocation for content aggregation. +//! +//! This module provides budget-aware content selection that optimizes +//! token usage while maximizing relevance. + +use std::collections::HashMap; + +use crate::domain::{estimate_tokens, NodeId}; + +use super::scorer::ContentRelevance; + +/// Allocation strategy for distributing token budget. +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum AllocationStrategy { + /// Select highest-scoring content first until budget exhausted. + Greedy, + /// Distribute budget proportionally to relevance scores. + Proportional, + /// Ensure each depth level has minimum representation. + Hierarchical { + /// Minimum fraction of budget per level (0.0 - 1.0) + min_per_level: f32, + }, +} + +impl Default for AllocationStrategy { + fn default() -> Self { + Self::Hierarchical { min_per_level: 0.1 } + } +} + +/// Information about content truncation. +#[derive(Debug, Clone)] +pub struct TruncationInfo { + /// Original content length in characters. + pub original_len: usize, + /// Truncated content length in characters. + pub truncated_len: usize, + /// Reason for truncation. + pub reason: TruncationReason, +} + +/// Reason for content truncation. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum TruncationReason { + /// Content exceeded remaining budget. + BudgetExceeded, + /// Content tail had low relevance. + LowRelevanceTail, +} + +/// A selected content item after budget allocation. +#[derive(Debug, Clone)] +pub struct SelectedContent { + /// Node ID. + pub node_id: NodeId, + /// Node title. + pub title: String, + /// Selected content text. + pub content: String, + /// Token count of selected content. + pub tokens: usize, + /// Relevance score. + pub score: f32, + /// Depth in tree. + pub depth: usize, + /// Truncation info if content was truncated. + pub truncation: Option, +} + +impl SelectedContent { + /// Check if content was truncated. + #[must_use] + pub fn is_truncated(&self) -> bool { + self.truncation.is_some() + } +} + +/// Statistics about the allocation process. +#[derive(Debug, Clone, Default)] +pub struct AllocationStats { + /// Total content items considered. + pub items_considered: usize, + /// Items selected for output. + pub items_selected: usize, + /// Items truncated. + pub items_truncated: usize, + /// Items filtered (below threshold). + pub items_filtered: usize, + /// Average score of selected items. + pub avg_score: f32, +} + +/// Result of budget allocation. +#[derive(Debug, Clone)] +pub struct AllocationResult { + /// Selected content items. + pub selected: Vec, + /// Total tokens used. + pub tokens_used: usize, + /// Remaining token budget. + pub remaining_budget: usize, + /// Allocation statistics. + pub stats: AllocationStats, +} + +impl AllocationResult { + /// Check if any content was selected. + #[must_use] + pub fn is_empty(&self) -> bool { + self.selected.is_empty() + } + + /// Get number of selected items. + #[must_use] + pub fn len(&self) -> usize { + self.selected.len() + } +} + +/// Token budget allocator. +#[derive(Debug)] +pub struct BudgetAllocator { + /// Total token budget. + total_budget: usize, + /// Minimum reserve budget (for fallback). + min_reserve: usize, + /// Allocation strategy. + strategy: AllocationStrategy, + /// Minimum relevance score threshold. + min_score: f32, +} + +impl BudgetAllocator { + /// Create a new allocator with the specified budget. + #[must_use] + pub fn new(budget: usize) -> Self { + Self { + total_budget: budget, + min_reserve: budget / 10, + strategy: AllocationStrategy::default(), + min_score: 0.0, + } + } + + /// Set the allocation strategy. + #[must_use] + pub fn with_strategy(mut self, strategy: AllocationStrategy) -> Self { + self.strategy = strategy; + self + } + + /// Set minimum relevance score threshold. + #[must_use] + pub fn with_min_score(mut self, min_score: f32) -> Self { + self.min_score = min_score; + self + } + + /// Allocate budget to scored content. + #[must_use] + pub fn allocate( + &self, + scored_content: Vec, + max_depth: usize, + ) -> AllocationResult { + // Filter by minimum score + let filtered: Vec<_> = scored_content + .into_iter() + .filter(|c| c.score >= self.min_score) + .collect(); + + let stats = AllocationStats { + items_considered: filtered.len(), + ..Default::default() + }; + + match &self.strategy { + AllocationStrategy::Greedy => self.allocate_greedy(filtered, stats), + AllocationStrategy::Proportional => self.allocate_proportional(filtered, stats), + AllocationStrategy::Hierarchical { min_per_level } => { + self.allocate_hierarchical(filtered, max_depth, *min_per_level, stats) + } + } + } + + /// Greedy allocation: select highest-scoring content first. + fn allocate_greedy( + &self, + mut content: Vec, + mut stats: AllocationStats, + ) -> AllocationResult { + // Sort by score descending + content.sort_by(|a, b| { + b.score + .partial_cmp(&a.score) + .unwrap_or(std::cmp::Ordering::Equal) + }); + + let mut selected = Vec::new(); + let mut tokens_used = 0; + + for relevance in content { + let tokens = relevance.chunk.token_count(); + + if tokens_used + tokens <= self.total_budget { + selected.push(SelectedContent { + node_id: relevance.chunk.node_id, + title: relevance.chunk.title, + content: relevance.chunk.content, + tokens, + score: relevance.score, + depth: relevance.chunk.depth, + truncation: None, + }); + tokens_used += tokens; + } else { + // Try to fit truncated content + let remaining = self.total_budget - tokens_used; + if remaining >= 50 { + // Minimum useful content + if let Some(truncated) = self.truncate_content(&relevance.chunk.content, remaining) { + let truncated_tokens = estimate_tokens(&truncated); + selected.push(SelectedContent { + node_id: relevance.chunk.node_id, + title: relevance.chunk.title, + content: truncated, + tokens: truncated_tokens, + score: relevance.score, + depth: relevance.chunk.depth, + truncation: Some(TruncationInfo { + original_len: relevance.chunk.content.len(), + truncated_len: remaining, + reason: TruncationReason::BudgetExceeded, + }), + }); + tokens_used += truncated_tokens; + stats.items_truncated += 1; + } + } + break; + } + } + + stats.items_selected = selected.len(); + stats.avg_score = if selected.is_empty() { + 0.0 + } else { + selected.iter().map(|s| s.score).sum::() / selected.len() as f32 + }; + + AllocationResult { + selected, + tokens_used, + remaining_budget: self.total_budget - tokens_used, + stats, + } + } + + /// Proportional allocation: distribute budget by score ratio. + fn allocate_proportional( + &self, + content: Vec, + mut stats: AllocationStats, + ) -> AllocationResult { + let total_score: f32 = content.iter().map(|c| c.score).sum(); + if total_score == 0.0 { + return AllocationResult { + selected: Vec::new(), + tokens_used: 0, + remaining_budget: self.total_budget, + stats, + }; + } + + let mut selected = Vec::new(); + let mut tokens_used = 0; + + for relevance in content { + // Calculate proportional budget + let proportion = relevance.score / total_score; + let allocated_budget = ((self.total_budget as f32 * proportion) as usize).max(50); + + let content_tokens = relevance.chunk.token_count(); + + if content_tokens <= allocated_budget { + // Full content fits + if tokens_used + content_tokens <= self.total_budget { + selected.push(SelectedContent { + node_id: relevance.chunk.node_id, + title: relevance.chunk.title, + content: relevance.chunk.content, + tokens: content_tokens, + score: relevance.score, + depth: relevance.chunk.depth, + truncation: None, + }); + tokens_used += content_tokens; + } + } else { + // Truncate to allocated budget + let remaining = self.total_budget - tokens_used; + if remaining >= 50 && remaining >= allocated_budget / 2 { + if let Some(truncated) = self.truncate_content(&relevance.chunk.content, remaining.min(allocated_budget)) { + let truncated_tokens = estimate_tokens(&truncated); + let truncated_len = truncated.len(); + selected.push(SelectedContent { + node_id: relevance.chunk.node_id, + title: relevance.chunk.title, + content: truncated, + tokens: truncated_tokens, + score: relevance.score, + depth: relevance.chunk.depth, + truncation: Some(TruncationInfo { + original_len: relevance.chunk.content.len(), + truncated_len, + reason: TruncationReason::BudgetExceeded, + }), + }); + tokens_used += truncated_tokens; + stats.items_truncated += 1; + } + } + } + } + + stats.items_selected = selected.len(); + stats.avg_score = if selected.is_empty() { + 0.0 + } else { + selected.iter().map(|s| s.score).sum::() / selected.len() as f32 + }; + + AllocationResult { + selected, + tokens_used, + remaining_budget: self.total_budget - tokens_used, + stats, + } + } + + /// Hierarchical allocation: ensure each depth level has representation. + fn allocate_hierarchical( + &self, + content: Vec, + max_depth: usize, + min_per_level: f32, + mut stats: AllocationStats, + ) -> AllocationResult { + // Group content by depth + let mut by_depth: HashMap> = HashMap::new(); + for c in content { + by_depth + .entry(c.chunk.depth) + .or_default() + .push(c); + } + + // Sort each level by score + for (_depth, items) in by_depth.iter_mut() { + items.sort_by(|a, b| { + b.score + .partial_cmp(&a.score) + .unwrap_or(std::cmp::Ordering::Equal) + }); + } + + let per_level_budget = (self.total_budget as f32 * min_per_level) as usize; + let mut selected = Vec::new(); + let mut tokens_used = 0; + + // Process from shallow to deep + for depth in 0..=max_depth { + if tokens_used >= self.total_budget { + break; + } + + if let Some(level_content) = by_depth.get(&depth) { + let mut level_used = 0; + + for relevance in level_content { + if tokens_used >= self.total_budget { + break; + } + + let tokens = relevance.chunk.token_count(); + + // Check if we should include this content + let can_include_full = tokens_used + tokens <= self.total_budget; + let level_budget_ok = level_used < per_level_budget || depth == 0; + + if can_include_full && level_budget_ok { + selected.push(SelectedContent { + node_id: relevance.chunk.node_id, + title: relevance.chunk.title.clone(), + content: relevance.chunk.content.clone(), + tokens, + score: relevance.score, + depth, + truncation: None, + }); + tokens_used += tokens; + level_used += tokens; + } else if level_used < per_level_budget { + // Try truncated version + let remaining = (self.total_budget - tokens_used).min(per_level_budget - level_used); + if remaining >= 50 { + if let Some(truncated) = self.truncate_content(&relevance.chunk.content, remaining) { + let truncated_tokens = estimate_tokens(&truncated); + selected.push(SelectedContent { + node_id: relevance.chunk.node_id, + title: relevance.chunk.title.clone(), + content: truncated, + tokens: truncated_tokens, + score: relevance.score, + depth, + truncation: Some(TruncationInfo { + original_len: relevance.chunk.content.len(), + truncated_len: remaining, + reason: TruncationReason::BudgetExceeded, + }), + }); + tokens_used += truncated_tokens; + level_used += truncated_tokens; + stats.items_truncated += 1; + } + } + } + } + } + } + + // Second pass: fill remaining budget with highest-scoring content + if tokens_used < self.total_budget - self.min_reserve { + let mut all_remaining: Vec<_> = by_depth + .values() + .flat_map(|v| v.iter()) + .filter(|c| !selected.iter().any(|s| s.node_id == c.chunk.node_id)) + .collect(); + + all_remaining.sort_by(|a, b| { + b.score + .partial_cmp(&a.score) + .unwrap_or(std::cmp::Ordering::Equal) + }); + + for relevance in all_remaining { + if tokens_used >= self.total_budget - self.min_reserve { + break; + } + + let tokens = relevance.chunk.token_count(); + if tokens_used + tokens <= self.total_budget { + selected.push(SelectedContent { + node_id: relevance.chunk.node_id, + title: relevance.chunk.title.clone(), + content: relevance.chunk.content.clone(), + tokens, + score: relevance.score, + depth: relevance.chunk.depth, + truncation: None, + }); + tokens_used += tokens; + } + } + } + + stats.items_selected = selected.len(); + stats.avg_score = if selected.is_empty() { + 0.0 + } else { + selected.iter().map(|s| s.score).sum::() / selected.len() as f32 + }; + + AllocationResult { + selected, + tokens_used, + remaining_budget: self.total_budget - tokens_used, + stats, + } + } + + /// Truncate content to fit within token budget. + fn truncate_content(&self, content: &str, max_tokens: usize) -> Option { + if max_tokens < 20 { + return None; + } + + // Approximate: 1 token ≈ 4 characters (for English) + let max_chars = max_tokens * 4; + + if content.len() <= max_chars { + return Some(content.to_string()); + } + + // Try to break at sentence boundary + let truncated = &content[..max_chars]; + + // Find last sentence boundary + if let Some(pos) = truncated.rfind(|c| c == '.' || c == '!' || c == '?') { + Some(format!("{}...", &truncated[..=pos])) + } else if let Some(pos) = truncated.rfind(' ') { + // Fall back to word boundary + Some(format!("{}...", &truncated[..pos])) + } else { + // Hard truncate + Some(format!("{}...", truncated)) + } + } +} + +impl Default for BudgetAllocator { + fn default() -> Self { + Self::new(4000) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::retrieval::content::{ContentChunk, ScoreComponents}; + use indextree::Arena; + + fn make_test_node_id() -> NodeId { + let mut arena = Arena::new(); + let node = crate::domain::TreeNode { + title: "Test".to_string(), + structure: String::new(), + content: String::new(), + summary: String::new(), + depth: 0, + start_index: 0, + end_index: 0, + start_page: None, + end_page: None, + node_id: None, + physical_index: None, + token_count: None, + }; + NodeId(arena.new_node(node)) + } + + fn make_relevance(content: &str, score: f32, depth: usize) -> ContentRelevance { + let chunk = ContentChunk::new( + make_test_node_id(), + "Test".to_string(), + content.to_string(), + depth, + ); + ContentRelevance::new(chunk, score, ScoreComponents::default()) + } + + #[test] + fn test_allocator_creation() { + let allocator = BudgetAllocator::new(1000); + assert_eq!(allocator.total_budget, 1000); + } + + #[test] + fn test_greedy_allocation() { + let allocator = BudgetAllocator::new(100) + .with_strategy(AllocationStrategy::Greedy); + + let content = vec![ + make_relevance("High score content with enough text", 0.9, 0), + make_relevance("Low score content", 0.3, 0), + ]; + + let result = allocator.allocate(content, 1); + assert!(!result.is_empty()); + assert!(result.tokens_used <= 100); + } + + #[test] + fn test_min_score_filter() { + let allocator = BudgetAllocator::new(1000) + .with_min_score(0.5); + + let content = vec![ + make_relevance("Good content", 0.8, 0), + make_relevance("Bad content", 0.2, 0), + ]; + + let result = allocator.allocate(content, 1); + assert_eq!(result.selected.len(), 1); + } + + #[test] + fn test_truncation() { + let allocator = BudgetAllocator::new(50); + let truncated = allocator.truncate_content( + "This is a very long piece of content. It has multiple sentences. We want to test truncation at sentence boundary.", + 25, // Need at least 20 tokens for truncation + ); + + assert!(truncated.is_some()); + let text = truncated.unwrap(); + // Should truncate and add ellipsis + assert!(text.len() < 200); // Should be truncated + } + + #[test] + fn test_hierarchical_allocation() { + let allocator = BudgetAllocator::new(200) + .with_strategy(AllocationStrategy::Hierarchical { min_per_level: 0.2 }); + + let content = vec![ + make_relevance("Depth 0 content", 0.9, 0), + make_relevance("Depth 1 content A", 0.7, 1), + make_relevance("Depth 1 content B", 0.6, 1), + make_relevance("Depth 2 content", 0.8, 2), + ]; + + let result = allocator.allocate(content, 2); + + // Should have content from multiple depths + let depths: std::collections::HashSet = + result.selected.iter().map(|s| s.depth).collect(); + assert!(depths.len() >= 2); + } +} diff --git a/src/retrieval/content/builder.rs b/src/retrieval/content/builder.rs new file mode 100644 index 00000000..c3b5792f --- /dev/null +++ b/src/retrieval/content/builder.rs @@ -0,0 +1,522 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Structure builder for aggregated content. +//! +//! This module transforms selected content into structured output formats. + +use serde::{Deserialize, Serialize}; + +use crate::domain::DocumentTree; + +use super::budget::SelectedContent; +use super::config::OutputFormatConfig; + +/// Output format for structured content. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub enum OutputFormat { + /// Markdown format with headers. + #[default] + Markdown, + /// JSON format. + Json, + /// Tree format. + Tree, + /// Flat text format. + Flat, +} + +impl From for OutputFormat { + fn from(config: OutputFormatConfig) -> Self { + match config { + OutputFormatConfig::Markdown => Self::Markdown, + OutputFormatConfig::Json => Self::Json, + OutputFormatConfig::Tree => Self::Tree, + OutputFormatConfig::Flat => Self::Flat, + } + } +} + +/// Tree node in the content structure. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ContentTreeNode { + /// Node title. + pub title: String, + /// Node content (if any). + pub content: Option, + /// Relevance score. + pub score: f32, + /// Child nodes. + pub children: Vec, +} + +impl ContentTreeNode { + /// Create a new tree node. + #[must_use] + pub fn new(title: String) -> Self { + Self { + title, + content: None, + score: 0.0, + children: Vec::new(), + } + } + + /// Add content to this node. + #[must_use] + pub fn with_content(mut self, content: String, score: f32) -> Self { + self.content = Some(content); + self.score = score; + self + } + + /// Add a child node. + pub fn add_child(&mut self, child: ContentTreeNode) { + self.children.push(child); + } +} + +/// Content tree structure. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ContentTree { + /// Root node. + pub root: ContentTreeNode, + /// Total nodes in tree. + pub total_nodes: usize, +} + +impl ContentTree { + /// Create a new content tree. + #[must_use] + pub fn new(root: ContentTreeNode) -> Self { + Self { + total_nodes: 1, + root, + } + } +} + +/// Metadata about aggregated content. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct ContentMetadata { + /// Total tokens in content. + pub total_tokens: usize, + /// Number of nodes included. + pub node_count: usize, + /// Average relevance score. + pub avg_score: f32, + /// Maximum depth included. + pub max_depth: usize, +} + +/// Structured content result. +#[derive(Debug, Clone)] +pub struct StructuredContent { + /// Formatted content string. + pub content: String, + /// Optional tree structure. + pub structure: Option, + /// Content metadata. + pub metadata: ContentMetadata, +} + +impl StructuredContent { + /// Check if content is empty. + #[must_use] + pub fn is_empty(&self) -> bool { + self.content.is_empty() + } + + /// Get content length in characters. + #[must_use] + pub fn len(&self) -> usize { + self.content.len() + } +} + +/// Builder for creating structured content output. +#[derive(Debug)] +pub struct StructureBuilder { + /// Output format. + format: OutputFormat, + /// Include metadata in output. + include_metadata: bool, + /// Include scores in output. + include_scores: bool, +} + +impl StructureBuilder { + /// Create a new structure builder. + #[must_use] + pub fn new(format: OutputFormat) -> Self { + Self { + format, + include_metadata: false, + include_scores: false, + } + } + + /// Create builder from config. + #[must_use] + pub fn from_config(format: OutputFormatConfig, include_scores: bool) -> Self { + Self { + format: OutputFormat::from(format), + include_metadata: false, + include_scores, + } + } + + /// Enable metadata in output. + #[must_use] + pub fn with_metadata(mut self) -> Self { + self.include_metadata = true; + self + } + + /// Enable scores in output. + #[must_use] + pub fn with_scores(mut self) -> Self { + self.include_scores = true; + self + } + + /// Build structured content from selected items. + #[must_use] + pub fn build( + &self, + selected: Vec, + tree: &DocumentTree, + ) -> StructuredContent { + if selected.is_empty() { + return StructuredContent { + content: String::new(), + structure: None, + metadata: ContentMetadata::default(), + }; + } + + // Calculate metadata + let total_tokens: usize = selected.iter().map(|s| s.tokens).sum(); + let avg_score = selected.iter().map(|s| s.score).sum::() / selected.len() as f32; + let max_depth = selected.iter().map(|s| s.depth).max().unwrap_or(0); + + let metadata = ContentMetadata { + total_tokens, + node_count: selected.len(), + avg_score, + max_depth, + }; + + // Build based on format + let (content, structure) = match &self.format { + OutputFormat::Markdown => self.build_markdown(selected, tree), + OutputFormat::Json => self.build_json(selected, tree), + OutputFormat::Tree => self.build_tree_format(selected, tree), + OutputFormat::Flat => self.build_flat(selected), + }; + + StructuredContent { + content, + structure, + metadata, + } + } + + /// Build Markdown format output. + fn build_markdown( + &self, + selected: Vec, + _tree: &DocumentTree, + ) -> (String, Option) { + let mut sections = Vec::new(); + let mut current_depth = 0; + + // Sort by depth to maintain hierarchy + let mut sorted = selected; + sorted.sort_by(|a, b| a.depth.cmp(&b.depth)); + + for content in sorted { + // Adjust heading level based on depth + let heading_level = (content.depth + 1).min(6); + let heading = "#".repeat(heading_level); + + let mut section = format!("{} {}", heading, content.title); + + if self.include_scores { + section.push_str(&format!(" *(score: {:.2})*", content.score)); + } + + section.push_str("\n\n"); + section.push_str(&content.content); + + if content.is_truncated() { + section.push_str("\n\n*[content truncated]*"); + } + + sections.push(section); + current_depth = current_depth.max(content.depth); + } + + (sections.join("\n\n---\n\n"), None) + } + + /// Build JSON format output. + fn build_json( + &self, + selected: Vec, + _tree: &DocumentTree, + ) -> (String, Option) { + #[derive(Serialize)] + struct JsonOutput<'a> { + sections: Vec>, + } + + #[derive(Serialize)] + struct JsonSection<'a> { + title: &'a str, + content: &'a str, + score: f32, + depth: usize, + truncated: bool, + } + + let sections: Vec<_> = selected + .iter() + .map(|s| JsonSection { + title: &s.title, + content: &s.content, + score: s.score, + depth: s.depth, + truncated: s.is_truncated(), + }) + .collect(); + + let output = JsonOutput { sections }; + let content = serde_json::to_string_pretty(&output).unwrap_or_default(); + + (content, None) + } + + /// Build tree format output. + fn build_tree_format( + &self, + selected: Vec, + tree: &DocumentTree, + ) -> (String, Option) { + // Build tree structure + let mut root = ContentTreeNode::new("Content".to_string()); + let mut node_count = 0; + + // Group by parent + use std::collections::HashMap; + let mut by_parent: HashMap, Vec<&SelectedContent>> = + HashMap::new(); + + for content in &selected { + let parent = tree.get(content.node_id).and_then(|_| { + // Find parent in selected + selected + .iter() + .find(|s| s.depth < content.depth) + .map(|s| Some(s.node_id)) + .unwrap_or(None) + }); + by_parent.entry(parent).or_default().push(content); + } + + // Build tree recursively + fn build_node( + content: &SelectedContent, + all_by_parent: &HashMap, Vec<&SelectedContent>>, + ) -> ContentTreeNode { + let mut node = ContentTreeNode::new(content.title.clone()) + .with_content(content.content.clone(), content.score); + + if let Some(children) = all_by_parent.get(&Some(content.node_id)) { + for child in children { + node.add_child(build_node(child, all_by_parent)); + } + } + + node + } + + // Add top-level items + if let Some(top_level) = by_parent.get(&None) { + for content in top_level { + let node = build_node(content, &by_parent); + node_count += count_nodes(&node); + root.add_child(node); + } + } + + // Build string representation + let content = render_tree(&root, 0); + + let tree_structure = ContentTree { + root, + total_nodes: node_count, + }; + + (content, Some(tree_structure)) + } + + /// Build flat format output. + fn build_flat(&self, selected: Vec) -> (String, Option) { + let parts: Vec<_> = selected + .iter() + .map(|c| { + let mut part = format!("[{}] {}", c.title, c.content); + if self.include_scores { + part = format!("[{}] (score: {:.2}) {}", c.title, c.score, c.content); + } + part + }) + .collect(); + + (parts.join("\n\n"), None) + } +} + +impl Default for StructureBuilder { + fn default() -> Self { + Self::new(OutputFormat::default()) + } +} + +/// Count nodes in a tree. +fn count_nodes(node: &ContentTreeNode) -> usize { + 1 + node.children.iter().map(count_nodes).sum::() +} + +/// Render tree as string. +fn render_tree(node: &ContentTreeNode, depth: usize) -> String { + let indent = " ".repeat(depth); + let mut result = format!("{}├─ {} (score: {:.2})\n", indent, node.title, node.score); + + if let Some(ref content) = node.content { + let preview = if content.len() > 100 { + format!("{}...", &content[..100]) + } else { + content.clone() + }; + result.push_str(&format!("{}│ {}\n", indent, preview.replace('\n', " "))); + } + + for child in &node.children { + result.push_str(&render_tree(child, depth + 1)); + } + + result +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::domain::NodeId; + use indextree::Arena; + + fn make_test_node_id() -> NodeId { + let mut arena = Arena::new(); + let node = crate::domain::TreeNode { + title: "Test".to_string(), + structure: String::new(), + content: String::new(), + summary: String::new(), + depth: 0, + start_index: 0, + end_index: 0, + start_page: None, + end_page: None, + node_id: None, + physical_index: None, + token_count: None, + }; + NodeId(arena.new_node(node)) + } + + fn make_selected(title: &str, content: &str, score: f32, depth: usize) -> SelectedContent { + SelectedContent { + node_id: make_test_node_id(), + title: title.to_string(), + content: content.to_string(), + tokens: 50, + score, + depth, + truncation: None, + } + } + + #[test] + fn test_markdown_builder() { + let builder = StructureBuilder::new(OutputFormat::Markdown); + let selected = vec![ + make_selected("Section 1", "Content 1", 0.9, 0), + make_selected("Section 2", "Content 2", 0.8, 1), + ]; + + // Create a minimal tree for testing + let tree = DocumentTree::new("Test", ""); + + let result = builder.build(selected, &tree); + + assert!(!result.is_empty()); + assert!(result.content.contains("Section 1")); + assert!(result.content.contains("Section 2")); + assert!(result.content.contains("# Section 1")); + assert!(result.content.contains("## Section 2")); + } + + #[test] + fn test_flat_builder() { + let builder = StructureBuilder::new(OutputFormat::Flat); + let selected = vec![ + make_selected("Section 1", "Content 1", 0.9, 0), + ]; + + let tree = DocumentTree::new("Test", ""); + let result = builder.build(selected, &tree); + + assert!(result.content.contains("[Section 1]")); + assert!(result.content.contains("Content 1")); + } + + #[test] + fn test_builder_with_scores() { + let builder = StructureBuilder::new(OutputFormat::Markdown) + .with_scores(); + + let selected = vec![ + make_selected("Section 1", "Content 1", 0.95, 0), + ]; + + let tree = DocumentTree::new("Test", ""); + let result = builder.build(selected, &tree); + + assert!(result.content.contains("score: 0.95")); + } + + #[test] + fn test_empty_selected() { + let builder = StructureBuilder::new(OutputFormat::Markdown); + let tree = DocumentTree::new("Test", ""); + let result = builder.build(Vec::new(), &tree); + + assert!(result.is_empty()); + assert_eq!(result.metadata.node_count, 0); + } + + #[test] + fn test_content_tree_node() { + let mut root = ContentTreeNode::new("Root".to_string()) + .with_content("Root content".to_string(), 0.9); + + let child = ContentTreeNode::new("Child".to_string()) + .with_content("Child content".to_string(), 0.8); + + root.add_child(child); + + assert_eq!(root.children.len(), 1); + assert_eq!(root.score, 0.9); + } +} diff --git a/src/retrieval/content/config.rs b/src/retrieval/content/config.rs new file mode 100644 index 00000000..f9bc38b6 --- /dev/null +++ b/src/retrieval/content/config.rs @@ -0,0 +1,158 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Configuration types for content aggregation. + +use serde::{Deserialize, Serialize}; + +/// Configuration for content aggregation. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ContentAggregatorConfig { + /// Maximum tokens to return in aggregated content. + pub token_budget: usize, + + /// Minimum relevance score threshold (0.0 - 1.0). + /// Content below this threshold will be filtered out. + pub min_relevance_score: f32, + + /// Scoring strategy for relevance computation. + pub scoring_strategy: ScoringStrategyConfig, + + /// Output format for aggregated content. + pub output_format: OutputFormatConfig, + + /// Include relevance scores in output metadata. + pub include_scores: bool, + + /// Minimum budget allocation per depth level (for hierarchical strategy). + /// Value between 0.0 and 1.0, representing fraction of total budget. + pub hierarchical_min_per_level: f32, + + /// Enable content deduplication. + pub deduplicate: bool, + + /// Similarity threshold for deduplication (0.0 - 1.0). + pub dedup_threshold: f32, +} + +impl Default for ContentAggregatorConfig { + fn default() -> Self { + Self { + token_budget: 4000, + min_relevance_score: 0.2, + scoring_strategy: ScoringStrategyConfig::KeywordWithBM25, + output_format: OutputFormatConfig::Markdown, + include_scores: false, + hierarchical_min_per_level: 0.1, + deduplicate: true, + dedup_threshold: 0.9, + } + } +} + +impl ContentAggregatorConfig { + /// Create a new config with default values. + #[must_use] + pub fn new() -> Self { + Self::default() + } + + /// Set the token budget. + #[must_use] + pub fn with_token_budget(mut self, budget: usize) -> Self { + self.token_budget = budget; + self + } + + /// Set the minimum relevance score. + #[must_use] + pub fn with_min_relevance(mut self, score: f32) -> Self { + self.min_relevance_score = score.clamp(0.0, 1.0); + self + } + + /// Set the scoring strategy. + #[must_use] + pub fn with_scoring_strategy(mut self, strategy: ScoringStrategyConfig) -> Self { + self.scoring_strategy = strategy; + self + } + + /// Set the output format. + #[must_use] + pub fn with_output_format(mut self, format: OutputFormatConfig) -> Self { + self.output_format = format; + self + } +} + +/// Scoring strategy configuration. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ScoringStrategyConfig { + /// Fast keyword matching only. + KeywordOnly, + /// Keyword matching with BM25 scoring. + KeywordWithBM25, + /// Hybrid: keyword + LLM reranking for top candidates. + Hybrid, +} + +impl Default for ScoringStrategyConfig { + fn default() -> Self { + Self::KeywordWithBM25 + } +} + +/// Output format configuration. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum OutputFormatConfig { + /// Markdown format with headers. + Markdown, + /// JSON format. + Json, + /// Tree format. + Tree, + /// Flat text format. + Flat, +} + +impl Default for OutputFormatConfig { + fn default() -> Self { + Self::Markdown + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_default_config() { + let config = ContentAggregatorConfig::default(); + assert_eq!(config.token_budget, 4000); + assert_eq!(config.min_relevance_score, 0.2); + } + + #[test] + fn test_config_builder() { + let config = ContentAggregatorConfig::new() + .with_token_budget(2000) + .with_min_relevance(0.5); + + assert_eq!(config.token_budget, 2000); + assert_eq!(config.min_relevance_score, 0.5); + } + + #[test] + fn test_min_relevance_clamped() { + let config = ContentAggregatorConfig::new() + .with_min_relevance(1.5); + assert_eq!(config.min_relevance_score, 1.0); + + let config = ContentAggregatorConfig::new() + .with_min_relevance(-0.5); + assert_eq!(config.min_relevance_score, 0.0); + } +} diff --git a/src/retrieval/content/mod.rs b/src/retrieval/content/mod.rs new file mode 100644 index 00000000..2a78f801 --- /dev/null +++ b/src/retrieval/content/mod.rs @@ -0,0 +1,46 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Content aggregation module for retrieval results. +//! +//! This module provides precision-focused, budget-aware content aggregation +//! that transforms candidate nodes into structured, relevant content. +//! +//! # Architecture +//! +//! ```text +//! ┌─────────────────────────────────────────────────────────────┐ +//! │ Content Aggregator │ +//! ├─────────────────────────────────────────────────────────────┤ +//! │ RelevanceScorer → BudgetAllocator → StructureBuilder │ +//! └─────────────────────────────────────────────────────────────┘ +//! ``` +//! +//! # Example +//! +//! ```rust,ignore +//! use vectorless::retrieval::content::{ContentAggregator, ContentAggregatorConfig}; +//! +//! let config = ContentAggregatorConfig { +//! token_budget: 4000, +//! min_relevance_score: 0.3, +//! ..Default::default() +//! }; +//! +//! let aggregator = ContentAggregator::new(config); +//! let result = aggregator.aggregate(&candidates, &tree, &query); +//! ``` + +mod aggregator; +mod budget; +mod builder; +mod config; +mod scorer; + +pub use aggregator::{ContentAggregator, AggregationResult, CandidateNode}; +pub use budget::{BudgetAllocator, AllocationStrategy, AllocationResult, SelectedContent}; +pub use builder::{StructureBuilder, OutputFormat, StructuredContent, ContentTree}; +pub use config::{ContentAggregatorConfig, OutputFormatConfig, ScoringStrategyConfig}; +pub use scorer::{ + RelevanceScorer, ContentRelevance, ScoreComponents, ContentChunk, ScoringContext, +}; diff --git a/src/retrieval/content/scorer.rs b/src/retrieval/content/scorer.rs new file mode 100644 index 00000000..ba04a6ce --- /dev/null +++ b/src/retrieval/content/scorer.rs @@ -0,0 +1,439 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Relevance scoring for content chunks. +//! +//! This module provides fine-grained relevance scoring for content, +//! combining keyword matching, BM25, and optional LLM reranking. + +use std::collections::HashMap; + +use crate::domain::{estimate_tokens, NodeId}; + +use super::config::ScoringStrategyConfig; + +/// Content chunk for scoring. +#[derive(Debug, Clone)] +pub struct ContentChunk { + /// Node ID this chunk belongs to. + pub node_id: NodeId, + /// Title of the node. + pub title: String, + /// Content text. + pub content: String, + /// Depth in tree (0 = root level). + pub depth: usize, +} + +impl ContentChunk { + /// Create a new content chunk. + #[must_use] + pub fn new(node_id: NodeId, title: String, content: String, depth: usize) -> Self { + Self { + node_id, + title, + content, + depth, + } + } + + /// Estimate token count for this chunk. + #[must_use] + pub fn token_count(&self) -> usize { + estimate_tokens(&self.content) + } +} + +/// Relevance score components. +#[derive(Debug, Clone, Default)] +pub struct ScoreComponents { + /// Keyword match score (0.0 - 1.0). + pub keyword_score: f32, + /// BM25 score (normalized). + pub bm25_score: f32, + /// Depth penalty (deeper = lower score). + pub depth_penalty: f32, + /// Path bonus from parent relevance. + pub path_bonus: f32, + /// Information density score. + pub density_score: f32, +} + +impl ScoreComponents { + /// Compute final weighted score. + #[must_use] + pub fn final_score(&self) -> f32 { + // Weight formula from design doc + let score = self.keyword_score * 0.35 + + self.bm25_score * 0.25 + + self.depth_penalty * 0.15 + + self.path_bonus * 0.10 + + self.density_score * 0.15; + + score.clamp(0.0, 1.0) + } +} + +/// Relevance score result for a content chunk. +#[derive(Debug, Clone)] +pub struct ContentRelevance { + /// The content chunk that was scored. + pub chunk: ContentChunk, + /// Final relevance score (0.0 - 1.0). + pub score: f32, + /// Score breakdown by component. + pub components: ScoreComponents, +} + +impl ContentRelevance { + /// Create a new relevance result. + #[must_use] + pub fn new(chunk: ContentChunk, score: f32, components: ScoreComponents) -> Self { + Self { + chunk, + score, + components, + } + } +} + +/// Context for scoring operations. +#[derive(Debug, Clone)] +pub struct ScoringContext { + /// Average document length for BM25. + pub avg_doc_len: f32, + /// Total document count for IDF. + pub doc_count: usize, + /// Document frequency for terms. + pub doc_freq: HashMap, + /// Parent node score (for path bonus). + pub parent_score: Option, +} + +impl Default for ScoringContext { + fn default() -> Self { + Self { + avg_doc_len: 100.0, + doc_count: 1, + doc_freq: HashMap::new(), + parent_score: None, + } + } +} + +/// Relevance scorer for content chunks. +#[derive(Debug)] +pub struct RelevanceScorer { + /// Query keywords extracted from the query. + query_keywords: Vec, + /// Scoring strategy to use. + strategy: ScoringStrategyConfig, + /// BM25 parameters. + k1: f32, + b: f32, +} + +impl RelevanceScorer { + /// Create a new scorer with keywords. + #[must_use] + pub fn new(query: &str, strategy: ScoringStrategyConfig) -> Self { + let query_keywords = extract_keywords(query); + Self { + query_keywords, + strategy, + k1: 1.2, + b: 0.75, + } + } + + /// Create a scorer with pre-extracted keywords. + #[must_use] + pub fn with_keywords(keywords: Vec, strategy: ScoringStrategyConfig) -> Self { + Self { + query_keywords: keywords, + strategy, + k1: 1.2, + b: 0.75, + } + } + + /// Score a content chunk. + #[must_use] + pub fn score_chunk(&self, chunk: &ContentChunk, ctx: &ScoringContext) -> ContentRelevance { + let mut components = ScoreComponents::default(); + + // 1. Keyword score + components.keyword_score = self.compute_keyword_score(&chunk.content); + + // 2. BM25 score (if enabled) + if matches!(self.strategy, ScoringStrategyConfig::KeywordWithBM25 | ScoringStrategyConfig::Hybrid) { + components.bm25_score = self.compute_bm25_score(&chunk.content, ctx); + } + + // 3. Depth penalty (10% per level) + components.depth_penalty = 0.9_f32.powi(chunk.depth as i32); + + // 4. Path bonus + components.path_bonus = ctx.parent_score.map(|s| s * 0.2).unwrap_or(0.0); + + // 5. Density score + components.density_score = compute_density(&chunk.content); + + let final_score = components.final_score(); + + ContentRelevance::new(chunk.clone(), final_score, components) + } + + /// Score multiple chunks. + pub fn score_chunks<'a>( + &self, + chunks: &'a [ContentChunk], + ctx: &ScoringContext, + ) -> Vec { + chunks + .iter() + .map(|chunk| self.score_chunk(chunk, ctx)) + .collect() + } + + /// Compute keyword overlap score. + fn compute_keyword_score(&self, content: &str) -> f32 { + if self.query_keywords.is_empty() { + return 0.5; // Neutral score if no keywords + } + + let content_lower = content.to_lowercase(); + let content_words: std::collections::HashSet<&str> = content_lower + .split_whitespace() + .collect(); + + let matches = self + .query_keywords + .iter() + .filter(|kw| { + let kw_lower = kw.to_lowercase(); + content_words.iter().any(|&w| w.contains(&kw_lower)) + || content_lower.contains(&kw_lower) + }) + .count(); + + matches as f32 / self.query_keywords.len() as f32 + } + + /// Compute BM25 score. + fn compute_bm25_score(&self, content: &str, ctx: &ScoringContext) -> f32 { + if self.query_keywords.is_empty() { + return 0.0; + } + + let doc_len = content.split_whitespace().count() as f32; + let mut score = 0.0; + + for term in &self.query_keywords { + let term_lower = term.to_lowercase(); + let tf = content + .to_lowercase() + .matches(&term_lower) + .count() as f32; + + if tf == 0.0 { + continue; + } + + // IDF calculation + let df = ctx.doc_freq.get(&term_lower).copied().unwrap_or(1) as f32; + let idf = ((ctx.doc_count as f32 - df + 0.5) / (df + 0.5) + 1.0).ln(); + + // BM25 formula + let numerator = tf * (self.k1 + 1.0); + let denominator = tf + self.k1 * (1.0 - self.b + self.b * doc_len / ctx.avg_doc_len); + + score += idf * numerator / denominator; + } + + // Normalize to [0, 1] + let max_possible_score = self.query_keywords.len() as f32 * 5.0; // Rough upper bound + (score / max_possible_score).clamp(0.0, 1.0) + } + + /// Get the query keywords. + #[must_use] + pub fn keywords(&self) -> &[String] { + &self.query_keywords + } +} + +/// Extract keywords from a query string. +fn extract_keywords(query: &str) -> Vec { + // Common English stop words + const STOPWORDS: &[&str] = &[ + "a", "an", "the", "is", "are", "was", "were", "be", "been", "being", + "have", "has", "had", "do", "does", "did", "will", "would", "could", + "should", "may", "might", "must", "shall", "can", "need", "dare", + "ought", "used", "to", "of", "in", "for", "on", "with", "at", "by", + "from", "as", "into", "through", "during", "before", "after", + "above", "below", "between", "under", "again", "further", "then", + "once", "here", "there", "when", "where", "why", "how", "all", + "each", "few", "more", "most", "other", "some", "such", "no", "nor", + "not", "only", "own", "same", "so", "than", "too", "very", "just", + "and", "but", "if", "or", "because", "until", "while", "about", + "what", "which", "who", "whom", "this", "that", "these", "those", + "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", + "your", "yours", "yourself", "yourselves", "he", "him", "his", + "himself", "she", "her", "hers", "herself", "it", "its", "itself", + "they", "them", "their", "theirs", "themselves", + ]; + + query + .to_lowercase() + .split(|c: char| !c.is_alphanumeric()) + .filter(|s| { + let s = *s; + !s.is_empty() && s.len() > 1 && !STOPWORDS.contains(&s) + }) + .map(String::from) + .collect() +} + +/// Compute information density of content. +fn compute_density(content: &str) -> f32 { + let words: Vec<&str> = content.split_whitespace().collect(); + if words.is_empty() { + return 0.0; + } + + // Stopword ratio (lower is better) + const STOPWORDS: &[&str] = &[ + "a", "an", "the", "is", "are", "was", "were", "be", "been", "being", + "have", "has", "had", "do", "does", "did", "will", "would", "could", + "should", "may", "might", "must", "shall", "can", "to", "of", "in", + "for", "on", "with", "at", "by", "from", "and", "but", "or", "as", + ]; + + let stopword_count = words + .iter() + .filter(|w| STOPWORDS.contains(&w.to_lowercase().as_str())) + .count(); + + let stopword_ratio = stopword_count as f32 / words.len() as f32; + + // Entity-like ratio (capitalized, numbers, special terms) + let entity_count = words + .iter() + .filter(|w| { + w.chars() + .any(|c| c.is_numeric() || c.is_uppercase()) + }) + .count(); + + let entity_ratio = entity_count as f32 / words.len() as f32; + + // Combined density score + (1.0 - stopword_ratio) * 0.7 + entity_ratio * 0.3 +} + +#[cfg(test)] +mod tests { + use super::*; + use indextree::Arena; + + fn make_test_node_id() -> NodeId { + let mut arena = Arena::new(); + let node = crate::domain::TreeNode { + title: "Test".to_string(), + structure: String::new(), + content: String::new(), + summary: String::new(), + depth: 0, + start_index: 0, + end_index: 0, + start_page: None, + end_page: None, + node_id: None, + physical_index: None, + token_count: None, + }; + NodeId(arena.new_node(node)) + } + + #[test] + fn test_keyword_extraction() { + let keywords = extract_keywords("What is the architecture of vectorless?"); + assert!(keywords.contains(&"architecture".to_string())); + assert!(keywords.contains(&"vectorless".to_string())); + assert!(!keywords.contains(&"what".to_string())); // stopword + assert!(!keywords.contains(&"the".to_string())); // stopword + } + + #[test] + fn test_keyword_score() { + let scorer = RelevanceScorer::new( + "vectorless architecture", + ScoringStrategyConfig::KeywordOnly, + ); + + let chunk = ContentChunk::new( + make_test_node_id(), + "Test".to_string(), + "Vectorless has a unique architecture for document retrieval.".to_string(), + 0, + ); + + let ctx = ScoringContext::default(); + let score = scorer.compute_keyword_score(&chunk.content); + + assert!(score > 0.5); // Should match both keywords + } + + #[test] + fn test_density_score() { + // High density content + let high_density = "Rust 1.85+ requires Cargo.toml configuration with [dependencies]"; + let score = compute_density(high_density); + assert!(score > 0.5); + + // Low density content (many stopwords) + let low_density = "This is a test of the system with some words in it"; + let score = compute_density(low_density); + assert!(score < 0.7); + } + + #[test] + fn test_depth_penalty() { + let shallow = ContentChunk::new( + make_test_node_id(), + "Test".to_string(), + "Content".to_string(), + 0, + ); + + let deep = ContentChunk::new( + make_test_node_id(), + "Test".to_string(), + "Content".to_string(), + 5, + ); + + let scorer = RelevanceScorer::new("test", ScoringStrategyConfig::KeywordOnly); + let ctx = ScoringContext::default(); + + let shallow_score = scorer.score_chunk(&shallow, &ctx); + let deep_score = scorer.score_chunk(&deep, &ctx); + + assert!(shallow_score.components.depth_penalty > deep_score.components.depth_penalty); + } + + #[test] + fn test_score_components_final_score() { + let components = ScoreComponents { + keyword_score: 0.8, + bm25_score: 0.6, + depth_penalty: 0.9, + path_bonus: 0.1, + density_score: 0.5, + }; + + let final_score = components.final_score(); + assert!(final_score > 0.0 && final_score <= 1.0); + } +} diff --git a/src/retrieval/context.rs b/src/retrieval/context.rs index ba0edb34..595c9083 100644 --- a/src/retrieval/context.rs +++ b/src/retrieval/context.rs @@ -434,7 +434,7 @@ impl ContextBuilder { sections.push(section); } - for child_id in tree.children(node_id) { + for child_id in tree.children_iter(node_id) { self.collect_sections(tree, child_id, current_depth + 1, max_depth, sections); } } @@ -463,7 +463,7 @@ impl ContextBuilder { sections.push(section); } - for child_id in tree.children(node_id) { + for child_id in tree.children_iter(node_id) { Box::pin(self.collect_sections_async( tree, child_id, diff --git a/src/retrieval/mod.rs b/src/retrieval/mod.rs index 5ff07413..565d0fa8 100644 --- a/src/retrieval/mod.rs +++ b/src/retrieval/mod.rs @@ -54,6 +54,7 @@ mod types; pub mod cache; pub mod complexity; +pub mod content; pub mod pilot; pub mod pipeline; pub mod search; @@ -98,6 +99,13 @@ pub use complexity::ComplexityDetector; // Cache exports pub use cache::PathCache; +// Content aggregation exports +pub use content::{ + AggregationResult, AllocationResult, AllocationStrategy, BudgetAllocator, ContentAggregator, + ContentAggregatorConfig, ContentChunk, ContentRelevance, OutputFormat, RelevanceScorer, + ScoreComponents, ScoringStrategyConfig, SelectedContent, StructureBuilder, StructuredContent, +}; + // Pilot exports pub use pilot::{ BudgetConfig, InterventionConfig, InterventionPoint, Pilot, PilotConfig, PilotDecision, diff --git a/src/retrieval/pilot/decision.rs b/src/retrieval/pilot/decision.rs index 09c76add..69a117d6 100644 --- a/src/retrieval/pilot/decision.rs +++ b/src/retrieval/pilot/decision.rs @@ -245,6 +245,7 @@ mod tests { for i in 0..count { let node = crate::domain::TreeNode { title: format!("Node {}", i), + structure: String::new(), content: String::new(), summary: String::new(), depth: 0, diff --git a/src/retrieval/pilot/llm_pilot.rs b/src/retrieval/pilot/llm_pilot.rs index 9342ffa4..c163396a 100644 --- a/src/retrieval/pilot/llm_pilot.rs +++ b/src/retrieval/pilot/llm_pilot.rs @@ -298,6 +298,7 @@ impl Pilot for LlmPilot { // Check budget if !self.has_budget() { + debug!("Budget exhausted, cannot guide start"); return None; } @@ -308,7 +309,14 @@ impl Pilot for LlmPilot { let candidates = tree.children(tree.root()); // Make LLM call - Some(self.call_llm(InterventionPoint::Start, &context, &candidates).await) + let decision = self.call_llm(InterventionPoint::Start, &context, &candidates).await; + info!( + "Pilot start guidance: confidence={}, candidates={}", + decision.confidence, + decision.ranked_candidates.len() + ); + + Some(decision) } async fn guide_backtrack( @@ -358,6 +366,7 @@ mod tests { for i in 0..count { let node = crate::domain::TreeNode { title: format!("Node {}", i), + structure: String::new(), content: String::new(), summary: String::new(), depth: 0, diff --git a/src/retrieval/pilot/parser.rs b/src/retrieval/pilot/parser.rs index 0447a259..9bb0bd48 100644 --- a/src/retrieval/pilot/parser.rs +++ b/src/retrieval/pilot/parser.rs @@ -350,6 +350,7 @@ mod tests { for i in 0..count { let node = crate::domain::TreeNode { title: format!("Node {}", i), + structure: String::new(), content: String::new(), summary: String::new(), depth: 0, diff --git a/src/retrieval/pipeline/context.rs b/src/retrieval/pipeline/context.rs index 5dafaf36..b12d3d9f 100644 --- a/src/retrieval/pipeline/context.rs +++ b/src/retrieval/pipeline/context.rs @@ -10,7 +10,7 @@ use std::collections::HashMap; use std::sync::Arc; use std::time::Instant; -use crate::domain::{DocumentTree, NodeId}; +use crate::domain::{DocumentTree, NodeId, RetrievalIndex}; use crate::retrieval::pilot::Pilot; use crate::retrieval::types::{ NavigationStep, QueryComplexity, RetrieveOptions, RetrieveResponse, SearchPath, @@ -195,6 +195,8 @@ pub struct PipelineContext { pub query: String, /// Document tree to search. pub tree: Arc, + /// Pre-computed retrieval index for efficient operations. + pub retrieval_index: Option, /// Retrieval options. pub options: RetrieveOptions, /// Optional Pilot for navigation guidance. @@ -254,9 +256,13 @@ impl PipelineContext { query: impl Into, options: RetrieveOptions, ) -> Self { + // Build retrieval index for efficient operations + let retrieval_index = Some(tree.build_retrieval_index()); + Self { query: query.into(), tree, + retrieval_index, options, pilot: None, complexity: None, diff --git a/src/retrieval/pipeline_retriever.rs b/src/retrieval/pipeline_retriever.rs index 084ad53d..e51d187a 100644 --- a/src/retrieval/pipeline_retriever.rs +++ b/src/retrieval/pipeline_retriever.rs @@ -9,6 +9,7 @@ use async_trait::async_trait; use std::sync::Arc; +use super::content::ContentAggregatorConfig; use super::pipeline::RetrievalOrchestrator; use super::retriever::{CostEstimate, Retriever, RetrieverError, RetrieverResult}; use super::stages::{AnalyzeStage, JudgeStage, PlanStage, SearchStage}; @@ -16,6 +17,7 @@ use super::strategy::LlmStrategy; use super::types::{RetrieveOptions, RetrieveResponse}; use crate::domain::DocumentTree; use crate::llm::LlmClient; +use crate::retrieval::pilot::{LlmPilot, PilotConfig}; /// Pipeline-based retriever using the stage architecture. /// @@ -37,6 +39,8 @@ pub struct PipelineRetriever { llm_client: Option, max_backtracks: usize, max_iterations: usize, + /// Content aggregator configuration. + content_config: Option, } impl Default for PipelineRetriever { @@ -52,6 +56,7 @@ impl PipelineRetriever { llm_client: None, max_backtracks: 5, max_iterations: 10, + content_config: None, } } @@ -73,6 +78,15 @@ impl PipelineRetriever { self } + /// Set content aggregator configuration. + /// + /// When enabled, the Judge stage uses precision-focused content + /// aggregation with relevance scoring and token budget control. + pub fn with_content_config(mut self, config: ContentAggregatorConfig) -> Self { + self.content_config = Some(config); + self + } + /// Build the orchestrator with all stages. fn build_orchestrator(&self) -> RetrievalOrchestrator { let mut orchestrator = RetrievalOrchestrator::new() @@ -89,18 +103,24 @@ impl PipelineRetriever { } orchestrator = orchestrator.stage(plan_stage); - // Add search stage + // Add search stage with Pilot for semantic navigation let mut search_stage = SearchStage::new(); if let Some(ref client) = self.llm_client { - search_stage = search_stage.with_llm_strategy(LlmStrategy::new(client.clone())); + // Create LLM-based Pilot for semantic navigation guidance + let pilot = LlmPilot::new(client.clone(), PilotConfig::default()); + search_stage = search_stage.with_pilot(Arc::new(pilot)); } orchestrator = orchestrator.stage(search_stage); - // Add judge stage + // Add judge stage with optional content aggregator let mut judge_stage = JudgeStage::new(); if let Some(ref client) = self.llm_client { judge_stage = judge_stage.with_llm_judge(client.clone()); } + // Configure content aggregator if provided + if let Some(ref config) = self.content_config { + judge_stage = judge_stage.with_content_aggregator(config.clone()); + } orchestrator = orchestrator.stage(judge_stage); orchestrator @@ -158,6 +178,7 @@ impl Clone for PipelineRetriever { llm_client: self.llm_client.clone(), max_backtracks: self.max_backtracks, max_iterations: self.max_iterations, + content_config: self.content_config.clone(), } } } @@ -180,4 +201,11 @@ mod tests { assert_eq!(cloned.name(), "pipeline"); assert_eq!(cloned.max_backtracks, 3); } + + #[test] + fn test_pipeline_retriever_with_content_config() { + let config = ContentAggregatorConfig::default(); + let retriever = PipelineRetriever::new().with_content_config(config); + assert!(retriever.content_config.is_some()); + } } diff --git a/src/retrieval/search/beam.rs b/src/retrieval/search/beam.rs index 63cdcec1..2dec5e40 100644 --- a/src/retrieval/search/beam.rs +++ b/src/retrieval/search/beam.rs @@ -12,7 +12,7 @@ use tracing::{debug, trace}; use super::super::RetrievalContext; use super::super::types::{NavigationDecision, NavigationStep, SearchPath}; -use super::scorer::NodeScorer; +use super::scorer::{NodeScorer, ScoringContext}; use super::{SearchConfig, SearchResult, SearchTree}; use crate::domain::{DocumentTree, NodeId}; use crate::retrieval::pilot::{Pilot, SearchState}; @@ -28,34 +28,36 @@ use crate::retrieval::pilot::{Pilot, SearchState}; /// (when multiple candidates are available) to get semantic guidance /// on which branches are most relevant to the query. pub struct BeamSearch { - scorer: NodeScorer, beam_width: usize, } impl BeamSearch { /// Create a new beam search with default beam width. pub fn new() -> Self { - Self { - scorer: NodeScorer::new(Default::default()), - beam_width: 3, - } + Self { beam_width: 3 } } /// Create beam search with specified width. pub fn with_width(width: usize) -> Self { Self { - scorer: NodeScorer::new(Default::default()), beam_width: width.max(1), } } - /// Score candidates using the algorithm's scorer. - fn score_candidates( + /// Create a scorer for the given query. + fn create_scorer(&self, query: &str) -> NodeScorer { + NodeScorer::new(ScoringContext::new(query)) + } + + /// Score candidates using a query-specific scorer. + fn score_candidates_with_query( &self, tree: &DocumentTree, candidates: &[NodeId], + query: &str, ) -> Vec<(NodeId, f32)> { - self.scorer.score_and_sort(tree, candidates) + let scorer = self.create_scorer(query); + scorer.score_and_sort(tree, candidates) } /// Merge algorithm scores with Pilot decision. @@ -67,7 +69,9 @@ impl BeamSearch { tree: &DocumentTree, candidates: &[NodeId], pilot_decision: &crate::retrieval::pilot::PilotDecision, + query: &str, ) -> Vec<(NodeId, f32)> { + let scorer = self.create_scorer(query); let alpha = 0.4; let beta = 0.6 * pilot_decision.confidence; @@ -81,7 +85,7 @@ impl BeamSearch { let mut merged: Vec<(NodeId, f32)> = candidates .iter() .map(|&node_id| { - let algo_score = self.scorer.score(tree, node_id); + let algo_score = scorer.score(tree, node_id); let pilot_score = pilot_scores.get(&node_id).copied().unwrap_or(0.0); // Weighted combination @@ -138,18 +142,18 @@ impl SearchTree for BeamSearch { // Use Pilot's ranked order if available if guidance.has_candidates() { - self.merge_with_pilot_decision(tree, &root_children, &guidance) + self.merge_with_pilot_decision(tree, &root_children, &guidance, &context.query) } else { - self.score_candidates(tree, &root_children) + self.score_candidates_with_query(tree, &root_children, &context.query) } } else { - self.score_candidates(tree, &root_children) + self.score_candidates_with_query(tree, &root_children, &context.query) } } else { - self.score_candidates(tree, &root_children) + self.score_candidates_with_query(tree, &root_children, &context.query) } } else { - self.score_candidates(tree, &root_children) + self.score_candidates_with_query(tree, &root_children, &context.query) }; let mut current_beam: Vec = initial_candidates @@ -211,16 +215,16 @@ impl SearchTree for BeamSearch { ); // Merge algorithm scores with Pilot decision - self.merge_with_pilot_decision(tree, &children, &decision) + self.merge_with_pilot_decision(tree, &children, &decision, &context.query) } } } else { // No intervention, use algorithm scoring - self.score_candidates(tree, &children) + self.score_candidates_with_query(tree, &children, &context.query) } } else { // No Pilot, use algorithm scoring - self.score_candidates(tree, &children) + self.score_candidates_with_query(tree, &children, &context.query) }; // ============================================== @@ -268,6 +272,16 @@ impl SearchTree for BeamSearch { } } + // Fallback: if no results found, add best candidates regardless of score + if result.paths.is_empty() && config.min_score > 0.0 { + debug!("No results above min_score, adding best candidates as fallback"); + // Re-score initial candidates and take top-k + let all_candidates = self.score_candidates_with_query(tree, &tree.children(tree.root()), &context.query); + for (node_id, score) in all_candidates.into_iter().take(config.top_k) { + result.paths.push(SearchPath::from_node(node_id, score)); + } + } + // Sort final results by score result.paths.sort_by(|a, b| { b.score diff --git a/src/retrieval/search/greedy.rs b/src/retrieval/search/greedy.rs index f016a066..ad9fd8d8 100644 --- a/src/retrieval/search/greedy.rs +++ b/src/retrieval/search/greedy.rs @@ -7,27 +7,85 @@ //! When a Pilot is provided, it can provide semantic guidance at decision points. use async_trait::async_trait; +use tracing::{debug, trace}; use super::super::RetrievalContext; use super::super::types::{NavigationDecision, NavigationStep, SearchPath}; -use super::scorer::NodeScorer; +use super::scorer::{NodeScorer, ScoringContext}; use super::{SearchConfig, SearchResult, SearchTree}; -use crate::domain::DocumentTree; -use crate::retrieval::pilot::Pilot; +use crate::domain::{DocumentTree, NodeId}; +use crate::retrieval::pilot::{Pilot, SearchState}; /// Greedy search - always follows the best single path. /// /// Fast but may miss relevant content in other branches. -pub struct GreedySearch { - scorer: NodeScorer, -} +/// When a Pilot is provided, it can guide the search at key decision points. +pub struct GreedySearch; impl GreedySearch { /// Create a new greedy search. pub fn new() -> Self { - Self { - scorer: NodeScorer::new(Default::default()), + Self + } + + /// Create a scorer for the given query. + fn create_scorer(&self, query: &str) -> NodeScorer { + NodeScorer::new(ScoringContext::new(query)) + } + + /// Score candidates using a query-specific scorer. + fn score_candidates_with_query( + &self, + tree: &DocumentTree, + candidates: &[NodeId], + query: &str, + ) -> Vec<(NodeId, f32)> { + let scorer = self.create_scorer(query); + scorer.score_and_sort(tree, candidates) + } + + /// Merge algorithm scores with Pilot decision. + fn merge_with_pilot_decision( + &self, + tree: &DocumentTree, + candidates: &[NodeId], + pilot_decision: &crate::retrieval::pilot::PilotDecision, + query: &str, + ) -> Vec<(NodeId, f32)> { + let scorer = self.create_scorer(query); + let alpha = 0.4; + let beta = 0.6 * pilot_decision.confidence; + + // Build a map from node_id to pilot score + let mut pilot_scores: std::collections::HashMap = std::collections::HashMap::new(); + for ranked in &pilot_decision.ranked_candidates { + pilot_scores.insert(ranked.node_id, ranked.score); } + + // Merge scores + let mut merged: Vec<(NodeId, f32)> = candidates + .iter() + .map(|&node_id| { + let algo_score = scorer.score(tree, node_id); + let pilot_score = pilot_scores.get(&node_id).copied().unwrap_or(0.0); + + // Weighted combination + let final_score = if beta > 0.0 { + (alpha * algo_score + beta * pilot_score) / (alpha + beta) + } else { + algo_score + }; + + (node_id, final_score) + }) + .collect(); + + // Sort by merged score + merged.sort_by(|a, b| { + b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal) + }); + + merged } } @@ -44,13 +102,15 @@ impl SearchTree for GreedySearch { tree: &DocumentTree, context: &RetrievalContext, config: &SearchConfig, - _pilot: Option<&dyn Pilot>, + pilot: Option<&dyn Pilot>, ) -> SearchResult { - // Note: Pilot integration for GreedySearch can be added in Phase 2 - // For now, we keep the original behavior let mut result = SearchResult::default(); let mut current_path = SearchPath::new(); let mut current_node = tree.root(); + let mut visited: std::collections::HashSet = std::collections::HashSet::new(); + + // Track Pilot interventions + let mut pilot_interventions = 0; for iteration in 0..config.max_iterations { result.iterations = iteration + 1; @@ -67,8 +127,43 @@ impl SearchTree for GreedySearch { break; } - // Score all children - let scored_children = self.scorer.score_and_sort(tree, &children); + // ========== Pilot Integration Point ========== + let scored_children = if let Some(p) = pilot { + // Build search state for Pilot + let state = SearchState::new( + tree, + &context.query, + ¤t_path.nodes, + &children, + &visited, + ); + + // Check if Pilot wants to intervene + if p.should_intervene(&state) { + trace!("Pilot intervening at greedy decision point with {} candidates", children.len()); + + match p.decide(&state).await { + decision => { + pilot_interventions += 1; + debug!( + "Pilot decision: confidence={}, direction={:?}", + decision.confidence, + std::mem::discriminant(&decision.direction) + ); + + // Merge algorithm scores with Pilot decision + self.merge_with_pilot_decision(tree, &children, &decision, &context.query) + } + } + } else { + // No intervention, use algorithm scoring + self.score_candidates_with_query(tree, &children, &context.query) + } + } else { + // No Pilot, use algorithm scoring + self.score_candidates_with_query(tree, &children, &context.query) + }; + // ============================================== // Find the best child that meets minimum score let mut best_child = None; @@ -83,6 +178,8 @@ impl SearchTree for GreedySearch { } if let Some(child_id) = best_child { + visited.insert(child_id); + // Record navigation step let child_node = tree.get(child_id); result.trace.push(NavigationStep { @@ -105,13 +202,18 @@ impl SearchTree for GreedySearch { break; } } else { - // No good children found + // No good children found - add current path as result current_path.leaf = Some(current_node); - result.paths.push(current_path); + if current_path.score > 0.0 { + result.paths.push(current_path); + } break; } } + // Record Pilot interventions + result.pilot_interventions = pilot_interventions; + result } diff --git a/src/retrieval/stages/judge.rs b/src/retrieval/stages/judge.rs index f22806db..9cc11e68 100644 --- a/src/retrieval/stages/judge.rs +++ b/src/retrieval/stages/judge.rs @@ -12,6 +12,7 @@ use tracing::{info, warn}; use crate::domain::estimate_tokens; use crate::llm::LlmClient; +use crate::retrieval::content::{ContentAggregator, ContentAggregatorConfig}; use crate::retrieval::pipeline::{FailurePolicy, PipelineContext, RetrievalStage, StageOutcome}; use crate::retrieval::sufficiency::{LlmJudge, SufficiencyChecker, ThresholdChecker}; use crate::retrieval::types::{RetrievalResult, RetrieveResponse, SufficiencyLevel}; @@ -23,18 +24,26 @@ use crate::retrieval::types::{RetrievalResult, RetrieveResponse, SufficiencyLeve /// 2. Checks if content is sufficient to answer the query /// 3. Can trigger additional search iterations if needed /// +/// # Content Aggregation +/// +/// By default, uses simple content collection. For precision-focused +/// aggregation with token budget control, use `with_content_aggregator()`. +/// /// # Example /// /// ```rust,ignore /// let stage = JudgeStage::new() /// .with_llm_judge(llm_client) -/// .with_max_iterations(3); +/// .with_max_iterations(3) +/// .with_content_aggregator(ContentAggregatorConfig::default()); /// ``` pub struct JudgeStage { threshold_checker: ThresholdChecker, llm_judge: Option, max_iterations: usize, use_llm_judge: bool, + /// Optional content aggregator for precision-focused aggregation. + content_aggregator: Option, } impl Default for JudgeStage { @@ -51,6 +60,7 @@ impl JudgeStage { llm_judge: None, max_iterations: 3, use_llm_judge: false, + content_aggregator: None, } } @@ -67,8 +77,58 @@ impl JudgeStage { self } + /// Add content aggregator for precision-focused aggregation. + /// + /// When enabled, content aggregation uses: + /// - Relevance scoring (keyword + BM25) + /// - Token budget allocation + /// - Hierarchical content selection + pub fn with_content_aggregator(mut self, config: ContentAggregatorConfig) -> Self { + self.content_aggregator = Some(ContentAggregator::new(config)); + self + } + + /// Enable content aggregator with default configuration. + pub fn with_default_content_aggregator(mut self) -> Self { + self.content_aggregator = Some(ContentAggregator::with_defaults()); + self + } + /// Aggregate content from candidates. + /// + /// When content aggregator is enabled: + /// - Uses relevance scoring for content selection + /// - Respects token budget + /// - Prioritizes high-relevance content + /// + /// Otherwise falls back to simple collection: + /// - Collects node's own content + descendant leaf content fn aggregate_content(&self, ctx: &PipelineContext) -> (String, usize) { + // Use ContentAggregator if configured + if let Some(ref aggregator) = self.content_aggregator { + use crate::retrieval::content::CandidateNode; + + let candidates: Vec = ctx.candidates + .iter() + .map(|c| CandidateNode::new(c.node_id, c.score, c.depth)) + .collect(); + + let result = aggregator.aggregate(&candidates, &ctx.tree, &ctx.query); + info!( + "ContentAggregator: {} nodes, {} tokens, avg score {:.2}", + result.nodes_included, + result.tokens_used, + result.avg_score + ); + return (result.content, result.tokens_used); + } + + // Fallback: simple content collection + self.aggregate_content_simple(ctx) + } + + /// Simple content aggregation (legacy behavior). + fn aggregate_content_simple(&self, ctx: &PipelineContext) -> (String, usize) { let mut content_parts = Vec::new(); let mut total_tokens = 0; @@ -77,13 +137,25 @@ impl JudgeStage { // Add title content_parts.push(format!("## {}\n", node.title)); - // Add summary if available, otherwise content preview - if !node.summary.is_empty() { + // Always collect all content: own content + descendant leaf content + let mut has_content = false; + + // Add node's own content if available + if !node.content.is_empty() { + content_parts.push(format!("{}\n\n", node.content)); + has_content = true; + } + + // Also collect content from leaf descendants (for intermediate nodes) + let leaf_content = self.collect_leaf_content(&ctx.tree, candidate.node_id); + if !leaf_content.is_empty() { + content_parts.push(format!("{}\n\n", leaf_content)); + has_content = true; + } + + // Fall back to summary only if no content available + if !has_content && !node.summary.is_empty() { content_parts.push(format!("{}\n\n", node.summary)); - } else if !node.content.is_empty() { - // Limit content preview - let preview: String = node.content.chars().take(500).collect(); - content_parts.push(format!("{}\n\n", preview)); } // Estimate tokens @@ -94,6 +166,38 @@ impl JudgeStage { (content_parts.join(""), total_tokens) } + /// Collect content from leaf descendants of a node (excluding the node itself). + fn collect_leaf_content(&self, tree: &crate::domain::DocumentTree, node_id: crate::domain::NodeId) -> String { + let mut content_parts = Vec::new(); + + // Start with children, not the node itself + let children = tree.children(node_id); + if children.is_empty() { + // Node is already a leaf, no descendants to collect + return String::new(); + } + + let mut stack: Vec = children; + + while let Some(current_id) = stack.pop() { + let current_children = tree.children(current_id); + + if current_children.is_empty() { + // Leaf node - collect its content + if let Some(node) = tree.get(current_id) { + if !node.content.is_empty() { + content_parts.push(format!("### {}\n{}", node.title, node.content)); + } + } + } else { + // Non-leaf node - add children to stack + stack.extend(current_children); + } + } + + content_parts.join("\n\n") + } + /// Check sufficiency level. fn check_sufficiency(&self, ctx: &PipelineContext) -> SufficiencyLevel { if !ctx.options.sufficiency_check { @@ -118,14 +222,34 @@ impl JudgeStage { for candidate in &ctx.candidates { if let Some(node) = ctx.tree.get(candidate.node_id) { + // Build content: node's own content + all descendant leaf content + let content = if ctx.options.include_content { + let mut content_parts = Vec::new(); + + // Add node's own content + if !node.content.is_empty() { + content_parts.push(node.content.clone()); + } + + // Add content from leaf descendants + let leaf_content = self.collect_leaf_content(&ctx.tree, candidate.node_id); + if !leaf_content.is_empty() { + content_parts.push(leaf_content); + } + + if content_parts.is_empty() { + None + } else { + Some(content_parts.join("\n\n")) + } + } else { + None + }; + results.push(RetrievalResult { node_id: Some(format!("{:?}", candidate.node_id)), title: node.title.clone(), - content: if ctx.options.include_content { - Some(node.content.clone()) - } else { - None - }, + content, summary: if ctx.options.include_summaries { Some(node.summary.clone()) } else { diff --git a/src/retrieval/stages/search.rs b/src/retrieval/stages/search.rs index 0283de23..e9addfe7 100644 --- a/src/retrieval/stages/search.rs +++ b/src/retrieval/stages/search.rs @@ -147,7 +147,7 @@ impl SearchStage { // Get node info if let Some(node) = tree.get(leaf_id) { let depth = node.depth; - let is_leaf = tree.children(leaf_id).is_empty(); + let is_leaf = tree.is_leaf(leaf_id); candidates.push(CandidateNode::new(leaf_id, path.score, depth, is_leaf)); } diff --git a/templates/template.toml b/templates/template.toml index 5ea61bce..66e85e21 100644 --- a/templates/template.toml +++ b/templates/template.toml @@ -108,6 +108,47 @@ high_similarity_threshold = 0.8 # Low similarity threshold for "explore" decision low_similarity_threshold = 0.3 +# Content aggregator configuration +# Controls how retrieved content is aggregated and returned +[retrieval.content] +# Enable/disable content aggregator +# When disabled, uses simple content collection (legacy behavior) +enabled = true + +# Maximum tokens for aggregated content +token_budget = 4000 + +# Minimum relevance score threshold (0.0 - 1.0) +# Content below this threshold will be filtered out +min_relevance_score = 0.2 + +# Scoring strategy: "keyword_only" | "keyword_bm25" | "hybrid" +# - keyword_only: Fast keyword matching (no BM25) +# - keyword_bm25: Keyword + BM25 scoring (recommended) +# - hybrid: Keyword + LLM reranking (most accurate, slower) +scoring_strategy = "keyword_bm25" + +# Output format: "markdown" | "json" | "tree" | "flat" +# - markdown: Structured markdown with headers (default) +# - json: JSON format for programmatic use +# - tree: Tree structure preserving hierarchy +# - flat: Flat text format +output_format = "markdown" + +# Include relevance scores in output (useful for debugging) +include_scores = false + +# Minimum budget allocation per depth level (0.0 - 1.0) +# Ensures each tree level gets representation +hierarchical_min_per_level = 0.1 + +# Enable content deduplication +deduplicate = true + +# Similarity threshold for deduplication (0.0 - 1.0) +# Higher = more aggressive deduplication +dedup_threshold = 0.9 + [storage] # Workspace directory for persisted documents #