-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconfig.py
More file actions
49 lines (42 loc) · 1.76 KB
/
config.py
File metadata and controls
49 lines (42 loc) · 1.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
"""Configuration for the book encoding pipeline."""
from pathlib import Path
from pydantic import BaseModel
from typing import Optional
class ChunkingConfig(BaseModel):
"""Semantic chunking configuration."""
chunk_size: int = 512 # Target tokens per child chunk
parent_chunk_size: int = 2048 # Parent context size
overlap: float = 0.25 # 25% overlap for context continuity
min_chunk_size: int = 100 # Minimum viable chunk
respect_sentences: bool = True # Don't split mid-sentence
class EmbeddingConfig(BaseModel):
"""Embedding model configuration."""
model_name: str = "all-MiniLM-L6-v2" # Fast, efficient model (384 dims, ~90MB)
dimension: int = 384
normalize: bool = True
batch_size: int = 32
device: str = "cuda" # or "cpu"
class VectorDBConfig(BaseModel):
"""Vector database configuration."""
collection_name: str = "cybersecurity_book"
persist_directory: str = "./vector_db"
distance_metric: str = "cosine"
class MetadataConfig(BaseModel):
"""Metadata enrichment configuration."""
extract_entities: bool = True
generate_summaries: bool = True
tag_severity: bool = True # Tag cybersecurity severity levels
extract_cves: bool = True # Extract CVE patterns
class PipelineConfig(BaseModel):
"""Master pipeline configuration."""
book_path: Optional[str] = None
chunking: ChunkingConfig = ChunkingConfig()
embedding: EmbeddingConfig = EmbeddingConfig()
vector_db: VectorDBConfig = VectorDBConfig()
metadata: MetadataConfig = MetadataConfig()
# Advanced options
use_parent_document_retrieval: bool = True
use_hybrid_search: bool = True
use_hyperbolic_embeddings: bool = False # Experimental
# Default configuration instance
DEFAULT_CONFIG = PipelineConfig()