Data-Embedding/config.py at main · nm727/Data-Embedding · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
"""Configuration for the book encoding pipeline."""
from pathlib import Path
from pydantic import BaseModel
from typing import Optional

class ChunkingConfig(BaseModel):
    """Semantic chunking configuration."""
    chunk_size: int = 512  # Target tokens per child chunk
    parent_chunk_size: int = 2048  # Parent context size
    overlap: float = 0.25  # 25% overlap for context continuity
    min_chunk_size: int = 100  # Minimum viable chunk
    respect_sentences: bool = True  # Don't split mid-sentence

class EmbeddingConfig(BaseModel):
    """Embedding model configuration."""
    model_name: str = "all-MiniLM-L6-v2"  # Fast, efficient model (384 dims, ~90MB)
    dimension: int = 384
    normalize: bool = True
    batch_size: int = 32
    device: str = "cuda"  # or "cpu"

class VectorDBConfig(BaseModel):
    """Vector database configuration."""
    collection_name: str = "cybersecurity_book"
    persist_directory: str = "./vector_db"
    distance_metric: str = "cosine"

class MetadataConfig(BaseModel):
    """Metadata enrichment configuration."""
    extract_entities: bool = True
    generate_summaries: bool = True
    tag_severity: bool = True  # Tag cybersecurity severity levels
    extract_cves: bool = True  # Extract CVE patterns

class PipelineConfig(BaseModel):
    """Master pipeline configuration."""
    book_path: Optional[str] = None
    chunking: ChunkingConfig = ChunkingConfig()
    embedding: EmbeddingConfig = EmbeddingConfig()
    vector_db: VectorDBConfig = VectorDBConfig()
    metadata: MetadataConfig = MetadataConfig()

    # Advanced options
    use_parent_document_retrieval: bool = True
    use_hybrid_search: bool = True
    use_hyperbolic_embeddings: bool = False  # Experimental

# Default configuration instance
DEFAULT_CONFIG = PipelineConfig()