From 8b60e71c7b668f05b28bcbf6c44e0a86685ed3ad Mon Sep 17 00:00:00 2001 From: "Mike@work" Date: Tue, 17 Feb 2026 10:10:15 -0500 Subject: [PATCH] merge conflict --- .gitignore | 2 + src/catalog/catalog/config/__init__.py | 31 +++ .../catalog/config/defaults/default.yaml | 110 +++++++++++ .../config/defaults/environments/dev.yaml | 13 ++ .../config/defaults/environments/prod.yaml | 17 ++ .../config/defaults/environments/test.yaml | 23 +++ src/catalog/catalog/config/loader.py | 157 +++++++++++++++ src/catalog/catalog/config/schema.py | 163 +++++++++++++++ src/catalog/catalog/core/settings.py | 38 +++- src/catalog/catalog/instance.py | 0 src/catalog/tests/idx/unit/config/__init__.py | 0 .../tests/idx/unit/config/test_loader.py | 187 ++++++++++++++++++ .../tests/idx/unit/config/test_schema.py | 158 +++++++++++++++ 13 files changed, 895 insertions(+), 4 deletions(-) create mode 100644 src/catalog/catalog/config/__init__.py create mode 100644 src/catalog/catalog/config/defaults/default.yaml create mode 100644 src/catalog/catalog/config/defaults/environments/dev.yaml create mode 100644 src/catalog/catalog/config/defaults/environments/prod.yaml create mode 100644 src/catalog/catalog/config/defaults/environments/test.yaml create mode 100644 src/catalog/catalog/config/loader.py create mode 100644 src/catalog/catalog/config/schema.py create mode 100644 src/catalog/catalog/instance.py create mode 100644 src/catalog/tests/idx/unit/config/__init__.py create mode 100644 src/catalog/tests/idx/unit/config/test_loader.py create mode 100644 src/catalog/tests/idx/unit/config/test_schema.py diff --git a/.gitignore b/.gitignore index 394f34a..0d110ba 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,8 @@ node_modules .wt.activate .wt.agent-env .wt.port +**/.testmondata-* + .ropeproject/ .ruff_cache/ diff --git a/src/catalog/catalog/config/__init__.py b/src/catalog/catalog/config/__init__.py new file mode 100644 index 0000000..27047fe --- /dev/null +++ b/src/catalog/catalog/config/__init__.py @@ -0,0 +1,31 @@ +"""catalog.config - Application-level Hydra configuration. + +Provides YAML-driven application configuration that mirrors all settings +defined in ``catalog.core.settings``. Uses Hydra's ``compose()`` API for +variable interpolation and config composition. + +Priority (highest to lowest): + 1. Environment variables (``IDX_*``) + 2. Hydra overrides (passed programmatically) + 3. YAML config file values + 4. Pydantic field defaults + +Example usage:: + + from catalog.config import load_app_config + + # Load default config + settings = load_app_config() + + # Load with environment-specific overrides + settings = load_app_config(overrides=["+environment=dev"]) + + # Load from a custom config directory + settings = load_app_config(config_dir=Path("my/configs")) +""" + +from catalog.config.loader import load_app_config + +__all__ = [ + "load_app_config", +] diff --git a/src/catalog/catalog/config/defaults/default.yaml b/src/catalog/catalog/config/defaults/default.yaml new file mode 100644 index 0000000..7c52284 --- /dev/null +++ b/src/catalog/catalog/config/defaults/default.yaml @@ -0,0 +1,110 @@ +# catalog application configuration - default settings +# +# All values here match the Pydantic defaults in catalog.core.settings. +# Override specific values in environment-specific configs (dev.yaml, etc.) +# or via IDX_* environment variables (highest priority). +# +# Hydra variable interpolation is supported: +# ${oc.env:HOME} resolves to $HOME +# ${log_level} cross-references within this config +# +# Usage: +# from catalog.config import load_app_config +# settings = load_app_config() # loads this file +# settings = load_app_config("default", overrides=["+environment=dev"]) + +# -- Paths ------------------------------------------------------------------ + +database_path: ${oc.env:HOME}/.idx/catalog.db +vector_store_path: ${oc.env:HOME}/.idx/vector_store +cache_path: ${oc.env:HOME}/.idx/cache + +# -- Models ------------------------------------------------------------------ + +embedding_model: BAAI/bge-small-en-v1.5 +transformers_model: cross-encoder/ms-marco-MiniLM-L-6-v2 + +# -- Logging ----------------------------------------------------------------- + +log_level: INFO + +# -- Multi-database paths ---------------------------------------------------- + +databases: + catalog_path: ${oc.env:HOME}/.idx/catalog.db + content_path: ${oc.env:HOME}/.idx/content.db + +# -- Embedding --------------------------------------------------------------- + +embedding: + backend: mlx + model_name: mlx-community/all-MiniLM-L6-v2-bf16 + batch_size: 32 + embedding_dim: 384 + +# -- Langfuse observability -------------------------------------------------- + +langfuse: + enabled: false + public_key: null + secret_key: null + host: null + +# -- Performance ------------------------------------------------------------- + +performance: + batch_size: 100 + concurrency: 4 + embedding_batch_size: 32 + chunk_max_bytes: 2048 + chunk_min_bytes: 128 + +# -- Qdrant ------------------------------------------------------------------ + +qdrant: + collection_name: catalog_vectors + +# -- RAG pipeline ------------------------------------------------------------ + +rag: + # Chunking + chunk_size: 800 + chunk_overlap: 120 + chunk_fallback_enabled: true + chunk_chars_per_token: 4 + + # Embedding prefixes + embed_batch_size: 32 + embed_fallback_enabled: true + embed_prefix_query: "task: search result | query: " + embed_prefix_doc: "title: {title} | text: " + + # Query expansion + expansion_enabled: true + expansion_max_lex: 3 + expansion_max_vec: 3 + expansion_include_hyde: true + + # RRF fusion + rrf_k: 60 + rrf_original_weight: 2.0 + rrf_expansion_weight: 1.0 + rrf_rank1_bonus: 0.05 + rrf_rank23_bonus: 0.02 + + # Reranking + rerank_top_n: 10 + rerank_candidates: 40 + rerank_cache_enabled: true + + # Caching + cache_ttl_hours: 168 + + # Retrieval + vector_top_k: 20 + fts_top_k: 20 + fusion_top_k: 30 + + # Snippets + snippet_max_lines: 10 + snippet_context_lines: 2 diff --git a/src/catalog/catalog/config/defaults/environments/dev.yaml b/src/catalog/catalog/config/defaults/environments/dev.yaml new file mode 100644 index 0000000..ce0cd86 --- /dev/null +++ b/src/catalog/catalog/config/defaults/environments/dev.yaml @@ -0,0 +1,13 @@ +# Development environment overrides +# +# Loaded via: load_app_config(overrides=["+environment=dev"]) +# Or automatically when CATALOG_ENV=dev + +log_level: DEBUG + +performance: + batch_size: 10 + concurrency: 2 + +rag: + rerank_cache_enabled: false diff --git a/src/catalog/catalog/config/defaults/environments/prod.yaml b/src/catalog/catalog/config/defaults/environments/prod.yaml new file mode 100644 index 0000000..cba59b3 --- /dev/null +++ b/src/catalog/catalog/config/defaults/environments/prod.yaml @@ -0,0 +1,17 @@ +# Production environment overrides +# +# Loaded via: load_app_config(overrides=["+environment=prod"]) +# Or automatically when CATALOG_ENV=prod + +log_level: WARNING + +performance: + batch_size: 200 + concurrency: 8 + embedding_batch_size: 64 + +rag: + rerank_cache_enabled: true + cache_ttl_hours: 336 + rerank_candidates: 60 + rerank_top_n: 15 diff --git a/src/catalog/catalog/config/defaults/environments/test.yaml b/src/catalog/catalog/config/defaults/environments/test.yaml new file mode 100644 index 0000000..4f3c678 --- /dev/null +++ b/src/catalog/catalog/config/defaults/environments/test.yaml @@ -0,0 +1,23 @@ +# Test environment overrides +# +# Loaded via: load_app_config(overrides=["+environment=test"]) +# Or automatically when CATALOG_ENV=test + +log_level: WARNING + +databases: + catalog_path: /tmp/idx-test/catalog.db + content_path: /tmp/idx-test/content.db + +database_path: /tmp/idx-test/catalog.db +vector_store_path: /tmp/idx-test/vector_store +cache_path: /tmp/idx-test/cache + +performance: + batch_size: 5 + concurrency: 1 + embedding_batch_size: 8 + +rag: + rerank_cache_enabled: false + expansion_enabled: false diff --git a/src/catalog/catalog/config/loader.py b/src/catalog/catalog/config/loader.py new file mode 100644 index 0000000..cf63637 --- /dev/null +++ b/src/catalog/catalog/config/loader.py @@ -0,0 +1,157 @@ +"""catalog.config.loader - Hydra-based application configuration loader. + +Follows the same pattern as ``catalog.ingest.job.DatasetJob.from_yaml()``: +uses Hydra's ``compose()`` API (not ``@hydra.main()``) for YAML loading +with variable interpolation and config composition. + +The loader produces ``catalog.core.settings.Settings`` instances by: +1. Loading YAML via Hydra into an ``AppConfig`` Pydantic model +2. Converting the validated config to a dict +3. Constructing a ``Settings`` instance using a custom source priority + so that environment variables always override YAML values: + ``env vars > YAML config > Pydantic defaults`` +""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any + +from agentlayer.logging import get_logger + +from catalog.config.schema import AppConfig + +__all__ = [ + "load_app_config", + "load_app_config_from_file", +] + +logger = get_logger(__name__) + +_DEFAULT_CONFIG_DIR = Path(__file__).parent / "defaults" + + +def _build_settings_from_yaml(config_dict: dict[str, Any]) -> "Settings": + """Construct a Settings instance with YAML values below env vars in priority. + + Uses ``settings_customise_sources`` to ensure the priority order is: + ``env vars > YAML (init kwargs) > field defaults``. + + By default pydantic-settings treats init kwargs as highest priority. + This function creates a temporary subclass that demotes init kwargs + below environment variables. + + Args: + config_dict: Validated config dict from AppConfig.to_settings_dict(). + + Returns: + Settings instance with correct source priority. + """ + from pydantic_settings import BaseSettings + + from catalog.core.settings import Settings + + class _YamlBackedSettings(Settings): + """Settings subclass that prioritizes env vars over init kwargs.""" + + @classmethod + def settings_customise_sources( + cls, + settings_cls: type[BaseSettings], + init_settings: Any, + env_settings: Any, + dotenv_settings: Any, + file_secret_settings: Any, + ) -> tuple: + # env vars first, then init kwargs (YAML values), then secrets + return (env_settings, init_settings, dotenv_settings, file_secret_settings) + + settings = _YamlBackedSettings(**config_dict) + settings.ensure_directories() + return settings + + +def load_app_config( + config_name: str = "default", + config_dir: Path | None = None, + overrides: list[str] | None = None, +) -> "Settings": + """Load application configuration from YAML via Hydra and return Settings. + + Loads the named YAML config file using Hydra's ``compose()`` API, + validates it into an ``AppConfig`` model, then constructs a + ``Settings`` instance. Environment variables (``IDX_*``) always + override YAML-supplied values. + + Args: + config_name: Stem of the YAML file to load (without ``.yaml``). + Defaults to ``"default"`` which loads ``defaults/default.yaml``. + config_dir: Directory containing config files. Defaults to the + ``defaults/`` directory shipped with this module. + overrides: Hydra override strings, e.g. ``["log_level=DEBUG"]``. + + Returns: + A fully-configured ``Settings`` instance. + + Raises: + FileNotFoundError: If the resolved config directory does not exist. + """ + from hydra import compose, initialize_config_dir + from hydra.core.global_hydra import GlobalHydra + from omegaconf import OmegaConf + + if config_dir is None: + config_dir = _DEFAULT_CONFIG_DIR + + config_dir = config_dir.resolve() + if not config_dir.exists(): + raise FileNotFoundError(f"Config directory not found: {config_dir}") + + # Clear any previous Hydra state (compose() requires a clean GlobalHydra) + GlobalHydra.instance().clear() + + try: + with initialize_config_dir(config_dir=str(config_dir), version_base=None): + cfg = compose(config_name=config_name, overrides=overrides or []) + + raw = OmegaConf.to_container(cfg, resolve=True) + logger.debug(f"Loaded Hydra config '{config_name}' from {config_dir}") + + # Validate through AppConfig to catch schema errors early + app_config = AppConfig.model_validate(raw) + config_dict = app_config.to_settings_dict() + + return _build_settings_from_yaml(config_dict) + + finally: + GlobalHydra.instance().clear() + + +def load_app_config_from_file( + path: Path, + overrides: list[str] | None = None, +) -> "Settings": + """Load application configuration from an arbitrary YAML file path. + + Convenience wrapper around :func:`load_app_config` for loading + a specific file rather than a named config from a directory. + + Args: + path: Absolute or relative path to a YAML config file. + overrides: Hydra override strings. + + Returns: + A fully-configured ``Settings`` instance. + + Raises: + FileNotFoundError: If the file does not exist. + """ + path = path.resolve() + if not path.exists(): + raise FileNotFoundError(f"Config file not found: {path}") + + return load_app_config( + config_name=path.stem, + config_dir=path.parent, + overrides=overrides, + ) diff --git a/src/catalog/catalog/config/schema.py b/src/catalog/catalog/config/schema.py new file mode 100644 index 0000000..e0cc25d --- /dev/null +++ b/src/catalog/catalog/config/schema.py @@ -0,0 +1,163 @@ +"""catalog.config.schema - Pydantic models for YAML-based application config. + +Mirrors the structure of ``catalog.core.settings`` but uses plain +``BaseModel`` instead of ``BaseSettings``. These models validate the +raw dict produced by Hydra/OmegaConf before merging with environment +variable overrides. + +The ``AppConfig`` top-level model can be converted to a full +``catalog.core.settings.Settings`` instance via :meth:`AppConfig.to_settings`. +""" + +from __future__ import annotations + +from pathlib import Path +from typing import Literal + +from pydantic import BaseModel, Field + +__all__ = [ + "AppConfig", + "DatabasesConfig", + "EmbeddingConfig", + "LangfuseConfig", + "PerformanceConfig", + "QdrantConfig", + "RAGConfig", +] + + +class LangfuseConfig(BaseModel): + """Langfuse observability configuration.""" + + enabled: bool = False + public_key: str | None = None + secret_key: str | None = None + host: str | None = None + + +class EmbeddingConfig(BaseModel): + """Embedding model configuration.""" + + backend: Literal["mlx", "huggingface"] = "mlx" + model_name: str = "mlx-community/all-MiniLM-L6-v2-bf16" + batch_size: int = 32 + embedding_dim: int = 384 + + +class PerformanceConfig(BaseModel): + """Batch processing and concurrency configuration.""" + + batch_size: int = 100 + concurrency: int = 4 + embedding_batch_size: int = 32 + chunk_max_bytes: int = 2048 + chunk_min_bytes: int = 128 + + +class QdrantConfig(BaseModel): + """Qdrant vector store configuration.""" + + collection_name: str = "catalog_vectors" + + +class DatabasesConfig(BaseModel): + """Multi-database path configuration.""" + + catalog_path: Path = Field(default_factory=lambda: Path("~/.idx/catalog.db").expanduser()) + content_path: Path = Field(default_factory=lambda: Path("~/.idx/content.db").expanduser()) + + +class RAGConfig(BaseModel): + """RAG pipeline configuration.""" + + # Chunking + chunk_size: int = 800 + chunk_overlap: int = 120 + chunk_fallback_enabled: bool = True + chunk_chars_per_token: int = 4 + + # Embedding + embed_batch_size: int = 32 + embed_fallback_enabled: bool = True + embed_prefix_query: str = "task: search result | query: " + embed_prefix_doc: str = "title: {title} | text: " + + # Query expansion + expansion_enabled: bool = True + expansion_max_lex: int = 3 + expansion_max_vec: int = 3 + expansion_include_hyde: bool = True + + # RRF fusion + rrf_k: int = 60 + rrf_original_weight: float = 2.0 + rrf_expansion_weight: float = 1.0 + rrf_rank1_bonus: float = 0.05 + rrf_rank23_bonus: float = 0.02 + + # Reranking + rerank_top_n: int = 10 + rerank_candidates: int = 40 + rerank_cache_enabled: bool = True + + # Caching + cache_ttl_hours: int = 168 + + # Retrieval + vector_top_k: int = 20 + fts_top_k: int = 20 + fusion_top_k: int = 30 + + # Snippets + snippet_max_lines: int = 10 + snippet_context_lines: int = 2 + + +class AppConfig(BaseModel): + """Top-level application configuration loaded from YAML. + + Mirrors the structure of ``catalog.core.settings.Settings`` exactly, + allowing a 1:1 mapping between YAML keys and Settings fields. + + All fields carry the same defaults as their Settings counterparts, + so partial YAML files work correctly: only specified keys override + the defaults. + """ + + # Direct fields + database_path: Path = Field(default_factory=lambda: Path("~/.idx/catalog.db").expanduser()) + vector_store_path: Path = Field(default_factory=lambda: Path("~/.idx/vector_store").expanduser()) + cache_path: Path = Field(default_factory=lambda: Path("~/.idx/cache").expanduser()) + embedding_model: str = "BAAI/bge-small-en-v1.5" + transformers_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2" + log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = "INFO" + + # Nested configs + databases: DatabasesConfig = Field(default_factory=DatabasesConfig) + embedding: EmbeddingConfig = Field(default_factory=EmbeddingConfig) + langfuse: LangfuseConfig = Field(default_factory=LangfuseConfig) + performance: PerformanceConfig = Field(default_factory=PerformanceConfig) + qdrant: QdrantConfig = Field(default_factory=QdrantConfig) + rag: RAGConfig = Field(default_factory=RAGConfig) + + def to_settings_dict(self) -> dict: + """Convert to a dict suitable for constructing Settings. + + Serializes all fields (including nested models) to a flat dict + that ``Settings(**d)`` can consume. Path objects are converted + to strings for pydantic-settings compatibility. + + Returns: + Dictionary with all config values. + """ + data = self.model_dump() + # Convert Path objects to strings for Settings constructor + for key in ("database_path", "vector_store_path", "cache_path"): + if isinstance(data.get(key), Path): + data[key] = str(data[key]) + dbs = data.get("databases", {}) + for key in ("catalog_path", "content_path"): + if isinstance(dbs.get(key), Path): + dbs[key] = str(dbs[key]) + return data diff --git a/src/catalog/catalog/core/settings.py b/src/catalog/catalog/core/settings.py index 3a3a63f..4fafbb9 100644 --- a/src/catalog/catalog/core/settings.py +++ b/src/catalog/catalog/core/settings.py @@ -1,6 +1,7 @@ """catalog.core.settings - Library configuration via pydantic-settings. -Supports environment variables first; config-file support is deferred. +Supports both environment variables and YAML config files (via Hydra). +Environment variables always take highest priority. All settings use the SUBSTRATE_ prefix for environment variables. @@ -12,14 +13,26 @@ (e.g. dev.toml, prod.toml) is read from the catalog.core package. Values in that file override Pydantic defaults only; environment variables still take precedence. -Example usage: +Configuration sources (highest to lowest priority): + 1. Environment variables (IDX_*) + 2. YAML config file (when CATALOG_CONFIG_PATH is set) + 3. Pydantic field defaults + +Example usage:: + from catalog.core.settings import get_settings settings = get_settings() print(settings.config_root) print(settings.databases.catalog_path) +YAML config usage:: + + export CATALOG_CONFIG_PATH=/path/to/config.yaml + # get_settings() will load YAML then overlay env vars + Environment variables: + CATALOG_CONFIG_PATH - Path to YAML config file (optional) SUBSTRATE_ENVIRONMENT - Application environment: dev (default), prod, staging, test SUBSTRATE_CONFIG_ROOT - Override config root for current environment SUBSTRATE_CONFIG_ROOT_DEV, SUBSTRATE_CONFIG_ROOT_PROD, etc. - Per-environment config root @@ -38,6 +51,7 @@ import os import tempfile import tomllib + from functools import lru_cache from pathlib import Path from typing import Literal @@ -722,15 +736,31 @@ def get_settings() -> Settings: """Get the singleton Settings instance. Returns a cached Settings instance, creating it on first call. - The settings are loaded from environment variables. + + When the ``CATALOG_CONFIG_PATH`` environment variable is set, the + settings are loaded from the specified YAML file via Hydra (with + environment variables overlaid on top). Otherwise, settings are + loaded purely from environment variables and Pydantic defaults. Returns: The singleton Settings instance. - Example: + Example:: + settings = get_settings() print(settings.database_path) + + # Or with YAML config: + # export CATALOG_CONFIG_PATH=~/.idx/config.yaml + settings = get_settings() """ + config_path = os.environ.get("CATALOG_CONFIG_PATH") + + if config_path: + from catalog.config.loader import load_app_config_from_file + + return load_app_config_from_file(Path(config_path)) + settings = Settings() settings.ensure_directories() return settings diff --git a/src/catalog/catalog/instance.py b/src/catalog/catalog/instance.py new file mode 100644 index 0000000..e69de29 diff --git a/src/catalog/tests/idx/unit/config/__init__.py b/src/catalog/tests/idx/unit/config/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/catalog/tests/idx/unit/config/test_loader.py b/src/catalog/tests/idx/unit/config/test_loader.py new file mode 100644 index 0000000..b1dbfbb --- /dev/null +++ b/src/catalog/tests/idx/unit/config/test_loader.py @@ -0,0 +1,187 @@ +"""Tests for catalog.config.loader module.""" + +import os +from pathlib import Path +from unittest import mock + +import pytest +from catalog.config.loader import load_app_config, load_app_config_from_file +from catalog.core.settings import Settings + + +class TestLoadAppConfig: + """Tests for load_app_config function.""" + + def test_loads_default_config(self) -> None: + """Loading default config produces a valid Settings instance.""" + settings = load_app_config() + assert isinstance(settings, Settings) + assert settings.log_level == "INFO" + assert settings.rag.chunk_size == 800 + assert settings.embedding.backend == "mlx" + + def test_hydra_overrides(self) -> None: + """Hydra overrides modify specific settings.""" + settings = load_app_config(overrides=["log_level=DEBUG"]) + assert settings.log_level == "DEBUG" + # Other values unchanged + assert settings.rag.chunk_size == 800 + + def test_nested_hydra_overrides(self) -> None: + """Hydra overrides work on nested fields.""" + settings = load_app_config(overrides=[ + "rag.chunk_size=1000", + "performance.batch_size=50", + ]) + assert settings.rag.chunk_size == 1000 + assert settings.performance.batch_size == 50 + + def test_env_vars_override_yaml(self) -> None: + """Environment variables take priority over YAML config values.""" + env = {"IDX_LOG_LEVEL": "ERROR"} + with mock.patch.dict(os.environ, env, clear=False): + settings = load_app_config() + assert settings.log_level == "ERROR" + + def test_nested_env_vars_override_yaml(self) -> None: + """Nested environment variables override YAML nested values.""" + env = {"IDX_RAG__CHUNK_SIZE": "999"} + with mock.patch.dict(os.environ, env, clear=False): + settings = load_app_config() + assert settings.rag.chunk_size == 999 + + def test_missing_config_dir_raises(self, tmp_path: Path) -> None: + """Non-existent config directory raises FileNotFoundError.""" + with pytest.raises(FileNotFoundError, match="Config directory not found"): + load_app_config(config_dir=tmp_path / "nonexistent") + + def test_custom_config_dir(self, tmp_path: Path) -> None: + """Loading from a custom config directory works.""" + yaml_content = """\ +log_level: WARNING +rag: + chunk_size: 500 +""" + config_file = tmp_path / "custom.yaml" + config_file.write_text(yaml_content) + + settings = load_app_config(config_name="custom", config_dir=tmp_path) + assert settings.log_level == "WARNING" + assert settings.rag.chunk_size == 500 + + def test_multiple_calls_produce_independent_settings(self) -> None: + """Each call to load_app_config produces an independent Settings.""" + s1 = load_app_config() + s2 = load_app_config(overrides=["log_level=DEBUG"]) + assert s1.log_level == "INFO" + assert s2.log_level == "DEBUG" + + +class TestLoadAppConfigFromFile: + """Tests for load_app_config_from_file function.""" + + def test_loads_from_file_path(self, tmp_path: Path) -> None: + """Loads settings from an explicit YAML file path.""" + yaml_content = """\ +log_level: DEBUG +performance: + batch_size: 25 + concurrency: 8 +""" + config_file = tmp_path / "myconfig.yaml" + config_file.write_text(yaml_content) + + settings = load_app_config_from_file(config_file) + assert settings.log_level == "DEBUG" + assert settings.performance.batch_size == 25 + assert settings.performance.concurrency == 8 + + def test_missing_file_raises(self, tmp_path: Path) -> None: + """Missing file raises FileNotFoundError.""" + with pytest.raises(FileNotFoundError, match="Config file not found"): + load_app_config_from_file(tmp_path / "nope.yaml") + + def test_partial_config_uses_defaults(self, tmp_path: Path) -> None: + """A partial YAML file fills in missing fields with defaults.""" + yaml_content = """\ +log_level: ERROR +""" + config_file = tmp_path / "partial.yaml" + config_file.write_text(yaml_content) + + settings = load_app_config_from_file(config_file) + assert settings.log_level == "ERROR" + # Everything else uses Pydantic defaults + assert settings.rag.chunk_size == 800 + assert settings.embedding.backend == "mlx" + assert settings.performance.batch_size == 100 + + def test_with_overrides(self, tmp_path: Path) -> None: + """Hydra overrides apply on top of the file.""" + yaml_content = """\ +log_level: INFO +rag: + chunk_size: 600 +""" + config_file = tmp_path / "base.yaml" + config_file.write_text(yaml_content) + + settings = load_app_config_from_file( + config_file, + overrides=["rag.chunk_size=1200"], + ) + assert settings.rag.chunk_size == 1200 + + def test_hydra_interpolation(self, tmp_path: Path) -> None: + """Hydra variable interpolation works in file configs.""" + yaml_content = """\ +log_level: DEBUG +embedding_model: BAAI/bge-small-en-v1.5 +transformers_model: cross-encoder/${embedding_model} +""" + config_file = tmp_path / "interp.yaml" + config_file.write_text(yaml_content) + + settings = load_app_config_from_file(config_file) + assert settings.transformers_model == "cross-encoder/BAAI/bge-small-en-v1.5" + + +class TestGetSettingsWithConfigPath: + """Tests for get_settings() with CATALOG_CONFIG_PATH.""" + + def test_config_path_env_loads_yaml(self, tmp_path: Path) -> None: + """Setting CATALOG_CONFIG_PATH causes get_settings to load YAML.""" + from catalog.core.settings import get_settings + + yaml_content = """\ +log_level: WARNING +rag: + chunk_size: 750 +""" + config_file = tmp_path / "app.yaml" + config_file.write_text(yaml_content) + + env = {"CATALOG_CONFIG_PATH": str(config_file)} + with mock.patch.dict(os.environ, env, clear=False): + get_settings.cache_clear() + try: + settings = get_settings() + assert settings.log_level == "WARNING" + assert settings.rag.chunk_size == 750 + finally: + get_settings.cache_clear() + + def test_without_config_path_uses_env_vars(self) -> None: + """Without CATALOG_CONFIG_PATH, get_settings uses env vars only.""" + from catalog.core.settings import get_settings + + # Ensure no CATALOG_CONFIG_PATH is set + env = {k: v for k, v in os.environ.items() if k != "CATALOG_CONFIG_PATH"} + with mock.patch.dict(os.environ, env, clear=True): + get_settings.cache_clear() + try: + settings = get_settings() + assert isinstance(settings, Settings) + assert settings.log_level == "INFO" + finally: + get_settings.cache_clear() diff --git a/src/catalog/tests/idx/unit/config/test_schema.py b/src/catalog/tests/idx/unit/config/test_schema.py new file mode 100644 index 0000000..6e934df --- /dev/null +++ b/src/catalog/tests/idx/unit/config/test_schema.py @@ -0,0 +1,158 @@ +"""Tests for catalog.config.schema module.""" + +from pathlib import Path + +from catalog.config.schema import ( + AppConfig, + DatabasesConfig, + EmbeddingConfig, + LangfuseConfig, + PerformanceConfig, + QdrantConfig, + RAGConfig, +) + + +class TestLangfuseConfig: + """Tests for LangfuseConfig model.""" + + def test_defaults(self) -> None: + """LangfuseConfig defaults match LangfuseSettings.""" + cfg = LangfuseConfig() + assert cfg.enabled is False + assert cfg.public_key is None + assert cfg.secret_key is None + assert cfg.host is None + + +class TestEmbeddingConfig: + """Tests for EmbeddingConfig model.""" + + def test_defaults(self) -> None: + """EmbeddingConfig defaults match EmbeddingSettings.""" + cfg = EmbeddingConfig() + assert cfg.backend == "mlx" + assert cfg.model_name == "mlx-community/all-MiniLM-L6-v2-bf16" + assert cfg.batch_size == 32 + assert cfg.embedding_dim == 384 + + def test_custom_values(self) -> None: + """EmbeddingConfig accepts custom values.""" + cfg = EmbeddingConfig( + backend="huggingface", + model_name="BAAI/bge-small-en-v1.5", + batch_size=64, + embedding_dim=768, + ) + assert cfg.backend == "huggingface" + assert cfg.batch_size == 64 + + +class TestPerformanceConfig: + """Tests for PerformanceConfig model.""" + + def test_defaults(self) -> None: + """PerformanceConfig defaults match PerformanceSettings.""" + cfg = PerformanceConfig() + assert cfg.batch_size == 100 + assert cfg.concurrency == 4 + assert cfg.embedding_batch_size == 32 + assert cfg.chunk_max_bytes == 2048 + assert cfg.chunk_min_bytes == 128 + + +class TestQdrantConfig: + """Tests for QdrantConfig model.""" + + def test_defaults(self) -> None: + """QdrantConfig defaults match QdrantSettings.""" + cfg = QdrantConfig() + assert cfg.collection_name == "catalog_vectors" + + +class TestDatabasesConfig: + """Tests for DatabasesConfig model.""" + + def test_defaults(self) -> None: + """DatabasesConfig defaults produce expanded home paths.""" + cfg = DatabasesConfig() + assert cfg.catalog_path == Path("~/.idx/catalog.db").expanduser() + assert cfg.content_path == Path("~/.idx/content.db").expanduser() + + +class TestRAGConfig: + """Tests for RAGConfig model.""" + + def test_defaults(self) -> None: + """RAGConfig defaults match RAGSettings.""" + cfg = RAGConfig() + assert cfg.chunk_size == 800 + assert cfg.chunk_overlap == 120 + assert cfg.rrf_k == 60 + assert cfg.expansion_enabled is True + assert cfg.rerank_top_n == 10 + assert cfg.cache_ttl_hours == 168 + assert cfg.vector_top_k == 20 + assert cfg.snippet_max_lines == 10 + + def test_custom_values(self) -> None: + """RAGConfig accepts custom values.""" + cfg = RAGConfig(chunk_size=1000, rrf_k=80, expansion_enabled=False) + assert cfg.chunk_size == 1000 + assert cfg.rrf_k == 80 + assert cfg.expansion_enabled is False + + +class TestAppConfig: + """Tests for AppConfig model.""" + + def test_defaults(self) -> None: + """AppConfig defaults match Settings defaults.""" + cfg = AppConfig() + assert cfg.log_level == "INFO" + assert cfg.embedding_model == "BAAI/bge-small-en-v1.5" + assert cfg.transformers_model == "cross-encoder/ms-marco-MiniLM-L-6-v2" + assert isinstance(cfg.databases, DatabasesConfig) + assert isinstance(cfg.rag, RAGConfig) + assert isinstance(cfg.embedding, EmbeddingConfig) + + def test_from_partial_dict(self) -> None: + """AppConfig can be constructed from a partial dict (missing keys use defaults).""" + cfg = AppConfig.model_validate({ + "log_level": "DEBUG", + "rag": {"chunk_size": 1000}, + }) + assert cfg.log_level == "DEBUG" + assert cfg.rag.chunk_size == 1000 + # Other RAG fields retain defaults + assert cfg.rag.rrf_k == 60 + # Top-level defaults retained + assert cfg.embedding_model == "BAAI/bge-small-en-v1.5" + + def test_to_settings_dict(self) -> None: + """to_settings_dict produces a dict with all keys.""" + cfg = AppConfig() + d = cfg.to_settings_dict() + assert "log_level" in d + assert "databases" in d + assert "rag" in d + assert "embedding" in d + assert "performance" in d + assert "qdrant" in d + assert "langfuse" in d + # Path fields are converted to strings + assert isinstance(d["database_path"], str) + assert isinstance(d["vector_store_path"], str) + assert isinstance(d["cache_path"], str) + + def test_to_settings_dict_preserves_values(self) -> None: + """to_settings_dict preserves custom values through round-trip.""" + cfg = AppConfig.model_validate({ + "log_level": "DEBUG", + "rag": {"chunk_size": 999}, + "performance": {"batch_size": 50}, + }) + d = cfg.to_settings_dict() + assert d["log_level"] == "DEBUG" + assert d["rag"]["chunk_size"] == 999 + assert d["performance"]["batch_size"] == 50