Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ node_modules
.wt.activate
.wt.agent-env
.wt.port
**/.testmondata-*

.ropeproject/
.ruff_cache/

Expand Down
31 changes: 31 additions & 0 deletions src/catalog/catalog/config/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""catalog.config - Application-level Hydra configuration.

Provides YAML-driven application configuration that mirrors all settings
defined in ``catalog.core.settings``. Uses Hydra's ``compose()`` API for
variable interpolation and config composition.

Priority (highest to lowest):
1. Environment variables (``IDX_*``)
2. Hydra overrides (passed programmatically)
3. YAML config file values
4. Pydantic field defaults

Example usage::

from catalog.config import load_app_config

# Load default config
settings = load_app_config()

# Load with environment-specific overrides
settings = load_app_config(overrides=["+environment=dev"])

# Load from a custom config directory
settings = load_app_config(config_dir=Path("my/configs"))
"""

from catalog.config.loader import load_app_config

__all__ = [
"load_app_config",
]
110 changes: 110 additions & 0 deletions src/catalog/catalog/config/defaults/default.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
# catalog application configuration - default settings
#
# All values here match the Pydantic defaults in catalog.core.settings.
# Override specific values in environment-specific configs (dev.yaml, etc.)
# or via IDX_* environment variables (highest priority).
#
# Hydra variable interpolation is supported:
# ${oc.env:HOME} resolves to $HOME
# ${log_level} cross-references within this config
#
# Usage:
# from catalog.config import load_app_config
# settings = load_app_config() # loads this file
# settings = load_app_config("default", overrides=["+environment=dev"])

# -- Paths ------------------------------------------------------------------

database_path: ${oc.env:HOME}/.idx/catalog.db
vector_store_path: ${oc.env:HOME}/.idx/vector_store
cache_path: ${oc.env:HOME}/.idx/cache

# -- Models ------------------------------------------------------------------

embedding_model: BAAI/bge-small-en-v1.5
transformers_model: cross-encoder/ms-marco-MiniLM-L-6-v2

# -- Logging -----------------------------------------------------------------

log_level: INFO

# -- Multi-database paths ----------------------------------------------------

databases:
catalog_path: ${oc.env:HOME}/.idx/catalog.db
content_path: ${oc.env:HOME}/.idx/content.db

# -- Embedding ---------------------------------------------------------------

embedding:
backend: mlx
model_name: mlx-community/all-MiniLM-L6-v2-bf16
batch_size: 32
embedding_dim: 384

# -- Langfuse observability --------------------------------------------------

langfuse:
enabled: false
public_key: null
secret_key: null
host: null

# -- Performance -------------------------------------------------------------

performance:
batch_size: 100
concurrency: 4
embedding_batch_size: 32
chunk_max_bytes: 2048
chunk_min_bytes: 128

# -- Qdrant ------------------------------------------------------------------

qdrant:
collection_name: catalog_vectors

# -- RAG pipeline ------------------------------------------------------------

rag:
# Chunking
chunk_size: 800
chunk_overlap: 120
chunk_fallback_enabled: true
chunk_chars_per_token: 4

# Embedding prefixes
embed_batch_size: 32
embed_fallback_enabled: true
embed_prefix_query: "task: search result | query: "
embed_prefix_doc: "title: {title} | text: "

# Query expansion
expansion_enabled: true
expansion_max_lex: 3
expansion_max_vec: 3
expansion_include_hyde: true

# RRF fusion
rrf_k: 60
rrf_original_weight: 2.0
rrf_expansion_weight: 1.0
rrf_rank1_bonus: 0.05
rrf_rank23_bonus: 0.02

# Reranking
rerank_top_n: 10
rerank_candidates: 40
rerank_cache_enabled: true

# Caching
cache_ttl_hours: 168

# Retrieval
vector_top_k: 20
fts_top_k: 20
fusion_top_k: 30

# Snippets
snippet_max_lines: 10
snippet_context_lines: 2
13 changes: 13 additions & 0 deletions src/catalog/catalog/config/defaults/environments/dev.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Development environment overrides
#
# Loaded via: load_app_config(overrides=["+environment=dev"])
# Or automatically when CATALOG_ENV=dev

log_level: DEBUG

performance:
batch_size: 10
concurrency: 2

rag:
rerank_cache_enabled: false
17 changes: 17 additions & 0 deletions src/catalog/catalog/config/defaults/environments/prod.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Production environment overrides
#
# Loaded via: load_app_config(overrides=["+environment=prod"])
# Or automatically when CATALOG_ENV=prod

log_level: WARNING

performance:
batch_size: 200
concurrency: 8
embedding_batch_size: 64

rag:
rerank_cache_enabled: true
cache_ttl_hours: 336
rerank_candidates: 60
rerank_top_n: 15
23 changes: 23 additions & 0 deletions src/catalog/catalog/config/defaults/environments/test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Test environment overrides
#
# Loaded via: load_app_config(overrides=["+environment=test"])
# Or automatically when CATALOG_ENV=test

log_level: WARNING

databases:
catalog_path: /tmp/idx-test/catalog.db
content_path: /tmp/idx-test/content.db

database_path: /tmp/idx-test/catalog.db
vector_store_path: /tmp/idx-test/vector_store
cache_path: /tmp/idx-test/cache

performance:
batch_size: 5
concurrency: 1
embedding_batch_size: 8

rag:
rerank_cache_enabled: false
expansion_enabled: false
157 changes: 157 additions & 0 deletions src/catalog/catalog/config/loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
"""catalog.config.loader - Hydra-based application configuration loader.

Follows the same pattern as ``catalog.ingest.job.DatasetJob.from_yaml()``:
uses Hydra's ``compose()`` API (not ``@hydra.main()``) for YAML loading
with variable interpolation and config composition.

The loader produces ``catalog.core.settings.Settings`` instances by:
1. Loading YAML via Hydra into an ``AppConfig`` Pydantic model
2. Converting the validated config to a dict
3. Constructing a ``Settings`` instance using a custom source priority
so that environment variables always override YAML values:
``env vars > YAML config > Pydantic defaults``
"""

from __future__ import annotations

from pathlib import Path
from typing import Any

from agentlayer.logging import get_logger

from catalog.config.schema import AppConfig

__all__ = [
"load_app_config",
"load_app_config_from_file",
]

logger = get_logger(__name__)

_DEFAULT_CONFIG_DIR = Path(__file__).parent / "defaults"


def _build_settings_from_yaml(config_dict: dict[str, Any]) -> "Settings":
"""Construct a Settings instance with YAML values below env vars in priority.

Uses ``settings_customise_sources`` to ensure the priority order is:
``env vars > YAML (init kwargs) > field defaults``.

By default pydantic-settings treats init kwargs as highest priority.
This function creates a temporary subclass that demotes init kwargs
below environment variables.

Args:
config_dict: Validated config dict from AppConfig.to_settings_dict().

Returns:
Settings instance with correct source priority.
"""
from pydantic_settings import BaseSettings

from catalog.core.settings import Settings

class _YamlBackedSettings(Settings):
"""Settings subclass that prioritizes env vars over init kwargs."""

@classmethod
def settings_customise_sources(
cls,
settings_cls: type[BaseSettings],
init_settings: Any,
env_settings: Any,
dotenv_settings: Any,
file_secret_settings: Any,
) -> tuple:
# env vars first, then init kwargs (YAML values), then secrets
return (env_settings, init_settings, dotenv_settings, file_secret_settings)

settings = _YamlBackedSettings(**config_dict)
settings.ensure_directories()
return settings


def load_app_config(
config_name: str = "default",
config_dir: Path | None = None,
overrides: list[str] | None = None,
) -> "Settings":
"""Load application configuration from YAML via Hydra and return Settings.

Loads the named YAML config file using Hydra's ``compose()`` API,
validates it into an ``AppConfig`` model, then constructs a
``Settings`` instance. Environment variables (``IDX_*``) always
override YAML-supplied values.

Args:
config_name: Stem of the YAML file to load (without ``.yaml``).
Defaults to ``"default"`` which loads ``defaults/default.yaml``.
config_dir: Directory containing config files. Defaults to the
``defaults/`` directory shipped with this module.
overrides: Hydra override strings, e.g. ``["log_level=DEBUG"]``.

Returns:
A fully-configured ``Settings`` instance.

Raises:
FileNotFoundError: If the resolved config directory does not exist.
"""
from hydra import compose, initialize_config_dir
from hydra.core.global_hydra import GlobalHydra
from omegaconf import OmegaConf

if config_dir is None:
config_dir = _DEFAULT_CONFIG_DIR

config_dir = config_dir.resolve()
if not config_dir.exists():
raise FileNotFoundError(f"Config directory not found: {config_dir}")

# Clear any previous Hydra state (compose() requires a clean GlobalHydra)
GlobalHydra.instance().clear()

try:
with initialize_config_dir(config_dir=str(config_dir), version_base=None):
cfg = compose(config_name=config_name, overrides=overrides or [])

raw = OmegaConf.to_container(cfg, resolve=True)
logger.debug(f"Loaded Hydra config '{config_name}' from {config_dir}")

# Validate through AppConfig to catch schema errors early
app_config = AppConfig.model_validate(raw)
config_dict = app_config.to_settings_dict()

return _build_settings_from_yaml(config_dict)

finally:
GlobalHydra.instance().clear()


def load_app_config_from_file(
path: Path,
overrides: list[str] | None = None,
) -> "Settings":
"""Load application configuration from an arbitrary YAML file path.

Convenience wrapper around :func:`load_app_config` for loading
a specific file rather than a named config from a directory.

Args:
path: Absolute or relative path to a YAML config file.
overrides: Hydra override strings.

Returns:
A fully-configured ``Settings`` instance.

Raises:
FileNotFoundError: If the file does not exist.
"""
path = path.resolve()
if not path.exists():
raise FileNotFoundError(f"Config file not found: {path}")

return load_app_config(
config_name=path.stem,
config_dir=path.parent,
overrides=overrides,
)
Loading