diff --git a/.gitignore b/.gitignore index f72f3bd..643bf8b 100644 --- a/.gitignore +++ b/.gitignore @@ -215,4 +215,5 @@ cython_debug/ # logs *.log -*.log.*reports/ +*.log.* +reports/ diff --git a/README.md b/README.md index ab537be..4ba0d65 100644 --- a/README.md +++ b/README.md @@ -59,10 +59,16 @@ The library is built on a **three-layer architecture** with a framework-agnostic ## πŸš€ **Quick Start** ```python -from views_evaluation import PandasAdapter, NativeEvaluator +from views_evaluation import EvaluationFrame, NativeEvaluator +import numpy as np -# 1. Convert DataFrames β†’ EvaluationFrame -ef = PandasAdapter.from_dataframes(actual=actuals, predictions=predictions_list, target="ged_sb_best") +# 1. Construct EvaluationFrame with NumPy arrays +ef = EvaluationFrame( + y_true=y_true_array, + y_pred=y_pred_array, # shape (N, S) where S >= 1 + identifiers={'time': times, 'unit': units, 'origin': origins, 'step': steps}, + metadata={'target': 'ged_sb_best'}, +) # 2. Configure and evaluate config = { @@ -89,7 +95,7 @@ VIEWS Evaluation ensures **forecasting accuracy and model robustness** as the ** ### **Pipeline Integration:** 1. **Model Predictions** β†’ -2. **PandasAdapter** (DataFrame β†’ EvaluationFrame) β†’ +2. **EvaluationFrame** (validated NumPy container) β†’ 3. **NativeEvaluator** (metrics computation) β†’ 4. **EvaluationReport** (structured results) @@ -195,7 +201,7 @@ config = { --- * **Data Integrity Checks**: Validates input arrays for shape consistency, NaN/infinity, and required identifiers. -* **Automatic Index Matching**: `PandasAdapter` aligns actual and predicted values based on MultiIndex structures. +* **Framework-Agnostic Core**: All evaluation operates on pure NumPy arrays via `EvaluationFrame`. * **Metric Catalog & Profiles**: Hyperparameters are managed through named evaluation profiles with a Chain of Responsibility resolver (model overrides β†’ profile β†’ fail loud). --- @@ -223,11 +229,11 @@ Level 0 β€” Pure Core (NumPy + SciPy only, zero framework imports) Profiles Named hyperparameter sets (base, hydranet_ucdp, ...) Level 1 β€” Bridge / Adapter - PandasAdapter DataFrame β†’ EvaluationFrame conversion (PHASE-3-DELETE) + EvaluationFrame Validated NumPy data container EvaluationReport Results container with DataFrame/dict export Level 2 β€” Legacy Orchestrator - EvaluationManager Deprecated wrapper; delegates to Level 0 + MetricCatalog Genome registry and parameter resolver ``` **Key design decisions:** @@ -244,7 +250,7 @@ views-evaluation/ β”œβ”€β”€ views_evaluation/ β”‚ β”œβ”€β”€ __init__.py # Public API exports β”‚ β”œβ”€β”€ adapters/ -β”‚ β”‚ └── pandas.py # PandasAdapter (PHASE-3-DELETE) +β”‚ β”‚ └── __init__.py # Reserved for future framework bridges β”‚ β”œβ”€β”€ evaluation/ β”‚ β”‚ β”œβ”€β”€ config_schema.py # EvaluationConfig TypedDict β”‚ β”‚ β”œβ”€β”€ evaluation_frame.py # Core data container diff --git a/documentation/ADRs/000_use_of_adrs.md b/documentation/ADRs/000_use_of_adrs.md index 1dc830f..3bbbf14 100644 --- a/documentation/ADRs/000_use_of_adrs.md +++ b/documentation/ADRs/000_use_of_adrs.md @@ -2,7 +2,9 @@ **Status:** Accepted **Date:** 2026-02-25 -**Deciders:** Project maintainers, Gemini CLI +**Deciders:** Project maintainers +**Consulted:** β€” +**Informed:** All contributors --- diff --git a/documentation/ADRs/001_silicon_based_agent_protocol.md b/documentation/ADRs/001_silicon_based_agent_protocol.md index 8ce90b7..28822a2 100644 --- a/documentation/ADRs/001_silicon_based_agent_protocol.md +++ b/documentation/ADRs/001_silicon_based_agent_protocol.md @@ -2,7 +2,9 @@ **Status:** Accepted **Date:** 2026-02-25 -**Deciders:** Project maintainers, Gemini CLI +**Deciders:** Project maintainers +**Consulted:** β€” +**Informed:** All contributors --- diff --git a/documentation/ADRs/010_ontology_of_evaluation.md b/documentation/ADRs/010_ontology_of_evaluation.md index 475bd98..fd4e00f 100644 --- a/documentation/ADRs/010_ontology_of_evaluation.md +++ b/documentation/ADRs/010_ontology_of_evaluation.md @@ -2,7 +2,9 @@ **Status:** Accepted **Date:** 2026-02-25 -**Deciders:** Project maintainers, Gemini CLI +**Deciders:** Project maintainers +**Consulted:** β€” +**Informed:** All contributors --- diff --git a/documentation/ADRs/011_topology_and_dependency_rules.md b/documentation/ADRs/011_topology_and_dependency_rules.md index ff1f89c..b482fc7 100644 --- a/documentation/ADRs/011_topology_and_dependency_rules.md +++ b/documentation/ADRs/011_topology_and_dependency_rules.md @@ -2,7 +2,9 @@ **Status:** Accepted **Date:** 2026-02-25 -**Deciders:** Project maintainers, Gemini CLI +**Deciders:** Project maintainers +**Consulted:** β€” +**Informed:** All contributors --- @@ -10,7 +12,7 @@ In complex evaluation systems, architectural fragility often emerges not from incorrect logic, but from uncontrolled dependencies between components. -The Evaluation repository pre-Feb 2026 suffered from "Pandas-heavy" coupling. Higher-level logic (EvaluationManager) depended on Pandas `MultiIndex` internals for alignment, which constrained our ability to scale probabilistic forecasts (N, S) due to memory/performance limits of Pandas' "lists-in-cells." +The Evaluation repository pre-Feb 2026 suffered from "Pandas-heavy" coupling. Higher-level logic (e.g., Pipeline Core) depended on Pandas `MultiIndex` internals for alignment, which constrained our ability to scale probabilistic forecasts (N, S) due to memory/performance limits of Pandas' "lists-in-cells." Without explicit topology rules, we risk high-level math modules beginning to depend on implementation details (e.g., NumPy indexing vs Xarray coordinates). @@ -29,8 +31,8 @@ Violations are architectural defects. The Evaluation Core is the lowest-level layer (most stable). - **Level 0: Evaluation Core** (Pure NumPy, `EvaluationFrame`, `NativeEvaluator`). No external imports except `numpy` and `scipy`. -- **Level 1: Adapters** (Framework-specific bridges like `PandasAdapter`). May depend on Level 0. -- **Level 2: Orchestration** (e.g., `EvaluationManager`, Pipeline Core). May depend on Level 1 and Level 0. +- **Level 1: Adapters** (Framework-specific bridges, reserved for future use). May depend on Level 0. +- **Level 2: Orchestration** (e.g., Pipeline Core β€” external to this repo). May depend on Level 1 and Level 0. Dependency direction must always flow **toward the Core**. @@ -38,7 +40,7 @@ Dependency direction must always flow **toward the Core**. - Math kernels importing `pandas` or `polars`. - `EvaluationFrame` containing anything other than NumPy arrays. -- Higher-level modules (e.g., `EvaluationManager`) passing DataFrames directly into metric functions. +- Higher-level modules (e.g., external orchestrators) passing DataFrames directly into metric functions. If a dependency feels β€œconvenient but wrong,” it probably is. diff --git a/documentation/ADRs/012_authority_over_inference.md b/documentation/ADRs/012_authority_over_inference.md index 3023700..dfdc260 100644 --- a/documentation/ADRs/012_authority_over_inference.md +++ b/documentation/ADRs/012_authority_over_inference.md @@ -2,7 +2,9 @@ **Status:** Accepted **Date:** 2026-02-25 -**Deciders:** Project maintainers, Gemini CLI +**Deciders:** Project maintainers +**Consulted:** β€” +**Informed:** All contributors --- @@ -58,5 +60,5 @@ the system **must fail loudly and immediately**. - Improves debuggability: we can inspect the `EvaluationFrame` and see exactly what the system *thinks* it is evaluating. ### Negative -- Requires more metadata in the `EvaluationFrame` and `PandasAdapter`. +- Requires more metadata in the `EvaluationFrame` and external adapters. - Some "convenient" hacks are disallowed. diff --git a/documentation/ADRs/013_observability_and_explicit_failure.md b/documentation/ADRs/013_observability_and_explicit_failure.md index 8c26e75..ebc7344 100644 --- a/documentation/ADRs/013_observability_and_explicit_failure.md +++ b/documentation/ADRs/013_observability_and_explicit_failure.md @@ -2,7 +2,9 @@ **Status:** Accepted **Date:** 2026-02-25 -**Deciders:** Project maintainers, Gemini CLI +**Deciders:** Project maintainers +**Consulted:** β€” +**Informed:** All contributors --- diff --git a/documentation/ADRs/014_boundary_contracts_and_validation.md b/documentation/ADRs/014_boundary_contracts_and_validation.md index dd6a70e..dc46108 100644 --- a/documentation/ADRs/014_boundary_contracts_and_validation.md +++ b/documentation/ADRs/014_boundary_contracts_and_validation.md @@ -2,7 +2,9 @@ **Status:** Accepted **Date:** 2026-02-25 -**Deciders:** Project maintainers, Gemini CLI +**Deciders:** Project maintainers +**Consulted:** β€” +**Informed:** All contributors --- @@ -25,7 +27,7 @@ Every boundary between components (e.g., Adapter β†’ Core) must define: - Declared invariants. ### 2. Validation at Entry -All configuration and external inputs must be validated at the system boundary (e.g., in `EvaluationManager` or `Adapters`). +All configuration and external inputs must be validated at the system boundary (e.g., in the `EvaluationFrame` constructor or `NativeEvaluator`). - Before execution begins. - Before orchestration proceeds. diff --git a/documentation/ADRs/020_multi_perspective_testing.md b/documentation/ADRs/020_multi_perspective_testing.md index c06ef07..3071a24 100644 --- a/documentation/ADRs/020_multi_perspective_testing.md +++ b/documentation/ADRs/020_multi_perspective_testing.md @@ -2,7 +2,9 @@ **Status:** Accepted **Date:** 2026-02-25 -**Deciders:** Project maintainers, Gemini CLI +**Deciders:** Project maintainers +**Consulted:** β€” +**Informed:** All contributors --- diff --git a/documentation/ADRs/021_intent_contracts_for_classes.md b/documentation/ADRs/021_intent_contracts_for_classes.md index 6c5a5d0..cd983ed 100644 --- a/documentation/ADRs/021_intent_contracts_for_classes.md +++ b/documentation/ADRs/021_intent_contracts_for_classes.md @@ -2,7 +2,9 @@ **Status:** Accepted **Date:** 2026-02-25 -**Deciders:** Project maintainers, Gemini CLI +**Deciders:** Project maintainers +**Consulted:** β€” +**Informed:** All contributors --- @@ -14,7 +16,7 @@ To prevent semantic drift, non-trivial classes require an explicit declaration o ## Decision -All **non-trivial and substantial classes** (e.g., `EvaluationFrame`, `NativeEvaluator`, `PandasAdapter`) must have an explicit **intent contract**. +All **non-trivial and substantial classes** (e.g., `EvaluationFrame`, `NativeEvaluator`, `EvaluationReport`) must have an explicit **intent contract**. An intent contract is a short, human-readable description of: - **Purpose**: what the class is for. diff --git a/documentation/ADRs/022_evolution_and_stability.md b/documentation/ADRs/022_evolution_and_stability.md index 4148c26..9715272 100644 --- a/documentation/ADRs/022_evolution_and_stability.md +++ b/documentation/ADRs/022_evolution_and_stability.md @@ -3,6 +3,8 @@ **Status:** Proposed (Deferred) **Date:** 2026-02-25 **Deciders:** β€” +**Consulted:** β€” +**Informed:** All contributors --- diff --git a/documentation/ADRs/023_technical_risk_register.md b/documentation/ADRs/023_technical_risk_register.md new file mode 100644 index 0000000..1111dad --- /dev/null +++ b/documentation/ADRs/023_technical_risk_register.md @@ -0,0 +1,69 @@ +# ADR-023: Technical Risk Register + +**Status:** Accepted +**Date:** 2026-03-31 +**Deciders:** Project maintainers +**Consulted:** β€” +**Informed:** All contributors + +--- + +## Context + +As the views-evaluation codebase matures through its EvaluationFrame refactor and metric catalog implementation, structural risks have been identified through repo-assimilation and expert review. Without a centralized, living register of these risks, concerns are scattered across reports, post-mortems, and tribal knowledge. + +A formalized risk register ensures that architectural concerns are: +- tracked with consistent metadata, +- prioritized by severity, +- linked to their source of discovery, +- and revisited systematically. + +--- + +## Decision + +This repository maintains a **Technical Risk Register** at `reports/technical_risk_register.md` as a first-class governance artifact. + +### Concern Format + +Each entry uses: +- **ID:** `C-xx` for concerns, `D-xx` for disagreements +- **Tier:** 1 (critical) through 4 (informational) +- **Trigger:** The specific circumstance under which the risk becomes actionable +- **Source:** How the concern was identified (e.g. repo-assimilation, expert review, falsification audit) + +### Tier Definitions + +| Tier | Severity | Response | +|------|----------|----------| +| 1 | Critical β€” blocks release or causes data corruption | Must be resolved before next release | +| 2 | High β€” significant architectural risk | Must have a mitigation plan within one sprint | +| 3 | Medium β€” known weakness, bounded impact | Track and address opportunistically | +| 4 | Low/Informational β€” minor or cosmetic | Document and revisit during tech debt cleanup | + +### Lifecycle + +- Concerns are opened during expert reviews, tech debt audits, repo-assimilation, and falsification audits. +- Concerns are closed when the risk is resolved, mitigated, or explicitly accepted with rationale. +- The register header tracks the total count for quick reference. + +--- + +## Consequences + +### Positive +- Centralized visibility of all known risks +- Consistent prioritization and tracking +- Prevents risks from being forgotten between conversations + +### Negative +- Requires discipline to keep updated +- Risk of register staleness if not reviewed regularly + +--- + +## References + +- `reports/technical_risk_register.md` +- Repo-assimilation output (2026-03-31) +- `reports/technical_debt_backlog.md` (related but focuses on actionable debt, not structural risks) diff --git a/documentation/ADRs/030_evaluation_strategy.md b/documentation/ADRs/030_evaluation_strategy.md index 4af22a3..ec5c88d 100644 --- a/documentation/ADRs/030_evaluation_strategy.md +++ b/documentation/ADRs/030_evaluation_strategy.md @@ -1,12 +1,10 @@ # ADR-030: Evaluation Strategy -| ADR Info | Details | -|---------------------|-------------------| -| Subject | Evaluation Strategy | -| ADR Number | 030 | -| Status | Accepted | -| Author | Xiaolong, Mihai| -| Date | 16.07.2025 | +**Status:** Accepted +**Date:** 2025-07-16 +**Deciders:** Xiaolong, Mihai +**Consulted:** β€” +**Informed:** All contributors ## Context To ensure reliable and realistic model performance assessment, our forecasting framework supports both **offline** and **online** evaluation strategies. These strategies serve complementary purposes: offline evaluation simulates the forecasting process retrospectively, while online evaluation assesses actual deployed forecasts against observed data. diff --git a/documentation/ADRs/031_evaluation_metrics.md b/documentation/ADRs/031_evaluation_metrics.md index ec302f2..f6090bc 100644 --- a/documentation/ADRs/031_evaluation_metrics.md +++ b/documentation/ADRs/031_evaluation_metrics.md @@ -1,12 +1,10 @@ # ADR-031: Evaluation Metrics -| ADR Info | Details | -|---------------------|--------------------| -| Subject | Evaluation Metrics | -| ADR Number | 031 | -| Status | Accepted | -| Author | Xiaolong | -| Date | 12.09.2024 | +**Status:** Accepted +**Date:** 2024-09-12 +**Deciders:** Xiaolong +**Consulted:** β€” +**Informed:** All contributors ## Context In the context of the VIEWS pipeline, it is necessary to evaluate the models using a robust set of metrics that account for the characteristics of conflict data, such as right-skewness and zero-inflation in the outcome variable. diff --git a/documentation/ADRs/032_metric_calculation_schemas.md b/documentation/ADRs/032_metric_calculation_schemas.md index 32894d3..a0b1c12 100644 --- a/documentation/ADRs/032_metric_calculation_schemas.md +++ b/documentation/ADRs/032_metric_calculation_schemas.md @@ -1,12 +1,10 @@ # ADR-032: Metric Calculation Schemas -| ADR Info | Details | -|---------------------|-------------------| -| Subject | Metric Calculation | -| ADR Number | 032 | -| Status | Accepted| -| Author | Mihai, Xiaolong| -| Date | 31.10.2024 | +**Status:** Accepted +**Date:** 2024-10-31 +**Deciders:** Mihai, Xiaolong +**Consulted:** β€” +**Informed:** All contributors ## Context Traditional machine learning metrics do not directly translate to time-series forecasting across multiple horizons. A standardized approach to regrouping data is necessary. diff --git a/documentation/ADRs/040_evaluation_input_schema.md b/documentation/ADRs/040_evaluation_input_schema.md index 4d5730e..e8fde7a 100644 --- a/documentation/ADRs/040_evaluation_input_schema.md +++ b/documentation/ADRs/040_evaluation_input_schema.md @@ -1,18 +1,16 @@ # ADR-040: Evaluation Input Schema -| ADR Info | Details | -|---------------------|-------------------------| -| Subject | Evaluation Input Schema | -| ADR Number | 040 | -| Status | Accepted | -| Author | Xiaolong | -| Date | 16.06.2025 | +**Status:** Accepted +**Date:** 2025-06-16 +**Deciders:** Xiaolong +**Consulted:** β€” +**Informed:** All contributors ## Context A consistent input format is required to compare model performance across the VIEWS pipeline. -Two integration paths exist: the native path (primary) and the legacy path (`EvaluationManager`, -deprecated per ADR-011). +The native path via `EvaluationFrame` is the sole integration path. The legacy +`EvaluationManager` path was removed in Phase 3. ## Decision @@ -42,9 +40,9 @@ Prediction type (point vs. sample) is determined structurally from the number of No name-based inference occurs (ADR-012). Callers must ensure all cells in a prediction column have the same number of values. -### Native Path Invariants (PandasAdapter) +### Native Path Invariants -When using `PandasAdapter`, the following identifiers are synthesised automatically: +When constructing an `EvaluationFrame`, the following identifiers must be provided: | Identifier | Source | |------------|--------------------------------------------------| diff --git a/documentation/ADRs/041_evaluation_output_schema.md b/documentation/ADRs/041_evaluation_output_schema.md index 50f1559..9dfe890 100644 --- a/documentation/ADRs/041_evaluation_output_schema.md +++ b/documentation/ADRs/041_evaluation_output_schema.md @@ -1,12 +1,10 @@ # ADR-041: Evaluation Output Schema -| ADR Info | Details | -|---------------------|-------------------| -| Subject | Evaluation Output Schema | -| ADR Number | 041 | -| Status | Proposed | -| Author | Xiaolong | -| Date | 16.06.2025 | +**Status:** Proposed +**Date:** 2025-06-16 +**Deciders:** Xiaolong +**Consulted:** β€” +**Informed:** All contributors ## Context Standardized reports are necessary for comparing ensemble models against constituent models and baselines. diff --git a/documentation/ADRs/042_metric_catalog.md b/documentation/ADRs/042_metric_catalog.md index ae662a4..810412e 100644 --- a/documentation/ADRs/042_metric_catalog.md +++ b/documentation/ADRs/042_metric_catalog.md @@ -1,12 +1,10 @@ # ADR-042: Metric Catalog and Named Evaluation Profiles -| ADR Info | Details | -|---------------------|-------------------| -| Subject | Metric hyperparameter management | -| ADR Number | 042 | -| Status | accepted | -| Author | Claude (silicon-based agent) | -| Date | 11.03.2026 | +**Status:** Accepted +**Date:** 2026-03-11 +**Deciders:** Project maintainers +**Consulted:** Claude (silicon-based agent) +**Informed:** All contributors ## Context @@ -93,8 +91,8 @@ the profile once, all models using it get the update. ## Additional Notes -- Legacy dispatch dicts (REGRESSION_POINT_NATIVE, etc.) are retained for EvaluationManager - backward compatibility (PHASE-3-DELETE). +- Legacy dispatch dicts (REGRESSION_POINT_NATIVE, etc.) were removed in Phase 3. + METRIC_MEMBERSHIP is the single source of truth for (task, pred_type) β†’ metric mapping. - The base profile ships with views-evaluation and provides values that match the previous function-signature defaults, ensuring zero behavioral change for existing integrations. - Profile values for twCRPS threshold and QIS quantile levels are subject to alignment diff --git a/documentation/ADRs/README.md b/documentation/ADRs/README.md index 136cae3..fed5346 100644 --- a/documentation/ADRs/README.md +++ b/documentation/ADRs/README.md @@ -24,6 +24,7 @@ We follow a hierarchical numbering scheme to organize decisions from the most fo - **020**: [Multi-Perspective Testing](020_multi_perspective_testing.md) - **021**: [Intent Contracts for Classes](021_intent_contracts_for_classes.md) - **022**: [Evolution and Stability](022_evolution_and_stability.md) +- **023**: [Technical Risk Register](023_technical_risk_register.md) ### 03x: Domain Strategy & Methodology *The mathematical and strategic core of conflict evaluation.* @@ -38,6 +39,23 @@ We follow a hierarchical numbering scheme to organize decisions from the most fo --- +## Governance Structure + +- **Ontology (010)** defines what exists. +- **Topology (011)** defines structural direction. +- **Authority (012)** defines who owns meaning. +- **Observability (013)** enforces failure semantics. +- **Boundary Contracts (014)** define interaction rules. +- **Testing (020)** verifies system integrity. +- **Intent Contracts (021)** bind class-level behavior. +- **Evolution (022)** (deferred) β€” rules for stability. +- **Risk Register (023)** tracks structural concerns. +- **Silicon Agent Protocol (001)** constrains automated modification. + +Together with domain ADRs (030–042), these define the invariant layer of the system. + +--- + ## Contributing To add a new ADR: 1. Identify the appropriate group for the decision. diff --git a/documentation/ADRs/adr_template.md b/documentation/ADRs/adr_template.md index 3374bbd..eb6a261 100644 --- a/documentation/ADRs/adr_template.md +++ b/documentation/ADRs/adr_template.md @@ -1,54 +1,134 @@ -# ADR Template -(Fine name should follow convention NNN-short-title.md) +# ADR-XXXX: -## Title -*Refinement of Model Configuration Files Structure* +**Status:** Proposed | Accepted | Superseded | Deprecated +**Date:** YYYY-MM-DD +**Deciders:** +**Consulted:** +**Informed:** -| ADR Info | Details | -|---------------------|-------------------| -| Subject | [Insert Subject] | -| ADR Number | [Insert Number (NNN)] | -| Status | [Insert Status (proposed, accepted, rejected, deprecated)] | -| Author | [Insert Author (Name)] | -| Date | [Insert Date (DD.MM.YYYY)] | +--- ## Context -*Describe the issue that necessitated the decision, including any factors considered during the decision-making process. This should provide a clear understanding of the challenges or opportunities addressed by the ADR.* + +Describe the problem that motivated this decision. + +Include: +- What is *not working* or *no longer tenable* +- Relevant technical, organizational, or scientific constraints +- Prior assumptions that turned out to be wrong +- Why this decision matters *now* (and not later) + +This section should make it obvious to a future reader **why a decision was needed at all**. + +--- ## Decision -*Detail the decision that was made, including any alternatives that were considered and the reasons for choosing the implemented solution. Provide enough technical specifics to justify the approach.* -### Overview -*Overview of the decision in a clear and concise manner.* +State the decision **clearly and unambiguously**. -## Consequences -*Discuss the positive and negative effects of the decision. Include both immediate outcomes and long-term implications for the project's architecture. Highlight how the decision aligns with the challenges outlined in the context.* +- What is being decided? +- What is explicitly *in scope*? +- What is explicitly *out of scope*? -**Positive Effects:** -- List the benefits of the decision. +Use assertive language. +This is the **source of truth**. -**Negative Effects:** -- List the potential drawbacks or challenges introduced by the decision. +--- ## Rationale -*Explain the reasoning behind the decision, including any specific advantages that influenced the choice. This section should reflect the factors mentioned in the context.* -### Considerations -*List any considerations that were part of the decision-making process, such as potential risks, dependency issues, or impacts on existing systems.* +Explain *why this option was chosen* over alternatives. + +Include: +- Key design principles or values (e.g. correctness > convenience) +- Trade-offs consciously accepted +- Alignment with long-term architecture or research goals +- Why this decision reduces risk, ambiguity, or technical debt + +This is where future disagreements get defused. + +--- + +## Considered Alternatives + +List the main alternatives that were seriously considered. + +For each alternative: +- Brief description +- Why it was *not* chosen +- Any conditions under which it might be revisited + +Example format: + +### Alternative A: +- **Pros:** +- **Cons:** +- **Reason for rejection:** + +--- + +## Consequences + +Describe the consequences of this decision. + +### Positive +- Benefits unlocked +- Simplifications introduced +- Risks reduced + +### Negative +- New constraints imposed +- Short-term pain +- Technical debt explicitly accepted + +Be honest. This section builds trust. + +--- + +## Implementation Notes + +Concrete guidance for implementation. + +Include: +- Where the decision should be enforced (code, config, docs, tests) +- Migration strategy (if applicable) +- Required follow-up tasks or refactors +- Guardrails to prevent regression + +If nothing is required yet, say so explicitly. + +--- + +## Validation & Monitoring + +How will we know this decision was correct? + +Examples: +- Tests or invariants that should hold +- Metrics or signals to watch +- Failure modes that would trigger reconsideration + +This turns the ADR into a *living* artifact. + +--- + +## Open Questions -## Additional Notes -*Include any additional information that might be relevant to the decision, such as implications for development workflows, future maintenance, or related decisions.* +List unresolved questions or known unknowns. -## Feedback and Suggestions -*Invite team members or stakeholders to provide feedback or suggest improvements on the decision or its implementation.* +- What do we still not know? +- What depends on future work or data? +- What should be revisited later? --- -**How to Use This Template:** +## References -1. **Copy the contents** of this template into a new markdown file within the ADR folder. -2. **Fill in each section** with specific details about the architectural decision being documented. -3. **Save the file** with a name that follows the naming convention (`NNN-subject-title.md` where `NNN` is the ADR number). -4. **Submit for review** through your project's standard process for documentation updates. +Links to: +- PRs +- Issues +- Design docs +- Papers +- Slack threads / meeting notes -This template ensures that each architectural decision is well-documented, providing a clear and consistent record that supports project development and facilitates understanding across the team. \ No newline at end of file +Future readers should be able to reconstruct the full story. diff --git a/documentation/CICs/EvaluationFrame.md b/documentation/CICs/EvaluationFrame.md index 00256de..30c96ed 100644 --- a/documentation/CICs/EvaluationFrame.md +++ b/documentation/CICs/EvaluationFrame.md @@ -2,7 +2,7 @@ **Status:** Active **Owner:** Evaluation Core -**Last reviewed:** 2026-03-13 +**Last reviewed:** 2026-04-02 **Related ADRs:** ADR-010 (Ontology), ADR-011 (Topology), ADR-012 (Authority) --- @@ -66,7 +66,7 @@ The canonical, framework-agnostic internal representation of a forecasting evalu ## 8. Step Semantics -The `step` identifier represents **positional lead time** (1-indexed), not an absolute calendar month. Step 1 is the first month of each forecast origin's prediction window, step 2 is the second, and so on. This is assigned positionally by adapters (e.g., `PandasAdapter.from_dataframes()`) based on the order of unique time values within each origin sequence. +The `step` identifier represents **positional lead time** (1-indexed), not an absolute calendar month. Step 1 is the first month of each forecast origin's prediction window, step 2 is the second, and so on. This is assigned positionally by adapters (e.g., views-pipeline-core's `EvaluationAdapter`) based on the order of unique time values within each origin sequence. **Consequence:** Step 1 in origin A and step 1 in origin B typically refer to *different* calendar months. When `NativeEvaluator` groups data by step, it collects the "diagonals" of the parallelogram β€” all first-month-ahead predictions together, all second-month-ahead together, etc. This is the correct semantic for forecast-horizon evaluation. @@ -91,3 +91,38 @@ ef = EvaluationFrame( month_groups = ef.get_group_indices('time') sub_ef = ef.select_indices(month_groups[100]) ``` + +--- + +## 10. Examples of Incorrect Usage + +- Constructing an `EvaluationFrame` directly with ragged sample arrays (varying S per row). External adapters should guard against this, but direct construction validates only ndim. +- Passing DataFrames or Series instead of NumPy arrays β€” the class has zero knowledge of Pandas. +- Omitting required identifier keys (e.g. passing only `time` and `unit` without `origin` and `step`). +- Storing derived or mutable state on an `EvaluationFrame` instance after construction. + +--- + +## 11. Test Alignment + +- **Green:** `tests/test_evaluation_frame.py::TestEvaluationFrameGreen` β€” construction, properties, grouping, selection. +- **Beige:** `tests/test_evaluation_frame.py::TestEvaluationFrameBeige` β€” single-row frames, large sample counts, multi-unit grouping. +- **Red:** `tests/test_evaluation_frame.py::TestEvaluationFrameRed` β€” shape mismatches, NaN/Inf/None in data and identifiers, missing keys. +- **Adversarial:** `tests/test_adversarial_inputs.py::TestAdversarialNativeInputs` β€” NaN/Inf boundary rejection. + +--- + +## 12. Known Deviations + +- **Rectangular sample invariant not enforced:** Direct construction does not validate that all rows of `y_pred` have the same number of samples. Only well-designed external adapters guard against ragged arrays. A directly-constructed frame with ragged `y_pred` would cause indexing errors deep in metric calculations. (Risk register C-03) +- **Integer identifier NaN not checked:** Validation checks float and object identifiers for NaN/None, but integer-typed identifiers are not checked (NumPy integers cannot represent NaN, so this is safe in practice but not explicitly documented). +- **No immutability enforcement:** The contract claims "State Immutability" via new-instance methods, but `y_true`, `y_pred`, and `identifiers` are publicly mutable attributes. Nothing prevents `ef.y_true[0] = 999` after construction. + +--- + +## End of Contract + +This document defines the **intended meaning** of `EvaluationFrame`. + +Changes to behavior that violate this intent are bugs. +Changes to intent must update this contract. diff --git a/documentation/CICs/EvaluationReport.md b/documentation/CICs/EvaluationReport.md index 9d1b414..8aa34f9 100644 --- a/documentation/CICs/EvaluationReport.md +++ b/documentation/CICs/EvaluationReport.md @@ -2,7 +2,7 @@ **Status:** Active **Owner:** Evaluation Core -**Last reviewed:** 2026-03-13 +**Last reviewed:** 2026-04-02 **Related ADRs:** ADR-010 (Ontology), ADR-041 (Output Schema) --- @@ -61,7 +61,7 @@ A structured, framework-agnostic container for evaluation results. It decouples ## 7. Boundaries and Interactions - **Upstream**: Produced by **NativeEvaluator**. -- **Downstream**: Consumed by **EvaluationManager**, Pipeline Core, or reporting tools. +- **Downstream**: Consumed by Pipeline Core or reporting tools. --- @@ -73,3 +73,42 @@ df = report.to_dataframe(schema="month") # pd.DataFrame data = report.to_dict() # nested dict schema = report.get_schema_results("month") # dict β†’ typed metrics dataclass ``` + +--- + +## 9. Examples of Incorrect Usage + +- Calling `to_dataframe(schema='raw')` β€” this is deprecated and returns the internal dict, not a DataFrame. Use `to_dict()['schemas']` instead. +- Adding a new metric to `METRIC_CATALOG` without adding a corresponding field to the typed metrics dataclass β€” the FM1 guard will raise `ValueError`. +- Treating the report as mutable and modifying `_results` after construction. + +--- + +## 10. Test Alignment + +- **Green:** `tests/test_evaluation_report.py` β€” construction, schema access, to_dict, to_dataframe. +- **Beige:** `tests/test_evaluation_report.py` β€” empty schemas, single-entry schemas. +- **Red:** `tests/test_evaluation_report.py` β€” missing schema keys, field mismatch (FM1 guard). + +--- + +## 11. Evolution Notes + +- The `to_dataframe()` method imports Pandas lazily. After Phase 3, this method may be removed or moved to an adapter. +- The `_metrics_map` mapping 4 (task, pred_type) combinations to dataclass types is stable but must be extended if new task types are added. + +--- + +## 12. Known Deviations + +- **Lazy Pandas import:** `to_dataframe()` imports `pandas` at call time, which means the Level 1 bridge concern leaks into what is otherwise a Level 0 component. This is a pragmatic compromise for backward compatibility. +- **Legacy dataclass coupling:** `get_schema_results()` wraps results in legacy dataclass instances (`RegressionPointEvaluationMetrics`, etc.) from `metrics.py`. If a metric is computed but has no field in the dataclass, the FM1 guard raises. This means new metrics require coordinated updates to both `metric_catalog.py` and `metrics.py`. + +--- + +## End of Contract + +This document defines the **intended meaning** of `EvaluationReport`. + +Changes to behavior that violate this intent are bugs. +Changes to intent must update this contract. diff --git a/documentation/CICs/MetricCatalog.md b/documentation/CICs/MetricCatalog.md new file mode 100644 index 0000000..c4040c1 --- /dev/null +++ b/documentation/CICs/MetricCatalog.md @@ -0,0 +1,137 @@ +# Class Intent Contract: MetricCatalog + +**Status:** Active +**Owner:** Evaluation Core +**Last reviewed:** 2026-03-31 +**Related ADRs:** ADR-042 (Metric Catalog), ADR-012 (Authority), ADR-013 (Observability) + +--- + +## 1. Purpose + +A genome registry and Chain of Responsibility resolver for evaluation metric hyperparameters. Declares what each metric requires (its genome) but provides NO default values. Values are supplied by named profiles and/or per-model overrides. + +--- + +## 2. Non-Goals (Explicit Exclusions) + +- This module does **not** compute metrics (that is the role of metric calculator functions). +- This module does **not** supply default hyperparameter values (that is the role of named profiles in `views_evaluation/profiles/`). +- This module does **not** validate data shapes or content (that is the role of `EvaluationFrame`). +- This module does **not** know about DataFrames or any external data framework. + +--- + +## 3. Responsibilities and Guarantees + +- **Genome Declaration:** Each `MetricSpec` guarantees an immutable declaration of which hyperparameters a metric requires (the `genome` tuple) and whether the metric is implemented. +- **Membership Declaration:** `METRIC_MEMBERSHIP` guarantees a complete mapping of `(task, pred_type)` pairs to valid metric name sets. +- **Chain of Responsibility Resolution:** `resolve_metric_params()` guarantees that hyperparameters are resolved in strict order: model overrides β†’ named profile β†’ fail loud. No silent defaults. +- **Fail-Loud on Missing Params:** Guarantees `ValueError` if a required parameter is missing from both overrides and profile. +- **Fail-Loud on None Values:** Guarantees `ValueError` if a resolved parameter is `None`. +- **Fail-Loud on Unknown Params:** Guarantees `ValueError` if model overrides contain parameters not in the metric's genome. +- **Fail-Loud on Unimplemented Metrics:** Guarantees `ValueError` with clear message if an unimplemented metric is requested. + +--- + +## 4. Inputs and Assumptions + +- **`resolve_metric_params(metric_name, model_overrides, profile)`:** + - `metric_name` must exist in `METRIC_CATALOG`. + - `model_overrides` is a dict of per-metric parameter overrides (may be empty). + - `profile` is a named evaluation profile dict (e.g. `BASE_PROFILE`). +- **Metric functions:** Each function referenced by a `MetricSpec` must accept `(y_true, y_pred, **resolved_params)`. +- **Genome completeness:** All hyperparameters required by a metric function must be declared in the spec's `genome` tuple. + +--- + +## 5. Outputs and Side Effects + +- **`resolve_metric_params()`** returns a `Dict[str, Any]` of resolved hyperparameters ready to pass as `**kwargs` to the metric function. Empty dict for metrics with no genome. +- **No side effects.** The module is purely declarative; no state mutation, no I/O, no logging. + +--- + +## 6. Failure Modes and Loudness + +- `ValueError` if `metric_name` is unknown (not in `METRIC_CATALOG`). +- `ValueError` if metric is not implemented (`spec.implemented == False`). +- `ValueError` if a genome parameter is missing from both overrides and profile. +- `ValueError` if a resolved parameter is `None`. +- `ValueError` if overrides contain unknown parameters not in the genome. +- `ValueError` if overrides are provided for a metric with empty genome. + +All failures are immediate and explicit. No warnings, no fallbacks, no silent degradation. + +--- + +## 7. Boundaries and Interactions + +- **Upstream:** Consumed by `NativeEvaluator._calculate_metrics()`. +- **Internal:** Imports metric functions from `native_metric_calculators.py`. +- **Downstream:** Named profiles (`views_evaluation/profiles/`) supply values consumed by the resolver. +- **Isolation:** Must not import Pandas, Polars, or any external data framework. Only depends on `native_metric_calculators` and standard library. + +--- + +## 8. Examples of Correct Usage + +```python +from views_evaluation.evaluation.metric_catalog import METRIC_CATALOG, resolve_metric_params +from views_evaluation.profiles.base import BASE_PROFILE + +# Resolve params for twCRPS using base profile +params = resolve_metric_params("twCRPS", {}, BASE_PROFILE) +# β†’ {"threshold": 0.0} + +# Override threshold for a specific model +params = resolve_metric_params("twCRPS", {"threshold": 2.0}, BASE_PROFILE) +# β†’ {"threshold": 2.0} + +# Metrics with no genome return empty dict +params = resolve_metric_params("MSE", {}, BASE_PROFILE) +# β†’ {} +``` + +--- + +## 9. Examples of Incorrect Usage + +- Hardcoding hyperparameter defaults inside metric function signatures β€” the catalog pattern requires all values to come from profiles or overrides. +- Calling `resolve_metric_params` with `None` as the profile β€” a real profile dict is always required. +- Adding a new metric to `METRIC_CATALOG` without adding its genome params to at least one profile β€” all callers will get `ValueError`. +- Passing overrides for metrics with empty genome (e.g. `resolve_metric_params("MSE", {"power": 1.5}, profile)`) β€” raises `ValueError`. + +--- + +## 10. Test Alignment + +- **Green:** `tests/test_metric_catalog.py` β€” registry snapshot integrity, resolver happy path, genome completeness checks. +- **Beige:** `tests/test_metric_catalog.py` β€” partial overrides, profile-only resolution, edge case param values. +- **Red:** `tests/test_metric_catalog.py` β€” unknown metrics, unimplemented metrics, missing params, None values, unknown overrides. +- **Correctness:** `tests/test_metric_correctness.py` β€” golden-value tests (5 tests; coverage gap noted). + +--- + +## 11. Evolution Notes + +- New metrics are added by: (1) implementing the function in `native_metric_calculators.py`, (2) adding a `MetricSpec` to `METRIC_CATALOG`, (3) adding to `METRIC_MEMBERSHIP`, (4) adding genome values to relevant profiles, (5) adding a field to the typed metrics dataclass in `metrics.py`. +- The legacy dispatch dicts were removed in Phase 3. `METRIC_MEMBERSHIP` is the single source of truth. +- Profile structure is stable; new profiles are added by creating a new file in `profiles/`. + +--- + +## 12. Known Deviations + +- **No profile completeness validation:** There is no mechanism to verify that a profile provides values for all metrics with non-empty genomes. A profile missing a metric's params will only fail at evaluation time, not at profile registration. +- **Weak golden-value coverage:** Only 5 tests in `test_metric_correctness.py` verify metric functions against independently computed known answers. Most metrics lack this verification (see risk register C-07). +- **Breaking rename:** The legacy `Brier` metric (unimplemented placeholder) was replaced by `Brier_sample` and `Brier_point` (implemented). The field in `ClassificationSampleEvaluationMetrics` was renamed from `Brier` to `Brier_sample`. External consumers accessing `.Brier` on classification sample results must update to `.Brier_sample`. + +--- + +## End of Contract + +This document defines the **intended meaning** of the MetricCatalog module (`MetricSpec`, `METRIC_CATALOG`, `METRIC_MEMBERSHIP`, `resolve_metric_params`). + +Changes to behavior that violate this intent are bugs. +Changes to intent must update this contract. diff --git a/documentation/CICs/NativeEvaluator.md b/documentation/CICs/NativeEvaluator.md index 414f150..e69011d 100644 --- a/documentation/CICs/NativeEvaluator.md +++ b/documentation/CICs/NativeEvaluator.md @@ -2,7 +2,7 @@ **Status:** Active **Owner:** Evaluation Core -**Last reviewed:** 2026-03-13 +**Last reviewed:** 2026-04-02 **Related ADRs:** ADR-010 (Ontology), ADR-011 (Topology), ADR-032 (Schemas), ADR-042 (Metric Catalog) --- @@ -62,7 +62,7 @@ A stateless "Pure Math" engine that executes the three standard Views evaluation ## 7. Boundaries and Interactions -- **Upstream**: Called directly or via legacy `EvaluationManager` (PHASE-3-DELETE). +- **Upstream**: Called directly by evaluation orchestrators (e.g. views-pipeline-core). - **Internal**: Depends on `EvaluationFrame` and `MetricCalculators`. - **Isolation**: Must not depend on any IO or dataframe frameworks. @@ -79,3 +79,45 @@ month_df = report.to_dataframe('month') # pd.DataFrame indexed by group k step_dict = report.to_dict()['schemas']['step'] # raw nested dict schema = report.get_schema_results('time_series') # dict β†’ typed metrics dataclass ``` + +--- + +## 9. Examples of Incorrect Usage + +- Passing a raw dict instead of an `EvaluationFrame` β€” the evaluator expects validated frames, not ad-hoc data. +- Requesting metrics that are not valid for the (task, pred_type) combination β€” e.g. asking for `CRPS` on a point prediction. This will fail loud. +- Omitting `evaluation_profile` from config and expecting hardcoded defaults β€” the resolver requires explicit profile selection. +- Using `legacy_compatibility=False` without understanding that step-wise results will include steps not present in all origins. + +--- + +## 10. Test Alignment + +- **Green:** `tests/test_native_evaluator.py` β€” three-schema evaluation, legacy compat, metric dispatch. +- **Beige:** `tests/test_native_evaluator.py` β€” sparse step configs, single-origin frames. +- **Red:** `tests/test_native_evaluator.py`, `tests/test_adversarial_inputs.py` β€” undeclared targets, unimplemented metrics. +- **Integration:** `tests/test_adversarial_inputs.py` β€” undeclared targets, unimplemented metrics, NaN/Inf defense-in-depth. + +--- + +## 11. Evolution Notes + +- `legacy_compatibility` default was flipped to `False` in Phase 3. The flag is retained for callers that need truncation behavior. +- Config validation may be added to `__init__` to catch structural config errors at construction time rather than at evaluation time (currently a known gap β€” risk register C-02). +- The `EvaluationReport` return type is stable; the internal `_calculate_metrics` dispatch may evolve as the `MetricCatalog` grows. + +--- + +## 12. Known Deviations + +- **No config validation at init:** `NativeEvaluator.__init__` only validates the profile name. Missing or malformed config keys cause cryptic errors at evaluation time rather than at construction. (Risk register C-02) +- **sklearn/scipy in "pure core":** The `NativeEvaluator` dispatches to metric functions that import `sklearn` and `scipy` at module level. This contradicts the stated goal of a zero-external-dep Level 0 core (ADR-011). (Risk register C-05) + +--- + +## End of Contract + +This document defines the **intended meaning** of `NativeEvaluator`. + +Changes to behavior that violate this intent are bugs. +Changes to intent must update this contract. diff --git a/documentation/CICs/PandasAdapter.md b/documentation/CICs/PandasAdapter.md deleted file mode 100644 index 289c2d2..0000000 --- a/documentation/CICs/PandasAdapter.md +++ /dev/null @@ -1,69 +0,0 @@ -# Class Intent Contract: PandasAdapter - -**Status:** Deprecated (PHASE-3-DELETE) -**Owner:** Adapters Layer -**Last reviewed:** 2026-03-13 -**Related ADRs:** ADR-010 (Ontology), ADR-011 (Topology), ADR-012 (Authority), ADR-040 (Input Schema) - ---- - -## 1. Purpose - -A framework-specific bridge that transforms Pandas DataFrames into the canonical `EvaluationFrame`. It encapsulates all the "dirty" logic of alignment, reindexing, and list-extraction. - ---- - -## 2. Non-Goals (Explicit Exclusions) - -- This class does **not** calculate metrics. -- This class does **not** persist data. -- This class does **not** handle other frameworks (like Polars). -- This class does **not** own the authoritative math core. - ---- - -## 3. Responsibilities and Guarantees - -- **MultiIndex Translation**: Guarantees that Pandas index levels (time, unit) are correctly mapped to `EvaluationFrame` identifiers. -- **Alignment (Truth Duplication)**: Responsible for performing the intersection of indices and duplicating `actuals` to match the sequence-based structure of `predictions`. -- **Sample Extraction**: Guarantees that "lists-in-cells" are correctly exploded into dense 2D NumPy arrays. -- **Metadata Declaration**: Responsible for explicitly declaring task and prediction types (as per ADR-012). - ---- - -## 4. Inputs and Assumptions - -- **Pandas Objects**: Expects `pd.DataFrame` and `List[pd.DataFrame]`. -- **Naming Conventions**: Assumes `month_id` and `entity_id` structure in MultiIndex. -- **Rectangular Samples**: Assumes that all prediction cells in a given task contain the same number of samples (or scalars). - ---- - -## 5. Outputs and Side Effects - -- **EvaluationFrame**: Produces a single, pre-aligned, flattened `EvaluationFrame`. - ---- - -## 6. Failure Modes and Loudness - -- Silently skips prediction DataFrames whose index has no overlap with actuals (continues to the next sequence). -- Raises `ValueError` if sample lengths are inconsistent across cells. -- Fails loud if the input is not a DataFrame. - ---- - -## 7. Boundaries and Interactions - -- **Upstream**: Called by users or legacy `EvaluationManager` (PHASE-3-DELETE). -- **Downstream**: Produces input for `EvaluationFrame`. -- **Isolation**: This is one of the few places where a `pandas` import is allowed. -- **Deprecation**: Emits `DeprecationWarning` on use. Will be removed from this repo in Phase 3; adapters belong in the calling repository (e.g. `views-pipeline-core`). - ---- - -## 8. Examples of Correct Usage - -```python -ef = PandasAdapter.from_dataframes(actual_df, [pred_df1, pred_df2], "target_name") -``` diff --git a/documentation/CICs/README.md b/documentation/CICs/README.md index afc7f7e..c581396 100644 --- a/documentation/CICs/README.md +++ b/documentation/CICs/README.md @@ -50,11 +50,10 @@ Contracts must be clear enough that: ## Active Contracts -- `EvaluationFrame.md` -- `NativeEvaluator.md` -- `EvaluationReport.md` -- `PandasAdapter.md` (PHASE-3-DELETE) -- (Add entries as they are created) +- `EvaluationFrame.md` β€” Canonical NumPy data container +- `NativeEvaluator.md` β€” Pure math evaluation engine +- `EvaluationReport.md` β€” Structured result container +- `MetricCatalog.md` β€” Genome registry and parameter resolver --- @@ -65,5 +64,6 @@ Intent Contracts are governed by: - ADR-021 (Intent Contracts for Classes) - ADR-012 (Authority over Inference) - ADR-020 (Multi-Perspective Testing) +- ADR-042 (Metric Catalog) If a class changes meaning, its Intent Contract must be updated. diff --git a/documentation/INSTANTIATION_CHECKLIST.md b/documentation/INSTANTIATION_CHECKLIST.md new file mode 100644 index 0000000..b7277dd --- /dev/null +++ b/documentation/INSTANTIATION_CHECKLIST.md @@ -0,0 +1,72 @@ +# Instantiation Checklist + +Use this checklist to track the base_docs governance adoption for views-evaluation. + +--- + +## Before You Start + +- [x] Decide which adoption phase you're targeting +- [x] Identify your project's ontological categories (ADR-010) + +--- + +## ADR Adaptation + +### All adopted ADRs +- [x] Update Status from `--template--` to `Proposed` or `Accepted` +- [x] Fill in Date, Deciders, Consulted, Informed fields + +### Per-ADR adaptation notes +- [x] **ADR-000:** Updated path reference to `documentation/ADRs/` +- [x] **ADR-010 (base 001):** Defined project's ontological categories (EvaluationFrame, NativeEvaluator, etc.) +- [x] **ADR-011 (base 002):** Defined 3-level layering and forbidden dependency patterns +- [x] **ADR-012 (base 003):** Adapted forbidden behavior examples to evaluation domain (no sniffing, no type inference) +- [x] **ADR-020 (base 005):** Adapted test taxonomy for forecasting evaluation domain +- [x] **ADR-021 (base 006):** No domain adaptation needed (criteria are universal) +- [x] **ADR-001 (base 007):** Adapted silicon agent rules to views-evaluation tooling +- [x] **ADR-014 (base 009):** Adapted boundary examples to Adapter-Core and Config-Runtime boundaries +- [x] **ADR-023:** Created technical risk register ADR + +--- + +## CICs + +- [x] Replace placeholder active contracts list in `CICs/README.md` with project contracts +- [x] Create intent contracts for non-trivial classes: + - [x] EvaluationFrame.md + - [x] NativeEvaluator.md + - [x] EvaluationReport.md + - [x] PandasAdapter.md (removed in Phase 3) + - [x] MetricCatalog.md + +--- + +## Contributor Protocols + +- [x] Review and adapt `contributor_protocols/silicon_based_agents.md` for project tooling +- [x] Review and adapt `contributor_protocols/carbon_based_agents.md` for project team +- [x] Adapt `contributor_protocols/hardened_protocol_template.md` for numerical computation domain + +--- + +## Standards + +- [x] Review `standards/logging_and_observability_standard.md` β€” adapted scope for Level 0 pure-math exception propagation +- [x] Review `standards/physical_architecture_standard.md` β€” includes critical bundling assessment + +--- + +## Risk Register + +- [x] Created `reports/technical_risk_register.md` seeded with 9 concerns from repo-assimilation +- [x] Created ADR-023 governing the risk register + +--- + +## Final Verification + +- [x] No files still have Status `--template--` (except ADR-022 which is intentionally deferred) +- [ ] No phantom references to non-existent files +- [ ] All cross-ADR references resolve correctly +- [ ] Run `validate_docs.sh` to check internal consistency diff --git a/documentation/contributor_protocols/hardened_protocol_template.md b/documentation/contributor_protocols/hardened_protocol_template.md new file mode 100644 index 0000000..25ca178 --- /dev/null +++ b/documentation/contributor_protocols/hardened_protocol_template.md @@ -0,0 +1,79 @@ +# The Hardened Protocol: Contributor Governance for Numerical Evaluation + +This document defines the mandatory engineering and mathematical standards for the `views-evaluation` repository. Adherence to this protocol is required for all contributions that affect metric computation, data transformation, or evaluation logic to guarantee scientific integrity and reproducibility. + +--- + +## 1. Core Principles + +### A. The Authority of Declarations (ADR-012) +**"Never infer; only trust declarations."** +All meaningful semantics (task types, prediction types, metric hyperparameters, step identifiers) must be explicitly declared in configuration or the `EvaluationFrame`. +- **Prohibited:** Type-sniffing from cell contents, step inference from row position without explicit assignment, scaling inference from target name prefixes. +- **Requirement:** If a parameter affects metric computation (e.g. twCRPS threshold, Coverage alpha), it must be a declared gene in the `MetricCatalog` genome and resolved via Chain of Responsibility. + +### B. The Fail-Loud Mandate (ADR-013) +**"A crash is a successful defense of scientific integrity."** +Silent failures, implicit fallbacks, and "best-effort" corrections are forbidden. +- **Requirement:** Violations of data, configuration, or semantic invariants must raise explicit `ValueError` immediately. +- **Prohibited:** Using `np.nan_to_num`, silent clipping, "sensible defaults" for critical metric parameters, or downgrading errors to warnings. + +### C. The Numerical Airlock (EvaluationFrame._validate) +All data entering the evaluation system must pass through the `EvaluationFrame` validation boundary. +- **Requirement:** Reject NaN and Inf values in observations and predictions at construction time. +- **Requirement:** Reject NaN/None in all identifier arrays at construction time. +- **Requirement:** Enforce shape consistency: `y_true` (N,), `y_pred` (N, S), all identifiers (N,). + +### D. The Metric Genome Contract (ADR-042) +**"No silent defaults."** +Every metric hyperparameter must be declared in the `MetricSpec.genome` tuple and resolved explicitly. +- **Requirement:** New metrics must declare all required hyperparameters in their genome. +- **Requirement:** Metric functions must use keyword-only arguments without defaults for genome parameters. +- **Prohibited:** Hardcoding default values in metric function signatures. + +--- + +## 2. Contributor Requirements + +### Adding a New Metric +1. **Implement the function** in `native_metric_calculators.py` with keyword-only args for genome parameters. +2. **Register in catalog:** Add a `MetricSpec` to `METRIC_CATALOG` in `metric_catalog.py`. +3. **Declare membership:** Add the metric name to the appropriate set in `METRIC_MEMBERSHIP`. +4. **Add to profile:** Add genome parameter values to `BASE_PROFILE` (and other relevant profiles). +5. **Add dataclass field:** Add the metric as `Optional[float] = None` to the appropriate typed metrics dataclass in `metrics.py`. +6. **Write tests:** Include at minimum one golden-value test and one red-team test. + +### Modifying an Existing Metric +1. **Update the CIC** if the change affects behavior described in the intent contract. +2. **Verify parity** by running the full Green/Beige/Red test suite. +3. **Update golden-value tests** if numerical output changes. + +--- + +## 3. Mandatory Testing Taxonomy (ADR-020) + +Every Pull Request affecting metric computation must include tests covering: + +### Green Team (Stability & Correctness) +- **Goal:** Ensure the metric produces correct values for known inputs. +- **Examples:** Golden-value tests against analytical solutions, CRPS parity with `properscoring`, bit-identical results across schemas. + +### Beige Team (Configuration & Human Error) +- **Goal:** Catch failures caused by common configuration mistakes or missing parameters. +- **Examples:** Missing genome parameters in profile, requesting unimplemented metrics, mismatched task/pred_type combinations. + +### Red Team (Adversarial) +- **Goal:** Expose failure modes by deliberately trying to make the system produce wrong results silently. +- **Examples:** NaN injection in predictions, Inf in observations, ragged sample arrays, zero-variance inputs. + +--- + +## 4. Operational Invariants + +- **Shape Guard Defense-in-Depth:** All metric functions call `_guard_shapes()` even though `EvaluationFrame._validate()` has already checked. This is deliberate double-checking, not redundancy to remove. +- **Profile Consistency:** All profiles must provide values for all metrics with non-empty genomes that may be requested in evaluations using that profile. +- **Schema Reproducibility:** Month-wise, time-series-wise, and step-wise schemas must produce identical results regardless of the order of input rows (grouping is by identifier value, not position). + +--- + +**"In this repository, we value explicit correctness over convenient execution."** diff --git a/documentation/evaluation_concepts.md b/documentation/evaluation_concepts.md index 53814a3..b5dc8f8 100644 --- a/documentation/evaluation_concepts.md +++ b/documentation/evaluation_concepts.md @@ -35,7 +35,7 @@ This parallelogram is the fundamental data structure that is analyzed by the thr ## 3. The Three Evaluation Schemas -The `EvaluationManager` assesses the predictive parallelogram by "slicing" it in three different ways. Each schema groups the data differently to answer a unique question about model performance. +The evaluation framework assesses the predictive parallelogram by "slicing" it in three different ways. Each schema groups the data differently to answer a unique question about model performance. ### Schema 1: Time-series-wise Evaluation diff --git a/documentation/integration_guide.md b/documentation/integration_guide.md index 0631dc6..3efb466 100644 --- a/documentation/integration_guide.md +++ b/documentation/integration_guide.md @@ -8,40 +8,31 @@ what the library does and does not do with your data. ## 1. Architecture Overview -The library has three layers: +The library is a pure-math evaluation engine with two core components: ``` - β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” - β”‚ Adapters (Bridge Layer) β”‚ - β”‚ PandasAdapter β€” converts List[DataFrame] β”‚ - β”‚ to EvaluationFrame; synthesises identifiersβ”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ EvaluationFrame (Core) β”‚ + β”‚ Pure NumPy container: y_true, y_pred, β”‚ + β”‚ identifiers {time, unit, origin, step} β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ - β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” - β”‚ EvaluationFrame (Core) β”‚ - β”‚ Pure NumPy container: y_true, y_pred, β”‚ - β”‚ identifiers {time, unit, origin, step} β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ NativeEvaluator (Pure Math) β”‚ + β”‚ Stateless engine: executes month-wise, β”‚ + β”‚ sequence-wise, and step-wise schemas β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ - β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” - β”‚ NativeEvaluator (Pure Math) β”‚ - β”‚ Stateless engine: executes month-wise, β”‚ - β”‚ sequence-wise, and step-wise schemas β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ - β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” - β”‚ EvaluationReport (Results) β”‚ - β”‚ Framework-agnostic results container; β”‚ - β”‚ exposes to_dict(), to_dataframe(), β”‚ - β”‚ get_schema_results() β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ EvaluationReport (Results) β”‚ + β”‚ Framework-agnostic results container; β”‚ + β”‚ exposes to_dict(), to_dataframe(), β”‚ + β”‚ get_schema_results() β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ ``` -`EvaluationManager` is a **legacy orchestrator** that wraps all four layers behind a single -`evaluate()` call. It is retained for backward compatibility and will be removed in Phase 3 of the -orchestrator migration (see `reports/2026-02-25_evaluation_frame_refactor/10_orchestrator_migration_plan.md`). - -**New integrations should use the native API (Β§2). The legacy API is documented in Β§3.** +Callers (e.g. views-pipeline-core) are responsible for constructing `EvaluationFrame` from their +own data formats. This library has no knowledge of Pandas, Polars, or any external data framework. --- @@ -56,7 +47,7 @@ pip install pandas numpy # only needed to prepare input DataFrames ### 2.2. Identifier Glossary -All evaluation logic operates on four identifiers that `PandasAdapter` synthesises from your input. +All evaluation logic operates on four identifiers that must be provided in the `EvaluationFrame`. Understanding them is required: | Identifier | Type | Meaning | @@ -114,57 +105,45 @@ use `[1, 2, ..., 12]`. Sparse configs (e.g. `[1, 3, 6, 12]`) evaluate only those ```python import numpy as np -import pandas as pd -from views_evaluation.evaluation.adapters.pandas import PandasAdapter -from views_evaluation.evaluation.native_evaluator import NativeEvaluator - -# --- 1. Prepare actuals --- -actuals_index = pd.MultiIndex.from_product( - [range(500, 513), [101, 102]], - names=['month_id', 'country_id'] +from views_evaluation import EvaluationFrame, NativeEvaluator + +# --- 1. Construct EvaluationFrame from NumPy arrays --- +ef = EvaluationFrame( + y_true=y_true_array, # shape (N,) + y_pred=y_pred_array, # shape (N, S) where S >= 1 + identifiers={ + 'time': time_ids, # shape (N,) β€” calendar month ids + 'unit': unit_ids, # shape (N,) β€” spatial entity ids + 'origin': origin_ids, # shape (N,) β€” sequence index + 'step': step_ids, # shape (N,) β€” 1-indexed lead time + }, + metadata={'target': 'ged_sb_best'}, ) -actuals = pd.DataFrame( - {'ged_sb_best': np.random.randint(0, 20, size=26)}, - index=actuals_index -) - -# --- 2. Prepare predictions list (2 sequences, 12 steps each) --- -target = 'ged_sb_best' -pred_col = f'pred_{target}' -predictions_list = [] -for origin_offset in range(2): - months = range(500 + origin_offset, 512 + origin_offset) - idx = pd.MultiIndex.from_product([months, [101, 102]], names=['month_id', 'country_id']) - preds = pd.DataFrame({pred_col: [[v] for v in np.random.rand(len(idx)) * 20]}, index=idx) - predictions_list.append(preds) - -# --- 3. Configure --- +# --- 2. Configure --- config = { 'steps': list(range(1, 13)), - 'regression_targets': [target], + 'regression_targets': ['ged_sb_best'], 'regression_point_metrics': ['MSE', 'RMSLE', 'Pearson'], } -# --- 4. Adapt and evaluate --- -ef = PandasAdapter.from_dataframes(actual=actuals, predictions=predictions_list, target=target) - +# --- 3. Evaluate --- evaluator = NativeEvaluator(config) -report = evaluator.evaluate(ef) # legacy_compatibility=True by default +report = evaluator.evaluate(ef) -# --- 5. Access results --- -print(report.to_dataframe('step')) # step-wise DataFrame (MSE, RMSLE, Pearson per step) +# --- 4. Access results --- +print(report.to_dict()) # full nested dict +print(report.to_dataframe('step')) # step-wise DataFrame print(report.to_dataframe('month')) # month-wise DataFrame print(report.to_dataframe('time_series')) # sequence-wise DataFrame -print(report.to_dict()) # full nested dict ``` ### 2.6. The `legacy_compatibility` Flag -`NativeEvaluator.evaluate(ef, legacy_compatibility=True)` (default) caps step-wise evaluation to +`NativeEvaluator.evaluate(ef, legacy_compatibility=True)` caps step-wise evaluation to the shortest sequence in the frame. If origin 0 has 12 steps and origin 1 has only 10 steps, -legacy mode evaluates steps 1–10 and leaves steps 11–12 empty. This reproduces a historic zip -truncation behaviour required for parity with the legacy system. +legacy mode evaluates steps 1–10 and leaves steps 11–12 empty. The default is `False` (evaluate +all steps with available data). Set `legacy_compatibility=False` to evaluate all steps that have any data, regardless of whether shorter sequences exist. @@ -189,57 +168,13 @@ report.get_schema_results('month') # dict mapping key β†’ typed metrics datacla --- -## 3. The Legacy API (`EvaluationManager`) - -> **Deprecation notice:** `EvaluationManager` will be removed in Phase 3 of the orchestrator -> migration. New integrations must use the native API (Β§2). This section is retained for teams -> currently using the legacy path. - -### 3.1. Differences from the Native API - -- Accepts the same DataFrame inputs and config as Β§2. -- Applies **inverse transforms** based on target name prefixes: - - `ln_` prefix: applies `exp(x) - 1` to both actuals and predictions - - `lx_` prefix: applies a custom inverse log transform - - `lr_` prefix: no transform (raw values) - - No prefix: no transform - This behaviour is **absent** from the native path, which always operates on data as provided. -- Returns a dict of `{schema: (dict, DataFrame)}` tuples, not an `EvaluationReport`. -- `legacy_compatibility` is hardcoded to `True` (cannot be changed). - -### 3.2. Usage - -```python -from views_evaluation.evaluation.evaluation_manager import EvaluationManager - -manager = EvaluationManager() -config = { - 'steps': [1, 2, 3], - 'regression_targets': ['lr_ged_sb_best'], - 'regression_point_metrics': ['MSE', 'RMSLE', 'Pearson'] -} - -results = manager.evaluate( - actual=actuals, # same format as Β§2.3 - predictions=predictions_list, - target='lr_ged_sb_best', - config=config -) - -# Access results (tuple format β€” not EvaluationReport) -step_df = results['step'][1] # index 1 = DataFrame -step_dict = results['step'][0] # index 0 = raw dict -``` - ---- - -## 4. What This Library Does NOT Do +## 3. What This Library Does NOT Do -- **Does not load or save data.** Pass DataFrames in; get an `EvaluationReport` (or dict) out. +- **Does not load or save data.** Construct `EvaluationFrame` from NumPy arrays; get an `EvaluationReport` out. +- **Does not perform data alignment or adaptation.** Callers (e.g. views-pipeline-core's `EvaluationAdapter`) are responsible for aligning actuals with predictions and synthesising identifiers. - **Does not enforce k=12 or 36-month sequences.** The VIEWS standard (ADR-030) recommends k=12 rolling origins over 36-month evaluation windows, but this library accepts any sequence count and length. -- **Does not validate spatial or temporal alignment.** The adapter performs index intersection, but - it does not verify that sequences are in chronological order or that all origins cover the same - calendar range. +- **Does not validate spatial or temporal alignment.** It verifies shape consistency and NaN/Inf + rejection, but does not verify that sequences are chronologically ordered. - **Does not produce output files.** Persistence is handled by `views-pipeline-core` per ADR-041. diff --git a/documentation/standards/logging_and_observability_standard.md b/documentation/standards/logging_and_observability_standard.md index 3efcaf0..4191a05 100644 --- a/documentation/standards/logging_and_observability_standard.md +++ b/documentation/standards/logging_and_observability_standard.md @@ -125,7 +125,7 @@ The following must be logged: * Configuration summaries * All structural failures -> **Scope note:** Level 0 pure-math classes (`EvaluationFrame`, `NativeEvaluator`, `EvaluationReport`) rely on exception propagation per ADR-013 and do not maintain their own loggers. Logging responsibility for these components sits at the orchestration layer (e.g. `EvaluationManager` or calling code in `views-pipeline-core`). +> **Scope note:** Level 0 pure-math classes (`EvaluationFrame`, `NativeEvaluator`, `EvaluationReport`) rely on exception propagation per ADR-013 and do not maintain their own loggers. Logging responsibility for these components sits at the orchestration layer (e.g. calling code in `views-pipeline-core` or equivalent orchestrators). ### 5.2 Optional Logging diff --git a/documentation/standards/physical_architecture_standard.md b/documentation/standards/physical_architecture_standard.md new file mode 100644 index 0000000..e7c1dca --- /dev/null +++ b/documentation/standards/physical_architecture_standard.md @@ -0,0 +1,127 @@ +# Physical Architecture Standard + +**Status:** Active +**Governing ADRs:** ADR-010 (Ontology), ADR-011 (Topology) + +--- + +## 1. The 1-Class-1-File Standard + +**Every non-trivial class must live in its own file named after the class in `snake_case`.** + +- **Correct:** `EvaluationFrame` lives in `evaluation_frame.py`. +- **Correct:** `NativeEvaluator` lives in `native_evaluator.py`. +- **Exception:** Trivial data containers directly related to a class may coexist in the same file. + +--- + +## 2. Directory Ontology + +Files must be located in directories that match their functional category: + +``` +views_evaluation/ +β”œβ”€β”€ evaluation/ # Core evaluation logic (Level 0) +β”‚ β”œβ”€β”€ evaluation_frame.py +β”‚ β”œβ”€β”€ native_evaluator.py +β”‚ β”œβ”€β”€ metric_catalog.py +β”‚ β”œβ”€β”€ native_metric_calculators.py +β”‚ β”œβ”€β”€ evaluation_report.py +β”‚ β”œβ”€β”€ metrics.py +β”‚ └── config_schema.py +β”œβ”€β”€ adapters/ # Reserved for future framework bridges +β”‚ └── __init__.py +└── profiles/ # Named evaluation profiles + β”œβ”€β”€ base.py + └── hydranet_ucdp.py +``` + +--- + +## 3. Current State Assessment β€” Bundling + +### Compliant + +| File | Contents | Verdict | +|------|----------|---------| +| `evaluation_frame.py` | `EvaluationFrame` (1 class) | Compliant | +| `native_evaluator.py` | `NativeEvaluator` (1 class) | Compliant | +| `metric_catalog.py` | `MetricSpec` + `METRIC_CATALOG` + `METRIC_MEMBERSHIP` + `resolve_metric_params` | Cohesive module β€” spec, registries, and resolver form a single concept | +| `evaluation_report.py` | `EvaluationReport` (1 class) | Compliant | +| `config_schema.py` | `EvaluationConfig` (1 TypedDict) | Compliant | + +### Defensible Exception + +| File | Contents | Verdict | +|------|----------|---------| +| `metrics.py` | 5 dataclasses: `BaseEvaluationMetrics` + 4 typed 2x2 containers | Defensible β€” trivial data containers sharing a base class. Splitting into 5 files would create fragmentation without improving discoverability. | + +### Identified Challenge + +| File | Contents | Concern | +|------|----------|---------| +| `native_metric_calculators.py` | 437 lines: `_guard_shapes` (shared guard), 15+ implemented metric functions spanning 4 categories, 4 placeholder stubs, 4 legacy dispatch dicts, 1 legacy alias | **Bundling challenge** | + +**Analysis of `native_metric_calculators.py`:** + +This file bundles heterogeneous concerns: + +1. **Shared utility** (`_guard_shapes`) β€” used by all metrics, should arguably be its own module or remain as a private helper. + +2. **Four metric families:** + - Regression point: MSE, MSLE, RMSLE, EMD, Pearson, MTD, y_hat_bar, MCR + - Regression sample: CRPS, twCRPS, MIS, QIS, QS_sample, Coverage, Ignorance + - Classification point: AP, Brier_point, QS_point + - Classification sample: Brier_sample + +3. **Placeholder stubs** for unimplemented metrics (SD, pEMDiv, Variogram, Jeffreys). + +4. **Legacy dispatch dicts removed in Phase 3.** `METRIC_MEMBERSHIP` is now the single source of truth. + +**Why this is a challenge:** +- The file mixes 4 distinct metric families. Adding a new regression-sample metric requires editing a 437-line file that also contains classification metrics. +- The legacy dispatch dicts at the bottom duplicate the `METRIC_MEMBERSHIP` registry (risk C-01). +- The file has the highest line count of any source module and the most heterogeneous responsibility set. + +**Why splitting is not straightforward:** +- All metric functions share `_guard_shapes`. Splitting would either duplicate it or create a shared utility module. +- The `MetricCatalog` imports all 22 functions from this single module. Splitting would require updating the catalog's import block. +- Functions are stateless and flat β€” they are not classes, so the 1-class-1-file rule does not directly apply. + +**Recommendation (for future consideration):** +If and when the file exceeds ~600 lines or the metric count exceeds ~30, consider splitting into: +``` +evaluation/ +β”œβ”€β”€ metric_calculators/ +β”‚ β”œβ”€β”€ __init__.py (re-exports all functions) +β”‚ β”œβ”€β”€ _guard.py (_guard_shapes) +β”‚ β”œβ”€β”€ regression_point.py +β”‚ β”œβ”€β”€ regression_sample.py +β”‚ β”œβ”€β”€ classification.py +β”‚ └── placeholders.py +``` + +This is a **future evolution path**, not a current mandate. The current bundling is tolerable but approaches the threshold where it creates friction. + +--- + +## 4. Import Conventions + +- **Explicit imports:** Avoid `from module import *`. +- **Circular dependency guard:** Follow ADR-011 layering. Level 0 modules must not import from Level 1 or Level 2. +- **Lazy imports for Pandas:** Pandas is imported inside methods (e.g. `to_dataframe()`) rather than at module level in Level 0/1 code. + +--- + +## 5. Enforcement + +Compliance with this standard is assessed during: +- Code review +- Repo-assimilation audits +- Tech debt cleanup cycles + +PRs that introduce new multi-class files or significantly expand existing bundled files should document the justification. + +--- + +**"The structure of the files is as rigorous as the logic of the code."** diff --git a/documentation/validate_docs.sh b/documentation/validate_docs.sh new file mode 100755 index 0000000..8bd3b9b --- /dev/null +++ b/documentation/validate_docs.sh @@ -0,0 +1,94 @@ +#!/usr/bin/env bash +# Validates internal consistency of base_docs documentation set. +# Exit 0 if clean, exit 1 if issues found. + +set -uo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +errors=0 + +echo "=== base_docs validation ===" +echo "" + +# 1. Check for unfilled template placeholders in accepted/active files +# (skip files whose names contain "template" β€” those are expected to have placeholders) +# These are warnings only (non-blocking) since in the template repo some +# files are legitimately Accepted with placeholder dates. +echo "--- Checking for template placeholders in accepted/active files ---" +warnings=0 +while IFS= read -r file; do + [[ -z "$file" ]] && continue + [[ "$file" == *template* ]] && continue + if grep -q 'YYYY-MM-DD' "$file"; then + echo " WARN: Unfilled date placeholder in $file" + warnings=$((warnings + 1)) + fi + if grep -q '' "$file"; then + echo " WARN: Unfilled deciders placeholder in $file" + warnings=$((warnings + 1)) + fi + if grep -q '' "$file"; then + echo " WARN: Unfilled ClassName placeholder in $file" + warnings=$((warnings + 1)) + fi +done < <(grep -rl 'Status:.*\(Accepted\|Active\)' --include='*.md' . 2>/dev/null || true) +if [ "$warnings" -eq 0 ]; then + echo " OK" +fi + +# 2. Verify CIC active contracts exist (skip blockquote/example lines) +echo "--- Checking CIC active contract references ---" +if [ -f "CICs/README.md" ]; then + while IFS= read -r line; do + [[ -z "$line" ]] && continue + contract=$(echo "$line" | sed -n 's/^- `\(.*\.md\)`.*$/\1/p') + if [ -n "$contract" ] && [ ! -f "CICs/$contract" ]; then + echo " ERROR: CIC contract listed but missing: CICs/$contract" + errors=$((errors + 1)) + fi + done < <(grep -E '^- `[A-Z].*\.md`' CICs/README.md 2>/dev/null | grep -v '>' || true) +fi + +# 3. Cross-ADR reference integrity (constitutional ADRs 000-009 only; +# higher numbers are project-specific and not expected in the template repo) +echo "--- Checking cross-ADR references (constitutional: 000-009) ---" +while IFS= read -r ref; do + [[ -z "$ref" ]] && continue + file=$(echo "$ref" | cut -d: -f1) + adr_num=$(echo "$ref" | grep -oP 'ADR-00\K[0-9]' | head -1) + if [ -n "$adr_num" ]; then + match_count=$(find ADRs -name "00${adr_num}_*.md" 2>/dev/null | wc -l) + if [ "$match_count" -eq 0 ]; then + echo " ERROR: $file references ADR-00${adr_num} but no matching file found" + errors=$((errors + 1)) + fi + fi +done < <(grep -rn 'ADR-00[0-9]' --include='*.md' . 2>/dev/null || true) + +# 4. Check that referenced protocol files exist +echo "--- Checking protocol file references ---" +while IFS= read -r ref; do + [[ -z "$ref" ]] && continue + file=$(echo "$ref" | cut -d: -f1) + proto=$(echo "$ref" | grep -oP 'contributor_protocols/[a-z_]+\.md' | head -1) + if [ -n "$proto" ] && [ ! -f "$proto" ]; then + echo " ERROR: $file references $proto but file does not exist" + errors=$((errors + 1)) + fi +done < <(grep -rn 'contributor_protocols/' --include='*.md' . 2>/dev/null || true) + +# 5. Report template status markers +echo "--- Checking template status markers ---" +template_count=$(grep -rl '\-\-template\-\-' --include='*.md' . 2>/dev/null | wc -l) +echo " INFO: $template_count files still have --template-- status (expected in template repo)" + +echo "" +if [ "$errors" -gt 0 ]; then + echo "=== FAILED: $errors issue(s) found ===" + exit 1 +else + echo "=== PASSED: no issues found ===" + exit 0 +fi diff --git a/examples/evaluate_native_prototype.py b/examples/evaluate_native_prototype.py index 1e252ed..c131e36 100644 --- a/examples/evaluate_native_prototype.py +++ b/examples/evaluate_native_prototype.py @@ -1,56 +1,50 @@ +""" +Example: EvaluationFrame grouping and metric computation + +Demonstrates how EvaluationFrame provides month-wise, step-wise, +and origin-wise grouping for evaluation schemas. +""" import numpy as np -import pandas as pd -from views_evaluation.evaluation.adapters import PandasAdapter from views_evaluation.evaluation.evaluation_frame import EvaluationFrame + def mock_metrics_mse(ef: EvaluationFrame) -> float: - """A 'native' metric that uses broadcasting.""" - # y_true (N,) broadcasts to (N, S) - # y_pred (N, S) - # result (N, S) -> mean(axis=1) -> (N,) -> mean() -> scalar + """A native metric using broadcasting: mean((y_true - y_pred)^2).""" errors = (ef.y_true[:, np.newaxis] - ef.y_pred) ** 2 return np.mean(errors) -def run_parity_demo(): - # 1. Create dummy data mimicking the current structure - index = pd.MultiIndex.from_product([[100, 101, 102], [1, 2]], names=['month', 'unit']) - actual = pd.DataFrame({'target': np.random.rand(6)}, index=index) - - # Two sequences (overlapping) - pred_0 = pd.DataFrame({'pred_target': [[x, x+0.1] for x in np.random.rand(4)]}, - index=index[:4]) - pred_1 = pd.DataFrame({'pred_target': [[x, x+0.1] for x in np.random.rand(4)]}, - index=index[2:]) - - print("--- 1. Adapter Phase ---") - ef = PandasAdapter.from_dataframes(actual, [pred_0, pred_1], "target") + +def run_demo(): + rng = np.random.default_rng(42) + + # Build 2 overlapping sequences, 3 months each, 2 units + rows, y_true_list, y_pred_list = [], [], [] + for origin in range(2): + for step_idx, month in enumerate(range(100 + origin, 103 + origin)): + for unit in [1, 2]: + rows.append((month, unit, origin, step_idx + 1)) + y_true_list.append(rng.random()) + y_pred_list.append([rng.random(), rng.random()]) # 2-sample ensemble + + ef = EvaluationFrame( + y_true=np.array(y_true_list), + y_pred=np.array(y_pred_list), + identifiers={ + 'time': np.array([r[0] for r in rows]), + 'unit': np.array([r[1] for r in rows]), + 'origin': np.array([r[2] for r in rows]), + 'step': np.array([r[3] for r in rows]), + }, + metadata={'target': 'target'}, + ) print(ef) - print("\n--- 2. Schema Preservation ---") - - # Month-wise - print("\nMonth-wise Groups:") - month_groups = ef.get_group_indices('time') - for month, idx in month_groups.items(): - sub_ef = ef.select_indices(idx) - mse = mock_metrics_mse(sub_ef) - print(f" Month {month}: {sub_ef.n_rows} rows, MSE={mse:.4f}") - - # Step-wise - print("\nStep-wise Groups:") - step_groups = ef.get_group_indices('step') - for step, idx in step_groups.items(): - sub_ef = ef.select_indices(idx) - mse = mock_metrics_mse(sub_ef) - print(f" Step {step}: {sub_ef.n_rows} rows, MSE={mse:.4f}") - - # Sequence-wise (Origin-wise) - print("\nSequence-wise Groups:") - origin_groups = ef.get_group_indices('origin') - for origin, idx in origin_groups.items(): - sub_ef = ef.select_indices(idx) - mse = mock_metrics_mse(sub_ef) - print(f" Sequence {origin}: {sub_ef.n_rows} rows, MSE={mse:.4f}") + for group_key in ['time', 'step', 'origin']: + print(f"\n{group_key.title()}-wise Groups:") + for val, idx in ef.get_group_indices(group_key).items(): + sub = ef.select_indices(idx) + print(f" {group_key}={val}: {sub.n_rows} rows, MSE={mock_metrics_mse(sub):.4f}") + if __name__ == "__main__": - run_parity_demo() + run_demo() diff --git a/examples/quickstart.ipynb b/examples/quickstart.ipynb deleted file mode 100644 index 84b36d7..0000000 --- a/examples/quickstart.ipynb +++ /dev/null @@ -1,371 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Quick Start\n", - "In this notebook, we go over the main functionalities of the library\n", - "\n", - "## Table of Contents\n", - "1. [Installation](#installation)\n", - "2. [Importing Libraries](#importing-libraries)\n", - "3. [Creating Sample Data](#creating-sample-data)\n", - "4. [Initializing EvaluationManager](#initializing-evaluationmanager)\n", - "5. [Evaluating Predictions](#evaluating-predictions)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Installation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We recommend using some virtual environment" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "With pip" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`pip install views-evaluation`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Importing Libraries\n", - "First, let's import a few things:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "from views_evaluation.evaluation.evaluation_manager import EvaluationManager\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Creating Sample Data\n", - "\n", - "Let's create some sample data for actual values and predictions." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "index = pd.MultiIndex.from_tuples(\n", - " [(99, 1), (99, 2), (100, 1), (100, 2), (101, 1), (101, 2), (102, 1), (102, 2)],\n", - " names=[\"month\", \"country\"],\n", - ")\n", - "index_0 = pd.MultiIndex.from_tuples(\n", - " [(100, 1), (100, 2), (101, 1), (101, 2)],\n", - " names=[\"month\", \"country\"],\n", - ")\n", - "index_1 = pd.MultiIndex.from_tuples(\n", - " [(101, 1), (101, 2), (102, 1), (102, 2)],\n", - " names=[\"month\", \"country\"],\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# Actual data\n", - "df_actual = pd.DataFrame(\n", - " {\n", - " \"lr_target\": [0, 1, 1, 2, 2, 3, 3, 4],\n", - " \"covariate_1\": [3, 2, 4, 5, 2, 6, 8, 5],\n", - " },\n", - " index=index,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# Point predictions\n", - "df1_point = pd.DataFrame({\"pred_lr_target\": [1, 3, 5, 7]}, index=index_0)\n", - "df2_point = pd.DataFrame({\"pred_lr_target\": [2, 4, 6, 8]}, index=index_1)\n", - "dfs_point = [df1_point, df2_point]\n", - "\n", - "# Uncertainty\n", - "df1_uncertainty = pd.DataFrame(\n", - " {\"pred_lr_target\": [[1, 2, 3], [2, 3, 4], [3, 4, 5], [4, 5, 6]]}, index=index_0\n", - ")\n", - "df2_uncertainty = pd.DataFrame(\n", - " {\"pred_lr_target\": [[4, 6, 8], [5, 7, 9], [6, 8, 10], [7, 9, 11]]}, index=index_1\n", - ")\n", - "dfs_uncertainty = [df1_uncertainty, df2_uncertainty]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Initializing EvaluationManager\n", - "Now, we can initialize the `EvaluationManager` with the metrics we want to evaluate.\n", - "Point evaluation supports the following metrics:\n", - "- RMSLE\n", - "- CRPS\n", - "- Average Precision\n", - "\n", - "Uncertainty evaluation supports the following metric:\n", - "- CRPS" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "metrics_list = ['RMSLE', 'CRPS', 'MIS'] # Add other metrics as needed\n", - "evaluation_manager = EvaluationManager(metrics_list)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Evaluating Predictions" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Metric MIS is not a default metric, skipping...\n", - "Metric MIS is not a default metric, skipping...\n", - "Metric MIS is not a default metric, skipping...\n" - ] - } - ], - "source": [ - "config = {\"steps\": [1, 2]}\n", - "point_evaluation_results = evaluation_manager.evaluate(df_actual, dfs_point, target='lr_target', config=config)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "( RMSLE CRPS\n", - " month100 0.203422 0.5\n", - " month101 0.502668 2.0\n", - " month102 0.573874 3.5,\n", - " RMSLE CRPS\n", - " step01 0.182040 0.5\n", - " step02 0.636311 3.5,\n", - " RMSLE CRPS\n", - " ts00 0.510800 2.0\n", - " ts01 0.420849 2.0)" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "point_evaluation_results['month'][1], point_evaluation_results['step'][1], point_evaluation_results['time_series'][1]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Metrics will be **ignored** if not in the supported metric list" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Metric RMSLE is not a default metric, skipping...\n", - "Metric RMSLE is not a default metric, skipping...\n", - "Metric RMSLE is not a default metric, skipping...\n" - ] - } - ], - "source": [ - "uncertainty_evaluation_results = evaluation_manager.evaluate(df_actual, dfs_uncertainty, target='lr_target', config=config)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "( CRPS MIS\n", - " month100 0.555556 3.90\n", - " month101 2.333333 65.85\n", - " month102 4.111111 127.80,\n", - " CRPS MIS\n", - " step01 1.833333 45.85\n", - " step02 2.833333 85.85,\n", - " CRPS MIS\n", - " ts00 1.055556 23.9\n", - " ts01 3.611111 107.8)" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "uncertainty_evaluation_results['month'][1], uncertainty_evaluation_results['step'][1], uncertainty_evaluation_results['time_series'][1]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you are only interested in one of the evaluation schemas, you can call the corresponding function" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Metric MIS is not a default metric, skipping...\n" - ] - } - ], - "source": [ - "# Get the evaluation type, i.e., uncertainty or point\n", - "actual = EvaluationManager.transform_data(\n", - " EvaluationManager.convert_to_array(df_actual, \"lr_target\"), 'lr_target'\n", - " )\n", - "predictions = [\n", - " EvaluationManager.transform_data(\n", - " EvaluationManager.convert_to_array(pred, \"pred_lr_target\"), \"pred_lr_target\"\n", - " )\n", - " for pred in dfs_point\n", - "]\n", - "is_uncertainty = EvaluationManager.get_evaluation_type(predictions, 'pred_lr_target')\n", - "month_point_evaluation_results = evaluation_manager.month_wise_evaluation(actual, predictions, target='lr_target', is_uncertainty=is_uncertainty)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " RMSLE CRPS\n", - "month100 0.203422 0.5\n", - "month101 0.502668 2.0\n", - "month102 0.573874 3.5\n" - ] - } - ], - "source": [ - "print(month_point_evaluation_results[1])" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'step01': PointEvaluationMetrics(MSE=None, MSLE=None, RMSLE=0.18203984406117593, CRPS=0.5, AP=None, EMD=None, SD=None, pEMDiv=None, Pearson=None, Variogram=None, y_hat_bar=None),\n", - " 'step02': PointEvaluationMetrics(MSE=None, MSLE=None, RMSLE=0.636311445241193, CRPS=3.5, AP=None, EMD=None, SD=None, pEMDiv=None, Pearson=None, Variogram=None, y_hat_bar=None)}" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "point_evaluation_results['step'][0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "testenv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.11" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/using_native_api.py b/examples/using_native_api.py index 84bd00d..90075df 100644 --- a/examples/using_native_api.py +++ b/examples/using_native_api.py @@ -1,52 +1,47 @@ """ Example: Using the Native Evaluation API -This script demonstrates the modern, performant way to evaluate forecasts -using the NativeEvaluator and EvaluationFrame. This path is up to 14x -faster for probabilistic forecasts. +This script demonstrates how to evaluate forecasts using the +NativeEvaluator and EvaluationFrame with pure NumPy arrays. """ -import pandas as pd -from views_evaluation import PandasAdapter, NativeEvaluator +import numpy as np +from views_evaluation import EvaluationFrame, NativeEvaluator -# 1. Prepare dummy data (The legacy format) -index = pd.MultiIndex.from_product([[100, 101], [1, 2]], names=['month', 'unit']) -actuals = pd.DataFrame({'target': [0, 1, 0, 1]}, index=index) -# 3-sample ensemble predictions -preds = [ - pd.DataFrame({'pred_target': [[0.1, 0.2, 0.05], [0.8, 0.9, 0.7]]}, index=index[:2]), - pd.DataFrame({'pred_target': [[0.1, 0.15, 0.2], [0.7, 0.8, 0.9]]}, index=index[2:]) -] +# 1. Prepare data as NumPy arrays +n = 4 +y_true = np.array([0.0, 1.0, 0.0, 1.0]) +y_pred = np.array([ + [0.1, 0.2, 0.05], + [0.8, 0.9, 0.7], + [0.1, 0.15, 0.2], + [0.7, 0.8, 0.9], +]) # shape (4, 3) β€” 3-sample ensemble -# 2. Configure metrics +identifiers = { + 'time': np.array([100, 100, 101, 101]), + 'unit': np.array([1, 2, 1, 2]), + 'origin': np.array([0, 0, 1, 1]), + 'step': np.array([1, 1, 1, 1]), +} + +# 2. Construct EvaluationFrame (validates shapes, NaN, identifiers) +ef = EvaluationFrame(y_true, y_pred, identifiers, metadata={'target': 'target'}) +print(f"EvaluationFrame: {ef.n_rows} rows, {ef.n_samples} samples") + +# 3. Configure and evaluate config = { 'steps': [1], 'regression_targets': ['target'], - 'regression_sample_metrics': ['CRPS', 'Ignorance'] + 'regression_sample_metrics': ['CRPS', 'Ignorance'], } - -print("--- Step 1: Adapt Data ---") -# The adapter performs alignment, truth-duplication, and list-extraction. -# This step can be moved to the Orchestration layer (Pipeline Core) in the future. -ef = PandasAdapter.from_dataframes(actuals, preds, "target") -print(f"Adapted data: {ef.n_rows} rows, {ef.n_samples} samples") - -print("") -print("--- Step 2: Evaluate ---") -# The evaluator is stateless and pure math (no pandas). evaluator = NativeEvaluator(config) report = evaluator.evaluate(ef) print("Evaluation complete.") -print("") -print("--- Step 3: Export Results ---") -# Convert to DataFrames only when needed for reporting -month_df = report.to_dataframe(schema="month") -print("Month-wise results:") -print(month_df) +# 4. Export results +print("\nMonth-wise results (dict):") +print(report.to_dict()['schemas']['month']) -# Or export to pure dict for JSON serialization -json_friendly_dict = report.to_dict() -print("") -print("JSON Export (Sample):") -print(f"Target: {json_friendly_dict['target']}") -print(f"Schemas found: {list(json_friendly_dict['schemas'].keys())}") +print("\nFull export:") +d = report.to_dict() +print(f"Target: {d['target']}, Schemas: {list(d['schemas'].keys())}") diff --git a/pyproject.toml b/pyproject.toml index 99d03c4..bed92eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "views_evaluation" -version = "0.4.0" +version = "0.5.0" description = "" authors = [ "Xiaolong Sun ", @@ -13,7 +13,10 @@ readme = "README.md" python = ">=3.11,<3.15" scikit-learn = "^1.6.0" numpy = "^1.26.4" -pandas = "^1.5.3" # PHASE-3-DELETE: will become optional once EvaluationManager/PandasAdapter are removed +pandas = {version = "^1.5.3", optional = true} + +[tool.poetry.extras] +dataframe = ["pandas"] [tool.poetry.group.dev.dependencies] pytest = "^8.3.3" diff --git a/tests/conftest.py b/tests/conftest.py deleted file mode 100644 index 7144b73..0000000 --- a/tests/conftest.py +++ /dev/null @@ -1,64 +0,0 @@ -""" -PHASE-3-DELETE: Pytest fixtures for the legacy EvaluationManager test suite. -Will be deleted when Phase 3 of the orchestrator migration is complete. -See reports/2026-02-25_evaluation_frame_refactor/10_orchestrator_migration_plan.md -""" -import pandas as pd -import numpy as np -import pytest - -# A fixture to generate mock data for tests -@pytest.fixture -def mock_data_factory(): - def _generate( - target_name="lr_ged_sb_best", - point_predictions_as_list=True, - num_sequences=2, - num_steps=3, - num_locations=2, - start_month=500, - ): - pred_col_name = f"pred_{target_name}" - loc_id_name = "country_id" - - # 1. Actuals DataFrame - actuals_index = pd.MultiIndex.from_product( - [range(start_month, start_month + num_sequences + num_steps), range(num_locations)], - names=['month_id', loc_id_name] - ) - actuals = pd.DataFrame( - {target_name: np.random.randint(0, 50, size=len(actuals_index))}, - index=actuals_index - ) - - # 2. Predictions List - predictions_list = [] - for i in range(num_sequences): - preds_index = pd.MultiIndex.from_product( - [range(start_month + i, start_month + i + num_steps), range(num_locations)], - names=['month_id', loc_id_name] - ) - - if point_predictions_as_list: - # Canonical format: list of single floats - pred_values = [[val] for val in np.random.rand(len(preds_index)) * 50] - else: - # Non-canonical format: raw floats - pred_values = [val for val in np.random.rand(len(preds_index)) * 50] - - preds = pd.DataFrame( - {pred_col_name: pred_values}, - index=preds_index - ) - predictions_list.append(preds) - - # 3. Config - config = { - 'steps': list(range(1, num_steps + 1)), - 'regression_targets': [target_name], - 'regression_point_metrics': ['MSE', 'RMSLE', 'Pearson'], - } - - return actuals, predictions_list, target_name, config - - return _generate diff --git a/tests/test_adversarial_inputs.py b/tests/test_adversarial_inputs.py index 3a9110d..21c890c 100644 --- a/tests/test_adversarial_inputs.py +++ b/tests/test_adversarial_inputs.py @@ -1,229 +1,16 @@ -import pandas as pd import numpy as np import pytest -from views_evaluation.evaluation.evaluation_manager import EvaluationManager from views_evaluation.evaluation.evaluation_frame import EvaluationFrame from views_evaluation.evaluation.native_evaluator import NativeEvaluator -@pytest.fixture -def adversarial_data_factory(mock_data_factory): - """A fixture that extends the mock_data_factory to create adversarial data.""" - def _generate( - target_name="lr_ged_sb_best", - num_sequences=1, - num_steps=1, - num_locations=1, - start_month=500, - actuals_value=10.0, - predictions_value=[[10.0]], - ): - pred_col_name = f"pred_{target_name}" - loc_id_name = "country_id" - - # 1. Actuals DataFrame - actuals_index = pd.MultiIndex.from_product( - [range(start_month, start_month + num_steps), range(num_locations)], - names=['month_id', loc_id_name] - ) - actuals = pd.DataFrame( - {target_name: actuals_value}, - index=actuals_index - ) - - # 2. Predictions List - predictions_list = [] - preds_index = pd.MultiIndex.from_product( - [range(start_month, start_month + num_steps), range(num_locations)], - names=['month_id', loc_id_name] - ) - preds = pd.DataFrame( - {pred_col_name: predictions_value}, - index=preds_index - ) - predictions_list.append(preds) - - # 3. Config - config = { - 'steps': list(range(1, num_steps + 1)), - 'regression_targets': [target_name], - 'regression_point_metrics': ['RMSLE'], - } - - return actuals, predictions_list, target_name, config - - return _generate - - -class TestAdversarialInputs: - """ - A test suite for Phase 2: Adversarial and Edge-Case Testing. - These tests probe for robustness and predictable failure modes. - """ - - def test_corrupted_numerical_data_nan_in_actuals(self, adversarial_data_factory): - """ - Tests behavior when np.nan is present in the actuals data. - Expected: A ValueError should be raised by the underlying sklearn metric. - """ - # Arrange - actuals, predictions, target, config = adversarial_data_factory( - actuals_value=np.nan, - predictions_value=[[10.0]] - ) - manager = EvaluationManager() - - # Act & Assert - with pytest.raises(ValueError, match="Input contains NaN"): - manager.evaluate( - actual=actuals, - predictions=predictions, - target=target, - config=config - ) - - def test_corrupted_numerical_data_nan_in_predictions(self, adversarial_data_factory): - """ - Tests behavior when np.nan is present in the predictions data. - Expected: A ValueError should be raised by the underlying sklearn metric. - """ - # Arrange - actuals, predictions, target, config = adversarial_data_factory( - actuals_value=10.0, - predictions_value=[[np.nan]] - ) - manager = EvaluationManager() - - # Act & Assert - with pytest.raises(ValueError, match="Input contains NaN"): - manager.evaluate( - actual=actuals, - predictions=predictions, - target=target, - config=config - ) - - def test_corrupted_numerical_data_inf_in_actuals(self, adversarial_data_factory): - """ - Tests behavior when np.inf is present in the actuals data. - Expected: A ValueError should be raised. - """ - # Arrange - actuals, predictions, target, config = adversarial_data_factory( - actuals_value=np.inf, - predictions_value=[[10.0]] - ) - manager = EvaluationManager() - - # Act & Assert - with pytest.raises(ValueError, match="Input contains infinity"): - manager.evaluate( - actual=actuals, - predictions=predictions, - target=target, - config=config - ) - - def test_corrupted_numerical_data_inf_in_predictions(self, adversarial_data_factory): - """ - Tests behavior when np.inf is present in the predictions data. - Expected: A ValueError should be raised. - """ - # Arrange - actuals, predictions, target, config = adversarial_data_factory( - actuals_value=10.0, - predictions_value=[[np.inf]] - ) - manager = EvaluationManager() - - # Act & Assert - with pytest.raises(ValueError, match="Input contains infinity"): - manager.evaluate( - actual=actuals, - predictions=predictions, - target=target, - config=config - ) - - def test_malformed_structural_data_empty_predictions_list(self, adversarial_data_factory): - """ - Tests behavior when an empty list is passed for predictions. - Expected: A ValueError should be raised by pandas.concat. - """ - # Arrange - actuals, _, target, config = adversarial_data_factory() - empty_predictions = [] - manager = EvaluationManager() - - # Act & Assert - with pytest.raises(ValueError, match="No objects to concatenate"): - manager.evaluate( - actual=actuals, - predictions=empty_predictions, - target=target, - config=config - ) - - def test_malformed_structural_data_empty_actuals_df(self, adversarial_data_factory): - """ - Tests behavior when an empty DataFrame is passed for actuals. - Expected: A KeyError should be raised when trying to access the target column. - """ - # Arrange - _, predictions, target, config = adversarial_data_factory() - empty_actuals = pd.DataFrame() - manager = EvaluationManager() - - # Act & Assert - with pytest.raises(KeyError): - manager.evaluate( - actual=empty_actuals, - predictions=predictions, - target=target, - config=config - ) - - def test_malformed_structural_data_non_overlapping_indices(self, adversarial_data_factory): - """ - Tests behavior when actuals and predictions have no overlapping indices. - Expected: A ValueError should be raised by np.concatenate in the metric calculator. - """ - # Arrange - # Create actuals starting at month 500 - actuals, _, target, config = adversarial_data_factory(start_month=500, num_locations=1) - - # Create predictions starting at month 600, ensuring no overlap - pred_col_name = f"pred_{target}" - # Correctly create a 2-level MultiIndex - preds_index = pd.MultiIndex.from_product( - [range(600, 602), [10]], # Non-overlapping range for month_id - names=['month_id', "country_id"] - ) - preds = pd.DataFrame({pred_col_name: [[10.0]] * 2}, index=preds_index) - predictions_non_overlapping = [preds] - - manager = EvaluationManager() - - # Act & Assert - with pytest.raises(ValueError, match="need at least one array to concatenate"): - manager.evaluate( - actual=actuals, - predictions=predictions_non_overlapping, - target=target, - config=config - ) - class TestAdversarialNativeInputs: """ Adversarial tests targeting EvaluationFrame + NativeEvaluator directly. - These tests are the native-path equivalents of TestAdversarialInputs above. - They must survive Phase 3 of the orchestrator migration (ADR-011), when - EvaluationManager and PandasAdapter are removed from this repository. - - Every test here asserts ADR-013 (Fail-Loud) behaviour from the entry - points that will remain after the migration. + Every test here asserts ADR-013 (Fail-Loud) behaviour from the + permanent native-path entry points. """ @staticmethod @@ -317,3 +104,23 @@ def test_unimplemented_metric_raises_clear_value_error(self): with pytest.raises(ValueError, match="not yet implemented"): NativeEvaluator(config).evaluate(ef) + def test_nan_rejected_before_brier_executes(self): + """Defense-in-depth: EvaluationFrame rejects NaN so Brier's NaN-swallowing + comparison semantics can never be triggered through the normal evaluation path.""" + with pytest.raises(ValueError, match="NaN"): + EvaluationFrame( + y_true=np.array([np.nan, 1.0]), + y_pred=np.array([[0.5], [0.8]]), + identifiers=self._simple_ids(2), + metadata={'target': 'cls_target'}, + ) + + def test_inf_rejected_before_metric_executes(self): + """Defense-in-depth: EvaluationFrame rejects Inf before any metric function runs.""" + with pytest.raises(ValueError, match="infinity"): + EvaluationFrame( + y_true=np.array([np.inf, 1.0]), + y_pred=np.array([[0.5], [0.8]]), + identifiers=self._simple_ids(2), + metadata={'target': 'cls_target'}, + ) diff --git a/tests/test_data_contract.py b/tests/test_data_contract.py deleted file mode 100644 index 7e53490..0000000 --- a/tests/test_data_contract.py +++ /dev/null @@ -1,80 +0,0 @@ -""" -PHASE-3-DELETE: Tests data contract enforcement for the legacy EvaluationManager path. -Will be deleted when Phase 3 of the orchestrator migration is complete. -See reports/2026-02-25_evaluation_frame_refactor/10_orchestrator_migration_plan.md -""" -import pandas as pd -import pytest -from views_evaluation.evaluation.evaluation_manager import EvaluationManager - -@pytest.fixture -def mock_data(): - target = "lr_target" - index = pd.MultiIndex.from_tuples([(100, 1), (101, 1)], names=["month", "id"]) - actual = pd.DataFrame({target: [10, 20]}, index=index) - config = { - "steps": [1, 2], - "regression_targets": [target], - "regression_point_metrics": ["MSE"], - } - return actual, target, config, index - -def test_missing_pred_column(mock_data): - actual, target, config, index = mock_data - # Column name is wrong - pred_df = pd.DataFrame({"wrong_name": [[10.5], [19.5]]}, index=index) - manager = EvaluationManager() - - with pytest.raises(ValueError, match=f"must contain the column named 'pred_{target}'"): - manager.evaluate(actual, [pred_df], target, config) - -def test_extra_columns_raises_error(mock_data): - """Verify that extra columns now raise a ValueError per the documentation.""" - actual, target, config, index = mock_data - pred_df = pd.DataFrame({ - f"pred_{target}": [[10.5], [19.5]], - "extra_garbage": [1, 2] - }, index=index) - manager = EvaluationManager() - - with pytest.raises(ValueError, match="must contain exactly one column"): - manager.evaluate(actual, [pred_df], target, config) - -def test_duplicate_pred_columns_raises_error(mock_data): - """Verify that duplicate target columns cause a failure (currently a crash).""" - actual, target, config, index = mock_data - df1 = pd.DataFrame({f"pred_{target}": [[10.5], [19.5]]}, index=index) - df2 = pd.DataFrame({f"pred_{target}": [[11.0], [20.0]]}, index=index) - pred_df = pd.concat([df1, df2], axis=1) - - manager = EvaluationManager() - - # We expect a failure. Note: Ideally we want a custom ValueError from our validator. - # Currently it raises a numpy/pandas ValueError during calculation. - with pytest.raises(ValueError): - manager.evaluate(actual, [pred_df], target, config) - -def test_zero_index_overlap_graceful_failure(mock_data): - """Verify behavior when actuals and predictions have no common months.""" - actual, target, config, _ = mock_data - # Preds are for months 200, 201 (no overlap with 100, 101) - index_no_overlap = pd.MultiIndex.from_tuples([(200, 1), (201, 1)], names=["month", "id"]) - pred_df = pd.DataFrame({f"pred_{target}": [[10.5], [19.5]]}, index=index_no_overlap) - - manager = EvaluationManager() - - # Currently, this crashes in np.concatenate inside the metric calculator. - # We want it to either raise a clear error or return NaNs. - with pytest.raises((ValueError, KeyError)): - manager.evaluate(actual, [pred_df], target, config) - -def test_mixed_point_and_sample_types(mock_data): - actual, target, config, index = mock_data - # First is point, second is sample - pred1 = pd.DataFrame({f"pred_{target}": [[10.5], [19.5]]}, index=index) - pred2 = pd.DataFrame({f"pred_{target}": [[10, 11, 12], [19, 20, 21]]}, index=index) - - manager = EvaluationManager() - - with pytest.raises(ValueError, match="Mix of evaluation types detected"): - manager.evaluate(actual, [pred1, pred2], target, config) diff --git a/tests/test_documentation_contracts.py b/tests/test_documentation_contracts.py deleted file mode 100644 index 8c096d2..0000000 --- a/tests/test_documentation_contracts.py +++ /dev/null @@ -1,286 +0,0 @@ -""" -PHASE-3-DELETE: Tests documentation contracts for the legacy EvaluationManager path. -Will be deleted when Phase 3 of the orchestrator migration is complete. -See reports/2026-02-25_evaluation_frame_refactor/10_orchestrator_migration_plan.md -""" -import pandas as pd -import numpy as np -import pytest - -from views_evaluation.evaluation.evaluation_manager import EvaluationManager - - -class TestDocumentationContracts: - """ - A test suite to verify the claims made in the project's documentation. - """ - - def test_eval_lib_imp_actuals_schema_prefix_requirement_succeeds(self, mock_data_factory): - """ - Verifies Section 3.1 of eval_lib_imp.md. - Claim: Evaluation succeeds if the target name has a valid prefix. - """ - # Arrange - target_with_prefix = "lr_ged_sb_best" - actuals, predictions, target, config = mock_data_factory(target_name=target_with_prefix) - manager = EvaluationManager() - - # Act & Assert - try: - manager.evaluate( - actual=actuals, - predictions=predictions, - target=target, - config=config - ) - except ValueError as e: - pytest.fail(f"Evaluation failed unexpectedly with a valid prefix: {e}") - - def test_eval_lib_imp_actuals_schema_prefix_requirement_fails(self, mock_data_factory): - """ - Verifies updated behaviour from Section 3.1 of eval_lib_imp.md. - Old claim: Evaluation fails if the target name is missing a valid prefix. - New behaviour: The new EvaluationManager no longer validates prefixes in evaluate(). - transform_data() issues a warning for unknown prefixes but applies an identity - transform and continues. Evaluation therefore *succeeds* with an unknown prefix as - long as the target is declared in the config. - """ - # Arrange - target_without_prefix = "ged_sb_best" - actuals, predictions, target, config = mock_data_factory(target_name=target_without_prefix) - manager = EvaluationManager() - - # Act & Assert β€” should now succeed (prefix validation removed from evaluate()) - try: - manager.evaluate( - actual=actuals, - predictions=predictions, - target=target, - config=config - ) - except Exception as e: - pytest.fail( - f"evaluate() raised unexpectedly for a target with no recognised prefix: {e}" - ) - - def test_eval_lib_imp_predictions_schema_point_canonical_succeeds(self, mock_data_factory): - """ - Verifies Section 3.2 of eval_lib_imp.md. - Claim: Evaluation succeeds if point predictions are canonical (list of single float). - """ - # Arrange - actuals, predictions, target, config = mock_data_factory(point_predictions_as_list=True) - manager = EvaluationManager() - - # Act & Assert - try: - manager.evaluate( - actual=actuals, - predictions=predictions, - target=target, - config=config - ) - except ValueError as e: - pytest.fail(f"Evaluation failed unexpectedly with canonical point predictions: {e}") - - def test_eval_lib_imp_predictions_schema_point_non_canonical_succeeds_due_to_implicit_conversion(self, mock_data_factory): - """ - Verifies Section 3.2 of eval_lib_imp.md by demonstrating a divergence. - Claim: Documentation states evaluation fails if point predictions are non-canonical (raw float). - Observed: Evaluation *succeeds* due to implicit conversion in EvaluationManager, making documentation inaccurate. - """ - # Arrange - actuals, predictions, target, config = mock_data_factory(point_predictions_as_list=False) - manager = EvaluationManager() - - # Act & Assert - try: - manager.evaluate( - actual=actuals, - predictions=predictions, - target=target, - config=config - ) - except ValueError as e: - pytest.fail(f"Evaluation *should have succeeded* with non-canonical point predictions due to implicit conversion, but failed with: {e}") - - def test_evaluation_manager_implicitly_converts_raw_floats_to_arrays(self, mock_data_factory): - """ - Explicitly verifies the implicit conversion of raw float predictions to np.ndarray([float]) - by EvaluationManager's internal _process_data method. - This behavior contradicts eval_lib_imp.md's claim that raw floats should cause an error. - """ - # Arrange - actuals, predictions, target, config = mock_data_factory(point_predictions_as_list=False) - manager = EvaluationManager() - - # Act - manager.evaluate( - actual=actuals, - predictions=predictions, - target=target, - config=config - ) - - # Assert - # After evaluate, internal predictions should be processed - processed_predictions = manager.predictions - # Check that the first value in the first DataFrame of processed_predictions is now a np.ndarray - assert isinstance(processed_predictions[0].iloc[0, 0], np.ndarray) - # Check that its length is 1 (single element) - assert len(processed_predictions[0].iloc[0, 0]) == 1 - - def test_eval_lib_imp_api_contract_missing_steps_config_fails(self, mock_data_factory): - """ - Verifies Section 4.2 of eval_lib_imp.md. - Claim: The `evaluate` method's `config` parameter *must* contain the key 'steps'. - """ - # Arrange - actuals, predictions, target, _ = mock_data_factory() # Use _ to ignore the default config - manager = EvaluationManager() - invalid_config = {} # Missing 'steps' key - - # Act & Assert - with pytest.raises(KeyError, match="'steps'"): - manager.evaluate( - actual=actuals, - predictions=predictions, - target=target, - config=invalid_config - ) - - def test_eval_lib_imp_data_state_coherency_no_inverse_transform(self, mock_data_factory): - """ - Verifies Section 3.4 of eval_lib_imp.md. - Claim: EvaluationManager does NOT perform inverse transformations on prediction data (producer's responsibility). - """ - # Arrange - target_name = "lr_some_var" # lr_ prefix means raw, no transform by EM - pred_col_name = f"pred_{target_name}" - loc_id_name = "country_id" - - # Create actuals (raw counts) - actuals_index = pd.MultiIndex.from_product([[500], [10]], names=['month_id', loc_id_name]) - actuals = pd.DataFrame( - {target_name: [100]}, # Actual value is 100 - index=actuals_index - ) - - # Create predictions that are log-transformed, but named as 'lr_' to indicate raw input - # So, if EM were to inverse transform, it would be wrong, but it shouldn't inverse transform - predictions_list = [] - preds_index = pd.MultiIndex.from_product([[500], [10]], names=['month_id', loc_id_name]) - # Prediction of log(100+1) - 1, which is approximately 4.6 (ln(101)-1) - # If EM doesn't inverse transform, RMSLE will be calculated with 4.6 vs 100 - # If EM incorrectly inverse transformed, it would see 4.6, transform it back, then calculate RMSLE - pred_values_log_transformed = [[np.log1p(100)]] # Represents log(100+1) - predictions_df = pd.DataFrame( - {pred_col_name: pred_values_log_transformed}, - index=preds_index - ) - predictions_list.append(predictions_df) - - manager = EvaluationManager() - - # We need a config with steps and the new required keys - config = { - 'steps': [1], - 'regression_targets': [target_name], - 'regression_point_metrics': ['RMSLE'], - } - - # Act - results = manager.evaluate( - actual=actuals, - predictions=predictions_list, - target=target_name, - config=config - ) - - # Assert - # Get the RMSLE for the step-wise evaluation - rmsle = results['step'][1]['RMSLE'][0] # Access the DataFrame, then RMSLE column, then first value - - # If EM incorrectly inverse-transformed, RMSLE would be close to 0 - # If EM correctly *doesn't* inverse-transform, RMSLE is calculated with actual=100 and pred=log1p(100) - # log1p(100) is approx 4.615 - # RMSLE(100, 4.615) is large. - - # A simple check: if RMSLE is very small, it means inverse transform *did* happen. - # We expect it to be large. - assert rmsle > 1.0 # Arbitrary large threshold to show it's not a small error - - def test_r2darts2_report_point_prediction_format_succeeds(self, mock_data_factory): - """ - Verifies Section B.1 of the plan (from r2darts2_full_imp_report.md). - Claim: views-r2darts2 produces point predictions as a list (e.g., [[25.5]]). - """ - # Arrange - # Use mock_data_factory with point_predictions_as_list=True to simulate r2darts2 output - actuals, predictions, target, config = mock_data_factory(point_predictions_as_list=True) - manager = EvaluationManager() - - # Act & Assert - try: - manager.evaluate( - actual=actuals, - predictions=predictions, - target=target, - config=config - ) - except ValueError as e: - pytest.fail(f"Evaluation failed unexpectedly when processing r2darts2-like canonical point predictions: {e}") - - def test_stepshifter_report_point_prediction_format_succeeds_despite_raw_float_output(self, mock_data_factory): - """ - Verifies Section C.1 of the plan (from stepshifter_full_imp_report.md). - Claim: views-stepshifter produces point predictions as raw np.float64 values (contradicts eval_lib_imp.md). - Observed: EvaluationManager implicitly converts and processes successfully. - """ - # Arrange - # Use mock_data_factory with point_predictions_as_list=False to simulate stepshifter output - actuals, predictions, target, config = mock_data_factory(point_predictions_as_list=False) - manager = EvaluationManager() - - # Act & Assert - try: - manager.evaluate( - actual=actuals, - predictions=predictions, - target=target, - config=config - ) - except ValueError as e: - pytest.fail(f"Evaluation *should have succeeded* with stepshifter-like raw float predictions due to implicit conversion, but failed with: {e}") - - def test_stepshifter_report_reconciliation_fix_succeeds(self, mock_data_factory): - """ - Verifies Section C.2 of the plan (from stepshifter_full_imp_report.md). - Claim: Applying the reconciliation fix (float -> list) to stepshifter's raw float output - should allow EvaluationManager to process the data successfully. - """ - # Arrange - # Simulate stepshifter output (raw floats) - actuals, predictions_raw_floats, target, config = mock_data_factory(point_predictions_as_list=False) - manager = EvaluationManager() - - # Apply the reconciliation logic as described in the report - # "Wrap every cell value in a list to conform to the canonical standard." - reconciled_predictions = [df.applymap(lambda x: [x]) for df in predictions_raw_floats] - - # Act & Assert - try: - manager.evaluate( - actual=actuals, - predictions=reconciled_predictions, - target=target, - config=config - ) - except ValueError as e: - pytest.fail(f"Evaluation failed unexpectedly after applying stepshifter's reconciliation fix: {e}") - - - - - - diff --git a/tests/test_evaluation_frame.py b/tests/test_evaluation_frame.py index 000de43..5f9724c 100644 --- a/tests/test_evaluation_frame.py +++ b/tests/test_evaluation_frame.py @@ -218,6 +218,18 @@ def test_y_pred_row_mismatch_raises(self): with pytest.raises(ValueError, match="mismatch"): EvaluationFrame(np.ones(5), np.ones((4, 1)), _make_identifiers(5)) + def test_y_pred_1d_raises(self): + """1D y_pred must be rejected β€” callers must provide (N, S) shape.""" + n = 3 + with pytest.raises(ValueError, match="y_pred must be 2D"): + EvaluationFrame(np.ones(n), np.ones(n), _make_identifiers(n)) + + def test_y_pred_3d_raises(self): + """3D y_pred must be rejected.""" + n = 2 + with pytest.raises(ValueError, match="y_pred must be 2D"): + EvaluationFrame(np.ones(n), np.ones((n, 3, 2)), _make_identifiers(n)) + def test_nan_in_y_true_raises(self): n = 4 with pytest.raises(ValueError, match="NaN"): diff --git a/tests/test_evaluation_manager.py b/tests/test_evaluation_manager.py deleted file mode 100644 index da4ad53..0000000 --- a/tests/test_evaluation_manager.py +++ /dev/null @@ -1,515 +0,0 @@ -""" -PHASE-3-DELETE: Tests the legacy EvaluationManager path. -Will be deleted when Phase 3 of the orchestrator migration is complete. -See reports/2026-02-25_evaluation_frame_refactor/10_orchestrator_migration_plan.md -""" -import logging -import pandas as pd -import numpy as np -import pytest -from sklearn.metrics import root_mean_squared_log_error, average_precision_score -import properscoring as ps -from views_evaluation.evaluation.evaluation_manager import EvaluationManager -from views_evaluation.evaluation.native_metric_calculators import ( - REGRESSION_POINT_NATIVE, - REGRESSION_SAMPLE_NATIVE, -) -from views_evaluation.evaluation.metrics import ( - RegressionPointEvaluationMetrics, - RegressionSampleEvaluationMetrics, -) - - -@pytest.fixture -def mock_index(): - index_0 = pd.MultiIndex.from_tuples( - [ - (100, 1), - (100, 2), - (101, 1), - (101, 2), - (102, 1), - (102, 2), - ], - names=["month", "country"], - ) - index_1 = pd.MultiIndex.from_tuples( - [ - (101, 1), - (101, 2), - (102, 1), - (102, 2), - (103, 1), - (103, 2), - ], - names=["month", "country"], - ) - return [index_0, index_1] - - -@pytest.fixture -def mock_actual(): - index = pd.MultiIndex.from_tuples( - [ - (99, 1), - (99, 2), - (100, 1), - (100, 2), - (101, 1), - (101, 2), - (102, 1), - (102, 2), - (103, 1), - (103, 2), - (104, 1), - (104, 2), - ], - names=["month", "country"], - ) - df = pd.DataFrame( - { - "target": [0.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 5.0, 5.0, 6.0], - "covariate_1": [3.0, 2.0, 4.0, 5.0, 2.0, 6.0, 8.0, 5.0, 3.0, 2.0, 9.0, 4.0], - }, - index=index, - ) - return EvaluationManager.convert_to_array(df, "target") - - -@pytest.fixture -def mock_point_predictions(mock_index): - df1 = pd.DataFrame({"pred_target": [1.0, 3.0, 5.0, 7.0, 9.0, 7.0]}, index=mock_index[0]) - df2 = pd.DataFrame({"pred_target": [2.0, 4.0, 6.0, 8.0, 10.0, 8.0]}, index=mock_index[1]) - return [EvaluationManager.convert_to_array(df1, "pred_target"), EvaluationManager.convert_to_array(df2, "pred_target")] - - -@pytest.fixture -def mock_sample_predictions(mock_index): - df1 = pd.DataFrame( - { - "pred_target": [ - [1.0, 2.0, 3.0], - [2.0, 3.0, 4.0], - [3.0, 4.0, 5.0], - [4.0, 5.0, 6.0], - [5.0, 6.0, 7.0], - [6.0, 7.0, 8.0], - ] - }, - index=mock_index[0], - ) - df2 = pd.DataFrame( - { - "pred_target": [ - [4.0, 6.0, 8.0], - [5.0, 7.0, 9.0], - [6.0, 8.0, 10.0], - [7.0, 9.0, 11.0], - [8.0, 10.0, 12.0], - [9.0, 11.0, 13.0], - ] - }, - index=mock_index[1], - ) - return [EvaluationManager.convert_to_array(df1, "pred_target"), EvaluationManager.convert_to_array(df2, "pred_target")] - - -def test_validate_dataframes_valid_type(mock_point_predictions): - with pytest.raises(TypeError): - EvaluationManager.validate_predictions( - mock_point_predictions[0], "target" - ) - - -def test_validate_dataframes_valid_columns(mock_point_predictions): - with pytest.raises(ValueError): - EvaluationManager.validate_predictions( - mock_point_predictions, "y" - ) - -def test_get_evaluation_type(): - # Test case 1: All DataFrames for sample evaluation - predictions_sample = [ - pd.DataFrame({'pred_target': [[1.0, 2.0], [3.0, 4.0]]}), - pd.DataFrame({'pred_target': [[5.0, 6.0], [7.0, 8.0]]}), - ] - assert EvaluationManager.get_evaluation_type(predictions_sample, "pred_target") is True - - # Test case 2: All DataFrames for point evaluation - predictions_point = [ - pd.DataFrame({'pred_target': [[1.0], [2.0]]}), - pd.DataFrame({'pred_target': [[3.0], [4.0]]}), - ] - assert EvaluationManager.get_evaluation_type(predictions_point, "pred_target") is False - - # Test case 3: Mixed evaluation types - predictions_mixed = [ - pd.DataFrame({'pred_target': [[1.0, 2.0], [3.0, 4.0]]}), - pd.DataFrame({'pred_target': [[5.0], [6.0]]}), - ] - with pytest.raises(ValueError): - EvaluationManager.get_evaluation_type(predictions_mixed, "pred_target") - - # Test case 4: Single element lists - predictions_single_element = [ - pd.DataFrame({'pred_target': [[1.0], [2.0]]}), - pd.DataFrame({'pred_target': [[3.0], [4.0]]}), - ] - assert EvaluationManager.get_evaluation_type(predictions_single_element, "pred_target") is False - - -def test_match_actual_pred_point( - mock_actual, mock_point_predictions, mock_sample_predictions, mock_index -): - df_matched = [ - pd.DataFrame({"target": [[1.0], [2.0], [2.0], [3.0], [3.0], [4.0]]}, index=mock_index[0]), - pd.DataFrame({"target": [[2.0], [3.0], [3.0], [4.0], [4.0], [5.0]]}, index=mock_index[1]), - ] - for i in range(len(df_matched)): - df_matched_actual_point, df_matched_point = ( - EvaluationManager._match_actual_pred( - mock_actual, mock_point_predictions[i], "target" - ) - ) - df_matched_actual_sample, df_matched_sample = ( - EvaluationManager._match_actual_pred( - mock_actual, mock_sample_predictions[i], "target" - ) - ) - assert df_matched[i].equals(df_matched_actual_point) - assert df_matched_point.equals(mock_point_predictions[i]) - assert df_matched[i].equals(df_matched_actual_sample) - assert df_matched_sample.equals(mock_sample_predictions[i]) - - -def test_split_dfs_by_step(mock_point_predictions, mock_sample_predictions): - df_splitted_point = [ - EvaluationManager.convert_to_array(pd.DataFrame( - {"pred_target": [[1.0], [3.0], [2.0], [4.0]]}, - index=pd.MultiIndex.from_tuples( - [(100, 1), (100, 2), (101, 1), (101, 2)], names=["month", "country"] - ), - ), "pred_target"), - EvaluationManager.convert_to_array(pd.DataFrame( - {"pred_target": [[5.0], [7.0], [6.0], [8.0]]}, - index=pd.MultiIndex.from_tuples( - [(101, 1), (101, 2), (102, 1), (102, 2)], names=["month", "country"] - ), - ), "pred_target"), - EvaluationManager.convert_to_array(pd.DataFrame( - {"pred_target": [[9.0], [7.0], [10.0], [8.0]]}, - index=pd.MultiIndex.from_tuples( - [(102, 1), (102, 2), (103, 1), (103, 2)], names=["month", "country"] - ), - ), "pred_target"), - ] - df_splitted_sample = [ - EvaluationManager.convert_to_array(pd.DataFrame( - {"pred_target": [[1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [4.0, 6.0, 8.0], [5.0, 7.0, 9.0]]}, - index=pd.MultiIndex.from_tuples( - [(100, 1), (100, 2), (101, 1), (101, 2)], names=["month", "country"] - ), - ), "pred_target"), - EvaluationManager.convert_to_array(pd.DataFrame( - {"pred_target": [[3.0, 4.0, 5.0], [4.0, 5.0, 6.0], [6.0, 8.0, 10.0], [7.0, 9.0, 11.0]]}, - index=pd.MultiIndex.from_tuples( - [(101, 1), (101, 2), (102, 1), (102, 2)], names=["month", "country"] - ), - ), "pred_target"), - EvaluationManager.convert_to_array(pd.DataFrame( - {"pred_target": [[5.0, 6.0, 7.0], [6.0, 7.0, 8.0], [8.0, 10.0, 12.0], [9.0, 11.0, 13.0]]}, - index=pd.MultiIndex.from_tuples( - [(102, 1), (102, 2), (103, 1), (103, 2)], names=["month", "country"] - ), - ), "pred_target"), - ] - df_splitted_point_test = EvaluationManager._split_dfs_by_step( - mock_point_predictions - ) - df_splitted_sample_test = EvaluationManager._split_dfs_by_step( - mock_sample_predictions - ) - for df1, df2 in zip(df_splitted_point, df_splitted_point_test): - assert df1.equals(df2) - for df1, df2 in zip(df_splitted_sample, df_splitted_sample_test): - assert df1.equals(df2) - - -def test_step_wise_evaluation_point(mock_actual, mock_point_predictions): - manager = EvaluationManager() - evaluation_dict, df_evaluation = manager.step_wise_evaluation( - mock_actual, mock_point_predictions, "target", [1, 2, 3], - metrics_list=["RMSLE"], - metric_functions=REGRESSION_POINT_NATIVE, - metrics_cls=RegressionPointEvaluationMetrics, - ) - - actuals = [[1, 2, 2, 3], [2, 3, 3, 4], [3, 4, 4, 5]] - preds = [[1, 3, 2, 4], [5, 7, 6, 8], [9, 7, 10, 8]] - df_evaluation_test = pd.DataFrame( - { - "RMSLE": [ - root_mean_squared_log_error(actual, pred) - for (actual, pred) in zip(actuals, preds) - ], - }, - index=["step01", "step02", "step03"], - ) - - assert ["step01", "step02", "step03"] == list(evaluation_dict.keys()) - assert np.allclose(df_evaluation, df_evaluation_test, atol=0.000001) - - -def test_step_wise_evaluation_sample(mock_actual, mock_sample_predictions): - manager = EvaluationManager() - evaluation_dict, df_evaluation = manager.step_wise_evaluation( - mock_actual, mock_sample_predictions, "target", [1, 2, 3], - metrics_list=["CRPS"], - metric_functions=REGRESSION_SAMPLE_NATIVE, - metrics_cls=RegressionSampleEvaluationMetrics, - ) - actuals = [[1, 2, 2, 3], [2, 3, 3, 4], [3, 4, 4, 5]] - preds = [ - [[1, 2, 3], [2, 3, 4], [4, 6, 8], [5, 7, 9]], - [[3, 4, 5], [4, 5, 6], [6, 8, 10], [7, 9, 11]], - [[5, 6, 7], [6, 7, 8], [8, 10, 12], [9, 11, 13]], - ] - df_evaluation_test = pd.DataFrame( - { - "CRPS": [ - ps.crps_ensemble(actual, pred).mean() - for (actual, pred) in zip(actuals, preds) - ], - }, - index=["step01", "step02", "step03"], - ) - - assert ["step01", "step02", "step03"] == list(evaluation_dict.keys()) - assert np.allclose(df_evaluation, df_evaluation_test, atol=0.000001) - - -def test_time_series_wise_evaluation_point(mock_actual, mock_point_predictions): - manager = EvaluationManager() - evaluation_dict, df_evaluation = manager.time_series_wise_evaluation( - mock_actual, mock_point_predictions, "target", - metrics_list=["RMSLE"], - metric_functions=REGRESSION_POINT_NATIVE, - metrics_cls=RegressionPointEvaluationMetrics, - ) - - actuals = [[1, 2, 2, 3, 3, 4], [2, 3, 3, 4, 4, 5]] - preds = [1, 3, 5, 7, 9, 7], [2, 4, 6, 8, 10, 8] - df_evaluation_test = pd.DataFrame( - { - "RMSLE": [ - root_mean_squared_log_error(actual, pred) - for (actual, pred) in zip(actuals, preds) - ], - }, - index=["ts00", "ts01"], - ) - - assert ["ts00", "ts01"] == list(evaluation_dict.keys()) - assert np.allclose(df_evaluation, df_evaluation_test, atol=0.000001) - - -def test_time_series_wise_evaluation_sample(mock_actual, mock_sample_predictions): - manager = EvaluationManager() - evaluation_dict, df_evaluation = manager.time_series_wise_evaluation( - mock_actual, mock_sample_predictions, "target", - metrics_list=["CRPS"], - metric_functions=REGRESSION_SAMPLE_NATIVE, - metrics_cls=RegressionSampleEvaluationMetrics, - ) - - actuals = [[1, 2, 2, 3, 3, 4], [2, 3, 3, 4, 4, 5]] - preds = [ - [[1, 2, 3], [2, 3, 4], [3, 4, 5], [4, 5, 6], [5, 6, 7], [6, 7, 8]], - [[4, 6, 8], [5, 7, 9], [6, 8, 10], [7, 9, 11], [8, 10, 12], [9, 11, 13]], - ] - df_evaluation_test = pd.DataFrame( - { - "CRPS": [ - ps.crps_ensemble(actual, pred).mean() - for (actual, pred) in zip(actuals, preds) - ], - }, - index=["ts00", "ts01"], - ) - - assert ["ts00", "ts01"] == list(evaluation_dict.keys()) - assert np.allclose(df_evaluation, df_evaluation_test, atol=0.000001) - - -def test_month_wise_evaluation_point(mock_actual, mock_point_predictions): - manager = EvaluationManager() - evaluation_dict, df_evaluation = manager.month_wise_evaluation( - mock_actual, mock_point_predictions, "target", - metrics_list=["RMSLE"], - metric_functions=REGRESSION_POINT_NATIVE, - metrics_cls=RegressionPointEvaluationMetrics, - ) - - actuals = [[1, 2], [2, 3, 2, 3], [3, 4, 3, 4], [4, 5]] - preds = [[1, 3], [5, 7, 2, 4], [9, 7, 6, 8], [10, 8]] - df_evaluation_test = pd.DataFrame({ - "RMSLE": [ - root_mean_squared_log_error(actual, pred) - for (actual, pred) in zip(actuals, preds) - ], - }, - index=["month100", "month101", "month102", "month103"], - ) - - assert ["month100", "month101", "month102", "month103"] == list( - evaluation_dict.keys() - ) - assert np.allclose(df_evaluation, df_evaluation_test, atol=0.000001) - - -def test_month_wise_evaluation_sample(mock_actual, mock_sample_predictions): - manager = EvaluationManager() - evaluation_dict, df_evaluation = manager.month_wise_evaluation( - mock_actual, mock_sample_predictions, "target", - metrics_list=["CRPS"], - metric_functions=REGRESSION_SAMPLE_NATIVE, - metrics_cls=RegressionSampleEvaluationMetrics, - ) - - actuals = [[1, 2], [2, 3, 2, 3], [3, 4, 3, 4], [4, 5]] - preds = [ - [[1, 2, 3], [2, 3, 4]], - [[3, 4, 5], [4, 5, 6], [4, 6, 8], [5, 7, 9]], - [[5, 6, 7], [6, 7, 8], [6, 8, 10], [7, 9, 11]], - [[8, 10, 12], [9, 11, 13]], - ] - df_evaluation_test = pd.DataFrame( - { - "CRPS": [ - ps.crps_ensemble(actual, pred).mean() - for (actual, pred) in zip(actuals, preds) - ], - }, - index=["month100", "month101", "month102", "month103"], - ) - - assert ["month100", "month101", "month102", "month103"] == list( - evaluation_dict.keys() - ) - assert np.allclose(df_evaluation, df_evaluation_test, atol=0.000001) - - -def test_calculate_ap_point_predictions(): - """ - Test calculate_ap with pre-binarised actuals (0/1) and probability scores as predictions. - """ - # Binary actuals: 1 = positive class, 0 = negative class - actual_binary = [1, 0, 1, 0] - # Probability scores for the positive class - pred_scores = [0.9, 0.4, 0.3, 0.1] - - matched_actual = pd.DataFrame({'target': [[v] for v in actual_binary]}) - matched_pred = pd.DataFrame({'pred_target': [[v] for v in pred_scores]}) - - from views_evaluation.evaluation.native_metric_calculators import calculate_ap - ap_score = calculate_ap(matched_actual, matched_pred, 'target') - - expected_ap = average_precision_score(actual_binary, pred_scores) - - assert abs(ap_score - expected_ap) < 0.01 - - -def test_calculate_ap_sample_predictions(): - """ - Test calculate_ap with pre-binarised actuals and distributional probability scores. - Each prediction is a list of probability samples; actuals are 0/1. - """ - # Binary actuals: 1 = positive, 0 = negative - actual_binary = [1, 0, 1, 0] - # Distributional probability predictions (multiple samples per observation) - pred_scores = [ - [0.8, 0.9, 0.95], - [0.3, 0.4, 0.45], - [0.2, 0.25, 0.35], - [0.05, 0.1, 0.15], - ] - - matched_actual = pd.DataFrame({'target': [[v] for v in actual_binary]}) - matched_pred = pd.DataFrame({'pred_target': pred_scores}) - - from views_evaluation.evaluation.native_metric_calculators import calculate_ap - ap_score = calculate_ap(matched_actual, matched_pred, 'target') - - # Expected: actuals expanded to match samples, predictions are the raw samples - actual_expanded = np.repeat(actual_binary, [len(p) for p in pred_scores]) - pred_flat = np.concatenate(pred_scores) - expected_ap = average_precision_score(actual_expanded, pred_flat) - - assert abs(ap_score - expected_ap) < 0.01 - - -# --------------------------------------------------------------------------- -# New tests for config normalisation and validation -# --------------------------------------------------------------------------- - -def test_normalise_config_legacy_targets_key(caplog): - """Legacy 'targets' key should be translated to 'regression_targets' with a warning.""" - config = {'steps': [1], 'targets': ['my_target'], 'regression_point_metrics': ['MSE']} - with caplog.at_level(logging.WARNING): - normalised = EvaluationManager._normalise_config(config) - assert 'regression_targets' in normalised - assert 'targets' not in normalised - assert any('DEPRECATED' in r.message for r in caplog.records) - - -def test_normalise_config_legacy_metrics_key(caplog): - """Legacy 'metrics' key should be translated to 'regression_point_metrics' with a warning.""" - config = {'steps': [1], 'regression_targets': ['t'], 'metrics': ['MSE']} - with caplog.at_level(logging.WARNING): - normalised = EvaluationManager._normalise_config(config) - assert 'regression_point_metrics' in normalised - assert 'metrics' not in normalised - assert any('DEPRECATED' in r.message for r in caplog.records) - - -def test_validate_config_missing_steps(): - with pytest.raises(KeyError, match="steps"): - EvaluationManager._validate_config({'regression_targets': ['t'], 'regression_point_metrics': ['MSE']}) - - -def test_validate_config_missing_all_targets(): - with pytest.raises(KeyError): - EvaluationManager._validate_config({'steps': [1]}) - - -def test_validate_config_regression_targets_without_metrics(): - with pytest.raises(KeyError, match="regression_point_metrics"): - EvaluationManager._validate_config({'steps': [1], 'regression_targets': ['t']}) - - -def test_validate_config_classification_targets_without_metrics(): - with pytest.raises(KeyError, match="classification_point_metrics"): - EvaluationManager._validate_config({'steps': [1], 'classification_targets': ['t']}) - - -def test_evaluate_target_not_in_config(mock_actual, mock_point_predictions): - manager = EvaluationManager() - config = { - 'steps': [1, 2, 3], - 'regression_targets': ['some_other_target'], - 'regression_point_metrics': ['RMSLE'], - } - with pytest.raises(ValueError, match="not declared in config"): - manager.evaluate(mock_actual, mock_point_predictions, 'target', config) - - -def test_evaluate_invalid_metric_for_task_type(mock_actual, mock_point_predictions): - """AP is a classification metric β€” declaring it under regression_point_metrics should raise.""" - manager = EvaluationManager() - config = { - 'steps': [1, 2, 3], - 'regression_targets': ['target'], - 'regression_point_metrics': ['AP'], # AP is not a regression metric - } - with pytest.raises(ValueError, match="not valid for"): - manager.evaluate(mock_actual, mock_point_predictions, 'target', config) diff --git a/tests/test_evaluation_schemas.py b/tests/test_evaluation_schemas.py deleted file mode 100644 index d08ca8f..0000000 --- a/tests/test_evaluation_schemas.py +++ /dev/null @@ -1,197 +0,0 @@ -""" -PHASE-3-DELETE: Tests legacy EvaluationManager schema grouping logic via mocks. -Will be deleted when Phase 3 of the orchestrator migration is complete. -See reports/2026-02-25_evaluation_frame_refactor/10_orchestrator_migration_plan.md - -Original docstring: -This test suite rigorously verifies the grouping logic of the three evaluation -schemas (step-wise, time-series-wise, and month-wise) as described in the -core project documentation. -""" -import pytest -import pandas as pd -from unittest.mock import MagicMock, patch - -from views_evaluation.evaluation.evaluation_manager import EvaluationManager -from views_evaluation.evaluation.metrics import RegressionPointEvaluationMetrics - -@pytest.fixture -def schema_test_data(): - """ - Generates a predictable, non-random "predictive parallelogram" for testing. - - - 3 sequences (t0, t1, t2) - - 4 steps per sequence (s1, s2, s3, s4) - - 2 locations (l0, l1) - - Start month: 100 - - Parallelogram structure (value is month_id): - l0 l1 (Sequence 0) - t0_s1: 100 100 - t0_s2: 101 101 - t0_s3: 102 102 - t0_s4: 103 103 - ... ... (Sequence 1) - t1_s1: 101 101 - t1_s2: 102 102 - t1_s3: 103 103 - t1_s4: 104 104 - ... ... (Sequence 2) - t2_s1: 102 102 - t2_s2: 103 103 - t2_s3: 104 104 - t2_s4: 105 105 - """ - target_name = "lr_test_target" - pred_col_name = f"pred_{target_name}" - loc_id_name = "location_id" - num_sequences = 3 - num_steps = 4 - num_locations = 2 - start_month = 100 - - # 1. Actuals DataFrame (covering all possible months) - actuals_index = pd.MultiIndex.from_product( - [range(start_month, start_month + num_sequences + num_steps), range(num_locations)], - names=['month_id', loc_id_name] - ) - # Use month_id as the value for easy checking - actuals_values = [idx[0] for idx in actuals_index] - actuals = pd.DataFrame({target_name: actuals_values}, index=actuals_index) - - # 2. Predictions List - predictions_list = [] - for i in range(num_sequences): - preds_index = pd.MultiIndex.from_product( - [range(start_month + i, start_month + i + num_steps), range(num_locations)], - names=['month_id', loc_id_name] - ) - # Use month_id as the prediction value for easy checking. Wrap in a list. - pred_values = [[idx[0]] for idx in preds_index] - preds = pd.DataFrame({pred_col_name: pred_values}, index=preds_index) - predictions_list.append(preds) - - # 3. Config - config = {'steps': list(range(1, num_steps + 1))} - - return actuals, predictions_list, target_name, config - - -def get_months_from_mock_call(call): - """Helper to extract unique month_ids from a mock call's DataFrame argument.""" - df = call[0][1] # call[0] is args, [1] is the matched_pred dataframe - return sorted(df.index.get_level_values('month_id').unique().tolist()) - - -def test_step_wise_schema_grouping(schema_test_data): - """ - Verify that step-wise evaluation groups data by forecast horizon (diagonals). - """ - actuals, preds, target, config = schema_test_data - manager = EvaluationManager() - mock_metric_func = MagicMock() - - with patch.dict(manager.regression_point_functions, {"RMSLE": mock_metric_func}): - actuals, preds = manager._process_data(actuals, preds, target) - manager.step_wise_evaluation( - actuals, preds, target, config["steps"], - metrics_list=["RMSLE"], - metric_functions=manager.regression_point_functions, - metrics_cls=RegressionPointEvaluationMetrics, - ) - - # Expected groupings for steps (diagonals of the parallelogram) - expected_step_months = { - # step 1: (t0_s1, t1_s1, t2_s1) -> months (100, 101, 102) - 0: [100, 101, 102], - # step 2: (t0_s2, t1_s2, t2_s2) -> months (101, 102, 103) - 1: [101, 102, 103], - # step 3: (t0_s3, t1_s3, t2_s3) -> months (102, 103, 104) - 2: [102, 103, 104], - # step 4: (t0_s4, t1_s4, t2_s4) -> months (103, 104, 105) - 3: [103, 104, 105], - } - - assert mock_metric_func.call_count == len(expected_step_months) - - for i, expected_months in expected_step_months.items(): - call = mock_metric_func.call_args_list[i] - observed_months = get_months_from_mock_call(call) - assert observed_months == expected_months, f"Mismatch on step {i+1}" - - -def test_time_series_wise_schema_grouping(schema_test_data): - """ - Verify that time-series-wise evaluation groups data by forecast run (columns). - """ - actuals, preds, target, config = schema_test_data - manager = EvaluationManager() - mock_metric_func = MagicMock() - - with patch.dict(manager.regression_point_functions, {"RMSLE": mock_metric_func}): - actuals, preds = manager._process_data(actuals, preds, target) - manager.time_series_wise_evaluation( - actuals, preds, target, - metrics_list=["RMSLE"], - metric_functions=manager.regression_point_functions, - metrics_cls=RegressionPointEvaluationMetrics, - ) - - # Expected groupings for time-series (columns of the parallelogram) - expected_ts_months = { - # sequence 0: months 100, 101, 102, 103 - 0: [100, 101, 102, 103], - # sequence 1: months 101, 102, 103, 104 - 1: [101, 102, 103, 104], - # sequence 2: months 102, 103, 104, 105 - 2: [102, 103, 104, 105], - } - - assert mock_metric_func.call_count == len(expected_ts_months) - - for i, expected_months in expected_ts_months.items(): - call = mock_metric_func.call_args_list[i] - observed_months = get_months_from_mock_call(call) - assert observed_months == expected_months, f"Mismatch on time-series {i}" - - -def test_month_wise_schema_grouping(schema_test_data): - """ - Verify that month-wise evaluation groups data by calendar month (rows). - """ - actuals, preds, target, config = schema_test_data - manager = EvaluationManager() - mock_metric_func = MagicMock() - - with patch.dict(manager.regression_point_functions, {"RMSLE": mock_metric_func}): - actuals, preds = manager._process_data(actuals, preds, target) - manager.month_wise_evaluation( - actuals, preds, target, - metrics_list=["RMSLE"], - metric_functions=manager.regression_point_functions, - metrics_cls=RegressionPointEvaluationMetrics, - ) - - # For month-wise, each call corresponds to one month. - # We check that each month was called and that the data in the call is correct. - observed_calls = {} - for call in mock_metric_func.call_args_list: - df_pred = call[0][1] - month = get_months_from_mock_call(call)[0] - # Check that dataframe only contains data for its specified month - assert all(m == month for m in get_months_from_mock_call(call)) - observed_calls[month] = df_pred - - # Expected months in the full parallelogram - expected_months = [100, 101, 102, 103, 104, 105] - assert sorted(observed_calls.keys()) == expected_months - - # Check the number of predictions for a few key months - # Month 100: Only from sequence 0 (2 locations) - assert len(observed_calls[100]) == 2 - # Month 101: From sequence 0 and 1 (2 locs * 2 seqs = 4) - assert len(observed_calls[101]) == 4 - # Month 102: From sequence 0, 1, and 2 (2 locs * 3 seqs = 6) - assert len(observed_calls[102]) == 6 - # Month 105: Only from sequence 2 (2 locations) - assert len(observed_calls[105]) == 2 diff --git a/tests/test_metric_calculators.py b/tests/test_metric_calculators.py index 3e75bcf..f411968 100644 --- a/tests/test_metric_calculators.py +++ b/tests/test_metric_calculators.py @@ -1,8 +1,8 @@ import pytest import numpy as np -import pandas as pd from views_evaluation.evaluation.native_metric_calculators import ( calculate_mse_native, + calculate_msle_native, calculate_rmsle_native, calculate_crps_native, calculate_twcrps_native, @@ -15,128 +15,101 @@ calculate_mean_interval_score_native, calculate_mtd_native, calculate_mcr_native, - REGRESSION_POINT_NATIVE, - REGRESSION_SAMPLE_NATIVE, - CLASSIFICATION_POINT_NATIVE, - CLASSIFICATION_SAMPLE_NATIVE, + calculate_brier_sample_native, + calculate_brier_point_native, + calculate_qs_sample_native, + calculate_qs_point_native, ) +from views_evaluation.evaluation.metric_catalog import METRIC_MEMBERSHIP -@pytest.fixture -def sample_data(): - """Create sample data for testing.""" - actual = pd.DataFrame({ - 'target': [[1.0], [2.0], [3.0], [4.0]] - }) - pred = pd.DataFrame({ - 'pred_target': [[1.1], [1.9], [3.1], [3.9]] - }) - return actual, pred - - -@pytest.fixture -def sample_sample_data(): - """Create sample sample data for testing.""" - actual = pd.DataFrame({ - 'target': [[1.0], [2.0], [3.0], [4.0]] - }) - pred = pd.DataFrame({ - 'pred_target': [[1.0, 1.1, 1.2], [1.8, 2.0, 2.2], [2.9, 3.0, 3.1], [3.8, 4.0, 4.2]] - }) - return actual, pred - - -def test_calculate_mse_native(sample_data): - """Test MSE calculation.""" - actual, pred = sample_data - result = calculate_mse_native(actual, pred, 'target') +# Point-prediction test data (N=4, S=1) +_POINT_Y_TRUE = np.array([1.0, 2.0, 3.0, 4.0]) +_POINT_Y_PRED = np.array([[1.1], [1.9], [3.1], [3.9]]) + +# Sample-prediction test data (N=4, S=3) +_SAMPLE_Y_TRUE = np.array([1.0, 2.0, 3.0, 4.0]) +_SAMPLE_Y_PRED = np.array([[1.0, 1.1, 1.2], [1.8, 2.0, 2.2], [2.9, 3.0, 3.1], [3.8, 4.0, 4.2]]) + + +def test_calculate_mse_native(): + """Test MSE calculation with pure NumPy arrays.""" + result = calculate_mse_native(_POINT_Y_TRUE, _POINT_Y_PRED) assert isinstance(result, float) assert result >= 0 -def test_calculate_rmsle_native_point(sample_data): +def test_calculate_rmsle_native_point(): """Test RMSLE calculation.""" - actual, pred = sample_data - result = calculate_rmsle_native(actual, pred, 'target') + result = calculate_rmsle_native(_POINT_Y_TRUE, _POINT_Y_PRED) assert isinstance(result, float) assert result >= 0 -def test_calculate_crps_native_point(sample_data): - """Test CRPS calculation.""" - actual, pred = sample_data - result = calculate_crps_native(actual, pred, 'target') +def test_calculate_crps_native_point(): + """Test CRPS calculation with point predictions.""" + result = calculate_crps_native(_POINT_Y_TRUE, _POINT_Y_PRED) assert isinstance(result, float) assert result >= 0 -def test_calculate_crps_native_sample(sample_sample_data): - """Test CRPS calculation.""" - actual, pred = sample_sample_data - result = calculate_crps_native(actual, pred, 'target') +def test_calculate_crps_native_sample(): + """Test CRPS calculation with sample predictions.""" + result = calculate_crps_native(_SAMPLE_Y_TRUE, _SAMPLE_Y_PRED) assert isinstance(result, float) assert result >= 0 def test_calculate_ap_native(): - """Test Average Precision calculation with pre-binarised actuals and probability scores.""" - # Binary actuals (0/1) and probability scores as predictions - actual = pd.DataFrame({'target': [[1], [0], [1], [0]]}) - pred = pd.DataFrame({'pred_target': [[0.9], [0.4], [0.3], [0.1]]}) - result = calculate_ap_native(actual, pred, 'target') + """Test Average Precision with binary actuals and probability scores.""" + y_true = np.array([1.0, 0.0, 1.0, 0.0]) + y_pred = np.array([[0.9], [0.4], [0.3], [0.1]]) + result = calculate_ap_native(y_true, y_pred) assert isinstance(result, float) assert 0 <= result <= 1 -def test_calculate_emd_native(sample_data): +def test_calculate_emd_native(): """Test Earth Mover's Distance calculation.""" - actual, pred = sample_data - result = calculate_emd_native(actual, pred, 'target') + result = calculate_emd_native(_POINT_Y_TRUE, _POINT_Y_PRED) assert isinstance(result, float) assert result >= 0 -def test_calculate_pearson_native(sample_data): +def test_calculate_pearson_native(): """Test Pearson correlation calculation.""" - actual, pred = sample_data - result = calculate_pearson_native(actual, pred, 'target') + result = calculate_pearson_native(_POINT_Y_TRUE, _POINT_Y_PRED) assert isinstance(result, float) assert -1 <= result <= 1 -def test_calculate_mtd_native(sample_data): +def test_calculate_mtd_native(): """Test Mean Tweedie Deviance calculation.""" - actual, pred = sample_data - result = calculate_mtd_native(actual, pred, 'target', power=1.5) + result = calculate_mtd_native(_POINT_Y_TRUE, _POINT_Y_PRED, power=1.5) assert isinstance(result, float) assert result >= 0 -def test_calculate_mtd_native_with_power(sample_data): - """Test Mean Tweedie Deviance calculation with different power values.""" - actual, pred = sample_data - # Test with power=1.5 (compound Poisson-Gamma) - result_15 = calculate_mtd_native(actual, pred, 'target', power=1.5) +def test_calculate_mtd_native_with_power(): + """Test Mean Tweedie Deviance with different power values.""" + result_15 = calculate_mtd_native(_POINT_Y_TRUE, _POINT_Y_PRED, power=1.5) assert isinstance(result_15, float) assert result_15 >= 0 - # Test with power=2 (Gamma) - result_2 = calculate_mtd_native(actual, pred, 'target', power=2.0) + result_2 = calculate_mtd_native(_POINT_Y_TRUE, _POINT_Y_PRED, power=2.0) assert isinstance(result_2, float) assert result_2 >= 0 -def test_calculate_coverage_native_sample(sample_sample_data): - """Test Coverage calculation.""" - actual, pred = sample_sample_data - result = calculate_coverage_native(actual, pred, 'target', alpha=0.1) +def test_calculate_coverage_native_sample(): + """Test Coverage calculation with sample predictions.""" + result = calculate_coverage_native(_SAMPLE_Y_TRUE, _SAMPLE_Y_PRED, alpha=0.1) assert isinstance(result, float) assert 0 <= result <= 1 -def test_calculate_ignorance_score_native_sample(sample_sample_data): +def test_calculate_ignorance_score_native_sample(): """Test Ignorance Score calculation.""" - actual, pred = sample_sample_data result = calculate_ignorance_score_native( - actual, pred, 'target', + _SAMPLE_Y_TRUE, _SAMPLE_Y_PRED, bins=[0, 0.5, 2.5, 5.5, 10.5, 25.5, 50.5, 100.5, 250.5, 500.5, 1000.5], low_bin=0, high_bin=10000, ) @@ -144,106 +117,59 @@ def test_calculate_ignorance_score_native_sample(sample_sample_data): assert result >= 0 -def test_calculate_mis_sample(sample_sample_data): +def test_calculate_mis_sample(): """Test Mean Interval Score calculation.""" - actual, pred = sample_sample_data - result = calculate_mean_interval_score_native(actual, pred, 'target', alpha=0.05) + result = calculate_mean_interval_score_native(_SAMPLE_Y_TRUE, _SAMPLE_Y_PRED, alpha=0.05) assert isinstance(result, float) assert result >= 0 -def test_point_metric_functions(): - """Test that all point metric functions are available in the deprecated REGRESSION_POINT_NATIVE.""" - expected_metrics = [ - "MSE", "MSLE", "RMSLE", "EMD", "SD", "pEMDiv", "Pearson", "Variogram", "MTD", "y_hat_bar" - ] - - - for metric in expected_metrics: - assert metric in REGRESSION_POINT_NATIVE - assert callable(REGRESSION_POINT_NATIVE[metric]) - - -def test_sample_metric_functions(): - """Test that all sample metric functions are available in the deprecated REGRESSION_SAMPLE_NATIVE.""" - expected_metrics = ["CRPS", "twCRPS", "MIS", "QIS", "Ignorance", "Coverage", "y_hat_bar", "MCR_sample"] - - for metric in expected_metrics: - assert metric in REGRESSION_SAMPLE_NATIVE - assert callable(REGRESSION_SAMPLE_NATIVE[metric]) - - -def test_regression_point_metric_functions(): - """Test that all regression point metric functions are available in REGRESSION_POINT_NATIVE.""" - expected_metrics = ["MSE", "MSLE", "RMSLE", "EMD", "SD", "pEMDiv", "Pearson", "Variogram", "MTD", "y_hat_bar"] - - for metric in expected_metrics: - assert metric in REGRESSION_POINT_NATIVE - assert callable(REGRESSION_POINT_NATIVE[metric]) +def test_metric_membership_regression_point(): + """METRIC_MEMBERSHIP contains expected regression point metrics.""" + members = METRIC_MEMBERSHIP[("regression", "point")] + for m in ["MSE", "MSLE", "RMSLE", "EMD", "Pearson", "MTD", "y_hat_bar", "MCR_point", "QS_point"]: + assert m in members + assert "AP" not in members + assert "CRPS" not in members - # AP must NOT be in regression point functions - assert "AP" not in REGRESSION_POINT_NATIVE - # CRPS must NOT be in regression point functions - assert "CRPS" not in REGRESSION_POINT_NATIVE +def test_metric_membership_regression_sample(): + """METRIC_MEMBERSHIP contains expected regression sample metrics.""" + members = METRIC_MEMBERSHIP[("regression", "sample")] + for m in ["CRPS", "twCRPS", "MIS", "QIS", "Coverage", "Ignorance", "y_hat_bar", "QS_sample", "MCR_sample"]: + assert m in members + assert "AP" not in members -def test_regression_sample_metric_functions(): - """Test that all regression sample metric functions are available.""" - expected_metrics = ["CRPS", "twCRPS", "MIS", "QIS", "Coverage", "Ignorance", "y_hat_bar"] - for metric in expected_metrics: - assert metric in REGRESSION_SAMPLE_NATIVE - assert callable(REGRESSION_SAMPLE_NATIVE[metric]) +def test_metric_membership_classification_point(): + """METRIC_MEMBERSHIP contains expected classification point metrics.""" + members = METRIC_MEMBERSHIP[("classification", "point")] + assert "AP" in members + assert "Brier_point" in members + assert "RMSLE" not in members - # AP must NOT be in regression sample functions - assert "AP" not in REGRESSION_SAMPLE_NATIVE - -def test_classification_point_metric_functions(): - """Test that AP is in CLASSIFICATION_POINT_NATIVE.""" - assert "AP" in CLASSIFICATION_POINT_NATIVE - assert callable(CLASSIFICATION_POINT_NATIVE["AP"]) - - # RMSLE must NOT be in classification point functions - assert "RMSLE" not in CLASSIFICATION_POINT_NATIVE - - -def test_classification_sample_metric_functions(): - """Test that classification sample metric functions are available.""" - expected_metrics = ["CRPS", "twCRPS", "Brier", "Jeffreys"] - - for metric in expected_metrics: - assert metric in CLASSIFICATION_SAMPLE_NATIVE - assert callable(CLASSIFICATION_SAMPLE_NATIVE[metric]) - - # RMSLE must NOT be in classification sample functions - assert "RMSLE" not in CLASSIFICATION_SAMPLE_NATIVE +def test_metric_membership_classification_sample(): + """METRIC_MEMBERSHIP contains expected classification sample metrics.""" + members = METRIC_MEMBERSHIP[("classification", "sample")] + for m in ["CRPS", "twCRPS", "Brier_sample", "Jeffreys"]: + assert m in members + assert "RMSLE" not in members def test_not_implemented_metrics(): - """Test that unimplemented metrics raise NotImplementedError.""" - actual = pd.DataFrame({'target': [[1.0]]}) - pred = pd.DataFrame({'pred_target': [[1.0]]}) - + """Test that unimplemented metrics raise ValueError with clear message.""" from views_evaluation.evaluation.native_metric_calculators import ( - calculate_brier_native, calculate_jeffreys_native, calculate_sd_native, calculate_pEMDiv_native, calculate_variogram_native, ) - unimplemented_functions = [ - calculate_brier_native, - calculate_jeffreys_native, - calculate_sd_native, - calculate_pEMDiv_native, - calculate_variogram_native, - ] - - for func in unimplemented_functions: + for func in [calculate_jeffreys_native, calculate_sd_native, + calculate_pEMDiv_native, calculate_variogram_native]: with pytest.raises(ValueError, match="not yet implemented"): - func(actual, pred, 'target') + func(np.array([1.0]), np.array([[1.0]])) # --------------------------------------------------------------------------- @@ -326,10 +252,9 @@ def test_parity_wide_spread(self): class TestTwCRPS: - def test_twcrps_basic_smoke(self, sample_sample_data): + def test_twcrps_basic_smoke(self): """twCRPS produces a non-negative float.""" - actual, pred = sample_sample_data - result = calculate_twcrps_native(actual, pred, 'target', threshold=0.0) + result = calculate_twcrps_native(_SAMPLE_Y_TRUE, _SAMPLE_Y_PRED, threshold=0.0) assert isinstance(result, float) assert result >= 0 @@ -360,11 +285,10 @@ def test_twcrps_threshold_changes_result(self): # They should differ for data straddling the threshold assert twcrps != pytest.approx(crps, abs=1e-5) - def test_twcrps_in_dispatch_dicts(self): - """twCRPS must be in both regression and classification sample dispatch dicts.""" - assert "twCRPS" in REGRESSION_SAMPLE_NATIVE - assert "twCRPS" in CLASSIFICATION_SAMPLE_NATIVE - assert callable(REGRESSION_SAMPLE_NATIVE["twCRPS"]) + def test_twcrps_in_metric_membership(self): + """twCRPS must be in both regression and classification sample membership.""" + assert "twCRPS" in METRIC_MEMBERSHIP[("regression", "sample")] + assert "twCRPS" in METRIC_MEMBERSHIP[("classification", "sample")] # --------------------------------------------------------------------------- @@ -373,11 +297,10 @@ def test_twcrps_in_dispatch_dicts(self): class TestQuantileIntervalScore: - def test_qis_basic_smoke(self, sample_sample_data): + def test_qis_basic_smoke(self): """QIS produces a non-negative float.""" - actual, pred = sample_sample_data result = calculate_quantile_interval_score_native( - actual, pred, 'target', lower_quantile=0.025, upper_quantile=0.975, + _SAMPLE_Y_TRUE, _SAMPLE_Y_PRED, lower_quantile=0.025, upper_quantile=0.975, ) assert isinstance(result, float) assert result >= 0 @@ -471,8 +394,8 @@ def test_qis_golden_value_with_violation(self): y_pred = np.array([[1.0, 2.0, 3.0, 4.0, 5.0]]) q_lo, q_hi = 0.1, 0.9 - lower = float(np.quantile(y_pred, q_lo, axis=1)) - upper = float(np.quantile(y_pred, q_hi, axis=1)) + lower = np.quantile(y_pred, q_lo, axis=1).item() + upper = np.quantile(y_pred, q_hi, axis=1).item() width = upper - lower upper_penalty = (2 / (1 - q_hi)) * (20.0 - upper) @@ -503,10 +426,233 @@ def test_qis_perfect_coverage_minimal_score(self): expected_width = float(np.mean(upper - lower)) assert result == pytest.approx(expected_width, abs=1e-10) - def test_qis_in_dispatch_dict(self): - """QIS must be in regression sample dispatch dict.""" - assert "QIS" in REGRESSION_SAMPLE_NATIVE - assert callable(REGRESSION_SAMPLE_NATIVE["QIS"]) + def test_qis_in_metric_membership(self): + """QIS must be in regression sample membership.""" + assert "QIS" in METRIC_MEMBERSHIP[("regression", "sample")] + + +# --------------------------------------------------------------------------- +# Green: Golden-value correctness tests β€” hand-computed expected values (ADR-020) +# --------------------------------------------------------------------------- + +class TestGoldenValues: + """Verify numerical correctness of all implemented metrics against hand-computed or oracle values.""" + + def test_mse_known_errors(self): + """y_true=[1,2,3], y_pred=[[2],[3],[4]] β†’ errors=[1,1,1], MSE=1.0.""" + result = calculate_mse_native(np.array([1.0, 2.0, 3.0]), np.array([[2.0], [3.0], [4.0]])) + assert result == pytest.approx(1.0, abs=1e-10) + + def test_msle_known_values(self): + """y_true=[e-1], y_pred=[[0]] β†’ log1p(e-1)=1, log1p(0)=0, MSLE=1.0.""" + result = calculate_msle_native(np.array([np.e - 1]), np.array([[0.0]])) + assert result == pytest.approx(1.0, abs=1e-10) + + def test_rmsle_is_sqrt_msle(self): + """RMSLE = sqrt(MSLE) for the same input.""" + y_true = np.array([np.e - 1]) + y_pred = np.array([[0.0]]) + msle = calculate_msle_native(y_true, y_pred) + rmsle = calculate_rmsle_native(y_true, y_pred) + assert rmsle == pytest.approx(np.sqrt(msle), abs=1e-10) + + def test_emd_point_prediction(self): + """y_true=[0], y_pred=[[5]] β†’ wasserstein_distance([5],[0]) = 5.0.""" + result = calculate_emd_native(np.array([0.0]), np.array([[5.0]])) + assert result == pytest.approx(5.0, abs=1e-10) + + def test_pearson_perfect_correlation(self): + """y_true=[1,2,3], y_pred=[[1],[2],[3]] β†’ r = 1.0.""" + result = calculate_pearson_native(np.array([1.0, 2.0, 3.0]), np.array([[1.0], [2.0], [3.0]])) + assert result == pytest.approx(1.0, abs=1e-10) + + def test_pearson_perfect_negative(self): + """y_true=[1,2,3], y_pred=[[3],[2],[1]] β†’ r = -1.0.""" + result = calculate_pearson_native(np.array([1.0, 2.0, 3.0]), np.array([[3.0], [2.0], [1.0]])) + assert result == pytest.approx(-1.0, abs=1e-10) + + def test_mtd_known_tweedie(self): + """Tweedie deviance with power=2 reduces to (y/mu - ln(y/mu) - 1) * 2.""" + from sklearn.metrics import mean_tweedie_deviance + y_true = np.array([1.0, 2.0, 3.0]) + y_pred = np.array([[2.0], [2.0], [2.0]]) + expected = mean_tweedie_deviance( + np.repeat(y_true, 1), y_pred.flatten(), power=2 + ) + result = calculate_mtd_native(y_true, y_pred, power=2) + assert result == pytest.approx(expected, abs=1e-10) + + def test_mcr_perfect_calibration(self): + """mean(y_pred) == mean(y_true) β†’ MCR = 1.0.""" + y_true = np.array([2.0, 4.0, 6.0]) + y_pred = np.array([[2.0], [4.0], [6.0]]) + result = calculate_mcr_native(y_true, y_pred) + assert result == pytest.approx(1.0, abs=1e-10) + + def test_mcr_double_overprediction(self): + """mean(y_pred) = 2 * mean(y_true) β†’ MCR = 2.0.""" + y_true = np.array([1.0, 2.0, 3.0]) + y_pred = np.array([[2.0], [4.0], [6.0]]) + result = calculate_mcr_native(y_true, y_pred) + assert result == pytest.approx(2.0, abs=1e-10) + + def test_ignorance_known_bin_distribution(self): + """Hand-computed Ignorance: 5 ensemble members, 3 bins, known distribution. + + bins=[0,4,8,12], preds=[1,3,5,7,9] β†’ bin counts [2,2,1] + smoothed=[3,3,2], total=8. Truth 5.0 β†’ bin 1, prob=3/8. + Score = -log2(3/8) = log2(8/3). + """ + y_true = np.array([5.0]) + y_pred = np.array([[1.0, 3.0, 5.0, 7.0, 9.0]]) + result = calculate_ignorance_score_native( + y_true, y_pred, bins=[0, 4, 8, 12], low_bin=0, high_bin=12, + ) + expected = np.log2(8.0 / 3.0) + assert result == pytest.approx(expected, abs=1e-10) + + def test_ap_oracle_sklearn(self): + """AP matches sklearn.metrics.average_precision_score.""" + from sklearn.metrics import average_precision_score + y_true = np.array([1.0, 0.0, 1.0, 0.0]) + y_pred = np.array([[0.9], [0.1], [0.8], [0.2]]) + result = calculate_ap_native(y_true, y_pred) + # AP native repeats y_true for S columns, flattens y_pred + expected = average_precision_score( + np.repeat(y_true, 1), y_pred.flatten() + ) + assert result == pytest.approx(expected, abs=1e-10) + + def test_coverage_all_inside(self): + """All obs inside the central interval β†’ coverage = 1.0.""" + y_true = np.array([5.0]) + y_pred = np.array([[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]]) + result = calculate_coverage_native(y_true, y_pred, alpha=0.1) + assert result == pytest.approx(1.0, abs=1e-10) + + def test_coverage_all_outside(self): + """Obs far outside the interval β†’ coverage = 0.0.""" + y_true = np.array([100.0]) + y_pred = np.array([[1.0, 2.0, 3.0, 4.0, 5.0]]) + result = calculate_coverage_native(y_true, y_pred, alpha=0.1) + assert result == pytest.approx(0.0, abs=1e-10) + + def test_mis_obs_inside_interval(self): + """Obs inside interval β†’ MIS = interval width only (no penalty).""" + y_true = np.array([5.0]) + y_pred = np.array([[0.0, 2.0, 4.0, 5.0, 6.0, 8.0, 10.0]]) + alpha = 0.1 + lower = np.quantile([0, 2, 4, 5, 6, 8, 10], alpha / 2) + upper = np.quantile([0, 2, 4, 5, 6, 8, 10], 1 - alpha / 2) + expected = upper - lower # no penalty since obs is inside + result = calculate_mean_interval_score_native(y_true, y_pred, alpha=alpha) + assert result == pytest.approx(expected, abs=1e-10) + + def test_crps_point_prediction_equals_absolute_error(self): + """CRPS of 1-member ensemble = |y - x|.""" + result = calculate_crps_native(np.array([5.0]), np.array([[8.0]])) + assert result == pytest.approx(3.0, abs=1e-10) + + def test_twcrps_zero_threshold_equals_crps(self): + """twCRPS with threshold=0 on non-negative data = CRPS.""" + y_true = np.array([5.0, 10.0]) + y_pred = np.array([[3.0, 7.0], [8.0, 12.0]]) + crps = calculate_crps_native(y_true, y_pred) + twcrps = calculate_twcrps_native(y_true, y_pred, threshold=0.0) + assert twcrps == pytest.approx(crps, abs=1e-10) + + def test_qis_symmetric_equals_mis(self): + """QIS with symmetric quantiles (alpha/2, 1-alpha/2) equals MIS.""" + y_true = np.array([5.0, 15.0]) + y_pred = np.array([[1.0, 3.0, 5.0, 7.0, 9.0], [10.0, 12.0, 14.0, 16.0, 18.0]]) + alpha = 0.1 + mis = calculate_mean_interval_score_native(y_true, y_pred, alpha=alpha) + qis = calculate_quantile_interval_score_native( + y_true, y_pred, lower_quantile=alpha / 2, upper_quantile=1 - alpha / 2 + ) + assert qis == pytest.approx(mis, abs=1e-10) + + +# --------------------------------------------------------------------------- +# Green: Brier Score golden-value tests (ADR-020) +# --------------------------------------------------------------------------- + +class TestBrierScore: + + def test_brier_sample_golden_value(self): + """Hand-computed Brier sample: threshold=1, mixed binary outcomes.""" + y_true = np.array([0.0, 2.0, 5.0]) + y_pred = np.array([[0.5, 1.5], [0.5, 1.5], [4.0, 6.0]]) + # y_binary = [0, 1, 1] (0 < 1, 2 > 1, 5 > 1) + # p_hat = [0.5, 0.5, 1.0] (fraction of ensemble > threshold) + # Brier = mean([(0.5-0)^2, (0.5-1)^2, (1.0-1)^2]) = mean([0.25, 0.25, 0]) = 1/6 + result = calculate_brier_sample_native(y_true, y_pred, threshold=1.0) + assert result == pytest.approx(1.0 / 6.0, abs=1e-10) + + def test_brier_point_golden_value(self): + """Hand-computed Brier point: threshold=1, probabilities vs binary outcomes.""" + y_true = np.array([0.0, 2.0, 5.0]) + y_pred = np.array([[0.1], [0.7], [0.9]]) + # y_binary = [0, 1, 1] + # p_hat = [0.1, 0.7, 0.9] (point prediction as probability) + # Brier = mean([(0.1-0)^2, (0.7-1)^2, (0.9-1)^2]) = mean([0.01, 0.09, 0.01]) = 11/300 + result = calculate_brier_point_native(y_true, y_pred, threshold=1.0) + assert result == pytest.approx(11.0 / 300.0, abs=1e-10) + + def test_brier_sample_perfect(self): + """All above threshold, all ensemble members above β†’ p_hat=1, y_binary=1, Brier=0.""" + y_true = np.array([5.0, 10.0]) + y_pred = np.array([[2.0, 3.0], [2.0, 3.0]]) + result = calculate_brier_sample_native(y_true, y_pred, threshold=1.0) + assert result == pytest.approx(0.0, abs=1e-10) + + def test_brier_point_perfect(self): + """p_hat matches y_binary exactly β†’ Brier=0.""" + y_true = np.array([0.0, 2.0]) # binary=[0, 1] at threshold=1 + y_pred = np.array([[0.0], [1.0]]) # perfect probability predictions + result = calculate_brier_point_native(y_true, y_pred, threshold=1.0) + assert result == pytest.approx(0.0, abs=1e-10) + + +# --------------------------------------------------------------------------- +# Green: Quantile Score (pinball loss) golden-value tests (ADR-020) +# --------------------------------------------------------------------------- + +class TestQuantileScore: + + def test_qs_sample_golden_value_at_median(self): + """Median matches observation β†’ QS = 0.""" + y_true = np.array([3.0]) + y_pred = np.array([[1.0, 2.0, 3.0, 4.0, 5.0]]) + # median of [1,2,3,4,5] = 3.0, diff = 3-3 = 0, QS = 0 + result = calculate_qs_sample_native(y_true, y_pred, quantile=0.5) + assert result == pytest.approx(0.0, abs=1e-10) + + def test_qs_point_golden_value_overprediction(self): + """Point overpredicts: y=3, q=5, quantile=0.9 β†’ (1-0.9)*(5-3) = 0.2.""" + y_true = np.array([3.0]) + y_pred = np.array([[5.0]]) + # diff = 3 - 5 = -2 < 0 β†’ branch: -diff * (1-quantile) = 2 * 0.1 = 0.2 + result = calculate_qs_point_native(y_true, y_pred, quantile=0.9) + assert result == pytest.approx(0.2, abs=1e-10) + + def test_qs_sample_underprediction(self): + """Sample underpredicts: y=10, q=2.0 at quantile=0.9 β†’ 0.9*(10-2) = 7.2.""" + y_true = np.array([10.0]) + y_pred = np.array([[1.0, 2.0, 3.0]]) + # quantile(0.9) of [1,2,3] = 2.8 via linear interpolation + q = np.quantile([1.0, 2.0, 3.0], 0.9) # = 2.8 + expected = 0.9 * (10.0 - q) + result = calculate_qs_sample_native(y_true, y_pred, quantile=0.9) + assert result == pytest.approx(expected, abs=1e-10) + + def test_qs_point_underprediction(self): + """Point underpredicts: y=10, y_hat=2, quantile=0.9 β†’ 0.9*(10-2) = 7.2.""" + y_true = np.array([10.0]) + y_pred = np.array([[2.0]]) + # diff = 10 - 2 = 8 β‰₯ 0 β†’ branch: diff * quantile = 8 * 0.9 = 7.2 + result = calculate_qs_point_native(y_true, y_pred, quantile=0.9) + assert result == pytest.approx(7.2, abs=1e-10) # --------------------------------------------------------------------------- @@ -609,6 +755,74 @@ def test_large_alpha(self): assert np.isfinite(result) +class TestBrierScoreBeige: + + def test_single_observation(self): + """Brier handles N=1, S=1 without error.""" + result = calculate_brier_sample_native(np.array([2.0]), np.array([[3.0]]), threshold=1.0) + assert np.isfinite(result) + + def test_large_ensemble_stable(self): + """Brier is stable with S=1000 samples.""" + rng = np.random.default_rng(42) + y_true = np.array([0.0, 5.0, 10.0]) + y_pred = rng.normal(loc=y_true[:, None], scale=2.0, size=(3, 1000)) + result = calculate_brier_sample_native(y_true, y_pred, threshold=1.0) + assert np.isfinite(result) + assert 0 <= result <= 1 # Brier is bounded [0, 1] + + def test_threshold_at_exact_data_value(self): + """Threshold equals an observation β€” no crash.""" + y_true = np.array([5.0, 5.0]) + y_pred = np.array([[4.0, 6.0], [4.0, 6.0]]) + result = calculate_brier_sample_native(y_true, y_pred, threshold=5.0) + assert np.isfinite(result) + + def test_all_above_threshold(self): + """All y_true above threshold β€” y_binary all 1, finite result.""" + y_true = np.array([10.0, 20.0]) + y_pred = np.array([[0.5, 1.5], [0.5, 1.5]]) + result = calculate_brier_sample_native(y_true, y_pred, threshold=1.0) + assert np.isfinite(result) + + def test_all_below_threshold(self): + """All y_true below threshold β€” y_binary all 0, finite result.""" + y_true = np.array([0.0, 0.5]) + y_pred = np.array([[0.5, 1.5], [0.5, 1.5]]) + result = calculate_brier_sample_native(y_true, y_pred, threshold=1.0) + assert np.isfinite(result) + + +class TestQuantileScoreBeige: + + def test_single_observation(self): + """QS handles N=1, S=1 without error.""" + result = calculate_qs_sample_native(np.array([1.0]), np.array([[1.0]]), quantile=0.5) + assert np.isfinite(result) + + def test_large_ensemble_stable(self): + """QS is stable with S=1000 samples.""" + rng = np.random.default_rng(42) + y_true = np.array([5.0, 10.0, 0.0]) + y_pred = rng.normal(loc=y_true[:, None], scale=1.0, size=(3, 1000)) + result = calculate_qs_sample_native(y_true, y_pred, quantile=0.99) + assert np.isfinite(result) + assert result >= 0 + + def test_extreme_quantile_near_one(self): + """Quantile very close to 1 β€” finite result.""" + y_true = np.array([5.0]) + y_pred = np.array([[1.0, 2.0, 3.0, 4.0, 5.0]]) + result = calculate_qs_sample_native(y_true, y_pred, quantile=0.999) + assert np.isfinite(result) + + def test_extreme_quantile_near_zero(self): + """Quantile very close to 0 β€” finite result.""" + y_true = np.array([5.0]) + result = calculate_qs_point_native(y_true, np.array([[2.0]]), quantile=0.001) + assert np.isfinite(result) + + class TestMCRBeige: def test_single_observation(self): @@ -756,3 +970,87 @@ def test_negative_y_true_valid(self): y_pred = np.array([[4.0], [4.0]]) result = calculate_mcr_native(y_true, y_pred) assert result == -2.0 + + +class TestBrierScoreRed: + + def test_nan_in_y_true_swallowed_by_comparison(self): + """NaN in y_true is swallowed by '>' comparison (NaN > x β†’ False). + + Unlike arithmetic metrics, Brier's binarization step converts NaN to + False (0.0) rather than propagating. This is NumPy's standard comparison + semantics. The EvaluationFrame boundary should reject NaN before it + reaches here (defense-in-depth). + """ + y_true = np.array([np.nan, 1.0]) + y_pred = np.array([[1.0], [1.0]]) + result = calculate_brier_sample_native(y_true, y_pred, threshold=1.0) + # NaN is treated as below-threshold (False), so result is finite, not NaN + assert np.isfinite(result) + + def test_nan_in_y_pred_swallowed_by_comparison(self): + """NaN in y_pred is swallowed by '>' comparison in p_hat computation.""" + y_true = np.array([1.0, 1.0]) + y_pred = np.array([[np.nan], [1.0]]) + result = calculate_brier_sample_native(y_true, y_pred, threshold=1.0) + assert np.isfinite(result) + + def test_negative_threshold_accepted(self): + """Negative threshold is mathematically valid.""" + y_true = np.array([1.0, 2.0]) + y_pred = np.array([[1.0, 2.0], [2.0, 3.0]]) + result = calculate_brier_sample_native(y_true, y_pred, threshold=-5.0) + assert np.isfinite(result) + + +class TestQuantileScoreRed: + + def test_nan_in_y_true_propagates(self): + """NaN in y_true propagates to result.""" + y_true = np.array([np.nan, 1.0]) + y_pred = np.array([[1.0], [1.0]]) + result = calculate_qs_sample_native(y_true, y_pred, quantile=0.5) + assert np.isnan(result) + + def test_nan_in_y_pred_propagates(self): + """NaN in y_pred propagates to result.""" + y_true = np.array([1.0, 1.0]) + y_pred = np.array([[np.nan], [1.0]]) + result = calculate_qs_point_native(y_true, y_pred, quantile=0.5) + assert np.isnan(result) + + +# --------------------------------------------------------------------------- +# Red: Extreme-value tests (ADR-020) +# --------------------------------------------------------------------------- + +class TestExtremeValues: + """Test metric behavior near float64 limits β€” no overflow, no silent corruption.""" + + def test_mse_large_matching_values(self): + """Large but equal values β†’ MSE = 0, not overflow.""" + result = calculate_mse_native(np.array([1e150]), np.array([[1e150]])) + assert result == pytest.approx(0.0, abs=1e-10) + + def test_crps_large_ensemble_values(self): + """CRPS with large ensemble values remains finite.""" + y_true = np.array([1e50]) + y_pred = np.array([[0.9e50, 1.0e50, 1.1e50]]) + result = calculate_crps_native(y_true, y_pred) + assert np.isfinite(result) + assert result >= 0 + + def test_brier_extreme_threshold(self): + """Threshold at 1e300: all values below β†’ y_binary all 0, p_hat all 0, Brier = 0.""" + y_true = np.array([1.0, 2.0]) + y_pred = np.array([[0.5, 1.5], [0.5, 1.5]]) + result = calculate_brier_sample_native(y_true, y_pred, threshold=1e300) + assert result == pytest.approx(0.0, abs=1e-10) + + def test_coverage_tiny_ensemble_spread(self): + """Extremely narrow ensemble β†’ interval width ~ 0, coverage depends on obs position.""" + base = 1e-15 + y_true = np.array([base]) + y_pred = np.array([[base - 1e-30, base + 1e-30]]) + result = calculate_coverage_native(y_true, y_pred, alpha=0.1) + assert np.isfinite(result) diff --git a/tests/test_metric_catalog.py b/tests/test_metric_catalog.py index 8b1c44f..e621ce0 100644 --- a/tests/test_metric_catalog.py +++ b/tests/test_metric_catalog.py @@ -16,10 +16,6 @@ from views_evaluation.profiles import PROFILES from views_evaluation.profiles.base import BASE_PROFILE from views_evaluation.evaluation.native_metric_calculators import ( - REGRESSION_POINT_NATIVE, - REGRESSION_SAMPLE_NATIVE, - CLASSIFICATION_POINT_NATIVE, - CLASSIFICATION_SAMPLE_NATIVE, calculate_mcr_native, ) import numpy as np @@ -147,23 +143,6 @@ def test_every_membership_metric_in_catalog(self): f"Metric '{metric_name}' in METRIC_MEMBERSHIP{key} but not in METRIC_CATALOG" ) - def test_catalog_functions_match_legacy_dispatch_dicts(self): - """METRIC_CATALOG functions must match the legacy dispatch dict entries.""" - legacy_dicts = { - ("regression", "point"): REGRESSION_POINT_NATIVE, - ("regression", "sample"): REGRESSION_SAMPLE_NATIVE, - ("classification", "point"): CLASSIFICATION_POINT_NATIVE, - ("classification", "sample"): CLASSIFICATION_SAMPLE_NATIVE, - } - for key, legacy_dict in legacy_dicts.items(): - for metric_name, legacy_func in legacy_dict.items(): - assert metric_name in METRIC_CATALOG, ( - f"Legacy dict {key} has '{metric_name}' not in METRIC_CATALOG" - ) - assert METRIC_CATALOG[metric_name].function is legacy_func, ( - f"Function mismatch for '{metric_name}' between catalog and legacy dict {key}" - ) - def test_base_profile_covers_all_implemented_genomes(self): """BASE_PROFILE must provide values for every genome param of every implemented metric.""" for metric_name, spec in METRIC_CATALOG.items(): @@ -365,7 +344,7 @@ def test_evaluator_rejects_unknown_profile(self): def test_registry_snapshot_integrity(self): """Registries have expected sizes β€” catches accidental mutation or deletion.""" - assert len(METRIC_CATALOG) == 21 + assert len(METRIC_CATALOG) == 24 assert len(METRIC_MEMBERSHIP) == 4 assert len(PROFILES) >= 2 diff --git a/tests/test_metric_correctness.py b/tests/test_metric_correctness.py deleted file mode 100644 index d86dcf1..0000000 --- a/tests/test_metric_correctness.py +++ /dev/null @@ -1,239 +0,0 @@ -""" -PHASE-3-DELETE: Tests metric correctness through the legacy EvaluationManager path. -Will be deleted when Phase 3 of the orchestrator migration is complete. -See reports/2026-02-25_evaluation_frame_refactor/10_orchestrator_migration_plan.md -""" -import pandas as pd -import numpy as np -import pytest - -from views_evaluation.evaluation.evaluation_manager import EvaluationManager - -class TestMetricCorrectness: - """ - A test suite for Phase 3: Data-Centric & Metric-Specific Validation. - These tests verify the numerical correctness of the metric calculators - using 'golden datasets' with pre-calculated, known outcomes. - """ - - def test_rmsle_golden_dataset_perfect_match(self): - """ - Tests the RMSLE calculation with a perfect match. - Expected: RMSLE should be 0.0. - """ - # Arrange - target_name = "lr_test" - pred_col_name = f"pred_{target_name}" - - # Create a simple, non-random dataset - actuals_index = pd.MultiIndex.from_product([[500], [10, 20]], names=['month_id', 'country_id']) - actuals = pd.DataFrame({target_name: [100, 50]}, index=actuals_index) - - # Predictions are identical to actuals - predictions_df = pd.DataFrame({pred_col_name: [[100.0], [50.0]]}, index=actuals_index) - predictions = [predictions_df] - - config = { - 'steps': [1], - 'regression_targets': [target_name], - 'regression_point_metrics': ['RMSLE'], - } - manager = EvaluationManager() - - # Act - results = manager.evaluate( - actual=actuals, - predictions=predictions, - target=target_name, - config=config - ) - - # Assert - # Check all evaluation schemas for correctness - rmsle_step = results['step'][1]['RMSLE'].iloc[0] - rmsle_ts = results['time_series'][1]['RMSLE'].iloc[0] - rmsle_month = results['month'][1]['RMSLE'].iloc[0] - - assert rmsle_step == 0.0 - assert rmsle_ts == 0.0 - assert rmsle_month == 0.0 - - def test_rmsle_golden_dataset_simple_mismatch(self): - """ - Tests the RMSLE calculation with a simple, known mismatch. - actual = e - 1, pred = 0. - log(actual + 1) = log(e) = 1. - log(pred + 1) = log(1) = 0. - RMSLE = sqrt((1-0)^2) = 1. - Expected: RMSLE should be 1.0. - """ - # Arrange - target_name = "lr_test" - pred_col_name = f"pred_{target_name}" - - actual_val = np.e - 1 - pred_val = 0.0 - - actuals_index = pd.MultiIndex.from_product([[500], [10]], names=['month_id', 'country_id']) - actuals = pd.DataFrame({target_name: [actual_val]}, index=actuals_index) - - predictions_df = pd.DataFrame({pred_col_name: [[pred_val]]}, index=actuals_index) - predictions = [predictions_df] - - config = { - 'steps': [1], - 'regression_targets': [target_name], - 'regression_point_metrics': ['RMSLE'], - } - manager = EvaluationManager() - - # Act - results = manager.evaluate( - actual=actuals, - predictions=predictions, - target=target_name, - config=config - ) - - # Assert - rmsle_step = results['step'][1]['RMSLE'].iloc[0] - - assert rmsle_step == pytest.approx(1.0) - - def test_ap_metric_with_prebinarised_inputs(self): - """ - Tests the AP (Average Precision) metric with pre-binarised actuals and probability - scores as predictions. AP is a classification metric; actuals must already be - binary (0/1) before reaching evaluate(). No threshold kwarg is accepted. - """ - # Arrange - target_name = "cls_binary" - pred_col_name = f"pred_{target_name}" - - # Pre-binarised actuals and probability scores - y_true_binary = [0, 1, 1, 0] - y_scores = [0.1, 0.4, 0.35, 0.8] - - actuals_index = pd.MultiIndex.from_product( - [[500], [10, 20, 30, 40]], names=['month_id', 'country_id'] - ) - actuals = pd.DataFrame({target_name: y_true_binary}, index=actuals_index) - predictions_df = pd.DataFrame( - {pred_col_name: [[s] for s in y_scores]}, index=actuals_index - ) - predictions = [predictions_df] - - config = { - 'steps': [1], - 'classification_targets': [target_name], - 'classification_point_metrics': ['AP'], - } - manager = EvaluationManager() - - # Act - results = manager.evaluate( - actual=actuals, - predictions=predictions, - target=target_name, - config=config - ) - - ap_step = results['step'][1]['AP'].iloc[0] - - # Expected AP from sklearn with the raw probability scores as the ranking signal - from sklearn.metrics import average_precision_score - expected_ap = average_precision_score(y_true_binary, y_scores) - - assert ap_step == pytest.approx(expected_ap) - - def test_crps_golden_dataset_point_prediction(self): - """ - Tests the CRPS calculation for point predictions (single-value ensemble). - Expected: CRPS matches properscoring for a 1-sample ensemble. - """ - # Arrange - target_name = "lr_test_crps_point" - pred_col_name = f"pred_{target_name}" - - # Simple dataset: one actual, one prediction - actual_val = 5.0 - pred_val = 6.0 - - actuals_index = pd.MultiIndex.from_product([[500], [10]], names=['month_id', 'country_id']) - actuals = pd.DataFrame({target_name: [actual_val]}, index=actuals_index) - - # Single-value prediction β†’ point prediction, use regression_sample_metrics - # by providing a multi-element ensemble so it's detected as sample type. - # Use the same scalar as a 3-sample degenerate ensemble for CRPS: - predictions_df = pd.DataFrame({pred_col_name: [[pred_val, pred_val, pred_val]]}, index=actuals_index) - predictions = [predictions_df] - - config = { - 'steps': [1], - 'regression_targets': [target_name], - 'regression_point_metrics': ['RMSLE'], # required by _validate_config - 'regression_sample_metrics': ['CRPS'], # routed to because predictions are multi-element - } - manager = EvaluationManager() - - # Act - results = manager.evaluate( - actual=actuals, - predictions=predictions, - target=target_name, - config=config - ) - - # Assert - crps_step = results['step'][1]['CRPS'].iloc[0] - - # Calculate expected CRPS using properscoring for the degenerate 3-sample ensemble - import properscoring as ps - expected_crps = ps.crps_ensemble(actual_val, np.array([pred_val, pred_val, pred_val])) - - assert crps_step == pytest.approx(expected_crps) - - def test_crps_golden_dataset_sample_prediction(self): - """ - Tests the CRPS calculation for sample predictions (ensemble of multiple values). - Expected: CRPS for sample predictions matches properscoring. - """ - # Arrange - target_name = "lr_test_crps_sample" - pred_col_name = f"pred_{target_name}" - - # Simple dataset: one actual, one prediction ensemble - actual_val = 5.0 - prediction_ensemble = [3.0, 4.0, 5.0, 6.0, 7.0] # A simple ensemble - - actuals_index = pd.MultiIndex.from_product([[500], [10]], names=['month_id', 'country_id']) - actuals = pd.DataFrame({target_name: [actual_val]}, index=actuals_index) - - # Sample prediction is a list of multiple values - predictions_df = pd.DataFrame({pred_col_name: [prediction_ensemble]}, index=actuals_index) - predictions = [predictions_df] - - config = { - 'steps': [1], - 'regression_targets': [target_name], - 'regression_point_metrics': ['RMSLE'], # required by _validate_config - 'regression_sample_metrics': ['CRPS'], # routed to because predictions are multi-element - } - manager = EvaluationManager() - - # Act - results = manager.evaluate( - actual=actuals, - predictions=predictions, - target=target_name, - config=config - ) - - # Assert - crps_step = results['step'][1]['CRPS'].iloc[0] - - # Calculate expected CRPS using properscoring for the ensemble - import properscoring as ps - expected_crps = ps.crps_ensemble(actual_val, np.array(prediction_ensemble)) - - assert crps_step == pytest.approx(expected_crps) diff --git a/tests/test_native_evaluator.py b/tests/test_native_evaluator.py index d60ca2e..a0fa14f 100644 --- a/tests/test_native_evaluator.py +++ b/tests/test_native_evaluator.py @@ -223,6 +223,48 @@ def test_single_origin_single_step(self): assert 'ts00' in d['time_series'] assert 'step01' in d['step'] + def test_multi_target_regression_and_classification(self): + """Config with both target types; each evaluated separately via EvaluationFrame metadata.""" + n = 4 + config = { + 'steps': [1, 2], + 'regression_targets': ['ged_sb'], + 'classification_targets': ['by_sb'], + 'regression_point_metrics': ['MSE'], + 'classification_point_metrics': ['AP'], + } + # Evaluate regression target + ef_reg = EvaluationFrame( + y_true=np.array([1.0, 2.0, 3.0, 4.0]), + y_pred=np.array([[1.1], [2.1], [3.1], [4.1]]), + identifiers={ + 'time': np.array([100, 100, 101, 101]), + 'unit': np.array([1, 2, 1, 2]), + 'origin': np.zeros(n, dtype=int), + 'step': np.array([1, 1, 2, 2]), + }, + metadata={'target': 'ged_sb'}, + ) + report_reg = NativeEvaluator(config).evaluate(ef_reg) + assert report_reg.task == 'regression' + assert 'MSE' in report_reg.to_dict()['schemas']['month']['month100'] + + # Evaluate classification target + ef_cls = EvaluationFrame( + y_true=np.array([0.0, 1.0, 0.0, 1.0]), + y_pred=np.array([[0.2], [0.8], [0.3], [0.7]]), + identifiers={ + 'time': np.array([100, 100, 101, 101]), + 'unit': np.array([1, 2, 1, 2]), + 'origin': np.zeros(n, dtype=int), + 'step': np.array([1, 1, 2, 2]), + }, + metadata={'target': 'by_sb'}, + ) + report_cls = NativeEvaluator(config).evaluate(ef_cls) + assert report_cls.task == 'classification' + assert 'AP' in report_cls.to_dict()['schemas']['month']['month100'] + def test_classification_target(self): n = 6 ef = EvaluationFrame( @@ -246,6 +288,65 @@ def test_classification_target(self): assert report.pred_type == 'point' assert 'month100' in report.to_dict()['schemas']['month'] + def test_classification_sample_brier(self): + """Brier_sample and CRPS work for classification sample predictions.""" + n = 6 + ef = EvaluationFrame( + y_true=np.array([0.0, 1.0, 0.0, 1.0, 0.0, 1.0]), + y_pred=np.random.default_rng(42).uniform(0, 2, size=(n, 20)), + identifiers={ + 'time': np.array([100, 100, 101, 101, 102, 102]), + 'unit': np.array([1, 2, 1, 2, 1, 2]), + 'origin': np.zeros(n, dtype=int), + 'step': np.array([1, 1, 2, 2, 3, 3]), + }, + metadata={'target': 'by_sb_best'}, + ) + config = { + 'steps': [1, 2, 3], + 'classification_targets': ['by_sb_best'], + 'classification_sample_metrics': ['Brier_sample', 'CRPS'], + } + report = NativeEvaluator(config).evaluate(ef) + assert report.task == 'classification' + assert report.pred_type == 'sample' + d = report.to_dict()['schemas'] + assert 'Brier_sample' in d['month']['month100'] + assert 'CRPS' in d['month']['month100'] + + def test_classification_point_brier(self): + """AP and Brier_point work together for classification point predictions.""" + n = 6 + ef = EvaluationFrame( + y_true=np.array([0.0, 1.0, 0.0, 1.0, 0.0, 1.0]), + y_pred=np.array([[0.2], [0.8], [0.3], [0.7], [0.4], [0.6]]), + identifiers={ + 'time': np.array([100, 100, 101, 101, 102, 102]), + 'unit': np.array([1, 2, 1, 2, 1, 2]), + 'origin': np.zeros(n, dtype=int), + 'step': np.array([1, 1, 2, 2, 3, 3]), + }, + metadata={'target': 'by_sb_best'}, + ) + config = { + 'steps': [1, 2, 3], + 'classification_targets': ['by_sb_best'], + 'classification_point_metrics': ['AP', 'Brier_point'], + } + report = NativeEvaluator(config).evaluate(ef) + d = report.to_dict()['schemas'] + assert 'AP' in d['step']['step01'] + assert 'Brier_point' in d['step']['step01'] + + def test_evaluate_twice_produces_identical_results(self): + """NativeEvaluator is stateless β€” same input yields same output.""" + ef = _make_parallelogram_ef(n_origins=2, n_steps=3, n_units=2) + config = _regression_point_config(steps=[1, 2, 3]) + evaluator = NativeEvaluator(config) + report1 = evaluator.evaluate(ef) + report2 = evaluator.evaluate(ef) + assert report1.to_dict() == report2.to_dict() + def test_sample_predictions_produce_point_pred_type_false(self): n = 4 ef = EvaluationFrame( @@ -307,6 +408,16 @@ def test_invalid_metric_name_raises_value_error(self): with pytest.raises(ValueError, match="not valid"): NativeEvaluator(config).evaluate(ef) + def test_empty_config_accepted_at_init_fails_at_evaluate(self): + """Empty config is accepted at init (C-02 known gap) but fails at evaluate(). + + NativeEvaluator.__init__ only validates profile name (defaults to 'base'). + Structural config errors surface at evaluate() time, not construction. + """ + ef = _make_parallelogram_ef(n_origins=1, n_steps=2, n_units=2) + evaluator = NativeEvaluator({}) # does NOT raise β€” C-02 + with pytest.raises((ValueError, KeyError)): + evaluator.evaluate(ef) def test_classification_metric_on_regression_target_raises(self): """AP is only valid for classification; using it with regression_targets must fail.""" ef = _make_parallelogram_ef(n_origins=1, n_steps=2, n_units=2) diff --git a/tests/test_parity_adapter_transfer.py b/tests/test_parity_adapter_transfer.py deleted file mode 100644 index 8145acc..0000000 --- a/tests/test_parity_adapter_transfer.py +++ /dev/null @@ -1,71 +0,0 @@ -""" -PHASE-3-DELETE: Tests parity between internal and external PandasAdapter adaptation. -Will be deleted when Phase 3 of the orchestrator migration is complete. -See reports/2026-02-25_evaluation_frame_refactor/10_orchestrator_migration_plan.md -""" -import pytest -import pandas as pd -from views_evaluation import EvaluationManager, PandasAdapter, NativeEvaluator - -def test_parity_internal_vs_external_adaptation(): - """ - PROVING PARITY FOR UPSTREAMING: - This test verifies that adapting a DataFrame to an EvaluationFrame - OUTSIDE of the EvaluationManager produces identical results to - letting the Manager handle it internally. - """ - # 1. Setup Data - index = pd.MultiIndex.from_product([[100, 101], [1, 2]], names=['month', 'unit']) - actuals = pd.DataFrame({'target': [0, 1, 0, 1]}, index=index) - preds = [pd.DataFrame({'pred_target': [0.1, 0.8, 0.15, 0.7]}, index=index)] - - config = { - 'steps': [1], - 'regression_targets': ['target'], - 'regression_point_metrics': ['MSE'] - } - - manager = EvaluationManager() - - # 2. PATH A: Internal Adaptation (The status quo) - # Manager receives DataFrames, adapts internally. - results_internal = manager.evaluate(actuals, preds, "target", config) - - # 3. PATH B: External Adaptation (The future) - # Simulation of Orchestrator running the adapter. - ef_external = PandasAdapter.from_dataframes(actuals, preds, "target") - - # We use the NativeEvaluator directly to simulate the final state - evaluator = NativeEvaluator(config) - report_external = evaluator.evaluate(ef_external) - - # 4. Bit-wise Parity Check - for schema in ["month", "time_series", "step"]: - df_internal = results_internal[schema][1] - df_external = report_external.to_dataframe(schema) - - pd.testing.assert_frame_equal(df_internal, df_external, - obj=f"Divergence in schema: {schema}") - - print("Parity Proven: External adaptation matches internal adaptation 100%.") - -def test_shadow_verification_mode(): - """Verifies that verify_parity=True catches mismatches and allows matches.""" - index = pd.MultiIndex.from_product([[100], [1]], names=['month', 'unit']) - actuals = pd.DataFrame({'target': [1]}, index=index) - preds = [pd.DataFrame({'pred_target': [0.9]}, index=index)] - config = {'steps': [1], 'regression_targets': ['target'], 'regression_point_metrics': ['MSE']} - - manager = EvaluationManager() - ef_external = PandasAdapter.from_dataframes(actuals, preds, "target") - - # 1. Matching case - should pass silently - manager.evaluate(actuals, preds, "target", config, ef=ef_external, verify_parity=True) - - # 2. Mismatching case - should raise ValueError - ef_corrupted = PandasAdapter.from_dataframes(actuals, preds, "target") - ef_corrupted.y_true = ef_corrupted.y_true * 2 # Corrupt the data - - with pytest.raises(ValueError, match="Parity Failure"): - manager.evaluate(actuals, preds, "target", config, ef=ef_corrupted, verify_parity=True) - diff --git a/tests/test_parity_beige.py b/tests/test_parity_beige.py deleted file mode 100644 index 62d2d00..0000000 --- a/tests/test_parity_beige.py +++ /dev/null @@ -1,51 +0,0 @@ -""" -PHASE-3-DELETE: Parity edge-case tests between the legacy EvaluationManager and native paths. -Will be deleted when Phase 3 of the orchestrator migration is complete. -See reports/2026-02-25_evaluation_frame_refactor/10_orchestrator_migration_plan.md -""" -import pytest -import pandas as pd -import numpy as np -from views_evaluation.evaluation.evaluation_manager import EvaluationManager -from views_evaluation.adapters.pandas import PandasAdapter -from views_evaluation.evaluation.native_evaluator import NativeEvaluator -from tests.test_parity_green import assert_parity - -@pytest.fixture -def beige_data_ragged(): - """Ragged sequences and missing months.""" - index = pd.MultiIndex.from_product([[100, 101, 102, 103], [1, 2]], names=['month', 'unit']) - actual = pd.DataFrame({'target': np.random.rand(8)}, index=index) - - # Sequence 0: Month 100 and 101 (complete) - pred_0 = pd.DataFrame({'pred_target': np.random.rand(4)}, index=index[:4]) - - # Sequence 1: Month 101 and 102, but Month 101 Unit 2 is MISSING - idx_1 = index[2:6].drop((101, 2)) - pred_1 = pd.DataFrame({'pred_target': np.random.rand(3)}, index=idx_1) - - # Sequence 2: Only Month 103 - pred_2 = pd.DataFrame({'pred_target': np.random.rand(2)}, index=index[6:]) - - config = { - 'steps': [1, 2], - 'regression_targets': ['target'], - 'regression_point_metrics': ['MSE'] - } - - return actual, [pred_0, pred_1, pred_2], "target", config - -def test_parity_beige_ragged(beige_data_ragged): - actual, predictions, target, config = beige_data_ragged - - # 1. Run Legacy - manager = EvaluationManager() - legacy_results = manager.evaluate(actual, predictions, target, config) - - # 2. Run Native - ef = PandasAdapter.from_dataframes(actual, predictions, target) - native_evaluator = NativeEvaluator(config) - native_results = native_evaluator.evaluate(ef) - - # 3. Assert Parity - assert_parity(legacy_results, native_results) diff --git a/tests/test_parity_green.py b/tests/test_parity_green.py deleted file mode 100644 index 91187cd..0000000 --- a/tests/test_parity_green.py +++ /dev/null @@ -1,101 +0,0 @@ -""" -PHASE-3-DELETE: Parity tests between the legacy EvaluationManager and native paths. -Will be deleted when Phase 3 of the orchestrator migration is complete. -See reports/2026-02-25_evaluation_frame_refactor/10_orchestrator_migration_plan.md -""" -import pytest -import pandas as pd -from views_evaluation.evaluation.evaluation_manager import EvaluationManager -from views_evaluation.adapters.pandas import PandasAdapter -from views_evaluation.evaluation.native_evaluator import NativeEvaluator - -def assert_parity(legacy_results, native_report, tolerance=1e-9): - """ - Asserts bit-wise (or within tolerance) parity between legacy and native results. - legacy_results: output of EvaluationManager.evaluate() - native_report: EvaluationReport object from NativeEvaluator.evaluate() - """ - for schema in ["month", "time_series", "step"]: - legacy_df = legacy_results[schema][1] - native_df = native_report.to_dataframe(schema) - - # Check index parity - pd.testing.assert_index_equal(legacy_df.index, native_df.index) - - # Check column parity (might be slight differences in names if not careful) - pd.testing.assert_index_equal(legacy_df.columns, native_df.columns) - - # Check value parity - pd.testing.assert_frame_equal(legacy_df, native_df, atol=tolerance) - -@pytest.fixture -def green_data(): - """Clean, overlapping rolling origin data.""" - index = pd.MultiIndex.from_product([[100, 101, 102], [1, 2]], names=['month', 'unit']) - actual = pd.DataFrame({'target': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]}, index=index) - - # Sequence 0: Steps 1-2 for all units - pred_0 = pd.DataFrame({'pred_target': [0.11, 0.21, 0.31, 0.41]}, index=index[:4]) - # Sequence 1: Steps 1-2 for all units, starting from month 101 - pred_1 = pd.DataFrame({'pred_target': [0.32, 0.42, 0.52, 0.62]}, index=index[2:]) - - config = { - 'steps': [1, 2], - 'regression_targets': ['target'], - 'regression_point_metrics': ['MSE'] - } - - return actual, [pred_0, pred_1], "target", config - -@pytest.fixture -def green_data_samples(): - """Clean, overlapping rolling origin data with samples.""" - index = pd.MultiIndex.from_product([[100, 101, 102], [1, 2]], names=['month', 'unit']) - actual = pd.DataFrame({'target': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]}, index=index) - - # 3 samples per prediction - pred_0 = pd.DataFrame({ - 'pred_target': [[0.1, 0.12, 0.08], [0.2, 0.22, 0.18], [0.3, 0.32, 0.28], [0.4, 0.42, 0.38]] - }, index=index[:4]) - pred_1 = pd.DataFrame({ - 'pred_target': [[0.31, 0.33, 0.29], [0.41, 0.43, 0.39], [0.51, 0.53, 0.49], [0.61, 0.63, 0.59]] - }, index=index[2:]) - - config = { - 'steps': [1, 2], - 'regression_targets': ['target'], - 'regression_point_metrics': [], - 'regression_sample_metrics': ['CRPS'] - } - - return actual, [pred_0, pred_1], "target", config - -def test_parity_green_happy_path(green_data): - actual, predictions, target, config = green_data - - # 1. Run Legacy - manager = EvaluationManager() - legacy_results = manager.evaluate(actual, predictions, target, config) - - # 2. Run Native (New Path) - ef = PandasAdapter.from_dataframes(actual, predictions, target) - native_evaluator = NativeEvaluator(config) - native_results = native_evaluator.evaluate(ef) - - # 3. Assert Parity - assert_parity(legacy_results, native_results) - -def test_parity_green_ignorance(green_data_samples): - actual, predictions, target, config = green_data_samples - # Update config to use Ignorance score - config['regression_sample_metrics'] = ['Ignorance'] - - manager = EvaluationManager() - legacy_results = manager.evaluate(actual, predictions, target, config) - - ef = PandasAdapter.from_dataframes(actual, predictions, target) - native_evaluator = NativeEvaluator(config) - native_results = native_evaluator.evaluate(ef) - - assert_parity(legacy_results, native_results) - diff --git a/tests/test_parity_red.py b/tests/test_parity_red.py deleted file mode 100644 index f217233..0000000 --- a/tests/test_parity_red.py +++ /dev/null @@ -1,120 +0,0 @@ -""" -PHASE-3-DELETE: Parity error-case tests between the legacy EvaluationManager and native paths. -Will be deleted when Phase 3 of the orchestrator migration is complete. -See reports/2026-02-25_evaluation_frame_refactor/10_orchestrator_migration_plan.md -""" -import pytest -import pandas as pd -import numpy as np -from views_evaluation.evaluation.evaluation_manager import EvaluationManager -from views_evaluation.adapters.pandas import PandasAdapter -from views_evaluation.evaluation.native_evaluator import NativeEvaluator -from tests.test_parity_green import assert_parity - -@pytest.fixture -def red_data_unordered(): - """Predictions are mis-ordered in the DataFrame.""" - index = pd.MultiIndex.from_product([[100, 101], [1, 2]], names=['month', 'unit']) - actual = pd.DataFrame({'target': [0.1, 0.2, 0.3, 0.4]}, index=index) - - # Sequence 0: SHUFFLED rows - shuffled_idx = index[[3, 0, 2, 1]] - pred_0 = pd.DataFrame({'pred_target': [0.41, 0.11, 0.31, 0.21]}, index=shuffled_idx) - - config = { - 'steps': [1, 2], - 'regression_targets': ['target'], - 'regression_point_metrics': ['MSE'] - } - - return actual, [pred_0], "target", config - -@pytest.fixture -def red_data_coordinates(): - """Mismatched coordinates (extra units/months).""" - index_actual = pd.MultiIndex.from_product([[100, 101], [1, 2]], names=['month', 'unit']) - actual = pd.DataFrame({'target': [0.1, 0.2, 0.3, 0.4]}, index=index_actual) - - # Extra unit 3 (not in actuals) - index_pred = pd.MultiIndex.from_product([[100, 101], [1, 2, 3]], names=['month', 'unit']) - pred_0 = pd.DataFrame({'pred_target': [0.11, 0.21, 0.99, 0.31, 0.41, 0.99]}, index=index_pred) - - config = { - 'steps': [1, 2], - 'regression_targets': ['target'], - 'regression_point_metrics': ['MSE'] - } - - return actual, [pred_0], "target", config - -@pytest.fixture -def red_data_inconsistent_samples(): - """Ragged sample lengths (e.g. some rows have 2 samples, some have 3).""" - index = pd.MultiIndex.from_product([[100], [1, 2]], names=['month', 'unit']) - actual = pd.DataFrame({'target': [0.1, 0.2]}, index=index) - - # Row 1 has 2 samples, Row 2 has 3 samples - pred_0 = pd.DataFrame({ - 'pred_target': [[0.1, 0.12], [0.2, 0.22, 0.24]] - }, index=index) - - config = { - 'steps': [1], - 'regression_targets': ['target'], - 'regression_point_metrics': [], - 'regression_sample_metrics': ['CRPS'] - } - - return actual, [pred_0], "target", config - -@pytest.fixture -def red_data_nan_index(): - """NaNs in index levels.""" - # Note: Use object dtype for index to allow NaN mixed with ints - idx_actual = pd.MultiIndex.from_tuples([(100, 1), (101, 1), (np.nan, 2)], names=['month', 'unit']) - actual = pd.DataFrame({'target': [0.1, 0.2, 0.3]}, index=idx_actual) - - idx_pred = pd.MultiIndex.from_tuples([(100, 1), (101, 1), (np.nan, 2)], names=['month', 'unit']) - pred_0 = pd.DataFrame({'pred_target': [0.11, 0.21, 0.31]}, index=idx_pred) - - config = { - 'steps': [1, 2], - 'regression_targets': ['target'], - 'regression_point_metrics': ['MSE'] - } - - return actual, [pred_0], "target", config - -def test_parity_red_unordered(red_data_unordered): - actual, predictions, target, config = red_data_unordered - manager = EvaluationManager() - legacy_results = manager.evaluate(actual, predictions, target, config) - ef = PandasAdapter.from_dataframes(actual, predictions, target) - native_evaluator = NativeEvaluator(config) - native_results = native_evaluator.evaluate(ef) - assert_parity(legacy_results, native_results) - -def test_parity_red_coordinates(red_data_coordinates): - actual, predictions, target, config = red_data_coordinates - manager = EvaluationManager() - legacy_results = manager.evaluate(actual, predictions, target, config) - ef = PandasAdapter.from_dataframes(actual, predictions, target) - native_evaluator = NativeEvaluator(config) - native_results = native_evaluator.evaluate(ef) - assert_parity(legacy_results, native_results) - -def test_fail_loud_inconsistent_samples(red_data_inconsistent_samples): - actual, predictions, target, config = red_data_inconsistent_samples - manager = EvaluationManager() - # The new implementation raises a more descriptive error from the adapter - with pytest.raises(ValueError, match="Inconsistent list lengths"): - manager.evaluate(actual, predictions, target, config) - - -def test_fail_loud_nan_index(red_data_nan_index): - actual, predictions, target, config = red_data_nan_index - manager = EvaluationManager() - # The new implementation fails early in the adapter if NaNs are detected - with pytest.raises(ValueError, match="NaN detected in 'time' index level"): - manager.evaluate(actual, predictions, target, config) - diff --git a/views_evaluation/__init__.py b/views_evaluation/__init__.py index 64c263b..91a728e 100644 --- a/views_evaluation/__init__.py +++ b/views_evaluation/__init__.py @@ -1,6 +1,4 @@ -# ── Permanent public API ───────────────────────────────────────────────────── -# These classes are the stable, long-term interface of this library. -# They will remain after Phase 3 of the orchestrator migration. +# ── Public API ──────────────────────────────────────────────────────────────── from views_evaluation.evaluation.evaluation_frame import EvaluationFrame from views_evaluation.evaluation.native_evaluator import NativeEvaluator from views_evaluation.evaluation.evaluation_report import EvaluationReport @@ -13,16 +11,7 @@ from views_evaluation.evaluation.config_schema import EvaluationConfig from views_evaluation.profiles import PROFILES -# ── Temporary (PHASE-3-DELETE) ──────────────────────────────────────────────── -# These classes exist for backward compatibility and parity testing while the -# orchestrator migration (ADR-011, report 10) completes in views-pipeline-core. -# They will be removed once upstream parity is confirmed. Do not build new -# integrations on them. -from views_evaluation.evaluation.evaluation_manager import EvaluationManager -from views_evaluation.adapters.pandas import PandasAdapter - __all__ = [ - # Permanent "EvaluationFrame", "NativeEvaluator", "EvaluationReport", @@ -32,7 +21,4 @@ "resolve_metric_params", "EvaluationConfig", "PROFILES", - # Temporary β€” PHASE-3-DELETE - "EvaluationManager", - "PandasAdapter", ] diff --git a/views_evaluation/adapters/pandas.py b/views_evaluation/adapters/pandas.py deleted file mode 100644 index 5941578..0000000 --- a/views_evaluation/adapters/pandas.py +++ /dev/null @@ -1,150 +0,0 @@ -""" -PHASE-3-DELETE -This module is TEMPORARY and will be deleted in Phase 3 of the orchestrator migration. -See: reports/2026-02-25_evaluation_frame_refactor/10_orchestrator_migration_plan.md - -After Phase 3: - - Adapters live in views-pipeline-core (or in the calling repository) - - This repo has no knowledge of any specific data framework - - This file will not exist in this repository - -Do not add new functionality to this file. -""" -import warnings -import numpy as np -import pandas as pd -from typing import List -from views_evaluation.evaluation.evaluation_frame import EvaluationFrame - -class PandasAdapter: - """ - Adapter to convert Pandas DataFrames into the native EvaluationFrame. - - This class 'knows' about Pandas, allowing the rest of the core - to remain pure. - """ - - @staticmethod - def from_dataframes( - actual: pd.DataFrame, - predictions: List[pd.DataFrame], - target: str, - ) -> EvaluationFrame: - """ - Convert the current List[DataFrame] structure into a single EvaluationFrame. - - Args: - actual: DataFrame with MultiIndex [time, unit] - predictions: List of DataFrames with MultiIndex [time, unit] - target: The name of the target column - """ - - warnings.warn( - "PandasAdapter is deprecated and will be removed from this repo in Phase 3. " - "Adapters belong in the calling repository (e.g. views-pipeline-core).", - DeprecationWarning, - stacklevel=2, - ) - all_y_true = [] - all_y_pred = [] - all_times = [] - all_units = [] - all_origins = [] - all_steps = [] - - pred_col = f"pred_{target}" - - if target not in actual.columns: - raise KeyError(f"Target column '{target}' not found in actuals.") - - if not predictions: - # Align with legacy expected error message - raise ValueError("No objects to concatenate") - - for i, df in enumerate(predictions): - # 1. Align/Match Actuals (duplicated logic from EvaluationManager) - common_idx = actual.index.intersection(df.index) - if common_idx.empty: - continue - - matched_pred = df.loc[common_idx] - matched_actual = actual.loc[common_idx, target] - - # 2. Extract Data - # Note: We assume all cells have the same number of samples - # This is where we explode the 'list-in-cell' - sample_lists = matched_pred[pred_col].tolist() - - # ADR-012: Validate rectangular samples - lengths = [len(x) if isinstance(x, (list, np.ndarray)) else 1 for x in sample_lists] - if len(set(lengths)) > 1: - # Align with legacy expected error message - raise ValueError( - f"Inconsistent list lengths in sample evaluation. " - f"Found lengths {set(lengths)}" - ) - - samples = np.array(sample_lists) - if samples.ndim == 1: # Point forecasts - samples = samples.reshape(-1, 1) - - n_rows = len(matched_actual) - - # Legacy Actuals might be list-like (e.g. [0.1]) - actual_vals = matched_actual.values - if actual_vals.dtype == object: - # Coerce to scalars - actual_vals = np.array([ - x[0] if isinstance(x, (list, np.ndarray)) and len(x) > 0 else x - for x in actual_vals - ]) - - all_y_true.append(actual_vals) - all_y_pred.append(samples) - - # 3. Extract Identifiers - times = matched_pred.index.get_level_values(0).values - units = matched_pred.index.get_level_values(1).values - - # ADR-012: No NaNs in identifiers - if np.any(pd.isna(times)): - raise ValueError(f"NaN detected in 'time' index level of sequence {i}.") - if np.any(pd.isna(units)): - raise ValueError(f"NaN detected in 'unit' index level of sequence {i}.") - - all_times.append(times) - all_units.append(units) - - # 4. Synthesize Origin and Step - # Origin is the list index - all_origins.append(np.full(n_rows, i)) - - # Step is positional lead-time per unique month in the sequence - unique_times = matched_pred.index.get_level_values(0).unique() - time_to_step = {t: step_idx + 1 for step_idx, t in enumerate(unique_times)} - steps = np.array([time_to_step[t] for t in times]) - all_steps.append(steps) - - if not all_y_true: - # ADR-013: Fail-Loud on zero overlap - raise ValueError("need at least one array to concatenate") - - # ADR-012: Ensure all sequences have consistent sample counts - sample_counts = [y.shape[1] for y in all_y_pred] - if len(set(sample_counts)) > 1: - raise ValueError( - "Mix of evaluation types detected: some sequences contain point forecasts, others contain samples. " - "Please ensure all sequences are consistent in their evaluation type." - ) - - return EvaluationFrame( - y_true=np.concatenate(all_y_true), - y_pred=np.concatenate(all_y_pred), - identifiers={ - 'time': np.concatenate(all_times), - 'unit': np.concatenate(all_units), - 'origin': np.concatenate(all_origins), - 'step': np.concatenate(all_steps), - }, - metadata={'target': target} - ) diff --git a/views_evaluation/evaluation/deprecation_msgs.py b/views_evaluation/evaluation/deprecation_msgs.py deleted file mode 100644 index dcbbbc7..0000000 --- a/views_evaluation/evaluation/deprecation_msgs.py +++ /dev/null @@ -1,40 +0,0 @@ - -import warnings - -def raise_legacy_scale_msg() -> None: - - """ - Emit a highly visible warning banner for legacy scale-detection behavior - that should eventually be removed, but does not currently break execution. - """ - - default_msg = """ -Currently, the evaluation package infers target scaling (e.g. log, linear) -from the target variable name (lr_, ln_, lx_). - -This is problematic because: - -1) Target scaling is a MODEL parameter and must live with the model, - not be inferred from target names. - -2) Adding new scales would require updating a hard-coded list in the - evaluation package, which is brittle and volatile. - -3) Target prefixes (lr_, ln_, lx_) are not guarantees of scaling β€” - at best they are hints, and can lead to silent errors. - -As such, this behavior should be removed. -Targets should always be assumed unscaled. -""" - - banner = ( - "\n" - + "#" * 78 + "\n" - + "#{:^76}#\n".format("LEGACY SCALE DETECTION β€” SHOULD BE REMOVED") - + "#" * 78 + "\n" - + (default_msg).strip() + "\n" - + "#" * 78 - ) - - # Use UserWarning so it is always shown (DeprecationWarning is often suppressed) - warnings.warn(banner, UserWarning, stacklevel=2) diff --git a/views_evaluation/evaluation/evaluation_frame.py b/views_evaluation/evaluation/evaluation_frame.py index f8cacb5..90f955e 100644 --- a/views_evaluation/evaluation/evaluation_frame.py +++ b/views_evaluation/evaluation/evaluation_frame.py @@ -27,7 +27,13 @@ def _validate(y_true: np.ndarray, y_pred: np.ndarray, identifiers: Dict[str, np. n_rows = len(y_true) if y_pred.shape[0] != n_rows: raise ValueError(f"y_pred rows ({y_pred.shape[0]}) mismatch y_true ({n_rows})") - + + # Rectangular sample validation: y_pred must be a dense 2D array + if y_pred.ndim != 2: + raise ValueError( + f"y_pred must be 2D (N, S), got {y_pred.ndim}D with shape {y_pred.shape}" + ) + # ADR-013: Fail-Loud on corrupted numerical data # Align with legacy test expectations for error messages def check_corrupted(arr, name): diff --git a/views_evaluation/evaluation/evaluation_manager.py b/views_evaluation/evaluation/evaluation_manager.py deleted file mode 100644 index 8da94cb..0000000 --- a/views_evaluation/evaluation/evaluation_manager.py +++ /dev/null @@ -1,733 +0,0 @@ -""" -PHASE-3-DELETE -This module is TEMPORARY and will be deleted in Phase 3 of the orchestrator migration. -See: reports/2026-02-25_evaluation_frame_refactor/10_orchestrator_migration_plan.md - -After Phase 3: - - Adapters live in views-pipeline-core (or in the calling repository) - - EvaluationManager is fully replaced by NativeEvaluator in pipeline-core - - This file will not exist in this repository - -Do not add new functionality to this file. -""" -from typing import List, Tuple -import logging -import warnings -import pandas as pd -import numpy as np -from views_evaluation.adapters.pandas import PandasAdapter -from views_evaluation.evaluation.native_evaluator import NativeEvaluator -from views_evaluation.evaluation.evaluation_frame import EvaluationFrame -from views_evaluation.evaluation.metrics import ( - BaseEvaluationMetrics, -) -from views_evaluation.evaluation.native_metric_calculators import ( - REGRESSION_POINT_NATIVE, - REGRESSION_SAMPLE_NATIVE, - CLASSIFICATION_POINT_NATIVE, - CLASSIFICATION_SAMPLE_NATIVE, -) - -logger = logging.getLogger(__name__) - - -class EvaluationManager: - """ - A class for calculating metrics on time series predictions - Refer to https://github.com/prio-data/views_pipeline/blob/eval_docs/documentation/evaluation/schema.MD for more details on three evaluation schemas. - """ - - def __init__(self): - """ - Initialize the EvaluationManager. - - Metrics to compute and targets to evaluate are declared in the config - passed to evaluate(). No metric list is accepted here. - """ - - warnings.warn( - "EvaluationManager is deprecated and will be removed in Phase 3 of the " - "orchestrator migration. Use NativeEvaluator directly with an adapter. " - "See documentation/integration_guide.md.", - DeprecationWarning, - stacklevel=2, - ) - self.regression_point_functions = REGRESSION_POINT_NATIVE - self.regression_sample_functions = REGRESSION_SAMPLE_NATIVE - self.classification_point_functions = CLASSIFICATION_POINT_NATIVE - self.classification_sample_functions = CLASSIFICATION_SAMPLE_NATIVE - - - @staticmethod - def transform_data(df: pd.DataFrame, target: str | list[str]) -> pd.DataFrame: - """ - DEPRECATED. Apply legacy inverse transformations based on target name prefix. - - This method will be removed once all model repos have migrated to returning - predictions on the original scale. Do not add new logic here. - """ - if isinstance(target, str): - target = [target] - for t in target: - if t.startswith("ln") or t.startswith("pred_ln"): - df[[t]] = df[[t]].applymap(lambda x: np.exp(x) - 1) - elif t.startswith("lx") or t.startswith("pred_lx"): - df[[t]] = df[[t]].applymap(lambda x: np.exp(x) - np.exp(100)) - elif t.startswith("lr") or t.startswith("pred_lr"): - pass # identity β€” lr_ targets are already on the original scale - else: - logger.warning( - f"transform_data: unrecognised prefix for target '{t}'. " - "Applying identity (no transformation). " - "If this target requires inverse transformation it must be applied " - "by the model manager before calling evaluate(). " - "This fallback will be removed when transform_data is deprecated." - ) - return df - - @staticmethod - def convert_to_array(df: pd.DataFrame, target: str | list[str]) -> pd.DataFrame: - """ - Convert columns in a DataFrame to numpy arrays. - - Args: - df (pd.DataFrame): The input DataFrame with columns that may contain lists. - - Returns: - pd.DataFrame: A new DataFrame with columns converted to numpy arrays. - """ - converted = df.copy() - if isinstance(target, str): - target = [target] - - for t in target: - converted[t] = converted[t].apply( - lambda x: ( - x - if isinstance(x, np.ndarray) - else (np.array(x) if isinstance(x, list) else np.array([x])) - ) - ) - return converted - - @staticmethod - def convert_to_scalar(df: pd.DataFrame, target: str | list[str]) -> pd.DataFrame: - """ - Convert columns in a DataFrame to scalar values by taking the mean of the list. - """ - converted = df.copy() - if isinstance(target, str): - target = [target] - for t in target: - converted[t] = converted[t].apply( - lambda x: np.mean(x) if isinstance(x, (list, np.ndarray)) else x - ) - return converted - - @staticmethod - def get_evaluation_type(predictions: List[pd.DataFrame], target: str) -> bool: - """ - Validates the values in each DataFrame in the list. - The return value indicates whether all DataFrames are for sample evaluation. - - Args: - predictions (List[pd.DataFrame]): A list of DataFrames to check. - - Returns: - bool: True if all DataFrames are for sample evaluation, - False if all DataFrame are for point evaluation. - - Raises: - ValueError: If there is a mix of single and multiple values in the lists, - or if uncertainty lists have different lengths. - """ - is_sample = False - is_point = False - sample_length = None - - for df in predictions: - for value in df[target].values.flatten(): - if not (isinstance(value, np.ndarray) or isinstance(value, list)): - raise ValueError( - "All values must be lists or numpy arrays. Convert the data." - ) - - if len(value) > 1: - is_sample = True - # For sample evaluation, check that all lists have the same length - if sample_length is None: - sample_length = len(value) - elif len(value) != sample_length: - raise ValueError( - f"Inconsistent list lengths in sample evaluation. " - f"Found lengths {sample_length} and {len(value)}" - ) - elif len(value) == 1: - is_point = True - else: - raise ValueError("Empty lists are not allowed") - - if is_sample and is_point: - raise ValueError( - "Mix of evaluation types detected: some rows contain single values, others contain multiple values. " - "Please ensure all rows are consistent in their evaluation type" - ) - - return is_sample - - @staticmethod - def validate_predictions(predictions: List[pd.DataFrame], target: str): - """ - Checks if the predictions are valid DataFrames. - - Each DataFrame must have exactly one column named `pred_column_name`. - - Args: - predictions (List[pd.DataFrame]): A list of DataFrames containing the predictions. - target (str): The target column in the actual DataFrame. - """ - pred_column_name = f"pred_{target}" - - if not isinstance(predictions, list): - raise TypeError("Predictions must be a list of DataFrames.") - - for i, df in enumerate(predictions): - - if not isinstance(df, pd.DataFrame): - raise TypeError(f"Predictions[{i}] must be a DataFrame.") - - if df.empty: - raise ValueError(f"Predictions[{i}] must not be empty.") - - if len(df.columns) != 1: - raise ValueError( - f"Predictions[{i}] must contain exactly one column, but found {len(df.columns)}: {list(df.columns)}" # <-------- - ) - - if pred_column_name not in df.columns: - raise ValueError( - f"Predictions[{i}] must contain the column named '{pred_column_name}'. Columns found: {list(df.columns)}" - ) - - @staticmethod - def _match_actual_pred( - actual: pd.DataFrame, pred: pd.DataFrame, target: str - ) -> Tuple[pd.DataFrame, pd.DataFrame]: - """ - Matches the actual and predicted DataFrames based on the index and target column. - - Parameters: - - actual: pd.DataFrame with a MultiIndex (e.g., month, level). - - pred: pd.DataFrame with a MultiIndex that may contain duplicated indices. - - target: str, the target column in actual. - - Returns: - - matched_actual: pd.DataFrame aligned with pred. - - matched_pred: pd.DataFrame aligned with actual. - """ - actual_target = actual[[target]] - common_indices = actual_target.index.intersection(pred.index) - matched_pred = pred[pred.index.isin(common_indices)].copy() - - # Create matched_actual by reindexing actual_target to match pred's index structure - # This will duplicate rows in actual where pred has duplicate indices - matched_actual = actual_target.reindex(matched_pred.index) - - matched_actual = matched_actual.sort_index() - matched_pred = matched_pred.sort_index() - - return matched_actual, matched_pred - - - @staticmethod - def _split_dfs_by_step(dfs: list) -> list: - """Β¨ - This function splits a list of DataFrames into a list of DataFrames by step, where the key is the step. - For example, assume df0 has month_id from 100 to 102, df1 has month_id from 101 to 103, and df2 has month_id from 102 to 104. - This function returns a list of three dataframes, with the first dataframe having month_id 100 from df0, month_id 101 from df1, and month_id 102 from df; - the second dataframe having month_id 101 from df0, month_id 102 from df1, and month_id 103 from df2; and the third dataframe having month_id 102 from df1 and month_id 104 from df2. - - Args: - dfs (list): List of DataFrames with overlapping time ranges. - - Returns: - dict (list): A list of DataFrames where each contains one unique month_id from each input DataFrame. - """ - time_id = dfs[0].index.names[0] - all_month_ids = [df.index.get_level_values(0).unique() for df in dfs] - - grouped_month_ids = list(zip(*all_month_ids)) - - result_dfs = [] - for group in grouped_month_ids: - combined = pd.concat( - [df.loc[month_id] for df, month_id in zip(dfs, group)], - keys=group, - names=[time_id], - ) - result_dfs.append(combined) - - return result_dfs - - def _process_data( - self, actual: pd.DataFrame, predictions: List[pd.DataFrame], target: str - ): - """ - Process the data for evaluation. - """ - actual = EvaluationManager.transform_data( - EvaluationManager.convert_to_array(actual, target), target - ) - predictions = [ - EvaluationManager.transform_data( - EvaluationManager.convert_to_array(pred, f"pred_{target}"), - f"pred_{target}", - ) - for pred in predictions - ] - return actual, predictions - - @staticmethod - def _normalise_config(config: dict) -> dict: - """ - Translate legacy config keys to canonical keys, warning loudly. - - Legacy key 'targets' β†’ 'regression_targets' - Legacy key 'metrics' β†’ 'regression_point_metrics' - Legacy key 'regression_uncertainty_metrics' β†’ 'regression_sample_metrics' - Legacy key 'classification_uncertainty_metrics' β†’ 'classification_sample_metrics' - """ - canonical = config.copy() - if "targets" in config and "regression_targets" not in config: - logger.warning( - "Config key 'targets' is DEPRECATED and will be rejected in a future " - "version. It has been treated as 'regression_targets'. " - "Update your config." - ) - canonical["regression_targets"] = canonical.pop("targets") - if "metrics" in config and "regression_point_metrics" not in config: - logger.warning( - "Config key 'metrics' is DEPRECATED and will be rejected in a future " - "version. It has been treated as 'regression_point_metrics'. " - "Update your config." - ) - canonical["regression_point_metrics"] = canonical.pop("metrics") - - if "regression_uncertainty_metrics" in config and "regression_sample_metrics" not in config: - logger.warning( - "Config key 'regression_uncertainty_metrics' is DEPRECATED and will be rejected in a future " - "version. It has been treated as 'regression_sample_metrics'. " - "Update your config." - ) - canonical["regression_sample_metrics"] = canonical.pop("regression_uncertainty_metrics") - - if "classification_uncertainty_metrics" in config and "classification_sample_metrics" not in config: - logger.warning( - "Config key 'classification_uncertainty_metrics' is DEPRECATED and will be rejected in a future " - "version. It has been treated as 'classification_sample_metrics'. " - "Update your config." - ) - canonical["classification_sample_metrics"] = canonical.pop("classification_uncertainty_metrics") - - return canonical - - @staticmethod - def _validate_config(config: dict) -> None: - """ - Fail loud and fast on an invalid or incomplete config. - - Raises KeyError if required keys are absent. - """ - if "steps" not in config: - raise KeyError("Config must contain 'steps'.") - has_regression = bool(config.get("regression_targets")) - has_classification = bool(config.get("classification_targets")) - if not has_regression and not has_classification: - raise KeyError( - "Config must declare at least one of 'regression_targets' or " - "'classification_targets'." - ) - if has_regression and not ( - config.get("regression_point_metrics") or config.get("regression_sample_metrics") - ): - raise KeyError( - "Config declares 'regression_targets' but has neither " - "'regression_point_metrics' nor 'regression_sample_metrics'." - ) - if has_classification and not ( - config.get("classification_point_metrics") or config.get("classification_sample_metrics") - ): - raise KeyError( - "Config declares 'classification_targets' but has neither " - "'classification_point_metrics' nor 'classification_sample_metrics'." - ) - - # Validate that metrics are valid for the task type (ADR-014) - from views_evaluation.evaluation.native_metric_calculators import ( - REGRESSION_POINT_NATIVE, REGRESSION_SAMPLE_NATIVE, - CLASSIFICATION_POINT_NATIVE, CLASSIFICATION_SAMPLE_NATIVE - ) - - for metric in config.get("regression_point_metrics", []): - if metric not in REGRESSION_POINT_NATIVE or metric == "AP": - raise ValueError(f"Metric '{metric}' is not valid for regression point tasks.") - - for metric in config.get("regression_sample_metrics", []): - if metric not in REGRESSION_SAMPLE_NATIVE: - raise ValueError(f"Metric '{metric}' is not valid for regression sample tasks.") - for metric in config.get("classification_point_metrics", []): - if metric not in CLASSIFICATION_POINT_NATIVE: - raise ValueError(f"Metric '{metric}' is not valid for classification point tasks.") - for metric in config.get("classification_sample_metrics", []): - if metric not in CLASSIFICATION_SAMPLE_NATIVE: - raise ValueError(f"Metric '{metric}' is not valid for classification sample tasks.") - - - def step_wise_evaluation( - self, - actual: pd.DataFrame, - predictions: List[pd.DataFrame], - target: str, - steps: List[int], - metrics_list: List[str], - metric_functions: dict, - metrics_cls: type, - **kwargs, - ): - """ - Evaluates the predictions step-wise and calculates the specified metrics. - - Args: - actual (pd.DataFrame): The actual values. - predictions (List[pd.DataFrame]): A list of DataFrames containing the predictions. - target (str): The target column in the actual DataFrame. - steps (List[int]): The steps to evaluate. - metrics_list (List[str]): Metrics to compute, declared in config. - metric_functions (dict): Dispatch dict for the resolved task/pred type. - metrics_cls (type): Dataclass to use for result storage. - - Returns: - Tuple: A tuple containing the evaluation dictionary and the evaluation DataFrame. - """ - evaluation_dict = metrics_cls.make_step_wise_evaluation_dict(steps=max(steps)) - result_dfs = EvaluationManager._split_dfs_by_step(predictions) - - step_matched_data = {} - for i, pred in enumerate(result_dfs): - step = i + 1 - matched_actual, matched_pred = EvaluationManager._match_actual_pred( - actual, pred, target - ) - step_matched_data[step] = (matched_actual, matched_pred) - - for metric in metrics_list: - for step, (matched_actual, matched_pred) in step_matched_data.items(): - evaluation_dict[f"step{str(step).zfill(2)}"].__setattr__( - metric, - metric_functions[metric]( - matched_actual, matched_pred, target, **kwargs - ), - ) - - return ( - evaluation_dict, - metrics_cls.evaluation_dict_to_dataframe(evaluation_dict), - ) - - def time_series_wise_evaluation( - self, - actual: pd.DataFrame, - predictions: List[pd.DataFrame], - target: str, - metrics_list: List[str], - metric_functions: dict, - metrics_cls: type, - **kwargs, - ): - """ - Evaluates the predictions time series-wise and calculates the specified metrics. - - Args: - actual (pd.DataFrame): The actual values. - predictions (List[pd.DataFrame]): A list of DataFrames containing the predictions. - target (str): The target column in the actual DataFrame. - metrics_list (List[str]): Metrics to compute, declared in config. - metric_functions (dict): Dispatch dict for the resolved task/pred type. - metrics_cls (type): Dataclass to use for result storage. - - Returns: - Tuple: A tuple containing the evaluation dictionary and the evaluation DataFrame. - """ - evaluation_dict = metrics_cls.make_time_series_wise_evaluation_dict( - len(predictions) - ) - - ts_matched_data = {} - for i, pred in enumerate(predictions): - matched_actual, matched_pred = EvaluationManager._match_actual_pred( - actual, pred, target - ) - ts_matched_data[i] = (matched_actual, matched_pred) - - for metric in metrics_list: - for i, (matched_actual, matched_pred) in ts_matched_data.items(): - evaluation_dict[f"ts{str(i).zfill(2)}"].__setattr__( - metric, - metric_functions[metric]( - matched_actual, matched_pred, target, **kwargs - ), - ) - - return ( - evaluation_dict, - metrics_cls.evaluation_dict_to_dataframe(evaluation_dict), - ) - - def month_wise_evaluation( - self, - actual: pd.DataFrame, - predictions: List[pd.DataFrame], - target: str, - metrics_list: List[str], - metric_functions: dict, - metrics_cls: type, - **kwargs, - ): - """ - Evaluates the predictions month-wise and calculates the specified metrics. - - Args: - actual (pd.DataFrame): The actual values. - predictions (List[pd.DataFrame]): A list of DataFrames containing the predictions. - target (str): The target column in the actual DataFrame. - metrics_list (List[str]): Metrics to compute, declared in config. - metric_functions (dict): Dispatch dict for the resolved task/pred type. - metrics_cls (type): Dataclass to use for result storage. - - Returns: - Tuple: A tuple containing the evaluation dictionary and the evaluation DataFrame. - """ - pred_concat = pd.concat(predictions) - month_range = pred_concat.index.get_level_values(0).unique() - month_start = int(month_range.min()) - month_end = int(month_range.max()) - - evaluation_dict = metrics_cls.make_month_wise_evaluation_dict( - month_start, month_end - ) - - matched_actual, matched_pred = EvaluationManager._match_actual_pred( - actual, pred_concat, target - ) - - g = matched_pred.groupby(level=matched_pred.index.names[0], sort=False, observed=True) - groups = g.indices # dict: {month -> np.ndarray of row positions} - - for metric in metrics_list: - for month, pos in groups.items(): - value = metric_functions[metric]( - matched_actual.iloc[pos], - matched_pred.iloc[pos], - target, - **kwargs, - ) - evaluation_dict[f"month{str(month)}"].__setattr__(metric, value) - - return ( - evaluation_dict, - metrics_cls.evaluation_dict_to_dataframe(evaluation_dict), - ) - - def evaluate( - self, - actual: pd.DataFrame = None, - predictions: List[pd.DataFrame] = None, - target: str = None, - config: dict = None, - ef: EvaluationFrame = None, - verify_parity: bool = False, - **kwargs, - ): - """ - Evaluate predictions. Supports legacy DataFrame inputs OR Native EvaluationFrame. - - Args: - actual (pd.DataFrame): Optional. Legacy actuals. - predictions (List[pd.DataFrame]): Optional. Legacy predictions. - target (str): Target column name. - config (dict): Evaluation configuration. - ef (EvaluationFrame): Optional. Pre-adapted native frame. - verify_parity (bool): If True and both ef and legacy inputs are provided, - verifies bit-wise parity between them. - """ - config = EvaluationManager._normalise_config(config) - EvaluationManager._validate_config(config) - - if ef is not None: - # PATH B: Direct Native Evaluation - if not isinstance(ef, EvaluationFrame): - raise TypeError("Provided 'ef' must be an EvaluationFrame instance.") - target = ef.metadata.get('target', target) - - if verify_parity and actual is not None and predictions is not None: - # ADR-024 Shadow Run: Verify external adaptation matches internal - ef_internal = PandasAdapter.from_dataframes(actual, predictions, target) - # Check data parity - if not np.array_equal(ef.y_true, ef_internal.y_true): - raise ValueError("Parity Failure: y_true mismatch between external and internal adaptation.") - if not np.array_equal(ef.y_pred, ef_internal.y_pred): - raise ValueError("Parity Failure: y_pred mismatch between external and internal adaptation.") - for key in ef.identifiers: - if not np.array_equal(ef.identifiers[key], ef_internal.identifiers[key]): - raise ValueError(f"Parity Failure: identifier '{key}' mismatch.") - else: - # PATH A: Legacy Adaptation - if actual is None or predictions is None or target is None: - raise ValueError("If 'ef' is not provided, 'actual', 'predictions', and 'target' are required.") - - EvaluationManager.validate_predictions(predictions, target) - # ADR-010: Adapt legacy DataFrames to canonical EvaluationFrame - ef = PandasAdapter.from_dataframes(actual, predictions, target) - - # Restore internal state for backward compatibility with reflective tests - self.actual, self.predictions = self._process_data(actual, predictions, target) - - self.is_sample = ef.is_sample - - - # ADR-010: Delegate to the NativeEvaluator (Pure Math Engine) - evaluator = NativeEvaluator(config) - - # Phase 1: Enable legacy compatibility to maintain bit-wise parity - # (Reproduces truncation bugs and positional step assumptions) - try: - report = evaluator.evaluate(ef, legacy_compatibility=True) - # Map report back to legacy dictionary structure for backward compatibility - return { - schema: (report.get_schema_results(schema), report.to_dataframe(schema)) - for schema in ["month", "time_series", "step"] - } - except ValueError as e: - # Re-wrap error message to match legacy test expectations if needed - if "Target" in str(e) and "not found in config" in str(e): - raise ValueError(f"Target '{target}' is not declared in config") - raise e - - - - - @staticmethod - def filter_step_wise_evaluation( - step_wise_evaluation_results: dict, - filter_steps: list[int] = [1, 3, 6, 12, 36], - ): - """ - Filter step-wise evaluation results to include only specific steps. - - Args: - step_wise_evaluation_results (dict): The step-wise evaluation results containing evaluation dict and DataFrame. - filter_steps (list[int]): List of step numbers to include in the filtered results. Defaults to [1, 3, 6, 12, 36]. - - Returns: - dict: A dictionary containing the filtered evaluation dictionary and DataFrame for the selected steps. - """ - step_wise_evaluation_dict = step_wise_evaluation_results[0] - step_wise_evaluation_df = step_wise_evaluation_results[1] - - selected_keys = [f"step{str(step).zfill(2)}" for step in filter_steps] - - filtered_evaluation_dict = { - key: step_wise_evaluation_dict[key] - for key in selected_keys - if key in step_wise_evaluation_dict - } - - filtered_evaluation_df = step_wise_evaluation_df.loc[ - step_wise_evaluation_df.index.isin(selected_keys) - ] - - return (filtered_evaluation_dict, filtered_evaluation_df) - - @staticmethod - def aggregate_month_wise_evaluation( - month_wise_evaluation_results: dict, - aggregation_period: int = 6, - aggregation_type: str = "mean", - ): - """ - Aggregate month-wise evaluation results by grouping months into periods and applying aggregation. - - Args: - month_wise_evaluation_results (dict): The month-wise evaluation results containing evaluation dict and DataFrame. - aggregation_period (int): Number of months to group together for aggregation. - aggregation_type (str): Type of aggregation to apply. - Returns: - dict: A dictionary containing the aggregated evaluation dictionary and DataFrame. - """ - month_wise_evaluation_dict = month_wise_evaluation_results[0] - month_wise_evaluation_df = month_wise_evaluation_results[1] - - available_months = [ - int(month.replace("month", "")) for month in month_wise_evaluation_df.index - ] - available_months.sort() - - if len(available_months) < aggregation_period: - raise ValueError( - f"Not enough months to aggregate. Available months: {available_months}, aggregation period: {aggregation_period}" - ) - - aggregated_dict = {} - aggregated_data = [] - - for i in range(0, len(available_months), aggregation_period): - period_months = available_months[i : i + aggregation_period] - period_start = period_months[0] - period_end = period_months[-1] - period_key = f"month_{period_start}_{period_end}" - - period_metrics = [] - for month in period_months: - month_key = f"month{month}" - if month_key in month_wise_evaluation_dict: - period_metrics.append(month_wise_evaluation_dict[month_key]) - - if period_metrics: - aggregated_metrics = {} - for metric_name in period_metrics[0].__annotations__.keys(): - metric_values = [ - getattr(metric, metric_name) - for metric in period_metrics - if getattr(metric, metric_name) is not None - ] - - if metric_values: - if aggregation_type == "mean": - aggregated_value = np.mean(metric_values) - elif aggregation_type == "median": - aggregated_value = np.median(metric_values) - else: - raise ValueError( - f"Unsupported aggregation type: {aggregation_type}" - ) - - aggregated_metrics[metric_name] = aggregated_value - else: - aggregated_metrics[metric_name] = None - - if hasattr(period_metrics[0], "__class__"): - aggregated_eval_metrics = period_metrics[0].__class__( - **aggregated_metrics - ) - else: - aggregated_eval_metrics = aggregated_metrics - - aggregated_dict[period_key] = aggregated_eval_metrics - - aggregated_data.append({"month_id": period_key, **aggregated_metrics}) - - if aggregated_data: - aggregated_df = BaseEvaluationMetrics.evaluation_dict_to_dataframe( - aggregated_dict - ) - - return (aggregated_dict, aggregated_df) diff --git a/views_evaluation/evaluation/metric_catalog.py b/views_evaluation/evaluation/metric_catalog.py index 861cfeb..7eae282 100644 --- a/views_evaluation/evaluation/metric_catalog.py +++ b/views_evaluation/evaluation/metric_catalog.py @@ -31,10 +31,13 @@ calculate_coverage_native, calculate_mean_interval_score_native, calculate_ignorance_score_native, + calculate_brier_sample_native, + calculate_brier_point_native, + calculate_qs_sample_native, + calculate_qs_point_native, calculate_sd_native, calculate_pEMDiv_native, calculate_variogram_native, - calculate_brier_native, calculate_jeffreys_native, ) @@ -81,20 +84,29 @@ class MetricSpec: "Ignorance": MetricSpec(function=calculate_ignorance_score_native, genome=("bins", "low_bin", "high_bin")), + # ── Quantile Score (Pinball Loss) ──────────────────────────────────── + "QS_sample": MetricSpec(function=calculate_qs_sample_native, genome=("quantile",)), + "QS_point": MetricSpec(function=calculate_qs_point_native, genome=("quantile",)), + + # ── Brier Score ─────────────────────────────────────────────────────── + "Brier_sample": MetricSpec(function=calculate_brier_sample_native, genome=("threshold",)), + "Brier_point": MetricSpec(function=calculate_brier_point_native, genome=("threshold",)), + # ── Classification ──────────────────────────────────────────────────── "AP": MetricSpec(function=calculate_ap_native, genome=()), - "Brier": MetricSpec(function=calculate_brier_native, genome=(), implemented=False), "Jeffreys": MetricSpec(function=calculate_jeffreys_native, genome=(), implemented=False), } METRIC_MEMBERSHIP: Dict[Tuple[str, str], set] = { ("regression", "point"): {"MSE", "MSLE", "RMSLE", "EMD", "Pearson", "MTD", - "y_hat_bar", "MCR_point", "SD", "pEMDiv", "Variogram"}, - ("regression", "sample"): {"CRPS", "twCRPS", "MIS", "QIS", "Coverage", - "Ignorance", "y_hat_bar", "MCR_sample"}, - ("classification", "point"): {"AP"}, - ("classification", "sample"): {"CRPS", "twCRPS", "Brier", "Jeffreys"}, + "y_hat_bar", "MCR_point", "QS_point", + "SD", "pEMDiv", "Variogram"}, + ("regression", "sample"): {"CRPS", "twCRPS", "MIS", "QIS", "QS_sample", + "Coverage", "Ignorance", + "y_hat_bar", "MCR_sample"}, + ("classification", "point"): {"AP", "Brier_point"}, + ("classification", "sample"): {"CRPS", "twCRPS", "Brier_sample", "Jeffreys"}, } diff --git a/views_evaluation/evaluation/metrics.py b/views_evaluation/evaluation/metrics.py index 83512cf..c3491f5 100644 --- a/views_evaluation/evaluation/metrics.py +++ b/views_evaluation/evaluation/metrics.py @@ -99,64 +99,8 @@ def evaluation_dict_to_dataframe(evaluation_dict: dict): return df.loc[:, df.notna().any()] -@dataclass -class PointEvaluationMetrics(BaseEvaluationMetrics): - """ - A data class for storing and managing point evaluation metrics for time series forecasting models. - - Attributes: - RMSLE (Optional[float]): Root Mean Squared Logarithmic Error. - CRPS (Optional[float]): Continuous Ranked Probability Score. - AP (Optional[float]): Average Precision. - Brier (Optional[float]): Brier Score. - Jeffreys (Optional[float]): Jeffreys Divergence. - Coverage (Optional[float]): Coverage (Histograms). - EMD (Optional[float]): Earth Mover Distance. - SD (Optional[float]): Sinkhorn Distance. - pEMDiv (Optional[float]): pseudo-Earth Mover Divergence. - Pearson (Optional[float]): Pearson Correlation. - Variogram (Optional[float]): Variogram. - """ - - MSE: Optional[float] = None - MSLE: Optional[float] = None - RMSLE: Optional[float] = None - CRPS: Optional[float] = None - AP: Optional[float] = None - EMD: Optional[float] = None - SD: Optional[float] = None - pEMDiv: Optional[float] = None - Pearson: Optional[float] = None - Variogram: Optional[float] = None - MTD: Optional[float] = None - y_hat_bar: Optional[float] = None - - -@dataclass -class SampleEvaluationMetrics(BaseEvaluationMetrics): - """ - A data class for storing and managing sample-based evaluation metrics for time series forecasting models. - - Attributes: - CRPS (Optional[float]): Continuous Ranked Probability Score. - """ - - CRPS: Optional[float] = None - twCRPS: Optional[float] = None - MIS: Optional[float] = None - QIS: Optional[float] = None - Ignorance: Optional[float] = None - Coverage: Optional[float] = None - pEMDiv: Optional[float] = None - Brier: Optional[float] = None - Jeffreys: Optional[float] = None - y_hat_bar: Optional[float] = None - - # --------------------------------------------------------------------------- -# New 2Γ—2 dataclasses: {regression, classification} Γ— {point, sample} -# These replace PointEvaluationMetrics and SampleEvaluationMetrics for -# all new code. The legacy classes above are retained for backward compat. +# 2Γ—2 dataclasses: {regression, classification} Γ— {point, sample} # --------------------------------------------------------------------------- @dataclass @@ -173,16 +117,18 @@ class RegressionPointEvaluationMetrics(BaseEvaluationMetrics): MTD: Optional[float] = None y_hat_bar: Optional[float] = None MCR_point: Optional[float] = None + QS_point: Optional[float] = None @dataclass class RegressionSampleEvaluationMetrics(BaseEvaluationMetrics): """Metrics for regression targets evaluated with sample-based predictions.""" - CRPS: Optional[float] = None - twCRPS: Optional[float] = None - MIS: Optional[float] = None - QIS: Optional[float] = None - Coverage: Optional[float] = None + CRPS: Optional[float] = None + twCRPS: Optional[float] = None + MIS: Optional[float] = None + QIS: Optional[float] = None + QS_sample: Optional[float] = None + Coverage: Optional[float] = None Ignorance: Optional[float] = None y_hat_bar: Optional[float] = None MCR_sample: Optional[float] = None @@ -191,13 +137,14 @@ class RegressionSampleEvaluationMetrics(BaseEvaluationMetrics): @dataclass class ClassificationPointEvaluationMetrics(BaseEvaluationMetrics): """Metrics for classification targets evaluated with point (probability) predictions.""" - AP: Optional[float] = None + AP: Optional[float] = None + Brier_point: Optional[float] = None @dataclass class ClassificationSampleEvaluationMetrics(BaseEvaluationMetrics): """Metrics for classification targets evaluated with sample-based predictions.""" - CRPS: Optional[float] = None - twCRPS: Optional[float] = None - Brier: Optional[float] = None - Jeffreys: Optional[float] = None + CRPS: Optional[float] = None + twCRPS: Optional[float] = None + Brier_sample: Optional[float] = None + Jeffreys: Optional[float] = None diff --git a/views_evaluation/evaluation/native_evaluator.py b/views_evaluation/evaluation/native_evaluator.py index 60aa7c9..bfb9a82 100644 --- a/views_evaluation/evaluation/native_evaluator.py +++ b/views_evaluation/evaluation/native_evaluator.py @@ -73,7 +73,7 @@ def _calculate_metrics(self, ef: EvaluationFrame, metrics_list: List[str], results[m] = spec.function(ef.y_true, ef.y_pred, **resolved) return results - def evaluate(self, ef: EvaluationFrame, legacy_compatibility: bool = True) -> EvaluationReport: + def evaluate(self, ef: EvaluationFrame, legacy_compatibility: bool = False) -> EvaluationReport: metrics_list, task, pred_type = self._resolve_task_and_metrics(ef) results = {} diff --git a/views_evaluation/evaluation/native_metric_calculators.py b/views_evaluation/evaluation/native_metric_calculators.py index 76f4db5..f04ea8a 100644 --- a/views_evaluation/evaluation/native_metric_calculators.py +++ b/views_evaluation/evaluation/native_metric_calculators.py @@ -238,59 +238,164 @@ def calculate_quantile_interval_score_native( return float(np.mean(qis)) +# ── Brier Score ─────────────────────────────────────────────────────────────── + +def calculate_brier_sample_native( + y_true: np.ndarray, + y_pred: np.ndarray, + target=None, + *, + threshold: float, + **kwargs, +) -> float: + """ + Brier Score for sample-based predictions binarized at a threshold. + + Binarises truth at the threshold, computes event probability from + the fraction of ensemble members exceeding the threshold, then + returns the mean squared error between predicted probability and + binary outcome. + + Brier = mean((p_hat - y_binary)^2) + + where p_hat = mean(y_pred > threshold, axis=1) and + y_binary = (y_true > threshold). + + Note: NaN values in y_true or y_pred are silently converted to + below-threshold (False) by NumPy comparison semantics. Callers + must validate inputs via EvaluationFrame. + + Args: + threshold: Onset threshold for binarisation. Must be provided + explicitly via evaluation profile or model config. + """ + y_true, y_pred = _guard_shapes(y_true, y_pred) + y_binary = (y_true > threshold).astype(float) + p_hat = np.mean(y_pred > threshold, axis=1) + return float(np.mean((p_hat - y_binary) ** 2)) + + +def calculate_brier_point_native( + y_true: np.ndarray, + y_pred: np.ndarray, + target=None, + *, + threshold: float, + **kwargs, +) -> float: + """ + Brier Score for point (probability) predictions binarized at a threshold. + + Binarises truth at the threshold, uses the point prediction + directly as the predicted probability. y_pred values should be + in [0, 1] for meaningful results; values outside this range + produce a mathematically valid but semantically misleading score. + + Brier = mean((y_pred - y_binary)^2) + + For point predictions, y_pred is (N, 1) after _guard_shapes. + The single column is the predicted probability. + + Note: NaN values in y_true or y_pred are silently converted to + below-threshold (False) by NumPy comparison semantics. Callers + must validate inputs via EvaluationFrame. + + Args: + threshold: Onset threshold for binarisation. + """ + y_true, y_pred = _guard_shapes(y_true, y_pred) + y_binary = (y_true > threshold).astype(float) + p_hat = y_pred[:, 0] # Point prediction: single column + return float(np.mean((p_hat - y_binary) ** 2)) + + +# ── Quantile Score (Pinball Loss) ───────────────────────────────────────────── + +def calculate_qs_sample_native( + y_true: np.ndarray, + y_pred: np.ndarray, + target=None, + *, + quantile: float, + **kwargs, +) -> float: + """ + Quantile Score (pinball loss) for sample-based predictions. + + Extracts the specified quantile from the forecast ensemble, then + computes the asymmetric pinball loss. + + QS = mean(max(alpha * (y - q), (1 - alpha) * (q - y))) + + where q = np.quantile(y_pred, quantile, axis=1). + + Args: + quantile: Quantile level in (0, 1). E.g. 0.99 for QS99. + """ + y_true, y_pred = _guard_shapes(y_true, y_pred) + q = np.quantile(y_pred, quantile, axis=1) + diff = y_true - q + scores = np.where( + diff >= 0, + diff * quantile, + -diff * (1 - quantile), + ) + return float(np.mean(scores)) + + +def calculate_qs_point_native( + y_true: np.ndarray, + y_pred: np.ndarray, + target=None, + *, + quantile: float, + **kwargs, +) -> float: + """ + Quantile Score (pinball loss) for point predictions. + + The point prediction is treated as the quantile estimate directly. + Computes the asymmetric pinball loss. + + QS = mean(max(alpha * (y - y_hat), (1 - alpha) * (y_hat - y))) + + For point predictions, y_pred is (N, 1) after _guard_shapes. + + Args: + quantile: Quantile level in (0, 1). E.g. 0.99 for QS99. + """ + y_true, y_pred = _guard_shapes(y_true, y_pred) + q = y_pred[:, 0] + diff = y_true - q + scores = np.where( + diff >= 0, + diff * quantile, + -diff * (1 - quantile), + ) + return float(np.mean(scores)) + + # Placeholder functions for metrics that are planned but not yet implemented. -# ADR-013: Raise ValueError (not NotImplementedError) so callers get a consistent, -# user-facing message rather than a bare exception type. +# ADR-013: Raise ValueError (not NotImplementedError) so callers get a +# consistent, user-facing message rather than a bare exception type. def calculate_sd_native(*args, **kwargs): - raise ValueError("Metric 'SD' is defined but not yet implemented. Remove it from your config.") + raise ValueError( + "Metric 'SD' is defined but not yet implemented." + " Remove it from your config." + ) def calculate_pEMDiv_native(*args, **kwargs): - raise ValueError("Metric 'pEMDiv' is defined but not yet implemented. Remove it from your config.") + raise ValueError( + "Metric 'pEMDiv' is defined but not yet implemented." + " Remove it from your config." + ) def calculate_variogram_native(*args, **kwargs): - raise ValueError("Metric 'Variogram' is defined but not yet implemented. Remove it from your config.") -def calculate_brier_native(*args, **kwargs): - raise ValueError("Metric 'Brier' is defined but not yet implemented. Remove it from your config.") + raise ValueError( + "Metric 'Variogram' is defined but not yet implemented." + " Remove it from your config." + ) def calculate_jeffreys_native(*args, **kwargs): - raise ValueError("Metric 'Jeffreys' is defined but not yet implemented. Remove it from your config.") - -# PHASE-3-DELETE: Legacy alias retained for test_evaluation_manager.py -calculate_ap = calculate_ap_native - -# Dispatch dicts (Framework Agnostic) -REGRESSION_POINT_NATIVE = { - "MSE": calculate_mse_native, - "MSLE": calculate_msle_native, - "RMSLE": calculate_rmsle_native, - "EMD": calculate_emd_native, - "Pearson": calculate_pearson_native, - "MTD": calculate_mtd_native, - "y_hat_bar": calculate_mean_prediction_native, - "MCR_point": calculate_mcr_native, - "SD": calculate_sd_native, - "pEMDiv": calculate_pEMDiv_native, - "Variogram": calculate_variogram_native, -} - - - - -REGRESSION_SAMPLE_NATIVE = { - "CRPS": calculate_crps_native, - "twCRPS": calculate_twcrps_native, - "MIS": calculate_mean_interval_score_native, - "QIS": calculate_quantile_interval_score_native, - "Coverage": calculate_coverage_native, - "Ignorance": calculate_ignorance_score_native, - "y_hat_bar": calculate_mean_prediction_native, - "MCR_sample": calculate_mcr_native, -} - -CLASSIFICATION_POINT_NATIVE = { - "AP": calculate_ap_native, -} - -CLASSIFICATION_SAMPLE_NATIVE = { - "CRPS": calculate_crps_native, - "twCRPS": calculate_twcrps_native, - "Brier": calculate_brier_native, - "Jeffreys": calculate_jeffreys_native, -} + raise ValueError( + "Metric 'Jeffreys' is defined but not yet implemented." + " Remove it from your config." + ) + diff --git a/views_evaluation/profiles/base.py b/views_evaluation/profiles/base.py index 450200e..652367b 100644 --- a/views_evaluation/profiles/base.py +++ b/views_evaluation/profiles/base.py @@ -21,11 +21,15 @@ """ BASE_PROFILE = { - "MTD": {"power": 1.5}, - "twCRPS": {"threshold": 0.0}, - "MIS": {"alpha": 0.05}, - "QIS": {"lower_quantile": 0.025, "upper_quantile": 0.975}, - "Coverage": {"alpha": 0.1}, + "MTD": {"power": 1.5}, + "twCRPS": {"threshold": 0.0}, + "MIS": {"alpha": 0.05}, + "QIS": {"lower_quantile": 0.025, "upper_quantile": 0.975}, + "QS_sample": {"quantile": 0.99}, + "QS_point": {"quantile": 0.99}, + "Brier_sample": {"threshold": 1.0}, + "Brier_point": {"threshold": 1.0}, + "Coverage": {"alpha": 0.1}, "Ignorance": { "bins": [0, 0.5, 2.5, 5.5, 10.5, 25.5, 50.5, 100.5, 250.5, 500.5, 1000.5], "low_bin": 0,