diff --git a/.gitignore b/.gitignore
index f72f3bd..643bf8b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -215,4 +215,5 @@ cython_debug/
 
 # logs
 *.log
-*.log.*reports/
+*.log.*
+reports/
diff --git a/README.md b/README.md
index ab537be..4ba0d65 100644
--- a/README.md
+++ b/README.md
@@ -59,10 +59,16 @@ The library is built on a **three-layer architecture** with a framework-agnostic
 ## 🚀 **Quick Start**
 
 ```python
-from views_evaluation import PandasAdapter, NativeEvaluator
+from views_evaluation import EvaluationFrame, NativeEvaluator
+import numpy as np
 
-# 1. Convert DataFrames → EvaluationFrame
-ef = PandasAdapter.from_dataframes(actual=actuals, predictions=predictions_list, target="ged_sb_best")
+# 1. Construct EvaluationFrame with NumPy arrays
+ef = EvaluationFrame(
+    y_true=y_true_array,
+    y_pred=y_pred_array,  # shape (N, S) where S >= 1
+    identifiers={'time': times, 'unit': units, 'origin': origins, 'step': steps},
+    metadata={'target': 'ged_sb_best'},
+)
 
 # 2. Configure and evaluate
 config = {
@@ -89,7 +95,7 @@ VIEWS Evaluation ensures **forecasting accuracy and model robustness** as the **
 
 ### **Pipeline Integration:**
 1. **Model Predictions** →
-2. **PandasAdapter** (DataFrame → EvaluationFrame) →
+2. **EvaluationFrame** (validated NumPy container) →
 3. **NativeEvaluator** (metrics computation) →
 4. **EvaluationReport** (structured results)  
 
@@ -195,7 +201,7 @@ config = {
 ---
 
 * **Data Integrity Checks**: Validates input arrays for shape consistency, NaN/infinity, and required identifiers.
-* **Automatic Index Matching**: `PandasAdapter` aligns actual and predicted values based on MultiIndex structures.
+* **Framework-Agnostic Core**: All evaluation operates on pure NumPy arrays via `EvaluationFrame`.
 * **Metric Catalog & Profiles**: Hyperparameters are managed through named evaluation profiles with a Chain of Responsibility resolver (model overrides → profile → fail loud).  
 
 ---
@@ -223,11 +229,11 @@ Level 0 — Pure Core (NumPy + SciPy only, zero framework imports)
   Profiles              Named hyperparameter sets (base, hydranet_ucdp, ...)
 
 Level 1 — Bridge / Adapter
-  PandasAdapter         DataFrame → EvaluationFrame conversion (PHASE-3-DELETE)
+  EvaluationFrame       Validated NumPy data container
   EvaluationReport      Results container with DataFrame/dict export
 
 Level 2 — Legacy Orchestrator
-  EvaluationManager     Deprecated wrapper; delegates to Level 0
+  MetricCatalog         Genome registry and parameter resolver
 ```
 
 **Key design decisions:**
@@ -244,7 +250,7 @@ views-evaluation/
 ├── views_evaluation/
 │   ├── __init__.py                        # Public API exports
 │   ├── adapters/
-│   │   └── pandas.py                      # PandasAdapter (PHASE-3-DELETE)
+│   │   └── __init__.py                     # Reserved for future framework bridges
 │   ├── evaluation/
 │   │   ├── config_schema.py               # EvaluationConfig TypedDict
 │   │   ├── evaluation_frame.py            # Core data container
diff --git a/documentation/ADRs/000_use_of_adrs.md b/documentation/ADRs/000_use_of_adrs.md
index 1dc830f..3bbbf14 100644
--- a/documentation/ADRs/000_use_of_adrs.md
+++ b/documentation/ADRs/000_use_of_adrs.md
@@ -2,7 +2,9 @@
 
 **Status:** Accepted  
 **Date:** 2026-02-25  
-**Deciders:** Project maintainers, Gemini CLI  
+**Deciders:** Project maintainers  
+**Consulted:** —  
+**Informed:** All contributors  
 
 ---
 
diff --git a/documentation/ADRs/001_silicon_based_agent_protocol.md b/documentation/ADRs/001_silicon_based_agent_protocol.md
index 8ce90b7..28822a2 100644
--- a/documentation/ADRs/001_silicon_based_agent_protocol.md
+++ b/documentation/ADRs/001_silicon_based_agent_protocol.md
@@ -2,7 +2,9 @@
 
 **Status:** Accepted  
 **Date:** 2026-02-25  
-**Deciders:** Project maintainers, Gemini CLI  
+**Deciders:** Project maintainers  
+**Consulted:** —  
+**Informed:** All contributors  
 
 ---
 
diff --git a/documentation/ADRs/010_ontology_of_evaluation.md b/documentation/ADRs/010_ontology_of_evaluation.md
index 475bd98..fd4e00f 100644
--- a/documentation/ADRs/010_ontology_of_evaluation.md
+++ b/documentation/ADRs/010_ontology_of_evaluation.md
@@ -2,7 +2,9 @@
 
 **Status:** Accepted  
 **Date:** 2026-02-25  
-**Deciders:** Project maintainers, Gemini CLI  
+**Deciders:** Project maintainers  
+**Consulted:** —  
+**Informed:** All contributors  
 
 ---
 
diff --git a/documentation/ADRs/011_topology_and_dependency_rules.md b/documentation/ADRs/011_topology_and_dependency_rules.md
index ff1f89c..b482fc7 100644
--- a/documentation/ADRs/011_topology_and_dependency_rules.md
+++ b/documentation/ADRs/011_topology_and_dependency_rules.md
@@ -2,7 +2,9 @@
 
 **Status:** Accepted  
 **Date:** 2026-02-25  
-**Deciders:** Project maintainers, Gemini CLI  
+**Deciders:** Project maintainers  
+**Consulted:** —  
+**Informed:** All contributors  
 
 ---
 
@@ -10,7 +12,7 @@
 
 In complex evaluation systems, architectural fragility often emerges not from incorrect logic, but from uncontrolled dependencies between components.
 
-The Evaluation repository pre-Feb 2026 suffered from "Pandas-heavy" coupling. Higher-level logic (EvaluationManager) depended on Pandas `MultiIndex` internals for alignment, which constrained our ability to scale probabilistic forecasts (N, S) due to memory/performance limits of Pandas' "lists-in-cells."
+The Evaluation repository pre-Feb 2026 suffered from "Pandas-heavy" coupling. Higher-level logic (e.g., Pipeline Core) depended on Pandas `MultiIndex` internals for alignment, which constrained our ability to scale probabilistic forecasts (N, S) due to memory/performance limits of Pandas' "lists-in-cells."
 
 Without explicit topology rules, we risk high-level math modules beginning to depend on implementation details (e.g., NumPy indexing vs Xarray coordinates).
 
@@ -29,8 +31,8 @@ Violations are architectural defects.
 The Evaluation Core is the lowest-level layer (most stable). 
 
 - **Level 0: Evaluation Core** (Pure NumPy, `EvaluationFrame`, `NativeEvaluator`). No external imports except `numpy` and `scipy`.
-- **Level 1: Adapters** (Framework-specific bridges like `PandasAdapter`). May depend on Level 0.
-- **Level 2: Orchestration** (e.g., `EvaluationManager`, Pipeline Core). May depend on Level 1 and Level 0.
+- **Level 1: Adapters** (Framework-specific bridges, reserved for future use). May depend on Level 0.
+- **Level 2: Orchestration** (e.g., Pipeline Core — external to this repo). May depend on Level 1 and Level 0.
 
 Dependency direction must always flow **toward the Core**.
 
@@ -38,7 +40,7 @@ Dependency direction must always flow **toward the Core**.
 
 - Math kernels importing `pandas` or `polars`.
 - `EvaluationFrame` containing anything other than NumPy arrays.
-- Higher-level modules (e.g., `EvaluationManager`) passing DataFrames directly into metric functions.
+- Higher-level modules (e.g., external orchestrators) passing DataFrames directly into metric functions.
 
 If a dependency feels “convenient but wrong,” it probably is.
 
diff --git a/documentation/ADRs/012_authority_over_inference.md b/documentation/ADRs/012_authority_over_inference.md
index 3023700..dfdc260 100644
--- a/documentation/ADRs/012_authority_over_inference.md
+++ b/documentation/ADRs/012_authority_over_inference.md
@@ -2,7 +2,9 @@
 
 **Status:** Accepted  
 **Date:** 2026-02-25  
-**Deciders:** Project maintainers, Gemini CLI  
+**Deciders:** Project maintainers  
+**Consulted:** —  
+**Informed:** All contributors  
 
 ---
 
@@ -58,5 +60,5 @@ the system **must fail loudly and immediately**.
 - Improves debuggability: we can inspect the `EvaluationFrame` and see exactly what the system *thinks* it is evaluating.
 
 ### Negative
-- Requires more metadata in the `EvaluationFrame` and `PandasAdapter`.
+- Requires more metadata in the `EvaluationFrame` and external adapters.
 - Some "convenient" hacks are disallowed.
diff --git a/documentation/ADRs/013_observability_and_explicit_failure.md b/documentation/ADRs/013_observability_and_explicit_failure.md
index 8c26e75..ebc7344 100644
--- a/documentation/ADRs/013_observability_and_explicit_failure.md
+++ b/documentation/ADRs/013_observability_and_explicit_failure.md
@@ -2,7 +2,9 @@
 
 **Status:** Accepted  
 **Date:** 2026-02-25  
-**Deciders:** Project maintainers, Gemini CLI  
+**Deciders:** Project maintainers  
+**Consulted:** —  
+**Informed:** All contributors  
 
 ---
 
diff --git a/documentation/ADRs/014_boundary_contracts_and_validation.md b/documentation/ADRs/014_boundary_contracts_and_validation.md
index dd6a70e..dc46108 100644
--- a/documentation/ADRs/014_boundary_contracts_and_validation.md
+++ b/documentation/ADRs/014_boundary_contracts_and_validation.md
@@ -2,7 +2,9 @@
 
 **Status:** Accepted  
 **Date:** 2026-02-25  
-**Deciders:** Project maintainers, Gemini CLI  
+**Deciders:** Project maintainers  
+**Consulted:** —  
+**Informed:** All contributors  
 
 ---
 
@@ -25,7 +27,7 @@ Every boundary between components (e.g., Adapter → Core) must define:
 - Declared invariants.
 
 ### 2. Validation at Entry
-All configuration and external inputs must be validated at the system boundary (e.g., in `EvaluationManager` or `Adapters`).
+All configuration and external inputs must be validated at the system boundary (e.g., in the `EvaluationFrame` constructor or `NativeEvaluator`).
 - Before execution begins.
 - Before orchestration proceeds.
 
diff --git a/documentation/ADRs/020_multi_perspective_testing.md b/documentation/ADRs/020_multi_perspective_testing.md
index c06ef07..3071a24 100644
--- a/documentation/ADRs/020_multi_perspective_testing.md
+++ b/documentation/ADRs/020_multi_perspective_testing.md
@@ -2,7 +2,9 @@
 
 **Status:** Accepted  
 **Date:** 2026-02-25  
-**Deciders:** Project maintainers, Gemini CLI  
+**Deciders:** Project maintainers  
+**Consulted:** —  
+**Informed:** All contributors  
 
 ---
 
diff --git a/documentation/ADRs/021_intent_contracts_for_classes.md b/documentation/ADRs/021_intent_contracts_for_classes.md
index 6c5a5d0..cd983ed 100644
--- a/documentation/ADRs/021_intent_contracts_for_classes.md
+++ b/documentation/ADRs/021_intent_contracts_for_classes.md
@@ -2,7 +2,9 @@
 
 **Status:** Accepted  
 **Date:** 2026-02-25  
-**Deciders:** Project maintainers, Gemini CLI  
+**Deciders:** Project maintainers  
+**Consulted:** —  
+**Informed:** All contributors  
 
 ---
 
@@ -14,7 +16,7 @@ To prevent semantic drift, non-trivial classes require an explicit declaration o
 
 ## Decision
 
-All **non-trivial and substantial classes** (e.g., `EvaluationFrame`, `NativeEvaluator`, `PandasAdapter`) must have an explicit **intent contract**.
+All **non-trivial and substantial classes** (e.g., `EvaluationFrame`, `NativeEvaluator`, `EvaluationReport`) must have an explicit **intent contract**.
 
 An intent contract is a short, human-readable description of:
 - **Purpose**: what the class is for.
diff --git a/documentation/ADRs/022_evolution_and_stability.md b/documentation/ADRs/022_evolution_and_stability.md
index 4148c26..9715272 100644
--- a/documentation/ADRs/022_evolution_and_stability.md
+++ b/documentation/ADRs/022_evolution_and_stability.md
@@ -3,6 +3,8 @@
 **Status:** Proposed (Deferred)  
 **Date:** 2026-02-25  
 **Deciders:** —  
+**Consulted:** —  
+**Informed:** All contributors  
 
 ---
 
diff --git a/documentation/ADRs/023_technical_risk_register.md b/documentation/ADRs/023_technical_risk_register.md
new file mode 100644
index 0000000..1111dad
--- /dev/null
+++ b/documentation/ADRs/023_technical_risk_register.md
@@ -0,0 +1,69 @@
+# ADR-023: Technical Risk Register
+
+**Status:** Accepted  
+**Date:** 2026-03-31  
+**Deciders:** Project maintainers  
+**Consulted:** —  
+**Informed:** All contributors  
+
+---
+
+## Context
+
+As the views-evaluation codebase matures through its EvaluationFrame refactor and metric catalog implementation, structural risks have been identified through repo-assimilation and expert review. Without a centralized, living register of these risks, concerns are scattered across reports, post-mortems, and tribal knowledge.
+
+A formalized risk register ensures that architectural concerns are:
+- tracked with consistent metadata,
+- prioritized by severity,
+- linked to their source of discovery,
+- and revisited systematically.
+
+---
+
+## Decision
+
+This repository maintains a **Technical Risk Register** at `reports/technical_risk_register.md` as a first-class governance artifact.
+
+### Concern Format
+
+Each entry uses:
+- **ID:** `C-xx` for concerns, `D-xx` for disagreements
+- **Tier:** 1 (critical) through 4 (informational)
+- **Trigger:** The specific circumstance under which the risk becomes actionable
+- **Source:** How the concern was identified (e.g. repo-assimilation, expert review, falsification audit)
+
+### Tier Definitions
+
+| Tier | Severity | Response |
+|------|----------|----------|
+| 1 | Critical — blocks release or causes data corruption | Must be resolved before next release |
+| 2 | High — significant architectural risk | Must have a mitigation plan within one sprint |
+| 3 | Medium — known weakness, bounded impact | Track and address opportunistically |
+| 4 | Low/Informational — minor or cosmetic | Document and revisit during tech debt cleanup |
+
+### Lifecycle
+
+- Concerns are opened during expert reviews, tech debt audits, repo-assimilation, and falsification audits.
+- Concerns are closed when the risk is resolved, mitigated, or explicitly accepted with rationale.
+- The register header tracks the total count for quick reference.
+
+---
+
+## Consequences
+
+### Positive
+- Centralized visibility of all known risks
+- Consistent prioritization and tracking
+- Prevents risks from being forgotten between conversations
+
+### Negative
+- Requires discipline to keep updated
+- Risk of register staleness if not reviewed regularly
+
+---
+
+## References
+
+- `reports/technical_risk_register.md`
+- Repo-assimilation output (2026-03-31)
+- `reports/technical_debt_backlog.md` (related but focuses on actionable debt, not structural risks)
diff --git a/documentation/ADRs/030_evaluation_strategy.md b/documentation/ADRs/030_evaluation_strategy.md
index 4af22a3..ec5c88d 100644
--- a/documentation/ADRs/030_evaluation_strategy.md
+++ b/documentation/ADRs/030_evaluation_strategy.md
@@ -1,12 +1,10 @@
 # ADR-030: Evaluation Strategy
 
-| ADR Info            | Details           |
-|---------------------|-------------------|
-| Subject             | Evaluation Strategy  |
-| ADR Number          | 030   |
-| Status              | Accepted |
-| Author              | Xiaolong, Mihai|
-| Date                | 16.07.2025 |
+**Status:** Accepted  
+**Date:** 2025-07-16  
+**Deciders:** Xiaolong, Mihai  
+**Consulted:** —  
+**Informed:** All contributors  
 
 ## Context
 To ensure reliable and realistic model performance assessment, our forecasting framework supports both **offline** and **online** evaluation strategies. These strategies serve complementary purposes: offline evaluation simulates the forecasting process retrospectively, while online evaluation assesses actual deployed forecasts against observed data.
diff --git a/documentation/ADRs/031_evaluation_metrics.md b/documentation/ADRs/031_evaluation_metrics.md
index ec302f2..f6090bc 100644
--- a/documentation/ADRs/031_evaluation_metrics.md
+++ b/documentation/ADRs/031_evaluation_metrics.md
@@ -1,12 +1,10 @@
 # ADR-031: Evaluation Metrics
 
-| ADR Info            | Details            |
-|---------------------|--------------------|
-| Subject             | Evaluation Metrics |
-| ADR Number          | 031                |
-| Status              | Accepted           |
-| Author              | Xiaolong           |
-| Date                | 12.09.2024         |
+**Status:** Accepted  
+**Date:** 2024-09-12  
+**Deciders:** Xiaolong  
+**Consulted:** —  
+**Informed:** All contributors  
 
 ## Context
 In the context of the VIEWS pipeline, it is necessary to evaluate the models using a robust set of metrics that account for the characteristics of conflict data, such as right-skewness and zero-inflation in the outcome variable.
diff --git a/documentation/ADRs/032_metric_calculation_schemas.md b/documentation/ADRs/032_metric_calculation_schemas.md
index 32894d3..a0b1c12 100644
--- a/documentation/ADRs/032_metric_calculation_schemas.md
+++ b/documentation/ADRs/032_metric_calculation_schemas.md
@@ -1,12 +1,10 @@
 # ADR-032: Metric Calculation Schemas
 
-| ADR Info            | Details           |
-|---------------------|-------------------|
-| Subject             | Metric Calculation  |
-| ADR Number          | 032   |
-| Status              | Accepted|
-| Author              | Mihai, Xiaolong|
-| Date                | 31.10.2024 |
+**Status:** Accepted  
+**Date:** 2024-10-31  
+**Deciders:** Mihai, Xiaolong  
+**Consulted:** —  
+**Informed:** All contributors  
 
 ## Context
 Traditional machine learning metrics do not directly translate to time-series forecasting across multiple horizons. A standardized approach to regrouping data is necessary.
diff --git a/documentation/ADRs/040_evaluation_input_schema.md b/documentation/ADRs/040_evaluation_input_schema.md
index 4d5730e..e8fde7a 100644
--- a/documentation/ADRs/040_evaluation_input_schema.md
+++ b/documentation/ADRs/040_evaluation_input_schema.md
@@ -1,18 +1,16 @@
 # ADR-040: Evaluation Input Schema
 
-| ADR Info            | Details                 |
-|---------------------|-------------------------|
-| Subject             | Evaluation Input Schema |
-| ADR Number          | 040                     |
-| Status              | Accepted                |
-| Author              | Xiaolong                |
-| Date                | 16.06.2025              |
+**Status:** Accepted  
+**Date:** 2025-06-16  
+**Deciders:** Xiaolong  
+**Consulted:** —  
+**Informed:** All contributors  
 
 ## Context
 
 A consistent input format is required to compare model performance across the VIEWS pipeline.
-Two integration paths exist: the native path (primary) and the legacy path (`EvaluationManager`,
-deprecated per ADR-011).
+The native path via `EvaluationFrame` is the sole integration path. The legacy
+`EvaluationManager` path was removed in Phase 3.
 
 ## Decision
 
@@ -42,9 +40,9 @@ Prediction type (point vs. sample) is determined structurally from the number of
 No name-based inference occurs (ADR-012). Callers must ensure all cells in a prediction column
 have the same number of values.
 
-### Native Path Invariants (PandasAdapter)
+### Native Path Invariants
 
-When using `PandasAdapter`, the following identifiers are synthesised automatically:
+When constructing an `EvaluationFrame`, the following identifiers must be provided:
 
 | Identifier | Source                                           |
 |------------|--------------------------------------------------|
diff --git a/documentation/ADRs/041_evaluation_output_schema.md b/documentation/ADRs/041_evaluation_output_schema.md
index 50f1559..9dfe890 100644
--- a/documentation/ADRs/041_evaluation_output_schema.md
+++ b/documentation/ADRs/041_evaluation_output_schema.md
@@ -1,12 +1,10 @@
 # ADR-041: Evaluation Output Schema
 
-| ADR Info            | Details           |
-|---------------------|-------------------|
-| Subject             | Evaluation Output Schema  |
-| ADR Number          | 041   |
-| Status              | Proposed   |
-| Author              | Xiaolong   |
-| Date                | 16.06.2025     |
+**Status:** Proposed  
+**Date:** 2025-06-16  
+**Deciders:** Xiaolong  
+**Consulted:** —  
+**Informed:** All contributors  
 
 ## Context
 Standardized reports are necessary for comparing ensemble models against constituent models and baselines.
diff --git a/documentation/ADRs/042_metric_catalog.md b/documentation/ADRs/042_metric_catalog.md
index ae662a4..810412e 100644
--- a/documentation/ADRs/042_metric_catalog.md
+++ b/documentation/ADRs/042_metric_catalog.md
@@ -1,12 +1,10 @@
 # ADR-042: Metric Catalog and Named Evaluation Profiles
 
-| ADR Info            | Details           |
-|---------------------|-------------------|
-| Subject             | Metric hyperparameter management |
-| ADR Number          | 042               |
-| Status              | accepted          |
-| Author              | Claude (silicon-based agent) |
-| Date                | 11.03.2026        |
+**Status:** Accepted  
+**Date:** 2026-03-11  
+**Deciders:** Project maintainers  
+**Consulted:** Claude (silicon-based agent)  
+**Informed:** All contributors  
 
 ## Context
 
@@ -93,8 +91,8 @@ the profile once, all models using it get the update.
 
 ## Additional Notes
 
-- Legacy dispatch dicts (REGRESSION_POINT_NATIVE, etc.) are retained for EvaluationManager
-  backward compatibility (PHASE-3-DELETE).
+- Legacy dispatch dicts (REGRESSION_POINT_NATIVE, etc.) were removed in Phase 3.
+  METRIC_MEMBERSHIP is the single source of truth for (task, pred_type) → metric mapping.
 - The base profile ships with views-evaluation and provides values that match the previous
   function-signature defaults, ensuring zero behavioral change for existing integrations.
 - Profile values for twCRPS threshold and QIS quantile levels are subject to alignment
diff --git a/documentation/ADRs/README.md b/documentation/ADRs/README.md
index 136cae3..fed5346 100644
--- a/documentation/ADRs/README.md
+++ b/documentation/ADRs/README.md
@@ -24,6 +24,7 @@ We follow a hierarchical numbering scheme to organize decisions from the most fo
 - **020**: [Multi-Perspective Testing](020_multi_perspective_testing.md)
 - **021**: [Intent Contracts for Classes](021_intent_contracts_for_classes.md)
 - **022**: [Evolution and Stability](022_evolution_and_stability.md)
+- **023**: [Technical Risk Register](023_technical_risk_register.md)
 
 ### 03x: Domain Strategy & Methodology
 *The mathematical and strategic core of conflict evaluation.*
@@ -38,6 +39,23 @@ We follow a hierarchical numbering scheme to organize decisions from the most fo
 
 ---
 
+## Governance Structure
+
+- **Ontology (010)** defines what exists.
+- **Topology (011)** defines structural direction.
+- **Authority (012)** defines who owns meaning.
+- **Observability (013)** enforces failure semantics.
+- **Boundary Contracts (014)** define interaction rules.
+- **Testing (020)** verifies system integrity.
+- **Intent Contracts (021)** bind class-level behavior.
+- **Evolution (022)** (deferred) — rules for stability.
+- **Risk Register (023)** tracks structural concerns.
+- **Silicon Agent Protocol (001)** constrains automated modification.
+
+Together with domain ADRs (030–042), these define the invariant layer of the system.
+
+---
+
 ## Contributing
 To add a new ADR:
 1. Identify the appropriate group for the decision.
diff --git a/documentation/ADRs/adr_template.md b/documentation/ADRs/adr_template.md
index 3374bbd..eb6a261 100644
--- a/documentation/ADRs/adr_template.md
+++ b/documentation/ADRs/adr_template.md
@@ -1,54 +1,134 @@
-# ADR Template
-(Fine name should follow convention NNN-short-title.md)
+# ADR-XXXX: <Concise decision title>
 
-## Title
-*Refinement of Model Configuration Files Structure*
+**Status:** Proposed | Accepted | Superseded | Deprecated  
+**Date:** YYYY-MM-DD  
+**Deciders:** <Names / roles>  
+**Consulted:** <Optional>  
+**Informed:** <Optional>  
 
-| ADR Info            | Details           |
-|---------------------|-------------------|
-| Subject             | [Insert Subject]  |
-| ADR Number          | [Insert Number (NNN)]   |
-| Status              | [Insert Status (proposed, accepted, rejected, deprecated)]   |
-| Author              | [Insert Author (Name)]   |
-| Date                | [Insert Date (DD.MM.YYYY)]     |
+---
 
 ## Context
-*Describe the issue that necessitated the decision, including any factors considered during the decision-making process. This should provide a clear understanding of the challenges or opportunities addressed by the ADR.*
+
+Describe the problem that motivated this decision.
+
+Include:
+- What is *not working* or *no longer tenable*
+- Relevant technical, organizational, or scientific constraints
+- Prior assumptions that turned out to be wrong
+- Why this decision matters *now* (and not later)
+
+This section should make it obvious to a future reader **why a decision was needed at all**.
+
+---
 
 ## Decision
-*Detail the decision that was made, including any alternatives that were considered and the reasons for choosing the implemented solution. Provide enough technical specifics to justify the approach.*
 
-### Overview
-*Overview of the decision in a clear and concise manner.*
+State the decision **clearly and unambiguously**.
 
-## Consequences
-*Discuss the positive and negative effects of the decision. Include both immediate outcomes and long-term implications for the project's architecture. Highlight how the decision aligns with the challenges outlined in the context.*
+- What is being decided?
+- What is explicitly *in scope*?
+- What is explicitly *out of scope*?
 
-**Positive Effects:**
-- List the benefits of the decision.
+Use assertive language.  
+This is the **source of truth**.
 
-**Negative Effects:**
-- List the potential drawbacks or challenges introduced by the decision.
+---
 
 ## Rationale
-*Explain the reasoning behind the decision, including any specific advantages that influenced the choice. This section should reflect the factors mentioned in the context.*
 
-### Considerations
-*List any considerations that were part of the decision-making process, such as potential risks, dependency issues, or impacts on existing systems.*
+Explain *why this option was chosen* over alternatives.
+
+Include:
+- Key design principles or values (e.g. correctness > convenience)
+- Trade-offs consciously accepted
+- Alignment with long-term architecture or research goals
+- Why this decision reduces risk, ambiguity, or technical debt
+
+This is where future disagreements get defused.
+
+---
+
+## Considered Alternatives
+
+List the main alternatives that were seriously considered.
+
+For each alternative:
+- Brief description
+- Why it was *not* chosen
+- Any conditions under which it might be revisited
+
+Example format:
+
+### Alternative A: <name>
+- **Pros:**  
+- **Cons:**  
+- **Reason for rejection:**  
+
+---
+
+## Consequences
+
+Describe the consequences of this decision.
+
+### Positive
+- Benefits unlocked
+- Simplifications introduced
+- Risks reduced
+
+### Negative
+- New constraints imposed
+- Short-term pain
+- Technical debt explicitly accepted
+
+Be honest. This section builds trust.
+
+---
+
+## Implementation Notes
+
+Concrete guidance for implementation.
+
+Include:
+- Where the decision should be enforced (code, config, docs, tests)
+- Migration strategy (if applicable)
+- Required follow-up tasks or refactors
+- Guardrails to prevent regression
+
+If nothing is required yet, say so explicitly.
+
+---
+
+## Validation & Monitoring
+
+How will we know this decision was correct?
+
+Examples:
+- Tests or invariants that should hold
+- Metrics or signals to watch
+- Failure modes that would trigger reconsideration
+
+This turns the ADR into a *living* artifact.
+
+---
+
+## Open Questions
 
-## Additional Notes
-*Include any additional information that might be relevant to the decision, such as implications for development workflows, future maintenance, or related decisions.*
+List unresolved questions or known unknowns.
 
-## Feedback and Suggestions
-*Invite team members or stakeholders to provide feedback or suggest improvements on the decision or its implementation.*
+- What do we still not know?
+- What depends on future work or data?
+- What should be revisited later?
 
 ---
 
-**How to Use This Template:**
+## References
 
-1. **Copy the contents** of this template into a new markdown file within the ADR folder.
-2. **Fill in each section** with specific details about the architectural decision being documented.
-3. **Save the file** with a name that follows the naming convention (`NNN-subject-title.md` where `NNN` is the ADR number).
-4. **Submit for review** through your project's standard process for documentation updates.
+Links to:
+- PRs
+- Issues
+- Design docs
+- Papers
+- Slack threads / meeting notes
 
-This template ensures that each architectural decision is well-documented, providing a clear and consistent record that supports project development and facilitates understanding across the team.
\ No newline at end of file
+Future readers should be able to reconstruct the full story.
diff --git a/documentation/CICs/EvaluationFrame.md b/documentation/CICs/EvaluationFrame.md
index 00256de..30c96ed 100644
--- a/documentation/CICs/EvaluationFrame.md
+++ b/documentation/CICs/EvaluationFrame.md
@@ -2,7 +2,7 @@
 
 **Status:** Active  
 **Owner:** Evaluation Core  
-**Last reviewed:** 2026-03-13  
+**Last reviewed:** 2026-04-02  
 **Related ADRs:** ADR-010 (Ontology), ADR-011 (Topology), ADR-012 (Authority)
 
 ---
@@ -66,7 +66,7 @@ The canonical, framework-agnostic internal representation of a forecasting evalu
 
 ## 8. Step Semantics
 
-The `step` identifier represents **positional lead time** (1-indexed), not an absolute calendar month. Step 1 is the first month of each forecast origin's prediction window, step 2 is the second, and so on. This is assigned positionally by adapters (e.g., `PandasAdapter.from_dataframes()`) based on the order of unique time values within each origin sequence.
+The `step` identifier represents **positional lead time** (1-indexed), not an absolute calendar month. Step 1 is the first month of each forecast origin's prediction window, step 2 is the second, and so on. This is assigned positionally by adapters (e.g., views-pipeline-core's `EvaluationAdapter`) based on the order of unique time values within each origin sequence.
 
 **Consequence:** Step 1 in origin A and step 1 in origin B typically refer to *different* calendar months. When `NativeEvaluator` groups data by step, it collects the "diagonals" of the parallelogram — all first-month-ahead predictions together, all second-month-ahead together, etc. This is the correct semantic for forecast-horizon evaluation.
 
@@ -91,3 +91,38 @@ ef = EvaluationFrame(
 month_groups = ef.get_group_indices('time')
 sub_ef = ef.select_indices(month_groups[100])
 ```
+
+---
+
+## 10. Examples of Incorrect Usage
+
+- Constructing an `EvaluationFrame` directly with ragged sample arrays (varying S per row). External adapters should guard against this, but direct construction validates only ndim.
+- Passing DataFrames or Series instead of NumPy arrays — the class has zero knowledge of Pandas.
+- Omitting required identifier keys (e.g. passing only `time` and `unit` without `origin` and `step`).
+- Storing derived or mutable state on an `EvaluationFrame` instance after construction.
+
+---
+
+## 11. Test Alignment
+
+- **Green:** `tests/test_evaluation_frame.py::TestEvaluationFrameGreen` — construction, properties, grouping, selection.
+- **Beige:** `tests/test_evaluation_frame.py::TestEvaluationFrameBeige` — single-row frames, large sample counts, multi-unit grouping.
+- **Red:** `tests/test_evaluation_frame.py::TestEvaluationFrameRed` — shape mismatches, NaN/Inf/None in data and identifiers, missing keys.
+- **Adversarial:** `tests/test_adversarial_inputs.py::TestAdversarialNativeInputs` — NaN/Inf boundary rejection.
+
+---
+
+## 12. Known Deviations
+
+- **Rectangular sample invariant not enforced:** Direct construction does not validate that all rows of `y_pred` have the same number of samples. Only well-designed external adapters guard against ragged arrays. A directly-constructed frame with ragged `y_pred` would cause indexing errors deep in metric calculations. (Risk register C-03)
+- **Integer identifier NaN not checked:** Validation checks float and object identifiers for NaN/None, but integer-typed identifiers are not checked (NumPy integers cannot represent NaN, so this is safe in practice but not explicitly documented).
+- **No immutability enforcement:** The contract claims "State Immutability" via new-instance methods, but `y_true`, `y_pred`, and `identifiers` are publicly mutable attributes. Nothing prevents `ef.y_true[0] = 999` after construction.
+
+---
+
+## End of Contract
+
+This document defines the **intended meaning** of `EvaluationFrame`.
+
+Changes to behavior that violate this intent are bugs.  
+Changes to intent must update this contract.
diff --git a/documentation/CICs/EvaluationReport.md b/documentation/CICs/EvaluationReport.md
index 9d1b414..8aa34f9 100644
--- a/documentation/CICs/EvaluationReport.md
+++ b/documentation/CICs/EvaluationReport.md
@@ -2,7 +2,7 @@
 
 **Status:** Active  
 **Owner:** Evaluation Core  
-**Last reviewed:** 2026-03-13  
+**Last reviewed:** 2026-04-02  
 **Related ADRs:** ADR-010 (Ontology), ADR-041 (Output Schema)
 
 ---
@@ -61,7 +61,7 @@ A structured, framework-agnostic container for evaluation results. It decouples
 ## 7. Boundaries and Interactions
 
 - **Upstream**: Produced by **NativeEvaluator**.
-- **Downstream**: Consumed by **EvaluationManager**, Pipeline Core, or reporting tools.
+- **Downstream**: Consumed by Pipeline Core or reporting tools.
 
 ---
 
@@ -73,3 +73,42 @@ df = report.to_dataframe(schema="month")       # pd.DataFrame
 data = report.to_dict()                         # nested dict
 schema = report.get_schema_results("month")     # dict → typed metrics dataclass
 ```
+
+---
+
+## 9. Examples of Incorrect Usage
+
+- Calling `to_dataframe(schema='raw')` — this is deprecated and returns the internal dict, not a DataFrame. Use `to_dict()['schemas']` instead.
+- Adding a new metric to `METRIC_CATALOG` without adding a corresponding field to the typed metrics dataclass — the FM1 guard will raise `ValueError`.
+- Treating the report as mutable and modifying `_results` after construction.
+
+---
+
+## 10. Test Alignment
+
+- **Green:** `tests/test_evaluation_report.py` — construction, schema access, to_dict, to_dataframe.
+- **Beige:** `tests/test_evaluation_report.py` — empty schemas, single-entry schemas.
+- **Red:** `tests/test_evaluation_report.py` — missing schema keys, field mismatch (FM1 guard).
+
+---
+
+## 11. Evolution Notes
+
+- The `to_dataframe()` method imports Pandas lazily. After Phase 3, this method may be removed or moved to an adapter.
+- The `_metrics_map` mapping 4 (task, pred_type) combinations to dataclass types is stable but must be extended if new task types are added.
+
+---
+
+## 12. Known Deviations
+
+- **Lazy Pandas import:** `to_dataframe()` imports `pandas` at call time, which means the Level 1 bridge concern leaks into what is otherwise a Level 0 component. This is a pragmatic compromise for backward compatibility.
+- **Legacy dataclass coupling:** `get_schema_results()` wraps results in legacy dataclass instances (`RegressionPointEvaluationMetrics`, etc.) from `metrics.py`. If a metric is computed but has no field in the dataclass, the FM1 guard raises. This means new metrics require coordinated updates to both `metric_catalog.py` and `metrics.py`.
+
+---
+
+## End of Contract
+
+This document defines the **intended meaning** of `EvaluationReport`.
+
+Changes to behavior that violate this intent are bugs.  
+Changes to intent must update this contract.
diff --git a/documentation/CICs/MetricCatalog.md b/documentation/CICs/MetricCatalog.md
new file mode 100644
index 0000000..c4040c1
--- /dev/null
+++ b/documentation/CICs/MetricCatalog.md
@@ -0,0 +1,137 @@
+# Class Intent Contract: MetricCatalog
+
+**Status:** Active  
+**Owner:** Evaluation Core  
+**Last reviewed:** 2026-03-31  
+**Related ADRs:** ADR-042 (Metric Catalog), ADR-012 (Authority), ADR-013 (Observability)
+
+---
+
+## 1. Purpose
+
+A genome registry and Chain of Responsibility resolver for evaluation metric hyperparameters. Declares what each metric requires (its genome) but provides NO default values. Values are supplied by named profiles and/or per-model overrides.
+
+---
+
+## 2. Non-Goals (Explicit Exclusions)
+
+- This module does **not** compute metrics (that is the role of metric calculator functions).
+- This module does **not** supply default hyperparameter values (that is the role of named profiles in `views_evaluation/profiles/`).
+- This module does **not** validate data shapes or content (that is the role of `EvaluationFrame`).
+- This module does **not** know about DataFrames or any external data framework.
+
+---
+
+## 3. Responsibilities and Guarantees
+
+- **Genome Declaration:** Each `MetricSpec` guarantees an immutable declaration of which hyperparameters a metric requires (the `genome` tuple) and whether the metric is implemented.
+- **Membership Declaration:** `METRIC_MEMBERSHIP` guarantees a complete mapping of `(task, pred_type)` pairs to valid metric name sets.
+- **Chain of Responsibility Resolution:** `resolve_metric_params()` guarantees that hyperparameters are resolved in strict order: model overrides → named profile → fail loud. No silent defaults.
+- **Fail-Loud on Missing Params:** Guarantees `ValueError` if a required parameter is missing from both overrides and profile.
+- **Fail-Loud on None Values:** Guarantees `ValueError` if a resolved parameter is `None`.
+- **Fail-Loud on Unknown Params:** Guarantees `ValueError` if model overrides contain parameters not in the metric's genome.
+- **Fail-Loud on Unimplemented Metrics:** Guarantees `ValueError` with clear message if an unimplemented metric is requested.
+
+---
+
+## 4. Inputs and Assumptions
+
+- **`resolve_metric_params(metric_name, model_overrides, profile)`:**
+  - `metric_name` must exist in `METRIC_CATALOG`.
+  - `model_overrides` is a dict of per-metric parameter overrides (may be empty).
+  - `profile` is a named evaluation profile dict (e.g. `BASE_PROFILE`).
+- **Metric functions:** Each function referenced by a `MetricSpec` must accept `(y_true, y_pred, **resolved_params)`.
+- **Genome completeness:** All hyperparameters required by a metric function must be declared in the spec's `genome` tuple.
+
+---
+
+## 5. Outputs and Side Effects
+
+- **`resolve_metric_params()`** returns a `Dict[str, Any]` of resolved hyperparameters ready to pass as `**kwargs` to the metric function. Empty dict for metrics with no genome.
+- **No side effects.** The module is purely declarative; no state mutation, no I/O, no logging.
+
+---
+
+## 6. Failure Modes and Loudness
+
+- `ValueError` if `metric_name` is unknown (not in `METRIC_CATALOG`).
+- `ValueError` if metric is not implemented (`spec.implemented == False`).
+- `ValueError` if a genome parameter is missing from both overrides and profile.
+- `ValueError` if a resolved parameter is `None`.
+- `ValueError` if overrides contain unknown parameters not in the genome.
+- `ValueError` if overrides are provided for a metric with empty genome.
+
+All failures are immediate and explicit. No warnings, no fallbacks, no silent degradation.
+
+---
+
+## 7. Boundaries and Interactions
+
+- **Upstream:** Consumed by `NativeEvaluator._calculate_metrics()`.
+- **Internal:** Imports metric functions from `native_metric_calculators.py`.
+- **Downstream:** Named profiles (`views_evaluation/profiles/`) supply values consumed by the resolver.
+- **Isolation:** Must not import Pandas, Polars, or any external data framework. Only depends on `native_metric_calculators` and standard library.
+
+---
+
+## 8. Examples of Correct Usage
+
+```python
+from views_evaluation.evaluation.metric_catalog import METRIC_CATALOG, resolve_metric_params
+from views_evaluation.profiles.base import BASE_PROFILE
+
+# Resolve params for twCRPS using base profile
+params = resolve_metric_params("twCRPS", {}, BASE_PROFILE)
+# → {"threshold": 0.0}
+
+# Override threshold for a specific model
+params = resolve_metric_params("twCRPS", {"threshold": 2.0}, BASE_PROFILE)
+# → {"threshold": 2.0}
+
+# Metrics with no genome return empty dict
+params = resolve_metric_params("MSE", {}, BASE_PROFILE)
+# → {}
+```
+
+---
+
+## 9. Examples of Incorrect Usage
+
+- Hardcoding hyperparameter defaults inside metric function signatures — the catalog pattern requires all values to come from profiles or overrides.
+- Calling `resolve_metric_params` with `None` as the profile — a real profile dict is always required.
+- Adding a new metric to `METRIC_CATALOG` without adding its genome params to at least one profile — all callers will get `ValueError`.
+- Passing overrides for metrics with empty genome (e.g. `resolve_metric_params("MSE", {"power": 1.5}, profile)`) — raises `ValueError`.
+
+---
+
+## 10. Test Alignment
+
+- **Green:** `tests/test_metric_catalog.py` — registry snapshot integrity, resolver happy path, genome completeness checks.
+- **Beige:** `tests/test_metric_catalog.py` — partial overrides, profile-only resolution, edge case param values.
+- **Red:** `tests/test_metric_catalog.py` — unknown metrics, unimplemented metrics, missing params, None values, unknown overrides.
+- **Correctness:** `tests/test_metric_correctness.py` — golden-value tests (5 tests; coverage gap noted).
+
+---
+
+## 11. Evolution Notes
+
+- New metrics are added by: (1) implementing the function in `native_metric_calculators.py`, (2) adding a `MetricSpec` to `METRIC_CATALOG`, (3) adding to `METRIC_MEMBERSHIP`, (4) adding genome values to relevant profiles, (5) adding a field to the typed metrics dataclass in `metrics.py`.
+- The legacy dispatch dicts were removed in Phase 3. `METRIC_MEMBERSHIP` is the single source of truth.
+- Profile structure is stable; new profiles are added by creating a new file in `profiles/`.
+
+---
+
+## 12. Known Deviations
+
+- **No profile completeness validation:** There is no mechanism to verify that a profile provides values for all metrics with non-empty genomes. A profile missing a metric's params will only fail at evaluation time, not at profile registration.
+- **Weak golden-value coverage:** Only 5 tests in `test_metric_correctness.py` verify metric functions against independently computed known answers. Most metrics lack this verification (see risk register C-07).
+- **Breaking rename:** The legacy `Brier` metric (unimplemented placeholder) was replaced by `Brier_sample` and `Brier_point` (implemented). The field in `ClassificationSampleEvaluationMetrics` was renamed from `Brier` to `Brier_sample`. External consumers accessing `.Brier` on classification sample results must update to `.Brier_sample`.
+
+---
+
+## End of Contract
+
+This document defines the **intended meaning** of the MetricCatalog module (`MetricSpec`, `METRIC_CATALOG`, `METRIC_MEMBERSHIP`, `resolve_metric_params`).
+
+Changes to behavior that violate this intent are bugs.  
+Changes to intent must update this contract.
diff --git a/documentation/CICs/NativeEvaluator.md b/documentation/CICs/NativeEvaluator.md
index 414f150..e69011d 100644
--- a/documentation/CICs/NativeEvaluator.md
+++ b/documentation/CICs/NativeEvaluator.md
@@ -2,7 +2,7 @@
 
 **Status:** Active  
 **Owner:** Evaluation Core  
-**Last reviewed:** 2026-03-13
+**Last reviewed:** 2026-04-02
 **Related ADRs:** ADR-010 (Ontology), ADR-011 (Topology), ADR-032 (Schemas), ADR-042 (Metric Catalog)
 
 ---
@@ -62,7 +62,7 @@ A stateless "Pure Math" engine that executes the three standard Views evaluation
 
 ## 7. Boundaries and Interactions
 
-- **Upstream**: Called directly or via legacy `EvaluationManager` (PHASE-3-DELETE).
+- **Upstream**: Called directly by evaluation orchestrators (e.g. views-pipeline-core).
 - **Internal**: Depends on `EvaluationFrame` and `MetricCalculators`.
 - **Isolation**: Must not depend on any IO or dataframe frameworks.
 
@@ -79,3 +79,45 @@ month_df = report.to_dataframe('month')        # pd.DataFrame indexed by group k
 step_dict = report.to_dict()['schemas']['step']  # raw nested dict
 schema = report.get_schema_results('time_series')  # dict → typed metrics dataclass
 ```
+
+---
+
+## 9. Examples of Incorrect Usage
+
+- Passing a raw dict instead of an `EvaluationFrame` — the evaluator expects validated frames, not ad-hoc data.
+- Requesting metrics that are not valid for the (task, pred_type) combination — e.g. asking for `CRPS` on a point prediction. This will fail loud.
+- Omitting `evaluation_profile` from config and expecting hardcoded defaults — the resolver requires explicit profile selection.
+- Using `legacy_compatibility=False` without understanding that step-wise results will include steps not present in all origins.
+
+---
+
+## 10. Test Alignment
+
+- **Green:** `tests/test_native_evaluator.py` — three-schema evaluation, legacy compat, metric dispatch.
+- **Beige:** `tests/test_native_evaluator.py` — sparse step configs, single-origin frames.
+- **Red:** `tests/test_native_evaluator.py`, `tests/test_adversarial_inputs.py` — undeclared targets, unimplemented metrics.
+- **Integration:** `tests/test_adversarial_inputs.py` — undeclared targets, unimplemented metrics, NaN/Inf defense-in-depth.
+
+---
+
+## 11. Evolution Notes
+
+- `legacy_compatibility` default was flipped to `False` in Phase 3. The flag is retained for callers that need truncation behavior.
+- Config validation may be added to `__init__` to catch structural config errors at construction time rather than at evaluation time (currently a known gap — risk register C-02).
+- The `EvaluationReport` return type is stable; the internal `_calculate_metrics` dispatch may evolve as the `MetricCatalog` grows.
+
+---
+
+## 12. Known Deviations
+
+- **No config validation at init:** `NativeEvaluator.__init__` only validates the profile name. Missing or malformed config keys cause cryptic errors at evaluation time rather than at construction. (Risk register C-02)
+- **sklearn/scipy in "pure core":** The `NativeEvaluator` dispatches to metric functions that import `sklearn` and `scipy` at module level. This contradicts the stated goal of a zero-external-dep Level 0 core (ADR-011). (Risk register C-05)
+
+---
+
+## End of Contract
+
+This document defines the **intended meaning** of `NativeEvaluator`.
+
+Changes to behavior that violate this intent are bugs.  
+Changes to intent must update this contract.
diff --git a/documentation/CICs/PandasAdapter.md b/documentation/CICs/PandasAdapter.md
deleted file mode 100644
index 289c2d2..0000000
--- a/documentation/CICs/PandasAdapter.md
+++ /dev/null
@@ -1,69 +0,0 @@
-# Class Intent Contract: PandasAdapter
-
-**Status:** Deprecated (PHASE-3-DELETE)
-**Owner:** Adapters Layer
-**Last reviewed:** 2026-03-13  
-**Related ADRs:** ADR-010 (Ontology), ADR-011 (Topology), ADR-012 (Authority), ADR-040 (Input Schema)
-
----
-
-## 1. Purpose
-
-A framework-specific bridge that transforms Pandas DataFrames into the canonical `EvaluationFrame`. It encapsulates all the "dirty" logic of alignment, reindexing, and list-extraction.
-
----
-
-## 2. Non-Goals (Explicit Exclusions)
-
-- This class does **not** calculate metrics.
-- This class does **not** persist data.
-- This class does **not** handle other frameworks (like Polars).
-- This class does **not** own the authoritative math core.
-
----
-
-## 3. Responsibilities and Guarantees
-
-- **MultiIndex Translation**: Guarantees that Pandas index levels (time, unit) are correctly mapped to `EvaluationFrame` identifiers.
-- **Alignment (Truth Duplication)**: Responsible for performing the intersection of indices and duplicating `actuals` to match the sequence-based structure of `predictions`.
-- **Sample Extraction**: Guarantees that "lists-in-cells" are correctly exploded into dense 2D NumPy arrays.
-- **Metadata Declaration**: Responsible for explicitly declaring task and prediction types (as per ADR-012).
-
----
-
-## 4. Inputs and Assumptions
-
-- **Pandas Objects**: Expects `pd.DataFrame` and `List[pd.DataFrame]`.
-- **Naming Conventions**: Assumes `month_id` and `entity_id` structure in MultiIndex.
-- **Rectangular Samples**: Assumes that all prediction cells in a given task contain the same number of samples (or scalars).
-
----
-
-## 5. Outputs and Side Effects
-
-- **EvaluationFrame**: Produces a single, pre-aligned, flattened `EvaluationFrame`.
-
----
-
-## 6. Failure Modes and Loudness
-
-- Silently skips prediction DataFrames whose index has no overlap with actuals (continues to the next sequence).
-- Raises `ValueError` if sample lengths are inconsistent across cells.
-- Fails loud if the input is not a DataFrame.
-
----
-
-## 7. Boundaries and Interactions
-
-- **Upstream**: Called by users or legacy `EvaluationManager` (PHASE-3-DELETE).
-- **Downstream**: Produces input for `EvaluationFrame`.
-- **Isolation**: This is one of the few places where a `pandas` import is allowed.
-- **Deprecation**: Emits `DeprecationWarning` on use. Will be removed from this repo in Phase 3; adapters belong in the calling repository (e.g. `views-pipeline-core`).
-
----
-
-## 8. Examples of Correct Usage
-
-```python
-ef = PandasAdapter.from_dataframes(actual_df, [pred_df1, pred_df2], "target_name")
-```
diff --git a/documentation/CICs/README.md b/documentation/CICs/README.md
index afc7f7e..c581396 100644
--- a/documentation/CICs/README.md
+++ b/documentation/CICs/README.md
@@ -50,11 +50,10 @@ Contracts must be clear enough that:
 
 ## Active Contracts
 
-- `EvaluationFrame.md`
-- `NativeEvaluator.md`
-- `EvaluationReport.md`
-- `PandasAdapter.md` (PHASE-3-DELETE)
-- (Add entries as they are created)
+- `EvaluationFrame.md` — Canonical NumPy data container
+- `NativeEvaluator.md` — Pure math evaluation engine
+- `EvaluationReport.md` — Structured result container
+- `MetricCatalog.md` — Genome registry and parameter resolver
 
 ---
 
@@ -65,5 +64,6 @@ Intent Contracts are governed by:
 - ADR-021 (Intent Contracts for Classes)
 - ADR-012 (Authority over Inference)
 - ADR-020 (Multi-Perspective Testing)
+- ADR-042 (Metric Catalog)
 
 If a class changes meaning, its Intent Contract must be updated.
diff --git a/documentation/INSTANTIATION_CHECKLIST.md b/documentation/INSTANTIATION_CHECKLIST.md
new file mode 100644
index 0000000..b7277dd
--- /dev/null
+++ b/documentation/INSTANTIATION_CHECKLIST.md
@@ -0,0 +1,72 @@
+# Instantiation Checklist
+
+Use this checklist to track the base_docs governance adoption for views-evaluation.
+
+---
+
+## Before You Start
+
+- [x] Decide which adoption phase you're targeting
+- [x] Identify your project's ontological categories (ADR-010)
+
+---
+
+## ADR Adaptation
+
+### All adopted ADRs
+- [x] Update Status from `--template--` to `Proposed` or `Accepted`
+- [x] Fill in Date, Deciders, Consulted, Informed fields
+
+### Per-ADR adaptation notes
+- [x] **ADR-000:** Updated path reference to `documentation/ADRs/`
+- [x] **ADR-010 (base 001):** Defined project's ontological categories (EvaluationFrame, NativeEvaluator, etc.)
+- [x] **ADR-011 (base 002):** Defined 3-level layering and forbidden dependency patterns
+- [x] **ADR-012 (base 003):** Adapted forbidden behavior examples to evaluation domain (no sniffing, no type inference)
+- [x] **ADR-020 (base 005):** Adapted test taxonomy for forecasting evaluation domain
+- [x] **ADR-021 (base 006):** No domain adaptation needed (criteria are universal)
+- [x] **ADR-001 (base 007):** Adapted silicon agent rules to views-evaluation tooling
+- [x] **ADR-014 (base 009):** Adapted boundary examples to Adapter-Core and Config-Runtime boundaries
+- [x] **ADR-023:** Created technical risk register ADR
+
+---
+
+## CICs
+
+- [x] Replace placeholder active contracts list in `CICs/README.md` with project contracts
+- [x] Create intent contracts for non-trivial classes:
+  - [x] EvaluationFrame.md
+  - [x] NativeEvaluator.md
+  - [x] EvaluationReport.md
+  - [x] PandasAdapter.md (removed in Phase 3)
+  - [x] MetricCatalog.md
+
+---
+
+## Contributor Protocols
+
+- [x] Review and adapt `contributor_protocols/silicon_based_agents.md` for project tooling
+- [x] Review and adapt `contributor_protocols/carbon_based_agents.md` for project team
+- [x] Adapt `contributor_protocols/hardened_protocol_template.md` for numerical computation domain
+
+---
+
+## Standards
+
+- [x] Review `standards/logging_and_observability_standard.md` — adapted scope for Level 0 pure-math exception propagation
+- [x] Review `standards/physical_architecture_standard.md` — includes critical bundling assessment
+
+---
+
+## Risk Register
+
+- [x] Created `reports/technical_risk_register.md` seeded with 9 concerns from repo-assimilation
+- [x] Created ADR-023 governing the risk register
+
+---
+
+## Final Verification
+
+- [x] No files still have Status `--template--` (except ADR-022 which is intentionally deferred)
+- [ ] No phantom references to non-existent files
+- [ ] All cross-ADR references resolve correctly
+- [ ] Run `validate_docs.sh` to check internal consistency
diff --git a/documentation/contributor_protocols/hardened_protocol_template.md b/documentation/contributor_protocols/hardened_protocol_template.md
new file mode 100644
index 0000000..25ca178
--- /dev/null
+++ b/documentation/contributor_protocols/hardened_protocol_template.md
@@ -0,0 +1,79 @@
+# The Hardened Protocol: Contributor Governance for Numerical Evaluation
+
+This document defines the mandatory engineering and mathematical standards for the `views-evaluation` repository. Adherence to this protocol is required for all contributions that affect metric computation, data transformation, or evaluation logic to guarantee scientific integrity and reproducibility.
+
+---
+
+## 1. Core Principles
+
+### A. The Authority of Declarations (ADR-012)
+**"Never infer; only trust declarations."**
+All meaningful semantics (task types, prediction types, metric hyperparameters, step identifiers) must be explicitly declared in configuration or the `EvaluationFrame`.
+- **Prohibited:** Type-sniffing from cell contents, step inference from row position without explicit assignment, scaling inference from target name prefixes.
+- **Requirement:** If a parameter affects metric computation (e.g. twCRPS threshold, Coverage alpha), it must be a declared gene in the `MetricCatalog` genome and resolved via Chain of Responsibility.
+
+### B. The Fail-Loud Mandate (ADR-013)
+**"A crash is a successful defense of scientific integrity."**
+Silent failures, implicit fallbacks, and "best-effort" corrections are forbidden.
+- **Requirement:** Violations of data, configuration, or semantic invariants must raise explicit `ValueError` immediately.
+- **Prohibited:** Using `np.nan_to_num`, silent clipping, "sensible defaults" for critical metric parameters, or downgrading errors to warnings.
+
+### C. The Numerical Airlock (EvaluationFrame._validate)
+All data entering the evaluation system must pass through the `EvaluationFrame` validation boundary.
+- **Requirement:** Reject NaN and Inf values in observations and predictions at construction time.
+- **Requirement:** Reject NaN/None in all identifier arrays at construction time.
+- **Requirement:** Enforce shape consistency: `y_true` (N,), `y_pred` (N, S), all identifiers (N,).
+
+### D. The Metric Genome Contract (ADR-042)
+**"No silent defaults."**
+Every metric hyperparameter must be declared in the `MetricSpec.genome` tuple and resolved explicitly.
+- **Requirement:** New metrics must declare all required hyperparameters in their genome.
+- **Requirement:** Metric functions must use keyword-only arguments without defaults for genome parameters.
+- **Prohibited:** Hardcoding default values in metric function signatures.
+
+---
+
+## 2. Contributor Requirements
+
+### Adding a New Metric
+1. **Implement the function** in `native_metric_calculators.py` with keyword-only args for genome parameters.
+2. **Register in catalog:** Add a `MetricSpec` to `METRIC_CATALOG` in `metric_catalog.py`.
+3. **Declare membership:** Add the metric name to the appropriate set in `METRIC_MEMBERSHIP`.
+4. **Add to profile:** Add genome parameter values to `BASE_PROFILE` (and other relevant profiles).
+5. **Add dataclass field:** Add the metric as `Optional[float] = None` to the appropriate typed metrics dataclass in `metrics.py`.
+6. **Write tests:** Include at minimum one golden-value test and one red-team test.
+
+### Modifying an Existing Metric
+1. **Update the CIC** if the change affects behavior described in the intent contract.
+2. **Verify parity** by running the full Green/Beige/Red test suite.
+3. **Update golden-value tests** if numerical output changes.
+
+---
+
+## 3. Mandatory Testing Taxonomy (ADR-020)
+
+Every Pull Request affecting metric computation must include tests covering:
+
+### Green Team (Stability & Correctness)
+- **Goal:** Ensure the metric produces correct values for known inputs.
+- **Examples:** Golden-value tests against analytical solutions, CRPS parity with `properscoring`, bit-identical results across schemas.
+
+### Beige Team (Configuration & Human Error)
+- **Goal:** Catch failures caused by common configuration mistakes or missing parameters.
+- **Examples:** Missing genome parameters in profile, requesting unimplemented metrics, mismatched task/pred_type combinations.
+
+### Red Team (Adversarial)
+- **Goal:** Expose failure modes by deliberately trying to make the system produce wrong results silently.
+- **Examples:** NaN injection in predictions, Inf in observations, ragged sample arrays, zero-variance inputs.
+
+---
+
+## 4. Operational Invariants
+
+- **Shape Guard Defense-in-Depth:** All metric functions call `_guard_shapes()` even though `EvaluationFrame._validate()` has already checked. This is deliberate double-checking, not redundancy to remove.
+- **Profile Consistency:** All profiles must provide values for all metrics with non-empty genomes that may be requested in evaluations using that profile.
+- **Schema Reproducibility:** Month-wise, time-series-wise, and step-wise schemas must produce identical results regardless of the order of input rows (grouping is by identifier value, not position).
+
+---
+
+**"In this repository, we value explicit correctness over convenient execution."**
diff --git a/documentation/evaluation_concepts.md b/documentation/evaluation_concepts.md
index 53814a3..b5dc8f8 100644
--- a/documentation/evaluation_concepts.md
+++ b/documentation/evaluation_concepts.md
@@ -35,7 +35,7 @@ This parallelogram is the fundamental data structure that is analyzed by the thr
 
 ## 3. The Three Evaluation Schemas
 
-The `EvaluationManager` assesses the predictive parallelogram by "slicing" it in three different ways. Each schema groups the data differently to answer a unique question about model performance.
+The evaluation framework assesses the predictive parallelogram by "slicing" it in three different ways. Each schema groups the data differently to answer a unique question about model performance.
 
 ### Schema 1: Time-series-wise Evaluation
 
diff --git a/documentation/integration_guide.md b/documentation/integration_guide.md
index 0631dc6..3efb466 100644
--- a/documentation/integration_guide.md
+++ b/documentation/integration_guide.md
@@ -8,40 +8,31 @@ what the library does and does not do with your data.
 
 ## 1. Architecture Overview
 
-The library has three layers:
+The library is a pure-math evaluation engine with two core components:
 
 ```
-  ┌─────────────────────────────────────────────┐
-  │             Adapters (Bridge Layer)          │
-  │  PandasAdapter — converts List[DataFrame]   │
-  │  to EvaluationFrame; synthesises identifiers│
-  └───────────────────┬─────────────────────────┘
+  ┌───────────────────────────────────────────────┐
+  │          EvaluationFrame (Core)                │
+  │  Pure NumPy container: y_true, y_pred,        │
+  │  identifiers {time, unit, origin, step}       │
+  └───────────────────┬───────────────────────────┘
                       │
-  ┌───────────────────▼─────────────────────────┐
-  │          EvaluationFrame (Core)              │
-  │  Pure NumPy container: y_true, y_pred,      │
-  │  identifiers {time, unit, origin, step}     │
-  └───────────────────┬─────────────────────────┘
+  ┌───────────────────▼───────────────────────────┐
+  │         NativeEvaluator (Pure Math)            │
+  │  Stateless engine: executes month-wise,       │
+  │  sequence-wise, and step-wise schemas          │
+  └───────────────────┬───────────────────────────┘
                       │
-  ┌───────────────────▼─────────────────────────┐
-  │         NativeEvaluator (Pure Math)          │
-  │  Stateless engine: executes month-wise,     │
-  │  sequence-wise, and step-wise schemas        │
-  └───────────────────┬─────────────────────────┘
-                      │
-  ┌───────────────────▼─────────────────────────┐
-  │        EvaluationReport (Results)            │
-  │  Framework-agnostic results container;      │
-  │  exposes to_dict(), to_dataframe(),         │
-  │  get_schema_results()                        │
-  └─────────────────────────────────────────────┘
+  ┌───────────────────▼───────────────────────────┐
+  │        EvaluationReport (Results)              │
+  │  Framework-agnostic results container;        │
+  │  exposes to_dict(), to_dataframe(),           │
+  │  get_schema_results()                          │
+  └───────────────────────────────────────────────┘
 ```
 
-`EvaluationManager` is a **legacy orchestrator** that wraps all four layers behind a single
-`evaluate()` call. It is retained for backward compatibility and will be removed in Phase 3 of the
-orchestrator migration (see `reports/2026-02-25_evaluation_frame_refactor/10_orchestrator_migration_plan.md`).
-
-**New integrations should use the native API (§2). The legacy API is documented in §3.**
+Callers (e.g. views-pipeline-core) are responsible for constructing `EvaluationFrame` from their
+own data formats. This library has no knowledge of Pandas, Polars, or any external data framework.
 
 ---
 
@@ -56,7 +47,7 @@ pip install pandas numpy  # only needed to prepare input DataFrames
 
 ### 2.2. Identifier Glossary
 
-All evaluation logic operates on four identifiers that `PandasAdapter` synthesises from your input.
+All evaluation logic operates on four identifiers that must be provided in the `EvaluationFrame`.
 Understanding them is required:
 
 | Identifier | Type    | Meaning                                                                 |
@@ -114,57 +105,45 @@ use `[1, 2, ..., 12]`. Sparse configs (e.g. `[1, 3, 6, 12]`) evaluate only those
 
 ```python
 import numpy as np
-import pandas as pd
-from views_evaluation.evaluation.adapters.pandas import PandasAdapter
-from views_evaluation.evaluation.native_evaluator import NativeEvaluator
-
-# --- 1. Prepare actuals ---
-actuals_index = pd.MultiIndex.from_product(
-    [range(500, 513), [101, 102]],
-    names=['month_id', 'country_id']
+from views_evaluation import EvaluationFrame, NativeEvaluator
+
+# --- 1. Construct EvaluationFrame from NumPy arrays ---
+ef = EvaluationFrame(
+    y_true=y_true_array,           # shape (N,)
+    y_pred=y_pred_array,           # shape (N, S) where S >= 1
+    identifiers={
+        'time':   time_ids,        # shape (N,) — calendar month ids
+        'unit':   unit_ids,        # shape (N,) — spatial entity ids
+        'origin': origin_ids,      # shape (N,) — sequence index
+        'step':   step_ids,        # shape (N,) — 1-indexed lead time
+    },
+    metadata={'target': 'ged_sb_best'},
 )
-actuals = pd.DataFrame(
-    {'ged_sb_best': np.random.randint(0, 20, size=26)},
-    index=actuals_index
-)
-
-# --- 2. Prepare predictions list (2 sequences, 12 steps each) ---
-target = 'ged_sb_best'
-pred_col = f'pred_{target}'
-predictions_list = []
 
-for origin_offset in range(2):
-    months = range(500 + origin_offset, 512 + origin_offset)
-    idx = pd.MultiIndex.from_product([months, [101, 102]], names=['month_id', 'country_id'])
-    preds = pd.DataFrame({pred_col: [[v] for v in np.random.rand(len(idx)) * 20]}, index=idx)
-    predictions_list.append(preds)
-
-# --- 3. Configure ---
+# --- 2. Configure ---
 config = {
     'steps': list(range(1, 13)),
-    'regression_targets': [target],
+    'regression_targets': ['ged_sb_best'],
     'regression_point_metrics': ['MSE', 'RMSLE', 'Pearson'],
 }
 
-# --- 4. Adapt and evaluate ---
-ef = PandasAdapter.from_dataframes(actual=actuals, predictions=predictions_list, target=target)
-
+# --- 3. Evaluate ---
 evaluator = NativeEvaluator(config)
-report = evaluator.evaluate(ef)   # legacy_compatibility=True by default
+report = evaluator.evaluate(ef)
 
-# --- 5. Access results ---
-print(report.to_dataframe('step'))         # step-wise DataFrame (MSE, RMSLE, Pearson per step)
+# --- 4. Access results ---
+print(report.to_dict())                    # full nested dict
+print(report.to_dataframe('step'))         # step-wise DataFrame
 print(report.to_dataframe('month'))        # month-wise DataFrame
 print(report.to_dataframe('time_series'))  # sequence-wise DataFrame
-print(report.to_dict())                    # full nested dict
 ```
 
 ### 2.6. The `legacy_compatibility` Flag
 
-`NativeEvaluator.evaluate(ef, legacy_compatibility=True)` (default) caps step-wise evaluation to
+`NativeEvaluator.evaluate(ef, legacy_compatibility=True)` caps step-wise evaluation to
 the shortest sequence in the frame. If origin 0 has 12 steps and origin 1 has only 10 steps,
-legacy mode evaluates steps 1–10 and leaves steps 11–12 empty. This reproduces a historic zip
-truncation behaviour required for parity with the legacy system.
+legacy mode evaluates steps 1–10 and leaves steps 11–12 empty. The default is `False` (evaluate
+all steps with available data).
 
 Set `legacy_compatibility=False` to evaluate all steps that have any data, regardless of whether
 shorter sequences exist.
@@ -189,57 +168,13 @@ report.get_schema_results('month')  # dict mapping key → typed metrics datacla
 
 ---
 
-## 3. The Legacy API (`EvaluationManager`)
-
-> **Deprecation notice:** `EvaluationManager` will be removed in Phase 3 of the orchestrator
-> migration. New integrations must use the native API (§2). This section is retained for teams
-> currently using the legacy path.
-
-### 3.1. Differences from the Native API
-
-- Accepts the same DataFrame inputs and config as §2.
-- Applies **inverse transforms** based on target name prefixes:
-  - `ln_` prefix: applies `exp(x) - 1` to both actuals and predictions
-  - `lx_` prefix: applies a custom inverse log transform
-  - `lr_` prefix: no transform (raw values)
-  - No prefix: no transform
-  This behaviour is **absent** from the native path, which always operates on data as provided.
-- Returns a dict of `{schema: (dict, DataFrame)}` tuples, not an `EvaluationReport`.
-- `legacy_compatibility` is hardcoded to `True` (cannot be changed).
-
-### 3.2. Usage
-
-```python
-from views_evaluation.evaluation.evaluation_manager import EvaluationManager
-
-manager = EvaluationManager()
-config = {
-    'steps': [1, 2, 3],
-    'regression_targets': ['lr_ged_sb_best'],
-    'regression_point_metrics': ['MSE', 'RMSLE', 'Pearson']
-}
-
-results = manager.evaluate(
-    actual=actuals,         # same format as §2.3
-    predictions=predictions_list,
-    target='lr_ged_sb_best',
-    config=config
-)
-
-# Access results (tuple format — not EvaluationReport)
-step_df = results['step'][1]         # index 1 = DataFrame
-step_dict = results['step'][0]       # index 0 = raw dict
-```
-
----
-
-## 4. What This Library Does NOT Do
+## 3. What This Library Does NOT Do
 
-- **Does not load or save data.** Pass DataFrames in; get an `EvaluationReport` (or dict) out.
+- **Does not load or save data.** Construct `EvaluationFrame` from NumPy arrays; get an `EvaluationReport` out.
+- **Does not perform data alignment or adaptation.** Callers (e.g. views-pipeline-core's `EvaluationAdapter`) are responsible for aligning actuals with predictions and synthesising identifiers.
 - **Does not enforce k=12 or 36-month sequences.** The VIEWS standard (ADR-030) recommends
   k=12 rolling origins over 36-month evaluation windows, but this library accepts any sequence
   count and length.
-- **Does not validate spatial or temporal alignment.** The adapter performs index intersection, but
-  it does not verify that sequences are in chronological order or that all origins cover the same
-  calendar range.
+- **Does not validate spatial or temporal alignment.** It verifies shape consistency and NaN/Inf
+  rejection, but does not verify that sequences are chronologically ordered.
 - **Does not produce output files.** Persistence is handled by `views-pipeline-core` per ADR-041.
diff --git a/documentation/standards/logging_and_observability_standard.md b/documentation/standards/logging_and_observability_standard.md
index 3efcaf0..4191a05 100644
--- a/documentation/standards/logging_and_observability_standard.md
+++ b/documentation/standards/logging_and_observability_standard.md
@@ -125,7 +125,7 @@ The following must be logged:
 * Configuration summaries
 * All structural failures
 
-> **Scope note:** Level 0 pure-math classes (`EvaluationFrame`, `NativeEvaluator`, `EvaluationReport`) rely on exception propagation per ADR-013 and do not maintain their own loggers. Logging responsibility for these components sits at the orchestration layer (e.g. `EvaluationManager` or calling code in `views-pipeline-core`).
+> **Scope note:** Level 0 pure-math classes (`EvaluationFrame`, `NativeEvaluator`, `EvaluationReport`) rely on exception propagation per ADR-013 and do not maintain their own loggers. Logging responsibility for these components sits at the orchestration layer (e.g. calling code in `views-pipeline-core` or equivalent orchestrators).
 
 ### 5.2 Optional Logging
 
diff --git a/documentation/standards/physical_architecture_standard.md b/documentation/standards/physical_architecture_standard.md
new file mode 100644
index 0000000..e7c1dca
--- /dev/null
+++ b/documentation/standards/physical_architecture_standard.md
@@ -0,0 +1,127 @@
+# Physical Architecture Standard
+
+**Status:** Active  
+**Governing ADRs:** ADR-010 (Ontology), ADR-011 (Topology)  
+
+---
+
+## 1. The 1-Class-1-File Standard
+
+**Every non-trivial class must live in its own file named after the class in `snake_case`.**
+
+- **Correct:** `EvaluationFrame` lives in `evaluation_frame.py`.
+- **Correct:** `NativeEvaluator` lives in `native_evaluator.py`.
+- **Exception:** Trivial data containers directly related to a class may coexist in the same file.
+
+---
+
+## 2. Directory Ontology
+
+Files must be located in directories that match their functional category:
+
+```
+views_evaluation/
+├── evaluation/          # Core evaluation logic (Level 0)
+│   ├── evaluation_frame.py
+│   ├── native_evaluator.py
+│   ├── metric_catalog.py
+│   ├── native_metric_calculators.py
+│   ├── evaluation_report.py
+│   ├── metrics.py
+│   └── config_schema.py
+├── adapters/            # Reserved for future framework bridges
+│   └── __init__.py
+└── profiles/            # Named evaluation profiles
+    ├── base.py
+    └── hydranet_ucdp.py
+```
+
+---
+
+## 3. Current State Assessment — Bundling
+
+### Compliant
+
+| File | Contents | Verdict |
+|------|----------|---------|
+| `evaluation_frame.py` | `EvaluationFrame` (1 class) | Compliant |
+| `native_evaluator.py` | `NativeEvaluator` (1 class) | Compliant |
+| `metric_catalog.py` | `MetricSpec` + `METRIC_CATALOG` + `METRIC_MEMBERSHIP` + `resolve_metric_params` | Cohesive module — spec, registries, and resolver form a single concept |
+| `evaluation_report.py` | `EvaluationReport` (1 class) | Compliant |
+| `config_schema.py` | `EvaluationConfig` (1 TypedDict) | Compliant |
+
+### Defensible Exception
+
+| File | Contents | Verdict |
+|------|----------|---------|
+| `metrics.py` | 5 dataclasses: `BaseEvaluationMetrics` + 4 typed 2x2 containers | Defensible — trivial data containers sharing a base class. Splitting into 5 files would create fragmentation without improving discoverability. |
+
+### Identified Challenge
+
+| File | Contents | Concern |
+|------|----------|---------|
+| `native_metric_calculators.py` | 437 lines: `_guard_shapes` (shared guard), 15+ implemented metric functions spanning 4 categories, 4 placeholder stubs, 4 legacy dispatch dicts, 1 legacy alias | **Bundling challenge** |
+
+**Analysis of `native_metric_calculators.py`:**
+
+This file bundles heterogeneous concerns:
+
+1. **Shared utility** (`_guard_shapes`) — used by all metrics, should arguably be its own module or remain as a private helper.
+
+2. **Four metric families:**
+   - Regression point: MSE, MSLE, RMSLE, EMD, Pearson, MTD, y_hat_bar, MCR
+   - Regression sample: CRPS, twCRPS, MIS, QIS, QS_sample, Coverage, Ignorance
+   - Classification point: AP, Brier_point, QS_point
+   - Classification sample: Brier_sample
+
+3. **Placeholder stubs** for unimplemented metrics (SD, pEMDiv, Variogram, Jeffreys).
+
+4. **Legacy dispatch dicts removed in Phase 3.** `METRIC_MEMBERSHIP` is now the single source of truth.
+
+**Why this is a challenge:**
+- The file mixes 4 distinct metric families. Adding a new regression-sample metric requires editing a 437-line file that also contains classification metrics.
+- The legacy dispatch dicts at the bottom duplicate the `METRIC_MEMBERSHIP` registry (risk C-01).
+- The file has the highest line count of any source module and the most heterogeneous responsibility set.
+
+**Why splitting is not straightforward:**
+- All metric functions share `_guard_shapes`. Splitting would either duplicate it or create a shared utility module.
+- The `MetricCatalog` imports all 22 functions from this single module. Splitting would require updating the catalog's import block.
+- Functions are stateless and flat — they are not classes, so the 1-class-1-file rule does not directly apply.
+
+**Recommendation (for future consideration):**
+If and when the file exceeds ~600 lines or the metric count exceeds ~30, consider splitting into:
+```
+evaluation/
+├── metric_calculators/
+│   ├── __init__.py          (re-exports all functions)
+│   ├── _guard.py            (_guard_shapes)
+│   ├── regression_point.py
+│   ├── regression_sample.py
+│   ├── classification.py
+│   └── placeholders.py
+```
+
+This is a **future evolution path**, not a current mandate. The current bundling is tolerable but approaches the threshold where it creates friction.
+
+---
+
+## 4. Import Conventions
+
+- **Explicit imports:** Avoid `from module import *`.
+- **Circular dependency guard:** Follow ADR-011 layering. Level 0 modules must not import from Level 1 or Level 2.
+- **Lazy imports for Pandas:** Pandas is imported inside methods (e.g. `to_dataframe()`) rather than at module level in Level 0/1 code.
+
+---
+
+## 5. Enforcement
+
+Compliance with this standard is assessed during:
+- Code review
+- Repo-assimilation audits
+- Tech debt cleanup cycles
+
+PRs that introduce new multi-class files or significantly expand existing bundled files should document the justification.
+
+---
+
+**"The structure of the files is as rigorous as the logic of the code."**
diff --git a/documentation/validate_docs.sh b/documentation/validate_docs.sh
new file mode 100755
index 0000000..8bd3b9b
--- /dev/null
+++ b/documentation/validate_docs.sh
@@ -0,0 +1,94 @@
+#!/usr/bin/env bash
+# Validates internal consistency of base_docs documentation set.
+# Exit 0 if clean, exit 1 if issues found.
+
+set -uo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$SCRIPT_DIR"
+
+errors=0
+
+echo "=== base_docs validation ==="
+echo ""
+
+# 1. Check for unfilled template placeholders in accepted/active files
+#    (skip files whose names contain "template" — those are expected to have placeholders)
+#    These are warnings only (non-blocking) since in the template repo some
+#    files are legitimately Accepted with placeholder dates.
+echo "--- Checking for template placeholders in accepted/active files ---"
+warnings=0
+while IFS= read -r file; do
+    [[ -z "$file" ]] && continue
+    [[ "$file" == *template* ]] && continue
+    if grep -q 'YYYY-MM-DD' "$file"; then
+        echo "  WARN: Unfilled date placeholder in $file"
+        warnings=$((warnings + 1))
+    fi
+    if grep -q '<roles / team>' "$file"; then
+        echo "  WARN: Unfilled deciders placeholder in $file"
+        warnings=$((warnings + 1))
+    fi
+    if grep -q '<ClassName>' "$file"; then
+        echo "  WARN: Unfilled ClassName placeholder in $file"
+        warnings=$((warnings + 1))
+    fi
+done < <(grep -rl 'Status:.*\(Accepted\|Active\)' --include='*.md' . 2>/dev/null || true)
+if [ "$warnings" -eq 0 ]; then
+    echo "  OK"
+fi
+
+# 2. Verify CIC active contracts exist (skip blockquote/example lines)
+echo "--- Checking CIC active contract references ---"
+if [ -f "CICs/README.md" ]; then
+    while IFS= read -r line; do
+        [[ -z "$line" ]] && continue
+        contract=$(echo "$line" | sed -n 's/^- `\(.*\.md\)`.*$/\1/p')
+        if [ -n "$contract" ] && [ ! -f "CICs/$contract" ]; then
+            echo "  ERROR: CIC contract listed but missing: CICs/$contract"
+            errors=$((errors + 1))
+        fi
+    done < <(grep -E '^- `[A-Z].*\.md`' CICs/README.md 2>/dev/null | grep -v '>' || true)
+fi
+
+# 3. Cross-ADR reference integrity (constitutional ADRs 000-009 only;
+#    higher numbers are project-specific and not expected in the template repo)
+echo "--- Checking cross-ADR references (constitutional: 000-009) ---"
+while IFS= read -r ref; do
+    [[ -z "$ref" ]] && continue
+    file=$(echo "$ref" | cut -d: -f1)
+    adr_num=$(echo "$ref" | grep -oP 'ADR-00\K[0-9]' | head -1)
+    if [ -n "$adr_num" ]; then
+        match_count=$(find ADRs -name "00${adr_num}_*.md" 2>/dev/null | wc -l)
+        if [ "$match_count" -eq 0 ]; then
+            echo "  ERROR: $file references ADR-00${adr_num} but no matching file found"
+            errors=$((errors + 1))
+        fi
+    fi
+done < <(grep -rn 'ADR-00[0-9]' --include='*.md' . 2>/dev/null || true)
+
+# 4. Check that referenced protocol files exist
+echo "--- Checking protocol file references ---"
+while IFS= read -r ref; do
+    [[ -z "$ref" ]] && continue
+    file=$(echo "$ref" | cut -d: -f1)
+    proto=$(echo "$ref" | grep -oP 'contributor_protocols/[a-z_]+\.md' | head -1)
+    if [ -n "$proto" ] && [ ! -f "$proto" ]; then
+        echo "  ERROR: $file references $proto but file does not exist"
+        errors=$((errors + 1))
+    fi
+done < <(grep -rn 'contributor_protocols/' --include='*.md' . 2>/dev/null || true)
+
+# 5. Report template status markers
+echo "--- Checking template status markers ---"
+template_count=$(grep -rl '\-\-template\-\-' --include='*.md' . 2>/dev/null | wc -l)
+echo "  INFO: $template_count files still have --template-- status (expected in template repo)"
+
+echo ""
+if [ "$errors" -gt 0 ]; then
+    echo "=== FAILED: $errors issue(s) found ==="
+    exit 1
+else
+    echo "=== PASSED: no issues found ==="
+    exit 0
+fi
diff --git a/examples/evaluate_native_prototype.py b/examples/evaluate_native_prototype.py
index 1e252ed..c131e36 100644
--- a/examples/evaluate_native_prototype.py
+++ b/examples/evaluate_native_prototype.py
@@ -1,56 +1,50 @@
+"""
+Example: EvaluationFrame grouping and metric computation
+
+Demonstrates how EvaluationFrame provides month-wise, step-wise,
+and origin-wise grouping for evaluation schemas.
+"""
 import numpy as np
-import pandas as pd
-from views_evaluation.evaluation.adapters import PandasAdapter
 from views_evaluation.evaluation.evaluation_frame import EvaluationFrame
 
+
 def mock_metrics_mse(ef: EvaluationFrame) -> float:
-    """A 'native' metric that uses broadcasting."""
-    # y_true (N,) broadcasts to (N, S)
-    # y_pred (N, S)
-    # result (N, S) -> mean(axis=1) -> (N,) -> mean() -> scalar
+    """A native metric using broadcasting: mean((y_true - y_pred)^2)."""
     errors = (ef.y_true[:, np.newaxis] - ef.y_pred) ** 2
     return np.mean(errors)
 
-def run_parity_demo():
-    # 1. Create dummy data mimicking the current structure
-    index = pd.MultiIndex.from_product([[100, 101, 102], [1, 2]], names=['month', 'unit'])
-    actual = pd.DataFrame({'target': np.random.rand(6)}, index=index)
-    
-    # Two sequences (overlapping)
-    pred_0 = pd.DataFrame({'pred_target': [[x, x+0.1] for x in np.random.rand(4)]}, 
-                          index=index[:4])
-    pred_1 = pd.DataFrame({'pred_target': [[x, x+0.1] for x in np.random.rand(4)]}, 
-                          index=index[2:])
-    
-    print("--- 1. Adapter Phase ---")
-    ef = PandasAdapter.from_dataframes(actual, [pred_0, pred_1], "target")
+
+def run_demo():
+    rng = np.random.default_rng(42)
+
+    # Build 2 overlapping sequences, 3 months each, 2 units
+    rows, y_true_list, y_pred_list = [], [], []
+    for origin in range(2):
+        for step_idx, month in enumerate(range(100 + origin, 103 + origin)):
+            for unit in [1, 2]:
+                rows.append((month, unit, origin, step_idx + 1))
+                y_true_list.append(rng.random())
+                y_pred_list.append([rng.random(), rng.random()])  # 2-sample ensemble
+
+    ef = EvaluationFrame(
+        y_true=np.array(y_true_list),
+        y_pred=np.array(y_pred_list),
+        identifiers={
+            'time':   np.array([r[0] for r in rows]),
+            'unit':   np.array([r[1] for r in rows]),
+            'origin': np.array([r[2] for r in rows]),
+            'step':   np.array([r[3] for r in rows]),
+        },
+        metadata={'target': 'target'},
+    )
     print(ef)
 
-    print("\n--- 2. Schema Preservation ---")
-    
-    # Month-wise
-    print("\nMonth-wise Groups:")
-    month_groups = ef.get_group_indices('time')
-    for month, idx in month_groups.items():
-        sub_ef = ef.select_indices(idx)
-        mse = mock_metrics_mse(sub_ef)
-        print(f"  Month {month}: {sub_ef.n_rows} rows, MSE={mse:.4f}")
-
-    # Step-wise
-    print("\nStep-wise Groups:")
-    step_groups = ef.get_group_indices('step')
-    for step, idx in step_groups.items():
-        sub_ef = ef.select_indices(idx)
-        mse = mock_metrics_mse(sub_ef)
-        print(f"  Step {step}: {sub_ef.n_rows} rows, MSE={mse:.4f}")
-
-    # Sequence-wise (Origin-wise)
-    print("\nSequence-wise Groups:")
-    origin_groups = ef.get_group_indices('origin')
-    for origin, idx in origin_groups.items():
-        sub_ef = ef.select_indices(idx)
-        mse = mock_metrics_mse(sub_ef)
-        print(f"  Sequence {origin}: {sub_ef.n_rows} rows, MSE={mse:.4f}")
+    for group_key in ['time', 'step', 'origin']:
+        print(f"\n{group_key.title()}-wise Groups:")
+        for val, idx in ef.get_group_indices(group_key).items():
+            sub = ef.select_indices(idx)
+            print(f"  {group_key}={val}: {sub.n_rows} rows, MSE={mock_metrics_mse(sub):.4f}")
+
 
 if __name__ == "__main__":
-    run_parity_demo()
+    run_demo()
diff --git a/examples/quickstart.ipynb b/examples/quickstart.ipynb
deleted file mode 100644
index 84b36d7..0000000
--- a/examples/quickstart.ipynb
+++ /dev/null
@@ -1,371 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Quick Start\n",
-    "In this notebook, we go over the main functionalities of the library\n",
-    "\n",
-    "## Table of Contents\n",
-    "1. [Installation](#installation)\n",
-    "2. [Importing Libraries](#importing-libraries)\n",
-    "3. [Creating Sample Data](#creating-sample-data)\n",
-    "4. [Initializing EvaluationManager](#initializing-evaluationmanager)\n",
-    "5. [Evaluating Predictions](#evaluating-predictions)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Installation"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We recommend using some virtual environment"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "With pip"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "`pip install views-evaluation`"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Importing Libraries\n",
-    "First, let's import a few things:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "from views_evaluation.evaluation.evaluation_manager import EvaluationManager\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Creating Sample Data\n",
-    "\n",
-    "Let's create some sample data for actual values and predictions."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "index = pd.MultiIndex.from_tuples(\n",
-    "    [(99, 1), (99, 2), (100, 1), (100, 2), (101, 1), (101, 2), (102, 1), (102, 2)],\n",
-    "    names=[\"month\", \"country\"],\n",
-    ")\n",
-    "index_0 = pd.MultiIndex.from_tuples(\n",
-    "    [(100, 1), (100, 2), (101, 1), (101, 2)],\n",
-    "    names=[\"month\", \"country\"],\n",
-    ")\n",
-    "index_1 = pd.MultiIndex.from_tuples(\n",
-    "    [(101, 1), (101, 2), (102, 1), (102, 2)],\n",
-    "    names=[\"month\", \"country\"],\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Actual data\n",
-    "df_actual = pd.DataFrame(\n",
-    "    {\n",
-    "        \"lr_target\": [0, 1, 1, 2, 2, 3, 3, 4],\n",
-    "        \"covariate_1\": [3, 2, 4, 5, 2, 6, 8, 5],\n",
-    "    },\n",
-    "    index=index,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Point predictions\n",
-    "df1_point = pd.DataFrame({\"pred_lr_target\": [1, 3, 5, 7]}, index=index_0)\n",
-    "df2_point = pd.DataFrame({\"pred_lr_target\": [2, 4, 6, 8]}, index=index_1)\n",
-    "dfs_point = [df1_point, df2_point]\n",
-    "\n",
-    "# Uncertainty\n",
-    "df1_uncertainty = pd.DataFrame(\n",
-    "    {\"pred_lr_target\": [[1, 2, 3], [2, 3, 4], [3, 4, 5], [4, 5, 6]]}, index=index_0\n",
-    ")\n",
-    "df2_uncertainty = pd.DataFrame(\n",
-    "    {\"pred_lr_target\": [[4, 6, 8], [5, 7, 9], [6, 8, 10], [7, 9, 11]]}, index=index_1\n",
-    ")\n",
-    "dfs_uncertainty = [df1_uncertainty, df2_uncertainty]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Initializing EvaluationManager\n",
-    "Now, we can initialize the `EvaluationManager` with the metrics we want to evaluate.\n",
-    "Point evaluation supports the following metrics:\n",
-    "- RMSLE\n",
-    "- CRPS\n",
-    "- Average Precision\n",
-    "\n",
-    "Uncertainty evaluation supports the following metric:\n",
-    "- CRPS"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "metrics_list = ['RMSLE', 'CRPS', 'MIS'] # Add other metrics as needed\n",
-    "evaluation_manager = EvaluationManager(metrics_list)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Evaluating Predictions"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Metric MIS is not a default metric, skipping...\n",
-      "Metric MIS is not a default metric, skipping...\n",
-      "Metric MIS is not a default metric, skipping...\n"
-     ]
-    }
-   ],
-   "source": [
-    "config = {\"steps\": [1, 2]}\n",
-    "point_evaluation_results = evaluation_manager.evaluate(df_actual, dfs_point, target='lr_target', config=config)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(             RMSLE  CRPS\n",
-       " month100  0.203422   0.5\n",
-       " month101  0.502668   2.0\n",
-       " month102  0.573874   3.5,\n",
-       "            RMSLE  CRPS\n",
-       " step01  0.182040   0.5\n",
-       " step02  0.636311   3.5,\n",
-       "          RMSLE  CRPS\n",
-       " ts00  0.510800   2.0\n",
-       " ts01  0.420849   2.0)"
-      ]
-     },
-     "execution_count": 13,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "point_evaluation_results['month'][1], point_evaluation_results['step'][1], point_evaluation_results['time_series'][1]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Metrics will be **ignored** if not in the supported metric list"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Metric RMSLE is not a default metric, skipping...\n",
-      "Metric RMSLE is not a default metric, skipping...\n",
-      "Metric RMSLE is not a default metric, skipping...\n"
-     ]
-    }
-   ],
-   "source": [
-    "uncertainty_evaluation_results = evaluation_manager.evaluate(df_actual, dfs_uncertainty, target='lr_target', config=config)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(              CRPS     MIS\n",
-       " month100  0.555556    3.90\n",
-       " month101  2.333333   65.85\n",
-       " month102  4.111111  127.80,\n",
-       "             CRPS    MIS\n",
-       " step01  1.833333  45.85\n",
-       " step02  2.833333  85.85,\n",
-       "           CRPS    MIS\n",
-       " ts00  1.055556   23.9\n",
-       " ts01  3.611111  107.8)"
-      ]
-     },
-     "execution_count": 16,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "uncertainty_evaluation_results['month'][1], uncertainty_evaluation_results['step'][1], uncertainty_evaluation_results['time_series'][1]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "If you are only interested in one of the evaluation schemas, you can call the corresponding function"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Metric MIS is not a default metric, skipping...\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Get the evaluation type, i.e., uncertainty or point\n",
-    "actual = EvaluationManager.transform_data(\n",
-    "            EvaluationManager.convert_to_array(df_actual, \"lr_target\"), 'lr_target'\n",
-    "        )\n",
-    "predictions = [\n",
-    "    EvaluationManager.transform_data(\n",
-    "        EvaluationManager.convert_to_array(pred, \"pred_lr_target\"), \"pred_lr_target\"\n",
-    "    )\n",
-    "    for pred in dfs_point\n",
-    "]\n",
-    "is_uncertainty = EvaluationManager.get_evaluation_type(predictions, 'pred_lr_target')\n",
-    "month_point_evaluation_results = evaluation_manager.month_wise_evaluation(actual, predictions, target='lr_target', is_uncertainty=is_uncertainty)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "             RMSLE  CRPS\n",
-      "month100  0.203422   0.5\n",
-      "month101  0.502668   2.0\n",
-      "month102  0.573874   3.5\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(month_point_evaluation_results[1])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'step01': PointEvaluationMetrics(MSE=None, MSLE=None, RMSLE=0.18203984406117593, CRPS=0.5, AP=None, EMD=None, SD=None, pEMDiv=None, Pearson=None, Variogram=None, y_hat_bar=None),\n",
-       " 'step02': PointEvaluationMetrics(MSE=None, MSLE=None, RMSLE=0.636311445241193, CRPS=3.5, AP=None, EMD=None, SD=None, pEMDiv=None, Pearson=None, Variogram=None, y_hat_bar=None)}"
-      ]
-     },
-     "execution_count": 22,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "point_evaluation_results['step'][0]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "testenv",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.11"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/examples/using_native_api.py b/examples/using_native_api.py
index 84bd00d..90075df 100644
--- a/examples/using_native_api.py
+++ b/examples/using_native_api.py
@@ -1,52 +1,47 @@
 """
 Example: Using the Native Evaluation API
 
-This script demonstrates the modern, performant way to evaluate forecasts 
-using the NativeEvaluator and EvaluationFrame. This path is up to 14x 
-faster for probabilistic forecasts.
+This script demonstrates how to evaluate forecasts using the
+NativeEvaluator and EvaluationFrame with pure NumPy arrays.
 """
-import pandas as pd
-from views_evaluation import PandasAdapter, NativeEvaluator
+import numpy as np
+from views_evaluation import EvaluationFrame, NativeEvaluator
 
-# 1. Prepare dummy data (The legacy format)
-index = pd.MultiIndex.from_product([[100, 101], [1, 2]], names=['month', 'unit'])
-actuals = pd.DataFrame({'target': [0, 1, 0, 1]}, index=index)
-# 3-sample ensemble predictions
-preds = [
-    pd.DataFrame({'pred_target': [[0.1, 0.2, 0.05], [0.8, 0.9, 0.7]]}, index=index[:2]),
-    pd.DataFrame({'pred_target': [[0.1, 0.15, 0.2], [0.7, 0.8, 0.9]]}, index=index[2:])
-]
+# 1. Prepare data as NumPy arrays
+n = 4
+y_true = np.array([0.0, 1.0, 0.0, 1.0])
+y_pred = np.array([
+    [0.1, 0.2, 0.05],
+    [0.8, 0.9, 0.7],
+    [0.1, 0.15, 0.2],
+    [0.7, 0.8, 0.9],
+])  # shape (4, 3) — 3-sample ensemble
 
-# 2. Configure metrics
+identifiers = {
+    'time':   np.array([100, 100, 101, 101]),
+    'unit':   np.array([1, 2, 1, 2]),
+    'origin': np.array([0, 0, 1, 1]),
+    'step':   np.array([1, 1, 1, 1]),
+}
+
+# 2. Construct EvaluationFrame (validates shapes, NaN, identifiers)
+ef = EvaluationFrame(y_true, y_pred, identifiers, metadata={'target': 'target'})
+print(f"EvaluationFrame: {ef.n_rows} rows, {ef.n_samples} samples")
+
+# 3. Configure and evaluate
 config = {
     'steps': [1],
     'regression_targets': ['target'],
-    'regression_sample_metrics': ['CRPS', 'Ignorance']
+    'regression_sample_metrics': ['CRPS', 'Ignorance'],
 }
-
-print("--- Step 1: Adapt Data ---")
-# The adapter performs alignment, truth-duplication, and list-extraction.
-# This step can be moved to the Orchestration layer (Pipeline Core) in the future.
-ef = PandasAdapter.from_dataframes(actuals, preds, "target")
-print(f"Adapted data: {ef.n_rows} rows, {ef.n_samples} samples")
-
-print("")
-print("--- Step 2: Evaluate ---")
-# The evaluator is stateless and pure math (no pandas).
 evaluator = NativeEvaluator(config)
 report = evaluator.evaluate(ef)
 print("Evaluation complete.")
 
-print("")
-print("--- Step 3: Export Results ---")
-# Convert to DataFrames only when needed for reporting
-month_df = report.to_dataframe(schema="month")
-print("Month-wise results:")
-print(month_df)
+# 4. Export results
+print("\nMonth-wise results (dict):")
+print(report.to_dict()['schemas']['month'])
 
-# Or export to pure dict for JSON serialization
-json_friendly_dict = report.to_dict()
-print("")
-print("JSON Export (Sample):")
-print(f"Target: {json_friendly_dict['target']}")
-print(f"Schemas found: {list(json_friendly_dict['schemas'].keys())}")
+print("\nFull export:")
+d = report.to_dict()
+print(f"Target: {d['target']}, Schemas: {list(d['schemas'].keys())}")
diff --git a/pyproject.toml b/pyproject.toml
index 99d03c4..bed92eb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "views_evaluation"
-version = "0.4.0"
+version = "0.5.0"
 description = ""
 authors = [
     "Xiaolong Sun <xiaolong.sun@pcr.uu.se>",
@@ -13,7 +13,10 @@ readme = "README.md"
 python = ">=3.11,<3.15"
 scikit-learn = "^1.6.0"
 numpy = "^1.26.4"
-pandas = "^1.5.3"  # PHASE-3-DELETE: will become optional once EvaluationManager/PandasAdapter are removed
+pandas = {version = "^1.5.3", optional = true}
+
+[tool.poetry.extras]
+dataframe = ["pandas"]
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^8.3.3"
diff --git a/tests/conftest.py b/tests/conftest.py
deleted file mode 100644
index 7144b73..0000000
--- a/tests/conftest.py
+++ /dev/null
@@ -1,64 +0,0 @@
-"""
-PHASE-3-DELETE: Pytest fixtures for the legacy EvaluationManager test suite.
-Will be deleted when Phase 3 of the orchestrator migration is complete.
-See reports/2026-02-25_evaluation_frame_refactor/10_orchestrator_migration_plan.md
-"""
-import pandas as pd
-import numpy as np
-import pytest
-
-# A fixture to generate mock data for tests
-@pytest.fixture
-def mock_data_factory():
-    def _generate(
-        target_name="lr_ged_sb_best",
-        point_predictions_as_list=True,
-        num_sequences=2,
-        num_steps=3,
-        num_locations=2,
-        start_month=500,
-    ):
-        pred_col_name = f"pred_{target_name}"
-        loc_id_name = "country_id"
-
-        # 1. Actuals DataFrame
-        actuals_index = pd.MultiIndex.from_product(
-            [range(start_month, start_month + num_sequences + num_steps), range(num_locations)],
-            names=['month_id', loc_id_name]
-        )
-        actuals = pd.DataFrame(
-            {target_name: np.random.randint(0, 50, size=len(actuals_index))},
-            index=actuals_index
-        )
-
-        # 2. Predictions List
-        predictions_list = []
-        for i in range(num_sequences):
-            preds_index = pd.MultiIndex.from_product(
-                [range(start_month + i, start_month + i + num_steps), range(num_locations)],
-                names=['month_id', loc_id_name]
-            )
-            
-            if point_predictions_as_list:
-                # Canonical format: list of single floats
-                pred_values = [[val] for val in np.random.rand(len(preds_index)) * 50]
-            else:
-                # Non-canonical format: raw floats
-                pred_values = [val for val in np.random.rand(len(preds_index)) * 50]
-
-            preds = pd.DataFrame(
-                {pred_col_name: pred_values},
-                index=preds_index
-            )
-            predictions_list.append(preds)
-
-        # 3. Config
-        config = {
-            'steps': list(range(1, num_steps + 1)),
-            'regression_targets': [target_name],
-            'regression_point_metrics': ['MSE', 'RMSLE', 'Pearson'],
-        }
-
-        return actuals, predictions_list, target_name, config
-
-    return _generate
diff --git a/tests/test_adversarial_inputs.py b/tests/test_adversarial_inputs.py
index 3a9110d..21c890c 100644
--- a/tests/test_adversarial_inputs.py
+++ b/tests/test_adversarial_inputs.py
@@ -1,229 +1,16 @@
-import pandas as pd
 import numpy as np
 import pytest
 
-from views_evaluation.evaluation.evaluation_manager import EvaluationManager
 from views_evaluation.evaluation.evaluation_frame import EvaluationFrame
 from views_evaluation.evaluation.native_evaluator import NativeEvaluator
 
-@pytest.fixture
-def adversarial_data_factory(mock_data_factory):
-    """A fixture that extends the mock_data_factory to create adversarial data."""
-    def _generate(
-        target_name="lr_ged_sb_best",
-        num_sequences=1,
-        num_steps=1,
-        num_locations=1,
-        start_month=500,
-        actuals_value=10.0,
-        predictions_value=[[10.0]],
-    ):
-        pred_col_name = f"pred_{target_name}"
-        loc_id_name = "country_id"
-
-        # 1. Actuals DataFrame
-        actuals_index = pd.MultiIndex.from_product(
-            [range(start_month, start_month + num_steps), range(num_locations)],
-            names=['month_id', loc_id_name]
-        )
-        actuals = pd.DataFrame(
-            {target_name: actuals_value},
-            index=actuals_index
-        )
-
-        # 2. Predictions List
-        predictions_list = []
-        preds_index = pd.MultiIndex.from_product(
-            [range(start_month, start_month + num_steps), range(num_locations)],
-            names=['month_id', loc_id_name]
-        )
-        preds = pd.DataFrame(
-            {pred_col_name: predictions_value},
-            index=preds_index
-        )
-        predictions_list.append(preds)
-
-        # 3. Config
-        config = {
-            'steps': list(range(1, num_steps + 1)),
-            'regression_targets': [target_name],
-            'regression_point_metrics': ['RMSLE'],
-        }
-
-        return actuals, predictions_list, target_name, config
-
-    return _generate
-
-
-class TestAdversarialInputs:
-    """
-    A test suite for Phase 2: Adversarial and Edge-Case Testing.
-    These tests probe for robustness and predictable failure modes.
-    """
-
-    def test_corrupted_numerical_data_nan_in_actuals(self, adversarial_data_factory):
-        """
-        Tests behavior when np.nan is present in the actuals data.
-        Expected: A ValueError should be raised by the underlying sklearn metric.
-        """
-        # Arrange
-        actuals, predictions, target, config = adversarial_data_factory(
-            actuals_value=np.nan,
-            predictions_value=[[10.0]]
-        )
-        manager = EvaluationManager()
-
-        # Act & Assert
-        with pytest.raises(ValueError, match="Input contains NaN"):
-            manager.evaluate(
-                actual=actuals,
-                predictions=predictions,
-                target=target,
-                config=config
-            )
-
-    def test_corrupted_numerical_data_nan_in_predictions(self, adversarial_data_factory):
-        """
-        Tests behavior when np.nan is present in the predictions data.
-        Expected: A ValueError should be raised by the underlying sklearn metric.
-        """
-        # Arrange
-        actuals, predictions, target, config = adversarial_data_factory(
-            actuals_value=10.0,
-            predictions_value=[[np.nan]]
-        )
-        manager = EvaluationManager()
-
-        # Act & Assert
-        with pytest.raises(ValueError, match="Input contains NaN"):
-            manager.evaluate(
-                actual=actuals,
-                predictions=predictions,
-                target=target,
-                config=config
-            )
-
-    def test_corrupted_numerical_data_inf_in_actuals(self, adversarial_data_factory):
-        """
-        Tests behavior when np.inf is present in the actuals data.
-        Expected: A ValueError should be raised.
-        """
-        # Arrange
-        actuals, predictions, target, config = adversarial_data_factory(
-            actuals_value=np.inf,
-            predictions_value=[[10.0]]
-        )
-        manager = EvaluationManager()
-
-        # Act & Assert
-        with pytest.raises(ValueError, match="Input contains infinity"):
-            manager.evaluate(
-                actual=actuals,
-                predictions=predictions,
-                target=target,
-                config=config
-            )
-
-    def test_corrupted_numerical_data_inf_in_predictions(self, adversarial_data_factory):
-        """
-        Tests behavior when np.inf is present in the predictions data.
-        Expected: A ValueError should be raised.
-        """
-        # Arrange
-        actuals, predictions, target, config = adversarial_data_factory(
-            actuals_value=10.0,
-            predictions_value=[[np.inf]]
-        )
-        manager = EvaluationManager()
-
-        # Act & Assert
-        with pytest.raises(ValueError, match="Input contains infinity"):
-            manager.evaluate(
-                actual=actuals,
-                predictions=predictions,
-                target=target,
-                config=config
-            )
-
-    def test_malformed_structural_data_empty_predictions_list(self, adversarial_data_factory):
-        """
-        Tests behavior when an empty list is passed for predictions.
-        Expected: A ValueError should be raised by pandas.concat.
-        """
-        # Arrange
-        actuals, _, target, config = adversarial_data_factory()
-        empty_predictions = []
-        manager = EvaluationManager()
-
-        # Act & Assert
-        with pytest.raises(ValueError, match="No objects to concatenate"):
-            manager.evaluate(
-                actual=actuals,
-                predictions=empty_predictions,
-                target=target,
-                config=config
-            )
-
-    def test_malformed_structural_data_empty_actuals_df(self, adversarial_data_factory):
-        """
-        Tests behavior when an empty DataFrame is passed for actuals.
-        Expected: A KeyError should be raised when trying to access the target column.
-        """
-        # Arrange
-        _, predictions, target, config = adversarial_data_factory()
-        empty_actuals = pd.DataFrame()
-        manager = EvaluationManager()
-
-        # Act & Assert
-        with pytest.raises(KeyError):
-            manager.evaluate(
-                actual=empty_actuals,
-                predictions=predictions,
-                target=target,
-                config=config
-            )
-
-    def test_malformed_structural_data_non_overlapping_indices(self, adversarial_data_factory):
-        """
-        Tests behavior when actuals and predictions have no overlapping indices.
-        Expected: A ValueError should be raised by np.concatenate in the metric calculator.
-        """
-        # Arrange
-        # Create actuals starting at month 500
-        actuals, _, target, config = adversarial_data_factory(start_month=500, num_locations=1)
-        
-        # Create predictions starting at month 600, ensuring no overlap
-        pred_col_name = f"pred_{target}"
-        # Correctly create a 2-level MultiIndex
-        preds_index = pd.MultiIndex.from_product(
-            [range(600, 602), [10]], # Non-overlapping range for month_id
-            names=['month_id', "country_id"]
-        )
-        preds = pd.DataFrame({pred_col_name: [[10.0]] * 2}, index=preds_index)
-        predictions_non_overlapping = [preds]
-        
-        manager = EvaluationManager()
-
-        # Act & Assert
-        with pytest.raises(ValueError, match="need at least one array to concatenate"):
-            manager.evaluate(
-                actual=actuals,
-                predictions=predictions_non_overlapping,
-                target=target,
-                config=config
-            )
-
 
 class TestAdversarialNativeInputs:
     """
     Adversarial tests targeting EvaluationFrame + NativeEvaluator directly.
 
-    These tests are the native-path equivalents of TestAdversarialInputs above.
-    They must survive Phase 3 of the orchestrator migration (ADR-011), when
-    EvaluationManager and PandasAdapter are removed from this repository.
-
-    Every test here asserts ADR-013 (Fail-Loud) behaviour from the entry
-    points that will remain after the migration.
+    Every test here asserts ADR-013 (Fail-Loud) behaviour from the
+    permanent native-path entry points.
     """
 
     @staticmethod
@@ -317,3 +104,23 @@ def test_unimplemented_metric_raises_clear_value_error(self):
         with pytest.raises(ValueError, match="not yet implemented"):
             NativeEvaluator(config).evaluate(ef)
 
+    def test_nan_rejected_before_brier_executes(self):
+        """Defense-in-depth: EvaluationFrame rejects NaN so Brier's NaN-swallowing
+        comparison semantics can never be triggered through the normal evaluation path."""
+        with pytest.raises(ValueError, match="NaN"):
+            EvaluationFrame(
+                y_true=np.array([np.nan, 1.0]),
+                y_pred=np.array([[0.5], [0.8]]),
+                identifiers=self._simple_ids(2),
+                metadata={'target': 'cls_target'},
+            )
+
+    def test_inf_rejected_before_metric_executes(self):
+        """Defense-in-depth: EvaluationFrame rejects Inf before any metric function runs."""
+        with pytest.raises(ValueError, match="infinity"):
+            EvaluationFrame(
+                y_true=np.array([np.inf, 1.0]),
+                y_pred=np.array([[0.5], [0.8]]),
+                identifiers=self._simple_ids(2),
+                metadata={'target': 'cls_target'},
+            )
diff --git a/tests/test_data_contract.py b/tests/test_data_contract.py
deleted file mode 100644
index 7e53490..0000000
--- a/tests/test_data_contract.py
+++ /dev/null
@@ -1,80 +0,0 @@
-"""
-PHASE-3-DELETE: Tests data contract enforcement for the legacy EvaluationManager path.
-Will be deleted when Phase 3 of the orchestrator migration is complete.
-See reports/2026-02-25_evaluation_frame_refactor/10_orchestrator_migration_plan.md
-"""
-import pandas as pd
-import pytest
-from views_evaluation.evaluation.evaluation_manager import EvaluationManager
-
-@pytest.fixture
-def mock_data():
-    target = "lr_target"
-    index = pd.MultiIndex.from_tuples([(100, 1), (101, 1)], names=["month", "id"])
-    actual = pd.DataFrame({target: [10, 20]}, index=index)
-    config = {
-        "steps": [1, 2],
-        "regression_targets": [target],
-        "regression_point_metrics": ["MSE"],
-    }
-    return actual, target, config, index
-
-def test_missing_pred_column(mock_data):
-    actual, target, config, index = mock_data
-    # Column name is wrong
-    pred_df = pd.DataFrame({"wrong_name": [[10.5], [19.5]]}, index=index)
-    manager = EvaluationManager()
-
-    with pytest.raises(ValueError, match=f"must contain the column named 'pred_{target}'"):
-        manager.evaluate(actual, [pred_df], target, config)
-
-def test_extra_columns_raises_error(mock_data):
-    """Verify that extra columns now raise a ValueError per the documentation."""
-    actual, target, config, index = mock_data
-    pred_df = pd.DataFrame({
-        f"pred_{target}": [[10.5], [19.5]],
-        "extra_garbage": [1, 2]
-    }, index=index)
-    manager = EvaluationManager()
-
-    with pytest.raises(ValueError, match="must contain exactly one column"):
-        manager.evaluate(actual, [pred_df], target, config)
-
-def test_duplicate_pred_columns_raises_error(mock_data):
-    """Verify that duplicate target columns cause a failure (currently a crash)."""
-    actual, target, config, index = mock_data
-    df1 = pd.DataFrame({f"pred_{target}": [[10.5], [19.5]]}, index=index)
-    df2 = pd.DataFrame({f"pred_{target}": [[11.0], [20.0]]}, index=index)
-    pred_df = pd.concat([df1, df2], axis=1)
-
-    manager = EvaluationManager()
-
-    # We expect a failure. Note: Ideally we want a custom ValueError from our validator.
-    # Currently it raises a numpy/pandas ValueError during calculation.
-    with pytest.raises(ValueError):
-        manager.evaluate(actual, [pred_df], target, config)
-
-def test_zero_index_overlap_graceful_failure(mock_data):
-    """Verify behavior when actuals and predictions have no common months."""
-    actual, target, config, _ = mock_data
-    # Preds are for months 200, 201 (no overlap with 100, 101)
-    index_no_overlap = pd.MultiIndex.from_tuples([(200, 1), (201, 1)], names=["month", "id"])
-    pred_df = pd.DataFrame({f"pred_{target}": [[10.5], [19.5]]}, index=index_no_overlap)
-
-    manager = EvaluationManager()
-
-    # Currently, this crashes in np.concatenate inside the metric calculator.
-    # We want it to either raise a clear error or return NaNs.
-    with pytest.raises((ValueError, KeyError)):
-        manager.evaluate(actual, [pred_df], target, config)
-
-def test_mixed_point_and_sample_types(mock_data):
-    actual, target, config, index = mock_data
-    # First is point, second is sample
-    pred1 = pd.DataFrame({f"pred_{target}": [[10.5], [19.5]]}, index=index)
-    pred2 = pd.DataFrame({f"pred_{target}": [[10, 11, 12], [19, 20, 21]]}, index=index)
-
-    manager = EvaluationManager()
-
-    with pytest.raises(ValueError, match="Mix of evaluation types detected"):
-        manager.evaluate(actual, [pred1, pred2], target, config)
diff --git a/tests/test_documentation_contracts.py b/tests/test_documentation_contracts.py
deleted file mode 100644
index 8c096d2..0000000
--- a/tests/test_documentation_contracts.py
+++ /dev/null
@@ -1,286 +0,0 @@
-"""
-PHASE-3-DELETE: Tests documentation contracts for the legacy EvaluationManager path.
-Will be deleted when Phase 3 of the orchestrator migration is complete.
-See reports/2026-02-25_evaluation_frame_refactor/10_orchestrator_migration_plan.md
-"""
-import pandas as pd
-import numpy as np
-import pytest
-
-from views_evaluation.evaluation.evaluation_manager import EvaluationManager
-
-
-class TestDocumentationContracts:
-    """
-    A test suite to verify the claims made in the project's documentation.
-    """
-
-    def test_eval_lib_imp_actuals_schema_prefix_requirement_succeeds(self, mock_data_factory):
-        """
-        Verifies Section 3.1 of eval_lib_imp.md.
-        Claim: Evaluation succeeds if the target name has a valid prefix.
-        """
-        # Arrange
-        target_with_prefix = "lr_ged_sb_best"
-        actuals, predictions, target, config = mock_data_factory(target_name=target_with_prefix)
-        manager = EvaluationManager()
-
-        # Act & Assert
-        try:
-            manager.evaluate(
-                actual=actuals,
-                predictions=predictions,
-                target=target,
-                config=config
-            )
-        except ValueError as e:
-            pytest.fail(f"Evaluation failed unexpectedly with a valid prefix: {e}")
-
-    def test_eval_lib_imp_actuals_schema_prefix_requirement_fails(self, mock_data_factory):
-        """
-        Verifies updated behaviour from Section 3.1 of eval_lib_imp.md.
-        Old claim: Evaluation fails if the target name is missing a valid prefix.
-        New behaviour: The new EvaluationManager no longer validates prefixes in evaluate().
-        transform_data() issues a warning for unknown prefixes but applies an identity
-        transform and continues. Evaluation therefore *succeeds* with an unknown prefix as
-        long as the target is declared in the config.
-        """
-        # Arrange
-        target_without_prefix = "ged_sb_best"
-        actuals, predictions, target, config = mock_data_factory(target_name=target_without_prefix)
-        manager = EvaluationManager()
-
-        # Act & Assert — should now succeed (prefix validation removed from evaluate())
-        try:
-            manager.evaluate(
-                actual=actuals,
-                predictions=predictions,
-                target=target,
-                config=config
-            )
-        except Exception as e:
-            pytest.fail(
-                f"evaluate() raised unexpectedly for a target with no recognised prefix: {e}"
-            )
-
-    def test_eval_lib_imp_predictions_schema_point_canonical_succeeds(self, mock_data_factory):
-        """
-        Verifies Section 3.2 of eval_lib_imp.md.
-        Claim: Evaluation succeeds if point predictions are canonical (list of single float).
-        """
-        # Arrange
-        actuals, predictions, target, config = mock_data_factory(point_predictions_as_list=True)
-        manager = EvaluationManager()
-
-        # Act & Assert
-        try:
-            manager.evaluate(
-                actual=actuals,
-                predictions=predictions,
-                target=target,
-                config=config
-            )
-        except ValueError as e:
-            pytest.fail(f"Evaluation failed unexpectedly with canonical point predictions: {e}")
-
-    def test_eval_lib_imp_predictions_schema_point_non_canonical_succeeds_due_to_implicit_conversion(self, mock_data_factory):
-        """
-        Verifies Section 3.2 of eval_lib_imp.md by demonstrating a divergence.
-        Claim: Documentation states evaluation fails if point predictions are non-canonical (raw float).
-        Observed: Evaluation *succeeds* due to implicit conversion in EvaluationManager, making documentation inaccurate.
-        """
-        # Arrange
-        actuals, predictions, target, config = mock_data_factory(point_predictions_as_list=False)
-        manager = EvaluationManager()
-
-        # Act & Assert
-        try:
-            manager.evaluate(
-                actual=actuals,
-                predictions=predictions,
-                target=target,
-                config=config
-            )
-        except ValueError as e:
-            pytest.fail(f"Evaluation *should have succeeded* with non-canonical point predictions due to implicit conversion, but failed with: {e}")
-
-    def test_evaluation_manager_implicitly_converts_raw_floats_to_arrays(self, mock_data_factory):
-        """
-        Explicitly verifies the implicit conversion of raw float predictions to np.ndarray([float])
-        by EvaluationManager's internal _process_data method.
-        This behavior contradicts eval_lib_imp.md's claim that raw floats should cause an error.
-        """
-        # Arrange
-        actuals, predictions, target, config = mock_data_factory(point_predictions_as_list=False)
-        manager = EvaluationManager()
-
-        # Act
-        manager.evaluate(
-            actual=actuals,
-            predictions=predictions,
-            target=target,
-            config=config
-        )
-
-        # Assert
-        # After evaluate, internal predictions should be processed
-        processed_predictions = manager.predictions
-        # Check that the first value in the first DataFrame of processed_predictions is now a np.ndarray
-        assert isinstance(processed_predictions[0].iloc[0, 0], np.ndarray)
-        # Check that its length is 1 (single element)
-        assert len(processed_predictions[0].iloc[0, 0]) == 1
-
-    def test_eval_lib_imp_api_contract_missing_steps_config_fails(self, mock_data_factory):
-        """
-        Verifies Section 4.2 of eval_lib_imp.md.
-        Claim: The `evaluate` method's `config` parameter *must* contain the key 'steps'.
-        """
-        # Arrange
-        actuals, predictions, target, _ = mock_data_factory() # Use _ to ignore the default config
-        manager = EvaluationManager()
-        invalid_config = {} # Missing 'steps' key
-
-        # Act & Assert
-        with pytest.raises(KeyError, match="'steps'"):
-            manager.evaluate(
-                actual=actuals,
-                predictions=predictions,
-                target=target,
-                config=invalid_config
-            )
-
-    def test_eval_lib_imp_data_state_coherency_no_inverse_transform(self, mock_data_factory):
-        """
-        Verifies Section 3.4 of eval_lib_imp.md.
-        Claim: EvaluationManager does NOT perform inverse transformations on prediction data (producer's responsibility).
-        """
-        # Arrange
-        target_name = "lr_some_var" # lr_ prefix means raw, no transform by EM
-        pred_col_name = f"pred_{target_name}"
-        loc_id_name = "country_id"
-
-        # Create actuals (raw counts)
-        actuals_index = pd.MultiIndex.from_product([[500], [10]], names=['month_id', loc_id_name])
-        actuals = pd.DataFrame(
-            {target_name: [100]}, # Actual value is 100
-            index=actuals_index
-        )
-
-        # Create predictions that are log-transformed, but named as 'lr_' to indicate raw input
-        # So, if EM were to inverse transform, it would be wrong, but it shouldn't inverse transform
-        predictions_list = []
-        preds_index = pd.MultiIndex.from_product([[500], [10]], names=['month_id', loc_id_name])
-        # Prediction of log(100+1) - 1, which is approximately 4.6 (ln(101)-1)
-        # If EM doesn't inverse transform, RMSLE will be calculated with 4.6 vs 100
-        # If EM incorrectly inverse transformed, it would see 4.6, transform it back, then calculate RMSLE
-        pred_values_log_transformed = [[np.log1p(100)]] # Represents log(100+1)
-        predictions_df = pd.DataFrame(
-            {pred_col_name: pred_values_log_transformed},
-            index=preds_index
-        )
-        predictions_list.append(predictions_df)
-        
-        manager = EvaluationManager()
-        
-        # We need a config with steps and the new required keys
-        config = {
-            'steps': [1],
-            'regression_targets': [target_name],
-            'regression_point_metrics': ['RMSLE'],
-        }
-
-        # Act
-        results = manager.evaluate(
-            actual=actuals,
-            predictions=predictions_list,
-            target=target_name,
-            config=config
-        )
-
-        # Assert
-        # Get the RMSLE for the step-wise evaluation
-        rmsle = results['step'][1]['RMSLE'][0] # Access the DataFrame, then RMSLE column, then first value
-        
-        # If EM incorrectly inverse-transformed, RMSLE would be close to 0
-        # If EM correctly *doesn't* inverse-transform, RMSLE is calculated with actual=100 and pred=log1p(100)
-        # log1p(100) is approx 4.615
-        # RMSLE(100, 4.615) is large.
-        
-        # A simple check: if RMSLE is very small, it means inverse transform *did* happen.
-        # We expect it to be large.
-        assert rmsle > 1.0 # Arbitrary large threshold to show it's not a small error
-
-    def test_r2darts2_report_point_prediction_format_succeeds(self, mock_data_factory):
-        """
-        Verifies Section B.1 of the plan (from r2darts2_full_imp_report.md).
-        Claim: views-r2darts2 produces point predictions as a list (e.g., [[25.5]]).
-        """
-        # Arrange
-        # Use mock_data_factory with point_predictions_as_list=True to simulate r2darts2 output
-        actuals, predictions, target, config = mock_data_factory(point_predictions_as_list=True)
-        manager = EvaluationManager()
-
-        # Act & Assert
-        try:
-            manager.evaluate(
-                actual=actuals,
-                predictions=predictions,
-                target=target,
-                config=config
-            )
-        except ValueError as e:
-            pytest.fail(f"Evaluation failed unexpectedly when processing r2darts2-like canonical point predictions: {e}")
-
-    def test_stepshifter_report_point_prediction_format_succeeds_despite_raw_float_output(self, mock_data_factory):
-        """
-        Verifies Section C.1 of the plan (from stepshifter_full_imp_report.md).
-        Claim: views-stepshifter produces point predictions as raw np.float64 values (contradicts eval_lib_imp.md).
-        Observed: EvaluationManager implicitly converts and processes successfully.
-        """
-        # Arrange
-        # Use mock_data_factory with point_predictions_as_list=False to simulate stepshifter output
-        actuals, predictions, target, config = mock_data_factory(point_predictions_as_list=False)
-        manager = EvaluationManager()
-
-        # Act & Assert
-        try:
-            manager.evaluate(
-                actual=actuals,
-                predictions=predictions,
-                target=target,
-                config=config
-            )
-        except ValueError as e:
-            pytest.fail(f"Evaluation *should have succeeded* with stepshifter-like raw float predictions due to implicit conversion, but failed with: {e}")
-
-    def test_stepshifter_report_reconciliation_fix_succeeds(self, mock_data_factory):
-        """
-        Verifies Section C.2 of the plan (from stepshifter_full_imp_report.md).
-        Claim: Applying the reconciliation fix (float -> list) to stepshifter's raw float output
-               should allow EvaluationManager to process the data successfully.
-        """
-        # Arrange
-        # Simulate stepshifter output (raw floats)
-        actuals, predictions_raw_floats, target, config = mock_data_factory(point_predictions_as_list=False)
-        manager = EvaluationManager()
-
-        # Apply the reconciliation logic as described in the report
-        # "Wrap every cell value in a list to conform to the canonical standard."
-        reconciled_predictions = [df.applymap(lambda x: [x]) for df in predictions_raw_floats]
-
-        # Act & Assert
-        try:
-            manager.evaluate(
-                actual=actuals,
-                predictions=reconciled_predictions,
-                target=target,
-                config=config
-            )
-        except ValueError as e:
-            pytest.fail(f"Evaluation failed unexpectedly after applying stepshifter's reconciliation fix: {e}")
-
-
-
-        
-
-
diff --git a/tests/test_evaluation_frame.py b/tests/test_evaluation_frame.py
index 000de43..5f9724c 100644
--- a/tests/test_evaluation_frame.py
+++ b/tests/test_evaluation_frame.py
@@ -218,6 +218,18 @@ def test_y_pred_row_mismatch_raises(self):
         with pytest.raises(ValueError, match="mismatch"):
             EvaluationFrame(np.ones(5), np.ones((4, 1)), _make_identifiers(5))
 
+    def test_y_pred_1d_raises(self):
+        """1D y_pred must be rejected — callers must provide (N, S) shape."""
+        n = 3
+        with pytest.raises(ValueError, match="y_pred must be 2D"):
+            EvaluationFrame(np.ones(n), np.ones(n), _make_identifiers(n))
+
+    def test_y_pred_3d_raises(self):
+        """3D y_pred must be rejected."""
+        n = 2
+        with pytest.raises(ValueError, match="y_pred must be 2D"):
+            EvaluationFrame(np.ones(n), np.ones((n, 3, 2)), _make_identifiers(n))
+
     def test_nan_in_y_true_raises(self):
         n = 4
         with pytest.raises(ValueError, match="NaN"):
diff --git a/tests/test_evaluation_manager.py b/tests/test_evaluation_manager.py
deleted file mode 100644
index da4ad53..0000000
--- a/tests/test_evaluation_manager.py
+++ /dev/null
@@ -1,515 +0,0 @@
-"""
-PHASE-3-DELETE: Tests the legacy EvaluationManager path.
-Will be deleted when Phase 3 of the orchestrator migration is complete.
-See reports/2026-02-25_evaluation_frame_refactor/10_orchestrator_migration_plan.md
-"""
-import logging
-import pandas as pd
-import numpy as np
-import pytest
-from sklearn.metrics import root_mean_squared_log_error, average_precision_score
-import properscoring as ps
-from views_evaluation.evaluation.evaluation_manager import EvaluationManager
-from views_evaluation.evaluation.native_metric_calculators import (
-    REGRESSION_POINT_NATIVE,
-    REGRESSION_SAMPLE_NATIVE,
-)
-from views_evaluation.evaluation.metrics import (
-    RegressionPointEvaluationMetrics,
-    RegressionSampleEvaluationMetrics,
-)
-
-
-@pytest.fixture
-def mock_index():
-    index_0 = pd.MultiIndex.from_tuples(
-        [
-            (100, 1),
-            (100, 2),
-            (101, 1),
-            (101, 2),
-            (102, 1),
-            (102, 2),
-        ],
-        names=["month", "country"],
-    )
-    index_1 = pd.MultiIndex.from_tuples(
-        [
-            (101, 1),
-            (101, 2),
-            (102, 1),
-            (102, 2),
-            (103, 1),
-            (103, 2),
-        ],
-        names=["month", "country"],
-    )
-    return [index_0, index_1]
-
-
-@pytest.fixture
-def mock_actual():
-    index = pd.MultiIndex.from_tuples(
-        [
-            (99, 1),
-            (99, 2),
-            (100, 1),
-            (100, 2),
-            (101, 1),
-            (101, 2),
-            (102, 1),
-            (102, 2),
-            (103, 1),
-            (103, 2),
-            (104, 1),
-            (104, 2),
-        ],
-        names=["month", "country"],
-    )
-    df = pd.DataFrame(
-        {
-            "target": [0.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 5.0, 5.0, 6.0],
-            "covariate_1": [3.0, 2.0, 4.0, 5.0, 2.0, 6.0, 8.0, 5.0, 3.0, 2.0, 9.0, 4.0],
-        },
-        index=index,
-    )
-    return EvaluationManager.convert_to_array(df, "target")
-
-
-@pytest.fixture
-def mock_point_predictions(mock_index):
-    df1 = pd.DataFrame({"pred_target": [1.0, 3.0, 5.0, 7.0, 9.0, 7.0]}, index=mock_index[0])
-    df2 = pd.DataFrame({"pred_target": [2.0, 4.0, 6.0, 8.0, 10.0, 8.0]}, index=mock_index[1])
-    return [EvaluationManager.convert_to_array(df1, "pred_target"), EvaluationManager.convert_to_array(df2, "pred_target")]
-
-
-@pytest.fixture
-def mock_sample_predictions(mock_index):
-    df1 = pd.DataFrame(
-        {
-            "pred_target": [
-                [1.0, 2.0, 3.0],
-                [2.0, 3.0, 4.0],
-                [3.0, 4.0, 5.0],
-                [4.0, 5.0, 6.0],
-                [5.0, 6.0, 7.0],
-                [6.0, 7.0, 8.0],
-            ]
-        },
-        index=mock_index[0],
-    )
-    df2 = pd.DataFrame(
-        {
-            "pred_target": [
-                [4.0, 6.0, 8.0],
-                [5.0, 7.0, 9.0],
-                [6.0, 8.0, 10.0],
-                [7.0, 9.0, 11.0],
-                [8.0, 10.0, 12.0],
-                [9.0, 11.0, 13.0],
-            ]
-        },
-        index=mock_index[1],
-    )
-    return [EvaluationManager.convert_to_array(df1, "pred_target"), EvaluationManager.convert_to_array(df2, "pred_target")]
-
-
-def test_validate_dataframes_valid_type(mock_point_predictions):
-    with pytest.raises(TypeError):
-        EvaluationManager.validate_predictions(
-            mock_point_predictions[0], "target"
-        )
-
-
-def test_validate_dataframes_valid_columns(mock_point_predictions):
-    with pytest.raises(ValueError):
-        EvaluationManager.validate_predictions(
-            mock_point_predictions, "y"
-        )
-
-def test_get_evaluation_type():
-    # Test case 1: All DataFrames for sample evaluation
-    predictions_sample = [
-        pd.DataFrame({'pred_target': [[1.0, 2.0], [3.0, 4.0]]}),
-        pd.DataFrame({'pred_target': [[5.0, 6.0], [7.0, 8.0]]}),
-    ]
-    assert EvaluationManager.get_evaluation_type(predictions_sample, "pred_target") is True
-
-    # Test case 2: All DataFrames for point evaluation
-    predictions_point = [
-        pd.DataFrame({'pred_target': [[1.0], [2.0]]}),
-        pd.DataFrame({'pred_target': [[3.0], [4.0]]}),
-    ]
-    assert EvaluationManager.get_evaluation_type(predictions_point, "pred_target") is False
-
-    # Test case 3: Mixed evaluation types
-    predictions_mixed = [
-        pd.DataFrame({'pred_target': [[1.0, 2.0], [3.0, 4.0]]}),
-        pd.DataFrame({'pred_target': [[5.0], [6.0]]}),
-    ]
-    with pytest.raises(ValueError):
-        EvaluationManager.get_evaluation_type(predictions_mixed, "pred_target")
-
-    # Test case 4: Single element lists
-    predictions_single_element = [
-        pd.DataFrame({'pred_target': [[1.0], [2.0]]}),
-        pd.DataFrame({'pred_target': [[3.0], [4.0]]}),
-    ]
-    assert EvaluationManager.get_evaluation_type(predictions_single_element, "pred_target") is False
-
-
-def test_match_actual_pred_point(
-    mock_actual, mock_point_predictions, mock_sample_predictions, mock_index
-):
-    df_matched = [
-        pd.DataFrame({"target": [[1.0], [2.0], [2.0], [3.0], [3.0], [4.0]]}, index=mock_index[0]),
-        pd.DataFrame({"target": [[2.0], [3.0], [3.0], [4.0], [4.0], [5.0]]}, index=mock_index[1]),
-    ]
-    for i in range(len(df_matched)):
-        df_matched_actual_point, df_matched_point = (
-            EvaluationManager._match_actual_pred(
-                mock_actual, mock_point_predictions[i], "target"
-            )
-        )
-        df_matched_actual_sample, df_matched_sample = (
-            EvaluationManager._match_actual_pred(
-                mock_actual, mock_sample_predictions[i], "target"
-            )
-        )
-        assert df_matched[i].equals(df_matched_actual_point)
-        assert df_matched_point.equals(mock_point_predictions[i])
-        assert df_matched[i].equals(df_matched_actual_sample)
-        assert df_matched_sample.equals(mock_sample_predictions[i])
-
-
-def test_split_dfs_by_step(mock_point_predictions, mock_sample_predictions):
-    df_splitted_point = [
-        EvaluationManager.convert_to_array(pd.DataFrame(
-            {"pred_target": [[1.0], [3.0], [2.0], [4.0]]},
-            index=pd.MultiIndex.from_tuples(
-                [(100, 1), (100, 2), (101, 1), (101, 2)], names=["month", "country"]
-            ),
-        ), "pred_target"),
-        EvaluationManager.convert_to_array(pd.DataFrame(
-            {"pred_target": [[5.0], [7.0], [6.0], [8.0]]},
-            index=pd.MultiIndex.from_tuples(
-                [(101, 1), (101, 2), (102, 1), (102, 2)], names=["month", "country"]
-            ),
-        ), "pred_target"),
-        EvaluationManager.convert_to_array(pd.DataFrame(
-            {"pred_target": [[9.0], [7.0], [10.0], [8.0]]},
-            index=pd.MultiIndex.from_tuples(
-                [(102, 1), (102, 2), (103, 1), (103, 2)], names=["month", "country"]
-            ),
-        ), "pred_target"),
-    ]
-    df_splitted_sample = [
-        EvaluationManager.convert_to_array(pd.DataFrame(
-            {"pred_target": [[1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [4.0, 6.0, 8.0], [5.0, 7.0, 9.0]]},
-            index=pd.MultiIndex.from_tuples(
-                [(100, 1), (100, 2), (101, 1), (101, 2)], names=["month", "country"]
-            ),
-        ), "pred_target"),
-        EvaluationManager.convert_to_array(pd.DataFrame(
-            {"pred_target": [[3.0, 4.0, 5.0], [4.0, 5.0, 6.0], [6.0, 8.0, 10.0], [7.0, 9.0, 11.0]]},
-            index=pd.MultiIndex.from_tuples(
-                [(101, 1), (101, 2), (102, 1), (102, 2)], names=["month", "country"]
-            ),
-        ), "pred_target"),
-        EvaluationManager.convert_to_array(pd.DataFrame(
-            {"pred_target": [[5.0, 6.0, 7.0], [6.0, 7.0, 8.0], [8.0, 10.0, 12.0], [9.0, 11.0, 13.0]]},
-            index=pd.MultiIndex.from_tuples(
-                [(102, 1), (102, 2), (103, 1), (103, 2)], names=["month", "country"]
-            ),
-        ), "pred_target"),
-    ]
-    df_splitted_point_test = EvaluationManager._split_dfs_by_step(
-        mock_point_predictions
-    )
-    df_splitted_sample_test = EvaluationManager._split_dfs_by_step(
-        mock_sample_predictions
-    )
-    for df1, df2 in zip(df_splitted_point, df_splitted_point_test):
-        assert df1.equals(df2)
-    for df1, df2 in zip(df_splitted_sample, df_splitted_sample_test):
-        assert df1.equals(df2)
-
-
-def test_step_wise_evaluation_point(mock_actual, mock_point_predictions):
-    manager = EvaluationManager()
-    evaluation_dict, df_evaluation = manager.step_wise_evaluation(
-        mock_actual, mock_point_predictions, "target", [1, 2, 3],
-        metrics_list=["RMSLE"],
-        metric_functions=REGRESSION_POINT_NATIVE,
-        metrics_cls=RegressionPointEvaluationMetrics,
-    )
-
-    actuals = [[1, 2, 2, 3], [2, 3, 3, 4], [3, 4, 4, 5]]
-    preds = [[1, 3, 2, 4], [5, 7, 6, 8], [9, 7, 10, 8]]
-    df_evaluation_test = pd.DataFrame(
-        {
-            "RMSLE": [
-                root_mean_squared_log_error(actual, pred)
-                for (actual, pred) in zip(actuals, preds)
-            ],
-        },
-        index=["step01", "step02", "step03"],
-    )
-
-    assert ["step01", "step02", "step03"] == list(evaluation_dict.keys())
-    assert np.allclose(df_evaluation, df_evaluation_test, atol=0.000001)
-
-
-def test_step_wise_evaluation_sample(mock_actual, mock_sample_predictions):
-    manager = EvaluationManager()
-    evaluation_dict, df_evaluation = manager.step_wise_evaluation(
-        mock_actual, mock_sample_predictions, "target", [1, 2, 3],
-        metrics_list=["CRPS"],
-        metric_functions=REGRESSION_SAMPLE_NATIVE,
-        metrics_cls=RegressionSampleEvaluationMetrics,
-    )
-    actuals = [[1, 2, 2, 3], [2, 3, 3, 4], [3, 4, 4, 5]]
-    preds = [
-        [[1, 2, 3], [2, 3, 4], [4, 6, 8], [5, 7, 9]],
-        [[3, 4, 5], [4, 5, 6], [6, 8, 10], [7, 9, 11]],
-        [[5, 6, 7], [6, 7, 8], [8, 10, 12], [9, 11, 13]],
-    ]
-    df_evaluation_test = pd.DataFrame(
-        {
-            "CRPS": [
-                ps.crps_ensemble(actual, pred).mean()
-                for (actual, pred) in zip(actuals, preds)
-            ],
-        },
-        index=["step01", "step02", "step03"],
-    )
-
-    assert ["step01", "step02", "step03"] == list(evaluation_dict.keys())
-    assert np.allclose(df_evaluation, df_evaluation_test, atol=0.000001)
-
-
-def test_time_series_wise_evaluation_point(mock_actual, mock_point_predictions):
-    manager = EvaluationManager()
-    evaluation_dict, df_evaluation = manager.time_series_wise_evaluation(
-        mock_actual, mock_point_predictions, "target",
-        metrics_list=["RMSLE"],
-        metric_functions=REGRESSION_POINT_NATIVE,
-        metrics_cls=RegressionPointEvaluationMetrics,
-    )
-
-    actuals = [[1, 2, 2, 3, 3, 4], [2, 3, 3, 4, 4, 5]]
-    preds = [1, 3, 5, 7, 9, 7], [2, 4, 6, 8, 10, 8]
-    df_evaluation_test = pd.DataFrame(
-        {
-            "RMSLE": [
-                root_mean_squared_log_error(actual, pred)
-                for (actual, pred) in zip(actuals, preds)
-            ],
-        },
-        index=["ts00", "ts01"],
-    )
-
-    assert ["ts00", "ts01"] == list(evaluation_dict.keys())
-    assert np.allclose(df_evaluation, df_evaluation_test, atol=0.000001)
-
-
-def test_time_series_wise_evaluation_sample(mock_actual, mock_sample_predictions):
-    manager = EvaluationManager()
-    evaluation_dict, df_evaluation = manager.time_series_wise_evaluation(
-        mock_actual, mock_sample_predictions, "target",
-        metrics_list=["CRPS"],
-        metric_functions=REGRESSION_SAMPLE_NATIVE,
-        metrics_cls=RegressionSampleEvaluationMetrics,
-    )
-
-    actuals = [[1, 2, 2, 3, 3, 4], [2, 3, 3, 4, 4, 5]]
-    preds = [
-        [[1, 2, 3], [2, 3, 4], [3, 4, 5], [4, 5, 6], [5, 6, 7], [6, 7, 8]],
-        [[4, 6, 8], [5, 7, 9], [6, 8, 10], [7, 9, 11], [8, 10, 12], [9, 11, 13]],
-    ]
-    df_evaluation_test = pd.DataFrame(
-        {
-            "CRPS": [
-                ps.crps_ensemble(actual, pred).mean()
-                for (actual, pred) in zip(actuals, preds)
-            ],
-        },
-        index=["ts00", "ts01"],
-    )
-
-    assert ["ts00", "ts01"] == list(evaluation_dict.keys())
-    assert np.allclose(df_evaluation, df_evaluation_test, atol=0.000001)
-
-
-def test_month_wise_evaluation_point(mock_actual, mock_point_predictions):
-    manager = EvaluationManager()
-    evaluation_dict, df_evaluation = manager.month_wise_evaluation(
-        mock_actual, mock_point_predictions, "target",
-        metrics_list=["RMSLE"],
-        metric_functions=REGRESSION_POINT_NATIVE,
-        metrics_cls=RegressionPointEvaluationMetrics,
-    )
-
-    actuals = [[1, 2], [2, 3, 2, 3], [3, 4, 3, 4], [4, 5]]
-    preds = [[1, 3], [5, 7, 2, 4], [9, 7, 6, 8], [10, 8]]
-    df_evaluation_test = pd.DataFrame({
-            "RMSLE": [
-                root_mean_squared_log_error(actual, pred)
-                for (actual, pred) in zip(actuals, preds)
-            ],
-        },
-        index=["month100", "month101", "month102", "month103"],
-    )
-
-    assert ["month100", "month101", "month102", "month103"] == list(
-        evaluation_dict.keys()
-    )
-    assert np.allclose(df_evaluation, df_evaluation_test, atol=0.000001)
-
-
-def test_month_wise_evaluation_sample(mock_actual, mock_sample_predictions):
-    manager = EvaluationManager()
-    evaluation_dict, df_evaluation = manager.month_wise_evaluation(
-        mock_actual, mock_sample_predictions, "target",
-        metrics_list=["CRPS"],
-        metric_functions=REGRESSION_SAMPLE_NATIVE,
-        metrics_cls=RegressionSampleEvaluationMetrics,
-    )
-
-    actuals = [[1, 2], [2, 3, 2, 3], [3, 4, 3, 4], [4, 5]]
-    preds = [
-        [[1, 2, 3], [2, 3, 4]],
-        [[3, 4, 5], [4, 5, 6], [4, 6, 8], [5, 7, 9]],
-        [[5, 6, 7], [6, 7, 8], [6, 8, 10], [7, 9, 11]],
-        [[8, 10, 12], [9, 11, 13]],
-    ]
-    df_evaluation_test = pd.DataFrame(
-        {
-            "CRPS": [
-                ps.crps_ensemble(actual, pred).mean()
-                for (actual, pred) in zip(actuals, preds)
-            ],
-        },
-        index=["month100", "month101", "month102", "month103"],
-    )
-
-    assert ["month100", "month101", "month102", "month103"] == list(
-        evaluation_dict.keys()
-    )
-    assert np.allclose(df_evaluation, df_evaluation_test, atol=0.000001)
-
-
-def test_calculate_ap_point_predictions():
-    """
-    Test calculate_ap with pre-binarised actuals (0/1) and probability scores as predictions.
-    """
-    # Binary actuals: 1 = positive class, 0 = negative class
-    actual_binary = [1, 0, 1, 0]
-    # Probability scores for the positive class
-    pred_scores = [0.9, 0.4, 0.3, 0.1]
-
-    matched_actual = pd.DataFrame({'target': [[v] for v in actual_binary]})
-    matched_pred = pd.DataFrame({'pred_target': [[v] for v in pred_scores]})
-
-    from views_evaluation.evaluation.native_metric_calculators import calculate_ap
-    ap_score = calculate_ap(matched_actual, matched_pred, 'target')
-
-    expected_ap = average_precision_score(actual_binary, pred_scores)
-
-    assert abs(ap_score - expected_ap) < 0.01
-
-
-def test_calculate_ap_sample_predictions():
-    """
-    Test calculate_ap with pre-binarised actuals and distributional probability scores.
-    Each prediction is a list of probability samples; actuals are 0/1.
-    """
-    # Binary actuals: 1 = positive, 0 = negative
-    actual_binary = [1, 0, 1, 0]
-    # Distributional probability predictions (multiple samples per observation)
-    pred_scores = [
-        [0.8, 0.9, 0.95],
-        [0.3, 0.4, 0.45],
-        [0.2, 0.25, 0.35],
-        [0.05, 0.1, 0.15],
-    ]
-
-    matched_actual = pd.DataFrame({'target': [[v] for v in actual_binary]})
-    matched_pred = pd.DataFrame({'pred_target': pred_scores})
-
-    from views_evaluation.evaluation.native_metric_calculators import calculate_ap
-    ap_score = calculate_ap(matched_actual, matched_pred, 'target')
-
-    # Expected: actuals expanded to match samples, predictions are the raw samples
-    actual_expanded = np.repeat(actual_binary, [len(p) for p in pred_scores])
-    pred_flat = np.concatenate(pred_scores)
-    expected_ap = average_precision_score(actual_expanded, pred_flat)
-
-    assert abs(ap_score - expected_ap) < 0.01
-
-
-# ---------------------------------------------------------------------------
-# New tests for config normalisation and validation
-# ---------------------------------------------------------------------------
-
-def test_normalise_config_legacy_targets_key(caplog):
-    """Legacy 'targets' key should be translated to 'regression_targets' with a warning."""
-    config = {'steps': [1], 'targets': ['my_target'], 'regression_point_metrics': ['MSE']}
-    with caplog.at_level(logging.WARNING):
-        normalised = EvaluationManager._normalise_config(config)
-    assert 'regression_targets' in normalised
-    assert 'targets' not in normalised
-    assert any('DEPRECATED' in r.message for r in caplog.records)
-
-
-def test_normalise_config_legacy_metrics_key(caplog):
-    """Legacy 'metrics' key should be translated to 'regression_point_metrics' with a warning."""
-    config = {'steps': [1], 'regression_targets': ['t'], 'metrics': ['MSE']}
-    with caplog.at_level(logging.WARNING):
-        normalised = EvaluationManager._normalise_config(config)
-    assert 'regression_point_metrics' in normalised
-    assert 'metrics' not in normalised
-    assert any('DEPRECATED' in r.message for r in caplog.records)
-
-
-def test_validate_config_missing_steps():
-    with pytest.raises(KeyError, match="steps"):
-        EvaluationManager._validate_config({'regression_targets': ['t'], 'regression_point_metrics': ['MSE']})
-
-
-def test_validate_config_missing_all_targets():
-    with pytest.raises(KeyError):
-        EvaluationManager._validate_config({'steps': [1]})
-
-
-def test_validate_config_regression_targets_without_metrics():
-    with pytest.raises(KeyError, match="regression_point_metrics"):
-        EvaluationManager._validate_config({'steps': [1], 'regression_targets': ['t']})
-
-
-def test_validate_config_classification_targets_without_metrics():
-    with pytest.raises(KeyError, match="classification_point_metrics"):
-        EvaluationManager._validate_config({'steps': [1], 'classification_targets': ['t']})
-
-
-def test_evaluate_target_not_in_config(mock_actual, mock_point_predictions):
-    manager = EvaluationManager()
-    config = {
-        'steps': [1, 2, 3],
-        'regression_targets': ['some_other_target'],
-        'regression_point_metrics': ['RMSLE'],
-    }
-    with pytest.raises(ValueError, match="not declared in config"):
-        manager.evaluate(mock_actual, mock_point_predictions, 'target', config)
-
-
-def test_evaluate_invalid_metric_for_task_type(mock_actual, mock_point_predictions):
-    """AP is a classification metric — declaring it under regression_point_metrics should raise."""
-    manager = EvaluationManager()
-    config = {
-        'steps': [1, 2, 3],
-        'regression_targets': ['target'],
-        'regression_point_metrics': ['AP'],  # AP is not a regression metric
-    }
-    with pytest.raises(ValueError, match="not valid for"):
-        manager.evaluate(mock_actual, mock_point_predictions, 'target', config)
diff --git a/tests/test_evaluation_schemas.py b/tests/test_evaluation_schemas.py
deleted file mode 100644
index d08ca8f..0000000
--- a/tests/test_evaluation_schemas.py
+++ /dev/null
@@ -1,197 +0,0 @@
-"""
-PHASE-3-DELETE: Tests legacy EvaluationManager schema grouping logic via mocks.
-Will be deleted when Phase 3 of the orchestrator migration is complete.
-See reports/2026-02-25_evaluation_frame_refactor/10_orchestrator_migration_plan.md
-
-Original docstring:
-This test suite rigorously verifies the grouping logic of the three evaluation
-schemas (step-wise, time-series-wise, and month-wise) as described in the
-core project documentation.
-"""
-import pytest
-import pandas as pd
-from unittest.mock import MagicMock, patch
-
-from views_evaluation.evaluation.evaluation_manager import EvaluationManager
-from views_evaluation.evaluation.metrics import RegressionPointEvaluationMetrics
-
-@pytest.fixture
-def schema_test_data():
-    """
-    Generates a predictable, non-random "predictive parallelogram" for testing.
-
-    - 3 sequences (t0, t1, t2)
-    - 4 steps per sequence (s1, s2, s3, s4)
-    - 2 locations (l0, l1)
-    - Start month: 100
-
-    Parallelogram structure (value is month_id):
-            l0   l1  (Sequence 0)
-    t0_s1:  100  100
-    t0_s2:  101  101
-    t0_s3:  102  102
-    t0_s4:  103  103
-            ...  ... (Sequence 1)
-    t1_s1:  101  101
-    t1_s2:  102  102
-    t1_s3:  103  103
-    t1_s4:  104  104
-            ...  ... (Sequence 2)
-    t2_s1:  102  102
-    t2_s2:  103  103
-    t2_s3:  104  104
-    t2_s4:  105  105
-    """
-    target_name = "lr_test_target"
-    pred_col_name = f"pred_{target_name}"
-    loc_id_name = "location_id"
-    num_sequences = 3
-    num_steps = 4
-    num_locations = 2
-    start_month = 100
-
-    # 1. Actuals DataFrame (covering all possible months)
-    actuals_index = pd.MultiIndex.from_product(
-        [range(start_month, start_month + num_sequences + num_steps), range(num_locations)],
-        names=['month_id', loc_id_name]
-    )
-    # Use month_id as the value for easy checking
-    actuals_values = [idx[0] for idx in actuals_index]
-    actuals = pd.DataFrame({target_name: actuals_values}, index=actuals_index)
-
-    # 2. Predictions List
-    predictions_list = []
-    for i in range(num_sequences):
-        preds_index = pd.MultiIndex.from_product(
-            [range(start_month + i, start_month + i + num_steps), range(num_locations)],
-            names=['month_id', loc_id_name]
-        )
-        # Use month_id as the prediction value for easy checking. Wrap in a list.
-        pred_values = [[idx[0]] for idx in preds_index]
-        preds = pd.DataFrame({pred_col_name: pred_values}, index=preds_index)
-        predictions_list.append(preds)
-
-    # 3. Config
-    config = {'steps': list(range(1, num_steps + 1))}
-
-    return actuals, predictions_list, target_name, config
-
-
-def get_months_from_mock_call(call):
-    """Helper to extract unique month_ids from a mock call's DataFrame argument."""
-    df = call[0][1]  # call[0] is args, [1] is the matched_pred dataframe
-    return sorted(df.index.get_level_values('month_id').unique().tolist())
-
-
-def test_step_wise_schema_grouping(schema_test_data):
-    """
-    Verify that step-wise evaluation groups data by forecast horizon (diagonals).
-    """
-    actuals, preds, target, config = schema_test_data
-    manager = EvaluationManager()
-    mock_metric_func = MagicMock()
-
-    with patch.dict(manager.regression_point_functions, {"RMSLE": mock_metric_func}):
-        actuals, preds = manager._process_data(actuals, preds, target)
-        manager.step_wise_evaluation(
-            actuals, preds, target, config["steps"],
-            metrics_list=["RMSLE"],
-            metric_functions=manager.regression_point_functions,
-            metrics_cls=RegressionPointEvaluationMetrics,
-        )
-
-    # Expected groupings for steps (diagonals of the parallelogram)
-    expected_step_months = {
-        # step 1: (t0_s1, t1_s1, t2_s1) -> months (100, 101, 102)
-        0: [100, 101, 102],
-        # step 2: (t0_s2, t1_s2, t2_s2) -> months (101, 102, 103)
-        1: [101, 102, 103],
-        # step 3: (t0_s3, t1_s3, t2_s3) -> months (102, 103, 104)
-        2: [102, 103, 104],
-        # step 4: (t0_s4, t1_s4, t2_s4) -> months (103, 104, 105)
-        3: [103, 104, 105],
-    }
-
-    assert mock_metric_func.call_count == len(expected_step_months)
-
-    for i, expected_months in expected_step_months.items():
-        call = mock_metric_func.call_args_list[i]
-        observed_months = get_months_from_mock_call(call)
-        assert observed_months == expected_months, f"Mismatch on step {i+1}"
-
-
-def test_time_series_wise_schema_grouping(schema_test_data):
-    """
-    Verify that time-series-wise evaluation groups data by forecast run (columns).
-    """
-    actuals, preds, target, config = schema_test_data
-    manager = EvaluationManager()
-    mock_metric_func = MagicMock()
-
-    with patch.dict(manager.regression_point_functions, {"RMSLE": mock_metric_func}):
-        actuals, preds = manager._process_data(actuals, preds, target)
-        manager.time_series_wise_evaluation(
-            actuals, preds, target,
-            metrics_list=["RMSLE"],
-            metric_functions=manager.regression_point_functions,
-            metrics_cls=RegressionPointEvaluationMetrics,
-        )
-
-    # Expected groupings for time-series (columns of the parallelogram)
-    expected_ts_months = {
-        # sequence 0: months 100, 101, 102, 103
-        0: [100, 101, 102, 103],
-        # sequence 1: months 101, 102, 103, 104
-        1: [101, 102, 103, 104],
-        # sequence 2: months 102, 103, 104, 105
-        2: [102, 103, 104, 105],
-    }
-
-    assert mock_metric_func.call_count == len(expected_ts_months)
-
-    for i, expected_months in expected_ts_months.items():
-        call = mock_metric_func.call_args_list[i]
-        observed_months = get_months_from_mock_call(call)
-        assert observed_months == expected_months, f"Mismatch on time-series {i}"
-
-
-def test_month_wise_schema_grouping(schema_test_data):
-    """
-    Verify that month-wise evaluation groups data by calendar month (rows).
-    """
-    actuals, preds, target, config = schema_test_data
-    manager = EvaluationManager()
-    mock_metric_func = MagicMock()
-
-    with patch.dict(manager.regression_point_functions, {"RMSLE": mock_metric_func}):
-        actuals, preds = manager._process_data(actuals, preds, target)
-        manager.month_wise_evaluation(
-            actuals, preds, target,
-            metrics_list=["RMSLE"],
-            metric_functions=manager.regression_point_functions,
-            metrics_cls=RegressionPointEvaluationMetrics,
-        )
-
-    # For month-wise, each call corresponds to one month.
-    # We check that each month was called and that the data in the call is correct.
-    observed_calls = {}
-    for call in mock_metric_func.call_args_list:
-        df_pred = call[0][1]
-        month = get_months_from_mock_call(call)[0]
-        # Check that dataframe only contains data for its specified month
-        assert all(m == month for m in get_months_from_mock_call(call))
-        observed_calls[month] = df_pred
-
-    # Expected months in the full parallelogram
-    expected_months = [100, 101, 102, 103, 104, 105]
-    assert sorted(observed_calls.keys()) == expected_months
-
-    # Check the number of predictions for a few key months
-    # Month 100: Only from sequence 0 (2 locations)
-    assert len(observed_calls[100]) == 2
-    # Month 101: From sequence 0 and 1 (2 locs * 2 seqs = 4)
-    assert len(observed_calls[101]) == 4
-    # Month 102: From sequence 0, 1, and 2 (2 locs * 3 seqs = 6)
-    assert len(observed_calls[102]) == 6
-    # Month 105: Only from sequence 2 (2 locations)
-    assert len(observed_calls[105]) == 2
diff --git a/tests/test_metric_calculators.py b/tests/test_metric_calculators.py
index 3e75bcf..f411968 100644
--- a/tests/test_metric_calculators.py
+++ b/tests/test_metric_calculators.py
@@ -1,8 +1,8 @@
 import pytest
 import numpy as np
-import pandas as pd
 from views_evaluation.evaluation.native_metric_calculators import (
     calculate_mse_native,
+    calculate_msle_native,
     calculate_rmsle_native,
     calculate_crps_native,
     calculate_twcrps_native,
@@ -15,128 +15,101 @@
     calculate_mean_interval_score_native,
     calculate_mtd_native,
     calculate_mcr_native,
-    REGRESSION_POINT_NATIVE,
-    REGRESSION_SAMPLE_NATIVE,
-    CLASSIFICATION_POINT_NATIVE,
-    CLASSIFICATION_SAMPLE_NATIVE,
+    calculate_brier_sample_native,
+    calculate_brier_point_native,
+    calculate_qs_sample_native,
+    calculate_qs_point_native,
 )
+from views_evaluation.evaluation.metric_catalog import METRIC_MEMBERSHIP
 
 
-@pytest.fixture
-def sample_data():
-    """Create sample data for testing."""
-    actual = pd.DataFrame({
-        'target': [[1.0], [2.0], [3.0], [4.0]]
-    })
-    pred = pd.DataFrame({
-        'pred_target': [[1.1], [1.9], [3.1], [3.9]]
-    })
-    return actual, pred
-
-
-@pytest.fixture
-def sample_sample_data():
-    """Create sample sample data for testing."""
-    actual = pd.DataFrame({
-        'target': [[1.0], [2.0], [3.0], [4.0]]
-    })
-    pred = pd.DataFrame({
-        'pred_target': [[1.0, 1.1, 1.2], [1.8, 2.0, 2.2], [2.9, 3.0, 3.1], [3.8, 4.0, 4.2]]
-    })
-    return actual, pred
-
-
-def test_calculate_mse_native(sample_data):
-    """Test MSE calculation."""
-    actual, pred = sample_data
-    result = calculate_mse_native(actual, pred, 'target')
+# Point-prediction test data (N=4, S=1)
+_POINT_Y_TRUE = np.array([1.0, 2.0, 3.0, 4.0])
+_POINT_Y_PRED = np.array([[1.1], [1.9], [3.1], [3.9]])
+
+# Sample-prediction test data (N=4, S=3)
+_SAMPLE_Y_TRUE = np.array([1.0, 2.0, 3.0, 4.0])
+_SAMPLE_Y_PRED = np.array([[1.0, 1.1, 1.2], [1.8, 2.0, 2.2], [2.9, 3.0, 3.1], [3.8, 4.0, 4.2]])
+
+
+def test_calculate_mse_native():
+    """Test MSE calculation with pure NumPy arrays."""
+    result = calculate_mse_native(_POINT_Y_TRUE, _POINT_Y_PRED)
     assert isinstance(result, float)
     assert result >= 0
 
-def test_calculate_rmsle_native_point(sample_data):
+def test_calculate_rmsle_native_point():
     """Test RMSLE calculation."""
-    actual, pred = sample_data
-    result = calculate_rmsle_native(actual, pred, 'target')
+    result = calculate_rmsle_native(_POINT_Y_TRUE, _POINT_Y_PRED)
     assert isinstance(result, float)
     assert result >= 0
 
-def test_calculate_crps_native_point(sample_data):
-    """Test CRPS calculation."""
-    actual, pred = sample_data
-    result = calculate_crps_native(actual, pred, 'target')
+def test_calculate_crps_native_point():
+    """Test CRPS calculation with point predictions."""
+    result = calculate_crps_native(_POINT_Y_TRUE, _POINT_Y_PRED)
     assert isinstance(result, float)
     assert result >= 0
 
 
-def test_calculate_crps_native_sample(sample_sample_data):
-    """Test CRPS calculation."""
-    actual, pred = sample_sample_data
-    result = calculate_crps_native(actual, pred, 'target')
+def test_calculate_crps_native_sample():
+    """Test CRPS calculation with sample predictions."""
+    result = calculate_crps_native(_SAMPLE_Y_TRUE, _SAMPLE_Y_PRED)
     assert isinstance(result, float)
     assert result >= 0
 
 
 def test_calculate_ap_native():
-    """Test Average Precision calculation with pre-binarised actuals and probability scores."""
-    # Binary actuals (0/1) and probability scores as predictions
-    actual = pd.DataFrame({'target': [[1], [0], [1], [0]]})
-    pred = pd.DataFrame({'pred_target': [[0.9], [0.4], [0.3], [0.1]]})
-    result = calculate_ap_native(actual, pred, 'target')
+    """Test Average Precision with binary actuals and probability scores."""
+    y_true = np.array([1.0, 0.0, 1.0, 0.0])
+    y_pred = np.array([[0.9], [0.4], [0.3], [0.1]])
+    result = calculate_ap_native(y_true, y_pred)
     assert isinstance(result, float)
     assert 0 <= result <= 1
 
 
-def test_calculate_emd_native(sample_data):
+def test_calculate_emd_native():
     """Test Earth Mover's Distance calculation."""
-    actual, pred = sample_data
-    result = calculate_emd_native(actual, pred, 'target')
+    result = calculate_emd_native(_POINT_Y_TRUE, _POINT_Y_PRED)
     assert isinstance(result, float)
     assert result >= 0
 
 
-def test_calculate_pearson_native(sample_data):
+def test_calculate_pearson_native():
     """Test Pearson correlation calculation."""
-    actual, pred = sample_data
-    result = calculate_pearson_native(actual, pred, 'target')
+    result = calculate_pearson_native(_POINT_Y_TRUE, _POINT_Y_PRED)
     assert isinstance(result, float)
     assert -1 <= result <= 1
 
 
-def test_calculate_mtd_native(sample_data):
+def test_calculate_mtd_native():
     """Test Mean Tweedie Deviance calculation."""
-    actual, pred = sample_data
-    result = calculate_mtd_native(actual, pred, 'target', power=1.5)
+    result = calculate_mtd_native(_POINT_Y_TRUE, _POINT_Y_PRED, power=1.5)
     assert isinstance(result, float)
     assert result >= 0
 
 
-def test_calculate_mtd_native_with_power(sample_data):
-    """Test Mean Tweedie Deviance calculation with different power values."""
-    actual, pred = sample_data
-    # Test with power=1.5 (compound Poisson-Gamma)
-    result_15 = calculate_mtd_native(actual, pred, 'target', power=1.5)
+def test_calculate_mtd_native_with_power():
+    """Test Mean Tweedie Deviance with different power values."""
+    result_15 = calculate_mtd_native(_POINT_Y_TRUE, _POINT_Y_PRED, power=1.5)
     assert isinstance(result_15, float)
     assert result_15 >= 0
 
-    # Test with power=2 (Gamma)
-    result_2 = calculate_mtd_native(actual, pred, 'target', power=2.0)
+    result_2 = calculate_mtd_native(_POINT_Y_TRUE, _POINT_Y_PRED, power=2.0)
     assert isinstance(result_2, float)
     assert result_2 >= 0
 
 
-def test_calculate_coverage_native_sample(sample_sample_data):
-    """Test Coverage calculation."""
-    actual, pred = sample_sample_data
-    result = calculate_coverage_native(actual, pred, 'target', alpha=0.1)
+def test_calculate_coverage_native_sample():
+    """Test Coverage calculation with sample predictions."""
+    result = calculate_coverage_native(_SAMPLE_Y_TRUE, _SAMPLE_Y_PRED, alpha=0.1)
     assert isinstance(result, float)
     assert 0 <= result <= 1
 
 
-def test_calculate_ignorance_score_native_sample(sample_sample_data):
+def test_calculate_ignorance_score_native_sample():
     """Test Ignorance Score calculation."""
-    actual, pred = sample_sample_data
     result = calculate_ignorance_score_native(
-        actual, pred, 'target',
+        _SAMPLE_Y_TRUE, _SAMPLE_Y_PRED,
         bins=[0, 0.5, 2.5, 5.5, 10.5, 25.5, 50.5, 100.5, 250.5, 500.5, 1000.5],
         low_bin=0, high_bin=10000,
     )
@@ -144,106 +117,59 @@ def test_calculate_ignorance_score_native_sample(sample_sample_data):
     assert result >= 0
 
 
-def test_calculate_mis_sample(sample_sample_data):
+def test_calculate_mis_sample():
     """Test Mean Interval Score calculation."""
-    actual, pred = sample_sample_data
-    result = calculate_mean_interval_score_native(actual, pred, 'target', alpha=0.05)
+    result = calculate_mean_interval_score_native(_SAMPLE_Y_TRUE, _SAMPLE_Y_PRED, alpha=0.05)
     assert isinstance(result, float)
     assert result >= 0
 
 
-def test_point_metric_functions():
-    """Test that all point metric functions are available in the deprecated REGRESSION_POINT_NATIVE."""
-    expected_metrics = [
-        "MSE", "MSLE", "RMSLE", "EMD", "SD", "pEMDiv", "Pearson", "Variogram", "MTD", "y_hat_bar"
-    ]
-
-
-    for metric in expected_metrics:
-        assert metric in REGRESSION_POINT_NATIVE
-        assert callable(REGRESSION_POINT_NATIVE[metric])
-
-
-def test_sample_metric_functions():
-    """Test that all sample metric functions are available in the deprecated REGRESSION_SAMPLE_NATIVE."""
-    expected_metrics = ["CRPS", "twCRPS", "MIS", "QIS", "Ignorance", "Coverage", "y_hat_bar", "MCR_sample"]
-
-    for metric in expected_metrics:
-        assert metric in REGRESSION_SAMPLE_NATIVE
-        assert callable(REGRESSION_SAMPLE_NATIVE[metric])
-
-
-def test_regression_point_metric_functions():
-    """Test that all regression point metric functions are available in REGRESSION_POINT_NATIVE."""
-    expected_metrics = ["MSE", "MSLE", "RMSLE", "EMD", "SD", "pEMDiv", "Pearson", "Variogram", "MTD", "y_hat_bar"]
-
-    for metric in expected_metrics:
-        assert metric in REGRESSION_POINT_NATIVE
-        assert callable(REGRESSION_POINT_NATIVE[metric])
+def test_metric_membership_regression_point():
+    """METRIC_MEMBERSHIP contains expected regression point metrics."""
+    members = METRIC_MEMBERSHIP[("regression", "point")]
+    for m in ["MSE", "MSLE", "RMSLE", "EMD", "Pearson", "MTD", "y_hat_bar", "MCR_point", "QS_point"]:
+        assert m in members
+    assert "AP" not in members
+    assert "CRPS" not in members
 
-    # AP must NOT be in regression point functions
-    assert "AP" not in REGRESSION_POINT_NATIVE
-    # CRPS must NOT be in regression point functions
-    assert "CRPS" not in REGRESSION_POINT_NATIVE
 
+def test_metric_membership_regression_sample():
+    """METRIC_MEMBERSHIP contains expected regression sample metrics."""
+    members = METRIC_MEMBERSHIP[("regression", "sample")]
+    for m in ["CRPS", "twCRPS", "MIS", "QIS", "Coverage", "Ignorance", "y_hat_bar", "QS_sample", "MCR_sample"]:
+        assert m in members
+    assert "AP" not in members
 
-def test_regression_sample_metric_functions():
-    """Test that all regression sample metric functions are available."""
-    expected_metrics = ["CRPS", "twCRPS", "MIS", "QIS", "Coverage", "Ignorance", "y_hat_bar"]
 
-    for metric in expected_metrics:
-        assert metric in REGRESSION_SAMPLE_NATIVE
-        assert callable(REGRESSION_SAMPLE_NATIVE[metric])
+def test_metric_membership_classification_point():
+    """METRIC_MEMBERSHIP contains expected classification point metrics."""
+    members = METRIC_MEMBERSHIP[("classification", "point")]
+    assert "AP" in members
+    assert "Brier_point" in members
+    assert "RMSLE" not in members
 
-    # AP must NOT be in regression sample functions
-    assert "AP" not in REGRESSION_SAMPLE_NATIVE
 
-
-def test_classification_point_metric_functions():
-    """Test that AP is in CLASSIFICATION_POINT_NATIVE."""
-    assert "AP" in CLASSIFICATION_POINT_NATIVE
-    assert callable(CLASSIFICATION_POINT_NATIVE["AP"])
-
-    # RMSLE must NOT be in classification point functions
-    assert "RMSLE" not in CLASSIFICATION_POINT_NATIVE
-
-
-def test_classification_sample_metric_functions():
-    """Test that classification sample metric functions are available."""
-    expected_metrics = ["CRPS", "twCRPS", "Brier", "Jeffreys"]
-
-    for metric in expected_metrics:
-        assert metric in CLASSIFICATION_SAMPLE_NATIVE
-        assert callable(CLASSIFICATION_SAMPLE_NATIVE[metric])
-
-    # RMSLE must NOT be in classification sample functions
-    assert "RMSLE" not in CLASSIFICATION_SAMPLE_NATIVE
+def test_metric_membership_classification_sample():
+    """METRIC_MEMBERSHIP contains expected classification sample metrics."""
+    members = METRIC_MEMBERSHIP[("classification", "sample")]
+    for m in ["CRPS", "twCRPS", "Brier_sample", "Jeffreys"]:
+        assert m in members
+    assert "RMSLE" not in members
 
 
 def test_not_implemented_metrics():
-    """Test that unimplemented metrics raise NotImplementedError."""
-    actual = pd.DataFrame({'target': [[1.0]]})
-    pred = pd.DataFrame({'pred_target': [[1.0]]})
-
+    """Test that unimplemented metrics raise ValueError with clear message."""
     from views_evaluation.evaluation.native_metric_calculators import (
-        calculate_brier_native,
         calculate_jeffreys_native,
         calculate_sd_native,
         calculate_pEMDiv_native,
         calculate_variogram_native,
     )
 
-    unimplemented_functions = [
-        calculate_brier_native,
-        calculate_jeffreys_native,
-        calculate_sd_native,
-        calculate_pEMDiv_native,
-        calculate_variogram_native,
-    ]
-
-    for func in unimplemented_functions:
+    for func in [calculate_jeffreys_native, calculate_sd_native,
+                 calculate_pEMDiv_native, calculate_variogram_native]:
         with pytest.raises(ValueError, match="not yet implemented"):
-            func(actual, pred, 'target')
+            func(np.array([1.0]), np.array([[1.0]]))
 
 
 # ---------------------------------------------------------------------------
@@ -326,10 +252,9 @@ def test_parity_wide_spread(self):
 
 class TestTwCRPS:
 
-    def test_twcrps_basic_smoke(self, sample_sample_data):
+    def test_twcrps_basic_smoke(self):
         """twCRPS produces a non-negative float."""
-        actual, pred = sample_sample_data
-        result = calculate_twcrps_native(actual, pred, 'target', threshold=0.0)
+        result = calculate_twcrps_native(_SAMPLE_Y_TRUE, _SAMPLE_Y_PRED, threshold=0.0)
         assert isinstance(result, float)
         assert result >= 0
 
@@ -360,11 +285,10 @@ def test_twcrps_threshold_changes_result(self):
         # They should differ for data straddling the threshold
         assert twcrps != pytest.approx(crps, abs=1e-5)
 
-    def test_twcrps_in_dispatch_dicts(self):
-        """twCRPS must be in both regression and classification sample dispatch dicts."""
-        assert "twCRPS" in REGRESSION_SAMPLE_NATIVE
-        assert "twCRPS" in CLASSIFICATION_SAMPLE_NATIVE
-        assert callable(REGRESSION_SAMPLE_NATIVE["twCRPS"])
+    def test_twcrps_in_metric_membership(self):
+        """twCRPS must be in both regression and classification sample membership."""
+        assert "twCRPS" in METRIC_MEMBERSHIP[("regression", "sample")]
+        assert "twCRPS" in METRIC_MEMBERSHIP[("classification", "sample")]
 
 
 # ---------------------------------------------------------------------------
@@ -373,11 +297,10 @@ def test_twcrps_in_dispatch_dicts(self):
 
 class TestQuantileIntervalScore:
 
-    def test_qis_basic_smoke(self, sample_sample_data):
+    def test_qis_basic_smoke(self):
         """QIS produces a non-negative float."""
-        actual, pred = sample_sample_data
         result = calculate_quantile_interval_score_native(
-            actual, pred, 'target', lower_quantile=0.025, upper_quantile=0.975,
+            _SAMPLE_Y_TRUE, _SAMPLE_Y_PRED, lower_quantile=0.025, upper_quantile=0.975,
         )
         assert isinstance(result, float)
         assert result >= 0
@@ -471,8 +394,8 @@ def test_qis_golden_value_with_violation(self):
         y_pred = np.array([[1.0, 2.0, 3.0, 4.0, 5.0]])
         q_lo, q_hi = 0.1, 0.9
 
-        lower = float(np.quantile(y_pred, q_lo, axis=1))
-        upper = float(np.quantile(y_pred, q_hi, axis=1))
+        lower = np.quantile(y_pred, q_lo, axis=1).item()
+        upper = np.quantile(y_pred, q_hi, axis=1).item()
 
         width = upper - lower
         upper_penalty = (2 / (1 - q_hi)) * (20.0 - upper)
@@ -503,10 +426,233 @@ def test_qis_perfect_coverage_minimal_score(self):
         expected_width = float(np.mean(upper - lower))
         assert result == pytest.approx(expected_width, abs=1e-10)
 
-    def test_qis_in_dispatch_dict(self):
-        """QIS must be in regression sample dispatch dict."""
-        assert "QIS" in REGRESSION_SAMPLE_NATIVE
-        assert callable(REGRESSION_SAMPLE_NATIVE["QIS"])
+    def test_qis_in_metric_membership(self):
+        """QIS must be in regression sample membership."""
+        assert "QIS" in METRIC_MEMBERSHIP[("regression", "sample")]
+
+
+# ---------------------------------------------------------------------------
+# Green: Golden-value correctness tests — hand-computed expected values (ADR-020)
+# ---------------------------------------------------------------------------
+
+class TestGoldenValues:
+    """Verify numerical correctness of all implemented metrics against hand-computed or oracle values."""
+
+    def test_mse_known_errors(self):
+        """y_true=[1,2,3], y_pred=[[2],[3],[4]] → errors=[1,1,1], MSE=1.0."""
+        result = calculate_mse_native(np.array([1.0, 2.0, 3.0]), np.array([[2.0], [3.0], [4.0]]))
+        assert result == pytest.approx(1.0, abs=1e-10)
+
+    def test_msle_known_values(self):
+        """y_true=[e-1], y_pred=[[0]] → log1p(e-1)=1, log1p(0)=0, MSLE=1.0."""
+        result = calculate_msle_native(np.array([np.e - 1]), np.array([[0.0]]))
+        assert result == pytest.approx(1.0, abs=1e-10)
+
+    def test_rmsle_is_sqrt_msle(self):
+        """RMSLE = sqrt(MSLE) for the same input."""
+        y_true = np.array([np.e - 1])
+        y_pred = np.array([[0.0]])
+        msle = calculate_msle_native(y_true, y_pred)
+        rmsle = calculate_rmsle_native(y_true, y_pred)
+        assert rmsle == pytest.approx(np.sqrt(msle), abs=1e-10)
+
+    def test_emd_point_prediction(self):
+        """y_true=[0], y_pred=[[5]] → wasserstein_distance([5],[0]) = 5.0."""
+        result = calculate_emd_native(np.array([0.0]), np.array([[5.0]]))
+        assert result == pytest.approx(5.0, abs=1e-10)
+
+    def test_pearson_perfect_correlation(self):
+        """y_true=[1,2,3], y_pred=[[1],[2],[3]] → r = 1.0."""
+        result = calculate_pearson_native(np.array([1.0, 2.0, 3.0]), np.array([[1.0], [2.0], [3.0]]))
+        assert result == pytest.approx(1.0, abs=1e-10)
+
+    def test_pearson_perfect_negative(self):
+        """y_true=[1,2,3], y_pred=[[3],[2],[1]] → r = -1.0."""
+        result = calculate_pearson_native(np.array([1.0, 2.0, 3.0]), np.array([[3.0], [2.0], [1.0]]))
+        assert result == pytest.approx(-1.0, abs=1e-10)
+
+    def test_mtd_known_tweedie(self):
+        """Tweedie deviance with power=2 reduces to (y/mu - ln(y/mu) - 1) * 2."""
+        from sklearn.metrics import mean_tweedie_deviance
+        y_true = np.array([1.0, 2.0, 3.0])
+        y_pred = np.array([[2.0], [2.0], [2.0]])
+        expected = mean_tweedie_deviance(
+            np.repeat(y_true, 1), y_pred.flatten(), power=2
+        )
+        result = calculate_mtd_native(y_true, y_pred, power=2)
+        assert result == pytest.approx(expected, abs=1e-10)
+
+    def test_mcr_perfect_calibration(self):
+        """mean(y_pred) == mean(y_true) → MCR = 1.0."""
+        y_true = np.array([2.0, 4.0, 6.0])
+        y_pred = np.array([[2.0], [4.0], [6.0]])
+        result = calculate_mcr_native(y_true, y_pred)
+        assert result == pytest.approx(1.0, abs=1e-10)
+
+    def test_mcr_double_overprediction(self):
+        """mean(y_pred) = 2 * mean(y_true) → MCR = 2.0."""
+        y_true = np.array([1.0, 2.0, 3.0])
+        y_pred = np.array([[2.0], [4.0], [6.0]])
+        result = calculate_mcr_native(y_true, y_pred)
+        assert result == pytest.approx(2.0, abs=1e-10)
+
+    def test_ignorance_known_bin_distribution(self):
+        """Hand-computed Ignorance: 5 ensemble members, 3 bins, known distribution.
+
+        bins=[0,4,8,12], preds=[1,3,5,7,9] → bin counts [2,2,1]
+        smoothed=[3,3,2], total=8. Truth 5.0 → bin 1, prob=3/8.
+        Score = -log2(3/8) = log2(8/3).
+        """
+        y_true = np.array([5.0])
+        y_pred = np.array([[1.0, 3.0, 5.0, 7.0, 9.0]])
+        result = calculate_ignorance_score_native(
+            y_true, y_pred, bins=[0, 4, 8, 12], low_bin=0, high_bin=12,
+        )
+        expected = np.log2(8.0 / 3.0)
+        assert result == pytest.approx(expected, abs=1e-10)
+
+    def test_ap_oracle_sklearn(self):
+        """AP matches sklearn.metrics.average_precision_score."""
+        from sklearn.metrics import average_precision_score
+        y_true = np.array([1.0, 0.0, 1.0, 0.0])
+        y_pred = np.array([[0.9], [0.1], [0.8], [0.2]])
+        result = calculate_ap_native(y_true, y_pred)
+        # AP native repeats y_true for S columns, flattens y_pred
+        expected = average_precision_score(
+            np.repeat(y_true, 1), y_pred.flatten()
+        )
+        assert result == pytest.approx(expected, abs=1e-10)
+
+    def test_coverage_all_inside(self):
+        """All obs inside the central interval → coverage = 1.0."""
+        y_true = np.array([5.0])
+        y_pred = np.array([[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]])
+        result = calculate_coverage_native(y_true, y_pred, alpha=0.1)
+        assert result == pytest.approx(1.0, abs=1e-10)
+
+    def test_coverage_all_outside(self):
+        """Obs far outside the interval → coverage = 0.0."""
+        y_true = np.array([100.0])
+        y_pred = np.array([[1.0, 2.0, 3.0, 4.0, 5.0]])
+        result = calculate_coverage_native(y_true, y_pred, alpha=0.1)
+        assert result == pytest.approx(0.0, abs=1e-10)
+
+    def test_mis_obs_inside_interval(self):
+        """Obs inside interval → MIS = interval width only (no penalty)."""
+        y_true = np.array([5.0])
+        y_pred = np.array([[0.0, 2.0, 4.0, 5.0, 6.0, 8.0, 10.0]])
+        alpha = 0.1
+        lower = np.quantile([0, 2, 4, 5, 6, 8, 10], alpha / 2)
+        upper = np.quantile([0, 2, 4, 5, 6, 8, 10], 1 - alpha / 2)
+        expected = upper - lower  # no penalty since obs is inside
+        result = calculate_mean_interval_score_native(y_true, y_pred, alpha=alpha)
+        assert result == pytest.approx(expected, abs=1e-10)
+
+    def test_crps_point_prediction_equals_absolute_error(self):
+        """CRPS of 1-member ensemble = |y - x|."""
+        result = calculate_crps_native(np.array([5.0]), np.array([[8.0]]))
+        assert result == pytest.approx(3.0, abs=1e-10)
+
+    def test_twcrps_zero_threshold_equals_crps(self):
+        """twCRPS with threshold=0 on non-negative data = CRPS."""
+        y_true = np.array([5.0, 10.0])
+        y_pred = np.array([[3.0, 7.0], [8.0, 12.0]])
+        crps = calculate_crps_native(y_true, y_pred)
+        twcrps = calculate_twcrps_native(y_true, y_pred, threshold=0.0)
+        assert twcrps == pytest.approx(crps, abs=1e-10)
+
+    def test_qis_symmetric_equals_mis(self):
+        """QIS with symmetric quantiles (alpha/2, 1-alpha/2) equals MIS."""
+        y_true = np.array([5.0, 15.0])
+        y_pred = np.array([[1.0, 3.0, 5.0, 7.0, 9.0], [10.0, 12.0, 14.0, 16.0, 18.0]])
+        alpha = 0.1
+        mis = calculate_mean_interval_score_native(y_true, y_pred, alpha=alpha)
+        qis = calculate_quantile_interval_score_native(
+            y_true, y_pred, lower_quantile=alpha / 2, upper_quantile=1 - alpha / 2
+        )
+        assert qis == pytest.approx(mis, abs=1e-10)
+
+
+# ---------------------------------------------------------------------------
+# Green: Brier Score golden-value tests (ADR-020)
+# ---------------------------------------------------------------------------
+
+class TestBrierScore:
+
+    def test_brier_sample_golden_value(self):
+        """Hand-computed Brier sample: threshold=1, mixed binary outcomes."""
+        y_true = np.array([0.0, 2.0, 5.0])
+        y_pred = np.array([[0.5, 1.5], [0.5, 1.5], [4.0, 6.0]])
+        # y_binary = [0, 1, 1] (0 < 1, 2 > 1, 5 > 1)
+        # p_hat = [0.5, 0.5, 1.0] (fraction of ensemble > threshold)
+        # Brier = mean([(0.5-0)^2, (0.5-1)^2, (1.0-1)^2]) = mean([0.25, 0.25, 0]) = 1/6
+        result = calculate_brier_sample_native(y_true, y_pred, threshold=1.0)
+        assert result == pytest.approx(1.0 / 6.0, abs=1e-10)
+
+    def test_brier_point_golden_value(self):
+        """Hand-computed Brier point: threshold=1, probabilities vs binary outcomes."""
+        y_true = np.array([0.0, 2.0, 5.0])
+        y_pred = np.array([[0.1], [0.7], [0.9]])
+        # y_binary = [0, 1, 1]
+        # p_hat = [0.1, 0.7, 0.9] (point prediction as probability)
+        # Brier = mean([(0.1-0)^2, (0.7-1)^2, (0.9-1)^2]) = mean([0.01, 0.09, 0.01]) = 11/300
+        result = calculate_brier_point_native(y_true, y_pred, threshold=1.0)
+        assert result == pytest.approx(11.0 / 300.0, abs=1e-10)
+
+    def test_brier_sample_perfect(self):
+        """All above threshold, all ensemble members above → p_hat=1, y_binary=1, Brier=0."""
+        y_true = np.array([5.0, 10.0])
+        y_pred = np.array([[2.0, 3.0], [2.0, 3.0]])
+        result = calculate_brier_sample_native(y_true, y_pred, threshold=1.0)
+        assert result == pytest.approx(0.0, abs=1e-10)
+
+    def test_brier_point_perfect(self):
+        """p_hat matches y_binary exactly → Brier=0."""
+        y_true = np.array([0.0, 2.0])  # binary=[0, 1] at threshold=1
+        y_pred = np.array([[0.0], [1.0]])  # perfect probability predictions
+        result = calculate_brier_point_native(y_true, y_pred, threshold=1.0)
+        assert result == pytest.approx(0.0, abs=1e-10)
+
+
+# ---------------------------------------------------------------------------
+# Green: Quantile Score (pinball loss) golden-value tests (ADR-020)
+# ---------------------------------------------------------------------------
+
+class TestQuantileScore:
+
+    def test_qs_sample_golden_value_at_median(self):
+        """Median matches observation → QS = 0."""
+        y_true = np.array([3.0])
+        y_pred = np.array([[1.0, 2.0, 3.0, 4.0, 5.0]])
+        # median of [1,2,3,4,5] = 3.0, diff = 3-3 = 0, QS = 0
+        result = calculate_qs_sample_native(y_true, y_pred, quantile=0.5)
+        assert result == pytest.approx(0.0, abs=1e-10)
+
+    def test_qs_point_golden_value_overprediction(self):
+        """Point overpredicts: y=3, q=5, quantile=0.9 → (1-0.9)*(5-3) = 0.2."""
+        y_true = np.array([3.0])
+        y_pred = np.array([[5.0]])
+        # diff = 3 - 5 = -2 < 0 → branch: -diff * (1-quantile) = 2 * 0.1 = 0.2
+        result = calculate_qs_point_native(y_true, y_pred, quantile=0.9)
+        assert result == pytest.approx(0.2, abs=1e-10)
+
+    def test_qs_sample_underprediction(self):
+        """Sample underpredicts: y=10, q=2.0 at quantile=0.9 → 0.9*(10-2) = 7.2."""
+        y_true = np.array([10.0])
+        y_pred = np.array([[1.0, 2.0, 3.0]])
+        # quantile(0.9) of [1,2,3] = 2.8 via linear interpolation
+        q = np.quantile([1.0, 2.0, 3.0], 0.9)  # = 2.8
+        expected = 0.9 * (10.0 - q)
+        result = calculate_qs_sample_native(y_true, y_pred, quantile=0.9)
+        assert result == pytest.approx(expected, abs=1e-10)
+
+    def test_qs_point_underprediction(self):
+        """Point underpredicts: y=10, y_hat=2, quantile=0.9 → 0.9*(10-2) = 7.2."""
+        y_true = np.array([10.0])
+        y_pred = np.array([[2.0]])
+        # diff = 10 - 2 = 8 ≥ 0 → branch: diff * quantile = 8 * 0.9 = 7.2
+        result = calculate_qs_point_native(y_true, y_pred, quantile=0.9)
+        assert result == pytest.approx(7.2, abs=1e-10)
 
 
 # ---------------------------------------------------------------------------
@@ -609,6 +755,74 @@ def test_large_alpha(self):
         assert np.isfinite(result)
 
 
+class TestBrierScoreBeige:
+
+    def test_single_observation(self):
+        """Brier handles N=1, S=1 without error."""
+        result = calculate_brier_sample_native(np.array([2.0]), np.array([[3.0]]), threshold=1.0)
+        assert np.isfinite(result)
+
+    def test_large_ensemble_stable(self):
+        """Brier is stable with S=1000 samples."""
+        rng = np.random.default_rng(42)
+        y_true = np.array([0.0, 5.0, 10.0])
+        y_pred = rng.normal(loc=y_true[:, None], scale=2.0, size=(3, 1000))
+        result = calculate_brier_sample_native(y_true, y_pred, threshold=1.0)
+        assert np.isfinite(result)
+        assert 0 <= result <= 1  # Brier is bounded [0, 1]
+
+    def test_threshold_at_exact_data_value(self):
+        """Threshold equals an observation — no crash."""
+        y_true = np.array([5.0, 5.0])
+        y_pred = np.array([[4.0, 6.0], [4.0, 6.0]])
+        result = calculate_brier_sample_native(y_true, y_pred, threshold=5.0)
+        assert np.isfinite(result)
+
+    def test_all_above_threshold(self):
+        """All y_true above threshold — y_binary all 1, finite result."""
+        y_true = np.array([10.0, 20.0])
+        y_pred = np.array([[0.5, 1.5], [0.5, 1.5]])
+        result = calculate_brier_sample_native(y_true, y_pred, threshold=1.0)
+        assert np.isfinite(result)
+
+    def test_all_below_threshold(self):
+        """All y_true below threshold — y_binary all 0, finite result."""
+        y_true = np.array([0.0, 0.5])
+        y_pred = np.array([[0.5, 1.5], [0.5, 1.5]])
+        result = calculate_brier_sample_native(y_true, y_pred, threshold=1.0)
+        assert np.isfinite(result)
+
+
+class TestQuantileScoreBeige:
+
+    def test_single_observation(self):
+        """QS handles N=1, S=1 without error."""
+        result = calculate_qs_sample_native(np.array([1.0]), np.array([[1.0]]), quantile=0.5)
+        assert np.isfinite(result)
+
+    def test_large_ensemble_stable(self):
+        """QS is stable with S=1000 samples."""
+        rng = np.random.default_rng(42)
+        y_true = np.array([5.0, 10.0, 0.0])
+        y_pred = rng.normal(loc=y_true[:, None], scale=1.0, size=(3, 1000))
+        result = calculate_qs_sample_native(y_true, y_pred, quantile=0.99)
+        assert np.isfinite(result)
+        assert result >= 0
+
+    def test_extreme_quantile_near_one(self):
+        """Quantile very close to 1 — finite result."""
+        y_true = np.array([5.0])
+        y_pred = np.array([[1.0, 2.0, 3.0, 4.0, 5.0]])
+        result = calculate_qs_sample_native(y_true, y_pred, quantile=0.999)
+        assert np.isfinite(result)
+
+    def test_extreme_quantile_near_zero(self):
+        """Quantile very close to 0 — finite result."""
+        y_true = np.array([5.0])
+        result = calculate_qs_point_native(y_true, np.array([[2.0]]), quantile=0.001)
+        assert np.isfinite(result)
+
+
 class TestMCRBeige:
 
     def test_single_observation(self):
@@ -756,3 +970,87 @@ def test_negative_y_true_valid(self):
         y_pred = np.array([[4.0], [4.0]])
         result = calculate_mcr_native(y_true, y_pred)
         assert result == -2.0
+
+
+class TestBrierScoreRed:
+
+    def test_nan_in_y_true_swallowed_by_comparison(self):
+        """NaN in y_true is swallowed by '>' comparison (NaN > x → False).
+
+        Unlike arithmetic metrics, Brier's binarization step converts NaN to
+        False (0.0) rather than propagating. This is NumPy's standard comparison
+        semantics. The EvaluationFrame boundary should reject NaN before it
+        reaches here (defense-in-depth).
+        """
+        y_true = np.array([np.nan, 1.0])
+        y_pred = np.array([[1.0], [1.0]])
+        result = calculate_brier_sample_native(y_true, y_pred, threshold=1.0)
+        # NaN is treated as below-threshold (False), so result is finite, not NaN
+        assert np.isfinite(result)
+
+    def test_nan_in_y_pred_swallowed_by_comparison(self):
+        """NaN in y_pred is swallowed by '>' comparison in p_hat computation."""
+        y_true = np.array([1.0, 1.0])
+        y_pred = np.array([[np.nan], [1.0]])
+        result = calculate_brier_sample_native(y_true, y_pred, threshold=1.0)
+        assert np.isfinite(result)
+
+    def test_negative_threshold_accepted(self):
+        """Negative threshold is mathematically valid."""
+        y_true = np.array([1.0, 2.0])
+        y_pred = np.array([[1.0, 2.0], [2.0, 3.0]])
+        result = calculate_brier_sample_native(y_true, y_pred, threshold=-5.0)
+        assert np.isfinite(result)
+
+
+class TestQuantileScoreRed:
+
+    def test_nan_in_y_true_propagates(self):
+        """NaN in y_true propagates to result."""
+        y_true = np.array([np.nan, 1.0])
+        y_pred = np.array([[1.0], [1.0]])
+        result = calculate_qs_sample_native(y_true, y_pred, quantile=0.5)
+        assert np.isnan(result)
+
+    def test_nan_in_y_pred_propagates(self):
+        """NaN in y_pred propagates to result."""
+        y_true = np.array([1.0, 1.0])
+        y_pred = np.array([[np.nan], [1.0]])
+        result = calculate_qs_point_native(y_true, y_pred, quantile=0.5)
+        assert np.isnan(result)
+
+
+# ---------------------------------------------------------------------------
+# Red: Extreme-value tests (ADR-020)
+# ---------------------------------------------------------------------------
+
+class TestExtremeValues:
+    """Test metric behavior near float64 limits — no overflow, no silent corruption."""
+
+    def test_mse_large_matching_values(self):
+        """Large but equal values → MSE = 0, not overflow."""
+        result = calculate_mse_native(np.array([1e150]), np.array([[1e150]]))
+        assert result == pytest.approx(0.0, abs=1e-10)
+
+    def test_crps_large_ensemble_values(self):
+        """CRPS with large ensemble values remains finite."""
+        y_true = np.array([1e50])
+        y_pred = np.array([[0.9e50, 1.0e50, 1.1e50]])
+        result = calculate_crps_native(y_true, y_pred)
+        assert np.isfinite(result)
+        assert result >= 0
+
+    def test_brier_extreme_threshold(self):
+        """Threshold at 1e300: all values below → y_binary all 0, p_hat all 0, Brier = 0."""
+        y_true = np.array([1.0, 2.0])
+        y_pred = np.array([[0.5, 1.5], [0.5, 1.5]])
+        result = calculate_brier_sample_native(y_true, y_pred, threshold=1e300)
+        assert result == pytest.approx(0.0, abs=1e-10)
+
+    def test_coverage_tiny_ensemble_spread(self):
+        """Extremely narrow ensemble → interval width ~ 0, coverage depends on obs position."""
+        base = 1e-15
+        y_true = np.array([base])
+        y_pred = np.array([[base - 1e-30, base + 1e-30]])
+        result = calculate_coverage_native(y_true, y_pred, alpha=0.1)
+        assert np.isfinite(result)
diff --git a/tests/test_metric_catalog.py b/tests/test_metric_catalog.py
index 8b1c44f..e621ce0 100644
--- a/tests/test_metric_catalog.py
+++ b/tests/test_metric_catalog.py
@@ -16,10 +16,6 @@
 from views_evaluation.profiles import PROFILES
 from views_evaluation.profiles.base import BASE_PROFILE
 from views_evaluation.evaluation.native_metric_calculators import (
-    REGRESSION_POINT_NATIVE,
-    REGRESSION_SAMPLE_NATIVE,
-    CLASSIFICATION_POINT_NATIVE,
-    CLASSIFICATION_SAMPLE_NATIVE,
     calculate_mcr_native,
 )
 import numpy as np
@@ -147,23 +143,6 @@ def test_every_membership_metric_in_catalog(self):
                     f"Metric '{metric_name}' in METRIC_MEMBERSHIP{key} but not in METRIC_CATALOG"
                 )
 
-    def test_catalog_functions_match_legacy_dispatch_dicts(self):
-        """METRIC_CATALOG functions must match the legacy dispatch dict entries."""
-        legacy_dicts = {
-            ("regression", "point"): REGRESSION_POINT_NATIVE,
-            ("regression", "sample"): REGRESSION_SAMPLE_NATIVE,
-            ("classification", "point"): CLASSIFICATION_POINT_NATIVE,
-            ("classification", "sample"): CLASSIFICATION_SAMPLE_NATIVE,
-        }
-        for key, legacy_dict in legacy_dicts.items():
-            for metric_name, legacy_func in legacy_dict.items():
-                assert metric_name in METRIC_CATALOG, (
-                    f"Legacy dict {key} has '{metric_name}' not in METRIC_CATALOG"
-                )
-                assert METRIC_CATALOG[metric_name].function is legacy_func, (
-                    f"Function mismatch for '{metric_name}' between catalog and legacy dict {key}"
-                )
-
     def test_base_profile_covers_all_implemented_genomes(self):
         """BASE_PROFILE must provide values for every genome param of every implemented metric."""
         for metric_name, spec in METRIC_CATALOG.items():
@@ -365,7 +344,7 @@ def test_evaluator_rejects_unknown_profile(self):
 
     def test_registry_snapshot_integrity(self):
         """Registries have expected sizes — catches accidental mutation or deletion."""
-        assert len(METRIC_CATALOG) == 21
+        assert len(METRIC_CATALOG) == 24
         assert len(METRIC_MEMBERSHIP) == 4
         assert len(PROFILES) >= 2
 
diff --git a/tests/test_metric_correctness.py b/tests/test_metric_correctness.py
deleted file mode 100644
index d86dcf1..0000000
--- a/tests/test_metric_correctness.py
+++ /dev/null
@@ -1,239 +0,0 @@
-"""
-PHASE-3-DELETE: Tests metric correctness through the legacy EvaluationManager path.
-Will be deleted when Phase 3 of the orchestrator migration is complete.
-See reports/2026-02-25_evaluation_frame_refactor/10_orchestrator_migration_plan.md
-"""
-import pandas as pd
-import numpy as np
-import pytest
-
-from views_evaluation.evaluation.evaluation_manager import EvaluationManager
-
-class TestMetricCorrectness:
-    """
-    A test suite for Phase 3: Data-Centric & Metric-Specific Validation.
-    These tests verify the numerical correctness of the metric calculators
-    using 'golden datasets' with pre-calculated, known outcomes.
-    """
-
-    def test_rmsle_golden_dataset_perfect_match(self):
-        """
-        Tests the RMSLE calculation with a perfect match.
-        Expected: RMSLE should be 0.0.
-        """
-        # Arrange
-        target_name = "lr_test"
-        pred_col_name = f"pred_{target_name}"
-
-        # Create a simple, non-random dataset
-        actuals_index = pd.MultiIndex.from_product([[500], [10, 20]], names=['month_id', 'country_id'])
-        actuals = pd.DataFrame({target_name: [100, 50]}, index=actuals_index)
-
-        # Predictions are identical to actuals
-        predictions_df = pd.DataFrame({pred_col_name: [[100.0], [50.0]]}, index=actuals_index)
-        predictions = [predictions_df]
-
-        config = {
-            'steps': [1],
-            'regression_targets': [target_name],
-            'regression_point_metrics': ['RMSLE'],
-        }
-        manager = EvaluationManager()
-
-        # Act
-        results = manager.evaluate(
-            actual=actuals,
-            predictions=predictions,
-            target=target_name,
-            config=config
-        )
-
-        # Assert
-        # Check all evaluation schemas for correctness
-        rmsle_step = results['step'][1]['RMSLE'].iloc[0]
-        rmsle_ts = results['time_series'][1]['RMSLE'].iloc[0]
-        rmsle_month = results['month'][1]['RMSLE'].iloc[0]
-
-        assert rmsle_step == 0.0
-        assert rmsle_ts == 0.0
-        assert rmsle_month == 0.0
-
-    def test_rmsle_golden_dataset_simple_mismatch(self):
-        """
-        Tests the RMSLE calculation with a simple, known mismatch.
-        actual = e - 1, pred = 0.
-        log(actual + 1) = log(e) = 1.
-        log(pred + 1) = log(1) = 0.
-        RMSLE = sqrt((1-0)^2) = 1.
-        Expected: RMSLE should be 1.0.
-        """
-        # Arrange
-        target_name = "lr_test"
-        pred_col_name = f"pred_{target_name}"
-
-        actual_val = np.e - 1
-        pred_val = 0.0
-
-        actuals_index = pd.MultiIndex.from_product([[500], [10]], names=['month_id', 'country_id'])
-        actuals = pd.DataFrame({target_name: [actual_val]}, index=actuals_index)
-
-        predictions_df = pd.DataFrame({pred_col_name: [[pred_val]]}, index=actuals_index)
-        predictions = [predictions_df]
-
-        config = {
-            'steps': [1],
-            'regression_targets': [target_name],
-            'regression_point_metrics': ['RMSLE'],
-        }
-        manager = EvaluationManager()
-
-        # Act
-        results = manager.evaluate(
-            actual=actuals,
-            predictions=predictions,
-            target=target_name,
-            config=config
-        )
-
-        # Assert
-        rmsle_step = results['step'][1]['RMSLE'].iloc[0]
-
-        assert rmsle_step == pytest.approx(1.0)
-
-    def test_ap_metric_with_prebinarised_inputs(self):
-        """
-        Tests the AP (Average Precision) metric with pre-binarised actuals and probability
-        scores as predictions.  AP is a classification metric; actuals must already be
-        binary (0/1) before reaching evaluate().  No threshold kwarg is accepted.
-        """
-        # Arrange
-        target_name = "cls_binary"
-        pred_col_name = f"pred_{target_name}"
-
-        # Pre-binarised actuals and probability scores
-        y_true_binary = [0, 1, 1, 0]
-        y_scores = [0.1, 0.4, 0.35, 0.8]
-
-        actuals_index = pd.MultiIndex.from_product(
-            [[500], [10, 20, 30, 40]], names=['month_id', 'country_id']
-        )
-        actuals = pd.DataFrame({target_name: y_true_binary}, index=actuals_index)
-        predictions_df = pd.DataFrame(
-            {pred_col_name: [[s] for s in y_scores]}, index=actuals_index
-        )
-        predictions = [predictions_df]
-
-        config = {
-            'steps': [1],
-            'classification_targets': [target_name],
-            'classification_point_metrics': ['AP'],
-        }
-        manager = EvaluationManager()
-
-        # Act
-        results = manager.evaluate(
-            actual=actuals,
-            predictions=predictions,
-            target=target_name,
-            config=config
-        )
-
-        ap_step = results['step'][1]['AP'].iloc[0]
-
-        # Expected AP from sklearn with the raw probability scores as the ranking signal
-        from sklearn.metrics import average_precision_score
-        expected_ap = average_precision_score(y_true_binary, y_scores)
-
-        assert ap_step == pytest.approx(expected_ap)
-
-    def test_crps_golden_dataset_point_prediction(self):
-        """
-        Tests the CRPS calculation for point predictions (single-value ensemble).
-        Expected: CRPS matches properscoring for a 1-sample ensemble.
-        """
-        # Arrange
-        target_name = "lr_test_crps_point"
-        pred_col_name = f"pred_{target_name}"
-
-        # Simple dataset: one actual, one prediction
-        actual_val = 5.0
-        pred_val = 6.0
-
-        actuals_index = pd.MultiIndex.from_product([[500], [10]], names=['month_id', 'country_id'])
-        actuals = pd.DataFrame({target_name: [actual_val]}, index=actuals_index)
-
-        # Single-value prediction → point prediction, use regression_sample_metrics
-        # by providing a multi-element ensemble so it's detected as sample type.
-        # Use the same scalar as a 3-sample degenerate ensemble for CRPS:
-        predictions_df = pd.DataFrame({pred_col_name: [[pred_val, pred_val, pred_val]]}, index=actuals_index)
-        predictions = [predictions_df]
-
-        config = {
-            'steps': [1],
-            'regression_targets': [target_name],
-            'regression_point_metrics': ['RMSLE'],       # required by _validate_config
-            'regression_sample_metrics': ['CRPS'],  # routed to because predictions are multi-element
-        }
-        manager = EvaluationManager()
-
-        # Act
-        results = manager.evaluate(
-            actual=actuals,
-            predictions=predictions,
-            target=target_name,
-            config=config
-        )
-
-        # Assert
-        crps_step = results['step'][1]['CRPS'].iloc[0]
-
-        # Calculate expected CRPS using properscoring for the degenerate 3-sample ensemble
-        import properscoring as ps
-        expected_crps = ps.crps_ensemble(actual_val, np.array([pred_val, pred_val, pred_val]))
-
-        assert crps_step == pytest.approx(expected_crps)
-
-    def test_crps_golden_dataset_sample_prediction(self):
-        """
-        Tests the CRPS calculation for sample predictions (ensemble of multiple values).
-        Expected: CRPS for sample predictions matches properscoring.
-        """
-        # Arrange
-        target_name = "lr_test_crps_sample"
-        pred_col_name = f"pred_{target_name}"
-
-        # Simple dataset: one actual, one prediction ensemble
-        actual_val = 5.0
-        prediction_ensemble = [3.0, 4.0, 5.0, 6.0, 7.0]  # A simple ensemble
-
-        actuals_index = pd.MultiIndex.from_product([[500], [10]], names=['month_id', 'country_id'])
-        actuals = pd.DataFrame({target_name: [actual_val]}, index=actuals_index)
-
-        # Sample prediction is a list of multiple values
-        predictions_df = pd.DataFrame({pred_col_name: [prediction_ensemble]}, index=actuals_index)
-        predictions = [predictions_df]
-
-        config = {
-            'steps': [1],
-            'regression_targets': [target_name],
-            'regression_point_metrics': ['RMSLE'],       # required by _validate_config
-            'regression_sample_metrics': ['CRPS'],  # routed to because predictions are multi-element
-        }
-        manager = EvaluationManager()
-
-        # Act
-        results = manager.evaluate(
-            actual=actuals,
-            predictions=predictions,
-            target=target_name,
-            config=config
-        )
-
-        # Assert
-        crps_step = results['step'][1]['CRPS'].iloc[0]
-
-        # Calculate expected CRPS using properscoring for the ensemble
-        import properscoring as ps
-        expected_crps = ps.crps_ensemble(actual_val, np.array(prediction_ensemble))
-
-        assert crps_step == pytest.approx(expected_crps)
diff --git a/tests/test_native_evaluator.py b/tests/test_native_evaluator.py
index d60ca2e..a0fa14f 100644
--- a/tests/test_native_evaluator.py
+++ b/tests/test_native_evaluator.py
@@ -223,6 +223,48 @@ def test_single_origin_single_step(self):
         assert 'ts00' in d['time_series']
         assert 'step01' in d['step']
 
+    def test_multi_target_regression_and_classification(self):
+        """Config with both target types; each evaluated separately via EvaluationFrame metadata."""
+        n = 4
+        config = {
+            'steps': [1, 2],
+            'regression_targets': ['ged_sb'],
+            'classification_targets': ['by_sb'],
+            'regression_point_metrics': ['MSE'],
+            'classification_point_metrics': ['AP'],
+        }
+        # Evaluate regression target
+        ef_reg = EvaluationFrame(
+            y_true=np.array([1.0, 2.0, 3.0, 4.0]),
+            y_pred=np.array([[1.1], [2.1], [3.1], [4.1]]),
+            identifiers={
+                'time':   np.array([100, 100, 101, 101]),
+                'unit':   np.array([1, 2, 1, 2]),
+                'origin': np.zeros(n, dtype=int),
+                'step':   np.array([1, 1, 2, 2]),
+            },
+            metadata={'target': 'ged_sb'},
+        )
+        report_reg = NativeEvaluator(config).evaluate(ef_reg)
+        assert report_reg.task == 'regression'
+        assert 'MSE' in report_reg.to_dict()['schemas']['month']['month100']
+
+        # Evaluate classification target
+        ef_cls = EvaluationFrame(
+            y_true=np.array([0.0, 1.0, 0.0, 1.0]),
+            y_pred=np.array([[0.2], [0.8], [0.3], [0.7]]),
+            identifiers={
+                'time':   np.array([100, 100, 101, 101]),
+                'unit':   np.array([1, 2, 1, 2]),
+                'origin': np.zeros(n, dtype=int),
+                'step':   np.array([1, 1, 2, 2]),
+            },
+            metadata={'target': 'by_sb'},
+        )
+        report_cls = NativeEvaluator(config).evaluate(ef_cls)
+        assert report_cls.task == 'classification'
+        assert 'AP' in report_cls.to_dict()['schemas']['month']['month100']
+
     def test_classification_target(self):
         n = 6
         ef = EvaluationFrame(
@@ -246,6 +288,65 @@ def test_classification_target(self):
         assert report.pred_type == 'point'
         assert 'month100' in report.to_dict()['schemas']['month']
 
+    def test_classification_sample_brier(self):
+        """Brier_sample and CRPS work for classification sample predictions."""
+        n = 6
+        ef = EvaluationFrame(
+            y_true=np.array([0.0, 1.0, 0.0, 1.0, 0.0, 1.0]),
+            y_pred=np.random.default_rng(42).uniform(0, 2, size=(n, 20)),
+            identifiers={
+                'time':   np.array([100, 100, 101, 101, 102, 102]),
+                'unit':   np.array([1, 2, 1, 2, 1, 2]),
+                'origin': np.zeros(n, dtype=int),
+                'step':   np.array([1, 1, 2, 2, 3, 3]),
+            },
+            metadata={'target': 'by_sb_best'},
+        )
+        config = {
+            'steps': [1, 2, 3],
+            'classification_targets': ['by_sb_best'],
+            'classification_sample_metrics': ['Brier_sample', 'CRPS'],
+        }
+        report = NativeEvaluator(config).evaluate(ef)
+        assert report.task == 'classification'
+        assert report.pred_type == 'sample'
+        d = report.to_dict()['schemas']
+        assert 'Brier_sample' in d['month']['month100']
+        assert 'CRPS' in d['month']['month100']
+
+    def test_classification_point_brier(self):
+        """AP and Brier_point work together for classification point predictions."""
+        n = 6
+        ef = EvaluationFrame(
+            y_true=np.array([0.0, 1.0, 0.0, 1.0, 0.0, 1.0]),
+            y_pred=np.array([[0.2], [0.8], [0.3], [0.7], [0.4], [0.6]]),
+            identifiers={
+                'time':   np.array([100, 100, 101, 101, 102, 102]),
+                'unit':   np.array([1, 2, 1, 2, 1, 2]),
+                'origin': np.zeros(n, dtype=int),
+                'step':   np.array([1, 1, 2, 2, 3, 3]),
+            },
+            metadata={'target': 'by_sb_best'},
+        )
+        config = {
+            'steps': [1, 2, 3],
+            'classification_targets': ['by_sb_best'],
+            'classification_point_metrics': ['AP', 'Brier_point'],
+        }
+        report = NativeEvaluator(config).evaluate(ef)
+        d = report.to_dict()['schemas']
+        assert 'AP' in d['step']['step01']
+        assert 'Brier_point' in d['step']['step01']
+
+    def test_evaluate_twice_produces_identical_results(self):
+        """NativeEvaluator is stateless — same input yields same output."""
+        ef = _make_parallelogram_ef(n_origins=2, n_steps=3, n_units=2)
+        config = _regression_point_config(steps=[1, 2, 3])
+        evaluator = NativeEvaluator(config)
+        report1 = evaluator.evaluate(ef)
+        report2 = evaluator.evaluate(ef)
+        assert report1.to_dict() == report2.to_dict()
+
     def test_sample_predictions_produce_point_pred_type_false(self):
         n = 4
         ef = EvaluationFrame(
@@ -307,6 +408,16 @@ def test_invalid_metric_name_raises_value_error(self):
         with pytest.raises(ValueError, match="not valid"):
             NativeEvaluator(config).evaluate(ef)
 
+    def test_empty_config_accepted_at_init_fails_at_evaluate(self):
+        """Empty config is accepted at init (C-02 known gap) but fails at evaluate().
+
+        NativeEvaluator.__init__ only validates profile name (defaults to 'base').
+        Structural config errors surface at evaluate() time, not construction.
+        """
+        ef = _make_parallelogram_ef(n_origins=1, n_steps=2, n_units=2)
+        evaluator = NativeEvaluator({})  # does NOT raise — C-02
+        with pytest.raises((ValueError, KeyError)):
+            evaluator.evaluate(ef)
     def test_classification_metric_on_regression_target_raises(self):
         """AP is only valid for classification; using it with regression_targets must fail."""
         ef = _make_parallelogram_ef(n_origins=1, n_steps=2, n_units=2)
diff --git a/tests/test_parity_adapter_transfer.py b/tests/test_parity_adapter_transfer.py
deleted file mode 100644
index 8145acc..0000000
--- a/tests/test_parity_adapter_transfer.py
+++ /dev/null
@@ -1,71 +0,0 @@
-"""
-PHASE-3-DELETE: Tests parity between internal and external PandasAdapter adaptation.
-Will be deleted when Phase 3 of the orchestrator migration is complete.
-See reports/2026-02-25_evaluation_frame_refactor/10_orchestrator_migration_plan.md
-"""
-import pytest
-import pandas as pd
-from views_evaluation import EvaluationManager, PandasAdapter, NativeEvaluator
-
-def test_parity_internal_vs_external_adaptation():
-    """
-    PROVING PARITY FOR UPSTREAMING:
-    This test verifies that adapting a DataFrame to an EvaluationFrame 
-    OUTSIDE of the EvaluationManager produces identical results to 
-    letting the Manager handle it internally.
-    """
-    # 1. Setup Data
-    index = pd.MultiIndex.from_product([[100, 101], [1, 2]], names=['month', 'unit'])
-    actuals = pd.DataFrame({'target': [0, 1, 0, 1]}, index=index)
-    preds = [pd.DataFrame({'pred_target': [0.1, 0.8, 0.15, 0.7]}, index=index)]
-    
-    config = {
-        'steps': [1],
-        'regression_targets': ['target'],
-        'regression_point_metrics': ['MSE']
-    }
-    
-    manager = EvaluationManager()
-    
-    # 2. PATH A: Internal Adaptation (The status quo)
-    # Manager receives DataFrames, adapts internally.
-    results_internal = manager.evaluate(actuals, preds, "target", config)
-    
-    # 3. PATH B: External Adaptation (The future)
-    # Simulation of Orchestrator running the adapter.
-    ef_external = PandasAdapter.from_dataframes(actuals, preds, "target")
-    
-    # We use the NativeEvaluator directly to simulate the final state
-    evaluator = NativeEvaluator(config)
-    report_external = evaluator.evaluate(ef_external)
-    
-    # 4. Bit-wise Parity Check
-    for schema in ["month", "time_series", "step"]:
-        df_internal = results_internal[schema][1]
-        df_external = report_external.to_dataframe(schema)
-        
-        pd.testing.assert_frame_equal(df_internal, df_external, 
-                                      obj=f"Divergence in schema: {schema}")
-
-    print("Parity Proven: External adaptation matches internal adaptation 100%.")
-
-def test_shadow_verification_mode():
-    """Verifies that verify_parity=True catches mismatches and allows matches."""
-    index = pd.MultiIndex.from_product([[100], [1]], names=['month', 'unit'])
-    actuals = pd.DataFrame({'target': [1]}, index=index)
-    preds = [pd.DataFrame({'pred_target': [0.9]}, index=index)]
-    config = {'steps': [1], 'regression_targets': ['target'], 'regression_point_metrics': ['MSE']}
-    
-    manager = EvaluationManager()
-    ef_external = PandasAdapter.from_dataframes(actuals, preds, "target")
-    
-    # 1. Matching case - should pass silently
-    manager.evaluate(actuals, preds, "target", config, ef=ef_external, verify_parity=True)
-    
-    # 2. Mismatching case - should raise ValueError
-    ef_corrupted = PandasAdapter.from_dataframes(actuals, preds, "target")
-    ef_corrupted.y_true = ef_corrupted.y_true * 2 # Corrupt the data
-    
-    with pytest.raises(ValueError, match="Parity Failure"):
-        manager.evaluate(actuals, preds, "target", config, ef=ef_corrupted, verify_parity=True)
-
diff --git a/tests/test_parity_beige.py b/tests/test_parity_beige.py
deleted file mode 100644
index 62d2d00..0000000
--- a/tests/test_parity_beige.py
+++ /dev/null
@@ -1,51 +0,0 @@
-"""
-PHASE-3-DELETE: Parity edge-case tests between the legacy EvaluationManager and native paths.
-Will be deleted when Phase 3 of the orchestrator migration is complete.
-See reports/2026-02-25_evaluation_frame_refactor/10_orchestrator_migration_plan.md
-"""
-import pytest
-import pandas as pd
-import numpy as np
-from views_evaluation.evaluation.evaluation_manager import EvaluationManager
-from views_evaluation.adapters.pandas import PandasAdapter
-from views_evaluation.evaluation.native_evaluator import NativeEvaluator
-from tests.test_parity_green import assert_parity
-
-@pytest.fixture
-def beige_data_ragged():
-    """Ragged sequences and missing months."""
-    index = pd.MultiIndex.from_product([[100, 101, 102, 103], [1, 2]], names=['month', 'unit'])
-    actual = pd.DataFrame({'target': np.random.rand(8)}, index=index)
-    
-    # Sequence 0: Month 100 and 101 (complete)
-    pred_0 = pd.DataFrame({'pred_target': np.random.rand(4)}, index=index[:4])
-    
-    # Sequence 1: Month 101 and 102, but Month 101 Unit 2 is MISSING
-    idx_1 = index[2:6].drop((101, 2))
-    pred_1 = pd.DataFrame({'pred_target': np.random.rand(3)}, index=idx_1)
-    
-    # Sequence 2: Only Month 103
-    pred_2 = pd.DataFrame({'pred_target': np.random.rand(2)}, index=index[6:])
-    
-    config = {
-        'steps': [1, 2],
-        'regression_targets': ['target'],
-        'regression_point_metrics': ['MSE']
-    }
-    
-    return actual, [pred_0, pred_1, pred_2], "target", config
-
-def test_parity_beige_ragged(beige_data_ragged):
-    actual, predictions, target, config = beige_data_ragged
-    
-    # 1. Run Legacy
-    manager = EvaluationManager()
-    legacy_results = manager.evaluate(actual, predictions, target, config)
-    
-    # 2. Run Native
-    ef = PandasAdapter.from_dataframes(actual, predictions, target)
-    native_evaluator = NativeEvaluator(config)
-    native_results = native_evaluator.evaluate(ef)
-    
-    # 3. Assert Parity
-    assert_parity(legacy_results, native_results)
diff --git a/tests/test_parity_green.py b/tests/test_parity_green.py
deleted file mode 100644
index 91187cd..0000000
--- a/tests/test_parity_green.py
+++ /dev/null
@@ -1,101 +0,0 @@
-"""
-PHASE-3-DELETE: Parity tests between the legacy EvaluationManager and native paths.
-Will be deleted when Phase 3 of the orchestrator migration is complete.
-See reports/2026-02-25_evaluation_frame_refactor/10_orchestrator_migration_plan.md
-"""
-import pytest
-import pandas as pd
-from views_evaluation.evaluation.evaluation_manager import EvaluationManager
-from views_evaluation.adapters.pandas import PandasAdapter
-from views_evaluation.evaluation.native_evaluator import NativeEvaluator
-
-def assert_parity(legacy_results, native_report, tolerance=1e-9):
-    """
-    Asserts bit-wise (or within tolerance) parity between legacy and native results.
-    legacy_results: output of EvaluationManager.evaluate()
-    native_report: EvaluationReport object from NativeEvaluator.evaluate()
-    """
-    for schema in ["month", "time_series", "step"]:
-        legacy_df = legacy_results[schema][1]
-        native_df = native_report.to_dataframe(schema)
-        
-        # Check index parity
-        pd.testing.assert_index_equal(legacy_df.index, native_df.index)
-        
-        # Check column parity (might be slight differences in names if not careful)
-        pd.testing.assert_index_equal(legacy_df.columns, native_df.columns)
-        
-        # Check value parity
-        pd.testing.assert_frame_equal(legacy_df, native_df, atol=tolerance)
-
-@pytest.fixture
-def green_data():
-    """Clean, overlapping rolling origin data."""
-    index = pd.MultiIndex.from_product([[100, 101, 102], [1, 2]], names=['month', 'unit'])
-    actual = pd.DataFrame({'target': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]}, index=index)
-    
-    # Sequence 0: Steps 1-2 for all units
-    pred_0 = pd.DataFrame({'pred_target': [0.11, 0.21, 0.31, 0.41]}, index=index[:4])
-    # Sequence 1: Steps 1-2 for all units, starting from month 101
-    pred_1 = pd.DataFrame({'pred_target': [0.32, 0.42, 0.52, 0.62]}, index=index[2:])
-    
-    config = {
-        'steps': [1, 2],
-        'regression_targets': ['target'],
-        'regression_point_metrics': ['MSE']
-    }
-    
-    return actual, [pred_0, pred_1], "target", config
-
-@pytest.fixture
-def green_data_samples():
-    """Clean, overlapping rolling origin data with samples."""
-    index = pd.MultiIndex.from_product([[100, 101, 102], [1, 2]], names=['month', 'unit'])
-    actual = pd.DataFrame({'target': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]}, index=index)
-    
-    # 3 samples per prediction
-    pred_0 = pd.DataFrame({
-        'pred_target': [[0.1, 0.12, 0.08], [0.2, 0.22, 0.18], [0.3, 0.32, 0.28], [0.4, 0.42, 0.38]]
-    }, index=index[:4])
-    pred_1 = pd.DataFrame({
-        'pred_target': [[0.31, 0.33, 0.29], [0.41, 0.43, 0.39], [0.51, 0.53, 0.49], [0.61, 0.63, 0.59]]
-    }, index=index[2:])
-    
-    config = {
-        'steps': [1, 2],
-        'regression_targets': ['target'],
-        'regression_point_metrics': [],
-        'regression_sample_metrics': ['CRPS']
-    }
-    
-    return actual, [pred_0, pred_1], "target", config
-
-def test_parity_green_happy_path(green_data):
-    actual, predictions, target, config = green_data
-    
-    # 1. Run Legacy
-    manager = EvaluationManager()
-    legacy_results = manager.evaluate(actual, predictions, target, config)
-    
-    # 2. Run Native (New Path)
-    ef = PandasAdapter.from_dataframes(actual, predictions, target)
-    native_evaluator = NativeEvaluator(config)
-    native_results = native_evaluator.evaluate(ef)
-    
-    # 3. Assert Parity
-    assert_parity(legacy_results, native_results)
-
-def test_parity_green_ignorance(green_data_samples):
-    actual, predictions, target, config = green_data_samples
-    # Update config to use Ignorance score
-    config['regression_sample_metrics'] = ['Ignorance']
-    
-    manager = EvaluationManager()
-    legacy_results = manager.evaluate(actual, predictions, target, config)
-    
-    ef = PandasAdapter.from_dataframes(actual, predictions, target)
-    native_evaluator = NativeEvaluator(config)
-    native_results = native_evaluator.evaluate(ef)
-    
-    assert_parity(legacy_results, native_results)
-
diff --git a/tests/test_parity_red.py b/tests/test_parity_red.py
deleted file mode 100644
index f217233..0000000
--- a/tests/test_parity_red.py
+++ /dev/null
@@ -1,120 +0,0 @@
-"""
-PHASE-3-DELETE: Parity error-case tests between the legacy EvaluationManager and native paths.
-Will be deleted when Phase 3 of the orchestrator migration is complete.
-See reports/2026-02-25_evaluation_frame_refactor/10_orchestrator_migration_plan.md
-"""
-import pytest
-import pandas as pd
-import numpy as np
-from views_evaluation.evaluation.evaluation_manager import EvaluationManager
-from views_evaluation.adapters.pandas import PandasAdapter
-from views_evaluation.evaluation.native_evaluator import NativeEvaluator
-from tests.test_parity_green import assert_parity
-
-@pytest.fixture
-def red_data_unordered():
-    """Predictions are mis-ordered in the DataFrame."""
-    index = pd.MultiIndex.from_product([[100, 101], [1, 2]], names=['month', 'unit'])
-    actual = pd.DataFrame({'target': [0.1, 0.2, 0.3, 0.4]}, index=index)
-    
-    # Sequence 0: SHUFFLED rows
-    shuffled_idx = index[[3, 0, 2, 1]]
-    pred_0 = pd.DataFrame({'pred_target': [0.41, 0.11, 0.31, 0.21]}, index=shuffled_idx)
-    
-    config = {
-        'steps': [1, 2],
-        'regression_targets': ['target'],
-        'regression_point_metrics': ['MSE']
-    }
-    
-    return actual, [pred_0], "target", config
-
-@pytest.fixture
-def red_data_coordinates():
-    """Mismatched coordinates (extra units/months)."""
-    index_actual = pd.MultiIndex.from_product([[100, 101], [1, 2]], names=['month', 'unit'])
-    actual = pd.DataFrame({'target': [0.1, 0.2, 0.3, 0.4]}, index=index_actual)
-    
-    # Extra unit 3 (not in actuals)
-    index_pred = pd.MultiIndex.from_product([[100, 101], [1, 2, 3]], names=['month', 'unit'])
-    pred_0 = pd.DataFrame({'pred_target': [0.11, 0.21, 0.99, 0.31, 0.41, 0.99]}, index=index_pred)
-    
-    config = {
-        'steps': [1, 2],
-        'regression_targets': ['target'],
-        'regression_point_metrics': ['MSE']
-    }
-    
-    return actual, [pred_0], "target", config
-
-@pytest.fixture
-def red_data_inconsistent_samples():
-    """Ragged sample lengths (e.g. some rows have 2 samples, some have 3)."""
-    index = pd.MultiIndex.from_product([[100], [1, 2]], names=['month', 'unit'])
-    actual = pd.DataFrame({'target': [0.1, 0.2]}, index=index)
-    
-    # Row 1 has 2 samples, Row 2 has 3 samples
-    pred_0 = pd.DataFrame({
-        'pred_target': [[0.1, 0.12], [0.2, 0.22, 0.24]]
-    }, index=index)
-    
-    config = {
-        'steps': [1],
-        'regression_targets': ['target'],
-        'regression_point_metrics': [],
-        'regression_sample_metrics': ['CRPS']
-    }
-    
-    return actual, [pred_0], "target", config
-
-@pytest.fixture
-def red_data_nan_index():
-    """NaNs in index levels."""
-    # Note: Use object dtype for index to allow NaN mixed with ints
-    idx_actual = pd.MultiIndex.from_tuples([(100, 1), (101, 1), (np.nan, 2)], names=['month', 'unit'])
-    actual = pd.DataFrame({'target': [0.1, 0.2, 0.3]}, index=idx_actual)
-    
-    idx_pred = pd.MultiIndex.from_tuples([(100, 1), (101, 1), (np.nan, 2)], names=['month', 'unit'])
-    pred_0 = pd.DataFrame({'pred_target': [0.11, 0.21, 0.31]}, index=idx_pred)
-    
-    config = {
-        'steps': [1, 2],
-        'regression_targets': ['target'],
-        'regression_point_metrics': ['MSE']
-    }
-    
-    return actual, [pred_0], "target", config
-
-def test_parity_red_unordered(red_data_unordered):
-    actual, predictions, target, config = red_data_unordered
-    manager = EvaluationManager()
-    legacy_results = manager.evaluate(actual, predictions, target, config)
-    ef = PandasAdapter.from_dataframes(actual, predictions, target)
-    native_evaluator = NativeEvaluator(config)
-    native_results = native_evaluator.evaluate(ef)
-    assert_parity(legacy_results, native_results)
-
-def test_parity_red_coordinates(red_data_coordinates):
-    actual, predictions, target, config = red_data_coordinates
-    manager = EvaluationManager()
-    legacy_results = manager.evaluate(actual, predictions, target, config)
-    ef = PandasAdapter.from_dataframes(actual, predictions, target)
-    native_evaluator = NativeEvaluator(config)
-    native_results = native_evaluator.evaluate(ef)
-    assert_parity(legacy_results, native_results)
-
-def test_fail_loud_inconsistent_samples(red_data_inconsistent_samples):
-    actual, predictions, target, config = red_data_inconsistent_samples
-    manager = EvaluationManager()
-    # The new implementation raises a more descriptive error from the adapter
-    with pytest.raises(ValueError, match="Inconsistent list lengths"):
-        manager.evaluate(actual, predictions, target, config)
-
-
-def test_fail_loud_nan_index(red_data_nan_index):
-    actual, predictions, target, config = red_data_nan_index
-    manager = EvaluationManager()
-    # The new implementation fails early in the adapter if NaNs are detected
-    with pytest.raises(ValueError, match="NaN detected in 'time' index level"):
-        manager.evaluate(actual, predictions, target, config)
-
diff --git a/views_evaluation/__init__.py b/views_evaluation/__init__.py
index 64c263b..91a728e 100644
--- a/views_evaluation/__init__.py
+++ b/views_evaluation/__init__.py
@@ -1,6 +1,4 @@
-# ── Permanent public API ─────────────────────────────────────────────────────
-# These classes are the stable, long-term interface of this library.
-# They will remain after Phase 3 of the orchestrator migration.
+# ── Public API ────────────────────────────────────────────────────────────────
 from views_evaluation.evaluation.evaluation_frame import EvaluationFrame
 from views_evaluation.evaluation.native_evaluator import NativeEvaluator
 from views_evaluation.evaluation.evaluation_report import EvaluationReport
@@ -13,16 +11,7 @@
 from views_evaluation.evaluation.config_schema import EvaluationConfig
 from views_evaluation.profiles import PROFILES
 
-# ── Temporary (PHASE-3-DELETE) ────────────────────────────────────────────────
-# These classes exist for backward compatibility and parity testing while the
-# orchestrator migration (ADR-011, report 10) completes in views-pipeline-core.
-# They will be removed once upstream parity is confirmed. Do not build new
-# integrations on them.
-from views_evaluation.evaluation.evaluation_manager import EvaluationManager
-from views_evaluation.adapters.pandas import PandasAdapter
-
 __all__ = [
-    # Permanent
     "EvaluationFrame",
     "NativeEvaluator",
     "EvaluationReport",
@@ -32,7 +21,4 @@
     "resolve_metric_params",
     "EvaluationConfig",
     "PROFILES",
-    # Temporary — PHASE-3-DELETE
-    "EvaluationManager",
-    "PandasAdapter",
 ]
diff --git a/views_evaluation/adapters/pandas.py b/views_evaluation/adapters/pandas.py
deleted file mode 100644
index 5941578..0000000
--- a/views_evaluation/adapters/pandas.py
+++ /dev/null
@@ -1,150 +0,0 @@
-"""
-PHASE-3-DELETE
-This module is TEMPORARY and will be deleted in Phase 3 of the orchestrator migration.
-See: reports/2026-02-25_evaluation_frame_refactor/10_orchestrator_migration_plan.md
-
-After Phase 3:
-  - Adapters live in views-pipeline-core (or in the calling repository)
-  - This repo has no knowledge of any specific data framework
-  - This file will not exist in this repository
-
-Do not add new functionality to this file.
-"""
-import warnings
-import numpy as np
-import pandas as pd
-from typing import List
-from views_evaluation.evaluation.evaluation_frame import EvaluationFrame
-
-class PandasAdapter:
-    """
-    Adapter to convert Pandas DataFrames into the native EvaluationFrame.
-    
-    This class 'knows' about Pandas, allowing the rest of the core
-    to remain pure.
-    """
-    
-    @staticmethod
-    def from_dataframes(
-        actual: pd.DataFrame,
-        predictions: List[pd.DataFrame],
-        target: str,
-    ) -> EvaluationFrame:
-        """
-        Convert the current List[DataFrame] structure into a single EvaluationFrame.
-        
-        Args:
-            actual: DataFrame with MultiIndex [time, unit]
-            predictions: List of DataFrames with MultiIndex [time, unit]
-            target: The name of the target column
-        """
-        
-        warnings.warn(
-            "PandasAdapter is deprecated and will be removed from this repo in Phase 3. "
-            "Adapters belong in the calling repository (e.g. views-pipeline-core).",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-        all_y_true = []
-        all_y_pred = []
-        all_times = []
-        all_units = []
-        all_origins = []
-        all_steps = []
-        
-        pred_col = f"pred_{target}"
-        
-        if target not in actual.columns:
-            raise KeyError(f"Target column '{target}' not found in actuals.")
-
-        if not predictions:
-            # Align with legacy expected error message
-            raise ValueError("No objects to concatenate")
-        
-        for i, df in enumerate(predictions):
-            # 1. Align/Match Actuals (duplicated logic from EvaluationManager)
-            common_idx = actual.index.intersection(df.index)
-            if common_idx.empty:
-                continue
-
-            matched_pred = df.loc[common_idx]
-            matched_actual = actual.loc[common_idx, target]
-            
-            # 2. Extract Data
-            # Note: We assume all cells have the same number of samples
-            # This is where we explode the 'list-in-cell'
-            sample_lists = matched_pred[pred_col].tolist()
-            
-            # ADR-012: Validate rectangular samples
-            lengths = [len(x) if isinstance(x, (list, np.ndarray)) else 1 for x in sample_lists]
-            if len(set(lengths)) > 1:
-                # Align with legacy expected error message
-                raise ValueError(
-                    f"Inconsistent list lengths in sample evaluation. "
-                    f"Found lengths {set(lengths)}"
-                )
-
-            samples = np.array(sample_lists)
-            if samples.ndim == 1: # Point forecasts
-                samples = samples.reshape(-1, 1)
-            
-            n_rows = len(matched_actual)
-            
-            # Legacy Actuals might be list-like (e.g. [0.1])
-            actual_vals = matched_actual.values
-            if actual_vals.dtype == object:
-                # Coerce to scalars
-                actual_vals = np.array([
-                    x[0] if isinstance(x, (list, np.ndarray)) and len(x) > 0 else x 
-                    for x in actual_vals
-                ])
-
-            all_y_true.append(actual_vals)
-            all_y_pred.append(samples)
-            
-            # 3. Extract Identifiers
-            times = matched_pred.index.get_level_values(0).values
-            units = matched_pred.index.get_level_values(1).values
-            
-            # ADR-012: No NaNs in identifiers
-            if np.any(pd.isna(times)):
-                raise ValueError(f"NaN detected in 'time' index level of sequence {i}.")
-            if np.any(pd.isna(units)):
-                raise ValueError(f"NaN detected in 'unit' index level of sequence {i}.")
-
-            all_times.append(times)
-            all_units.append(units)
-            
-            # 4. Synthesize Origin and Step
-            # Origin is the list index
-            all_origins.append(np.full(n_rows, i))
-            
-            # Step is positional lead-time per unique month in the sequence
-            unique_times = matched_pred.index.get_level_values(0).unique()
-            time_to_step = {t: step_idx + 1 for step_idx, t in enumerate(unique_times)}
-            steps = np.array([time_to_step[t] for t in times])
-            all_steps.append(steps)
-            
-        if not all_y_true:
-            # ADR-013: Fail-Loud on zero overlap
-            raise ValueError("need at least one array to concatenate")
-
-        # ADR-012: Ensure all sequences have consistent sample counts
-        sample_counts = [y.shape[1] for y in all_y_pred]
-        if len(set(sample_counts)) > 1:
-            raise ValueError(
-                "Mix of evaluation types detected: some sequences contain point forecasts, others contain samples. "
-                "Please ensure all sequences are consistent in their evaluation type."
-            )
-
-        return EvaluationFrame(
-            y_true=np.concatenate(all_y_true),
-            y_pred=np.concatenate(all_y_pred),
-            identifiers={
-                'time': np.concatenate(all_times),
-                'unit': np.concatenate(all_units),
-                'origin': np.concatenate(all_origins),
-                'step': np.concatenate(all_steps),
-            },
-            metadata={'target': target}
-        )
diff --git a/views_evaluation/evaluation/deprecation_msgs.py b/views_evaluation/evaluation/deprecation_msgs.py
deleted file mode 100644
index dcbbbc7..0000000
--- a/views_evaluation/evaluation/deprecation_msgs.py
+++ /dev/null
@@ -1,40 +0,0 @@
-
-import warnings
-
-def raise_legacy_scale_msg() -> None:
-
-    """
-    Emit a highly visible warning banner for legacy scale-detection behavior
-    that should eventually be removed, but does not currently break execution.
-    """
-
-    default_msg = """
-Currently, the evaluation package infers target scaling (e.g. log, linear)
-from the target variable name (lr_, ln_, lx_).
-
-This is problematic because:
-
-1) Target scaling is a MODEL parameter and must live with the model,
-   not be inferred from target names.
-
-2) Adding new scales would require updating a hard-coded list in the
-   evaluation package, which is brittle and volatile.
-
-3) Target prefixes (lr_, ln_, lx_) are not guarantees of scaling —
-   at best they are hints, and can lead to silent errors.
-
-As such, this behavior should be removed.
-Targets should always be assumed unscaled.
-"""
-
-    banner = (
-        "\n"
-        + "#" * 78 + "\n"
-        + "#{:^76}#\n".format("LEGACY SCALE DETECTION — SHOULD BE REMOVED")
-        + "#" * 78 + "\n"
-        + (default_msg).strip() + "\n"
-        + "#" * 78
-    )
-
-    # Use UserWarning so it is always shown (DeprecationWarning is often suppressed)
-    warnings.warn(banner, UserWarning, stacklevel=2)
diff --git a/views_evaluation/evaluation/evaluation_frame.py b/views_evaluation/evaluation/evaluation_frame.py
index f8cacb5..90f955e 100644
--- a/views_evaluation/evaluation/evaluation_frame.py
+++ b/views_evaluation/evaluation/evaluation_frame.py
@@ -27,7 +27,13 @@ def _validate(y_true: np.ndarray, y_pred: np.ndarray, identifiers: Dict[str, np.
         n_rows = len(y_true)
         if y_pred.shape[0] != n_rows:
             raise ValueError(f"y_pred rows ({y_pred.shape[0]}) mismatch y_true ({n_rows})")
-        
+
+        # Rectangular sample validation: y_pred must be a dense 2D array
+        if y_pred.ndim != 2:
+            raise ValueError(
+                f"y_pred must be 2D (N, S), got {y_pred.ndim}D with shape {y_pred.shape}"
+            )
+
         # ADR-013: Fail-Loud on corrupted numerical data
         # Align with legacy test expectations for error messages
         def check_corrupted(arr, name):
diff --git a/views_evaluation/evaluation/evaluation_manager.py b/views_evaluation/evaluation/evaluation_manager.py
deleted file mode 100644
index 8da94cb..0000000
--- a/views_evaluation/evaluation/evaluation_manager.py
+++ /dev/null
@@ -1,733 +0,0 @@
-"""
-PHASE-3-DELETE
-This module is TEMPORARY and will be deleted in Phase 3 of the orchestrator migration.
-See: reports/2026-02-25_evaluation_frame_refactor/10_orchestrator_migration_plan.md
-
-After Phase 3:
-  - Adapters live in views-pipeline-core (or in the calling repository)
-  - EvaluationManager is fully replaced by NativeEvaluator in pipeline-core
-  - This file will not exist in this repository
-
-Do not add new functionality to this file.
-"""
-from typing import List, Tuple
-import logging
-import warnings
-import pandas as pd
-import numpy as np
-from views_evaluation.adapters.pandas import PandasAdapter
-from views_evaluation.evaluation.native_evaluator import NativeEvaluator
-from views_evaluation.evaluation.evaluation_frame import EvaluationFrame
-from views_evaluation.evaluation.metrics import (
-    BaseEvaluationMetrics,
-)
-from views_evaluation.evaluation.native_metric_calculators import (
-    REGRESSION_POINT_NATIVE,
-    REGRESSION_SAMPLE_NATIVE,
-    CLASSIFICATION_POINT_NATIVE,
-    CLASSIFICATION_SAMPLE_NATIVE,
-)
-
-logger = logging.getLogger(__name__)
-
-
-class EvaluationManager:
-    """
-    A class for calculating metrics on time series predictions
-    Refer to https://github.com/prio-data/views_pipeline/blob/eval_docs/documentation/evaluation/schema.MD for more details on three evaluation schemas.
-    """
-
-    def __init__(self):
-        """
-        Initialize the EvaluationManager.
-
-        Metrics to compute and targets to evaluate are declared in the config
-        passed to evaluate(). No metric list is accepted here.
-        """
-
-        warnings.warn(
-            "EvaluationManager is deprecated and will be removed in Phase 3 of the "
-            "orchestrator migration. Use NativeEvaluator directly with an adapter. "
-            "See documentation/integration_guide.md.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-        self.regression_point_functions           = REGRESSION_POINT_NATIVE
-        self.regression_sample_functions          = REGRESSION_SAMPLE_NATIVE
-        self.classification_point_functions       = CLASSIFICATION_POINT_NATIVE
-        self.classification_sample_functions      = CLASSIFICATION_SAMPLE_NATIVE
-
-
-    @staticmethod
-    def transform_data(df: pd.DataFrame, target: str | list[str]) -> pd.DataFrame:
-        """
-        DEPRECATED. Apply legacy inverse transformations based on target name prefix.
-
-        This method will be removed once all model repos have migrated to returning
-        predictions on the original scale. Do not add new logic here.
-        """
-        if isinstance(target, str):
-            target = [target]
-        for t in target:
-            if t.startswith("ln") or t.startswith("pred_ln"):
-                df[[t]] = df[[t]].applymap(lambda x: np.exp(x) - 1)
-            elif t.startswith("lx") or t.startswith("pred_lx"):
-                df[[t]] = df[[t]].applymap(lambda x: np.exp(x) - np.exp(100))
-            elif t.startswith("lr") or t.startswith("pred_lr"):
-                pass  # identity — lr_ targets are already on the original scale
-            else:
-                logger.warning(
-                    f"transform_data: unrecognised prefix for target '{t}'. "
-                    "Applying identity (no transformation). "
-                    "If this target requires inverse transformation it must be applied "
-                    "by the model manager before calling evaluate(). "
-                    "This fallback will be removed when transform_data is deprecated."
-                )
-        return df
-
-    @staticmethod
-    def convert_to_array(df: pd.DataFrame, target: str | list[str]) -> pd.DataFrame:
-        """
-        Convert columns in a DataFrame to numpy arrays.
-
-        Args:
-            df (pd.DataFrame): The input DataFrame with columns that may contain lists.
-
-        Returns:
-            pd.DataFrame: A new DataFrame with columns converted to numpy arrays.
-        """
-        converted = df.copy()
-        if isinstance(target, str):
-            target = [target]
-
-        for t in target:
-            converted[t] = converted[t].apply(
-                lambda x: (
-                    x
-                    if isinstance(x, np.ndarray)
-                    else (np.array(x) if isinstance(x, list) else np.array([x]))
-                )
-            )
-        return converted
-
-    @staticmethod
-    def convert_to_scalar(df: pd.DataFrame, target: str | list[str]) -> pd.DataFrame:
-        """
-        Convert columns in a DataFrame to scalar values by taking the mean of the list.
-        """
-        converted = df.copy()
-        if isinstance(target, str):
-            target = [target]
-        for t in target:
-            converted[t] = converted[t].apply(
-                lambda x: np.mean(x) if isinstance(x, (list, np.ndarray)) else x
-            )
-        return converted
-
-    @staticmethod
-    def get_evaluation_type(predictions: List[pd.DataFrame], target: str) -> bool:
-        """
-        Validates the values in each DataFrame in the list.
-        The return value indicates whether all DataFrames are for sample evaluation.
-
-        Args:
-            predictions (List[pd.DataFrame]): A list of DataFrames to check.
-
-        Returns:
-            bool: True if all DataFrames are for sample evaluation,
-                  False if all DataFrame are for point evaluation.
-
-        Raises:
-            ValueError: If there is a mix of single and multiple values in the lists,
-                      or if uncertainty lists have different lengths.
-        """
-        is_sample = False
-        is_point = False
-        sample_length = None
-
-        for df in predictions:
-            for value in df[target].values.flatten():
-                if not (isinstance(value, np.ndarray) or isinstance(value, list)):
-                    raise ValueError(
-                        "All values must be lists or numpy arrays. Convert the data."
-                    )
-
-                if len(value) > 1:
-                    is_sample = True
-                    # For sample evaluation, check that all lists have the same length
-                    if sample_length is None:
-                        sample_length = len(value)
-                    elif len(value) != sample_length:
-                        raise ValueError(
-                            f"Inconsistent list lengths in sample evaluation. "
-                            f"Found lengths {sample_length} and {len(value)}"
-                        )
-                elif len(value) == 1:
-                    is_point = True
-                else:
-                    raise ValueError("Empty lists are not allowed")
-
-        if is_sample and is_point:
-            raise ValueError(
-                "Mix of evaluation types detected: some rows contain single values, others contain multiple values. "
-                "Please ensure all rows are consistent in their evaluation type"
-            )
-
-        return is_sample
-
-    @staticmethod
-    def validate_predictions(predictions: List[pd.DataFrame], target: str):
-        """
-        Checks if the predictions are valid DataFrames.
-        - Each DataFrame must have exactly one column named `pred_column_name`.
-
-        Args:
-            predictions (List[pd.DataFrame]): A list of DataFrames containing the predictions.
-            target (str): The target column in the actual DataFrame.
-        """
-        pred_column_name = f"pred_{target}"
-
-        if not isinstance(predictions, list):
-            raise TypeError("Predictions must be a list of DataFrames.")
-
-        for i, df in enumerate(predictions):
-            
-            if not isinstance(df, pd.DataFrame):
-                raise TypeError(f"Predictions[{i}] must be a DataFrame.")
-            
-            if df.empty:
-                raise ValueError(f"Predictions[{i}] must not be empty.")
-            
-            if len(df.columns) != 1:
-                raise ValueError(
-                    f"Predictions[{i}] must contain exactly one column, but found {len(df.columns)}: {list(df.columns)}" # <--------
-                )
-
-            if pred_column_name not in df.columns:
-                raise ValueError(
-                    f"Predictions[{i}] must contain the column named '{pred_column_name}'. Columns found: {list(df.columns)}"
-                )
-
-    @staticmethod
-    def _match_actual_pred(
-        actual: pd.DataFrame, pred: pd.DataFrame, target: str
-    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
-        """
-        Matches the actual and predicted DataFrames based on the index and target column.
-
-        Parameters:
-        - actual: pd.DataFrame with a MultiIndex (e.g., month, level).
-        - pred: pd.DataFrame with a MultiIndex that may contain duplicated indices.
-        - target: str, the target column in actual.
-
-        Returns:
-        - matched_actual: pd.DataFrame aligned with pred.
-        - matched_pred: pd.DataFrame aligned with actual.
-        """
-        actual_target = actual[[target]]
-        common_indices = actual_target.index.intersection(pred.index)
-        matched_pred = pred[pred.index.isin(common_indices)].copy()
-        
-        # Create matched_actual by reindexing actual_target to match pred's index structure
-        # This will duplicate rows in actual where pred has duplicate indices
-        matched_actual = actual_target.reindex(matched_pred.index)
-        
-        matched_actual = matched_actual.sort_index()
-        matched_pred = matched_pred.sort_index()
-
-        return matched_actual, matched_pred
-
-
-    @staticmethod
-    def _split_dfs_by_step(dfs: list) -> list:
-        """¨
-        This function splits a list of DataFrames into a list of DataFrames by step, where the key is the step.
-        For example, assume df0 has month_id from 100 to 102, df1 has month_id from 101 to 103, and df2 has month_id from 102 to 104.
-        This function returns a list of three dataframes, with the first dataframe having month_id 100 from df0, month_id 101 from df1, and month_id 102 from df;
-        the second dataframe having month_id 101 from df0, month_id 102 from df1, and month_id 103 from df2; and the third dataframe having month_id 102 from df1 and month_id 104 from df2.
-
-        Args:
-            dfs (list): List of DataFrames with overlapping time ranges.
-
-        Returns:
-            dict (list): A list of DataFrames where each contains one unique month_id from each input DataFrame.
-        """
-        time_id = dfs[0].index.names[0]
-        all_month_ids = [df.index.get_level_values(0).unique() for df in dfs]
-
-        grouped_month_ids = list(zip(*all_month_ids))
-
-        result_dfs = []
-        for group in grouped_month_ids:
-            combined = pd.concat(
-                [df.loc[month_id] for df, month_id in zip(dfs, group)],
-                keys=group,
-                names=[time_id],
-            )
-            result_dfs.append(combined)
-
-        return result_dfs
-
-    def _process_data(
-        self, actual: pd.DataFrame, predictions: List[pd.DataFrame], target: str
-    ):
-        """
-        Process the data for evaluation.
-        """
-        actual = EvaluationManager.transform_data(
-            EvaluationManager.convert_to_array(actual, target), target
-        )
-        predictions = [
-            EvaluationManager.transform_data(
-                EvaluationManager.convert_to_array(pred, f"pred_{target}"),
-                f"pred_{target}",
-            )
-            for pred in predictions
-        ]
-        return actual, predictions
-
-    @staticmethod
-    def _normalise_config(config: dict) -> dict:
-        """
-        Translate legacy config keys to canonical keys, warning loudly.
-
-        Legacy key 'targets' → 'regression_targets'
-        Legacy key 'metrics' → 'regression_point_metrics'
-        Legacy key 'regression_uncertainty_metrics' → 'regression_sample_metrics'
-        Legacy key 'classification_uncertainty_metrics' → 'classification_sample_metrics'
-        """
-        canonical = config.copy()
-        if "targets" in config and "regression_targets" not in config:
-            logger.warning(
-                "Config key 'targets' is DEPRECATED and will be rejected in a future "
-                "version. It has been treated as 'regression_targets'. "
-                "Update your config."
-            )
-            canonical["regression_targets"] = canonical.pop("targets")
-        if "metrics" in config and "regression_point_metrics" not in config:
-            logger.warning(
-                "Config key 'metrics' is DEPRECATED and will be rejected in a future "
-                "version. It has been treated as 'regression_point_metrics'. "
-                "Update your config."
-            )
-            canonical["regression_point_metrics"] = canonical.pop("metrics")
-
-        if "regression_uncertainty_metrics" in config and "regression_sample_metrics" not in config:
-            logger.warning(
-                "Config key 'regression_uncertainty_metrics' is DEPRECATED and will be rejected in a future "
-                "version. It has been treated as 'regression_sample_metrics'. "
-                "Update your config."
-            )
-            canonical["regression_sample_metrics"] = canonical.pop("regression_uncertainty_metrics")
-
-        if "classification_uncertainty_metrics" in config and "classification_sample_metrics" not in config:
-            logger.warning(
-                "Config key 'classification_uncertainty_metrics' is DEPRECATED and will be rejected in a future "
-                "version. It has been treated as 'classification_sample_metrics'. "
-                "Update your config."
-            )
-            canonical["classification_sample_metrics"] = canonical.pop("classification_uncertainty_metrics")
-
-        return canonical
-
-    @staticmethod
-    def _validate_config(config: dict) -> None:
-        """
-        Fail loud and fast on an invalid or incomplete config.
-
-        Raises KeyError if required keys are absent.
-        """
-        if "steps" not in config:
-            raise KeyError("Config must contain 'steps'.")
-        has_regression     = bool(config.get("regression_targets"))
-        has_classification = bool(config.get("classification_targets"))
-        if not has_regression and not has_classification:
-            raise KeyError(
-                "Config must declare at least one of 'regression_targets' or "
-                "'classification_targets'."
-            )
-        if has_regression and not (
-            config.get("regression_point_metrics") or config.get("regression_sample_metrics")
-        ):
-            raise KeyError(
-                "Config declares 'regression_targets' but has neither "
-                "'regression_point_metrics' nor 'regression_sample_metrics'."
-            )
-        if has_classification and not (
-            config.get("classification_point_metrics") or config.get("classification_sample_metrics")
-        ):
-            raise KeyError(
-                "Config declares 'classification_targets' but has neither "
-                "'classification_point_metrics' nor 'classification_sample_metrics'."
-            )
-
-        # Validate that metrics are valid for the task type (ADR-014)
-        from views_evaluation.evaluation.native_metric_calculators import (
-            REGRESSION_POINT_NATIVE, REGRESSION_SAMPLE_NATIVE,
-            CLASSIFICATION_POINT_NATIVE, CLASSIFICATION_SAMPLE_NATIVE
-        )
-        
-        for metric in config.get("regression_point_metrics", []):
-            if metric not in REGRESSION_POINT_NATIVE or metric == "AP":
-                raise ValueError(f"Metric '{metric}' is not valid for regression point tasks.")
-
-        for metric in config.get("regression_sample_metrics", []):
-            if metric not in REGRESSION_SAMPLE_NATIVE:
-                raise ValueError(f"Metric '{metric}' is not valid for regression sample tasks.")
-        for metric in config.get("classification_point_metrics", []):
-            if metric not in CLASSIFICATION_POINT_NATIVE:
-                raise ValueError(f"Metric '{metric}' is not valid for classification point tasks.")
-        for metric in config.get("classification_sample_metrics", []):
-            if metric not in CLASSIFICATION_SAMPLE_NATIVE:
-                raise ValueError(f"Metric '{metric}' is not valid for classification sample tasks.")
-
-
-    def step_wise_evaluation(
-        self,
-        actual: pd.DataFrame,
-        predictions: List[pd.DataFrame],
-        target: str,
-        steps: List[int],
-        metrics_list: List[str],
-        metric_functions: dict,
-        metrics_cls: type,
-        **kwargs,
-    ):
-        """
-        Evaluates the predictions step-wise and calculates the specified metrics.
-
-        Args:
-            actual (pd.DataFrame): The actual values.
-            predictions (List[pd.DataFrame]): A list of DataFrames containing the predictions.
-            target (str): The target column in the actual DataFrame.
-            steps (List[int]): The steps to evaluate.
-            metrics_list (List[str]): Metrics to compute, declared in config.
-            metric_functions (dict): Dispatch dict for the resolved task/pred type.
-            metrics_cls (type): Dataclass to use for result storage.
-
-        Returns:
-            Tuple: A tuple containing the evaluation dictionary and the evaluation DataFrame.
-        """
-        evaluation_dict = metrics_cls.make_step_wise_evaluation_dict(steps=max(steps))
-        result_dfs = EvaluationManager._split_dfs_by_step(predictions)
-
-        step_matched_data = {}
-        for i, pred in enumerate(result_dfs):
-            step = i + 1
-            matched_actual, matched_pred = EvaluationManager._match_actual_pred(
-                actual, pred, target
-            )
-            step_matched_data[step] = (matched_actual, matched_pred)
-
-        for metric in metrics_list:
-            for step, (matched_actual, matched_pred) in step_matched_data.items():
-                evaluation_dict[f"step{str(step).zfill(2)}"].__setattr__(
-                    metric,
-                    metric_functions[metric](
-                        matched_actual, matched_pred, target, **kwargs
-                    ),
-                )
-
-        return (
-            evaluation_dict,
-            metrics_cls.evaluation_dict_to_dataframe(evaluation_dict),
-        )
-
-    def time_series_wise_evaluation(
-        self,
-        actual: pd.DataFrame,
-        predictions: List[pd.DataFrame],
-        target: str,
-        metrics_list: List[str],
-        metric_functions: dict,
-        metrics_cls: type,
-        **kwargs,
-    ):
-        """
-        Evaluates the predictions time series-wise and calculates the specified metrics.
-
-        Args:
-            actual (pd.DataFrame): The actual values.
-            predictions (List[pd.DataFrame]): A list of DataFrames containing the predictions.
-            target (str): The target column in the actual DataFrame.
-            metrics_list (List[str]): Metrics to compute, declared in config.
-            metric_functions (dict): Dispatch dict for the resolved task/pred type.
-            metrics_cls (type): Dataclass to use for result storage.
-
-        Returns:
-            Tuple: A tuple containing the evaluation dictionary and the evaluation DataFrame.
-        """
-        evaluation_dict = metrics_cls.make_time_series_wise_evaluation_dict(
-            len(predictions)
-        )
-
-        ts_matched_data = {}
-        for i, pred in enumerate(predictions):
-            matched_actual, matched_pred = EvaluationManager._match_actual_pred(
-                actual, pred, target
-            )
-            ts_matched_data[i] = (matched_actual, matched_pred)
-
-        for metric in metrics_list:
-            for i, (matched_actual, matched_pred) in ts_matched_data.items():
-                evaluation_dict[f"ts{str(i).zfill(2)}"].__setattr__(
-                    metric,
-                    metric_functions[metric](
-                        matched_actual, matched_pred, target, **kwargs
-                    ),
-                )
-
-        return (
-            evaluation_dict,
-            metrics_cls.evaluation_dict_to_dataframe(evaluation_dict),
-        )
-
-    def month_wise_evaluation(
-        self,
-        actual: pd.DataFrame,
-        predictions: List[pd.DataFrame],
-        target: str,
-        metrics_list: List[str],
-        metric_functions: dict,
-        metrics_cls: type,
-        **kwargs,
-    ):
-        """
-        Evaluates the predictions month-wise and calculates the specified metrics.
-
-        Args:
-            actual (pd.DataFrame): The actual values.
-            predictions (List[pd.DataFrame]): A list of DataFrames containing the predictions.
-            target (str): The target column in the actual DataFrame.
-            metrics_list (List[str]): Metrics to compute, declared in config.
-            metric_functions (dict): Dispatch dict for the resolved task/pred type.
-            metrics_cls (type): Dataclass to use for result storage.
-
-        Returns:
-            Tuple: A tuple containing the evaluation dictionary and the evaluation DataFrame.
-        """
-        pred_concat = pd.concat(predictions)
-        month_range = pred_concat.index.get_level_values(0).unique()
-        month_start = int(month_range.min())
-        month_end   = int(month_range.max())
-
-        evaluation_dict = metrics_cls.make_month_wise_evaluation_dict(
-            month_start, month_end
-        )
-
-        matched_actual, matched_pred = EvaluationManager._match_actual_pred(
-            actual, pred_concat, target
-        )
-
-        g = matched_pred.groupby(level=matched_pred.index.names[0], sort=False, observed=True)
-        groups = g.indices  # dict: {month -> np.ndarray of row positions}
-
-        for metric in metrics_list:
-            for month, pos in groups.items():
-                value = metric_functions[metric](
-                    matched_actual.iloc[pos],
-                    matched_pred.iloc[pos],
-                    target,
-                    **kwargs,
-                )
-                evaluation_dict[f"month{str(month)}"].__setattr__(metric, value)
-
-        return (
-            evaluation_dict,
-            metrics_cls.evaluation_dict_to_dataframe(evaluation_dict),
-        )
-
-    def evaluate(
-        self,
-        actual: pd.DataFrame = None,
-        predictions: List[pd.DataFrame] = None,
-        target: str = None,
-        config: dict = None,
-        ef: EvaluationFrame = None,
-        verify_parity: bool = False,
-        **kwargs,
-    ):
-        """
-        Evaluate predictions. Supports legacy DataFrame inputs OR Native EvaluationFrame.
-
-        Args:
-            actual (pd.DataFrame): Optional. Legacy actuals.
-            predictions (List[pd.DataFrame]): Optional. Legacy predictions.
-            target (str): Target column name.
-            config (dict): Evaluation configuration.
-            ef (EvaluationFrame): Optional. Pre-adapted native frame.
-            verify_parity (bool): If True and both ef and legacy inputs are provided,
-                verifies bit-wise parity between them.
-        """
-        config = EvaluationManager._normalise_config(config)
-        EvaluationManager._validate_config(config)
-
-        if ef is not None:
-            # PATH B: Direct Native Evaluation
-            if not isinstance(ef, EvaluationFrame):
-                raise TypeError("Provided 'ef' must be an EvaluationFrame instance.")
-            target = ef.metadata.get('target', target)
-            
-            if verify_parity and actual is not None and predictions is not None:
-                # ADR-024 Shadow Run: Verify external adaptation matches internal
-                ef_internal = PandasAdapter.from_dataframes(actual, predictions, target)
-                # Check data parity
-                if not np.array_equal(ef.y_true, ef_internal.y_true):
-                    raise ValueError("Parity Failure: y_true mismatch between external and internal adaptation.")
-                if not np.array_equal(ef.y_pred, ef_internal.y_pred):
-                    raise ValueError("Parity Failure: y_pred mismatch between external and internal adaptation.")
-                for key in ef.identifiers:
-                    if not np.array_equal(ef.identifiers[key], ef_internal.identifiers[key]):
-                        raise ValueError(f"Parity Failure: identifier '{key}' mismatch.")
-        else:
-            # PATH A: Legacy Adaptation
-            if actual is None or predictions is None or target is None:
-                raise ValueError("If 'ef' is not provided, 'actual', 'predictions', and 'target' are required.")
-            
-            EvaluationManager.validate_predictions(predictions, target)
-            # ADR-010: Adapt legacy DataFrames to canonical EvaluationFrame
-            ef = PandasAdapter.from_dataframes(actual, predictions, target)
-            
-            # Restore internal state for backward compatibility with reflective tests
-            self.actual, self.predictions = self._process_data(actual, predictions, target)
-        
-        self.is_sample = ef.is_sample
-
-
-        # ADR-010: Delegate to the NativeEvaluator (Pure Math Engine)
-        evaluator = NativeEvaluator(config)
-        
-        # Phase 1: Enable legacy compatibility to maintain bit-wise parity
-        # (Reproduces truncation bugs and positional step assumptions)
-        try:
-            report = evaluator.evaluate(ef, legacy_compatibility=True)
-            # Map report back to legacy dictionary structure for backward compatibility
-            return {
-                schema: (report.get_schema_results(schema), report.to_dataframe(schema))
-                for schema in ["month", "time_series", "step"]
-            }
-        except ValueError as e:
-            # Re-wrap error message to match legacy test expectations if needed
-            if "Target" in str(e) and "not found in config" in str(e):
-                raise ValueError(f"Target '{target}' is not declared in config")
-            raise e
-
-
-
-
-    @staticmethod
-    def filter_step_wise_evaluation(
-        step_wise_evaluation_results: dict,
-        filter_steps: list[int] = [1, 3, 6, 12, 36],
-    ):
-        """
-        Filter step-wise evaluation results to include only specific steps.
-
-        Args:
-            step_wise_evaluation_results (dict): The step-wise evaluation results containing evaluation dict and DataFrame.
-            filter_steps (list[int]): List of step numbers to include in the filtered results. Defaults to [1, 3, 6, 12, 36].
-
-        Returns:
-            dict: A dictionary containing the filtered evaluation dictionary and DataFrame for the selected steps.
-        """
-        step_wise_evaluation_dict = step_wise_evaluation_results[0]
-        step_wise_evaluation_df = step_wise_evaluation_results[1]
-
-        selected_keys = [f"step{str(step).zfill(2)}" for step in filter_steps]
-
-        filtered_evaluation_dict = {
-            key: step_wise_evaluation_dict[key]
-            for key in selected_keys
-            if key in step_wise_evaluation_dict
-        }
-
-        filtered_evaluation_df = step_wise_evaluation_df.loc[
-            step_wise_evaluation_df.index.isin(selected_keys)
-        ]
-
-        return (filtered_evaluation_dict, filtered_evaluation_df)
-
-    @staticmethod
-    def aggregate_month_wise_evaluation(
-        month_wise_evaluation_results: dict,
-        aggregation_period: int = 6,
-        aggregation_type: str = "mean",
-    ):
-        """
-        Aggregate month-wise evaluation results by grouping months into periods and applying aggregation.
-
-        Args:
-            month_wise_evaluation_results (dict): The month-wise evaluation results containing evaluation dict and DataFrame.
-            aggregation_period (int): Number of months to group together for aggregation.
-            aggregation_type (str): Type of aggregation to apply.
-        Returns:
-            dict: A dictionary containing the aggregated evaluation dictionary and DataFrame.
-        """
-        month_wise_evaluation_dict = month_wise_evaluation_results[0]
-        month_wise_evaluation_df = month_wise_evaluation_results[1]
-
-        available_months = [
-            int(month.replace("month", "")) for month in month_wise_evaluation_df.index
-        ]
-        available_months.sort()
-
-        if len(available_months) < aggregation_period:
-            raise ValueError(
-                f"Not enough months to aggregate. Available months: {available_months}, aggregation period: {aggregation_period}"
-            )
-
-        aggregated_dict = {}
-        aggregated_data = []
-
-        for i in range(0, len(available_months), aggregation_period):
-            period_months = available_months[i : i + aggregation_period]
-            period_start = period_months[0]
-            period_end = period_months[-1]
-            period_key = f"month_{period_start}_{period_end}"
-
-            period_metrics = []
-            for month in period_months:
-                month_key = f"month{month}"
-                if month_key in month_wise_evaluation_dict:
-                    period_metrics.append(month_wise_evaluation_dict[month_key])
-
-            if period_metrics:
-                aggregated_metrics = {}
-                for metric_name in period_metrics[0].__annotations__.keys():
-                    metric_values = [
-                        getattr(metric, metric_name)
-                        for metric in period_metrics
-                        if getattr(metric, metric_name) is not None
-                    ]
-
-                    if metric_values:
-                        if aggregation_type == "mean":
-                            aggregated_value = np.mean(metric_values)
-                        elif aggregation_type == "median":
-                            aggregated_value = np.median(metric_values)
-                        else:
-                            raise ValueError(
-                                f"Unsupported aggregation type: {aggregation_type}"
-                            )
-
-                        aggregated_metrics[metric_name] = aggregated_value
-                    else:
-                        aggregated_metrics[metric_name] = None
-
-                if hasattr(period_metrics[0], "__class__"):
-                    aggregated_eval_metrics = period_metrics[0].__class__(
-                        **aggregated_metrics
-                    )
-                else:
-                    aggregated_eval_metrics = aggregated_metrics
-
-                aggregated_dict[period_key] = aggregated_eval_metrics
-
-                aggregated_data.append({"month_id": period_key, **aggregated_metrics})
-
-        if aggregated_data:
-            aggregated_df = BaseEvaluationMetrics.evaluation_dict_to_dataframe(
-                aggregated_dict
-            )
-
-        return (aggregated_dict, aggregated_df)
diff --git a/views_evaluation/evaluation/metric_catalog.py b/views_evaluation/evaluation/metric_catalog.py
index 861cfeb..7eae282 100644
--- a/views_evaluation/evaluation/metric_catalog.py
+++ b/views_evaluation/evaluation/metric_catalog.py
@@ -31,10 +31,13 @@
     calculate_coverage_native,
     calculate_mean_interval_score_native,
     calculate_ignorance_score_native,
+    calculate_brier_sample_native,
+    calculate_brier_point_native,
+    calculate_qs_sample_native,
+    calculate_qs_point_native,
     calculate_sd_native,
     calculate_pEMDiv_native,
     calculate_variogram_native,
-    calculate_brier_native,
     calculate_jeffreys_native,
 )
 
@@ -81,20 +84,29 @@ class MetricSpec:
     "Ignorance": MetricSpec(function=calculate_ignorance_score_native,
                             genome=("bins", "low_bin", "high_bin")),
 
+    # ── Quantile Score (Pinball Loss) ────────────────────────────────────
+    "QS_sample": MetricSpec(function=calculate_qs_sample_native, genome=("quantile",)),
+    "QS_point":  MetricSpec(function=calculate_qs_point_native,  genome=("quantile",)),
+
+    # ── Brier Score ───────────────────────────────────────────────────────
+    "Brier_sample": MetricSpec(function=calculate_brier_sample_native, genome=("threshold",)),
+    "Brier_point":  MetricSpec(function=calculate_brier_point_native,  genome=("threshold",)),
+
     # ── Classification ────────────────────────────────────────────────────
     "AP":        MetricSpec(function=calculate_ap_native,        genome=()),
-    "Brier":     MetricSpec(function=calculate_brier_native,     genome=(), implemented=False),
     "Jeffreys":  MetricSpec(function=calculate_jeffreys_native,  genome=(), implemented=False),
 }
 
 
 METRIC_MEMBERSHIP: Dict[Tuple[str, str], set] = {
     ("regression", "point"):      {"MSE", "MSLE", "RMSLE", "EMD", "Pearson", "MTD",
-                                    "y_hat_bar", "MCR_point", "SD", "pEMDiv", "Variogram"},
-    ("regression", "sample"):     {"CRPS", "twCRPS", "MIS", "QIS", "Coverage",
-                                    "Ignorance", "y_hat_bar", "MCR_sample"},
-    ("classification", "point"):  {"AP"},
-    ("classification", "sample"): {"CRPS", "twCRPS", "Brier", "Jeffreys"},
+                                    "y_hat_bar", "MCR_point", "QS_point",
+                                    "SD", "pEMDiv", "Variogram"},
+    ("regression", "sample"):     {"CRPS", "twCRPS", "MIS", "QIS", "QS_sample",
+                                    "Coverage", "Ignorance",
+                                    "y_hat_bar", "MCR_sample"},
+    ("classification", "point"):  {"AP", "Brier_point"},
+    ("classification", "sample"): {"CRPS", "twCRPS", "Brier_sample", "Jeffreys"},
 }
 
 
diff --git a/views_evaluation/evaluation/metrics.py b/views_evaluation/evaluation/metrics.py
index 83512cf..c3491f5 100644
--- a/views_evaluation/evaluation/metrics.py
+++ b/views_evaluation/evaluation/metrics.py
@@ -99,64 +99,8 @@ def evaluation_dict_to_dataframe(evaluation_dict: dict):
         return df.loc[:, df.notna().any()]
 
 
-@dataclass
-class PointEvaluationMetrics(BaseEvaluationMetrics):
-    """
-    A data class for storing and managing point evaluation metrics for time series forecasting models.
-    
-    Attributes:
-        RMSLE (Optional[float]): Root Mean Squared Logarithmic Error.
-        CRPS (Optional[float]): Continuous Ranked Probability Score.
-        AP (Optional[float]): Average Precision.
-        Brier (Optional[float]): Brier Score.
-        Jeffreys (Optional[float]): Jeffreys Divergence.
-        Coverage (Optional[float]): Coverage (Histograms).
-        EMD (Optional[float]): Earth Mover Distance.
-        SD (Optional[float]): Sinkhorn Distance.
-        pEMDiv (Optional[float]): pseudo-Earth Mover Divergence.
-        Pearson (Optional[float]): Pearson Correlation.
-        Variogram (Optional[float]): Variogram.
-    """
-
-    MSE: Optional[float] = None
-    MSLE: Optional[float] = None
-    RMSLE: Optional[float] = None
-    CRPS: Optional[float] = None
-    AP: Optional[float] = None
-    EMD: Optional[float] = None
-    SD: Optional[float] = None
-    pEMDiv: Optional[float] = None
-    Pearson: Optional[float] = None
-    Variogram: Optional[float] = None
-    MTD: Optional[float] = None
-    y_hat_bar: Optional[float] = None
-
-  
-@dataclass
-class SampleEvaluationMetrics(BaseEvaluationMetrics):
-    """
-    A data class for storing and managing sample-based evaluation metrics for time series forecasting models.
-
-    Attributes:
-        CRPS (Optional[float]): Continuous Ranked Probability Score.
-    """
-
-    CRPS: Optional[float] = None
-    twCRPS: Optional[float] = None
-    MIS: Optional[float] = None
-    QIS: Optional[float] = None
-    Ignorance: Optional[float] = None
-    Coverage: Optional[float] = None
-    pEMDiv: Optional[float] = None
-    Brier: Optional[float] = None
-    Jeffreys: Optional[float] = None
-    y_hat_bar: Optional[float] = None
-
-
 # ---------------------------------------------------------------------------
-# New 2×2 dataclasses: {regression, classification} × {point, sample}
-# These replace PointEvaluationMetrics and SampleEvaluationMetrics for
-# all new code. The legacy classes above are retained for backward compat.
+# 2×2 dataclasses: {regression, classification} × {point, sample}
 # ---------------------------------------------------------------------------
 
 @dataclass
@@ -173,16 +117,18 @@ class RegressionPointEvaluationMetrics(BaseEvaluationMetrics):
     MTD:       Optional[float] = None
     y_hat_bar: Optional[float] = None
     MCR_point: Optional[float] = None
+    QS_point:  Optional[float] = None
 
 
 @dataclass
 class RegressionSampleEvaluationMetrics(BaseEvaluationMetrics):
     """Metrics for regression targets evaluated with sample-based predictions."""
-    CRPS:      Optional[float] = None
-    twCRPS:    Optional[float] = None
-    MIS:       Optional[float] = None
-    QIS:       Optional[float] = None
-    Coverage:  Optional[float] = None
+    CRPS:       Optional[float] = None
+    twCRPS:     Optional[float] = None
+    MIS:        Optional[float] = None
+    QIS:        Optional[float] = None
+    QS_sample:  Optional[float] = None
+    Coverage:   Optional[float] = None
     Ignorance:  Optional[float] = None
     y_hat_bar:  Optional[float] = None
     MCR_sample: Optional[float] = None
@@ -191,13 +137,14 @@ class RegressionSampleEvaluationMetrics(BaseEvaluationMetrics):
 @dataclass
 class ClassificationPointEvaluationMetrics(BaseEvaluationMetrics):
     """Metrics for classification targets evaluated with point (probability) predictions."""
-    AP: Optional[float] = None
+    AP:          Optional[float] = None
+    Brier_point: Optional[float] = None
 
 
 @dataclass
 class ClassificationSampleEvaluationMetrics(BaseEvaluationMetrics):
     """Metrics for classification targets evaluated with sample-based predictions."""
-    CRPS:     Optional[float] = None
-    twCRPS:   Optional[float] = None
-    Brier:    Optional[float] = None
-    Jeffreys: Optional[float] = None
+    CRPS:         Optional[float] = None
+    twCRPS:       Optional[float] = None
+    Brier_sample: Optional[float] = None
+    Jeffreys:     Optional[float] = None
diff --git a/views_evaluation/evaluation/native_evaluator.py b/views_evaluation/evaluation/native_evaluator.py
index 60aa7c9..bfb9a82 100644
--- a/views_evaluation/evaluation/native_evaluator.py
+++ b/views_evaluation/evaluation/native_evaluator.py
@@ -73,7 +73,7 @@ def _calculate_metrics(self, ef: EvaluationFrame, metrics_list: List[str],
             results[m] = spec.function(ef.y_true, ef.y_pred, **resolved)
         return results
 
-    def evaluate(self, ef: EvaluationFrame, legacy_compatibility: bool = True) -> EvaluationReport:
+    def evaluate(self, ef: EvaluationFrame, legacy_compatibility: bool = False) -> EvaluationReport:
         metrics_list, task, pred_type = self._resolve_task_and_metrics(ef)
 
         results = {}
diff --git a/views_evaluation/evaluation/native_metric_calculators.py b/views_evaluation/evaluation/native_metric_calculators.py
index 76f4db5..f04ea8a 100644
--- a/views_evaluation/evaluation/native_metric_calculators.py
+++ b/views_evaluation/evaluation/native_metric_calculators.py
@@ -238,59 +238,164 @@ def calculate_quantile_interval_score_native(
     return float(np.mean(qis))
 
 
+# ── Brier Score ───────────────────────────────────────────────────────────────
+
+def calculate_brier_sample_native(
+    y_true: np.ndarray,
+    y_pred: np.ndarray,
+    target=None,
+    *,
+    threshold: float,
+    **kwargs,
+) -> float:
+    """
+    Brier Score for sample-based predictions binarized at a threshold.
+
+    Binarises truth at the threshold, computes event probability from
+    the fraction of ensemble members exceeding the threshold, then
+    returns the mean squared error between predicted probability and
+    binary outcome.
+
+    Brier = mean((p_hat - y_binary)^2)
+
+    where p_hat = mean(y_pred > threshold, axis=1) and
+    y_binary = (y_true > threshold).
+
+    Note: NaN values in y_true or y_pred are silently converted to
+    below-threshold (False) by NumPy comparison semantics. Callers
+    must validate inputs via EvaluationFrame.
+
+    Args:
+        threshold: Onset threshold for binarisation. Must be provided
+                   explicitly via evaluation profile or model config.
+    """
+    y_true, y_pred = _guard_shapes(y_true, y_pred)
+    y_binary = (y_true > threshold).astype(float)
+    p_hat = np.mean(y_pred > threshold, axis=1)
+    return float(np.mean((p_hat - y_binary) ** 2))
+
+
+def calculate_brier_point_native(
+    y_true: np.ndarray,
+    y_pred: np.ndarray,
+    target=None,
+    *,
+    threshold: float,
+    **kwargs,
+) -> float:
+    """
+    Brier Score for point (probability) predictions binarized at a threshold.
+
+    Binarises truth at the threshold, uses the point prediction
+    directly as the predicted probability. y_pred values should be
+    in [0, 1] for meaningful results; values outside this range
+    produce a mathematically valid but semantically misleading score.
+
+    Brier = mean((y_pred - y_binary)^2)
+
+    For point predictions, y_pred is (N, 1) after _guard_shapes.
+    The single column is the predicted probability.
+
+    Note: NaN values in y_true or y_pred are silently converted to
+    below-threshold (False) by NumPy comparison semantics. Callers
+    must validate inputs via EvaluationFrame.
+
+    Args:
+        threshold: Onset threshold for binarisation.
+    """
+    y_true, y_pred = _guard_shapes(y_true, y_pred)
+    y_binary = (y_true > threshold).astype(float)
+    p_hat = y_pred[:, 0]  # Point prediction: single column
+    return float(np.mean((p_hat - y_binary) ** 2))
+
+
+# ── Quantile Score (Pinball Loss) ─────────────────────────────────────────────
+
+def calculate_qs_sample_native(
+    y_true: np.ndarray,
+    y_pred: np.ndarray,
+    target=None,
+    *,
+    quantile: float,
+    **kwargs,
+) -> float:
+    """
+    Quantile Score (pinball loss) for sample-based predictions.
+
+    Extracts the specified quantile from the forecast ensemble, then
+    computes the asymmetric pinball loss.
+
+    QS = mean(max(alpha * (y - q), (1 - alpha) * (q - y)))
+
+    where q = np.quantile(y_pred, quantile, axis=1).
+
+    Args:
+        quantile: Quantile level in (0, 1). E.g. 0.99 for QS99.
+    """
+    y_true, y_pred = _guard_shapes(y_true, y_pred)
+    q = np.quantile(y_pred, quantile, axis=1)
+    diff = y_true - q
+    scores = np.where(
+        diff >= 0,
+        diff * quantile,
+        -diff * (1 - quantile),
+    )
+    return float(np.mean(scores))
+
+
+def calculate_qs_point_native(
+    y_true: np.ndarray,
+    y_pred: np.ndarray,
+    target=None,
+    *,
+    quantile: float,
+    **kwargs,
+) -> float:
+    """
+    Quantile Score (pinball loss) for point predictions.
+
+    The point prediction is treated as the quantile estimate directly.
+    Computes the asymmetric pinball loss.
+
+    QS = mean(max(alpha * (y - y_hat), (1 - alpha) * (y_hat - y)))
+
+    For point predictions, y_pred is (N, 1) after _guard_shapes.
+
+    Args:
+        quantile: Quantile level in (0, 1). E.g. 0.99 for QS99.
+    """
+    y_true, y_pred = _guard_shapes(y_true, y_pred)
+    q = y_pred[:, 0]
+    diff = y_true - q
+    scores = np.where(
+        diff >= 0,
+        diff * quantile,
+        -diff * (1 - quantile),
+    )
+    return float(np.mean(scores))
+
+
 # Placeholder functions for metrics that are planned but not yet implemented.
-# ADR-013: Raise ValueError (not NotImplementedError) so callers get a consistent,
-# user-facing message rather than a bare exception type.
+# ADR-013: Raise ValueError (not NotImplementedError) so callers get a
+# consistent, user-facing message rather than a bare exception type.
 def calculate_sd_native(*args, **kwargs):
-    raise ValueError("Metric 'SD' is defined but not yet implemented. Remove it from your config.")
+    raise ValueError(
+        "Metric 'SD' is defined but not yet implemented."
+        " Remove it from your config."
+    )
 def calculate_pEMDiv_native(*args, **kwargs):
-    raise ValueError("Metric 'pEMDiv' is defined but not yet implemented. Remove it from your config.")
+    raise ValueError(
+        "Metric 'pEMDiv' is defined but not yet implemented."
+        " Remove it from your config."
+    )
 def calculate_variogram_native(*args, **kwargs):
-    raise ValueError("Metric 'Variogram' is defined but not yet implemented. Remove it from your config.")
-def calculate_brier_native(*args, **kwargs):
-    raise ValueError("Metric 'Brier' is defined but not yet implemented. Remove it from your config.")
+    raise ValueError(
+        "Metric 'Variogram' is defined but not yet implemented."
+        " Remove it from your config."
+    )
 def calculate_jeffreys_native(*args, **kwargs):
-    raise ValueError("Metric 'Jeffreys' is defined but not yet implemented. Remove it from your config.")
-
-# PHASE-3-DELETE: Legacy alias retained for test_evaluation_manager.py
-calculate_ap = calculate_ap_native
-
-# Dispatch dicts (Framework Agnostic)
-REGRESSION_POINT_NATIVE = {
-    "MSE":       calculate_mse_native,
-    "MSLE":      calculate_msle_native,
-    "RMSLE":     calculate_rmsle_native,
-    "EMD":       calculate_emd_native,
-    "Pearson":   calculate_pearson_native,
-    "MTD":       calculate_mtd_native,
-    "y_hat_bar": calculate_mean_prediction_native,
-    "MCR_point": calculate_mcr_native,
-    "SD":        calculate_sd_native,
-    "pEMDiv":    calculate_pEMDiv_native,
-    "Variogram": calculate_variogram_native,
-}
-
-
-
-
-REGRESSION_SAMPLE_NATIVE = {
-    "CRPS":      calculate_crps_native,
-    "twCRPS":    calculate_twcrps_native,
-    "MIS":       calculate_mean_interval_score_native,
-    "QIS":       calculate_quantile_interval_score_native,
-    "Coverage":  calculate_coverage_native,
-    "Ignorance": calculate_ignorance_score_native,
-    "y_hat_bar": calculate_mean_prediction_native,
-    "MCR_sample": calculate_mcr_native,
-}
-
-CLASSIFICATION_POINT_NATIVE = {
-    "AP": calculate_ap_native,
-}
-
-CLASSIFICATION_SAMPLE_NATIVE = {
-    "CRPS":      calculate_crps_native,
-    "twCRPS":    calculate_twcrps_native,
-    "Brier":     calculate_brier_native,
-    "Jeffreys":  calculate_jeffreys_native,
-}
+    raise ValueError(
+        "Metric 'Jeffreys' is defined but not yet implemented."
+        " Remove it from your config."
+    )
+
diff --git a/views_evaluation/profiles/base.py b/views_evaluation/profiles/base.py
index 450200e..652367b 100644
--- a/views_evaluation/profiles/base.py
+++ b/views_evaluation/profiles/base.py
@@ -21,11 +21,15 @@
 """
 
 BASE_PROFILE = {
-    "MTD":       {"power": 1.5},
-    "twCRPS":    {"threshold": 0.0},
-    "MIS":       {"alpha": 0.05},
-    "QIS":       {"lower_quantile": 0.025, "upper_quantile": 0.975},
-    "Coverage":  {"alpha": 0.1},
+    "MTD":           {"power": 1.5},
+    "twCRPS":        {"threshold": 0.0},
+    "MIS":           {"alpha": 0.05},
+    "QIS":           {"lower_quantile": 0.025, "upper_quantile": 0.975},
+    "QS_sample":     {"quantile": 0.99},
+    "QS_point":      {"quantile": 0.99},
+    "Brier_sample":  {"threshold": 1.0},
+    "Brier_point":   {"threshold": 1.0},
+    "Coverage":      {"alpha": 0.1},
     "Ignorance": {
         "bins": [0, 0.5, 2.5, 5.5, 10.5, 25.5, 50.5, 100.5, 250.5, 500.5, 1000.5],
         "low_bin": 0,