From 1c970fe94c9ad2468d1168030838ed68c0f6f065 Mon Sep 17 00:00:00 2001
From: Polichinl <simmaa@prio.org>
Date: Thu, 9 Apr 2026 08:34:28 +0200
Subject: [PATCH 1/2] =?UTF-8?q?feat:=20explicit=20Brier=20score=20variants?=
 =?UTF-8?q?=20for=20the=202=C3=972=20evaluation=20matrix?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace Brier_sample/Brier_point with three task-explicit variants:
- Brier_cls_point: classification point (y_pred is probability)
- Brier_cls_sample: classification sample (average MC Dropout probabilities)
- Brier_rgs_sample: regression sample (binarise count samples at threshold)

Brier_rgs_point intentionally omitted — regression point estimates are not
probabilities. The _cls_/_rgs_ infix makes the task context self-documenting.

Critical fix: Brier_cls_sample uses mean(y_pred) instead of mean(y_pred > threshold),
which was broken for probability samples (all probabilities > 0 → p_hat ≈ 1.0).

Profile defaults: classification Brier threshold=0.0 (PDF: event y > 0),
regression Brier threshold=1.0 (binarise at 1 fatality). Catalog size 24 → 25.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 documentation/CICs/MetricCatalog.md           |   6 +-
 tests/test_metric_calculators.py              | 106 ++++++++++++------
 tests/test_metric_catalog.py                  |   2 +-
 tests/test_native_evaluator.py                |  14 +--
 views_evaluation/evaluation/metric_catalog.py |  18 +--
 views_evaluation/evaluation/metrics.py        |  15 +--
 .../evaluation/native_metric_calculators.py   |  90 ++++++++++-----
 views_evaluation/profiles/base.py             |   5 +-
 views_evaluation/profiles/hydranet_ucdp.py    |   7 +-
 9 files changed, 165 insertions(+), 98 deletions(-)

diff --git a/documentation/CICs/MetricCatalog.md b/documentation/CICs/MetricCatalog.md
index 25b4d6c..ba529f8 100644
--- a/documentation/CICs/MetricCatalog.md
+++ b/documentation/CICs/MetricCatalog.md
@@ -113,6 +113,7 @@ params = resolve_metric_params("MSE", {}, BASE_PROFILE)
 - **Red:** `tests/test_metric_catalog.py` — unknown metrics, unimplemented metrics, missing params, None values, unknown overrides.
 - **Red (bounds):** `tests/test_metric_catalog.py::TestResolveMetricParamsBoundsRed` — 7 tests for out-of-range alpha/quantile and crossed QIS quantiles.
 - **Correctness:** `tests/test_metric_calculators.py::TestGoldenValues` — 17 golden-value tests for all implemented metrics.
+- **Correctness (Brier/QS):** `tests/test_metric_calculators.py::TestBrierScore` + `TestQuantileScore` — 10 golden-value tests for the 3 Brier variants and 2 QS variants.
 
 ---
 
@@ -122,14 +123,15 @@ params = resolve_metric_params("MSE", {}, BASE_PROFILE)
 - The legacy dispatch dicts were removed in Phase 3. `METRIC_MEMBERSHIP` is the single source of truth.
 - Profile structure is stable; new profiles are added by creating a new file in `profiles/`.
 - Bounds validation added for probability/proportion parameters (2026-04-04, C-18): `alpha`, `quantile`, `lower_quantile`, `upper_quantile` must be in (0, 1). Cross-parameter validation for QIS quantile ordering.
+- Explicit Brier variants added (2026-04-09): `Brier_sample`/`Brier_point` replaced by three task-explicit variants: `Brier_cls_point`, `Brier_cls_sample`, `Brier_rgs_sample`. The `_cls_`/`_rgs_` infix denotes the task context (classification vs. regression). `Brier_rgs_point` is intentionally omitted — a regression point estimate is not a probability. `Brier_cls_sample` averages probability samples (`mean(y_pred)`); `Brier_rgs_sample` binarises count samples (`mean(y_pred > threshold)`). Catalog size: 24 → 25.
 
 ---
 
 ## 12. Known Deviations
 
 - **No profile completeness validation:** There is no mechanism to verify that a profile provides values for all metrics with non-empty genomes. A profile missing a metric's params will only fail at evaluation time, not at profile registration.
-- **Golden-value coverage complete:** 17 tests in `tests/test_metric_calculators.py::TestGoldenValues` plus 8 Brier/QS golden-value tests cover all implemented metrics (C-07 closed 2026-04-02).
-- **Breaking rename:** The legacy `Brier` metric (unimplemented placeholder) was replaced by `Brier_sample` and `Brier_point` (implemented). The field in `ClassificationSampleEvaluationMetrics` was renamed from `Brier` to `Brier_sample`. External consumers accessing `.Brier` on classification sample results must update to `.Brier_sample`.
+- **Golden-value coverage complete:** 17 tests in `tests/test_metric_calculators.py::TestGoldenValues` plus 10 Brier/QS golden-value tests cover all implemented metrics (C-07 closed 2026-04-02).
+- **Breaking rename (2026-04-09):** `Brier_sample` and `Brier_point` were replaced by `Brier_cls_point`, `Brier_cls_sample`, and `Brier_rgs_sample`. Dataclass fields in `ClassificationPointEvaluationMetrics`, `ClassificationSampleEvaluationMetrics`, and `RegressionSampleEvaluationMetrics` were renamed/added accordingly. External consumers accessing `.Brier_sample` or `.Brier_point` must update.
 
 ---
 
diff --git a/tests/test_metric_calculators.py b/tests/test_metric_calculators.py
index f411968..7f1528f 100644
--- a/tests/test_metric_calculators.py
+++ b/tests/test_metric_calculators.py
@@ -15,8 +15,9 @@
     calculate_mean_interval_score_native,
     calculate_mtd_native,
     calculate_mcr_native,
-    calculate_brier_sample_native,
-    calculate_brier_point_native,
+    calculate_brier_cls_point_native,
+    calculate_brier_cls_sample_native,
+    calculate_brier_rgs_sample_native,
     calculate_qs_sample_native,
     calculate_qs_point_native,
 )
@@ -145,14 +146,14 @@ def test_metric_membership_classification_point():
     """METRIC_MEMBERSHIP contains expected classification point metrics."""
     members = METRIC_MEMBERSHIP[("classification", "point")]
     assert "AP" in members
-    assert "Brier_point" in members
+    assert "Brier_cls_point" in members
     assert "RMSLE" not in members
 
 
 def test_metric_membership_classification_sample():
     """METRIC_MEMBERSHIP contains expected classification sample metrics."""
     members = METRIC_MEMBERSHIP[("classification", "sample")]
-    for m in ["CRPS", "twCRPS", "Brier_sample", "Jeffreys"]:
+    for m in ["CRPS", "twCRPS", "Brier_cls_sample", "Jeffreys"]:
         assert m in members
     assert "RMSLE" not in members
 
@@ -579,38 +580,55 @@ def test_qis_symmetric_equals_mis(self):
 
 class TestBrierScore:
 
-    def test_brier_sample_golden_value(self):
-        """Hand-computed Brier sample: threshold=1, mixed binary outcomes."""
+    def test_brier_rgs_sample_golden_value(self):
+        """Hand-computed Brier rgs_sample: threshold=1, mixed binary outcomes."""
         y_true = np.array([0.0, 2.0, 5.0])
         y_pred = np.array([[0.5, 1.5], [0.5, 1.5], [4.0, 6.0]])
         # y_binary = [0, 1, 1] (0 < 1, 2 > 1, 5 > 1)
         # p_hat = [0.5, 0.5, 1.0] (fraction of ensemble > threshold)
         # Brier = mean([(0.5-0)^2, (0.5-1)^2, (1.0-1)^2]) = mean([0.25, 0.25, 0]) = 1/6
-        result = calculate_brier_sample_native(y_true, y_pred, threshold=1.0)
+        result = calculate_brier_rgs_sample_native(y_true, y_pred, threshold=1.0)
         assert result == pytest.approx(1.0 / 6.0, abs=1e-10)
 
-    def test_brier_point_golden_value(self):
-        """Hand-computed Brier point: threshold=1, probabilities vs binary outcomes."""
+    def test_brier_cls_point_golden_value(self):
+        """Hand-computed Brier cls_point: threshold=1, probabilities vs binary outcomes."""
         y_true = np.array([0.0, 2.0, 5.0])
         y_pred = np.array([[0.1], [0.7], [0.9]])
         # y_binary = [0, 1, 1]
         # p_hat = [0.1, 0.7, 0.9] (point prediction as probability)
         # Brier = mean([(0.1-0)^2, (0.7-1)^2, (0.9-1)^2]) = mean([0.01, 0.09, 0.01]) = 11/300
-        result = calculate_brier_point_native(y_true, y_pred, threshold=1.0)
+        result = calculate_brier_cls_point_native(y_true, y_pred, threshold=1.0)
         assert result == pytest.approx(11.0 / 300.0, abs=1e-10)
 
-    def test_brier_sample_perfect(self):
+    def test_brier_cls_sample_golden_value(self):
+        """Hand-computed Brier cls_sample: average probability samples, threshold=0."""
+        y_true = np.array([1.0, 0.0])
+        y_pred = np.array([[0.9, 0.8], [0.1, 0.2]])
+        # y_binary = [1, 0] (1 > 0, 0 not > 0)
+        # p_hat = [mean(0.9, 0.8), mean(0.1, 0.2)] = [0.85, 0.15]
+        # Brier = mean([(0.85-1)^2, (0.15-0)^2]) = mean([0.0225, 0.0225]) = 0.0225
+        result = calculate_brier_cls_sample_native(y_true, y_pred, threshold=0.0)
+        assert result == pytest.approx(0.0225, abs=1e-10)
+
+    def test_brier_rgs_sample_perfect(self):
         """All above threshold, all ensemble members above → p_hat=1, y_binary=1, Brier=0."""
         y_true = np.array([5.0, 10.0])
         y_pred = np.array([[2.0, 3.0], [2.0, 3.0]])
-        result = calculate_brier_sample_native(y_true, y_pred, threshold=1.0)
+        result = calculate_brier_rgs_sample_native(y_true, y_pred, threshold=1.0)
         assert result == pytest.approx(0.0, abs=1e-10)
 
-    def test_brier_point_perfect(self):
+    def test_brier_cls_point_perfect(self):
         """p_hat matches y_binary exactly → Brier=0."""
         y_true = np.array([0.0, 2.0])  # binary=[0, 1] at threshold=1
         y_pred = np.array([[0.0], [1.0]])  # perfect probability predictions
-        result = calculate_brier_point_native(y_true, y_pred, threshold=1.0)
+        result = calculate_brier_cls_point_native(y_true, y_pred, threshold=1.0)
+        assert result == pytest.approx(0.0, abs=1e-10)
+
+    def test_brier_cls_sample_perfect(self):
+        """Perfect probability samples → Brier=0."""
+        y_true = np.array([1.0, 0.0])
+        y_pred = np.array([[1.0, 1.0], [0.0, 0.0]])
+        result = calculate_brier_cls_sample_native(y_true, y_pred, threshold=0.0)
         assert result == pytest.approx(0.0, abs=1e-10)
 
 
@@ -757,40 +775,49 @@ def test_large_alpha(self):
 
 class TestBrierScoreBeige:
 
-    def test_single_observation(self):
-        """Brier handles N=1, S=1 without error."""
-        result = calculate_brier_sample_native(np.array([2.0]), np.array([[3.0]]), threshold=1.0)
+    def test_rgs_single_observation(self):
+        """Brier rgs_sample handles N=1, S=1 without error."""
+        result = calculate_brier_rgs_sample_native(np.array([2.0]), np.array([[3.0]]), threshold=1.0)
         assert np.isfinite(result)
 
-    def test_large_ensemble_stable(self):
-        """Brier is stable with S=1000 samples."""
+    def test_rgs_large_ensemble_stable(self):
+        """Brier rgs_sample is stable with S=1000 samples."""
         rng = np.random.default_rng(42)
         y_true = np.array([0.0, 5.0, 10.0])
         y_pred = rng.normal(loc=y_true[:, None], scale=2.0, size=(3, 1000))
-        result = calculate_brier_sample_native(y_true, y_pred, threshold=1.0)
+        result = calculate_brier_rgs_sample_native(y_true, y_pred, threshold=1.0)
         assert np.isfinite(result)
         assert 0 <= result <= 1  # Brier is bounded [0, 1]
 
-    def test_threshold_at_exact_data_value(self):
+    def test_rgs_threshold_at_exact_data_value(self):
         """Threshold equals an observation — no crash."""
         y_true = np.array([5.0, 5.0])
         y_pred = np.array([[4.0, 6.0], [4.0, 6.0]])
-        result = calculate_brier_sample_native(y_true, y_pred, threshold=5.0)
+        result = calculate_brier_rgs_sample_native(y_true, y_pred, threshold=5.0)
         assert np.isfinite(result)
 
-    def test_all_above_threshold(self):
+    def test_rgs_all_above_threshold(self):
         """All y_true above threshold — y_binary all 1, finite result."""
         y_true = np.array([10.0, 20.0])
         y_pred = np.array([[0.5, 1.5], [0.5, 1.5]])
-        result = calculate_brier_sample_native(y_true, y_pred, threshold=1.0)
+        result = calculate_brier_rgs_sample_native(y_true, y_pred, threshold=1.0)
         assert np.isfinite(result)
 
-    def test_all_below_threshold(self):
+    def test_rgs_all_below_threshold(self):
         """All y_true below threshold — y_binary all 0, finite result."""
         y_true = np.array([0.0, 0.5])
         y_pred = np.array([[0.5, 1.5], [0.5, 1.5]])
-        result = calculate_brier_sample_native(y_true, y_pred, threshold=1.0)
+        result = calculate_brier_rgs_sample_native(y_true, y_pred, threshold=1.0)
+        assert np.isfinite(result)
+
+    def test_cls_sample_large_ensemble_stable(self):
+        """Brier cls_sample is stable with S=1000 probability samples."""
+        rng = np.random.default_rng(42)
+        y_true = np.array([1.0, 0.0, 1.0])
+        y_pred = rng.beta(a=2, b=2, size=(3, 1000))  # probabilities in [0, 1]
+        result = calculate_brier_cls_sample_native(y_true, y_pred, threshold=0.0)
         assert np.isfinite(result)
+        assert 0 <= result <= 1
 
 
 class TestQuantileScoreBeige:
@@ -974,7 +1001,7 @@ def test_negative_y_true_valid(self):
 
 class TestBrierScoreRed:
 
-    def test_nan_in_y_true_swallowed_by_comparison(self):
+    def test_rgs_nan_in_y_true_swallowed_by_comparison(self):
         """NaN in y_true is swallowed by '>' comparison (NaN > x → False).
 
         Unlike arithmetic metrics, Brier's binarization step converts NaN to
@@ -984,24 +1011,31 @@ def test_nan_in_y_true_swallowed_by_comparison(self):
         """
         y_true = np.array([np.nan, 1.0])
         y_pred = np.array([[1.0], [1.0]])
-        result = calculate_brier_sample_native(y_true, y_pred, threshold=1.0)
-        # NaN is treated as below-threshold (False), so result is finite, not NaN
+        result = calculate_brier_rgs_sample_native(y_true, y_pred, threshold=1.0)
         assert np.isfinite(result)
 
-    def test_nan_in_y_pred_swallowed_by_comparison(self):
+    def test_rgs_nan_in_y_pred_swallowed_by_comparison(self):
         """NaN in y_pred is swallowed by '>' comparison in p_hat computation."""
         y_true = np.array([1.0, 1.0])
         y_pred = np.array([[np.nan], [1.0]])
-        result = calculate_brier_sample_native(y_true, y_pred, threshold=1.0)
+        result = calculate_brier_rgs_sample_native(y_true, y_pred, threshold=1.0)
         assert np.isfinite(result)
 
-    def test_negative_threshold_accepted(self):
-        """Negative threshold is mathematically valid."""
+    def test_rgs_negative_threshold_accepted(self):
+        """Negative threshold is mathematically valid for regression Brier."""
         y_true = np.array([1.0, 2.0])
         y_pred = np.array([[1.0, 2.0], [2.0, 3.0]])
-        result = calculate_brier_sample_native(y_true, y_pred, threshold=-5.0)
+        result = calculate_brier_rgs_sample_native(y_true, y_pred, threshold=-5.0)
         assert np.isfinite(result)
 
+    def test_cls_sample_nan_in_y_pred_propagates(self):
+        """NaN in probability samples propagates via mean() — not swallowed."""
+        y_true = np.array([1.0, 1.0])
+        y_pred = np.array([[np.nan], [0.5]])
+        result = calculate_brier_cls_sample_native(y_true, y_pred, threshold=0.0)
+        # mean([nan]) = nan, (nan - 1)^2 = nan → result is nan
+        assert np.isnan(result)
+
 
 class TestQuantileScoreRed:
 
@@ -1040,11 +1074,11 @@ def test_crps_large_ensemble_values(self):
         assert np.isfinite(result)
         assert result >= 0
 
-    def test_brier_extreme_threshold(self):
+    def test_brier_rgs_extreme_threshold(self):
         """Threshold at 1e300: all values below → y_binary all 0, p_hat all 0, Brier = 0."""
         y_true = np.array([1.0, 2.0])
         y_pred = np.array([[0.5, 1.5], [0.5, 1.5]])
-        result = calculate_brier_sample_native(y_true, y_pred, threshold=1e300)
+        result = calculate_brier_rgs_sample_native(y_true, y_pred, threshold=1e300)
         assert result == pytest.approx(0.0, abs=1e-10)
 
     def test_coverage_tiny_ensemble_spread(self):
diff --git a/tests/test_metric_catalog.py b/tests/test_metric_catalog.py
index e1bec4e..bf42f88 100644
--- a/tests/test_metric_catalog.py
+++ b/tests/test_metric_catalog.py
@@ -386,7 +386,7 @@ def test_evaluator_rejects_unknown_profile(self):
 
     def test_registry_snapshot_integrity(self):
         """Registries have expected sizes — catches accidental mutation or deletion."""
-        assert len(METRIC_CATALOG) == 24
+        assert len(METRIC_CATALOG) == 25
         assert len(METRIC_MEMBERSHIP) == 4
         assert len(PROFILES) >= 2
 
diff --git a/tests/test_native_evaluator.py b/tests/test_native_evaluator.py
index b8fb3e8..04d72a2 100644
--- a/tests/test_native_evaluator.py
+++ b/tests/test_native_evaluator.py
@@ -289,11 +289,11 @@ def test_classification_target(self):
         assert 'month100' in report.to_dict()['schemas']['month']
 
     def test_classification_sample_brier(self):
-        """Brier_sample and CRPS work for classification sample predictions."""
+        """Brier_cls_sample and CRPS work for classification sample predictions."""
         n = 6
         ef = EvaluationFrame(
             y_true=np.array([0.0, 1.0, 0.0, 1.0, 0.0, 1.0]),
-            y_pred=np.random.default_rng(42).uniform(0, 2, size=(n, 20)),
+            y_pred=np.random.default_rng(42).uniform(0, 1, size=(n, 20)),  # probability samples in [0,1]
             identifiers={
                 'time':   np.array([100, 100, 101, 101, 102, 102]),
                 'unit':   np.array([1, 2, 1, 2, 1, 2]),
@@ -305,17 +305,17 @@ def test_classification_sample_brier(self):
         config = {
             'steps': [1, 2, 3],
             'classification_targets': ['by_sb_best'],
-            'classification_sample_metrics': ['Brier_sample', 'CRPS'],
+            'classification_sample_metrics': ['Brier_cls_sample', 'CRPS'],
         }
         report = NativeEvaluator(config).evaluate(ef)
         assert report.task == 'classification'
         assert report.pred_type == 'sample'
         d = report.to_dict()['schemas']
-        assert 'Brier_sample' in d['month']['month100']
+        assert 'Brier_cls_sample' in d['month']['month100']
         assert 'CRPS' in d['month']['month100']
 
     def test_classification_point_brier(self):
-        """AP and Brier_point work together for classification point predictions."""
+        """AP and Brier_cls_point work together for classification point predictions."""
         n = 6
         ef = EvaluationFrame(
             y_true=np.array([0.0, 1.0, 0.0, 1.0, 0.0, 1.0]),
@@ -331,12 +331,12 @@ def test_classification_point_brier(self):
         config = {
             'steps': [1, 2, 3],
             'classification_targets': ['by_sb_best'],
-            'classification_point_metrics': ['AP', 'Brier_point'],
+            'classification_point_metrics': ['AP', 'Brier_cls_point'],
         }
         report = NativeEvaluator(config).evaluate(ef)
         d = report.to_dict()['schemas']
         assert 'AP' in d['step']['step01']
-        assert 'Brier_point' in d['step']['step01']
+        assert 'Brier_cls_point' in d['step']['step01']
 
     def test_evaluate_twice_produces_identical_results(self):
         """NativeEvaluator is stateless — same input yields same output."""
diff --git a/views_evaluation/evaluation/metric_catalog.py b/views_evaluation/evaluation/metric_catalog.py
index ed6f50a..adaca24 100644
--- a/views_evaluation/evaluation/metric_catalog.py
+++ b/views_evaluation/evaluation/metric_catalog.py
@@ -31,8 +31,9 @@
     calculate_coverage_native,
     calculate_mean_interval_score_native,
     calculate_ignorance_score_native,
-    calculate_brier_sample_native,
-    calculate_brier_point_native,
+    calculate_brier_cls_point_native,
+    calculate_brier_cls_sample_native,
+    calculate_brier_rgs_sample_native,
     calculate_qs_sample_native,
     calculate_qs_point_native,
     calculate_sd_native,
@@ -91,9 +92,10 @@ class MetricSpec:
     "QS_sample": MetricSpec(function=calculate_qs_sample_native, genome=("quantile",)),
     "QS_point":  MetricSpec(function=calculate_qs_point_native,  genome=("quantile",)),
 
-    # ── Brier Score ───────────────────────────────────────────────────────
-    "Brier_sample": MetricSpec(function=calculate_brier_sample_native, genome=("threshold",)),
-    "Brier_point":  MetricSpec(function=calculate_brier_point_native,  genome=("threshold",)),
+    # ── Brier Score (3 explicit variants for the 2×2 matrix) ────────────
+    "Brier_cls_point":  MetricSpec(function=calculate_brier_cls_point_native,  genome=("threshold",)),
+    "Brier_cls_sample": MetricSpec(function=calculate_brier_cls_sample_native, genome=("threshold",)),
+    "Brier_rgs_sample": MetricSpec(function=calculate_brier_rgs_sample_native, genome=("threshold",)),
 
     # ── Classification ────────────────────────────────────────────────────
     "AP":        MetricSpec(function=calculate_ap_native,        genome=()),
@@ -107,9 +109,9 @@ class MetricSpec:
                                     "SD", "pEMDiv", "Variogram"},
     ("regression", "sample"):     {"CRPS", "twCRPS", "MIS", "QIS", "QS_sample",
                                     "Coverage", "Ignorance",
-                                    "y_hat_bar", "MCR_sample"},
-    ("classification", "point"):  {"AP", "Brier_point"},
-    ("classification", "sample"): {"CRPS", "twCRPS", "Brier_sample", "Jeffreys"},
+                                    "y_hat_bar", "MCR_sample", "Brier_rgs_sample"},
+    ("classification", "point"):  {"AP", "Brier_cls_point"},
+    ("classification", "sample"): {"CRPS", "twCRPS", "Brier_cls_sample", "Jeffreys"},
 }
 
 
diff --git a/views_evaluation/evaluation/metrics.py b/views_evaluation/evaluation/metrics.py
index c3491f5..eaa80b5 100644
--- a/views_evaluation/evaluation/metrics.py
+++ b/views_evaluation/evaluation/metrics.py
@@ -131,20 +131,21 @@ class RegressionSampleEvaluationMetrics(BaseEvaluationMetrics):
     Coverage:   Optional[float] = None
     Ignorance:  Optional[float] = None
     y_hat_bar:  Optional[float] = None
-    MCR_sample: Optional[float] = None
+    MCR_sample:      Optional[float] = None
+    Brier_rgs_sample: Optional[float] = None
 
 
 @dataclass
 class ClassificationPointEvaluationMetrics(BaseEvaluationMetrics):
     """Metrics for classification targets evaluated with point (probability) predictions."""
-    AP:          Optional[float] = None
-    Brier_point: Optional[float] = None
+    AP:              Optional[float] = None
+    Brier_cls_point: Optional[float] = None
 
 
 @dataclass
 class ClassificationSampleEvaluationMetrics(BaseEvaluationMetrics):
     """Metrics for classification targets evaluated with sample-based predictions."""
-    CRPS:         Optional[float] = None
-    twCRPS:       Optional[float] = None
-    Brier_sample: Optional[float] = None
-    Jeffreys:     Optional[float] = None
+    CRPS:              Optional[float] = None
+    twCRPS:            Optional[float] = None
+    Brier_cls_sample:  Optional[float] = None
+    Jeffreys:          Optional[float] = None
diff --git a/views_evaluation/evaluation/native_metric_calculators.py b/views_evaluation/evaluation/native_metric_calculators.py
index 6c80707..3757507 100644
--- a/views_evaluation/evaluation/native_metric_calculators.py
+++ b/views_evaluation/evaluation/native_metric_calculators.py
@@ -218,8 +218,44 @@ def calculate_quantile_interval_score_native(
 
 
 # ── Brier Score ───────────────────────────────────────────────────────────────
+#
+# Three explicit variants for the 2×2 evaluation matrix:
+#   Brier_cls_point  — classification point: y_pred is a probability
+#   Brier_cls_sample — classification sample: y_pred are probability samples (MC Dropout)
+#   Brier_rgs_sample — regression sample: y_pred are count/magnitude samples
+#
+# Brier_rgs_point is intentionally omitted: a regression point estimate
+# is not a probability, so calling the result a Brier score is misleading.
+
+def calculate_brier_cls_point_native(
+    y_true: np.ndarray,
+    y_pred: np.ndarray,
+    target=None,
+    *,
+    threshold: float,
+    **kwargs,
+) -> float:
+    """
+    Brier Score for classification point (probability) predictions.
 
-def calculate_brier_sample_native(
+    Binarises truth at the threshold, uses the point prediction
+    directly as the predicted probability.
+
+    Brier = mean((y_pred - y_binary)^2)
+
+    y_pred values should be in [0, 1] for meaningful results.
+    For point predictions, y_pred is (N, 1) after _guard_shapes.
+
+    Args:
+        threshold: Onset threshold for binarising y_true.
+    """
+    y_true, y_pred = _guard_shapes(y_true, y_pred)
+    y_binary = (y_true > threshold).astype(float)
+    p_hat = y_pred[:, 0]
+    return float(np.mean((p_hat - y_binary) ** 2))
+
+
+def calculate_brier_cls_sample_native(
     y_true: np.ndarray,
     y_pred: np.ndarray,
     target=None,
@@ -228,33 +264,30 @@ def calculate_brier_sample_native(
     **kwargs,
 ) -> float:
     """
-    Brier Score for sample-based predictions binarized at a threshold.
+    Brier Score for classification probability samples (e.g. MC Dropout).
 
-    Binarises truth at the threshold, computes event probability from
-    the fraction of ensemble members exceeding the threshold, then
-    returns the mean squared error between predicted probability and
-    binary outcome.
+    Each sample in y_pred is a probability in [0, 1]. The posterior mean
+    probability is used as the point estimate:
 
-    Brier = mean((p_hat - y_binary)^2)
+    Brier = mean((mean(y_pred, axis=1) - y_binary)^2)
 
-    where p_hat = mean(y_pred > threshold, axis=1) and
-    y_binary = (y_true > threshold).
+    where y_binary = (y_true > threshold).
 
-    Note: NaN values in y_true or y_pred are silently converted to
-    below-threshold (False) by NumPy comparison semantics. Callers
-    must validate inputs via EvaluationFrame.
+    This is the correct formulation for probability samples — averaging
+    probabilities preserves calibration information. Binarising probability
+    samples at a threshold (as Brier_rgs_sample does for count data) would
+    destroy discrimination.
 
     Args:
-        threshold: Onset threshold for binarisation. Must be provided
-                   explicitly via evaluation profile or model config.
+        threshold: Onset threshold for binarising y_true.
     """
     y_true, y_pred = _guard_shapes(y_true, y_pred)
     y_binary = (y_true > threshold).astype(float)
-    p_hat = np.mean(y_pred > threshold, axis=1)
+    p_hat = np.mean(y_pred, axis=1)
     return float(np.mean((p_hat - y_binary) ** 2))
 
 
-def calculate_brier_point_native(
+def calculate_brier_rgs_sample_native(
     y_true: np.ndarray,
     y_pred: np.ndarray,
     target=None,
@@ -263,28 +296,25 @@ def calculate_brier_point_native(
     **kwargs,
 ) -> float:
     """
-    Brier Score for point (probability) predictions binarized at a threshold.
+    Brier Score for regression (count/magnitude) samples.
 
-    Binarises truth at the threshold, uses the point prediction
-    directly as the predicted probability. y_pred values should be
-    in [0, 1] for meaningful results; values outside this range
-    produce a mathematically valid but semantically misleading score.
-
-    Brier = mean((y_pred - y_binary)^2)
+    Binarises both truth and each sample at the threshold, then
+    estimates the event probability from the fraction of ensemble
+    members exceeding the threshold.
 
-    For point predictions, y_pred is (N, 1) after _guard_shapes.
-    The single column is the predicted probability.
+    Brier = mean((p_hat - y_binary)^2)
 
-    Note: NaN values in y_true or y_pred are silently converted to
-    below-threshold (False) by NumPy comparison semantics. Callers
-    must validate inputs via EvaluationFrame.
+    where p_hat = mean(y_pred > threshold, axis=1) and
+    y_binary = (y_true > threshold).
 
     Args:
-        threshold: Onset threshold for binarisation.
+        threshold: Onset threshold for binarisation of both y_true
+                   and y_pred. Must be provided explicitly via
+                   evaluation profile or model config.
     """
     y_true, y_pred = _guard_shapes(y_true, y_pred)
     y_binary = (y_true > threshold).astype(float)
-    p_hat = y_pred[:, 0]  # Point prediction: single column
+    p_hat = np.mean(y_pred > threshold, axis=1)
     return float(np.mean((p_hat - y_binary) ** 2))
 
 
diff --git a/views_evaluation/profiles/base.py b/views_evaluation/profiles/base.py
index 652367b..f858d54 100644
--- a/views_evaluation/profiles/base.py
+++ b/views_evaluation/profiles/base.py
@@ -27,8 +27,9 @@
     "QIS":           {"lower_quantile": 0.025, "upper_quantile": 0.975},
     "QS_sample":     {"quantile": 0.99},
     "QS_point":      {"quantile": 0.99},
-    "Brier_sample":  {"threshold": 1.0},
-    "Brier_point":   {"threshold": 1.0},
+    "Brier_cls_point":  {"threshold": 0.0},
+    "Brier_cls_sample": {"threshold": 0.0},
+    "Brier_rgs_sample": {"threshold": 1.0},
     "Coverage":      {"alpha": 0.1},
     "Ignorance": {
         "bins": [0, 0.5, 2.5, 5.5, 10.5, 25.5, 50.5, 100.5, 250.5, 500.5, 1000.5],
diff --git a/views_evaluation/profiles/hydranet_ucdp.py b/views_evaluation/profiles/hydranet_ucdp.py
index 42464fe..07abe36 100644
--- a/views_evaluation/profiles/hydranet_ucdp.py
+++ b/views_evaluation/profiles/hydranet_ucdp.py
@@ -1,11 +1,8 @@
 """
 Evaluation profile for HydraNet models on UCDP targets.
 
-Covers probabilistic evaluation metrics that require hyperparameters:
-twCRPS, MIS, QIS. All other metrics (CRPS, AP, Brier, etc.) have
-empty genomes and need no profile entries.
-
-Values are placeholders inheriting from base — adjust per domain needs.
+Inherits all genome values from BASE_PROFILE (including Brier thresholds,
+QS quantile, Coverage alpha, etc.) and overrides where domain needs differ.
 """
 
 from views_evaluation.profiles.base import BASE_PROFILE

From 8e0811056cb37c1144a95298fc0195f7db182562 Mon Sep 17 00:00:00 2001
From: Polichinl <simmaa@prio.org>
Date: Thu, 9 Apr 2026 09:28:31 +0200
Subject: [PATCH 2/2] fix: set Brier threshold defaults to 0.0 (hurdle event y
 > 0)

All three Brier variants now default to threshold=0.0 in the base profile,
matching the Pre-Release Note 05 definition: Brier evaluates the binary
event "any fatality occurred" (y > 0). On integer-valued UCDP data,
y > 0 and y >= 1 are equivalent.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 views_evaluation/profiles/base.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/views_evaluation/profiles/base.py b/views_evaluation/profiles/base.py
index f858d54..12dca09 100644
--- a/views_evaluation/profiles/base.py
+++ b/views_evaluation/profiles/base.py
@@ -27,9 +27,9 @@
     "QIS":           {"lower_quantile": 0.025, "upper_quantile": 0.975},
     "QS_sample":     {"quantile": 0.99},
     "QS_point":      {"quantile": 0.99},
-    "Brier_cls_point":  {"threshold": 0.0},
-    "Brier_cls_sample": {"threshold": 0.0},
-    "Brier_rgs_sample": {"threshold": 1.0},
+    "Brier_cls_point":  {"threshold": 0.0},  # hurdle event: any fatality (y > 0)
+    "Brier_cls_sample": {"threshold": 0.0},  # hurdle event: any fatality (y > 0)
+    "Brier_rgs_sample": {"threshold": 0.0},  # hurdle event: any fatality (y > 0)
     "Coverage":      {"alpha": 0.1},
     "Ignorance": {
         "bins": [0, 0.5, 2.5, 5.5, 10.5, 25.5, 50.5, 100.5, 250.5, 500.5, 1000.5],