From 1c970fe94c9ad2468d1168030838ed68c0f6f065 Mon Sep 17 00:00:00 2001 From: Polichinl Date: Thu, 9 Apr 2026 08:34:28 +0200 Subject: [PATCH 1/2] =?UTF-8?q?feat:=20explicit=20Brier=20score=20variants?= =?UTF-8?q?=20for=20the=202=C3=972=20evaluation=20matrix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace Brier_sample/Brier_point with three task-explicit variants: - Brier_cls_point: classification point (y_pred is probability) - Brier_cls_sample: classification sample (average MC Dropout probabilities) - Brier_rgs_sample: regression sample (binarise count samples at threshold) Brier_rgs_point intentionally omitted — regression point estimates are not probabilities. The _cls_/_rgs_ infix makes the task context self-documenting. Critical fix: Brier_cls_sample uses mean(y_pred) instead of mean(y_pred > threshold), which was broken for probability samples (all probabilities > 0 → p_hat ≈ 1.0). Profile defaults: classification Brier threshold=0.0 (PDF: event y > 0), regression Brier threshold=1.0 (binarise at 1 fatality). Catalog size 24 → 25. Co-Authored-By: Claude Opus 4.6 (1M context) --- documentation/CICs/MetricCatalog.md | 6 +- tests/test_metric_calculators.py | 106 ++++++++++++------ tests/test_metric_catalog.py | 2 +- tests/test_native_evaluator.py | 14 +-- views_evaluation/evaluation/metric_catalog.py | 18 +-- views_evaluation/evaluation/metrics.py | 15 +-- .../evaluation/native_metric_calculators.py | 90 ++++++++++----- views_evaluation/profiles/base.py | 5 +- views_evaluation/profiles/hydranet_ucdp.py | 7 +- 9 files changed, 165 insertions(+), 98 deletions(-) diff --git a/documentation/CICs/MetricCatalog.md b/documentation/CICs/MetricCatalog.md index 25b4d6c..ba529f8 100644 --- a/documentation/CICs/MetricCatalog.md +++ b/documentation/CICs/MetricCatalog.md @@ -113,6 +113,7 @@ params = resolve_metric_params("MSE", {}, BASE_PROFILE) - **Red:** `tests/test_metric_catalog.py` — unknown metrics, unimplemented metrics, missing params, None values, unknown overrides. - **Red (bounds):** `tests/test_metric_catalog.py::TestResolveMetricParamsBoundsRed` — 7 tests for out-of-range alpha/quantile and crossed QIS quantiles. - **Correctness:** `tests/test_metric_calculators.py::TestGoldenValues` — 17 golden-value tests for all implemented metrics. +- **Correctness (Brier/QS):** `tests/test_metric_calculators.py::TestBrierScore` + `TestQuantileScore` — 10 golden-value tests for the 3 Brier variants and 2 QS variants. --- @@ -122,14 +123,15 @@ params = resolve_metric_params("MSE", {}, BASE_PROFILE) - The legacy dispatch dicts were removed in Phase 3. `METRIC_MEMBERSHIP` is the single source of truth. - Profile structure is stable; new profiles are added by creating a new file in `profiles/`. - Bounds validation added for probability/proportion parameters (2026-04-04, C-18): `alpha`, `quantile`, `lower_quantile`, `upper_quantile` must be in (0, 1). Cross-parameter validation for QIS quantile ordering. +- Explicit Brier variants added (2026-04-09): `Brier_sample`/`Brier_point` replaced by three task-explicit variants: `Brier_cls_point`, `Brier_cls_sample`, `Brier_rgs_sample`. The `_cls_`/`_rgs_` infix denotes the task context (classification vs. regression). `Brier_rgs_point` is intentionally omitted — a regression point estimate is not a probability. `Brier_cls_sample` averages probability samples (`mean(y_pred)`); `Brier_rgs_sample` binarises count samples (`mean(y_pred > threshold)`). Catalog size: 24 → 25. --- ## 12. Known Deviations - **No profile completeness validation:** There is no mechanism to verify that a profile provides values for all metrics with non-empty genomes. A profile missing a metric's params will only fail at evaluation time, not at profile registration. -- **Golden-value coverage complete:** 17 tests in `tests/test_metric_calculators.py::TestGoldenValues` plus 8 Brier/QS golden-value tests cover all implemented metrics (C-07 closed 2026-04-02). -- **Breaking rename:** The legacy `Brier` metric (unimplemented placeholder) was replaced by `Brier_sample` and `Brier_point` (implemented). The field in `ClassificationSampleEvaluationMetrics` was renamed from `Brier` to `Brier_sample`. External consumers accessing `.Brier` on classification sample results must update to `.Brier_sample`. +- **Golden-value coverage complete:** 17 tests in `tests/test_metric_calculators.py::TestGoldenValues` plus 10 Brier/QS golden-value tests cover all implemented metrics (C-07 closed 2026-04-02). +- **Breaking rename (2026-04-09):** `Brier_sample` and `Brier_point` were replaced by `Brier_cls_point`, `Brier_cls_sample`, and `Brier_rgs_sample`. Dataclass fields in `ClassificationPointEvaluationMetrics`, `ClassificationSampleEvaluationMetrics`, and `RegressionSampleEvaluationMetrics` were renamed/added accordingly. External consumers accessing `.Brier_sample` or `.Brier_point` must update. --- diff --git a/tests/test_metric_calculators.py b/tests/test_metric_calculators.py index f411968..7f1528f 100644 --- a/tests/test_metric_calculators.py +++ b/tests/test_metric_calculators.py @@ -15,8 +15,9 @@ calculate_mean_interval_score_native, calculate_mtd_native, calculate_mcr_native, - calculate_brier_sample_native, - calculate_brier_point_native, + calculate_brier_cls_point_native, + calculate_brier_cls_sample_native, + calculate_brier_rgs_sample_native, calculate_qs_sample_native, calculate_qs_point_native, ) @@ -145,14 +146,14 @@ def test_metric_membership_classification_point(): """METRIC_MEMBERSHIP contains expected classification point metrics.""" members = METRIC_MEMBERSHIP[("classification", "point")] assert "AP" in members - assert "Brier_point" in members + assert "Brier_cls_point" in members assert "RMSLE" not in members def test_metric_membership_classification_sample(): """METRIC_MEMBERSHIP contains expected classification sample metrics.""" members = METRIC_MEMBERSHIP[("classification", "sample")] - for m in ["CRPS", "twCRPS", "Brier_sample", "Jeffreys"]: + for m in ["CRPS", "twCRPS", "Brier_cls_sample", "Jeffreys"]: assert m in members assert "RMSLE" not in members @@ -579,38 +580,55 @@ def test_qis_symmetric_equals_mis(self): class TestBrierScore: - def test_brier_sample_golden_value(self): - """Hand-computed Brier sample: threshold=1, mixed binary outcomes.""" + def test_brier_rgs_sample_golden_value(self): + """Hand-computed Brier rgs_sample: threshold=1, mixed binary outcomes.""" y_true = np.array([0.0, 2.0, 5.0]) y_pred = np.array([[0.5, 1.5], [0.5, 1.5], [4.0, 6.0]]) # y_binary = [0, 1, 1] (0 < 1, 2 > 1, 5 > 1) # p_hat = [0.5, 0.5, 1.0] (fraction of ensemble > threshold) # Brier = mean([(0.5-0)^2, (0.5-1)^2, (1.0-1)^2]) = mean([0.25, 0.25, 0]) = 1/6 - result = calculate_brier_sample_native(y_true, y_pred, threshold=1.0) + result = calculate_brier_rgs_sample_native(y_true, y_pred, threshold=1.0) assert result == pytest.approx(1.0 / 6.0, abs=1e-10) - def test_brier_point_golden_value(self): - """Hand-computed Brier point: threshold=1, probabilities vs binary outcomes.""" + def test_brier_cls_point_golden_value(self): + """Hand-computed Brier cls_point: threshold=1, probabilities vs binary outcomes.""" y_true = np.array([0.0, 2.0, 5.0]) y_pred = np.array([[0.1], [0.7], [0.9]]) # y_binary = [0, 1, 1] # p_hat = [0.1, 0.7, 0.9] (point prediction as probability) # Brier = mean([(0.1-0)^2, (0.7-1)^2, (0.9-1)^2]) = mean([0.01, 0.09, 0.01]) = 11/300 - result = calculate_brier_point_native(y_true, y_pred, threshold=1.0) + result = calculate_brier_cls_point_native(y_true, y_pred, threshold=1.0) assert result == pytest.approx(11.0 / 300.0, abs=1e-10) - def test_brier_sample_perfect(self): + def test_brier_cls_sample_golden_value(self): + """Hand-computed Brier cls_sample: average probability samples, threshold=0.""" + y_true = np.array([1.0, 0.0]) + y_pred = np.array([[0.9, 0.8], [0.1, 0.2]]) + # y_binary = [1, 0] (1 > 0, 0 not > 0) + # p_hat = [mean(0.9, 0.8), mean(0.1, 0.2)] = [0.85, 0.15] + # Brier = mean([(0.85-1)^2, (0.15-0)^2]) = mean([0.0225, 0.0225]) = 0.0225 + result = calculate_brier_cls_sample_native(y_true, y_pred, threshold=0.0) + assert result == pytest.approx(0.0225, abs=1e-10) + + def test_brier_rgs_sample_perfect(self): """All above threshold, all ensemble members above → p_hat=1, y_binary=1, Brier=0.""" y_true = np.array([5.0, 10.0]) y_pred = np.array([[2.0, 3.0], [2.0, 3.0]]) - result = calculate_brier_sample_native(y_true, y_pred, threshold=1.0) + result = calculate_brier_rgs_sample_native(y_true, y_pred, threshold=1.0) assert result == pytest.approx(0.0, abs=1e-10) - def test_brier_point_perfect(self): + def test_brier_cls_point_perfect(self): """p_hat matches y_binary exactly → Brier=0.""" y_true = np.array([0.0, 2.0]) # binary=[0, 1] at threshold=1 y_pred = np.array([[0.0], [1.0]]) # perfect probability predictions - result = calculate_brier_point_native(y_true, y_pred, threshold=1.0) + result = calculate_brier_cls_point_native(y_true, y_pred, threshold=1.0) + assert result == pytest.approx(0.0, abs=1e-10) + + def test_brier_cls_sample_perfect(self): + """Perfect probability samples → Brier=0.""" + y_true = np.array([1.0, 0.0]) + y_pred = np.array([[1.0, 1.0], [0.0, 0.0]]) + result = calculate_brier_cls_sample_native(y_true, y_pred, threshold=0.0) assert result == pytest.approx(0.0, abs=1e-10) @@ -757,40 +775,49 @@ def test_large_alpha(self): class TestBrierScoreBeige: - def test_single_observation(self): - """Brier handles N=1, S=1 without error.""" - result = calculate_brier_sample_native(np.array([2.0]), np.array([[3.0]]), threshold=1.0) + def test_rgs_single_observation(self): + """Brier rgs_sample handles N=1, S=1 without error.""" + result = calculate_brier_rgs_sample_native(np.array([2.0]), np.array([[3.0]]), threshold=1.0) assert np.isfinite(result) - def test_large_ensemble_stable(self): - """Brier is stable with S=1000 samples.""" + def test_rgs_large_ensemble_stable(self): + """Brier rgs_sample is stable with S=1000 samples.""" rng = np.random.default_rng(42) y_true = np.array([0.0, 5.0, 10.0]) y_pred = rng.normal(loc=y_true[:, None], scale=2.0, size=(3, 1000)) - result = calculate_brier_sample_native(y_true, y_pred, threshold=1.0) + result = calculate_brier_rgs_sample_native(y_true, y_pred, threshold=1.0) assert np.isfinite(result) assert 0 <= result <= 1 # Brier is bounded [0, 1] - def test_threshold_at_exact_data_value(self): + def test_rgs_threshold_at_exact_data_value(self): """Threshold equals an observation — no crash.""" y_true = np.array([5.0, 5.0]) y_pred = np.array([[4.0, 6.0], [4.0, 6.0]]) - result = calculate_brier_sample_native(y_true, y_pred, threshold=5.0) + result = calculate_brier_rgs_sample_native(y_true, y_pred, threshold=5.0) assert np.isfinite(result) - def test_all_above_threshold(self): + def test_rgs_all_above_threshold(self): """All y_true above threshold — y_binary all 1, finite result.""" y_true = np.array([10.0, 20.0]) y_pred = np.array([[0.5, 1.5], [0.5, 1.5]]) - result = calculate_brier_sample_native(y_true, y_pred, threshold=1.0) + result = calculate_brier_rgs_sample_native(y_true, y_pred, threshold=1.0) assert np.isfinite(result) - def test_all_below_threshold(self): + def test_rgs_all_below_threshold(self): """All y_true below threshold — y_binary all 0, finite result.""" y_true = np.array([0.0, 0.5]) y_pred = np.array([[0.5, 1.5], [0.5, 1.5]]) - result = calculate_brier_sample_native(y_true, y_pred, threshold=1.0) + result = calculate_brier_rgs_sample_native(y_true, y_pred, threshold=1.0) + assert np.isfinite(result) + + def test_cls_sample_large_ensemble_stable(self): + """Brier cls_sample is stable with S=1000 probability samples.""" + rng = np.random.default_rng(42) + y_true = np.array([1.0, 0.0, 1.0]) + y_pred = rng.beta(a=2, b=2, size=(3, 1000)) # probabilities in [0, 1] + result = calculate_brier_cls_sample_native(y_true, y_pred, threshold=0.0) assert np.isfinite(result) + assert 0 <= result <= 1 class TestQuantileScoreBeige: @@ -974,7 +1001,7 @@ def test_negative_y_true_valid(self): class TestBrierScoreRed: - def test_nan_in_y_true_swallowed_by_comparison(self): + def test_rgs_nan_in_y_true_swallowed_by_comparison(self): """NaN in y_true is swallowed by '>' comparison (NaN > x → False). Unlike arithmetic metrics, Brier's binarization step converts NaN to @@ -984,24 +1011,31 @@ def test_nan_in_y_true_swallowed_by_comparison(self): """ y_true = np.array([np.nan, 1.0]) y_pred = np.array([[1.0], [1.0]]) - result = calculate_brier_sample_native(y_true, y_pred, threshold=1.0) - # NaN is treated as below-threshold (False), so result is finite, not NaN + result = calculate_brier_rgs_sample_native(y_true, y_pred, threshold=1.0) assert np.isfinite(result) - def test_nan_in_y_pred_swallowed_by_comparison(self): + def test_rgs_nan_in_y_pred_swallowed_by_comparison(self): """NaN in y_pred is swallowed by '>' comparison in p_hat computation.""" y_true = np.array([1.0, 1.0]) y_pred = np.array([[np.nan], [1.0]]) - result = calculate_brier_sample_native(y_true, y_pred, threshold=1.0) + result = calculate_brier_rgs_sample_native(y_true, y_pred, threshold=1.0) assert np.isfinite(result) - def test_negative_threshold_accepted(self): - """Negative threshold is mathematically valid.""" + def test_rgs_negative_threshold_accepted(self): + """Negative threshold is mathematically valid for regression Brier.""" y_true = np.array([1.0, 2.0]) y_pred = np.array([[1.0, 2.0], [2.0, 3.0]]) - result = calculate_brier_sample_native(y_true, y_pred, threshold=-5.0) + result = calculate_brier_rgs_sample_native(y_true, y_pred, threshold=-5.0) assert np.isfinite(result) + def test_cls_sample_nan_in_y_pred_propagates(self): + """NaN in probability samples propagates via mean() — not swallowed.""" + y_true = np.array([1.0, 1.0]) + y_pred = np.array([[np.nan], [0.5]]) + result = calculate_brier_cls_sample_native(y_true, y_pred, threshold=0.0) + # mean([nan]) = nan, (nan - 1)^2 = nan → result is nan + assert np.isnan(result) + class TestQuantileScoreRed: @@ -1040,11 +1074,11 @@ def test_crps_large_ensemble_values(self): assert np.isfinite(result) assert result >= 0 - def test_brier_extreme_threshold(self): + def test_brier_rgs_extreme_threshold(self): """Threshold at 1e300: all values below → y_binary all 0, p_hat all 0, Brier = 0.""" y_true = np.array([1.0, 2.0]) y_pred = np.array([[0.5, 1.5], [0.5, 1.5]]) - result = calculate_brier_sample_native(y_true, y_pred, threshold=1e300) + result = calculate_brier_rgs_sample_native(y_true, y_pred, threshold=1e300) assert result == pytest.approx(0.0, abs=1e-10) def test_coverage_tiny_ensemble_spread(self): diff --git a/tests/test_metric_catalog.py b/tests/test_metric_catalog.py index e1bec4e..bf42f88 100644 --- a/tests/test_metric_catalog.py +++ b/tests/test_metric_catalog.py @@ -386,7 +386,7 @@ def test_evaluator_rejects_unknown_profile(self): def test_registry_snapshot_integrity(self): """Registries have expected sizes — catches accidental mutation or deletion.""" - assert len(METRIC_CATALOG) == 24 + assert len(METRIC_CATALOG) == 25 assert len(METRIC_MEMBERSHIP) == 4 assert len(PROFILES) >= 2 diff --git a/tests/test_native_evaluator.py b/tests/test_native_evaluator.py index b8fb3e8..04d72a2 100644 --- a/tests/test_native_evaluator.py +++ b/tests/test_native_evaluator.py @@ -289,11 +289,11 @@ def test_classification_target(self): assert 'month100' in report.to_dict()['schemas']['month'] def test_classification_sample_brier(self): - """Brier_sample and CRPS work for classification sample predictions.""" + """Brier_cls_sample and CRPS work for classification sample predictions.""" n = 6 ef = EvaluationFrame( y_true=np.array([0.0, 1.0, 0.0, 1.0, 0.0, 1.0]), - y_pred=np.random.default_rng(42).uniform(0, 2, size=(n, 20)), + y_pred=np.random.default_rng(42).uniform(0, 1, size=(n, 20)), # probability samples in [0,1] identifiers={ 'time': np.array([100, 100, 101, 101, 102, 102]), 'unit': np.array([1, 2, 1, 2, 1, 2]), @@ -305,17 +305,17 @@ def test_classification_sample_brier(self): config = { 'steps': [1, 2, 3], 'classification_targets': ['by_sb_best'], - 'classification_sample_metrics': ['Brier_sample', 'CRPS'], + 'classification_sample_metrics': ['Brier_cls_sample', 'CRPS'], } report = NativeEvaluator(config).evaluate(ef) assert report.task == 'classification' assert report.pred_type == 'sample' d = report.to_dict()['schemas'] - assert 'Brier_sample' in d['month']['month100'] + assert 'Brier_cls_sample' in d['month']['month100'] assert 'CRPS' in d['month']['month100'] def test_classification_point_brier(self): - """AP and Brier_point work together for classification point predictions.""" + """AP and Brier_cls_point work together for classification point predictions.""" n = 6 ef = EvaluationFrame( y_true=np.array([0.0, 1.0, 0.0, 1.0, 0.0, 1.0]), @@ -331,12 +331,12 @@ def test_classification_point_brier(self): config = { 'steps': [1, 2, 3], 'classification_targets': ['by_sb_best'], - 'classification_point_metrics': ['AP', 'Brier_point'], + 'classification_point_metrics': ['AP', 'Brier_cls_point'], } report = NativeEvaluator(config).evaluate(ef) d = report.to_dict()['schemas'] assert 'AP' in d['step']['step01'] - assert 'Brier_point' in d['step']['step01'] + assert 'Brier_cls_point' in d['step']['step01'] def test_evaluate_twice_produces_identical_results(self): """NativeEvaluator is stateless — same input yields same output.""" diff --git a/views_evaluation/evaluation/metric_catalog.py b/views_evaluation/evaluation/metric_catalog.py index ed6f50a..adaca24 100644 --- a/views_evaluation/evaluation/metric_catalog.py +++ b/views_evaluation/evaluation/metric_catalog.py @@ -31,8 +31,9 @@ calculate_coverage_native, calculate_mean_interval_score_native, calculate_ignorance_score_native, - calculate_brier_sample_native, - calculate_brier_point_native, + calculate_brier_cls_point_native, + calculate_brier_cls_sample_native, + calculate_brier_rgs_sample_native, calculate_qs_sample_native, calculate_qs_point_native, calculate_sd_native, @@ -91,9 +92,10 @@ class MetricSpec: "QS_sample": MetricSpec(function=calculate_qs_sample_native, genome=("quantile",)), "QS_point": MetricSpec(function=calculate_qs_point_native, genome=("quantile",)), - # ── Brier Score ─────────────────────────────────────────────────────── - "Brier_sample": MetricSpec(function=calculate_brier_sample_native, genome=("threshold",)), - "Brier_point": MetricSpec(function=calculate_brier_point_native, genome=("threshold",)), + # ── Brier Score (3 explicit variants for the 2×2 matrix) ──────────── + "Brier_cls_point": MetricSpec(function=calculate_brier_cls_point_native, genome=("threshold",)), + "Brier_cls_sample": MetricSpec(function=calculate_brier_cls_sample_native, genome=("threshold",)), + "Brier_rgs_sample": MetricSpec(function=calculate_brier_rgs_sample_native, genome=("threshold",)), # ── Classification ──────────────────────────────────────────────────── "AP": MetricSpec(function=calculate_ap_native, genome=()), @@ -107,9 +109,9 @@ class MetricSpec: "SD", "pEMDiv", "Variogram"}, ("regression", "sample"): {"CRPS", "twCRPS", "MIS", "QIS", "QS_sample", "Coverage", "Ignorance", - "y_hat_bar", "MCR_sample"}, - ("classification", "point"): {"AP", "Brier_point"}, - ("classification", "sample"): {"CRPS", "twCRPS", "Brier_sample", "Jeffreys"}, + "y_hat_bar", "MCR_sample", "Brier_rgs_sample"}, + ("classification", "point"): {"AP", "Brier_cls_point"}, + ("classification", "sample"): {"CRPS", "twCRPS", "Brier_cls_sample", "Jeffreys"}, } diff --git a/views_evaluation/evaluation/metrics.py b/views_evaluation/evaluation/metrics.py index c3491f5..eaa80b5 100644 --- a/views_evaluation/evaluation/metrics.py +++ b/views_evaluation/evaluation/metrics.py @@ -131,20 +131,21 @@ class RegressionSampleEvaluationMetrics(BaseEvaluationMetrics): Coverage: Optional[float] = None Ignorance: Optional[float] = None y_hat_bar: Optional[float] = None - MCR_sample: Optional[float] = None + MCR_sample: Optional[float] = None + Brier_rgs_sample: Optional[float] = None @dataclass class ClassificationPointEvaluationMetrics(BaseEvaluationMetrics): """Metrics for classification targets evaluated with point (probability) predictions.""" - AP: Optional[float] = None - Brier_point: Optional[float] = None + AP: Optional[float] = None + Brier_cls_point: Optional[float] = None @dataclass class ClassificationSampleEvaluationMetrics(BaseEvaluationMetrics): """Metrics for classification targets evaluated with sample-based predictions.""" - CRPS: Optional[float] = None - twCRPS: Optional[float] = None - Brier_sample: Optional[float] = None - Jeffreys: Optional[float] = None + CRPS: Optional[float] = None + twCRPS: Optional[float] = None + Brier_cls_sample: Optional[float] = None + Jeffreys: Optional[float] = None diff --git a/views_evaluation/evaluation/native_metric_calculators.py b/views_evaluation/evaluation/native_metric_calculators.py index 6c80707..3757507 100644 --- a/views_evaluation/evaluation/native_metric_calculators.py +++ b/views_evaluation/evaluation/native_metric_calculators.py @@ -218,8 +218,44 @@ def calculate_quantile_interval_score_native( # ── Brier Score ─────────────────────────────────────────────────────────────── +# +# Three explicit variants for the 2×2 evaluation matrix: +# Brier_cls_point — classification point: y_pred is a probability +# Brier_cls_sample — classification sample: y_pred are probability samples (MC Dropout) +# Brier_rgs_sample — regression sample: y_pred are count/magnitude samples +# +# Brier_rgs_point is intentionally omitted: a regression point estimate +# is not a probability, so calling the result a Brier score is misleading. + +def calculate_brier_cls_point_native( + y_true: np.ndarray, + y_pred: np.ndarray, + target=None, + *, + threshold: float, + **kwargs, +) -> float: + """ + Brier Score for classification point (probability) predictions. -def calculate_brier_sample_native( + Binarises truth at the threshold, uses the point prediction + directly as the predicted probability. + + Brier = mean((y_pred - y_binary)^2) + + y_pred values should be in [0, 1] for meaningful results. + For point predictions, y_pred is (N, 1) after _guard_shapes. + + Args: + threshold: Onset threshold for binarising y_true. + """ + y_true, y_pred = _guard_shapes(y_true, y_pred) + y_binary = (y_true > threshold).astype(float) + p_hat = y_pred[:, 0] + return float(np.mean((p_hat - y_binary) ** 2)) + + +def calculate_brier_cls_sample_native( y_true: np.ndarray, y_pred: np.ndarray, target=None, @@ -228,33 +264,30 @@ def calculate_brier_sample_native( **kwargs, ) -> float: """ - Brier Score for sample-based predictions binarized at a threshold. + Brier Score for classification probability samples (e.g. MC Dropout). - Binarises truth at the threshold, computes event probability from - the fraction of ensemble members exceeding the threshold, then - returns the mean squared error between predicted probability and - binary outcome. + Each sample in y_pred is a probability in [0, 1]. The posterior mean + probability is used as the point estimate: - Brier = mean((p_hat - y_binary)^2) + Brier = mean((mean(y_pred, axis=1) - y_binary)^2) - where p_hat = mean(y_pred > threshold, axis=1) and - y_binary = (y_true > threshold). + where y_binary = (y_true > threshold). - Note: NaN values in y_true or y_pred are silently converted to - below-threshold (False) by NumPy comparison semantics. Callers - must validate inputs via EvaluationFrame. + This is the correct formulation for probability samples — averaging + probabilities preserves calibration information. Binarising probability + samples at a threshold (as Brier_rgs_sample does for count data) would + destroy discrimination. Args: - threshold: Onset threshold for binarisation. Must be provided - explicitly via evaluation profile or model config. + threshold: Onset threshold for binarising y_true. """ y_true, y_pred = _guard_shapes(y_true, y_pred) y_binary = (y_true > threshold).astype(float) - p_hat = np.mean(y_pred > threshold, axis=1) + p_hat = np.mean(y_pred, axis=1) return float(np.mean((p_hat - y_binary) ** 2)) -def calculate_brier_point_native( +def calculate_brier_rgs_sample_native( y_true: np.ndarray, y_pred: np.ndarray, target=None, @@ -263,28 +296,25 @@ def calculate_brier_point_native( **kwargs, ) -> float: """ - Brier Score for point (probability) predictions binarized at a threshold. + Brier Score for regression (count/magnitude) samples. - Binarises truth at the threshold, uses the point prediction - directly as the predicted probability. y_pred values should be - in [0, 1] for meaningful results; values outside this range - produce a mathematically valid but semantically misleading score. - - Brier = mean((y_pred - y_binary)^2) + Binarises both truth and each sample at the threshold, then + estimates the event probability from the fraction of ensemble + members exceeding the threshold. - For point predictions, y_pred is (N, 1) after _guard_shapes. - The single column is the predicted probability. + Brier = mean((p_hat - y_binary)^2) - Note: NaN values in y_true or y_pred are silently converted to - below-threshold (False) by NumPy comparison semantics. Callers - must validate inputs via EvaluationFrame. + where p_hat = mean(y_pred > threshold, axis=1) and + y_binary = (y_true > threshold). Args: - threshold: Onset threshold for binarisation. + threshold: Onset threshold for binarisation of both y_true + and y_pred. Must be provided explicitly via + evaluation profile or model config. """ y_true, y_pred = _guard_shapes(y_true, y_pred) y_binary = (y_true > threshold).astype(float) - p_hat = y_pred[:, 0] # Point prediction: single column + p_hat = np.mean(y_pred > threshold, axis=1) return float(np.mean((p_hat - y_binary) ** 2)) diff --git a/views_evaluation/profiles/base.py b/views_evaluation/profiles/base.py index 652367b..f858d54 100644 --- a/views_evaluation/profiles/base.py +++ b/views_evaluation/profiles/base.py @@ -27,8 +27,9 @@ "QIS": {"lower_quantile": 0.025, "upper_quantile": 0.975}, "QS_sample": {"quantile": 0.99}, "QS_point": {"quantile": 0.99}, - "Brier_sample": {"threshold": 1.0}, - "Brier_point": {"threshold": 1.0}, + "Brier_cls_point": {"threshold": 0.0}, + "Brier_cls_sample": {"threshold": 0.0}, + "Brier_rgs_sample": {"threshold": 1.0}, "Coverage": {"alpha": 0.1}, "Ignorance": { "bins": [0, 0.5, 2.5, 5.5, 10.5, 25.5, 50.5, 100.5, 250.5, 500.5, 1000.5], diff --git a/views_evaluation/profiles/hydranet_ucdp.py b/views_evaluation/profiles/hydranet_ucdp.py index 42464fe..07abe36 100644 --- a/views_evaluation/profiles/hydranet_ucdp.py +++ b/views_evaluation/profiles/hydranet_ucdp.py @@ -1,11 +1,8 @@ """ Evaluation profile for HydraNet models on UCDP targets. -Covers probabilistic evaluation metrics that require hyperparameters: -twCRPS, MIS, QIS. All other metrics (CRPS, AP, Brier, etc.) have -empty genomes and need no profile entries. - -Values are placeholders inheriting from base — adjust per domain needs. +Inherits all genome values from BASE_PROFILE (including Brier thresholds, +QS quantile, Coverage alpha, etc.) and overrides where domain needs differ. """ from views_evaluation.profiles.base import BASE_PROFILE From 8e0811056cb37c1144a95298fc0195f7db182562 Mon Sep 17 00:00:00 2001 From: Polichinl Date: Thu, 9 Apr 2026 09:28:31 +0200 Subject: [PATCH 2/2] fix: set Brier threshold defaults to 0.0 (hurdle event y > 0) All three Brier variants now default to threshold=0.0 in the base profile, matching the Pre-Release Note 05 definition: Brier evaluates the binary event "any fatality occurred" (y > 0). On integer-valued UCDP data, y > 0 and y >= 1 are equivalent. Co-Authored-By: Claude Opus 4.6 (1M context) --- views_evaluation/profiles/base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/views_evaluation/profiles/base.py b/views_evaluation/profiles/base.py index f858d54..12dca09 100644 --- a/views_evaluation/profiles/base.py +++ b/views_evaluation/profiles/base.py @@ -27,9 +27,9 @@ "QIS": {"lower_quantile": 0.025, "upper_quantile": 0.975}, "QS_sample": {"quantile": 0.99}, "QS_point": {"quantile": 0.99}, - "Brier_cls_point": {"threshold": 0.0}, - "Brier_cls_sample": {"threshold": 0.0}, - "Brier_rgs_sample": {"threshold": 1.0}, + "Brier_cls_point": {"threshold": 0.0}, # hurdle event: any fatality (y > 0) + "Brier_cls_sample": {"threshold": 0.0}, # hurdle event: any fatality (y > 0) + "Brier_rgs_sample": {"threshold": 0.0}, # hurdle event: any fatality (y > 0) "Coverage": {"alpha": 0.1}, "Ignorance": { "bins": [0, 0.5, 2.5, 5.5, 10.5, 25.5, 50.5, 100.5, 250.5, 500.5, 1000.5],