pf_python_modelbuilding/StatsCalculator.py at main · USEPA/pf_python_modelbuilding · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import pandas as pd
import math
from typing import Dict, Optional

# class StatsConstants:
#     COVERAGE = "Coverage"
#     MAE = "MAE"
#     PEARSON_RSQ = "PearsonRSQ"
#     RMSE = "RMSE"
#     Q2 = "Q2"
#     R2 = "R2"
#
#     TAG_TEST = "_Test"
#     TAG_TRAINING = "_Training"
#     TAG_CV = "_CV"
#
#     Q2_TEST = Q2 + TAG_TEST
#     R2_TRAINING = R2+TAG_TRAINING

from util import predict_constants as pc


# Safe divisions (match Java behavior but avoid ZeroDivisionError)
def safe_div(n, d):
    try:
        return n / d if d else float('nan')
    except ZeroDivisionError:
        return float('nan')


def calculate_mean_exp_training(df_training: pd.DataFrame):
    # Filter out rows where 'exp' or 'pred' are NaN
    valid_df = df_training.dropna(subset=['exp', 'pred'])

    # Calculate means
    return valid_df['exp'].mean()


def calculate_continuous_statistics(df: pd.DataFrame, mean_exp_training: float, tag: str, ad_measure_final: Optional[str] = None) -> Dict[str, float]:
    # Filter out rows where 'exp' or 'pred' are NaN
    valid_df = df.dropna(subset=['exp', 'pred'])

    # Total counts
    count_total = len(df.dropna(subset=['exp']))
    count_predicted = len(valid_df)

    if count_predicted == 0:
        raise ValueError("No valid predictions available for calculation.")

    # Calculate means
    mean_exp = valid_df['exp'].mean()
    mean_pred = valid_df['pred'].mean()

    # Calculate MAE
    mae = (valid_df['exp'] - valid_df['pred']).abs().mean()

    # Calculate terms for Pearson RSQ
    term_xy = ((valid_df['exp'] - mean_exp) * (valid_df['pred'] - mean_pred)).sum()
    term_xx = ((valid_df['exp'] - mean_exp) ** 2).sum()
    term_yy = ((valid_df['pred'] - mean_pred) ** 2).sum()

    # Calculate sums for coefficient of determination
    ss = ((valid_df['exp'] - valid_df['pred']) ** 2).sum()
    ss_total = ((valid_df['exp'] - mean_exp_training) ** 2).sum()

    # Calculate statistics
    if ad_measure_final is None:
        coverage = safe_div(count_predicted, count_total)
    else:
        # Build list of AD columns
        colsAD = [f"AD_{ad.replace(' ', '_')}" for ad in ad_measure_final]

        # Inside/outside consensus AD masks
        mask_all_true = valid_df[colsAD].eq(True).all(axis=1)
        mask_outside = ~mask_all_true

        # Coverage of consensus AD
        total_rows = len(valid_df)
        coverage = safe_div(mask_all_true.sum(), total_rows)

    pearson_rsq = safe_div(term_xy ** 2, term_xx * term_yy)
    coeff_det = 1 - ss / ss_total if ss_total != 0 else float('nan')
    coeff_det = 1 - safe_div(ss, ss_total)
    rmse = math.sqrt(safe_div(ss, count_predicted))

    model_statistic_values = {
        pc.COVERAGE + tag: coverage,
        pc.MAE + tag: mae,
        pc.PEARSON_RSQ + tag: pearson_rsq,
        pc.RMSE + tag: rmse
    }

    if tag == pc.TAG_TEST:
        model_statistic_values[pc.Q2_TEST] = coeff_det
    elif tag == pc.TAG_TRAINING:
        model_statistic_values[pc.R2_TRAINING] = coeff_det
    elif tag == pc.TAG_EXTERNAL:
        model_statistic_values[pc.Q2_EXTERNAL] = coeff_det

    return model_statistic_values


def calculate_binary_statistics(df: pd.DataFrame, cutoff: float, tag: str, ad_measure_final: Optional[str] = None) -> Dict[str, float]:
    # Keep only rows with a known expected label
    valid = df.dropna(subset=['exp'])
    count_total = len(valid)

    # Among those, keep only rows with a prediction
    predicted = valid.dropna(subset=['pred'])
    count_predicted = len(predicted)

    # If there are no predicted rows, return coverage and NaNs for other metrics
    if count_predicted == 0:
        coverage = (count_predicted / count_total) if count_total else float('nan')
        return {
            pc.COVERAGE + tag: coverage,
            pc.CONCORDANCE + tag: float('nan'),
            pc.SENSITIVITY + tag: float('nan'),
            pc.SPECIFICITY + tag: float('nan'),
            pc.BALANCED_ACCURACY + tag: float('nan'),
        }

    # Binary predictions using the cutoff
    pred_bin = (predicted['pred'] >= cutoff).astype(int)

    # Use exp values from the same (predicted) subset
    exp_vals = predicted['exp']

    # Java logic counts positives/negatives only among rows that have predictions
    pos_mask = (exp_vals == 1)
    neg_mask = (exp_vals == 0)

    count_positive = int(pos_mask.sum())
    count_negative = int(neg_mask.sum())

    tp = int((pos_mask & (pred_bin == 1)).sum())
    tn = int((neg_mask & (pred_bin == 0)).sum())
    count_true = tp + tn

    # Calculate statistics
    if ad_measure_final is None:
        coverage = safe_div(count_predicted, count_total)
    else:
        # Build list of AD columns
        colsAD = [f"AD_{ad.replace(' ', '_')}" for ad in ad_measure_final]

        # Inside/outside consensus AD masks
        mask_all_true = valid[colsAD].eq(True).all(axis=1)
        mask_outside = ~mask_all_true

        # Coverage of consensus AD
        total_rows = len(valid)
        coverage = safe_div(mask_all_true.sum(), total_rows)

    concordance = safe_div(count_true, count_predicted)
    sensitivity = safe_div(tp, count_positive)
    specificity = safe_div(tn, count_negative)
    balanced_accuracy = (sensitivity + specificity) / 2.0

    return {
        pc.COVERAGE + tag: coverage,
        pc.CONCORDANCE + tag: concordance,
        pc.SENSITIVITY + tag: sensitivity,
        pc.SPECIFICITY + tag: specificity,
        pc.BALANCED_ACCURACY + tag: balanced_accuracy,
    }