-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathStatsCalculator.py
More file actions
166 lines (130 loc) · 5.36 KB
/
StatsCalculator.py
File metadata and controls
166 lines (130 loc) · 5.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import pandas as pd
import math
from typing import Dict, Optional
# class StatsConstants:
# COVERAGE = "Coverage"
# MAE = "MAE"
# PEARSON_RSQ = "PearsonRSQ"
# RMSE = "RMSE"
# Q2 = "Q2"
# R2 = "R2"
#
# TAG_TEST = "_Test"
# TAG_TRAINING = "_Training"
# TAG_CV = "_CV"
#
# Q2_TEST = Q2 + TAG_TEST
# R2_TRAINING = R2+TAG_TRAINING
from util import predict_constants as pc
# Safe divisions (match Java behavior but avoid ZeroDivisionError)
def safe_div(n, d):
try:
return n / d if d else float('nan')
except ZeroDivisionError:
return float('nan')
def calculate_mean_exp_training(df_training: pd.DataFrame):
# Filter out rows where 'exp' or 'pred' are NaN
valid_df = df_training.dropna(subset=['exp', 'pred'])
# Calculate means
return valid_df['exp'].mean()
def calculate_continuous_statistics(df: pd.DataFrame, mean_exp_training: float, tag: str, ad_measure_final: Optional[str] = None) -> Dict[str, float]:
# Filter out rows where 'exp' or 'pred' are NaN
valid_df = df.dropna(subset=['exp', 'pred'])
# Total counts
count_total = len(df.dropna(subset=['exp']))
count_predicted = len(valid_df)
if count_predicted == 0:
raise ValueError("No valid predictions available for calculation.")
# Calculate means
mean_exp = valid_df['exp'].mean()
mean_pred = valid_df['pred'].mean()
# Calculate MAE
mae = (valid_df['exp'] - valid_df['pred']).abs().mean()
# Calculate terms for Pearson RSQ
term_xy = ((valid_df['exp'] - mean_exp) * (valid_df['pred'] - mean_pred)).sum()
term_xx = ((valid_df['exp'] - mean_exp) ** 2).sum()
term_yy = ((valid_df['pred'] - mean_pred) ** 2).sum()
# Calculate sums for coefficient of determination
ss = ((valid_df['exp'] - valid_df['pred']) ** 2).sum()
ss_total = ((valid_df['exp'] - mean_exp_training) ** 2).sum()
# Calculate statistics
if ad_measure_final is None:
coverage = safe_div(count_predicted, count_total)
else:
# Build list of AD columns
colsAD = [f"AD_{ad.replace(' ', '_')}" for ad in ad_measure_final]
# Inside/outside consensus AD masks
mask_all_true = valid_df[colsAD].eq(True).all(axis=1)
mask_outside = ~mask_all_true
# Coverage of consensus AD
total_rows = len(valid_df)
coverage = safe_div(mask_all_true.sum(), total_rows)
pearson_rsq = safe_div(term_xy ** 2, term_xx * term_yy)
coeff_det = 1 - ss / ss_total if ss_total != 0 else float('nan')
coeff_det = 1 - safe_div(ss, ss_total)
rmse = math.sqrt(safe_div(ss, count_predicted))
model_statistic_values = {
pc.COVERAGE + tag: coverage,
pc.MAE + tag: mae,
pc.PEARSON_RSQ + tag: pearson_rsq,
pc.RMSE + tag: rmse
}
if tag == pc.TAG_TEST:
model_statistic_values[pc.Q2_TEST] = coeff_det
elif tag == pc.TAG_TRAINING:
model_statistic_values[pc.R2_TRAINING] = coeff_det
elif tag == pc.TAG_EXTERNAL:
model_statistic_values[pc.Q2_EXTERNAL] = coeff_det
return model_statistic_values
def calculate_binary_statistics(df: pd.DataFrame, cutoff: float, tag: str, ad_measure_final: Optional[str] = None) -> Dict[str, float]:
# Keep only rows with a known expected label
valid = df.dropna(subset=['exp'])
count_total = len(valid)
# Among those, keep only rows with a prediction
predicted = valid.dropna(subset=['pred'])
count_predicted = len(predicted)
# If there are no predicted rows, return coverage and NaNs for other metrics
if count_predicted == 0:
coverage = (count_predicted / count_total) if count_total else float('nan')
return {
pc.COVERAGE + tag: coverage,
pc.CONCORDANCE + tag: float('nan'),
pc.SENSITIVITY + tag: float('nan'),
pc.SPECIFICITY + tag: float('nan'),
pc.BALANCED_ACCURACY + tag: float('nan'),
}
# Binary predictions using the cutoff
pred_bin = (predicted['pred'] >= cutoff).astype(int)
# Use exp values from the same (predicted) subset
exp_vals = predicted['exp']
# Java logic counts positives/negatives only among rows that have predictions
pos_mask = (exp_vals == 1)
neg_mask = (exp_vals == 0)
count_positive = int(pos_mask.sum())
count_negative = int(neg_mask.sum())
tp = int((pos_mask & (pred_bin == 1)).sum())
tn = int((neg_mask & (pred_bin == 0)).sum())
count_true = tp + tn
# Calculate statistics
if ad_measure_final is None:
coverage = safe_div(count_predicted, count_total)
else:
# Build list of AD columns
colsAD = [f"AD_{ad.replace(' ', '_')}" for ad in ad_measure_final]
# Inside/outside consensus AD masks
mask_all_true = valid[colsAD].eq(True).all(axis=1)
mask_outside = ~mask_all_true
# Coverage of consensus AD
total_rows = len(valid)
coverage = safe_div(mask_all_true.sum(), total_rows)
concordance = safe_div(count_true, count_predicted)
sensitivity = safe_div(tp, count_positive)
specificity = safe_div(tn, count_negative)
balanced_accuracy = (sensitivity + specificity) / 2.0
return {
pc.COVERAGE + tag: coverage,
pc.CONCORDANCE + tag: concordance,
pc.SENSITIVITY + tag: sensitivity,
pc.SPECIFICITY + tag: specificity,
pc.BALANCED_ACCURACY + tag: balanced_accuracy,
}