-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_augmentation.py
More file actions
124 lines (101 loc) · 5.3 KB
/
data_augmentation.py
File metadata and controls
124 lines (101 loc) · 5.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import pandas as pd
import numpy as np
from scipy.stats import boxcox
from sklearn.preprocessing import StandardScaler, QuantileTransformer, RobustScaler, PowerTransformer
import os
def check_and_replace_infs(df, replacement_value=1e6):
df.replace([np.inf, -np.inf], replacement_value, inplace=True)
df.fillna(replacement_value, inplace=True)
return df
def transform_and_append(df, transformation, transformation_name, apply_to_whole_df=False):
transformed_data = df.copy()
if apply_to_whole_df:
transformed_numeric = transformation(transformed_data.select_dtypes(include=[np.number]))
else:
transformed_numeric = transformed_data.select_dtypes(include=[np.number]).apply(transformation)
for col in transformed_numeric.columns:
transformed_numeric[col] = transformed_numeric[col].astype(transformed_data[col].dtype)
transformed_numeric = check_and_replace_infs(transformed_numeric)
transformed_data.update(transformed_numeric)
transformed_data['Environment'] = transformed_data['Environment'] + f'_{transformation_name}'
return transformed_data
def boxcox_transform(x):
x_positive = x + 1 - np.min(x)
transformed, _ = boxcox(x_positive)
return transformed
def clr_transform(x):
gm = np.exp(np.mean(np.log(x + 1))) # Adding 1 to avoid log(0)
return np.log((x + 1) / gm) # Adding 1 to avoid log(0)
def mr_transform(df):
transformed_data = df.copy()
counts = transformed_data.select_dtypes(include=[np.number])
geometric_means = np.exp(np.mean(np.log(counts + 1), axis=0))
size_factors = counts.divide(geometric_means, axis=1).median(axis=0)
normalized_counts = counts.divide(size_factors, axis=1)
for col in normalized_counts.columns:
normalized_counts[col] = normalized_counts[col].astype(transformed_data[col].dtype)
normalized_counts = check_and_replace_infs(normalized_counts)
transformed_data.update(normalized_counts)
transformed_data['Environment'] = transformed_data['Environment'] + '_mr'
return transformed_data
def zscore_transform(x):
return (x - np.mean(x)) / np.std(x)
def quantile_transform(df):
qt = QuantileTransformer(output_distribution='normal', random_state=0)
transformed = qt.fit_transform(df.select_dtypes(include=[np.number]))
transformed_df = pd.DataFrame(transformed, columns=df.select_dtypes(include=[np.number]).columns)
transformed_df = check_and_replace_infs(transformed_df)
return transformed_df
def minmax_transform(x):
return (x - np.min(x)) / (np.max(x) - np.min(x))
def robust_transform(df):
rs = RobustScaler()
transformed = rs.fit_transform(df.select_dtypes(include=[np.number]))
transformed_df = pd.DataFrame(transformed, columns=df.select_dtypes(include=[np.number]).columns)
transformed_df = check_and_replace_infs(transformed_df)
return transformed_df
def yeo_johnson_transform(df):
pt = PowerTransformer(method='yeo-johnson')
transformed = pt.fit_transform(df.select_dtypes(include=[np.number]))
transformed_df = pd.DataFrame(transformed, columns=df.select_dtypes(include=[np.number]).columns)
transformed_df = check_and_replace_infs(transformed_df)
return transformed_df
# Mapping augmentation name to function
AUGMENTATION_FUNCTIONS = {
"log": lambda df: transform_and_append(df, np.log1p, 'log'),
"sqrt": lambda df: transform_and_append(df, np.sqrt, 'sqrt'),
"boxcox": lambda df: transform_and_append(df, boxcox_transform, 'boxcox'),
"clr": lambda df: transform_and_append(df, clr_transform, 'clr'),
"mr": mr_transform,
"zscore": lambda df: transform_and_append(df, zscore_transform, 'zscore'),
"quantile": lambda df: transform_and_append(df, quantile_transform, 'quantile', apply_to_whole_df=True),
"minmax": lambda df: transform_and_append(df, minmax_transform, 'minmax'),
"robust": lambda df: transform_and_append(df, robust_transform, 'robust', apply_to_whole_df=True),
"yeojohnson": lambda df: transform_and_append(df, yeo_johnson_transform, 'yeojohnson', apply_to_whole_df=True)
}
def apply_augmentations(df, augmentations, original_file_path):
# Construct augmented filename based on augmentations
base_filename = os.path.splitext(os.path.basename(original_file_path))[0]
aug_suffix = "_" + "_".join(augmentations) if augmentations else ""
output_filename = f"{base_filename}{aug_suffix}.csv"
output_path = os.path.join("data", "ml_data", output_filename)
# If file already exists, load and return it
if os.path.exists(output_path):
print(f"Augmented file found: {output_path}. Skipping augmentations.")
return pd.read_csv(output_path)
# Otherwise, perform augmentations
print(f"No existing file for augmentations {augmentations}. Running augmentations...")
augmented_dfs = [df]
for aug in augmentations:
if aug in AUGMENTATION_FUNCTIONS:
augmented_df = AUGMENTATION_FUNCTIONS[aug](df)
augmented_dfs.append(augmented_df)
else:
print(f"Warning: Augmentation '{aug}' not recognized and will be skipped.")
combined = pd.concat(augmented_dfs, axis=0)
combined.reset_index(drop=True, inplace=True)
# Save to CSV
os.makedirs(os.path.dirname(output_path), exist_ok=True)
combined.to_csv(output_path, index=False)
print(f"Augmented data saved to {output_path}")
return combined