-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSelection.py
More file actions
94 lines (79 loc) · 3.69 KB
/
Selection.py
File metadata and controls
94 lines (79 loc) · 3.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.datasets import load_iris
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd
# Load dataset
data = pd.read_csv('/Users/kylow/Dev/Data Mining/DM_Project_24.csv')
print(data.columns)
data = data.drop(index=0)
X = data.drop('Target (Col 106)',axis=1)
Y = data['Target (Col 106)']
# Define preprocessing and models
preprocessing_options = {
#Impute first then Normalize
'Method A': Pipeline(steps=[
('imputer', SimpleImputer(strategy='mean')), # Mean imputation
('scaler', StandardScaler()) # Standard scaling
]),
'Method B': Pipeline(steps=[
('imputer', SimpleImputer(strategy='mean')), # Median imputation
('scaler', MinMaxScaler()) # Min-max scaling
]),
'Method C': Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')), # Mean imputation
('scaler', StandardScaler()) # Robust scaling
]),
'Method D': Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')), # Most frequent imputation
('scaler', MinMaxScaler()) # Standard scaling
]),
#Normalize first
'Method E': Pipeline(steps=[
('scaler', StandardScaler()),
('imputer', SimpleImputer(strategy='mean')), # Min-max scaling
]),
'Method F': Pipeline(steps=[
('scaler', StandardScaler()),
('imputer', SimpleImputer(strategy='median')), # Min-max scaling
]),
'Method G': Pipeline(steps=[
('scaler', MinMaxScaler()),
('imputer', SimpleImputer(strategy='mean')), # Min-max scaling
]),
'Method H': Pipeline(steps=[
('scaler', MinMaxScaler()),
('imputer', SimpleImputer(strategy='median')), # Min-max scaling
]),
}
models = {
'KNN': KNeighborsClassifier(algorithm='auto', leaf_size=10, metric='euclidean', n_neighbors=11, p=1, weights='uniform'),
}
# Outer CV for model selection
outer_cv = KFold(n_splits=5, shuffle=True, random_state=79)
results = {}
f1results = {}
# Evaluate each combination of model and preprocessing method
for preprocessing_name, preprocessing in preprocessing_options.items():
for model_name, model in models.items():
pipeline = Pipeline(steps=[('preprocessing', preprocessing),
('model', model)])
# Inner CV for hyperparameter tuning (you can define param_grid as needed)
param_grid = {} # Define hyperparameter grid if necessary
grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='f1', error_score='raise')
# Perform outer CV to evaluate the model
grid_search.fit(X, Y)
scores = cross_val_score(grid_search, X, Y, cv=outer_cv, scoring='f1')
# Store results
results[(preprocessing_name, model_name)] = scores.mean()
print(preprocessing_name, model_name,scores.mean())
# Display results
best_combination = max(results, key=results.get)
print("Best combination:", best_combination, "with score:", results[best_combination])