sleep_models/sfs.py at master · CUDeepLearningFall2019/sleep_models · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.preprocessing import scale, normalize
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.semi_supervised import label_propagation
from sklearn.semi_supervised import LabelSpreading
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix

n_features = 15

# Read am partition the matrix
def load_data():
    data = pd.read_feather('./feature_stage_data_all.ftr')
    x = data[data.columns[3:]]
    y = data['stage']
    x = x.values
    x = normalize(x)
    y = y.values

    nnl = lambda a: np.invert(np.isnan(a))
    nul = lambda a: np.isnan(a)
    x_obs = x[nnl(y)]
    y_obs = y[nnl(y)]
    x_nuls = x[nul(y)]
    return x_obs, y_obs, x_nuls

x_obs, y_obs, x_nuls = load_data()

def do_sfs(x_tr, y_tr):
    sfs_kern = sfs(svm.SVC(kernel='rbf'),
               k_features=n_features,
               forward=True,
               floating=True,
               verbose=2,
               scoring='accuracy',
               cv=5)
    sfs_kern.fit(x_tr, y_tr)
    return sfs_kern


# Do some initial spliting
x, y = shuffle(x_obs, y_obs, random_state=42)
smpnum = min([sum(y==i) for i in range(1,6)])
y_btr = y[y == 1][:smpnum]
x_btr = x[y == 1][:smpnum]
for i in range(2,6):
    x_btr = np.concatenate([x_btr, x[y == i][:smpnum]])
    y_btr = np.concatenate([y_btr, y[y == i][:smpnum]])

x_tr, x_te, y_tr, y_te = train_test_split(x_btr, y_btr, test_size = 0.20, )

best = do_sfs(x_tr, y_tr)


# examine the results
plot = plot_sfs(best.get_metric_dict())
plot[1].figure.savefig("SFS-" + str(n_features) + ".png")
for i in range(1,11):
    best.get_metric_dict()[i]['avg_score']

test_svm(x_all, y_all)

# make a more select dataset
# Filter the rest of the data
x_obs, y_obs, x_nuls = load_data()
keep = list(best.k_feature_idx_)
np.save('sfs_features', keep)
# keep = np.load('sfs_features.npy')
x_obs = x_obs[:,keep]
x_nuls = x_nuls[:,keep]


# apply LabelSpreading
label_spread = LabelSpreading(kernel='knn', alpha=0.8)
label_spread.fit(x_obs, y_obs)
x_all = np.concatenate([x_obs, x_nuls], axis=0)
y_all = np.concatenate([y_obs, label_spread.predict(x_nuls)], axis=0)

x, y = shuffle(x_all, y_all, random_state=42)
smpnum = min([sum(y==i) for i in range(1,6)])
y_btr = y[y == 1][:smpnum]
x_btr = x[y == 1][:smpnum]
for i in range(2,6):
    x_btr = np.concatenate([x_btr, x[y == i][:smpnum]])
    y_btr = np.concatenate([y_btr, y[y == i][:smpnum]])

x_tr, x_te, y_tr, y_te = train_test_split(x_btr, y_btr, test_size = 0.20)

mod = svm.SVC(kernel='rbf')
mod.fit(x_tr, y_tr)
mod.score(x_te, y_te)

disp = plot_confusion_matrix(mod, x_te, y_te,
                             cmap=plt.cm.Blues
                             ,normalize='true')
disp.ax_.set_title("RBF Kernel with " + str(n_features) + " best features")
cfm = disp.plot()
cfm.figure_.savefig("CM-SVM-RBF-" + str(n_features) + ".png")