Automation_scripts/rfc.py at main · elichter/Automation_scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# This module will test random forest classifier. Already did regression, so
# classifier will be attempted to see how it performs comparatively.

# First, do all imports
from sklearn.ensemble import RandomForestClassifier as rfc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split as tts
from sklearn.tree import plot_tree
import skopt
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import accuracy_score

# Import the data.
df = pd.read_csv('df2.csv', usecols=['Reference Position', 'Type',
        'Length', 'Reference', 'Allele', 'Frequency', 'Strain', 'Gene_name',
            'Gene_type', 'Gene_element' ,'Gender'])

# Since the 'Reference' and 'Allele' columns represent what should be in the
# reference genome, and what is actually there respectively, they can be
# combined into a single column. This combined column called 'Mut' will be
# created and the original 'Reference' and 'Allele' columns will be dropped.
# This is done to make the dataset simpler and removing unnecessary columns.
df['Mut'] = df['Reference'] + df['Allele']

# Convert the data to binary using pandas get_dummies. This converts
# categorical data (such as names of samples, mutations, etc) into numerical
# data without ordering them. It does so using one-hot encoding. This is
# acheived by creating columns for each and putting 0s in rows where something
# doesn't apply, and 1s in rows where things do apply.
features = pd.get_dummies(df)

# Create variables 'S', 'G', 'M', 'N', 'T', 'E', and 'Y' to represent Strain,
# Gender, Mut, Gene_name, ' Gene_type, Gene_element, and  Type respectively.
# This way, these can be generalized, and the new columns created for them from
# the get_dummies command above do not need to be called explictly
# individually.
S = [col for col in features if col.startswith('Strain')]
G = [col for col in features if col.startswith('Gender')]
M = [col for col in features if col.startswith('Mut')]
N = [col for col in features if col.startswith('Gene_name')]
T = [col for col in features if col.startswith('Gene_type')]
E = [col for col in features if col.startswith('Gene_element')]
Y = [col for col in features if col.startswith('Type')]
df.drop(['Reference', 'Allele'], axis=1, inplace=True)

# Drop the Strain variable from the dataframe because that is what we want to
# predict - x, and create the response variable, y
x = features.drop(S, axis=1)
y = features[S]

# Split the data into train and test
x_train, x_test, y_train, y_test = tts(x, y, train_size=0.7, random_state=42)

# Fit the data to the random forest classifier.
cf = rfc(random_state=42, n_jobs=-1, n_estimators=10000, oob_score=True)
cf.fit(x_train, y_train)

# Check the oob score
print('The oob score is:', cf.oob_score_)

# Pull out one tree from the forest
tree = cf.estimators_[5]

# Visualize the results.
plt.figure(figsize=(100,50))
plot_tree(tree,feature_names =
        x.columns,class_names=[S],filled=True)
plt.savefig('classifier.png')

# Optimize hyperparameters. Here we will use the Bayesian optimization from
# pierpaolo28.github.io/blog/blog25/
space = {'criterion': hp.choice('criterion', ['entropy', 'gini']),
        'max_depth': hp.quniform('max_depth', 10, 1200, 10),
        'max_features': hp.choice('max_features', ['auto', 'sqrt','log2',
            None]),
        'min_samples_leaf': hp.uniform ('min_samples_leaf', 0, 0.5),
        'min_samples_split' : hp.uniform ('min_samples_split', 0, 1),
        'n_estimators' : hp.choice('n_estimators', [10, 50, 300, 750, 1200])
        }

def objective(space):
    model = rfc(criterion = space['criterion'], max_depth = space['max_depth'],
            max_features = space['max_features'], min_samples_leaf =
            space['min_samples_leaf'], min_samples_split =
            space['min_samples_split'], n_estimators = space['n_estimators'],)

    accuracy = cross_val_score(model, x_train, y_train, cv = 4).mean()
    return{'loss': -accuracy, 'status': STATUS_OK}

trials = Trials()
best = fmin(fn=objective, space = space, algo = tpe.suggest, max_evals = 80,
        trials = trials)
#print(best)
crit = {0: 'entropy', 1: 'gini'}
feat = {0: 'auto', 1: 'sqrt', 2: 'log2', 3: None}
est = {0: 10, 1: 50, 2: 300, 3: 750, 4: 1200}
train_forest = rfc(criterion = crit[best['criterion']], max_depth=
        best['max_depth'], max_features = feat[best['max_features']],
        min_samples_leaf = best['min_samples_leaf'], min_samples_split =
        best['min_samples_split'], n_estimators =
        est[best['n_estimators']]).fit(x_train, y_train)
print(best)
treeee = train_forest.estimators_[7]
plt.figure(figsize=(100,50))
plot_tree(treeee, feature_names = x.columns, class_names=[S], filled=True)
plt.savefig('bayesin.png')
#y_test = np.argmax(y_test, axis = 1)
prediction_forest = train_forest.predict(x_test)
print(confusion_matrix(y_test.values.argmax(axis=1),
    prediction_forest.argmax(axis=1)))
#print(classification_report(y_test, prediction_forest))
acc5 = accuracy_score(y_test, prediction_forest)
print(acc5)