forked from ryll-spectre/PD-Webpage-Classifier
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathWebpageClassifier.py
More file actions
123 lines (99 loc) · 3.98 KB
/
WebpageClassifier.py
File metadata and controls
123 lines (99 loc) · 3.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import os
from pandas import DataFrame
import numpy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import KFold
from sklearn.metrics import confusion_matrix, f1_score
"""
Trains a classifier using features extracted from webpage URLs anmd titles
in order to predict whether a future webpage is relevant or irrelevant.
Uses multinomial naive Bayes and produces a confusion matrix and F1 score to
display the results of tweaking the parameters of the classifier.
NOTE: User of program must edit paths accordingly
"""
NEWLINE = '\n'
SKIP_FILES = {'cmds'}
"""
Takes a file path, iterates recursively through all files in path,
and yields textual content from crawler contained in the path. Allows for moving
of data into directories
Args:
path: path containing parsed keywords from crawl results
"""
def read_files(path):
for root, dir_names, file_names in os.walk(path):
for path in dir_names:
read_files(os.path.join(root, path))
for file_name in file_names:
if file_name not in SKIP_FILES:
file_path = os.path.join(root, file_name)
if os.path.isfile(file_path):
lines = []
f = open(file_path, encoding="latin-1")
for line in f:
lines.append(line)
f.close()
content = NEWLINE.join(lines)
#print(content)
yield file_path, content
"""
Uses Panda library to turn data into ideal array format
for classifier
Args:
path: path containing parsed keywords from crawl results
classification: marker indicating relevant or not relevant
Returns:
data_frame: DataFrame from all files in path
"""
def build_data_frame(path, classification):
rows = []
index = []
for file_name, text in read_files(path):
rows.append({'text': text, 'class': classification})
index.append(file_name)
data_frame = DataFrame(rows, index=index)
return data_frame
# classifications of data
R = 'relevant'
NR = 'not relevant'
# training set of relevant and not relevant webpages
# arranged by (PATH, CLASSIFICATION)
SOURCES = [
('/Users/yjiang/Documents/nutch_data/classification/YES', R),
('/Users/yjiang/Documents/nutch_data/classification/NO', NR),
]
# add training set to data frame
data = DataFrame({'text': [], 'class': []})
for path, classification in SOURCES:
data = data.append(build_data_frame(path, classification))
data = data.reindex(numpy.random.permutation(data.index)) # shuffle the dataset
# Pipeline consists of a count vectorizer and a classifier to
# learn vocabulary and extract word count features
pipeline = Pipeline([
('count_vectorizer', CountVectorizer(ngram_range=(1, 2))), # looking for n-gram frequency
('classifier', MultinomialNB())])
pipeline.fit(data['text'].values, data['class'].values)
# examples = ['neo jpl', 'venus NASA', 'black hole']
# pipeline.predict(examples)
# save the trained model to use in AccuracyCalc.py
from sklearn.externals import joblib
joblib.dump(pipeline, 'multinomial_classifier.pkl')
k_fold = KFold(n=len(data), n_folds=6)
scores = []
confusion = numpy.array([[0, 0], [0, 0]])
for train_indices, test_indices in k_fold:
train_text = data.iloc[train_indices]['text'].values
train_y = data.iloc[train_indices]['class'].values
test_text = data.iloc[test_indices]['text'].values
test_y = data.iloc[test_indices]['class'].values
pipeline.fit(train_text, train_y)
predictions = pipeline.predict(test_text)
confusion += confusion_matrix(test_y, predictions)
score = f1_score(test_y, predictions, pos_label=R)
scores.append(score)
print('Total pages classified:', len(data))
print('Score:', sum(scores)/len(scores))
print('Confusion matrix:')
print(confusion)