-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgeneralClassifier.py
More file actions
68 lines (55 loc) · 2.13 KB
/
Copy pathgeneralClassifier.py
File metadata and controls
68 lines (55 loc) · 2.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import classificationPreprocessor as cp
import datasetRetriever as dr
import langid
import nltk
import sklearn
from functools import partial
from nltk.corpus import words
from sklearn.naive_bayes import GaussianNB
def trainGeneralClassifier():
# DRY
dr.init_all(200)
# Get features, labelled bigrams, test and training sets
dr.prepareDatasets()
bigramFeatures, labelledBigrams = dr.getTrainingDataset()
featureExtractor = partial(extractFeatures, gramFeatures = bigramFeatures)
training_set = nltk.classify.apply_features(featureExtractor, labelledBigrams)
labelledTest = dr.getTestDataset()
test_set = nltk.classify.apply_features(featureExtractor, labelledTest)
splitPoint = int(len(test_set) * 0.532)
clf = trainNBClassifier(training_set + test_set[:splitPoint])
printAcc(clf, test_set[splitPoint:])
return clf
# Extract features from a bigram
def extractFeatures(doc, gramFeatures):
document = set(doc)
features = {}
for gram in gramFeatures:
features['contains(({0}, {1}))'.format(gram[0], gram[1])] = (gram in document)
return features
# Naive Bayes Classifier - nltk implementation - using manual IDF
def trainNBClassifier(training_set):
return nltk.NaiveBayesClassifier.train(training_set)
def printAcc(classifier, test_set):
print "%.2f%%" % (nltk.classify.accuracy(classifier, test_set) * 100)
def predict(tweet, clf):
PREDICTIONS = {
'0' : "Definitely not sarcastic",
'1' : "Probably not sarcastic",
'2' : "Can't say",
'3' : "Probably sarcastic",
'4' : "Definitely sarcastic"
}
tweetBigrams = dr.getBigrams(tweet)
bigramFeatures, _ = dr.getTrainingDataset()
featureExtractor = partial(extractFeatures, gramFeatures = bigramFeatures)
if tweetBigrams is not None:
pr = clf.classify(featureExtractor(tweetBigrams))
return PREDICTIONS[str(pr)]
# Extract features from a bigram
def extractFeatures(doc, gramFeatures):
document = set(doc)
features = {}
for gram in gramFeatures:
features['contains(({0}, {1}))'.format(gram[0], gram[1])] = (gram in document)
return features