Perry/generalClassifier.py at master · CptFoobar/Perry · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import classificationPreprocessor as cp
import datasetRetriever as dr
import langid
import nltk
import sklearn
from functools import partial
from nltk.corpus import words
from sklearn.naive_bayes import GaussianNB

def trainGeneralClassifier():
    # DRY
    dr.init_all(200)
    # Get features, labelled bigrams, test and training sets
    dr.prepareDatasets()

    bigramFeatures, labelledBigrams = dr.getTrainingDataset()
    featureExtractor = partial(extractFeatures, gramFeatures = bigramFeatures)
    training_set = nltk.classify.apply_features(featureExtractor, labelledBigrams)
    labelledTest = dr.getTestDataset()
    test_set = nltk.classify.apply_features(featureExtractor, labelledTest)
    splitPoint = int(len(test_set) * 0.532)
    clf = trainNBClassifier(training_set + test_set[:splitPoint])
    printAcc(clf, test_set[splitPoint:])
    return clf

# Extract features from a bigram
def extractFeatures(doc, gramFeatures):
    document = set(doc)
    features = {}
    for gram in gramFeatures:
        features['contains(({0}, {1}))'.format(gram[0], gram[1])] = (gram in document)
    return features


# Naive Bayes Classifier - nltk implementation - using manual IDF
def trainNBClassifier(training_set):
    return nltk.NaiveBayesClassifier.train(training_set)


def printAcc(classifier, test_set):
    print "%.2f%%" % (nltk.classify.accuracy(classifier, test_set) * 100)


def predict(tweet, clf):

    PREDICTIONS = {
        '0' : "Definitely not sarcastic",
        '1' : "Probably not sarcastic",
        '2' : "Can't say",
        '3' : "Probably sarcastic",
        '4' : "Definitely sarcastic"
    }

    tweetBigrams = dr.getBigrams(tweet)
    bigramFeatures, _ = dr.getTrainingDataset()
    featureExtractor = partial(extractFeatures, gramFeatures = bigramFeatures)
    if tweetBigrams is not None:
        pr = clf.classify(featureExtractor(tweetBigrams))
        return PREDICTIONS[str(pr)]


# Extract features from a bigram
def extractFeatures(doc, gramFeatures):
    document = set(doc)
    features = {}
    for gram in gramFeatures:
        features['contains(({0}, {1}))'.format(gram[0], gram[1])] = (gram in document)
    return features