-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclassificationPreprocessor.py
More file actions
115 lines (93 loc) · 3.42 KB
/
classificationPreprocessor.py
File metadata and controls
115 lines (93 loc) · 3.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
from tweetTemplate import *
from HTMLParser import HTMLParser
import datasetRetriever
import langid
import re
import string
import pickle
import sys
class HTMLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
def stripHtmlTags(html):
s = HTMLStripper()
s.feed(html)
return s.get_data()
def preprocess(topic):
print "Processing tweets, topic: " + topic
tweetSet = datasetRetriever.getTweetDataset("../data/" + topic + "_tweets_dataset.tds")
outFile = open("../data/" + topic + "_processed_dataset.tds", 'w')
count = 0
for tweet in tweetSet:
tweet = ' '.join(tweet.split())
lang, conf = langid.classify(tweet)
count += 1
if count % 1000 == 0:
print "At {0}th tweet.".format(str(count))
if lang == 'en' and conf > 0.99:
goodTweet, tweet = processTweet(tweet)
if goodTweet:
pickle.dump(tweetTemplate(tweet), outFile)
else:
continue
outFile.close()
totalCount = len(tweetSet)
tweetSet = datasetRetriever.getTweetDataset("../data/" + topic + "_processed_dataset.tds")
print "Processing completete. \nTotal:" + str(totalCount) + " tweets. Usable: " + str(len(tweetSet))
return
def processTweet(tweet):
goodTweet = False
# Drop tweets from Yes!News.. they use #noticas hashtags which adds noise to dataset
if tweet.find("Yes!News") > -1: return False, None
# Drop retweets. We might already have them
if tweet.find("RT") == 0: return False, None
# Replace URLs
tweet = re.sub("((https?|ftp)://|(www|ftp)\\.)?[a-z0-9-]+(\\.[a-z0-9-]+)+([/?].*)?$", "URL", tweet)
# Remove Non-ASCII
tweet = ''.join([i if ord(i) < 128 else '' for i in tweet])
# Remove digits
# Forget them
#for digit in string.digits: tweet = tweet.replace(digit, '')
# Remove Unicode emojis
try:
# UCS-4
emojiRe = re.compile(u'([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])')
except re.error:
# UCS-2
emojiRe = re.compile(u'([\u2600-\u27BF])|([\uD83C][\uDF00-\uDFFF])|([\uD83D][\uDC00-\uDE4F])|([\uD83D][\uDE80-\uDEFF])')
emojiRe.sub('', tweet)
# Replace @refs
tweet = re.sub(r"@\w+", "REF", tweet)
# Remove HTML tags
tweet = stripHtmlTags(tweet)
if len(tweet.split()) < 4 or len(minimizedTweet(tweet).split()) < 3: goodTweet = False
else: goodTweet = True
return goodTweet, tweetTemplate(' '.join(tweet.split()))
# Return tweet after stripping URLs, REFs and TAGs
def minimizedTweet(tweet):
tweet = re.sub(r"URL", '', tweet)
tweet = re.sub(r"TAG", '', tweet)
return re.sub(r"REF", '', tweet)
def main():
if len(sys.argv) < 2 or len(sys.argv) > 2:
printUsage()
else:
if sys.argv[1] == '--london':
preprocess("london")
elif sys.argv[1] == '--sarcasm':
preprocess("sarcasm")
elif sys.argv[1] == '--sf':
preprocess("sf")
else: printUsage()
def printUsage():
print "Usage: python classificationPreprocess.py --<topic>\n" + \
"Topics:\n--london: Process tweets from London\n" + \
"--sarcasm: Process tweets containing Sarcasm indicator tags\n" + \
"--sf: Process Global tweet corpus"
if __name__ == "__main__":
main()