Tweet_normalizer/normalize_tweets.py at master · DavidBert/Tweet_normalizer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206

# coding: utf-8
import numpy as np
import sys
import traceback
import re
import nltk
from context2vec.common.model_reader import ModelReader

regex_str = [
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)

def remove_emoji(tweet):
    regx = re.compile(u"[^\U00000000-\U0000d7ff\U0000e000-\U0000ffff]", flags=re.UNICODE)
    return regx.sub(u'', unicode(tweet, 'utf-8')).encode('utf8')

def tokenize(tweet):
    return tokens_re.findall(tweet)

def remove_tweet_RT(tweet):
    cleanned = re.sub('RT @', '@', tweet, count=1)
    return re.sub(r'@.*?:','',cleanned, count=1)

def remove_hex(tweet):
    return re.sub(r'[^\x00-\x7f]',r'', tweet)

def remove_url(tweet):
    return re.sub(r'http\S+', '', tweet)

def preprocess(tweet):
    tweet = remove_url(tweet)
    tweet = remove_tweet_RT(tweet)
    tweet = remove_emoji(tweet)
    tweet = remove_hex(tweet)
    tokens = tokenize(tweet)
    return tokens

def get_misspelled_words(tokens):
    misspelled_words = []
    for i, word in enumerate(tokens):
        if (not word.lower() in dico) & (not word in dico):
            misspelled_words.append((i, word))
    return misspelled_words

def levenshtein(word1, word2):
    l1 = len(word1)
    l2 = len(word2)
    dist = [[0 for x in range(l2 + 1)] for x in range(l1 + 1)]

    for i in range(l1 + 1):
        for j in range(l2 + 1):

            if i == 0:
                dist[i][j] = j

            elif j == 0:
                dist[i][j] = i

            elif word1[i - 1] == word2[j -1 ]:
                dist[i][j] = dist[i - 1][j - 1]

            else:
                dist[i][j] = 1 + min(dist[i][j - 1],
                                   dist[i - 1][j],
                                   dist[i - 1][j - 1])

    return dist[l1][l2]

def best_proposition(tokens, target_pos, propositions):

    best_sim = None
    best_answer = None
    context_v = None
    for proposition in propositions:
        if len(tokens) <= 1:
            raise Exception("Can't find context for target word.")
        if proposition in word2index:

            target_v = w[word2index[proposition]]
            context_v = model.context2vec(tokens, target_pos)
            context_v = context_v / np.sqrt((context_v * context_v).sum())
            sim =  (np.dot(target_v.tolist(), context_v.tolist()) + 1) / 2
        else:
            sim = 0

        if best_sim is None or sim > best_sim:
            best_sim = sim
            best_answer = proposition

    return best_answer

def find_closest(word):
    res = []
    if len(word) < 2:
        return res
    for other in dico:
        if levenshtein(word, other) < 3:
            res.append(other.encode('utf8'))
    return res

def damereau_levenshtein(word):
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    words = set(deletes + transposes + replaces + inserts)

    return list(w for w in words if w in dico)

def correct_tokens(tokens, fast=True):
    misspelled_words = get_misspelled_words(tokens)
    if len(misspelled_words) > 0:
        for (target_pos, word) in misspelled_words:
            if (len(word) == 1) & (not word.isalpha()):
                pass
            else:
                propositions = []
                if fast:
                    propositions = find_closest(word)
                else:
                    propositions = list(damereau_levenshtein(word))
                if (len(propositions) > 0) & (len(tokens) > 1):
                    correction = best_proposition(tokens, target_pos, propositions + [word])
                    tokens[target_pos] = correction
    return tokens

def correct_tweet(tweet, fast=True):
    tokens = preprocess(tweet)
    return ' '.join(correct_tokens(tokens, fast))

def read_content(path):

    with open(path) as f:
        content = f.readlines()
    content = [x.strip() for x in content]
    return content

def correct_and_save_tweets(dico, c2v_model_path, input_path, output_path, fast=True):

    model_reader = ModelReader(c2v_model_path)
    w = model_reader.w
    word2index = model_reader.word2index
    index2word = model_reader.index2word
    model = model_reader.model

    tweets = read_content(input_path)
    tweets = tweets[:10]

    f = open(output_path,'w')
    for tweet in tweets:
        f.write(correct_tweet(tweet, fast) + '\n')
    f.close()


if __name__ == '__main__':

    # Load corpus to creat dictionnary
    print("Loading brown corpus...")
    nltk.download('brown');
    from nltk.corpus import brown
    dico = set(brown.words())
    print("Brown corpus loaded!")

    # Load context2vec Model
    print("Loading context2vec Model...")
    c2v_model_path = sys.argv[1]
    model_reader = ModelReader(c2v_model_path);
    w = model_reader.w
    word2index = model_reader.word2index
    index2word = model_reader.index2word
    model = model_reader.model
    print("Context2vec Model loaded!")

    # Load tweets
    print("Loading tweets...")
    input_path = sys.argv[2]
    tweets = read_content(input_path)
    tweets = tweets[:1000]
    print("Tweets loaded!")

    #Use the fast method to compute propositions before language model
    fast = sys.argv[4].lower() == 'true'

    # Compute and save normalized tweets
    print("Computing normalized tweets...")
    output_path = sys.argv[3]
    f = open(output_path,'w')
    for tweet in tweets:
        f.write(correct_tweet(tweet, fast) + '\n')
    f.close()
    print("Normalized tweets computed!")