text-normalization/main.py at master · howardhamilton/text-normalization · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
# coding: utf-8
"""This file is quite vague script (sorry for that!), just an interface for all normalizing techniques,
   programmed in num and numword classes"""
__author__ = 'soshial'

def dictionary_check(word):
    """Checks the word and its variations for being in the dictionary"""
    # the function should transform "The orthopaedics's co-ordination of aedicule is well-kno-wn." into "The orthopedics's coordination of edicule is well-known."
    # todo support unambiguous abbreviations unfolding
    # todo SPELLING VARIANTS support on VADIM side!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    # todo Latin words such as 'etc.', 'e.g.' etc. into pronouncible form
    def spelling_variant_check(words):
        """Checks whether any from @words is in our substitution list"""
        for word_variant in words:
            word_variant = word_variant.lower()
            if word_variant in other_words:
                return principal_words[other_words[word_variant]].lower()
        return word
    word = unicode(word)
    if word.upper() in dictionary:
        return spelling_variant_check([word])
    clean_word = regex.sub("[^\w'.-]","",word).upper() # just cleant word only with apostrophies, hyphens and dots

    # checking acronyms like "U.S.A."
    if len(word)>2 and ".".join(list(word)) in dictionary: return ".".join(list(word))
    # cheking abbreviations like "Mr., Mrs., Dr., ..."
    if len(word)<5 and not word in dictionary and word+"." in dictionary: return word+"."
    # checking and splitting hyphenized words like "out-of-order"
    dehyphenized_set = set(clean_word.split("-"))
    if len(dehyphenized_set) == len(dehyphenized_set & dictionary): return regex.sub("-"," ",word) # 'out-of-order' -> 'out of order'
    variants = {clean_word, regex.sub("-","",word).upper()} # the set of all variants of the word
    # generating all possible variants; not applicable, imho
    # for example, for the word "file" @variants would be: set(['FIL-E', 'FIL E', 'FI-LE', 'FI LE', 'FILE', 'F-ILE', 'F ILE'])
#    i = 1
#    for letter in clean_word:
#        if i == len(clean_word): break
#        variants.add(clean_word[:i] + ' ' + clean_word[i:])
#        variants.add(clean_word[:i] + '-' + clean_word[i:])
#        variants.add(clean_word[:i] + '\'' + clean_word[i:])
#        i += 1
    entered = False # @entered shows if there is some variant in the @dictionary
    for var in variants & dictionary: # some variant is in dictionary
        entered = True
        if word.lower() == var.lower(): return word
    if not entered: # zero variants are in dictionary
        return spelling_variant_check(variants)
    else:
        return var.lower() # if it had entered the loop, but the original word is not there, then we return one of the variants

# initializing database
def init_dic(lang):
    global config
    import MySQLdb
    conn = MySQLdb.connect (host = config.get('db','host'), user = config.get('db','user'),
                            passwd = config.get('db','passwd'), db = config.get('db','db'))
    cursor = conn.cursor ()

    principal_words = {}
    other_words = {}
    cursor.execute ("SET NAMES 'utf8' COLLATE 'utf8_general_ci'")
    cursor.execute ("SELECT word_id, word_spelling, principal FROM %s_word_equivalents WHERE lang = '%s' AND relation = 1 OR principal > 0 " % (lang,lang))
    while True:
        row = cursor.fetchone ()
        if row is None:
            break
        if row[2]: principal_words[int(row[0])] = (row[1]) # {46:'analyzer'}
        else: other_words[(row[1])] = int(row[0]) # {'analyser':46}
    #logger.info("Equivalent words imported: %d" % cursor.rowcount)

    dictionary = set()
    cursor.execute ("SELECT word FROM %s_dict WHERE state = 0" % lang)
    while True:
        row = cursor.fetchone ()
        if row is None:
            break
        dictionary.add(row[0])
    #logger.info("Dictionary entries imported: %d" % cursor.rowcount)
    cursor.close ()

    conn.close ()
    return dictionary,principal_words,other_words

def omit_brackets(words):
    counter = 0
    if words.count("(") != words.count(")") or words.count("[") != words.count("]") or words.count("{") != words.count("}"):
        return [] # all brackets should be paired
    while '(' in words and ')' in words and counter < 4:
        words = words[:words.index("(")] + words[words.index(")")+1:]
        counter+=1
    while '[' in words and ']' in words and counter < 4:
        words = words[:words.index("[")] + words[words.index("]")+1:]
        counter+=1
    while '{' in words and '}' in words and counter < 4:
        words = words[:words.index("{")] + words[words.index("}")+1:]
        counter+=1
    if counter > 3: words = []
    return words

def restore_apostrophes(words,sentence):
    i=0 # token counter
    pointer,max_pointer = 0,0 # points to the place where the substring is found
    import string
    for word in words:
        if word.startswith("'") and i>0:
            pointer = string.find(sentence,words[i-1]+word,max_pointer)
            if pointer != -1:
                max_pointer = max(max_pointer,pointer)
                words = words[:i-1]+[words[i-1]+word]+words[i+1:]
                i-=1 # we concatenate two neighbour words - hence we need for @i to stay the same
        i+=1
    return words

def split_twodigits(word):
    """splitting two-digit words"""
    numbers = {'decims':['twenty','thirty','forty','fifty','sixty','seventy','eighty','ninety'],'units':['one','two','three','four','five','six','seven','eight','nine']}
    word = unicode(word)
    if re.search(' ',word): # composite word with spaces
        word_ret = ''
        for word_part in word.split(' '): word_ret += split_twodigits(word_part) + ' '
        return word_ret.strip()
    else: # any simple word without spaces
        if not word == '' and re.search('-',word) and\
                word.split('-')[0].lower() in numbers['decims'] and word.split('-')[1].lower() in numbers['units']:
            return ' '.join(word.split('-')) # word with hyphens
        else: return word # word without hyphens

def abridgement_fix(words):
    #print words;quit()
    i=0 # token counter
    days_of_week = {'Mon':'Monday','Tue':'Tuesday','Thu':'Thursday','Fri':'Friday','Sat':'Saturday'}
    months = {'Jan':'January','Feb':'February','Mar':'March','Apr':'April','Jun':'June','Jul':'July','Aug':'August','Sep':'September','Oct':'October','Nov':'November','Dec':'December'}
    array_isdigit = lambda ar: (len(ar) == 1 and ar[0].isdigit()) or (len(ar) > 1 and (ar[0].isdigit() or array_isdigit(ar[1:])))
    for word in words:
        neighbourhood = words[max(0,i-4):min(i+4,len(words))]
        word = split_twodigits(word)
        if (word == "#" or word == u"№") and i+1<len(words) and words[i+1].isdigit():
            words = words[:i]+[word+words[i+1]]+words[i+2:] # merging #1/№3
        # fixing time representation
        if word in ['p.m.','a.m.','P.M.','A.M.','GMT','UTC'] and words[i-1] == "00" and words[i-2]==":" and words[i-3].isdigit():
            words = words[:i-1]+words[i:] # 4:00 a.m. sounds like 4 a.m.
            i-=1
        if word in ['p.m.','a.m.','P.M.','A.M.','GMT','UTC','PM','pm','AM','am'] and re.match("(\d{1,2})\.(\d{2})",words[i-1]):
            matches = re.match("(\d{1,2})\.(\d{2})",words[i-1])
            words = words[:i-1]+[matches.group(1)]+[matches.group(2)]+words[i+1:]
            i+=1
        # expanding months and days of week
        if (word in days_of_week or word.endswith('.') and word[:-1] in days_of_week) and array_isdigit(neighbourhood):
            if word.endswith('.'):words[i] = days_of_week[word[:-1]]
            else: words[i] = days_of_week[word]
        if (word in months or word.endswith('.') and word[:-1] in months) and array_isdigit(neighbourhood):
            if word.endswith('.'): words[i] = months[word[:-1]]
            else: words[i] = months[word]
        i+=1
    #print words; quit()
    return words

def garbage_stats(words):
    digit_num,word_num = 0,0
    for word in words:
        if re.search("\d",word):
            digit_num+=1
        elif re.search(re.compile("\w",re.UNICODE),word):
            word_num+=1
    if word_num == 0: return 1
    return float(digit_num/word_num)

def typographics(sentence):
    sentence = re.sub(re.compile(u"[`’']",re.UNICODE),u"'",sentence)
    sentence = re.sub(re.compile(u"[’‘’„“«»”]",re.UNICODE),u'"',sentence)
    sentence = re.sub(re.compile(u"[‒]",re.UNICODE),u'-',sentence) # figure dash -> ndash
    sentence = re.sub(re.compile(u"[―]",re.UNICODE),u'-',sentence) # horizontal bar -> mdash
    sentence = re.sub(re.compile(u"[−–‐]",re.UNICODE),u'-',sentence) # only minus sign, only hyphen -> common hyphen-minus
    sentence = re.sub(re.compile(u"(\s?\.){3}",re.UNICODE),u'...',sentence) # ellipsis
    return sentence

# processing script arguments
import sys
arguments = sys.argv
lang = arguments[1] # en|ru|fr...
path_in = arguments[2] # '/home/soshial/text-normalization/in/'
path_out = arguments[3] # '/home/soshial/text-normalization/out/'
do_logging = True
import ConfigParser
config = ConfigParser.RawConfigParser()
config.read('normalization.cfg')
import re,regex

# initializing logging
import logging
logger = logging.getLogger('norm')
hdlr = logging.FileHandler('./norm.log')
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr)
logger.setLevel(logging.INFO)

# initializing linguistic components
# NB! nltk.download() # для работы программы необходимо при первом запуске раскомментировать строку и скачать модуль punkt tokenizer models # todo add to readme
from nltk.tokenize import *
import nltk.data as nltk_data
exec("import num_"+lang+" as num; numw = num.Num"+lang.title()+"(lang,logger)") # import num_ru as num; numw = num.NumRu(lang,logger)
sent_detector = nltk_data.load('tokenizers/punkt/english.pickle')
if lang == "en": dictionary,principal_words,other_words = init_dic(lang)

# working with files
import glob, os, codecs, time
#year = False
# main loop
while True:
    if glob.glob( os.path.join(path_in, '*.meta') ) == []: time.sleep(1); print "sleep" # if no files in a folder then sleep for a second
    if os.path.isfile(path_in + "die"): os.remove(path_in + "die"); quit()
    for fullpath_inmeta in glob.glob( os.path.join(path_in, '*.meta') ):
        # file_in - input file, file_in2 - copy of input file, file_out - output file
        filename_intxt = fullpath_inmeta.split('/')[-1].split('.')[-2] + '.txt'
        print "processing",filename_intxt
        logger.info('processing ' + path_in + filename_intxt)
        file_intxt = codecs.open( path_in + filename_intxt, "r", "utf-8" )
        text_in = typographics(file_intxt.read()) # variable @file_intxt with our text to process
        if lang == "ru": # just adding all ru files to the 'file.list' for AOT to process
            file_list = codecs.open(config.get('lms','aot_path') + "ru_file.list", 'w+', 'utf-8-sig')
            file_list.write(filename_intxt+"\n")
            file_list.close()
        file_outtxt = codecs.open(path_out + filename_intxt, 'w', 'utf-8-sig') # output file as ./out/_____.out
        sentences = sent_detector.tokenize(text_in)
        for sentence_in in sentences:
            if do_logging: print sentence_in
            sentence_out = ''; omit = False; i = 0; words = []
            try:
                if re.search(u"[/|]",sentence_in): omit = True
                # omitting the part of phrase that is considered as play/transcripts characters (MICHELE: ...)
                # has : AND start is all upper OR is title AND first character after : is big
                if regex.search(":",sentence_in) and (sentence_in.split(':')[0].isupper() or sentence_in.split(':')[0].istitle()) and sentence_in.split(':')[1].strip()[0].isupper():
                    sentence_in = ':'.join(sentence_in.split(':')[1:])
                if regex.search(u"\p{L}{2,}[\.?!…]\p{L}{2,}",sentence_in):
                    omit = True
                    print "omitted because of the dot!"
                    logger.info('Dot inside the word problem with sentence:\n____'+sentence_in)
                    continue
                words = PunktWordTokenizer().tokenize(sentence_in)
                if words[-1].endswith('.'): words[-1]=words[-1][:-1] # removing dot in the end of the phrase
                words = omit_brackets(words) # we don't need anything placed in brackets - it worsens LM
                words = abridgement_fix(words)
                words = restore_apostrophes(words,sentence_in) # word tokenizer designedly moves apart words with apostrophes inside them - we try fix this
                if len(words) == 0:
                    omit = True
                # garbage concentration
                if garbage_stats(words) > 1/3:
                    omit = True
                    logger.info('Garbage problem with sentence:\n____'+sentence_in)
                    continue
            except Exception,ExText:
                logger.info('Preprocessing exception: '+unicode(ExText))
            for word in words:
                word = word.translate(['|*',''])
                if re.search("\d",word): # if numbers are not present then we just return the string back
                    # getting neighbour words
                    d = 1
                    neighbours_left = words[max(0,i-d):i]
                    neighbours_right = words[i+1:min(i+d,len(words))+1]
                    if i-1>=0 and words[i-1] == "-": word = "-" + word; neighbours_left = words[max(0,i-d-1):i-1] # negative numbers
                    try:
                        word = numw.check_and_convert_into_number(word,{"left":neighbours_left,"right":neighbours_right})
                        if lang == 'en': word = split_twodigits(word)
                        #if lang == 'ru': word = u"* " + word + u" *"
                    except StandardError, error_message:
                        omit = True
                        logger.info('Problem with : ' + unicode(error_message) +' // word: '+ unicode(word))
                        #print "###################################     Hey! Problem with : " + unicode(error_message) +' // word: '+ unicode(word)
                        continue
                if omit: break; # if something happens while converting, we should omit the sentence
                if re.search(re.compile("\w",re.UNICODE),unicode(word)):
                    if lang == 'en': word = dictionary_check(word)
                    #if lang == 'ru' and word in {u'г.',u'Г.',u'гг.',u'ГГ.'}: year = True
                    sentence_out += word.upper() + ' '
                i += 1
            if not omit:
                sentence_out = regex.sub(u"[^\p{L}. *'-]","",sentence_out)
                #if lang == 'ru' and year == True:
                #    sentence_out = regex.sub(u" г\.| Г\.",u" ГОДУ|ГОД|ГОДА|ГОДОМ|ГОДЕ",sentence_out)
                #    sentence_out = regex.sub(u" гг\.| ГГ\.",u" ГОДЫ|ГОДОВ|ЛЕТ|ГОДАМ|ГОДАХ|ГОДАМИ",sentence_out)
                #    year = False
                if do_logging: print sentence_out
                file_outtxt.write(sentence_out.strip() + "\n")
            else:
                if do_logging: print "    missed"
            if do_logging: print
        file_outtxt.close()
        quit()
        os.remove(fullpath_inmeta) # removing ./in/_____.meta for the loop not to process it again
        #os.remove(path_in + filename_intxt) # removing original ./in/_____.txt
        file_meta = codecs.open(path_out + fullpath_inmeta.split('/')[-1], 'w', 'utf-8') # creating ./out/_____.meta
        file_meta.close()

# canonicalization:
# "co-operation" → "cooperation", "valour" → "valor", "should've" → "should have" / redis?
#
# NER
# tagger = nltk.data.load('chunkers/maxent_ne_chunker/english_ace.pickle')
# tagger.parse([('Guido', 'NNP'), ('lives', 'NNS'), ('in', 'IN'), ('Seattle', 'NNP')] )
# gives: Tree('S', [Tree('NE', [('Guido', 'NNP')]), ('lives', 'NNS'), ('in', 'IN'), Tree('NE', [('Seattle', 'NNP')])])