-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtext_clean_utils.py
More file actions
393 lines (321 loc) · 13.8 KB
/
text_clean_utils.py
File metadata and controls
393 lines (321 loc) · 13.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
# -*- coding: utf-8 -*-
"""Set of helper functions for text_clean.py"""
# Allow compatibility between Python 2.7 and 3.x
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from io import open
#####
import itertools # Used to recombine n_groups into one message
import pickle # Packs and unpacks pre-stored Python objects
import re # Used to parse texts
import sys # # Checks user's Python version
import nltk # Supports a variety of NLP functions
# Set of symbols to remove
SYMBOLS = {
'#', '[', '(', ']', '}', '_', '@', '+', ')',
'/', '%', '`', '>', '\\', '~', '^', '-', '&',
'$', '=', '*', '<', '{'
}
# Set of punctuation marks to keep
PUNCTUATION = {
'!', '"', ',', "'", ':', '.', ';', '?', '|'
}
# Set of valid one letter words
VALID_ONE_LETTERS = {'a', 'i', 'k'}
# Check for Python 3.x
if sys.version_info.major == 3:
# Load set of valid English digraphs
with open('pkl_objects/3_x/english_words.pkl', 'rb') as handle:
ENGLISH_WORDS = pickle.load(handle)
# Load set of valid two letter words
with open('pkl_objects/3_x/valid_two_letters.pkl', 'rb') as handle:
VALID_TWO_LETTERS = pickle.load(handle)
# Load set of valid consonant digraphs
with open('pkl_objects/3_x/valid_cons_digraphs.pkl', 'rb') as handle:
VALID_CONS_DIGRAPHS = pickle.load(handle)
# Load set of valid vowel digraphs
with open('pkl_objects/3_x/valid_vowel_digraphs.pkl', 'rb') as handle:
VALID_VOWEL_DIGRAPHS = pickle.load(handle)
# Check for Python 2.7.
if sys.version_info.major == 2:
# Load set of valid English digraphs
with open('pkl_objects/2_7/english_words.pkl', 'rb') as handle:
ENGLISH_WORDS = pickle.load(handle)
# Load set of valid two letter words
with open('pkl_objects/2_7/valid_two_letters.pkl', 'rb') as handle:
VALID_TWO_LETTERS = pickle.load(handle)
# Load set of valid consonant digraphs
with open('pkl_objects/2_7/valid_cons_digraphs.pkl', 'rb') as handle:
VALID_CONS_DIGRAPHS = pickle.load(handle)
# Load set of valid vowel digraphs
with open('pkl_objects/2_7/valid_vowel_digraphs.pkl', 'rb') as handle:
VALID_VOWEL_DIGRAPHS = pickle.load(handle)
def str_to_bool(option):
"""Converts True and False option strings to booleans."""
if option == 'True': # String
return True # Boolean
elif option == 'False': # String
return False # Boolean
else: # Cannot proceed if values other than True or False are passed
raise ValueError("Options can only be True or False")
def _read_input(input_file):
"""Reads input file and prepares text for cleaning."""
# Read the full conversation into a string
with open(input_file, 'r', encoding='utf-8') as input_handle:
conversation = input_handle.read()
# Split the conversation into messages based on the '|' divider
# Tokenize each message into character and punctuation tokens
messages = [nltk.word_tokenize(message.strip())
for message in conversation.split('|')]
# Send messages to the cleaner method
return messages
def _join_punctuation(cleaned_message):
"""Properly joins punctuation marks at the ends of words"""
# Allow iteration with over the message list using "next" built-in
cleaned_message = iter(cleaned_message)
# Take the first token in the message list
previous_token = next(cleaned_message)
# Loop over each token in message list, starting from second token
for current_token in cleaned_message:
# Concatenate token with previous if current token has punctuation mark
if re.search(r'[{}]'.format(".,;?!'"), current_token):
previous_token += current_token
# Otherwise, return previus token unchanged
else:
yield previous_token
# Move to next token and repeat
previous_token = current_token
# Return final token unchanged since no punctuation mark can follow
yield previous_token
def _is_a_long_string(token):
"""Checks if token is greater than length 13"""
# Extract characters from string, ignore punctuation
res = re.search(r'\w+', token)
if res:
if len(res.group()) > 13:
return True
return False
def _is_number(token):
"""Checks if token is an integer or float"""
# Check for standard integer (4) or float (4.90)
try:
float(token)
return True
except ValueError:
# Check for comma-separated integers or floats (4,900)
if re.match(r'\d+,\d+', token):
return True
# Check for numbers that end in punctuation
if re.match(r'\d+[{}]'.format(".,;?!'"), token):
return True
return False
def _is_time(token):
"""Special case check for HH:MM time tokens
Used because time token fails alpha and numeric check
E.g. "4:00".isalpha() and "4:00".isnumeric() are False
"""
if re.match(r'^([0-9]|0[0-9]|1[0-9]|2[0-3]):[0-5][0-9]$', token):
return True
return False
def _is_ordinal(token):
"""Special case check for ordinals
Used because ordinals fail mixed type check
E.g. "5th" and "101st" are considered to have mixed types (int and string)
"""
if re.match(r'\d+(st|nd|rd|th)$', token):
return True
return False
def _has_apostrophe(token):
"""Special case check for tokens containing apostrophe
Used because possessives and contractions fail alpha check
E.g "brother's".isalpha() or "they're".isalpha() are False
"""
if "'" in token:
return True
return False
def _is_mixed_type(token):
"""Checks if token contains mix of alpha, numeric, symbols, or punctuation"""
# Keep currency special case
if token[0] == '$' and _is_number(token[1:]):
return False
# Keep percentage special case
if token[-1] == '%' and _is_number(token[:-1]):
return False
# Keep time token, ordinal token, and punctuation in token special cases
if _is_time(token) or _is_ordinal(token) or _has_apostrophe(token):
return False
# Check if token is neither alpha nor numeric
# Match strings with symbols as well
# E.g. "Math123", "$John$", "Roger_Jones"
if not token.isalpha() and not _is_number(token):
return True
# Otherwise, token is a normal token, such as word or number
return False
def _has_repeating_chars(token):
"""Checks if token contains repeating characters, e.g. 'Repppppeat'"""
# Extract characters from string, ignore punctuation
res = re.search(r'\w+', token)
if res:
# Normalize case so mixed-case repeating characters are caught
token = res.group().lower()
# Check if three or more characters are found repeating
if any(token[i] == token[i - 1] and token[i] == token[i + 1]
for i in range(1, len(token) - 1)):
# Remove if repetition is found
return True
# Otherwise, keep the token
return False
def _remove_n_groups(message, length):
"""Removes groups n words long that repeat"""
# Create list of indexes to used to partition message
indexes = [i for i in range(len(message) + 1) if i % length == 0]
# Partition message into groups n characters long
n_groups = [message[i: i + length] for i in indexes]
# Start with an empty list and build up
keep_groups = []
# Check each group except for the last
for i, group in enumerate(n_groups[:-1]):
# Ignore group if it repeats
if n_groups[i] == n_groups[i + 1]:
continue
# Otherwise, keep the group
else:
keep_groups.append(group)
# Keep the last group
keep_groups.append(n_groups[-1])
# Chain groups together into one list
keep_groups = list(itertools.chain.from_iterable(keep_groups))
# Return groups that did not repeat
return keep_groups
def _is_invalid_pair(pair):
"""Checks if a pair of English characters is invalid"""
# Prepare regex pattern for consonant digraph or vowel digraph check
pattern = r'[^aeiouy]+$|[aeiouy]+$'
# 1.) Check if pair is consonant digraph or vowel digraph
# Consonant / vowel digraphs are always considered valid
# 2.) Check if pair is invalid consonant digraph
# 3.) Check if pair is invalid vowel digraph
if re.match(pattern, pair):
if pair not in VALID_CONS_DIGRAPHS and pair not in VALID_VOWEL_DIGRAPHS:
return True
# Otherwise, token is valid
return False
def _remove_gibberish(cleaned_message):
"""Removes gibberish language from a message"""
# Loop over each token
for i, token in enumerate(cleaned_message):
# Skip over punctuation; process character strings only
token = token.lower()
if not token.isalpha():
continue
# Erase all consonant tokens
if re.match(r'[^aeiouy]+$', token):
cleaned_message[i] = ''
continue
# If token is one letter, check if invalid
if len(token) == 1:
if token not in VALID_ONE_LETTERS:
# Erase if invalid; move to next
cleaned_message[i] = ''
continue
else:
# Do nothing if valid; move to next
continue
# If token is two letter, check if invalid
if len(token) == 2:
if token not in VALID_TWO_LETTERS:
# Erase if invalid; move to next
cleaned_message[i] = ''
continue
else:
# Do nothing if valid; move to next
continue
# Do nothing if token is a valid English word; move to next
if token in ENGLISH_WORDS:
continue
# Break token into list of pairs of characters
# E.g. 'test' >> ['te', 'es', 'st']
pairs = [token[j: j + 2] for j in range(len(token) - 2 + 1)]
# Erase token if any pair is invalid
if any(_is_invalid_pair(pair) for pair in pairs):
cleaned_message[i] = ''
# Keep tokens that were not erased
final_message = [tok for tok in cleaned_message if tok != '']
# Return empty if all that is left is random punctuation and symbols
if all(char in PUNCTUATION or char in SYMBOLS for char in ''.join(final_message)):
return ''
else:
# Return tokens that were kept
return final_message
def _clean_message(message):
"""Cleans an individual message"""
# Iteratively construct new message from scratch
cleaned_message = []
# Establish length here once so it doesn't have be computed for each token
message_len = len(message)
for i, token in enumerate(message):
# Keep all punctuation tokens
if all(char in PUNCTUATION for char in token):
cleaned_message.append(token)
continue
# Stop at the second to last token
if i < message_len - 1:
# Check and ignore repeated instances of the same token
# Keep only the first instance
# E.g. "John John John" >>> "John"
if message[i] == message[i + 1]:
continue
# Keep if token is a math symbol and if tokens on both sides
# are numbers
if token in {'+', '-', '*', '/' '='} and \
_is_number(message[i - 1]) and _is_number(message[i + 1]):
cleaned_message.append(token)
continue
# Check and ignore mixed type tokens, e.g. John23, max$$$, @nne
# Currencies ($5.00) and percentages (10%) will be kept
if _is_mixed_type(token):
continue
# Ignore strings with more than 13 characters
if _is_a_long_string(token):
continue
# Check and ignore tokens with repeating characters
# E.g. "aaaaaahhhhh", "boooom", "rrrrrrrogerrrrrr"
if _has_repeating_chars(token):
continue
# If token isn't ignored by this point, keep it in the new message
cleaned_message.append(token)
# Successively remove quad-groups, tri-groups, and bi-groups
if len(cleaned_message) >= 8: # Must have length of 8 minimum
cleaned_message = _remove_n_groups(cleaned_message, 4) # Quad-groups
if len(cleaned_message) >= 6: # Must have length of 6 minimum
cleaned_message = _remove_n_groups(cleaned_message, 3) # Tri-groups
if len(cleaned_message) >= 4: # Must have length of 4 minimum
cleaned_message = _remove_n_groups(cleaned_message, 2) # Bi-groups
# Remove gibberish tokens
final_message = _remove_gibberish(cleaned_message)
# Return tokens that were kept
return final_message
def clean_text(current_input):
"""Reads messages file, splits it into individual messages, cleans each one,
and recombines them into string of consecutive cleaned messages
"""
# Fetch messages from file
messages = _read_input(current_input)
# Iteratively add new messages as they are generated
cleaned_messages = []
# Loop over each message
for message in messages:
# Append empty if message contains no content
if not message:
cleaned_message = ''
else:
# Attempt cleaning the message
cleaned_message = _clean_message(message)
# Append each cleaned message to the list of cleaned messages
cleaned_messages.append(' '.join(_join_punctuation(cleaned_message)))
# Join each message with a pipe divider for final output
cleaned_messages = ' | '.join(cleaned_messages)
# Return cleaned messages string to write to new file
return cleaned_messages