Text-Cleaner/text_clean_utils.py at master · shellshock1911/Text-Cleaner · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
# -*- coding: utf-8 -*-

"""Set of helper functions for text_clean.py"""

# Allow compatibility between Python 2.7 and 3.x
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

from io import open
#####

import itertools # Used to recombine n_groups into one message
import pickle # Packs and unpacks pre-stored Python objects
import re # Used to parse texts
import sys # # Checks user's Python version

import nltk # Supports a variety of NLP functions

# Set of symbols to remove
SYMBOLS = {
    '#', '[', '(', ']', '}', '_', '@', '+', ')',
    '/', '%', '`', '>', '\\', '~', '^', '-', '&',
    '$', '=', '*', '<', '{'
    }

# Set of punctuation marks to keep
PUNCTUATION = {
    '!', '"', ',', "'", ':', '.', ';', '?', '|'
    }

# Set of valid one letter words
VALID_ONE_LETTERS = {'a', 'i', 'k'}

# Check for Python 3.x
if sys.version_info.major == 3:
    # Load set of valid English digraphs
    with open('pkl_objects/3_x/english_words.pkl', 'rb') as handle:
        ENGLISH_WORDS = pickle.load(handle)

    # Load set of valid two letter words
    with open('pkl_objects/3_x/valid_two_letters.pkl', 'rb') as handle:
        VALID_TWO_LETTERS = pickle.load(handle)

    # Load set of valid consonant digraphs
    with open('pkl_objects/3_x/valid_cons_digraphs.pkl', 'rb') as handle:
        VALID_CONS_DIGRAPHS = pickle.load(handle)

    # Load set of valid vowel digraphs
    with open('pkl_objects/3_x/valid_vowel_digraphs.pkl', 'rb') as handle:
        VALID_VOWEL_DIGRAPHS = pickle.load(handle)

# Check for Python 2.7.
if sys.version_info.major == 2:
    # Load set of valid English digraphs
    with open('pkl_objects/2_7/english_words.pkl', 'rb') as handle:
        ENGLISH_WORDS = pickle.load(handle)

    # Load set of valid two letter words
    with open('pkl_objects/2_7/valid_two_letters.pkl', 'rb') as handle:
        VALID_TWO_LETTERS = pickle.load(handle)

    # Load set of valid consonant digraphs
    with open('pkl_objects/2_7/valid_cons_digraphs.pkl', 'rb') as handle:
        VALID_CONS_DIGRAPHS = pickle.load(handle)

    # Load set of valid vowel digraphs
    with open('pkl_objects/2_7/valid_vowel_digraphs.pkl', 'rb') as handle:
        VALID_VOWEL_DIGRAPHS = pickle.load(handle)

def str_to_bool(option):

    """Converts True and False option strings to booleans."""

    if option == 'True': # String
        return True # Boolean
    elif option == 'False': # String
        return False # Boolean
    else: # Cannot proceed if values other than True or False are passed
        raise ValueError("Options can only be True or False")

def _read_input(input_file):

    """Reads input file and prepares text for cleaning."""

    # Read the full conversation into a string
    with open(input_file, 'r', encoding='utf-8') as input_handle:
        conversation = input_handle.read()

    # Split the conversation into messages based on the '|' divider
    # Tokenize each message into character and punctuation tokens
    messages = [nltk.word_tokenize(message.strip())
                for message in conversation.split('|')]

    # Send messages to the cleaner method
    return messages

def _join_punctuation(cleaned_message):

    """Properly joins punctuation marks at the ends of words"""

    # Allow iteration with over the message list using "next" built-in
    cleaned_message = iter(cleaned_message)
    # Take the first token in the message list
    previous_token = next(cleaned_message)
    # Loop over each token in message list, starting from second token
    for current_token in cleaned_message:
        # Concatenate token with previous if current token has punctuation mark
        if re.search(r'[{}]'.format(".,;?!'"), current_token):
            previous_token += current_token
        # Otherwise, return previus token unchanged
        else:
            yield previous_token
            # Move to next token and repeat
            previous_token = current_token
    # Return final token unchanged since no punctuation mark can follow
    yield previous_token

def _is_a_long_string(token):

    """Checks if token is greater than length 13"""

    # Extract characters from string, ignore punctuation
    res = re.search(r'\w+', token)
    if res:
        if len(res.group()) > 13:
            return True
    return False

def _is_number(token):

    """Checks if token is an integer or float"""

    # Check for standard integer (4) or float (4.90)
    try:
        float(token)
        return True
    except ValueError:
        # Check for comma-separated integers or floats (4,900)
        if re.match(r'\d+,\d+', token):
            return True
        # Check for numbers that end in punctuation
        if re.match(r'\d+[{}]'.format(".,;?!'"), token):
            return True
        return False

def _is_time(token):

    """Special case check for HH:MM time tokens
    Used because time token fails alpha and numeric check
    E.g. "4:00".isalpha() and "4:00".isnumeric() are False
    """

    if re.match(r'^([0-9]|0[0-9]|1[0-9]|2[0-3]):[0-5][0-9]$', token):
        return True
    return False

def _is_ordinal(token):

    """Special case check for ordinals
    Used because ordinals fail mixed type check
    E.g. "5th" and "101st" are considered to have mixed types (int and string)
    """

    if re.match(r'\d+(st|nd|rd|th)$', token):
        return True
    return False

def _has_apostrophe(token):

    """Special case check for tokens containing apostrophe
    Used because possessives and contractions fail alpha check
    E.g "brother's".isalpha() or "they're".isalpha() are False
    """

    if "'" in token:
        return True
    return False

def _is_mixed_type(token):

    """Checks if token contains mix of alpha, numeric, symbols, or punctuation"""

    # Keep currency special case
    if token[0] == '$' and _is_number(token[1:]):
        return False
    # Keep percentage special case
    if token[-1] == '%' and _is_number(token[:-1]):
        return False
    # Keep time token, ordinal token, and punctuation in token special cases
    if _is_time(token) or _is_ordinal(token) or _has_apostrophe(token):
        return False
    # Check if token is neither alpha nor numeric
    # Match strings with symbols as well
    # E.g. "Math123", "$John$", "Roger_Jones"
    if not token.isalpha() and not _is_number(token):
        return True
    # Otherwise, token is a normal token, such as word or number
    return False

def _has_repeating_chars(token):

    """Checks if token contains repeating characters, e.g. 'Repppppeat'"""

    # Extract characters from string, ignore punctuation
    res = re.search(r'\w+', token)
    if res:
        # Normalize case so mixed-case repeating characters are caught
        token = res.group().lower()
        # Check if three or more characters are found repeating
        if any(token[i] == token[i - 1] and token[i] == token[i + 1]
               for i in range(1, len(token) - 1)):
            # Remove if repetition is found
            return True
    # Otherwise, keep the token
    return False

def _remove_n_groups(message, length):

    """Removes groups n words long that repeat"""

    # Create list of indexes to used to partition message
    indexes = [i for i in range(len(message) + 1) if i % length == 0]
    # Partition message into groups n characters long
    n_groups = [message[i: i + length] for i in indexes]
    # Start with an empty list and build up
    keep_groups = []
    # Check each group except for the last
    for i, group in enumerate(n_groups[:-1]):
        # Ignore group if it repeats
        if n_groups[i] == n_groups[i + 1]:
            continue
        # Otherwise, keep the group
        else:
            keep_groups.append(group)
    # Keep the last group
    keep_groups.append(n_groups[-1])
    # Chain groups together into one list
    keep_groups = list(itertools.chain.from_iterable(keep_groups))
    # Return groups that did not repeat
    return keep_groups

def _is_invalid_pair(pair):

    """Checks if a pair of English characters is invalid"""

    # Prepare regex pattern for consonant digraph or vowel digraph check
    pattern = r'[^aeiouy]+$|[aeiouy]+$'
    # 1.) Check if pair is consonant digraph or vowel digraph
        # Consonant / vowel digraphs are always considered valid
    # 2.) Check if pair is invalid consonant digraph
    # 3.) Check if pair is invalid vowel digraph
    if re.match(pattern, pair):
        if pair not in VALID_CONS_DIGRAPHS and pair not in VALID_VOWEL_DIGRAPHS:
            return True
    # Otherwise, token is valid
    return False

def _remove_gibberish(cleaned_message):

    """Removes gibberish language from a message"""

    # Loop over each token
    for i, token in enumerate(cleaned_message):
        # Skip over punctuation; process character strings only
        token = token.lower()
        if not token.isalpha():
            continue
        # Erase all consonant tokens
        if re.match(r'[^aeiouy]+$', token):
            cleaned_message[i] = ''
            continue
        # If token is one letter, check if invalid
        if len(token) == 1:
            if token not in VALID_ONE_LETTERS:
                # Erase if invalid; move to next
                cleaned_message[i] = ''
                continue
            else:
                # Do nothing if valid; move to next
                continue
        # If token is two letter, check if invalid
        if len(token) == 2:
            if token not in VALID_TWO_LETTERS:
                # Erase if invalid; move to next
                cleaned_message[i] = ''
                continue
            else:
                # Do nothing if valid; move to next
                continue
        # Do nothing if token is a valid English word; move to next
        if token in ENGLISH_WORDS:
            continue
        # Break token into list of pairs of characters
        # E.g. 'test' >> ['te', 'es', 'st']
        pairs = [token[j: j + 2] for j in range(len(token) - 2 + 1)]
        # Erase token if any pair is invalid
        if any(_is_invalid_pair(pair) for pair in pairs):
            cleaned_message[i] = ''

    # Keep tokens that were not erased
    final_message = [tok for tok in cleaned_message if tok != '']
    # Return empty if all that is left is random punctuation and symbols
    if all(char in PUNCTUATION or char in SYMBOLS for char in ''.join(final_message)):
        return ''
    else:
    # Return tokens that were kept
        return final_message


def _clean_message(message):

    """Cleans an individual message"""

    # Iteratively construct new message from scratch
    cleaned_message = []
    # Establish length here once so it doesn't have be computed for each token
    message_len = len(message)
    for i, token in enumerate(message):
        # Keep all punctuation tokens
        if all(char in PUNCTUATION for char in token):
            cleaned_message.append(token)
            continue
        # Stop at the second to last token
        if i < message_len - 1:
            # Check and ignore repeated instances of the same token
            # Keep only the first instance
            # E.g. "John John John" >>> "John"
            if message[i] == message[i + 1]:
                continue
            # Keep if token is a math symbol and if tokens on both sides
                # are numbers
            if token in {'+', '-', '*', '/' '='} and \
            _is_number(message[i - 1]) and _is_number(message[i + 1]):
                cleaned_message.append(token)
                continue
        # Check and ignore mixed type tokens, e.g. John23, max$$$, @nne
        # Currencies ($5.00) and percentages (10%) will be kept
        if _is_mixed_type(token):
            continue
        # Ignore strings with more than 13 characters
        if _is_a_long_string(token):
            continue
        # Check and ignore tokens with repeating characters
        # E.g. "aaaaaahhhhh", "boooom", "rrrrrrrogerrrrrr"
        if _has_repeating_chars(token):
            continue
        # If token isn't ignored by this point, keep it in the new message
        cleaned_message.append(token)

    # Successively remove quad-groups, tri-groups, and bi-groups

    if len(cleaned_message) >= 8: # Must have length of 8 minimum
        cleaned_message = _remove_n_groups(cleaned_message, 4) # Quad-groups

    if len(cleaned_message) >= 6: # Must have length of 6 minimum
        cleaned_message = _remove_n_groups(cleaned_message, 3) # Tri-groups

    if len(cleaned_message) >= 4: # Must have length of 4 minimum
        cleaned_message = _remove_n_groups(cleaned_message, 2) # Bi-groups

    # Remove gibberish tokens
    final_message = _remove_gibberish(cleaned_message)

    # Return tokens that were kept
    return final_message

def clean_text(current_input):

    """Reads messages file, splits it into individual messages, cleans each one,
    and recombines them into string of consecutive cleaned messages
    """

    # Fetch messages from file
    messages = _read_input(current_input)
    # Iteratively add new messages as they are generated
    cleaned_messages = []
    # Loop over each message
    for message in messages:
        # Append empty if message contains no content
        if not message:
            cleaned_message = ''
        else:
        # Attempt cleaning the message
            cleaned_message = _clean_message(message)
        # Append each cleaned message to the list of cleaned messages
        cleaned_messages.append(' '.join(_join_punctuation(cleaned_message)))
    # Join each message with a pipe divider for final output
    cleaned_messages = ' | '.join(cleaned_messages)

    # Return cleaned messages string to write to new file
    return cleaned_messages