RedditClassifier/data_process_word_embedding.py at master · ece324-2019/RedditClassifier · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
"""
Code to process text data, build the word embeddings, and ready it for NLP pipeline
"""
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from os import path

import string
import re
from multiprocessing import Pool

def process_subdata(input_pack):
    corpus, initial = input_pack

    print(len(corpus.text))
    i =0
    while i < len(corpus.text):
        corpus.text[i+initial] = ''.join(ch for ch in corpus.text[i+initial] if (ch.isalnum() or ch == " ")).lower()
        if (i % 1000 == 0):
            print(i)
        i += 1
    return corpus

number_subreddits = 20
num_threads = 30
subreddit_labels = []
# First load the data in, and we only need the text from the files, so we load in column 1, skip the first header row and use the custom header of (text) as it's just a bit more descriptive
subreddits = []
i =0
while i < number_subreddits:
    subreddits.append(pd.read_csv('data/' + str(i) + '_raw_data.csv', header=None, names=['text'], skiprows=[0], usecols=[1], lineterminator='\n'))
    j = 0
    while j < len(subreddits[i]):
        subreddit_labels.append(i)
        j += 1
    i += 1

# Now, we concatenate all the subreddit texts together to get our corpus in one nice dataframe, and save it as a csv
corpus = pd.concat(subreddits, axis=0).reset_index(drop=True)
#corpus.to_csv('data/word_embedding/combined_text.csv')

# The following just strips the corpus of all punctuation and non-alphanumeric characters, and turns everything lower case
# Note, this takes some time to run
if not path.exists('data/word_embeddings/checkpoint1.csv'):
    sub_corpus = []
    i = 0
    print(len(corpus))
    section = len(corpus)//num_threads
    while i < num_threads:
        end = (i+1)*section
        if i == num_threads - 1:
            end = len(corpus)
        sub_corpus.append([corpus[i*section:end], section*i])
        i += 1
    p = Pool(processes=num_threads)
    output = p.map(process_subdata, sub_corpus)
    p.close()
    corpus = pd.concat(output)
    corpus.to_csv('data/word_embeddings/checkpoint1.csv')
else:
    corpus = pd.read_csv('data/word_embeddings/checkpoint1.csv')
print("Arriving to checkpoint1")
# Now, we need to build our word embeddings. We build a frequency table of all the words in the corpus
# and discard words numbering below 50. The rest, we assign a unique index.

# Begin by splitting the sentence by words in the entire corpus dataframe
# Note, if you haven't already, ensure you have run nltk.download('punkt')
i = 0
tokenized_corpus = []
while i < len(corpus.text):
    try:
        tokenized_corpus.append(corpus.text[i].split(" "))
    except:
        tokenized_corpus.append([])
    i += 1
#tokenized_corpus.to_csv('data/word_embedding/tokenized_text.csv')

# We build a frequency table of all the words in corpus. word_freq is of type pd.Series
# Note: what's interesting is that the index is the word, and the attribute is it's frequency
# we can use this to now go and remove words that occur less than 50 times in the corpus
# To access the index use word_freq.index
word_freq = pd.Series(np.concatenate(tokenized_corpus)).value_counts()

# vocab is a list of all our words. It'll be our key, and unique_int will be
# our value. We'll make a key-value mapping with these
# vocab[6842] is the first word that has only 19  occurences. Since there are
# 44,804 words, that means at and after the 6481 st word, all the rest of the words occur 19 times or less
print("Successfully tokenized and count vocabulary")

word_freq.to_csv("data/word_embeddings/dict.csv")

vocab = word_freq.index
unique_int = []
threshold = 100
for i in range(len(vocab)):
    if word_freq[i] >= threshold:
        unique_int.append(i)
        print(i)
    else:
        unique_int.append(-1)


# Now we can form our dictionary
dictionary = dict(zip(vocab, unique_int))

# We needed the dictionary so that we can replace words with numbers in the corpus
for lst in tokenized_corpus:
    for ind, item in enumerate(lst):
        lst[ind] = dictionary.get(item, item)

#tokenized_corpus.to_csv('./data/word_embedding/integer_tokenized_text.csv')
print(len(tokenized_corpus))

df = pd.DataFrame(subreddit_labels, columns=['label'])
df2 = pd.DataFrame({"text":tokenized_corpus})
processed_with_label = pd.concat([df2, df], axis=1)

# As a final step, we need to create the test and train sets for these embeddings
# and as well, need to seperate the sentences into a numpy array and labels into another numpy array
processed_with_label = processed_with_label.sample(frac=1, random_state=42).reset_index(drop=True)
test_ratio = 0.05
valid_ratio = 0.1

test_set = processed_with_label[0:int(len(processed_with_label)*test_ratio)]
valid_set = processed_with_label[int(len(processed_with_label)*test_ratio):int(len(processed_with_label)*(test_ratio+valid_ratio))]
train_set = processed_with_label[int(len(processed_with_label)*(test_ratio+valid_ratio)):]
print(len(test_set))
print(len(train_set))
print(len(valid_set))

train_set.to_csv('data/word_embeddings/train.csv')
valid_set.to_csv('data/word_embeddings/valid.csv')
test_set.to_csv('data/word_embeddings/test.csv')


train_X = train_set['text'].to_numpy()
train_y = train_set['label'].to_numpy()

test_X = test_set['text'].to_numpy()
test_y = test_set['label'].to_numpy()

valid_X = valid_set['text'].to_numpy()
valid_y = valid_set['label'].to_numpy()


np.save('data/word_embeddings/train_X.npy',train_X)
np.save('data/word_embeddings/train_y.npy',train_y)
np.save('data/word_embeddings/test_X.npy',test_X)
np.save('data/word_embeddings/test_y.npy',test_y)
np.save('data/word_embeddings/valid_X.npy',valid_X)
np.save('data/word_embeddings/valid_y.npy',valid_y)