RedditClassifier/data_process_char_embedding.py at master · ece324-2019/RedditClassifier · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from os import path

import string
import re
import ast
from multiprocessing import Pool

num_threads = 30

# We create a dictionary of all ASCII characters. We'll need this to map the characters to unique integers
'''dictionary = {'a':0, 'b':1, 'c':2, 'd':3, 'e':4, 'f':5, 'g':6,'h':7, 'i':8, 'j':9, 'k': 10, 'l': 11,
              'm': 12, 'n':13, 'o': 14, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w':23,
              'x': 24, 'y': 25, 'z': 26, '0': 27, '1': 28, '2': 29, '3': 30, '4': 31, '5': 32, '6': 33, '7': 34,
              '8': 35, '9': 36, " ": 37}'''
dictionary = {}
i = 32
value = 0
while i <= 122:
    dictionary[chr(i)] = value
    value += 1
    i += 1
print(dictionary)

def process_subdata(input_pack):
    corpus, initial = input_pack

    #print(len(corpus.text))
    i =0
    while i < len(corpus.text):
        corpus.text[i+initial] = ','.join(str(dictionary[j]) if (j in dictionary) else '-1' for j in
            corpus.text[i+initial].lower())
        #print(str(corpus.text[i+initial]) + "\n" + str(list(ast.literal_eval(corpus.text[i+initial]))))
        corpus.text[i+initial] = list(ast.literal_eval(corpus.text[i+initial]))
        if (i % 1000 == 0):
            print(i)
        i += 1
    return corpus

number_subreddits = 20
subreddit_labels = []
# First load the data in, and we only need the text from the files, so we load in column 1, skip the first header row and use the custom header of (text) as it's just a bit more descriptive
subreddits = []
i =0
while i < number_subreddits:
    subreddits.append(pd.read_csv('data/' + str(i) + '_raw_data.csv', header=None, names=['text'], skiprows=[0], usecols=[1], lineterminator='\n'))
    j = 0
    while j < len(subreddits[i]):
        subreddit_labels.append(i)
        j += 1
    i += 1

# Now, we concatenate all the subreddit texts together to get our corpus in one nice dataframe, and save it as a csv
corpus = pd.concat(subreddits, axis=0).reset_index(drop=True)
# 97 total characters

# The following iterates over each sentence in the corpus, turns all characters lower case, and changes the characters into
# a unique integer. It also turns all non-ascii characters into -1
# Note, this takes some time to run

if not path.exists('data/char_embeddings2/checkpoint1.csv'):
    sub_corpus = []
    i = 0
    print(len(corpus))
    section = len(corpus) // num_threads
    while i < num_threads:
        end = (i + 1) * section
        if i == num_threads - 1:
            end = len(corpus)
        sub_corpus.append([corpus[i * section:end], section * i])
        i += 1
    p = Pool(processes=num_threads)
    output = p.map(process_subdata, sub_corpus)
    p.close()
    corpus = pd.concat(output)
    corpus.to_csv('data/char_embeddings2/checkpoint1.csv')
else:
    corpus = pd.read_csv('data/char_embeddings2/checkpoint1.csv')

# Now, we have to pad all the rows to the same length (of longest sentence)
# and we pad all the shorter sentences with -1 at the end
'''padded_corpus = pd.DataFrame(corpus['text'].values.tolist()).agg(list, 1)
for i in range(len(padded_corpus)):
    padded_corpus[i] = [-1 if pd.isnull(x) else x for x in padded_corpus[i]]

padded_corpus.to_csv('./data/char_embedding/padded_int_char_encoded_text.csv')
'''

df = pd.DataFrame(subreddit_labels, columns=['label'])
processed_with_label = pd.concat([corpus, df], axis=1)


# As a final step, we need to create the test and train sets for these embeddings
# and as well, need to seperate the sentences into a numpy array and labels into another numpy array
processed_with_label = processed_with_label.sample(frac=1, random_state=42).reset_index(drop=True)

test_ratio = 0.05
valid_ratio = 0.1

test_set = processed_with_label[0:int(len(processed_with_label)*test_ratio)]
valid_set = processed_with_label[int(len(processed_with_label)*test_ratio):int(len(processed_with_label)*(test_ratio+valid_ratio))]
train_set = processed_with_label[int(len(processed_with_label)*(test_ratio+valid_ratio)):]
print(len(test_set))
print(len(train_set))
print(len(valid_set))

train_set.to_csv('data/char_embeddings2/train.csv')
valid_set.to_csv('data/char_embeddings2/valid.csv')
test_set.to_csv('data/char_embeddings2/test.csv')


train_X = train_set['text'].to_numpy()
train_y = train_set['label'].to_numpy()

test_X = test_set['text'].to_numpy()
test_y = test_set['label'].to_numpy()

valid_X = valid_set['text'].to_numpy()
valid_y = valid_set['label'].to_numpy()


np.save('data/char_embeddings2/train_X.npy',train_X)
np.save('data/char_embeddings2/train_y.npy',train_y)
np.save('data/char_embeddings2/test_X.npy',test_X)
np.save('data/char_embeddings2/test_y.npy',test_y)
np.save('data/char_embeddings2/valid_X.npy',valid_X)
np.save('data/char_embeddings2/valid_y.npy',valid_y)