-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_process_char_embedding.py
More file actions
131 lines (106 loc) · 4.86 KB
/
data_process_char_embedding.py
File metadata and controls
131 lines (106 loc) · 4.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from os import path
import string
import re
import ast
from multiprocessing import Pool
num_threads = 30
# We create a dictionary of all ASCII characters. We'll need this to map the characters to unique integers
'''dictionary = {'a':0, 'b':1, 'c':2, 'd':3, 'e':4, 'f':5, 'g':6,'h':7, 'i':8, 'j':9, 'k': 10, 'l': 11,
'm': 12, 'n':13, 'o': 14, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w':23,
'x': 24, 'y': 25, 'z': 26, '0': 27, '1': 28, '2': 29, '3': 30, '4': 31, '5': 32, '6': 33, '7': 34,
'8': 35, '9': 36, " ": 37}'''
dictionary = {}
i = 32
value = 0
while i <= 122:
dictionary[chr(i)] = value
value += 1
i += 1
print(dictionary)
def process_subdata(input_pack):
corpus, initial = input_pack
#print(len(corpus.text))
i =0
while i < len(corpus.text):
corpus.text[i+initial] = ','.join(str(dictionary[j]) if (j in dictionary) else '-1' for j in
corpus.text[i+initial].lower())
#print(str(corpus.text[i+initial]) + "\n" + str(list(ast.literal_eval(corpus.text[i+initial]))))
corpus.text[i+initial] = list(ast.literal_eval(corpus.text[i+initial]))
if (i % 1000 == 0):
print(i)
i += 1
return corpus
number_subreddits = 20
subreddit_labels = []
# First load the data in, and we only need the text from the files, so we load in column 1, skip the first header row and use the custom header of (text) as it's just a bit more descriptive
subreddits = []
i =0
while i < number_subreddits:
subreddits.append(pd.read_csv('data/' + str(i) + '_raw_data.csv', header=None, names=['text'], skiprows=[0], usecols=[1], lineterminator='\n'))
j = 0
while j < len(subreddits[i]):
subreddit_labels.append(i)
j += 1
i += 1
# Now, we concatenate all the subreddit texts together to get our corpus in one nice dataframe, and save it as a csv
corpus = pd.concat(subreddits, axis=0).reset_index(drop=True)
# 97 total characters
# The following iterates over each sentence in the corpus, turns all characters lower case, and changes the characters into
# a unique integer. It also turns all non-ascii characters into -1
# Note, this takes some time to run
if not path.exists('data/char_embeddings2/checkpoint1.csv'):
sub_corpus = []
i = 0
print(len(corpus))
section = len(corpus) // num_threads
while i < num_threads:
end = (i + 1) * section
if i == num_threads - 1:
end = len(corpus)
sub_corpus.append([corpus[i * section:end], section * i])
i += 1
p = Pool(processes=num_threads)
output = p.map(process_subdata, sub_corpus)
p.close()
corpus = pd.concat(output)
corpus.to_csv('data/char_embeddings2/checkpoint1.csv')
else:
corpus = pd.read_csv('data/char_embeddings2/checkpoint1.csv')
# Now, we have to pad all the rows to the same length (of longest sentence)
# and we pad all the shorter sentences with -1 at the end
'''padded_corpus = pd.DataFrame(corpus['text'].values.tolist()).agg(list, 1)
for i in range(len(padded_corpus)):
padded_corpus[i] = [-1 if pd.isnull(x) else x for x in padded_corpus[i]]
padded_corpus.to_csv('./data/char_embedding/padded_int_char_encoded_text.csv')
'''
df = pd.DataFrame(subreddit_labels, columns=['label'])
processed_with_label = pd.concat([corpus, df], axis=1)
# As a final step, we need to create the test and train sets for these embeddings
# and as well, need to seperate the sentences into a numpy array and labels into another numpy array
processed_with_label = processed_with_label.sample(frac=1, random_state=42).reset_index(drop=True)
test_ratio = 0.05
valid_ratio = 0.1
test_set = processed_with_label[0:int(len(processed_with_label)*test_ratio)]
valid_set = processed_with_label[int(len(processed_with_label)*test_ratio):int(len(processed_with_label)*(test_ratio+valid_ratio))]
train_set = processed_with_label[int(len(processed_with_label)*(test_ratio+valid_ratio)):]
print(len(test_set))
print(len(train_set))
print(len(valid_set))
train_set.to_csv('data/char_embeddings2/train.csv')
valid_set.to_csv('data/char_embeddings2/valid.csv')
test_set.to_csv('data/char_embeddings2/test.csv')
train_X = train_set['text'].to_numpy()
train_y = train_set['label'].to_numpy()
test_X = test_set['text'].to_numpy()
test_y = test_set['label'].to_numpy()
valid_X = valid_set['text'].to_numpy()
valid_y = valid_set['label'].to_numpy()
np.save('data/char_embeddings2/train_X.npy',train_X)
np.save('data/char_embeddings2/train_y.npy',train_y)
np.save('data/char_embeddings2/test_X.npy',test_X)
np.save('data/char_embeddings2/test_y.npy',test_y)
np.save('data/char_embeddings2/valid_X.npy',valid_X)
np.save('data/char_embeddings2/valid_y.npy',valid_y)