-
Notifications
You must be signed in to change notification settings - Fork 22
Expand file tree
/
Copy pathutil.py
More file actions
205 lines (171 loc) · 6.08 KB
/
util.py
File metadata and controls
205 lines (171 loc) · 6.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import numpy as np
import scipy as sc
from prettyprint import pp
import os
import re
#index label in the dictionary
idx_lbl = 'idx'
dfreq_lbl = "docfreq"
pattern = re.compile(r'([a-zA-Z]+|[0-9]+(\.[0-9]+)?)')
def tokenizeDoc(doc_address, min_len = 0, remove_numerics=True):
"""
to tokenize a document file to alphabetic tokens use this function.
Parameters
----------
doc_address: str
path to the file that is going to be tokenized
min_len: int
minimum length of a token. Default value is zero, it should always be non-negative.
remove_numerics: boolean
whether to remove the numeric tokens or not
Returns
-------
tokens: list
list of tokens from the input document according to the filtering criteria specified
"""
from string import punctuation, digits
tokens = []
try:
f = open(doc_address)
raw = f.read().lower()
text = pattern.sub(r' \1 ', raw.replace('\n', ' '))
text_translated = ''
if remove_numerics:
text_translated = text.translate(None, punctuation + digits)
else:
text_translated = text.translate(None, punctuation)
tokens = [word for word in text_translated.split(' ') if (word and len(word) > min_len)]
f.close()
except:
print "Error: %s couldn't be opened!", doc_address
finally:
return tokens
def createDictionary(classes, tokens_pool):
"""
this method will create a dictionary out of the tokens_pool it has been provided.
Parameters
----------
classes: list
list of the names of the classes of documents
tokens_pool: dictionary
dictionary of tokens. Each value of the dictionary is an list of lists,
each list belonging to a document in the corresponding class that has a list of tokens
Returns
-------
token_dict: dictionary
*Note that the tokens in the dictionary are not sorted, since in the vector space model
that we are going to use, all words are treated equal.
We practically believe in justice. Words in dictionary are tired of
all this injustice they have been forced to take for such a long time.
Now is the time to rise and earn the justice that belongs to them.
"""
token_dict = {}
idx = 0 #a unique index for words in dictionary
for cl in classes:
for tokens_list in tokens_pool[cl]:
for token in tokens_list:
if token in token_dict: #if token has been added to the dictionary before
if cl in token_dict[token]:
token_dict[token][cl] += 1
else:
token_dict[token][cl] = 1
else:
token_dict[token] = {}
token_dict[token][idx_lbl] = idx
idx += 1
token_dict[token][cl] = 1
return token_dict
def createTokenPool(classes, paths):
"""
this method will create a pool of tokens out of the list of paths to documents it will be provided
Parameters
----------
classes: list
list of the names of the classes documents belong to
paths: dictionary
dictionary of lists of paths to documents
Returns
-------
token_pool: dictionary
dictionary of lists of lists of tokens. each value bin of dictionary is a has a list of lists,
for which each list is of a document and it contains a list of tokens in that document
"""
token_pool = {}
for cl in classes:
token_pool[cl] = []
for path in paths[cl]:
token_pool[cl].append(tokenizeDoc(path))
return token_pool
def saveDictToFile(tdict, filename):
"""
this method will save the key/value pair of the dictionary to a csv file
Parameters
----------
tdict: dictionary
dictionary object containing many pairs of key and value
filename: str
name of the dictionary file
Returns
-------
"""
import csv
w = csv.writer(open(filename, "w"))
for key, val in tdict.items():
row = []
row.append(key)
row.append(val[idx_lbl])
for cl in class_titles:
if cl in val:
row.append(cl + ':' + str(val[cl]))
w.writerow(row)
def readFileToDict(filename):
"""
this method will create a dictionary from a file
*dictionary file is a csv file, each row contains a token and it's index
Parameters
----------
filename: str
name of the dictionary file
Returns
-------
tdict: dictionary
dictionary object created from input file
"""
import csv, codecs
tdict = {}
for row in csv.reader(codecs.open(filename, 'r')):
try:
tdict[row[0]] = {}
tdict[row[0]][idx_lbl] = int(row[1])
for i in range(2, len(row)):
lbl, cnt = row[i].split(':')
tdict[row[0]][lbl] = int(cnt)
except:
continue
return tdict
def train_test_split(ratio, classes, files):
"""
this method will split the input list of files to train and test sets.
*Note: currently this method uses the simplest way an array can be split in two parts.
Parameters
----------
ratio: float
ratio of total documents in each class assigned to the training set
classes: list
list of label classes
files: dictionary
a dictionary with list of files for each class
Returns
-------
train_dic: dictionary
a dictionary with lists of documents in the training set for each class
test_dict: dictionary
a dictionary with lists of documents in the testing set for each class
"""
train_dict = {}
test_dict = {}
for cl in classes:
train_cnt = int(ratio * len(files[cl]))
train_dict[cl] = files[cl][:train_cnt]
test_dict[cl] = files[cl][train_cnt:]
return train_dict, test_dict