-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodel.py
More file actions
102 lines (85 loc) · 3.08 KB
/
model.py
File metadata and controls
102 lines (85 loc) · 3.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import os
import json
import nltk
import time
import random
DEBUG = 1
def dprint(explanation,msg):
if DEBUG == 1:
print(explanation + ": " + str(msg))
def main():
#Pre data collection and setup
with open('complete.json') as json_data:
data = json.load(json_data)
# split training and validation in a 9:1 ratio
validation_set_size = round(len(data) / 10)
training_set_size = len(data) - validation_set_size
training_set = {}
validation_set = {}
# randomize set assignment but guarantee proportions
for key in data:
if (training_set_size == 0) or (validation_set_size > 0 and random.random() < .1):
validation_set[key] = data[key]
validation_set_size -= 1
else:
training_set_size -= 1
training_set[key] = data[key]
with open('training_set.json', 'w') as f:
json.dump(training_set, f)
with open('validation_set.json', 'w') as f:
json.dump(validation_set, f)
#Train model and time the training
startTime = time.time()
model = train(training_set)
dprint("Time for model to train",time.time() - startTime)
with open('model.json', 'w') as f:
json.dump(model, f)
#Training our prediction model
def train(training_set):
model = {}
idf = {}
counter = 0
for vote in training_set:
#Splits text into words
full_text = nltk.word_tokenize(training_set[vote]["bill"]["text"])
# count number of bills a word appears in for each word
words = set(full_text)
for word in words:
idf[word] = idf.get(word, 0) + 1
#Iterate over words in text
for word in full_text:
word = word.lower()
counter += 1
if (counter % 10000 == 0):
dprint("Words Trained On",counter)
#Train prediction model for each given vote aka Nay,No,Yea,Aye,Not Voting
def trainer(givenVote):
if givenVote in training_set[vote]["votes"]:
for legislator in training_set[vote]["votes"][givenVote]:
# group Aye with Yea and No with Nay
loggedCategory = givenVote
if givenVote == "Aye":
loggedCategory = "Yea"
elif givenVote == "No":
loggedCategory = "Nay"
# initialize new congressman
if legislator["id"] not in model:
model[legislator["id"]] = {}
if loggedCategory not in model[legislator["id"]]:
model[legislator["id"]][loggedCategory] = {}
# increment word count for a given label word pair
model[legislator["id"]][loggedCategory][word] = model[legislator["id"]][loggedCategory].get(word, 0) + 1
# indexed using a unique string so total word count won't be affected by anything else
model[legislator["id"]][loggedCategory]["total_wc !@#"] = model[legislator["id"]][loggedCategory].get(1, 0) + 1
# train over all possible categories
trainer("Nay")
trainer("No")
trainer("Yea")
trainer("Aye")
trainer("Not Voting")
idf["total_wc !@#"] = len(training_set)
with open('idf.json', 'w') as f:
json.dump(idf, f)
return model
if __name__ == "__main__":
main()