-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathrun_model.py
More file actions
73 lines (58 loc) · 2.73 KB
/
run_model.py
File metadata and controls
73 lines (58 loc) · 2.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import pandas as pd
import gensim.corpora as corpora
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt
import json
from preprocessing import clean_tweets
from models import lda, biterm, guided_lda
from util import util
#from models.btm import BitermModel
HASHTAGS = {r'#nlwhiteout', r'#nlweather', r'#newfoundland', r'#nlblizzard2020', r'#nlstm2020', r'#snowmaggedon2020', r'#stmageddon2020', r'#stormageddon2020', r'#newfoundland',
r'#snowpocalypse2020', r'#snowmageddon', r'#nlstm', r'#nlwx', r'#nlblizzard', r'#nlwx', 'snowmaggedon2020', 'newfoundland', r'#nlstorm2020', 'snow', 'st'}
if __name__ == '__main__':
num_topics = [5,10,15,20]
seed_topic_list = [['traffic', 'road'], ['love','family']]
crisis_nlp_seeds = [['dead', 'injured', 'hurt', 'people', 'casualties'], ['missing', 'trapped', 'found', 'people'], ['home', 'building', 'roads', 'traffic', 'bridges', 'electricity'],
['donations', 'volunteers', 'shelter', 'supplies'], ['warning', 'advice'], ['thoughts', 'prayers']]
seed_conf = 0.15
seed_conf_list = [0.15, 0.30, 0.50, 0.75]
winter_tweets = pd.read_csv('/home/smacawi/smacawi/repositories/tweet-classifier/nlwx_2020_hashtags_no_rt.csv')
winter_tweets_cleaned = clean_tweets.preprocess(df=winter_tweets, extra_stopwords = HASHTAGS)
winter_tweets_cleaned.reset_index(inplace=True, drop=True)
ready_data = winter_tweets_cleaned['tokenized_text'].values.tolist()
for d in range(10):
print(winter_tweets_cleaned['text'].loc[d], ready_data[d])
# get best LDA model
#lda.get_best(ready_data, 1, 20, 1)
'''
# train best lda by coherence
best_num = 7
lda_model = lda.build_model(ready_data, num_topics = best_num, include_vis = True)
# save top words for each topic to csv
lda.top_vocab(lda_model, 10)
'''
'''
# trains 5, 10, 15, 20 topics
for val in num_topics:
print("Building model with {} topics...".format(val))
guided_lda_model = guided_lda.build_model(ready_data, num_topics = val, seed_topic_list = seed_topic_list, seed_conf = seed_conf)
#lda_model = lda.build_model(ready_data, num_topics = val)
#biterm_model = biterm.build_model(ready_data, num_topics = val)
'''
'''
# guided lda
for conf in seed_conf_list:
guided_lda_model = guided_lda.build_model(ready_data, num_topics=9, seed_topic_list=crisis_nlp_seeds,
seed_conf=conf)
'''
# self-implemented biterm
'''
K = 15
W = None
alpha = 0.5
beta = 0.5
n_iter = 1000
save_step = 100
btm = BitermModel.BitermModel(K, W, alpha, beta, n_iter, save_step)
btm.run(ready_data, "btm_")
'''