forked from serenayj/evoquer
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocess_query_simpl_trans.py
More file actions
142 lines (116 loc) · 3.76 KB
/
preprocess_query_simpl_trans.py
File metadata and controls
142 lines (116 loc) · 3.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import benepar
parser = benepar.Parser("benepar_en2")
def parse_line(line):
tree = parser.parse(line)
item = None
for tr in tree.subtrees():
if tr.label() == 'VP':
item = tr
return item
# Process queries by constituency parser, extract main verb (verb not in subordinating conjunction)
# Main objects are also extracted
t = "person turns off the light as they're leaving"
from nltk.tree import ParentedTree
def is_sbar(tr, v_cnt):
ptr = tr.parent()
flag = False
while ptr:
if ptr.label() == 'SBAR' and v_cnt !=0:
flag = True
break
else:
ptr = ptr.parent()
return flag
verbs_tags = ['VBZ', 'VBP', 'VB', 'VBD','VBG', 'VBN']
nouns_tags = ['NNS', 'NN']
def find_verb(item):
verb = None
for st in item.subtrees():
if st.label() in verbs_tags:
verb = st.leaves()
return verb, st
return verb, st
def find_noun(item):
noun = None
for st in item.subtrees():
if st.label() in nouns_tags:
noun = st.leaves()
return noun
return None
# Extract main verbs and nouns as translation phrase
def process_line(t):
line = t.split("##")[-1]
tree = parser.parse(line)
newtree = ParentedTree.convert(tree)
output= []
verbs = []
nouns = []
v_cnt = 0
out = ""
for tr in newtree.subtrees():
if tr.label() == 'VP':
#flag = is_sbar(tr, v_cnt)
flag = False
if not flag:
verb, st = find_verb(tr)
if verb not in verbs:
verbs.append(verb)
v_cnt +=1
if verb:
out += " "+ " ".join(verb)
noun = find_noun(tr)
if noun not in nouns:
nouns.append(noun)
#print(noun)
if noun:
out += " "+ " ".join(noun)
return verbs, nouns, out
verbs, nouns, out = process_line(t)
stem_queries_verb = {}
from nltk.stem.wordnet import WordNetLemmatizer
out.split()
words = [WordNetLemmatizer().lemmatize(w,'v') for w in out.split()]
def process_pip(line):
verbs, nouns, out = process_line(line)
words = [WordNetLemmatizer().lemmatize(w,'v') for w in out.split()]
return words
test_words = {}
test_lines = open("/Users/yanjungao/Desktop/VPMT/data/charades/annotations/charades_sta_test.txt").readlines()
train_lines = open("/Users/yanjungao/Desktop/VPMT/data/charades/annotations/charades_sta_train.txt").readlines()
vocabs = []
for l in test_lines:
_id = test_lines.index(l)
words = process_pip(l)
test_words[_id] = words
vocabs.append(words)
train_words = {}
for l in train_lines:
_id = train_lines.index(l)
words = process_pip(l)
train_words[_id] = words
vocabs.append(words)
vocabs.extend(list(test_words.values()))
vocabs_all = [j for i in vocabs for j in i]
vocabs = set(vocabs_all)
vocab_idx = {k:list(vocabs).index(k)+1 for k in vocabs}
vocab_idx['PAD'] = 0
vocab_idx['<sos>'] = len(vocab_idx)
vocab_idx['<eos>'] = len(vocab_idx)
idx_vocab = {v:k for k,v in vocab_idx.items()}
def label_index(queries, vocab_idx):
out_idx = {}
for k,v in queries.items():
v.insert(0, '<sos>')
v.append('<eos>')
val = [vocab_idx[i] for i in v]
out_idx[k] = val
return out_idx
train_idx = label_index(train_words, vocab_idx)
test_idx = label_index(test_words, vocab_idx)
import json
with open('train_translate.json', 'w') as f:
json.dump([train_idx, train_words], f)
with open('test_translate.json', 'w') as f:
json.dump([test_idx, test_words], f)
with open('vocab_translate.json', 'w') as f:
json.dump([vocab_idx, idx_vocab], f)