-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprepare_data.py
More file actions
123 lines (109 loc) · 4.22 KB
/
prepare_data.py
File metadata and controls
123 lines (109 loc) · 4.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime
from pdb import set_trace
import bert
import input_builder
from bert import optimization
from bert import tokenization
from tensorflow import keras
import os
import re
import pandas as pd
MAX_SEQ_LENGTH = 128
os.environ['TFHUB_CACHE_DIR'] = '/home/djjindal/bert/script-learning'
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
# This is a path to an uncased (all lowercase) version of BERT
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
def tokenize_if_small_enough(ds, sentences=True, no_context=True):
# for d in ds:
for i, d in zip(range(10000), ds):
try:
yield tokenize_dataset_dict(d, sentences, no_context)
except AssertionError:
continue
def create_tokenizer_from_hub_module():
"""Get the vocab file and casing info from the Hub module."""
#set_trace()
with tf.Graph().as_default():
bert_module = hub.Module(BERT_MODEL_HUB)
tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
with tf.Session() as sess:
vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
tokenization_info["do_lower_case"]])
return bert.tokenization.FullTokenizer(
vocab_file=vocab_file, do_lower_case=do_lower_case)
tokenizer = create_tokenizer_from_hub_module()
def get_token_ids(sentence, tokenizer, entity):
tokens = []
if type(sentence) == str:
for orig_token in sentence.split(" "):
temp = tokenizer.tokenize(orig_token)
for t in temp:
tokens.append(t)
elif type(sentence) == tuple or type(sentence) == list:
for svo in sentence:
if svo is None:
svo = entity
for orig_token in svo.split(" "):
temp = tokenizer.tokenize(orig_token)
for t in temp:
tokens.append(t)
return tokens
"""# Make Data InputFeatures"""
#If candidates is list of strings, entity can be None
def convert_single_example2(event_chain, candidates, entity, label, max_seq_length, no_context,
tokenizer):
tokens_e = []
segment_ids_e = []
input_id_list = []
input_mask_list = []
segment_id_list = []
tokens_e.append("[CLS]")
segment_ids_e.append(0)
# Fill Token Ids and Segment Ids from event chain
if no_context != "True":
for event in event_chain:
tokens_e.extend(get_token_ids(event, tokenizer, entity))
tokens_e.append("[SEP]")
segment_ids_e = [0]*len(tokens_e)
for candidate in candidates:
tokens = []
segment_ids = []
tokens.extend(tokens_e)
segment_ids.extend(segment_ids_e)
candidate_tokens = get_token_ids(candidate, tokenizer, entity)
tokens.extend(candidate_tokens)
segment_ids.extend([1]*len(candidate_tokens))
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_mask = [1] * len(input_ids)
assert len(input_ids) <= max_seq_length
assert len(input_mask) <= max_seq_length
assert len(segment_ids) <= max_seq_length
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
input_id_list.append(input_ids)
input_mask_list.append(input_mask)
segment_id_list.append(segment_ids)
feature = input_builder.InputFeatures(
input_ids=input_id_list,
input_mask=input_mask_list,
segment_ids=segment_id_list,
label_id=label+1,
is_real_example=True)
return feature
def tokenize_dataset_dict(ec_dict, sentence, no_context):
train_sents = ec_dict['sentences']
train_triples = ec_dict['triples']
candidates = ec_dict['candidates']
correct_ending = ec_dict['correct']
entity = ec_dict['entity']
if sentence == "True":
train_features = convert_single_example2(train_sents[:-1], candidates, entity, correct_ending, MAX_SEQ_LENGTH, no_context, tokenizer)
else:
train_features = convert_single_example2(train_triples[:-1], candidates, entity, correct_ending, MAX_SEQ_LENGTH, no_context, tokenizer)
return train_features