Commonsense_Test/encoder_decoder.py at master · shiningliang/Commonsense_Test · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
from __future__ import print_function
import sys, pandas, argparse
import xml.etree.cElementTree as et
import pickle as pkl
import time

sys.path.append('../')

from models.pipeline import *
from models.classifier import *
from models.transformer import *


def load_copa(filepath=None):
    xml_tree = et.parse(filepath)
    corpus = xml_tree.getroot()
    premises = []
    alts = []
    answers = []
    modes = []
    for item in corpus:
        mode = item.attrib["asks-for"]
        modes.append(mode)
        answer = int(item.attrib["most-plausible-alternative"]) - 1  # answers are 1 and 2, convert to 0 and 1
        answers.append(answer)
        premise = item.find("p").text
        premises.append(premise)
        alt1 = item.find("a1").text
        alt2 = item.find("a2").text
        alts.append([alt1, alt2])
    answers = numpy.array(answers)
    return premises, alts, answers, modes


def get_copa_scores(model, premises, alts, modes):
    alt1_pairs = []
    alt2_pairs = []
    for premise, (alt1, alt2), mode in zip(premises, alts, modes):
        if mode == 'cause':
            alt1_pairs.append([alt1, premise])
            alt2_pairs.append([alt2, premise])
        else:
            alt1_pairs.append([premise, alt1])
            alt2_pairs.append([premise, alt2])

    s1 = [pair[0] for pair in alt1_pairs]
    s2 = [pair[1] for pair in alt1_pairs]
    alt1_scores = model.predict(seqs1=s1, seqs2=s2)
    alt2_scores = model.predict(seqs1=[pair[0] for pair in alt2_pairs], seqs2=[pair[1] for pair in alt2_pairs])

    # pred_alts = numpy.argmax(numpy.array(zip(alt1_scores, alt2_scores)), axis=1)
    alt = np.concatenate((alt1_scores, alt2_scores)).reshape(2, -1)
    pred_alts = numpy.argmax(alt, axis=0)

    return alt1_scores, alt2_scores, pred_alts


def get_copa_accuracy(pred_alts, answers):
    pred_is_correct = numpy.array(pred_alts) == numpy.array(answers)
    accuracy = numpy.mean(pred_is_correct)
    return accuracy


def eval_copa(model, data_filepath):
    premises, alts, answers, modes = load_copa(filepath=data_filepath)
    alt1_scores, alt2_scores, pred_alts = get_copa_scores(model, premises, alts, modes)
    accuracy = get_copa_accuracy(pred_alts, answers)
    print("COPA accuracy: {:.3f}".format(accuracy))
    return accuracy


def get_seqs(filepath, header=None, chunk_size=None):
    if chunk_size:
        seqs = (chunk.iloc[:, 0].values.tolist() for chunk in
                pandas.read_csv(filepath, encoding='utf-8', header=header, chunksize=chunk_size))
    else:
        seqs = pandas.read_csv(filepath, encoding='ISO-8859-1', header=header).iloc[:, 0].values.tolist()
    return seqs


def load_model(filepath):
    model = EncoderDecoderPipeline.load(filepath=filepath)
    return model


def preprocess(args):
    # 数据处理
    transformer = SequenceTransformer(min_freq=args.min_freq, lemmatize=True, filepath=args.save_filepath,
                                      # fine-grained POS tags, retain adj noun adv verb
                                      include_tags=['JJ', 'JJR', 'JJS', 'NN', 'NNS', 'RB', 'RBR', 'RBS', 'RP',
                                                    # lemmatize segments and filter grammatical words
                                                    'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'])
    # 模型
    classifier = EncoderDecoder(filepath=args.save_filepath, recurrent=args.recurrent, batch_size=args.batch_size,
                                n_hidden_nodes=args.n_hidden_nodes)
    model = EncoderDecoderPipeline(transformer, classifier)

    if args.chunk_size:  # load training data in chunks
        if not transformer.lexicon:
            for seqs in get_seqs(args.train_seqs, chunk_size=args.chunk_size):
                transformer.make_lexicon(seqs)

        for epoch in range(args.n_epochs):
            print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
            print("EPOCH:", epoch + 1)
            for seqs in get_seqs(args.train_seqs, chunk_size=args.chunk_size):
                seq_pairs = get_adj_sent_pairs(seqs, segment_clauses=False if args.segment_sents else True,
                                               max_distance=args.max_pair_distance, max_sent_length=args.max_length)
                model.fit(seqs1=[pair[0] for pair in seq_pairs], seqs2=[pair[1] for pair in seq_pairs],
                          max_length=args.max_length,
                          eval_fn=lambda model: eval_copa(model, data_filepath=args.val_items), n_epochs=1)

    else:  # load entire training data at once
        # load ROCStories
        seqs = get_seqs(args.train_seqs, chunk_size=None)
        if not transformer.lexicon:
            # 制作ROCS的词典
            print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
            print('Making lexicon...')
            transformer.make_lexicon(seqs)

        # 邻居句子pair 但只向下文查找，跳过超过max_length的句子
        print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
        print('Getting adj sent pairs...')
        seq_pairs = get_adj_sent_pairs(seqs, segment_clauses=False if args.segment_sents else True,
                                       max_distance=args.max_pair_distance, max_sent_length=args.max_length)

        print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
        print('Saving model and sent pairs...')
        with open('./checkpoints/model.pkl', 'wb') as f:
            pkl.dump(model, f)
        f.close()
        with open('./dataset/processed/pairs.pkl', 'wb') as f:
            pkl.dump(seq_pairs, f)
        f.close()


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description="Train an encoder-decoder model that predicts causally related sentences "
                    "in the Choice of Plausible Alternatives (COPA) framework")
    parser.add_argument("--train_seqs",
                        help="Specify filename (.csv) containing text used as training data.",
                        type=str, default='dataset/raw/stories.csv')
    parser.add_argument("--val_items",
                        help="Specify filename (XML) containing COPA items in validation set.",
                        type=str, default='dataset/raw/copa-dev.xml')
    parser.add_argument("--test_items",
                        help="Specify filename (XML) containing COPA items in test set.",
                        type=str, default='dataset/raw/copa-test.xml')
    parser.add_argument("--save_filepath",
                        help="Specify the directory filepath where the trained model should be stored.",
                        type=str, default='checkpoints')

    parser.add_argument("--min_freq", "-freq",
                        help="Specify frequency threshold for including words in model lexicon, "
                             "such that only words that appear in the training sequences at least "
                             "this number of times will be added (all other words will be mapped to "
                             "a generic <UNKNOWN> token). Default is 5.",
                        required=False, type=int, default=5)
    parser.add_argument("--segment_sents", "-sent",
                        help="Specify if the segments in the input-output pairs should be sentences rather than "
                             "intrasentential clauses (see paper). If not given, clause-based segmentation will be used.",
                        required=False, action='store_true')
    parser.add_argument("--max_length", "-len",
                        help="Specify the maximum length of the input and output segements in the training data "
                             "(in terms of number of words). Pairs with longer sequences will be filtered. Default is 20.",
                        required=False, type=int, default=20)
    parser.add_argument("--max_pair_distance", "-dist",
                        help="Specify the distance window in which neighboring segments will be joined into input-output pairs. "
                             "For example, if this parameter is 3, "
                             "all segments that are separated by 3 or fewer segments in a particular training text "
                             "will be added as pairs. Default is 4.",
                        required=False, type=int, default=4)
    parser.add_argument("--recurrent", "-rec",
                        help="Specify if the model should use RNN (GRU) layers. If not specified, "
                             "feed-forward layers will be used, and the sequential ordering of words in the segments will be ignored.",
                        required=False, action='store_true')
    parser.add_argument("--batch_size", "-batch",
                        help="Specify number of sequences in batch during training. Default is 100.",
                        required=False, type=int, default=100)
    parser.add_argument("--n_hidden_nodes", "-hid",
                        help="Specify number of dimensions in the encoder and decoder layers. Default is 500.",
                        required=False, type=int, default=500)
    parser.add_argument("--n_epochs", "-epoch",
                        help="Specify the number of epochs the model should be trained for. Default is 50.",
                        required=False, type=int, default=50)
    parser.add_argument("--chunk_size", "-chunk",
                        help="If dataset is large, specify this parameter to load training sequences in chunks of "
                             "this size instead of all at once to avoid memory issues."
                             "For smaller datasets (e.g. the ROCStories corpus), "
                             "it is much faster to load entire dataset prior to training. This will be done by default if chunk size is not given.",
                        required=False, type=int, default=0)
    args = parser.parse_args()

    preprocess(args)

    with open('./checkpoints/model.pkl', 'rb') as f:
        model = pkl.load(f)
    f.close()
    with open('./dataset/processed/pairs.pkl', 'rb') as f:
        seq_pairs = pkl.load(f)
    f.close()
    model.fit(seqs1=[pair[0] for pair in seq_pairs], seqs2=[pair[1] for pair in seq_pairs],
              max_length=args.max_length,
              eval_fn=lambda model: eval_copa(model, data_filepath=args.val_items), n_epochs=args.n_epochs)

    # Evaluate model on test set after training
    print("\ntest accuracy:")
    test_accuracy = eval_copa(model, data_filepath=args.test_items)