-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmi_process.py
More file actions
executable file
·124 lines (110 loc) · 3.03 KB
/
mi_process.py
File metadata and controls
executable file
·124 lines (110 loc) · 3.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import codecs
import json
import math
import csv
SENSES = ['Expansion.List', 'Expansion.Conjunction',
'Expansion.Instantiation',
'Contingency.Cause',
'Temporal.Asynchronous',
'Comparison.Contrast',
'Expansion.Restatement',
'Temporal.Synchrony',
'Contingency.Pragmatic cause',
'Comparison.Concession',
'Expansion.Alternative']
def read_data(file_name):
data = []
with codecs.open(file_name, 'r', encoding = 'utf-8') as f:
for line in f:
obj = json.loads(line)
if obj['Type'] == 'Implicit':
data.append(obj)
return data
def get_all_wp(fname, length=-1):
all_pairs = {}
wp_file = open(fname)
for lineno, line in enumerate(wp_file):
if line == '':
continue
if lineno == length:
break
all_pairs[line[:-1]] = lineno
return all_pairs
def mycmp(x, y):
return -cmp(x[2],y[2])
def mi_wps():
wps = get_all_wp('word_pairs.txt', 10000)
wps_list = [wp for wp in wps.keys()]
all_jsons = read_data('train_pdtb.json')
wp_sense_dict = {}
sense_dict = {}
wp_dict = {}
wp_res_list = []
hit_wp = 0
for index, wp in enumerate(wps_list):
wp_dict[wp] = 0
for sense in SENSES:
sense_dict[sense] = 0
for index, relation in enumerate(all_jsons):
if index % 100 == 0:
print 'index %d' % index
print hit_wp
for arg1 in relation['Arg1']['Lemma']:
for arg2 in relation['Arg2']['Lemma']:
if '%s_%s' % (arg1, arg2) in wps_list:
wp = '%s_%s' % (arg1, arg2)
hit_wp +=1
#print wp
wp_dict[wp] += 1
if not wp in wp_sense_dict:
wp_sense_dict[wp] = {}
for sense in relation['Sense']:
sense_dict[sense] += 1
if not sense in wp_sense_dict[wp]:
wp_sense_dict[wp][sense] = 0
wp_sense_dict[wp][sense] += 1
print 'pre process finished...\n'
with open('wp_sense_dict.json', 'w') as f:
f.write(json.dumps(wp_sense_dict))
total = 0
for sense in SENSES:
total+= sense_dict[sense]
print total
#print wps_list
for index, wp in enumerate(wps_list):
if wp_dict[wp] == 0:
print wp
continue
mi = []
for sense in wp_sense_dict[wp]:
n11 = wp_sense_dict[wp][sense]
n01 = sense_dict[sense] - n11
n10 = wp_dict[wp] - n11
n00 = total - n11 - n10 - n01
n1_ = n11+n10
n_1 = n11+n01
n0_ = n01+n00
n_0 = n10+n00
mi_value = 0
if not n11 == 0: mi_value += n11*1.0/(total* math.log(total*n11*1.0/(n1_*n_1), 2))
if not n01 == 0: mi_value += n01*1.0/(total* math.log(total*n01*1.0/(n0_*n_1), 2))
if not n10 == 0: mi_value += n10*1.0/(total* math.log(total*n10*1.0/(n1_*n_0), 2))
if not n00 == 0: mi_value += n00*1.0/(total* math.log(total*n00*1.0/(n0_*n_0), 2))
mi.append((sense, mi_value))
if len(mi) == 0: continue
mi_max = mi[0]
for i in mi:
if i[1] > mi_max[1]:
mi_max = i
wp_res_list.append((wp, mi_max[0], mi_max[1]))
print mi_max
wp_res_list.sort(mycmp)
print 'process finished...\n'
wp_mi_file = open('word_pairs, mi', 'w')
for i, wp_res in enumerate(wp_res_list):
wp_mi_file.write(str(wp_res_list[i][0]) + ' ' + str(wp_res_list[i][1]) + ' ' + str(wp_res_list[i][2]) + '\n')
wp_mi_file.close()
'''
def mi_prule():
'''
mi_wps()