-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfilter_freebase.py
More file actions
89 lines (71 loc) · 2.59 KB
/
filter_freebase.py
File metadata and controls
89 lines (71 loc) · 2.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import gzip
import pickle
in_file = "freebase-latest.gz"
out_file = "freebase-filtered.gz"
datatype_string = {}
datatype_string["type.int"] = "<http://www.w3.org/2001/XMLSchema#integer>"
datatype_string["type.float"] = "<http://www.w3.org/2001/XMLSchema#float>"
datatype_string["type.boolean"] = "<http://www.w3.org/2001/XMLSchema#boolean>"
ENTITY_GET_LABEL = [
b'<http://rdf.freebase.com/ns/type.object.name>',
b'<http://rdf.freebase.com/ns/common.topic.alias>',
b'<http://rdf.freebase.com/key/en>',
b'<http://rdf.freebase.com/key/wikipedia.en>',
b'<http://rdf.freebase.com/key/wikipedia.en_title>',
b'<http://www.w3.org/2000/01/rdf-schema#label>'
]
LBL_DESCR_LANG_ID = [b'<http://www.w3.org/2000/01/rdf-schema#label>', b'<http://rdf.freebase.com/ns/type.object.name>', b'<http://rdf.freebase.com/ns/common.topic.description>']
PREDICATES_TYPEOBJECT = [
b'<http://rdf.freebase.com/ns/type.object.key>',
b'<http://rdf.freebase.com/ns/type.object.id>',
b'<http://rdf.freebase.com/ns/type.object.permission>'
]
PREDICATE_START_UNNECESARRY = [
b'<http://rdf.freebase.com/key',
b'<http://rdf.freebase.com/dataworld',
b'<http://rdf.freebase.com/freebase',
b'<http://rdf.freebase.com/user',
b'<http://rdf.freebase.com/base',
b'<http://rdf.freebase.com/common',
b'<http://www.w3.org/1999/02/22-rdf-syntax-ns'
]
type_map = {}
with open("numeric_properties.txt", "r") as f_in:
for line in f_in:
line = line.strip()
pred, type = line.split("\t")
type_map[pred] = datatype_string[type]
f_in = gzip.open(in_file, "r")
f_out = gzip.open(out_file, "w")
line_num = added = 0
for line in f_in:
line_num += 1
if line_num % 1000000 == 0:
print(line_num)
if not line:
continue
subj, pred, obj, rest = line.split(b"\t")
if any([(pred.startswith(start) and pred not in ENTITY_GET_LABEL) for start in PREDICATE_START_UNNECESARRY]):
continue
if any([pred.startswith(typeobj) for typeobj in PREDICATES_TYPEOBJECT]):
continue
if pred in LBL_DESCR_LANG_ID and not obj.endswith(b'@en'):
continue
pred_t = pred[pred.rfind(b"/")+1:len(pred)-1]
try:
datatype_string = type_map[pred_t]
if b"^^" in obj:
pass
else:
if b"\"" in obj:
obj = obj + b"^^" + datatype_string
else:
obj = b"\"" + obj + b"\"^^" + datatype_string
line = b"\t".join([subj, pred, obj, rest])
except:
pass
f_out.write(line)
added += 1
print(f"Added: {added} triplets")
f_in.close()
f_out.close()