-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocess.py
More file actions
44 lines (40 loc) · 1.8 KB
/
preprocess.py
File metadata and controls
44 lines (40 loc) · 1.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import nltk
def tokenizeAndFilter(sentence):
"""
tokenize the sentence
"""
# TODO: make a custom tokenizer so can tokenize imp domain related chunks
tokenized = nltk.word_tokenize(sentence)
for i, token in enumerate(tokenized):
if token[-1] == '.' or token[-1]==" ":
tokenized[i] = token[:len(token)-1]
if token == "vector" or token == "Vector":
if tokenized[i+1]=="space" or tokenized[i+1]=="Space":
tokenized[i] = "Vector Space"
omitWords = {"The", "there are", "there is", "draw", "make", "construct",
"Construct", "Make", "Draw", "is", "an", "Given", "given", ",", " ", ""}
return [w for w in tokenized if not w in omitWords]
def seqLabel(tokenizedSent):
types = {"Function", "Set", "Vector Space", "Vector"}
# just have names as set name defines function
relations = {"Injection": "BinRelFunc",
"Bijection": "BinRelFunc", "Surjection": "BinRelFunc", "Orthogonal": "BinRelVec", "Intersection": "BinRelSet", "+": "BinRelVec", "=": "BinRelVec"}
directional = {"From", "To", "In"}
named = set()
label = []
for i, word in enumerate(tokenizedSent):
capitalizedWord = word.capitalize()
if capitalizedWord in types or word in types:
# print(capitalizedWord)
label.append((capitalizedWord, "entityType"))
elif capitalizedWord in relations.keys():
label.append((capitalizedWord, relations[capitalizedWord]))
elif capitalizedWord in directional:
label.append((word, capitalizedWord))
elif len(word) <= 2:
if word not in named:
label.append((word,"name"))
named.add(word)
else:
label.append((word, "declaredBefore"))
return label