sem_parser/preprocess.py at master · Adityav369/sem_parser · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import nltk


def tokenizeAndFilter(sentence):
    """
    tokenize the sentence
    """
    # TODO: make a custom tokenizer so can tokenize imp domain related chunks
    tokenized = nltk.word_tokenize(sentence)
    for i, token in enumerate(tokenized):
        if token[-1] == '.' or token[-1]==" ":
            tokenized[i] = token[:len(token)-1]
        if token == "vector" or token == "Vector":
            if tokenized[i+1]=="space" or tokenized[i+1]=="Space":
                tokenized[i] = "Vector Space"
    omitWords = {"The", "there are", "there is", "draw", "make", "construct",
                 "Construct", "Make", "Draw", "is", "an", "Given", "given", ",", " ", ""}
    return [w for w in tokenized if not w in omitWords]


def seqLabel(tokenizedSent):
    types = {"Function", "Set", "Vector Space", "Vector"}
    # just have names as set name defines function
    relations = {"Injection": "BinRelFunc",
                      "Bijection": "BinRelFunc", "Surjection": "BinRelFunc", "Orthogonal": "BinRelVec", "Intersection": "BinRelSet", "+": "BinRelVec", "=": "BinRelVec"}
    directional = {"From", "To", "In"}
    named = set()
    label = []
    for i, word in enumerate(tokenizedSent):
        capitalizedWord = word.capitalize()
        if capitalizedWord in types or word in types:
            # print(capitalizedWord)
            label.append((capitalizedWord, "entityType"))
        elif capitalizedWord in relations.keys():
            label.append((capitalizedWord, relations[capitalizedWord]))
        elif capitalizedWord in directional:
            label.append((word, capitalizedWord))
        elif len(word) <= 2:
            if word not in named:
                label.append((word,"name"))
                named.add(word)
            else:
                label.append((word, "declaredBefore"))
    return label