-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathquery.py
More file actions
109 lines (97 loc) · 4.41 KB
/
query.py
File metadata and controls
109 lines (97 loc) · 4.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# coding=utf-8
POSSIBLE_OPERATORS = '|&'
class Tree:
def __init__(self, parent=None):
self.operator = None
self.childrens = []
self.parent = parent
def normalize(self, tokenizer, normalizer):
for i in range(len(self.childrens)):
if isinstance(self.childrens[i], Tree):
self.childrens[i].normalize(tokenizer, normalizer)
# This happens before anything. So isintance(self.childrens[i], set) should never happen
else:
try:
self.childrens[i] = next(tokenizer.tokenize(self.childrens[i], normalizer))
except StopIteration:
for j in range(len(self.parent.childrens)):
if self.parent.childrens[j] == self:
break
self.parent.childrens[j] = self.childrens[i ^ 1] # 1 ^ 1 = 0, 0 ^ 1 = 1 :)
# Small hack, but this prevent from having a non-normalized token make it to the layer above without being normalized
self.parent.normalize(tokenizer, normalizer)
def prepare(self, inv_index):
for i in range(len(self.childrens)):
if isinstance(self.childrens[i], Tree):
self.childrens[i].prepare(inv_index)
elif isinstance(self.childrens[i], set):
pass
else:
self.childrens[i] = set(inv_index.inverted_index.get(self.childrens[i], {}).keys())
def execute(self, inv_index):
for i in range(len(self.childrens)):
if isinstance(self.childrens[i], Tree):
self.childrens[i] = self.childrens[i].execute(inv_index)
if self.operator == '&':
return self.childrens[0] & self.childrens[1]
if self.operator == '|':
return self.childrens[0] | (self.childrens[1])
if self.operator == '~':
return inv_index - self.childrens[0]
if self.operator is None and len(self.childrens) == 1:
return self.childrens[0]
def query(self, inv_index, tokenizer, normalizer):
# execute all leaves
self.normalize(tokenizer, normalizer)
self.prepare(inv_index)
return self.execute(inv_index)
@staticmethod
def parse(tree, query_string):
if len(query_string) == 0:
return
query_string = query_string.replace(' ', '')
if query_string[0] not in '()~' + POSSIBLE_OPERATORS:
i = 1
while i < len(query_string):
if query_string[i] in POSSIBLE_OPERATORS + ')':
break
i += 1
if len(tree.childrens) > 0:
if i < len(query_string): # I have an operator
if query_string[i] in POSSIBLE_OPERATORS:
node = Tree(tree)
node.childrens.append(tree.childrens[0])
if isinstance(tree.childrens[0], Tree):
tree.childrens[0].parent = node
node.operator = tree.operator
node.childrens.append(query_string[:i])
tree.operator = query_string[i]
tree.childrens[0] = node
Tree.parse(tree, query_string[i + 1:])
elif query_string[i] == ')':
tree.childrens.append(query_string[:i])
Tree.parse(tree.parent, query_string[i + 1:])
else:
tree.childrens.append(query_string)
return
else:
if i < len(query_string):
tree.operator = query_string[i]
tree.childrens.append(query_string[:i])
Tree.parse(tree, query_string[i + 1:])
else:
tree.childrens.append(query_string)
if query_string[0] in POSSIBLE_OPERATORS:
tree.operator = query_string[0]
Tree.parse(tree, query_string[1:])
if query_string[0] == '(':
node = Tree(tree)
tree.childrens.append(node)
Tree.parse(node, query_string[1:])
if query_string[0] == ')':
Tree.parse(node.parent, query_string[1:])
if query_string[0] == '~':
node = Tree(tree)
tree.childrens.append(node)
node.operator = '~'
Tree.parse(node, query_string[1:])