-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path#MaxLogHash.py#
More file actions
121 lines (107 loc) · 3.5 KB
/
#MaxLogHash.py#
File metadata and controls
121 lines (107 loc) · 3.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import random
import math
import argparse
import time
import sys
import mmh3
import numpy as np
from heapq import heapify,heappop,heappush
import time
totalShingles = (1 << 32) - 1
def MaxLog(seed):
randomNoA = hash_parameter(k)
randomNoB = hash_parameter(k)
for item in stream:
# print "item"
# print item
if item [0] in maxShingleID.keys():
max_hash_val_list = maxShingleID[item[0]][0]
max_hash_sig_list = maxShingleID[item[0]][1]
# print max_hash_val_list
# print max_hash_sig_list
for x in range(0, k):
temp = ((randomNoA[x] * mmh3.hash(str(item[1]),seed) + randomNoB[x]) % totalShingles)
temp = temp / float(totalShingles)
log_temp = - math.log(temp,2)
hash_val = math.ceil(log_temp)
if hash_val > max_hash_val_list[x]:
max_hash_val_list[x] = hash_val
max_hash_sig_list[x] = 1
elif hash_val == max_hash_val_list[x]:
max_hash_sig_list[x] = 0
maxShingleID [item[0]][0]= max_hash_val_list
maxShingleID [item[0]][1]= max_hash_sig_list
else:
max_hash_val_list = [-1] * k
max_hash_sig_list = [0] * k
max_hash_res_new = []
max_hash_res_new.append(max_hash_val_list)
max_hash_res_new.append(max_hash_sig_list)
maxShingleID[item[0]] = max_hash_res_new
return
def hash_parameter(k):
randList = []
randIndex = random.randint(0, totalShingles - 1)
randList.append(randIndex)
while k > 0:
while randIndex in randList:
randIndex = random.randint(0, totalShingles - 1)
randList.append(randIndex)
k = k - 1
return randList
def estimate():
con = 0
for x in range(0, k):
if (maxShingleID['setA'][0][x] > maxShingleID['setB'][0][x] and maxShingleID['setA'][1][x] ==1 ):
con = con + 1
elif (maxShingleID['setA'][0][x] < maxShingleID['setB'][0][x] and maxShingleID['setB'][1][x] ==1 ):
con = con + 1
# print con
num = float(k)
# print num
jaccard_sim = 1.0 - con*(1/num)*(1/0.7213)
return jaccard_sim
if __name__ == '__main__':
random_seed = 1
card = 100000
jaccard_true = 0.9
k = int(128)
total_num = card * 2
sim = (2 * jaccard_true) / (1 + jaccard_true)
maxShingleID = {}
the_same_index = total_num / 2 * sim
setA_uni_index = total_num / 2 * 1
setB_uni_index = total_num / 2 * (2 - sim)
stream = []
setA = []
setB = []
#synthetic data
for num in range(total_num):
if num <= the_same_index:
stream.append(['setA', num])
setA.append(num)
stream.append(['setB', num])
setB.append(num)
elif num <= setA_uni_index:
stream.append(['setA', num])
setA.append(num)
elif num <= setB_uni_index:
stream.append(['setB', num])
setB.append(num)
else:
break
mltimes = time.time()
MaxLog(random_seed)
mltimef = time.time()
jtimes = time.time()
jaccard_est = estimate()
jtimef = time.time()
#print('Len Stream: ', len(stream))
#print('setA: ', setA)
#print('setB', setB)
print(jaccard_true, jaccard_est)
mltime = mltimef - mltimes
jtime = jtimef - jtimes
print('Max Log Time: ', mltime)
print('Jaccard Time: ', jtime)
print('Total:', mltime + jtime)