mintt/utils.py at master · data-iitd/mintt · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
# graph agg 3

from torch.nn import Linear
from torch_geometric.data import TemporalData
from torch_geometric.loader import TemporalDataLoader
from sklearn.metrics import average_precision_score, roc_auc_score
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
import torch
from torch import Tensor
import pandas as pd
import numpy as np
from torch_geometric.nn import Node2Vec
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import os
from tqdm import tqdm
from numpy.linalg import norm
# print(torch.__version__)
import random
import json
import ast
# def reduce_data(data,keep=.1):
#     src,dst,t,msg = data.src,data.dst,data.t,data.msg
#     return TemporalData(src=data.src[:len_], dst=data.dst[:len_], t=data.t[:len_], msg=data.msg[:len_])

def filter_data(data,src_nodes):
    idx1 = np.in1d(data.src.cpu().numpy(),src_nodes)
    idx2 = np.in1d(data.dst.cpu().numpy(),src_nodes)
    idx = idx1*idx2
    data = TemporalData(src=data.src[idx], dst=data.dst[idx], t=data.t[idx], msg=data.msg[idx])
    return data

def calculate_rank(lst,item):
    for rk, row in enumerate(lst):
        if row[0] == item:
            return rk+1
    return 0 ### in case item is not in the list

def read_interaction_data_old(file_path,city,randomize_features=False,dataset="4square"):
    graph_df = pd.read_csv(file_path+f'/ml_{dataset}_{city}.csv')
    edge_features = np.load(file_path+f'/ml_{dataset}_{city}.npy')
    node_features = np.load(file_path+f'/ml_{dataset}_{city}_node.npy')

    if randomize_features:
        node_features = np.random.rand(node_features.shape[0], node_features.shape[1])
    return graph_df,edge_features,node_features

def read_interaction_data(file_path,city,randomize_features=False,dataset="4square"):
    graph_df = pd.read_csv(file_path+f'/ml_{dataset}_{city}.csv')
    edge_features = np.load(file_path+f'/ml_{dataset}_{city}.npy')
    node_features = np.load(file_path+f'/ml_{dataset}_{city}_node.npy')

    if randomize_features:
        node_features = np.random.rand(node_features.shape[0], node_features.shape[1])

    return graph_df,edge_features,node_features


def seed_everything(seed: int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    return

SEED = 2019
seed_everything(SEED)

def get_global_cats(dataset="4square"):

    check_path = f"data/Meta/4square_categories_map.json"
    check_path2 = f"data/Meta/4square_categories_inv.json"

    if dataset == "wiki":
        check_path = f"data/Meta/wiki_categories_map.json"
        check_path2 = f"data/Meta/wiki_categories_inv.json"
    elif dataset == "yelp":
        check_path = f"data/Meta/yelp_categories_map.json"
        check_path2 = f"data/Meta/yelp_categories_inv.json"

    if os.path.isfile(check_path):
        with open(check_path, 'r') as openfile:
            categories_map = json.load(openfile)
        with open(check_path2, 'r') as openfile:
            categories_inv = json.load(openfile)
        return categories_map, categories_inv

    else:
        print("categories not found..")


def save_dataframe(city, verbose=True, source = True, useSaved = True):

    check_path = "data/ml_4square_{}.csv".format(city)

    if os.path.isfile(check_path) and useSaved:
        print("Found already, reading and returning")
        df_review = pd.read_csv(check_path)
        return df_review

    print("dataframe not found, it has not been saved yet...")

    return None

def save_dataframe_wiki(language, verbose=False, source = False, useSaved = True):

    check_path = f"data/df_wiki_" + language + ".csv"

    if os.path.isfile(check_path) and useSaved:
        df_lang = pd.read_csv(check_path)
        return df_lang

    print("dataframe not found, it has not been saved yet...")

    return None


def save_dataframe_yelp(city, useSaved = True):

    check_path = f"data/ml_yelp_{city}.csv"
    print(check_path)
    if os.path.isfile(check_path):
        df_review = pd.read_csv(check_path)
        return df_review

    print("dataframe not found, it has not been saved yet...")

    return None


def returnEdgeIndexes(df_city, item_cat_df=None, totCats = 512, verbose=True, dataset="4square"):

    categories_map, categories_inv = get_global_cats(dataset)


    minUsrSrc = min(df_city['u'])
    maxUsrSrc = max(df_city['u'])
    minBusSrc = min(df_city['i'])
    maxBusSrc = max(df_city['i'])

    usrStart = minUsrSrc
    busStart = minBusSrc
    num_cat = len(categories_map)
    num_poi = max(df_city['i']) - min(df_city['i']) + 1
    num_usr = max(df_city['u']) - min(df_city['u']) + 1 # 0 node is dummy (for tgn) and treated as user, but this has nothing to do here

    if item_cat_df is None:

        minBusSrc = min(df_city['i'])
        maxBusSrc = max(df_city['i'])
        busStart = minBusSrc
        num_poi = max(df_city['i']) - min(df_city['i']) + 1
    else:   ### Added

        minBusSrc = min(item_cat_df['i'])
        maxBusSrc = max(item_cat_df['i'])
        busStart = minBusSrc
        num_poi = max(item_cat_df['i']) - min(item_cat_df['i']) + 1

    print(minUsrSrc,maxUsrSrc,minBusSrc,maxBusSrc,num_cat,num_poi,num_usr)

    if dataset == 'yelp':
        df_city['category']  = df_city['categories_list']

    df_city = df_city.groupby(['u','i'],as_index=False).agg({'category': 'first', 'label': 'count'})

    df_city['edge_count'] = df_city['label']
    print(df_city.shape)

    u2b_wt_dict = {}
    for key, data in df_city.groupby('u'):
        u2b_wt_dict[key] = {}
        total_edges = data['edge_count'].sum()
        data['wt'] = data['edge_count']*1.0/total_edges
        u2b_wt_dict[key]  = {i:wt for i,wt in zip(data['i'],data['wt'])}

    df_city['u2b_edge_wt'] = df_city[['u','i']].apply(lambda x:u2b_wt_dict[x['u']][x['i']] ,axis=1)  #

    b2u_wt_dict = {}
    for key, data in df_city.groupby('i'):
        b2u_wt_dict[key] = {}
        total_edges = data['edge_count'].sum()
        data['wt'] = data['edge_count']*1.0/total_edges
        b2u_wt_dict[key]  = {u:wt for u,wt in zip(data['u'],data['wt'])}

    df_city['b2u_edge_wt'] = df_city[['u','i']].apply(lambda x:b2u_wt_dict[x['i']][x['u']] ,axis=1)

    u2b_edge_idx = np.hstack([np.array(df_city['u'].values-minUsrSrc).reshape((len(df_city),-1)), np.array(df_city['i'].values-busStart).reshape((len(df_city),-1))])
    b2u_edge_idx = np.hstack([np.array(df_city['i'].values-busStart).reshape((len(df_city),-1)), np.array(df_city['u'].values-minUsrSrc).reshape((len(df_city),-1))])

    u2b_edge_wt = np.array(df_city['u2b_edge_wt'])
    b2u_edge_wt = np.array(df_city['b2u_edge_wt'])

    print(u2b_edge_idx.shape,b2u_edge_idx.shape,u2b_edge_wt.shape,b2u_edge_wt.shape)

    print(df_city['i'].dtype)

    if dataset == "yelp":
        df_city['category'] = df_city['category'].apply(ast.literal_eval)
        def mapper(category_list):
            return [categories_map[category] for category in category_list]
        df_city['cat_idx'] = df_city['category'].apply(mapper)
    else:
        df_city = df_city[df_city['category'].isin(categories_map)] # redundant line
        df_city['cat_idx'] = df_city['category'].map(categories_map)
        df_city = df_city[['i','cat_idx']].drop_duplicates(keep='first')

    print("After keeping only i and cats", df_city.shape)

    if dataset == "yelp":
        df_expanded = df_city.explode('cat_idx')
        df_unique = df_expanded.drop_duplicates(subset=['i', 'cat_idx'])
        df_unique['temp'] = df_unique['i']-busStart
        b2c_edge_idx = df_unique[['temp', 'cat_idx']].to_numpy().astype(int)
        c2b_edge_idx = df_unique[['cat_idx', 'temp']].to_numpy().astype(int)
    else:
        b2c_edge_idx = np.hstack([np.array(df_city['i'].values-busStart).reshape((len(df_city),-1)), np.array(df_city['cat_idx'].values).reshape((len(df_city),-1))])
        c2b_edge_idx = np.hstack([np.array(df_city['cat_idx'].values).reshape((len(df_city),-1)), np.array(df_city['i'].values-busStart).reshape((len(df_city),-1))])
        print(b2c_edge_idx.shape,c2b_edge_idx.shape)

        # print(df_city[df_city['i']==71487])

#         for x in b2c_edge_idx:
#             if pd.isna(x[0]) or pd.isna(x[1]):
#                 print("dfdvdvvdvfv",x)


    b2u_edge_idx = torch.from_numpy(np.transpose(np.array(b2u_edge_idx)))
    u2b_edge_idx = torch.from_numpy(np.transpose(np.array(u2b_edge_idx)))
    b2c_edge_idx = torch.from_numpy(np.transpose(np.array(b2c_edge_idx)))
    c2b_edge_idx = torch.from_numpy(np.transpose(np.array(c2b_edge_idx)))
    b2u_edge_wt = torch.from_numpy(b2u_edge_wt).to(torch.float32)
    u2b_edge_wt = torch.from_numpy(u2b_edge_wt).to(torch.float32)
    print(b2u_edge_idx.shape,u2b_edge_idx.shape,b2c_edge_idx.shape,c2b_edge_idx.shape,b2u_edge_wt.shape,u2b_edge_wt.shape)


    return b2u_edge_idx, u2b_edge_idx, b2c_edge_idx, c2b_edge_idx , [num_cat, num_poi, num_usr],b2u_edge_wt,u2b_edge_wt,usrStart,busStart