-
Notifications
You must be signed in to change notification settings - Fork 24
Expand file tree
/
Copy pathmake_datasets.py
More file actions
144 lines (94 loc) · 3.75 KB
/
make_datasets.py
File metadata and controls
144 lines (94 loc) · 3.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import pandas as pd
def make_datasets(data, len_Seq, len_Tag, len_Pred):
#file_path = 'input/u.data'
p = data.groupby('item')['user'].count().reset_index().rename(columns={'user':'item_count'})
data = pd.merge(data,p,how='left',on='item')
data = data[data['item_count'] > 5].drop(['item_count'],axis=1)
# ReMap item ids
item_unique = data['item'].unique().tolist()
item_map = dict(zip(item_unique, range(1,len(item_unique) + 1)))
item_map[-1] = 0
all_item_count = len(item_map)
data['item'] = data['item'].apply(lambda x: item_map[x])
# ReMap usr ids
user_unique = data['user'].unique().tolist()
user_map = dict(zip(user_unique, range(1, len(user_unique) + 1)))
user_map[-1] = 0
all_user_count = len(item_map)
data['user'] = data['user'].apply(lambda x: user_map[x])
# Get user session
data = data.sort_values(by=['user','timestamps']).reset_index(drop=True)
# 生成用户序列
user_sessions = data.groupby('user')['item'].apply(lambda x: x.tolist()) \
.reset_index().rename(columns={'item': 'item_list'})
train_users = []
train_seqs = []
train_targets = []
test_users = []
test_seqs = []
test_targets = []
items_usr_clicked = {}
for index, row in user_sessions.iterrows():
user = row['user']
items = row['item_list']
test_item = items[-1*len_Pred :]
test_seq = items[-1* (len_Pred + len_Seq) :-1*len_Pred]
test_users.append(user)
test_seqs.append(test_seq)
test_targets.append(test_item)
train_build_items = items[:-1*len_Pred]
items_usr_clicked[user] = train_build_items
for i in range(len_Seq, len(train_build_items) - len_Tag + 1):
item = train_build_items[i:i+ len_Tag]
seq = train_build_items[max(0,i - len_Seq):i]
train_users.append(user)
train_seqs.append(seq)
train_targets.append(item)
d_train = pd.DataFrame({'user':train_users,'seq':train_seqs,'target':train_targets})
d_test = pd.DataFrame({'user': test_users, 'seq': test_seqs, 'target': test_targets})
d_info= (all_user_count, all_item_count, items_usr_clicked, user_map, item_map)
return d_train,d_test,d_info
if __name__ == '__main__':
make_datasets(5,3,2)
'''
src data
196 242 3 881250949
186 302 3 891717742
22 377 1 878887116
244 51 2 880606923
166 346 1 886397596
298 474 4 884182806
115 265 2 881171488
253 465 5 891628467
305 451 3 886324817
6 86 3 883603013
62 257 2 879372434
286 1014 5 879781125
200 222 5 876042340
210 40 3 891035994
224 29 3 888104457
train_data
seq target user
0 [1, 290, 492, 381, 752] [467, 523, 11] 1
1 [290, 492, 381, 752, 467] [523, 11, 673] 1
2 [492, 381, 752, 467, 523] [11, 673, 1046] 1
3 [381, 752, 467, 523, 11] [673, 1046, 650] 1
4 [752, 467, 523, 11, 673] [1046, 650, 378] 1
5 [467, 523, 11, 673, 1046] [650, 378, 180] 1
6 [523, 11, 673, 1046, 650] [378, 180, 390] 1
7 [11, 673, 1046, 650, 378] [180, 390, 666] 1
8 [673, 1046, 650, 378, 180] [390, 666, 513] 1
9 [1046, 650, 378, 180, 390] [666, 513, 432] 1
test_data
seq target user
0 [633, 657, 1007, 948, 364] [522, 0, 0] 1
1 [26, 247, 49, 531, 146] [32, 0, 0] 2
2 [459, 477, 369, 770, 15] [306, 0, 0] 3
3 [1093, 946, 1101, 690, 1211] [526, 0, 0] 4
4 [732, 266, 669, 188, 253] [986, 0, 0] 5
5 [410, 446, 104, 782, 96] [26, 0, 0] 6
6 [146, 817, 536, 694, 186] [525, 0, 0] 7
7 [395, 669, 281, 289, 98] [731, 0, 0] 8
8 [588, 671, 369, 292, 304] [250, 0, 0] 9
9 [472, 222, 82, 716, 8] [131, 0, 0] 10
'''