-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy path07-process-test-file.py
96 lines (78 loc) · 3.5 KB
/
07-process-test-file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import sys
import pickle
import pandas as pd
one_file = sys.argv[1]
track_len = int(sys.argv[2])
track_features = pd.read_pickle('./data/track_features.pkl.gz')
track_ids_slnos = pd.read_pickle('./data/track_ids_slnos.pkl.gz')
def process_for_feats(for_feats):
for_feats.skip_1 = for_feats.skip_1.astype('int64')
for_feats.skip_2 = for_feats.skip_2.astype('int64')
for_feats.skip_3 = for_feats.skip_3.astype('int64')
for_feats.not_skipped = for_feats.not_skipped.astype('int64')
for_feats.hist_user_behavior_is_shuffle = for_feats.hist_user_behavior_is_shuffle.astype('int64')
for_feats.premium = for_feats.premium.astype('int64')
for_feats.date = pd.to_datetime(for_feats.date)
for_feats['wkdy'] = for_feats.date.dt.dayofweek
for_feats['day'] = for_feats.date.dt.day
for_feats['month'] = for_feats.date.dt.month
for_feats['year'] = for_feats.date.dt.year
for_feats.drop(columns=['date'], inplace=True)
for_feats.drop(columns=['track_id_clean'], inplace=True)
where_to_replace = for_feats.hist_user_behavior_reason_start.isin([
'endplay', 'popup', 'uriopen', 'clickside'
]).copy()
for_feats.loc[where_to_replace, 'hist_user_behavior_reason_start'] = 'merged'
where_to_replace = for_feats.hist_user_behavior_reason_end.isin([
'clickrow', 'appload', 'popup', 'uriopen', 'clickside', 'logout'
]).copy()
for_feats.loc[where_to_replace, 'hist_user_behavior_reason_end'] = 'merged'
for_feats.sort_values(['session_id', 'session_position'], inplace=True)
return for_feats.reset_index(drop=True)
tmp = pd.read_csv('./data/test_set/log_prehistory_{}'.format(one_file))
tmp = tmp.loc[lambda x: x.session_length == track_len]
tmp = pd.merge(tmp, track_ids_slnos, on=['track_id_clean'], how='inner')
tmp.sort_values(['session_id', 'session_position'], inplace=True)
tmp.reset_index(drop=True, inplace=True)
train_feats = process_for_feats(tmp)
tmp = pd.read_csv('./data/test_set/log_input_{}'.format(one_file))
tmp = tmp.loc[lambda x: x.session_length == track_len]
tmp = pd.merge(tmp, track_ids_slnos, on=['track_id_clean'], how='inner')
tmp.sort_values(['session_id', 'session_position'], inplace=True)
tmp.reset_index(drop=True, inplace=True)
train_df = tmp
cols_to_select = [
'context_switch',
'context_type',
'day',
'hist_user_behavior_is_shuffle',
'hist_user_behavior_n_seekback',
'hist_user_behavior_n_seekfwd',
'hist_user_behavior_reason_end',
'hist_user_behavior_reason_start',
'hour_of_day',
'long_pause_before_play',
'month',
'no_pause_before_play',
'not_skipped',
'premium',
'session_position',
'short_pause_before_play',
'skip_1',
'skip_2',
'skip_3',
'wkdy']
train_feats_dummies = pd.get_dummies(train_feats.loc[:, cols_to_select])
train_feats.reset_index(drop=False, inplace=True)
train_feats['index'] += 1
train_feats.set_index('index', inplace=True, drop=True, verify_integrity=True)
train_seq = train_feats.reset_index().groupby('session_id')['index'].apply(lambda x: x.tolist()).tolist()
train_track_seq = train_feats.groupby('session_id')['track_slno'].apply(lambda x: x.tolist()).tolist()
train_pre_pred = train_feats.groupby('session_id')['skip_2'].apply(lambda x: x.tolist()).tolist()
train_to_pred_tracks = train_df.groupby('session_id')['track_slno'].apply(lambda x: x.tolist()).tolist()
with open('./tmp/{}-{}.pkl'.format(one_file, track_len), 'wb') as f:
pickle.dump((
train_feats, train_feats_dummies,
track_features, train_seq,
train_track_seq, train_to_pred_tracks,
train_df, train_pre_pred), f)