In [48]:
import sys
sys.path.append("../")
In [1]:
import numpy as np
import pandas as pd
import datetime as dt

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
In [2]:
import os

Clicks Dataset

In [3]:
%%time
df_clicks = pd.read_csv("/tf/notebooks/data/yoochoose/yoochoose-clicks.dat", sep=",", header=None,
                       dtype={0:np.int32, 1:str, 2:np.int64, 3:str})
CPU times: user 19.4 s, sys: 2.98 s, total: 22.3 s
Wall time: 23.1 s
In [4]:
df_clicks.shape
Out[4]:
(33003944, 4)
In [5]:
df_clicks.head()
Out[5]:
0 1 2 3
0 1 2014-04-07T10:51:09.277Z 214536502 0
1 1 2014-04-07T10:54:09.868Z 214536500 0
2 1 2014-04-07T10:54:46.998Z 214536506 0
3 1 2014-04-07T10:57:00.306Z 214577561 0
4 2 2014-04-07T13:56:37.614Z 214662742 0
In [6]:
# df_clicks.iloc[:,3].unique()

Create category code

In [7]:
def assign_cat(x):
    if x == "S":
        return "PROMO"
    elif np.int(x) == 0:
        return "NONE"
    elif np.int(x) < 13:
        return "CATEGORY"
    else:
        return "BRAND"
In [8]:
df_clicks["item_type"] = df_clicks.iloc[:, 3].map(assign_cat)
In [9]:
# df_clicks.head()
In [10]:
df_clicks.columns = ["SessionId", "TimeStr", "ItemId", "Item_Type_Code", "Item_Type"]
In [11]:
df_clicks.SessionId.nunique()
Out[11]:
9249729
In [12]:
df_buys = pd.read_csv("/tf/notebooks/data/yoochoose/yoochoose-buys.dat", header=None, sep=",",
                     dtype={0:np.int32, 1:str, 2:np.int64, 3:np.int64, 4:np.int64})
In [13]:
df_buys.head(1)
Out[13]:
0 1 2 3 4
0 420374 2014-04-06T18:44:58.314Z 214537888 12462 1
In [14]:
df_buys.columns = ["SessionId", "TimeStr", "ItemId", "Price", "Quantity"]
In [15]:
df_buys["Action"] = "BUY"
In [16]:
df_buys.head(1)
Out[16]:
SessionId TimeStr ItemId Price Quantity Action
0 420374 2014-04-06T18:44:58.314Z 214537888 12462 1 BUY
In [17]:
df = pd.merge(left=df_clicks, right=df_buys, how="left", on=["SessionId", "ItemId"])
In [18]:
del df_clicks
df_clicks = pd.DataFrame()
In [19]:
df.drop_duplicates(inplace=True)

Sub-select the data

In [20]:
df.SessionId.nunique(), df.ItemId.nunique()
Out[20]:
(9249729, 52739)
In [21]:
SESSION_THRESHOLD = 25
ITEM_THRESHOLD = 1500
In [22]:
session_lengths = df.groupby(["SessionId"]).size()
In [23]:
session_gt_threshold = (session_lengths[session_lengths>SESSION_THRESHOLD]).reset_index()
In [24]:
session_gt_threshold.head(1)
Out[24]:
SessionId 0
0 33 29
In [25]:
df_w_session_threshold = df[df.SessionId.isin(session_gt_threshold.SessionId)]
In [26]:
df_w_session_threshold.shape
Out[26]:
(1532814, 9)
In [27]:
item_lengths = df_w_session_threshold.groupby(["ItemId"]).size()
item_gt_threshold = (item_lengths[item_lengths>ITEM_THRESHOLD]).reset_index()
In [28]:
item_gt_threshold.head(1)
Out[28]:
ItemId 0
0 214587317 1592
In [29]:
df_final = df_w_session_threshold[df_w_session_threshold.ItemId.isin(item_gt_threshold.ItemId)]
In [30]:
df_final.shape
Out[30]:
(80143, 9)
In [31]:
df_final.head(2)
Out[31]:
SessionId TimeStr_x ItemId Item_Type_Code Item_Type TimeStr_y Price Quantity Action
295 87 2014-04-07T06:29:05.905Z 214820392 0 NONE 2014-04-07T06:57:22.749Z 627.0 1.0 BUY
296 87 2014-04-07T06:29:15.640Z 214820392 0 NONE 2014-04-07T06:57:22.749Z 627.0 1.0 BUY
In [32]:
# To check list of notebooks/kernels currently running
# http://<machine>.ml.pipal.in/tree/notebooks#running
In [33]:
df_final.columns
Out[33]:
Index(['SessionId', 'TimeStr_x', 'ItemId', 'Item_Type_Code', 'Item_Type',
       'TimeStr_y', 'Price', 'Quantity', 'Action'],
      dtype='object')
In [35]:
df_final.drop(["Item_Type_Code", "Item_Type", "TimeStr_y", "Price", "Quantity"], axis=1, inplace=True)
/usr/local/lib/python3.6/dist-packages/pandas/core/frame.py:4102: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
In [36]:
df_final.to_csv("/tf/notebooks/data/yoochoose/df_processed.csv", index=False)
In [38]:
df_final.head(1)
Out[38]:
SessionId TimeStr_x ItemId Action
295 87 2014-04-07T06:29:05.905Z 214820392 BUY
In [40]:
df_final.Action.fillna("CLICK", inplace=True)
/usr/local/lib/python3.6/dist-packages/pandas/core/generic.py:6287: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
In [41]:
df_final.head()
Out[41]:
SessionId TimeStr_x ItemId Action
295 87 2014-04-07T06:29:05.905Z 214820392 BUY
296 87 2014-04-07T06:29:15.640Z 214820392 BUY
303 87 2014-04-07T06:47:46.542Z 214819760 CLICK
1492 496 2014-04-07T16:56:22.584Z 214716110 CLICK
1496 496 2014-04-07T16:58:03.005Z 214819762 CLICK
In [42]:
# Fixing time column:Converting it into a continuous value
df_final["TimeStr_x"] = df_final.TimeStr_x.apply(lambda x: dt.datetime.strptime(x, 
                                                "%Y-%m-%dT%H:%M:%S.%fZ").timestamp())
#This doesn't really matter - since the time is not in UTC
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
In [43]:
df_final.head()
Out[43]:
SessionId TimeStr_x ItemId Action
295 87 1.396852e+09 214820392 BUY
296 87 1.396852e+09 214820392 BUY
303 87 1.396853e+09 214819760 CLICK
1492 496 1.396890e+09 214716110 CLICK
1496 496 1.396890e+09 214819762 CLICK

Encode Rating

In [44]:
df_final["Rating"] = df_final.Action.apply(lambda x: 5 if (x=="BUY") else 1)
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.

Sort each Session by time

In [45]:
df_final.sort_values(by=["SessionId", "TimeStr_x"], inplace=True)
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
In [46]:
df_final.head()
Out[46]:
SessionId TimeStr_x ItemId Action Rating
295 87 1.396852e+09 214820392 BUY 5
296 87 1.396852e+09 214820392 BUY 5
303 87 1.396853e+09 214819760 CLICK 1
1492 496 1.396890e+09 214716110 CLICK 1
1496 496 1.396890e+09 214819762 CLICK 1

Prepare data for the models

In [49]:
from reco.preprocess import encode_user_item
In [53]:
DATA, user_encoder, item_encoder = encode_user_item(df_final, "SessionId", "ItemId", 
                                                   "Rating", "TimeStr_x")
Number of users:  15463
Number of items:  34
In [54]:
DATA.USER = DATA.USER + 1
DATA.ITEM = DATA.ITEM + 1
In [55]:
DATA.dtypes
Out[55]:
SessionId      int32
TIMESTAMP    float64
ItemId         int64
Action        object
RATING         int64
USER           int64
ITEM           int64
dtype: object
In [56]:
DATA.RATING = DATA.RATING.astype(np.int32)
DATA.USER = DATA.USER.astype(np.int32)
DATA.ITEM = DATA.ITEM.astype(np.int32)
In [57]:
from spotlight.interactions import Interactions
In [62]:
df_for_interactions =(DATA.USER.values, DATA.ITEM.values, DATA.RATING, DATA.TIMESTAMP)
In [63]:
df_interaction = Interactions(*df_for_interactions)

Train/Test split

In [64]:
from spotlight.cross_validation import user_based_train_test_split
In [65]:
train, test = user_based_train_test_split(df_interaction, test_percentage=0.2)

Implicit Model (Ranking-based)

In [66]:
from spotlight.factorization.implicit import ImplicitFactorizationModel
In [67]:
model_implicit = ImplicitFactorizationModel(n_iter=3, loss="bpr")
In [68]:
%%time
model_implicit.fit(train)
CPU times: user 13.2 s, sys: 273 ms, total: 13.5 s
Wall time: 4.38 s

Prediction

In [69]:
user_to_reco = test.user_ids[0]
In [70]:
user_to_reco
Out[70]:
7
In [71]:
pred_for_user = model_implicit.predict(user_to_reco)
In [72]:
pred_for_user
Out[72]:
array([-2.0355577 ,  0.22583865,  1.4562953 ,  1.009843  , -0.32925776,
        1.5589361 , -0.70531   , -1.227006  , -0.79263   , -0.6278971 ,
        0.5904501 ,  1.9308102 ,  0.6674701 ,  0.10251044,  0.15111214,
       -1.2623167 ,  0.11576879, -0.48451066,  0.32283413, -0.80832785,
        0.2570512 , -0.05693956,  0.7529382 , -1.0873412 ,  0.18263072,
       -0.19784877, -0.2705483 ,  0.15113422, -0.3896907 , -0.72124124,
        0.98997295,  0.40136272, -0.67114127, -0.31846702, -0.470914  ],
      dtype=float32)
In [74]:
rec_item_ids = (-pred_for_user).argsort()
In [75]:
rec_item_ids
Out[75]:
array([11,  5,  2,  3, 30, 22, 12, 10, 31, 18, 20,  1, 24, 27, 14, 16, 13,
       21, 25, 26, 33,  4, 28, 34, 17,  9, 32,  6, 29,  8, 19, 23,  7, 15,
        0])
In [76]:
ground_truth = test.item_ids[0]
In [77]:
ground_truth
Out[77]:
15

Evaluation

In [78]:
from spotlight.evaluation import precision_recall_score
In [80]:
(pk, rk) = precision_recall_score(model_implicit, test, k=5)
In [82]:
pk, np.mean(pk)
Out[82]:
(array([0. , 0. , 0. , ..., 0. , 0.2, 0. ]), 0.10234049374799613)
In [83]:
rk, np.mean(rk)
Out[83]:
(array([0., 0., 0., ..., 0., 1., 0.]), 0.1964059585813353)

Sequence Models

In [84]:
from spotlight.sequence.representations import CNNNet
from spotlight.sequence.implicit import ImplicitSequenceModel
In [85]:
max_sequence_length = 150
min_sequence_length = 30
step_size = 200
In [86]:
train_sequence = train.to_sequence(max_sequence_length= max_sequence_length,
                                  min_sequence_length = min_sequence_length,
                                  step_size = step_size)

test_sequence = test.to_sequence(max_sequence_length= max_sequence_length,
                                  min_sequence_length = min_sequence_length,
                                  step_size = step_size)
In [88]:
train_sequence.sequences.shape
Out[88]:
(247, 150)
In [89]:
nnet = CNNNet(train_sequence.num_items,
             embedding_dim=128,
             kernel_width=3,
             dilation=[1,1,1,1],
             num_layers=2,
             nonlinearity="relu",
             residual_connections=False)
In [91]:
model_sequence = ImplicitSequenceModel(
    loss = "bpr",
    representation= nnet,
    batch_size= 32,
    learning_rate= 0.1, 
    l2 = 0.0,
    n_iter= 2)
In [92]:
%%time
model_sequence.fit(train_sequence)
CPU times: user 8.1 s, sys: 19.3 ms, total: 8.12 s
Wall time: 3.82 s

Prediction

In [110]:
test_sequence.sequences[10][0:149], test_sequence.sequences[10][149]
Out[110]:
(array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 16, 16,
        16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 25, 25, 25,
        25, 24, 24, 24, 18, 18, 24, 24, 24, 24, 25, 25, 16], dtype=int32), 16)
In [111]:
sequence_prediction = model_sequence.predict(test_sequence.sequences[0][0:149])
In [112]:
rec_item_ids = (-sequence_prediction).argsort()
In [113]:
rec_item_ids
Out[113]:
array([34,  9, 15, 31, 24, 19, 33, 21, 27, 18, 25, 29, 12, 17, 20,  0,  7,
       23, 26, 13, 16, 14,  1, 32, 30,  6, 22, 10,  2, 11, 28,  3,  8,  4,
        5])
In [ ]: