import sys
sys.path.append("../")

import numpy as np
import pandas as pd
import datetime as dt

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import os

Clicks Dataset¶

%%time
df_clicks = pd.read_csv("/tf/notebooks/data/yoochoose/yoochoose-clicks.dat", sep=",", header=None,
                       dtype={0:np.int32, 1:str, 2:np.int64, 3:str})

CPU times: user 19.4 s, sys: 2.98 s, total: 22.3 s
Wall time: 23.1 s

df_clicks.shape

(33003944, 4)

df_clicks.head()

# df_clicks.iloc[:,3].unique()

Create category code¶

def assign_cat(x):
    if x == "S":
        return "PROMO"
    elif np.int(x) == 0:
        return "NONE"
    elif np.int(x) < 13:
        return "CATEGORY"
    else:
        return "BRAND"

df_clicks["item_type"] = df_clicks.iloc[:, 3].map(assign_cat)

# df_clicks.head()

df_clicks.columns = ["SessionId", "TimeStr", "ItemId", "Item_Type_Code", "Item_Type"]

df_clicks.SessionId.nunique()

9249729

df_buys = pd.read_csv("/tf/notebooks/data/yoochoose/yoochoose-buys.dat", header=None, sep=",",
                     dtype={0:np.int32, 1:str, 2:np.int64, 3:np.int64, 4:np.int64})

df_buys.head(1)

df_buys.columns = ["SessionId", "TimeStr", "ItemId", "Price", "Quantity"]

df_buys["Action"] = "BUY"

df_buys.head(1)

df = pd.merge(left=df_clicks, right=df_buys, how="left", on=["SessionId", "ItemId"])

del df_clicks
df_clicks = pd.DataFrame()

df.drop_duplicates(inplace=True)

Sub-select the data¶

df.SessionId.nunique(), df.ItemId.nunique()

(9249729, 52739)

SESSION_THRESHOLD = 25
ITEM_THRESHOLD = 1500

session_lengths = df.groupby(["SessionId"]).size()

session_gt_threshold = (session_lengths[session_lengths>SESSION_THRESHOLD]).reset_index()

session_gt_threshold.head(1)

df_w_session_threshold = df[df.SessionId.isin(session_gt_threshold.SessionId)]

df_w_session_threshold.shape

(1532814, 9)

item_lengths = df_w_session_threshold.groupby(["ItemId"]).size()
item_gt_threshold = (item_lengths[item_lengths>ITEM_THRESHOLD]).reset_index()

item_gt_threshold.head(1)

df_final = df_w_session_threshold[df_w_session_threshold.ItemId.isin(item_gt_threshold.ItemId)]

df_final.shape

(80143, 9)

df_final.head(2)

# To check list of notebooks/kernels currently running
# http://<machine>.ml.pipal.in/tree/notebooks#running

df_final.columns

Index(['SessionId', 'TimeStr_x', 'ItemId', 'Item_Type_Code', 'Item_Type',
       'TimeStr_y', 'Price', 'Quantity', 'Action'],
      dtype='object')

df_final.drop(["Item_Type_Code", "Item_Type", "TimeStr_y", "Price", "Quantity"], axis=1, inplace=True)

/usr/local/lib/python3.6/dist-packages/pandas/core/frame.py:4102: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,

df_final.to_csv("/tf/notebooks/data/yoochoose/df_processed.csv", index=False)

df_final.head(1)

df_final.Action.fillna("CLICK", inplace=True)

/usr/local/lib/python3.6/dist-packages/pandas/core/generic.py:6287: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)

df_final.head()

# Fixing time column:Converting it into a continuous value
df_final["TimeStr_x"] = df_final.TimeStr_x.apply(lambda x: dt.datetime.strptime(x, 
                                                "%Y-%m-%dT%H:%M:%S.%fZ").timestamp())
#This doesn't really matter - since the time is not in UTC

/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

df_final.head()

Encode Rating¶

df_final["Rating"] = df_final.Action.apply(lambda x: 5 if (x=="BUY") else 1)

/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.

Sort each Session by time¶

df_final.sort_values(by=["SessionId", "TimeStr_x"], inplace=True)

/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.

df_final.head()

Prepare data for the models¶

from reco.preprocess import encode_user_item

DATA, user_encoder, item_encoder = encode_user_item(df_final, "SessionId", "ItemId", 
                                                   "Rating", "TimeStr_x")

Number of users:  15463
Number of items:  34

DATA.USER = DATA.USER + 1
DATA.ITEM = DATA.ITEM + 1

DATA.dtypes

SessionId      int32
TIMESTAMP    float64
ItemId         int64
Action        object
RATING         int64
USER           int64
ITEM           int64
dtype: object

DATA.RATING = DATA.RATING.astype(np.int32)
DATA.USER = DATA.USER.astype(np.int32)
DATA.ITEM = DATA.ITEM.astype(np.int32)

from spotlight.interactions import Interactions

df_for_interactions =(DATA.USER.values, DATA.ITEM.values, DATA.RATING, DATA.TIMESTAMP)

df_interaction = Interactions(*df_for_interactions)

Train/Test split¶

from spotlight.cross_validation import user_based_train_test_split

train, test = user_based_train_test_split(df_interaction, test_percentage=0.2)

Implicit Model (Ranking-based)¶

from spotlight.factorization.implicit import ImplicitFactorizationModel

model_implicit = ImplicitFactorizationModel(n_iter=3, loss="bpr")

%%time
model_implicit.fit(train)

CPU times: user 13.2 s, sys: 273 ms, total: 13.5 s
Wall time: 4.38 s

Prediction¶

user_to_reco = test.user_ids[0]

user_to_reco

7

pred_for_user = model_implicit.predict(user_to_reco)

pred_for_user

array([-2.0355577 ,  0.22583865,  1.4562953 ,  1.009843  , -0.32925776,
        1.5589361 , -0.70531   , -1.227006  , -0.79263   , -0.6278971 ,
        0.5904501 ,  1.9308102 ,  0.6674701 ,  0.10251044,  0.15111214,
       -1.2623167 ,  0.11576879, -0.48451066,  0.32283413, -0.80832785,
        0.2570512 , -0.05693956,  0.7529382 , -1.0873412 ,  0.18263072,
       -0.19784877, -0.2705483 ,  0.15113422, -0.3896907 , -0.72124124,
        0.98997295,  0.40136272, -0.67114127, -0.31846702, -0.470914  ],
      dtype=float32)

rec_item_ids = (-pred_for_user).argsort()

rec_item_ids

array([11,  5,  2,  3, 30, 22, 12, 10, 31, 18, 20,  1, 24, 27, 14, 16, 13,
       21, 25, 26, 33,  4, 28, 34, 17,  9, 32,  6, 29,  8, 19, 23,  7, 15,
        0])

ground_truth = test.item_ids[0]

ground_truth

15

Evaluation¶

from spotlight.evaluation import precision_recall_score

(pk, rk) = precision_recall_score(model_implicit, test, k=5)

pk, np.mean(pk)

(array([0. , 0. , 0. , ..., 0. , 0.2, 0. ]), 0.10234049374799613)

rk, np.mean(rk)

(array([0., 0., 0., ..., 0., 1., 0.]), 0.1964059585813353)

Sequence Models¶

from spotlight.sequence.representations import CNNNet
from spotlight.sequence.implicit import ImplicitSequenceModel

max_sequence_length = 150
min_sequence_length = 30
step_size = 200

train_sequence = train.to_sequence(max_sequence_length= max_sequence_length,
                                  min_sequence_length = min_sequence_length,
                                  step_size = step_size)

test_sequence = test.to_sequence(max_sequence_length= max_sequence_length,
                                  min_sequence_length = min_sequence_length,
                                  step_size = step_size)

train_sequence.sequences.shape

(247, 150)

nnet = CNNNet(train_sequence.num_items,
             embedding_dim=128,
             kernel_width=3,
             dilation=[1,1,1,1],
             num_layers=2,
             nonlinearity="relu",
             residual_connections=False)

model_sequence = ImplicitSequenceModel(
    loss = "bpr",
    representation= nnet,
    batch_size= 32,
    learning_rate= 0.1, 
    l2 = 0.0,
    n_iter= 2)

%%time
model_sequence.fit(train_sequence)

CPU times: user 8.1 s, sys: 19.3 ms, total: 8.12 s
Wall time: 3.82 s

Prediction¶

test_sequence.sequences[10][0:149], test_sequence.sequences[10][149]

(array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 16, 16,
        16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 25, 25, 25,
        25, 24, 24, 24, 18, 18, 24, 24, 24, 24, 25, 25, 16], dtype=int32), 16)

sequence_prediction = model_sequence.predict(test_sequence.sequences[0][0:149])

rec_item_ids = (-sequence_prediction).argsort()

rec_item_ids

array([34,  9, 15, 31, 24, 19, 33, 21, 27, 18, 25, 29, 12, 17, 20,  0,  7,
       23, 26, 13, 16, 14,  1, 32, 30,  6, 22, 10,  2, 11, 28,  3,  8,  4,
        5])

	0	1	2
0	1	2014-04-07T10:51:09.277Z	214536502
1	1	2014-04-07T10:54:09.868Z	214536500
2	1	2014-04-07T10:54:46.998Z	214536506
3	1	2014-04-07T10:57:00.306Z	214577561
4	2	2014-04-07T13:56:37.614Z	214662742

	SessionId	TimeStr_x	ItemId	Action
295	87	2014-04-07T06:29:05.905Z	214820392	BUY
296	87	2014-04-07T06:29:15.640Z	214820392	BUY
303	87	2014-04-07T06:47:46.542Z	214819760	CLICK
1492	496	2014-04-07T16:56:22.584Z	214716110	CLICK
1496	496	2014-04-07T16:58:03.005Z	214819762	CLICK

	SessionId	TimeStr_x	ItemId	Action
295	87	1.396852e+09	214820392	BUY
296	87	1.396852e+09	214820392	BUY
303	87	1.396853e+09	214819760	CLICK
1492	496	1.396890e+09	214716110	CLICK
1496	496	1.396890e+09	214819762	CLICK

	SessionId	TimeStr_x	ItemId	Action	Rating
295	87	1.396852e+09	214820392	BUY	5
296	87	1.396852e+09	214820392	BUY	5
303	87	1.396853e+09	214819760	CLICK	1
1492	496	1.396890e+09	214716110	CLICK	1
1496	496	1.396890e+09	214819762	CLICK	1

	SessionId	TimeStr_x	ItemId	Item_Type_Code	Item_Type	TimeStr_y	Price	Quantity	Action
295	87	2014-04-07T06:29:05.905Z	214820392	0	NONE	2014-04-07T06:57:22.749Z	627.0	1.0	BUY
296	87	2014-04-07T06:29:15.640Z	214820392	0	NONE	2014-04-07T06:57:22.749Z	627.0	1.0	BUY