import sys
sys.path.append("../")
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import os
%%time
df_clicks = pd.read_csv("/tf/notebooks/data/yoochoose/yoochoose-clicks.dat", sep=",", header=None,
dtype={0:np.int32, 1:str, 2:np.int64, 3:str})
df_clicks.shape
df_clicks.head()
# df_clicks.iloc[:,3].unique()
def assign_cat(x):
if x == "S":
return "PROMO"
elif np.int(x) == 0:
return "NONE"
elif np.int(x) < 13:
return "CATEGORY"
else:
return "BRAND"
df_clicks["item_type"] = df_clicks.iloc[:, 3].map(assign_cat)
# df_clicks.head()
df_clicks.columns = ["SessionId", "TimeStr", "ItemId", "Item_Type_Code", "Item_Type"]
df_clicks.SessionId.nunique()
df_buys = pd.read_csv("/tf/notebooks/data/yoochoose/yoochoose-buys.dat", header=None, sep=",",
dtype={0:np.int32, 1:str, 2:np.int64, 3:np.int64, 4:np.int64})
df_buys.head(1)
df_buys.columns = ["SessionId", "TimeStr", "ItemId", "Price", "Quantity"]
df_buys["Action"] = "BUY"
df_buys.head(1)
df = pd.merge(left=df_clicks, right=df_buys, how="left", on=["SessionId", "ItemId"])
del df_clicks
df_clicks = pd.DataFrame()
df.drop_duplicates(inplace=True)
df.SessionId.nunique(), df.ItemId.nunique()
SESSION_THRESHOLD = 25
ITEM_THRESHOLD = 1500
session_lengths = df.groupby(["SessionId"]).size()
session_gt_threshold = (session_lengths[session_lengths>SESSION_THRESHOLD]).reset_index()
session_gt_threshold.head(1)
df_w_session_threshold = df[df.SessionId.isin(session_gt_threshold.SessionId)]
df_w_session_threshold.shape
item_lengths = df_w_session_threshold.groupby(["ItemId"]).size()
item_gt_threshold = (item_lengths[item_lengths>ITEM_THRESHOLD]).reset_index()
item_gt_threshold.head(1)
df_final = df_w_session_threshold[df_w_session_threshold.ItemId.isin(item_gt_threshold.ItemId)]
df_final.shape
df_final.head(2)
# To check list of notebooks/kernels currently running
# http://<machine>.ml.pipal.in/tree/notebooks#running
df_final.columns
df_final.drop(["Item_Type_Code", "Item_Type", "TimeStr_y", "Price", "Quantity"], axis=1, inplace=True)
df_final.to_csv("/tf/notebooks/data/yoochoose/df_processed.csv", index=False)
df_final.head(1)
df_final.Action.fillna("CLICK", inplace=True)
df_final.head()
# Fixing time column:Converting it into a continuous value
df_final["TimeStr_x"] = df_final.TimeStr_x.apply(lambda x: dt.datetime.strptime(x,
"%Y-%m-%dT%H:%M:%S.%fZ").timestamp())
#This doesn't really matter - since the time is not in UTC
df_final.head()
df_final["Rating"] = df_final.Action.apply(lambda x: 5 if (x=="BUY") else 1)
df_final.sort_values(by=["SessionId", "TimeStr_x"], inplace=True)
df_final.head()
from reco.preprocess import encode_user_item
DATA, user_encoder, item_encoder = encode_user_item(df_final, "SessionId", "ItemId",
"Rating", "TimeStr_x")
DATA.USER = DATA.USER + 1
DATA.ITEM = DATA.ITEM + 1
DATA.dtypes
DATA.RATING = DATA.RATING.astype(np.int32)
DATA.USER = DATA.USER.astype(np.int32)
DATA.ITEM = DATA.ITEM.astype(np.int32)
from spotlight.interactions import Interactions
df_for_interactions =(DATA.USER.values, DATA.ITEM.values, DATA.RATING, DATA.TIMESTAMP)
df_interaction = Interactions(*df_for_interactions)
from spotlight.cross_validation import user_based_train_test_split
train, test = user_based_train_test_split(df_interaction, test_percentage=0.2)
from spotlight.factorization.implicit import ImplicitFactorizationModel
model_implicit = ImplicitFactorizationModel(n_iter=3, loss="bpr")
%%time
model_implicit.fit(train)
user_to_reco = test.user_ids[0]
user_to_reco
pred_for_user = model_implicit.predict(user_to_reco)
pred_for_user
rec_item_ids = (-pred_for_user).argsort()
rec_item_ids
ground_truth = test.item_ids[0]
ground_truth
from spotlight.evaluation import precision_recall_score
(pk, rk) = precision_recall_score(model_implicit, test, k=5)
pk, np.mean(pk)
rk, np.mean(rk)
from spotlight.sequence.representations import CNNNet
from spotlight.sequence.implicit import ImplicitSequenceModel
max_sequence_length = 150
min_sequence_length = 30
step_size = 200
train_sequence = train.to_sequence(max_sequence_length= max_sequence_length,
min_sequence_length = min_sequence_length,
step_size = step_size)
test_sequence = test.to_sequence(max_sequence_length= max_sequence_length,
min_sequence_length = min_sequence_length,
step_size = step_size)
train_sequence.sequences.shape
nnet = CNNNet(train_sequence.num_items,
embedding_dim=128,
kernel_width=3,
dilation=[1,1,1,1],
num_layers=2,
nonlinearity="relu",
residual_connections=False)
model_sequence = ImplicitSequenceModel(
loss = "bpr",
representation= nnet,
batch_size= 32,
learning_rate= 0.1,
l2 = 0.0,
n_iter= 2)
%%time
model_sequence.fit(train_sequence)
test_sequence.sequences[10][0:149], test_sequence.sequences[10][149]
sequence_prediction = model_sequence.predict(test_sequence.sequences[0][0:149])
rec_item_ids = (-sequence_prediction).argsort()
rec_item_ids