Classical Matrix Factorisation model used for Collaborative Filtering - Popularised by Netflix Competition
import sys
sys.path.append("../")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
Dataset from https://grouplens.org/datasets/movielens/100k/
df_ratings = pd.read_csv("/tf/notebooks/data/data/ratings.csv")
df_ratings.head()
df_ratings.shape
#Sparsity
df_ratings.shape[0]/ (df_ratings.user_id.nunique() * df_ratings.movie_id.nunique())
from reco.preprocess import encode_user_item
#encode_user_item??
DATA, user_encoder, item_encoder = encode_user_item(df_ratings, "user_id", "movie_id",
"rating", "unix_timestamp")
DATA.head()
from reco.preprocess import user_split, random_split
#user_split??
#train, val, test = user_split(DATA, [0.6, 0.2 ,0.2])
train, test = random_split(DATA, [0.8, 0.2])
from keras.models import Model
from keras.layers import Input, Embedding, Flatten, Add, Dot, Activation
from keras.regularizers import l2
from keras.utils import plot_model
def ExplicitMF(n_users, n_items, n_factors):
# Item Layer
item_input = Input(shape=[1], name="Item")
item_embedding = Embedding(n_items, n_factors,
embeddings_regularizer=l2(1e-6),
name="ItemEmbedding")(item_input)
item_vec = Flatten(name="FlattenItemE")(item_embedding)
# User Layer
user_input = Input(shape=[1], name="User")
user_embedding = Embedding(n_users, n_factors,
embeddings_regularizer=l2(1e-6),
name="UserEmbedding")(user_input)
user_vec = Flatten(name="FlattenUserE")(user_embedding)
# Dot Product
rating = Dot(axes=1, name="DotProduct")([item_vec, user_vec])
# Model Creation
model = Model([user_input, item_input], rating)
# Compile
model.compile(loss="mean_squared_error", optimizer="adam")
return model
n_users = DATA.USER.nunique()
n_items = DATA.ITEM.nunique()
n_factors = 40
model = ExplicitMF(n_users, n_items, n_factors)
plot_model(model, show_layer_names=True, show_shapes=True)
model.summary()
n_users * 40, n_items * 40
%%time
output = model.fit([train.USER, train.ITEM], train.RATING,
batch_size=64, epochs=10, verbose=1, validation_split=0.2)
from reco.vis import metrics
metrics(output.history)
from reco.evaluate import get_embedding
item_embedding = model.get_layer("ItemEmbedding").get_weights()[0]
item_embedding.shape
from reco.recommend import get_similar, show_similar
from sklearn.neighbors import NearestNeighbors
def get_similar(embedding, k):
model_similar_items = NearestNeighbors(n_neighbors=k, algorithm="ball_tree").fit(embedding)
distances, indices = model_similar_items.kneighbors(embedding)
return distances, indices
item_distance, item_similar_indices = get_similar(item_embedding, 5)
item_similar_indices
import matplotlib.image as mpimage
def show_similar(item_index, item_similar_indices, item_encoder):
s = item_similar_indices[item_index]
movie_ids = item_encoder.inverse_transform(s)
images = []
for movie_id in movie_ids:
img_path = '/tf/notebooks/data/data/posters/' + str(movie_id) + '.jpg'
images.append(mpimage.imread(img_path))
plt.figure(figsize=(20,10))
columns = 5
for i, image in enumerate(images):
plt.subplot(len(images) / columns + 1, columns, i + 1)
plt.axis('off')
plt.imshow(image)
show_similar(0, item_similar_indices, item_encoder)
Embedding: add Non negative constraints to the embedding layer
from keras.layers import Lambda
max_rating = DATA.RATING.max()
min_rating = DATA.RATING.min()
max_rating, min_rating
def ExplicitMF_bias (n_users, n_items, n_factors):
# Item Layer
item_input = Input(shape=[1], name="Item")
item_embedding = Embedding(n_items, n_factors,
embeddings_regularizer=l2(1e-6),
name="ItemEmbedding")(item_input)
item_vec = Flatten(name="FlattenItemE")(item_embedding)
# User Layer
user_input = Input(shape=[1], name="User")
user_embedding = Embedding(n_users, n_factors,
embeddings_regularizer=l2(1e-6),
name="UserEmbedding")(user_input)
user_vec = Flatten(name="FlattenUserE")(user_embedding)
# User Bias
user_bias = Embedding(n_users, 1,
embeddings_regularizer=l2(1e-6),
name="UserBias")(user_input)
user_bias_vec = Flatten(name="FlattenUserBiasE")(user_bias)
# Item Bias
item_bias = Embedding(n_items, 1,
embeddings_regularizer=l2(1e-6),
name="ItemBias")(item_input)
item_bias_vec = Flatten(name="FlattenItemBiasE")(item_bias)
# Dot Product
DotProduct = Dot(axes=1, name="DotProduct")([item_vec, user_vec])
# Add Bias
AddBias = Add(name="AddBias")([DotProduct, user_bias_vec, item_bias_vec])
# Scaling trick
y = Activation("sigmoid")(AddBias)
rating_output = Lambda(lambda x: x * (max_rating - min_rating) + min_rating)(y)
# Model Creation
model = Model([user_input, item_input], rating_output)
# Compile
model.compile(loss="mean_squared_error", optimizer="adam")
return model
n_factors = 40
model_bias = ExplicitMF_bias(n_users, n_items, n_factors)
model_bias.summary()
plot_model(model_bias, show_layer_names=True, show_shapes=True)
%%time
output_bias = model_bias.fit([train.USER, train.ITEM], train.RATING,
batch_size=128, epochs=5, verbose=1, validation_split=0.2)
metrics(output_bias.history)
from keras.layers import Concatenate, Dense, Dropout
def ExplicitMF_bias_concat (n_users, n_items, n_factors):
# Item Layer
item_input = Input(shape=[1], name="Item")
item_embedding = Embedding(n_items, n_factors,
embeddings_initializer="he_normal",
embeddings_regularizer=l2(1e-6),
name="ItemEmbedding")(item_input)
item_vec = Flatten(name="FlattenItemE")(item_embedding)
# User Layer
user_input = Input(shape=[1], name="User")
user_embedding = Embedding(n_users, n_factors,
embeddings_regularizer=l2(1e-6),
embeddings_initializer="he_normal",
name="UserEmbedding")(user_input)
user_vec = Flatten(name="FlattenUserE")(user_embedding)
# User Bias
user_bias = Embedding(n_users, 1,
embeddings_regularizer=l2(1e-6),
embeddings_initializer="he_normal",
name="UserBias")(user_input)
user_bias_vec = Flatten(name="FlattenUserBiasE")(user_bias)
# Item Bias
item_bias = Embedding(n_items, 1,
embeddings_regularizer=l2(1e-6),
embeddings_initializer="he_normal",
name="ItemBias")(item_input)
item_bias_vec = Flatten(name="FlattenItemBiasE")(item_bias)
# Concatenate
concat = Concatenate(name="Concat")([item_vec, user_vec])
concatD = Dropout(0.5)(concat)
# Use Dense
dense_1 = Dense(32, kernel_initializer="he_normal")(concatD)
dense_1_drop = Dropout(0.5)(dense_1)
dense_2 = Dense(1, kernel_initializer="he_normal")(dense_1_drop)
# Dot Product
#DotProduct = Dot(axes=1, name="DotProduct")([item_vec, user_vec])
# Add Bias
AddBias = Add(name="AddBias")([dense_2, user_bias_vec, item_bias_vec])
# Scaling trick
y = Activation("sigmoid")(AddBias)
rating_output = Lambda(lambda x: x * (max_rating - min_rating) + min_rating)(y)
# Model Creation
model = Model([user_input, item_input], rating_output)
# Compile
model.compile(loss="mean_squared_error", optimizer="adam")
return model
n_factors = 2
model_concat = ExplicitMF_bias_concat(n_users, n_items, n_factors)
model_concat.summary()
plot_model(model_concat, show_layer_names=True, show_shapes=True)
trainU, testU = user_split(DATA, [0.8, 0.2])
%%time
output_concat = model.fit([trainU.USER, trainU.ITEM], trainU.RATING,
batch_size=128, verbose=1, epochs=3,
validation_data=([testU.USER, testU.ITEM], testU.RATING))
metrics(output_concat.history)