import sys
sys.path.append("../")

import numpy as np
import pandas as pd

from keras.models import Model
from keras.applications.vgg16 import VGG16
from keras.applications.inception_v3 import InceptionV3
from keras.applications.resnet50 import ResNet50
from keras.models import Model
from os import listdir
from keras.preprocessing.image import load_img, img_to_array

base_model = VGG16(weights="imagenet", include_top=False)

base_model.summary()

Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input_5 (InputLayer)         (None, None, None, 3)     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, None, None, 64)    1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, None, None, 64)    36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, None, None, 64)    0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, None, None, 128)   73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, None, None, 128)   147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, None, None, 128)   0         
_________________________________________________________________
block3_conv1 (Conv2D)        (None, None, None, 256)   295168    
_________________________________________________________________
block3_conv2 (Conv2D)        (None, None, None, 256)   590080    
_________________________________________________________________
block3_conv3 (Conv2D)        (None, None, None, 256)   590080    
_________________________________________________________________
block3_pool (MaxPooling2D)   (None, None, None, 256)   0         
_________________________________________________________________
block4_conv1 (Conv2D)        (None, None, None, 512)   1180160   
_________________________________________________________________
block4_conv2 (Conv2D)        (None, None, None, 512)   2359808   
_________________________________________________________________
block4_conv3 (Conv2D)        (None, None, None, 512)   2359808   
_________________________________________________________________
block4_pool (MaxPooling2D)   (None, None, None, 512)   0         
_________________________________________________________________
block5_conv1 (Conv2D)        (None, None, None, 512)   2359808   
_________________________________________________________________
block5_conv2 (Conv2D)        (None, None, None, 512)   2359808   
_________________________________________________________________
block5_conv3 (Conv2D)        (None, None, None, 512)   2359808   
_________________________________________________________________
block5_pool (MaxPooling2D)   (None, None, None, 512)   0         
=================================================================
Total params: 14,714,688
Trainable params: 14,714,688
Non-trainable params: 0
_________________________________________________________________

def preprocess_image(x):
    x /= 255.
    x -= 0.5
    x *= 2.
    return x

images = []
for name in listdir("/tf/notebooks/data/data/posters/"):
    images.append(name)

selected_images = images[1000:1500] # Let's learn features on a subset of the images

image = load_img("/tf/notebooks/data/data/posters/"+selected_images[0], target_size=(299,299))

image = img_to_array(image)

image = np.expand_dims(image, axis=0)

image = preprocess_image(image)

feature = base_model.predict(image).ravel()

feature

array([0.        , 0.        , 0.        , ..., 0.        , 0.35005143,
       0.        ], dtype=float32)

feature.shape

(41472,)

def load_photos_predict(directory):
    images = []
    for name in selected_images:
        filename = directory + '/' + name
        image = load_img(filename, target_size=(299, 299))
        # convert the image pixels to a numpy array
        image = img_to_array(image)
        # reshape data for the model
        image = np.expand_dims(image, axis=0)
        # prepare the image for the  model
        image = preprocess_image(image)
        feature = base_model.predict(image).ravel()
        images.append(feature)
        
    return images

%%time
poster_features = load_photos_predict("/tf/notebooks/data/data/posters/")

CPU times: user 7min 47s, sys: 1.46 s, total: 7min 48s
Wall time: 2min 4s

Nearest Neighbor¶

from reco.recommend import get_similar
from sklearn.neighbors import NearestNeighbors, VALID_METRICS
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import PIL

from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity

def get_similar(embedding, k):
    model_similar_items = NearestNeighbors(n_neighbors=k, metric="cosine",
                                           algorithm="brute").fit(embedding)
    distances, indices = model_similar_items.kneighbors(embedding)
    
    return distances, indices

%%time
item_distances, item_similar_indices = get_similar(poster_features[:100], 5)

CPU times: user 89.3 ms, sys: 12.1 ms, total: 101 ms
Wall time: 56.5 ms

item_similar_indices[:5]

array([[ 0, 87, 40, 91, 47],
       [ 1, 98,  6, 91, 40],
       [ 2, 33, 46,  5, 57],
       [ 3, 68, 87, 40, 69],
       [ 4,  6, 23, 64, 40]])

def show_similar(item_index, item_similar_indices):
        
    movie_ids = item_similar_indices[item_index]
    #movie_ids = item_encoder.inverse_transform(s)

    images = []
    for movie_id in movie_ids:
        img_path = '/tf/notebooks/data/data/posters/' + str(movie_id+1) + '.jpg'
        images.append(mpimg.imread(img_path))

    plt.figure(figsize=(20,10))
    columns = 5
    for i, image in enumerate(images):
        plt.subplot(len(images) / columns + 1, columns, i + 1)
        plt.axis('off')
        plt.imshow(image)

show_similar(2, item_similar_indices)

Text Features¶

item_features = pd.read_csv("/tf/notebooks/data/data/item_features.csv")

item_features.head()

item_features["overview"][0]

"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."

item_features.overview.isna().sum()

9

# fill the missing value with some dummy text

item_features.overview.fillna("Missing value", inplace=True)

Get the sentence vector¶

import spacy

nlp = spacy.load("en_core_web_lg")

doc = nlp(item_features["overview"][0])

doc.vector.shape

(300,)

def word_vec(sentence):
    doc = nlp(sentence)
    return doc.vector

%%time
overview_embedding = item_features["overview"].apply(word_vec)

CPU times: user 24.6 s, sys: 38 ms, total: 24.6 s
Wall time: 24.6 s

overview_embedding[:2]

0    [0.02493808, 0.17454928, -0.09294575, -0.04020...
1    [-0.14874268, 0.054975014, -0.01273541, 0.0032...
Name: overview, dtype: object

# Converting overview_embedding into a pandas dataframe

overview_embedding_list = []
for vec in overview_embedding:
    overview_embedding_list.append(vec.tolist())
    
overview_embedding_df = pd.DataFrame(overview_embedding_list)

overview_embedding_df.head()

%%time
item_distances, item_similar_indices = get_similar(overview_embedding_df, 5)

CPU times: user 288 ms, sys: 153 ms, total: 441 ms
Wall time: 120 ms

item_similar_indices

array([[   0, 1597, 1212, 1363, 1412],
       [   1,  228, 1596,  847,  772],
       [   2,  974,  494, 1397,  451],
       ...,
       [1734, 1733, 1467, 1466,   35],
       [1735, 1369,  745,   47,   25],
       [1736,  973,  619, 1637, 1638]])

show_similar(0, item_similar_indices)

	adult	backdrop_path	belongs_to_collection	budget	genres	homepage	id	imdb_id	original_language	original_title	...	runtime	spoken_languages	status	tagline	title	video	vote_average	vote_count	movie_id	tmdb_id
0	False	/dji4Fm0gCDVb9DQQMRvAI8YNnTz.jpg	{'id': 10194, 'name': 'Toy Story Collection', ...	30000000	[{'id': 16, 'name': 'Animation'}, {'id': 35, '...	http://toystory.disney.com/toy-story	862	tt0114709	en	Toy Story	...	81.0	[{'iso_639_1': 'en', 'name': 'English'}]	Released	NaN	Toy Story	False	7.9	10878	1	862
1	False	/dA9I0Vd9OZzRQ2GyGcsFXdKGMz3.jpg	{'id': 645, 'name': 'James Bond Collection', '...	58000000	[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...	http://www.mgm.com/view/movie/757/Goldeneye/	710	tt0113189	en	GoldenEye	...	130.0	[{'iso_639_1': 'en', 'name': 'English'}, {'iso...	Released	No limits. No fears. No substitutes.	GoldenEye	False	6.8	2037	2	710
2	False	/3EqYpbGCE9S5GddU2K4cYzP5UmI.jpg	NaN	4000000	[{'id': 80, 'name': 'Crime'}, {'id': 35, 'name...	NaN	5	tt0113101	en	Four Rooms	...	98.0	[{'iso_639_1': 'en', 'name': 'English'}]	Released	Twelve outrageous guests. Four scandalous requ...	Four Rooms	False	6.1	1251	3	5
3	False	/g1BfxcnplYEveGqS1ttfhdRBgbR.jpg	{'id': 91698, 'name': 'Chili Palmer Collection...	30250000	[{'id': 35, 'name': 'Comedy'}, {'id': 53, 'nam...	NaN	8012	tt0113161	en	Get Shorty	...	105.0	[{'iso_639_1': 'en', 'name': 'English'}]	Released	The mob is tough, but it’s nothing like show b...	Get Shorty	False	6.5	501	4	8012
4	False	/gE1DHa82NZVb9B2Lx2cLNz43Iwd.jpg	NaN	0	[{'id': 18, 'name': 'Drama'}, {'id': 53, 'name...	NaN	1710	tt0112722	en	Copycat	...	124.0	[{'iso_639_1': 'en', 'name': 'English'}, {'iso...	Released	One man is copying the most notorious killers ...	Copycat	False	6.5	424	5	1710

	0	1	2	3	4	5	6	7	8	9	...	290	291	292	293	294	295	296	297	298	299
0	0.024938	0.174549	-0.092946	-0.040207	0.024558	0.021060	0.045996	-0.143893	0.022363	2.109395	...	-0.164214	0.073254	0.071066	0.005870	-0.047621	-0.005115	0.025926	-0.035420	-0.059641	-0.004321
1	-0.148743	0.054975	-0.012735	0.003243	-0.082071	-0.177124	-0.135892	0.036500	-0.000987	1.804925	...	-0.193255	0.056461	0.038040	-0.049742	-0.059125	0.078672	0.028826	0.006573	-0.070430	-0.000461
2	0.024798	0.201679	-0.035214	-0.044371	0.065224	0.011609	-0.008056	-0.141306	0.003372	2.377326	...	-0.125685	0.043005	0.087619	-0.073365	0.035089	-0.006726	-0.003821	-0.012937	-0.014347	0.046458
3	-0.044721	0.126973	-0.104857	-0.019217	0.138141	0.091052	0.026251	-0.202099	0.047526	2.029502	...	-0.125688	0.038220	0.011013	0.041688	0.000462	-0.014254	-0.015342	-0.110975	-0.009556	0.053125
4	0.080059	-0.017512	-0.176883	0.082077	0.056057	-0.022036	-0.042583	-0.109661	0.073444	2.139982	...	-0.184866	0.007139	0.070511	0.089315	0.028993	-0.071459	0.047445	-0.044927	-0.102735	-0.100472