In [109]:
import sys
sys.path.append("../")
In [110]:
import numpy as np
import pandas as pd
In [ ]:
from keras.models import Model
from keras.applications.vgg16 import VGG16
from keras.applications.inception_v3 import InceptionV3
from keras.applications.resnet50 import ResNet50
from keras.models import Model
from os import listdir
from keras.preprocessing.image import load_img, img_to_array
In [111]:
base_model = VGG16(weights="imagenet", include_top=False)
In [125]:
base_model.summary()
Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input_5 (InputLayer)         (None, None, None, 3)     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, None, None, 64)    1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, None, None, 64)    36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, None, None, 64)    0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, None, None, 128)   73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, None, None, 128)   147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, None, None, 128)   0         
_________________________________________________________________
block3_conv1 (Conv2D)        (None, None, None, 256)   295168    
_________________________________________________________________
block3_conv2 (Conv2D)        (None, None, None, 256)   590080    
_________________________________________________________________
block3_conv3 (Conv2D)        (None, None, None, 256)   590080    
_________________________________________________________________
block3_pool (MaxPooling2D)   (None, None, None, 256)   0         
_________________________________________________________________
block4_conv1 (Conv2D)        (None, None, None, 512)   1180160   
_________________________________________________________________
block4_conv2 (Conv2D)        (None, None, None, 512)   2359808   
_________________________________________________________________
block4_conv3 (Conv2D)        (None, None, None, 512)   2359808   
_________________________________________________________________
block4_pool (MaxPooling2D)   (None, None, None, 512)   0         
_________________________________________________________________
block5_conv1 (Conv2D)        (None, None, None, 512)   2359808   
_________________________________________________________________
block5_conv2 (Conv2D)        (None, None, None, 512)   2359808   
_________________________________________________________________
block5_conv3 (Conv2D)        (None, None, None, 512)   2359808   
_________________________________________________________________
block5_pool (MaxPooling2D)   (None, None, None, 512)   0         
=================================================================
Total params: 14,714,688
Trainable params: 14,714,688
Non-trainable params: 0
_________________________________________________________________
In [113]:
def preprocess_image(x):
    x /= 255.
    x -= 0.5
    x *= 2.
    return x
In [114]:
images = []
for name in listdir("/tf/notebooks/data/data/posters/"):
    images.append(name)
In [115]:
selected_images = images[1000:1500] # Let's learn features on a subset of the images
In [116]:
image = load_img("/tf/notebooks/data/data/posters/"+selected_images[0], target_size=(299,299))
In [117]:
image = img_to_array(image)
In [118]:
image = np.expand_dims(image, axis=0)
In [119]:
image = preprocess_image(image)
In [120]:
feature = base_model.predict(image).ravel()
In [121]:
feature
Out[121]:
array([0.        , 0.        , 0.        , ..., 0.        , 0.35005143,
       0.        ], dtype=float32)
In [ ]:
 
In [122]:
feature.shape
Out[122]:
(41472,)
In [123]:
def load_photos_predict(directory):
    images = []
    for name in selected_images:
        filename = directory + '/' + name
        image = load_img(filename, target_size=(299, 299))
        # convert the image pixels to a numpy array
        image = img_to_array(image)
        # reshape data for the model
        image = np.expand_dims(image, axis=0)
        # prepare the image for the  model
        image = preprocess_image(image)
        feature = base_model.predict(image).ravel()
        images.append(feature)
        
    return images
In [124]:
%%time
poster_features = load_photos_predict("/tf/notebooks/data/data/posters/")
CPU times: user 7min 47s, sys: 1.46 s, total: 7min 48s
Wall time: 2min 4s

Nearest Neighbor

In [126]:
from reco.recommend import get_similar
from sklearn.neighbors import NearestNeighbors, VALID_METRICS
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import PIL

from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity
In [127]:
def get_similar(embedding, k):
    model_similar_items = NearestNeighbors(n_neighbors=k, metric="cosine",
                                           algorithm="brute").fit(embedding)
    distances, indices = model_similar_items.kneighbors(embedding)
    
    return distances, indices
In [128]:
%%time
item_distances, item_similar_indices = get_similar(poster_features[:100], 5)
CPU times: user 89.3 ms, sys: 12.1 ms, total: 101 ms
Wall time: 56.5 ms
In [129]:
item_similar_indices[:5]
Out[129]:
array([[ 0, 87, 40, 91, 47],
       [ 1, 98,  6, 91, 40],
       [ 2, 33, 46,  5, 57],
       [ 3, 68, 87, 40, 69],
       [ 4,  6, 23, 64, 40]])
In [130]:
def show_similar(item_index, item_similar_indices):
        
    movie_ids = item_similar_indices[item_index]
    #movie_ids = item_encoder.inverse_transform(s)

    images = []
    for movie_id in movie_ids:
        img_path = '/tf/notebooks/data/data/posters/' + str(movie_id+1) + '.jpg'
        images.append(mpimg.imread(img_path))

    plt.figure(figsize=(20,10))
    columns = 5
    for i, image in enumerate(images):
        plt.subplot(len(images) / columns + 1, columns, i + 1)
        plt.axis('off')
        plt.imshow(image)
In [132]:
show_similar(2, item_similar_indices)

Text Features

In [134]:
item_features = pd.read_csv("/tf/notebooks/data/data/item_features.csv")
In [135]:
item_features.head()
Out[135]:
adult backdrop_path belongs_to_collection budget genres homepage id imdb_id original_language original_title ... runtime spoken_languages status tagline title video vote_average vote_count movie_id tmdb_id
0 False /dji4Fm0gCDVb9DQQMRvAI8YNnTz.jpg {'id': 10194, 'name': 'Toy Story Collection', ... 30000000 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... http://toystory.disney.com/toy-story 862 tt0114709 en Toy Story ... 81.0 [{'iso_639_1': 'en', 'name': 'English'}] Released NaN Toy Story False 7.9 10878 1 862
1 False /dA9I0Vd9OZzRQ2GyGcsFXdKGMz3.jpg {'id': 645, 'name': 'James Bond Collection', '... 58000000 [{'id': 12, 'name': 'Adventure'}, {'id': 28, '... http://www.mgm.com/view/movie/757/Goldeneye/ 710 tt0113189 en GoldenEye ... 130.0 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... Released No limits. No fears. No substitutes. GoldenEye False 6.8 2037 2 710
2 False /3EqYpbGCE9S5GddU2K4cYzP5UmI.jpg NaN 4000000 [{'id': 80, 'name': 'Crime'}, {'id': 35, 'name... NaN 5 tt0113101 en Four Rooms ... 98.0 [{'iso_639_1': 'en', 'name': 'English'}] Released Twelve outrageous guests. Four scandalous requ... Four Rooms False 6.1 1251 3 5
3 False /g1BfxcnplYEveGqS1ttfhdRBgbR.jpg {'id': 91698, 'name': 'Chili Palmer Collection... 30250000 [{'id': 35, 'name': 'Comedy'}, {'id': 53, 'nam... NaN 8012 tt0113161 en Get Shorty ... 105.0 [{'iso_639_1': 'en', 'name': 'English'}] Released The mob is tough, but it’s nothing like show b... Get Shorty False 6.5 501 4 8012
4 False /gE1DHa82NZVb9B2Lx2cLNz43Iwd.jpg NaN 0 [{'id': 18, 'name': 'Drama'}, {'id': 53, 'name... NaN 1710 tt0112722 en Copycat ... 124.0 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... Released One man is copying the most notorious killers ... Copycat False 6.5 424 5 1710

5 rows × 27 columns

In [137]:
item_features["overview"][0]
Out[137]:
"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."
In [141]:
item_features.overview.isna().sum()
Out[141]:
9
In [142]:
# fill the missing value with some dummy text
In [143]:
item_features.overview.fillna("Missing value", inplace=True)

Get the sentence vector

In [144]:
import spacy
In [145]:
nlp = spacy.load("en_core_web_lg")
In [146]:
doc = nlp(item_features["overview"][0])
In [149]:
doc.vector.shape
Out[149]:
(300,)
In [151]:
def word_vec(sentence):
    doc = nlp(sentence)
    return doc.vector
In [152]:
%%time
overview_embedding = item_features["overview"].apply(word_vec)
CPU times: user 24.6 s, sys: 38 ms, total: 24.6 s
Wall time: 24.6 s
In [155]:
overview_embedding[:2]
Out[155]:
0    [0.02493808, 0.17454928, -0.09294575, -0.04020...
1    [-0.14874268, 0.054975014, -0.01273541, 0.0032...
Name: overview, dtype: object
In [156]:
# Converting overview_embedding into a pandas dataframe
In [157]:
overview_embedding_list = []
for vec in overview_embedding:
    overview_embedding_list.append(vec.tolist())
    
overview_embedding_df = pd.DataFrame(overview_embedding_list)

overview_embedding_df.head()
Out[157]:
0 1 2 3 4 5 6 7 8 9 ... 290 291 292 293 294 295 296 297 298 299
0 0.024938 0.174549 -0.092946 -0.040207 0.024558 0.021060 0.045996 -0.143893 0.022363 2.109395 ... -0.164214 0.073254 0.071066 0.005870 -0.047621 -0.005115 0.025926 -0.035420 -0.059641 -0.004321
1 -0.148743 0.054975 -0.012735 0.003243 -0.082071 -0.177124 -0.135892 0.036500 -0.000987 1.804925 ... -0.193255 0.056461 0.038040 -0.049742 -0.059125 0.078672 0.028826 0.006573 -0.070430 -0.000461
2 0.024798 0.201679 -0.035214 -0.044371 0.065224 0.011609 -0.008056 -0.141306 0.003372 2.377326 ... -0.125685 0.043005 0.087619 -0.073365 0.035089 -0.006726 -0.003821 -0.012937 -0.014347 0.046458
3 -0.044721 0.126973 -0.104857 -0.019217 0.138141 0.091052 0.026251 -0.202099 0.047526 2.029502 ... -0.125688 0.038220 0.011013 0.041688 0.000462 -0.014254 -0.015342 -0.110975 -0.009556 0.053125
4 0.080059 -0.017512 -0.176883 0.082077 0.056057 -0.022036 -0.042583 -0.109661 0.073444 2.139982 ... -0.184866 0.007139 0.070511 0.089315 0.028993 -0.071459 0.047445 -0.044927 -0.102735 -0.100472

5 rows × 300 columns

In [158]:
%%time
item_distances, item_similar_indices = get_similar(overview_embedding_df, 5)
CPU times: user 288 ms, sys: 153 ms, total: 441 ms
Wall time: 120 ms
In [159]:
item_similar_indices
Out[159]:
array([[   0, 1597, 1212, 1363, 1412],
       [   1,  228, 1596,  847,  772],
       [   2,  974,  494, 1397,  451],
       ...,
       [1734, 1733, 1467, 1466,   35],
       [1735, 1369,  745,   47,   25],
       [1736,  973,  619, 1637, 1638]])
In [161]:
show_similar(0, item_similar_indices)
In [ ]: