Can we cluster movies that go together based on
IMDB for Western Movies
import pandas as pd
import numpy as np
url = "https://notes.pipal.in/2018/airwatch-ml/IMDB-Movie-Data.csv"
df = pd.read_csv(url)
df.columns
df.shape
colnames = ['Rank', 'Title', 'Genre', 'Description', 'Director', 'Actors', 'Year',
'Runtime', 'Rating', 'Votes', 'Revenue', 'Metascore']
df.columns = colnames
df.head()
df.isnull().sum()
df.Metascore.mean()
import matplotlib.pyplot as plt
%matplotlib inline
df.corr()
plt.scatter(df.Metascore, df.Rating)
df_clean = df[['Rating', 'Metascore']].dropna()
from sklearn.linear_model import LinearRegression
metascore_model = LinearRegression()
metascore_model.fit(df_clean[['Rating']], df_clean.Metascore)
metascore_model.coef_, metascore_model.intercept_
df2 = df.copy(deep=True)
missing_indices = df2.Metascore.isnull()
df2['Metascore'][missing_indices] = metascore_model.predict(df2[['Rating']][missing_indices])
df2.isnull().sum()
df.isnull().sum()
df2.head()
# Number of Genres
df.Genre.str.split(",", expand=True).stack().unique().shape
# One-Hot Encoding for Genres
genre = (df.Genre
.str
.split(",", expand=True)
.stack()
.str
.get_dummies()
.sum(level=0)
)
# Scale the Continuous Values
from sklearn.preprocessing import StandardScaler
df_cont = df2[["Runtime", "Rating", "Votes", "Metascore"]]
df_cont.head()
SS = StandardScaler()
SS.fit(df_cont)
df_cont_scaled = SS.transform(df_cont)
df_cont_scaled
Simliar clusters of movies based on
X = np.c_[np.array(genre), df_cont_scaled]
X.shape
from sklearn.cluster import KMeans
model_km = KMeans()
model_km.fit(X)
model_km.labels_
df2[model_km.labels_ == 1].sort_values(by="Rating", ascending=False).head(10).Title
def get_top_movies(cluster_id):
movies = (df2[model_km.labels_ == cluster_id]
.sort_values(by="Rating", ascending=False)
.head(10))
return [{"id": i, "title": movies.loc[i].Title, "director": movies.loc[i].Director}
for i in movies.index]
x = np.array([1, 2, 3, 4])
y = np.array(["A", "B", "B", "A"])
x[[1, 2]]
x[[True, False, False, True]]
x[y == "A"]
get_top_movies(3)
Q: How to get the cluster id?
X[:5]
model_km.predict(X[:5])
df2.head()
get_top_movies(3)
How to find the related movies given a movie?
def get_related_movies(movie_id):
cluster_id = model_km.labels_[movie_id]
return get_top_movies(cluster_id)
df2.head()
get_related_movies(0)
Save the data frame first.
df2.to_csv("movies.csv", index=False)
How to save the model?
import joblib
joblib.dump(model_km, "model_km.model")
Let us move the required code to a python file.
%%file movies.py
import pandas as pd
import joblib
import sys
df = pd.read_csv("movies.csv")
model_km = joblib.load("model_km.model")
def get_top_movies(cluster_id):
movies = (df[model_km.labels_ == cluster_id]
.sort_values(by="Rating", ascending=False)
.head(10))
return [{"id": i, "title": movies.loc[i].Title, "director": movies.loc[i].Director}
for i in movies.index]
def get_related_movies(movie_id):
cluster_id = model_km.labels_[movie_id]
return get_top_movies(cluster_id)
def main():
movie_id = int(sys.argv[1])
print(get_related_movies(movie_id))
if __name__ == "__main__":
main()
!python movies.py 2
Install firefly.
pip install firefly-python
And run the following in your terminal.
firefly movies.get_related_movies
That would start that function as an API.
Once it is running, you can use it as an API.
import firefly
api = firefly.Client("http://127.0.0.1:8000/")
api.get_related_movies(movie_id=5)