VMware Bangalore
June 18-20, 2018
Amit kapoor • Anand Chitipothu • Bargava Subramanian
Notes of this workshop are available online at: https://bit.ly/vmware-ml
Home | Day 1 | Day 2 - Iris | Day 2 - Housing | Day 3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['figure.subplot.bottom'] = 0.15
%matplotlib inline
# load data
url = "https://notes.pipal.in/2018/vmware-ml/rent-data.json.zip"
df = pd.read_json(url)
# Removed Outlier
df.drop(df[df.price > 200000].index, inplace=True)
df.drop(df[df.bathrooms > 7].index, inplace=True)
# Shape
df["price_log"] = np.log(df.price)
# Feature Creation
df["photos_num"] = df.photos.apply(len)
df["features_num"] = df.features.apply(len)
len(df.loc[0, "photos"])
df.reset_index(inplace = True)
data = df[["bathrooms", "bedrooms", "latitude", "longitude", "created", \
"photos_num", "features_num", "price", "price_log", "interest_level"]]
data.head()
data.to_csv("data-interactive.csv", index=False)
# Encode interest level
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder().fit(df.interest_level)
interest_level_encoded = le.transform(df.interest_level)
y = interest_level_encoded
X_unscaled = data[["bathrooms", "bedrooms", "latitude", "longitude", \
"photos_num", "features_num", "price_log"]]
# Scale my date using Standard Scaler
from sklearn.preprocessing import StandardScaler
ss = StandardScaler().fit(X_unscaled)
X = ss.transform(X_unscaled)
# Model Creation - Decision Tree
from sklearn.tree import DecisionTreeClassifier
model_dt = DecisionTreeClassifier(max_depth=6, random_state=42)
model_dt.fit(X,y)
# Model Validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model_dt, X, y, scoring="neg_log_loss", cv=5, n_jobs=-1)
np.mean(scores)
# Model Tuning (HyperParameter Tuning)
from sklearn.model_selection import GridSearchCV
parameters = {'max_depth':[2, 5, 6, 7, 8, 15]}
clf2 = GridSearchCV(model_dt, parameters, scoring="neg_log_loss", return_train_score=True)
clf2.fit(X, y)
clf2.best_params_
# Model Creation - Random Forest
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier(max_depth=15, random_state=42, n_estimators=100)
model_rf.fit(X,y)
# Model Validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model_rf, X, y, scoring="neg_log_loss", cv=5, n_jobs=-1)
np.mean(scores)
# Model Tuning (HyperParameter Tuning)
from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators':[10, 50, 100], 'max_depth':[2, 5, 6, 7, 8, 9, 10, 15]}
clf2 = GridSearchCV(model_rf, parameters, scoring="neg_log_loss", return_train_score=True)
clf2.fit(X, y)
clf2.best_params_
model_rf.feature_importances_
X_unscaled.columns
To use this model anywhere else, we need to save the following things:
model_rf - the RandomForest modelss - the StandardScaler transformer used when training the modelle - the LabelEncoder used to encode the input data before trainingimport joblib
joblib.dump(model_rf, "model.pkl")
joblib.dump(ss, "ss.pkl")
joblib.dump(le, "le.pkl")
X_unscaled.columns
!ls -l *.pkl
model = joblib.load("model.pkl")
ss = joblib.load("ss.pkl")
le = joblib.load("le.pkl")
# take 100 random rows
test_data = X_unscaled.sample(100)
test_data.head()
def predict(data):
data_scaled = ss.transform(data)
y = model.predict(data_scaled)
print(y)
y2 = le.inverse_transform(y)
return y2
predict(test_data)
from sklearn.pipeline import Pipeline
pipe = Pipeline([
('scale', StandardScaler()),
('clf', RandomForestClassifier(max_depth=15, random_state=42, n_estimators=100))
])
pipe.fit(X_unscaled, y)
pipe.predict(test_data)
The pipeline can be serialized as a file.
joblib.dump(pipe, "pipe.pkl")
X_unscaled.ix[0]
Install firefly using:
pip install firefly-python
%%file predict.py
import joblib
import numpy as np
pipe = joblib.load("pipe.pkl")
def predict(bathrooms, bedrooms, lat, lon, nphotos, nfeatures, price):
"""Predict the interest from the protential customers given the features of the house.
This expects the house to have the following values as a list.
"""
price_log = np.log(price)
row = [bathrooms, bedrooms, lat, lon, nphotos, nfeatures, price_log]
dataset = [row]
y = pipe.predict(dataset)
classes = ["low", "medium", "high"]
return classes[y[0]]
if __name__ == "__main__":
print(predict(bathrooms=2, bedrooms=3,
lat=40, lon=-73,
nphotos=5, nfeatures=0,
price=3000))
!python predict.py
To run the predict function as an API, run the following in your command-line / terminal:
$ firefly predict.predict
http://127.0.0.1:8000/
firefly is called with modulename.funcname as argument and it starts running an API.
import firefly
api = firefly.Client("http://127.0.0.1:8000/")
api.predict(bathrooms=2, bedrooms=3,
lat=40, lon=-73,
nphotos=5, nfeatures=0,
price=3000)