Practical Machine Learning - Day 3 - Housing¶

VMware Bangalore
June 18-20, 2018

Amit kapoor • Anand Chitipothu • Bargava Subramanian

Notes of this workshop are available online at: https://bit.ly/vmware-ml

Home | Day 1 | Day 2 - Iris | Day 2 - Housing | Day 3

Housing - Classification¶

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['figure.subplot.bottom'] = 0.15
%matplotlib inline

ACQUIRE¶

# load data
url = "https://notes.pipal.in/2018/vmware-ml/rent-data.json.zip"
df = pd.read_json(url)

REFINE¶

# Removed Outlier
df.drop(df[df.price > 200000].index, inplace=True)
df.drop(df[df.bathrooms > 7].index, inplace=True)

TRANSFORM¶

# Shape
df["price_log"] = np.log(df.price)
# Feature Creation
df["photos_num"] = df.photos.apply(len)
df["features_num"] = df.features.apply(len)

len(df.loc[0, "photos"])

5

df.reset_index(inplace = True)

data = df[["bathrooms", "bedrooms", "latitude", "longitude", "created", \
         "photos_num", "features_num", "price", "price_log", "interest_level"]]

data.head()

data.to_csv("data-interactive.csv", index=False)

# Encode interest level
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder().fit(df.interest_level)
interest_level_encoded = le.transform(df.interest_level)

y = interest_level_encoded
X_unscaled = data[["bathrooms", "bedrooms", "latitude", "longitude", \
                         "photos_num", "features_num", "price_log"]]

# Scale my date using Standard Scaler
from sklearn.preprocessing import StandardScaler
ss = StandardScaler().fit(X_unscaled)
X = ss.transform(X_unscaled)

MODEL¶

# Model Creation - Decision Tree
from sklearn.tree import DecisionTreeClassifier
model_dt = DecisionTreeClassifier(max_depth=6, random_state=42)
model_dt.fit(X,y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=6,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')

# Model Validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model_dt, X, y, scoring="neg_log_loss", cv=5, n_jobs=-1)
np.mean(scores)

-0.6966771036075812

# Model Tuning (HyperParameter Tuning)
from sklearn.model_selection import GridSearchCV
parameters = {'max_depth':[2, 5, 6, 7, 8, 15]}
clf2 = GridSearchCV(model_dt, parameters, scoring="neg_log_loss", return_train_score=True)
clf2.fit(X, y)
clf2.best_params_

{'max_depth': 6}

RandomForest¶

# Model Creation - Random Forest 
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier(max_depth=15, random_state=42, n_estimators=100)
model_rf.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

# Model Validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model_rf, X, y, scoring="neg_log_loss", cv=5, n_jobs=-1)
np.mean(scores)

-0.6271560455278088

# Model Tuning (HyperParameter Tuning)
from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators':[10, 50, 100], 'max_depth':[2, 5, 6, 7, 8, 9, 10, 15]}
clf2 = GridSearchCV(model_rf, parameters, scoring="neg_log_loss", return_train_score=True)
clf2.fit(X, y)
clf2.best_params_

{'max_depth': 15, 'n_estimators': 100}

model_rf.feature_importances_

array([0.02637304, 0.09473909, 0.16961835, 0.16825723, 0.13159474,
       0.11033417, 0.29908338])

X_unscaled.columns

Index(['bathrooms', 'bedrooms', 'latitude', 'longitude', 'photos_num',
       'features_num', 'price_log'],
      dtype='object')

Model Serialization¶

To use this model anywhere else, we need to save the following things:

model_rf - the RandomForest model
ss - the StandardScaler transformer used when training the model
le - the LabelEncoder used to encode the input data before training

import joblib

joblib.dump(model_rf, "model.pkl")

['model.pkl']

joblib.dump(ss, "ss.pkl")

['ss.pkl']

joblib.dump(le, "le.pkl")

['le.pkl']

X_unscaled.columns

Index(['bathrooms', 'bedrooms', 'latitude', 'longitude', 'photos_num',
       'features_num', 'price_log'],
      dtype='object')

!ls -l *.pkl

-rw-r--r--@ 1 amitkaps  staff       552 Jun 20 14:47 le.pkl
-rw-r--r--@ 1 amitkaps  staff  43275898 Jun 20 14:47 model.pkl
-rw-r--r--@ 1 amitkaps  staff       650 Jun 20 14:47 ss.pkl

Loading the model¶

model = joblib.load("model.pkl")
ss = joblib.load("ss.pkl")
le = joblib.load("le.pkl")

# take 100 random rows
test_data = X_unscaled.sample(100)

test_data.head()

def predict(data):
    data_scaled = ss.transform(data)
    y = model.predict(data_scaled)
    print(y)
    y2 = le.inverse_transform(y)
    return y2

predict(test_data)

[1 1 1 1 1 1 1 1 1 1 2 2 1 2 2 2 1 1 2 1 1 1 1 1 2 1 1 1 1 2 2 2 1 1 2 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1
 0 2 1 1 1 1 2 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1]

/Users/amitkaps/miniconda3/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:

array(['low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low',
       'low', 'medium', 'medium', 'low', 'medium', 'medium', 'medium',
       'low', 'low', 'medium', 'low', 'low', 'low', 'low', 'low',
       'medium', 'low', 'low', 'low', 'low', 'medium', 'medium', 'medium',
       'low', 'low', 'medium', 'low', 'low', 'low', 'low', 'low', 'low',
       'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low',
       'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low',
       'low', 'low', 'low', 'low', 'low', 'low', 'low', 'medium',
       'medium', 'low', 'low', 'low', 'low', 'low', 'low', 'high',
       'medium', 'low', 'low', 'low', 'low', 'medium', 'low', 'low',
       'low', 'low', 'medium', 'medium', 'low', 'low', 'low', 'low',
       'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low'],
      dtype=object)

Build a pipeline¶

from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('scale', StandardScaler()),
    ('clf', RandomForestClassifier(max_depth=15, random_state=42, n_estimators=100))
])

pipe.fit(X_unscaled, y)

Pipeline(memory=None,
     steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False))])

pipe.predict(test_data)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 2, 2, 1, 1, 2, 1, 1, 1,
       1, 1, 2, 1, 1, 1, 1, 2, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 2, 1, 1, 1, 1, 1, 1, 0, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

The pipeline can be serialized as a file.

joblib.dump(pipe, "pipe.pkl")

['pipe.pkl']

X_unscaled.ix[0]

/Users/amitkaps/miniconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.

bathrooms        1.500000
bedrooms         3.000000
latitude        40.714500
longitude      -73.942500
photos_num       5.000000
features_num     0.000000
price_log        8.006368
Name: 0, dtype: float64

Deploying Model as an API¶

Install firefly using:

pip install firefly-python

%%file predict.py
import joblib
import numpy as np

pipe = joblib.load("pipe.pkl")

def predict(bathrooms, bedrooms, lat, lon, nphotos, nfeatures, price):
    """Predict the interest from the protential customers given the features of the house.
    
    This expects the house to have the following values as a list.    
    """
    price_log = np.log(price)
    row = [bathrooms, bedrooms, lat, lon, nphotos, nfeatures, price_log]
    dataset = [row]
    y = pipe.predict(dataset)
    classes = ["low", "medium", "high"]
    return classes[y[0]]

if __name__ == "__main__":
    print(predict(bathrooms=2, bedrooms=3, 
                  lat=40, lon=-73, 
                  nphotos=5, nfeatures=0, 
                  price=3000))

Overwriting predict.py

!python predict.py

medium

To run the predict function as an API, run the following in your command-line / terminal:

    $ firefly predict.predict
    http://127.0.0.1:8000/

firefly is called with modulename.funcname as argument and it starts running an API.

import firefly

api = firefly.Client("http://127.0.0.1:8000/")

api.predict(bathrooms=2, bedrooms=3, 
                  lat=40, lon=-73, 
                  nphotos=5, nfeatures=0, 
                  price=3000)

'medium'

	bathrooms	bedrooms	latitude	longitude	created	photos_num	features_num	price	price_log	interest_level
0	1.5	3	40.7145	-73.9425	2016-06-24 07:54:24	5	0	3000	8.006368	medium
1	1.0	2	40.7947	-73.9667	2016-06-12 12:19:27	11	5	5465	8.606119	low
2	1.0	1	40.7388	-74.0018	2016-04-17 03:26:41	8	4	2850	7.955074	high
3	1.0	1	40.7539	-73.9677	2016-04-18 02:22:02	3	2	3275	8.094073	low
4	1.0	4	40.8241	-73.9493	2016-04-28 01:32:41	3	1	3350	8.116716	low

	bathrooms	bedrooms	latitude	longitude	photos_num	features_num	price_log
29144	1.0	1	40.7832	-73.9520	4	3	7.729735
5034	1.0	0	40.7775	-73.9784	3	3	7.673223
8325	1.0	2	40.7756	-73.9829	7	3	7.935587
24553	1.0	1	40.7242	-73.9946	3	1	8.124151
44732	1.0	0	40.6782	-73.9750	1	3	7.783224