Practical Machine Learning - Day 3 - Housing

VMware Bangalore
June 18-20, 2018

Amit kapoor • Anand Chitipothu • Bargava Subramanian

Notes of this workshop are available online at: https://bit.ly/vmware-ml

Home | Day 1 | Day 2 - Iris | Day 2 - Housing | Day 3

Housing - Classification

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['figure.subplot.bottom'] = 0.15
%matplotlib inline

ACQUIRE

In [20]:
# load data
url = "https://notes.pipal.in/2018/vmware-ml/rent-data.json.zip"
df = pd.read_json(url)

REFINE

In [22]:
# Removed Outlier
df.drop(df[df.price > 200000].index, inplace=True)
df.drop(df[df.bathrooms > 7].index, inplace=True)

TRANSFORM

In [23]:
# Shape
df["price_log"] = np.log(df.price)
# Feature Creation
df["photos_num"] = df.photos.apply(len)
df["features_num"] = df.features.apply(len)
In [42]:
len(df.loc[0, "photos"])
Out[42]:
5
In [37]:
df.reset_index(inplace = True)
In [38]:
data = df[["bathrooms", "bedrooms", "latitude", "longitude", "created", \
         "photos_num", "features_num", "price", "price_log", "interest_level"]]
In [39]:
data.head()
Out[39]:
bathrooms bedrooms latitude longitude created photos_num features_num price price_log interest_level
0 1.5 3 40.7145 -73.9425 2016-06-24 07:54:24 5 0 3000 8.006368 medium
1 1.0 2 40.7947 -73.9667 2016-06-12 12:19:27 11 5 5465 8.606119 low
2 1.0 1 40.7388 -74.0018 2016-04-17 03:26:41 8 4 2850 7.955074 high
3 1.0 1 40.7539 -73.9677 2016-04-18 02:22:02 3 2 3275 8.094073 low
4 1.0 4 40.8241 -73.9493 2016-04-28 01:32:41 3 1 3350 8.116716 low
In [32]:
data.to_csv("data-interactive.csv", index=False)
In [43]:
# Encode interest level
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder().fit(df.interest_level)
interest_level_encoded = le.transform(df.interest_level)
In [48]:
y = interest_level_encoded
X_unscaled = data[["bathrooms", "bedrooms", "latitude", "longitude", \
                         "photos_num", "features_num", "price_log"]] 
In [51]:
# Scale my date using Standard Scaler
from sklearn.preprocessing import StandardScaler
ss = StandardScaler().fit(X_unscaled)
X = ss.transform(X_unscaled)

MODEL

In [99]:
# Model Creation - Decision Tree
from sklearn.tree import DecisionTreeClassifier
model_dt = DecisionTreeClassifier(max_depth=6, random_state=42)
model_dt.fit(X,y)
Out[99]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=6,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')
In [100]:
# Model Validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model_dt, X, y, scoring="neg_log_loss", cv=5, n_jobs=-1)
np.mean(scores)
Out[100]:
-0.6966771036075812
In [101]:
# Model Tuning (HyperParameter Tuning)
from sklearn.model_selection import GridSearchCV
parameters = {'max_depth':[2, 5, 6, 7, 8, 15]}
clf2 = GridSearchCV(model_dt, parameters, scoring="neg_log_loss", return_train_score=True)
clf2.fit(X, y)
clf2.best_params_
Out[101]:
{'max_depth': 6}

RandomForest

In [111]:
# Model Creation - Random Forest 
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier(max_depth=15, random_state=42, n_estimators=100)
model_rf.fit(X,y)
Out[111]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
In [112]:
# Model Validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model_rf, X, y, scoring="neg_log_loss", cv=5, n_jobs=-1)
np.mean(scores)
Out[112]:
-0.6271560455278088
In [110]:
# Model Tuning (HyperParameter Tuning)
from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators':[10, 50, 100], 'max_depth':[2, 5, 6, 7, 8, 9, 10, 15]}
clf2 = GridSearchCV(model_rf, parameters, scoring="neg_log_loss", return_train_score=True)
clf2.fit(X, y)
clf2.best_params_
Out[110]:
{'max_depth': 15, 'n_estimators': 100}
In [113]:
model_rf.feature_importances_
Out[113]:
array([0.02637304, 0.09473909, 0.16961835, 0.16825723, 0.13159474,
       0.11033417, 0.29908338])
In [116]:
X_unscaled.columns
Out[116]:
Index(['bathrooms', 'bedrooms', 'latitude', 'longitude', 'photos_num',
       'features_num', 'price_log'],
      dtype='object')

Model Serialization

To use this model anywhere else, we need to save the following things:

  • model_rf - the RandomForest model
  • ss - the StandardScaler transformer used when training the model
  • le - the LabelEncoder used to encode the input data before training
In [117]:
import joblib
In [118]:
joblib.dump(model_rf, "model.pkl")
Out[118]:
['model.pkl']
In [119]:
joblib.dump(ss, "ss.pkl")
Out[119]:
['ss.pkl']
In [120]:
joblib.dump(le, "le.pkl")
Out[120]:
['le.pkl']
In [121]:
X_unscaled.columns
Out[121]:
Index(['bathrooms', 'bedrooms', 'latitude', 'longitude', 'photos_num',
       'features_num', 'price_log'],
      dtype='object')
In [123]:
!ls -l *.pkl
-rw-r--r--@ 1 amitkaps  staff       552 Jun 20 14:47 le.pkl
-rw-r--r--@ 1 amitkaps  staff  43275898 Jun 20 14:47 model.pkl
-rw-r--r--@ 1 amitkaps  staff       650 Jun 20 14:47 ss.pkl

Loading the model

In [125]:
model = joblib.load("model.pkl")
ss = joblib.load("ss.pkl")
le = joblib.load("le.pkl")
In [127]:
# take 100 random rows
test_data = X_unscaled.sample(100)
In [129]:
test_data.head()
Out[129]:
bathrooms bedrooms latitude longitude photos_num features_num price_log
29144 1.0 1 40.7832 -73.9520 4 3 7.729735
5034 1.0 0 40.7775 -73.9784 3 3 7.673223
8325 1.0 2 40.7756 -73.9829 7 3 7.935587
24553 1.0 1 40.7242 -73.9946 3 1 8.124151
44732 1.0 0 40.6782 -73.9750 1 3 7.783224
In [145]:
def predict(data):
    data_scaled = ss.transform(data)
    y = model.predict(data_scaled)
    print(y)
    y2 = le.inverse_transform(y)
    return y2
In [146]:
predict(test_data)
[1 1 1 1 1 1 1 1 1 1 2 2 1 2 2 2 1 1 2 1 1 1 1 1 2 1 1 1 1 2 2 2 1 1 2 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1
 0 2 1 1 1 1 2 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1]
/Users/amitkaps/miniconda3/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
Out[146]:
array(['low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low',
       'low', 'medium', 'medium', 'low', 'medium', 'medium', 'medium',
       'low', 'low', 'medium', 'low', 'low', 'low', 'low', 'low',
       'medium', 'low', 'low', 'low', 'low', 'medium', 'medium', 'medium',
       'low', 'low', 'medium', 'low', 'low', 'low', 'low', 'low', 'low',
       'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low',
       'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low',
       'low', 'low', 'low', 'low', 'low', 'low', 'low', 'medium',
       'medium', 'low', 'low', 'low', 'low', 'low', 'low', 'high',
       'medium', 'low', 'low', 'low', 'low', 'medium', 'low', 'low',
       'low', 'low', 'medium', 'medium', 'low', 'low', 'low', 'low',
       'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low'],
      dtype=object)

Build a pipeline

In [147]:
from sklearn.pipeline import Pipeline
In [148]:
pipe = Pipeline([
    ('scale', StandardScaler()),
    ('clf', RandomForestClassifier(max_depth=15, random_state=42, n_estimators=100))
])
In [150]:
pipe.fit(X_unscaled, y)
Out[150]:
Pipeline(memory=None,
     steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False))])
In [151]:
pipe.predict(test_data)
Out[151]:
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 2, 2, 1, 1, 2, 1, 1, 1,
       1, 1, 2, 1, 1, 1, 1, 2, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 2, 1, 1, 1, 1, 1, 1, 0, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

The pipeline can be serialized as a file.

In [152]:
joblib.dump(pipe, "pipe.pkl")
Out[152]:
['pipe.pkl']
In [154]:
X_unscaled.ix[0]
/Users/amitkaps/miniconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.
Out[154]:
bathrooms        1.500000
bedrooms         3.000000
latitude        40.714500
longitude      -73.942500
photos_num       5.000000
features_num     0.000000
price_log        8.006368
Name: 0, dtype: float64

Deploying Model as an API

Install firefly using:

pip install firefly-python
In [160]:
%%file predict.py
import joblib
import numpy as np

pipe = joblib.load("pipe.pkl")

def predict(bathrooms, bedrooms, lat, lon, nphotos, nfeatures, price):
    """Predict the interest from the protential customers given the features of the house.
    
    This expects the house to have the following values as a list.    
    """
    price_log = np.log(price)
    row = [bathrooms, bedrooms, lat, lon, nphotos, nfeatures, price_log]
    dataset = [row]
    y = pipe.predict(dataset)
    classes = ["low", "medium", "high"]
    return classes[y[0]]

if __name__ == "__main__":
    print(predict(bathrooms=2, bedrooms=3, 
                  lat=40, lon=-73, 
                  nphotos=5, nfeatures=0, 
                  price=3000))
Overwriting predict.py
In [161]:
!python predict.py
medium

To run the predict function as an API, run the following in your command-line / terminal:

    $ firefly predict.predict
    http://127.0.0.1:8000/

firefly is called with modulename.funcname as argument and it starts running an API.

In [162]:
import firefly
In [163]:
api = firefly.Client("http://127.0.0.1:8000/")
In [164]:
api.predict(bathrooms=2, bedrooms=3, 
                  lat=40, lon=-73, 
                  nphotos=5, nfeatures=0, 
                  price=3000)
Out[164]:
'medium'