Practical Machine Learning

03 - Cars

Notes: https://notes.pipal.in/2018/vmware-ml2/

1 - Introduction | Freight Optimization | Cars

Frame

  • Predict whether a cars is Hatchback or Sedan?
- y: type
- X: price, kmpl, bhp, brand

Acquire

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
In [2]:
df = pd.read_csv("https://notes.pipal.in/2018/vmware-ml2/cars_small.csv")
In [3]:
df.head()
Out[3]:
brand model price kmpl bhp type
0 Chevrolet Beat 421 18.6 79 Hatchback
1 Chevrolet Sail 551 18.2 82 Sedan
2 Chevrolet Sail Hatchback 468 18.2 82 Hatchback
3 Chevrolet Spark 345 16.2 62 Hatchback
4 Fiat Linea Classic 612 14.9 89 Sedan

Transform (Pre-Processing)

Preprocessing y

In [4]:
from sklearn.preprocessing import LabelEncoder
In [5]:
# Instantiate the encoder
le = LabelEncoder()
In [6]:
# Fit the encoder
le.fit(df.type)
Out[6]:
LabelEncoder()
In [7]:
le.classes_
Out[7]:
array(['Hatchback', 'Sedan'], dtype=object)
In [8]:
# Transform the data
y = le.transform(df.type)

Preprocessing X

In [9]:
from sklearn.preprocessing import StandardScaler
In [10]:
# Instatiate a scale
sc = StandardScaler()
In [11]:
sc.fit(df[["price", "kmpl"]])
Out[11]:
StandardScaler(copy=True, with_mean=True, with_std=True)
In [12]:
X = sc.transform(df[["price", "kmpl"]])

Explore

In [13]:
df.plot(kind="scatter", x = "kmpl", y = "price", c=y, cmap="viridis");
In [14]:
plt.scatter(x = X[:,0], y = X[:,1], c=y, cmap="plasma");
In [15]:
plt.scatter(y = y,x  = X[:,0])
Out[15]:
<matplotlib.collections.PathCollection at 0x10cbfa5f8>

Model

In [16]:
from sklearn.tree import DecisionTreeClassifier
In [17]:
# Instatiate a model
tree = DecisionTreeClassifier(max_depth=1)
In [18]:
# Fit the model
tree.fit(X,y)
Out[18]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
In [19]:
#!pip import modelvis
In [20]:
import modelvis
In [21]:
modelvis.plot_decision_boundaries(tree, pd.DataFrame(X), y, show_input=True, probability=True)
In [22]:
modelvis.render_tree(tree)
Out[22]:
Tree 0 X 0 ≤ -0.157 gini = 0.49 samples = 42 value = [24, 18] 1 gini = 0.0 samples = 19 value = [19, 0] 0->1 True 2 gini = 0.34 samples = 23 value = [5, 18] 0->2 False
In [23]:
tree.predict(X)
Out[23]:
array([0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1])
In [24]:
tree.predict_proba(X)
Out[24]:
array([[ 1.       ,  0.       ],
       [ 0.2173913,  0.7826087],
       [ 1.       ,  0.       ],
       [ 1.       ,  0.       ],
       [ 0.2173913,  0.7826087],
       [ 0.2173913,  0.7826087],
       [ 0.2173913,  0.7826087],
       [ 0.2173913,  0.7826087],
       [ 1.       ,  0.       ],
       [ 0.2173913,  0.7826087],
       [ 1.       ,  0.       ],
       [ 1.       ,  0.       ],
       [ 1.       ,  0.       ],
       [ 0.2173913,  0.7826087],
       [ 0.2173913,  0.7826087],
       [ 0.2173913,  0.7826087],
       [ 1.       ,  0.       ],
       [ 1.       ,  0.       ],
       [ 1.       ,  0.       ],
       [ 0.2173913,  0.7826087],
       [ 1.       ,  0.       ],
       [ 1.       ,  0.       ],
       [ 1.       ,  0.       ],
       [ 0.2173913,  0.7826087],
       [ 0.2173913,  0.7826087],
       [ 1.       ,  0.       ],
       [ 1.       ,  0.       ],
       [ 1.       ,  0.       ],
       [ 0.2173913,  0.7826087],
       [ 1.       ,  0.       ],
       [ 0.2173913,  0.7826087],
       [ 0.2173913,  0.7826087],
       [ 0.2173913,  0.7826087],
       [ 0.2173913,  0.7826087],
       [ 0.2173913,  0.7826087],
       [ 1.       ,  0.       ],
       [ 0.2173913,  0.7826087],
       [ 0.2173913,  0.7826087],
       [ 0.2173913,  0.7826087],
       [ 0.2173913,  0.7826087],
       [ 1.       ,  0.       ],
       [ 0.2173913,  0.7826087]])
In [26]:
print(modelvis.render_tree_as_code(tree))
def predict(row):
    """Your decision-tree model wrote this code."""
    # 42 samples; value=[24, 18]; class=0
    if row[0] < -0.1565473973751068:
        # 19 samples; value=[19, 0]; class=0
        return 0
    else:
        # 23 samples; value=[5, 18]; class=1
        return 1

Full Model

In [28]:
X_raw = df[["price", "kmpl", "bhp"]]
y_raw = df["type"]
In [37]:
# Transformation X_raw
from sklearn.preprocessing import StandardScaler
scaleX = StandardScaler()
scaleX.fit(X_raw)
X = scaleX.transform(X_raw)
In [36]:
# Transform y
from sklearn.preprocessing import LabelEncoder
labelY = LabelEncoder()
labelY.fit(y_raw)
y = labelY.transform(y_raw)
In [38]:
# Model Creation
from sklearn.tree import DecisionTreeClassifier
modelTree = DecisionTreeClassifier()
modelTree.fit(X,y)
Out[38]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
In [45]:
# Model Validation
from sklearn.model_selection import cross_val_score
score = cross_val_score(modelTree, X, y, cv=6, scoring="accuracy")
np.mean(score)
Out[45]:
0.80952380952380965
In [44]:
# Model Tuning
from sklearn.model_selection import GridSearchCV
parameters = {"max_depth" : [1,2,3,4,5,6,7]}
clf = GridSearchCV(modelTree, parameters, return_train_score=True)
clf.fit(X,y)
clf.best_params_
Out[44]:
{'max_depth': 1}
In [48]:
model = clf.best_estimator_
model
Out[48]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

Saving the Model

In [46]:
import joblib
In [59]:
joblib.dump(model, "model.pkl")
joblib.dump(scaleX, "scalex.pkl")
joblib.dump(labelY, "labely.pkl")
Out[59]:
['labely.pkl']

The model can now be loaded from the model.pkl file any time.

In [60]:
model2 = joblib.load("model.pkl")
scaleX2 = joblib.load("scalex.pkl")
labely2 = joblib.load("labely.pkl")
In [53]:
model2.predict(X[:10])
Out[53]:
array([0, 1, 0, 0, 1, 1, 1, 1, 0, 1])
In [55]:
df.head()
Out[55]:
brand model price kmpl bhp type
0 Chevrolet Beat 421 18.6 79 Hatchback
1 Chevrolet Sail 551 18.2 82 Sedan
2 Chevrolet Sail Hatchback 468 18.2 82 Hatchback
3 Chevrolet Spark 345 16.2 62 Hatchback
4 Fiat Linea Classic 612 14.9 89 Sedan
In [70]:
def predict(price, kmpl, bhp):
    X = scaleX2.transform([[price, kmpl, bhp]])
    y = model2.predict(X)
    return labely2.inverse_transform(y)[0]
In [72]:
predict(550, 18.2, 82)
Out[72]:
'Sedan'

Running Model as API

Install firefly using:

pip install firefly-python

And run the predict function in cars.py as an API using:

firefly cars.predict
In [ ]:
!pip install firefly-python
In [78]:
import firefly
cars_api = firefly.Client("http://127.0.0.1:8000/")
In [79]:
cars_api.predict(price=550, kmpl=18.2, bhp=82)
Out[79]:
'Sedan'
In [80]:
help(cars_api.predict)
Help on function predict in module firefly.client:

predict(*args, **kwargs)
    Predicts the model of a car using its price, milage and horse power.

In [81]:
cars_api.square(x=10)
Out[81]:
100