Practical Machine Learning

04 - Ensemble

Notes: https://notes.pipal.in/2018/vmware-ml2/

1 - Introduction | 2 - Freight Optimization | 3 - Cars | 4 - Ensemble

import pandas as pd
import numpy as np

df = pd.read_csv("https://notes.pipal.in/2018/vmware-ml2/cars_small.csv")

X = df[["kmpl", "bhp", "type"]].copy()
y = df.price

# Transform y
from sklearn.preprocessing import LabelEncoder
labelY = LabelEncoder()
labelY.fit(X.iloc[:,2])
X["type"] = labelY.transform(X.iloc[:,2])

Decision Tree¶

# Model Creation
from sklearn.tree import DecisionTreeRegressor
modelTree = DecisionTreeRegressor(max_depth = 5)
modelTree.fit(X,y)

DecisionTreeRegressor(criterion='mse', max_depth=5, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

from sklearn.metrics import mean_squared_error

-1 * mean_squared_error(y, y_pred)

-542.23044217687072

Manually derive cross validation¶

from sklearn.model_selection import cross_validate
cross_validate(modelTree, X, y, scoring="neg_mean_squared_error", cv=6, return_train_score=True)

{'fit_time': array([ 0.00307202,  0.00298691,  0.00282598,  0.00216413,  0.00273514,
         0.00200081]),
 'score_time': array([ 0.00103879,  0.000983  ,  0.00080395,  0.00074577,  0.00082469,
         0.00064111]),
 'test_score': array([ -6269.28539683,  -8995.6431746 , -10935.5       ,  -1947.03571429,
        -14307.71428571, -16148.36111111]),
 'train_score': array([-349.55095238, -266.22761905, -267.14428571, -210.03809524,
        -381.40142857, -553.04761905])}

# Model Validation
from sklearn.model_selection import cross_val_score
score = cross_val_score(modelTree, X, y, cv=6, scoring="neg_mean_squared_error")
#score
np.mean(score)

-8202.5806878306885

score

array([ -6519.14253968,  -8995.6431746 , -17594.5       ,  -1947.03571429,
       -13979.85714286, -16148.36111111])

y_pred = modelTree.predict(X)

output = pd.DataFrame({"actual": y, "predicted" :y_pred})

output["error"] = output.predicted - output.actual
output["squared_error"] = output.error * output.error

mean_squared_error = output.squared_error.mean()
mean_squared_error

542.2304421768708

Random Forest¶

# Model Creation
from sklearn.ensemble import RandomForestRegressor
modelForest = RandomForestRegressor(max_depth = 5, n_estimators=50)
modelForest.fit(X,y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

# Model Validation
from sklearn.model_selection import cross_val_score
score = cross_val_score(modelForest, X, y, cv=6, scoring="neg_mean_squared_error")
np.mean(score)

-5976.1764531801564

modelForest.feature_importances_

array([ 0.25064836,  0.59346933,  0.15588231])