In [2]:
import pandas as pd
import numpy as np
In [3]:
df = pd.read_csv("https://notes.pipal.in/2018/vmware-ml2/cars_small.csv")
In [28]:
X = df[["kmpl", "bhp", "type"]].copy()
y = df.price
In [29]:
# Transform y
from sklearn.preprocessing import LabelEncoder
labelY = LabelEncoder()
labelY.fit(X.iloc[:,2])
X["type"] = labelY.transform(X.iloc[:,2])

Decision Tree

In [69]:
# Model Creation
from sklearn.tree import DecisionTreeRegressor
modelTree = DecisionTreeRegressor(max_depth = 5)
modelTree.fit(X,y)
Out[69]:
DecisionTreeRegressor(criterion='mse', max_depth=5, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')
In [73]:
from sklearn.metrics import mean_squared_error
In [77]:
-1 * mean_squared_error(y, y_pred)
Out[77]:
-542.23044217687072

Manually derive cross validation

In [122]:
from sklearn.model_selection import cross_validate
cross_validate(modelTree, X, y, scoring="neg_mean_squared_error", cv=6, return_train_score=True)
Out[122]:
{'fit_time': array([ 0.00307202,  0.00298691,  0.00282598,  0.00216413,  0.00273514,
         0.00200081]),
 'score_time': array([ 0.00103879,  0.000983  ,  0.00080395,  0.00074577,  0.00082469,
         0.00064111]),
 'test_score': array([ -6269.28539683,  -8995.6431746 , -10935.5       ,  -1947.03571429,
        -14307.71428571, -16148.36111111]),
 'train_score': array([-349.55095238, -266.22761905, -267.14428571, -210.03809524,
        -381.40142857, -553.04761905])}
In [114]:
# Model Validation
from sklearn.model_selection import cross_val_score
score = cross_val_score(modelTree, X, y, cv=6, scoring="neg_mean_squared_error")
#score
np.mean(score)
Out[114]:
-8202.5806878306885
In [111]:
score
Out[111]:
array([ -6519.14253968,  -8995.6431746 , -17594.5       ,  -1947.03571429,
       -13979.85714286, -16148.36111111])
In [40]:
y_pred = modelTree.predict(X)
In [50]:
output = pd.DataFrame({"actual": y, "predicted" :y_pred})
In [58]:
output["error"] = output.predicted - output.actual
output["squared_error"] = output.error * output.error
In [61]:
mean_squared_error = output.squared_error.mean()
mean_squared_error
Out[61]:
542.2304421768708

Random Forest

In [33]:
# Model Creation
from sklearn.ensemble import RandomForestRegressor
modelForest = RandomForestRegressor(max_depth = 5, n_estimators=50)
modelForest.fit(X,y)
Out[33]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
In [34]:
# Model Validation
from sklearn.model_selection import cross_val_score
score = cross_val_score(modelForest, X, y, cv=6, scoring="neg_mean_squared_error")
np.mean(score)
Out[34]:
-5976.1764531801564
In [35]:
modelForest.feature_importances_
Out[35]:
array([ 0.25064836,  0.59346933,  0.15588231])