Practical Machine Learning - Day 2 - Boston Housing

Airwatch Bangalore
May 7-9, 2018

Notes of this workshop are available online at:
https://bit.ly/airwatch-ml

Home | Day 1 | Day 2 - iris | Day 2 - Boston Housing

In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("ggplot")

Frame the Problem

Predict house prices for Boston

Acquire the Data

In [218]:
from sklearn.datasets import load_boston
boston = load_boston()
In [219]:
X = boston.data
y = boston.target
In [220]:
#print(boston.DESCR)

Explore the Data

In [221]:
df = pd.DataFrame(X)
df.columns = boston.feature_names
In [222]:
df["MEDV"] = y
In [223]:
df.head()
Out[223]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT MEDV
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 15.3 396.90 4.98 24.0
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 17.8 396.90 9.14 21.6
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 17.8 392.83 4.03 34.7
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 18.7 394.63 2.94 33.4
4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 18.7 396.90 5.33 36.2
In [224]:
df.describe()
Out[224]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT MEDV
count 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000
mean 3.593761 11.363636 11.136779 0.069170 0.554695 6.284634 68.574901 3.795043 9.549407 408.237154 18.455534 356.674032 12.653063 22.532806
std 8.596783 23.322453 6.860353 0.253994 0.115878 0.702617 28.148861 2.105710 8.707259 168.537116 2.164946 91.294864 7.141062 9.197104
min 0.006320 0.000000 0.460000 0.000000 0.385000 3.561000 2.900000 1.129600 1.000000 187.000000 12.600000 0.320000 1.730000 5.000000
25% 0.082045 0.000000 5.190000 0.000000 0.449000 5.885500 45.025000 2.100175 4.000000 279.000000 17.400000 375.377500 6.950000 17.025000
50% 0.256510 0.000000 9.690000 0.000000 0.538000 6.208500 77.500000 3.207450 5.000000 330.000000 19.050000 391.440000 11.360000 21.200000
75% 3.647423 12.500000 18.100000 0.000000 0.624000 6.623500 94.075000 5.188425 24.000000 666.000000 20.200000 396.225000 16.955000 25.000000
max 88.976200 100.000000 27.740000 1.000000 0.871000 8.780000 100.000000 12.126500 24.000000 711.000000 22.000000 396.900000 37.970000 50.000000

Transform the Data

In [225]:
from sklearn.preprocessing import MinMaxScaler
In [226]:
# Instantiate the transformation
MM = MinMaxScaler()
In [227]:
# Fit the transformation
MM.fit(X)
Out[227]:
MinMaxScaler(copy=True, feature_range=(0, 1))
In [228]:
X_scaled = MM.transform(X)
df_scaled = pd.DataFrame(X_scaled)
df_scaled.columns = boston.feature_names
df_scaled['MEDV'] = y

One Dimensional Plotting

In [229]:
df_scaled.hist(figsize=(12, 10), bins = 20);

Two Dimension Plotting

In [230]:
from pandas.plotting import scatter_matrix
In [231]:
scatter_matrix(df_scaled, figsize =(16,16));

Model the data

In [232]:
from sklearn.model_selection import train_test_split
In [233]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2, random_state=42)
In [234]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape
Out[234]:
((404, 13), (404,), (102, 13), (102,))
In [235]:
from sklearn.linear_model import LinearRegression
In [236]:
# Instantiate the Model
model_lr = LinearRegression()
In [237]:
# Fit the model
model_lr.fit(X_train, y_train)
Out[237]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
In [238]:
model_lr.coef_
Out[238]:
array([-10.00586239,   3.00810168,   1.11114146,   2.78676719,
        -8.37894847,  23.13315401,  -0.60590223, -15.92884514,
         6.02861725,  -5.57488726,  -8.61414758,   4.93807413,
       -18.45881209])
In [239]:
model_lr.intercept_
Out[239]:
23.647903692379955
In [240]:
model_lr.score(X_train, y_train)
Out[240]:
0.7508837786732915
In [241]:
model_lr.score(X_test, y_test)
Out[241]:
0.6684825753971606

Ridge & Lasso Regression

In [242]:
from sklearn.linear_model import Ridge, Lasso
In [244]:
model_ridge = Ridge()
model_ridge.fit(X_train, y_train)
model_ridge.score(X_train, y_train), model_ridge.score(X_test, y_test)
Out[244]:
(0.746737248539369, 0.6778821616179469)
In [246]:
model_ridge.coef_
Out[246]:
array([ -6.96270806,   2.04087419,  -0.03711462,   3.08262005,
        -6.65592555,  20.84585014,  -0.32401097, -12.52135334,
         4.5067104 ,  -4.34387875,  -8.46991847,   4.76065757,
       -18.06337222])
In [247]:
model_lasso = Lasso()
model_lasso.fit(X_train, y_train)
model_lasso.score(X_train, y_train), model_lasso.score(X_test, y_test)
Out[247]:
(0.24595011626275587, 0.2573921283746299)
In [250]:
boston.feature_names
Out[250]:
array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')
In [248]:
model_lasso.coef_
Out[248]:
array([-0.        ,  0.        , -0.        ,  0.        , -0.        ,
        0.        , -0.        ,  0.        , -0.        , -0.85254317,
       -0.        ,  0.        , -8.25432751])

Building a Complex Feature

In [139]:
# Interaction Feature
from sklearn.datasets import make_circles
In [196]:
X, y = make_circles(n_samples=1000, factor=0.2, noise=0.2)
In [197]:
df = pd.DataFrame(np.c_[y,X])
df.columns = ["y", "x1", "x2"]
df.head()
Out[197]:
y x1 x2
0 0.0 0.641605 0.677097
1 1.0 -0.314740 -0.149596
2 0.0 0.458356 0.131196
3 0.0 -0.832329 -0.914584
4 0.0 -1.031308 -0.020552
In [198]:
df.plot(kind="scatter", x="x1", y="x2", c="y", cmap="viridis")
Out[198]:
<matplotlib.axes._subplots.AxesSubplot at 0x118a18748>
In [199]:
df["x1x2"] = df.x1 * df.x2
df["x1_sqr+x2_sqr"] = df.x1**2 + df.x2**2
df["x1_sqr"] = df.x1**2
df["x2_sqr"] = df.x2**2
In [200]:
df.head()
Out[200]:
y x1 x2 x1x2 x1_sqr+x2_sqr x1_sqr x2_sqr
0 0.0 0.641605 0.677097 0.434429 0.870118 0.411657 0.458460
1 1.0 -0.314740 -0.149596 0.047084 0.121440 0.099061 0.022379
2 0.0 0.458356 0.131196 0.060134 0.227303 0.210090 0.017212
3 0.0 -0.832329 -0.914584 0.761235 1.529236 0.692771 0.836465
4 0.0 -1.031308 -0.020552 0.021195 1.064019 1.063596 0.000422
In [212]:
df.plot(kind="scatter", x="x2", y="x1_sqr+x2_sqr", c="y", cmap="viridis")
Out[212]:
<matplotlib.axes._subplots.AxesSubplot at 0x11a5c5668>
In [202]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
In [203]:
X = df.iloc[:,1:]
y = df.iloc[:,0]
In [204]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.2)
In [208]:
model_log = LogisticRegression(penalty="l1")
model_log.fit(X_train,y_train)
Out[208]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
In [211]:
X.columns
Out[211]:
Index(['x1', 'x2', 'x1x2', 'x1_sqr+x2_sqr', 'x1_sqr', 'x2_sqr'], dtype='object')
In [209]:
model_log.coef_
Out[209]:
array([[  0.        ,   0.61996298,   0.        , -11.85251425,
          0.        ,   0.        ]])
In [210]:
model_log.score(X_train,y_train), model_log.score(X_test, y_test)
Out[210]:
(0.9625, 0.975)