Airwatch Bangalore
May 7-9, 2018
Notes of this workshop are available online at:
https://bit.ly/airwatch-ml
Home | Day 1 | Day 2 - iris | Day 2 - Boston Housing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("ggplot")
Predict house prices for Boston
from sklearn.datasets import load_boston
boston = load_boston()
X = boston.data
y = boston.target
#print(boston.DESCR)
df = pd.DataFrame(X)
df.columns = boston.feature_names
df["MEDV"] = y
df.head()
df.describe()
from sklearn.preprocessing import MinMaxScaler
# Instantiate the transformation
MM = MinMaxScaler()
# Fit the transformation
MM.fit(X)
X_scaled = MM.transform(X)
df_scaled = pd.DataFrame(X_scaled)
df_scaled.columns = boston.feature_names
df_scaled['MEDV'] = y
df_scaled.hist(figsize=(12, 10), bins = 20);
from pandas.plotting import scatter_matrix
scatter_matrix(df_scaled, figsize =(16,16));
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2, random_state=42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape
from sklearn.linear_model import LinearRegression
# Instantiate the Model
model_lr = LinearRegression()
# Fit the model
model_lr.fit(X_train, y_train)
model_lr.coef_
model_lr.intercept_
model_lr.score(X_train, y_train)
model_lr.score(X_test, y_test)
from sklearn.linear_model import Ridge, Lasso
model_ridge = Ridge()
model_ridge.fit(X_train, y_train)
model_ridge.score(X_train, y_train), model_ridge.score(X_test, y_test)
model_ridge.coef_
model_lasso = Lasso()
model_lasso.fit(X_train, y_train)
model_lasso.score(X_train, y_train), model_lasso.score(X_test, y_test)
boston.feature_names
model_lasso.coef_
# Interaction Feature
from sklearn.datasets import make_circles
X, y = make_circles(n_samples=1000, factor=0.2, noise=0.2)
df = pd.DataFrame(np.c_[y,X])
df.columns = ["y", "x1", "x2"]
df.head()
df.plot(kind="scatter", x="x1", y="x2", c="y", cmap="viridis")
df["x1x2"] = df.x1 * df.x2
df["x1_sqr+x2_sqr"] = df.x1**2 + df.x2**2
df["x1_sqr"] = df.x1**2
df["x2_sqr"] = df.x2**2
df.head()
df.plot(kind="scatter", x="x2", y="x1_sqr+x2_sqr", c="y", cmap="viridis")
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
X = df.iloc[:,1:]
y = df.iloc[:,0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.2)
model_log = LogisticRegression(penalty="l1")
model_log.fit(X_train,y_train)
X.columns
model_log.coef_
model_log.score(X_train,y_train), model_log.score(X_test, y_test)