VMware Bangalore
June 18-20, 2018
Amit kapoor • Anand Chitipothu • Bargava Subramanian
Notes of this workshop are available online at: https://bit.ly/vmware-ml
Home | Day 1 | Day 2 - Iris | Day 2 - Housing | Day 3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['figure.subplot.bottom'] = 0.15
%matplotlib inline
Find if a retail listing is going generate interest from the protential customers?
url = "https://notes.pipal.in/2018/vmware-ml/rent-data.json.zip"
# url = "https://notes.pipal.in/2018/vmware-ml/rent-data.json"
df = pd.read_json(url)
df.columns
df.shape
df.dtypes
df.head()
Prediction Variable: interest_level
df.interest_level.value_counts()
What are my Y feature set
df.columns
# What are the unique values in the column
df.bathrooms.unique()
# Is the type of bathrooms correct?
df.bathrooms.dtypes
# Are there any null values
df.bathrooms.isnull().sum()
# Are there any outliers in this data
df.bathrooms.hist()
df.bathrooms.value_counts(sort=True)
# Is there null values
df.bedrooms.isnull().sum()
df.bedrooms.value_counts(sort=True)
# Cross Tabulation
pd.crosstab(df.bedrooms, df.bathrooms)
Identification
Redressal system for Outliers & Null Values
df[df.bathrooms == 10]
## Check for dtypes for all
df.dtypes
# Fix the dtypes
df.created = pd.to_datetime(df.created)
If you want to visalise missing nos. - then you can also use missingno library
df.isnull().sum()
## Outlier for Price
df.price.hist()
df[df.price > 200000]
df.drop?
# Need to run this !
df.drop(df[df.price > 200000].index, inplace=True)
df.drop(df[df.bathrooms > 7].index, inplace=True)
df.shape
Lets looks at the possible input features
# Price
df.price.hist()
df.price.plot(kind="hist", logx=True, bins=100)
plt.scatter(df.longitude, df.latitude, alpha=0.1)
plt.xlim(-70,-74)
plt.ylim(38,45)
df.head()
from plotnine import *
(ggplot(df) + aes('longitude', 'latitude', color='interest_level')
+ geom_point(alpha=0.01) + ylim(40.6,40.8) +xlim(-73.5,-74.5) +
facet_wrap("interest_level")
)
price to log numbersdf["priceLog"] = np.log(df.price)
df.columns
X = df[["bathrooms", "bedrooms", "latitude", "longitude", "priceLog"]]
X.head()
y = df['interest_level']
y.head()
type(y)
from sklearn.preprocessing import LabelEncoder
le_y = LabelEncoder()
le_y.fit(y)
le_y.classes_
y_encoded = le_y.transform(y)
y_encoded
print("Original Values: \n" , y.iloc[:5], "\n\n Encoded values: \n", y_encoded[:5])
from sklearn.tree import DecisionTreeClassifier
model_dt = DecisionTreeClassifier(max_depth=2)
model_dt.fit(X, y_encoded)
#from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
score = cross_val_score(model_dt, X, y_encoded, scoring="accuracy", cv=5, n_jobs=-1)
np.mean(score)
Problem Try a different error metric (precision, recall)
Problem Change model depth and see how the model performance changes
def depthDT(depth):
model = DecisionTreeClassifier(max_depth=depth)
model.fit(X, y_encoded)
score = cross_val_score(model, X, y_encoded, scoring="accuracy", cv=5, n_jobs=-1)
return np.mean(score)
scores = []
for i in range(1,20,1):
score = depthDT(i)
scores.append(score)
plt.plot(scores)