Notes: https://notes.pipal.in/2018/vmware-ml2/
1 - Introduction | Freight Optimization
import numpy as np
import pandas as pd
url = "https://notes.pipal.in/2018/vmware-ml2/10-Secondary-Freight-Data.csv"
matrix_url = "https://notes.pipal.in/2018/vmware-ml2/13-location-distance-matrix.csv"
freight = pd.read_csv(url)
freight.head()
freight.dtypes
freight.columns
matrix = pd.read_csv(matrix_url)
matrix.head()
Basic Cleaning:
columns = {
'Sales Order No. New': 'orderno',
'Date': 'date',
'Supplying DC Location': 'source',
'Customer Code New': 'custcode',
'Customer Town': 'dest',
'Qty (in cases)': 'qty',
'Freight/Cartage': 'cartage',
'Remarks': 'remarks',
'Amount': 'amount'
}
freight.rename(columns=columns, inplace=True)
freight.head()
freight.drop(["orderno", "custcode"], axis=1, inplace=True)
freight.head()
freight.amount.dtype
freight.amount.unique()[:20]
freight.dtypes
# remove commas in the numbers
freight.amount.head()
freight.amount.head().str.replace(",", "")
freight.amount = freight.amount.str.replace(",", "")
freight.amount.head()
Problem: Replace commas from cartage column.
freight.cartage = freight.cartage.str.replace(",", "")
# fix column types
freight.amount = pd.to_numeric(freight.amount)
freight.cartage = pd.to_numeric(freight.cartage)
freight.date = pd.to_datetime(freight.date)
freight.head()
# dtypes
freight.dtypes
Q: Is the remarks column required?
Lets see.
freight.remarks.unique()
freight[freight.remarks=='Charges - Per case'].head()
freight[freight.remarks=='Charges - Per Consignment'].head()
len(freight)
freight.isnull().head()
freight.isnull().sum()
freight[freight.source.isnull()]
# remove NaN values
freight.dropna(inplace=True)
shortunits = {
"Charges - Per case": "case",
"Charges - Per Consignment": "consignment"
}
freight.remarks = freight.remarks.replace(shortunits)
freight.remarks.value_counts()
freight.dest.value_counts()
def fix_names(c):
c = c.str.strip()
c = c.str.title()
return c
freight.source = fix_names(freight.source)
freight.dest = fix_names(freight.dest)
freight.head()
Q: Which source has the maximum quantity of transfer?
freight.groupby('source').sum().head()
freight.groupby('source').sum().qty.sort_values(ascending=False).head()
(freight.groupby('source')
.sum()
.qty
.sort_values(ascending=False)
.head()
)
Q: What is the source/destination pair that has the maximum quantity of transfer?
(freight.groupby(['source', 'dest'])
.sum()
.qty
.sort_values(ascending=False)
.head()
)
Q: Show data for Bangalore -> Bangalore.
freight[(freight.source == 'Bangalore') & (freight.dest == 'Bangalore')].head()
freight['computed_amount'] = freight.qty * freight.cartage
freight.head()
Problem: Refine the location-distance-matrix data and merge it with the freight dataframe.
Hint: pd.merge?
matrix = pd.read_csv(matrix_url)
matrix.head()
# fix column names
columns = {
"Source": "source",
"S. Longitude": "slon",
"S. Latitude": "slat",
"Destination": "dest",
"D. Longitude": "dlon",
"D. Latitude": "dlat",
"Lane": "lane",
"Distance": "distance"
}
matrix.rename(columns=columns, inplace=True)
matrix.head()
# missing values
matrix.isnull().sum()
# standadize names
matrix.source = fix_names(matrix.source)
matrix.dest = fix_names(matrix.dest)
matrix.head()
(matrix.source + "--" + matrix.dest).nunique(), len(matrix)
# drop the duplicates
matrix.drop_duplicates(['source', 'dest'], inplace=True)
len(matrix)
df = pd.merge(freight, matrix, how='left', on=['source', 'dest'])
len(freight), len(matrix), len(df)
df.head()
df.isnull().sum()
len(df)
df[df.isnull().sum(axis=1)>0].head()
Q: What is the total amount where distance is missing?
df[df.isnull().sum(axis=1)>0].amount.sum()
df[df.isnull().sum(axis=1)>0].amount.sum() / df.amount.sum()
freight.source.nunique(), matrix.source.nunique(), df.source.nunique()
freight.source.unique()
matrix.source.unique()
set(freight.source.unique()) - set(matrix.source.unique())
set(freight.dest.unique()) - set(matrix.dest.unique())
cities = {
'Ahemadabad': 'Ahmedabad',
'Bbsr': 'Bhubaneshwar',
'Gauwhati': 'Guwahati',
'Zirakhpur': 'Zirakpur'
}
# freight.replace({"source": cities}).head()
freight.replace({"source": cities}, inplace=True)
freight.head()
df = pd.merge(freight, matrix, how='left', on=['source', 'dest'])
df.isnull().sum()
df[df.isnull().sum(axis=1) > 0].head()
df[df.isnull().sum(axis=1) > 0].amount.sum() / df.amount.sum()
data = df.dropna()
data = data[data.remarks == "case"].copy()
#!pip install altair
import altair as alt
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("ggplot")
data.distance.hist(bins=40)
data.cartage.hist(bins=40);
data[data.cartage > 100]
data.plot(kind="scatter", x="distance", y="cartage", alpha=0.05);
data.plot(kind="scatter", x="distance", y="cartage", alpha=0.05, logy=True);
data.loc[data.distance==0, "distance"] = 5
data.plot(kind="scatter", x="distance", y="cartage", alpha=0.05, logy=True, logx=True);
data.plot(kind="scatter",
x="distance",
y="cartage",
c="amount",
alpha=0.05,
logy=True,
logx=True,
cmap='viridis');
data.to_csv("data.csv")
data.head()
Do we need to take date column in modeling?
Let us see if date has any effect on the cartage.
data[(data.source=='Bangalore') & (data.dest == 'Hubli')].describe()
data.groupby(['source', 'dest']).std().cartage.value_counts()
data.groupby(['source', 'dest']).std().cartage.hist();
# roll up
agg_func = {
"cartage": ["mean"],
"distance": ["mean"],
"qty": ["sum"],
"amount": ["sum", "count"]
}
newdata = (data
.groupby(["source", "dest", "slat", "slon", "dlat", "dlon"])
.agg(agg_func)
.reset_index())
newdata.columns = newdata.columns.droplevel(1)
newdata.columns = list(newdata.columns)[:-1] + ['count']
newdata.head()
newdata.plot(kind='scatter', x='distance', y='cartage', alpha=0.3);
newdata.plot(kind='scatter', x='distance', y='cartage', alpha=0.3, logy=True, logx=True);
newdata['distance_log'] = np.log10(newdata.distance)
newdata['cartage_log'] = np.log10(newdata.cartage)
X = newdata[['distance_log']]
y = newdata.cartage_log
X.head()
y.head()
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X, y)
model.intercept_, model.coef_
def show_predicitons(X, y, y_pred):
plt.scatter(X, y)
plt.plot(X, y_pred, color='b')
y_pred = model.predict(X)
show_predicitons(X, y, y_pred)
from sklearn.model_selection import cross_val_score
model = LinearRegression()
score = cross_val_score(model,
X, y,
scoring='neg_mean_squared_error',
cv=5,
n_jobs=-1,)
np.mean(score)
score
Lets build another model with 2 features.
X2 = newdata[['distance_log', 'count']]
model = LinearRegression()
score = cross_val_score(model,
X2, y,
scoring='neg_mean_squared_error',
cv=5,
n_jobs=-1,)
np.mean(score)
newdata['cartage_per_distance'] = newdata['cartage'] / newdata['distance']
newdata['cartage_per_distance_log'] = np.log10(newdata['cartage_per_distance'])
newdata.plot(kind='scatter',
x='distance',
y='cartage_per_distance',
logx=True,
logy=True);
model = LinearRegression()
X = newdata[['distance_log']]
y = newdata['cartage_per_distance_log']
score = cross_val_score(model,
X, y,
scoring='neg_mean_squared_error',
cv=5,
n_jobs=-1,)
np.mean(score)
model.fit(X, y)
y_pred = model.predict(X)
show_predicitons(X, y, y_pred)
from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor(max_depth=2)
X = newdata[['distance_log']]
y = newdata['cartage_per_distance_log']
tree.fit(X, y)
def show_predicitons(X, y, y_pred):
plt.scatter(X, y, color='r')
plt.scatter(X, y_pred, color='b')
y_pred = tree.predict(X)
show_predicitons(X, y, y_pred)
#!pip install modelvis
import modelvis
modelvis.render_tree(tree, feature_names=['distance_log'])
model.predict([[1.4]])
from sklearn.model_selection import GridSearchCV
tree = DecisionTreeRegressor()
param_grid = {
"max_depth": [2, 3, 4, 5, 6, 7, 8, 9, 10]
}
grid = GridSearchCV(tree,
param_grid=param_grid,
cv=10,
scoring="neg_mean_squared_error")
grid.fit(X, y)
grid.best_estimator_
tree = grid.best_estimator_
df2 = newdata.sort_values(by='distance_log')
X = df2[['distance_log']]
y = df2.cartage_per_distance_log
def show_predicitons2(model, X, y):
X = X.as_matrix()
X1 = np.arange(X[0].min(), X[0].max(), 1000)
y1 = model.predict(X1)
def show_predicitons(X, y, y_pred):
plt.scatter(X, y, color='r')
plt.plot(X, y_pred, color='b')
y_pred = tree.predict(X)
show_predicitons(X, y, y_pred)
Download the cars dataset and build a model to predict the price of a car.
features: brand, kmpl, bhp, type
target: price