import numpy as np
np.array([1, 2, 3, 4])
array([1, 2, 3, 4])
a = np.array([1, 2, 3, 4])
a[0]
1
a[-1]
4
a
array([1, 2, 3, 4])
a + a
array([2, 4, 6, 8])
a*a
array([ 1, 4, 9, 16])
a*5
array([ 5, 10, 15, 20])
zeros = np.zeros(100) # this will create numy array of size 100 with zero in it
zeros
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
zeros.shape
(100,)
matrix = zeros.reshape(10, 10)
matrix
array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])
matrix.shape
(10, 10)
zeros.reshape(20, 5)
array([[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.]])
np.linspace(0, 100, 50)
array([ 0. , 2.04081633, 4.08163265, 6.12244898,
8.16326531, 10.20408163, 12.24489796, 14.28571429,
16.32653061, 18.36734694, 20.40816327, 22.44897959,
24.48979592, 26.53061224, 28.57142857, 30.6122449 ,
32.65306122, 34.69387755, 36.73469388, 38.7755102 ,
40.81632653, 42.85714286, 44.89795918, 46.93877551,
48.97959184, 51.02040816, 53.06122449, 55.10204082,
57.14285714, 59.18367347, 61.2244898 , 63.26530612,
65.30612245, 67.34693878, 69.3877551 , 71.42857143,
73.46938776, 75.51020408, 77.55102041, 79.59183673,
81.63265306, 83.67346939, 85.71428571, 87.75510204,
89.79591837, 91.83673469, 93.87755102, 95.91836735,
97.95918367, 100. ])
import matplotlib.pyplot as plt
%matplotlib inline
# this is reuiqred on jupyter notebook if you want to see graphs
x = np.linspace(-10, 10, 100) # array of size 100 between -10, 10 equally spaced
y = np.sin(x)
plt.plot(x, y, marker='x')
[<matplotlib.lines.Line2D at 0x7f5068e74790>]
is called as spreadsheet of python. It is used to manipulate tabular data
import pandas as pd
data = {"Name" :['Indraraj', "Renuka", "Pushkar", "Gunjan", "Samiksha", "Nisha"],
"Location": ["Pune", "Pune", "Mumbai", "Aakurdi", "Nashik", "Pune"],
"Score":[95, 94, 96, 95, 93, 97]}
df = pd.DataFrame(data)
df
| Name | Location | Score | |
|---|---|---|---|
| 0 | Indraraj | Pune | 95 |
| 1 | Renuka | Pune | 94 |
| 2 | Pushkar | Mumbai | 96 |
| 3 | Gunjan | Aakurdi | 95 |
| 4 | Samiksha | Nashik | 93 |
| 5 | Nisha | Pune | 97 |
df.columns # column names
Index(['Name', 'Location', 'Score'], dtype='object')
df.Name
0 Indraraj 1 Renuka 2 Pushkar 3 Gunjan 4 Samiksha 5 Nisha Name: Name, dtype: object
df['Name']
0 Indraraj 1 Renuka 2 Pushkar 3 Gunjan 4 Samiksha 5 Nisha Name: Name, dtype: object
df.Score # this is series
0 95 1 94 2 96 3 95 4 93 5 97 Name: Score, dtype: int64
stock = pd.DataFrame({"value":[123, 335, 334, 124],
"high":[125, 340, 350, 150]},
index = ['IBM', 'APPLE', 'M&M', "INFY"])
stock
| value | high | |
|---|---|---|
| IBM | 123 | 125 |
| APPLE | 335 | 340 |
| M&M | 334 | 350 |
| INFY | 124 | 150 |
stock.value
IBM 123 APPLE 335 M&M 334 INFY 124 Name: value, dtype: int64
stock.value['IBM']
123
df
| Name | Location | Score | |
|---|---|---|---|
| 0 | Indraraj | Pune | 95 |
| 1 | Renuka | Pune | 94 |
| 2 | Pushkar | Mumbai | 96 |
| 3 | Gunjan | Aakurdi | 95 |
| 4 | Samiksha | Nashik | 93 |
| 5 | Nisha | Pune | 97 |
df.Score > 95
0 False 1 False 2 True 3 False 4 False 5 True Name: Score, dtype: bool
df[df.Score > 95]
| Name | Location | Score | |
|---|---|---|---|
| 2 | Pushkar | Mumbai | 96 |
| 5 | Nisha | Pune | 97 |
df[df.Location=='Pune']
| Name | Location | Score | |
|---|---|---|---|
| 0 | Indraraj | Pune | 95 |
| 1 | Renuka | Pune | 94 |
| 5 | Nisha | Pune | 97 |
A hobby botanist has collected some data of iris flowers. She has collected some parameters for sepal and petal of flowers (features). Into this observed data, botanists also puts known species names (target). Now question is , using this data can we build a ML modlel which can predict me species name if some new data is given?
from IPython.display import Image
sepal_petal_url = "https://external-content.duckduckgo.com/iu/?u=https%3A%2F%2Ftse1.mm.bing.net%2Fth%3Fid%3DOIP.5iVkkI_CdoAcvj6L_mxR7QHaGI%26pid%3DApi&f=1"
Image(url=sepal_petal_url, width=400, height=400)
from sklearn.datasets import load_iris
iris_dataset = load_iris()
iris_dataset.keys()
dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])
iris_dataset.data[:5]
array([[5.1, 3.5, 1.4, 0.2],
[4.9, 3. , 1.4, 0.2],
[4.7, 3.2, 1.3, 0.2],
[4.6, 3.1, 1.5, 0.2],
[5. , 3.6, 1.4, 0.2]])
iris_dataset.feature_names
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
iris_dataset.target_names
array(['setosa', 'versicolor', 'virginica'], dtype='<U10')
iris_dataset.target
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris_dataset['data'], iris_dataset['target'], random_state=0)
iris_dataset['data'].shape # it has 150 rows and 4 columns
(150, 4)
X_train.shape # 75% of data
(112, 4)
X_test.shape # 25% of data
(38, 4)
y_train.shape
(112,)
y_test.shape
(38,)
Convention for data naming... data is usually denoted with X, while labels are denoted by lowercase y. this is inspired from standard mathematic formula f(x) = y . X is capital because it a matrix! y is lower because it is simply a vector
!pip install mglearn
Requirement already satisfied: mglearn in /home/vikrant/programming/work/github/python-ml-course/notebooks/venv/lib/python3.8/site-packages (0.1.9)
Requirement already satisfied: numpy in /home/vikrant/programming/work/github/python-ml-course/notebooks/venv/lib/python3.8/site-packages (from mglearn) (1.21.2)
Requirement already satisfied: matplotlib in /home/vikrant/programming/work/github/python-ml-course/notebooks/venv/lib/python3.8/site-packages (from mglearn) (3.4.3)
Requirement already satisfied: scikit-learn in /home/vikrant/programming/work/github/python-ml-course/notebooks/venv/lib/python3.8/site-packages (from mglearn) (1.0)
Requirement already satisfied: pandas in /home/vikrant/programming/work/github/python-ml-course/notebooks/venv/lib/python3.8/site-packages (from mglearn) (1.3.4)
Requirement already satisfied: pillow in /home/vikrant/programming/work/github/python-ml-course/notebooks/venv/lib/python3.8/site-packages (from mglearn) (8.4.0)
Requirement already satisfied: cycler in /home/vikrant/programming/work/github/python-ml-course/notebooks/venv/lib/python3.8/site-packages (from mglearn) (0.10.0)
Requirement already satisfied: imageio in /home/vikrant/programming/work/github/python-ml-course/notebooks/venv/lib/python3.8/site-packages (from mglearn) (2.9.0)
Requirement already satisfied: joblib in /home/vikrant/programming/work/github/python-ml-course/notebooks/venv/lib/python3.8/site-packages (from mglearn) (1.1.0)
Requirement already satisfied: six in /home/vikrant/programming/work/github/python-ml-course/notebooks/venv/lib/python3.8/site-packages (from cycler->mglearn) (1.16.0)
Requirement already satisfied: python-dateutil>=2.7 in /home/vikrant/programming/work/github/python-ml-course/notebooks/venv/lib/python3.8/site-packages (from matplotlib->mglearn) (2.8.2)
Requirement already satisfied: pyparsing>=2.2.1 in /home/vikrant/programming/work/github/python-ml-course/notebooks/venv/lib/python3.8/site-packages (from matplotlib->mglearn) (2.4.7)
Requirement already satisfied: kiwisolver>=1.0.1 in /home/vikrant/programming/work/github/python-ml-course/notebooks/venv/lib/python3.8/site-packages (from matplotlib->mglearn) (1.3.2)
Requirement already satisfied: pytz>=2017.3 in /home/vikrant/programming/work/github/python-ml-course/notebooks/venv/lib/python3.8/site-packages (from pandas->mglearn) (2021.3)
Requirement already satisfied: threadpoolctl>=2.0.0 in /home/vikrant/programming/work/github/python-ml-course/notebooks/venv/lib/python3.8/site-packages (from scikit-learn->mglearn) (3.0.0)
Requirement already satisfied: scipy>=1.1.0 in /home/vikrant/programming/work/github/python-ml-course/notebooks/venv/lib/python3.8/site-packages (from scikit-learn->mglearn) (1.7.1)
WARNING: You are using pip version 21.3; however, version 21.3.1 is available.
You should consider upgrading via the '/home/vikrant/programming/work/github/python-ml-course/notebooks/venv/bin/python -m pip install --upgrade pip' command.
import mglearn
iris_dataframe = pd.DataFrame(X_train, columns=iris_dataset.feature_names)
iris_dataframe
| sepal length (cm) | sepal width (cm) | petal length (cm) | petal width (cm) | |
|---|---|---|---|---|
| 0 | 5.9 | 3.0 | 4.2 | 1.5 |
| 1 | 5.8 | 2.6 | 4.0 | 1.2 |
| 2 | 6.8 | 3.0 | 5.5 | 2.1 |
| 3 | 4.7 | 3.2 | 1.3 | 0.2 |
| 4 | 6.9 | 3.1 | 5.1 | 2.3 |
| ... | ... | ... | ... | ... |
| 107 | 4.9 | 3.1 | 1.5 | 0.1 |
| 108 | 6.3 | 2.9 | 5.6 | 1.8 |
| 109 | 5.8 | 2.7 | 4.1 | 1.0 |
| 110 | 7.7 | 3.8 | 6.7 | 2.2 |
| 111 | 4.6 | 3.2 | 1.4 | 0.2 |
112 rows × 4 columns
graph = pd.plotting.scatter_matrix(iris_dataframe, c=y_train, figsize=(15, 15), marker='o',
hist_kwds={'bins':20}, s=60, alpha=0.8, cmap=mglearn.cm3)
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
KNeighborsClassifier(n_neighbors=1)
X_new = np.array([[5, 2.9, 1, 2.0]])
X_new.shape
(1, 4)
prediction = knn.predict(X_new)
prediction
array([0])
iris_dataset['target_names'][prediction]
array(['setosa'], dtype='<U10')
y_pred = knn.predict(X_test)
print(y_pred)
[2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0 2]
y_test
array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1,
0, 0, 2, 0, 0, 1, 1, 0, 2, 1, 0, 2, 2, 1, 0, 1])
np.mean(y_pred==y_test)
0.9736842105263158
X_train, X_test, y_train, y_test = train_test_split(iris_dataset['data'],
iris_dataset['target'],
random_state=0)
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train) # give known set of inputs and outputs for learning!
print(knn.score(X_test, y_test))
0.9736842105263158
It is an algorithm which learns from known inputs and outputs and then predicts for new inputs!
There are two subcategories for this further
sample datasets
X, y = mglearn.datasets.make_forge()
mglearn.discrete_scatter(X[:, 0], X[:,1], y)
plt.legend(["Class 0", "Class 1"], loc=4)
plt.xlabel("First Feature")
plt.ylabel("Second Feature")
/home/vikrant/programming/work/github/python-ml-course/notebooks/venv/lib/python3.8/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function make_blobs is deprecated; Please import make_blobs directly from scikit-learn warnings.warn(msg, category=FutureWarning)
Text(0, 0.5, 'Second Feature')
X.shape
(26, 2)
X,y = mglearn.datasets.make_wave(n_samples=40)
plt.plot(X,y, 'o')
plt.ylim(-3, 3)
plt.xlabel("Feature")
plt.ylabel('Target')
Text(0, 0.5, 'Target')
mglearn.plots.plot_knn_classification(n_neighbors=1)
/home/vikrant/programming/work/github/python-ml-course/notebooks/venv/lib/python3.8/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function make_blobs is deprecated; Please import make_blobs directly from scikit-learn warnings.warn(msg, category=FutureWarning)
mglearn.plots.plot_knn_classification(n_neighbors=3)
/home/vikrant/programming/work/github/python-ml-course/notebooks/venv/lib/python3.8/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function make_blobs is deprecated; Please import make_blobs directly from scikit-learn warnings.warn(msg, category=FutureWarning)
from sklearn.model_selection import train_test_split
X,y = mglearn.datasets.make_forge()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
/home/vikrant/programming/work/github/python-ml-course/notebooks/venv/lib/python3.8/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function make_blobs is deprecated; Please import make_blobs directly from scikit-learn warnings.warn(msg, category=FutureWarning)
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X_train, y_train)
KNeighborsClassifier(n_neighbors=3)
clf.predict(X_test)
array([1, 0, 1, 0, 1, 0, 0])
clf.score(X_test, y_test)
0.8571428571428571
fig, axes = plt.subplots(1, 3, figsize=(10,3))
for n_neighbors, ax in zip([1, 3, 9], axes):
clf = KNeighborsClassifier(n_neighbors=n_neighbors).fit(X,y)
mglearn.plots.plot_2d_separator(clf, X, fill=True, eps=0.5, ax=ax, alpha=0.4)
mglearn.discrete_scatter(X[:, 0], X[:, 1], y, ax=ax)
ax.set_title(f"{n_neighbors} neighbors")
ax.set_xlabel("feature 0")
ax.set_xlabel("feature 1")
axes[0].legend(loc=3)
<matplotlib.legend.Legend at 0x7f505714d0d0>
In general we would like to choose paramters of a model such that model is not too simple or is not too complex
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data,
cancer.target,
stratify=cancer.target,
random_state=66)
training_accuracy = []
test_accuracy = []
n_range = range(1, 11)
for n in n_range:
clf = KNeighborsClassifier(n_neighbors=n)
clf.fit(X_train, y_train)
training_accuracy.append(clf.score(X_train, y_train))
test_accuracy.append(clf.score(X_test, y_test))
plt.plot(n_range, training_accuracy, label='training accuracy')
plt.plot(n_range, test_accuracy, label='test_accuracy')
plt.ylabel("Accuracy")
plt.xlabel("n_neighbors")
plt.legend()
<matplotlib.legend.Legend at 0x7f503a0f6a60>