from IPython.display import Image
%matplotlib inline
19 A Tour of Machine Learning Classifiers Using Scikit-Learn
19.1 Choosing a classification algorithm
19.1.1 First steps with scikit-learn
Loading the Iris dataset from scikit-learn. Here, the third column represents the petal length, and the fourth column the petal width of the flower examples. The classes are already converted to integer labels where 0=Iris-Setosa, 1=Iris-Versicolor, 2=Iris-Virginica.
from sklearn import datasets
import numpy as np
= datasets.load_iris()
iris = iris.data[:, [2, 3]]
X = iris.target
y
print('Class labels:', np.unique(y))
Class labels: [0 1 2]
Splitting data into 70% training and 30% test data:
from sklearn.model_selection import train_test_split
= train_test_split(
X_train, X_test, y_train, y_test =0.3, random_state=1, stratify=y) X, y, test_size
print('Labels counts in y:', np.bincount(y))
print('Labels counts in y_train:', np.bincount(y_train))
print('Labels counts in y_test:', np.bincount(y_test))
Labels counts in y: [50 50 50]
Labels counts in y_train: [35 35 35]
Labels counts in y_test: [15 15 15]
Standardizing the features:
from sklearn.preprocessing import StandardScaler
= StandardScaler()
sc
sc.fit(X_train)= sc.transform(X_train)
X_train_std = sc.transform(X_test) X_test_std
19.1.2 Training a perceptron via scikit-learn
from sklearn.linear_model import Perceptron
= Perceptron(eta0=0.1, random_state=1)
ppn ppn.fit(X_train_std, y_train)
Perceptron(eta0=0.1, random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Perceptron(eta0=0.1, random_state=1)
= ppn.predict(X_test_std)
y_pred print('Misclassified examples: %d' % (y_test != y_pred).sum())
Misclassified examples: 1
from sklearn.metrics import accuracy_score
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))
Accuracy: 0.978
print('Accuracy: %.3f' % ppn.score(X_test_std, y_test))
Accuracy: 0.978
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
# To check recent matplotlib compatibility
import matplotlib
from distutils.version import LooseVersion
def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
# setup marker generator and color map
= ('o', 's', '^', 'v', '<')
markers = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
colors = ListedColormap(colors[:len(np.unique(y))])
cmap
# plot the decision surface
= X[:, 0].min() - 1, X[:, 0].max() + 1
x1_min, x1_max = X[:, 1].min() - 1, X[:, 1].max() + 1
x2_min, x2_max = np.meshgrid(np.arange(x1_min, x1_max, resolution),
xx1, xx2
np.arange(x2_min, x2_max, resolution))= classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
lab = lab.reshape(xx1.shape)
lab =0.3, cmap=cmap)
plt.contourf(xx1, xx2, lab, alphamin(), xx1.max())
plt.xlim(xx1.min(), xx2.max())
plt.ylim(xx2.
# plot class examples
for idx, cl in enumerate(np.unique(y)):
=X[y == cl, 0],
plt.scatter(x=X[y == cl, 1],
y=0.8,
alpha=colors[idx],
c=markers[idx],
marker=f'Class {cl}',
label='black')
edgecolor
# highlight test examples
if test_idx:
# plot all examples
= X[test_idx, :], y[test_idx]
X_test, y_test
0],
plt.scatter(X_test[:, 1],
X_test[:, ='none',
c='black',
edgecolor=1.0,
alpha=1,
linewidth='o',
marker=100,
s='Test set') label
Training a perceptron model using the standardized training data:
= np.vstack((X_train_std, X_test_std))
X_combined_std = np.hstack((y_train, y_test))
y_combined
=X_combined_std, y=y_combined,
plot_decision_regions(X=ppn, test_idx=range(105, 150))
classifier'Petal length [standardized]')
plt.xlabel('Petal width [standardized]')
plt.ylabel(='upper left')
plt.legend(loc
plt.tight_layout()#plt.savefig('./ch03_files/figures/03_01.png', dpi=300)
plt.show()
19.2 Modeling class probabilities via logistic regression
…
19.2.1 Logistic regression intuition and conditional probabilities
import matplotlib.pyplot as plt
import numpy as np
def sigmoid(z):
return 1.0 / (1.0 + np.exp(-z))
= np.arange(-7, 7, 0.1)
z = sigmoid(z)
sigma_z
plt.plot(z, sigma_z)0.0, color='k')
plt.axvline(-0.1, 1.1)
plt.ylim('z')
plt.xlabel('$\sigma (z)$')
plt.ylabel(
# y axis ticks and gridline
0.0, 0.5, 1.0])
plt.yticks([= plt.gca()
ax True)
ax.yaxis.grid(
plt.tight_layout()#plt.savefig('./ch03_files/figures/03_02.png', dpi=300)
plt.show()
='./ch03_files/figures/03_03.png', width=500) Image(filename
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) Cell In[13], line 1 ----> 1 Image(filename='./ch03_files/figures/03_03.png', width=500) File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:970, in Image.__init__(self, data, url, filename, format, embed, width, height, retina, unconfined, metadata, alt) 968 self.unconfined = unconfined 969 self.alt = alt --> 970 super(Image, self).__init__(data=data, url=url, filename=filename, 971 metadata=metadata) 973 if self.width is None and self.metadata.get('width', {}): 974 self.width = metadata['width'] File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:327, in DisplayObject.__init__(self, data, url, filename, metadata) 324 elif self.metadata is None: 325 self.metadata = {} --> 327 self.reload() 328 self._check_data() File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:1005, in Image.reload(self) 1003 """Reload the raw data from file or URL.""" 1004 if self.embed: -> 1005 super(Image,self).reload() 1006 if self.retina: 1007 self._retina_shape() File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:353, in DisplayObject.reload(self) 351 if self.filename is not None: 352 encoding = None if "b" in self._read_flags else "utf-8" --> 353 with open(self.filename, self._read_flags, encoding=encoding) as f: 354 self.data = f.read() 355 elif self.url is not None: 356 # Deferred import FileNotFoundError: [Errno 2] No such file or directory: './ch03_files/figures/03_03.png'
='./ch03_files/figures/03_25.png', width=500) Image(filename
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) Cell In[14], line 1 ----> 1 Image(filename='./ch03_files/figures/03_25.png', width=500) File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:970, in Image.__init__(self, data, url, filename, format, embed, width, height, retina, unconfined, metadata, alt) 968 self.unconfined = unconfined 969 self.alt = alt --> 970 super(Image, self).__init__(data=data, url=url, filename=filename, 971 metadata=metadata) 973 if self.width is None and self.metadata.get('width', {}): 974 self.width = metadata['width'] File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:327, in DisplayObject.__init__(self, data, url, filename, metadata) 324 elif self.metadata is None: 325 self.metadata = {} --> 327 self.reload() 328 self._check_data() File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:1005, in Image.reload(self) 1003 """Reload the raw data from file or URL.""" 1004 if self.embed: -> 1005 super(Image,self).reload() 1006 if self.retina: 1007 self._retina_shape() File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:353, in DisplayObject.reload(self) 351 if self.filename is not None: 352 encoding = None if "b" in self._read_flags else "utf-8" --> 353 with open(self.filename, self._read_flags, encoding=encoding) as f: 354 self.data = f.read() 355 elif self.url is not None: 356 # Deferred import FileNotFoundError: [Errno 2] No such file or directory: './ch03_files/figures/03_25.png'
19.2.2 Learning the weights of the logistic loss function
def loss_1(z):
return - np.log(sigmoid(z))
def loss_0(z):
return - np.log(1 - sigmoid(z))
= np.arange(-10, 10, 0.1)
z = sigmoid(z)
sigma_z
= [loss_1(x) for x in z]
c1 ='L(w, b) if y=1')
plt.plot(sigma_z, c1, label
= [loss_0(x) for x in z]
c0 ='--', label='L(w, b) if y=0')
plt.plot(sigma_z, c0, linestyle
0.0, 5.1)
plt.ylim(0, 1])
plt.xlim(['$\sigma(z)$')
plt.xlabel('L(w, b)')
plt.ylabel(='best')
plt.legend(loc
plt.tight_layout()#plt.savefig('./ch03_files/figures/03_04.png', dpi=300)
plt.show()
class LogisticRegressionGD:
"""Gradient descent-based logistic regression classifier.
Parameters
------------
eta : float
Learning rate (between 0.0 and 1.0)
n_iter : int
Passes over the training dataset.
random_state : int
Random number generator seed for random weight
initialization.
Attributes
-----------
w_ : 1d-array
Weights after training.
b_ : Scalar
Bias unit after fitting.
losses_ : list
Mean squared error loss function values in each epoch.
"""
def __init__(self, eta=0.01, n_iter=50, random_state=1):
self.eta = eta
self.n_iter = n_iter
self.random_state = random_state
def fit(self, X, y):
""" Fit training data.
Parameters
----------
X : {array-like}, shape = [n_examples, n_features]
Training vectors, where n_examples is the number of examples and
n_features is the number of features.
y : array-like, shape = [n_examples]
Target values.
Returns
-------
self : Instance of LogisticRegressionGD
"""
= np.random.RandomState(self.random_state)
rgen self.w_ = rgen.normal(loc=0.0, scale=0.01, size=X.shape[1])
self.b_ = np.float_(0.)
self.losses_ = []
for i in range(self.n_iter):
= self.net_input(X)
net_input = self.activation(net_input)
output = (y - output)
errors self.w_ += self.eta * X.T.dot(errors) / X.shape[0]
self.b_ += self.eta * errors.mean()
= -y.dot(np.log(output)) - ((1 - y).dot(np.log(1 - output))) / X.shape[0]
loss self.losses_.append(loss)
return self
def net_input(self, X):
"""Calculate net input"""
return np.dot(X, self.w_) + self.b_
def activation(self, z):
"""Compute logistic sigmoid activation"""
return 1. / (1. + np.exp(-np.clip(z, -250, 250)))
def predict(self, X):
"""Return class label after unit step"""
return np.where(self.activation(self.net_input(X)) >= 0.5, 1, 0)
= X_train_std[(y_train == 0) | (y_train == 1)]
X_train_01_subset = y_train[(y_train == 0) | (y_train == 1)]
y_train_01_subset
= LogisticRegressionGD(eta=0.3, n_iter=1000, random_state=1)
lrgd
lrgd.fit(X_train_01_subset,
y_train_01_subset)
=X_train_01_subset,
plot_decision_regions(X=y_train_01_subset,
y=lrgd)
classifier
'Petal length [standardized]')
plt.xlabel('Petal width [standardized]')
plt.ylabel(='upper left')
plt.legend(loc
plt.tight_layout()#plt.savefig('./ch03_files/figures/03_05.png', dpi=300)
plt.show()
19.2.3 Training a logistic regression model with scikit-learn
from sklearn.linear_model import LogisticRegression
= LogisticRegression(C=100.0, solver='lbfgs', multi_class='ovr')
lr
lr.fit(X_train_std, y_train)
plot_decision_regions(X_combined_std, y_combined,=lr, test_idx=range(105, 150))
classifier'Petal length [standardized]')
plt.xlabel('Petal width [standardized]')
plt.ylabel(='upper left')
plt.legend(loc
plt.tight_layout()#plt.savefig('./ch03_files/figures/03_06.png', dpi=300)
plt.show()
/home/jjung/anaconda3/envs/islp/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:1256: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. Use OneVsRestClassifier(LogisticRegression(..)) instead. Leave it to its default value to avoid this warning.
warnings.warn(
3, :]) lr.predict_proba(X_test_std[:
array([[6.63770505e-09, 1.44747233e-01, 8.55252760e-01],
[8.34031210e-01, 1.65968790e-01, 3.20815954e-13],
[8.48822884e-01, 1.51177116e-01, 2.57998350e-14]])
3, :]).sum(axis=1) lr.predict_proba(X_test_std[:
array([1., 1., 1.])
3, :]).argmax(axis=1) lr.predict_proba(X_test_std[:
array([2, 0, 0])
3, :]) lr.predict(X_test_std[:
array([2, 0, 0])
0, :].reshape(1, -1)) lr.predict(X_test_std[
array([2])
19.2.4 Tackling overfitting via regularization
='./ch03_files/figures/03_07.png', width=700) Image(filename
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) Cell In[24], line 1 ----> 1 Image(filename='./ch03_files/figures/03_07.png', width=700) File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:970, in Image.__init__(self, data, url, filename, format, embed, width, height, retina, unconfined, metadata, alt) 968 self.unconfined = unconfined 969 self.alt = alt --> 970 super(Image, self).__init__(data=data, url=url, filename=filename, 971 metadata=metadata) 973 if self.width is None and self.metadata.get('width', {}): 974 self.width = metadata['width'] File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:327, in DisplayObject.__init__(self, data, url, filename, metadata) 324 elif self.metadata is None: 325 self.metadata = {} --> 327 self.reload() 328 self._check_data() File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:1005, in Image.reload(self) 1003 """Reload the raw data from file or URL.""" 1004 if self.embed: -> 1005 super(Image,self).reload() 1006 if self.retina: 1007 self._retina_shape() File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:353, in DisplayObject.reload(self) 351 if self.filename is not None: 352 encoding = None if "b" in self._read_flags else "utf-8" --> 353 with open(self.filename, self._read_flags, encoding=encoding) as f: 354 self.data = f.read() 355 elif self.url is not None: 356 # Deferred import FileNotFoundError: [Errno 2] No such file or directory: './ch03_files/figures/03_07.png'
= [], []
weights, params for c in np.arange(-5, 5):
= LogisticRegression(C=10.**c,
lr ='ovr')
multi_class
lr.fit(X_train_std, y_train)1])
weights.append(lr.coef_[10.**c)
params.append(
= np.array(weights)
weights 0],
plt.plot(params, weights[:, ='Petal length')
label1], linestyle='--',
plt.plot(params, weights[:, ='Petal width')
label'Weight coefficient')
plt.ylabel('C')
plt.xlabel(='upper left')
plt.legend(loc'log')
plt.xscale(#plt.savefig('./ch03_files/figures/03_08.png', dpi=300)
plt.show()
/home/jjung/anaconda3/envs/islp/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:1256: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. Use OneVsRestClassifier(LogisticRegression(..)) instead. Leave it to its default value to avoid this warning.
warnings.warn(
/home/jjung/anaconda3/envs/islp/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:1256: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. Use OneVsRestClassifier(LogisticRegression(..)) instead. Leave it to its default value to avoid this warning.
warnings.warn(
/home/jjung/anaconda3/envs/islp/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:1256: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. Use OneVsRestClassifier(LogisticRegression(..)) instead. Leave it to its default value to avoid this warning.
warnings.warn(
/home/jjung/anaconda3/envs/islp/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:1256: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. Use OneVsRestClassifier(LogisticRegression(..)) instead. Leave it to its default value to avoid this warning.
warnings.warn(
/home/jjung/anaconda3/envs/islp/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:1256: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. Use OneVsRestClassifier(LogisticRegression(..)) instead. Leave it to its default value to avoid this warning.
warnings.warn(
/home/jjung/anaconda3/envs/islp/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:1256: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. Use OneVsRestClassifier(LogisticRegression(..)) instead. Leave it to its default value to avoid this warning.
warnings.warn(
/home/jjung/anaconda3/envs/islp/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:1256: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. Use OneVsRestClassifier(LogisticRegression(..)) instead. Leave it to its default value to avoid this warning.
warnings.warn(
/home/jjung/anaconda3/envs/islp/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:1256: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. Use OneVsRestClassifier(LogisticRegression(..)) instead. Leave it to its default value to avoid this warning.
warnings.warn(
/home/jjung/anaconda3/envs/islp/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:1256: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. Use OneVsRestClassifier(LogisticRegression(..)) instead. Leave it to its default value to avoid this warning.
warnings.warn(
/home/jjung/anaconda3/envs/islp/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:1256: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. Use OneVsRestClassifier(LogisticRegression(..)) instead. Leave it to its default value to avoid this warning.
warnings.warn(
19.3 Maximum margin classification with support vector machines
='./ch03_files/figures/03_09.png', width=700) Image(filename
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) Cell In[26], line 1 ----> 1 Image(filename='./ch03_files/figures/03_09.png', width=700) File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:970, in Image.__init__(self, data, url, filename, format, embed, width, height, retina, unconfined, metadata, alt) 968 self.unconfined = unconfined 969 self.alt = alt --> 970 super(Image, self).__init__(data=data, url=url, filename=filename, 971 metadata=metadata) 973 if self.width is None and self.metadata.get('width', {}): 974 self.width = metadata['width'] File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:327, in DisplayObject.__init__(self, data, url, filename, metadata) 324 elif self.metadata is None: 325 self.metadata = {} --> 327 self.reload() 328 self._check_data() File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:1005, in Image.reload(self) 1003 """Reload the raw data from file or URL.""" 1004 if self.embed: -> 1005 super(Image,self).reload() 1006 if self.retina: 1007 self._retina_shape() File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:353, in DisplayObject.reload(self) 351 if self.filename is not None: 352 encoding = None if "b" in self._read_flags else "utf-8" --> 353 with open(self.filename, self._read_flags, encoding=encoding) as f: 354 self.data = f.read() 355 elif self.url is not None: 356 # Deferred import FileNotFoundError: [Errno 2] No such file or directory: './ch03_files/figures/03_09.png'
19.3.1 Maximum margin intuition
19.3.2 Dealing with the nonlinearly separable case using slack variables
='./ch03_files/figures/03_10.png', width=600) Image(filename
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) Cell In[27], line 1 ----> 1 Image(filename='./ch03_files/figures/03_10.png', width=600) File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:970, in Image.__init__(self, data, url, filename, format, embed, width, height, retina, unconfined, metadata, alt) 968 self.unconfined = unconfined 969 self.alt = alt --> 970 super(Image, self).__init__(data=data, url=url, filename=filename, 971 metadata=metadata) 973 if self.width is None and self.metadata.get('width', {}): 974 self.width = metadata['width'] File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:327, in DisplayObject.__init__(self, data, url, filename, metadata) 324 elif self.metadata is None: 325 self.metadata = {} --> 327 self.reload() 328 self._check_data() File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:1005, in Image.reload(self) 1003 """Reload the raw data from file or URL.""" 1004 if self.embed: -> 1005 super(Image,self).reload() 1006 if self.retina: 1007 self._retina_shape() File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:353, in DisplayObject.reload(self) 351 if self.filename is not None: 352 encoding = None if "b" in self._read_flags else "utf-8" --> 353 with open(self.filename, self._read_flags, encoding=encoding) as f: 354 self.data = f.read() 355 elif self.url is not None: 356 # Deferred import FileNotFoundError: [Errno 2] No such file or directory: './ch03_files/figures/03_10.png'
from sklearn.svm import SVC
= SVC(kernel='linear', C=1.0, random_state=1)
svm
svm.fit(X_train_std, y_train)
plot_decision_regions(X_combined_std,
y_combined,=svm,
classifier=range(105, 150))
test_idx'Petal length [standardized]')
plt.xlabel('Petal width [standardized]')
plt.ylabel(='upper left')
plt.legend(loc
plt.tight_layout()#plt.savefig('./ch03_files/figures/03_11.png', dpi=300)
plt.show()
19.3.3 Alternative implementations in scikit-learn
from sklearn.linear_model import SGDClassifier
= SGDClassifier(loss='perceptron')
ppn = SGDClassifier(loss='log')
lr = SGDClassifier(loss='hinge') svm
19.4 Solving non-linear problems using a kernel SVM
import matplotlib.pyplot as plt
import numpy as np
1)
np.random.seed(= np.random.randn(200, 2)
X_xor = np.logical_xor(X_xor[:, 0] > 0,
y_xor 1] > 0)
X_xor[:, = np.where(y_xor, 1, 0)
y_xor
== 1, 0],
plt.scatter(X_xor[y_xor == 1, 1],
X_xor[y_xor ='royalblue',
c='s',
marker='Class 1')
label== 0, 0],
plt.scatter(X_xor[y_xor == 0, 1],
X_xor[y_xor ='tomato',
c='o',
marker='Class 0')
label
-3, 3])
plt.xlim([-3, 3])
plt.ylim(['Feature 1')
plt.xlabel('Feature 2')
plt.ylabel(
='best')
plt.legend(loc
plt.tight_layout()#plt.savefig('./ch03_files/figures/03_12.png', dpi=300)
plt.show()
='./ch03_files/figures/03_13.png', width=700) Image(filename
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) Cell In[31], line 1 ----> 1 Image(filename='./ch03_files/figures/03_13.png', width=700) File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:970, in Image.__init__(self, data, url, filename, format, embed, width, height, retina, unconfined, metadata, alt) 968 self.unconfined = unconfined 969 self.alt = alt --> 970 super(Image, self).__init__(data=data, url=url, filename=filename, 971 metadata=metadata) 973 if self.width is None and self.metadata.get('width', {}): 974 self.width = metadata['width'] File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:327, in DisplayObject.__init__(self, data, url, filename, metadata) 324 elif self.metadata is None: 325 self.metadata = {} --> 327 self.reload() 328 self._check_data() File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:1005, in Image.reload(self) 1003 """Reload the raw data from file or URL.""" 1004 if self.embed: -> 1005 super(Image,self).reload() 1006 if self.retina: 1007 self._retina_shape() File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:353, in DisplayObject.reload(self) 351 if self.filename is not None: 352 encoding = None if "b" in self._read_flags else "utf-8" --> 353 with open(self.filename, self._read_flags, encoding=encoding) as f: 354 self.data = f.read() 355 elif self.url is not None: 356 # Deferred import FileNotFoundError: [Errno 2] No such file or directory: './ch03_files/figures/03_13.png'
19.4.1 Using the kernel trick to find separating hyperplanes in higher dimensional space
= SVC(kernel='rbf', random_state=1, gamma=0.10, C=10.0)
svm
svm.fit(X_xor, y_xor)
plot_decision_regions(X_xor, y_xor,=svm)
classifier
='upper left')
plt.legend(loc
plt.tight_layout()#plt.savefig('./ch03_files/figures/03_14.png', dpi=300)
plt.show()
from sklearn.svm import SVC
= SVC(kernel='rbf', random_state=1, gamma=0.2, C=1.0)
svm
svm.fit(X_train_std, y_train)
plot_decision_regions(X_combined_std, y_combined,=svm, test_idx=range(105, 150))
classifier'Petal length [standardized]')
plt.xlabel('Petal width [standardized]')
plt.ylabel(='upper left')
plt.legend(loc
plt.tight_layout()#plt.savefig('./ch03_files/figures/03_15.png', dpi=300)
plt.show()
= SVC(kernel='rbf', random_state=1, gamma=100.0, C=1.0)
svm
svm.fit(X_train_std, y_train)
plot_decision_regions(X_combined_std, y_combined,=svm, test_idx=range(105, 150))
classifier'Petal length [standardized]')
plt.xlabel('Petal width [standardized]')
plt.ylabel(='upper left')
plt.legend(loc
plt.tight_layout()#plt.savefig('./ch03_files/figures/03_16.png', dpi=300)
plt.show()
19.5 Decision tree learning
='./ch03_files/figures/03_17.png', width=500) Image(filename
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) Cell In[35], line 1 ----> 1 Image(filename='./ch03_files/figures/03_17.png', width=500) File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:970, in Image.__init__(self, data, url, filename, format, embed, width, height, retina, unconfined, metadata, alt) 968 self.unconfined = unconfined 969 self.alt = alt --> 970 super(Image, self).__init__(data=data, url=url, filename=filename, 971 metadata=metadata) 973 if self.width is None and self.metadata.get('width', {}): 974 self.width = metadata['width'] File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:327, in DisplayObject.__init__(self, data, url, filename, metadata) 324 elif self.metadata is None: 325 self.metadata = {} --> 327 self.reload() 328 self._check_data() File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:1005, in Image.reload(self) 1003 """Reload the raw data from file or URL.""" 1004 if self.embed: -> 1005 super(Image,self).reload() 1006 if self.retina: 1007 self._retina_shape() File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:353, in DisplayObject.reload(self) 351 if self.filename is not None: 352 encoding = None if "b" in self._read_flags else "utf-8" --> 353 with open(self.filename, self._read_flags, encoding=encoding) as f: 354 self.data = f.read() 355 elif self.url is not None: 356 # Deferred import FileNotFoundError: [Errno 2] No such file or directory: './ch03_files/figures/03_17.png'
def entropy(p):
return - p * np.log2(p) - (1 - p) * np.log2((1 - p))
= np.arange(0.0, 1.0, 0.01)
x = [entropy(p) if p != 0 else None
ent for p in x]
'Entropy')
plt.ylabel('Class-membership probability p(i=1)')
plt.xlabel(
plt.plot(x, ent)#plt.savefig('./ch03_files/figures/03_26.png', dpi=300)
plt.show()
='./ch03_files/figures/03_18.png', width=500) Image(filename
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) Cell In[37], line 1 ----> 1 Image(filename='./ch03_files/figures/03_18.png', width=500) File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:970, in Image.__init__(self, data, url, filename, format, embed, width, height, retina, unconfined, metadata, alt) 968 self.unconfined = unconfined 969 self.alt = alt --> 970 super(Image, self).__init__(data=data, url=url, filename=filename, 971 metadata=metadata) 973 if self.width is None and self.metadata.get('width', {}): 974 self.width = metadata['width'] File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:327, in DisplayObject.__init__(self, data, url, filename, metadata) 324 elif self.metadata is None: 325 self.metadata = {} --> 327 self.reload() 328 self._check_data() File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:1005, in Image.reload(self) 1003 """Reload the raw data from file or URL.""" 1004 if self.embed: -> 1005 super(Image,self).reload() 1006 if self.retina: 1007 self._retina_shape() File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:353, in DisplayObject.reload(self) 351 if self.filename is not None: 352 encoding = None if "b" in self._read_flags else "utf-8" --> 353 with open(self.filename, self._read_flags, encoding=encoding) as f: 354 self.data = f.read() 355 elif self.url is not None: 356 # Deferred import FileNotFoundError: [Errno 2] No such file or directory: './ch03_files/figures/03_18.png'
19.5.1 Maximizing information gain - getting the most bang for the buck
import matplotlib.pyplot as plt
import numpy as np
def gini(p):
return p * (1 - p) + (1 - p) * (1 - (1 - p))
def entropy(p):
return - p * np.log2(p) - (1 - p) * np.log2((1 - p))
def error(p):
return 1 - np.max([p, 1 - p])
= np.arange(0.0, 1.0, 0.01)
x
= [entropy(p) if p != 0 else None for p in x]
ent = [e * 0.5 if e else None for e in ent]
sc_ent = [error(i) for i in x]
err
= plt.figure()
fig = plt.subplot(111)
ax for i, lab, ls, c, in zip([ent, sc_ent, gini(x), err],
'Entropy', 'Entropy (scaled)',
['Gini impurity', 'Misclassification error'],
'-', '-', '--', '-.'],
['black', 'lightgray', 'red', 'green', 'cyan']):
[= ax.plot(x, i, label=lab, linestyle=ls, lw=2, color=c)
line
='upper center', bbox_to_anchor=(0.5, 1.15),
ax.legend(loc=5, fancybox=True, shadow=False)
ncol
=0.5, linewidth=1, color='k', linestyle='--')
ax.axhline(y=1.0, linewidth=1, color='k', linestyle='--')
ax.axhline(y0, 1.1])
plt.ylim(['p(i=1)')
plt.xlabel('Impurity index')
plt.ylabel(#plt.savefig('./ch03_files/figures/03_19.png', dpi=300, bbox_inches='tight')
plt.show()
19.5.2 Building a decision tree
from sklearn.tree import DecisionTreeClassifier
= DecisionTreeClassifier(criterion='gini',
tree_model =4,
max_depth=1)
random_state
tree_model.fit(X_train, y_train)
= np.vstack((X_train, X_test))
X_combined = np.hstack((y_train, y_test))
y_combined
plot_decision_regions(X_combined, y_combined,=tree_model,
classifier=range(105, 150))
test_idx
'Petal length [cm]')
plt.xlabel('Petal width [cm]')
plt.ylabel(='upper left')
plt.legend(loc
plt.tight_layout()#plt.savefig('./ch03_files/figures/03_20.png', dpi=300)
plt.show()
19.5.3 Combining weak to strong learners via random forests
from sklearn.ensemble import RandomForestClassifier
= RandomForestClassifier(n_estimators=25,
forest =1,
random_state=2)
n_jobs
forest.fit(X_train, y_train)
plot_decision_regions(X_combined, y_combined,=forest, test_idx=range(105, 150))
classifier
'Petal length [cm]')
plt.xlabel('Petal width [cm]')
plt.ylabel(='upper left')
plt.legend(loc
plt.tight_layout()#plt.savefig('./ch03_files/figures/03_2.png', dpi=300)
plt.show()
19.6 K-nearest neighbors - a lazy learning algorithm
='./ch03_files/figures/03_23.png', width=400) Image(filename
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) Cell In[42], line 1 ----> 1 Image(filename='./ch03_files/figures/03_23.png', width=400) File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:970, in Image.__init__(self, data, url, filename, format, embed, width, height, retina, unconfined, metadata, alt) 968 self.unconfined = unconfined 969 self.alt = alt --> 970 super(Image, self).__init__(data=data, url=url, filename=filename, 971 metadata=metadata) 973 if self.width is None and self.metadata.get('width', {}): 974 self.width = metadata['width'] File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:327, in DisplayObject.__init__(self, data, url, filename, metadata) 324 elif self.metadata is None: 325 self.metadata = {} --> 327 self.reload() 328 self._check_data() File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:1005, in Image.reload(self) 1003 """Reload the raw data from file or URL.""" 1004 if self.embed: -> 1005 super(Image,self).reload() 1006 if self.retina: 1007 self._retina_shape() File ~/anaconda3/envs/islp/lib/python3.11/site-packages/IPython/core/display.py:353, in DisplayObject.reload(self) 351 if self.filename is not None: 352 encoding = None if "b" in self._read_flags else "utf-8" --> 353 with open(self.filename, self._read_flags, encoding=encoding) as f: 354 self.data = f.read() 355 elif self.url is not None: 356 # Deferred import FileNotFoundError: [Errno 2] No such file or directory: './ch03_files/figures/03_23.png'
from sklearn.neighbors import KNeighborsClassifier
= KNeighborsClassifier(n_neighbors=5,
knn =2,
p='minkowski')
metric
knn.fit(X_train_std, y_train)
plot_decision_regions(X_combined_std, y_combined,=knn, test_idx=range(105, 150))
classifier
'Petal length [standardized]')
plt.xlabel('Petal width [standardized]')
plt.ylabel(='upper left')
plt.legend(loc
plt.tight_layout()#plt.savefig('./ch03_files/figures/03_24_./figures.png', dpi=300)
plt.show()
19.7 Summary
…