import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
= pd.read_csv('mnist.zip') mnist
mnist.head()
label | pixel0 | pixel1 | pixel2 | pixel3 | pixel4 | pixel5 | pixel6 | pixel7 | pixel8 | ... | pixel774 | pixel775 | pixel776 | pixel777 | pixel778 | pixel779 | pixel780 | pixel781 | pixel782 | pixel783 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 785 columns
mnist.shape
(42000, 785)
Split the data into train and test, and investigate the distribution of digits on each partition.
= mnist['label'].values
y = mnist[[c for c in mnist.columns if c != 'label']].values X
= np.arange(len(y))
idx
np.random.shuffle(idx)
= idx[:10000]
train_idx = idx[10000:15000]
test_idx
= X[train_idx]
X_train = y[train_idx]
y_train = X[test_idx]
X_test = y[test_idx] y_test
= plt.subplots(2, figsize=(10, 8))
fig, axs 'digit': y_train}).groupby('digit').size().plot.barh(ax=axs[0])
pd.DataFrame({'digit': y_test}).groupby('digit').size().plot.barh(ax=axs[1])
pd.DataFrame({0].set_title('Train')
axs[1].set_title('Test')
axs[ plt.show()
Plot a sample of the images.
= plt.subplots(2, 5, figsize=(15, 5))
fig, axs = axs.flatten()
axs for i in range(10):
= (y_train == i)
mask 0].reshape(28, 28), cmap='gray_r')
axs[i].imshow(X_train[mask]["A Sample of images")
plt.suptitle( plt.show()
As a baseline we will use the logistic regression model from
scikit-learn
.
from sklearn.linear_model import LogisticRegression
= LogisticRegression(max_iter=30)
lr lr.fit(X_train, y_train)
/Users/troelslaegsgaard/Git/laegsgaardTroels/laegsgaardTroels.github.io/src/posts/2023-09-30-reinforcement-learning-cartpole-v0/envs/reinforcement-learning/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
LogisticRegression(max_iter=30)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(max_iter=30)
= lr.predict(X_train)
y_hat_train = lr.predict(X_test)
y_hat_test
print("Train Accuracy", np.mean(y_hat_train == y_train))
print("Test Accuracy", np.mean(y_hat_test == y_test))
Train Accuracy 0.9374
Test Accuracy 0.9008
Multiclass perceptron without intercept.
import numpy as np
class Perceptron:
def fit(self, X, y):
self.n_classes = len(np.unique(y))
self.W = np.zeros(shape=(X.shape[1], self.n_classes))
for x, y_ in zip(X, y):
= self.predict(x)
y_hat if y_hat != y_:
self.W[:, y_] += x
self.W[:, y_hat[0]] -= x
def predict(self, X):
if X.ndim == 1:
= X.reshape(1, -1)
X = X.dot(self.W)
weights return np.argmax(weights, axis=1)
= Perceptron()
pt pt.fit(X_train, y_train)
= pt.predict(X_train)
y_hat_train = pt.predict(X_test) y_hat_test
print("Train Accuracy", np.mean(y_hat_train == y_train))
print("Test Accuracy", np.mean(y_hat_test == y_test))
Train Accuracy 0.8766
Test Accuracy 0.859
Feel free to comment here below. A Github account is required.