import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

mnist = pd.read_csv('mnist.zip')

mnist.head()

	label	...
0	1	...
1	0	...
2	1	...
3	4	...
4	0	...

5 rows × 785 columns

mnist.shape

(42000, 785)

Resampling

Split the data into train and test, and investigate the distribution of digits on each partition.

y = mnist['label'].values
X = mnist[[c for c in mnist.columns if c != 'label']].values

idx = np.arange(len(y))
np.random.shuffle(idx)

train_idx = idx[:10000]
test_idx = idx[10000:15000]

X_train = X[train_idx]
y_train = y[train_idx]
X_test = X[test_idx]
y_test = y[test_idx]

fig, axs = plt.subplots(2, figsize=(10, 8))
pd.DataFrame({'digit': y_train}).groupby('digit').size().plot.barh(ax=axs[0])
pd.DataFrame({'digit': y_test}).groupby('digit').size().plot.barh(ax=axs[1])
axs[0].set_title('Train')
axs[1].set_title('Test')
plt.show()

Plot a sample of the images.

fig, axs = plt.subplots(2, 5, figsize=(15, 5))
axs = axs.flatten()
for i in range(10):
    mask = (y_train == i)
    axs[i].imshow(X_train[mask][0].reshape(28, 28), cmap='gray_r')
plt.suptitle("A Sample of images")
plt.show()

Baseline: Logistic Regression

As a baseline we will use the logistic regression model from scikit-learn.

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=30)
lr.fit(X_train, y_train)

/Users/troelslaegsgaard/Git/laegsgaardTroels/laegsgaardTroels.github.io/src/posts/2023-09-30-reinforcement-learning-cartpole-v0/envs/reinforcement-learning/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

LogisticRegression(max_iter=30)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

y_hat_train = lr.predict(X_train)
y_hat_test = lr.predict(X_test)

print("Train Accuracy", np.mean(y_hat_train == y_train))
print("Test Accuracy", np.mean(y_hat_test == y_test))

Train Accuracy 0.9374
Test Accuracy 0.9008

Multiclass Perceptron

Multiclass perceptron without intercept.

import numpy as np

class Perceptron:
    
    def fit(self, X, y):
        self.n_classes = len(np.unique(y))
        self.W = np.zeros(shape=(X.shape[1], self.n_classes))
        for x, y_ in zip(X, y):
            y_hat = self.predict(x)
            if y_hat != y_:
                self.W[:, y_] += x
                self.W[:, y_hat[0]] -= x
                    
    def predict(self, X):
        if X.ndim == 1:
            X = X.reshape(1, -1)
        weights = X.dot(self.W)
        return np.argmax(weights, axis=1)

pt = Perceptron()
pt.fit(X_train, y_train)

y_hat_train = pt.predict(X_train)
y_hat_test = pt.predict(X_test)

print("Train Accuracy", np.mean(y_hat_train == y_train))
print("Test Accuracy", np.mean(y_hat_test == y_test))

Train Accuracy 0.8766
Test Accuracy 0.859

	label	...
0	1	...
1	0	...
2	1	...
3	4	...
4	0	...

	label	...
0	1	...
1	0	...
2	1	...
3	4	...
4	0	...

Perceptron

Resampling

Baseline: Logistic Regression

Multiclass Perceptron

Comments

	label	...
0	1	...
1	0	...
2	1	...
3	4	...
4	0	...