Purpose: Show the unsupervised learning trick on the mnist data using a Random Forest.
The unsupervised learning trick is done by creating a new class in a classification problem using simulated data.
The data is simulated by randomly sampling from the univariate empirical distribution for each feature (not the multivariate empirical distribution). For this new class each feature is independent wrt. the empirical distribution:
\[\mathbb{P}(X_1, \dots, X_n) = \mathbb{P}(X_1) \cdots \mathbb{P}(X_n)\]
As explained by Adele Cutler in [1]:
The Unsupervised Learning Trick. Label the “real” data as class 1. Construct cases from a synthetic second class as follows:
The Synthetic Second Class. The synthetic second class has the same marginal distributions as the “real” data, but we have destroyed all the dependencies between the variables. Now we have a 2-class classification problem.
Run random forests !
[1] Slide 57-58. state-of-the-art-data-mining-using-random-forest-leo-breiman-adele-cutler.pdf
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import dataclasses
import matplotlib.pyplot as plt
from sklearn import metrics
import json
@dataclasses.dataclass
class Config:
str = 'data/processed/train-images-idx3-ubyte'
train_images_path: str = 'data/processed/train-labels-idx1-ubyte'
train_labels_path: int = 60000
train_sample_size:
str = 'data/processed/t10k-images-idx3-ubyte'
test_images_path: str = 'data/processed/t10k-labels-idx1-ubyte'
test_labels_path: int = 10000
test_sample_size:
bool = True
with_unsupervised_learning_trick: int = 42
seed: int = 6000
train_n_unknowns: int = 1000
test_n_unknowns:
dict = dataclasses.field(
random_forest_kwargs: =lambda:{
default_factory'n_estimators': 500,
'max_depth': 13,
'random_state': 42,
'n_jobs': -1,
}
)
= Config() config
def load_images(images_path):
with open(images_path, 'rb') as f:
= int.from_bytes(f.read(4), 'big')
magic_number = int.from_bytes(f.read(4), 'big')
n_images = int.from_bytes(f.read(4), 'big')
n_columns = int.from_bytes(f.read(4), 'big')
n_rows = np.frombuffer(f.read(), dtype=np.uint8)
images = images.reshape(n_images, n_columns * n_rows)
images return magic_number, n_images, n_columns, n_rows, images
def load_labels(labels_path):
with open(labels_path, 'rb') as f:
= int.from_bytes(f.read(4), 'big')
magic_number = int.from_bytes(f.read(4), 'big')
n_items = np.frombuffer(f.read(), dtype=np.uint8)
labels return magic_number, n_items, labels
def load_train(config):
= load_images(
_, _, _, _, images =config.train_images_path
images_path
)= load_labels(
_, _, labels =config.train_labels_path,
labels_path
)= images[0:config.train_sample_size,:]
images = labels[0:config.train_sample_size]
labels return images, labels
def load_test(config):
= load_images(
_, _, _, _, images =config.test_images_path
images_path
)= load_labels(
_, _, labels =config.test_labels_path,
labels_path
)= images[0:config.test_sample_size,:]
images = labels[0:config.test_sample_size]
labels return images, labels
def unsupervised_learning_trick(X, y, n_unknowns):
# Create the simulated observations.
= []
features for feature_idx in range(X_train.shape[1]):
# Randomly select the value for variable from all the observed values of variable.
= np.random.choice(X_train.shape[0], size=n_unknowns)
random_idxs = X_train[random_idxs, feature_idx]
feature
# Save.
features.append(feature)
# Combine. Unknowns are assigned label -1.
= np.column_stack(features)
X_unknown = np.repeat(-1, n_unknowns)
y_unknown
# Concatenate with original data.
= np.r_[X, X_unknown]
X = np.r_[y, y_unknown]
y
return X, y
def evaluate(y_true, y_pred):
print(metrics.classification_report(y_true, y_pred))
= plt.subplots(figsize=(12, 12))
fig, ax =ax, normalize='true')
metrics.ConfusionMatrixDisplay.from_predictions(y_true, y_pred, ax
plt.show()
def eyeball(X, y, y_hat, y_prob, n_max_eyeballs=10):
for row_idx in range(min(X.shape[0], n_max_eyeballs)):
print('True Label', y[row_idx])
print('Predicted Label', y_hat[row_idx])
print('Predicted Probability', np.max(y_prob[row_idx]))
plt.figure()28, 28))
plt.imshow(X[row_idx, :].reshape( plt.show()
# Set a seed for reproducibility.
np.random.seed(config.seed)
= load_train(config)
X_train, y_train if config.with_unsupervised_learning_trick:
= unsupervised_learning_trick(X_train, y_train, n_unknowns=config.train_n_unknowns)
X_train, y_train = RandomForestClassifier(**config.random_forest_kwargs)
rf rf.fit(X_train, y_train)
RandomForestClassifier(max_depth=13, n_estimators=500, n_jobs=-1, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(max_depth=13, n_estimators=500, n_jobs=-1, random_state=42)
= rf.predict(X_train)
y_hat_train evaluate(y_train, y_hat_train)
precision recall f1-score support
-1 1.00 1.00 1.00 6000
0 1.00 1.00 1.00 5923
1 0.99 0.99 0.99 6742
2 0.99 0.99 0.99 5958
3 0.99 0.98 0.99 6131
4 0.99 0.98 0.99 5842
5 1.00 0.99 1.00 5421
6 1.00 1.00 1.00 5918
7 0.98 0.98 0.98 6265
8 0.99 0.99 0.99 5851
9 0.97 0.98 0.97 5949
accuracy 0.99 66000
macro avg 0.99 0.99 0.99 66000
weighted avg 0.99 0.99 0.99 66000
= load_test(config)
X_test, y_test if config.with_unsupervised_learning_trick:
= unsupervised_learning_trick(X_test, y_test, n_unknowns=config.test_n_unknowns)
X_test, y_test = rf.predict(X_test)
y_hat_test = rf.predict_proba(X_test)
y_prob_test evaluate(y_test, y_hat_test)
precision recall f1-score support
-1 0.98 1.00 0.99 1000
0 0.97 0.99 0.98 980
1 0.98 0.99 0.99 1135
2 0.95 0.96 0.95 1032
3 0.95 0.95 0.95 1010
4 0.97 0.95 0.96 982
5 0.97 0.95 0.96 892
6 0.97 0.98 0.97 958
7 0.97 0.94 0.95 1028
8 0.96 0.94 0.95 974
9 0.93 0.95 0.94 1009
accuracy 0.96 11000
macro avg 0.96 0.96 0.96 11000
weighted avg 0.96 0.96 0.96 11000
= y_test != -1
not_unknown_mask print("Total", sum(not_unknown_mask))
print()
eyeball(=X_test[not_unknown_mask],
X=y_test[not_unknown_mask],
y=y_hat_test[not_unknown_mask],
y_hat=y_prob_test[not_unknown_mask]
y_prob )
Total 10000
True Label 7
Predicted Label 7
Predicted Probability 0.9793516538192172
True Label 2
Predicted Label 2
Predicted Probability 0.7504016701535301
True Label 1
Predicted Label 1
Predicted Probability 0.9738755382964321
True Label 0
Predicted Label 0
Predicted Probability 0.9352415713509062
True Label 4
Predicted Label 4
Predicted Probability 0.8931474951134427
True Label 1
Predicted Label 1
Predicted Probability 0.9789041021483729
True Label 4
Predicted Label 4
Predicted Probability 0.6839885479117203
True Label 9
Predicted Label 9
Predicted Probability 0.5554514860202037
True Label 5
Predicted Label 5
Predicted Probability 0.2591955406086213
True Label 9
Predicted Label 9
Predicted Probability 0.7983953847065036
= y_hat_test == -1
unknown_mask print("Total", sum(unknown_mask))
print()
eyeball(=X_test[unknown_mask],
X=y_test[unknown_mask],
y=y_hat_test[unknown_mask],
y_hat=y_prob_test[unknown_mask],
y_prob )
Total 1022
True Label 2
Predicted Label -1
Predicted Probability 0.2222329669340371
True Label 8
Predicted Label -1
Predicted Probability 0.1951142756865323
True Label 8
Predicted Label -1
Predicted Probability 0.17248933928188173
True Label 7
Predicted Label -1
Predicted Probability 0.22828327150130867
True Label 8
Predicted Label -1
Predicted Probability 0.30708866966945164
True Label 4
Predicted Label -1
Predicted Probability 0.19787391342365956
True Label 8
Predicted Label -1
Predicted Probability 0.225606998174814
True Label 8
Predicted Label -1
Predicted Probability 0.22178239104303824
True Label 0
Predicted Label -1
Predicted Probability 0.20292505069056974
True Label 1
Predicted Label -1
Predicted Probability 0.16867325565278937
Feel free to comment here below. A Github account is required.