Purpose: Show an example of how to cluster numerical features using their correlation.
The number \(\rho(X, Y) ^2\) is called the coefficient of determination. It measures how much of the variation in \(Y\) can be explained by a linear relationship to \(X\), see [1]. And \(1 - \rho(X, Y) ^2\) is the amount of unexplained variation from a linear relationship with \(X\).
[1] Page 212-214. Peter Olofsson. Probability, Statistics, and Stochastic Processes.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from scipy.cluster import hierarchy as hc
'figure.figsize'] = (12, 12)
matplotlib.rcParams[
= pd.read_csv('aarhus_apartments.csv') aarhus_apartments
aarhus_apartments.corr()
zip_code | price | rooms | size | build_year | latitude | longitude | |
---|---|---|---|---|---|---|---|
zip_code | 1.000000 | -0.145322 | -0.129532 | -0.146553 | 0.010671 | 0.622659 | 0.244359 |
price | -0.145322 | 1.000000 | 0.416454 | 0.567506 | -0.075140 | 0.004019 | 0.181975 |
rooms | -0.129532 | 0.416454 | 1.000000 | 0.795884 | -0.127959 | -0.050889 | 0.081551 |
size | -0.146553 | 0.567506 | 0.795884 | 1.000000 | -0.064468 | 0.003794 | 0.225137 |
build_year | 0.010671 | -0.075140 | -0.127959 | -0.064468 | 1.000000 | 0.061176 | 0.062275 |
latitude | 0.622659 | 0.004019 | -0.050889 | 0.003794 | 0.061176 | 1.000000 | 0.534906 |
longitude | 0.244359 | 0.181975 | 0.081551 | 0.225137 | 0.062275 | 0.534906 | 1.000000 |
def correlation_matrix(df):
"""Plots the correlation matrix.
Args:
df (pd.DataFrame): A pandas DataFrame.
"""
= df.corr()
corr = plt.figure()
f =f.number)
plt.matshow(corr, fignum
plt.xticks(range(len(corr.columns)),
corr.columns,=14,
fontsize=45,
rotation
)
plt.yticks(range(len(corr.columns)),
corr.columns,=14,
fontsize
)= plt.colorbar()
cb =14)
cb.ax.tick_params(labelsize'Correlation Matrix', fontsize=16)
plt.title(
plt.show()
correlation_matrix(aarhus_apartments)
def correlation_dendogram(df, method='single'):
"""Used to plot the dendogram from a correlation matrix.
1 - corr ** 2 can be interpreted as the unexplained variance
from a linear model between a bivariate distribution (X, Y).
This can be interpreted as a distance matrix
Args:
df (pd.DataFrame):
"""
plt.figure()= df.corr()
corr = np.round(corr, 2)
corr = 1 - corr ** 2
distance_matrix = hc.distance.squareform(distance_matrix)
corr_condensed = hc.linkage(corr_condensed, method=method)
z
hc.dendrogram(
z,=corr.columns,
labels='left',
orientation=16,
leaf_font_size
)
plt.title("Agglomerative Clustering, single linkage with "
"distance = $1 - \\rho^2$"
)
plt.xlabel("Unexplained Variance"
)
plt.show()
correlation_dendogram(aarhus_apartments)
Feel free to comment here below. A Github account is required.