#!/usr/bin/env python # coding: utf-8 # ### Homeworks 1 and 2 # # David Allemang. STOR 881.001.FA25. # # Since the dataset and visualization content are the same, I've done both homeworks in the same notebook and will submit them together. # Basic data loading. Each cell re-loads local variables from one of these datasets so they can be executed in any order. # In[ ]: import matplotlib.pyplot as plt import numpy as np import pandas as pd from scipy.stats import gaussian_kde np.set_printoptions(suppress=True, precision=3) set_i = pd.read_excel( "data/AppleBananaPearSetI.xlsx", sheet_name=["MainData", "ClassLabels"], index_col=None, header=None, ) set_ii = pd.read_excel( "data/AppleBananaPearSetII.xlsx", sheet_name=["MainData", "ClassLabels"], index_col=None, header=None, ) # The workhorse scatterplot matrix implementation. Uses a variated layout from discussed in class, but I think it is easier to read. # In[ ]: def scatterplot_matrix( data: np.ndarray, c=None, alpha=0.10, axislabels: list[str] | None = None, bw_method: str | float | None = None, ): """ Builds a scatterplot matrix on D-dimensional data. The diagonal plots show the coordinates on the line x=y so that correspondent points are unconditionally aligned within each row or column. The diagonal plots are not available for KDE, then, so the KDE is shown along the top and left edges instead. :param data: Shape (D, N) data matrix. :param c: Point labels (color-mapped). :param alpha: Point marker alpha to mitigate overdraw. :param axislabels: Coordinate axis labels. """ D = len(data) if axislabels is None: axislabels = (f"$x_{i + 1}$ coordinate" for i in range(D)) figure, axes = plt.subplots( D + 1, D + 1, figsize=(10, 10), sharex="col", sharey="row", width_ratios=[1] + [3] * D, height_ratios=[1] + [3] * D, ) axes = np.array(axes) # for easier iterating space = np.linspace(-4, 4, 100) ticks = np.linspace(-4, 4, 5) # Populate KDE charts. for row_idx, (coord, label) in enumerate(zip(data, axislabels)): kde = gaussian_kde(coord, bw_method=bw_method)(space) row: plt.Axes = axes[row_idx + 1, 0] row.plot(kde, space, c="k", linewidth=2) row.spines[["left", "bottom", "top"]].set_visible(False) row.tick_params( left=False, right=True, bottom=False, top=False, labelleft=False, labelbottom=False, ) row.set_ylabel(label) row.xaxis.set_inverted(True) row.set_yticks(ticks) row.set_aspect(1 / 2) col: plt.Axes = axes[0, row_idx + 1] col.plot(space, kde, c="k", linewidth=2) col.spines[["left", "right", "top"]].set_visible(False) col.tick_params( left=False, right=False, bottom=True, top=False, labelleft=False, labelbottom=False, ) col.set_xlabel(label) col.xaxis.set_label_position("top") col.set_xticks(ticks) col.set_aspect(2) axes[row_idx + 1, 1].tick_params(labelleft=True) axes[row_idx + 1, -1].tick_params(labelright=True) axes[1, row_idx + 1].tick_params(labeltop=True) axes[-1, row_idx + 1].tick_params(labelbottom=True) # Populate primary matrix. for row_idx, col_idx in np.ndindex((D, D)): ax: plt.Axes = axes[row_idx + 1, col_idx + 1] ax.set_axisbelow(True) ax.grid(True) ax.tick_params( left=True, right=True, bottom=True, top=True, ) ax.set_aspect(1) if row_idx == col_idx: ax.plot( [0, 1], [0, 1], transform=ax.transAxes, c="k", linewidth=1, zorder=0, ) ax.scatter(data[col_idx], data[row_idx], c=c, alpha=alpha) # Remove the plot in the KDE diagonal entry. axes[0, 0].remove() figure.tight_layout() # # Assignment I - Apple Banana Pear # # The dataset is so named because it contains three point clusters vaguely shaped like those fruit (and the red, green, and yellow color labels helpfully match). The apple (spheroid distribution) and pear (oblong distribution) are clearly visible in the $x_{12}$ and $x_{23}$ plots, while the banana (crescent distribution) is clearly visible in the $x_{13}$ plot. # In[ ]: data = set_i["MainData"].to_numpy() labels = set_i["ClassLabels"].to_numpy().squeeze() residual = data - data.mean(axis=1, keepdims=True) scatterplot_matrix(residual, c=labels, bw_method=0.2) plt.suptitle("Apple-Banana-Pear I Mean Residuals.") plt.tight_layout() plt.show() # # Assignment II - Rotated Apple Banana Pear # # The same plot with the rotated data. The fruit are still visible to the eye, but the clusters overlap horribly in all three views. # In[ ]: data = set_ii["MainData"].to_numpy() labels = set_ii["ClassLabels"].to_numpy().squeeze() residual = data - data.mean(axis=1, keepdims=True) scatterplot_matrix(residual, c=labels, bw_method=0.2) plt.suptitle("Apple-Banana-Pear II Mean Residuals.") plt.tight_layout() plt.show() # # Assignment II - PCA on Apple Banana Pear # # Extract the principal modes of variation (eigenvectors of the covariance matrix), and show the scatterplot matrix of the data transformed into that eigenbasis. Additionally output the covariance, eigenbasis, and transformation matrices. # In[ ]: data = set_ii["MainData"].to_numpy() labels = set_ii["ClassLabels"].to_numpy().squeeze() residual = data - data.mean(axis=1, keepdims=True) covariance = residual @ residual.T / (data.shape[1] - 1) # or np.cov(data) print("covariance:", covariance, sep="\n", end="\n\n") res = np.linalg.eig(covariance) index = np.argsort(-res.eigenvalues) eigenvalues = res.eigenvalues[index] print("eigenvalues:", eigenvalues, sep="\n", end="\n\n") eigenvectors = res.eigenvectors[:, index] print("eigenvectors:", eigenvectors, sep="\n", end="\n\n") to_eigenbasis = np.linalg.inv(eigenvectors) print("transform to eigenbasis:", to_eigenbasis, sep="\n", end="\n\n") scatterplot_matrix( to_eigenbasis @ residual, c=labels, axislabels=["PC 1", "PC 2", "PC 3"], bw_method=0.2, ) plt.suptitle("Apple-Banana-Pear II Principal Components") plt.tight_layout() plt.show() # Out of curiosity and as a sanity check, I apply PCA on the non-rotated dataset. Indeed the principal components are the same. Some notes: # # - The covariance matrix is approximately diagonal. # - The eigenvalues are identical. # - The eigenbasis is approximately a permutation which sorts the variances. # In[ ]: data = set_i["MainData"].to_numpy() labels = set_i["ClassLabels"].to_numpy().squeeze() residual = data - data.mean(axis=1, keepdims=True) covariance = residual @ residual.T / (data.shape[1] - 1) # or np.cov(data) print("covariance:", covariance, sep="\n", end="\n\n") res = np.linalg.eig(covariance) index = np.argsort(-res.eigenvalues) eigenvalues = res.eigenvalues[index] print("eigenvalues:", eigenvalues, sep="\n", end="\n\n") eigenvectors = res.eigenvectors[:, index] print("eigenvectors:", eigenvectors, sep="\n", end="\n\n") to_eigenbasis = np.linalg.inv(eigenvectors) print("transform to eigenbasis:", to_eigenbasis, sep="\n", end="\n\n") scatterplot_matrix( to_eigenbasis @ residual, c=labels, axislabels=["PC 1", "PC 2", "PC 3"], bw_method=0.2, ) plt.suptitle("Apple-Banana-Pear I Principal Components") plt.tight_layout() plt.show()