Fall 2025

2026-05-25 11:32:47 -04:00
commit 2e1103d295
5 changed files with 362 additions and 0 deletions
--- a/2.py
+++ b/2.py
@@ -0,0 +1,253 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# ### Homeworks 1 and 2
+# 
+# David Allemang. STOR 881.001.FA25.
+# 
+# Since the dataset and visualization content are the same, I've done both homeworks in the same notebook and will submit them together.
+
+# Basic data loading. Each cell re-loads local variables from one of these datasets so they can be executed in any order.
+
+# In[ ]:
+
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from scipy.stats import gaussian_kde
+
+np.set_printoptions(suppress=True, precision=3)
+
+set_i = pd.read_excel(
+    "data/AppleBananaPearSetI.xlsx",
+    sheet_name=["MainData", "ClassLabels"],
+    index_col=None,
+    header=None,
+)
+
+set_ii = pd.read_excel(
+    "data/AppleBananaPearSetII.xlsx",
+    sheet_name=["MainData", "ClassLabels"],
+    index_col=None,
+    header=None,
+)
+
+# The workhorse scatterplot matrix implementation. Uses a variated layout from discussed in class, but I think it is easier to read.
+
+# In[ ]:
+
+
+def scatterplot_matrix(
+        data: np.ndarray,
+        c=None,
+        alpha=0.10,
+        axislabels: list[str] | None = None,
+        bw_method: str | float | None = None,
+):
+    """
+    Builds a scatterplot matrix on D-dimensional data. The diagonal plots
+    show the coordinates on the line x=y so that correspondent points are
+    unconditionally aligned within each row or column. The diagonal plots are
+    not available for KDE, then, so the KDE is shown along the top and left
+    edges instead.
+
+    :param data: Shape (D, N) data matrix.
+    :param c: Point labels (color-mapped).
+    :param alpha: Point marker alpha to mitigate overdraw.
+    :param axislabels: Coordinate axis labels.
+    """
+
+    D = len(data)
+
+    if axislabels is None:
+        axislabels = (f"$x_{i + 1}$ coordinate" for i in range(D))
+
+    figure, axes = plt.subplots(
+        D + 1,
+        D + 1,
+        figsize=(10, 10),
+        sharex="col",
+        sharey="row",
+        width_ratios=[1] + [3] * D,
+        height_ratios=[1] + [3] * D,
+    )
+    axes = np.array(axes)  # for easier iterating
+
+    space = np.linspace(-4, 4, 100)
+    ticks = np.linspace(-4, 4, 5)
+
+    # Populate KDE charts.
+    for row_idx, (coord, label) in enumerate(zip(data, axislabels)):
+        kde = gaussian_kde(coord, bw_method=bw_method)(space)
+
+        row: plt.Axes = axes[row_idx + 1, 0]
+        row.plot(kde, space, c="k", linewidth=2)
+        row.spines[["left", "bottom", "top"]].set_visible(False)
+        row.tick_params(
+            left=False,
+            right=True,
+            bottom=False,
+            top=False,
+            labelleft=False,
+            labelbottom=False,
+        )
+        row.set_ylabel(label)
+        row.xaxis.set_inverted(True)
+        row.set_yticks(ticks)
+        row.set_aspect(1 / 2)
+
+        col: plt.Axes = axes[0, row_idx + 1]
+        col.plot(space, kde, c="k", linewidth=2)
+        col.spines[["left", "right", "top"]].set_visible(False)
+        col.tick_params(
+            left=False,
+            right=False,
+            bottom=True,
+            top=False,
+            labelleft=False,
+            labelbottom=False,
+        )
+        col.set_xlabel(label)
+        col.xaxis.set_label_position("top")
+        col.set_xticks(ticks)
+        col.set_aspect(2)
+
+        axes[row_idx + 1, 1].tick_params(labelleft=True)
+        axes[row_idx + 1, -1].tick_params(labelright=True)
+        axes[1, row_idx + 1].tick_params(labeltop=True)
+        axes[-1, row_idx + 1].tick_params(labelbottom=True)
+
+    # Populate primary matrix.
+    for row_idx, col_idx in np.ndindex((D, D)):
+        ax: plt.Axes = axes[row_idx + 1, col_idx + 1]
+        ax.set_axisbelow(True)
+        ax.grid(True)
+        ax.tick_params(
+            left=True,
+            right=True,
+            bottom=True,
+            top=True,
+        )
+        ax.set_aspect(1)
+
+        if row_idx == col_idx:
+            ax.plot(
+                [0, 1],
+                [0, 1],
+                transform=ax.transAxes,
+                c="k",
+                linewidth=1,
+                zorder=0,
+            )
+
+        ax.scatter(data[col_idx], data[row_idx], c=c, alpha=alpha)
+
+    # Remove the plot in the KDE diagonal entry.
+    axes[0, 0].remove()
+
+    figure.tight_layout()
+
+# # Assignment I - Apple Banana Pear
+# 
+# The dataset is so named because it contains three point clusters vaguely shaped like those fruit (and the red, green, and yellow color labels helpfully match). The apple (spheroid distribution) and pear (oblong distribution) are clearly visible in the $x_{12}$ and $x_{23}$ plots, while the banana (crescent distribution) is clearly visible in the $x_{13}$ plot.
+
+# In[ ]:
+
+
+data = set_i["MainData"].to_numpy()
+labels = set_i["ClassLabels"].to_numpy().squeeze()
+residual = data - data.mean(axis=1, keepdims=True)
+
+scatterplot_matrix(residual, c=labels, bw_method=0.2)
+plt.suptitle("Apple-Banana-Pear I Mean Residuals.")
+plt.tight_layout()
+plt.show()
+
+# # Assignment II - Rotated Apple Banana Pear
+# 
+# The same plot with the rotated data. The fruit are still visible to the eye, but the clusters overlap horribly in all three views.
+
+# In[ ]:
+
+
+data = set_ii["MainData"].to_numpy()
+labels = set_ii["ClassLabels"].to_numpy().squeeze()
+
+residual = data - data.mean(axis=1, keepdims=True)
+scatterplot_matrix(residual, c=labels, bw_method=0.2)
+plt.suptitle("Apple-Banana-Pear II Mean Residuals.")
+plt.tight_layout()
+plt.show()
+
+# # Assignment II - PCA on Apple Banana Pear
+# 
+# Extract the principal modes of variation (eigenvectors of the covariance matrix), and show the scatterplot matrix of the data transformed into that eigenbasis. Additionally output the covariance, eigenbasis, and transformation matrices.
+
+# In[ ]:
+
+
+data = set_ii["MainData"].to_numpy()
+labels = set_ii["ClassLabels"].to_numpy().squeeze()
+residual = data - data.mean(axis=1, keepdims=True)
+
+covariance = residual @ residual.T / (data.shape[1] - 1)  # or np.cov(data)
+print("covariance:", covariance, sep="\n", end="\n\n")
+
+res = np.linalg.eig(covariance)
+index = np.argsort(-res.eigenvalues)
+eigenvalues = res.eigenvalues[index]
+print("eigenvalues:", eigenvalues, sep="\n", end="\n\n")
+
+eigenvectors = res.eigenvectors[:, index]
+print("eigenvectors:", eigenvectors, sep="\n", end="\n\n")
+
+to_eigenbasis = np.linalg.inv(eigenvectors)
+print("transform to eigenbasis:", to_eigenbasis, sep="\n", end="\n\n")
+
+scatterplot_matrix(
+    to_eigenbasis @ residual,
+    c=labels,
+    axislabels=["PC 1", "PC 2", "PC 3"],
+    bw_method=0.2,
+)
+plt.suptitle("Apple-Banana-Pear II Principal Components")
+plt.tight_layout()
+plt.show()
+
+# Out of curiosity and as a sanity check, I apply PCA on the non-rotated dataset. Indeed the principal components are the same. Some notes:
+# 
+# - The covariance matrix is approximately diagonal.
+# - The eigenvalues are identical.
+# - The eigenbasis is approximately a permutation which sorts the variances.
+
+# In[ ]:
+
+
+data = set_i["MainData"].to_numpy()
+labels = set_i["ClassLabels"].to_numpy().squeeze()
+residual = data - data.mean(axis=1, keepdims=True)
+
+covariance = residual @ residual.T / (data.shape[1] - 1)  # or np.cov(data)
+print("covariance:", covariance, sep="\n", end="\n\n")
+
+res = np.linalg.eig(covariance)
+index = np.argsort(-res.eigenvalues)
+eigenvalues = res.eigenvalues[index]
+print("eigenvalues:", eigenvalues, sep="\n", end="\n\n")
+
+eigenvectors = res.eigenvectors[:, index]
+print("eigenvectors:", eigenvectors, sep="\n", end="\n\n")
+
+to_eigenbasis = np.linalg.inv(eigenvectors)
+print("transform to eigenbasis:", to_eigenbasis, sep="\n", end="\n\n")
+
+scatterplot_matrix(
+    to_eigenbasis @ residual,
+    c=labels,
+    axislabels=["PC 1", "PC 2", "PC 3"],
+    bw_method=0.2,
+)
+plt.suptitle("Apple-Banana-Pear I Principal Components")
+plt.tight_layout()
+plt.show()
--- a/3.py
+++ b/3.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# ### Homework 3
+# 
+# David Allemang. STOR 881.001.FA25.
+# 
+# Verify that for all $x > 0$ and $\lambda \in \mathbb R$, $$\lim_{\lambda \to 0} \frac{x^\lambda - 1}{\lambda} = \ln x$$
+
+# Proof. Apply L'Hôpital's rule: $$\lim_{\lambda \to 0} \frac f g = \lim_{\lambda \to 0} \frac {f_\lambda}{g_\lambda}$$ where $f = x^\lambda - 1$ and $g = \lambda$.
+# 
+# $$\begin{align*}
+# \lim_{\lambda \to 0} \frac{x^\lambda - 1}{\lambda} &= 
+# \lim_{\lambda \to 0} \frac{x^\lambda \ln x - 0}{1}  \\
+# &= x^0 \ln x \\
+# &= \ln x
+# \end{align*}$$
+
+# Just for fun, let's plot the thing.
+
+# In[1]:
+
+
+import matplotlib.pyplot as plt
+from matplotlib.colors import Normalize
+import numpy as np
+
+x = 2 ** np.linspace(-2, 2, 50)
+l = np.linspace(-1, 1, 21)
+l = l[l != 0]
+X, L = np.meshgrid(x, l)
+Z = (X ** L - 1) / L
+
+cmap = plt.get_cmap('bwr')
+lnorm = Normalize(l.min(), l.max())
+
+# In[2]:
+
+
+plt.figure(figsize=(9, 6))
+plt.title("Box-Cox transform (linear scale)")
+
+lines = plt.plot(x, Z.T, lw=1.5, ls='-')
+lines[0].set_label(rf'$\lambda = {l.min()}$')
+lines[-1].set_label(rf'$\lambda = {l.max()}$')
+for line, color in zip(lines, cmap(lnorm(l))):
+    line.set_color(color)
+
+line, = plt.plot(x, np.log(x), lw=3, ls=':', c='k')
+line.set_label(r'$\ln x$')
+
+plt.legend()
+plt.show();
+
+# In[3]:
+
+
+plt.figure(figsize=(9, 6))
+plt.title("Box-Cox transform (log scale)")
+plt.gca().set_xscale('log', base=2)
+
+lines = plt.plot(x, Z.T, lw=1.5, ls='-')
+lines[0].set_label(rf'$\lambda = {l.min()}$')
+lines[-1].set_label(rf'$\lambda = {l.max()}$')
+for line, color in zip(lines, cmap(lnorm(l))):
+    line.set_color(color)
+
+line, = plt.plot(x, np.log(x), lw=3, ls=':', c='k')
+line.set_label(r'$\ln x$')
+
+plt.legend()
+plt.show();
--- a/4.py
+++ b/4.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# ### Homework 4
+# 
+# David Allemang. STOR 881.001.FA25.
+
+# #### 1. Flat subspace
+# 
+# Show the set of flat vectors $F = \{ \alpha \vec 1 : \alpha \in \mathbb R\}$ is a subspace of $\mathbb R^d$. (Where $\vec 1$ is the vector where all coefficients are $1$).
+# 
+# Proof.
+# 
+# Show that $F$ is closed under linear combination. Given flat vectors $\vec u = \alpha \vec 1$ and $\vec v = \beta \vec 1$, and scalars $a$, $b$, show that $a \vec u + b \vec v \in F$.
+# 
+# $$\begin{align*}
+# a \vec u + b \vec v &= a \alpha \vec 1 + b \beta \vec 1 \\
+# &= (a \alpha + b \beta) \vec 1 \\
+# &= \gamma \vec 1 \\
+# \end{align*}$$
+
+# #### 2. Projection onto flat subspace.
+# 
+# Show that the projection of a $d$-dimensional vector onto the space of flat vectors is the flat vector whose common entry is the mean of entries of the original vector.
+# 
+# Define the projection operator which projects $\vec x$ onto the line spanned by $\vec y$: $$P(\vec x, \vec y) = \frac{\langle \vec x, \vec y\rangle}{\langle \vec y, \vec y\rangle} \vec y$$
+# 
+# Proof.
+# 
+# The flat vector subspace is rank 1, spanned by any of its vectors. Pick $\vec 1$ as the basis element for easy arithmetic and consider the projection of an arbitrary vector $\vec x$ onto the line spanned by $\vec 1$:
+# 
+# $$\begin{align*}
+# P(\vec x, \vec 1) &= \frac{\langle \vec x, \vec 1\rangle}{\langle \vec 1, \vec 1\rangle} \vec 1 \\
+# &= \frac{\sum_{i=1}^d x_i \cdot 1}{\sum_{i=1}^d 1 \cdot 1} \vec 1 \\
+# &= \left (\frac 1 d \sum_{i=1}^d x_i \right) \vec 1 \\
+# &= \mu \vec 1
+# \end{align*}$$
--- a/data/AppleBananaPearSetI.xlsx
+++ b/data/AppleBananaPearSetI.xlsx
--- a/data/AppleBananaPearSetII.xlsx
+++ b/data/AppleBananaPearSetII.xlsx