Fall 2025

This commit is contained in:
David Allemang
2026-05-25 11:32:47 -04:00
commit 2e1103d295
5 changed files with 362 additions and 0 deletions

253
HW 1 HW 2.py Normal file
View File

@@ -0,0 +1,253 @@
#!/usr/bin/env python
# coding: utf-8
# ### Homeworks 1 and 2
#
# David Allemang. STOR 881.001.FA25.
#
# Since the dataset and visualization content are the same, I've done both homeworks in the same notebook and will submit them together.
# Basic data loading. Each cell re-loads local variables from one of these datasets so they can be executed in any order.
# In[ ]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import gaussian_kde
np.set_printoptions(suppress=True, precision=3)
set_i = pd.read_excel(
"data/AppleBananaPearSetI.xlsx",
sheet_name=["MainData", "ClassLabels"],
index_col=None,
header=None,
)
set_ii = pd.read_excel(
"data/AppleBananaPearSetII.xlsx",
sheet_name=["MainData", "ClassLabels"],
index_col=None,
header=None,
)
# The workhorse scatterplot matrix implementation. Uses a variated layout from discussed in class, but I think it is easier to read.
# In[ ]:
def scatterplot_matrix(
data: np.ndarray,
c=None,
alpha=0.10,
axislabels: list[str] | None = None,
bw_method: str | float | None = None,
):
"""
Builds a scatterplot matrix on D-dimensional data. The diagonal plots
show the coordinates on the line x=y so that correspondent points are
unconditionally aligned within each row or column. The diagonal plots are
not available for KDE, then, so the KDE is shown along the top and left
edges instead.
:param data: Shape (D, N) data matrix.
:param c: Point labels (color-mapped).
:param alpha: Point marker alpha to mitigate overdraw.
:param axislabels: Coordinate axis labels.
"""
D = len(data)
if axislabels is None:
axislabels = (f"$x_{i + 1}$ coordinate" for i in range(D))
figure, axes = plt.subplots(
D + 1,
D + 1,
figsize=(10, 10),
sharex="col",
sharey="row",
width_ratios=[1] + [3] * D,
height_ratios=[1] + [3] * D,
)
axes = np.array(axes) # for easier iterating
space = np.linspace(-4, 4, 100)
ticks = np.linspace(-4, 4, 5)
# Populate KDE charts.
for row_idx, (coord, label) in enumerate(zip(data, axislabels)):
kde = gaussian_kde(coord, bw_method=bw_method)(space)
row: plt.Axes = axes[row_idx + 1, 0]
row.plot(kde, space, c="k", linewidth=2)
row.spines[["left", "bottom", "top"]].set_visible(False)
row.tick_params(
left=False,
right=True,
bottom=False,
top=False,
labelleft=False,
labelbottom=False,
)
row.set_ylabel(label)
row.xaxis.set_inverted(True)
row.set_yticks(ticks)
row.set_aspect(1 / 2)
col: plt.Axes = axes[0, row_idx + 1]
col.plot(space, kde, c="k", linewidth=2)
col.spines[["left", "right", "top"]].set_visible(False)
col.tick_params(
left=False,
right=False,
bottom=True,
top=False,
labelleft=False,
labelbottom=False,
)
col.set_xlabel(label)
col.xaxis.set_label_position("top")
col.set_xticks(ticks)
col.set_aspect(2)
axes[row_idx + 1, 1].tick_params(labelleft=True)
axes[row_idx + 1, -1].tick_params(labelright=True)
axes[1, row_idx + 1].tick_params(labeltop=True)
axes[-1, row_idx + 1].tick_params(labelbottom=True)
# Populate primary matrix.
for row_idx, col_idx in np.ndindex((D, D)):
ax: plt.Axes = axes[row_idx + 1, col_idx + 1]
ax.set_axisbelow(True)
ax.grid(True)
ax.tick_params(
left=True,
right=True,
bottom=True,
top=True,
)
ax.set_aspect(1)
if row_idx == col_idx:
ax.plot(
[0, 1],
[0, 1],
transform=ax.transAxes,
c="k",
linewidth=1,
zorder=0,
)
ax.scatter(data[col_idx], data[row_idx], c=c, alpha=alpha)
# Remove the plot in the KDE diagonal entry.
axes[0, 0].remove()
figure.tight_layout()
# # Assignment I - Apple Banana Pear
#
# The dataset is so named because it contains three point clusters vaguely shaped like those fruit (and the red, green, and yellow color labels helpfully match). The apple (spheroid distribution) and pear (oblong distribution) are clearly visible in the $x_{12}$ and $x_{23}$ plots, while the banana (crescent distribution) is clearly visible in the $x_{13}$ plot.
# In[ ]:
data = set_i["MainData"].to_numpy()
labels = set_i["ClassLabels"].to_numpy().squeeze()
residual = data - data.mean(axis=1, keepdims=True)
scatterplot_matrix(residual, c=labels, bw_method=0.2)
plt.suptitle("Apple-Banana-Pear I Mean Residuals.")
plt.tight_layout()
plt.show()
# # Assignment II - Rotated Apple Banana Pear
#
# The same plot with the rotated data. The fruit are still visible to the eye, but the clusters overlap horribly in all three views.
# In[ ]:
data = set_ii["MainData"].to_numpy()
labels = set_ii["ClassLabels"].to_numpy().squeeze()
residual = data - data.mean(axis=1, keepdims=True)
scatterplot_matrix(residual, c=labels, bw_method=0.2)
plt.suptitle("Apple-Banana-Pear II Mean Residuals.")
plt.tight_layout()
plt.show()
# # Assignment II - PCA on Apple Banana Pear
#
# Extract the principal modes of variation (eigenvectors of the covariance matrix), and show the scatterplot matrix of the data transformed into that eigenbasis. Additionally output the covariance, eigenbasis, and transformation matrices.
# In[ ]:
data = set_ii["MainData"].to_numpy()
labels = set_ii["ClassLabels"].to_numpy().squeeze()
residual = data - data.mean(axis=1, keepdims=True)
covariance = residual @ residual.T / (data.shape[1] - 1) # or np.cov(data)
print("covariance:", covariance, sep="\n", end="\n\n")
res = np.linalg.eig(covariance)
index = np.argsort(-res.eigenvalues)
eigenvalues = res.eigenvalues[index]
print("eigenvalues:", eigenvalues, sep="\n", end="\n\n")
eigenvectors = res.eigenvectors[:, index]
print("eigenvectors:", eigenvectors, sep="\n", end="\n\n")
to_eigenbasis = np.linalg.inv(eigenvectors)
print("transform to eigenbasis:", to_eigenbasis, sep="\n", end="\n\n")
scatterplot_matrix(
to_eigenbasis @ residual,
c=labels,
axislabels=["PC 1", "PC 2", "PC 3"],
bw_method=0.2,
)
plt.suptitle("Apple-Banana-Pear II Principal Components")
plt.tight_layout()
plt.show()
# Out of curiosity and as a sanity check, I apply PCA on the non-rotated dataset. Indeed the principal components are the same. Some notes:
#
# - The covariance matrix is approximately diagonal.
# - The eigenvalues are identical.
# - The eigenbasis is approximately a permutation which sorts the variances.
# In[ ]:
data = set_i["MainData"].to_numpy()
labels = set_i["ClassLabels"].to_numpy().squeeze()
residual = data - data.mean(axis=1, keepdims=True)
covariance = residual @ residual.T / (data.shape[1] - 1) # or np.cov(data)
print("covariance:", covariance, sep="\n", end="\n\n")
res = np.linalg.eig(covariance)
index = np.argsort(-res.eigenvalues)
eigenvalues = res.eigenvalues[index]
print("eigenvalues:", eigenvalues, sep="\n", end="\n\n")
eigenvectors = res.eigenvectors[:, index]
print("eigenvectors:", eigenvectors, sep="\n", end="\n\n")
to_eigenbasis = np.linalg.inv(eigenvectors)
print("transform to eigenbasis:", to_eigenbasis, sep="\n", end="\n\n")
scatterplot_matrix(
to_eigenbasis @ residual,
c=labels,
axislabels=["PC 1", "PC 2", "PC 3"],
bw_method=0.2,
)
plt.suptitle("Apple-Banana-Pear I Principal Components")
plt.tight_layout()
plt.show()

72
HW 3.py Normal file
View File

@@ -0,0 +1,72 @@
#!/usr/bin/env python
# coding: utf-8
# ### Homework 3
#
# David Allemang. STOR 881.001.FA25.
#
# Verify that for all $x > 0$ and $\lambda \in \mathbb R$, $$\lim_{\lambda \to 0} \frac{x^\lambda - 1}{\lambda} = \ln x$$
# Proof. Apply L'Hôpital's rule: $$\lim_{\lambda \to 0} \frac f g = \lim_{\lambda \to 0} \frac {f_\lambda}{g_\lambda}$$ where $f = x^\lambda - 1$ and $g = \lambda$.
#
# $$\begin{align*}
# \lim_{\lambda \to 0} \frac{x^\lambda - 1}{\lambda} &=
# \lim_{\lambda \to 0} \frac{x^\lambda \ln x - 0}{1} \\
# &= x^0 \ln x \\
# &= \ln x
# \end{align*}$$
# Just for fun, let's plot the thing.
# In[1]:
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
import numpy as np
x = 2 ** np.linspace(-2, 2, 50)
l = np.linspace(-1, 1, 21)
l = l[l != 0]
X, L = np.meshgrid(x, l)
Z = (X ** L - 1) / L
cmap = plt.get_cmap('bwr')
lnorm = Normalize(l.min(), l.max())
# In[2]:
plt.figure(figsize=(9, 6))
plt.title("Box-Cox transform (linear scale)")
lines = plt.plot(x, Z.T, lw=1.5, ls='-')
lines[0].set_label(rf'$\lambda = {l.min()}$')
lines[-1].set_label(rf'$\lambda = {l.max()}$')
for line, color in zip(lines, cmap(lnorm(l))):
line.set_color(color)
line, = plt.plot(x, np.log(x), lw=3, ls=':', c='k')
line.set_label(r'$\ln x$')
plt.legend()
plt.show();
# In[3]:
plt.figure(figsize=(9, 6))
plt.title("Box-Cox transform (log scale)")
plt.gca().set_xscale('log', base=2)
lines = plt.plot(x, Z.T, lw=1.5, ls='-')
lines[0].set_label(rf'$\lambda = {l.min()}$')
lines[-1].set_label(rf'$\lambda = {l.max()}$')
for line, color in zip(lines, cmap(lnorm(l))):
line.set_color(color)
line, = plt.plot(x, np.log(x), lw=3, ls=':', c='k')
line.set_label(r'$\ln x$')
plt.legend()
plt.show();

37
HW 4.py Normal file
View File

@@ -0,0 +1,37 @@
#!/usr/bin/env python
# coding: utf-8
# ### Homework 4
#
# David Allemang. STOR 881.001.FA25.
# #### 1. Flat subspace
#
# Show the set of flat vectors $F = \{ \alpha \vec 1 : \alpha \in \mathbb R\}$ is a subspace of $\mathbb R^d$. (Where $\vec 1$ is the vector where all coefficients are $1$).
#
# Proof.
#
# Show that $F$ is closed under linear combination. Given flat vectors $\vec u = \alpha \vec 1$ and $\vec v = \beta \vec 1$, and scalars $a$, $b$, show that $a \vec u + b \vec v \in F$.
#
# $$\begin{align*}
# a \vec u + b \vec v &= a \alpha \vec 1 + b \beta \vec 1 \\
# &= (a \alpha + b \beta) \vec 1 \\
# &= \gamma \vec 1 \\
# \end{align*}$$
# #### 2. Projection onto flat subspace.
#
# Show that the projection of a $d$-dimensional vector onto the space of flat vectors is the flat vector whose common entry is the mean of entries of the original vector.
#
# Define the projection operator which projects $\vec x$ onto the line spanned by $\vec y$: $$P(\vec x, \vec y) = \frac{\langle \vec x, \vec y\rangle}{\langle \vec y, \vec y\rangle} \vec y$$
#
# Proof.
#
# The flat vector subspace is rank 1, spanned by any of its vectors. Pick $\vec 1$ as the basis element for easy arithmetic and consider the projection of an arbitrary vector $\vec x$ onto the line spanned by $\vec 1$:
#
# $$\begin{align*}
# P(\vec x, \vec 1) &= \frac{\langle \vec x, \vec 1\rangle}{\langle \vec 1, \vec 1\rangle} \vec 1 \\
# &= \frac{\sum_{i=1}^d x_i \cdot 1}{\sum_{i=1}^d 1 \cdot 1} \vec 1 \\
# &= \left (\frac 1 d \sum_{i=1}^d x_i \right) \vec 1 \\
# &= \mu \vec 1
# \end{align*}$$

Binary file not shown.

Binary file not shown.