Module quantfin.pca
Principal component analysis.
Source code
"""Principal component analysis."""
import numpy as np
def pca(data, norm=False, epsilon=1e-5):
"""Principal component analysis.
Performs PCA using the NIPALS algorithm.
Args:
data (numpy.array): A matrix of data on which to perform PCA.
norm (bool): Whether or not the input data has been standardised.
epsilon (float, optional): Required accuracy for convergence.
Returns:
A tuple `(T, P)`, where T and P are matrices. The columns of T are the
principal components, and P is the matrix of weights.
"""
if not norm:
# Standardise data
mu = data.mean(axis=0)
sigma = data.std(axis=0)
X = (data - mu) / sigma
else:
X = data.copy()
# Create empty T and P matrices
N, M = X.shape
T = np.zeros((N, M))
P = np.zeros((M, M))
# NIPALS algorithm
for j in range(M):
t_old = X[:, j] # Initial guess
while True:
p = X.T @ t_old
p /= np.linalg.norm(p)
t_new = X @ p
delta = np.linalg.norm(t_old - t_new) ** 2
if delta < epsilon: # Required accuracy reached
break
t_old = t_new # Update guess for t
# Vectors t and p are columns of T and P
T[:, j] = t_new
P[:, j] = p
# Need to convert arrays of shape (N,) -> (N,1) for matmul
X = X - (np.expand_dims(t_new, 1) @ np.expand_dims(p, 1).T)
return T, P
Functions
def pca(data, norm=False, epsilon=1e-05)
-
Principal component analysis.
Performs PCA using the NIPALS algorithm.
Args
data
:numpy.array
- A matrix of data on which to perform PCA.
norm
:bool
- Whether or not the input data has been standardised.
epsilon
:float
, optional- Required accuracy for convergence.
Returns
A tuple
(T, P)
, where T and P are matrices. The columns of T are the principal components, and P is the matrix of weights.Source code
def pca(data, norm=False, epsilon=1e-5): """Principal component analysis. Performs PCA using the NIPALS algorithm. Args: data (numpy.array): A matrix of data on which to perform PCA. norm (bool): Whether or not the input data has been standardised. epsilon (float, optional): Required accuracy for convergence. Returns: A tuple `(T, P)`, where T and P are matrices. The columns of T are the principal components, and P is the matrix of weights. """ if not norm: # Standardise data mu = data.mean(axis=0) sigma = data.std(axis=0) X = (data - mu) / sigma else: X = data.copy() # Create empty T and P matrices N, M = X.shape T = np.zeros((N, M)) P = np.zeros((M, M)) # NIPALS algorithm for j in range(M): t_old = X[:, j] # Initial guess while True: p = X.T @ t_old p /= np.linalg.norm(p) t_new = X @ p delta = np.linalg.norm(t_old - t_new) ** 2 if delta < epsilon: # Required accuracy reached break t_old = t_new # Update guess for t # Vectors t and p are columns of T and P T[:, j] = t_new P[:, j] = p # Need to convert arrays of shape (N,) -> (N,1) for matmul X = X - (np.expand_dims(t_new, 1) @ np.expand_dims(p, 1).T) return T, P