Module `quantfin.stats.stats`

Tools for fitting distributions to data.

Source code

"""Tools for fitting distributions to data."""

import numpy as np
from scipy.stats import norm
from .kernels import gaussian

def standardise(x):
    """Transforms data so that it has mean 0 and standard deviation 1.

    The transform is given by:

        z_i = (x_i - mu) / sigma

    where `mu` is the mean and `sigma` is the standard deviation of the data.

    Args:
        x (numpy.array): Data to transform.
    """
    mu = x.mean()
    sigma = x.std()

    return (x - mu) / sigma

def quantile_quantile(x, dist=norm):
    """Calculates theoretical and actual quantiles of a data set for a QQ plot.

    Args:
        x (numpy.array): The data to analyse.
        dist (scipy.stats.rv_continuous, optional): The distribution to use
            when calculating theoretical quantiles. Must have a `ppf` method.
            Defaults to `scipy.stats.norm`.

    Returns:
        A tuple `(z_theoretical, z_ordered)`, where `z_theoretical` are the
        quantile values calculated from the theoretical distribution, and
        `z_ordered` are the quantiles calculated from the ordered data.
    """
    N = len(x)
    z = standardise(x)
    z.sort()

    p = np.array([(i - 0.5)/N for i in range(1, N+1)])
    z_pred = dist.ppf(p)

    return (z_pred, z)

def kde(x, y, delta, k=gaussian):
    """Kernel density estimation.

    Estimates a continuous probability distribution from discrete data using
    kernel density estimation.

    Args:
        x (numpy.array): Values at which to evaluate the estimated PDF.
        y (numpy.array): Data to estimate the PDE from.
        delta (float): Smoothing parameter.
        f (optional): The kernel function to use. Should be positive semi-definite
            and normalised (i.e. a valid PDF). Defaults to a Gaussian distribution.
    """
    N = len(y)
    M = len(x)

    out = np.zeros(M)
    for i in range(M):
        out[i] = k((x[i] - y) / delta).sum()

    out *= (1.0 / (N * delta))

    return out

def fit_mixed_gaussian(x, K, epsilon=1e-4, maxiter=500):
    """Fits a mixed Gaussian distribution to the data.

    Fits a weighted sum of Gaussian distributions to data using the maximum
    expectation method.

    Args:
        x (numpy.array): Data to fit to.
        K (int): Number of Gaussians to use in the mixture.
        epsilon (float, optional): Accuracy to solve to.
        maxiter (int, optional): Max number of iterations.

    Returns:
        A tuple of numpy arrays `(mu, sigma, r)`. The kth entry of each array
        contains the mean/standard deviation/weight of the kth Gaussian
        distribution in the mixture.

    Raises:
        RuntimeError: If the iteration fails to converge.
    """
    N_x = len(x) # Number of data points
    r = np.ones(K) / N_x # Weights of each Gaussian
    mu = np.zeros(K) # Means of each Gaussian
    sigma = np.ones(K) # Variance of each Gaussian
    def g(y_n, k):
        # g(x_n; mu_k, sigma_k) =
        # r_k * phi(x_n; mu_k sigma_k) / sum_{j=1}^{K} r_j phi(x_n; mu_k, sigma_j)
        denom = (r * gaussian((y_n - mu) / sigma)).sum()
        return (r[k] * gaussian((y_n - mu[k]) / sigma[k])) / denom

    def N(k):
        # N_k = sum_{n=1}^{N} g(x_n; mu_k, sigma_k)
        g_vals = np.array([g(x_n, k) for x_n in x])
        return g_vals.sum()

    def update_mu(k):
        # mu_k = (1/N_k) * sum_{n=1}^{N} g(x_n; mu_k, sigma_k) * x_n
        summand = np.array([g(x_n, k) * x_n for x_n in x])
        return summand.sum() / N(k)

    def update_sigma(k):
        # sigma^2_k = (1/N_k) * sum_{n=1}^{N} g(x_n; mu_k, sigma_k) * (x_n - mu_k)^2
        summand = np.array([g(x_n, k) * (x_n - mu[k])**2 for x_n in x])
        return np.sqrt(summand.sum() / N(k))

    def update_r(k):
        # r_k = N_k / N
        return N(k) / N_x

    def log_L():
        # ln(L) = sum_{n=1}^{N} ln(sum_{k=1}^{K} r_k phi(x_n; mu_k, sigma_k))
        out = 0.0
        for n in range(N_x):
            out += np.log((r * gaussian((x[n] - mu)/sigma)).sum())
        return out

    n_iter = 0
    prev_L = log_L()
    while True:
        if n_iter > maxiter:
            # Raise exception when max iterations reached
            raise RuntimeError("Max iterations exceeded.")

        # Update our guess for each parameter
        for k in range(K):
            mu[k] = update_mu(k)
            sigma[k] = update_sigma(k)
            r[k] = update_r(k)

        # Calculate new value of ln(L)
        new_L = log_L()
        if abs(new_L - prev_L) < epsilon:
            # Iteration has converged
            break

        # Update value of ln(L)
        prev_L = new_L

    sigma = sigma
    return (mu, sigma, r)

Functions

def fit_mixed_gaussian(x, K, epsilon=0.0001, maxiter=500)

Fits a mixed Gaussian distribution to the data.

Fits a weighted sum of Gaussian distributions to data using the maximum expectation method.

Args

x : numpy.array: Data to fit to.
K : int: Number of Gaussians to use in the mixture.
epsilon : float, optional: Accuracy to solve to.
maxiter : int, optional: Max number of iterations.

Returns

A tuple of numpy arrays (mu, sigma, r). The kth entry of each array contains the mean/standard deviation/weight of the kth Gaussian distribution in the mixture.

Raises

RuntimeError: If the iteration fails to converge.

Source code

def fit_mixed_gaussian(x, K, epsilon=1e-4, maxiter=500):
    """Fits a mixed Gaussian distribution to the data.

    Fits a weighted sum of Gaussian distributions to data using the maximum
    expectation method.

    Args:
        x (numpy.array): Data to fit to.
        K (int): Number of Gaussians to use in the mixture.
        epsilon (float, optional): Accuracy to solve to.
        maxiter (int, optional): Max number of iterations.

    Returns:
        A tuple of numpy arrays `(mu, sigma, r)`. The kth entry of each array
        contains the mean/standard deviation/weight of the kth Gaussian
        distribution in the mixture.

    Raises:
        RuntimeError: If the iteration fails to converge.
    """
    N_x = len(x) # Number of data points
    r = np.ones(K) / N_x # Weights of each Gaussian
    mu = np.zeros(K) # Means of each Gaussian
    sigma = np.ones(K) # Variance of each Gaussian
    def g(y_n, k):
        # g(x_n; mu_k, sigma_k) =
        # r_k * phi(x_n; mu_k sigma_k) / sum_{j=1}^{K} r_j phi(x_n; mu_k, sigma_j)
        denom = (r * gaussian((y_n - mu) / sigma)).sum()
        return (r[k] * gaussian((y_n - mu[k]) / sigma[k])) / denom

    def N(k):
        # N_k = sum_{n=1}^{N} g(x_n; mu_k, sigma_k)
        g_vals = np.array([g(x_n, k) for x_n in x])
        return g_vals.sum()

    def update_mu(k):
        # mu_k = (1/N_k) * sum_{n=1}^{N} g(x_n; mu_k, sigma_k) * x_n
        summand = np.array([g(x_n, k) * x_n for x_n in x])
        return summand.sum() / N(k)

    def update_sigma(k):
        # sigma^2_k = (1/N_k) * sum_{n=1}^{N} g(x_n; mu_k, sigma_k) * (x_n - mu_k)^2
        summand = np.array([g(x_n, k) * (x_n - mu[k])**2 for x_n in x])
        return np.sqrt(summand.sum() / N(k))

    def update_r(k):
        # r_k = N_k / N
        return N(k) / N_x

    def log_L():
        # ln(L) = sum_{n=1}^{N} ln(sum_{k=1}^{K} r_k phi(x_n; mu_k, sigma_k))
        out = 0.0
        for n in range(N_x):
            out += np.log((r * gaussian((x[n] - mu)/sigma)).sum())
        return out

    n_iter = 0
    prev_L = log_L()
    while True:
        if n_iter > maxiter:
            # Raise exception when max iterations reached
            raise RuntimeError("Max iterations exceeded.")

        # Update our guess for each parameter
        for k in range(K):
            mu[k] = update_mu(k)
            sigma[k] = update_sigma(k)
            r[k] = update_r(k)

        # Calculate new value of ln(L)
        new_L = log_L()
        if abs(new_L - prev_L) < epsilon:
            # Iteration has converged
            break

        # Update value of ln(L)
        prev_L = new_L

    sigma = sigma
    return (mu, sigma, r)

def kde(x, y, delta, k=)

Kernel density estimation.

Estimates a continuous probability distribution from discrete data using kernel density estimation.

Args

x : numpy.array: Values at which to evaluate the estimated PDF.
y : numpy.array: Data to estimate the PDE from.
delta : float: Smoothing parameter.
f : optional: The kernel function to use. Should be positive semi-definite and normalised (i.e. a valid PDF). Defaults to a Gaussian distribution.

Source code

def kde(x, y, delta, k=gaussian):
    """Kernel density estimation.

    Estimates a continuous probability distribution from discrete data using
    kernel density estimation.

    Args:
        x (numpy.array): Values at which to evaluate the estimated PDF.
        y (numpy.array): Data to estimate the PDE from.
        delta (float): Smoothing parameter.
        f (optional): The kernel function to use. Should be positive semi-definite
            and normalised (i.e. a valid PDF). Defaults to a Gaussian distribution.
    """
    N = len(y)
    M = len(x)

    out = np.zeros(M)
    for i in range(M):
        out[i] = k((x[i] - y) / delta).sum()

    out *= (1.0 / (N * delta))

    return out

def quantile_quantile(x, dist=)

Calculates theoretical and actual quantiles of a data set for a QQ plot.

Args

x : numpy.array: The data to analyse.
dist : scipy.stats.rv_continuous, optional: The distribution to use when calculating theoretical quantiles. Must have a ppf method. Defaults to scipy.stats.norm.

Returns

A tuple (z_theoretical, z_ordered), where z_theoretical are the quantile values calculated from the theoretical distribution, and z_ordered are the quantiles calculated from the ordered data.

Source code

def quantile_quantile(x, dist=norm):
    """Calculates theoretical and actual quantiles of a data set for a QQ plot.

    Args:
        x (numpy.array): The data to analyse.
        dist (scipy.stats.rv_continuous, optional): The distribution to use
            when calculating theoretical quantiles. Must have a `ppf` method.
            Defaults to `scipy.stats.norm`.

    Returns:
        A tuple `(z_theoretical, z_ordered)`, where `z_theoretical` are the
        quantile values calculated from the theoretical distribution, and
        `z_ordered` are the quantiles calculated from the ordered data.
    """
    N = len(x)
    z = standardise(x)
    z.sort()

    p = np.array([(i - 0.5)/N for i in range(1, N+1)])
    z_pred = dist.ppf(p)

    return (z_pred, z)

def standardise(x)

Transforms data so that it has mean 0 and standard deviation 1.

The transform is given by:

z_i = (x_i - mu) / sigma

where mu is the mean and sigma is the standard deviation of the data.

Args

x : numpy.array: Data to transform.

Source code

def standardise(x):
    """Transforms data so that it has mean 0 and standard deviation 1.

    The transform is given by:

        z_i = (x_i - mu) / sigma

    where `mu` is the mean and `sigma` is the standard deviation of the data.

    Args:
        x (numpy.array): Data to transform.
    """
    mu = x.mean()
    sigma = x.std()

    return (x - mu) / sigma