Source code for medil.independence_testing

"""Independence testing on samples of random variables."""

from typing import NamedTuple, Optional

from dcor.independence import distance_correlation_t_test
from multiprocessing import Pool, cpu_count
import numpy as np
import numpy.typing as npt
from scipy.spatial.distance import pdist, squareform
from scipy.stats import chi2, norm, rankdata



[docs]
def dcov(samples):
    r"""Compute sample distance covariance matrix.
    Parameters
    ----------
    samples : 2d numpy array of floats
              A :math:`N \times M` matrix with :math:`N` samples of
              :math:`M` random variables.
    Returns
    -------
    2d numpy array
        A square matrix :math:`C`, where :math:`C_{i,j}` is the sample
        distance covariance between random variables :math:`R_i` and
        :math:`R_j`.
    """
    num_samps, num_feats = samples.shape
    num_pairs = num_samps * (num_samps - 1) // 2
    dists = np.zeros((num_feats, num_pairs))
    d_bars = np.zeros(num_feats)
    # compute doubly centered distance matrix for every feature:
    for feat_idx in range(num_feats):
        n = num_samps
        t = np.tile
        # raw distance matrix:
        d = squareform(pdist(samples[:, feat_idx].reshape(-1, 1), "cityblock"))
        # doubly centered:
        d_bar = d.mean()
        d -= t(d.mean(0), (n, 1)) + t(d.mean(1), (n, 1)).T - t(d_bar, (n, n))
        d = squareform(d, checks=False)  # ignore assymmetry due to numerical error
        dists[feat_idx] = d
        d_bars[feat_idx] = d_bar
    return dists @ dists.T / num_samps**2, d_bars




[docs]
def estimate_UDG(sample, method="dcov_fast", significance_level=0.05):
    samp_size, num_feats = sample.shape

    if isinstance(method, np.ndarray):
        p_vals = method
        udg = p_vals < significance_level
    elif method == "dcov_fast":
        cov, d_bars = dcov(sample)
        crit_val = chi2(1).ppf(1 - significance_level)
        test_val = samp_size * cov / np.outer(d_bars, d_bars)
        udg = test_val >= crit_val
        p_vals = None
    elif method == "g-test":
        pass
    else:
        p_vals = np.zeros((num_feats, num_feats), float)
        idxs, jdxs = np.triu_indices(num_feats, 1)
        zipped = zip(idxs, jdxs)
        sample_iter = (sample[:, i_j].T for i_j in zipped)
        if method == "dcov_big":
            test = dcor_test
        elif method == "xicor":
            test = xicor_test
        with Pool(max(1, int(0.75 * cpu_count()))) as p:
            p_vals[idxs, jdxs] = p_vals[jdxs, idxs] = np.fromiter(
                p.imap(test, sample_iter, 100), float
            )
            udg = p_vals < significance_level
    np.fill_diagonal(udg, False)
    return udg, p_vals




[docs]
def dcor_test(x_y):
    x, y = x_y
    return distance_correlation_t_test(x, y).pvalue




[docs]
def xicor_test(x_y):
    x, y = x_y
    xi, pvalue = xicorr(x, y)
    return pvalue



# The following is a modification of the xicorrelation source code,
# Copyright 2021 Nikolay Novik (https://github.com/jettify),
# for compatibility with numpy 2.0+
class _XiCorr(NamedTuple):
    xi: float
    fr: npt.NDArray[np.float64]
    cu: float



[docs]
class XiCorrResult(NamedTuple):
    correlation: float
    pvalue: Optional[float]



def _xicorr(x: npt.NDArray[np.float64], y: npt.NDArray[np.float64]) -> _XiCorr:
    # Ported from original R implementation.
    # https://github.com/cran/XICOR/blob/master/R/calculateXI.R
    n = x.size
    PI = rankdata(x, method="average")
    fr = rankdata(y, method="average") / n
    gr = rankdata(-y, method="average") / n
    cu = np.mean(gr * (1 - gr))
    A1 = np.abs(np.diff(fr[np.argsort(PI, kind="quicksort")])).sum() / (2 * n)
    xi = 1.0 - A1 / cu
    return _XiCorr(xi, fr, cu)



[docs]
def xicorr(x: npt.ArrayLike, y: npt.ArrayLike, ties: bool = True) -> XiCorrResult:
    """Compute the cross rank increment correlation coefficient xi [1].

    Parameters
    ----------
    x, y : array_like
        Arrays of rankings, of the same shape. If arrays are not 1-D, they
        will be flattened to 1-D.
    ties : bool, optional
        If ties is True, the algorithm assumes that the data has ties and
        employs the more elaborated theory for calculating s.d. and P-value.
        Otherwise, it uses the simpler theory. There is no harm in putting
        ties = True even if there are no ties.

    Returns
    -------
    correlation : float
       The tau statistic.
    pvalue : float
       P-values computed by the asymptotic theory.

    See Also
    --------
    spearmanr : Calculates a Spearman rank-order correlation coefficient.


    Example:
        >>> from xicorrelation import xicorr
        >>> x = [1, 2, 3, 4, 5]
        >>> y = [1, 4, 9, 16, 25]
        >>> xi, pvalue = xicorr(x, y)
        >>> print(xi, pvalue)

    References
    ----------
    .. [1] Chatterjee, S., "A new coefficient of correlation",
           https://arxiv.org/abs/1909.10140, 2020.

    Examples
    --------
    >>> from scipy import stats
    >>> x1 = [12, 2, 1, 12, 2]
    >>> x2 = [1, 4, 7, 1, 0]
    >>> xi, p_value, _ = xicorr(x1, x2)
    >>> tau
    -0.47140452079103173
    >>> p_value
    0.2827454599327748
    """
    # https://git.io/JSIlN
    x = np.asarray(x).ravel()
    y = np.asarray(y).ravel()

    if x.size != y.size:
        raise ValueError(
            "All inputs to `xicorr` must be of the same "
            f"size, found x-size {x.size} and y-size {y.size}"
        )
    elif not x.size or not y.size:
        # Return NaN if arrays are empty
        return XiCorrResult(np.nan, np.nan)

    r = _xicorr(x, y)
    xi = r.xi
    fr = r.fr
    CU = r.cu

    pvalue: Optional[float] = None
    # https://git.io/JSIlM
    n = x.size
    if not ties:
        # sd = np.sqrt(2.0 / (5.0 * n))
        pvalue = 1.0 - norm.cdf(np.sqrt(n) * xi / np.sqrt(2.0 / 5.0))
    else:
        qfr = np.sort(fr)
        ind = np.arange(1, n + 1)
        ind2 = 2 * n - 2 * ind + 1

        ai = np.mean(ind2 * qfr * qfr) / n
        ci = np.mean(ind2 * qfr) / n
        cq = np.cumsum(qfr)
        m = (cq + (n - ind) * qfr) / n
        b = np.mean(m**2)
        v = (ai - 2.0 * b + ci**2) / (CU**2)

        # sd = np.sqrt(v / n)
        pvalue = 1.0 - norm.cdf(np.sqrt(n) * xi / np.sqrt(v))
    return XiCorrResult(xi, pvalue)