Source code for rsatoolbox.rdm.compare

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Comparison methods for comparing two RDMs objects
"""
import numpy as np
import scipy.stats
from scipy import linalg
from scipy.optimize import minimize
from scipy.stats._stats import _kendall_dis
from scipy.spatial.distance import squareform
from rsatoolbox.util.matrix import pairwise_contrast_sparse
from rsatoolbox.util.matrix import pairwise_contrast
from rsatoolbox.util.rdm_utils import _get_n_from_reduced_vectors
from rsatoolbox.util.rdm_utils import _get_n_from_length
from rsatoolbox.util.matrix import row_col_indicator_g


[docs]def compare(rdm1, rdm2, method='cosine', sigma_k=None):
    """calculates the similarity between two RDMs objects using a chosen method

    Args:
        rdm1 (rsatoolbox.rdm.RDMs):
            first set of RDMs

        rdm2 (rsatoolbox.rdm.RDMs):
            second set of RDMs

        method (string): which method to use, options are:

            'cosine' = cosine similarity

            'spearman' = spearman rank correlation

            'corr' = pearson correlation

            'kendall' = kendall-tau b

            'tau-a' = kendall-tau a

            'rho-a' = spearman correlation without tie correction

            'corr_cov' = pearson correlation after whitening

            'cosine_cov' = unbiased distance correlation
            which is equivalent to the cosine dinstance after whitening

            'neg_riem_dist' = negative riemannian distance

        sigma_k (numpy.ndarray):
            covariance matrix of the pattern estimates.
            Used only for methods 'corr_cov' and 'cosine_cov'.

    Returns:
        numpy.ndarray: dist:
            pariwise similarities between the RDMs from the RDMs objects

    """
    if method == 'cosine':
        sim = compare_cosine(rdm1, rdm2)
    elif method == 'spearman':
        sim = compare_spearman(rdm1, rdm2)
    elif method == 'corr':
        sim = compare_correlation(rdm1, rdm2)
    elif method in ('kendall', 'tau-b'):
        sim = compare_kendall_tau(rdm1, rdm2)
    elif method == 'tau-a':
        sim = compare_kendall_tau_a(rdm1, rdm2)
    elif method == 'rho-a':
        sim = compare_rho_a(rdm1, rdm2)
    elif method == 'corr_cov':
        sim = compare_correlation_cov_weighted(rdm1, rdm2, sigma_k=sigma_k)
    elif method == 'cosine_cov':
        sim = compare_cosine_cov_weighted(rdm1, rdm2, sigma_k=sigma_k)
    elif method == 'neg_riem_dist':
        sim = compare_neg_riemannian_distance(rdm1, rdm2, sigma_k=sigma_k)
    else:
        raise ValueError('Unknown RDM comparison method requested!')
    return sim


[docs]def compare_cosine(rdm1, rdm2):
    """calculates the cosine similarities between two RDMs objects

    Args:
        rdm1 (rsatoolbox.rdm.RDMs):
            first set of RDMs
        rdm2 (rsatoolbox.rdm.RDMs):
            second set of RDMs
    Returns:
        numpy.ndarray: dist
            cosine similarity between the two RDMs

    """
    vector1, vector2, _ = _parse_input_rdms(rdm1, rdm2)
    sim = _cosine(vector1, vector2)
    return sim


[docs]def compare_correlation(rdm1, rdm2):
    """calculates the correlations between two RDMs objects

    Args:
        rdm1 (rsatoolbox.rdm.RDMs):
            first set of RDMs
        rdm2 (rsatoolbox.rdm.RDMs):
            second set of RDMs
    Returns:
        numpy.ndarray: dist:
            correlations between the two RDMs

    """
    vector1, vector2, _ = _parse_input_rdms(rdm1, rdm2)
    # compute by subtracting the mean and then calculating cosine similarity
    vector1 = vector1 - np.mean(vector1, 1, keepdims=True)
    vector2 = vector2 - np.mean(vector2, 1, keepdims=True)
    sim = _cosine(vector1, vector2)
    return sim


[docs]def compare_cosine_cov_weighted(rdm1, rdm2, sigma_k=None):
    """calculates the cosine similarities between two RDMs objects

    Args:
        rdm1 (rsatoolbox.rdm.RDMs):
            first set of RDMs
        rdm2 (rsatoolbox.rdm.RDMs):
            second set of RDMs
    Returns:
        numpy.ndarray: dist:
            cosine similarities between the two RDMs

    """
    vector1, vector2, nan_idx = _parse_input_rdms(rdm1, rdm2)
    sim = _cosine_cov_weighted(vector1, vector2, sigma_k, nan_idx)
    return sim


[docs]def compare_correlation_cov_weighted(rdm1, rdm2, sigma_k=None):
    """calculates the correlations between two RDMs objects after whitening
    with the covariance of the entries

    Args:
        rdm1 (rsatoolbox.rdm.RDMs):
            first set of RDMs
        rdm2 (rsatoolbox.rdm.RDMs):
            second set of RDMs

    Returns:
        numpy.ndarray: dist:
            correlations between the two RDMs

    """
    vector1, vector2, nan_idx = _parse_input_rdms(rdm1, rdm2)
    # compute by subtracting the mean and then calculating cosine similarity
    vector1 = vector1 - np.mean(vector1, 1, keepdims=True)
    vector2 = vector2 - np.mean(vector2, 1, keepdims=True)
    sim = _cosine_cov_weighted(vector1, vector2, sigma_k, nan_idx)
    return sim


[docs]def compare_spearman(rdm1, rdm2):
    """calculates the spearman rank correlations between
    two RDMs objects

    Args:
        rdm1 (rsatoolbox.rdm.RDMs):
            first set of RDMs
        rdm2 (rsatoolbox.rdm.RDMs):
            second set of RDMs
    Returns:
        numpy.ndarray: dist:
            rank correlations between the two RDMs

    """
    vector1, vector2, _ = _parse_input_rdms(rdm1, rdm2)
    vector1 = np.apply_along_axis(scipy.stats.rankdata, 1, vector1)
    vector2 = np.apply_along_axis(scipy.stats.rankdata, 1, vector2)
    vector1 = vector1 - np.mean(vector1, 1, keepdims=True)
    vector2 = vector2 - np.mean(vector2, 1, keepdims=True)
    sim = _cosine(vector1, vector2)
    return sim


[docs]def compare_rho_a(rdm1, rdm2):
    """calculates the spearman rank correlations between
    two RDMs objects without tie correction

    Args:
        rdm1 (rsatoolbox.rdm.RDMs):
            first set of RDMs
        rdm2 (rsatoolbox.rdm.RDMs):
            second set of RDMs
    Returns:
        numpy.ndarray: dist:
            rank correlations between the two RDMs

    """
    vector1, vector2, _ = _parse_input_rdms(rdm1, rdm2)
    vector1 = np.apply_along_axis(scipy.stats.rankdata, 1, vector1)
    vector2 = np.apply_along_axis(scipy.stats.rankdata, 1, vector2)
    vector1 = vector1 - np.mean(vector1, 1, keepdims=True)
    vector2 = vector2 - np.mean(vector2, 1, keepdims=True)
    n = vector1.shape[1]
    sim = np.einsum('ij,kj->ik', vector1, vector2) / (n ** 3 - n) * 12
    return sim


[docs]def compare_kendall_tau(rdm1, rdm2):
    """calculates the Kendall-tau bs between two RDMs objects.
    Kendall-tau b is the version, which corrects for ties.
    We here use the implementation from scipy.

    Args:
        rdm1 (rsatoolbox.rdm.RDMs):
            first set of RDMs
        rdm2 (rsatoolbox.rdm.RDMs):
            second set of RDMs
    Returns:
        numpy.ndarray: dist:
            kendall-tau correlation between the two RDMs
    """
    vector1, vector2, _ = _parse_input_rdms(rdm1, rdm2)
    sim = _all_combinations(vector1, vector2, _kendall_tau)
    return sim


[docs]def compare_kendall_tau_a(rdm1, rdm2):
    """calculates the Kendall-tau a based distance between two RDMs objects.
    adequate when some models predict ties

    Args:
        rdm1 (rsatoolbox.rdm.RDMs):
            first set of RDMs
        rdm2 (rsatoolbox.rdm.RDMs):
            second set of RDMs
    Returns:
        numpy.ndarray: dist:
            kendall-tau a between the two RDMs
    """
    vector1, vector2, _ = _parse_input_rdms(rdm1, rdm2)
    sim = _all_combinations(vector1, vector2, _tau_a)
    return sim


[docs]def compare_neg_riemannian_distance(rdm1, rdm2, sigma_k=None):
    """calculates the negative Riemannian distance between two RDMs objects.

    Args:
        rdm1 (rsatoolbox.rdm.RDMs):
            first set of RDMs
        rdm2 (rsatoolbox.rdm.RDMs):
            second set of RDMs
    Returns:
        numpy.ndarray: dist:
            negative Riemannian distance between the two RDMs
    """
    vector1, vector2, _ = _parse_input_rdms(rdm1, rdm2)
    n_cond = _get_n_from_length(vector1.shape[1])
    if sigma_k is None:
        sigma_k = np.eye(n_cond)
    P = np.block([-1*np.ones((n_cond - 1, 1)), np.eye(n_cond - 1)])
    sigma_k_hat = P@sigma_k@P.T
    # construct RDM to 2nd-moment (G) transformation
    pairs = pairwise_contrast(np.arange(n_cond-1))
    pairs[pairs == -1] = 1
    T = np.block([
        [np.eye(n_cond - 1), np.zeros((n_cond-1, vector1.shape[1] - n_cond + 1))],
        [0.5 * pairs, np.diag(-0.5 * np.ones(vector1.shape[1] - n_cond + 1))]])
    vec_G1 = vector1@np.transpose(T)
    vec_G2 = vector2@np.transpose(T)

    sim = _all_combinations(vec_G1, vec_G2, _riemannian_distance, sigma_k_hat)
    return sim


def _all_combinations(vectors1, vectors2, func, *args, **kwargs):
    """runs a function func on all combinations of v1 in vectors1
    and v2 in vectors2 and puts the results into an array

    Args:
        vectors1 (numpy.ndarray):
            first set of values
        vectors1 (numpy.ndarray):
            second set of values
        func (function):
            function to be applied, should take two input vectors
            and return one scalar
    Returns:
        numpy.ndarray: value: function result over all pairs

    """
    value = np.empty((len(vectors1), len(vectors2)))
    k1 = 0
    for v1 in vectors1:
        k2 = 0
        for v2 in vectors2:
            value[k1, k2] = func(v1, v2, *args, **kwargs)
            k2 += 1
        k1 += 1
    return value


def _cosine_cov_weighted_slow(vector1, vector2, sigma_k=None, nan_idx=None):
    """computes the cosine similarities between two sets of vectors
    after whitening by their covariance.

    Args:
        vector1 (numpy.ndarray):
            first vectors (2D)
        vector1 (numpy.ndarray):
            second vectors (2D)
        sigma_k (Matrix):
            optional, covariance between pattern estimates
        nan_idx (numpy.ndarray):
            vector of non-nan entries from input parsing

    Returns:
        cos (float):
            cosine of the angle between vectors

    """
    if nan_idx is not None:
        n_cond = _get_n_from_reduced_vectors(nan_idx.reshape(1, -1))
        v = _get_v(n_cond, sigma_k)
        v = v[nan_idx][:, nan_idx]
    else:
        n_cond = _get_n_from_reduced_vectors(vector1)
        v = _get_v(n_cond, sigma_k)
    # compute V^-1 vector1/2 for all vectors by solving Vx = vector1/2
    vector1_m = np.array([scipy.sparse.linalg.cg(v, vector1[i], atol=0)[0]
                          for i in range(vector1.shape[0])])
    vector2_m = np.array([scipy.sparse.linalg.cg(v, vector2[i], atol=0)[0]
                          for i in range(vector2.shape[0])])
    # compute the inner products v1^T (V^-1 v2) for all combinations
    cos = np.einsum('ij,kj->ik', vector1, vector2_m)
    # divide by sqrt(v1^T (V^-1 v1))
    cos /= np.sqrt(np.einsum('ij,ij->i', vector1,
                             vector1_m)).reshape((-1, 1))
    # divide by sqrt(v2^T (V^-1 v2))
    cos /= np.sqrt(np.einsum('ij,ij->i', vector2,
                             vector2_m)).reshape((1, -1))
    return cos


def _cosine_cov_weighted(vector1, vector2, sigma_k=None, nan_idx=None):
    """computes the cosine angles between two sets of vectors
    weighted by the covariance
    If no covariance is given this is computed using the linear CKA,
    which is equivalent in this case and faster to compute.
    Otherwise reverts to _cosine_cov_weighted_slow.

    Args:
        vector1 (numpy.ndarray):
            first vectors (2D)
        vector1 (numpy.ndarray):
            second vectors (2D)
        sigma_k (Matrix):
            optional, covariance between pattern estimates

    Returns:
        cos (float):
            cosine angle between vectors

    """
    if (sigma_k is not None) and (sigma_k.ndim >= 2):
        cos = _cosine_cov_weighted_slow(
            vector1, vector2, sigma_k=sigma_k, nan_idx=nan_idx)
    else:
        if nan_idx is None:
            nan_idx = np.ones(vector1[0].shape, bool)
        # Compute the extended version of RDM vectors in whitened space
        vector1_m = _cov_weighting(vector1, nan_idx, sigma_k)
        vector2_m = _cov_weighting(vector2, nan_idx, sigma_k)
        cos = _cosine(vector1_m, vector2_m)
    return cos


def _cov_weighting(vector, nan_idx, sigma_k=None):
    """Transforms an array of RDM vectors in to representation
    in which the elements are isotropic. This is a stretched-out
    second moment matrix, with the diagonal elements appended.
    To account for the fact that the off-diagonal elements are
    only there once, they are multipled by 2

    Args:
        vector (numpy.ndarray):
            RDM vectors (2D) N x n_dist

    Returns:
        vector_w:
            weighted vectors (M x n_dist + n_cond)

    """
    N, n_dist = vector.shape
    n_cond = _get_n_from_length(nan_idx.shape[0])
    vector_w = -0.5 * np.c_[vector, np.zeros((N, n_cond))]
    rowI, colI = row_col_indicator_g(n_cond)
    sumI = rowI + colI
    if np.all(nan_idx):
        # column and row means
        m = vector_w @ sumI / n_cond
        # Overall mean
        mm = np.sum(vector_w * 2, axis=1, keepdims=True) / (n_cond * n_cond)
        # subtract the column and row means and add overall mean
        vector_w = vector_w - m @ sumI.T + mm
        if sigma_k is not None:
            if sigma_k.ndim == 1:
                sigma_k_sqrt = np.sqrt(sigma_k)
                vector_w /= rowI @ sigma_k_sqrt
                vector_w /= colI @ sigma_k_sqrt
            elif sigma_k.ndim == 2:
                l_sigma_k = np.linalg.inv(np.linalg.cholesky(sigma_k))
                Gs = np.empty((vector.shape[0], n_cond, n_cond))
                for i_vec in range(vector.shape[0]):
                    G = scipy.spatial.distance.squareform(
                        vector_w[i_vec, :n_dist])
                    np.fill_diagonal(G, vector_w[i_vec, n_dist:])
                    Gs[i_vec] = G
                # These two are the slow lines for this whitening
                Gs = np.einsum('ij,mjk,lk->mil', l_sigma_k, Gs, l_sigma_k)
                vector_w = np.einsum('ij,mjk,ik->mi', rowI, Gs, colI)
    else:
        nan_idx_ext = np.concatenate((nan_idx, np.ones(n_cond, bool)))
        sumI = sumI[nan_idx_ext]
        # get matrix for double centering with missing values:
        sumI[n_dist:, :] /= 2
        diag = np.concatenate((np.ones((n_dist, 1)) / 2, np.ones((n_cond, 1))))
        # one line version much faster here!
        vector_w = vector_w - (
            vector_w
            @ sumI @ np.linalg.inv(sumI.T @ (diag * sumI)) @ (diag * sumI).T)
        if sigma_k is not None:
            if sigma_k.ndim == 1:
                sigma_k_sqrt = np.sqrt(sigma_k)
                vector_w /= rowI[nan_idx_ext] @ sigma_k_sqrt
                vector_w /= colI[nan_idx_ext] @ sigma_k_sqrt
            elif sigma_k.ndim == 2:
                raise ValueError('cannot handle sigma_k and nans')
    # Weight the off-diagnoal terms double
    vector_w[:, :n_dist] = vector_w[:, :n_dist] * np.sqrt(2)
    return vector_w


def _cosine(vector1, vector2):
    """computes the cosine angles between two sets of vectors

    Args:
        vector1 (numpy.ndarray):
            first vectors (2D)
        vector1 (numpy.ndarray):
            second vectors (2D)
    Returns:
        cos (float):
            cosine angle between vectors

    """
    norm_1 = np.sqrt(np.einsum('ij,ij->i', vector1, vector1))
    norm_2 = np.sqrt(np.einsum('ij,ij->i', vector2, vector2))
    sel_1 = norm_1 > 0
    sel_2 = norm_2 > 0
    # without more indexing if all vectors are nonzero length
    if np.all(sel_1) and np.all(sel_2):
        # compute all inner products
        cos_ok = np.einsum('ij,kj->ik', vector1, vector2)
        # divide by sqrt of the inner products with themselves
        cos_ok /= norm_1.reshape((-1, 1))
        cos_ok /= norm_2.reshape((1, -1))
        return cos_ok
    # keep track of indexing if some vectors are 0
    # compute all inner products
    cos_ok = np.einsum('ij,kj->ik', vector1[sel_1], vector2[sel_2])
    # divide by sqrt of the inner products with themselves
    cos_ok /= norm_1[sel_1].reshape((-1, 1))
    cos_ok /= norm_2[sel_2].reshape((1, -1))
    cos = np.zeros((vector1.shape[0], vector2.shape[0]))
    np.putmask(cos, np.outer(norm_1 > 0, norm_2 > 0), cos_ok)
    return cos


def _riemannian_distance(vec_G1, vec_G2, sigma_k):
    """computes the Riemannian distance between two vectorized second moments

    Args:
        vec_G1 (numpy.ndarray):
            first vectorized second-moment
        vec_G2 (numpy.ndarray):
            second vectorized second-moment

        Returns:
            neg_riem (float):
                negative riemannian distance
    """
    n_cond = _get_n_from_length(len(vec_G1))
    G1 = np.diag(vec_G1[0:(n_cond-1)])+squareform(vec_G1[(n_cond-1):len(vec_G1)])
    G2 = np.diag(vec_G2[0:(n_cond-1)])+squareform(vec_G2[(n_cond-1):len(vec_G2)])

    def fun(theta):
        return np.sqrt((np.log(linalg.eigvalsh(
            np.exp(theta[0]) * G1 + np.exp(theta[1]) * sigma_k, G2))**2).sum())
    theta = minimize(fun, (0, 0), method='Nelder-Mead')
    neg_riem = -1 * theta.fun
    return neg_riem


def _kendall_tau(vector1, vector2):
    """computes the kendall-tau between two vectors

    Args:
        vector1 (numpy.ndarray):
            first vector
        vector1 (numpy.ndarray):
            second vector
    Returns:
        tau (float):
            kendall-tau

    """
    tau = scipy.stats.kendalltau(vector1, vector2).correlation
    return tau


def _tau_a(vector1, vector2):
    """computes kendall-tau a between two vectors
    based on modifying scipy.stats.kendalltau

    Args:
        vector1 (numpy.ndarray):
            first vector
        vector1 (numpy.ndarray):
            second vector
    Returns:
        tau (float):
            kendall-tau a

    """
    size = vector1.size
    vector1, vector2 = _sort_and_rank(vector1, vector2)
    vector2, vector1 = _sort_and_rank(vector2, vector1)
    dis = _kendall_dis(vector1, vector2)  # discordant pairs
    obs = np.r_[True, (vector1[1:] != vector1[:-1]) |
                      (vector2[1:] != vector2[:-1]), True]
    cnt = np.diff(np.nonzero(obs)[0]).astype('int64', copy=False)
    ntie = (cnt * (cnt - 1) // 2).sum()  # joint ties
    xtie, _, _ = _count_rank_tie(vector1)     # ties in x, stats
    ytie, _, _ = _count_rank_tie(vector2)     # ties in y, stats
    tot = (size * (size - 1)) // 2
    # Note that tot = con + dis + (xtie - ntie) + (ytie - ntie) + ntie
    #               = con + dis + xtie + ytie - ntie
    con_minus_dis = tot - xtie - ytie + ntie - 2 * dis
    tau = con_minus_dis / tot
    # Limit range to fix computational errors
    tau = min(1., max(-1., tau))
    return tau


def _sort_and_rank(vector1, vector2):
    """does the sort and rank step of the _tau calculation"""
    perm = np.argsort(vector2, kind='mergesort')
    vector1 = vector1[perm]
    vector2 = vector2[perm]
    vector2 = np.r_[True, vector2[1:] != vector2[:-1]].cumsum(dtype=np.intp)
    return vector1, vector2


def _count_rank_tie(ranks):
    """ counts tied ranks for kendall-tau calculation"""
    cnt = np.bincount(ranks).astype('int64', copy=False)
    cnt = cnt[cnt > 1]
    return ((cnt * (cnt - 1) // 2).sum(),
            (cnt * (cnt - 1.) * (cnt - 2)).sum(),
            (cnt * (cnt - 1.) * (2*cnt + 5)).sum())


def _get_v(n_cond, sigma_k):
    """ get the rdm covariance from sigma_k """
    # calculate Xi
    c_mat = pairwise_contrast_sparse(np.arange(n_cond))
    if sigma_k is None:
        xi = c_mat @ c_mat.transpose()
    elif sigma_k.ndim == 1:
        sigma_k = scipy.sparse.diags(sigma_k)
        xi = c_mat @ sigma_k @ c_mat.transpose()
    else:
        sigma_k = scipy.sparse.csr_matrix(sigma_k)
        xi = c_mat @ sigma_k @ c_mat.transpose()
    # calculate V
    v = xi.multiply(xi).tocsc()
    return v


def _parse_input_rdms(rdm1, rdm2):
    """Gets the vector representation of input RDMs, raises an error if
    the two RDMs objects have different dimensions

    Args:
        rdm1 (rsatoolbox.rdm.RDMs):
            first set of RDMs
        rdm2 (rsatoolbox.rdm.RDMs):
            second set of RDMs

    """
    if not isinstance(rdm1, np.ndarray):
        vector1 = rdm1.get_vectors()
    else:
        if len(rdm1.shape) == 1:
            vector1 = rdm1.reshape(1, -1)
        else:
            vector1 = rdm1
    if not isinstance(rdm2, np.ndarray):
        vector2 = rdm2.get_vectors()
    else:
        if len(rdm2.shape) == 1:
            vector2 = rdm2.reshape(1, -1)
        else:
            vector2 = rdm2
    if not vector1.shape[1] == vector2.shape[1]:
        raise ValueError('rdm1 and rdm2 must be RDMs of equal shape')
    nan_idx = ~np.isnan(vector1)
    vector1_no_nan = vector1[nan_idx].reshape(vector1.shape[0], -1)
    vector2_no_nan = vector2[~np.isnan(vector2)].reshape(vector2.shape[0], -1)
    if not vector1_no_nan.shape[1] == vector2_no_nan.shape[1]:
        raise ValueError('rdm1 and rdm2 have different nan positions')
    return vector1_no_nan, vector2_no_nan, nan_idx[0]
Source code for rsatoolbox.rdm.compare

rsatoolbox

Navigation

Related Topics