#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Comparison methods for comparing two RDMs objects
"""
import numpy as np
import scipy.stats
from scipy import linalg
from scipy.optimize import minimize
from scipy.stats._stats import _kendall_dis
from scipy.spatial.distance import squareform
from rsatoolbox.util.matrix import pairwise_contrast_sparse
from rsatoolbox.util.matrix import pairwise_contrast
from rsatoolbox.util.rdm_utils import _get_n_from_reduced_vectors
from rsatoolbox.util.rdm_utils import _get_n_from_length
from rsatoolbox.util.matrix import row_col_indicator_g
[docs]def compare(rdm1, rdm2, method='cosine', sigma_k=None):
"""calculates the similarity between two RDMs objects using a chosen method
Args:
rdm1 (rsatoolbox.rdm.RDMs):
first set of RDMs
rdm2 (rsatoolbox.rdm.RDMs):
second set of RDMs
method (string): which method to use, options are:
'cosine' = cosine similarity
'spearman' = spearman rank correlation
'corr' = pearson correlation
'kendall' = kendall-tau b
'tau-a' = kendall-tau a
'rho-a' = spearman correlation without tie correction
'corr_cov' = pearson correlation after whitening
'cosine_cov' = unbiased distance correlation
which is equivalent to the cosine dinstance after whitening
'neg_riem_dist' = negative riemannian distance
sigma_k (numpy.ndarray):
covariance matrix of the pattern estimates.
Used only for methods 'corr_cov' and 'cosine_cov'.
Returns:
numpy.ndarray: dist:
pariwise similarities between the RDMs from the RDMs objects
"""
if method == 'cosine':
sim = compare_cosine(rdm1, rdm2)
elif method == 'spearman':
sim = compare_spearman(rdm1, rdm2)
elif method == 'corr':
sim = compare_correlation(rdm1, rdm2)
elif method in ('kendall', 'tau-b'):
sim = compare_kendall_tau(rdm1, rdm2)
elif method == 'tau-a':
sim = compare_kendall_tau_a(rdm1, rdm2)
elif method == 'rho-a':
sim = compare_rho_a(rdm1, rdm2)
elif method == 'corr_cov':
sim = compare_correlation_cov_weighted(rdm1, rdm2, sigma_k=sigma_k)
elif method == 'cosine_cov':
sim = compare_cosine_cov_weighted(rdm1, rdm2, sigma_k=sigma_k)
elif method == 'neg_riem_dist':
sim = compare_neg_riemannian_distance(rdm1, rdm2, sigma_k=sigma_k)
else:
raise ValueError('Unknown RDM comparison method requested!')
return sim
[docs]def compare_cosine(rdm1, rdm2):
"""calculates the cosine similarities between two RDMs objects
Args:
rdm1 (rsatoolbox.rdm.RDMs):
first set of RDMs
rdm2 (rsatoolbox.rdm.RDMs):
second set of RDMs
Returns:
numpy.ndarray: dist
cosine similarity between the two RDMs
"""
vector1, vector2, _ = _parse_input_rdms(rdm1, rdm2)
sim = _cosine(vector1, vector2)
return sim
[docs]def compare_correlation(rdm1, rdm2):
"""calculates the correlations between two RDMs objects
Args:
rdm1 (rsatoolbox.rdm.RDMs):
first set of RDMs
rdm2 (rsatoolbox.rdm.RDMs):
second set of RDMs
Returns:
numpy.ndarray: dist:
correlations between the two RDMs
"""
vector1, vector2, _ = _parse_input_rdms(rdm1, rdm2)
# compute by subtracting the mean and then calculating cosine similarity
vector1 = vector1 - np.mean(vector1, 1, keepdims=True)
vector2 = vector2 - np.mean(vector2, 1, keepdims=True)
sim = _cosine(vector1, vector2)
return sim
[docs]def compare_cosine_cov_weighted(rdm1, rdm2, sigma_k=None):
"""calculates the cosine similarities between two RDMs objects
Args:
rdm1 (rsatoolbox.rdm.RDMs):
first set of RDMs
rdm2 (rsatoolbox.rdm.RDMs):
second set of RDMs
Returns:
numpy.ndarray: dist:
cosine similarities between the two RDMs
"""
vector1, vector2, nan_idx = _parse_input_rdms(rdm1, rdm2)
sim = _cosine_cov_weighted(vector1, vector2, sigma_k, nan_idx)
return sim
[docs]def compare_correlation_cov_weighted(rdm1, rdm2, sigma_k=None):
"""calculates the correlations between two RDMs objects after whitening
with the covariance of the entries
Args:
rdm1 (rsatoolbox.rdm.RDMs):
first set of RDMs
rdm2 (rsatoolbox.rdm.RDMs):
second set of RDMs
Returns:
numpy.ndarray: dist:
correlations between the two RDMs
"""
vector1, vector2, nan_idx = _parse_input_rdms(rdm1, rdm2)
# compute by subtracting the mean and then calculating cosine similarity
vector1 = vector1 - np.mean(vector1, 1, keepdims=True)
vector2 = vector2 - np.mean(vector2, 1, keepdims=True)
sim = _cosine_cov_weighted(vector1, vector2, sigma_k, nan_idx)
return sim
[docs]def compare_spearman(rdm1, rdm2):
"""calculates the spearman rank correlations between
two RDMs objects
Args:
rdm1 (rsatoolbox.rdm.RDMs):
first set of RDMs
rdm2 (rsatoolbox.rdm.RDMs):
second set of RDMs
Returns:
numpy.ndarray: dist:
rank correlations between the two RDMs
"""
vector1, vector2, _ = _parse_input_rdms(rdm1, rdm2)
vector1 = np.apply_along_axis(scipy.stats.rankdata, 1, vector1)
vector2 = np.apply_along_axis(scipy.stats.rankdata, 1, vector2)
vector1 = vector1 - np.mean(vector1, 1, keepdims=True)
vector2 = vector2 - np.mean(vector2, 1, keepdims=True)
sim = _cosine(vector1, vector2)
return sim
[docs]def compare_rho_a(rdm1, rdm2):
"""calculates the spearman rank correlations between
two RDMs objects without tie correction
Args:
rdm1 (rsatoolbox.rdm.RDMs):
first set of RDMs
rdm2 (rsatoolbox.rdm.RDMs):
second set of RDMs
Returns:
numpy.ndarray: dist:
rank correlations between the two RDMs
"""
vector1, vector2, _ = _parse_input_rdms(rdm1, rdm2)
vector1 = np.apply_along_axis(scipy.stats.rankdata, 1, vector1)
vector2 = np.apply_along_axis(scipy.stats.rankdata, 1, vector2)
vector1 = vector1 - np.mean(vector1, 1, keepdims=True)
vector2 = vector2 - np.mean(vector2, 1, keepdims=True)
n = vector1.shape[1]
sim = np.einsum('ij,kj->ik', vector1, vector2) / (n ** 3 - n) * 12
return sim
[docs]def compare_kendall_tau(rdm1, rdm2):
"""calculates the Kendall-tau bs between two RDMs objects.
Kendall-tau b is the version, which corrects for ties.
We here use the implementation from scipy.
Args:
rdm1 (rsatoolbox.rdm.RDMs):
first set of RDMs
rdm2 (rsatoolbox.rdm.RDMs):
second set of RDMs
Returns:
numpy.ndarray: dist:
kendall-tau correlation between the two RDMs
"""
vector1, vector2, _ = _parse_input_rdms(rdm1, rdm2)
sim = _all_combinations(vector1, vector2, _kendall_tau)
return sim
[docs]def compare_kendall_tau_a(rdm1, rdm2):
"""calculates the Kendall-tau a based distance between two RDMs objects.
adequate when some models predict ties
Args:
rdm1 (rsatoolbox.rdm.RDMs):
first set of RDMs
rdm2 (rsatoolbox.rdm.RDMs):
second set of RDMs
Returns:
numpy.ndarray: dist:
kendall-tau a between the two RDMs
"""
vector1, vector2, _ = _parse_input_rdms(rdm1, rdm2)
sim = _all_combinations(vector1, vector2, _tau_a)
return sim
[docs]def compare_neg_riemannian_distance(rdm1, rdm2, sigma_k=None):
"""calculates the negative Riemannian distance between two RDMs objects.
Args:
rdm1 (rsatoolbox.rdm.RDMs):
first set of RDMs
rdm2 (rsatoolbox.rdm.RDMs):
second set of RDMs
Returns:
numpy.ndarray: dist:
negative Riemannian distance between the two RDMs
"""
vector1, vector2, _ = _parse_input_rdms(rdm1, rdm2)
n_cond = _get_n_from_length(vector1.shape[1])
if sigma_k is None:
sigma_k = np.eye(n_cond)
P = np.block([-1*np.ones((n_cond - 1, 1)), np.eye(n_cond - 1)])
sigma_k_hat = P@sigma_k@P.T
# construct RDM to 2nd-moment (G) transformation
pairs = pairwise_contrast(np.arange(n_cond-1))
pairs[pairs == -1] = 1
T = np.block([
[np.eye(n_cond - 1), np.zeros((n_cond-1, vector1.shape[1] - n_cond + 1))],
[0.5 * pairs, np.diag(-0.5 * np.ones(vector1.shape[1] - n_cond + 1))]])
vec_G1 = vector1@np.transpose(T)
vec_G2 = vector2@np.transpose(T)
sim = _all_combinations(vec_G1, vec_G2, _riemannian_distance, sigma_k_hat)
return sim
def _all_combinations(vectors1, vectors2, func, *args, **kwargs):
"""runs a function func on all combinations of v1 in vectors1
and v2 in vectors2 and puts the results into an array
Args:
vectors1 (numpy.ndarray):
first set of values
vectors1 (numpy.ndarray):
second set of values
func (function):
function to be applied, should take two input vectors
and return one scalar
Returns:
numpy.ndarray: value: function result over all pairs
"""
value = np.empty((len(vectors1), len(vectors2)))
k1 = 0
for v1 in vectors1:
k2 = 0
for v2 in vectors2:
value[k1, k2] = func(v1, v2, *args, **kwargs)
k2 += 1
k1 += 1
return value
def _cosine_cov_weighted_slow(vector1, vector2, sigma_k=None, nan_idx=None):
"""computes the cosine similarities between two sets of vectors
after whitening by their covariance.
Args:
vector1 (numpy.ndarray):
first vectors (2D)
vector1 (numpy.ndarray):
second vectors (2D)
sigma_k (Matrix):
optional, covariance between pattern estimates
nan_idx (numpy.ndarray):
vector of non-nan entries from input parsing
Returns:
cos (float):
cosine of the angle between vectors
"""
if nan_idx is not None:
n_cond = _get_n_from_reduced_vectors(nan_idx.reshape(1, -1))
v = _get_v(n_cond, sigma_k)
v = v[nan_idx][:, nan_idx]
else:
n_cond = _get_n_from_reduced_vectors(vector1)
v = _get_v(n_cond, sigma_k)
# compute V^-1 vector1/2 for all vectors by solving Vx = vector1/2
vector1_m = np.array([scipy.sparse.linalg.cg(v, vector1[i], atol=0)[0]
for i in range(vector1.shape[0])])
vector2_m = np.array([scipy.sparse.linalg.cg(v, vector2[i], atol=0)[0]
for i in range(vector2.shape[0])])
# compute the inner products v1^T (V^-1 v2) for all combinations
cos = np.einsum('ij,kj->ik', vector1, vector2_m)
# divide by sqrt(v1^T (V^-1 v1))
cos /= np.sqrt(np.einsum('ij,ij->i', vector1,
vector1_m)).reshape((-1, 1))
# divide by sqrt(v2^T (V^-1 v2))
cos /= np.sqrt(np.einsum('ij,ij->i', vector2,
vector2_m)).reshape((1, -1))
return cos
def _cosine_cov_weighted(vector1, vector2, sigma_k=None, nan_idx=None):
"""computes the cosine angles between two sets of vectors
weighted by the covariance
If no covariance is given this is computed using the linear CKA,
which is equivalent in this case and faster to compute.
Otherwise reverts to _cosine_cov_weighted_slow.
Args:
vector1 (numpy.ndarray):
first vectors (2D)
vector1 (numpy.ndarray):
second vectors (2D)
sigma_k (Matrix):
optional, covariance between pattern estimates
Returns:
cos (float):
cosine angle between vectors
"""
if (sigma_k is not None) and (sigma_k.ndim >= 2):
cos = _cosine_cov_weighted_slow(
vector1, vector2, sigma_k=sigma_k, nan_idx=nan_idx)
else:
if nan_idx is None:
nan_idx = np.ones(vector1[0].shape, bool)
# Compute the extended version of RDM vectors in whitened space
vector1_m = _cov_weighting(vector1, nan_idx, sigma_k)
vector2_m = _cov_weighting(vector2, nan_idx, sigma_k)
cos = _cosine(vector1_m, vector2_m)
return cos
def _cov_weighting(vector, nan_idx, sigma_k=None):
"""Transforms an array of RDM vectors in to representation
in which the elements are isotropic. This is a stretched-out
second moment matrix, with the diagonal elements appended.
To account for the fact that the off-diagonal elements are
only there once, they are multipled by 2
Args:
vector (numpy.ndarray):
RDM vectors (2D) N x n_dist
Returns:
vector_w:
weighted vectors (M x n_dist + n_cond)
"""
N, n_dist = vector.shape
n_cond = _get_n_from_length(nan_idx.shape[0])
vector_w = -0.5 * np.c_[vector, np.zeros((N, n_cond))]
rowI, colI = row_col_indicator_g(n_cond)
sumI = rowI + colI
if np.all(nan_idx):
# column and row means
m = vector_w @ sumI / n_cond
# Overall mean
mm = np.sum(vector_w * 2, axis=1, keepdims=True) / (n_cond * n_cond)
# subtract the column and row means and add overall mean
vector_w = vector_w - m @ sumI.T + mm
if sigma_k is not None:
if sigma_k.ndim == 1:
sigma_k_sqrt = np.sqrt(sigma_k)
vector_w /= rowI @ sigma_k_sqrt
vector_w /= colI @ sigma_k_sqrt
elif sigma_k.ndim == 2:
l_sigma_k = np.linalg.inv(np.linalg.cholesky(sigma_k))
Gs = np.empty((vector.shape[0], n_cond, n_cond))
for i_vec in range(vector.shape[0]):
G = scipy.spatial.distance.squareform(
vector_w[i_vec, :n_dist])
np.fill_diagonal(G, vector_w[i_vec, n_dist:])
Gs[i_vec] = G
# These two are the slow lines for this whitening
Gs = np.einsum('ij,mjk,lk->mil', l_sigma_k, Gs, l_sigma_k)
vector_w = np.einsum('ij,mjk,ik->mi', rowI, Gs, colI)
else:
nan_idx_ext = np.concatenate((nan_idx, np.ones(n_cond, bool)))
sumI = sumI[nan_idx_ext]
# get matrix for double centering with missing values:
sumI[n_dist:, :] /= 2
diag = np.concatenate((np.ones((n_dist, 1)) / 2, np.ones((n_cond, 1))))
# one line version much faster here!
vector_w = vector_w - (
vector_w
@ sumI @ np.linalg.inv(sumI.T @ (diag * sumI)) @ (diag * sumI).T)
if sigma_k is not None:
if sigma_k.ndim == 1:
sigma_k_sqrt = np.sqrt(sigma_k)
vector_w /= rowI[nan_idx_ext] @ sigma_k_sqrt
vector_w /= colI[nan_idx_ext] @ sigma_k_sqrt
elif sigma_k.ndim == 2:
raise ValueError('cannot handle sigma_k and nans')
# Weight the off-diagnoal terms double
vector_w[:, :n_dist] = vector_w[:, :n_dist] * np.sqrt(2)
return vector_w
def _cosine(vector1, vector2):
"""computes the cosine angles between two sets of vectors
Args:
vector1 (numpy.ndarray):
first vectors (2D)
vector1 (numpy.ndarray):
second vectors (2D)
Returns:
cos (float):
cosine angle between vectors
"""
norm_1 = np.sqrt(np.einsum('ij,ij->i', vector1, vector1))
norm_2 = np.sqrt(np.einsum('ij,ij->i', vector2, vector2))
sel_1 = norm_1 > 0
sel_2 = norm_2 > 0
# without more indexing if all vectors are nonzero length
if np.all(sel_1) and np.all(sel_2):
# compute all inner products
cos_ok = np.einsum('ij,kj->ik', vector1, vector2)
# divide by sqrt of the inner products with themselves
cos_ok /= norm_1.reshape((-1, 1))
cos_ok /= norm_2.reshape((1, -1))
return cos_ok
# keep track of indexing if some vectors are 0
# compute all inner products
cos_ok = np.einsum('ij,kj->ik', vector1[sel_1], vector2[sel_2])
# divide by sqrt of the inner products with themselves
cos_ok /= norm_1[sel_1].reshape((-1, 1))
cos_ok /= norm_2[sel_2].reshape((1, -1))
cos = np.zeros((vector1.shape[0], vector2.shape[0]))
np.putmask(cos, np.outer(norm_1 > 0, norm_2 > 0), cos_ok)
return cos
def _riemannian_distance(vec_G1, vec_G2, sigma_k):
"""computes the Riemannian distance between two vectorized second moments
Args:
vec_G1 (numpy.ndarray):
first vectorized second-moment
vec_G2 (numpy.ndarray):
second vectorized second-moment
Returns:
neg_riem (float):
negative riemannian distance
"""
n_cond = _get_n_from_length(len(vec_G1))
G1 = np.diag(vec_G1[0:(n_cond-1)])+squareform(vec_G1[(n_cond-1):len(vec_G1)])
G2 = np.diag(vec_G2[0:(n_cond-1)])+squareform(vec_G2[(n_cond-1):len(vec_G2)])
def fun(theta):
return np.sqrt((np.log(linalg.eigvalsh(
np.exp(theta[0]) * G1 + np.exp(theta[1]) * sigma_k, G2))**2).sum())
theta = minimize(fun, (0, 0), method='Nelder-Mead')
neg_riem = -1 * theta.fun
return neg_riem
def _kendall_tau(vector1, vector2):
"""computes the kendall-tau between two vectors
Args:
vector1 (numpy.ndarray):
first vector
vector1 (numpy.ndarray):
second vector
Returns:
tau (float):
kendall-tau
"""
tau = scipy.stats.kendalltau(vector1, vector2).correlation
return tau
def _tau_a(vector1, vector2):
"""computes kendall-tau a between two vectors
based on modifying scipy.stats.kendalltau
Args:
vector1 (numpy.ndarray):
first vector
vector1 (numpy.ndarray):
second vector
Returns:
tau (float):
kendall-tau a
"""
size = vector1.size
vector1, vector2 = _sort_and_rank(vector1, vector2)
vector2, vector1 = _sort_and_rank(vector2, vector1)
dis = _kendall_dis(vector1, vector2) # discordant pairs
obs = np.r_[True, (vector1[1:] != vector1[:-1]) |
(vector2[1:] != vector2[:-1]), True]
cnt = np.diff(np.nonzero(obs)[0]).astype('int64', copy=False)
ntie = (cnt * (cnt - 1) // 2).sum() # joint ties
xtie, _, _ = _count_rank_tie(vector1) # ties in x, stats
ytie, _, _ = _count_rank_tie(vector2) # ties in y, stats
tot = (size * (size - 1)) // 2
# Note that tot = con + dis + (xtie - ntie) + (ytie - ntie) + ntie
# = con + dis + xtie + ytie - ntie
con_minus_dis = tot - xtie - ytie + ntie - 2 * dis
tau = con_minus_dis / tot
# Limit range to fix computational errors
tau = min(1., max(-1., tau))
return tau
def _sort_and_rank(vector1, vector2):
"""does the sort and rank step of the _tau calculation"""
perm = np.argsort(vector2, kind='mergesort')
vector1 = vector1[perm]
vector2 = vector2[perm]
vector2 = np.r_[True, vector2[1:] != vector2[:-1]].cumsum(dtype=np.intp)
return vector1, vector2
def _count_rank_tie(ranks):
""" counts tied ranks for kendall-tau calculation"""
cnt = np.bincount(ranks).astype('int64', copy=False)
cnt = cnt[cnt > 1]
return ((cnt * (cnt - 1) // 2).sum(),
(cnt * (cnt - 1.) * (cnt - 2)).sum(),
(cnt * (cnt - 1.) * (2*cnt + 5)).sum())
def _get_v(n_cond, sigma_k):
""" get the rdm covariance from sigma_k """
# calculate Xi
c_mat = pairwise_contrast_sparse(np.arange(n_cond))
if sigma_k is None:
xi = c_mat @ c_mat.transpose()
elif sigma_k.ndim == 1:
sigma_k = scipy.sparse.diags(sigma_k)
xi = c_mat @ sigma_k @ c_mat.transpose()
else:
sigma_k = scipy.sparse.csr_matrix(sigma_k)
xi = c_mat @ sigma_k @ c_mat.transpose()
# calculate V
v = xi.multiply(xi).tocsc()
return v
def _parse_input_rdms(rdm1, rdm2):
"""Gets the vector representation of input RDMs, raises an error if
the two RDMs objects have different dimensions
Args:
rdm1 (rsatoolbox.rdm.RDMs):
first set of RDMs
rdm2 (rsatoolbox.rdm.RDMs):
second set of RDMs
"""
if not isinstance(rdm1, np.ndarray):
vector1 = rdm1.get_vectors()
else:
if len(rdm1.shape) == 1:
vector1 = rdm1.reshape(1, -1)
else:
vector1 = rdm1
if not isinstance(rdm2, np.ndarray):
vector2 = rdm2.get_vectors()
else:
if len(rdm2.shape) == 1:
vector2 = rdm2.reshape(1, -1)
else:
vector2 = rdm2
if not vector1.shape[1] == vector2.shape[1]:
raise ValueError('rdm1 and rdm2 must be RDMs of equal shape')
nan_idx = ~np.isnan(vector1)
vector1_no_nan = vector1[nan_idx].reshape(vector1.shape[0], -1)
vector2_no_nan = vector2[~np.isnan(vector2)].reshape(vector2.shape[0], -1)
if not vector1_no_nan.shape[1] == vector2_no_nan.shape[1]:
raise ValueError('rdm1 and rdm2 have different nan positions')
return vector1_no_nan, vector2_no_nan, nan_idx[0]