#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
generation of crossvalidation splits
"""
from copy import deepcopy
import numpy as np
from rsatoolbox.util.rdm_utils import add_pattern_index
from rsatoolbox.util.inference_util import default_k_pattern, default_k_rdm
[docs]def sets_leave_one_out_pattern(rdms, pattern_descriptor):
""" generates training and test set combinations by leaving one level
of pattern_descriptor out as a test set.
This is only sensible if pattern_descriptor already defines larger groups!
the ceil_train_set contains the rdms for the test-patterns from the
training-rdms. This is required for computing the noise-ceiling
Args:
rdms(rsatoolbox.rdm.RDMs): rdms to use
pattern_descriptor(String): descriptor to select groups
Returns:
train_set(list): list of tuples (rdms, pattern_idx)
test_set(list): list of tuples (rdms, pattern_idx)
ceil_set(list): list of tuples (rdms, pattern_idx)
"""
pattern_descriptor, pattern_select = \
add_pattern_index(rdms, pattern_descriptor)
train_set = []
test_set = []
ceil_set = []
for i_pattern in pattern_select:
pattern_idx_train = np.setdiff1d(pattern_select, i_pattern)
rdms_train = rdms.subset_pattern(pattern_descriptor,
pattern_idx_train)
pattern_idx_test = [i_pattern]
rdms_test = rdms.subset_pattern(pattern_descriptor,
pattern_idx_test)
rdms_ceil = rdms.subset_pattern(pattern_descriptor,
pattern_idx_test)
train_set.append((rdms_train, pattern_idx_train))
test_set.append((rdms_test, pattern_idx_test))
ceil_set.append((rdms_ceil, pattern_idx_test))
return train_set, test_set, ceil_set
[docs]def sets_leave_one_out_rdm(rdms, rdm_descriptor='index'):
""" generates training and test set combinations by leaving one level
of rdm_descriptor out as a test set.\
Args:
rdms(rsatoolbox.rdm.RDMs): rdms to use
rdm_descriptor(String): descriptor to select groups
Returns:
train_set(list): list of tuples (rdms, pattern_idx)
test_set(list): list of tuples (rdms, pattern_idx)
ceil_set(list): list of tuples (rdms, pattern_idx)
"""
rdm_select = rdms.rdm_descriptors[rdm_descriptor]
rdm_select = np.unique(rdm_select)
if len(rdm_select) > 1:
train_set = []
test_set = []
for i_pattern in rdm_select:
rdm_idx_train = np.setdiff1d(rdm_select, i_pattern)
rdms_train = rdms.subset(rdm_descriptor,
rdm_idx_train)
rdm_idx_test = [i_pattern]
rdms_test = rdms.subset(rdm_descriptor,
rdm_idx_test)
train_set.append((rdms_train, np.arange(rdms.n_cond)))
test_set.append((rdms_test, np.arange(rdms.n_cond)))
ceil_set = train_set
else:
Warning('leave one out called with only one group')
train_set = [(rdms, np.arange(rdms.n_cond))]
test_set = [(rdms, np.arange(rdms.n_cond))]
ceil_set = [(rdms, np.arange(rdms.n_cond))]
return train_set, test_set, ceil_set
[docs]def sets_k_fold(rdms, k_rdm=None, k_pattern=None, random=True,
pattern_descriptor='index', rdm_descriptor='index'):
""" generates training and test set combinations by splitting into k
similar sized groups. This version splits both over rdms and over patterns
resulting in k_rdm * k_pattern (training, test) pairs.
If a k is set to 1 the corresponding dimension is not crossvalidated.
Args:
rdms(rsatoolbox.rdm.RDMs): rdms to use
pattern_descriptor(String): descriptor to select pattern groups
rdm_descriptor(String): descriptor to select rdm groups
k_rdm(int): number of rdm groups
k_pattern(int): number of pattern groups
random(bool): whether the assignment shall be randomized
Returns:
train_set(list): list of tuples (rdms, pattern_idx)
test_set(list): list of tuples (rdms, pattern_idx)
ceil_set(list): list of tuples (rdms, pattern_idx)
"""
rdm_select = rdms.rdm_descriptors[rdm_descriptor]
rdm_select = np.unique(rdm_select)
if k_rdm is None:
k_rdm = default_k_rdm(len(rdm_select))
pattern_descriptor, pattern_select = \
add_pattern_index(rdms, pattern_descriptor)
if k_pattern is None:
k_pattern = default_k_pattern(len(pattern_select))
assert k_rdm <= len(rdm_select), \
'Can make at most as many groups as rdms'
if random:
np.random.shuffle(rdm_select)
group_size_rdm = np.floor(len(rdm_select) / k_rdm)
additional_rdms = len(rdm_select) % k_rdm
train_set = []
test_set = []
ceil_set = []
for i_group in range(k_rdm):
test_idx = np.arange(i_group * group_size_rdm,
(i_group + 1) * group_size_rdm)
if i_group < additional_rdms:
test_idx = np.concatenate((test_idx, [len(rdm_select)-(i_group+1)]))
if k_rdm <= 1:
train_idx = test_idx
else:
train_idx = np.setdiff1d(np.arange(len(rdm_select)),
test_idx)
rdm_idx_test = [rdm_select[int(idx)] for idx in test_idx]
rdm_idx_train = [rdm_select[int(idx)] for idx in train_idx]
rdms_test = rdms.subsample(rdm_descriptor,
rdm_idx_test)
rdms_train = rdms.subsample(rdm_descriptor,
rdm_idx_train)
train_new, test_new, _ = sets_k_fold_pattern(
rdms_train, k=k_pattern,
pattern_descriptor=pattern_descriptor, random=random)
ceil_new = deepcopy(test_new)
for i_pattern in range(k_pattern):
test_new[i_pattern][0] = rdms_test.subset_pattern(
by=pattern_descriptor,
value=test_new[i_pattern][1])
train_set += train_new
test_set += test_new
ceil_set += ceil_new
return train_set, test_set, ceil_set
[docs]def sets_k_fold_rdm(rdms, k_rdm=None, random=True, rdm_descriptor='index'):
""" generates training and test set combinations by splitting into k
similar sized groups. This version splits both over rdms and over patterns
resulting in k_rdm * k_pattern (training, test) pairs.
Args:
rdms(rsatoolbox.rdm.RDMs): rdms to use
rdm_descriptor(String): descriptor to select rdm groups
k_rdm(int): number of rdm groups
random(bool): whether the assignment shall be randomized
Returns:
train_set(list): list of tuples (rdms, pattern_idx)
test_set(list): list of tuples (rdms, pattern_idx)
"""
rdm_select = rdms.rdm_descriptors[rdm_descriptor]
rdm_select = np.unique(rdm_select)
if k_rdm is None:
k_rdm = default_k_rdm(len(rdm_select))
assert k_rdm <= len(rdm_select), \
'Can make at most as many groups as rdms'
if random:
np.random.shuffle(rdm_select)
group_size_rdm = np.floor(len(rdm_select) / k_rdm)
additional_rdms = len(rdm_select) % k_rdm
train_set = []
test_set = []
for i_group in range(k_rdm):
test_idx = np.arange(i_group * group_size_rdm,
(i_group + 1) * group_size_rdm)
if i_group < additional_rdms:
test_idx = np.concatenate((test_idx, [len(rdm_select)-(i_group+1)]))
train_idx = np.setdiff1d(np.arange(len(rdm_select)),
test_idx)
rdm_idx_test = [rdm_select[int(idx)] for idx in test_idx]
rdm_idx_train = [rdm_select[int(idx)] for idx in train_idx]
rdms_test = rdms.subsample(rdm_descriptor,
rdm_idx_test)
rdms_train = rdms.subsample(rdm_descriptor,
rdm_idx_train)
train_set.append([rdms_train, np.arange(rdms_train.n_cond)])
test_set.append([rdms_test, np.arange(rdms_train.n_cond)])
ceil_set = train_set
return train_set, test_set, ceil_set
[docs]def sets_k_fold_pattern(rdms, pattern_descriptor='index',
k=None, random=False):
""" generates training and test set combinations by splitting into k
similar sized groups. This version splits in the given order or
randomizes the order. For k=1 training and test_set are whole dataset,
i.e. no crossvalidation is performed.
For only crossvalidating over patterns there is no independent training
set for calculating a noise ceiling for the patterns.
To express this we set ceil_set to None, which makes the crossvalidation
function calculate a leave one rdm out noise ceiling for the right
patterns instead.
Args:
rdms(rsatoolbox.rdm.RDMs): rdms to use
pattern_descriptor(String): descriptor to select groups
k(int): number of groups
random(bool): whether the assignment shall be randomized
Returns:
train_set(list): list of tuples (rdms, pattern_idx)
test_set(list): list of tuples (rdms, pattern_idx)
ceil_set = None
"""
pattern_descriptor, pattern_select = \
add_pattern_index(rdms, pattern_descriptor)
if k is None:
k = default_k_pattern(len(pattern_select))
assert k <= len(pattern_select), \
'Can make at most as many groups as conditions'
if random:
np.random.shuffle(pattern_select)
group_size = np.floor(len(pattern_select) / k)
additional_patterns = len(pattern_select) % k
train_set = []
test_set = []
for i_group in range(k):
test_idx = np.arange(i_group * group_size,
(i_group + 1) * group_size)
if i_group < additional_patterns:
test_idx = np.concatenate((test_idx, [len(pattern_select)-(i_group+1)]))
if k <= 1:
train_idx = test_idx
else:
train_idx = np.setdiff1d(np.arange(len(pattern_select)),
test_idx)
pattern_idx_test = [pattern_select[int(idx)] for idx in test_idx]
pattern_idx_train = [pattern_select[int(idx)] for idx in train_idx]
rdms_test = rdms.subset_pattern(pattern_descriptor,
pattern_idx_test)
rdms_train = rdms.subset_pattern(pattern_descriptor,
pattern_idx_train)
test_set.append([rdms_test, pattern_idx_test])
train_set.append([rdms_train, pattern_idx_train])
ceil_set = None
return train_set, test_set, ceil_set
[docs]def sets_of_k_rdm(rdms, rdm_descriptor='index', k=5, random=False):
""" generates training and test set combinations by splitting into
groups of k. This version splits in the given order or
randomizes the order. If the number of patterns is not divisible by k
patterns are added to the first groups such that those have k+1 patterns
Args:
rdms(rsatoolbox.rdm.RDMs): rdms to use
pattern_descriptor(String): descriptor to select groups
k(int): number of groups
random(bool): whether the assignment shall be randomized
Returns:
train_set(list): list of tuples (rdms, pattern_idx)
test_set(list): list of tuples (rdms, pattern_idx)
ceil_set(list): list of tuples (rdms, pattern_idx)
"""
rdm_select = rdms.rdm_descriptors[rdm_descriptor]
rdm_select = np.unique(rdm_select)
assert k <= len(rdm_select) / 2, \
'to form groups we can use at most half the patterns per group'
n_groups = int(len(rdm_select) / k)
return sets_k_fold_rdm(rdms, rdm_descriptor=rdm_descriptor,
k=n_groups, random=random)
[docs]def sets_of_k_pattern(rdms, pattern_descriptor=None, k=5, random=False):
""" generates training and test set combinations by splitting into
groups of k. This version splits in the given order or
randomizes the order. If the number of patterns is not divisible by k
patterns are added to the first groups such that those have k+1 patterns
Args:
rdms(rsatoolbox.rdm.RDMs): rdms to use
pattern_descriptor(String): descriptor to select groups
k(int): number of groups
random(bool): whether the assignment shall be randomized
Returns:
train_set(list): list of tuples (rdms, pattern_idx)
test_set(list): list of tuples (rdms, pattern_idx)
"""
pattern_descriptor, pattern_select = \
add_pattern_index(rdms, pattern_descriptor)
assert k <= len(pattern_select) / 2, \
'to form groups we can use at most half the patterns per group'
n_groups = int(len(pattern_select) / k)
return sets_k_fold_pattern(rdms, pattern_descriptor=pattern_descriptor,
k=n_groups, random=random)
[docs]def sets_random(rdms, n_rdm=None, n_pattern=None, n_cv=2,
pattern_descriptor='index', rdm_descriptor='index'):
""" generates training and test set combinations by selecting random
test sets of n_rdm RDMs and n_pattern patterns and using the rest of
the data as the training set.
If a n is set to 0 the corresponding dimension is not crossvalidated.
Args:
rdms(rsatoolbox.rdm.RDMs): rdms to split
pattern_descriptor(String): descriptor to select pattern groups
rdm_descriptor(String): descriptor to select rdm groups
n_rdm(int): number of rdms per test set
n_pattern(int): number of patterns per test set
Returns:
train_set(list): list of tuples (rdms, pattern_idx)
test_set(list): list of tuples (rdms, pattern_idx)
ceil_set(list): list of tuples (rdms, pattern_idx)
"""
rdm_select = rdms.rdm_descriptors[rdm_descriptor]
rdm_select = np.unique(rdm_select)
if n_rdm is None:
k_rdm = default_k_rdm(len(rdm_select))
n_rdm = int(np.floor(len(rdm_select) / k_rdm))
pattern_descriptor, pattern_select = \
add_pattern_index(rdms, pattern_descriptor)
if n_pattern is None:
k_pattern = default_k_pattern(len(pattern_select))
n_pattern = int(np.floor(len(pattern_select) / k_pattern))
train_set = []
test_set = []
ceil_set = []
for _i_group in range(n_cv):
# shuffle
np.random.shuffle(rdm_select)
np.random.shuffle(pattern_select)
# choose indices based on n_rdm
if n_rdm == 0:
train_idx = np.arange(len(rdm_select))
test_idx = np.arange(len(rdm_select))
else:
test_idx = np.arange(n_rdm)
train_idx = np.arange(n_rdm, len(rdm_select))
# take subset of rdms
rdm_idx_test = [rdm_select[int(idx)] for idx in test_idx]
rdm_idx_train = [rdm_select[int(idx)] for idx in train_idx]
rdms_test = rdms.subsample(rdm_descriptor,
rdm_idx_test)
rdms_train = rdms.subsample(rdm_descriptor,
rdm_idx_train)
# choose indices based on n_pattern
if n_pattern == 0:
train_idx = np.arange(len(pattern_select))
test_idx = np.arange(len(pattern_select))
else:
test_idx = np.arange(n_pattern)
train_idx = np.arange(n_pattern, len(pattern_select))
pattern_idx_test = [pattern_select[int(idx)] for idx in test_idx]
pattern_idx_train = [pattern_select[int(idx)] for idx in train_idx]
rdms_test = rdms_test.subset_pattern(pattern_descriptor,
pattern_idx_test)
rdms_ceil = rdms_train.subset_pattern(pattern_descriptor,
pattern_idx_test)
rdms_train = rdms_train.subset_pattern(pattern_descriptor,
pattern_idx_train)
test_set.append([rdms_test, pattern_idx_test])
train_set.append([rdms_train, pattern_idx_train])
ceil_set.append([rdms_ceil, pattern_idx_test])
return train_set, test_set, ceil_set