diff --git a/mapca/mapca.py b/mapca/mapca.py index 4ea59d7..0349c5d 100644 --- a/mapca/mapca.py +++ b/mapca/mapca.py @@ -128,7 +128,7 @@ def __init__(self, criterion="mdl", normalize=True): self.criterion = criterion self.normalize = normalize - def _fit(self, img, mask): + def _fit(self, img, mask, subsample_depth=None): LGR.info( "Performing dimensionality reduction based on GIFT " "(https://trendscenter.org/software/gift/) and Li, Y. O., Adali, T., " @@ -210,9 +210,9 @@ def _fit(self, img, mask): sub_iid_sp_median = int(np.round(np.median(sub_iid_sp))) - # Calculating and logging the mean value to check if the differences in median - # within a dataset represent very small changes in the mean. It seems like this - # is the closest to a non-discrete value to store to compare across runs. + # Will log the mean value to check if the differences in median within a dataset + # represent very small changes in the mean. It seems like this is the closest + # to a non-discrete value to store to compare across runs. sub_iid_sp_mean = np.round(np.mean(sub_iid_sp), 3) if np.floor(np.power(n_samples / n_timepoints, 1 / dim_n)) < sub_iid_sp_median: @@ -224,6 +224,34 @@ def _fit(self, img, mask): LGR.info("Estimated subsampling depth for effective i.i.d samples: %d" % sub_iid_sp_median) + # Always save the calculated IID subsample value, but, if there is a user provide value, + # assign that to sub_iid_sp_median and use that instead + calculated_sub_iid_sp_median = sub_iid_sp_median + if subsample_depth: + if ( + ( + isinstance(subsample_depth, int) + or ( + isinstance(subsample_depth, float) + and subsample_depth == int(subsample_depth) + ) + ) + and (1 <= subsample_depth) + and ((n_samples / (subsample_depth**3)) >= 100) + ): + sub_iid_sp_median = subsample_depth + + else: + # The logic of the upper bound is subsample_depth^3 is the fraction of samples + # that removed and it would be good to have at least 100 sampling remaining to + # have a useful analysis. Given a masked volume is going to result in fewer + # samples remaining in 3D space, this is likely a very liberal upper bound, but + # probably good to at least include an upper bound. + raise ValueError( + "subsample_depth must be an integer > 1 and will retain >100 " + "samples after subsampling. It is %d" % subsample_depth + ) + N = np.round(n_samples / np.power(sub_iid_sp_median, dim_n)) if sub_iid_sp_median != 1: @@ -358,8 +386,11 @@ def _fit(self, img, mask): "explained_variance_total": cumsum_varexp, } self.subsampling_ = { - "calculated_IID_subsample_depth": sub_iid_sp_median, + + "calculated_IID_subsample_depth": calculated_sub_iid_sp_median, "calculated_IID_subsample_mean": sub_iid_sp_mean, + "IID_subsample_input": sub_iid_sp, + "used_IID_subsample_depth": sub_iid_sp_median, "effective_num_IID_samples": N, "total_num_samples": n_samples, } @@ -384,7 +415,7 @@ def _fit(self, img, mask): self.u_ = component_maps self.u_nii_ = nib.Nifti1Image(component_maps_3d, img.affine, img.header) - def fit(self, img, mask): + def fit(self, img, mask, subsample_depth=None): """Fit the model with X. Parameters @@ -393,16 +424,24 @@ def fit(self, img, mask): Data on which to apply PCA. mask : 3D niimg_like Mask to apply on ``img``. + subsample_depth : int, optional + Dimensionality reduction is calculated on a subset of voxels defined by + this depth. 2 would mean using every other voxel in 3D space and 3 would + mean every 3rd voxel. Default=None (estimated depth to make remaining + voxels independent and identically distributed (IID) + The subsampling value so that the voxels are assumed to be + independent and identically distributed (IID). + Default=None (use estimated value) Returns ------- self : object Returns the instance itself. """ - self._fit(img, mask) + self._fit(img, mask, subsample_depth=subsample_depth) return self - def fit_transform(self, img, mask): + def fit_transform(self, img, mask, subsample_depth=None): """Fit the model with X and apply the dimensionality reduction on X. Parameters @@ -411,6 +450,11 @@ def fit_transform(self, img, mask): Data on which to apply PCA. mask : 3D niimg_like Mask to apply on ``img``. + subsample_depth : int, optional + Dimensionality reduction is calculated on a subset of voxels defined by + this depth. 2 would mean using every other voxel in 3D space and 3 would + mean every 3rd voxel. Default=None (estimated depth to make remaining + voxels independent and identically distributed (IID) Returns ------- @@ -421,8 +465,18 @@ def fit_transform(self, img, mask): ----- The transformation step is different from scikit-learn's approach, which ignores explained variance. + + subsample_depth is always calculated automatically, but it should be consistent + across a dataset with the same acquisition parameters, since spatial dependence + should be similar. In practice, it sometimes gives a different value and causes + problems. That is, for a dataset with 100 runs, it is 2 in most runs, but when + it is 3, substantially fewer components are estimated and when it is 1, there is + almost no dimensionality reduction. This has been added as an optional user provided + parameter. If mapca seems to be having periodic mis-estimates, then this parameter + should make it possible to set the IID subsample depth to be consistent across a + dataset. """ - self._fit(img, mask) + self._fit(img, mask, subsample_depth=subsample_depth) return self.transform(img) def transform(self, img): @@ -490,7 +544,7 @@ def inverse_transform(self, img, mask): return img_orig -def ma_pca(img, mask, criterion="mdl", normalize=False): +def ma_pca(img, mask, criterion="mdl", normalize=False, subsample_depth=None): """Perform moving average-based PCA on imaging data. Run Singular Value Decomposition (SVD) on input data, @@ -512,6 +566,11 @@ def ma_pca(img, mask, criterion="mdl", normalize=False): ``kic`` refers to the Kullback-Leibler Information Criterion, which is the middle option. normalize : bool, optional Whether to normalize (zero mean and unit standard deviation) or not. Default is False. + subsample_depth : int, optional + Dimensionality reduction is calculated on a subset of voxels defined by + this depth. 2 would mean using every other voxel in 3D space and 3 would + mean every 3rd voxel. Default=None (estimated depth to make remaining + voxels independent and identically distributed (IID) Returns ------- @@ -525,7 +584,7 @@ def ma_pca(img, mask, criterion="mdl", normalize=False): Component timeseries. """ pca = MovingAveragePCA(criterion=criterion, normalize=normalize) - _ = pca.fit_transform(img, mask) + _ = pca.fit_transform(img, mask, subsample_depth=subsample_depth) u = pca.u_ s = pca.explained_variance_ varex_norm = pca.explained_variance_ratio_ diff --git a/mapca/tests/conftest.py b/mapca/tests/conftest.py index 8ddfe21..21f6819 100644 --- a/mapca/tests/conftest.py +++ b/mapca/tests/conftest.py @@ -53,7 +53,7 @@ def test_mask(testpath): @pytest.fixture def test_ts(testpath): return fetch_file('gz2hb', testpath, - 'compt_ts.npy') + 'comp_ts.npy') @pytest.fixture diff --git a/mapca/tests/test_mapca.py b/mapca/tests/test_mapca.py index 38bb499..aefdf01 100644 --- a/mapca/tests/test_mapca.py +++ b/mapca/tests/test_mapca.py @@ -95,3 +95,9 @@ def test_MovingAveragePCA(): test_data_est = pca2.inverse_transform(u2, test_mask_img) assert test_data_est.shape == test_img.shape + + # Testing setting inputting a pre-defined subsampling depth + pca3 = MovingAveragePCA(criterion="mdl", normalize=False) + pca3.fit(test_img, test_mask_img, subsample_depth=2) + assert pca3.subsampling_['calculated_IID_subsample_depth'] == 1 + assert pca3.subsampling_['used_IID_subsample_depth'] == 2