Skip to content

Commit

Permalink
Merge pull request #20 from rmarkello/postprocessing
Browse files Browse the repository at this point in the history
[ENH] Adds `abagen.correct` for postprocessing
  • Loading branch information
rmarkello authored Sep 21, 2018
2 parents c29abbf + 8a97a35 commit 3a276cb
Show file tree
Hide file tree
Showing 6 changed files with 233 additions and 10 deletions.
16 changes: 10 additions & 6 deletions abagen/allen.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,11 +200,11 @@ def group_by_label(microarray, sample_labels, labels=None, metric='mean'):
Microarray expression data, where `S` is samples and `G` is genes
sample_labels : (S, 1) pandas.DataFrame
Parcel labels for `S` samples, as returned by e.g., `label_samples()`
labels : (L,) array_like, optionalacross
All possible labels for parcelacrosscheme (to account for possibility
that some parcels have NO expracrossata). Default: None
labels : (L,) array_like, optional
All possible labels for parcellation (to account for possibility that
some parcels have NO expression data). Default: None
metric : str or func, optional
Mechanism by which to collapseacrosssamples within a parcel. If a
Mechanism by which to collapse across samples within a parcel. If a
str, should be in ['mean', 'median']; if a function, should be able to
accept an `N`-dimensional input and the `axis` keyword argument and
return an `N-1`-dimensional output. Default: 'mean'
Expand Down Expand Up @@ -240,7 +240,7 @@ def group_by_label(microarray, sample_labels, labels=None, metric='mean'):
def get_expression_data(files, atlas, atlas_info=None, *,
exact=True, tolerance=2, metric='mean',
ibf_threshold=0.5, corrected_mni=True,
return_counts=False):
return_counts=False, return_donors=False):
"""
Assigns microarray expression data in `files` to ROIs defined in `atlas`
Expand Down Expand Up @@ -332,6 +332,9 @@ def get_expression_data(files, atlas, atlas_info=None, *,
return_counts : bool, optional
Whether to return how many samples were assigned to each parcel in
`atlas` for each donor. Default: False
return_donors : bool, optional
Whether to return donor-level expression arrays instead of aggregating
expression across donors with provided `metric`. Default: False
Returns
-------
Expand Down Expand Up @@ -422,7 +425,8 @@ def get_expression_data(files, atlas, atlas_info=None, *,

# normalize data with SRS and aggregate across donors
expression = [process.normalize_expression(e) for e in expression]
expression = pd.concat(expression).groupby('label').aggregate(metric)
if not return_donors:
expression = pd.concat(expression).groupby('label').aggregate(metric)

if return_counts:
return expression, counts[1:]
Expand Down
172 changes: 172 additions & 0 deletions abagen/correct.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
# -*- coding: utf-8 -*-
"""
Functions for post-processing region x gene expression data
"""

import itertools
from nilearn._utils import check_niimg_3d
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist
from abagen import utils


def remove_distance(expression, atlas, atlas_info=None):
"""
Corrects for distance-dependent correlation effects in `expression`
Regresses Euclidean distance between regions in `atlas` from correlated
gene expression array generated from `expression`. If `atlas_info` is
provided different connection types (e.g., cortex-cortex, cortex-subcortex,
subcortex-subcortex) will be residualized independently.
Parameters
----------
expression : (R x G) :class:`pandas.DataFrame`
Microarray expression for `R` regions in `atlas` for `G` genes,
aggregated across donors.
atlas : niimg-like object
A parcellation image in MNI space, where each parcel is identified by a
unique integer ID
atlas_info : str or :class:`pandas.DataFrame`, optional
Filepath to or pre-loaded dataframe containing information about
`atlas`. Must have at least columns 'id', 'hemisphere', and 'structure'
containing information mapping atlas IDs to hemisphere (i.e, "L", "R")
and broad structural class (i.e., "cortex", "subcortex", "cerebellum").
Default: None
Returns
-------
corrgene : (R x R) :class:`numpy.ndarray`
Correlated gene `expression` data for `R` regions in `atlas`,
residualized against the spatial distances between region pairs
"""

# load atlas_info, if provided
atlas = check_niimg_3d(atlas)
if atlas_info is not None:
atlas_info = utils.check_atlas_info(atlas, atlas_info)

# check expression data and make correlation matrix
if not isinstance(expression, pd.DataFrame):
raise TypeError('Provided `expression` data must be type pd.DataFrame '
'not {}'.format(type(expression)))
genecorr = np.corrcoef(expression.get_values())

# we'll do basic Euclidean distance correction for now
# TODO: implement gray matter volume / cortical surface path distance
centroids = utils.get_centroids(atlas, labels_of_interest=expression.index)
dist = cdist(centroids, centroids, metric='euclidean')

corr_resid = np.zeros_like(genecorr)
triu_inds = np.triu_indices_from(genecorr, k=1)
# if no atlas_info, just residualize all correlations against distance
if atlas_info is None:
corr_resid[triu_inds] = _resid_dist(genecorr[triu_inds],
dist[triu_inds])
# otherwise, we can residualize the different connection types separately
else:
triu_inds = np.ravel_multi_index(triu_inds, corr_resid.shape)
genecorr, dist = genecorr.ravel(), dist.ravel()
types = ['cortex', 'subcortex']
for src, tar in itertools.combinations_with_replacement(types, 2):
# get indices of sources and targets
sources = np.where(atlas_info.structure == src)[0]
targets = np.where(atlas_info.structure == tar)[0]
inds = np.ravel_multi_index(np.ix_(sources, targets),
corr_resid.shape)
if src != tar:
rev = np.ravel_multi_index(np.ix_(targets, sources),
corr_resid.shape)
inds = np.append(inds.ravel(), rev.ravel())
# find intersection of source / target indices + upper triangle
inds = np.intersect1d(triu_inds, inds)
back = np.unravel_index(inds, corr_resid.shape)
corr_resid[back] = _resid_dist(genecorr[inds], dist[inds])

corr_resid = (corr_resid + corr_resid.T + np.eye(len(corr_resid)))

return corr_resid


def _resid_dist(dv, iv):
"""
Calculates residuals of `dv` after controlling for `iv`
Parameters
----------
dv : array_like
Dependent variable
iv : array_like
Independent variable; removed from `dv`
Returns
-------
residuals : array_like
Residuals of `dv` after controlling for `iv`
"""
dv = dv.squeeze()
distance = np.column_stack((iv, np.ones_like(iv)))
betas, *rest = np.linalg.lstsq(distance, dv, rcond=None)
residuals = dv - (distance @ betas)

return residuals.squeeze()


def keep_stable_genes(expression, threshold=0.9, percentile=True, rank=True):
"""
Removes genes in `expression` with differential stability < `threshold`
Calculates the similarity of gene expression across brain regions for every
pair of donors in `expression`. Similarity is averaged across donor pairs
and genes whose mean similarity falls below `threshold` are removed.
Parameters
----------
expression : list of (R x G) :class:`pandas.DataFrame`
Where each entry is the microarray expression of `R` regions across `G`
genes for a given donor
threshold : [0, 1] float, optional
Minimum required average similarity (e.g, correlation) across donors
for a gene to be retained. Default: 0.1
percentile : bool, optional
Whether to treat `threshold` as a percentile instead of an absolute
cutoff. For example, `threshold=0.9` and `percentile=True` would
retain only those genes with a differential stability in the top 10% of
all genes, whereas `percentile=False` would retain only those genes
with differential stability > 0.9. Default: True
rank : bool, optional
Whether to calculate similarity as Spearman correlation instead of
Pearson correlation. Default: True
Returns
-------
expression : list of (R x Gr) :class:`pandas.DataFrame`
Microarray expression for `R` regions across `Gr` genes, where `Gr` is
the number of retained genes
"""

# get number of donors and number of genes
num_subj = len(expression)
num_gene = expression[0].shape[-1]

# rank data, if necessary
for_corr = expression if not rank else [e.rank() for e in expression]

# get correlation of gene expression across regions for all donor pairs
gene_corrs = np.zeros((num_gene, sum(range(num_subj))))
for n, (s1, s2) in enumerate(itertools.combinations(range(num_subj), 2)):
regions = np.intersect1d(for_corr[s1].dropna(axis=0, how='all').index,
for_corr[s2].dropna(axis=0, how='all').index)
gene_corrs[:, n] = utils.efficient_corr(for_corr[s1].loc[regions],
for_corr[s2].loc[regions])

# average similarity across donors
gene_corrs = gene_corrs.mean(axis=1)
# calculate absolute threshold if percentile is desired
if percentile:
threshold = np.percentile(gene_corrs, threshold * 100)
keep_genes = gene_corrs > threshold
expression = [e.iloc[:, keep_genes] for e in expression]

return expression
2 changes: 1 addition & 1 deletion abagen/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
INSTALL_REQUIRES = [
'nibabel',
'nilearn',
'numpy',
'numpy>=1.14.0',
'pandas',
'requests',
'scikit-learn',
Expand Down
5 changes: 3 additions & 2 deletions abagen/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,9 @@ def get_stable_probes(microarray, annotation, probes):
If there are multiple probes with expression data for the same gene, this
function will calculate the similarity of each probes' expression across
donors and select the probe with the most consistent pattern of regional
variation (i.e., "differential stability" or DS). Regions are
operationalized by the "structure_id" column in `annotation`.
variation (i.e., "differential stability" or DS). Regions are defined by
the "structure_id" column in `annotation`; similarity is calculated by the
Spearman correlation coefficient.
Parameters
----------
Expand Down
46 changes: 46 additions & 0 deletions abagen/tests/test_correct.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import itertools
import numpy as np
import pandas as pd
import pytest
from abagen import allen, correct
from abagen.datasets import fetch_desikan_killiany

ATLAS = fetch_desikan_killiany()


@pytest.fixture(scope='module')
def donor_expression(testfiles):
return allen.get_expression_data(testfiles, ATLAS.image, ATLAS.info,
exact=False, return_donors=True)


def test_remove_distance(donor_expression):
expr = pd.concat(donor_expression).groupby('label').aggregate(np.mean)
for atlas_info in [None, ATLAS.info]:
out = correct.remove_distance(expr, ATLAS.image, atlas_info=atlas_info)
assert np.allclose(out, out.T)
assert isinstance(out, np.ndarray)


def test_resid_dist():
dv = np.array([1, 2, 3, 4, 5])
# residualizing against self should yield 0
assert np.allclose(correct._resid_dist(dv, iv=dv), 0)
# residualizing against perfectly anticorrelated should also yield 0
assert np.allclose(correct._resid_dist(dv, iv=dv[::-1]), 0)
# residualizing against scaled self should also yield 0 (intercept incl)
assert np.allclose(correct._resid_dist(dv, iv=(dv + 10)), 0)
# residualizing against constant should yield de-meaned input
assert np.allclose(correct._resid_dist(dv, iv=np.ones_like(dv)),
dv - dv.mean())


def test_keep_stable_genes(donor_expression):
for thr, per, rank in itertools.product(np.arange(0, 1, 0.1),
[True, False],
[True, False]):
out = correct.keep_stable_genes(donor_expression, threshold=thr,
percentile=per, rank=rank)
assert all([isinstance(f, pd.DataFrame) for f in out])
for df1, df2 in itertools.combinations(out, 2):
assert df1.shape == df2.shape
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
nibabel
nilearn
numpy
numpy>=1.14.0
pandas
requests
scipy
Expand Down

0 comments on commit 3a276cb

Please sign in to comment.