Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH,REF] Adds new region_agg parameter #131

Merged
merged 14 commits into from
Nov 25, 2019
Merged
259 changes: 97 additions & 162 deletions abagen/allen.py

Large diffs are not rendered by default.

66 changes: 46 additions & 20 deletions abagen/cli/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ def get_parser():
"""

from .. import __version__
from ..correct import NORMALIZATION_METHODS
from ..probes import SELECTION_METHODS

verstr = 'abagen {}'.format(__version__)
parser = argparse.ArgumentParser(
Expand Down Expand Up @@ -82,14 +84,14 @@ def get_parser():

If at any step a sample can be assigned to a parcel the matching process is
terminated. If multiple sample are assigned to the same parcel they are
aggregated with the metric specified via the `metric` parameter. More control
over the sample matching can be obtained by setting the `inexact` parameter;
see the parameter description for more information.
aggregated with the metric specified via the `agg_metric` parameter. More
control over the sample matching can be obtained by setting the `inexact`
parameter; see the parameter description for more information.

Once all samples have been matched to parcels for all supplied donors, the
microarray expression data are optionally normalized within-donor via the
provided `donor_norm` function before being combined across donors via the
supplied `metric`.
microarray expression data are optionally normalized via the provided
`sample_norm` and `gene_norm` functions before being combined within parcels
and across donors via the supplied `agg_metric`.
"""
)

Expand Down Expand Up @@ -175,8 +177,23 @@ def get_parser():
'samples, across all supplied donors, for which '
'a probe must have signal above background noise '
'in order to be retained. Default: 0.5')
w_data.add_argument('--metric', action='store', default='mean',
metavar='METHOD', choices=['mean', 'median'],
w_data.add_argument('--region_agg', '--region-agg', action='store',
default='donors', metavar='METHOD',
choices=['donors', 'samples'],
help='When multiple samples are identified as '
'belonging to a region in `atlas` this '
'determines how they are aggegated. If '
'\'samples\', expression data from all samples '
'for all donors assigned to a given region are '
'combined. If \'donors\', expression values for '
'all samples assigned to a given region are '
'combined independently for each donor before '
'being combined across donors. See `agg_metric` '
'for mechanism by which samples are combined. '
'Default: \'donors\'')
w_data.add_argument('--agg_metric', '--agg-metric', action='store',
default='mean', metavar='METHOD',
choices=['mean', 'median'],
help='Mechanism by which to (1) reduce expression '
'data of multiple samples in the same `atlas` '
'region, and (2) reduce donor-level expression '
Expand All @@ -185,10 +202,7 @@ def get_parser():
'Default: "mean"')
w_data.add_argument('--probe_selection', '--probe-selection',
action='store', default='diff_stability',
metavar='METHOD',
choices=['average', 'mean', 'max_intensity',
'max_variance', 'pc_loading', 'corr_variance',
'corr_intensity', 'diff_stability'],
metavar='METHOD', choices=sorted(SELECTION_METHODS),
help='Selection method for subsetting (or collapsing '
'across) probes that index the same gene. Must '
'be one of {"average", "mean", "max_intensity", '
Expand All @@ -202,15 +216,25 @@ def get_parser():
'both hemispheres (i.e., L->R and R->L), '
'approximately doubling the number of available '
'samples. Default: False (i.e., no mirroring)')
w_data.add_argument('--donor_norm', '--donor-norm', action='store',
w_data.add_argument('--gene_norm', '--gene-norm', action='store',
default='srs', metavar='METHOD', type=_resolve_none,
choices=['srs', 'zscore', 'batch', None],
choices=sorted(NORMALIZATION_METHODS) + ['None'],
help='Method by which to normalize microarray '
'expression values for each donor prior to '
'collapsing across donors. Expression values are '
'normalized separately for each gene for each '
'donor across all regions in `atlas`. Must be '
'one of {"srs", "zscore", "batch", None}. '
'donor across all expression samples. If None is '
'specified then no normalization is performed. '
'Default: "srs"')
w_data.add_argument('--sample_norm', '--sample-norm', action='store',
default='srs', metavar='METHOD', type=_resolve_none,
choices=sorted(NORMALIZATION_METHODS) + ['None'],
help='Method by which to normalize microarray '
'expression values for each sample prior to '
'collapsing into regions in `atlas`. Expression '
'values are normalized separately for each '
'sample and donor across genes. If None is '
'specified then no normalization is performed. '
'Default: "srs"')

p_data = parser.add_argument_group('Options to modify the AHBA data used')
Expand All @@ -233,7 +257,7 @@ def get_parser():
'to anatomical regions. Default: False (i.e., '
'use corrected coordinates)')

o_data = parser.add_argument_group('Options to modify how data is output')
o_data = parser.add_argument_group('Options to modify how data are output')
o_data.add_argument('--stdout', action='store_true',
help='Generated region x gene dataframes will be '
'printed to stdout for piping to other things. '
Expand All @@ -258,7 +282,7 @@ def get_parser():
o_data.add_argument('--save-donors', '--save_donors', action='store_true',
help='Whether to save donor-level expression '
'dataframes instead of aggregating expression '
'across donosr with provided `metric`. If '
'across donors with provided `agg_metric`. If '
'specified, dataframes will be saved to path '
'specified by `output-file`, appending donor IDs '
'to the end of the filename. Default: False')
Expand Down Expand Up @@ -289,11 +313,13 @@ def main(args=None):
atlas_info=opts.atlas_info,
exact=opts.exact,
tolerance=opts.tolerance,
metric=opts.metric,
region_agg=opts.region_agg,
agg_metric=opts.agg_metric,
ibf_threshold=opts.ibf_threshold,
probe_selection=opts.probe_selection,
lr_mirror=opts.lr_mirror,
donor_norm=opts.donor_norm,
gene_norm=opts.gene_norm,
sample_norm=opts.sample_norm,
corrected_mni=opts.corrected_mni,
reannotated=opts.reannotated,
return_counts=opts.save_counts,
Expand Down
32 changes: 17 additions & 15 deletions abagen/correct.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
Functions for processing and correcting gene expression data
"""

import functools
import itertools
import warnings

Expand Down Expand Up @@ -156,6 +157,15 @@ def _srs(data, low=0, high=1, axis=0):
return normed


NORMALIZATION_METHODS = dict(
rs=functools.partial(_rs, axis=0),
srs=functools.partial(_srs, low=0, high=1, axis=0),
center=lambda x: x - x.mean(axis=0, keepdims=True),
zscore=functools.partial(sstats.zscore, axis=0, ddof=1),
minmax=functools.partial(_rescale, low=0, high=1, axis=0)
)


def normalize_expression(expression, norm='srs'):
"""
Performs normalization on `expression` data
Expand Down Expand Up @@ -218,10 +228,12 @@ def normalize_expression(expression, norm='srs'):
20130048
"""

norms = ['rs', 'srs', 'center', 'zscore', 'batch', 'minmax']
if norm not in norms:
try:
normfunc = NORMALIZATION_METHODS[norm]
except KeyError:
raise ValueError('Provided value for `norm` not recognized. Must be '
'one of {}. Received: {}'.format(norms, norm))
'one of {}. Received: {}'
.format(list(NORMALIZATION_METHODS), norm))

# FIXME: I hate having to do this...
if isinstance(expression, pd.DataFrame):
Expand All @@ -236,18 +248,8 @@ def normalize_expression(expression, norm='srs'):
notna = np.logical_not(exp.isna().all(axis=1))
data = np.asarray(exp)[notna]

if norm == 'rs':
normed = _rs(data, axis=0)
if norm == 'srs':
normed = _srs(data, low=0, high=1, axis=0)
elif norm == 'center':
normed = data - data.mean(axis=0, keepdims=True)
elif norm == 'zscore':
normed = sstats.zscore(data, axis=0, ddof=1)
elif norm == 'minmax':
normed = _rescale(data, low=0, high=1, axis=0)
elif norm == 'batch':
normed = corrected[n]
# normalize the data (however was specified)
normed = normfunc(data) if norm != 'batch' else corrected[n]

# recreate dataframe and fill non-NaN values
normalized = pd.DataFrame(np.nan, columns=exp.columns, index=exp.index)
Expand Down
8 changes: 4 additions & 4 deletions abagen/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def read_ontology(fname, copy=False):
data = pd.read_csv(fname)
except ValueError:
if not isinstance(fname, pd.DataFrame):
raise TypeError('Provided fname must be filepath to Ontology.csv'
raise TypeError('Provided fname must be filepath to Ontology.csv '
'file from Allen Human Brain Atlas.')
data = fname.copy() if copy else fname

Expand Down Expand Up @@ -197,7 +197,7 @@ def read_pacall(fname, copy=False, parquet=True):
data.columns = pd.Series(range(len(data.columns)), name='sample_id')
except (AttributeError, ValueError):
if not isinstance(fname, pd.DataFrame):
raise TypeError('Provided fname must be filepath to PACall.csv'
raise TypeError('Provided fname must be filepath to PACall.csv '
'file from Allen Human Brain Atlas.')
data = fname.copy() if copy else fname

Expand Down Expand Up @@ -236,7 +236,7 @@ def read_probes(fname, copy=False):
data = pd.read_csv(fname, index_col=0)
except ValueError:
if not isinstance(fname, pd.DataFrame):
raise TypeError('Provided fname must be filepath to Probes.csv'
raise TypeError('Provided fname must be filepath to Probes.csv '
'file from Allen Human Brain Atlas.')
data = fname.copy() if copy else fname

Expand Down Expand Up @@ -278,7 +278,7 @@ def read_annotation(fname, copy=False):
data.index.name = 'sample_id'
except ValueError:
if not isinstance(fname, pd.DataFrame):
raise TypeError('Provided fname must be filepath to Annotation.'
raise TypeError('Provided fname must be filepath to Annotation'
'.csv file from Allen Human Brain Atlas.')
data = fname.copy() if copy else fname

Expand Down
29 changes: 21 additions & 8 deletions abagen/probes.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,16 @@
import gzip
from io import StringIO
import itertools
import logging
from pkg_resources import resource_filename

import numpy as np
import pandas as pd

from . import io, utils

lgr = logging.getLogger('abagen')


def reannotate_probes(probes):
"""
Expand Down Expand Up @@ -41,6 +44,9 @@ def reannotate_probes(probes):
data. NeuroImage, 189, 353-367.
"""

lgr.info('Reannotating probes with information from Arnatkevic̆iūtė '
'et al., 2019, NeuroImage')

# load in reannotated probes
reannot = resource_filename('abagen', 'data/reannotated.csv.gz')
with gzip.open(reannot, 'r') as src:
Expand Down Expand Up @@ -91,6 +97,9 @@ def filter_probes(pacall, samples, probes, threshold=0.5):

threshold = np.clip(threshold, 0.0, 1.0)

lgr.info('Filtering probes with intensity-based threshold of {}'
.format(threshold))

probes = io.read_probes(probes)
signal, n_samp = np.zeros(len(probes), dtype=int), 0
for pa, annot in zip(pacall, samples):
Expand All @@ -103,6 +112,8 @@ def filter_probes(pacall, samples, probes, threshold=0.5):
# calculate proportion of signal to noise for given probe across samples
keep = (signal / n_samp) >= threshold

lgr.info('{} probes survive intensity-based filtering'.format(keep.sum()))

return probes[keep]


Expand Down Expand Up @@ -429,8 +440,7 @@ def _collapse(expression, probes, *args, method='variance', **kwargs):
)


def collapse_probes(microarray, annotation, probes, method='diff_stability',
inplace=False):
def collapse_probes(microarray, annotation, probes, method='diff_stability'):
"""
Reduces `microarray` to a sample x gene expression dataframe

Expand Down Expand Up @@ -460,9 +470,6 @@ def collapse_probes(microarray, annotation, probes, method='diff_stability',
same gene. Must be one of 'average', 'intensity', 'variance',
'pc_loading', 'corr_variance', 'corr_intensity', or 'diff_stability';
see Notes for more information. Default: 'diff_stability'
inplace : bool, optional
Whether to conserve memory by editing dataframes in-place instead of
returning edited copies. Default: False

Returns
-------
Expand Down Expand Up @@ -608,12 +615,18 @@ def collapse_probes(microarray, annotation, probes, method='diff_stability',
raise ValueError('Provided method must be one of {}, not: {}'
.format(list(SELECTION_METHODS), method))

lgr.info('Reducing probes indexing same gene with method: "{}"'
.format(method))

# read in microarray data for all subjects; this can be quite slow...
probes = io.read_probes(probes)
exp = [
(io.read_microarray(m, copy=not inplace)
.loc[probes.index, io.read_annotation(a, copy=not inplace).index])
io.read_microarray(m).loc[probes.index, io.read_annotation(a).index]
for m, a in zip(microarray, annotation)
]
exp = [e.T for e in collfunc(exp, probes, annotation)]

lgr.info('{} genes remain after probe filtering and selection'
.format(exp[0].shape[-1]))

return [e.T for e in collfunc(exp, probes, annotation)]
return exp
Loading