Skip to content

Commit

Permalink
Merge pull request #131 from rmarkello/sampcombo
Browse files Browse the repository at this point in the history
[ENH,REF] Adds new `region_agg` parameter
  • Loading branch information
rmarkello authored Nov 25, 2019
2 parents 253b691 + 8de9e38 commit fc91343
Show file tree
Hide file tree
Showing 15 changed files with 652 additions and 447 deletions.
259 changes: 97 additions & 162 deletions abagen/allen.py

Large diffs are not rendered by default.

66 changes: 46 additions & 20 deletions abagen/cli/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ def get_parser():
"""

from .. import __version__
from ..correct import NORMALIZATION_METHODS
from ..probes import SELECTION_METHODS

verstr = 'abagen {}'.format(__version__)
parser = argparse.ArgumentParser(
Expand Down Expand Up @@ -82,14 +84,14 @@ def get_parser():
If at any step a sample can be assigned to a parcel the matching process is
terminated. If multiple sample are assigned to the same parcel they are
aggregated with the metric specified via the `metric` parameter. More control
over the sample matching can be obtained by setting the `inexact` parameter;
see the parameter description for more information.
aggregated with the metric specified via the `agg_metric` parameter. More
control over the sample matching can be obtained by setting the `inexact`
parameter; see the parameter description for more information.
Once all samples have been matched to parcels for all supplied donors, the
microarray expression data are optionally normalized within-donor via the
provided `donor_norm` function before being combined across donors via the
supplied `metric`.
microarray expression data are optionally normalized via the provided
`sample_norm` and `gene_norm` functions before being combined within parcels
and across donors via the supplied `agg_metric`.
"""
)

Expand Down Expand Up @@ -175,8 +177,23 @@ def get_parser():
'samples, across all supplied donors, for which '
'a probe must have signal above background noise '
'in order to be retained. Default: 0.5')
w_data.add_argument('--metric', action='store', default='mean',
metavar='METHOD', choices=['mean', 'median'],
w_data.add_argument('--region_agg', '--region-agg', action='store',
default='donors', metavar='METHOD',
choices=['donors', 'samples'],
help='When multiple samples are identified as '
'belonging to a region in `atlas` this '
'determines how they are aggegated. If '
'\'samples\', expression data from all samples '
'for all donors assigned to a given region are '
'combined. If \'donors\', expression values for '
'all samples assigned to a given region are '
'combined independently for each donor before '
'being combined across donors. See `agg_metric` '
'for mechanism by which samples are combined. '
'Default: \'donors\'')
w_data.add_argument('--agg_metric', '--agg-metric', action='store',
default='mean', metavar='METHOD',
choices=['mean', 'median'],
help='Mechanism by which to (1) reduce expression '
'data of multiple samples in the same `atlas` '
'region, and (2) reduce donor-level expression '
Expand All @@ -185,10 +202,7 @@ def get_parser():
'Default: "mean"')
w_data.add_argument('--probe_selection', '--probe-selection',
action='store', default='diff_stability',
metavar='METHOD',
choices=['average', 'mean', 'max_intensity',
'max_variance', 'pc_loading', 'corr_variance',
'corr_intensity', 'diff_stability'],
metavar='METHOD', choices=sorted(SELECTION_METHODS),
help='Selection method for subsetting (or collapsing '
'across) probes that index the same gene. Must '
'be one of {"average", "mean", "max_intensity", '
Expand All @@ -202,15 +216,25 @@ def get_parser():
'both hemispheres (i.e., L->R and R->L), '
'approximately doubling the number of available '
'samples. Default: False (i.e., no mirroring)')
w_data.add_argument('--donor_norm', '--donor-norm', action='store',
w_data.add_argument('--gene_norm', '--gene-norm', action='store',
default='srs', metavar='METHOD', type=_resolve_none,
choices=['srs', 'zscore', 'batch', None],
choices=sorted(NORMALIZATION_METHODS) + ['None'],
help='Method by which to normalize microarray '
'expression values for each donor prior to '
'collapsing across donors. Expression values are '
'normalized separately for each gene for each '
'donor across all regions in `atlas`. Must be '
'one of {"srs", "zscore", "batch", None}. '
'donor across all expression samples. If None is '
'specified then no normalization is performed. '
'Default: "srs"')
w_data.add_argument('--sample_norm', '--sample-norm', action='store',
default='srs', metavar='METHOD', type=_resolve_none,
choices=sorted(NORMALIZATION_METHODS) + ['None'],
help='Method by which to normalize microarray '
'expression values for each sample prior to '
'collapsing into regions in `atlas`. Expression '
'values are normalized separately for each '
'sample and donor across genes. If None is '
'specified then no normalization is performed. '
'Default: "srs"')

p_data = parser.add_argument_group('Options to modify the AHBA data used')
Expand All @@ -233,7 +257,7 @@ def get_parser():
'to anatomical regions. Default: False (i.e., '
'use corrected coordinates)')

o_data = parser.add_argument_group('Options to modify how data is output')
o_data = parser.add_argument_group('Options to modify how data are output')
o_data.add_argument('--stdout', action='store_true',
help='Generated region x gene dataframes will be '
'printed to stdout for piping to other things. '
Expand All @@ -258,7 +282,7 @@ def get_parser():
o_data.add_argument('--save-donors', '--save_donors', action='store_true',
help='Whether to save donor-level expression '
'dataframes instead of aggregating expression '
'across donosr with provided `metric`. If '
'across donors with provided `agg_metric`. If '
'specified, dataframes will be saved to path '
'specified by `output-file`, appending donor IDs '
'to the end of the filename. Default: False')
Expand Down Expand Up @@ -289,11 +313,13 @@ def main(args=None):
atlas_info=opts.atlas_info,
exact=opts.exact,
tolerance=opts.tolerance,
metric=opts.metric,
region_agg=opts.region_agg,
agg_metric=opts.agg_metric,
ibf_threshold=opts.ibf_threshold,
probe_selection=opts.probe_selection,
lr_mirror=opts.lr_mirror,
donor_norm=opts.donor_norm,
gene_norm=opts.gene_norm,
sample_norm=opts.sample_norm,
corrected_mni=opts.corrected_mni,
reannotated=opts.reannotated,
return_counts=opts.save_counts,
Expand Down
32 changes: 17 additions & 15 deletions abagen/correct.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
Functions for processing and correcting gene expression data
"""

import functools
import itertools
import warnings

Expand Down Expand Up @@ -156,6 +157,15 @@ def _srs(data, low=0, high=1, axis=0):
return normed


NORMALIZATION_METHODS = dict(
rs=functools.partial(_rs, axis=0),
srs=functools.partial(_srs, low=0, high=1, axis=0),
center=lambda x: x - x.mean(axis=0, keepdims=True),
zscore=functools.partial(sstats.zscore, axis=0, ddof=1),
minmax=functools.partial(_rescale, low=0, high=1, axis=0)
)


def normalize_expression(expression, norm='srs'):
"""
Performs normalization on `expression` data
Expand Down Expand Up @@ -218,10 +228,12 @@ def normalize_expression(expression, norm='srs'):
20130048
"""

norms = ['rs', 'srs', 'center', 'zscore', 'batch', 'minmax']
if norm not in norms:
try:
normfunc = NORMALIZATION_METHODS[norm]
except KeyError:
raise ValueError('Provided value for `norm` not recognized. Must be '
'one of {}. Received: {}'.format(norms, norm))
'one of {}. Received: {}'
.format(list(NORMALIZATION_METHODS), norm))

# FIXME: I hate having to do this...
if isinstance(expression, pd.DataFrame):
Expand All @@ -236,18 +248,8 @@ def normalize_expression(expression, norm='srs'):
notna = np.logical_not(exp.isna().all(axis=1))
data = np.asarray(exp)[notna]

if norm == 'rs':
normed = _rs(data, axis=0)
if norm == 'srs':
normed = _srs(data, low=0, high=1, axis=0)
elif norm == 'center':
normed = data - data.mean(axis=0, keepdims=True)
elif norm == 'zscore':
normed = sstats.zscore(data, axis=0, ddof=1)
elif norm == 'minmax':
normed = _rescale(data, low=0, high=1, axis=0)
elif norm == 'batch':
normed = corrected[n]
# normalize the data (however was specified)
normed = normfunc(data) if norm != 'batch' else corrected[n]

# recreate dataframe and fill non-NaN values
normalized = pd.DataFrame(np.nan, columns=exp.columns, index=exp.index)
Expand Down
8 changes: 4 additions & 4 deletions abagen/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def read_ontology(fname, copy=False):
data = pd.read_csv(fname)
except ValueError:
if not isinstance(fname, pd.DataFrame):
raise TypeError('Provided fname must be filepath to Ontology.csv'
raise TypeError('Provided fname must be filepath to Ontology.csv '
'file from Allen Human Brain Atlas.')
data = fname.copy() if copy else fname

Expand Down Expand Up @@ -197,7 +197,7 @@ def read_pacall(fname, copy=False, parquet=True):
data.columns = pd.Series(range(len(data.columns)), name='sample_id')
except (AttributeError, ValueError):
if not isinstance(fname, pd.DataFrame):
raise TypeError('Provided fname must be filepath to PACall.csv'
raise TypeError('Provided fname must be filepath to PACall.csv '
'file from Allen Human Brain Atlas.')
data = fname.copy() if copy else fname

Expand Down Expand Up @@ -236,7 +236,7 @@ def read_probes(fname, copy=False):
data = pd.read_csv(fname, index_col=0)
except ValueError:
if not isinstance(fname, pd.DataFrame):
raise TypeError('Provided fname must be filepath to Probes.csv'
raise TypeError('Provided fname must be filepath to Probes.csv '
'file from Allen Human Brain Atlas.')
data = fname.copy() if copy else fname

Expand Down Expand Up @@ -278,7 +278,7 @@ def read_annotation(fname, copy=False):
data.index.name = 'sample_id'
except ValueError:
if not isinstance(fname, pd.DataFrame):
raise TypeError('Provided fname must be filepath to Annotation.'
raise TypeError('Provided fname must be filepath to Annotation'
'.csv file from Allen Human Brain Atlas.')
data = fname.copy() if copy else fname

Expand Down
29 changes: 21 additions & 8 deletions abagen/probes.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,16 @@
import gzip
from io import StringIO
import itertools
import logging
from pkg_resources import resource_filename

import numpy as np
import pandas as pd

from . import io, utils

lgr = logging.getLogger('abagen')


def reannotate_probes(probes):
"""
Expand Down Expand Up @@ -41,6 +44,9 @@ def reannotate_probes(probes):
data. NeuroImage, 189, 353-367.
"""

lgr.info('Reannotating probes with information from Arnatkevic̆iūtė '
'et al., 2019, NeuroImage')

# load in reannotated probes
reannot = resource_filename('abagen', 'data/reannotated.csv.gz')
with gzip.open(reannot, 'r') as src:
Expand Down Expand Up @@ -91,6 +97,9 @@ def filter_probes(pacall, samples, probes, threshold=0.5):

threshold = np.clip(threshold, 0.0, 1.0)

lgr.info('Filtering probes with intensity-based threshold of {}'
.format(threshold))

probes = io.read_probes(probes)
signal, n_samp = np.zeros(len(probes), dtype=int), 0
for pa, annot in zip(pacall, samples):
Expand All @@ -103,6 +112,8 @@ def filter_probes(pacall, samples, probes, threshold=0.5):
# calculate proportion of signal to noise for given probe across samples
keep = (signal / n_samp) >= threshold

lgr.info('{} probes survive intensity-based filtering'.format(keep.sum()))

return probes[keep]


Expand Down Expand Up @@ -429,8 +440,7 @@ def _collapse(expression, probes, *args, method='variance', **kwargs):
)


def collapse_probes(microarray, annotation, probes, method='diff_stability',
inplace=False):
def collapse_probes(microarray, annotation, probes, method='diff_stability'):
"""
Reduces `microarray` to a sample x gene expression dataframe
Expand Down Expand Up @@ -460,9 +470,6 @@ def collapse_probes(microarray, annotation, probes, method='diff_stability',
same gene. Must be one of 'average', 'intensity', 'variance',
'pc_loading', 'corr_variance', 'corr_intensity', or 'diff_stability';
see Notes for more information. Default: 'diff_stability'
inplace : bool, optional
Whether to conserve memory by editing dataframes in-place instead of
returning edited copies. Default: False
Returns
-------
Expand Down Expand Up @@ -608,12 +615,18 @@ def collapse_probes(microarray, annotation, probes, method='diff_stability',
raise ValueError('Provided method must be one of {}, not: {}'
.format(list(SELECTION_METHODS), method))

lgr.info('Reducing probes indexing same gene with method: "{}"'
.format(method))

# read in microarray data for all subjects; this can be quite slow...
probes = io.read_probes(probes)
exp = [
(io.read_microarray(m, copy=not inplace)
.loc[probes.index, io.read_annotation(a, copy=not inplace).index])
io.read_microarray(m).loc[probes.index, io.read_annotation(a).index]
for m, a in zip(microarray, annotation)
]
exp = [e.T for e in collfunc(exp, probes, annotation)]

lgr.info('{} genes remain after probe filtering and selection'
.format(exp[0].shape[-1]))

return [e.T for e in collfunc(exp, probes, annotation)]
return exp
Loading

0 comments on commit fc91343

Please sign in to comment.