Merge pull request #131 from rmarkello/sampcombo

[ENH,REF] Adds new `region_agg` parameter
rmarkello · Nov 25, 2019 · fc91343 · fc91343
2 parents 253b691 + 8de9e38
commit fc91343
Show file tree

Hide file tree

Showing 15 changed files with 652 additions and 447 deletions.
diff --git a/abagen/allen.py b/abagen/allen.py
diff --git a/abagen/cli/run.py b/abagen/cli/run.py
@@ -45,6 +45,8 @@ def get_parser():
     """
 
     from .. import __version__
+    from ..correct import NORMALIZATION_METHODS
+    from ..probes import SELECTION_METHODS
 
     verstr = 'abagen {}'.format(__version__)
     parser = argparse.ArgumentParser(
@@ -82,14 +84,14 @@ def get_parser():
 
 If at any step a sample can be assigned to a parcel the matching process is
 terminated. If multiple sample are assigned to the same parcel they are
-aggregated with the metric specified via the `metric` parameter. More control
-over the sample matching can be obtained by setting the `inexact` parameter;
-see the parameter description for more information.
+aggregated with the metric specified via the `agg_metric` parameter. More
+control over the sample matching can be obtained by setting the `inexact`
+parameter; see the parameter description for more information.
 
 Once all samples have been matched to parcels for all supplied donors, the
-microarray expression data are optionally normalized within-donor via the
-provided `donor_norm` function before being combined across donors via the
-supplied `metric`.
+microarray expression data are optionally normalized via the provided
+`sample_norm` and `gene_norm` functions before being combined within parcels
+and across donors via the supplied `agg_metric`.
 """
     )
 
@@ -175,8 +177,23 @@ def get_parser():
                              'samples, across all supplied donors, for which '
                              'a probe must have signal above background noise '
                              'in order to be retained. Default: 0.5')
-    w_data.add_argument('--metric', action='store', default='mean',
-                        metavar='METHOD', choices=['mean', 'median'],
+    w_data.add_argument('--region_agg', '--region-agg', action='store',
+                        default='donors', metavar='METHOD',
+                        choices=['donors', 'samples'],
+                        help='When multiple samples are identified as '
+                             'belonging to a region in `atlas` this '
+                             'determines how they are aggegated. If '
+                             '\'samples\', expression data from all samples '
+                             'for all donors assigned to a given region are '
+                             'combined. If \'donors\', expression values for '
+                             'all samples assigned to a given region are '
+                             'combined independently for each donor before '
+                             'being combined across donors. See `agg_metric` '
+                             'for mechanism by which samples are combined. '
+                             'Default: \'donors\'')
+    w_data.add_argument('--agg_metric', '--agg-metric', action='store',
+                        default='mean', metavar='METHOD',
+                        choices=['mean', 'median'],
                         help='Mechanism by which to (1) reduce expression '
                              'data of multiple samples in the same `atlas` '
                              'region, and (2) reduce donor-level expression '
@@ -185,10 +202,7 @@ def get_parser():
                              'Default: "mean"')
     w_data.add_argument('--probe_selection', '--probe-selection',
                         action='store', default='diff_stability',
-                        metavar='METHOD',
-                        choices=['average', 'mean', 'max_intensity',
-                                 'max_variance', 'pc_loading', 'corr_variance',
-                                 'corr_intensity', 'diff_stability'],
+                        metavar='METHOD', choices=sorted(SELECTION_METHODS),
                         help='Selection method for subsetting (or collapsing '
                              'across) probes that index the same gene. Must '
                              'be one of {"average", "mean", "max_intensity", '
@@ -202,15 +216,25 @@ def get_parser():
                              'both hemispheres (i.e., L->R and R->L), '
                              'approximately doubling the number of available '
                              'samples. Default: False (i.e., no mirroring)')
-    w_data.add_argument('--donor_norm', '--donor-norm', action='store',
+    w_data.add_argument('--gene_norm', '--gene-norm', action='store',
                         default='srs', metavar='METHOD', type=_resolve_none,
-                        choices=['srs', 'zscore', 'batch', None],
+                        choices=sorted(NORMALIZATION_METHODS) + ['None'],
                         help='Method by which to normalize microarray '
                              'expression values for each donor prior to '
                              'collapsing across donors. Expression values are '
                              'normalized separately for each gene for each '
-                             'donor across all regions in `atlas`. Must be '
-                             'one of {"srs", "zscore", "batch", None}. '
+                             'donor across all expression samples. If None is '
+                             'specified then no normalization is performed. '
+                             'Default: "srs"')
+    w_data.add_argument('--sample_norm', '--sample-norm', action='store',
+                        default='srs', metavar='METHOD', type=_resolve_none,
+                        choices=sorted(NORMALIZATION_METHODS) + ['None'],
+                        help='Method by which to normalize microarray '
+                             'expression values for each sample prior to '
+                             'collapsing into regions in `atlas`. Expression '
+                             'values are normalized separately for each '
+                             'sample and donor across genes. If None is '
+                             'specified then no normalization is performed. '
                              'Default: "srs"')
 
     p_data = parser.add_argument_group('Options to modify the AHBA data used')
@@ -233,7 +257,7 @@ def get_parser():
                              'to anatomical regions. Default: False (i.e., '
                              'use corrected coordinates)')
 
-    o_data = parser.add_argument_group('Options to modify how data is output')
+    o_data = parser.add_argument_group('Options to modify how data are output')
     o_data.add_argument('--stdout', action='store_true',
                         help='Generated region x gene dataframes will be '
                              'printed to stdout for piping to other things. '
@@ -258,7 +282,7 @@ def get_parser():
     o_data.add_argument('--save-donors', '--save_donors', action='store_true',
                         help='Whether to save donor-level expression '
                              'dataframes instead of aggregating expression '
-                             'across donosr with provided `metric`. If '
+                             'across donors with provided `agg_metric`. If '
                              'specified, dataframes will be saved to path '
                              'specified by `output-file`, appending donor IDs '
                              'to the end of the filename. Default: False')
@@ -289,11 +313,13 @@ def main(args=None):
                                      atlas_info=opts.atlas_info,
                                      exact=opts.exact,
                                      tolerance=opts.tolerance,
-                                     metric=opts.metric,
+                                     region_agg=opts.region_agg,
+                                     agg_metric=opts.agg_metric,
                                      ibf_threshold=opts.ibf_threshold,
                                      probe_selection=opts.probe_selection,
                                      lr_mirror=opts.lr_mirror,
-                                     donor_norm=opts.donor_norm,
+                                     gene_norm=opts.gene_norm,
+                                     sample_norm=opts.sample_norm,
                                      corrected_mni=opts.corrected_mni,
                                      reannotated=opts.reannotated,
                                      return_counts=opts.save_counts,

diff --git a/abagen/correct.py b/abagen/correct.py
@@ -3,6 +3,7 @@
 Functions for processing and correcting gene expression data
 """
 
+import functools
 import itertools
 import warnings
 
@@ -156,6 +157,15 @@ def _srs(data, low=0, high=1, axis=0):
     return normed
 
 
+NORMALIZATION_METHODS = dict(
+    rs=functools.partial(_rs, axis=0),
+    srs=functools.partial(_srs, low=0, high=1, axis=0),
+    center=lambda x: x - x.mean(axis=0, keepdims=True),
+    zscore=functools.partial(sstats.zscore, axis=0, ddof=1),
+    minmax=functools.partial(_rescale, low=0, high=1, axis=0)
+)
+
+
 def normalize_expression(expression, norm='srs'):
     """
     Performs normalization on `expression` data
@@ -218,10 +228,12 @@ def normalize_expression(expression, norm='srs'):
        20130048
     """
 
-    norms = ['rs', 'srs', 'center', 'zscore', 'batch', 'minmax']
-    if norm not in norms:
+    try:
+        normfunc = NORMALIZATION_METHODS[norm]
+    except KeyError:
         raise ValueError('Provided value for `norm` not recognized. Must be '
-                         'one of {}. Received: {}'.format(norms, norm))
+                         'one of {}. Received: {}'
+                         .format(list(NORMALIZATION_METHODS), norm))
 
     # FIXME: I hate having to do this...
     if isinstance(expression, pd.DataFrame):
@@ -236,18 +248,8 @@ def normalize_expression(expression, norm='srs'):
         notna = np.logical_not(exp.isna().all(axis=1))
         data = np.asarray(exp)[notna]
 
-        if norm == 'rs':
-            normed = _rs(data, axis=0)
-        if norm == 'srs':
-            normed = _srs(data, low=0, high=1, axis=0)
-        elif norm == 'center':
-            normed = data - data.mean(axis=0, keepdims=True)
-        elif norm == 'zscore':
-            normed = sstats.zscore(data, axis=0, ddof=1)
-        elif norm == 'minmax':
-            normed = _rescale(data, low=0, high=1, axis=0)
-        elif norm == 'batch':
-            normed = corrected[n]
+        # normalize the data (however was specified)
+        normed = normfunc(data) if norm != 'batch' else corrected[n]
 
         # recreate dataframe and fill non-NaN values
         normalized = pd.DataFrame(np.nan, columns=exp.columns, index=exp.index)

diff --git a/abagen/io.py b/abagen/io.py
@@ -138,7 +138,7 @@ def read_ontology(fname, copy=False):
         data = pd.read_csv(fname)
     except ValueError:
         if not isinstance(fname, pd.DataFrame):
-            raise TypeError('Provided fname must be filepath to Ontology.csv'
+            raise TypeError('Provided fname must be filepath to Ontology.csv '
                             'file from Allen Human Brain Atlas.')
         data = fname.copy() if copy else fname
 
@@ -197,7 +197,7 @@ def read_pacall(fname, copy=False, parquet=True):
         data.columns = pd.Series(range(len(data.columns)), name='sample_id')
     except (AttributeError, ValueError):
         if not isinstance(fname, pd.DataFrame):
-            raise TypeError('Provided fname must be filepath to PACall.csv'
+            raise TypeError('Provided fname must be filepath to PACall.csv '
                             'file from Allen Human Brain Atlas.')
         data = fname.copy() if copy else fname
 
@@ -236,7 +236,7 @@ def read_probes(fname, copy=False):
         data = pd.read_csv(fname, index_col=0)
     except ValueError:
         if not isinstance(fname, pd.DataFrame):
-            raise TypeError('Provided fname must be filepath to Probes.csv'
+            raise TypeError('Provided fname must be filepath to Probes.csv '
                             'file from Allen Human Brain Atlas.')
         data = fname.copy() if copy else fname
 
@@ -278,7 +278,7 @@ def read_annotation(fname, copy=False):
         data.index.name = 'sample_id'
     except ValueError:
         if not isinstance(fname, pd.DataFrame):
-            raise TypeError('Provided fname must be filepath to Annotation.'
+            raise TypeError('Provided fname must be filepath to Annotation'
                             '.csv file from Allen Human Brain Atlas.')
         data = fname.copy() if copy else fname
 

diff --git a/abagen/probes.py b/abagen/probes.py
@@ -7,13 +7,16 @@
 import gzip
 from io import StringIO
 import itertools
+import logging
 from pkg_resources import resource_filename
 
 import numpy as np
 import pandas as pd
 
 from . import io, utils
 
+lgr = logging.getLogger('abagen')
+
 
 def reannotate_probes(probes):
     """
@@ -41,6 +44,9 @@ def reannotate_probes(probes):
        data. NeuroImage, 189, 353-367.
     """
 
+    lgr.info('Reannotating probes with information from Arnatkevic̆iūtė '
+             'et al., 2019, NeuroImage')
+
     # load in reannotated probes
     reannot = resource_filename('abagen', 'data/reannotated.csv.gz')
     with gzip.open(reannot, 'r') as src:
@@ -91,6 +97,9 @@ def filter_probes(pacall, samples, probes, threshold=0.5):
 
     threshold = np.clip(threshold, 0.0, 1.0)
 
+    lgr.info('Filtering probes with intensity-based threshold of {}'
+             .format(threshold))
+
     probes = io.read_probes(probes)
     signal, n_samp = np.zeros(len(probes), dtype=int), 0
     for pa, annot in zip(pacall, samples):
@@ -103,6 +112,8 @@ def filter_probes(pacall, samples, probes, threshold=0.5):
     # calculate proportion of signal to noise for given probe across samples
     keep = (signal / n_samp) >= threshold
 
+    lgr.info('{} probes survive intensity-based filtering'.format(keep.sum()))
+
     return probes[keep]
 
 
@@ -429,8 +440,7 @@ def _collapse(expression, probes, *args, method='variance', **kwargs):
 )
 
 
-def collapse_probes(microarray, annotation, probes, method='diff_stability',
-                    inplace=False):
+def collapse_probes(microarray, annotation, probes, method='diff_stability'):
     """
     Reduces `microarray` to a sample x gene expression dataframe
 
@@ -460,9 +470,6 @@ def collapse_probes(microarray, annotation, probes, method='diff_stability',
         same gene. Must be one of 'average', 'intensity', 'variance',
         'pc_loading', 'corr_variance', 'corr_intensity', or 'diff_stability';
         see Notes for more information. Default: 'diff_stability'
-    inplace : bool, optional
-        Whether to conserve memory by editing dataframes in-place instead of
-        returning edited copies. Default: False
 
     Returns
     -------
@@ -608,12 +615,18 @@ def collapse_probes(microarray, annotation, probes, method='diff_stability',
         raise ValueError('Provided method must be one of {}, not: {}'
                          .format(list(SELECTION_METHODS), method))
 
+    lgr.info('Reducing probes indexing same gene with method: "{}"'
+             .format(method))
+
     # read in microarray data for all subjects; this can be quite slow...
     probes = io.read_probes(probes)
     exp = [
-        (io.read_microarray(m, copy=not inplace)
-           .loc[probes.index, io.read_annotation(a, copy=not inplace).index])
+        io.read_microarray(m).loc[probes.index, io.read_annotation(a).index]
         for m, a in zip(microarray, annotation)
     ]
+    exp = [e.T for e in collfunc(exp, probes, annotation)]
+
+    lgr.info('{} genes remain after probe filtering and selection'
+             .format(exp[0].shape[-1]))
 
-    return [e.T for e in collfunc(exp, probes, annotation)]
+    return exp