Merge pull request #90 from rmarkello/donornorm

[ENH] Adds parameter for normalizing donor microarray expression values
rmarkello · Sep 4, 2019 · 649afc2 · 649afc2
2 parents 5120ccd + a8acfdf
commit 649afc2
Show file tree

Hide file tree

Showing 7 changed files with 349 additions and 123 deletions.
diff --git a/abagen/__init__.py b/abagen/__init__.py
@@ -1,7 +1,7 @@
 __all__ = [
     '__version__', '__doc__',
     'io', 'mouse', 'get_expression_data', 'keep_stable_genes',
-    'remove_distance', 'aggregate_donors', 'fetch_desikan_killiany',
+    'normalize_expression', 'remove_distance', 'fetch_desikan_killiany',
     'fetch_gene_group', 'fetch_microarray', 'fetch_raw_mri'
 ]
 
@@ -13,7 +13,6 @@
 
 from . import io, mouse
 from .allen import get_expression_data
-from .correct import keep_stable_genes, remove_distance
+from .correct import keep_stable_genes, normalize_expression, remove_distance
 from .datasets import (fetch_desikan_killiany, fetch_gene_group,
                        fetch_microarray, fetch_raw_mri)
-from .process import aggregate_donors
diff --git a/abagen/allen.py b/abagen/allen.py
@@ -1,15 +1,14 @@
 # -*- coding: utf-8 -*-
 """
 Functions for mapping AHBA microarray dataset to atlases and and parcellations
-in MNI space
 """
 
 from functools import reduce
 
 import numpy as np
 import pandas as pd
 
-from . import datasets, io, probes, process, samples, utils
+from . import correct, datasets, io, probes, samples, utils
 
 import logging
 logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
@@ -67,7 +66,8 @@ def groupby_label(microarray, sample_labels, labels=None, metric='mean'):
 def get_expression_data(atlas, atlas_info=None, *, exact=True,
                         tolerance=2, metric='mean', ibf_threshold=0.5,
                         probe_selection='diff_stability',
-                        lr_mirror=False, corrected_mni=True, reannotated=True,
+                        lr_mirror=False, donor_norm='srs',
+                        corrected_mni=True, reannotated=True,
                         return_counts=False, return_donors=False,
                         donors='all', data_dir=None, verbose=1):
     """
@@ -110,8 +110,8 @@ def get_expression_data(atlas, atlas_info=None, *, exact=True,
     parameter; see the parameter description for more information.
 
     Once all samples have been matched to parcels for all supplied donors, the
-    microarray expression data are normalized within-donor via a scaled robust
-    sigmoid (SRS) procedure before being combined across donors via the
+    microarray expression data are optionally normalized within-donor via the
+    provided `donor_norm` function before being combined across donors via the
     supplied `metric`.
 
     Parameters
@@ -160,6 +160,12 @@ def get_expression_data(atlas, atlas_info=None, *, exact=True,
         increase spatial coverage. This will duplicate samples across both
         hemispheres (i.e., L->R and R->L), approximately doubling the number of
         available samples. Default: False
+    donor_norm : {'srs', 'zscore', 'batch', None}, optional
+        Method by which to normalize microarray expression values for each
+        donor. Expression values are normalized separately for each gene for
+        each donor across all regions in `atlas`; see Notes for more
+        information on different methods. If not specified no normalization
+        is performed. Default: 'srs'
     corrected_mni : bool, optional
         Whether to use the "corrected" MNI coordinates shipped with the
         `alleninf` package instead of the coordinates provided with the AHBA
@@ -206,43 +212,67 @@ def get_expression_data(atlas, atlas_info=None, *, exact=True,
     The following methods can be used for collapsing across probes when
     multiple probes are available for the same gene.
 
-    1. ``probe_selection='average'``
+        1. ``probe_selection='average'``
 
-    Takes the average of expression data across all probes indexing the same
-    gene. Providing 'mean' as the input method will return the same thing.
+        Takes the average of expression data across all probes indexing the
+        same gene. Providing 'mean' as the input method will return the same
+        thing.
 
-    2. ``probe_selection='max_intensity'``
+        2. ``probe_selection='max_intensity'``
 
-    Selects the probe with the maximum average expression across samples from
-    all donors.
+        Selects the probe with the maximum average expression across samples
+        from all donors.
 
-    3. ``probe_selection='max_variance'``
+        3. ``probe_selection='max_variance'``
 
-    Selects the probe with the maximum variance in expression across samples
-    from all donors.
+        Selects the probe with the maximum variance in expression across
+        samples from all donors.
 
-    4. ``probe_selection='pc_loading'``
+        4. ``probe_selection='pc_loading'``
 
-    Selects the probe with the maximum loading along the first principal
-    component of a decomposition performed across samples from all donors.
+        Selects the probe with the maximum loading along the first principal
+        component of a decomposition performed across samples from all donors.
 
-    5. ``probe_selection='corr_intensity'``
+        5. ``probe_selection='corr_intensity'``
 
-    Selects the probe with the maximum correlation to other probes from the
-    same gene when >2 probes exist; otherwise, uses the same procedure as
-    `max_intensity`.
+        Selects the probe with the maximum correlation to other probes from the
+        same gene when >2 probes exist; otherwise, uses the same procedure as
+        `max_intensity`.
 
-    6. ``probe_selection='corr_variance'``
+        6. ``probe_selection='corr_variance'``
 
-    Selects the probe with the maximum correlation to other probes from the
-    same gene when >2 probes exist; otherwise, uses the same procedure as
-    `max_varance`.
+        Selects the probe with the maximum correlation to other probes from the
+        same gene when >2 probes exist; otherwise, uses the same procedure as
+        `max_varance`.
 
-    7. ``probe_selection='diff_stability'``
+        7. ``probe_selection='diff_stability'``
 
-    Selects the probe with the most consistent pattern of regional variation
-    across donors (i.e., the highest average correlation across brain regions
-    between all pairs of donors).
+        Selects the probe with the most consistent pattern of regional
+        variation across donors (i.e., the highest average correlation across
+        brain regions between all pairs of donors).
+
+    The following methods can be used for normalizing microarray expression
+    values for each donor prior to aggregating:
+
+        1. ``donor_norm='srs'``
+
+        Uses a scaled robust sigmoid function as in [A3]_ to normalize
+        expression values for each gene across regions to within the unit
+        normal (i.e., in the range 0-1).
+
+        2. ``donor_norm='zscore'``
+
+        Applies a basic z-score (subtract mean, divide by standard deviation)
+        to expression values for each gene across regions. Uses degrees of
+        freedom equal to one for standard deviation calculation.
+
+        3. ``donor_norm='batch'``
+
+        Uses a linear model to remove donor effects from expression values.
+        Differs from other methods in that all donors are simultaneously fit
+        to the same model and expression values are residualized based on
+        estimated betas. Linear model includes the intercept but the
+        residualization does not remove it.
 
     References
     ----------
@@ -251,6 +281,9 @@ def get_expression_data(atlas, atlas_info=None, *, exact=True,
        data. NeuroImage, 189, 353-367.
     .. [A2] Hawrylycz, M.J. et al. (2012) An anatomically comprehensive atlas
        of the adult human transcriptome. Nature, 489, 391-399.
+    .. [A3] Fulcher, B. D., & Fornito, A. (2016). A transcriptional signature
+       of hub connectivity in the mouse connectome. Proceedings of the National
+       Academy of Sciences, 113(5), 1435-1440.
     """
 
     # set logging verbosity level
@@ -389,11 +422,12 @@ def get_expression_data(atlas, atlas_info=None, *, exact=True,
             counts.loc[roi, ind] += 1
 
     # normalize data with SRS
-    expression = [process.normalize_expression(e) for e in expression]
+    if donor_norm is not None:
+        expression = correct.normalize_expression(expression, norm=donor_norm)
 
     # aggregate across donors if individual donor dataframes not requested
     if not return_donors:
-        expression = process.aggregate_donors(expression, metric)
+        expression = pd.concat(expression).groupby('label').aggregate(metric)
 
     # drop the "zero" label from the counts dataframe (this is background)
     if return_counts:

diff --git a/abagen/cli/run.py b/abagen/cli/run.py
@@ -20,6 +20,15 @@ def _resolve_path(path):
             return os.path.abspath(os.path.expanduser(path))
 
 
+def _resolve_none(inp):
+    """ Helper function to allow 'None' as input from argparse
+    """
+
+    if inp == "None":
+        return
+    return inp
+
+
 class CheckExists(argparse.Action):
     """ Helper class to check that provided paths exist
     """
@@ -78,9 +87,9 @@ def get_parser():
 see the parameter description for more information.
 
 Once all samples have been matched to parcels for all supplied donors, the
-microarray expression data are normalized within-donor via a scaled robust
-sigmoid (SRS) procedure before being combined across donors via the supplied
-`metric`.
+microarray expression data are optionally normalized within-donor via the
+provided `donor_norm` function before being combined across donors via the
+supplied `metric`.
 """
     )
 
@@ -193,6 +202,16 @@ def get_parser():
                              'both hemispheres (i.e., L->R and R->L), '
                              'approximately doubling the number of available '
                              'samples. Default: False (i.e., no mirroring)')
+    w_data.add_argument('--donor_norm', '--donor-norm', action='store',
+                        default='srs', metavar='METHOD', type=_resolve_none,
+                        choices=['srs', 'zscore', 'batch', None],
+                        help='Method by which to normalize microarray '
+                             'expression values for each donor prior to '
+                             'collapsing across donors. Expression values are '
+                             'normalized separately for each gene for each '
+                             'donor across all regions in `atlas`. Must be '
+                             'one of {"srs", "zscore", "batch", None}. '
+                             'Default: "srs"')
 
     p_data = parser.add_argument_group('Options to modify the AHBA data used')
     p_data.add_argument('--no-reannotated', '--no_reannotated',
@@ -274,6 +293,7 @@ def main(args=None):
                                      ibf_threshold=opts.ibf_threshold,
                                      probe_selection=opts.probe_selection,
                                      lr_mirror=opts.lr_mirror,
+                                     donor_norm=opts.donor_norm,
                                      corrected_mni=opts.corrected_mni,
                                      reannotated=opts.reannotated,
                                      return_counts=opts.save_counts,