Skip to content

Commit

Permalink
Merge pull request #90 from rmarkello/donornorm
Browse files Browse the repository at this point in the history
[ENH] Adds parameter for normalizing donor microarray expression values
  • Loading branch information
rmarkello authored Sep 4, 2019
2 parents 5120ccd + a8acfdf commit 649afc2
Show file tree
Hide file tree
Showing 7 changed files with 349 additions and 123 deletions.
5 changes: 2 additions & 3 deletions abagen/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
__all__ = [
'__version__', '__doc__',
'io', 'mouse', 'get_expression_data', 'keep_stable_genes',
'remove_distance', 'aggregate_donors', 'fetch_desikan_killiany',
'normalize_expression', 'remove_distance', 'fetch_desikan_killiany',
'fetch_gene_group', 'fetch_microarray', 'fetch_raw_mri'
]

Expand All @@ -13,7 +13,6 @@

from . import io, mouse
from .allen import get_expression_data
from .correct import keep_stable_genes, remove_distance
from .correct import keep_stable_genes, normalize_expression, remove_distance
from .datasets import (fetch_desikan_killiany, fetch_gene_group,
fetch_microarray, fetch_raw_mri)
from .process import aggregate_donors
96 changes: 65 additions & 31 deletions abagen/allen.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
# -*- coding: utf-8 -*-
"""
Functions for mapping AHBA microarray dataset to atlases and and parcellations
in MNI space
"""

from functools import reduce

import numpy as np
import pandas as pd

from . import datasets, io, probes, process, samples, utils
from . import correct, datasets, io, probes, samples, utils

import logging
logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
Expand Down Expand Up @@ -67,7 +66,8 @@ def groupby_label(microarray, sample_labels, labels=None, metric='mean'):
def get_expression_data(atlas, atlas_info=None, *, exact=True,
tolerance=2, metric='mean', ibf_threshold=0.5,
probe_selection='diff_stability',
lr_mirror=False, corrected_mni=True, reannotated=True,
lr_mirror=False, donor_norm='srs',
corrected_mni=True, reannotated=True,
return_counts=False, return_donors=False,
donors='all', data_dir=None, verbose=1):
"""
Expand Down Expand Up @@ -110,8 +110,8 @@ def get_expression_data(atlas, atlas_info=None, *, exact=True,
parameter; see the parameter description for more information.
Once all samples have been matched to parcels for all supplied donors, the
microarray expression data are normalized within-donor via a scaled robust
sigmoid (SRS) procedure before being combined across donors via the
microarray expression data are optionally normalized within-donor via the
provided `donor_norm` function before being combined across donors via the
supplied `metric`.
Parameters
Expand Down Expand Up @@ -160,6 +160,12 @@ def get_expression_data(atlas, atlas_info=None, *, exact=True,
increase spatial coverage. This will duplicate samples across both
hemispheres (i.e., L->R and R->L), approximately doubling the number of
available samples. Default: False
donor_norm : {'srs', 'zscore', 'batch', None}, optional
Method by which to normalize microarray expression values for each
donor. Expression values are normalized separately for each gene for
each donor across all regions in `atlas`; see Notes for more
information on different methods. If not specified no normalization
is performed. Default: 'srs'
corrected_mni : bool, optional
Whether to use the "corrected" MNI coordinates shipped with the
`alleninf` package instead of the coordinates provided with the AHBA
Expand Down Expand Up @@ -206,43 +212,67 @@ def get_expression_data(atlas, atlas_info=None, *, exact=True,
The following methods can be used for collapsing across probes when
multiple probes are available for the same gene.
1. ``probe_selection='average'``
1. ``probe_selection='average'``
Takes the average of expression data across all probes indexing the same
gene. Providing 'mean' as the input method will return the same thing.
Takes the average of expression data across all probes indexing the
same gene. Providing 'mean' as the input method will return the same
thing.
2. ``probe_selection='max_intensity'``
2. ``probe_selection='max_intensity'``
Selects the probe with the maximum average expression across samples from
all donors.
Selects the probe with the maximum average expression across samples
from all donors.
3. ``probe_selection='max_variance'``
3. ``probe_selection='max_variance'``
Selects the probe with the maximum variance in expression across samples
from all donors.
Selects the probe with the maximum variance in expression across
samples from all donors.
4. ``probe_selection='pc_loading'``
4. ``probe_selection='pc_loading'``
Selects the probe with the maximum loading along the first principal
component of a decomposition performed across samples from all donors.
Selects the probe with the maximum loading along the first principal
component of a decomposition performed across samples from all donors.
5. ``probe_selection='corr_intensity'``
5. ``probe_selection='corr_intensity'``
Selects the probe with the maximum correlation to other probes from the
same gene when >2 probes exist; otherwise, uses the same procedure as
`max_intensity`.
Selects the probe with the maximum correlation to other probes from the
same gene when >2 probes exist; otherwise, uses the same procedure as
`max_intensity`.
6. ``probe_selection='corr_variance'``
6. ``probe_selection='corr_variance'``
Selects the probe with the maximum correlation to other probes from the
same gene when >2 probes exist; otherwise, uses the same procedure as
`max_varance`.
Selects the probe with the maximum correlation to other probes from the
same gene when >2 probes exist; otherwise, uses the same procedure as
`max_varance`.
7. ``probe_selection='diff_stability'``
7. ``probe_selection='diff_stability'``
Selects the probe with the most consistent pattern of regional variation
across donors (i.e., the highest average correlation across brain regions
between all pairs of donors).
Selects the probe with the most consistent pattern of regional
variation across donors (i.e., the highest average correlation across
brain regions between all pairs of donors).
The following methods can be used for normalizing microarray expression
values for each donor prior to aggregating:
1. ``donor_norm='srs'``
Uses a scaled robust sigmoid function as in [A3]_ to normalize
expression values for each gene across regions to within the unit
normal (i.e., in the range 0-1).
2. ``donor_norm='zscore'``
Applies a basic z-score (subtract mean, divide by standard deviation)
to expression values for each gene across regions. Uses degrees of
freedom equal to one for standard deviation calculation.
3. ``donor_norm='batch'``
Uses a linear model to remove donor effects from expression values.
Differs from other methods in that all donors are simultaneously fit
to the same model and expression values are residualized based on
estimated betas. Linear model includes the intercept but the
residualization does not remove it.
References
----------
Expand All @@ -251,6 +281,9 @@ def get_expression_data(atlas, atlas_info=None, *, exact=True,
data. NeuroImage, 189, 353-367.
.. [A2] Hawrylycz, M.J. et al. (2012) An anatomically comprehensive atlas
of the adult human transcriptome. Nature, 489, 391-399.
.. [A3] Fulcher, B. D., & Fornito, A. (2016). A transcriptional signature
of hub connectivity in the mouse connectome. Proceedings of the National
Academy of Sciences, 113(5), 1435-1440.
"""

# set logging verbosity level
Expand Down Expand Up @@ -389,11 +422,12 @@ def get_expression_data(atlas, atlas_info=None, *, exact=True,
counts.loc[roi, ind] += 1

# normalize data with SRS
expression = [process.normalize_expression(e) for e in expression]
if donor_norm is not None:
expression = correct.normalize_expression(expression, norm=donor_norm)

# aggregate across donors if individual donor dataframes not requested
if not return_donors:
expression = process.aggregate_donors(expression, metric)
expression = pd.concat(expression).groupby('label').aggregate(metric)

# drop the "zero" label from the counts dataframe (this is background)
if return_counts:
Expand Down
26 changes: 23 additions & 3 deletions abagen/cli/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,15 @@ def _resolve_path(path):
return os.path.abspath(os.path.expanduser(path))


def _resolve_none(inp):
""" Helper function to allow 'None' as input from argparse
"""

if inp == "None":
return
return inp


class CheckExists(argparse.Action):
""" Helper class to check that provided paths exist
"""
Expand Down Expand Up @@ -78,9 +87,9 @@ def get_parser():
see the parameter description for more information.
Once all samples have been matched to parcels for all supplied donors, the
microarray expression data are normalized within-donor via a scaled robust
sigmoid (SRS) procedure before being combined across donors via the supplied
`metric`.
microarray expression data are optionally normalized within-donor via the
provided `donor_norm` function before being combined across donors via the
supplied `metric`.
"""
)

Expand Down Expand Up @@ -193,6 +202,16 @@ def get_parser():
'both hemispheres (i.e., L->R and R->L), '
'approximately doubling the number of available '
'samples. Default: False (i.e., no mirroring)')
w_data.add_argument('--donor_norm', '--donor-norm', action='store',
default='srs', metavar='METHOD', type=_resolve_none,
choices=['srs', 'zscore', 'batch', None],
help='Method by which to normalize microarray '
'expression values for each donor prior to '
'collapsing across donors. Expression values are '
'normalized separately for each gene for each '
'donor across all regions in `atlas`. Must be '
'one of {"srs", "zscore", "batch", None}. '
'Default: "srs"')

p_data = parser.add_argument_group('Options to modify the AHBA data used')
p_data.add_argument('--no-reannotated', '--no_reannotated',
Expand Down Expand Up @@ -274,6 +293,7 @@ def main(args=None):
ibf_threshold=opts.ibf_threshold,
probe_selection=opts.probe_selection,
lr_mirror=opts.lr_mirror,
donor_norm=opts.donor_norm,
corrected_mni=opts.corrected_mni,
reannotated=opts.reannotated,
return_counts=opts.save_counts,
Expand Down
Loading

0 comments on commit 649afc2

Please sign in to comment.