Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REF] abagen.io functions don't copy dataframes by default #94

Merged
merged 4 commits into from
Sep 5, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion abagen/allen.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ def get_expression_data(atlas, atlas_info=None, *, exact=True,
files.update(dict(microarray=micro, pacall=micro, annotation=annot))

# get dataframe of probe information (reannotated or otherwise)
probe_info = io.read_probes(files['probes'][0])
probe_info = io.read_probes(files['probes'][0], copy=True)
if reannotated:
lgr.info('Reannotating microarray probes with information from '
'Arnatkevic̆iūtė et al., 2018, NeuroImage')
Expand Down
137 changes: 76 additions & 61 deletions abagen/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def _make_parquet(fname, convert_only=False):
data : pandas.DataFrame
Data loaded from `fname`
"""

# get ideal parquet filename
parqname = fname.rpartition('.csv')[0] + '.parq'

Expand All @@ -55,7 +56,7 @@ def _make_parquet(fname, convert_only=False):
return data


def read_microarray(fname, parquet=True):
def read_microarray(fname, copy=False, parquet=True):
"""
Loads MicroarrayExpression.csv file found at `fname`

Expand All @@ -66,6 +67,9 @@ def read_microarray(fname, parquet=True):
----------
fname : str
Path to MicroarrayExpression.csv file
copy : bool, optional
Whether to return a copy if `fname` is a pre-loaded pandas.Dataframe.
Default: False
parquet : bool, optional
Whether to load data from parquet file instead of CSV. If a parquet
file does not already exist then one will be created for faster loading
Expand All @@ -79,28 +83,29 @@ def read_microarray(fname, parquet=True):
and `S` is samples. The row index is the unique probe ID assigned
during processing, which can be used to match data to the information
obtained with :func:`read_probes`. The column index is the unique
sample ID (integer, beginning at 1) which can be used to match data to
sample ID (integer, beginning at 0) which can be used to match data to
the information obtained with :func:`read_annotation`.
"""
if not isinstance(fname, str):
if isinstance(fname, pd.DataFrame):
return fname.copy()
else:
raise TypeError('Provided fname {} must be a filepath.'
.format(fname))

if use_parq and parquet:
data = _make_parquet(fname, convert_only=False).set_index('0')
else:
data = pd.read_csv(fname, header=None, index_col=0)

data.index.name = 'probe_id'
data.columns = pd.Series(range(len(data.columns)), name='sample_id')
try:
if use_parq and parquet:
data = _make_parquet(fname, convert_only=False)
data = data.set_index('0')
else:
data = pd.read_csv(fname, header=None, index_col=0)
data.index.name = 'probe_id'
data.columns = pd.Series(range(len(data.columns)), name='sample_id')
except (AttributeError, ValueError):
if not isinstance(fname, pd.DataFrame):
raise TypeError('Provided fname must be filepath to Microarray'
'Expression.csv file from Allen Human Brain '
'Atlas.')
data = fname.copy() if copy else fname

return data


def read_ontology(fname, parquet=True):
def read_ontology(fname, copy=False):
"""
Loads Ontology.csv file found at `fname`

Expand All @@ -116,8 +121,9 @@ def read_ontology(fname, parquet=True):
----------
fname : str
Path to Ontology.csv file
parquet : bool, optional
Does nothing; for compatibility with other :mod:`abagen.io` functions
copy : bool, optional
Whether to return a copy if `fname` is a pre-loaded pandas.Dataframe.
Default: False

Returns
-------
Expand All @@ -127,17 +133,19 @@ def read_ontology(fname, parquet=True):
'parent_structure_id', 'hemisphere', 'graph_order',
'structure_id_path', and 'color_hex_triplet'.
"""
if not isinstance(fname, str):
if isinstance(fname, pd.DataFrame):
return fname.copy()
else:
raise TypeError('Provided fname {} must be a filepath.'
.format(fname))

return pd.read_csv(fname)
try:
data = pd.read_csv(fname)
except ValueError:
if not isinstance(fname, pd.DataFrame):
raise TypeError('Provided fname must be filepath to Ontology.csv'
'file from Allen Human Brain Atlas.')
data = fname.copy() if copy else fname

return data

def read_pacall(fname, parquet=True):

def read_pacall(fname, copy=False, parquet=True):
"""
Loads PACall.csv file found at `fname`

Expand All @@ -158,6 +166,9 @@ def read_pacall(fname, parquet=True):
----------
fname : str
Path to PACall.csv file
copy : bool, optional
Whether to return a copy if `fname` is a pre-loaded pandas.Dataframe.
Default: False
parquet : bool, optional
Whether to load data from parquet file instead of CSV. If a parquet
file does not already exist then one will be created for faster loading
Expand All @@ -175,25 +186,25 @@ def read_pacall(fname, parquet=True):
the unique sample ID (integer, beginning at 1) which can be used to
match data to the information obtained with :func:`read_annotation`.
"""
if not isinstance(fname, str):
if isinstance(fname, pd.DataFrame):
return fname.copy()
else:
raise TypeError('Provided fname {} must be a filepath.'
.format(fname))

if use_parq and parquet:
data = _make_parquet(fname, convert_only=False).set_index('0')
else:
data = pd.read_csv(fname, header=None, index_col=0)

data.index.name = 'probe_id'
data.columns = pd.Series(range(len(data.columns)), name='sample_id')
try:
if use_parq and parquet:
data = _make_parquet(fname, convert_only=False)
data = data.set_index('0')
else:
data = pd.read_csv(fname, header=None, index_col=0)
data.index.name = 'probe_id'
data.columns = pd.Series(range(len(data.columns)), name='sample_id')
except (AttributeError, ValueError):
if not isinstance(fname, pd.DataFrame):
raise TypeError('Provided fname must be filepath to PACall.csv'
'file from Allen Human Brain Atlas.')
data = fname.copy() if copy else fname

return data


def read_probes(fname, parquet=True):
def read_probes(fname, copy=False):
"""
Loads Probes.csv file found at `fname`

Expand All @@ -207,8 +218,9 @@ def read_probes(fname, parquet=True):
----------
fname : str
Path to Probes.csv file
parquet : bool, optional
Does nothing; for compatibility with other :mod:`abagen.io` functions
copy : bool, optional
Whether to return a copy if `fname` is a pre-loaded pandas.Dataframe.
Default: False

Returns
-------
Expand All @@ -219,17 +231,19 @@ def read_probes(fname, parquet=True):
:func:`read_pacall`. Columns include 'probe_name', 'gene_id',
'gene_symbol', 'gene_name', 'entrez_id', and 'chromosome'.
"""
if not isinstance(fname, str):
if isinstance(fname, pd.DataFrame):
return fname.copy()
else:
raise TypeError('Provided fname {} must be a filepath.'
.format(fname))

return pd.read_csv(fname, index_col=0)
try:
data = pd.read_csv(fname, index_col=0)
except ValueError:
if not isinstance(fname, pd.DataFrame):
raise TypeError('Provided fname must be filepath to Probes.csv'
'file from Allen Human Brain Atlas.')
data = fname.copy() if copy else fname

return data


def read_annotation(fname, parquet=True):
def read_annotation(fname, copy=False):
"""
Loads SampleAnnot.csv file found at `fname`

Expand All @@ -243,8 +257,9 @@ def read_annotation(fname, parquet=True):
----------
fname : str
Path to SampleAnnot.csv file
parquet : bool, optional
Does nothing; for compatibility with other :mod:`abagen.io` functions
copy : bool, optional
Whether to return a copy if `fname` is a pre-loaded pandas.Dataframe.
Default: False

Returns
-------
Expand All @@ -257,14 +272,14 @@ def read_annotation(fname, parquet=True):
'structure_acronym', 'structure_name', 'polygon_id', 'mri_voxel_x',
'mri_voxel_y', 'mri_voxel_z', 'mni_x', 'mni_y', 'mni_z'.
"""
if not isinstance(fname, str):
if isinstance(fname, pd.DataFrame):
return fname.copy()
else:
raise TypeError('Provided fname {} must be a filepath.'
.format(fname))

annotation = pd.read_csv(fname)
annotation.index.name = 'sample_id'
try:
data = pd.read_csv(fname)
data.index.name = 'sample_id'
except ValueError:
if not isinstance(fname, pd.DataFrame):
raise TypeError('Provided fname must be filepath to Annotation.'
'.csv file from Allen Human Brain Atlas.')
data = fname.copy() if copy else fname

return annotation
return data
13 changes: 8 additions & 5 deletions abagen/probes.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,12 @@ def reannotate_probes(probes):
reannot = resource_filename('abagen', 'data/reannotated.csv.gz')
with gzip.open(reannot, 'r') as src:
reannot = pd.read_csv(StringIO(src.read().decode('utf-8')))
reannot = reannot[['probe_name', 'gene_symbol', 'entrez_id']]

# merge reannotated with original, keeping only reannotated
probes = io.read_probes(probes).reset_index()[['probe_name', 'probe_id']]
merged = pd.merge(reannot, probes, on='probe_name', how='left')
probes = io.read_probes(probes).reset_index()
merged = pd.merge(reannot[['probe_name', 'gene_symbol', 'entrez_id']],
probes[['probe_name', 'probe_id']],
on='probe_name', how='left')

# reset index as probe_id and sort
reannotated = merged.set_index('probe_id').sort_index()
Expand Down Expand Up @@ -88,7 +89,7 @@ def filter_probes(pacall, probes, threshold=0.5):

threshold = np.clip(threshold, 0.0, 1.0)

probes = io.read_probes(probes)
probes = io.read_probes(probes, copy=True)
signal, samples = [], 0
for fname in pacall:
data = io.read_pacall(fname).loc[probes.index]
Expand Down Expand Up @@ -581,6 +582,8 @@ def collapse_probes(microarray, annotation, probes, method='diff_stability'):

# read in microarray data for all subjects; this can be quite slow...
probes = io.read_probes(probes)
exp = [io.read_microarray(micro).loc[probes.index] for micro in microarray]
exp = [
io.read_microarray(m, copy=True).loc[probes.index] for m in microarray
]

return [e.T for e in collfunc(exp, probes, annotation)]
10 changes: 5 additions & 5 deletions abagen/samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def update_mni_coords(annotation):
axis=1)
coords = coords.set_index('well_id')

annotation = io.read_annotation(annotation)
annotation = io.read_annotation(annotation, copy=True)

# basic check that all well_ids in annotation are present in coords
# a future pandas update may cause this to raise a KeyError but we want
Expand Down Expand Up @@ -436,9 +436,9 @@ def _mirror_samples(microarray, pacall, annotation, ontology):
Loaded input data with all samples duplicated across hemispheres
"""

microarray = io.read_microarray(microarray)
pacall = io.read_pacall(pacall)
annotation = io.read_annotation(annotation)
microarray = io.read_microarray(microarray, copy=True)
pacall = io.read_pacall(pacall, copy=True)
annotation = io.read_annotation(annotation, copy=True)
ontology = io.read_ontology(ontology)

# take all lh and rh samples and flip x-coordinate
Expand Down Expand Up @@ -480,7 +480,7 @@ def _mirror_ontology(annotation, ontology):

HEMI_SWAP = dict(L='R', R='L')

annotation = io.read_annotation(annotation)
annotation = io.read_annotation(annotation, copy=True)
ontology = io.read_ontology(ontology)

# structure IDs are specific to structure + hemisphere, so we can use this
Expand Down
Loading