Skip to content

Commit

Permalink
Generate TileDB from collection of BigWigs (#2)
Browse files Browse the repository at this point in the history
  • Loading branch information
jkanche authored Oct 31, 2024
1 parent 50bf594 commit a075621
Show file tree
Hide file tree
Showing 11 changed files with 766 additions and 28 deletions.
68 changes: 65 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,73 @@

[![Project generated with PyScaffold](https://img.shields.io/badge/-PyScaffold-005CA0?logo=pyscaffold)](https://pyscaffold.org/)

# GenomicArrays
# Genomic Arrays based on TileDB

> Add a short description here!
GenomicArrays is a Python package for converting genomic data from BigWig format to TileDB arrays.

A longer description of your project goes here...
## Installation

Install the package from [PyPI](https://pypi.org/project/genomicarrays/)

```sh
pip install genomicarrays
```

## Quick Start

### Build a `GenomicArray`

Building a `GenomicArray` generates 3 TileDB files in the specified output directory:

- `feature_annotation`: A TileDB file containing input feature intervals.
- `sample_metadata`: A TileDB file containing sample metadata, each BigWig file is considered a sample.
- A matrix TileDB file named by the `layer_matrix_name` parameter. This allows the package
to store multiple different matrices, e.g. 'coverage', 'some_computed_statistic', for the same interval,
and sample metadata attributes.

The organization is inspired by the [SummarizedExperiment](https://bioconductor.org/packages/release/bioc/html/SummarizedExperiment.html) data structure. The TileDB matrix file is stored in a **features X samples** orientation.

![`GenomicArray` structure](./assets/genarr.png "GenomicArray")

To build a `GenomicArray` from a collection of `BigWig` files:

```python
import numpy as np
import tempfile
import genomicarrays as garr

# Create a temporary directory, this is where the
# output files are created. Pick your location here.
tempdir = tempfile.mkdtemp()

# List BigWig paths
bw_dir = "your/biwig/dir"
files = os.listdir(bw_dir)
bw_files = [f"{bw_dir}/{f}" for f in files]

features = pd.DataFrame({
"chrom": ["chr1", "chr1"],
"start": [1000, 2000],
"end": [1500, 2500]
})

# Build GenomicArray
garr.build_genomicarray(
files=bw_files,
output_path=tempdir,
features=features,
# agg function to summarize mutiple values
# from bigwig within an input feature interval.
feature_annotation_options=garr.FeatureAnnotationOptions(
aggregate_function = np.nanmean
),
# for parallel processing multiple bigwig files
num_threads=4
)
```

The build process stores missing intervals from a bigwig file as `np.nan`. The
default is to choose an aggregate functions that works with `np.nan`.


<!-- pyscaffold-notes -->
Expand Down
Binary file added assets/genarr.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added assets/genarr.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions assets/genarr.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 2 additions & 0 deletions src/genomicarrays/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,5 @@
__version__ = "unknown"
finally:
del version, PackageNotFoundError

from .build_genomicarray import build_genomicarray
286 changes: 286 additions & 0 deletions src/genomicarrays/build_genomicarray.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,286 @@
"""Build the `GenomicArrayDatset`.
This modules provides tools for converting genomic data from BigWig format to TileDB.
It supports parallel processing for handling large collection of genomic datasets.
Example:
.. code-block:: python
import pyBigWig as bw
import numpy as np
import tempfile
from genomicarrays import build_genomicarray, MatrixOptions
# Create a temporary directory
tempdir = tempfile.mkdtemp()
# Read BigWig objects
bw1 = bw.open("path/to/object1.bw", "r")
# or just provide the path
bw2 = "path/to/object2.bw"
# Build GenomicArray
dataset = build_genomicarray(
output_path=tempdir,
files=[bw1, bw2],
matrix_options=MatrixOptions(dtype=np.float32),
)
"""

import itertools
import os
import warnings
from multiprocessing import Pool
from typing import Dict, Union

import numpy as np
import pandas as pd
import pyBigWig as bw
import tiledb
from cellarr import buildutils_tiledb_frame as utf

from . import build_options as bopt
from . import buildutils_tiledb_array as uta
from . import utils_bw as ubw

# from .GenomicArrayDataset import GenomicArrayDataset

__author__ = "Jayaram Kancherla"
__copyright__ = "Jayaram Kancherla"
__license__ = "MIT"


# TODO: Accept files as a dictionary with names to each dataset.
def build_genomicarray(
files: list,
output_path: str,
features: Union[str, pd.DataFrame],
genome: Union[str, pd.DataFrame] = "hg38",
sample_metadata: Union[pd.DataFrame, str] = None,
sample_metadata_options: bopt.SampleMetadataOptions = bopt.SampleMetadataOptions(),
matrix_options: bopt.MatrixOptions = bopt.MatrixOptions(),
feature_annotation_options: bopt.FeatureAnnotationOptions = bopt.FeatureAnnotationOptions(),
optimize_tiledb: bool = True,
num_threads: int = 1,
):
"""Generate the `GenomicArrayDatset`.
All files are expected to be consistent and any modifications
to make them consistent is outside the scope of this function
and package.
Args:
files:
List of file paths to `BigWig` files.
output_path:
Path to where the output TileDB files should be stored.
features:
A :py:class:`~pandas.DataFrame` containing the input genomic
intervals..
Alternatively, may provide path to the file containing a
list of intervals. In this case,
the first row is expected to contain the column names,
"chrom", "start" and "end".
genome:
A string specifying the genome to automatically download the
chromosome sizes from ucsc.
Alternatively, may provide a :py:class:`~pandas.DataFrame`
containing columns 'chrom' and 'lengths'.
Note: This parameter is currently not used. Ideally this will
be used to truncate the regions.
sample_metadata:
A :py:class:`~pandas.DataFrame` containing the sample
metadata for each file in ``files``. Hences the number of rows
in the dataframe must match the number of ``files``.
Alternatively, may provide path to the file containing a
concatenated sample metadata across all BigWig files. In this case,
the first row is expected to contain the column names.
Additionally, the order of rows is expected to be in the same
order as the input list of ``files``.
Defaults to `None`, in which case, we create a simple sample
metadata dataframe containing the list of datasets, aka
each BigWig files. Each dataset is named as ``sample_{i}``
where `i` refers to the index position of the object in ``files``.
sample_metadata_options:
Optional parameters when generating ``sample_metadata`` store.
matrix_options:
Optional parameters when generating ``matrix`` store.
feature_annotation_options:
Optional parameters when generating ``feature_annotation`` store.
optimize_tiledb:
Whether to run TileDB's vaccum and consolidation (may take long).
num_threads:
Number of threads.
Defaults to 1.
"""
if not os.path.isdir(output_path):
raise ValueError("'output_path' must be a directory.")

####
## Process genome information
####
# if isinstance(genome, str):
# chrom_sizes = pd.read_csv(
# "https://hgdownload.soe.ucsc.edu/goldenpath/hg38/bigZips/hg38.chrom.sizes",
# sep="\t",
# header=None,
# names=["chrom", "length"],
# )
# elif isinstance(genome, pd.DataFrame):
# chrom_sizes = genome
# if "chrom" not in chrom_sizes:
# raise ValueError("genome does not contain column: 'chrom'.")

# if "length" not in chrom_sizes:
# raise ValueError("genome does not contain column: 'length'.")

# else:
# raise TypeError(
# "'genome' is not an expected type (either 'str' or 'Dataframe')."
# )

####
## Writing the features aka interval regions
####
if isinstance(features, str):
input_intervals = pd.read_csv(features, header=0)
elif isinstance(features, pd.DataFrame):
input_intervals = features.copy()

required_cols = {"chrom", "start", "end"}
if not required_cols.issubset(input_intervals.columns):
missing = required_cols - set(input_intervals.columns)
raise ValueError(f"Missing required columns: {missing}")

else:
raise TypeError(
"'input_intervals' is not an expected type (either 'str' or 'Dataframe')."
)

if not feature_annotation_options.skip:
_col_types = utf.infer_column_types(
input_intervals, feature_annotation_options.column_types
)

_feature_output_uri = (
f"{output_path}/{feature_annotation_options.tiledb_store_name}"
)
utf.create_tiledb_frame_from_dataframe(
_feature_output_uri, input_intervals, column_types=_col_types
)

if optimize_tiledb:
uta.optimize_tiledb_array(_feature_output_uri)

####
## Writing the sample metadata file
####
_samples = []
for idx, _ in enumerate(files):
_samples.append(f"sample_{idx + 1}")

if sample_metadata is None:
warnings.warn(
"Sample metadata is not provided, each dataset in 'files' is considered a sample",
UserWarning,
)

sample_metadata = pd.DataFrame({"genarr_sample": _samples})
elif isinstance(sample_metadata, str):
sample_metadata = pd.read_csv(sample_metadata, header=0)
if "genarr_sample" not in sample_metadata.columns:
sample_metadata["genarr_sample"] = _samples
elif isinstance(sample_metadata, pd.DataFrame):
if "genarr_sample" not in sample_metadata.columns:
sample_metadata["genarr_sample"] = _samples
else:
raise TypeError("'sample_metadata' is not an expected type.")

if not sample_metadata_options.skip:
_col_types = utf.infer_column_types(
sample_metadata, sample_metadata_options.column_types
)

_sample_output_uri = (
f"{output_path}/{sample_metadata_options.tiledb_store_name}"
)
utf.create_tiledb_frame_from_dataframe(
_sample_output_uri, sample_metadata, column_types=_col_types
)

if optimize_tiledb:
uta.optimize_tiledb_array(_sample_output_uri)

####
## Writing the genomic ranges file
####
if not matrix_options.skip:
_cov_uri = f"{output_path}/{matrix_options.tiledb_store_name}"
uta.create_tiledb_array(
_cov_uri,
matrix_attr_name=matrix_options.matrix_attr_name,
x_dim_dtype=feature_annotation_options.dtype,
y_dim_dtype=sample_metadata_options.dtype,
matrix_dim_dtype=matrix_options.dtype,
x_dim_length=len(input_intervals),
y_dim_length=len(files),
is_sparse=False,
)

all_bws_options = [
(
_cov_uri,
input_intervals,
bwpath,
idx,
feature_annotation_options.aggregate_function,
)
for idx, bwpath in enumerate(files)
]
with Pool(num_threads) as p:
p.map(_wrapper_extract_bwinfo, all_bws_options)

if optimize_tiledb:
uta.optimize_tiledb_array(_cov_uri)

# return GenomicArrayDataset(
# dataset_path=output_path,
# sample_metadata_uri=sample_metadata_options.tiledb_store_name,
# cell_metadata_uri=cell_metadata_options.tiledb_store_name,
# gene_annotation_uri=gene_annotation_options.tiledb_store_name,
# matrix_tdb_uri=matrix_options.tiledb_store_name,
# )


def _write_intervals_to_tiledb(outpath, intervals, bwpath, bwidx, agg_func):
"""Wrapper to extract the data for the given intervals from the
bigwig file and write the output to the tiledb file.
"""
data = ubw.extract_bw_intervals_as_vec(bwpath, intervals, agg_func)

if data is not None and len(data) > 0:
uta.write_frame_intervals_to_tiledb(outpath, data=data, y_idx=bwidx)


def _wrapper_extract_bwinfo(args):
"""Wrapper for multiprocessing multiple files and intervals."""
counts_uri, input_intervals, bwpath, idx, agg_func = args
return _write_intervals_to_tiledb(
counts_uri, input_intervals, bwpath, idx, agg_func
)
Loading

0 comments on commit a075621

Please sign in to comment.