Generate TileDB from collection of BigWigs (#2)

TileOme · Oct 31, 2024 · a075621 · a075621
1 parent 50bf594
commit a075621
Show file tree

Hide file tree

Showing 11 changed files with 766 additions and 28 deletions.
diff --git a/README.md b/README.md
@@ -12,11 +12,73 @@
 
 [![Project generated with PyScaffold](https://img.shields.io/badge/-PyScaffold-005CA0?logo=pyscaffold)](https://pyscaffold.org/)
 
-# GenomicArrays
+# Genomic Arrays based on TileDB
 
-> Add a short description here!
+GenomicArrays is a Python package for converting genomic data from BigWig format to TileDB arrays.
 
-A longer description of your project goes here...
+## Installation
+
+Install the package from [PyPI](https://pypi.org/project/genomicarrays/)
+
+```sh
+pip install genomicarrays
+```
+
+## Quick Start
+
+### Build a `GenomicArray`
+
+Building a `GenomicArray` generates 3 TileDB files in the specified output directory:
+
+- `feature_annotation`: A TileDB file containing input feature intervals.
+- `sample_metadata`: A TileDB file containing sample metadata, each BigWig file is considered a sample.
+- A matrix TileDB file named by the `layer_matrix_name` parameter. This allows the package
+to store multiple different matrices, e.g. 'coverage', 'some_computed_statistic', for the same interval,
+and sample metadata attributes.
+
+The organization is inspired by the [SummarizedExperiment](https://bioconductor.org/packages/release/bioc/html/SummarizedExperiment.html) data structure. The TileDB matrix file is stored in a **features X samples** orientation.
+
+![`GenomicArray` structure](./assets/genarr.png "GenomicArray")
+
+To build a `GenomicArray` from a collection of `BigWig` files:
+
+```python
+import numpy as np
+import tempfile
+import genomicarrays as garr
+
+# Create a temporary directory, this is where the
+# output files are created. Pick your location here.
+tempdir = tempfile.mkdtemp()
+
+# List BigWig paths
+bw_dir = "your/biwig/dir"
+files = os.listdir(bw_dir)
+bw_files = [f"{bw_dir}/{f}" for f in files]
+
+features = pd.DataFrame({
+     "chrom": ["chr1", "chr1"],
+     "start": [1000, 2000],
+     "end": [1500, 2500]
+})
+
+# Build GenomicArray
+garr.build_genomicarray(
+     files=bw_files,
+     output_path=tempdir,
+     features=features,
+     # agg function to summarize mutiple values 
+     # from bigwig within an input feature interval.
+     feature_annotation_options=garr.FeatureAnnotationOptions( 
+        aggregate_function = np.nanmean
+     ),
+     # for parallel processing multiple bigwig files
+     num_threads=4 
+)
+```
+
+The build process stores missing intervals from a bigwig file as `np.nan`. The 
+default is to choose an aggregate functions that works with `np.nan`.
 
 
 <!-- pyscaffold-notes -->

diff --git a/assets/genarr.jpg b/assets/genarr.jpg
diff --git a/assets/genarr.png b/assets/genarr.png
diff --git a/assets/genarr.svg b/assets/genarr.svg
diff --git a/src/genomicarrays/__init__.py b/src/genomicarrays/__init__.py
@@ -14,3 +14,5 @@
     __version__ = "unknown"
 finally:
     del version, PackageNotFoundError
+
+from .build_genomicarray import build_genomicarray
diff --git a/src/genomicarrays/build_genomicarray.py b/src/genomicarrays/build_genomicarray.py
@@ -0,0 +1,286 @@
+"""Build the `GenomicArrayDatset`.
+
+This modules provides tools for converting genomic data from BigWig format to TileDB.
+It supports parallel processing for handling large collection of genomic datasets.
+
+Example:
+
+    .. code-block:: python
+
+        import pyBigWig as bw
+        import numpy as np
+        import tempfile
+        from genomicarrays import build_genomicarray, MatrixOptions
+
+        # Create a temporary directory
+        tempdir = tempfile.mkdtemp()
+
+        # Read BigWig objects
+        bw1 = bw.open("path/to/object1.bw", "r")
+        # or just provide the path
+        bw2 = "path/to/object2.bw"
+
+        # Build GenomicArray
+        dataset = build_genomicarray(
+            output_path=tempdir,
+            files=[bw1, bw2],
+            matrix_options=MatrixOptions(dtype=np.float32),
+        )
+"""
+
+import itertools
+import os
+import warnings
+from multiprocessing import Pool
+from typing import Dict, Union
+
+import numpy as np
+import pandas as pd
+import pyBigWig as bw
+import tiledb
+from cellarr import buildutils_tiledb_frame as utf
+
+from . import build_options as bopt
+from . import buildutils_tiledb_array as uta
+from . import utils_bw as ubw
+
+# from .GenomicArrayDataset import GenomicArrayDataset
+
+__author__ = "Jayaram Kancherla"
+__copyright__ = "Jayaram Kancherla"
+__license__ = "MIT"
+
+
+# TODO: Accept files as a dictionary with names to each dataset.
+def build_genomicarray(
+    files: list,
+    output_path: str,
+    features: Union[str, pd.DataFrame],
+    genome: Union[str, pd.DataFrame] = "hg38",
+    sample_metadata: Union[pd.DataFrame, str] = None,
+    sample_metadata_options: bopt.SampleMetadataOptions = bopt.SampleMetadataOptions(),
+    matrix_options: bopt.MatrixOptions = bopt.MatrixOptions(),
+    feature_annotation_options: bopt.FeatureAnnotationOptions = bopt.FeatureAnnotationOptions(),
+    optimize_tiledb: bool = True,
+    num_threads: int = 1,
+):
+    """Generate the `GenomicArrayDatset`.
+
+    All files are expected to be consistent and any modifications
+    to make them consistent is outside the scope of this function
+    and package.
+
+    Args:
+        files:
+            List of file paths to `BigWig` files.
+
+        output_path:
+            Path to where the output TileDB files should be stored.
+
+        features:
+            A :py:class:`~pandas.DataFrame` containing the input genomic
+            intervals..
+
+            Alternatively, may provide path to the file containing a
+            list of intervals. In this case,
+            the first row is expected to contain the column names,
+            "chrom", "start" and "end".
+
+        genome:
+            A string specifying the genome to automatically download the
+            chromosome sizes from ucsc.
+
+            Alternatively, may provide a :py:class:`~pandas.DataFrame`
+            containing columns 'chrom' and 'lengths'.
+
+            Note: This parameter is currently not used. Ideally this will
+            be used to truncate the regions.
+
+        sample_metadata:
+            A :py:class:`~pandas.DataFrame` containing the sample
+            metadata for each file in ``files``. Hences the number of rows
+            in the dataframe must match the number of ``files``.
+
+            Alternatively, may provide path to the file containing a
+            concatenated sample metadata across all BigWig files. In this case,
+            the first row is expected to contain the column names.
+
+            Additionally, the order of rows is expected to be in the same
+            order as the input list of ``files``.
+
+            Defaults to `None`, in which case, we create a simple sample
+            metadata dataframe containing the list of datasets, aka
+            each BigWig files. Each dataset is named as ``sample_{i}``
+            where `i` refers to the index position of the object in ``files``.
+
+        sample_metadata_options:
+            Optional parameters when generating ``sample_metadata`` store.
+
+        matrix_options:
+            Optional parameters when generating ``matrix`` store.
+
+        feature_annotation_options:
+            Optional parameters when generating ``feature_annotation`` store.
+
+        optimize_tiledb:
+            Whether to run TileDB's vaccum and consolidation (may take long).
+
+        num_threads:
+            Number of threads.
+            Defaults to 1.
+    """
+    if not os.path.isdir(output_path):
+        raise ValueError("'output_path' must be a directory.")
+
+    ####
+    ## Process genome information
+    ####
+    # if isinstance(genome, str):
+    #     chrom_sizes = pd.read_csv(
+    #         "https://hgdownload.soe.ucsc.edu/goldenpath/hg38/bigZips/hg38.chrom.sizes",
+    #         sep="\t",
+    #         header=None,
+    #         names=["chrom", "length"],
+    #     )
+    # elif isinstance(genome, pd.DataFrame):
+    #     chrom_sizes = genome
+    #     if "chrom" not in chrom_sizes:
+    #         raise ValueError("genome does not contain column: 'chrom'.")
+
+    #     if "length" not in chrom_sizes:
+    #         raise ValueError("genome does not contain column: 'length'.")
+
+    # else:
+    #     raise TypeError(
+    #         "'genome' is not an expected type (either 'str' or 'Dataframe')."
+    #     )
+
+    ####
+    ## Writing the features aka interval regions
+    ####
+    if isinstance(features, str):
+        input_intervals = pd.read_csv(features, header=0)
+    elif isinstance(features, pd.DataFrame):
+        input_intervals = features.copy()
+
+        required_cols = {"chrom", "start", "end"}
+        if not required_cols.issubset(input_intervals.columns):
+            missing = required_cols - set(input_intervals.columns)
+            raise ValueError(f"Missing required columns: {missing}")
+
+    else:
+        raise TypeError(
+            "'input_intervals' is not an expected type (either 'str' or 'Dataframe')."
+        )
+
+    if not feature_annotation_options.skip:
+        _col_types = utf.infer_column_types(
+            input_intervals, feature_annotation_options.column_types
+        )
+
+        _feature_output_uri = (
+            f"{output_path}/{feature_annotation_options.tiledb_store_name}"
+        )
+        utf.create_tiledb_frame_from_dataframe(
+            _feature_output_uri, input_intervals, column_types=_col_types
+        )
+
+        if optimize_tiledb:
+            uta.optimize_tiledb_array(_feature_output_uri)
+
+    ####
+    ## Writing the sample metadata file
+    ####
+    _samples = []
+    for idx, _ in enumerate(files):
+        _samples.append(f"sample_{idx + 1}")
+
+    if sample_metadata is None:
+        warnings.warn(
+            "Sample metadata is not provided, each dataset in 'files' is considered a sample",
+            UserWarning,
+        )
+
+        sample_metadata = pd.DataFrame({"genarr_sample": _samples})
+    elif isinstance(sample_metadata, str):
+        sample_metadata = pd.read_csv(sample_metadata, header=0)
+        if "genarr_sample" not in sample_metadata.columns:
+            sample_metadata["genarr_sample"] = _samples
+    elif isinstance(sample_metadata, pd.DataFrame):
+        if "genarr_sample" not in sample_metadata.columns:
+            sample_metadata["genarr_sample"] = _samples
+    else:
+        raise TypeError("'sample_metadata' is not an expected type.")
+
+    if not sample_metadata_options.skip:
+        _col_types = utf.infer_column_types(
+            sample_metadata, sample_metadata_options.column_types
+        )
+
+        _sample_output_uri = (
+            f"{output_path}/{sample_metadata_options.tiledb_store_name}"
+        )
+        utf.create_tiledb_frame_from_dataframe(
+            _sample_output_uri, sample_metadata, column_types=_col_types
+        )
+
+        if optimize_tiledb:
+            uta.optimize_tiledb_array(_sample_output_uri)
+
+    ####
+    ## Writing the genomic ranges file
+    ####
+    if not matrix_options.skip:
+        _cov_uri = f"{output_path}/{matrix_options.tiledb_store_name}"
+        uta.create_tiledb_array(
+            _cov_uri,
+            matrix_attr_name=matrix_options.matrix_attr_name,
+            x_dim_dtype=feature_annotation_options.dtype,
+            y_dim_dtype=sample_metadata_options.dtype,
+            matrix_dim_dtype=matrix_options.dtype,
+            x_dim_length=len(input_intervals),
+            y_dim_length=len(files),
+            is_sparse=False,
+        )
+
+        all_bws_options = [
+            (
+                _cov_uri,
+                input_intervals,
+                bwpath,
+                idx,
+                feature_annotation_options.aggregate_function,
+            )
+            for idx, bwpath in enumerate(files)
+        ]
+        with Pool(num_threads) as p:
+            p.map(_wrapper_extract_bwinfo, all_bws_options)
+
+        if optimize_tiledb:
+            uta.optimize_tiledb_array(_cov_uri)
+
+    # return GenomicArrayDataset(
+    #     dataset_path=output_path,
+    #     sample_metadata_uri=sample_metadata_options.tiledb_store_name,
+    #     cell_metadata_uri=cell_metadata_options.tiledb_store_name,
+    #     gene_annotation_uri=gene_annotation_options.tiledb_store_name,
+    #     matrix_tdb_uri=matrix_options.tiledb_store_name,
+    # )
+
+
+def _write_intervals_to_tiledb(outpath, intervals, bwpath, bwidx, agg_func):
+    """Wrapper to extract the data for the given intervals from the
+    bigwig file and write the output to the tiledb file.
+    """
+    data = ubw.extract_bw_intervals_as_vec(bwpath, intervals, agg_func)
+
+    if data is not None and len(data) > 0:
+        uta.write_frame_intervals_to_tiledb(outpath, data=data, y_idx=bwidx)
+
+
+def _wrapper_extract_bwinfo(args):
+    """Wrapper for multiprocessing multiple files and intervals."""
+    counts_uri, input_intervals, bwpath, idx, agg_func = args
+    return _write_intervals_to_tiledb(
+        counts_uri, input_intervals, bwpath, idx, agg_func
+    )