Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bp level tiledb construction #1

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/genomicarrays/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,5 @@
__version__ = "unknown"
finally:
del version, PackageNotFoundError

from .build_genomicarray import build_genomicarray
227 changes: 227 additions & 0 deletions src/genomicarrays/build_genomicarray.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
"""Build the `GenomicArrayDatset`.

The `GenomicArrayDatset` method is designed to store genomic range-based
datasets from BigWig, BigBed or other similar files.

Example:

.. code-block:: python

import pyBigWig as bw
import numpy as np
import tempfile
from genomicarrays import build_genomicarray, MatrixOptions

# Create a temporary directory
tempdir = tempfile.mkdtemp()

# Read BigWig objects
bw1 = bw.open("path/to/object1.bw", "r")
# or just provide the path
bw2 = "path/to/object2.bw"

# Build GenomicArray
dataset = build_genomicarray(
output_path=tempdir,
files=[bw1, bw2],
matrix_options=MatrixOptions(dtype=np.float32),
)
"""

import itertools
import os
import warnings
from multiprocessing import Pool
from typing import Dict, Union

import numpy as np
import pandas as pd
import pyBigWig as bw
import tiledb
from cellarr import buildutils_tiledb_frame as utf

from . import build_options as bopt
from . import buildutils_tiledb_array as uta
from . import utils_bw as ubw

# from .GenomicArrayDataset import GenomicArrayDataset

__author__ = "Jayaram Kancherla"
__copyright__ = "Jayaram Kancherla"
__license__ = "MIT"


# TODO: Accept files as a dictionary with names to each dataset.
# TODO: options to only extract specific regions.
def build_genomicarray(
files: list,
output_path: str,
filter_to_regions: Union[str, pd.DataFrame],
genome: Union[str, pd.DataFrame] = "hg38",
sample_metadata: Union[pd.DataFrame, str] = None,
sample_metadata_options: bopt.SampleMetadataOptions = bopt.SampleMetadataOptions(),
matrix_options: bopt.MatrixOptions = bopt.MatrixOptions(),
optimize_tiledb: bool = True,
num_threads: int = 1,
):
"""Generate the `GenomicArrayDatset`.

All files are expected to be consistent and any modifications
to make them consistent is outside the scope of this function
and package.

Args:
files:
List of file paths to `BigWig` files.

output_path:
Path to where the output TileDB files should be stored.

sample_metadata:
A :py:class:`~pandas.DataFrame` containing the sample
metadata for each file in ``files``. Hences the number of rows
in the dataframe must match the number of ``files``.

Alternatively, may provide path to the file containing a
concatenated sample metadata across all BigWig files. In this case,
the first row is expected to contain the column names.

Additionally, the order of rows is expected to be in the same
order as the input list of ``files``.

Defaults to `None`, in which case, we create a simple sample
metadata dataframe containing the list of datasets, aka
each BigWig files. Each dataset is named as ``sample_{i}``
where `i` refers to the index position of the object in ``files``.

sample_metadata_options:
Optional parameters when generating ``sample_metadata`` store.

matrix_options:
Optional parameters when generating ``matrix`` store.

optimize_tiledb:
Whether to run TileDB's vaccum and consolidation (may take long).

num_threads:
Number of threads.
Defaults to 1.
"""
if not os.path.isdir(output_path):
raise ValueError("'output_path' must be a directory.")

####
## Writing the sample metadata file
####
if isinstance(genome, str):
chrom_sizes = pd.read_csv(
"https://hgdownload.soe.ucsc.edu/goldenpath/hg38/bigZips/hg38.chrom.sizes",
sep="\t",
header=None,
names=["chrom", "length"],
)
elif isinstance(genome, pd.DataFrame):
chrom_sizes = genome
if "chrom" not in chrom_sizes:
raise ValueError(f"genome does not contain column: 'chrom'.")

if "length" not in chrom_sizes:
raise ValueError(f"genome does not contain column: 'length'.")

else:
raise TypeError(
"'genome' is not an expected type (either 'str' or 'Dataframe')."
)

####
## Writing the sample metadata file
####
_samples = []
for idx, _ in enumerate(files):
_samples.append(f"sample_{idx + 1}")

if sample_metadata is None:
warnings.warn(
"Sample metadata is not provided, each dataset in 'files' is considered a sample",
UserWarning,
)

sample_metadata = pd.DataFrame({"genarr_sample": _samples})
elif isinstance(sample_metadata, str):
sample_metadata = pd.read_csv(sample_metadata, header=0)
if "genarr_sample" not in sample_metadata.columns:
sample_metadata["genarr_sample"] = _samples
elif isinstance(sample_metadata, pd.DataFrame):
if "genarr_sample" not in sample_metadata.columns:
sample_metadata["genarr_sample"] = _samples
else:
raise TypeError("'sample_metadata' is not an expected type.")

if not sample_metadata_options.skip:
_col_types = utf.infer_column_types(
sample_metadata, sample_metadata_options.column_types
)

_sample_output_uri = (
f"{output_path}/{sample_metadata_options.tiledb_store_name}"
)
utf.create_tiledb_frame_from_dataframe(
_sample_output_uri, sample_metadata, column_types=_col_types
)

if optimize_tiledb:
uta.optimize_tiledb_array(_sample_output_uri)

####
## Writing the genomic ranges file
####
if not matrix_options.skip:
tiledb.group_create(f"{output_path}/ranges")

_chrm_group_base = f"{output_path}/ranges"
for idx, seq in chrom_sizes.iterrows():
uta.create_tiledb_array_chrm(
f"{_chrm_group_base}/{seq['chrom']}",
x_dim_length=seq["length"],
y_dim_length=len(files),
matrix_attr_name=matrix_options.matrix_attr_name,
matrix_dim_dtype=matrix_options.dtype,
)

all_bws = [(_chrm_group_base, bwpath, idx) for idx, bwpath in enumerate(files)]
input_options = list(itertools.product(all_bws, chrom_sizes["chrom"]))

_write_bws_to_tiledb(input_options, num_threads=num_threads)

# if optimize_tiledb:
# uta.optimize_tiledb_array(_counts_uri)

# return GenomicArrayDataset(
# dataset_path=output_path,
# sample_metadata_uri=sample_metadata_options.tiledb_store_name,
# cell_metadata_uri=cell_metadata_options.tiledb_store_name,
# gene_annotation_uri=gene_annotation_options.tiledb_store_name,
# matrix_tdb_uri=matrix_options.tiledb_store_name,
# )


def _range_writer(outpath, bwpath, bwidx, chrm):
print("query params: ", outpath, bwpath, bwidx, chrm)
data = ubw.extract_bw_intervals(bwpath, chrm)

if data is not None:
print("data is not None:::", type(data))
uta.write_frame_intervals_to_tiledb(f"{outpath}/{chrm}", data=data, y_idx=bwidx)


def _wrapper_extract_info(args):
iopt, chrm = args
return _range_writer(iopt[0], iopt[1], iopt[2], chrm)


def _write_bws_to_tiledb(
input_options: list,
num_threads: int = 1,
):
with Pool(num_threads) as p:
return p.map(_wrapper_extract_info, input_options)
70 changes: 70 additions & 0 deletions src/genomicarrays/build_options.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from dataclasses import dataclass
from typing import Dict

import numpy as np

__author__ = "Jayaram Kancherla"
__copyright__ = "Jayaram Kancherla"
__license__ = "MIT"


@dataclass
class MatrixOptions:
"""Optional arguments for the ``matrix`` store for
:py:func:`~genomicarrays.build_genomicarray.build_genomicarray`.

Attributes:
matrix_attr_name:
Name of the matrix to be stored in the TileDB file.
Defaults to "data".

skip:
Whether to skip generating matrix TileDB.
Defaults to False.

dtype:
NumPy dtype for the values in the matrix.
Defaults to np.uint16.

Note: make sure the matrix values fit
within the range limits of chosen-dtype.
"""

skip: bool = False
matrix_attr_name: str = "data"
dtype: np.dtype = np.uint16


@dataclass
class SampleMetadataOptions:
"""Optional arguments for the ``sample`` store for
:py:func:`~genomicarrays.build_genomicarray.build_genomicarray`.

Attributes:
skip:
Whether to skip generating sample TileDB.
Defaults to False.

dtype:
NumPy dtype for the sample dimension.
Defaults to np.uint32.

Note: make sure the number of samples fit
within the integer limits of chosen dtype.

tiledb_store_name:
Name of the TileDB file.
Defaults to "sample_metadata".

column_types:
A dictionary containing column names as keys
and the value representing the type to in
the TileDB.

If `None`, all columns are cast as 'ascii'.
"""

skip: bool = False
dtype: np.dtype = np.uint32
tiledb_store_name: str = "sample_metadata"
column_types: Dict[str, np.dtype] = None
Loading
Loading