From e121820e849cdbbc8dc79272d56461a329209542 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Tue, 29 Oct 2024 21:45:10 -0700 Subject: [PATCH 1/3] EOD --- src/genomicarrays/build_genomicarray.py | 207 +++++++++++++++++++ src/genomicarrays/build_options.py | 70 +++++++ src/genomicarrays/buildutils_tiledb_array.py | 138 +++++++++++++ src/genomicarrays/utils_bw.py | 21 ++ 4 files changed, 436 insertions(+) create mode 100644 src/genomicarrays/build_genomicarray.py create mode 100644 src/genomicarrays/build_options.py create mode 100644 src/genomicarrays/buildutils_tiledb_array.py create mode 100644 src/genomicarrays/utils_bw.py diff --git a/src/genomicarrays/build_genomicarray.py b/src/genomicarrays/build_genomicarray.py new file mode 100644 index 0000000..c996ee9 --- /dev/null +++ b/src/genomicarrays/build_genomicarray.py @@ -0,0 +1,207 @@ +"""Build the `GenomicArrayDatset`. + +The `GenomicArrayDatset` method is designed to store genomic range-based +datasets from BigWig, BigBed or other similar files. + +Example: + + .. code-block:: python + + import pyBigWig as bw + import numpy as np + import tempfile + from genomicarrays import build_genomicarray, MatrixOptions + + # Create a temporary directory + tempdir = tempfile.mkdtemp() + + # Read BigWig objects + bw1 = bw.open("path/to/object1.bw", "r") + # or just provide the path + bw2 = "path/to/object2.bw" + + # Build GenomicArray + dataset = build_genomicarray( + output_path=tempdir, + files=[bw1, bw2], + matrix_options=MatrixOptions(dtype=np.float32), + ) +""" + +import os +import warnings +from typing import Dict, Union + +import pyBigWig as bw +import numpy as np +import pandas as pd +import tiledb +from multiprocessing import Pool + +from . import build_options as bopt +from . import buildutils_tiledb_array as uta +from cellarr import buildutils_tiledb_frame as utf +from . import utils_bw as ubw +# from .GenomicArrayDataset import GenomicArrayDataset + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +# TODO: Accept files as a dictionary with names to each dataset. +# TODO: options to only extract specific regions. +def build_genomicarray( + files: list, + output_path: str, + genome: Union[str, pd.DataFrame] = "hg38", + # filter_to_regions: Union[str, pd.DataFrame], + sample_metadata: Union[pd.DataFrame, str] = None, + sample_metadata_options: bopt.SampleMetadataOptions = bopt.SampleMetadataOptions(), + matrix_options: bopt.MatrixOptions = bopt.MatrixOptions(), + optimize_tiledb: bool = True, + num_threads: int = 1, +): + """Generate the `GenomicArrayDatset`. + + All files are expected to be consistent and any modifications + to make them consistent is outside the scope of this function + and package. + + Args: + files: + List of file paths to `BigWig` files. + + output_path: + Path to where the output TileDB files should be stored. + + sample_metadata: + A :py:class:`~pandas.DataFrame` containing the sample + metadata for each file in ``files``. Hences the number of rows + in the dataframe must match the number of ``files``. + + Alternatively, may provide path to the file containing a + concatenated sample metadata across all BigWig files. In this case, + the first row is expected to contain the column names. + + Additionally, the order of rows is expected to be in the same + order as the input list of ``files``. + + Defaults to `None`, in which case, we create a simple sample + metadata dataframe containing the list of datasets, aka + each BigWig files. Each dataset is named as ``sample_{i}`` + where `i` refers to the index position of the object in ``files``. + + sample_metadata_options: + Optional parameters when generating ``sample_metadata`` store. + + matrix_options: + Optional parameters when generating ``matrix`` store. + + optimize_tiledb: + Whether to run TileDB's vaccum and consolidation (may take long). + + num_threads: + Number of threads. + Defaults to 1. + """ + if not os.path.isdir(output_path): + raise ValueError("'output_path' must be a directory.") + + #### + ## Writing the sample metadata file + #### + + if isinstance(genome, str): + chrom_sizes = pd.read_csv("https://hgdownload.soe.ucsc.edu/goldenpath/hg38/bigZips/hg38.chrom.sizes", sep="\t", header=None, names=["chrom", "length"]) + elif isinstance(genome, pd.DataFrame): + chrom_sizes = genome + else: + raise TypeError("'genome' is not an expected type (either 'str' or 'Dataframe').") + + #### + ## Writing the sample metadata file + #### + _samples = [] + for idx, _ in enumerate(files): + _samples.append(f"sample_{idx + 1}") + + if sample_metadata is None: + warnings.warn( + "Sample metadata is not provided, each dataset in 'files' is considered a sample", + UserWarning, + ) + + sample_metadata = pd.DataFrame({"genarr_sample": _samples}) + elif isinstance(sample_metadata, str): + sample_metadata = pd.read_csv(sample_metadata, header=0) + if "genarr_sample" not in sample_metadata.columns: + sample_metadata["genarr_sample"] = _samples + elif isinstance(sample_metadata, pd.DataFrame): + if "genarr_sample" not in sample_metadata.columns: + sample_metadata["genarr_sample"] = _samples + else: + raise TypeError("'sample_metadata' is not an expected type.") + + if not sample_metadata_options.skip: + _col_types = utf.infer_column_types( + sample_metadata, sample_metadata_options.column_types + ) + + _sample_output_uri = ( + f"{output_path}/{sample_metadata_options.tiledb_store_name}" + ) + utf.create_tiledb_frame_from_dataframe( + _sample_output_uri, sample_metadata, column_types=_col_types + ) + + if optimize_tiledb: + uta.optimize_tiledb_array(_sample_output_uri) + + #### + ## Writing the genomic ranges file + #### + if not matrix_options.skip: + tiledb.group_create(f"{output_path}/ranges") + + _chrm_group_base = f"{output_path}/ranges" + for idx, seq in chrom_sizes.iterrows(): + uta.create_tiledb_array( + f"{_chrm_group_base}/{seq['chrom']}", + x_dim_length = seq['length'], + y_dim_length = len(files), + matrix_attr_name = matrix_options.matrix_attr_name, + matrix_dim_dtype = matrix_options.dtype, + ) + + if optimize_tiledb: + uta.optimize_tiledb_array(_counts_uri) + + # return GenomicArrayDataset( + # dataset_path=output_path, + # sample_metadata_uri=sample_metadata_options.tiledb_store_name, + # cell_metadata_uri=cell_metadata_options.tiledb_store_name, + # gene_annotation_uri=gene_annotation_options.tiledb_store_name, + # matrix_tdb_uri=matrix_options.tiledb_store_name, + # ) + +def _range_writer(outpath, bwpath, bwidx, chrm): + + +def _wrapper_extract_info(args): + outpath, bwpath, bwidx, chrm = args + return _range_writer(outpath, bwpath, bwidx, chrm) + + +def extract_anndata_info( + h5ad_or_adata: List[Union[str, anndata.AnnData]], + var_feature_column: str = "index", + obs_subset_columns: dict = None, + num_threads: int = 1, +): + with Pool(num_threads) as p: + _args = [ + (file_info, var_feature_column, obs_subset_columns) + for file_info in h5ad_or_adata + ] + return p.map(_wrapper_extract_info, _args) \ No newline at end of file diff --git a/src/genomicarrays/build_options.py b/src/genomicarrays/build_options.py new file mode 100644 index 0000000..6745a52 --- /dev/null +++ b/src/genomicarrays/build_options.py @@ -0,0 +1,70 @@ +from dataclasses import dataclass +from typing import Dict + +import numpy as np + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +@dataclass +class MatrixOptions: + """Optional arguments for the ``matrix`` store for + :py:func:`~genomicarrays.build_genomicarray.build_genomicarray`. + + Attributes: + matrix_attr_name: + Name of the matrix to be stored in the TileDB file. + Defaults to "data". + + skip: + Whether to skip generating matrix TileDB. + Defaults to False. + + dtype: + NumPy dtype for the values in the matrix. + Defaults to np.uint16. + + Note: make sure the matrix values fit + within the range limits of chosen-dtype. + """ + + skip: bool = False + matrix_attr_name: str = "data" + dtype: np.dtype = np.uint16 + + +@dataclass +class SampleMetadataOptions: + """Optional arguments for the ``sample`` store for + :py:func:`~genomicarrays.build_genomicarray.build_genomicarray`. + + Attributes: + skip: + Whether to skip generating sample TileDB. + Defaults to False. + + dtype: + NumPy dtype for the sample dimension. + Defaults to np.uint32. + + Note: make sure the number of samples fit + within the integer limits of chosen dtype. + + tiledb_store_name: + Name of the TileDB file. + Defaults to "sample_metadata". + + column_types: + A dictionary containing column names as keys + and the value representing the type to in + the TileDB. + + If `None`, all columns are cast as 'ascii'. + """ + + skip: bool = False + dtype: np.dtype = np.uint32 + tiledb_store_name: str = "sample_metadata" + column_types: Dict[str, np.dtype] = None diff --git a/src/genomicarrays/buildutils_tiledb_array.py b/src/genomicarrays/buildutils_tiledb_array.py new file mode 100644 index 0000000..d06a7cc --- /dev/null +++ b/src/genomicarrays/buildutils_tiledb_array.py @@ -0,0 +1,138 @@ +import os +import shutil +from typing import Union + +import numpy as np +import tiledb + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +def create_tiledb_array_chrm( + tiledb_uri_path: str, + x_dim_length: int = None, + y_dim_length: int = None, + x_dim_name: str = "base", + y_dim_name: str = "sample_index", + matrix_attr_name: str = "data", + x_dim_dtype: np.dtype = np.uint32, + y_dim_dtype: np.dtype = np.uint32, + matrix_dim_dtype: np.dtype = np.uint32, + is_sparse: bool = False, +): + """Create a TileDB file with the provided attributes to persistent storage. + + This will materialize the array directory and all + related schema files. + + Args: + tiledb_uri_path: + Path to create the array TileDB file. + + x_dim_length: + Number of entries along the x/fastest-changing dimension. + e.g. Number of cells. + Defaults to None, in which case, the max integer value of + ``x_dim_dtype`` is used. + + y_dim_length: + Number of entries along the y dimension. + e.g. Number of genes. + Defaults to None, in which case, the max integer value of + ``y_dim_dtype`` is used. + + x_dim_name: + Name for the x-dimension. + Defaults to "cell_index". + + y_dim_name: + Name for the y-dimension. + Defaults to "gene_index". + + matrix_attr_name: + Name for the attribute in the array. + Defaults to "data". + + x_dim_dtype: + NumPy dtype for the x-dimension. + Defaults to np.uint32. + + y_dim_dtype: + NumPy dtype for the y-dimension. + Defaults to np.uint32. + + matrix_dim_dtype: + NumPy dtype for the values in the matrix. + Defaults to np.uint32. + + is_sparse: + Whether the matrix is sparse. + Defaults to False. + """ + + if x_dim_length is None: + x_dim_length = np.iinfo(x_dim_dtype).max + + if y_dim_length is None: + y_dim_length = np.iinfo(y_dim_dtype).max + + xdim = tiledb.Dim(name=x_dim_name, domain=(0, x_dim_length - 1), dtype=x_dim_dtype) + ydim = tiledb.Dim(name=y_dim_name, domain=(0, y_dim_length - 1), dtype=y_dim_dtype) + + dom = tiledb.Domain(xdim, ydim) + + # expecting counts + tdb_attr = tiledb.Attr( + name=matrix_attr_name, + dtype=matrix_dim_dtype, + filters=tiledb.FilterList([tiledb.GzipFilter()]), + ) + + schema = tiledb.ArraySchema(domain=dom, sparse=is_sparse, attrs=[tdb_attr]) + + if os.path.exists(tiledb_uri_path): + shutil.rmtree(tiledb_uri_path) + + tiledb.Array.create(tiledb_uri_path, schema) + + tdbfile = tiledb.open(tiledb_uri_path, "w") + tdbfile.close() + + +def write_array_to_tiledb( + tiledb_array_uri: Union[str, tiledb.SparseArray], + data: np.ndarray, + x_idx: np.ndarray, + y_idx: int, + value_dtype: np.dtype = np.uint32, +): + tiledb_fp = tiledb_array_uri + if isinstance(tiledb_array_uri, str): + tiledb_fp = tiledb.open(tiledb_array_uri, "w") + + if not isinstance(data, (np.ndarray)): + raise TypeError("'data' is not an `ndarray`.") + + tiledb_fp[:x_idx, y_idx] = data.astype(value_dtype) + + +def optimize_tiledb_array(tiledb_array_uri: str, verbose: bool = True): + """Consolidate TileDB fragments.""" + if verbose: + print(f"Optimizing {tiledb_array_uri}") + + frags = tiledb.array_fragments(tiledb_array_uri) + if verbose: + print("Fragments before consolidation: {}".format(len(frags))) + + cfg = tiledb.Config() + cfg["sm.consolidation.step_min_frags"] = 1 + cfg["sm.consolidation.step_max_frags"] = 200 + tiledb.consolidate(tiledb_array_uri, config=cfg) + tiledb.vacuum(tiledb_array_uri) + + frags = tiledb.array_fragments(tiledb_array_uri) + if verbose: + print("Fragments after consolidation: {}".format(len(frags))) diff --git a/src/genomicarrays/utils_bw.py b/src/genomicarrays/utils_bw.py new file mode 100644 index 0000000..08ae362 --- /dev/null +++ b/src/genomicarrays/utils_bw.py @@ -0,0 +1,21 @@ +from typing import Tuple + +import numpy as np +import pyBigWig as bw + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +def extract_bw( + bw_path: str, + chrom: str, +) -> Tuple[np.ndarray, int]: + bwfile = bw.open(bw_path) + if chrom not in bwfile.chroms(): + return None, None + + chrom_length = bwfile.chroms(chrom) + data = bwfile.values(chrom, 0, chrom_length) + return data, chrom_length From 6fec68876c60576b8723577d4525073f43105fd0 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Tue, 29 Oct 2024 22:06:38 -0700 Subject: [PATCH 2/3] the rest --- src/genomicarrays/build_genomicarray.py | 71 +++++++++++++------- src/genomicarrays/buildutils_tiledb_array.py | 2 +- 2 files changed, 47 insertions(+), 26 deletions(-) diff --git a/src/genomicarrays/build_genomicarray.py b/src/genomicarrays/build_genomicarray.py index c996ee9..5bd31e8 100644 --- a/src/genomicarrays/build_genomicarray.py +++ b/src/genomicarrays/build_genomicarray.py @@ -1,6 +1,6 @@ """Build the `GenomicArrayDatset`. -The `GenomicArrayDatset` method is designed to store genomic range-based +The `GenomicArrayDatset` method is designed to store genomic range-based datasets from BigWig, BigBed or other similar files. Example: @@ -28,20 +28,22 @@ ) """ +import itertools import os import warnings +from multiprocessing import Pool from typing import Dict, Union -import pyBigWig as bw import numpy as np import pandas as pd +import pyBigWig as bw import tiledb -from multiprocessing import Pool +from cellarr import buildutils_tiledb_frame as utf from . import build_options as bopt from . import buildutils_tiledb_array as uta -from cellarr import buildutils_tiledb_frame as utf from . import utils_bw as ubw + # from .GenomicArrayDataset import GenomicArrayDataset __author__ = "Jayaram Kancherla" @@ -88,7 +90,7 @@ def build_genomicarray( order as the input list of ``files``. Defaults to `None`, in which case, we create a simple sample - metadata dataframe containing the list of datasets, aka + metadata dataframe containing the list of datasets, aka each BigWig files. Each dataset is named as ``sample_{i}`` where `i` refers to the index position of the object in ``files``. @@ -107,17 +109,30 @@ def build_genomicarray( """ if not os.path.isdir(output_path): raise ValueError("'output_path' must be a directory.") - + #### ## Writing the sample metadata file #### if isinstance(genome, str): - chrom_sizes = pd.read_csv("https://hgdownload.soe.ucsc.edu/goldenpath/hg38/bigZips/hg38.chrom.sizes", sep="\t", header=None, names=["chrom", "length"]) + chrom_sizes = pd.read_csv( + "https://hgdownload.soe.ucsc.edu/goldenpath/hg38/bigZips/hg38.chrom.sizes", + sep="\t", + header=None, + names=["chrom", "length"], + ) elif isinstance(genome, pd.DataFrame): chrom_sizes = genome + if "chrom" not in chrom_sizes: + raise ValueError(f"genome does not contain column: 'chrom'.") + + if "length" not in chrom_sizes: + raise ValueError(f"genome does not contain column: 'length'.") + else: - raise TypeError("'genome' is not an expected type (either 'str' or 'Dataframe').") + raise TypeError( + "'genome' is not an expected type (either 'str' or 'Dataframe')." + ) #### ## Writing the sample metadata file @@ -168,14 +183,21 @@ def build_genomicarray( for idx, seq in chrom_sizes.iterrows(): uta.create_tiledb_array( f"{_chrm_group_base}/{seq['chrom']}", - x_dim_length = seq['length'], - y_dim_length = len(files), - matrix_attr_name = matrix_options.matrix_attr_name, - matrix_dim_dtype = matrix_options.dtype, + x_dim_length=seq["length"], + y_dim_length=len(files), + matrix_attr_name=matrix_options.matrix_attr_name, + matrix_dim_dtype=matrix_options.dtype, ) - if optimize_tiledb: - uta.optimize_tiledb_array(_counts_uri) + all_bws = [(_chrm_group_base, bwpath, idx) for idx, bwpath in enumerate(files)] + input_options = list(itertools.product(all_bws, chrom_sizes["chrom"])) + + _write_bws_to_tiledb( + input_options, outpath=output_path, num_threads=num_threads + ) + + # if optimize_tiledb: + # uta.optimize_tiledb_array(_counts_uri) # return GenomicArrayDataset( # dataset_path=output_path, @@ -185,23 +207,22 @@ def build_genomicarray( # matrix_tdb_uri=matrix_options.tiledb_store_name, # ) + def _range_writer(outpath, bwpath, bwidx, chrm): + data, chrom_length = ubw.extract_bw(bwpath, chrm) + + if data is not None: + uta.write_array_to_tiledb(f"{outpath}/{chrm}", data, chrom_length, bwidx) def _wrapper_extract_info(args): - outpath, bwpath, bwidx, chrm = args - return _range_writer(outpath, bwpath, bwidx, chrm) + iopt, chrm = args + return _range_writer(iopt[0], iopt[1], iopt[2], chrm) -def extract_anndata_info( - h5ad_or_adata: List[Union[str, anndata.AnnData]], - var_feature_column: str = "index", - obs_subset_columns: dict = None, +def _write_bws_to_tiledb( + input_options: list, num_threads: int = 1, ): with Pool(num_threads) as p: - _args = [ - (file_info, var_feature_column, obs_subset_columns) - for file_info in h5ad_or_adata - ] - return p.map(_wrapper_extract_info, _args) \ No newline at end of file + return p.map(_wrapper_extract_info, input_options) diff --git a/src/genomicarrays/buildutils_tiledb_array.py b/src/genomicarrays/buildutils_tiledb_array.py index d06a7cc..755c5f7 100644 --- a/src/genomicarrays/buildutils_tiledb_array.py +++ b/src/genomicarrays/buildutils_tiledb_array.py @@ -115,7 +115,7 @@ def write_array_to_tiledb( if not isinstance(data, (np.ndarray)): raise TypeError("'data' is not an `ndarray`.") - tiledb_fp[:x_idx, y_idx] = data.astype(value_dtype) + tiledb_fp[0:x_idx, y_idx] = data.astype(value_dtype) def optimize_tiledb_array(tiledb_array_uri: str, verbose: bool = True): From ee27e45a3742b05ca8f4e2418de2756e5a2e4ecb Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Wed, 30 Oct 2024 10:46:55 -0700 Subject: [PATCH 3/3] works for full loading of bigwig files --- src/genomicarrays/__init__.py | 2 + src/genomicarrays/build_genomicarray.py | 15 +++---- src/genomicarrays/buildutils_tiledb_array.py | 45 +++++++++++++++++++- src/genomicarrays/utils_bw.py | 26 +++++++++-- 4 files changed, 75 insertions(+), 13 deletions(-) diff --git a/src/genomicarrays/__init__.py b/src/genomicarrays/__init__.py index f9f3ba5..185b801 100644 --- a/src/genomicarrays/__init__.py +++ b/src/genomicarrays/__init__.py @@ -14,3 +14,5 @@ __version__ = "unknown" finally: del version, PackageNotFoundError + +from .build_genomicarray import build_genomicarray \ No newline at end of file diff --git a/src/genomicarrays/build_genomicarray.py b/src/genomicarrays/build_genomicarray.py index 5bd31e8..a019074 100644 --- a/src/genomicarrays/build_genomicarray.py +++ b/src/genomicarrays/build_genomicarray.py @@ -56,8 +56,8 @@ def build_genomicarray( files: list, output_path: str, + filter_to_regions: Union[str, pd.DataFrame], genome: Union[str, pd.DataFrame] = "hg38", - # filter_to_regions: Union[str, pd.DataFrame], sample_metadata: Union[pd.DataFrame, str] = None, sample_metadata_options: bopt.SampleMetadataOptions = bopt.SampleMetadataOptions(), matrix_options: bopt.MatrixOptions = bopt.MatrixOptions(), @@ -113,7 +113,6 @@ def build_genomicarray( #### ## Writing the sample metadata file #### - if isinstance(genome, str): chrom_sizes = pd.read_csv( "https://hgdownload.soe.ucsc.edu/goldenpath/hg38/bigZips/hg38.chrom.sizes", @@ -181,7 +180,7 @@ def build_genomicarray( _chrm_group_base = f"{output_path}/ranges" for idx, seq in chrom_sizes.iterrows(): - uta.create_tiledb_array( + uta.create_tiledb_array_chrm( f"{_chrm_group_base}/{seq['chrom']}", x_dim_length=seq["length"], y_dim_length=len(files), @@ -192,9 +191,7 @@ def build_genomicarray( all_bws = [(_chrm_group_base, bwpath, idx) for idx, bwpath in enumerate(files)] input_options = list(itertools.product(all_bws, chrom_sizes["chrom"])) - _write_bws_to_tiledb( - input_options, outpath=output_path, num_threads=num_threads - ) + _write_bws_to_tiledb(input_options, num_threads=num_threads) # if optimize_tiledb: # uta.optimize_tiledb_array(_counts_uri) @@ -209,10 +206,12 @@ def build_genomicarray( def _range_writer(outpath, bwpath, bwidx, chrm): - data, chrom_length = ubw.extract_bw(bwpath, chrm) + print("query params: ", outpath, bwpath, bwidx, chrm) + data = ubw.extract_bw_intervals(bwpath, chrm) if data is not None: - uta.write_array_to_tiledb(f"{outpath}/{chrm}", data, chrom_length, bwidx) + print("data is not None:::", type(data)) + uta.write_frame_intervals_to_tiledb(f"{outpath}/{chrm}", data=data, y_idx=bwidx) def _wrapper_extract_info(args): diff --git a/src/genomicarrays/buildutils_tiledb_array.py b/src/genomicarrays/buildutils_tiledb_array.py index 755c5f7..f1dfd33 100644 --- a/src/genomicarrays/buildutils_tiledb_array.py +++ b/src/genomicarrays/buildutils_tiledb_array.py @@ -3,6 +3,7 @@ from typing import Union import numpy as np +import pandas as pd import tiledb __author__ = "Jayaram Kancherla" @@ -20,7 +21,7 @@ def create_tiledb_array_chrm( x_dim_dtype: np.dtype = np.uint32, y_dim_dtype: np.dtype = np.uint32, matrix_dim_dtype: np.dtype = np.uint32, - is_sparse: bool = False, + is_sparse: bool = True, ): """Create a TileDB file with the provided attributes to persistent storage. @@ -69,7 +70,7 @@ def create_tiledb_array_chrm( is_sparse: Whether the matrix is sparse. - Defaults to False. + Defaults to True. """ if x_dim_length is None: @@ -101,6 +102,45 @@ def create_tiledb_array_chrm( tdbfile.close() +def write_frame_intervals_to_tiledb( + tiledb_array_uri: Union[str, tiledb.SparseArray], + data: pd.DataFrame, + y_idx: int, + value_dtype: np.dtype = np.uint32, +): + """Append and save intervals to TileDB. + + Args: + tiledb_array_uri: + TileDB array object or path to a TileDB object. + + data: + Input dataframe to write to TileDB, must contain + columns, "start", "end" and "value". + + value_dtype: + NumPy dtype to reformat the matrix values. + Defaults to ``uint32``. + """ + tiledb_fp = tiledb_array_uri + if isinstance(tiledb_array_uri, str): + tiledb_fp = tiledb.open(tiledb_array_uri, "w") + + if not isinstance(data, (pd.DataFrame)): + raise TypeError("Intervals not provided as pandas DataFrame.") + + if data is None or len(data) == 0: + return + + for _, row in data.iterrows(): + _len = row["end"] - row["start"] + tiledb_fp[np.arange(row["start"], row["end"]), np.repeat(y_idx, _len)] = ( + np.repeat(row["value"], _len) + ) + + tiledb_fp.close() + + def write_array_to_tiledb( tiledb_array_uri: Union[str, tiledb.SparseArray], data: np.ndarray, @@ -116,6 +156,7 @@ def write_array_to_tiledb( raise TypeError("'data' is not an `ndarray`.") tiledb_fp[0:x_idx, y_idx] = data.astype(value_dtype) + tiledb_fp.close() def optimize_tiledb_array(tiledb_array_uri: str, verbose: bool = True): diff --git a/src/genomicarrays/utils_bw.py b/src/genomicarrays/utils_bw.py index 08ae362..29ae58c 100644 --- a/src/genomicarrays/utils_bw.py +++ b/src/genomicarrays/utils_bw.py @@ -1,6 +1,7 @@ -from typing import Tuple +from typing import Optional, Tuple import numpy as np +import pandas as pd import pyBigWig as bw __author__ = "Jayaram Kancherla" @@ -8,7 +9,7 @@ __license__ = "MIT" -def extract_bw( +def extract_bw_values( bw_path: str, chrom: str, ) -> Tuple[np.ndarray, int]: @@ -18,4 +19,23 @@ def extract_bw( chrom_length = bwfile.chroms(chrom) data = bwfile.values(chrom, 0, chrom_length) - return data, chrom_length + return np.array(data), chrom_length + + +def extract_bw_intervals( + bw_path: str, + chrom: str, +) -> Optional[pd.DataFrame]: + bwfile = bw.open(bw_path) + if chrom not in bwfile.chroms(): + return None + + data = pd.DataFrame(bwfile.intervals(chrom), columns=["start", "end", "value"]) + data["chrom"] = chrom + data_nnz = data[data["value"] > 0] + + data_nnz["tmp_group"] = (data_nnz["value"] != data_nnz["value"].shift()).cumsum() + result = data_nnz.groupby("tmp_group").agg( + {"start": "first", "end": "last", "value": "first"} + ) + return result