From b1420e02748da60d83bfabe1529223f2ac9c2972 Mon Sep 17 00:00:00 2001 From: lukasmahieu Date: Sat, 15 Jun 2024 20:57:11 +0200 Subject: [PATCH] bigwigs importing --- pyproject.toml | 3 ++- src/crested/__init__.py | 4 ++-- src/crested/_io.py | 39 ++++++++++++++++++++++----------------- 3 files changed, 26 insertions(+), 20 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 61c89de2..4178cda1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,8 @@ dependencies = [ "scikit-learn", "tqdm", "loguru", - "logomaker" + "logomaker", + "pybigtools", ] [project.optional-dependencies] diff --git a/src/crested/__init__.py b/src/crested/__init__.py index d5ad7b7d..b983fce9 100644 --- a/src/crested/__init__.py +++ b/src/crested/__init__.py @@ -3,10 +3,10 @@ from importlib.metadata import version from . import pl, pp, tl -from ._io import import_peaks, import_topics +from ._io import import_bigwigs, import_topics from ._logging import setup_logging -__all__ = ["pl", "pp", "tl", "import_topics", "import_peaks", "setup_logging"] +__all__ = ["pl", "pp", "tl", "import_topics", "import_bigwigs", "setup_logging"] __version__ = version("crested") diff --git a/src/crested/_io.py b/src/crested/_io.py index 8b7045f0..9ce7f6bd 100644 --- a/src/crested/_io.py +++ b/src/crested/_io.py @@ -1,3 +1,5 @@ +"""I/O functions for importing topics and bigWigs into AnnData objects.""" + from __future__ import annotations import os @@ -76,7 +78,7 @@ def _extract_values_from_bigwig(bw_file, bed_file, target, target_region_width): def _read_consensus_regions( - regions_file: PathLike, chromsizes_file: PathLike + regions_file: PathLike, chromsizes_file: PathLike | None = None ) -> pd.DataFrame: """Read consensus regions BED file and filter out regions not within chromosomes.""" consensus_peaks = pd.read_csv( @@ -90,26 +92,29 @@ def _read_consensus_regions( + consensus_peaks[2].astype(str) ) - chromsizes_dict = _read_chromsizes(chromsizes_file) - valid_mask = consensus_peaks.apply( - lambda row: row[0] in chromsizes_dict - and row[1] >= 0 - and row[2] <= chromsizes_dict[row[0]], - axis=1, - ) - consensus_peaks_filtered = consensus_peaks[valid_mask] - - if len(consensus_peaks) != len(consensus_peaks_filtered): - logger.warning( - f"Filtered {len(consensus_peaks) - len(consensus_peaks_filtered)} consensus regions (not within chromosomes)", + if chromsizes_file: + chromsizes_dict = _read_chromsizes(chromsizes_file) + valid_mask = consensus_peaks.apply( + lambda row: row[0] in chromsizes_dict + and row[1] >= 0 + and row[2] <= chromsizes_dict[row[0]], + axis=1, ) - return consensus_peaks_filtered + consensus_peaks_filtered = consensus_peaks[valid_mask] + + if len(consensus_peaks) != len(consensus_peaks_filtered): + logger.warning( + f"Filtered {len(consensus_peaks) - len(consensus_peaks_filtered)} consensus regions (not within chromosomes)", + ) + return consensus_peaks_filtered + + return consensus_peaks def _create_temp_bed_file( consensus_peaks: pd.DataFrame, target_region_width: int ) -> str: - # Adjust regions based on target_region_width + """Adjust consensus regions to a target width and create a temporary BED file.""" adjusted_peaks = consensus_peaks.copy() adjusted_peaks[1] = adjusted_peaks.apply( lambda row: max(0, row[1] - (target_region_width - (row[2] - row[1])) // 2), @@ -265,10 +270,10 @@ def import_topics( def import_bigwigs( bigwigs_folder: PathLike, regions_file: PathLike, - chromsizes_file: PathLike, + chromsizes_file: PathLike | None = None, target: str = "mean", target_region_width: int | None = None, - compress: bool = True, + compress: bool = False, ) -> AnnData: """ Import bigWig files and consensus regions BED file into AnnData format.