From f2296b7cb9e7425ac453c41a6d4028d6848035bc Mon Sep 17 00:00:00 2001 From: js2264 Date: Mon, 21 Oct 2024 18:11:55 +0200 Subject: [PATCH] feat: add pileup() method to MomicsQuery --- src/momics/momics.py | 2 + src/momics/momicsquery.py | 87 ++++++++++++++++++- tests/conftest.py | 13 +++ ...multirangequery.py => test_momicsquery.py} | 45 ++++++++-- 4 files changed, 135 insertions(+), 12 deletions(-) rename tests/{test_multirangequery.py => test_momicsquery.py} (76%) diff --git a/src/momics/momics.py b/src/momics/momics.py index 5e4cf97..e6e18a9 100644 --- a/src/momics/momics.py +++ b/src/momics/momics.py @@ -591,6 +591,8 @@ def bins(self, width, stride, cut_last_bin_out=False) -> pr.PyRanges: """ bins = [] chroms = self.chroms().set_index("chrom")["length"].to_dict() + if chroms == {}: + raise ValueError("Please fill out `chroms` table first.") for chrom, length in chroms.items(): start = 0 diff --git a/src/momics/momicsquery.py b/src/momics/momicsquery.py index 58c6efd..cf8b1d5 100644 --- a/src/momics/momicsquery.py +++ b/src/momics/momicsquery.py @@ -3,7 +3,7 @@ import pickle import time from pathlib import Path -from typing import List, Optional, Dict +from typing import List, Literal, Optional, Dict, Union import numpy as np import pandas as pd @@ -16,6 +16,7 @@ from .logging import logger from .momics import Momics from .utils import parse_ucsc_coordinates +from .utils import dict_to_bigwig class MomicsQuery: @@ -23,7 +24,7 @@ class MomicsQuery: Attributes ---------- - momics (Momics): a local `.momics` repository. + momics (Momics): a local `.momics` repositoryasdcasdc. queries (dict): Dict. of pr.PyRanges object. coverage (dict): Dictionary of coverage scores extracted from the \ `.momics` repository, populated after calling `q.query_tracks()` @@ -35,8 +36,8 @@ def __init__(self, momics: Momics, bed: pr.PyRanges): """Initialize the MomicsQuery object. Args: - momics (Momics): a Momics object - bed (pr.PyRanges): pr.PyRanges object + momics (Momics): a `Momics` object + bed (pr.PyRanges): `pr.PyRanges` object """ if not isinstance(momics, Momics): raise ValueError("momics must be a `Momics` object.") @@ -263,6 +264,84 @@ def query_sequence(self, threads: Optional[int] = None) -> "MomicsQuery": logger.info(f"Query completed in {round(t,4)}s.") return self + # For this function, the `type` argument can be either "mean" or "sum" + def pileup(self, type: Literal["mean", "sum"] = "mean", prefix: Optional[str] = None) -> Union[dict, None]: + """ + Aggregate query coverages into genome-wide dictionary(ies). + If the `coverage` attribute has not been populated yet, the :func:`query_tracks()` method will be called. + The coverage over each range is aggregated across all tracks. In the case of + overlapping ranges, the coverage is averaged. + + Each value of the output dictionary is a dictionary itself, with the keys being the chromosome names + and the values being the coverage score, averaged for overlapping ranges. + + Args: + prefix (str, optional): Prefix to the output `.bw` files to create. + If provided, queried coverage will be saved for each track in a file + named `_.bw`. + + Returns: + A dictionary of genome-wide coverage scores, for each track. If + the queried ranges overlap, the coverage is averaged. + Note that if the output argument is provided, the results will be + saved to a `.bw` file and this function will return the Path to the file. + + See Also: + :func:`MomicsQuery.query_tracks()` + + Examples: + >>> mom = momics.momics.Momics('path/to/momics') + >>> windows = pr.PyRanges( + ... chromosomes = ["I", "I", "I", "I"], + ... starts = [0, 5, 10, 20], + ... ends = [30, 30, 30, 30], + ... ) + >>> q = MomicsQuery(mom, windows) + >>> q.query_tracks() + >>> q.pileup() + """ + cov = self.coverage + if cov is None: + self.query_tracks() + cov = self.coverage + + attrs = cov.keys() + ranges = self.ranges + chroms = self.momics.chroms() + chrom_sizes = {chrom: size for chrom, size in zip(chroms["chrom"], chroms["length"])} + tracks = {attr: dict() for attr in attrs} + + for attr in iter(attrs): + attr_cov = cov[attr] + track = {chrom: np.zeros(size) for chrom, size in chrom_sizes.items()} + overlap_count = {chrom: np.zeros(size) for chrom, size in chrom_sizes.items()} + + for (_, row), (_, row_cov) in zip(ranges.df.iterrows(), attr_cov.items()): + chrom = row["Chromosome"] + start = row["Start"] + end = row["End"] + track[chrom][start:end] += row_cov + overlap_count[chrom][start:end] += 1 + + if type == "mean": + for chrom in track: + non_zero_mask = overlap_count[chrom] > 0 + track[chrom][non_zero_mask] /= overlap_count[chrom][non_zero_mask] + + tracks[attr] = track + + if prefix is not None: + bw_paths = [] + for attr in attrs: + f = Path(f"{prefix}_{attr}.bw") + p = dict_to_bigwig(tracks[attr], f) + logger.info(f"Saved coverage for {attr} to {p.name}") + bw_paths.append(p) + return bw_paths + + else: + return tracks + def to_df(self) -> pd.DataFrame: """Parse self.coverage attribute to a pd.DataFrame diff --git a/tests/conftest.py b/tests/conftest.py index 2096c00..0f03810 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,6 @@ import os import random +from typing import Final import numpy as np import pyBigWig @@ -8,6 +9,18 @@ from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord +NO_SKIP_OPTION: Final[str] = "--no-skip" + + +def pytest_addoption(parser): + parser.addoption(NO_SKIP_OPTION, action="store_true", default=False, help="also run skipped tests") + + +def pytest_collection_modifyitems(config, items: list): + if config.getoption(NO_SKIP_OPTION): + for test in items: + test.own_markers = [marker for marker in test.own_markers if marker.name not in ("skip", "skipif")] + @pytest.fixture(scope="session") def momics_path(tmp_path_factory): diff --git a/tests/test_multirangequery.py b/tests/test_momicsquery.py similarity index 76% rename from tests/test_multirangequery.py rename to tests/test_momicsquery.py index 3beacdb..3c4a83e 100644 --- a/tests/test_multirangequery.py +++ b/tests/test_momicsquery.py @@ -1,5 +1,6 @@ import json import multiprocessing +import os import pickle import tempfile from pathlib import Path @@ -9,7 +10,7 @@ import pyranges as pr import pytest -from momics import momics +from momics.momics import Momics from momics.momicsquery import MomicsQuery multiprocessing.set_start_method("spawn", force=True) @@ -17,7 +18,7 @@ @pytest.mark.order(2) def test_momicsquery_tracks(momics_path: str, bed1: str): - mom = momics.Momics(momics_path) + mom = Momics(momics_path) q = MomicsQuery(mom, "I:990-1010").query_tracks() assert len(q.coverage) == 4 assert len(q.coverage["bw2"]["I:990-1010"]) == 20 @@ -45,7 +46,7 @@ def test_momicsquery_tracks(momics_path: str, bed1: str): @pytest.mark.order(2) def test_momicsquery_seq(momics_path: str): - mom = momics.Momics(momics_path) + mom = Momics(momics_path) q = MomicsQuery(mom, "I:0-10").query_sequence() assert len(q.seq) == 1 assert q.seq["nucleotide"]["I:0-10"] == "ATCGATCGAT" @@ -57,7 +58,7 @@ def test_momicsquery_seq(momics_path: str): @pytest.mark.order(2) def test_momicsquery_seq2(momics_path: str, bed1: str): - mom = momics.Momics(momics_path) + mom = Momics(momics_path) q = MomicsQuery(mom, "I:0-10").query_sequence() bed = pr.read_bed(bed1) q = MomicsQuery(mom, bed).query_sequence() @@ -69,7 +70,7 @@ def test_momicsquery_seq2(momics_path: str, bed1: str): @pytest.mark.order(2) def test_momicsquery_seq3(momics_path: str): - mom = momics.Momics(momics_path) + mom = Momics(momics_path) q = MomicsQuery(mom, "I:0-10").query_sequence() q = MomicsQuery(mom, "I").query_sequence() assert len(q.seq["nucleotide"]["I:0-10000"]) == 10000 @@ -77,7 +78,7 @@ def test_momicsquery_seq3(momics_path: str): @pytest.mark.order(2) def test_momicsquery_seq4(momics_path: str, bed1: str): - mom = momics.Momics(momics_path) + mom = Momics(momics_path) q = MomicsQuery(mom, "I:0-10").query_sequence() bed = pr.read_bed(bed1) q = MomicsQuery(mom, bed).query_sequence() @@ -86,7 +87,7 @@ def test_momicsquery_seq4(momics_path: str, bed1: str): @pytest.mark.order(2) def test_momicsquery_seq5(momics_path: str, bed1: str): - mom = momics.Momics(momics_path) + mom = Momics(momics_path) bed = pr.read_bed(bed1) print(bed) q = MomicsQuery(mom, "I:0-10").query_sequence() @@ -115,7 +116,7 @@ def temp_npz_file(): @pytest.mark.order(2) def test_to_json_npz(momics_path: str, temp_json_file: Path, temp_npz_file: Path): - mom = momics.Momics(momics_path) + mom = Momics(momics_path) q = MomicsQuery(mom, "I:0-10").query_sequence().query_tracks() q.to_json(temp_json_file) @@ -133,3 +134,31 @@ def test_to_json_npz(momics_path: str, temp_json_file: Path, temp_npz_file: Path assert seq["nucleotide"]["I:0-10"] == "ATCGATCGAT" assert list(cov.keys()) == ["bw2", "custom", "bw3", "bw4", "nucleotide"] assert list(cov["bw2"].keys()) == ["I:0-10"] + + +@pytest.mark.order(2) +def test_pileup(momics_path: str): + mom = Momics(momics_path) + b = mom.bins(3, 1)["I", 6990:7010] + q = MomicsQuery(mom, b).query_tracks() + x = q.pileup() + print(x.keys()) + assert list(x.keys()) == ["bw2", "custom", "bw3", "bw4"] + assert [len(y) for y in x["bw2"].values()] == [10000, 20000, 30000] + assert x["bw2"]["I"].shape == (10000,) + + b = mom.bins(3, 1)["I", 6990:7010] + x = q.pileup(type="mean") + assert x["bw2"]["I"].shape == (10000,) + vec = x["bw2"]["I"][6985:7015] - np.array([0] * 3 + [0.06] * 12 + [0.07] * 12 + [0] * 3) < 1e-6 + assert sum(vec) == 30 + x = q.pileup(type="sum") + assert x["bw2"]["I"].shape == (10000,) + res = np.array([0] * 3 + [0.06, 0.12] + [0.18] * 10 + [0.21] * 10 + [0.14, 0.07] + [0] * 3) + vec = x["bw2"]["I"][6985:7015] - res < 1e-6 + assert sum(vec) == 30 + + x = q.pileup(prefix="tmp__") + print(x) + assert len(x) == 4 + assert [os.path.exists(idx) for idx in iter(x)].count(True) == 4