feat: add pileup() method to MomicsQuery

js2264 · Oct 21, 2024 · f2296b7 · f2296b7
1 parent 4579b36
commit f2296b7
Show file tree

Hide file tree

Showing 4 changed files with 135 additions and 12 deletions.
diff --git a/src/momics/momics.py b/src/momics/momics.py
@@ -591,6 +591,8 @@ def bins(self, width, stride, cut_last_bin_out=False) -> pr.PyRanges:
         """
         bins = []
         chroms = self.chroms().set_index("chrom")["length"].to_dict()
+        if chroms == {}:
+            raise ValueError("Please fill out `chroms` table first.")
 
         for chrom, length in chroms.items():
             start = 0

diff --git a/src/momics/momicsquery.py b/src/momics/momicsquery.py
@@ -3,7 +3,7 @@
 import pickle
 import time
 from pathlib import Path
-from typing import List, Optional, Dict
+from typing import List, Literal, Optional, Dict, Union
 
 import numpy as np
 import pandas as pd
@@ -16,14 +16,15 @@
 from .logging import logger
 from .momics import Momics
 from .utils import parse_ucsc_coordinates
+from .utils import dict_to_bigwig
 
 
 class MomicsQuery:
     """A class to query `.momics` repositories.
 
     Attributes
     ----------
-    momics (Momics): a local `.momics` repository.
+    momics (Momics): a local `.momics` repositoryasdcasdc.
     queries (dict): Dict. of pr.PyRanges object.
     coverage (dict): Dictionary of coverage scores extracted from the \
         `.momics` repository, populated after calling `q.query_tracks()`
@@ -35,8 +36,8 @@ def __init__(self, momics: Momics, bed: pr.PyRanges):
         """Initialize the MomicsQuery object.
 
         Args:
-            momics (Momics): a Momics object
-            bed (pr.PyRanges): pr.PyRanges object
+            momics (Momics): a `Momics` object
+            bed (pr.PyRanges): `pr.PyRanges` object
         """
         if not isinstance(momics, Momics):
             raise ValueError("momics must be a `Momics` object.")
@@ -263,6 +264,84 @@ def query_sequence(self, threads: Optional[int] = None) -> "MomicsQuery":
         logger.info(f"Query completed in {round(t,4)}s.")
         return self
 
+    # For this function, the `type` argument can be either "mean" or "sum"
+    def pileup(self, type: Literal["mean", "sum"] = "mean", prefix: Optional[str] = None) -> Union[dict, None]:
+        """
+        Aggregate query coverages into genome-wide dictionary(ies).
+        If the `coverage` attribute has not been populated yet, the :func:`query_tracks()` method will be called.
+        The coverage over each range is aggregated across all tracks. In the case of
+        overlapping ranges, the coverage is averaged.
+
+        Each value of the output dictionary is a dictionary itself, with the keys being the chromosome names
+        and the values being the coverage score, averaged for overlapping ranges.
+
+        Args:
+            prefix (str, optional): Prefix to the output `.bw` files to create.
+                If provided, queried coverage will be saved for each track in a file
+                named `<prefix>_<track_label>.bw`.
+
+        Returns:
+            A dictionary of genome-wide coverage scores, for each track. If
+            the queried ranges overlap, the coverage is averaged.
+            Note that if the output argument is provided, the results will be
+            saved to a `.bw` file and this function will return the Path to the file.
+
+        See Also:
+            :func:`MomicsQuery.query_tracks()`
+
+        Examples:
+            >>> mom = momics.momics.Momics('path/to/momics')
+            >>> windows = pr.PyRanges(
+            ...     chromosomes = ["I", "I", "I", "I"],
+            ...     starts = [0, 5, 10, 20],
+            ...     ends = [30, 30, 30, 30],
+            ... )
+            >>> q = MomicsQuery(mom, windows)
+            >>> q.query_tracks()
+            >>> q.pileup()
+        """
+        cov = self.coverage
+        if cov is None:
+            self.query_tracks()
+            cov = self.coverage
+
+        attrs = cov.keys()
+        ranges = self.ranges
+        chroms = self.momics.chroms()
+        chrom_sizes = {chrom: size for chrom, size in zip(chroms["chrom"], chroms["length"])}
+        tracks = {attr: dict() for attr in attrs}
+
+        for attr in iter(attrs):
+            attr_cov = cov[attr]
+            track = {chrom: np.zeros(size) for chrom, size in chrom_sizes.items()}
+            overlap_count = {chrom: np.zeros(size) for chrom, size in chrom_sizes.items()}
+
+            for (_, row), (_, row_cov) in zip(ranges.df.iterrows(), attr_cov.items()):
+                chrom = row["Chromosome"]
+                start = row["Start"]
+                end = row["End"]
+                track[chrom][start:end] += row_cov
+                overlap_count[chrom][start:end] += 1
+
+            if type == "mean":
+                for chrom in track:
+                    non_zero_mask = overlap_count[chrom] > 0
+                    track[chrom][non_zero_mask] /= overlap_count[chrom][non_zero_mask]
+
+            tracks[attr] = track
+
+        if prefix is not None:
+            bw_paths = []
+            for attr in attrs:
+                f = Path(f"{prefix}_{attr}.bw")
+                p = dict_to_bigwig(tracks[attr], f)
+                logger.info(f"Saved coverage for {attr} to {p.name}")
+                bw_paths.append(p)
+            return bw_paths
+
+        else:
+            return tracks
+
     def to_df(self) -> pd.DataFrame:
         """Parse self.coverage attribute to a pd.DataFrame
 

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,5 +1,6 @@
 import os
 import random
+from typing import Final
 
 import numpy as np
 import pyBigWig
@@ -8,6 +9,18 @@
 from Bio.Seq import Seq
 from Bio.SeqRecord import SeqRecord
 
+NO_SKIP_OPTION: Final[str] = "--no-skip"
+
+
+def pytest_addoption(parser):
+    parser.addoption(NO_SKIP_OPTION, action="store_true", default=False, help="also run skipped tests")
+
+
+def pytest_collection_modifyitems(config, items: list):
+    if config.getoption(NO_SKIP_OPTION):
+        for test in items:
+            test.own_markers = [marker for marker in test.own_markers if marker.name not in ("skip", "skipif")]
+
 
 @pytest.fixture(scope="session")
 def momics_path(tmp_path_factory):

diff --git a/tests/test_multirangequery.py → tests/test_momicsquery.py b/tests/test_multirangequery.py → tests/test_momicsquery.py
@@ -1,5 +1,6 @@
 import json
 import multiprocessing
+import os
 import pickle
 import tempfile
 from pathlib import Path
@@ -9,15 +10,15 @@
 import pyranges as pr
 import pytest
 
-from momics import momics
+from momics.momics import Momics
 from momics.momicsquery import MomicsQuery
 
 multiprocessing.set_start_method("spawn", force=True)
 
 
 @pytest.mark.order(2)
 def test_momicsquery_tracks(momics_path: str, bed1: str):
-    mom = momics.Momics(momics_path)
+    mom = Momics(momics_path)
     q = MomicsQuery(mom, "I:990-1010").query_tracks()
     assert len(q.coverage) == 4
     assert len(q.coverage["bw2"]["I:990-1010"]) == 20
@@ -45,7 +46,7 @@ def test_momicsquery_tracks(momics_path: str, bed1: str):
 
 @pytest.mark.order(2)
 def test_momicsquery_seq(momics_path: str):
-    mom = momics.Momics(momics_path)
+    mom = Momics(momics_path)
     q = MomicsQuery(mom, "I:0-10").query_sequence()
     assert len(q.seq) == 1
     assert q.seq["nucleotide"]["I:0-10"] == "ATCGATCGAT"
@@ -57,7 +58,7 @@ def test_momicsquery_seq(momics_path: str):
 
 @pytest.mark.order(2)
 def test_momicsquery_seq2(momics_path: str, bed1: str):
-    mom = momics.Momics(momics_path)
+    mom = Momics(momics_path)
     q = MomicsQuery(mom, "I:0-10").query_sequence()
     bed = pr.read_bed(bed1)
     q = MomicsQuery(mom, bed).query_sequence()
@@ -69,15 +70,15 @@ def test_momicsquery_seq2(momics_path: str, bed1: str):
 
 @pytest.mark.order(2)
 def test_momicsquery_seq3(momics_path: str):
-    mom = momics.Momics(momics_path)
+    mom = Momics(momics_path)
     q = MomicsQuery(mom, "I:0-10").query_sequence()
     q = MomicsQuery(mom, "I").query_sequence()
     assert len(q.seq["nucleotide"]["I:0-10000"]) == 10000
 
 
 @pytest.mark.order(2)
 def test_momicsquery_seq4(momics_path: str, bed1: str):
-    mom = momics.Momics(momics_path)
+    mom = Momics(momics_path)
     q = MomicsQuery(mom, "I:0-10").query_sequence()
     bed = pr.read_bed(bed1)
     q = MomicsQuery(mom, bed).query_sequence()
@@ -86,7 +87,7 @@ def test_momicsquery_seq4(momics_path: str, bed1: str):
 
 @pytest.mark.order(2)
 def test_momicsquery_seq5(momics_path: str, bed1: str):
-    mom = momics.Momics(momics_path)
+    mom = Momics(momics_path)
     bed = pr.read_bed(bed1)
     print(bed)
     q = MomicsQuery(mom, "I:0-10").query_sequence()
@@ -115,7 +116,7 @@ def temp_npz_file():
 
 @pytest.mark.order(2)
 def test_to_json_npz(momics_path: str, temp_json_file: Path, temp_npz_file: Path):
-    mom = momics.Momics(momics_path)
+    mom = Momics(momics_path)
     q = MomicsQuery(mom, "I:0-10").query_sequence().query_tracks()
 
     q.to_json(temp_json_file)
@@ -133,3 +134,31 @@ def test_to_json_npz(momics_path: str, temp_json_file: Path, temp_npz_file: Path
     assert seq["nucleotide"]["I:0-10"] == "ATCGATCGAT"
     assert list(cov.keys()) == ["bw2", "custom", "bw3", "bw4", "nucleotide"]
     assert list(cov["bw2"].keys()) == ["I:0-10"]
+
+
+@pytest.mark.order(2)
+def test_pileup(momics_path: str):
+    mom = Momics(momics_path)
+    b = mom.bins(3, 1)["I", 6990:7010]
+    q = MomicsQuery(mom, b).query_tracks()
+    x = q.pileup()
+    print(x.keys())
+    assert list(x.keys()) == ["bw2", "custom", "bw3", "bw4"]
+    assert [len(y) for y in x["bw2"].values()] == [10000, 20000, 30000]
+    assert x["bw2"]["I"].shape == (10000,)
+
+    b = mom.bins(3, 1)["I", 6990:7010]
+    x = q.pileup(type="mean")
+    assert x["bw2"]["I"].shape == (10000,)
+    vec = x["bw2"]["I"][6985:7015] - np.array([0] * 3 + [0.06] * 12 + [0.07] * 12 + [0] * 3) < 1e-6
+    assert sum(vec) == 30
+    x = q.pileup(type="sum")
+    assert x["bw2"]["I"].shape == (10000,)
+    res = np.array([0] * 3 + [0.06, 0.12] + [0.18] * 10 + [0.21] * 10 + [0.14, 0.07] + [0] * 3)
+    vec = x["bw2"]["I"][6985:7015] - res < 1e-6
+    assert sum(vec) == 30
+
+    x = q.pileup(prefix="tmp__")
+    print(x)
+    assert len(x) == 4
+    assert [os.path.exists(idx) for idx in iter(x)].count(True) == 4