feat: API change, pileup() method -> aggregate() separate function

js2264 · Oct 25, 2024 · 2e3e2f3 · 2e3e2f3
1 parent 16b5f4f
commit 2e3e2f3
Show file tree

Hide file tree

Showing 3 changed files with 96 additions and 89 deletions.
diff --git a/src/momics/aggregate.py b/src/momics/aggregate.py
@@ -0,0 +1,80 @@
+from typing import Literal, Optional
+from pathlib import Path
+import numpy as np
+
+from .logging import logger
+from .utils import dict_to_bigwig
+
+
+# For this function, the `type` argument can be either "mean" or "sum"
+def aggregate(cov, ranges, chrom_sizes, type: Literal["mean", "sum"] = "mean", prefix: Optional[str] = None) -> dict:
+    """
+    Aggregate query coverage outputs into genome-wide dictionary(ies).
+    The coverage over each range is aggregated across all tracks. In the case of
+    overlapping ranges, the coverage is averaged.
+
+    Each value of the output dictionary is a dictionary itself, with the keys being the chromosome names
+    and the values being the coverage score, averaged for overlapping ranges.
+
+    Args:
+        cov (dict): A dictionary of coverage scores, for each track. This is generally the output of
+            :func:`MomicsQuery.query_tracks().coverage`.
+        ranges (PyRanges): A PyRanges object containing the ranges queried.
+        chrom_sizes (dict): A dictionary of chromosome sizes.
+        type: The type of aggregation to perform. Can be either "mean" or "sum".
+        prefix (str, optional): Prefix to the output `.bw` files to create.
+            If provided, queried coverage will be saved for each track in a file
+            named `<prefix>_<track_label>.bw`.
+
+    Returns:
+        A dictionary of genome-wide coverage scores, for each track. If
+        the queried ranges overlap, the coverage is averaged/summed.
+        Note that if the output argument is provided, the results for each track will be
+        saved to a `<prefix>_<track_label>.bw` file.
+
+    See Also:
+        :func:`MomicsQuery.query_tracks()`
+
+    Examples:
+        >>> mom = momics.momics.Momics('path/to/momics')
+        >>> windows = pr.PyRanges(
+        ...     chromosomes = ["I", "I", "I", "I"],
+        ...     starts = [0, 5, 10, 20],
+        ...     ends = [30, 30, 30, 30],
+        ... )
+        >>> cov = MomicsQuery(mom, windows).coverage
+        >>> aggregate(cov, windows, {"I": 30})
+    """
+    attrs = cov.keys()
+    tracks = {attr: dict() for attr in attrs}
+
+    for attr in iter(attrs):
+        attr_cov = cov[attr]
+        track = {chrom: np.zeros(size) for chrom, size in chrom_sizes.items()}
+        overlap_count = {chrom: np.zeros(size) for chrom, size in chrom_sizes.items()}
+
+        for (_, row), (_, row_cov) in zip(ranges.df.iterrows(), attr_cov.items()):
+            chrom = row["Chromosome"]
+            start = row["Start"]
+            end = row["End"]
+            track[chrom][start:end] += row_cov
+            overlap_count[chrom][start:end] += 1
+
+        if type == "mean":
+            for chrom in track:
+                non_zero_mask = overlap_count[chrom] > 0
+                track[chrom][non_zero_mask] /= overlap_count[chrom][non_zero_mask]
+
+        tracks[attr] = track
+
+    if prefix is not None:
+        bw_paths = []
+        for attr in attrs:
+            f = Path(f"{prefix}_{attr}.bw")
+            p = dict_to_bigwig(tracks[attr], f)
+            logger.info(f"Saved coverage for {attr} to {p.name}")
+            bw_paths.append(p)
+        return tracks
+
+    else:
+        return tracks
diff --git a/src/momics/query.py b/src/momics/query.py
@@ -3,7 +3,7 @@
 import pickle
 import time
 from pathlib import Path
-from typing import List, Literal, Optional, Dict, Union
+from typing import List, Optional, Dict
 
 import numpy as np
 import pandas as pd
@@ -16,7 +16,6 @@
 from .logging import logger
 from .momics import Momics
 from .utils import parse_ucsc_coordinates
-from .utils import dict_to_bigwig
 
 
 class MomicsQuery:
@@ -264,84 +263,6 @@ def query_sequence(self, threads: Optional[int] = None) -> "MomicsQuery":
         logger.info(f"Query completed in {round(t,4)}s.")
         return self
 
-    # For this function, the `type` argument can be either "mean" or "sum"
-    def pileup(self, type: Literal["mean", "sum"] = "mean", prefix: Optional[str] = None) -> Union[dict, None]:
-        """
-        Aggregate query coverages into genome-wide dictionary(ies).
-        If the `coverage` attribute has not been populated yet, the :func:`query_tracks()` method will be called.
-        The coverage over each range is aggregated across all tracks. In the case of
-        overlapping ranges, the coverage is averaged.
-
-        Each value of the output dictionary is a dictionary itself, with the keys being the chromosome names
-        and the values being the coverage score, averaged for overlapping ranges.
-
-        Args:
-            prefix (str, optional): Prefix to the output `.bw` files to create.
-                If provided, queried coverage will be saved for each track in a file
-                named `<prefix>_<track_label>.bw`.
-
-        Returns:
-            A dictionary of genome-wide coverage scores, for each track. If
-            the queried ranges overlap, the coverage is averaged.
-            Note that if the output argument is provided, the results will be
-            saved to a `.bw` file and this function will return the Path to the file.
-
-        See Also:
-            :func:`MomicsQuery.query_tracks()`
-
-        Examples:
-            >>> mom = momics.momics.Momics('path/to/momics')
-            >>> windows = pr.PyRanges(
-            ...     chromosomes = ["I", "I", "I", "I"],
-            ...     starts = [0, 5, 10, 20],
-            ...     ends = [30, 30, 30, 30],
-            ... )
-            >>> q = MomicsQuery(mom, windows)
-            >>> q.query_tracks()
-            >>> q.pileup()
-        """
-        cov = self.coverage
-        if cov is None:
-            self.query_tracks()
-            cov = self.coverage
-
-        attrs = cov.keys()
-        ranges = self.ranges
-        chroms = self.momics.chroms()
-        chrom_sizes = {chrom: size for chrom, size in zip(chroms["chrom"], chroms["length"])}
-        tracks = {attr: dict() for attr in attrs}
-
-        for attr in iter(attrs):
-            attr_cov = cov[attr]
-            track = {chrom: np.zeros(size) for chrom, size in chrom_sizes.items()}
-            overlap_count = {chrom: np.zeros(size) for chrom, size in chrom_sizes.items()}
-
-            for (_, row), (_, row_cov) in zip(ranges.df.iterrows(), attr_cov.items()):
-                chrom = row["Chromosome"]
-                start = row["Start"]
-                end = row["End"]
-                track[chrom][start:end] += row_cov
-                overlap_count[chrom][start:end] += 1
-
-            if type == "mean":
-                for chrom in track:
-                    non_zero_mask = overlap_count[chrom] > 0
-                    track[chrom][non_zero_mask] /= overlap_count[chrom][non_zero_mask]
-
-            tracks[attr] = track
-
-        if prefix is not None:
-            bw_paths = []
-            for attr in attrs:
-                f = Path(f"{prefix}_{attr}.bw")
-                p = dict_to_bigwig(tracks[attr], f)
-                logger.info(f"Saved coverage for {attr} to {p.name}")
-                bw_paths.append(p)
-            return bw_paths
-
-        else:
-            return tracks
-
     def to_df(self) -> pd.DataFrame:
         """Parse self.coverage attribute to a pd.DataFrame
 

diff --git a/tests/test_momicsquery.py b/tests/test_momicsquery.py
@@ -12,6 +12,7 @@
 
 from momics.momics import Momics
 from momics.query import MomicsQuery
+from momics.aggregate import aggregate
 
 multiprocessing.set_start_method("spawn", force=True)
 
@@ -137,28 +138,33 @@ def test_to_json_npz(momics_path: str, temp_json_file: Path, temp_npz_file: Path
 
 
 @pytest.mark.order(2)
-def test_pileup(momics_path: str):
+def test_aggregate(momics_path: str):
     mom = Momics(momics_path)
     b = mom.bins(3, 1)["I", 6990:7010]
+    chrom_sizes = {chrom: size for chrom, size in zip(mom.chroms().chrom, mom.chroms().length)}
     q = MomicsQuery(mom, b).query_tracks()
-    x = q.pileup()
+    cov = q.coverage
+    x = aggregate(cov, b, chrom_sizes)
     print(x.keys())
     assert list(x.keys()) == ["bw2", "custom", "bw3", "bw4"]
     assert [len(y) for y in x["bw2"].values()] == [10000, 20000, 30000]
     assert x["bw2"]["I"].shape == (10000,)
 
-    b = mom.bins(3, 1)["I", 6990:7010]
-    x = q.pileup(type="mean")
+    x = aggregate(cov, b, chrom_sizes, type="mean")
     assert x["bw2"]["I"].shape == (10000,)
     vec = x["bw2"]["I"][6985:7015] - np.array([0] * 3 + [0.06] * 12 + [0.07] * 12 + [0] * 3) < 1e-6
     assert sum(vec) == 30
-    x = q.pileup(type="sum")
+
+    x = aggregate(cov, b, chrom_sizes, type="sum")
     assert x["bw2"]["I"].shape == (10000,)
     res = np.array([0] * 3 + [0.06, 0.12] + [0.18] * 10 + [0.21] * 10 + [0.14, 0.07] + [0] * 3)
     vec = x["bw2"]["I"][6985:7015] - res < 1e-6
     assert sum(vec) == 30
 
-    x = q.pileup(prefix="tmp__")
-    print(x)
-    assert len(x) == 4
-    assert [os.path.exists(idx) for idx in iter(x)].count(True) == 4
+    x = aggregate(cov, b, chrom_sizes, prefix="tmp__")
+    assert x["bw2"]["I"].shape == (10000,)
+    vec = x["bw2"]["I"][6985:7015] - np.array([0] * 3 + [0.06] * 12 + [0.07] * 12 + [0] * 3) < 1e-6
+    assert sum(vec) == 30
+    tracks = mom.tracks().label
+    print(tracks)
+    assert [os.path.exists("tmp___" + track + ".bw") for track in tracks].count(True) == 4