Skip to content

Commit

Permalink
feat: API change, pileup() method -> aggregate() separate function
Browse files Browse the repository at this point in the history
  • Loading branch information
js2264 committed Oct 25, 2024
1 parent 16b5f4f commit 2e3e2f3
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 89 deletions.
80 changes: 80 additions & 0 deletions src/momics/aggregate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
from typing import Literal, Optional
from pathlib import Path
import numpy as np

from .logging import logger
from .utils import dict_to_bigwig


# For this function, the `type` argument can be either "mean" or "sum"
def aggregate(cov, ranges, chrom_sizes, type: Literal["mean", "sum"] = "mean", prefix: Optional[str] = None) -> dict:
"""
Aggregate query coverage outputs into genome-wide dictionary(ies).
The coverage over each range is aggregated across all tracks. In the case of
overlapping ranges, the coverage is averaged.
Each value of the output dictionary is a dictionary itself, with the keys being the chromosome names
and the values being the coverage score, averaged for overlapping ranges.
Args:
cov (dict): A dictionary of coverage scores, for each track. This is generally the output of
:func:`MomicsQuery.query_tracks().coverage`.
ranges (PyRanges): A PyRanges object containing the ranges queried.
chrom_sizes (dict): A dictionary of chromosome sizes.
type: The type of aggregation to perform. Can be either "mean" or "sum".
prefix (str, optional): Prefix to the output `.bw` files to create.
If provided, queried coverage will be saved for each track in a file
named `<prefix>_<track_label>.bw`.
Returns:
A dictionary of genome-wide coverage scores, for each track. If
the queried ranges overlap, the coverage is averaged/summed.
Note that if the output argument is provided, the results for each track will be
saved to a `<prefix>_<track_label>.bw` file.
See Also:
:func:`MomicsQuery.query_tracks()`
Examples:
>>> mom = momics.momics.Momics('path/to/momics')
>>> windows = pr.PyRanges(
... chromosomes = ["I", "I", "I", "I"],
... starts = [0, 5, 10, 20],
... ends = [30, 30, 30, 30],
... )
>>> cov = MomicsQuery(mom, windows).coverage
>>> aggregate(cov, windows, {"I": 30})
"""
attrs = cov.keys()
tracks = {attr: dict() for attr in attrs}

for attr in iter(attrs):
attr_cov = cov[attr]
track = {chrom: np.zeros(size) for chrom, size in chrom_sizes.items()}
overlap_count = {chrom: np.zeros(size) for chrom, size in chrom_sizes.items()}

for (_, row), (_, row_cov) in zip(ranges.df.iterrows(), attr_cov.items()):
chrom = row["Chromosome"]
start = row["Start"]
end = row["End"]
track[chrom][start:end] += row_cov
overlap_count[chrom][start:end] += 1

if type == "mean":
for chrom in track:
non_zero_mask = overlap_count[chrom] > 0
track[chrom][non_zero_mask] /= overlap_count[chrom][non_zero_mask]

tracks[attr] = track

if prefix is not None:
bw_paths = []
for attr in attrs:
f = Path(f"{prefix}_{attr}.bw")
p = dict_to_bigwig(tracks[attr], f)
logger.info(f"Saved coverage for {attr} to {p.name}")
bw_paths.append(p)
return tracks

else:
return tracks
81 changes: 1 addition & 80 deletions src/momics/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pickle
import time
from pathlib import Path
from typing import List, Literal, Optional, Dict, Union
from typing import List, Optional, Dict

import numpy as np
import pandas as pd
Expand All @@ -16,7 +16,6 @@
from .logging import logger
from .momics import Momics
from .utils import parse_ucsc_coordinates
from .utils import dict_to_bigwig


class MomicsQuery:
Expand Down Expand Up @@ -264,84 +263,6 @@ def query_sequence(self, threads: Optional[int] = None) -> "MomicsQuery":
logger.info(f"Query completed in {round(t,4)}s.")
return self

# For this function, the `type` argument can be either "mean" or "sum"
def pileup(self, type: Literal["mean", "sum"] = "mean", prefix: Optional[str] = None) -> Union[dict, None]:
"""
Aggregate query coverages into genome-wide dictionary(ies).
If the `coverage` attribute has not been populated yet, the :func:`query_tracks()` method will be called.
The coverage over each range is aggregated across all tracks. In the case of
overlapping ranges, the coverage is averaged.
Each value of the output dictionary is a dictionary itself, with the keys being the chromosome names
and the values being the coverage score, averaged for overlapping ranges.
Args:
prefix (str, optional): Prefix to the output `.bw` files to create.
If provided, queried coverage will be saved for each track in a file
named `<prefix>_<track_label>.bw`.
Returns:
A dictionary of genome-wide coverage scores, for each track. If
the queried ranges overlap, the coverage is averaged.
Note that if the output argument is provided, the results will be
saved to a `.bw` file and this function will return the Path to the file.
See Also:
:func:`MomicsQuery.query_tracks()`
Examples:
>>> mom = momics.momics.Momics('path/to/momics')
>>> windows = pr.PyRanges(
... chromosomes = ["I", "I", "I", "I"],
... starts = [0, 5, 10, 20],
... ends = [30, 30, 30, 30],
... )
>>> q = MomicsQuery(mom, windows)
>>> q.query_tracks()
>>> q.pileup()
"""
cov = self.coverage
if cov is None:
self.query_tracks()
cov = self.coverage

attrs = cov.keys()
ranges = self.ranges
chroms = self.momics.chroms()
chrom_sizes = {chrom: size for chrom, size in zip(chroms["chrom"], chroms["length"])}
tracks = {attr: dict() for attr in attrs}

for attr in iter(attrs):
attr_cov = cov[attr]
track = {chrom: np.zeros(size) for chrom, size in chrom_sizes.items()}
overlap_count = {chrom: np.zeros(size) for chrom, size in chrom_sizes.items()}

for (_, row), (_, row_cov) in zip(ranges.df.iterrows(), attr_cov.items()):
chrom = row["Chromosome"]
start = row["Start"]
end = row["End"]
track[chrom][start:end] += row_cov
overlap_count[chrom][start:end] += 1

if type == "mean":
for chrom in track:
non_zero_mask = overlap_count[chrom] > 0
track[chrom][non_zero_mask] /= overlap_count[chrom][non_zero_mask]

tracks[attr] = track

if prefix is not None:
bw_paths = []
for attr in attrs:
f = Path(f"{prefix}_{attr}.bw")
p = dict_to_bigwig(tracks[attr], f)
logger.info(f"Saved coverage for {attr} to {p.name}")
bw_paths.append(p)
return bw_paths

else:
return tracks

def to_df(self) -> pd.DataFrame:
"""Parse self.coverage attribute to a pd.DataFrame
Expand Down
24 changes: 15 additions & 9 deletions tests/test_momicsquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from momics.momics import Momics
from momics.query import MomicsQuery
from momics.aggregate import aggregate

multiprocessing.set_start_method("spawn", force=True)

Expand Down Expand Up @@ -137,28 +138,33 @@ def test_to_json_npz(momics_path: str, temp_json_file: Path, temp_npz_file: Path


@pytest.mark.order(2)
def test_pileup(momics_path: str):
def test_aggregate(momics_path: str):
mom = Momics(momics_path)
b = mom.bins(3, 1)["I", 6990:7010]
chrom_sizes = {chrom: size for chrom, size in zip(mom.chroms().chrom, mom.chroms().length)}
q = MomicsQuery(mom, b).query_tracks()
x = q.pileup()
cov = q.coverage
x = aggregate(cov, b, chrom_sizes)
print(x.keys())
assert list(x.keys()) == ["bw2", "custom", "bw3", "bw4"]
assert [len(y) for y in x["bw2"].values()] == [10000, 20000, 30000]
assert x["bw2"]["I"].shape == (10000,)

b = mom.bins(3, 1)["I", 6990:7010]
x = q.pileup(type="mean")
x = aggregate(cov, b, chrom_sizes, type="mean")
assert x["bw2"]["I"].shape == (10000,)
vec = x["bw2"]["I"][6985:7015] - np.array([0] * 3 + [0.06] * 12 + [0.07] * 12 + [0] * 3) < 1e-6
assert sum(vec) == 30
x = q.pileup(type="sum")

x = aggregate(cov, b, chrom_sizes, type="sum")
assert x["bw2"]["I"].shape == (10000,)
res = np.array([0] * 3 + [0.06, 0.12] + [0.18] * 10 + [0.21] * 10 + [0.14, 0.07] + [0] * 3)
vec = x["bw2"]["I"][6985:7015] - res < 1e-6
assert sum(vec) == 30

x = q.pileup(prefix="tmp__")
print(x)
assert len(x) == 4
assert [os.path.exists(idx) for idx in iter(x)].count(True) == 4
x = aggregate(cov, b, chrom_sizes, prefix="tmp__")
assert x["bw2"]["I"].shape == (10000,)
vec = x["bw2"]["I"][6985:7015] - np.array([0] * 3 + [0.06] * 12 + [0.07] * 12 + [0] * 3) < 1e-6
assert sum(vec) == 30
tracks = mom.tracks().label
print(tracks)
assert [os.path.exists("tmp___" + track + ".bw") for track in tracks].count(True) == 4

0 comments on commit 2e3e2f3

Please sign in to comment.