Skip to content

Commit

Permalink
feat: add pileup() method to MomicsQuery
Browse files Browse the repository at this point in the history
  • Loading branch information
js2264 committed Oct 21, 2024
1 parent 4579b36 commit f2296b7
Show file tree
Hide file tree
Showing 4 changed files with 135 additions and 12 deletions.
2 changes: 2 additions & 0 deletions src/momics/momics.py
Original file line number Diff line number Diff line change
Expand Up @@ -591,6 +591,8 @@ def bins(self, width, stride, cut_last_bin_out=False) -> pr.PyRanges:
"""
bins = []
chroms = self.chroms().set_index("chrom")["length"].to_dict()
if chroms == {}:
raise ValueError("Please fill out `chroms` table first.")

for chrom, length in chroms.items():
start = 0
Expand Down
87 changes: 83 additions & 4 deletions src/momics/momicsquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pickle
import time
from pathlib import Path
from typing import List, Optional, Dict
from typing import List, Literal, Optional, Dict, Union

import numpy as np
import pandas as pd
Expand All @@ -16,14 +16,15 @@
from .logging import logger
from .momics import Momics
from .utils import parse_ucsc_coordinates
from .utils import dict_to_bigwig


class MomicsQuery:
"""A class to query `.momics` repositories.
Attributes
----------
momics (Momics): a local `.momics` repository.
momics (Momics): a local `.momics` repositoryasdcasdc.
queries (dict): Dict. of pr.PyRanges object.
coverage (dict): Dictionary of coverage scores extracted from the \
`.momics` repository, populated after calling `q.query_tracks()`
Expand All @@ -35,8 +36,8 @@ def __init__(self, momics: Momics, bed: pr.PyRanges):
"""Initialize the MomicsQuery object.
Args:
momics (Momics): a Momics object
bed (pr.PyRanges): pr.PyRanges object
momics (Momics): a `Momics` object
bed (pr.PyRanges): `pr.PyRanges` object
"""
if not isinstance(momics, Momics):
raise ValueError("momics must be a `Momics` object.")
Expand Down Expand Up @@ -263,6 +264,84 @@ def query_sequence(self, threads: Optional[int] = None) -> "MomicsQuery":
logger.info(f"Query completed in {round(t,4)}s.")
return self

# For this function, the `type` argument can be either "mean" or "sum"
def pileup(self, type: Literal["mean", "sum"] = "mean", prefix: Optional[str] = None) -> Union[dict, None]:
"""
Aggregate query coverages into genome-wide dictionary(ies).
If the `coverage` attribute has not been populated yet, the :func:`query_tracks()` method will be called.
The coverage over each range is aggregated across all tracks. In the case of
overlapping ranges, the coverage is averaged.
Each value of the output dictionary is a dictionary itself, with the keys being the chromosome names
and the values being the coverage score, averaged for overlapping ranges.
Args:
prefix (str, optional): Prefix to the output `.bw` files to create.
If provided, queried coverage will be saved for each track in a file
named `<prefix>_<track_label>.bw`.
Returns:
A dictionary of genome-wide coverage scores, for each track. If
the queried ranges overlap, the coverage is averaged.
Note that if the output argument is provided, the results will be
saved to a `.bw` file and this function will return the Path to the file.
See Also:
:func:`MomicsQuery.query_tracks()`
Examples:
>>> mom = momics.momics.Momics('path/to/momics')
>>> windows = pr.PyRanges(
... chromosomes = ["I", "I", "I", "I"],
... starts = [0, 5, 10, 20],
... ends = [30, 30, 30, 30],
... )
>>> q = MomicsQuery(mom, windows)
>>> q.query_tracks()
>>> q.pileup()
"""
cov = self.coverage
if cov is None:
self.query_tracks()
cov = self.coverage

attrs = cov.keys()
ranges = self.ranges
chroms = self.momics.chroms()
chrom_sizes = {chrom: size for chrom, size in zip(chroms["chrom"], chroms["length"])}
tracks = {attr: dict() for attr in attrs}

for attr in iter(attrs):
attr_cov = cov[attr]
track = {chrom: np.zeros(size) for chrom, size in chrom_sizes.items()}
overlap_count = {chrom: np.zeros(size) for chrom, size in chrom_sizes.items()}

for (_, row), (_, row_cov) in zip(ranges.df.iterrows(), attr_cov.items()):
chrom = row["Chromosome"]
start = row["Start"]
end = row["End"]
track[chrom][start:end] += row_cov
overlap_count[chrom][start:end] += 1

if type == "mean":
for chrom in track:
non_zero_mask = overlap_count[chrom] > 0
track[chrom][non_zero_mask] /= overlap_count[chrom][non_zero_mask]

tracks[attr] = track

if prefix is not None:
bw_paths = []
for attr in attrs:
f = Path(f"{prefix}_{attr}.bw")
p = dict_to_bigwig(tracks[attr], f)
logger.info(f"Saved coverage for {attr} to {p.name}")
bw_paths.append(p)
return bw_paths

else:
return tracks

def to_df(self) -> pd.DataFrame:
"""Parse self.coverage attribute to a pd.DataFrame
Expand Down
13 changes: 13 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import random
from typing import Final

import numpy as np
import pyBigWig
Expand All @@ -8,6 +9,18 @@
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

NO_SKIP_OPTION: Final[str] = "--no-skip"


def pytest_addoption(parser):
parser.addoption(NO_SKIP_OPTION, action="store_true", default=False, help="also run skipped tests")


def pytest_collection_modifyitems(config, items: list):
if config.getoption(NO_SKIP_OPTION):
for test in items:
test.own_markers = [marker for marker in test.own_markers if marker.name not in ("skip", "skipif")]


@pytest.fixture(scope="session")
def momics_path(tmp_path_factory):
Expand Down
45 changes: 37 additions & 8 deletions tests/test_multirangequery.py → tests/test_momicsquery.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import multiprocessing
import os
import pickle
import tempfile
from pathlib import Path
Expand All @@ -9,15 +10,15 @@
import pyranges as pr
import pytest

from momics import momics
from momics.momics import Momics
from momics.momicsquery import MomicsQuery

multiprocessing.set_start_method("spawn", force=True)


@pytest.mark.order(2)
def test_momicsquery_tracks(momics_path: str, bed1: str):
mom = momics.Momics(momics_path)
mom = Momics(momics_path)
q = MomicsQuery(mom, "I:990-1010").query_tracks()
assert len(q.coverage) == 4
assert len(q.coverage["bw2"]["I:990-1010"]) == 20
Expand Down Expand Up @@ -45,7 +46,7 @@ def test_momicsquery_tracks(momics_path: str, bed1: str):

@pytest.mark.order(2)
def test_momicsquery_seq(momics_path: str):
mom = momics.Momics(momics_path)
mom = Momics(momics_path)
q = MomicsQuery(mom, "I:0-10").query_sequence()
assert len(q.seq) == 1
assert q.seq["nucleotide"]["I:0-10"] == "ATCGATCGAT"
Expand All @@ -57,7 +58,7 @@ def test_momicsquery_seq(momics_path: str):

@pytest.mark.order(2)
def test_momicsquery_seq2(momics_path: str, bed1: str):
mom = momics.Momics(momics_path)
mom = Momics(momics_path)
q = MomicsQuery(mom, "I:0-10").query_sequence()
bed = pr.read_bed(bed1)
q = MomicsQuery(mom, bed).query_sequence()
Expand All @@ -69,15 +70,15 @@ def test_momicsquery_seq2(momics_path: str, bed1: str):

@pytest.mark.order(2)
def test_momicsquery_seq3(momics_path: str):
mom = momics.Momics(momics_path)
mom = Momics(momics_path)
q = MomicsQuery(mom, "I:0-10").query_sequence()
q = MomicsQuery(mom, "I").query_sequence()
assert len(q.seq["nucleotide"]["I:0-10000"]) == 10000


@pytest.mark.order(2)
def test_momicsquery_seq4(momics_path: str, bed1: str):
mom = momics.Momics(momics_path)
mom = Momics(momics_path)
q = MomicsQuery(mom, "I:0-10").query_sequence()
bed = pr.read_bed(bed1)
q = MomicsQuery(mom, bed).query_sequence()
Expand All @@ -86,7 +87,7 @@ def test_momicsquery_seq4(momics_path: str, bed1: str):

@pytest.mark.order(2)
def test_momicsquery_seq5(momics_path: str, bed1: str):
mom = momics.Momics(momics_path)
mom = Momics(momics_path)
bed = pr.read_bed(bed1)
print(bed)
q = MomicsQuery(mom, "I:0-10").query_sequence()
Expand Down Expand Up @@ -115,7 +116,7 @@ def temp_npz_file():

@pytest.mark.order(2)
def test_to_json_npz(momics_path: str, temp_json_file: Path, temp_npz_file: Path):
mom = momics.Momics(momics_path)
mom = Momics(momics_path)
q = MomicsQuery(mom, "I:0-10").query_sequence().query_tracks()

q.to_json(temp_json_file)
Expand All @@ -133,3 +134,31 @@ def test_to_json_npz(momics_path: str, temp_json_file: Path, temp_npz_file: Path
assert seq["nucleotide"]["I:0-10"] == "ATCGATCGAT"
assert list(cov.keys()) == ["bw2", "custom", "bw3", "bw4", "nucleotide"]
assert list(cov["bw2"].keys()) == ["I:0-10"]


@pytest.mark.order(2)
def test_pileup(momics_path: str):
mom = Momics(momics_path)
b = mom.bins(3, 1)["I", 6990:7010]
q = MomicsQuery(mom, b).query_tracks()
x = q.pileup()
print(x.keys())
assert list(x.keys()) == ["bw2", "custom", "bw3", "bw4"]
assert [len(y) for y in x["bw2"].values()] == [10000, 20000, 30000]
assert x["bw2"]["I"].shape == (10000,)

b = mom.bins(3, 1)["I", 6990:7010]
x = q.pileup(type="mean")
assert x["bw2"]["I"].shape == (10000,)
vec = x["bw2"]["I"][6985:7015] - np.array([0] * 3 + [0.06] * 12 + [0.07] * 12 + [0] * 3) < 1e-6
assert sum(vec) == 30
x = q.pileup(type="sum")
assert x["bw2"]["I"].shape == (10000,)
res = np.array([0] * 3 + [0.06, 0.12] + [0.18] * 10 + [0.21] * 10 + [0.14, 0.07] + [0] * 3)
vec = x["bw2"]["I"][6985:7015] - res < 1e-6
assert sum(vec) == 30

x = q.pileup(prefix="tmp__")
print(x)
assert len(x) == 4
assert [os.path.exists(idx) for idx in iter(x)].count(True) == 4

0 comments on commit f2296b7

Please sign in to comment.