Skip to content

Commit

Permalink
fix: some bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
js2264 committed Oct 18, 2024
1 parent 1893acb commit b5d2a2e
Show file tree
Hide file tree
Showing 7 changed files with 31 additions and 12 deletions.
2 changes: 1 addition & 1 deletion src/momics/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def __init__(
self.features = features
self.target = target

if target_size is not None and target_size >= int(widths[0]):
if target_size is not None and target_size > int(widths[0]):
raise ValueError("Target size must be smaller than the features width.")
self.target_size = target_size

Expand Down
6 changes: 4 additions & 2 deletions src/momics/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from pathlib import Path
import numpy as np
import pyBigWig
import tiledb
import Bio
from Bio import SeqIO

Expand Down Expand Up @@ -32,11 +33,12 @@ def export_track(momics: Momics, track: str, output: Path) -> Momics:
chrom_sizes = momics.chroms()[["chrom", "length"]].apply(tuple, axis=1).tolist()
bw.addHeader(chrom_sizes)
for chrom, chrom_length in chrom_sizes:
q = MultiRangeQuery(momics, chrom).query_tracks(tracks=[track])
tdb = momics._build_uri("coverage", f"{chrom}.tdb")
with tiledb.open(tdb, "r", ctx=momics.cfg.ctx) as A:
values0 = A.query(attrs=[track])[:][track][1:]
chroms = np.array([chrom] * chrom_length)
starts = np.array(range(chrom_length))
ends = starts + 1
values0 = q.coverage[track][next(iter(q.coverage[track].keys()))] # type: ignore
bw.addEntries(chroms, starts=starts, ends=ends, values=values0)
bw.close()

Expand Down
1 change: 1 addition & 0 deletions src/momics/momics.py
Original file line number Diff line number Diff line change
Expand Up @@ -533,6 +533,7 @@ def features(self, label: Optional[str] = None) -> Union[pd.DataFrame, pr.PyRang
idx = ft[ft["label"] == label]["idx"].iloc[0]
with tiledb.open(tdb, "r", ctx=self.cfg.ctx) as A:
x = A.query(cond=f"idx=={idx}").df[:]
x.iloc[:, 0] = x.iloc[:, 0].astype(str)
x.iloc[:, 0] = chrom
x.iloc[:, 1] = x.iloc[:, 1] - 1
ranges.append(x)
Expand Down
17 changes: 11 additions & 6 deletions src/momics/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,10 +142,9 @@ def split_ranges(pyranges, ratio=0.8, shuffle=True) -> Tuple[pr.PyRanges, pr.PyR
Returns:
Tuple[pr.PyRanges, pr.PyRanges]: A tuple of two PyRanges objects.
"""
df = pyranges.df
if shuffle:
df = pyranges.df.sample(frac=1, random_state=42).reset_index(drop=True)
else:
df = pyranges.df
split_idx = int(len(df) * ratio)
train_df = df.iloc[:split_idx]
test_df = df.iloc[split_idx:]
Expand All @@ -168,28 +167,34 @@ def pyranges_to_bw(pyranges: pr.PyRanges, scores: np.ndarray, output: str) -> No
Returns:
None
"""
# Abort if output file already exists
if Path(output).exists():
raise FileExistsError(f"Output file '{output}' already exists")

# Check that pyranges length is the same as scores dim 0
if len(pyranges) != scores.shape[0]:
raise ValueError("Length of PyRanges object must be the same as scores dimension 0")

# Check that all pyranges widths are equal to the scores dim 1
widths = pyranges.End - pyranges.Start
widths = pyranges.End - pyranges.Start + 1
if len(set(widths)) != 1:
raise ValueError("All ranges must have the same width")
if next(iter(widths)) != scores.shape[1]:
raise ValueError("All ranges must have the same width as the second dimension of scores")

# Save chrom sizes in header
bw = pyBigWig.open(output, "w")
chrom_sizes = pyranges.df.groupby("Chromosome")["End"].max().to_dict()
chrom_sizes = pyranges.df.groupby("Chromosome", observed=False)["End"].max().to_dict()
chroms = list(chrom_sizes.keys())
sizes = list(chrom_sizes.values())
bw.addHeader(list(zip(chroms, sizes)))

# Iterate over the PyRanges and write corresponding scores
for i, (chrom, start, end) in enumerate(zip(pyranges.Chromosome, pyranges.Start, pyranges.End)):
df = pyranges.df
df.Start = df.Start
for i, (chrom, start, end) in enumerate(zip(df.Chromosome, df.Start, df.End)):
score = scores[i]
positions = list(range(start, end))
positions = list(range(start, end + 1))
bw.addEntries([chrom] * len(positions), positions, ends=[p + 1 for p in positions], values=score)

# Step 4: Close the BigWig file
Expand Down
14 changes: 12 additions & 2 deletions tests/test_momics.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from pathlib import Path
import numpy as np
import pyranges as pr
import pandas as pd
Expand Down Expand Up @@ -93,6 +94,8 @@ def test_Momics_ingest_track(momics_path: str, bw1: str, bw2: str):
@pytest.mark.order(1)
def test_Momics_recover_track(momics_path: str):
mom = momics.Momics(momics_path)
print(mom.path)

with pytest.raises(ValueError, match=r".*not found"):
mom.tracks("bw1323")

Expand All @@ -102,12 +105,12 @@ def test_Momics_recover_track(momics_path: str):
act = {chrom: [0] * length for chrom, length in chrom_sizes.items()}
for chrom, size in chrom_sizes.items():
intervals = [(i, i + 1000, i / 100000) for i in range(0, size, 1000)]
x = [[v] * n for (_, n, v) in intervals]
x = [[v] * 1000 for (_, _, v) in intervals]
arr = np.array([item for sublist in x for item in sublist], dtype=np.float32)
act[chrom] = arr # type: ignore

for chrom in chrom_sizes.keys():
assert cov[chrom].__eq__(act[chrom])
assert cov[chrom].__eq__(act[chrom]).all()


@pytest.mark.order(1)
Expand Down Expand Up @@ -186,3 +189,10 @@ def test_Momics_features(momics_path: str):

ft1 = mom.bins(1000, 2000, cut_last_bin_out=True).df
mom.features("ft1").df[["Chromosome", "Start", "End"]].__eq__(ft1)


@pytest.mark.order(99999999)
def test_Momics_remove(momics_path: str):
mom = momics.Momics(momics_path)
mom.remove()
assert not Path(mom.path).exists()
1 change: 1 addition & 0 deletions tests/test_multirangequery.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ def temp_npz_file():
temp_npz_file.unlink()


@pytest.mark.order(2)
def test_to_json_npz(momics_path: str, temp_json_file: Path, temp_npz_file: Path):
mom = momics.Momics(momics_path)
q = MultiRangeQuery(mom, "I:1-10").query_sequence().query_tracks()
Expand Down
2 changes: 1 addition & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,6 @@ def test_pyranges_to_bw():
with pytest.raises(ValueError, match=r"All ranges must have the same width as.*"):
utils.pyranges_to_bw(rg, np.array([[1, 2], [2, 2], [3, 2]]), "out.bw")

utils.pyranges_to_bw(rg, np.array([[0.1] * 9, [0.2] * 9, [0.3] * 9]), "out.bw")
utils.pyranges_to_bw(rg, np.array([[0.1] * 10, [0.2] * 10, [0.3] * 10]), "out.bw")
assert os.path.exists("out.bw")
os.remove("out.bw")

0 comments on commit b5d2a2e

Please sign in to comment.