From b5d2a2e886e55018dc465ce0e229f30bd895139a Mon Sep 17 00:00:00 2001 From: js2264 Date: Fri, 18 Oct 2024 17:00:52 +0200 Subject: [PATCH] fix: some bugs --- src/momics/dataloader.py | 2 +- src/momics/export.py | 6 ++++-- src/momics/momics.py | 1 + src/momics/utils.py | 17 +++++++++++------ tests/test_momics.py | 14 ++++++++++++-- tests/test_multirangequery.py | 1 + tests/test_utils.py | 2 +- 7 files changed, 31 insertions(+), 12 deletions(-) diff --git a/src/momics/dataloader.py b/src/momics/dataloader.py index 97d2a7a..d8bb0ca 100644 --- a/src/momics/dataloader.py +++ b/src/momics/dataloader.py @@ -75,7 +75,7 @@ def __init__( self.features = features self.target = target - if target_size is not None and target_size >= int(widths[0]): + if target_size is not None and target_size > int(widths[0]): raise ValueError("Target size must be smaller than the features width.") self.target_size = target_size diff --git a/src/momics/export.py b/src/momics/export.py index 73fef6f..e974374 100644 --- a/src/momics/export.py +++ b/src/momics/export.py @@ -3,6 +3,7 @@ from pathlib import Path import numpy as np import pyBigWig +import tiledb import Bio from Bio import SeqIO @@ -32,11 +33,12 @@ def export_track(momics: Momics, track: str, output: Path) -> Momics: chrom_sizes = momics.chroms()[["chrom", "length"]].apply(tuple, axis=1).tolist() bw.addHeader(chrom_sizes) for chrom, chrom_length in chrom_sizes: - q = MultiRangeQuery(momics, chrom).query_tracks(tracks=[track]) + tdb = momics._build_uri("coverage", f"{chrom}.tdb") + with tiledb.open(tdb, "r", ctx=momics.cfg.ctx) as A: + values0 = A.query(attrs=[track])[:][track][1:] chroms = np.array([chrom] * chrom_length) starts = np.array(range(chrom_length)) ends = starts + 1 - values0 = q.coverage[track][next(iter(q.coverage[track].keys()))] # type: ignore bw.addEntries(chroms, starts=starts, ends=ends, values=values0) bw.close() diff --git a/src/momics/momics.py b/src/momics/momics.py index 3fb5d7c..e03a413 100644 --- a/src/momics/momics.py +++ b/src/momics/momics.py @@ -533,6 +533,7 @@ def features(self, label: Optional[str] = None) -> Union[pd.DataFrame, pr.PyRang idx = ft[ft["label"] == label]["idx"].iloc[0] with tiledb.open(tdb, "r", ctx=self.cfg.ctx) as A: x = A.query(cond=f"idx=={idx}").df[:] + x.iloc[:, 0] = x.iloc[:, 0].astype(str) x.iloc[:, 0] = chrom x.iloc[:, 1] = x.iloc[:, 1] - 1 ranges.append(x) diff --git a/src/momics/utils.py b/src/momics/utils.py index 35434f6..9d8df3f 100644 --- a/src/momics/utils.py +++ b/src/momics/utils.py @@ -142,10 +142,9 @@ def split_ranges(pyranges, ratio=0.8, shuffle=True) -> Tuple[pr.PyRanges, pr.PyR Returns: Tuple[pr.PyRanges, pr.PyRanges]: A tuple of two PyRanges objects. """ + df = pyranges.df if shuffle: df = pyranges.df.sample(frac=1, random_state=42).reset_index(drop=True) - else: - df = pyranges.df split_idx = int(len(df) * ratio) train_df = df.iloc[:split_idx] test_df = df.iloc[split_idx:] @@ -168,12 +167,16 @@ def pyranges_to_bw(pyranges: pr.PyRanges, scores: np.ndarray, output: str) -> No Returns: None """ + # Abort if output file already exists + if Path(output).exists(): + raise FileExistsError(f"Output file '{output}' already exists") + # Check that pyranges length is the same as scores dim 0 if len(pyranges) != scores.shape[0]: raise ValueError("Length of PyRanges object must be the same as scores dimension 0") # Check that all pyranges widths are equal to the scores dim 1 - widths = pyranges.End - pyranges.Start + widths = pyranges.End - pyranges.Start + 1 if len(set(widths)) != 1: raise ValueError("All ranges must have the same width") if next(iter(widths)) != scores.shape[1]: @@ -181,15 +184,17 @@ def pyranges_to_bw(pyranges: pr.PyRanges, scores: np.ndarray, output: str) -> No # Save chrom sizes in header bw = pyBigWig.open(output, "w") - chrom_sizes = pyranges.df.groupby("Chromosome")["End"].max().to_dict() + chrom_sizes = pyranges.df.groupby("Chromosome", observed=False)["End"].max().to_dict() chroms = list(chrom_sizes.keys()) sizes = list(chrom_sizes.values()) bw.addHeader(list(zip(chroms, sizes))) # Iterate over the PyRanges and write corresponding scores - for i, (chrom, start, end) in enumerate(zip(pyranges.Chromosome, pyranges.Start, pyranges.End)): + df = pyranges.df + df.Start = df.Start + for i, (chrom, start, end) in enumerate(zip(df.Chromosome, df.Start, df.End)): score = scores[i] - positions = list(range(start, end)) + positions = list(range(start, end + 1)) bw.addEntries([chrom] * len(positions), positions, ends=[p + 1 for p in positions], values=score) # Step 4: Close the BigWig file diff --git a/tests/test_momics.py b/tests/test_momics.py index 4e52ad3..fed66fe 100644 --- a/tests/test_momics.py +++ b/tests/test_momics.py @@ -1,3 +1,4 @@ +from pathlib import Path import numpy as np import pyranges as pr import pandas as pd @@ -93,6 +94,8 @@ def test_Momics_ingest_track(momics_path: str, bw1: str, bw2: str): @pytest.mark.order(1) def test_Momics_recover_track(momics_path: str): mom = momics.Momics(momics_path) + print(mom.path) + with pytest.raises(ValueError, match=r".*not found"): mom.tracks("bw1323") @@ -102,12 +105,12 @@ def test_Momics_recover_track(momics_path: str): act = {chrom: [0] * length for chrom, length in chrom_sizes.items()} for chrom, size in chrom_sizes.items(): intervals = [(i, i + 1000, i / 100000) for i in range(0, size, 1000)] - x = [[v] * n for (_, n, v) in intervals] + x = [[v] * 1000 for (_, _, v) in intervals] arr = np.array([item for sublist in x for item in sublist], dtype=np.float32) act[chrom] = arr # type: ignore for chrom in chrom_sizes.keys(): - assert cov[chrom].__eq__(act[chrom]) + assert cov[chrom].__eq__(act[chrom]).all() @pytest.mark.order(1) @@ -186,3 +189,10 @@ def test_Momics_features(momics_path: str): ft1 = mom.bins(1000, 2000, cut_last_bin_out=True).df mom.features("ft1").df[["Chromosome", "Start", "End"]].__eq__(ft1) + + +@pytest.mark.order(99999999) +def test_Momics_remove(momics_path: str): + mom = momics.Momics(momics_path) + mom.remove() + assert not Path(mom.path).exists() diff --git a/tests/test_multirangequery.py b/tests/test_multirangequery.py index 3b5c630..167e563 100644 --- a/tests/test_multirangequery.py +++ b/tests/test_multirangequery.py @@ -111,6 +111,7 @@ def temp_npz_file(): temp_npz_file.unlink() +@pytest.mark.order(2) def test_to_json_npz(momics_path: str, temp_json_file: Path, temp_npz_file: Path): mom = momics.Momics(momics_path) q = MultiRangeQuery(mom, "I:1-10").query_sequence().query_tracks() diff --git a/tests/test_utils.py b/tests/test_utils.py index ed28b19..f50c053 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -77,6 +77,6 @@ def test_pyranges_to_bw(): with pytest.raises(ValueError, match=r"All ranges must have the same width as.*"): utils.pyranges_to_bw(rg, np.array([[1, 2], [2, 2], [3, 2]]), "out.bw") - utils.pyranges_to_bw(rg, np.array([[0.1] * 9, [0.2] * 9, [0.3] * 9]), "out.bw") + utils.pyranges_to_bw(rg, np.array([[0.1] * 10, [0.2] * 10, [0.3] * 10]), "out.bw") assert os.path.exists("out.bw") os.remove("out.bw")