fix: some bugs

js2264 · Oct 18, 2024 · b5d2a2e · b5d2a2e
1 parent 1893acb
commit b5d2a2e
Show file tree

Hide file tree

Showing 7 changed files with 31 additions and 12 deletions.
diff --git a/src/momics/dataloader.py b/src/momics/dataloader.py
@@ -75,7 +75,7 @@ def __init__(
         self.features = features
         self.target = target
 
-        if target_size is not None and target_size >= int(widths[0]):
+        if target_size is not None and target_size > int(widths[0]):
             raise ValueError("Target size must be smaller than the features width.")
         self.target_size = target_size
 

diff --git a/src/momics/export.py b/src/momics/export.py
@@ -3,6 +3,7 @@
 from pathlib import Path
 import numpy as np
 import pyBigWig
+import tiledb
 import Bio
 from Bio import SeqIO
 
@@ -32,11 +33,12 @@ def export_track(momics: Momics, track: str, output: Path) -> Momics:
     chrom_sizes = momics.chroms()[["chrom", "length"]].apply(tuple, axis=1).tolist()
     bw.addHeader(chrom_sizes)
     for chrom, chrom_length in chrom_sizes:
-        q = MultiRangeQuery(momics, chrom).query_tracks(tracks=[track])
+        tdb = momics._build_uri("coverage", f"{chrom}.tdb")
+        with tiledb.open(tdb, "r", ctx=momics.cfg.ctx) as A:
+            values0 = A.query(attrs=[track])[:][track][1:]
         chroms = np.array([chrom] * chrom_length)
         starts = np.array(range(chrom_length))
         ends = starts + 1
-        values0 = q.coverage[track][next(iter(q.coverage[track].keys()))]  # type: ignore
         bw.addEntries(chroms, starts=starts, ends=ends, values=values0)
     bw.close()
 

diff --git a/src/momics/momics.py b/src/momics/momics.py
@@ -533,6 +533,7 @@ def features(self, label: Optional[str] = None) -> Union[pd.DataFrame, pr.PyRang
                 idx = ft[ft["label"] == label]["idx"].iloc[0]
                 with tiledb.open(tdb, "r", ctx=self.cfg.ctx) as A:
                     x = A.query(cond=f"idx=={idx}").df[:]
+                    x.iloc[:, 0] = x.iloc[:, 0].astype(str)
                     x.iloc[:, 0] = chrom
                     x.iloc[:, 1] = x.iloc[:, 1] - 1
                     ranges.append(x)

diff --git a/src/momics/utils.py b/src/momics/utils.py
@@ -142,10 +142,9 @@ def split_ranges(pyranges, ratio=0.8, shuffle=True) -> Tuple[pr.PyRanges, pr.PyR
     Returns:
         Tuple[pr.PyRanges, pr.PyRanges]: A tuple of two PyRanges objects.
     """
+    df = pyranges.df
     if shuffle:
         df = pyranges.df.sample(frac=1, random_state=42).reset_index(drop=True)
-    else:
-        df = pyranges.df
     split_idx = int(len(df) * ratio)
     train_df = df.iloc[:split_idx]
     test_df = df.iloc[split_idx:]
@@ -168,28 +167,34 @@ def pyranges_to_bw(pyranges: pr.PyRanges, scores: np.ndarray, output: str) -> No
     Returns:
         None
     """
+    # Abort if output file already exists
+    if Path(output).exists():
+        raise FileExistsError(f"Output file '{output}' already exists")
+
     # Check that pyranges length is the same as scores dim 0
     if len(pyranges) != scores.shape[0]:
         raise ValueError("Length of PyRanges object must be the same as scores dimension 0")
 
     # Check that all pyranges widths are equal to the scores dim 1
-    widths = pyranges.End - pyranges.Start
+    widths = pyranges.End - pyranges.Start + 1
     if len(set(widths)) != 1:
         raise ValueError("All ranges must have the same width")
     if next(iter(widths)) != scores.shape[1]:
         raise ValueError("All ranges must have the same width as the second dimension of scores")
 
     # Save chrom sizes in header
     bw = pyBigWig.open(output, "w")
-    chrom_sizes = pyranges.df.groupby("Chromosome")["End"].max().to_dict()
+    chrom_sizes = pyranges.df.groupby("Chromosome", observed=False)["End"].max().to_dict()
     chroms = list(chrom_sizes.keys())
     sizes = list(chrom_sizes.values())
     bw.addHeader(list(zip(chroms, sizes)))
 
     # Iterate over the PyRanges and write corresponding scores
-    for i, (chrom, start, end) in enumerate(zip(pyranges.Chromosome, pyranges.Start, pyranges.End)):
+    df = pyranges.df
+    df.Start = df.Start
+    for i, (chrom, start, end) in enumerate(zip(df.Chromosome, df.Start, df.End)):
         score = scores[i]
-        positions = list(range(start, end))
+        positions = list(range(start, end + 1))
         bw.addEntries([chrom] * len(positions), positions, ends=[p + 1 for p in positions], values=score)
 
     # Step 4: Close the BigWig file

diff --git a/tests/test_momics.py b/tests/test_momics.py
@@ -1,3 +1,4 @@
+from pathlib import Path
 import numpy as np
 import pyranges as pr
 import pandas as pd
@@ -93,6 +94,8 @@ def test_Momics_ingest_track(momics_path: str, bw1: str, bw2: str):
 @pytest.mark.order(1)
 def test_Momics_recover_track(momics_path: str):
     mom = momics.Momics(momics_path)
+    print(mom.path)
+
     with pytest.raises(ValueError, match=r".*not found"):
         mom.tracks("bw1323")
 
@@ -102,12 +105,12 @@ def test_Momics_recover_track(momics_path: str):
     act = {chrom: [0] * length for chrom, length in chrom_sizes.items()}
     for chrom, size in chrom_sizes.items():
         intervals = [(i, i + 1000, i / 100000) for i in range(0, size, 1000)]
-        x = [[v] * n for (_, n, v) in intervals]
+        x = [[v] * 1000 for (_, _, v) in intervals]
         arr = np.array([item for sublist in x for item in sublist], dtype=np.float32)
         act[chrom] = arr  # type: ignore
 
     for chrom in chrom_sizes.keys():
-        assert cov[chrom].__eq__(act[chrom])
+        assert cov[chrom].__eq__(act[chrom]).all()
 
 
 @pytest.mark.order(1)
@@ -186,3 +189,10 @@ def test_Momics_features(momics_path: str):
 
     ft1 = mom.bins(1000, 2000, cut_last_bin_out=True).df
     mom.features("ft1").df[["Chromosome", "Start", "End"]].__eq__(ft1)
+
+
+@pytest.mark.order(99999999)
+def test_Momics_remove(momics_path: str):
+    mom = momics.Momics(momics_path)
+    mom.remove()
+    assert not Path(mom.path).exists()
diff --git a/tests/test_multirangequery.py b/tests/test_multirangequery.py
@@ -111,6 +111,7 @@ def temp_npz_file():
     temp_npz_file.unlink()
 
 
+@pytest.mark.order(2)
 def test_to_json_npz(momics_path: str, temp_json_file: Path, temp_npz_file: Path):
     mom = momics.Momics(momics_path)
     q = MultiRangeQuery(mom, "I:1-10").query_sequence().query_tracks()

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -77,6 +77,6 @@ def test_pyranges_to_bw():
     with pytest.raises(ValueError, match=r"All ranges must have the same width as.*"):
         utils.pyranges_to_bw(rg, np.array([[1, 2], [2, 2], [3, 2]]), "out.bw")
 
-    utils.pyranges_to_bw(rg, np.array([[0.1] * 9, [0.2] * 9, [0.3] * 9]), "out.bw")
+    utils.pyranges_to_bw(rg, np.array([[0.1] * 10, [0.2] * 10, [0.3] * 10]), "out.bw")
     assert os.path.exists("out.bw")
     os.remove("out.bw")