feat: add sequence support (CLI and API)

js2264 · Sep 18, 2024 · f29c8df · f29c8df
1 parent e8f8762
commit f29c8df
Show file tree

Hide file tree

Showing 6 changed files with 233 additions and 13 deletions.
diff --git a/docs/source/user_guide/get-started.md b/docs/source/user_guide/get-started.md
@@ -30,11 +30,17 @@ In the latter case, you can install `momics` and all its dependencies as follows
 
 ```
 momics create hg19.momics 
+
 momics add chroms -f hg19.chrom.sizes hg19.momics
-momics ls --table chroms hg19.momics
+momics add seq -f hg19.fa hg19.momics 
 momics add tracks -f bw_a=sample1.bw -f bw_b=sample2.bw -f bw_c=sample3.bw hg19.momics 
-momics ls hg19.momics
-momics query --coordinates "I:10-1000" hg19.momics
+
+momics ls --table chroms hg19.momics
+momics ls --table tracks hg19.momics
+
+momics query seq --coordinates "I:10-1000" hg19.momics
+momics query tracks --coordinates "I:10-1000" hg19.momics
+
 momics export --track bw_b --prefix b_exported.bw hg19.momics
 momics remove --track bw_c hg19.momics
 ```
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,6 +29,7 @@ classifiers = [
 dependencies = [
     "tiledb",
     "pyBigWig",
+    "pyfaidx", 
     "numpy", 
     "pandas", 
     "pyarrow>=1.0",

diff --git a/src/momics/api.py b/src/momics/api.py
@@ -5,6 +5,7 @@
 import numpy as np
 import pandas as pd
 import pyBigWig
+import pyfaidx
 import tiledb
 
 from . import utils
@@ -65,6 +66,45 @@ def _get_table(self, tdb: str) -> Optional[pd.DataFrame]:
 
         return a
 
+    def _create_sequence_schema(self, tile: int, compression: int):
+        # Create every path/genome/sequence/{chrom}.tdb
+        chroms = self.chroms()
+        for chrom in chroms["chr"]:
+            chrom_length = np.array(chroms[chroms["chr"] == chrom]["length"])[0]
+            tdb = os.path.join(self.path, "genome", "sequence", f"{chrom}.tdb")
+            dom = tiledb.Domain(
+                tiledb.Dim(
+                    name="position",
+                    domain=(0, chrom_length - 1),
+                    dtype=np.int64,
+                    tile=tile,
+                )
+            )
+            attr = tiledb.Attr(
+                name="nucleotide",
+                dtype="ascii",
+                filters=tiledb.FilterList(
+                    [
+                        tiledb.LZ4Filter(),
+                        tiledb.ZstdFilter(level=compression),
+                    ],
+                    chunksize=1000,
+                ),
+            )
+            schema = tiledb.ArraySchema(
+                domain=dom,
+                attrs=[attr],
+                sparse=False,
+                coords_filters=tiledb.FilterList(
+                    [
+                        tiledb.LZ4Filter(),
+                        tiledb.ZstdFilter(level=compression),
+                    ],
+                    chunksize=1000,
+                ),
+            )
+            tiledb.DenseArray.create(tdb, schema)
+
     def _create_track_schema(self, max_bws: int, tile: int, compression: int):
         # Create path/coverage/tracks.tdb
         tdb = os.path.join(self.path, "coverage", "tracks.tdb")
@@ -146,6 +186,16 @@ def _populate_chroms_table(self, bws: Dict[str, str]):
                         coord2 = np.repeat(idx + n, len(coord1))
                         A[coord1, coord2] = {"scores": arr}
 
+    def _populate_sequence_table(self, fasta: str):
+        chroms = self.chroms()
+        for chrom in chroms["chr"]:
+            tdb = os.path.join(self.path, "genome", "sequence", f"{chrom}.tdb")
+            chrom_length = np.array(chroms[chroms["chr"] == chrom]["length"])[0]
+            with pyfaidx.Fasta(fasta) as fa:
+                chrom_seq = fa.get_seq(chrom, 1, chrom_length)
+            with tiledb.DenseArray(tdb, mode="w") as A:
+                A[:] = {"nucleotide": np.array(list(chrom_seq.seq), dtype="S1")}
+
     def _purge_track(self, track: str):
         idx = self.tracks()["idx"][self.tracks()["label"] == track].values[0]
         qc = f"idx == {idx}"
@@ -174,6 +224,30 @@ def chroms(self) -> pd.DataFrame:
             else pd.DataFrame(columns=["chrom_index", "chr", "length"])
         )
 
+    def sequence(self) -> pd.DataFrame:
+        """
+        Extract sequence table from a `.momics` repository.
+
+        Returns
+        -------
+        pd.DataFrame
+            A data frame listing one chromosome per row, with first/last 10 nts.
+        """
+        try:
+            self.query_sequence(f"{self.chroms()['chr'][0]}:1-2")
+        except tiledb.cc.TileDBError:
+            raise ValueError("Genomic sequence not added yet to the repository.")
+
+        chroms = self.chroms()
+        chroms["seq"] = pd.Series()
+        for chrom in chroms["chr"]:
+            chrom_len = chroms[chroms["chr"] == chrom]["length"].iloc[0]
+            start_nt = "".join(self.query_sequence(f"{chrom}:1-10"))
+            end_nt = "".join(self.query_sequence(f"{chrom}:{chrom_len-10}-{chrom_len}"))
+            chroms.loc[chroms["chr"] == chrom, "seq"] = start_nt + "..." + end_nt
+
+        return chroms
+
     def tracks(self) -> pd.DataFrame:
         """
         Extract table of ingested bigwigs.
@@ -191,6 +265,40 @@ def tracks(self) -> pd.DataFrame:
             else pd.DataFrame(columns=["idx", "label", "path"])
         )
 
+    def add_sequence(self, fasta: str, tile: int = 10000, compression: int = 3):
+        """
+        Ingest multi-sequence fasta file to the `.momics` repository.
+
+        Parameters
+        ----------
+        fasta : str
+            Path to fasta file
+        tile : int
+            Tile size.
+        compression : int
+            Compression level.
+        """
+
+        # Abort if `chroms` have not been filled
+        if self.chroms().empty:
+            raise ValueError("Please fill out `chroms` table first.")
+
+        # Abort if sequence table already exists
+        try:
+            self.query_sequence(f"{self.chroms()['chr'][0]}:1-2")
+            raise ValueError("Sequence already added to the repository.")
+        except tiledb.cc.TileDBError:
+            pass
+
+        # Abort if chr lengths in provided fasta do not match those in `chroms`
+        utils._check_fasta_lengths(fasta, self.chroms())
+
+        # Create sequence tables schema
+        self._create_sequence_schema(tile, compression)
+
+        # Populate each `path/genome/sequence/{chrom}.tdb`
+        self._populate_sequence_table(fasta)
+
     def add_tracks(
         self, bws: dict, max_bws: int = 9999, tile: int = 10000, compression: int = 3
     ):
@@ -269,35 +377,72 @@ def add_chroms(self, chr_lengths: dict, genome_version: str = ""):
             array.meta["genome_assembly_version"] = genome_version
             array.meta["timestamp"] = datetime.now().isoformat()
 
-    def query(
+    def query_tracks(
         self,
         query: str,
+        with_seq: bool = False,
     ):
         """
         Query bigwig coverage tracks from a `.momics` repository.
 
         Parameters
         ----------
-        chr_lengths : str
+        query : str
             UCSC-style chromosome interval (e.g. "II:12001-15000")
+        with_seq : bool
+            Append sequence to the resulting DataFrame
         """
         tr = self.tracks().drop("path", axis=1)
         if ":" in query:
             chrom, range_part = query.split(":")
             utils._check_chr_name(chrom, self.chroms())
-            start = range_part.split("-")[0]
-            end = range_part.split("-")[1]
+            start = int(range_part.split("-")[0]) - 1
+            end = int(range_part.split("-")[1]) - 1
             with tiledb.open(f"{self.path}/coverage/{chrom}.tdb", "r") as array:
-                data = array.df[int(start) : int(end), :]
+                data = array.df[start:end, :]
         else:
             chrom = query
             utils._check_chr_name(chrom, self.chroms())
             with tiledb.open(f"{self.path}/coverage/{chrom}.tdb", "r") as array:
                 data = array.df[:]
 
         mdata = pd.merge(tr, data, on="idx").drop("idx", axis=1)
+
+        if with_seq:
+            seq = self.query_sequence(query)
+            p = pd.DataFrame({"seq": seq})
+            p = pd.concat([p] * len(tr), ignore_index=True)
+            mdata["seq"] = p["seq"]
+
         return mdata
 
+    def query_sequence(
+        self,
+        query: str,
+    ):
+        """
+        Query chromosome sequence from a `.momics` repository.
+
+        Parameters
+        ----------
+        chr_lengths : str
+            UCSC-style chromosome interval (e.g. "II:12001-15000")
+        """
+        if ":" in query:
+            chrom, range_part = query.split(":")
+            utils._check_chr_name(chrom, self.chroms())
+            start = int(range_part.split("-")[0]) - 1
+            end = int(range_part.split("-")[1]) - 1
+            with tiledb.open(f"{self.path}/genome/sequence/{chrom}.tdb", "r") as A:
+                seq = A.df[start:end]["nucleotide"]
+        else:
+            chrom = query
+            utils._check_chr_name(chrom, self.chroms())
+            with tiledb.open(f"{self.path}/genome/sequence/{chrom}.tdb", "r") as A:
+                seq = A.df[:]["nucleotide"]
+
+        return seq
+
     def remove_track(self, track: str):
         """
         Remove a track from a `.momics` repository.
@@ -336,7 +481,7 @@ def export_track(self, track: str, prefix: str):
         bw.addHeader(chrom_sizes)
         for chrom, _ in chrom_sizes:
             print(chrom)
-            q = self.query(chrom)
+            q = self.query_tracks(chrom)
             q = q[q["label"] == track].dropna(subset=["scores"])
             # starts = q["position"].to_numpy(dtype=np.int64)
             # ends = starts + 1

diff --git a/src/momics/cli/add.py b/src/momics/cli/add.py
@@ -58,3 +58,20 @@ def tracks(ctx, file, path):
     m = api.Momics(path, create=False)
     m.add_tracks(fs)
     print(m.tracks().iloc[np.where(m.tracks()["label"] != "None")].iloc[:, 0:2])
+
+
+@add.command()
+@click.option(
+    "--file",
+    "-f",
+    help="Fasta file",
+    type=click.Path(exists=True),
+    required=True,
+)
+@click.argument("path", metavar="MOMICS_REPO", required=True)
+@click.pass_context
+def seq(ctx, file, path):
+    """Add genomic sequence to Momics."""
+    m = api.Momics(path, create=False)
+    m.add_sequence(file)
+    print(m.sequence())
diff --git a/src/momics/cli/query.py b/src/momics/cli/query.py
@@ -5,7 +5,13 @@
 from . import cli
 
 
-@cli.command()
+@cli.group()
+@click.pass_context
+def query(ctx):
+    """Query a Momics table"""
+
+
+@query.command()
 @click.option(
     "--coordinates",
     "-c",
@@ -23,10 +29,44 @@
     show_default=True,
 )
 @click.argument("path", metavar="MOMICS_REPO", type=click.Path(exists=True))
-def query(path, coordinates, output: str):
-    """Extract coverages over a chromosome interval."""
-    res = api.Momics(path, create=False).query(coordinates)
+@click.pass_context
+def tracks(ctx, path, coordinates, output: str):
+    """Extract track coverages over a chromosome interval."""
+    res = api.Momics(path, create=False).query_tracks(coordinates)
     if output is not None:
         res.to_csv(path_or_buf=output, sep="\t", index=False)
     else:
         print(res)
+
+
+@query.command()
+@click.option(
+    "--coordinates",
+    "-c",
+    help="UCSC-style coordinates",
+    type=str,
+    required=True,
+)
+@click.option(
+    "--output",
+    "-o",
+    help="Output file to save data in (data will be exported as fasta)",
+    type=click.Path(),
+    required=False,
+    default=None,
+    show_default=True,
+)
+@click.argument("path", metavar="MOMICS_REPO", type=click.Path(exists=True))
+@click.pass_context
+def seq(ctx, path, coordinates, output: str):
+    """Extract chromosomal sequence over a chromosome interval."""
+    seq = api.Momics(path, create=False).query_sequence(coordinates)
+    seq = "".join(seq)
+    if output is not None:
+        with open(output, "w") as file:
+            file.write(f">{coordinates}\n")
+            # Split the sequence into lines of 60 characters
+            for i in range(0, len(seq), 60):
+                file.write(seq[i : i + 60] + "\n")
+    else:
+        print(seq)
diff --git a/src/momics/utils.py b/src/momics/utils.py
@@ -1,4 +1,5 @@
 import pyBigWig
+import pyfaidx
 
 
 def get_chr_lengths(bw):
@@ -8,6 +9,16 @@ def get_chr_lengths(bw):
     return a
 
 
+def _check_fasta_lengths(fasta, chroms):
+    reference_lengths = dict(zip(chroms["chr"], chroms["length"]))
+    with pyfaidx.Fasta(fasta) as fa:
+        lengths = {name: len(seq) for name, seq in fa.items()}
+    if lengths != reference_lengths:
+        raise Exception(
+            f"{fa} file do not have identical chromomosome lengths."
+        )
+
+
 def _check_chr_lengths(bw_files, chroms):
     reference_lengths = dict(zip(chroms["chr"], chroms["length"]))
     for file in list(bw_files.values()):