Skip to content

Commit

Permalink
feat: add sequence support (CLI and API)
Browse files Browse the repository at this point in the history
  • Loading branch information
js2264 committed Sep 18, 2024
1 parent e8f8762 commit f29c8df
Show file tree
Hide file tree
Showing 6 changed files with 233 additions and 13 deletions.
12 changes: 9 additions & 3 deletions docs/source/user_guide/get-started.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,17 @@ In the latter case, you can install `momics` and all its dependencies as follows

```
momics create hg19.momics
momics add chroms -f hg19.chrom.sizes hg19.momics
momics ls --table chroms hg19.momics
momics add seq -f hg19.fa hg19.momics
momics add tracks -f bw_a=sample1.bw -f bw_b=sample2.bw -f bw_c=sample3.bw hg19.momics
momics ls hg19.momics
momics query --coordinates "I:10-1000" hg19.momics
momics ls --table chroms hg19.momics
momics ls --table tracks hg19.momics
momics query seq --coordinates "I:10-1000" hg19.momics
momics query tracks --coordinates "I:10-1000" hg19.momics
momics export --track bw_b --prefix b_exported.bw hg19.momics
momics remove --track bw_c hg19.momics
```
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ classifiers = [
dependencies = [
"tiledb",
"pyBigWig",
"pyfaidx",
"numpy",
"pandas",
"pyarrow>=1.0",
Expand Down
157 changes: 151 additions & 6 deletions src/momics/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import numpy as np
import pandas as pd
import pyBigWig
import pyfaidx
import tiledb

from . import utils
Expand Down Expand Up @@ -65,6 +66,45 @@ def _get_table(self, tdb: str) -> Optional[pd.DataFrame]:

return a

def _create_sequence_schema(self, tile: int, compression: int):
# Create every path/genome/sequence/{chrom}.tdb
chroms = self.chroms()
for chrom in chroms["chr"]:
chrom_length = np.array(chroms[chroms["chr"] == chrom]["length"])[0]
tdb = os.path.join(self.path, "genome", "sequence", f"{chrom}.tdb")
dom = tiledb.Domain(
tiledb.Dim(
name="position",
domain=(0, chrom_length - 1),
dtype=np.int64,
tile=tile,
)
)
attr = tiledb.Attr(
name="nucleotide",
dtype="ascii",
filters=tiledb.FilterList(
[
tiledb.LZ4Filter(),
tiledb.ZstdFilter(level=compression),
],
chunksize=1000,
),
)
schema = tiledb.ArraySchema(
domain=dom,
attrs=[attr],
sparse=False,
coords_filters=tiledb.FilterList(
[
tiledb.LZ4Filter(),
tiledb.ZstdFilter(level=compression),
],
chunksize=1000,
),
)
tiledb.DenseArray.create(tdb, schema)

def _create_track_schema(self, max_bws: int, tile: int, compression: int):
# Create path/coverage/tracks.tdb
tdb = os.path.join(self.path, "coverage", "tracks.tdb")
Expand Down Expand Up @@ -146,6 +186,16 @@ def _populate_chroms_table(self, bws: Dict[str, str]):
coord2 = np.repeat(idx + n, len(coord1))
A[coord1, coord2] = {"scores": arr}

def _populate_sequence_table(self, fasta: str):
chroms = self.chroms()
for chrom in chroms["chr"]:
tdb = os.path.join(self.path, "genome", "sequence", f"{chrom}.tdb")
chrom_length = np.array(chroms[chroms["chr"] == chrom]["length"])[0]
with pyfaidx.Fasta(fasta) as fa:
chrom_seq = fa.get_seq(chrom, 1, chrom_length)
with tiledb.DenseArray(tdb, mode="w") as A:
A[:] = {"nucleotide": np.array(list(chrom_seq.seq), dtype="S1")}

def _purge_track(self, track: str):
idx = self.tracks()["idx"][self.tracks()["label"] == track].values[0]
qc = f"idx == {idx}"
Expand Down Expand Up @@ -174,6 +224,30 @@ def chroms(self) -> pd.DataFrame:
else pd.DataFrame(columns=["chrom_index", "chr", "length"])
)

def sequence(self) -> pd.DataFrame:
"""
Extract sequence table from a `.momics` repository.
Returns
-------
pd.DataFrame
A data frame listing one chromosome per row, with first/last 10 nts.
"""
try:
self.query_sequence(f"{self.chroms()['chr'][0]}:1-2")
except tiledb.cc.TileDBError:
raise ValueError("Genomic sequence not added yet to the repository.")

chroms = self.chroms()
chroms["seq"] = pd.Series()
for chrom in chroms["chr"]:
chrom_len = chroms[chroms["chr"] == chrom]["length"].iloc[0]
start_nt = "".join(self.query_sequence(f"{chrom}:1-10"))
end_nt = "".join(self.query_sequence(f"{chrom}:{chrom_len-10}-{chrom_len}"))
chroms.loc[chroms["chr"] == chrom, "seq"] = start_nt + "..." + end_nt

return chroms

def tracks(self) -> pd.DataFrame:
"""
Extract table of ingested bigwigs.
Expand All @@ -191,6 +265,40 @@ def tracks(self) -> pd.DataFrame:
else pd.DataFrame(columns=["idx", "label", "path"])
)

def add_sequence(self, fasta: str, tile: int = 10000, compression: int = 3):
"""
Ingest multi-sequence fasta file to the `.momics` repository.
Parameters
----------
fasta : str
Path to fasta file
tile : int
Tile size.
compression : int
Compression level.
"""

# Abort if `chroms` have not been filled
if self.chroms().empty:
raise ValueError("Please fill out `chroms` table first.")

# Abort if sequence table already exists
try:
self.query_sequence(f"{self.chroms()['chr'][0]}:1-2")
raise ValueError("Sequence already added to the repository.")
except tiledb.cc.TileDBError:
pass

# Abort if chr lengths in provided fasta do not match those in `chroms`
utils._check_fasta_lengths(fasta, self.chroms())

# Create sequence tables schema
self._create_sequence_schema(tile, compression)

# Populate each `path/genome/sequence/{chrom}.tdb`
self._populate_sequence_table(fasta)

def add_tracks(
self, bws: dict, max_bws: int = 9999, tile: int = 10000, compression: int = 3
):
Expand Down Expand Up @@ -269,35 +377,72 @@ def add_chroms(self, chr_lengths: dict, genome_version: str = ""):
array.meta["genome_assembly_version"] = genome_version
array.meta["timestamp"] = datetime.now().isoformat()

def query(
def query_tracks(
self,
query: str,
with_seq: bool = False,
):
"""
Query bigwig coverage tracks from a `.momics` repository.
Parameters
----------
chr_lengths : str
query : str
UCSC-style chromosome interval (e.g. "II:12001-15000")
with_seq : bool
Append sequence to the resulting DataFrame
"""
tr = self.tracks().drop("path", axis=1)
if ":" in query:
chrom, range_part = query.split(":")
utils._check_chr_name(chrom, self.chroms())
start = range_part.split("-")[0]
end = range_part.split("-")[1]
start = int(range_part.split("-")[0]) - 1
end = int(range_part.split("-")[1]) - 1
with tiledb.open(f"{self.path}/coverage/{chrom}.tdb", "r") as array:
data = array.df[int(start) : int(end), :]
data = array.df[start:end, :]
else:
chrom = query
utils._check_chr_name(chrom, self.chroms())
with tiledb.open(f"{self.path}/coverage/{chrom}.tdb", "r") as array:
data = array.df[:]

mdata = pd.merge(tr, data, on="idx").drop("idx", axis=1)

if with_seq:
seq = self.query_sequence(query)
p = pd.DataFrame({"seq": seq})
p = pd.concat([p] * len(tr), ignore_index=True)
mdata["seq"] = p["seq"]

return mdata

def query_sequence(
self,
query: str,
):
"""
Query chromosome sequence from a `.momics` repository.
Parameters
----------
chr_lengths : str
UCSC-style chromosome interval (e.g. "II:12001-15000")
"""
if ":" in query:
chrom, range_part = query.split(":")
utils._check_chr_name(chrom, self.chroms())
start = int(range_part.split("-")[0]) - 1
end = int(range_part.split("-")[1]) - 1
with tiledb.open(f"{self.path}/genome/sequence/{chrom}.tdb", "r") as A:
seq = A.df[start:end]["nucleotide"]
else:
chrom = query
utils._check_chr_name(chrom, self.chroms())
with tiledb.open(f"{self.path}/genome/sequence/{chrom}.tdb", "r") as A:
seq = A.df[:]["nucleotide"]

return seq

def remove_track(self, track: str):
"""
Remove a track from a `.momics` repository.
Expand Down Expand Up @@ -336,7 +481,7 @@ def export_track(self, track: str, prefix: str):
bw.addHeader(chrom_sizes)
for chrom, _ in chrom_sizes:
print(chrom)
q = self.query(chrom)
q = self.query_tracks(chrom)
q = q[q["label"] == track].dropna(subset=["scores"])
# starts = q["position"].to_numpy(dtype=np.int64)
# ends = starts + 1
Expand Down
17 changes: 17 additions & 0 deletions src/momics/cli/add.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,20 @@ def tracks(ctx, file, path):
m = api.Momics(path, create=False)
m.add_tracks(fs)
print(m.tracks().iloc[np.where(m.tracks()["label"] != "None")].iloc[:, 0:2])


@add.command()
@click.option(
"--file",
"-f",
help="Fasta file",
type=click.Path(exists=True),
required=True,
)
@click.argument("path", metavar="MOMICS_REPO", required=True)
@click.pass_context
def seq(ctx, file, path):
"""Add genomic sequence to Momics."""
m = api.Momics(path, create=False)
m.add_sequence(file)
print(m.sequence())
48 changes: 44 additions & 4 deletions src/momics/cli/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,13 @@
from . import cli


@cli.command()
@cli.group()
@click.pass_context
def query(ctx):
"""Query a Momics table"""


@query.command()
@click.option(
"--coordinates",
"-c",
Expand All @@ -23,10 +29,44 @@
show_default=True,
)
@click.argument("path", metavar="MOMICS_REPO", type=click.Path(exists=True))
def query(path, coordinates, output: str):
"""Extract coverages over a chromosome interval."""
res = api.Momics(path, create=False).query(coordinates)
@click.pass_context
def tracks(ctx, path, coordinates, output: str):
"""Extract track coverages over a chromosome interval."""
res = api.Momics(path, create=False).query_tracks(coordinates)
if output is not None:
res.to_csv(path_or_buf=output, sep="\t", index=False)
else:
print(res)


@query.command()
@click.option(
"--coordinates",
"-c",
help="UCSC-style coordinates",
type=str,
required=True,
)
@click.option(
"--output",
"-o",
help="Output file to save data in (data will be exported as fasta)",
type=click.Path(),
required=False,
default=None,
show_default=True,
)
@click.argument("path", metavar="MOMICS_REPO", type=click.Path(exists=True))
@click.pass_context
def seq(ctx, path, coordinates, output: str):
"""Extract chromosomal sequence over a chromosome interval."""
seq = api.Momics(path, create=False).query_sequence(coordinates)
seq = "".join(seq)
if output is not None:
with open(output, "w") as file:
file.write(f">{coordinates}\n")
# Split the sequence into lines of 60 characters
for i in range(0, len(seq), 60):
file.write(seq[i : i + 60] + "\n")
else:
print(seq)
11 changes: 11 additions & 0 deletions src/momics/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pyBigWig
import pyfaidx


def get_chr_lengths(bw):
Expand All @@ -8,6 +9,16 @@ def get_chr_lengths(bw):
return a


def _check_fasta_lengths(fasta, chroms):
reference_lengths = dict(zip(chroms["chr"], chroms["length"]))
with pyfaidx.Fasta(fasta) as fa:
lengths = {name: len(seq) for name, seq in fa.items()}
if lengths != reference_lengths:
raise Exception(
f"{fa} file do not have identical chromomosome lengths."
)


def _check_chr_lengths(bw_files, chroms):
reference_lengths = dict(zip(chroms["chr"], chroms["length"]))
for file in list(bw_files.values()):
Expand Down

0 comments on commit f29c8df

Please sign in to comment.