Skip to content

Commit

Permalink
Merge pull request #29 from GavinHuttley/develop
Browse files Browse the repository at this point in the history
ENH: new feature and code tidy
  • Loading branch information
GavinHuttley authored Nov 13, 2023
2 parents 9fefce9 + e43353b commit 0be9ae3
Show file tree
Hide file tree
Showing 9 changed files with 199 additions and 37 deletions.
5 changes: 3 additions & 2 deletions src/ensembl_lite/_aligndb.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import numpy

from ensembl_lite._db_base import SqliteDbMixin
from ensembl_lite._db_base import SqliteDbMixin, _compressed_array_proxy


class AlignRecordType(typing.TypedDict):
Expand Down Expand Up @@ -30,7 +30,7 @@ class AlignDb(SqliteDbMixin):
"start": "INTEGER",
"end": "INTEGER",
"strand": "TEXT",
"gap_spans": "array",
"gap_spans": "compressed_array",
}

def __init__(self, *, source=":memory:"):
Expand All @@ -54,6 +54,7 @@ def add_records(self, records: typing.Sequence[AlignRecordType]):
).fetchall()
]
for i in range(len(records)):
records[i]["gap_spans"] = _compressed_array_proxy(records[i]["gap_spans"])
records[i] = [records[i][c] for c in col_order]

val_placeholder = ", ".join("?" * len(col_order))
Expand Down
36 changes: 25 additions & 11 deletions src/ensembl_lite/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@
INSTALLED_CONFIG_NAME = "installed.cfg"
DOWNLOADED_CONFIG_NAME = "downloaded.cfg"

_COMPARA_NAME = "compara"
_ALIGNS_NAME = "aligns"
_HOMOLOGIES_NAME = "homologies"
_GENOMES_NAME = "genomes"


@dataclass
class Config:
Expand Down Expand Up @@ -38,27 +43,27 @@ def db_names(self) -> Iterable[str]:

@property
def staging_genomes(self):
return self.staging_path / "genomes"
return self.staging_path / _GENOMES_NAME

@property
def install_genomes(self):
return self.install_path / "genomes"
return self.install_path / _GENOMES_NAME

@property
def staging_homologies(self):
return self.staging_path / "compara" / "homologies"
return self.staging_path / _COMPARA_NAME / _HOMOLOGIES_NAME

@property
def install_homologies(self):
return self.install_path / "compara" / "homologies"
return self.install_path / _COMPARA_NAME / _HOMOLOGIES_NAME

@property
def staging_aligns(self):
return self.staging_path / "compara" / "aligns"
return self.staging_path / _COMPARA_NAME / _ALIGNS_NAME

@property
def install_aligns(self):
return self.install_path / "compara" / "aligns"
return self.install_path / _COMPARA_NAME / _ALIGNS_NAME

def to_dict(self):
"""returns cfg as a dict"""
Expand Down Expand Up @@ -109,23 +114,32 @@ def __post_init__(self):
self.install_path = pathlib.Path(self.install_path)

@property
def install_homologies(self):
return self.install_path / "compara" / "homologies"
def compara_path(self):
return self.install_path / _COMPARA_NAME

@property
def install_aligns(self):
return self.install_path / "compara" / "aligns"
def homologies_path(self):
return self.compara_path / _HOMOLOGIES_NAME

@property
def aligns_path(self):
return self.compara_path / _ALIGNS_NAME

@property
def genomes_path(self):
return self.install_path / _GENOMES_NAME

def installed_genome(self, species: str) -> os.PathLike:
db_name = Species.get_ensembl_db_prefix(species)
return self.install_path / "genomes" / db_name
return self.genomes_path / db_name


def write_installed_cfg(config: Config) -> os.PathLike:
"""writes an ini file under config.installed_path"""
parser = configparser.ConfigParser()
parser.add_section("release")
parser.set("release", "release", config.release)
# create all the genome
outpath = config.install_path / INSTALLED_CONFIG_NAME
outpath.parent.mkdir(parents=True, exist_ok=True)
with outpath.open(mode="w") as out:
Expand Down
23 changes: 17 additions & 6 deletions src/ensembl_lite/_db_base.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import dataclasses
import gzip
import inspect
import sqlite3
Expand All @@ -6,6 +7,13 @@
import numpy


@dataclasses.dataclass
class _compressed_array_proxy:
"""this exists only to automate conversion of a customised sqlite type"""

array: numpy.ndarray


class AlignRecordType(typing.TypedDict):
source: str
block_id: str
Expand All @@ -20,19 +28,22 @@ class AlignRecordType(typing.TypedDict):
ReturnType = typing.Tuple[str, tuple] # the sql statement and corresponding values


def array_to_sqlite(data):
return gzip.compress(data.tobytes())
def compressed_array_to_sqlite(data):
return gzip.compress(data.array.astype(numpy.int32).tobytes())


def sqlite_to_array(data):
result = numpy.frombuffer(gzip.decompress(data), dtype=int)
def decompressed_sqlite_to_array(data):
result = numpy.frombuffer(gzip.decompress(data), dtype=numpy.int32)
dim = result.shape[0] // 2
return result.reshape((dim, 2))


# registering the conversion functions with sqlite
sqlite3.register_adapter(numpy.ndarray, array_to_sqlite)
sqlite3.register_converter("array", sqlite_to_array)
# since these conversion functions are tied to a type, need to ensure the
# type will be unique to this tool, best way is to use <libname_type> and
# wrap a fundamental type with a proxy
sqlite3.register_adapter(_compressed_array_proxy, compressed_array_to_sqlite)
sqlite3.register_converter("compressed_array", decompressed_sqlite_to_array)


def _make_table_sql(
Expand Down
5 changes: 5 additions & 0 deletions src/ensembl_lite/_genomedb.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from cogent3 import get_app, make_seq
from cogent3.app.composable import define_app
from cogent3.core.annotation_db import GffAnnotationDb
from cogent3.util.table import Table

from ensembl_lite._db_base import SqliteDbMixin

Expand Down Expand Up @@ -176,3 +177,7 @@ def get_features(
kwargs = {k: v for k, v in locals().items() if k not in ("self", "seqid")}
seq = self.get_seq(seqid=seqid, start=start, stop=stop)
yield from seq.get_features(**kwargs)


def get_feature_table(genome: Genome) -> Table:
...
124 changes: 112 additions & 12 deletions src/ensembl_lite/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@
import click
import wakepy.keep

from rich.progress import track
from trogon import tui

from ensembl_lite import __version__
from ensembl_lite._config import (
DOWNLOADED_CONFIG_NAME,
INSTALLED_CONFIG_NAME,
read_config,
read_installed_cfg,
write_installed_cfg,
Expand Down Expand Up @@ -136,6 +138,9 @@ def install(download, force_overwrite, verbose):

configpath = download / DOWNLOADED_CONFIG_NAME
config = read_config(configpath)
if verbose:
print(f"{config.install_path=}")

if force_overwrite:
shutil.rmtree(config.install_path, ignore_errors=True)

Expand Down Expand Up @@ -171,10 +176,30 @@ def exportrc(outpath):
click.secho(f"Contents written to {outpath}", fg="green")


@main.command(no_args_is_help=True)
@click.option(
"-i", "--installed", required=True, help="string pointing to installation"
def _get_installed_config_path(ctx, param, path):
"""path to installed.cfg"""
path = pathlib.Path(path)
if path.name == INSTALLED_CONFIG_NAME:
return path

path = path / INSTALLED_CONFIG_NAME
if not path.exists():
click.secho(f"{str(path)} missing", fg="red")
exit(1)
return path


_installed = click.option(
"-i",
"--installed",
required=True,
callback=_get_installed_config_path,
help="string pointing to installation",
)


@main.command(no_args_is_help=True)
@_installed
@click.option(
"-o", "--outpath", required=True, type=pathlib.Path, help="path to write json file"
)
Expand Down Expand Up @@ -202,21 +227,17 @@ def homologs(installed, outpath, relationship):


@main.command(no_args_is_help=True)
@_installation
def installed(installation):
@_installed
def installed(installed):
"""show what is installed"""
from cogent3 import make_table

from ensembl_lite.species import Species
from ensembl_lite.util import rich_display

if not installation.exists():
click.secho(f"{str(installation)!r} does not exist!")
exit(1)
config = read_installed_cfg(installed)

# TODO install structure should be provided by some type of
# hook function
genome_dir = installation / "genomes"
genome_dir = config.genomes_path
if genome_dir.exists():
species = [fn.name for fn in genome_dir.glob("*")]
data = {"species": [], "common name": []}
Expand All @@ -231,7 +252,7 @@ def installed(installation):
rich_display(table)

# TODO as above
compara_aligns = installation / "compara" / "aligns"
compara_aligns = config.aligns_path
if compara_aligns.exists():
align_names = [
fn.stem for fn in compara_aligns.glob("*") if not fn.name.startswith(".")
Expand All @@ -242,5 +263,84 @@ def installed(installation):
rich_display(table)


def _species_names_from_csv(ctx, param, species):
"""returns species names"""
if species is not None:
species = [s.strip().lower() for s in species.split(",")]
return species


_species = click.option(
"--species",
required=True,
callback=_species_names_from_csv,
help="Single species name, or multiple (comma separated).",
)
_outdir = click.option(
"--outdir",
type=pathlib.Path,
required=True,
default="gene_metadata.tsv",
help="Output file name.",
)
_limit = click.option(
"--limit",
type=int,
default=None,
help="Limit to this number of genes.",
show_default=True,
)


@main.command(no_args_is_help=True)
@_installed
@_species
@_outdir
@_limit
def dump_genes(installed, species, outdir, limit):
"""Dump meta data table for genes from one species to <species>-<release>.gene_metadata.tsv"""
from cogent3 import make_table
from cogent3.core.annotation_db import GffAnnotationDb

from ensembl_lite.species import Species

config = read_installed_cfg(installed)
species = species[0]
path = config.installed_genome(species=species)
if not path.exists():
click.secho(f"{species!r} not in {str(installed.parent)!r}", fg="red")
exit(1)

# TODO: this filename should be defined in one place
path = path / "features.gff3db"
if not path.exists():
click.secho(f"{path.name!r} is missing", fg="red")
exit(1)

annot_db = GffAnnotationDb(source=path)
rows = []
columns = [
"name",
"seqid",
"source",
"biotype",
"start",
"end",
"score",
"strand",
"phase",
]
for i, record in track(enumerate(annot_db.get_records_matching(biotype="gene"))):
rows.append([record[c] for c in columns])
if i == limit:
break

table = make_table(header=columns, data=rows)
outdir.mkdir(parents=True, exist_ok=True)
outpath = outdir / f"{path.parent.stem}-{config.release}-gene_metadata.tsv"
table.write(outpath)
click.secho(f"Finished wrote {str(outpath)!r}!", fg="green")


if __name__ == "__main__":
main()
6 changes: 4 additions & 2 deletions src/ensembl_lite/install.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from ensembl_lite import maf
from ensembl_lite._aligndb import AlignDb
from ensembl_lite._config import Config
from ensembl_lite._config import _COMPARA_NAME, Config
from ensembl_lite._genomedb import CompressedGenomeSeqsDb, compress_it
from ensembl_lite._homologydb import HomologyDb
from ensembl_lite.convert import seq_to_gap_coords
Expand Down Expand Up @@ -93,6 +93,8 @@ def local_install_genomes(config: Config, force_overwrite: bool):
for t in track(tasks, description="Installing annotations...", transient=True)
]

db.close()

return


Expand All @@ -118,7 +120,7 @@ def _load_one_align(path: os.PathLike) -> typing.Iterable[dict]:

def local_install_compara(config: Config, force_overwrite: bool):
if force_overwrite:
shutil.rmtree(config.install_path / "compara", ignore_errors=True)
shutil.rmtree(config.install_path / _COMPARA_NAME, ignore_errors=True)

for align_name in config.align_names:
src_dir = config.staging_aligns / align_name
Expand Down
4 changes: 2 additions & 2 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@ def test_installed_genome():

def test_installed_aligns():
cfg = InstalledConfig(release=110, install_path="abcd")
assert cfg.install_aligns == pathlib.Path("abcd/compara/aligns")
assert cfg.aligns_path == pathlib.Path("abcd/compara/aligns")


def test_installed_homologies():
cfg = InstalledConfig(release=110, install_path="abcd")
assert cfg.install_homologies == pathlib.Path("abcd/compara/homologies")
assert cfg.homologies_path == pathlib.Path("abcd/compara/homologies")


def test_read_installed(tmp_config, tmp_path):
Expand Down
Loading

0 comments on commit 0be9ae3

Please sign in to comment.