Skip to content

Commit

Permalink
Merge pull request #31 from GavinHuttley/develop
Browse files Browse the repository at this point in the history
MAINT: adding tests and consistent use of config properties
  • Loading branch information
GavinHuttley authored Nov 15, 2023
2 parents 0aec536 + 454c2bc commit 1830d1a
Show file tree
Hide file tree
Showing 6 changed files with 135 additions and 18 deletions.
23 changes: 23 additions & 0 deletions src/ensembl_lite/_homologydb.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,18 @@

from typing import Iterable, Sized

from cogent3.core.alignment import SequenceCollection
from rich.progress import track

from ensembl_lite._config import InstalledConfig
from ensembl_lite._db_base import SqliteDbMixin


OptionalStr = typing.Optional[str]

_HOMOLOGYDB_NAME = "homologies.sqlitedb"


class HomologyRecordType(typing.TypedDict):
source: str
species_1: str
Expand Down Expand Up @@ -122,3 +129,19 @@ def get_related_groups(
def get_distinct(self, column: str) -> set[str]:
sql = f"SELECT DISTINCT {column} from {self.table_name}"
return {r[column] for r in self._execute_sql(sql).fetchall()}


def load_homology_db(
*,
cfg: InstalledConfig,
) -> HomologyDb:
return HomologyDb(source=cfg.homologies_path / _HOMOLOGYDB_NAME)


def get_homologous_seqs(
*,
cfg: InstalledConfig,
names: list[str],
) -> typing.Iterable[SequenceCollection]:
# todo support ensuring species set present
hdb = load_homology_db(cfg)
5 changes: 1 addition & 4 deletions src/ensembl_lite/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,8 +219,7 @@ def homologs(installed, outpath, relationship):
from ensembl_lite._homologydb import HomologyDb

config = read_installed_cfg(installed)
db_path = config.install_homologies / "homologies.sqlitedb"
db = HomologyDb(source=db_path)
db = HomologyDb(source=config.homologies_path)
related = list(db.get_related_groups(relationship))
with open_(outpath, mode="wt") as out:
json.dump(related, out)
Expand Down Expand Up @@ -302,8 +301,6 @@ def dump_genes(installed, species, outdir, limit):
from cogent3 import make_table
from cogent3.core.annotation_db import GffAnnotationDb

from ensembl_lite.species import Species

config = read_installed_cfg(installed)
species = species[0]
path = config.installed_genome(species=species)
Expand Down
20 changes: 8 additions & 12 deletions src/ensembl_lite/install.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,9 +181,9 @@ def __init__(self, allowed_species: set):
def _matching_species(self, row):
return {row[1], row[4]} <= self._allowed_species

def __call__(self, dirpath: os.PathLike) -> list:
final = None
for path in dirpath.glob("*.tsv.gz"):
def __call__(self, paths: typing.Iterable[os.PathLike]) -> list:
final = []
for path in paths:
with open_(path) as infile:
# we bulk load because it's faster than the default line-by-line
# iteration on a file
Expand All @@ -193,13 +193,9 @@ def __call__(self, dirpath: os.PathLike) -> list:
header = rows.pop(0)
assert list(header) == list(self.src_cols), (header, self.src_cols)
rows = [r + [path.name] for r in rows]
if final is None:
final = rows
continue
final.extend(rows)

final += rows

return final or []
return final


def local_install_homology(config: Config, force_overwrite: bool):
Expand All @@ -216,12 +212,12 @@ def local_install_homology(config: Config, force_overwrite: bool):
# On test cases, only 30% speedup from running in parallel due to overhead
# of pickling the data, but considerable increase in memory. So, run
# in serial to avoid memory issues since it's reasonably fast anyway.
for rows in track(
map(loader, dirnames),
for dirname in track(
dirnames,
transient=True,
description="Installing homologies...",
total=len(dirnames),
):
rows = loader(dirname.glob("*.tsv.gz"))
db.add_records(records=rows, col_order=loader.dest_col)
del rows

Expand Down
37 changes: 37 additions & 0 deletions tests/data/one2one_homologies.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
False gene_stable_id protein_stable_id species identity homology_type homology_gene_stable_id homology_protein_stable_id homology_species homology_identity dn ds goc_score wga_coverage is_high_confidence homology_id
ENSPTRG00000042628 ENSPTRP00000061401 pan_troglodytes 91.3043 ortholog_one2one ENSGGOG00000026757 ENSGGOP00000021306 gorilla_gorilla 91.3043 NULL NULL 100 100.00 1 56628198
ENSPTRG00000042639 ENSPTRP00000061398 pan_troglodytes 93.0283 ortholog_one2one ENSGGOG00000025053 ENSGGOP00000024209 gorilla_gorilla 93.0283 NULL NULL 100 100.00 1 56625462
ENSPTRG00000042651 ENSPTRP00000061403 pan_troglodytes 90.5473 ortholog_one2one ENSGGOG00000022688 ENSGGOP00000022538 gorilla_gorilla 90.5473 NULL NULL 100 100.00 1 56644048
ENSPTRG00000042630 ENSPTRP00000061410 pan_troglodytes 95.977 ortholog_one2one ENSGGOG00000026221 ENSGGOP00000028201 gorilla_gorilla 95.977 NULL NULL 75 100.00 1 56640132
ENSPTRG00000042637 ENSPTRP00000061405 pan_troglodytes 93.1579 ortholog_one2one ENSGGOG00000024015 ENSGGOP00000022734 gorilla_gorilla 93.1579 NULL NULL 50 100.00 1 56631478
ENSPPYG00000020964 ENSPPYP00000023446 pongo_abelii 84.3478 ortholog_one2one ENSGGOG00000026757 ENSGGOP00000021306 gorilla_gorilla 84.3478 NULL NULL 100 0.00 1 56401690
ENSPPYG00000020967 ENSPPYP00000023448 pongo_abelii 89.1068 ortholog_one2one ENSGGOG00000025053 ENSGGOP00000024209 gorilla_gorilla 89.1068 NULL NULL 100 0.00 1 56399416
ENSPPYG00000020971 ENSPPYP00000023449 pongo_abelii 82.3627 ortholog_one2one ENSGGOG00000022688 ENSGGOP00000022538 gorilla_gorilla 82.0896 NULL NULL 100 0.00 1 56415811
ENSPPYG00000020974 ENSPPYP00000023451 pongo_abelii 90.2632 ortholog_one2one ENSGGOG00000024015 ENSGGOP00000022734 gorilla_gorilla 90.2632 NULL NULL 50 0.00 0 56404611
ENSG00000198695 ENSP00000354665 homo_sapiens 97.1264 ortholog_one2one ENSGGOG00000026221 ENSGGOP00000028201 gorilla_gorilla 97.1264 NULL NULL 75 100.00 1 56619377
ENSG00000198727 ENSP00000354554 homo_sapiens 92.6316 ortholog_one2one ENSGGOG00000024015 ENSGGOP00000022734 gorilla_gorilla 92.6316 NULL NULL 50 100.00 1 56611748
ENSG00000198786 ENSP00000354813 homo_sapiens 90.0498 ortholog_one2one ENSGGOG00000022688 ENSGGOP00000022538 gorilla_gorilla 90.0498 NULL NULL 100 100.00 1 56622868
ENSG00000198840 ENSP00000355206 homo_sapiens 93.0435 ortholog_one2one ENSGGOG00000026757 ENSGGOP00000021306 gorilla_gorilla 93.0435 NULL NULL 100 100.00 1 56608838
ENSG00000198886 ENSP00000354961 homo_sapiens 94.9891 ortholog_one2one ENSGGOG00000025053 ENSGGOP00000024209 gorilla_gorilla 94.9891 NULL NULL 100 100.00 1 56606592
ENSMMUG00000065353 ENSMMUP00000081256 macaca_mulatta 78.2609 ortholog_one2one ENSGGOG00000026757 ENSGGOP00000021306 gorilla_gorilla 78.2609 NULL NULL 100 100.00 1 76881074
ENSMMUG00000065387 ENSMMUP00000081261 macaca_mulatta 76.0349 ortholog_one2one ENSGGOG00000025053 ENSGGOP00000024209 gorilla_gorilla 76.0349 NULL NULL 100 100.00 1 76878860
ENSMMUG00000065354 ENSMMUP00000081255 macaca_mulatta 75.2902 ortholog_one2one ENSGGOG00000022688 ENSGGOP00000022538 gorilla_gorilla 75.2902 NULL NULL 100 100.00 1 76894845
ENSMMUG00000065382 ENSMMUP00000081249 macaca_mulatta 80.2632 ortholog_one2one ENSGGOG00000024015 ENSGGOP00000022734 gorilla_gorilla 80.2632 NULL NULL 50 100.00 1 76883952
ENSMICG00000027348 ENSMICP00000016336 microcebus_murinus 66.9565 ortholog_one2one ENSGGOG00000026757 ENSGGOP00000021306 gorilla_gorilla 66.9565 NULL NULL 100 100.00 1 83625751
ENSMICG00000032407 ENSMICP00000040348 microcebus_murinus 71.4597 ortholog_one2one ENSGGOG00000025053 ENSGGOP00000024209 gorilla_gorilla 71.4597 NULL NULL 100 100.00 1 83623377
ENSMICG00000026565 ENSMICP00000027021 microcebus_murinus 69.8176 ortholog_one2one ENSGGOG00000022688 ENSGGOP00000022538 gorilla_gorilla 69.8176 NULL NULL 100 100.00 1 83640006
ENSMICG00000028801 ENSMICP00000026522 microcebus_murinus 50 ortholog_one2one ENSGGOG00000026221 ENSGGOP00000028201 gorilla_gorilla 50 NULL NULL 75 100.00 1 83636541
ENSMICG00000037851 ENSMICP00000022746 microcebus_murinus 78.3641 ortholog_one2one ENSGGOG00000024015 ENSGGOP00000022734 gorilla_gorilla 78.1579 NULL NULL 50 100.00 1 83628828
ENSMFAG00000065538 ENSMFAP00000058497 macaca_fascicularis 80 ortholog_one2one ENSGGOG00000026757 ENSGGOP00000021306 gorilla_gorilla 80 NULL NULL 100 0.00 1 75714938
ENSMFAG00000057489 ENSMFAP00000063536 macaca_fascicularis 75.5991 ortholog_one2one ENSGGOG00000025053 ENSGGOP00000024209 gorilla_gorilla 75.5991 NULL NULL 100 0.00 1 75712710
ENSMFAG00000053671 ENSMFAP00000051998 macaca_fascicularis 75.4561 ortholog_one2one ENSGGOG00000022688 ENSGGOP00000022538 gorilla_gorilla 75.4561 NULL NULL 100 0.00 1 75728456
ENSMFAG00000062540 ENSMFAP00000058831 macaca_fascicularis 80.7895 ortholog_one2one ENSGGOG00000024015 ENSGGOP00000022734 gorilla_gorilla 80.7895 NULL NULL 50 0.00 0 75717779
ENSGGOG00000025053 ENSGGOP00000024209 gorilla_gorilla 76.0349 ortholog_one2one ENSCSAG00000000028 ENSCSAP00000000010 chlorocebus_sabaeus 76.0349 NULL NULL 100 100.00 1 84611826
ENSGGOG00000022688 ENSGGOP00000022538 gorilla_gorilla 74.6269 ortholog_one2one ENSCSAG00000000032 ENSCSAP00000000011 chlorocebus_sabaeus 74.7508 NULL NULL 100 100.00 1 84627614
ENSGGOG00000026221 ENSGGOP00000028201 gorilla_gorilla 81.0345 ortholog_one2one ENSCSAG00000000033 ENSCSAP00000000012 chlorocebus_sabaeus 81.5029 NULL NULL 75 100.00 1 84624249
ENSGGOG00000024015 ENSGGOP00000022734 gorilla_gorilla 82.6316 ortholog_one2one ENSCSAG00000000035 ENSCSAP00000000013 chlorocebus_sabaeus 82.6316 NULL NULL 50 100.00 1 84616943
ENSPPAG00000000025 ENSPPAP00000000008 pan_paniscus 93.913 ortholog_one2one ENSGGOG00000026757 ENSGGOP00000021306 gorilla_gorilla 93.913 NULL NULL 100 100.00 1 56439973
ENSPPAG00000000028 ENSPPAP00000000010 pan_paniscus 94.3355 ortholog_one2one ENSGGOG00000025053 ENSGGOP00000024209 gorilla_gorilla 94.3355 NULL NULL 100 100.00 1 56437537
ENSPPAG00000000032 ENSPPAP00000000011 pan_paniscus 90.8789 ortholog_one2one ENSGGOG00000022688 ENSGGOP00000022538 gorilla_gorilla 90.8789 NULL NULL 100 100.00 1 56454514
ENSPPAG00000000033 ENSPPAP00000000012 pan_paniscus 97.1264 ortholog_one2one ENSGGOG00000026221 ENSGGOP00000028201 gorilla_gorilla 97.1264 NULL NULL 75 100.00 1 56450900
ENSPPAG00000000035 ENSPPAP00000000013 pan_paniscus 92.3684 ortholog_one2one ENSGGOG00000024015 ENSGGOP00000022734 gorilla_gorilla 92.3684 NULL NULL 50 100.00 1 56442998
4 changes: 2 additions & 2 deletions tests/test_dbs.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def test_extract_homology_data(hom_dir):
loader = LoadHomologies(
{"gorilla_gorilla", "nomascus_leucogenys", "notamacropus_eugenii"}
)
got = loader(hom_dir)
got = loader(hom_dir.glob("*.tsv.gz"))
assert len(got) == 2
# loader dest cols matches the db schema
assert set(loader.dest_col) == HomologyDb._homology_schema.keys()
Expand All @@ -80,7 +80,7 @@ def test_homology_db(hom_dir):
loader = LoadHomologies(
{"gorilla_gorilla", "nomascus_leucogenys", "notamacropus_eugenii"}
)
got = loader(hom_dir)
got = loader(hom_dir.glob("*.tsv.gz"))
outpath = hom_dir / "species.sqlitedb"
db = HomologyDb(source=outpath)
db.add_records(records=got, col_order=loader.dest_col)
Expand Down
64 changes: 64 additions & 0 deletions tests/test_homologydb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import pytest

from cogent3 import load_table

from ensembl_lite._homologydb import _HOMOLOGYDB_NAME, HomologyDb
from ensembl_lite.install import LoadHomologies


def _make_expected_o2o(table):
"""return dict with keys stable ID's values list[tuple[str,str]]"""
data = table.to_list(["species_1", "gene_id_1", "species_2", "gene_id_2"])

result = {}
for sp1, g1, sp2, g2 in data:
value = {(sp1, g1), (sp2, g2)}
result[g1] = result.get(g1, set()) | value
result[g2] = result.get(g2, set()) | value

return result


@pytest.fixture
def o2o_db(DATA_DIR, tmp_dir):
raw = DATA_DIR / "one2one_homologies.tsv"

species = {
"gorilla_gorilla",
"macaca_mulatta",
"microcebus_murinus",
"homo_sapiens",
"pongo_abelii",
"pan_troglodytes",
"macaca_fascicularis",
"chlorocebus_sabaeus",
"pan_paniscus",
}
loader = LoadHomologies(species)

table = load_table(raw).get_columns(loader.src_cols)

table = table.with_new_header(loader.src_cols, loader.dest_col[:-1])
expect = _make_expected_o2o(table)

data = loader([raw])
homdb = HomologyDb(tmp_dir / _HOMOLOGYDB_NAME)
homdb.add_records(records=data, col_order=loader.dest_col)
return homdb, expect


@pytest.mark.parametrize(
"gene_id",
(
"ENSGGOG00000026757",
"ENSGGOG00000025053",
"ENSGGOG00000022688",
"ENSGGOG00000026221",
"ENSGGOG00000024015",
),
)
def test_hdb(o2o_db, gene_id):
homdb, expect = o2o_db

got = homdb.get_related_to(gene_id=gene_id, relationship_type="ortholog_one2one")
assert got == expect[gene_id]

0 comments on commit 1830d1a

Please sign in to comment.