From 5f331beef98e05ddbbaaf4ee2cc08c25e67a18ac Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Wed, 15 Nov 2023 16:26:31 +1100 Subject: [PATCH 1/3] API: modify interface to HomologyDb [CHANGED] pass in iterable of the paths to be traversed, makes it easier to use this class in tests --- src/ensembl_lite/install.py | 20 ++++++++------------ tests/test_dbs.py | 4 ++-- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/src/ensembl_lite/install.py b/src/ensembl_lite/install.py index 81f9894..d8d74f1 100644 --- a/src/ensembl_lite/install.py +++ b/src/ensembl_lite/install.py @@ -181,9 +181,9 @@ def __init__(self, allowed_species: set): def _matching_species(self, row): return {row[1], row[4]} <= self._allowed_species - def __call__(self, dirpath: os.PathLike) -> list: - final = None - for path in dirpath.glob("*.tsv.gz"): + def __call__(self, paths: typing.Iterable[os.PathLike]) -> list: + final = [] + for path in paths: with open_(path) as infile: # we bulk load because it's faster than the default line-by-line # iteration on a file @@ -193,13 +193,9 @@ def __call__(self, dirpath: os.PathLike) -> list: header = rows.pop(0) assert list(header) == list(self.src_cols), (header, self.src_cols) rows = [r + [path.name] for r in rows] - if final is None: - final = rows - continue + final.extend(rows) - final += rows - - return final or [] + return final def local_install_homology(config: Config, force_overwrite: bool): @@ -216,12 +212,12 @@ def local_install_homology(config: Config, force_overwrite: bool): # On test cases, only 30% speedup from running in parallel due to overhead # of pickling the data, but considerable increase in memory. So, run # in serial to avoid memory issues since it's reasonably fast anyway. - for rows in track( - map(loader, dirnames), + for dirname in track( + dirnames, transient=True, description="Installing homologies...", - total=len(dirnames), ): + rows = loader(dirname.glob("*.tsv.gz")) db.add_records(records=rows, col_order=loader.dest_col) del rows diff --git a/tests/test_dbs.py b/tests/test_dbs.py index b2d17b4..47a6b6a 100644 --- a/tests/test_dbs.py +++ b/tests/test_dbs.py @@ -70,7 +70,7 @@ def test_extract_homology_data(hom_dir): loader = LoadHomologies( {"gorilla_gorilla", "nomascus_leucogenys", "notamacropus_eugenii"} ) - got = loader(hom_dir) + got = loader(hom_dir.glob("*.tsv.gz")) assert len(got) == 2 # loader dest cols matches the db schema assert set(loader.dest_col) == HomologyDb._homology_schema.keys() @@ -80,7 +80,7 @@ def test_homology_db(hom_dir): loader = LoadHomologies( {"gorilla_gorilla", "nomascus_leucogenys", "notamacropus_eugenii"} ) - got = loader(hom_dir) + got = loader(hom_dir.glob("*.tsv.gz")) outpath = hom_dir / "species.sqlitedb" db = HomologyDb(source=outpath) db.add_records(records=got, col_order=loader.dest_col) From e5218593da726d3efa3cd0ac12cf8818fb405f0a Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Wed, 15 Nov 2023 16:28:03 +1100 Subject: [PATCH 2/3] MAINT: use installed config homologies_path and name constant --- src/ensembl_lite/_homologydb.py | 23 +++++++++++++++++++++++ src/ensembl_lite/cli.py | 5 +---- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/src/ensembl_lite/_homologydb.py b/src/ensembl_lite/_homologydb.py index ff6f062..b4d126c 100644 --- a/src/ensembl_lite/_homologydb.py +++ b/src/ensembl_lite/_homologydb.py @@ -4,11 +4,18 @@ from typing import Iterable, Sized +from cogent3.core.alignment import SequenceCollection from rich.progress import track +from ensembl_lite._config import InstalledConfig from ensembl_lite._db_base import SqliteDbMixin +OptionalStr = typing.Optional[str] + +_HOMOLOGYDB_NAME = "homologies.sqlitedb" + + class HomologyRecordType(typing.TypedDict): source: str species_1: str @@ -122,3 +129,19 @@ def get_related_groups( def get_distinct(self, column: str) -> set[str]: sql = f"SELECT DISTINCT {column} from {self.table_name}" return {r[column] for r in self._execute_sql(sql).fetchall()} + + +def load_homology_db( + *, + cfg: InstalledConfig, +) -> HomologyDb: + return HomologyDb(source=cfg.homologies_path / _HOMOLOGYDB_NAME) + + +def get_homologous_seqs( + *, + cfg: InstalledConfig, + names: list[str], +) -> typing.Iterable[SequenceCollection]: + # todo support ensuring species set present + hdb = load_homology_db(cfg) diff --git a/src/ensembl_lite/cli.py b/src/ensembl_lite/cli.py index d59a66e..2f68334 100644 --- a/src/ensembl_lite/cli.py +++ b/src/ensembl_lite/cli.py @@ -219,8 +219,7 @@ def homologs(installed, outpath, relationship): from ensembl_lite._homologydb import HomologyDb config = read_installed_cfg(installed) - db_path = config.install_homologies / "homologies.sqlitedb" - db = HomologyDb(source=db_path) + db = HomologyDb(source=config.homologies_path) related = list(db.get_related_groups(relationship)) with open_(outpath, mode="wt") as out: json.dump(related, out) @@ -302,8 +301,6 @@ def dump_genes(installed, species, outdir, limit): from cogent3 import make_table from cogent3.core.annotation_db import GffAnnotationDb - from ensembl_lite.species import Species - config = read_installed_cfg(installed) species = species[0] path = config.installed_genome(species=species) From 454c2bc67b526e44baaec3cb96d840e758a791ed Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Wed, 15 Nov 2023 16:28:21 +1100 Subject: [PATCH 3/3] TST: sample data and test of HomologyDb --- tests/data/one2one_homologies.tsv | 37 ++++++++++++++++++ tests/test_homologydb.py | 64 +++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 tests/data/one2one_homologies.tsv create mode 100644 tests/test_homologydb.py diff --git a/tests/data/one2one_homologies.tsv b/tests/data/one2one_homologies.tsv new file mode 100644 index 0000000..84160fd --- /dev/null +++ b/tests/data/one2one_homologies.tsv @@ -0,0 +1,37 @@ +False gene_stable_id protein_stable_id species identity homology_type homology_gene_stable_id homology_protein_stable_id homology_species homology_identity dn ds goc_score wga_coverage is_high_confidence homology_id + ENSPTRG00000042628 ENSPTRP00000061401 pan_troglodytes 91.3043 ortholog_one2one ENSGGOG00000026757 ENSGGOP00000021306 gorilla_gorilla 91.3043 NULL NULL 100 100.00 1 56628198 + ENSPTRG00000042639 ENSPTRP00000061398 pan_troglodytes 93.0283 ortholog_one2one ENSGGOG00000025053 ENSGGOP00000024209 gorilla_gorilla 93.0283 NULL NULL 100 100.00 1 56625462 + ENSPTRG00000042651 ENSPTRP00000061403 pan_troglodytes 90.5473 ortholog_one2one ENSGGOG00000022688 ENSGGOP00000022538 gorilla_gorilla 90.5473 NULL NULL 100 100.00 1 56644048 + ENSPTRG00000042630 ENSPTRP00000061410 pan_troglodytes 95.977 ortholog_one2one ENSGGOG00000026221 ENSGGOP00000028201 gorilla_gorilla 95.977 NULL NULL 75 100.00 1 56640132 + ENSPTRG00000042637 ENSPTRP00000061405 pan_troglodytes 93.1579 ortholog_one2one ENSGGOG00000024015 ENSGGOP00000022734 gorilla_gorilla 93.1579 NULL NULL 50 100.00 1 56631478 + ENSPPYG00000020964 ENSPPYP00000023446 pongo_abelii 84.3478 ortholog_one2one ENSGGOG00000026757 ENSGGOP00000021306 gorilla_gorilla 84.3478 NULL NULL 100 0.00 1 56401690 + ENSPPYG00000020967 ENSPPYP00000023448 pongo_abelii 89.1068 ortholog_one2one ENSGGOG00000025053 ENSGGOP00000024209 gorilla_gorilla 89.1068 NULL NULL 100 0.00 1 56399416 + ENSPPYG00000020971 ENSPPYP00000023449 pongo_abelii 82.3627 ortholog_one2one ENSGGOG00000022688 ENSGGOP00000022538 gorilla_gorilla 82.0896 NULL NULL 100 0.00 1 56415811 + ENSPPYG00000020974 ENSPPYP00000023451 pongo_abelii 90.2632 ortholog_one2one ENSGGOG00000024015 ENSGGOP00000022734 gorilla_gorilla 90.2632 NULL NULL 50 0.00 0 56404611 + ENSG00000198695 ENSP00000354665 homo_sapiens 97.1264 ortholog_one2one ENSGGOG00000026221 ENSGGOP00000028201 gorilla_gorilla 97.1264 NULL NULL 75 100.00 1 56619377 + ENSG00000198727 ENSP00000354554 homo_sapiens 92.6316 ortholog_one2one ENSGGOG00000024015 ENSGGOP00000022734 gorilla_gorilla 92.6316 NULL NULL 50 100.00 1 56611748 + ENSG00000198786 ENSP00000354813 homo_sapiens 90.0498 ortholog_one2one ENSGGOG00000022688 ENSGGOP00000022538 gorilla_gorilla 90.0498 NULL NULL 100 100.00 1 56622868 + ENSG00000198840 ENSP00000355206 homo_sapiens 93.0435 ortholog_one2one ENSGGOG00000026757 ENSGGOP00000021306 gorilla_gorilla 93.0435 NULL NULL 100 100.00 1 56608838 + ENSG00000198886 ENSP00000354961 homo_sapiens 94.9891 ortholog_one2one ENSGGOG00000025053 ENSGGOP00000024209 gorilla_gorilla 94.9891 NULL NULL 100 100.00 1 56606592 + ENSMMUG00000065353 ENSMMUP00000081256 macaca_mulatta 78.2609 ortholog_one2one ENSGGOG00000026757 ENSGGOP00000021306 gorilla_gorilla 78.2609 NULL NULL 100 100.00 1 76881074 + ENSMMUG00000065387 ENSMMUP00000081261 macaca_mulatta 76.0349 ortholog_one2one ENSGGOG00000025053 ENSGGOP00000024209 gorilla_gorilla 76.0349 NULL NULL 100 100.00 1 76878860 + ENSMMUG00000065354 ENSMMUP00000081255 macaca_mulatta 75.2902 ortholog_one2one ENSGGOG00000022688 ENSGGOP00000022538 gorilla_gorilla 75.2902 NULL NULL 100 100.00 1 76894845 + ENSMMUG00000065382 ENSMMUP00000081249 macaca_mulatta 80.2632 ortholog_one2one ENSGGOG00000024015 ENSGGOP00000022734 gorilla_gorilla 80.2632 NULL NULL 50 100.00 1 76883952 + ENSMICG00000027348 ENSMICP00000016336 microcebus_murinus 66.9565 ortholog_one2one ENSGGOG00000026757 ENSGGOP00000021306 gorilla_gorilla 66.9565 NULL NULL 100 100.00 1 83625751 + ENSMICG00000032407 ENSMICP00000040348 microcebus_murinus 71.4597 ortholog_one2one ENSGGOG00000025053 ENSGGOP00000024209 gorilla_gorilla 71.4597 NULL NULL 100 100.00 1 83623377 + ENSMICG00000026565 ENSMICP00000027021 microcebus_murinus 69.8176 ortholog_one2one ENSGGOG00000022688 ENSGGOP00000022538 gorilla_gorilla 69.8176 NULL NULL 100 100.00 1 83640006 + ENSMICG00000028801 ENSMICP00000026522 microcebus_murinus 50 ortholog_one2one ENSGGOG00000026221 ENSGGOP00000028201 gorilla_gorilla 50 NULL NULL 75 100.00 1 83636541 + ENSMICG00000037851 ENSMICP00000022746 microcebus_murinus 78.3641 ortholog_one2one ENSGGOG00000024015 ENSGGOP00000022734 gorilla_gorilla 78.1579 NULL NULL 50 100.00 1 83628828 + ENSMFAG00000065538 ENSMFAP00000058497 macaca_fascicularis 80 ortholog_one2one ENSGGOG00000026757 ENSGGOP00000021306 gorilla_gorilla 80 NULL NULL 100 0.00 1 75714938 + ENSMFAG00000057489 ENSMFAP00000063536 macaca_fascicularis 75.5991 ortholog_one2one ENSGGOG00000025053 ENSGGOP00000024209 gorilla_gorilla 75.5991 NULL NULL 100 0.00 1 75712710 + ENSMFAG00000053671 ENSMFAP00000051998 macaca_fascicularis 75.4561 ortholog_one2one ENSGGOG00000022688 ENSGGOP00000022538 gorilla_gorilla 75.4561 NULL NULL 100 0.00 1 75728456 + ENSMFAG00000062540 ENSMFAP00000058831 macaca_fascicularis 80.7895 ortholog_one2one ENSGGOG00000024015 ENSGGOP00000022734 gorilla_gorilla 80.7895 NULL NULL 50 0.00 0 75717779 + ENSGGOG00000025053 ENSGGOP00000024209 gorilla_gorilla 76.0349 ortholog_one2one ENSCSAG00000000028 ENSCSAP00000000010 chlorocebus_sabaeus 76.0349 NULL NULL 100 100.00 1 84611826 + ENSGGOG00000022688 ENSGGOP00000022538 gorilla_gorilla 74.6269 ortholog_one2one ENSCSAG00000000032 ENSCSAP00000000011 chlorocebus_sabaeus 74.7508 NULL NULL 100 100.00 1 84627614 + ENSGGOG00000026221 ENSGGOP00000028201 gorilla_gorilla 81.0345 ortholog_one2one ENSCSAG00000000033 ENSCSAP00000000012 chlorocebus_sabaeus 81.5029 NULL NULL 75 100.00 1 84624249 + ENSGGOG00000024015 ENSGGOP00000022734 gorilla_gorilla 82.6316 ortholog_one2one ENSCSAG00000000035 ENSCSAP00000000013 chlorocebus_sabaeus 82.6316 NULL NULL 50 100.00 1 84616943 + ENSPPAG00000000025 ENSPPAP00000000008 pan_paniscus 93.913 ortholog_one2one ENSGGOG00000026757 ENSGGOP00000021306 gorilla_gorilla 93.913 NULL NULL 100 100.00 1 56439973 + ENSPPAG00000000028 ENSPPAP00000000010 pan_paniscus 94.3355 ortholog_one2one ENSGGOG00000025053 ENSGGOP00000024209 gorilla_gorilla 94.3355 NULL NULL 100 100.00 1 56437537 + ENSPPAG00000000032 ENSPPAP00000000011 pan_paniscus 90.8789 ortholog_one2one ENSGGOG00000022688 ENSGGOP00000022538 gorilla_gorilla 90.8789 NULL NULL 100 100.00 1 56454514 + ENSPPAG00000000033 ENSPPAP00000000012 pan_paniscus 97.1264 ortholog_one2one ENSGGOG00000026221 ENSGGOP00000028201 gorilla_gorilla 97.1264 NULL NULL 75 100.00 1 56450900 + ENSPPAG00000000035 ENSPPAP00000000013 pan_paniscus 92.3684 ortholog_one2one ENSGGOG00000024015 ENSGGOP00000022734 gorilla_gorilla 92.3684 NULL NULL 50 100.00 1 56442998 diff --git a/tests/test_homologydb.py b/tests/test_homologydb.py new file mode 100644 index 0000000..52492b3 --- /dev/null +++ b/tests/test_homologydb.py @@ -0,0 +1,64 @@ +import pytest + +from cogent3 import load_table + +from ensembl_lite._homologydb import _HOMOLOGYDB_NAME, HomologyDb +from ensembl_lite.install import LoadHomologies + + +def _make_expected_o2o(table): + """return dict with keys stable ID's values list[tuple[str,str]]""" + data = table.to_list(["species_1", "gene_id_1", "species_2", "gene_id_2"]) + + result = {} + for sp1, g1, sp2, g2 in data: + value = {(sp1, g1), (sp2, g2)} + result[g1] = result.get(g1, set()) | value + result[g2] = result.get(g2, set()) | value + + return result + + +@pytest.fixture +def o2o_db(DATA_DIR, tmp_dir): + raw = DATA_DIR / "one2one_homologies.tsv" + + species = { + "gorilla_gorilla", + "macaca_mulatta", + "microcebus_murinus", + "homo_sapiens", + "pongo_abelii", + "pan_troglodytes", + "macaca_fascicularis", + "chlorocebus_sabaeus", + "pan_paniscus", + } + loader = LoadHomologies(species) + + table = load_table(raw).get_columns(loader.src_cols) + + table = table.with_new_header(loader.src_cols, loader.dest_col[:-1]) + expect = _make_expected_o2o(table) + + data = loader([raw]) + homdb = HomologyDb(tmp_dir / _HOMOLOGYDB_NAME) + homdb.add_records(records=data, col_order=loader.dest_col) + return homdb, expect + + +@pytest.mark.parametrize( + "gene_id", + ( + "ENSGGOG00000026757", + "ENSGGOG00000025053", + "ENSGGOG00000022688", + "ENSGGOG00000026221", + "ENSGGOG00000024015", + ), +) +def test_hdb(o2o_db, gene_id): + homdb, expect = o2o_db + + got = homdb.get_related_to(gene_id=gene_id, relationship_type="ortholog_one2one") + assert got == expect[gene_id]