popsim-consortium · jeromekelleher · Apr 9, 2021 · Apr 9, 2021 · Apr 9, 2021 · Apr 9, 2021
diff --git a/maintenance/__init__.py b/maintenance/__init__.py
@@ -2,3 +2,4 @@
 Code used for internal maintenance tasks.
 """
 from .ensembl import *  # noqa
+from .ncbi import *  # noqa
diff --git a/maintenance/main.py b/maintenance/main.py
@@ -15,6 +15,7 @@
 
 import stdpopsim
 from . import ensembl
+from . import ncbi
 
 logger = logging.getLogger("maint")
 
@@ -163,6 +164,14 @@ def ensembl_stdpopsim_id(ensembl_id):
     return sps_id
 
 
+def ncbi_stdpopsim_id(ncbi_id):
+    tmp = ncbi_id.split(" ")[:2]
+    sps_id = "".join([x[0:3].capitalize() for x in tmp])
+    if len(sps_id) != 6:
+        raise ValueError(f"Cannot extract six character id from {ncbi_id}")
+    return sps_id
+
+
 def catalog_path(sps_id):
     return pathlib.Path(f"stdpopsim/catalog/{sps_id}")
 
@@ -239,6 +248,27 @@ def add_species(self, ensembl_id, force=False):
             genome_data=genome_data,
         )
 
+    def add_species_ncbi(self, ncbi_id, force=False):
+        species_name = ncbi.get_species_name(ncbi_id)
+        tmp = species_name.split(" ")[:2]
+        sps_id = "".join([x[0:3].capitalize() for x in tmp])
+        if len(sps_id) != 6:
+            raise ValueError(f"Cannot extract six character id from {species_name}")
+        logger.info(f"Adding new species {sps_id} for NCBI ID {ncbi_id}")
+        root = catalog_path(sps_id)
+        if force:
+            shutil.rmtree(root, ignore_errors=True)
+        root.mkdir()
+        genome_data = self.write_genome_data_ncbi(ncbi_id, sps_id)
+        species_data = ncbi.get_species_data(ncbi_id)
+        write_catalog_stub(
+            path=root,
+            sps_id=sps_id,
+            ensembl_id=ncbi_id,
+            species_data=species_data,
+            genome_data=genome_data,
+        )
+
     def write_genome_data(self, ensembl_id):
         sps_id = ensembl_stdpopsim_id(ensembl_id)
         path = catalog_path(sps_id)
@@ -256,6 +286,22 @@ def write_genome_data(self, ensembl_id):
             f.write(black_format(code))
         return data
 
+    def write_genome_data_ncbi(self, ncbi_id, sps_id):
+        path = catalog_path(sps_id)
+        if not path.exists():
+            raise ValueError(
+                f"Directory {id} corresponding to {ncbi_id} does" + "not exist"
+            )
+        logger.info(f"Writing genome data for {sps_id} {ncbi_id}")
+        path = path / "genome_data.py"
+        data = ncbi.get_genome_data(ncbi_id)
+        code = f"data = {data}"
+
+        # Format the code with Black so we don't get noisy diffs
+        with self.write(path) as f:
+            f.write(black_format(code))
+        return data
+
     def write_ensembl_release(self):
         release = self.ensembl_client.get_release()
         logger.info(f"Using Ensembl release {release}")
@@ -305,6 +351,7 @@ def update_genome_data(species):
     will update the genome data for humans. Multiple species can
     be specified. By default all species are updated.
     """
+    # TODO make this work for NCBI as well
     if len(species) == 0:
         embl_ids = [s.ensembl_id for s in stdpopsim.all_species()]
     else:
@@ -327,5 +374,20 @@ def add_species(ensembl_id, force):
     writer.write_ensembl_release()
 
 
+# TODO refactor this so that it's an option to add-species. By default
+# we assume the repository is Ensembl.
+@cli.command()
+@click.argument("NCBI-id")
+@click.option("--force", is_flag=True)
+def add_species_ncbi(ncbi_id, force):
+    """
+    Add a new species to the catalog using its NCBI UID. UIDs can be
+    found by searching NCBI for the species in question and looking
+    at the assembly page.
+    """
+    writer = DataWriter()
+    writer.add_species_ncbi(ncbi_id, force=force)
+
+
 def main():
     cli()
diff --git a/maintenance/ncbi.py b/maintenance/ncbi.py
@@ -0,0 +1,74 @@
+"""
+Utilites for working with NCBI
+
+"""
+import logging
+import urllib
+
+from Bio import Entrez
+
+logger = logging.getLogger("ncbi")
+
+Entrez.email = "[email protected]"
+
+# TODO This is very badly factored, we're making the same call 3 or 4 times.
+# Fix up!
+
+
+def get_species_name(uid):
+    esummary_handle = Entrez.esummary(db="assembly", id=uid, report="full")
+    esummary_record = Entrez.read(esummary_handle)
+    summary = esummary_record["DocumentSummarySet"]["DocumentSummary"][0]
+    return summary["SpeciesName"]
+
+
+def get_species_data(uid):
+    esummary_handle = Entrez.esummary(db="assembly", id=uid, report="full")
+    esummary_record = Entrez.read(esummary_handle)
+    summary = esummary_record["DocumentSummarySet"]["DocumentSummary"][0]
+    species_data = {
+        "scientific_name": summary["SpeciesName"],
+        "display_name": summary["Organism"],
+    }
+    return species_data
+
+
+def get_assembly_summary(id):
+    """Get esummary for an entrez id"""
+    esummary_handle = Entrez.esummary(db="assembly", id=id, report="full")
+    esummary_record = Entrez.read(esummary_handle)
+    return esummary_record
+
+
+def get_genome_data(uid):
+    """
+    Get chromosome data for the specified NCBI UID.
+    """
+    logger.info(f"Getting genome data for id: {uid}")
+    summary = get_assembly_summary(uid)
+    sum_dict = summary["DocumentSummarySet"]["DocumentSummary"][0]
+    data = {
+        "assembly_accession": sum_dict["AssemblyAccession"],
+        "assembly_name": sum_dict["AssemblyName"],
+    }
+    # get ftp link
+    url = summary["DocumentSummarySet"]["DocumentSummary"][0]["FtpPath_Stats_rpt"]
+    chromosomes = {}
+    chrom_list = []
+    with urllib.request.urlopen(url) as f:
+        stats = f.read().decode("utf-8")
+        for line in stats.split("\n"):
+            if line and line[0] != "#":
+                tokens = line.strip().split("\t")
+                if (
+                    tokens[2] == "Chromosome"
+                    and tokens[3] == "all"
+                    and tokens[4] == "total-length"
+                ):
+                    chromosomes[tokens[1]] = {
+                        "length": tokens[5],
+                        "synonyms": [],  # not listed in NCBI
+                    }
+                    chrom_list.append(tokens[1])
+    data["chromosomes"] = {name: chromosomes[name] for name in chrom_list}
+    return data
diff --git a/requirements/CI/requirements.txt b/requirements/CI/requirements.txt
@@ -19,3 +19,4 @@ pandas==1.2.3
 numpy==1.20.2
 scikit-allel==1.3.3
 zarr==2.7.0
+biopython==1.78
diff --git a/requirements/development.txt b/requirements/development.txt
@@ -23,3 +23,4 @@ pandas
 numpy
 scikit-allel
 zarr>=2.4
+biopython