Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

maintenance extension to stub out from NCBI #875

Merged
merged 3 commits into from
Apr 9, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions maintenance/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
Code used for internal maintenance tasks.
"""
from .ensembl import * # noqa
from .ncbi import * # noqa
62 changes: 62 additions & 0 deletions maintenance/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

import stdpopsim
from . import ensembl
from . import ncbi

logger = logging.getLogger("maint")

Expand Down Expand Up @@ -163,6 +164,14 @@ def ensembl_stdpopsim_id(ensembl_id):
return sps_id


def ncbi_stdpopsim_id(ncbi_id):
tmp = ncbi_id.split(" ")[:2]
sps_id = "".join([x[0:3].capitalize() for x in tmp])
if len(sps_id) != 6:
raise ValueError(f"Cannot extract six character id from {ncbi_id}")
return sps_id


def catalog_path(sps_id):
return pathlib.Path(f"stdpopsim/catalog/{sps_id}")

Expand Down Expand Up @@ -239,6 +248,27 @@ def add_species(self, ensembl_id, force=False):
genome_data=genome_data,
)

def add_species_ncbi(self, ncbi_id, force=False):
species_name = ncbi.get_species_name(ncbi_id)
tmp = species_name.split(" ")[:2]
sps_id = "".join([x[0:3].capitalize() for x in tmp])
if len(sps_id) != 6:
raise ValueError(f"Cannot extract six character id from {species_name}")
logger.info(f"Adding new species {sps_id} for NCBI ID {ncbi_id}")
root = catalog_path(sps_id)
if force:
shutil.rmtree(root, ignore_errors=True)
root.mkdir()
genome_data = self.write_genome_data_ncbi(ncbi_id, sps_id)
species_data = ncbi.get_species_data(ncbi_id)
write_catalog_stub(
path=root,
sps_id=sps_id,
ensembl_id=ncbi_id,
species_data=species_data,
genome_data=genome_data,
)

def write_genome_data(self, ensembl_id):
sps_id = ensembl_stdpopsim_id(ensembl_id)
path = catalog_path(sps_id)
Expand All @@ -256,6 +286,22 @@ def write_genome_data(self, ensembl_id):
f.write(black_format(code))
return data

def write_genome_data_ncbi(self, ncbi_id, sps_id):
path = catalog_path(sps_id)
if not path.exists():
raise ValueError(
f"Directory {id} corresponding to {ncbi_id} does" + "not exist"
)
logger.info(f"Writing genome data for {sps_id} {ncbi_id}")
path = path / "genome_data.py"
data = ncbi.get_genome_data(ncbi_id)
code = f"data = {data}"

# Format the code with Black so we don't get noisy diffs
with self.write(path) as f:
f.write(black_format(code))
return data

def write_ensembl_release(self):
release = self.ensembl_client.get_release()
logger.info(f"Using Ensembl release {release}")
Expand Down Expand Up @@ -305,6 +351,7 @@ def update_genome_data(species):
will update the genome data for humans. Multiple species can
be specified. By default all species are updated.
"""
# TODO make this work for NCBI as well
if len(species) == 0:
embl_ids = [s.ensembl_id for s in stdpopsim.all_species()]
else:
Expand All @@ -327,5 +374,20 @@ def add_species(ensembl_id, force):
writer.write_ensembl_release()


# TODO refactor this so that it's an option to add-species. By default
# we assume the repository is Ensembl.
@cli.command()
@click.argument("NCBI-id")
@click.option("--force", is_flag=True)
def add_species_ncbi(ncbi_id, force):
"""
Add a new species to the catalog using its NCBI UID. UIDs can be
found by searching NCBI for the species in question and looking
at the assembly page.
"""
writer = DataWriter()
writer.add_species_ncbi(ncbi_id, force=force)


def main():
cli()
74 changes: 74 additions & 0 deletions maintenance/ncbi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
"""
Utilites for working with NCBI

"""
import logging
import urllib

from Bio import Entrez

logger = logging.getLogger("ncbi")

Entrez.email = "[email protected]"

# TODO This is very badly factored, we're making the same call 3 or 4 times.
# Fix up!


def get_species_name(uid):
esummary_handle = Entrez.esummary(db="assembly", id=uid, report="full")
esummary_record = Entrez.read(esummary_handle)
summary = esummary_record["DocumentSummarySet"]["DocumentSummary"][0]
return summary["SpeciesName"]


def get_species_data(uid):
esummary_handle = Entrez.esummary(db="assembly", id=uid, report="full")
esummary_record = Entrez.read(esummary_handle)
summary = esummary_record["DocumentSummarySet"]["DocumentSummary"][0]
species_data = {
"scientific_name": summary["SpeciesName"],
"display_name": summary["Organism"],
}
return species_data


def get_assembly_summary(id):
"""Get esummary for an entrez id"""
esummary_handle = Entrez.esummary(db="assembly", id=id, report="full")
esummary_record = Entrez.read(esummary_handle)
return esummary_record


def get_genome_data(uid):
"""
Get chromosome data for the specified NCBI UID.
"""
logger.info(f"Getting genome data for id: {uid}")
summary = get_assembly_summary(uid)
sum_dict = summary["DocumentSummarySet"]["DocumentSummary"][0]
data = {
"assembly_accession": sum_dict["AssemblyAccession"],
"assembly_name": sum_dict["AssemblyName"],
}
# get ftp link
url = summary["DocumentSummarySet"]["DocumentSummary"][0]["FtpPath_Stats_rpt"]
chromosomes = {}
chrom_list = []
with urllib.request.urlopen(url) as f:
stats = f.read().decode("utf-8")
for line in stats.split("\n"):
if line and line[0] != "#":
tokens = line.strip().split("\t")
if (
tokens[2] == "Chromosome"
and tokens[3] == "all"
and tokens[4] == "total-length"
):
chromosomes[tokens[1]] = {
"length": tokens[5],
"synonyms": [], # not listed in NCBI
}
chrom_list.append(tokens[1])
data["chromosomes"] = {name: chromosomes[name] for name in chrom_list}
return data
1 change: 1 addition & 0 deletions requirements/CI/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,4 @@ pandas==1.2.3
numpy==1.20.2
scikit-allel==1.3.3
zarr==2.7.0
biopython==1.78
1 change: 1 addition & 0 deletions requirements/development.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,4 @@ pandas
numpy
scikit-allel
zarr>=2.4
biopython