From 5bfb8f80d6114a6d7c2556d061f310c94a9a695d Mon Sep 17 00:00:00 2001 From: siebrenf Date: Tue, 21 Jun 2022 17:33:10 +0200 Subject: [PATCH 1/9] update release_checklist --- docs/release_checklist.md | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/docs/release_checklist.md b/docs/release_checklist.md index 95546217..6378b729 100644 --- a/docs/release_checklist.md +++ b/docs/release_checklist.md @@ -63,37 +63,30 @@ 7. Push everything to github, including tags: ```shell - git push --follow-tags origin develop + git push --follow-tags origin develop master ``` -8. Pull into master - - ```shell - git checkout master - git push origin master - ``` - -9. Upload to pypi: +8. Upload to pypi: ```shell python setup.py sdist bdist_wheel twine upload dist/genomepy-${new_version}* ``` -10. Create release on github (if it not already exists) +9. Create release on github (if it not already exists) * Update release with CHANGELOG information from the latest version * Download the tarball from the github release (`.tar.gz`). * Attach downloaded tarball to release as binary (this way the download count get tracked). -11a. Update bioconda package +10a. Update bioconda package * wait for the bioconda bot to create a PR * update dependencies in the bioconda recipe.yaml if needed * approve the PR * comment: @bioconda-bot please merge -11b. Update bioconda package +10b. Update bioconda package * fork bioconda/bioconda-recipes * follow the steps in the [docs](https://bioconda.github.io/contributor/workflow.html) From b016d8af4f44f2cf8251d03e8b4e76db6c1dbdf3 Mon Sep 17 00:00:00 2001 From: Brice Letcher Date: Wed, 22 Jun 2022 09:48:51 +0100 Subject: [PATCH 2/9] Bugfix: CLI help menu not showing with no arguments (#210) --- genomepy/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/genomepy/cli.py b/genomepy/cli.py index e6a83ebe..3e062d30 100755 --- a/genomepy/cli.py +++ b/genomepy/cli.py @@ -188,7 +188,7 @@ def get_install_options(): Add the provider name in front of the options to prevent overlap. """ - if sys.argv[1] == "install": + if len(sys.argv) > 1 and sys.argv[1] == "install": install_options = INSTALL_OPTIONS # extend install options with provider specific options From 02e23be52545b1697ba4513fd660d07775225530 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Siebren=20Fr=C3=B6lich?= <48289046+siebrenf@users.noreply.github.com> Date: Thu, 21 Jul 2022 14:08:21 +0200 Subject: [PATCH 3/9] filelock 4 import + writing + deleting the cache (#212) --- CHANGELOG.md | 3 +++ environment.yml | 1 + genomepy/annotation/mygene.py | 3 ++- genomepy/caching.py | 23 ++++++++++++++--------- genomepy/providers/ensembl.py | 4 +++- genomepy/providers/gencode.py | 3 ++- genomepy/providers/ncbi.py | 3 ++- genomepy/providers/ucsc.py | 4 +++- setup.py | 1 + 9 files changed, 31 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 161040df..076872f6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ## [Unreleased] +### Added +- now using `filelock` for improved thread safety + ## [0.13.1] - 2022-06-21 ### Changed diff --git a/environment.yml b/environment.yml index 50f0fae0..98ce0e7e 100644 --- a/environment.yml +++ b/environment.yml @@ -10,6 +10,7 @@ dependencies: - diskcache - click - colorama + - filelock - loguru - mygene - mysql-connector-python diff --git a/genomepy/annotation/mygene.py b/genomepy/annotation/mygene.py index ae9f90c5..2e7695f1 100644 --- a/genomepy/annotation/mygene.py +++ b/genomepy/annotation/mygene.py @@ -7,7 +7,7 @@ from tqdm import tqdm from genomepy.annotation.utils import _parse_annot -from genomepy.caching import cache_exp_short, disk_cache +from genomepy.caching import cache_exp_short, disk_cache, lock def _map_genes( @@ -78,6 +78,7 @@ def _map_genes( return df +@lock @disk_cache.memoize(expire=cache_exp_short, tag="query_mygene") def query_mygene( query: Iterable[str], diff --git a/genomepy/caching.py b/genomepy/caching.py index 2a051df9..d7911a74 100644 --- a/genomepy/caching.py +++ b/genomepy/caching.py @@ -1,12 +1,13 @@ import os from shutil import rmtree -from sqlite3 import DatabaseError -from time import sleep +from time import time from appdirs import user_cache_dir from diskcache import Cache +from filelock import FileLock from genomepy.__about__ import __version__ +from genomepy.utils import rm_rf # Set short/long cache expiration times (in seconds) cache_exp_short = 3.6e3 @@ -16,16 +17,20 @@ genomepy_cache_dir = os.path.join(user_cache_dir("genomepy"), __version__) os.makedirs(genomepy_cache_dir, exist_ok=True) -# Store the output of slow commands (marked with @disk_cache.memoize) for fast reuse -# DiskCache uses the LRU (least-recently-stored) eviction policy by default -try: - disk_cache = Cache(directory=genomepy_cache_dir) -except DatabaseError: - # another process was writing to the cache at the same time - sleep(3) +# create a lock, so only one tread can access the cache at once +lock_file = os.path.join(genomepy_cache_dir, "cache.lock") +if os.path.exists(lock_file) and time() - os.stat(lock_file).st_mtime > 60: + # remove abandoned lock + rm_rf(lock_file) +lock = FileLock(lock_file) + +with lock: + # Store the output of slow commands (marked with @disk_cache.memoize) for fast reuse + # DiskCache uses the LRU (least-recently-stored) eviction policy by default disk_cache = Cache(directory=genomepy_cache_dir) +@lock def clean(): """Remove cached data on providers.""" rmtree(genomepy_cache_dir, ignore_errors=True) diff --git a/genomepy/providers/ensembl.py b/genomepy/providers/ensembl.py index 4083a50a..58aaaaf0 100644 --- a/genomepy/providers/ensembl.py +++ b/genomepy/providers/ensembl.py @@ -3,7 +3,7 @@ import requests from loguru import logger -from genomepy.caching import cache_exp_long, cache_exp_short, disk_cache +from genomepy.caching import cache_exp_long, cache_exp_short, disk_cache, lock from genomepy.exceptions import GenomeDownloadError from genomepy.online import check_url, retry from genomepy.providers.base import BaseProvider @@ -65,6 +65,7 @@ def _genome_info_tuple(self, name, size=False): return name, accession, taxid, annotations, species, other @staticmethod + @lock @disk_cache.memoize(expire=cache_exp_short, tag="get_version-ensembl") def get_version(vertebrates=False, set_version=None, url=_url): """Retrieve current version from Ensembl FTP.""" @@ -237,6 +238,7 @@ def add_grch37(genomes): return genomes +@lock @disk_cache.memoize(expire=cache_exp_long, tag="get_genomes-ensembl") def get_genomes(rest_url): logger.info("Downloading assembly summaries from Ensembl") diff --git a/genomepy/providers/gencode.py b/genomepy/providers/gencode.py index 0cf8f6b9..e88e681c 100644 --- a/genomepy/providers/gencode.py +++ b/genomepy/providers/gencode.py @@ -3,7 +3,7 @@ from loguru import logger -from genomepy.caching import cache_exp_long, disk_cache +from genomepy.caching import cache_exp_long, disk_cache, lock from genomepy.exceptions import GenomeDownloadError from genomepy.files import update_readme from genomepy.online import check_url, connect_ftp_link @@ -255,6 +255,7 @@ def _get_genomes(ftp_link): return genomes +@lock @disk_cache.memoize(expire=cache_exp_long, tag="get_genomes-gencode") def get_genomes(ftp_link): """genomes dict of the latest gencode release of each major assembly.""" diff --git a/genomepy/providers/ncbi.py b/genomepy/providers/ncbi.py index 9a26c41a..57cda613 100644 --- a/genomepy/providers/ncbi.py +++ b/genomepy/providers/ncbi.py @@ -8,7 +8,7 @@ from loguru import logger from tqdm.auto import tqdm -from genomepy.caching import cache_exp_long, disk_cache +from genomepy.caching import cache_exp_long, disk_cache, lock from genomepy.exceptions import GenomeDownloadError from genomepy.online import check_url, read_url from genomepy.providers.base import BaseProvider @@ -200,6 +200,7 @@ def _ftp_or_html_link(self, name, file_suffix, skip_check=False): return link +@lock @disk_cache.memoize(expire=cache_exp_long, tag="get_genomes-ncbi") def get_genomes(assembly_url): """Parse genomes from assembly summary txt files.""" diff --git a/genomepy/providers/ucsc.py b/genomepy/providers/ucsc.py index 32bd7e6b..0fd5eda8 100644 --- a/genomepy/providers/ucsc.py +++ b/genomepy/providers/ucsc.py @@ -10,7 +10,7 @@ import requests from loguru import logger -from genomepy.caching import cache_exp_long, disk_cache +from genomepy.caching import cache_exp_long, disk_cache, lock from genomepy.exceptions import GenomeDownloadError from genomepy.files import update_readme from genomepy.online import check_url, read_url @@ -390,6 +390,7 @@ def head_annotation(self, name, genomes_dir=None, n: int = 5, **kwargs): break +@lock @disk_cache.memoize(expire=cache_exp_long, tag="get_genomes-ucsc") def get_genomes(rest_url): logger.info("Downloading assembly summaries from UCSC") @@ -587,6 +588,7 @@ def download_annotation(name, annot, genomes_dir, localname, n=None): rm_rf(tmp_dir) +@lock @disk_cache.memoize(expire=cache_exp_long, tag="scrape_accession-ucsc") def scrape_accession(htmlpath: str) -> str or None: """ diff --git a/setup.py b/setup.py index fd337897..8c80172c 100644 --- a/setup.py +++ b/setup.py @@ -33,6 +33,7 @@ "diskcache", "click", "colorama", + "filelock", "loguru", "mygene", "mysql-connector-python", From d8bb778248eaafeeba49e80a3f88169d86bd3ab5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Siebren=20Fr=C3=B6lich?= <48289046+siebrenf@users.noreply.github.com> Date: Thu, 28 Jul 2022 19:53:34 +0200 Subject: [PATCH 4/9] switch to pyproject.toml (#211) * pyproject.toml * update pkg build system & info * check all URLs per provider (#214) --- .flake8 | 3 - .gitignore | 23 +++---- .pre-commit-config.yaml | 35 ---------- .travis.yml | 10 +-- CHANGELOG.md | 4 ++ LICENSE | 2 + docs/release_checklist.md | 8 +-- docs/requirements.yaml | 3 +- environment.yml | 24 +++---- genomepy/annotation/__init__.py | 4 +- genomepy/argparse_support.py | 2 +- genomepy/cli.py | 7 +- genomepy/providers/__init__.py | 3 +- genomepy/providers/ensembl.py | 9 ++- genomepy/providers/gencode.py | 7 +- genomepy/providers/local.py | 2 +- genomepy/providers/ncbi.py | 3 +- genomepy/providers/ucsc.py | 4 +- genomepy/providers/url.py | 2 +- pyproject.toml | 106 ++++++++++++++++++++++++++++++ setup.py | 76 --------------------- tests/format.sh | 9 +-- tests/test_07_provider_ncbi.py | 9 ++- tests/test_09_provider_ensembl.py | 9 ++- 24 files changed, 193 insertions(+), 171 deletions(-) delete mode 100644 .flake8 delete mode 100644 .pre-commit-config.yaml create mode 100644 pyproject.toml delete mode 100644 setup.py diff --git a/.flake8 b/.flake8 deleted file mode 100644 index e38efb34..00000000 --- a/.flake8 +++ /dev/null @@ -1,3 +0,0 @@ -[flake8] -extend-ignore = E203,E402,E501 -select = C,E,F,W,B,B950 diff --git a/.gitignore b/.gitignore index 2704bdad..7c7d1f8f 100644 --- a/.gitignore +++ b/.gitignore @@ -11,14 +11,15 @@ Log.out genomepy/config/default.yaml # Directories -docs/_autosummary -docs/_build -tests/data -.pytest_cache -build -genomepy.egg-info -dist -eggs -tmp -.vscode -.idea +__pychache__/ +.idea/ +.pytest_cache/ +.vscode/ +build/ +dist/ +docs/_autosummary/ +docs/_build/ +eggs/ +genomepy.egg-info/ +tests/data/ +tmp/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml deleted file mode 100644 index c4ecb43a..00000000 --- a/.pre-commit-config.yaml +++ /dev/null @@ -1,35 +0,0 @@ -files: 'setup\.py|genomepy/.*|tests/.*' -repos: - - repo: https://github.com/myint/autoflake - rev: v1.4 - hooks: - - id: autoflake - args: - - --check - - --recursive - - --remove-all-unused-imports - - --remove-duplicate-keys - - --remove-unused-variables - - - repo: https://github.com/pycqa/isort - rev: 5.9.1 - hooks: - - id: isort - args: - - --check - - --profile=black - - --conda-env=environment.yml - - - repo: https://github.com/ambv/black - rev: 21.5b2 - hooks: - - id: black - args: - - --check - - - repo: https://gitlab.com/pycqa/flake8 - rev: 3.9.2 - hooks: - - id: flake8 - additional_dependencies: - - flake8-bugbear==21.4.3 diff --git a/.travis.yml b/.travis.yml index ee22f059..30060638 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,7 +7,7 @@ branches: # use the minimal travis environment since we test in conda language: minimal -#dist +# dist dist: focal os: @@ -18,7 +18,7 @@ env: global: - CC_TEST_REPORTER_ID=951f438ac8a0fa93801ff0bf69922df59fe03800bf7ea8ab77a3c26cda444979 jobs: - - PYTHON_VERSION=3.6 + - PYTHON_VERSION=3.7 cache: directories: @@ -43,14 +43,14 @@ before_install: export PATH=$HOME/miniconda/bin:$PATH; conda config --set always_yes yes; - conda install conda-forge::mamba; + conda update -c defaults conda; + conda install -c conda-forge mamba; mamba env create -n genomepy python=$PYTHON_VERSION -f environment.yml; fi install: - source activate genomepy - - python setup.py develop - - python setup.py build + - pip install -e . before_script: # install codeclimate test coverage diff --git a/CHANGELOG.md b/CHANGELOG.md index 076872f6..4a02476a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added - now using `filelock` for improved thread safety +- now checking if every API/FTP/HTTP(S) is accessible before proceeding + +### Changed +- switched to `pyproject.toml` + `hatchling` for packaging ## [0.13.1] - 2022-06-21 diff --git a/LICENSE b/LICENSE index 945de80b..cb212e6e 100644 --- a/LICENSE +++ b/LICENSE @@ -1,3 +1,5 @@ +MIT License + Copyright (c) 2016 Simon van Heeringen Permission is hereby granted, free of charge, to any person diff --git a/docs/release_checklist.md b/docs/release_checklist.md index 6378b729..72e86343 100644 --- a/docs/release_checklist.md +++ b/docs/release_checklist.md @@ -28,13 +28,13 @@ 5. Check if release works on pypi: ```shell - python setup.py sdist bdist_wheel + python -m build # twine must be up to date (3.3.0 works). System installed twine can interfere. twine upload --repository-url https://test.pypi.org/legacy/ dist/genomepy-${new_version}* - python setup.py develop --uninstall - + pip uninstall genomepy + # the \ is to escape the ==, so the variable ${new_version} can be called pip install --extra-index-url https://test.pypi.org/simple/ genomepy\==${new_version} @@ -69,7 +69,7 @@ 8. Upload to pypi: ```shell - python setup.py sdist bdist_wheel + python -m build twine upload dist/genomepy-${new_version}* ``` diff --git a/docs/requirements.yaml b/docs/requirements.yaml index e66afb06..23de0e35 100644 --- a/docs/requirements.yaml +++ b/docs/requirements.yaml @@ -5,5 +5,4 @@ channels: dependencies: - sphinx - sphinx_rtd_theme # read the docs theme - - m2r2 # include markdown files - - mistune==0.8.4 # tmp until m2r2 0.3.2 is on conda + - m2r2 >=0.3.2 # include markdown files diff --git a/environment.yml b/environment.yml index 98ce0e7e..c2878459 100644 --- a/environment.yml +++ b/environment.yml @@ -4,9 +4,9 @@ channels: - conda-forge - defaults dependencies: - # Also found in setup.py + # Also found in pyproject.toml - appdirs - - biopython>=1.73 # for pyfaidx + - biopython >=1.73 # for pyfaidx - diskcache - click - colorama @@ -14,12 +14,12 @@ dependencies: - loguru - mygene - mysql-connector-python - - norns>=0.1.5 + - norns >=0.1.5 - numpy - pandas - - pyfaidx>=0.5.7 + - pyfaidx >=0.5.7 - requests - - tqdm>=4.51 + - tqdm >=4.51 # Bgzip - tabix @@ -40,15 +40,15 @@ dependencies: - star # Distribution - - setuptools + - python-build # Testing - - autoflake=1.4 # versions match .pre-commit-config.yaml - - black=21.5b2 - - flake8=3.9.2 - - flake8-bugbear=21.4.3 - - isort=5.9.1 - - pre-commit + - autoflake ==1.4 + - black ==22.6.0 + - flake8 ==4.0.1 + - flake8-bugbear ==22.7.1 + - flakeheaven ==2.0.0 + - isort ==5.10.1 - pytest - pytest-rerunfailures - pytest-cov diff --git a/genomepy/annotation/__init__.py b/genomepy/annotation/__init__.py index a62a0d26..a5514d95 100644 --- a/genomepy/annotation/__init__.py +++ b/genomepy/annotation/__init__.py @@ -167,7 +167,7 @@ def from_attributes( raise ValueError(f"{field} not in GTF attributes!") # extract the text between the quotes - series = df["attribute"].str.extract(fr'{field} "(.*?)"', expand=False) + series = df["attribute"].str.extract(rf'{field} "(.*?)"', expand=False) series.name = field return series @@ -444,7 +444,7 @@ def _get_name_and_dir(name, genomes_dir=None): name = safe(os.path.basename(fname)) # remove suffices any_ext = "(" + ")|(".join(exts) + ")" - name = re.sub(fr"(\.annotation)?\.({any_ext})(\.gz)?$", "", name) + name = re.sub(rf"(\.annotation)?\.({any_ext})(\.gz)?$", "", name) elif os.path.isdir(fname): genome_dir = fname name = safe(os.path.basename(fname)) diff --git a/genomepy/argparse_support.py b/genomepy/argparse_support.py index 329ccbcd..4c2524f0 100644 --- a/genomepy/argparse_support.py +++ b/genomepy/argparse_support.py @@ -15,7 +15,7 @@ def parse_genome(auto_install=False, genomes_dir=None): auto_install : bool, optional Install a genome if it's not found locally. genomes_dir : str, optional - Directory to look for and/or insall genomes. + Directory to look for and/or install genomes. """ class GenomeAction(argparse.Action): diff --git a/genomepy/cli.py b/genomepy/cli.py index 3e062d30..2c62e172 100755 --- a/genomepy/cli.py +++ b/genomepy/cli.py @@ -314,7 +314,9 @@ def providers(): "other_info": "<40", } FULL_SEARCH_STRING = " ".join([f"{{:{size}}}" for size in SEARCH_FORMAT.values()]) -SEARCH_STRING = " ".join([f"{{:{size}}}" for size in SEARCH_FORMAT.values() if size != "<13"]) +SEARCH_STRING = " ".join( + [f"{{:{size}}}" for size in SEARCH_FORMAT.values() if size != "<13"] +) if sys.stdout.isatty(): def bool_to_unicode(boolean: bool) -> str: @@ -342,7 +344,7 @@ def terminal_formatting(row: list): row[n] = "na" if len(row) == 8: # genome_size - row[6] = f'{int(row[6]):,}' + row[6] = f"{int(row[6]):,}" row = FULL_SEARCH_STRING.format(*row) else: row = SEARCH_STRING.format(*row) @@ -364,6 +366,7 @@ def terminal_subheader(size): print(FULL_SEARCH_STRING.format(*subheader)) else: print(SEARCH_STRING.format(*subheader[:-1])) + else: def terminal_formatting(row: list): diff --git a/genomepy/providers/__init__.py b/genomepy/providers/__init__.py index f9ad5b00..2267e23d 100644 --- a/genomepy/providers/__init__.py +++ b/genomepy/providers/__init__.py @@ -98,7 +98,8 @@ def online_providers(provider: str = None): provider Provider instances """ - for provider in [provider] if provider else list_providers(): + providers = [provider] if provider else list_providers() + for provider in providers: try: yield create(provider) except ConnectionError as e: diff --git a/genomepy/providers/ensembl.py b/genomepy/providers/ensembl.py index 58aaaaf0..54f62bb5 100644 --- a/genomepy/providers/ensembl.py +++ b/genomepy/providers/ensembl.py @@ -50,7 +50,10 @@ def __init__(self): @staticmethod def ping(): """Can the provider be reached?""" - return bool(check_url("https://rest.ensembl.org/info/ping?")) + api_online = bool(check_url("https://rest.ensembl.org/info/ping?")) + vertebrate_url_online = bool(check_url("http://ftp.ensembl.org")) + other_url_online = bool(check_url("http://ftp.ensemblgenomes.org")) + return api_online and vertebrate_url_online and other_url_online def _genome_info_tuple(self, name, size=False): """tuple with assembly metadata""" @@ -111,7 +114,9 @@ def get_genome_download_link(self, name, mask="soft", **kwargs): # - EnsemblMetazoa: caenorhabditis_elegans if not check_url(ftp_directory, 2): lwr_name = genome["name"] - ftp_directory = f"{ftp}/pub/release-{version}{div_path}/fasta/{lwr_name}/dna" + ftp_directory = ( + f"{ftp}/pub/release-{version}{div_path}/fasta/{lwr_name}/dna" + ) # this assembly has its own directory if name == "GRCh37": diff --git a/genomepy/providers/gencode.py b/genomepy/providers/gencode.py index e88e681c..307229c7 100644 --- a/genomepy/providers/gencode.py +++ b/genomepy/providers/gencode.py @@ -33,12 +33,12 @@ class GencodeProvider(BaseProvider): "text_search", ] _cli_install_options = {} - _ftp_link = "ftp://ftp.ebi.ac.uk/pub/databases/gencode" + _url = "ftp://ftp.ebi.ac.uk/pub/databases/gencode" def __init__(self): self._provider_status() # Populate on init, so that methods can be cached - self.genomes = _get_genomes(self._ftp_link) + self.genomes = _get_genomes(self._url) self.ucsc = UcscProvider() self.gencode2ucsc = get_gencode2ucsc(self.genomes) self._update_genomes() @@ -46,7 +46,8 @@ def __init__(self): @staticmethod def ping(): """Can the provider be reached?""" - return bool(check_url("ftp.ebi.ac.uk/pub/databases/gencode")) + ftp_online = bool(check_url("ftp.ebi.ac.uk/pub/databases/gencode")) + return ftp_online def _genome_info_tuple(self, name, size=False): """tuple with assembly metadata""" diff --git a/genomepy/providers/local.py b/genomepy/providers/local.py index 5e877fb8..fd2c668a 100644 --- a/genomepy/providers/local.py +++ b/genomepy/providers/local.py @@ -107,7 +107,7 @@ def get_annotation_download_links(self, name, **kwargs): hits = [] for ext in ["gtf", "gff", "gff3"]: # .*? = non greedy filler. (\.gz)? = optional .gz - expr = fr"{search_name}.*?\.{ext}(\.gz)?" # noqa: W605 + expr = rf"{search_name}.*?\.{ext}(\.gz)?" # noqa: W605 for line in search_list: hit = re.search(expr, line, flags=re.IGNORECASE) if hit: diff --git a/genomepy/providers/ncbi.py b/genomepy/providers/ncbi.py index 57cda613..ddd0113c 100644 --- a/genomepy/providers/ncbi.py +++ b/genomepy/providers/ncbi.py @@ -56,7 +56,8 @@ def __init__(self): @staticmethod def ping(): """Can the provider be reached?""" - return bool(check_url("https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/")) + url_online = bool(check_url("https://ftp.ncbi.nlm.nih.gov/genomes/")) + return url_online def _genome_info_tuple(self, name, size=False): """tuple with assembly metadata""" diff --git a/genomepy/providers/ucsc.py b/genomepy/providers/ucsc.py index 0fd5eda8..7df89a5f 100644 --- a/genomepy/providers/ucsc.py +++ b/genomepy/providers/ucsc.py @@ -53,7 +53,9 @@ def __init__(self): @staticmethod def ping(): """Can the provider be reached?""" - return bool(check_url("http://hgdownload.soe.ucsc.edu/goldenPath")) + url_online = bool(check_url("http://hgdownload.soe.ucsc.edu/goldenPath")) + api_online = bool(check_url("http://api.genome.ucsc.edu/list/ucscGenomes")) + return url_online and api_online def _search_accession(self, term: str) -> Iterator[str]: """ diff --git a/genomepy/providers/url.py b/genomepy/providers/url.py index 617f840a..ede7e0fb 100644 --- a/genomepy/providers/url.py +++ b/genomepy/providers/url.py @@ -141,7 +141,7 @@ def fuzzy_annotation_search(search_name, search_list): hits = [] for ext in ["gtf", "gff"]: # .*? = non greedy filler. 3? = optional 3 (for gff3). (\.gz)? = optional .gz - expr = fr"{search_name}.*?\.{ext}3?(\.gz)?" # noqa: W605 + expr = rf"{search_name}.*?\.{ext}3?(\.gz)?" # noqa: W605 for line in search_list: hit = re.search(expr, line, flags=re.IGNORECASE) if hit: diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..87f47aa9 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,106 @@ +[project] +name = "genomepy" +description = "Genes and genomes at your fingertips" +authors = [ + {name = "Siebren Frölich", email = "siebrenf@gmail.com"}, + {name = "Maarten van der Sande", email = "m.vandersande@science.ru.nl"}, + {name = "Tilman Schäfers", email = "tilman.schaefers@ru.nl"}, + {name = "Simon van Heeringen", email = "simon.vanheeringen@gmail.com"}, +] +maintainers = [ + {name = "Siebren Frölich", email = "siebrenf@gmail.com"}, + {name = "Simon van Heeringen", email = "simon.vanheeringen@gmail.com"}, +] + +requires-python = ">=3.7" +dependencies = [ + "appdirs", + "biopython >=1.73", + "diskcache", + "click", + "colorama", + "filelock", + "loguru", + "mygene", + "mysql-connector-python", + "norns >=0.1.5", + "numpy", + "pandas", + "pyfaidx >=0.5.7", + "requests", + "tqdm >=4.51", +] + +dynamic = ["version"] +readme = "README.md" +license = {file = "LICENSE"} +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Operating System :: MacOS :: MacOS X", + "Operating System :: POSIX :: Linux", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Topic :: Scientific/Engineering :: Bio-Informatics", +] +keywords = [ + "genome", + "assembly", + "gene", + "annotation", + "Ensembl", + "UCSC", + "NCBI", + "GENCODE", +] + +[project.urls] +Homepage = "https://github.com/vanheeringen-lab/genomepy" +Documentation = "https://vanheeringen-lab.github.io/genomepy/" +News = "https://github.com/vanheeringen-lab/genomepy/blob/master/CHANGELOG.md" +Readme = "https://github.com/vanheeringen-lab/genomepy/blob/master/README.md" + +[project.scripts] +genomepy = "genomepy.cli:cli" + +# pyproject.toml documentation: +# https://peps.python.org/pep-0621/ +# https://hatch.pypa.io/latest/config/metadata/ +# https://setuptools.pypa.io/en/latest/userguide/pyproject_config.html +# https://python-poetry.org/docs/pyproject/ + +[build-system] +requires = ["hatchling>=1.5.0"] +build-backend = "hatchling.build" + +[tool.hatch.version] +path = "genomepy/__about__.py" + +# include files specified in .gitignore +[tool.hatch.build] +artifacts = [ + "genomepy/config/default.yaml", +] + +# # include other files +# [tool.hatch.build.force-include] +# "LICENSE" = "genomepy/LICENSE" +# "README.md" = "genomepy/README.md" +# "CHANGELOG.md" = "genomepy/CHANGELOG.md" + +[tool.isort] +profile = "black" +conda_env = "environment.yml" +overwrite_in_place = true + +[tool.flakeheaven] +max_line_length = 88 +show_source = true + +[tool.flakeheaven.plugins] +flake8-bugbear = ["+*"] +pycodestyle = ["+*", "-E501", "-W503"] +pyflakes = ["+*"] +pylint = ["+*"] diff --git a/setup.py b/setup.py deleted file mode 100644 index 8c80172c..00000000 --- a/setup.py +++ /dev/null @@ -1,76 +0,0 @@ -import os -import sys - -from setuptools import setup - -# https://packaging.python.org/single_source_version/ -exec(open("genomepy/__about__.py").read()) - -if sys.argv[-1] == "publish": - os.system("python setup.py sdist upload") - sys.exit() - -with open("README.md", encoding="utf-8") as f: - long_description = f.read() - -# List of files and directories to include when packaging for release -packages = [ - "genomepy", - "genomepy.annotation", - "genomepy.config", - "genomepy.genome", - "genomepy.plugins", - "genomepy.providers", -] -package_data = {"genomepy": ["config/default.yaml"]} -data_files = [("", ["LICENSE", "README.md", "CHANGELOG.md"])] - -entry_points = {"console_scripts": ["genomepy=genomepy.cli:cli"]} - -requires = [ - "appdirs", - "biopython>=1.73", - "diskcache", - "click", - "colorama", - "filelock", - "loguru", - "mygene", - "mysql-connector-python", - "norns>=0.1.5", - "numpy", - "pandas", - "pyfaidx>=0.5.7", - "requests", - "tqdm>=4.51", -] - -classifiers = [ - "Development Status :: 4 - Beta", - "Intended Audience :: Developers", - "Intended Audience :: Science/Research", - "License :: OSI Approved :: MIT License", - "Operating System :: MacOS :: MacOS X", - "Operating System :: POSIX :: Linux", - "Programming Language :: Python", - "Programming Language :: Python :: 3", - "Topic :: Scientific/Engineering :: Bio-Informatics", -] - -setup( - name="genomepy", - version=__version__, # noqa: F821 - description="Automatic downloading and processing of genomes and metadata in command line and Python", - long_description=long_description, - long_description_content_type="text/markdown", - packages=packages, - package_data=package_data, - data_files=data_files, - entry_points=entry_points, - install_requires=requires, - author=__author__, # noqa: F821 - author_email="simon.vanheeringen@gmail.com", - url="https://github.com/vanheeringen-lab/genomepy", - license="MIT", - classifiers=classifiers, -) diff --git a/tests/format.sh b/tests/format.sh index eb9c9a9e..b832705a 100755 --- a/tests/format.sh +++ b/tests/format.sh @@ -7,13 +7,10 @@ Run from root directory to format: bash tests/format.sh Run from root directory to lint: bash tests/format.sh lint - -Tool versions must match the pre-commit hook to prevent later issues. -This is automatically fixes if installed with `conda env create -f environment.yml` ' # files & directories to format/lint -targets=("setup.py genomepy/ tests/") +targets=("genomepy/ tests/") # check current directory if [[ $(pwd) != *genomepy ]] || [[ $(ls) != *genomepy* ]]; then @@ -55,8 +52,8 @@ black \ | grep 'would reformat' if $lint; then - flake8 \ - $targets + flakeheaven lint \ + $targets echo "" echo "Done" diff --git a/tests/test_07_provider_ncbi.py b/tests/test_07_provider_ncbi.py index e3fc17e1..b229f385 100644 --- a/tests/test_07_provider_ncbi.py +++ b/tests/test_07_provider_ncbi.py @@ -15,7 +15,14 @@ def test_ncbiprovider(ncbi): def test_genome_info_tuple(ncbi): t = ncbi._genome_info_tuple("ASM2732v1", size=True) assert isinstance(t, tuple) - assert t[:-1] == ("ASM2732v1", "GCF_000027325.1", 2097, True, 'Mycoplasma genitalium G37', 580076) + assert t[:-1] == ( + "ASM2732v1", + "GCF_000027325.1", + 2097, + True, + "Mycoplasma genitalium G37", + 580076, + ) def test_get_genome_download_link(ncbi): diff --git a/tests/test_09_provider_ensembl.py b/tests/test_09_provider_ensembl.py index 51882990..fb61df8f 100644 --- a/tests/test_09_provider_ensembl.py +++ b/tests/test_09_provider_ensembl.py @@ -12,7 +12,14 @@ def test_ensemblprovider(ensembl): def test_genome_info_tuple(ensembl): t = ensembl._genome_info_tuple("KH", size=True) assert isinstance(t, tuple) - assert t[:-1] == ("KH", "GCA_000224145.1", 7719, True, 'Ciona intestinalis', 115227500) + assert t[:-1] == ( + "KH", + "GCA_000224145.1", + 7719, + True, + "Ciona intestinalis", + 115227500, + ) def test_get_version(ensembl): From cfc389fdb7390c66d6545ce36d1b59c30a9882f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Siebren=20Fr=C3=B6lich?= <48289046+siebrenf@users.noreply.github.com> Date: Mon, 1 Aug 2022 13:43:53 +0200 Subject: [PATCH 5/9] improve search + update docs (#216) --- CHANGELOG.md | 6 ++ README.md | 146 ++++++++++++++++++--------------- docs/content/about.rst | 2 +- docs/content/command_line.rst | 4 +- docs/content/help_faq.rst | 4 +- docs/content/installation.rst | 4 +- genomepy/cli.py | 9 +- genomepy/providers/__init__.py | 11 ++- genomepy/providers/base.py | 36 ++++---- tests/test_06_provider_base.py | 2 +- 10 files changed, 125 insertions(+), 99 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a02476a..c680fbe6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,10 +9,16 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added - now using `filelock` for improved thread safety - now checking if every API/FTP/HTTP(S) is accessible before proceeding +- genomepy search improvements: + - text search now accepts regex, and multiple substrings (space separated) are unordered. + - taxonomy search now returns all hits that start with the given number. ### Changed - switched to `pyproject.toml` + `hatchling` for packaging +### Fixed +- updated the README and CLI documentation to mention the `Local` provider + ## [0.13.1] - 2022-06-21 ### Changed diff --git a/README.md b/README.md index 5474b6b9..e0f49e76 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,9 @@ -# genomepy +# genomepy: genes and genomes at your fingertips [![bioconda-badge](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io) [![Anaconda-Server Badge](https://anaconda.org/bioconda/genomepy/badges/downloads.svg)](https://anaconda.org/bioconda/genomepy) [![PyPI version](https://badge.fury.io/py/genomepy.svg)](https://badge.fury.io/py/genomepy) -[![star this repo](https://githubbadges.com/star.svg?user=vanheeringen-lab&repo=genomepy&style=flat)](https://github.com/vanheeringen-lab/genomepy) + [![Build Status](https://app.travis-ci.com/vanheeringen-lab/genomepy.svg?branch=master)](https://app.travis-ci.com/github/vanheeringen-lab/genomepy/branches) [![Maintainability](https://api.codeclimate.com/v1/badges/c4476820f1d21a3e0569/maintainability)](https://codeclimate.com/github/vanheeringen-lab/genomepy/maintainability) @@ -12,14 +12,13 @@ [![status](http://joss.theoj.org/papers/df434a15edd00c8c2f4076668575d1cd/status.svg)](http://joss.theoj.org/papers/df434a15edd00c8c2f4076668575d1cd) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.1010458.svg)](https://doi.org/10.5281/zenodo.1010458) -Install and use genomes & gene annotations the easy way! - genomepy is designed to provide a _simple_ and _straightforward_ way to download and use genomic data. -This includes (1) searching available data, (2) showing the available metadata, +This includes (1) searching available data, +(2) showing the available metadata, (3) automatically downloading, preprocessing and matching data and (4) generating optional aligner indexes. All with sensible, yet controllable defaults. -Currently, genomepy supports UCSC, Ensembl and NCBI. +Currently, genomepy supports Ensembl, UCSC, NCBI and GENCODE. [![asciicast](https://asciinema.org/a/eZttBuf5ly0AnjFVBiEIybbjS.png)](https://asciinema.org/a/eZttBuf5ly0AnjFVBiEIybbjS) @@ -43,27 +42,31 @@ Don't be shy and [let us know](https://github.com/vanheeringen-lab/genomepy/issu ## Installation -genomepy requires Python 3.6+ +genomepy requires Python 3.7+ You can install genomepy via [bioconda](https://bioconda.github.io/): -``` -$ conda install genomepy +```bash +$ conda install -c bioconda genomepy ``` Or via pip: -``` +```bash $ pip install genomepy ``` Or via git: -``` +```bash $ git clone https://github.com/vanheeringen-lab/genomepy.git $ cd genomepy -$ conda env update -f environment.yml -$ python setup.py install + +# optional: +$ conda env create -n genomepy -f environment.yml +$ conda activate genomepy + +$ pip install -e . ``` With Pip installation, you will have to install additional dependencies, and make them available in your PATH. @@ -85,7 +88,7 @@ You can find the binaries [here](http://hgdownload.cse.ucsc.edu/admin/exe/). 1. Find your genome: `$ genomepy search zebrafish` Console output: - ``` + ```bash name provider accession tax_id annotation species other_info GRCz11 Ensembl GCA_000002035.4 7955 ✓ Danio rerio 2017-08-Ensembl/2018-04 ^ @@ -94,18 +97,18 @@ You can find the binaries [here](http://hgdownload.cse.ucsc.edu/admin/exe/). 2. Install your genome (with annotation): `$ genomepy install --annotation GRCz11 --provider ensembl ` - Default genome directory: `~/.local/share/genomes/` +The default genomes directory: `~/.local/share/genomes/` ## Plugins and indexing -By default genomepy generates support files, including a genome index, +By default, genomepy generates support files including a genome index, chromosome sizes and gap locations (Ns in the sequence). -For some genomes genomepy can download blacklist files (generated by the Kundaje lab). -This will only work when installing these genomes from UCSC. -Enable this plugin to use it. +For some model organisms, genomepy can download a genome blacklist (generated by the Kundaje lab). +Blacklists are only available for these model organisms when downloaded from UCSC, and for the human and mouse genomes. +Enable the blacklist plugin to use it. -``` +```bash $ genomepy plugin enable blacklist ``` @@ -122,11 +125,12 @@ Currently, genomepy supports: Note 1: these programs are not installed by genomepy and need to be installed separately for the indexing to work. -Note 2: splice-aware indexing is performed by Hisat2 and STAR. +Note 2: splice-aware indexing (required for e.g. RNA-seq) is performed by STAR and Hisat2. Splice-aware indexing requires the annotation to be downloaded as well. You will receive a warning if indexing is performed without annotation for these aligners. -Note 3: STAR can further improve mapping to (novel) splice junctions by indexing again (2-pass mapping mode). +Note 3: STAR can further improve mapping to (novel) splice junctions by indexing again +(see 2-pass mapping mode in the STAR manual). The second pass is not supported by genomepy. You can configure the index creation using the `genomepy plugin` command (see below) @@ -135,25 +139,25 @@ You can configure the index creation using the `genomepy plugin` command (see be To change the default configuration, generate a personal config file: -``` +```bash $ genomepy config generate Created config file /home/simon/.config/genomepy/genomepy.yaml ``` ### Genome location -By default genomes will be saved in `~/.local/share/genomes`. +By default, genomes will be saved in `~/.local/share/genomes`. To set the default genome directory, to `/data/genomes` for instance, edit `~/.config/genomepy/genomepy.yaml` and change the following line: -``` +```bash genomes_dir: ~/.local/share/genomes/ ``` to: -``` +```bash genomes_dir: /data/genomes ``` @@ -165,7 +169,7 @@ Optionally genome FASTA files can be saved using bgzip compression. This means that the FASTA files will take up less space on disk. To enable this use the flag `--bgzip` on the command line, or add the following line to your config file: -``` +```bash bgzip: True ``` @@ -180,7 +184,7 @@ which comes installed with genomepy. All functions come with a short explanation when appended with `--help`. -``` +```bash $ genomepy --help Usage: genomepy [OPTIONS] COMMAND [ARGS]... @@ -203,8 +207,8 @@ Commands: Find the name of your desired genome: -``` -$ genomepy search xenopus tropicalis +```bash +$ genomepy search xenopus tro name provider accession tax_id annotation species other_info n r e k Xenopus_tropicalis_v9.1 Ensembl GCA_000004195.3 8364 ✓ Xenopus tropicalis 2019-04-Ensembl/2019-12 @@ -224,7 +228,7 @@ ASM1336827v1 NCBI GCA_013368275.1 8364 ✗ You can search by genome name (case-insensitive), taxonomy ID or assembly accession ID. Additionally, you can limit the search result to one provider with `-p`/`--provider`. -``` +```bash $ genomepy search 8364 -p ucsc name provider accession tax_id annotation species other_info n r e k @@ -237,16 +241,16 @@ xenTro9 UCSC GCA_000004195.3 8364 ✓ ✓ ✓ Use name for genomepy install ``` -Lets say we want to download the latest *Xenopus tropicalis* genome from UCSC. +Let's say we want to download a *Xenopus tropicalis* genome from UCSC. If you are interested in the gene annotation as well, you might want to check which gene annotation suits your needs. -Because we're looking at UCSC there are several options for us to choose from. +Because we're looking at UCSC, and there are several options for us to choose from. In the search results, `n r e k ` denotes which UCSC annotations are available. These stand for **n**cbiRefSeq, **r**efGene, **e**nsGene and **k**nownGene, respectively. We can quickly inspect these with the `genomepy annotation` command: -``` +```bash $ genomepy annotation xenTro9 -p ucsc 12:04:41 | INFO | UCSC ncbiRefSeq chr1 genomepy transcript 133270 152620 . - . gene_id "LOC100490505"; transcript_id "XM_012956089.1"; gene_name "LOC100490505"; @@ -260,11 +264,12 @@ chr1 genomepy exon 133270 134186 . - . gene_id ``` Here we can see that the `refGene` annotation has actual HGNC gene names, so lets go with this annotation. +This differs between genomes, so be sure to check! Copy the name returned by the search function to install. For UCSC we can also select the annotation type. -``` +```bash $ genomepy install xenTro9 --UCSC-annotation refGene ``` @@ -272,7 +277,7 @@ Since we did not specify the provider here, genomepy will use the first provider Since we learned in `genomepy search` that only UCSC uses this name, it will be UCSC. We can also specify genomepy to use UCSC by giving it the provider name with `-p`/`--provider`: -``` +```bash $ genomepy install xenTro9 -p UCSC Downloading genome from http://hgdownload.soe.ucsc.edu/goldenPath/xenTro9/bigZips/xenTro9.fa.gz... Genome download successful, starting post processing... @@ -285,7 +290,7 @@ fasta: /data/genomes/xenTro9/xenTro9.fa Next, the genome is downloaded to the directory specified in the config file. To choose a different directory, use the `-g`/`--genomes_dir` option: -``` +```bash $ genomepy install sacCer3 -p UCSC -g /path/to/my/genomes Downloading genome from http://hgdownload.soe.ucsc.edu/goldenPath/sacCer3/bigZips/chromFa.tar.gz... Genome download successful, starting post processing... @@ -299,7 +304,7 @@ You can use a regular expression to filter for matching sequences (or non-matching sequences by using the `--no-match` option). For instance, the following command downloads hg38 and saves only the major chromosomes: -``` +```bash $ genomepy install hg38 -p UCSC -r 'chr[0-9XY]+$' downloading from http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz... done... @@ -336,34 +341,41 @@ $ grep ">" /data/genomes/hg38/hg38.fa By default, sequences are soft-masked. Use `-m hard` for hard masking, or `-m none` for no masking. -The chromosome sizes are saved in file called `.fa.sizes`. - You can choose to download gene annotation files with the `--annotation` option. These will be saved in (gzipped) BED and GTF format. -``` +```bash $ genomepy install hg38 -p UCSC --annotation ``` +All selected options are stored in a `README.txt`. +This includes the original name, download location and other genomepy operations (such as regex filtering and time). + +###### Additional providers + To facilitate the downloading of genomes not supported by either NCBI, UCSC, or Ensembl, genomes -can also be downloaded directly from an url: +can also be downloaded directly from an url, by specifying `URL` as the provider: -``` +```bash $ genomepy install -p url https://research.nhgri.nih.gov/hydra/download/assembly/\Hm105_Dovetail_Assembly_1.0.fa.gz ``` -This installs the genome under the filename of the link, but can be changed with the `--localname` option -If you add the `--annotation` flag, genomepy will search the remote directory for an annotation file as well. -Should this fail, you can also add a url to the annotation with `--URL-to-annotation`. +This installs the genome under the filename of the link, but can be changed with the `--localname` option. +Similarly, if you have a local fasta file, you can install this using the filepath and specifying `Local` as the provider. -Finally, in the spirit of reproducibility all selected options are stored in a `README.txt`. -This includes the original name, download location and other genomepy operations (such as regex filtering and time). +If you add the `--annotation` flag, genomepy will search the (remote) directory for an annotation file as well. +Should this fail, you can also add a URL to the annotation with `--URL-to-annotation` with the `URL` provider, +or a filepath with `--Local-path-to-annotation` with the `Local` provider: + +```bash +$ genomepy install -p local /path/to/genome.fa --Local-path-to-annotation /path/to/gene.annotation.gtf +``` #### Manage plugins. Use `genomepy plugin list` to view the available plugins. -``` +```bash $ genomepy plugin list plugin enabled bowtie2 @@ -377,14 +389,14 @@ blacklist Enable plugins as follows: -``` +```bash $ genomepy plugin enable bwa hisat2 Enabled plugins: bwa, hisat2 ``` And disable like this: -``` +```bash $ genomepy plugin disable bwa Enabled plugins: hisat2 ``` @@ -394,7 +406,7 @@ Enabled plugins: hisat2 You can search by genome name (case-insensitive), taxonomy ID or assembly accession ID. Additionally, you can limit the search result to one provider with `-p`/`--provider`. -``` +```bash $ genomepy search xenopus tropicalis name provider accession tax_id annotation species other_info n r e k @@ -414,7 +426,7 @@ ASM1336827v1 NCBI GCA_013368275.1 8364 ✗ Only search a specific provider: -``` +```bash $ genomepy search tropicalis -p ucsc name provider accession tax_id annotation species other_info n r e k @@ -431,20 +443,22 @@ Note that searching doesn't work flawlessly, so try a few variations if you don' #### List available providers -``` +```bash $ genomepy providers +GENCODE Ensembl UCSC NCBI +Local URL ``` #### List available genomes -You can constrain the genome list by using the `-p` option to search only a -specific provider. +You can constrain the genome list by using the `-p`/`--provider` option to search only a specific provider. +Additionally, you can get the absolute `--size` of each genome (this option slows down the search). -``` +```bash $ genomepy genomes -p UCSC name provider accession tax_id annotation species other_info n r e k @@ -457,14 +471,14 @@ anoCar1 UCSC na 28377 ✗ ✗ ✓ List the current configuration file that genomepy uses: -``` +```bash $ genomepy config file /home/simon/.config/genomepy/genomepy.yaml ``` To show the contents of the config file: -``` +```bash $ genomepy config show # Directory were downloaded genomes will be stored genomes_dir: ~/.local/share/genomes/ @@ -475,7 +489,7 @@ plugin: To generate a personal configuration file (existing file will be overwritten): -``` +```bash $ genomepy config generate Created config file /home/simon/.config/genomepy/genomepy.yaml ``` @@ -528,8 +542,8 @@ fasta: ./data/genomes/hg38/hg38.fa tgtatggtccctagaggggccagagtcacagagatggaaagtggatggcgggtgccgggggctggggagctactgtgcagggggacagagctttagttct ``` -The `genomepy.Genome()` method returns a Genome object. This has all the -functionality of a `pyfaidx.Fasta` object, +The `genomepy.Genome()` method returns a Genome object. +This has all the functionality of a `pyfaidx.Fasta` object, see the [documentation](https://github.com/mdshw5/pyfaidx) for more examples on how to use this. ## Known issues @@ -593,15 +607,13 @@ When contributing a PR, please use the [develop](https://github.com/vanheeringen 1. Fork & download this repo. 2. `cd` into your local repo. 3. `git checkout develop` -4. `conda env create python=3.6 -f environment.yaml` +4. `conda env create -f environment.yaml` 5. `conda activate genomepy` -6. `python setup.py develop` -7. `python setup.py build` +6. `pip install -e .` 8. `git checkout -b` your_develop_branch The command line and python imports will now use the code in your local repo. -To test your changes locally, run the following command: -`pytest -vv --disable-pytest-warnings` +To test your changes locally, run the following command: `pytest -vvv` ## Contributors diff --git a/docs/content/about.rst b/docs/content/about.rst index 17151e26..d73494df 100644 --- a/docs/content/about.rst +++ b/docs/content/about.rst @@ -7,7 +7,7 @@ About :end-line: 14 .. mdinclude:: ../../README.md - :start-line: 601 + :start-line: 617 .. note: start- and end-line are 0-indexed. diff --git a/docs/content/command_line.rst b/docs/content/command_line.rst index b55383c8..3cbbf340 100644 --- a/docs/content/command_line.rst +++ b/docs/content/command_line.rst @@ -3,8 +3,8 @@ Command line documentation ========================== .. mdinclude:: ../../README.md - :start-line: 82 - :end-line: 486 + :start-line: 86 + :end-line: 504 .. note: start- and end-line are 0-indexed. diff --git a/docs/content/help_faq.rst b/docs/content/help_faq.rst index 9d1451e4..a2563f0c 100644 --- a/docs/content/help_faq.rst +++ b/docs/content/help_faq.rst @@ -3,5 +3,5 @@ Frequently Asked Questions ========================== .. mdinclude:: ../../README.md - :start-line: 531 - :end-line: 571 + :start-line: 550 + :end-line: 595 diff --git a/docs/content/installation.rst b/docs/content/installation.rst index d7655bff..354856b5 100644 --- a/docs/content/installation.rst +++ b/docs/content/installation.rst @@ -3,8 +3,8 @@ Installation ============ .. mdinclude:: ../../README.md - :start-line: 45 - :end-line: 81 + :start-line: 43 + :end-line: 85 .. note: start- and end-line are 0-indexed. diff --git a/genomepy/cli.py b/genomepy/cli.py index 2c62e172..80bc16a0 100755 --- a/genomepy/cli.py +++ b/genomepy/cli.py @@ -396,16 +396,17 @@ def terminal_subheader(_): @click.option("-s", "--size", is_flag=True, help="show absolute genome size") def search(term, provider=None, size=False): """ - Search for genomes that contain TERM in their name, description - accession (must start with GCA_ or GCF_) or (matching) taxonomy. - Search is case-insensitive. + Search for genomes that contain TERM in their name, description, + accession (must start with GCA_ or GCF_) or taxonomy (start). + + Search is case-insensitive, name/description search accepts multiple terms and regex. Returns the metadata of each found genome, including the availability of a gene annotation. For UCSC, up to 4 gene annotation styles are available: "ncbiRefSeq", "refGene", "ensGene", "knownGene" (respectively). Each with different naming schemes. """ - term = "_".join(term) + term = " ".join(term) no_genomes = True for row in genomepy.search(term, provider, size): if no_genomes: diff --git a/genomepy/providers/__init__.py b/genomepy/providers/__init__.py index 2267e23d..509c94e1 100644 --- a/genomepy/providers/__init__.py +++ b/genomepy/providers/__init__.py @@ -12,7 +12,7 @@ from genomepy.providers.ncbi import NcbiProvider, download_assembly_report from genomepy.providers.ucsc import UcscProvider from genomepy.providers.url import UrlProvider -from genomepy.utils import get_genomes_dir, safe +from genomepy.utils import get_genomes_dir __all__ = [ "Provider", @@ -106,18 +106,18 @@ def online_providers(provider: str = None): logger.warning(str(e)) -def search(term, provider: str = None, size=False): +def search(term: str or int, provider: str = None, size=False): """ Search for a genome. If provider is specified, search only that specific provider, else search all providers. Both the name and description are used for the - search. Search term is case-insensitive. + search. Search term is case-insensitive and can contain regex. Parameters ---------- - term : str - Search term, case-insensitive. + term : str, int + Search term, case-insensitive, allows regex. provider : str , optional Only search the specified provider (faster). size : bool, optional @@ -128,7 +128,6 @@ def search(term, provider: str = None, size=False): list genome name, provider and metadata """ - term = safe(str(term)) for p in online_providers(provider): for row in p.search(term, size): ret = list(row[:1]) + [p.name] + list(row[1:]) diff --git a/genomepy/providers/base.py b/genomepy/providers/base.py index b0dd4493..5baf09f3 100644 --- a/genomepy/providers/base.py +++ b/genomepy/providers/base.py @@ -1,11 +1,12 @@ """BaseProvider class, the parent of the provider classes""" import gzip import os +import re import shutil import subprocess as sp import time from tempfile import TemporaryDirectory, mkdtemp -from typing import Iterator, List, Union +from typing import Iterator, List from urllib.request import urlopen import pandas as pd @@ -16,7 +17,7 @@ from genomepy.exceptions import GenomeDownloadError from genomepy.files import extract_archive, get_file_info, update_readme from genomepy.online import download_file -from genomepy.utils import get_genomes_dir, get_localname, lower, mkdir_p, rm_rf, safe +from genomepy.utils import get_genomes_dir, get_localname, mkdir_p, rm_rf, safe class BaseProvider: @@ -311,10 +312,14 @@ def download_annotation(self, name, genomes_dir=None, localname=None, **kwargs): def _search_text(self, term: str) -> Iterator[str]: """check if search term is found in the provider's genome name or description field(s)""" + # multiple search terms: order doesn't matter + if " " in term: + term = "".join([f"(?=.*{t})" for t in term.split()]) + + pattern = re.compile(term, re.I) # case insensitive for name, metadata in self.genomes.items(): - if term in lower(name) or any( - [term in lower(metadata[f]) for f in self.description_fields] - ): + texts = [name] + [str(metadata[f]) for f in self.description_fields] + if any(pattern.search(t) for t in texts): yield name def _search_accession(self, term: str) -> Iterator[str]: @@ -322,16 +327,18 @@ def _search_accession(self, term: str) -> Iterator[str]: # cut off prefix (GCA_/GCF_) and suffix (version numbers, e.g. '.3') term = term[4:].split(".")[0] for name, metadata in self.genomes.items(): - if any([term in str(metadata[f]) for f in self.accession_fields]): + if any(term in str(metadata[f]) for f in self.accession_fields): yield name def _search_taxonomy(self, term: str) -> Iterator[str]: - """check if search term matches to any of the provider's taxonomy field(s)""" + """check if search term is the start of the provider's taxonomy field(s)""" for name, metadata in self.genomes.items(): - if any([term == lower(metadata[f]) for f in self.taxid_fields]): + if any( + str(metadata[f]).strip().startswith(term) for f in self.taxid_fields + ): yield name - def search(self, term: Union[str, int], size=False): + def search(self, term: str or int, size=False): """ Search for term in genome names, descriptions and taxonomy ID. @@ -344,7 +351,7 @@ def search(self, term: Union[str, int], size=False): Can be (part of) an assembly name (e.g. hg38), scientific name (Danio rerio) or assembly accession (`GCA_000146045`/`GCF_`), - or an exact taxonomy id (7227). + or a taxonomy id (7227). size : bool, optional Show absolute genome size. @@ -352,13 +359,14 @@ def search(self, term: Union[str, int], size=False): ------ tuples with name and metadata """ - term = lower(term) + term = str(term).strip() - search_function = self._search_text - if term.startswith(("gca_", "gcf_")): + if term.lower().startswith(("gca_", "gcf_")): search_function = self._search_accession - if term.isdigit(): + elif term.isdigit(): search_function = self._search_taxonomy + else: + search_function = self._search_text for name in search_function(term): yield self._genome_info_tuple(name, size) diff --git a/tests/test_06_provider_base.py b/tests/test_06_provider_base.py index 997ed455..084b6dc3 100644 --- a/tests/test_06_provider_base.py +++ b/tests/test_06_provider_base.py @@ -162,7 +162,7 @@ def test_head_annotation(ncbi, caplog, capsys): def test__search_text(ucsc): - term = genomepy.utils.lower("Ailuropoda melanoleuca") + term = "Ailuropoda melanoleuca" assert list(ucsc._search_text("not_in_description")) == [] assert next(ucsc._search_text(term)) == "ailMel1" From 338b7189db668cd03dda89e2f668e767176146a7 Mon Sep 17 00:00:00 2001 From: siebrenf Date: Mon, 1 Aug 2022 14:15:12 +0200 Subject: [PATCH 6/9] publication authors & order everywhere --- README.md | 5 +++-- docs/conf.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index e0f49e76..b7ceb1db 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![bioconda-badge](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io) [![Anaconda-Server Badge](https://anaconda.org/bioconda/genomepy/badges/downloads.svg)](https://anaconda.org/bioconda/genomepy) [![PyPI version](https://badge.fury.io/py/genomepy.svg)](https://badge.fury.io/py/genomepy) - +[![GitHub stars](https://badgen.net/github/stars/vanheeringen-lab/genomepy)](https://GitHub.com/vanheeringen-lab/genomepy/stargazers/) [![Build Status](https://app.travis-ci.com/vanheeringen-lab/genomepy.svg?branch=master)](https://app.travis-ci.com/github/vanheeringen-lab/genomepy/branches) [![Maintainability](https://api.codeclimate.com/v1/badges/c4476820f1d21a3e0569/maintainability)](https://codeclimate.com/github/vanheeringen-lab/genomepy/maintainability) @@ -618,8 +618,9 @@ To test your changes locally, run the following command: `pytest -vvv` ## Contributors - Siebren Frölich - [@siebrenf](https://github.com/siebrenf) -- Simon van Heeringen - [@simonvh](https://github.com/simonvh) - Maarten van der Sande - [@Maarten-vd-Sande](https://github.com/Maarten-vd-Sande) +- Tilman Schäfers [@tilschaef](https://github.com/tilschaef) +- Simon van Heeringen - [@simonvh](https://github.com/simonvh) - Dohoon Lee - [@dohlee](https://github.com/dohlee) - Jie Zhu - [@alienzj](https://github.com/alienzj) diff --git a/docs/conf.py b/docs/conf.py index 84c502f7..9acbeec7 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -18,8 +18,8 @@ # -- Project information ----------------------------------------------------- project = 'genomepy' -copyright = 'Simon van Heeringen, Siebren Frölich, Maarten van der Sande' -author = 'Simon van Heeringen, Siebren Frölich, Maarten van der Sande' +copyright = 'Siebren Frölich, Maarten van der Sande, Tilman Schäfers and Simon van Heeringen' +author = 'Siebren Frölich, Maarten van der Sande, Tilman Schäfers and Simon van Heeringen' # -- General configuration --------------------------------------------------- From 252c1a4b0e25d50a66a2bbd8ce9bc8452251a578 Mon Sep 17 00:00:00 2001 From: siebrenf Date: Mon, 1 Aug 2022 14:25:02 +0200 Subject: [PATCH 7/9] text flow --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index b7ceb1db..426c92f0 100644 --- a/README.md +++ b/README.md @@ -351,17 +351,17 @@ $ genomepy install hg38 -p UCSC --annotation All selected options are stored in a `README.txt`. This includes the original name, download location and other genomepy operations (such as regex filtering and time). -###### Additional providers +#### Other providers (any URL/local files) To facilitate the downloading of genomes not supported by either NCBI, UCSC, or Ensembl, genomes -can also be downloaded directly from an url, by specifying `URL` as the provider: +can also be downloaded directly from any URL, by specifying `URL` as the provider: ```bash $ genomepy install -p url https://research.nhgri.nih.gov/hydra/download/assembly/\Hm105_Dovetail_Assembly_1.0.fa.gz ``` -This installs the genome under the filename of the link, but can be changed with the `--localname` option. Similarly, if you have a local fasta file, you can install this using the filepath and specifying `Local` as the provider. +This installs the genome under the filename of the URL/filepath, but can be changed with the `--localname` option. If you add the `--annotation` flag, genomepy will search the (remote) directory for an annotation file as well. Should this fail, you can also add a URL to the annotation with `--URL-to-annotation` with the `URL` provider, From 7cae2270e00049b8ad5ec4007e16e7dc0efb9841 Mon Sep 17 00:00:00 2001 From: siebrenf Date: Mon, 1 Aug 2022 14:49:43 +0200 Subject: [PATCH 8/9] release 0.14 --- CHANGELOG.md | 3 +++ docs/release_checklist.md | 1 + genomepy/__about__.py | 4 ++-- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c680fbe6..5223164e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ## [Unreleased] +## [0.14.0] - 2022-08-01 + ### Added - now using `filelock` for improved thread safety - now checking if every API/FTP/HTTP(S) is accessible before proceeding @@ -429,6 +431,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added `-r` and `--match/--no-match` option to select sequences by regex. [Unreleased]: https://github.com/vanheeringen-lab/genomepy/compare/master...develop +[0.14.0]: https://github.com/vanheeringen-lab/genomepy/compare/0.13.1...0.14.0 [0.13.1]: https://github.com/vanheeringen-lab/genomepy/compare/0.13.0...0.13.1 [0.13.0]: https://github.com/vanheeringen-lab/genomepy/compare/0.12.0...0.13.0 [0.12.0]: https://github.com/vanheeringen-lab/genomepy/compare/0.11.1...0.12.0 diff --git a/docs/release_checklist.md b/docs/release_checklist.md index 72e86343..77f311d0 100644 --- a/docs/release_checklist.md +++ b/docs/release_checklist.md @@ -5,6 +5,7 @@ 1. Make sure all tests pass. ```shell + mamba env update -f environment.yml pytest -vvv ``` diff --git a/genomepy/__about__.py b/genomepy/__about__.py index 407f3727..092f21bb 100644 --- a/genomepy/__about__.py +++ b/genomepy/__about__.py @@ -1,3 +1,3 @@ """Metadata""" -__version__ = "0.13.1" -__author__ = "Simon van Heeringen, Siebren Frölich, Maarten van der Sande" +__version__ = "0.14.0" +__author__ = "Siebren Frölich, Maarten van der Sande, Tilman Schäfers and Simon van Heeringen" From df35379ad4954ea889407b9ce6bffbc3dc3de6b3 Mon Sep 17 00:00:00 2001 From: siebrenf Date: Mon, 1 Aug 2022 15:27:34 +0200 Subject: [PATCH 9/9] fix current publishing issues --- docs/release_checklist.md | 1 + environment.yml | 2 ++ genomepy/__about__.py | 4 +++- genomepy/functions.py | 5 ++++- 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/docs/release_checklist.md b/docs/release_checklist.md index 77f311d0..774d1c42 100644 --- a/docs/release_checklist.md +++ b/docs/release_checklist.md @@ -53,6 +53,7 @@ genomepy install -af -p ncbi ASM2732v1 genomepy install -af -p url -l url_test https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/027/325/GCF_000027325.1_ASM2732v1/GCF_000027325.1_ASM2732v1_genomic.fna.gz --URL-to-annotation https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/027/325/GCF_000027325.1_ASM2732v1/GCF_000027325.1_ASM2732v1_genomic.gff.gz genomepy install -af -p local -l local_test ~/.local/share/genomes/TAIR10/TAIR10.fa --Local-path-to-annotation ~/.local/share/genomes/TAIR10/TAIR10.annotation.gtf + ``` 6. Finish the release: diff --git a/environment.yml b/environment.yml index c2878459..9207f32a 100644 --- a/environment.yml +++ b/environment.yml @@ -41,6 +41,8 @@ dependencies: # Distribution - python-build + - twine ==3.3.0 + - keyring ==22.3.0 # Testing - autoflake ==1.4 diff --git a/genomepy/__about__.py b/genomepy/__about__.py index 092f21bb..1ca9df95 100644 --- a/genomepy/__about__.py +++ b/genomepy/__about__.py @@ -1,3 +1,5 @@ """Metadata""" __version__ = "0.14.0" -__author__ = "Siebren Frölich, Maarten van der Sande, Tilman Schäfers and Simon van Heeringen" +__author__ = ( + "Siebren Frölich, Maarten van der Sande, Tilman Schäfers and Simon van Heeringen" +) diff --git a/genomepy/functions.py b/genomepy/functions.py index 48785cbd..ab123c11 100644 --- a/genomepy/functions.py +++ b/genomepy/functions.py @@ -1,6 +1,7 @@ """Module-level functions""" import os import re +import sys from tempfile import mkdtemp from typing import Optional @@ -335,7 +336,9 @@ def _lazy_provider_selection(name, provider=None): if p.name == "Local" and os.path.exists(cleanpath(name)): return p - raise GenomeDownloadError(f"{name} not found on {', '.join(providers)}.") + if len(providers): + raise GenomeDownloadError(f"{name} not found on {', '.join(providers)}.") + sys.exit(0) def _provider_selection(name, localname, genomes_dir, provider=None):