Skip to content

Commit

Permalink
Merge branch 'release/0.16.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
siebrenf committed May 31, 2023
2 parents d641c7a + 0bf0814 commit d38e477
Show file tree
Hide file tree
Showing 29 changed files with 369 additions and 202 deletions.
26 changes: 26 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,31 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

## [Unreleased]

## [0.16.0] - 2023-05-31

### Added
- `genomepy search` now accepts the `--exact` flag
- `genomepy.Annotation.attributes()` returns a list of all attributes from the GTF attributes column.
- e.g. gene_name, gene_version
- nice to use with `genomepy.Annotation.from_attributes()` or `genomepy.Annotation.gtf_dict()`
- When installing assemblies from older Ensembl release versions, a clearer error message is given if assembly cannot be found:
- if the release does not exist, options will be given
- if the assembly does not exist on the release version, all available options are given
- if the URL to the genome or annotation files is incorrect, the error message stays the same
- new config option: `ucsc_mirror`, options: `eu` or `us`.
- the mirror should only affect download speed
- can be nice if the other mirror is down!

### Changed
- function `get_division` is now a class method of EnsemblProvider
- EnsemblProvider class methods `get_division` and `get_version` now require an assembly name.
- UCSC data is now downloaded over HTTPS instead of HTTP

### Fixed
- `genomepy.install()` now returns a `Genome` instance with updated annotation attributes.
- now ignoring ~1600 assemblies from the Ensembl database with incorrect metadata
- no easy way to retrieve this data

## [0.15.0] - 2023-02-28

### Added
Expand Down Expand Up @@ -448,6 +473,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Added `-r` and `--match/--no-match` option to select sequences by regex.

[Unreleased]: https://github.com/vanheeringen-lab/genomepy/compare/master...develop
[0.16.0]: https://github.com/vanheeringen-lab/genomepy/compare/0.15.0...0.16.0
[0.15.0]: https://github.com/vanheeringen-lab/genomepy/compare/0.14.0...0.15.0
[0.14.0]: https://github.com/vanheeringen-lab/genomepy/compare/0.13.1...0.14.0
[0.13.1]: https://github.com/vanheeringen-lab/genomepy/compare/0.13.0...0.13.1
Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ dependencies:
# Plugins
- bowtie2
- bwa
- gmap
- gmap <=2021.08.25
- hisat2
- minimap2
- star
Expand Down
2 changes: 1 addition & 1 deletion genomepy/__about__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Metadata"""
__version__ = "0.15.0"
__version__ = "0.16.0"
__author__ = (
"Siebren Frölich, Maarten van der Sande, Tilman Schäfers and Simon van Heeringen"
)
22 changes: 22 additions & 0 deletions genomepy/annotation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,28 @@ def __setattr__(self, name, value):
self.genome_contigs = None # noqa
super(Annotation, self).__setattr__(name, value)

def attributes(self, annot: Union[str, pd.DataFrame] = "gtf"):
"""
list all attributes present in the GTF attribute field.
Parameters
----------
annot : str or pd.Dataframe, optional
any GTF in dataframe format, or the default GTF.
Returns
-------
list
with attributes
"""
df = _parse_annot(self, annot)
attributes = set()
for feature in df["feature"].unique():
f_attributes = df[df["feature"] == feature]["attribute"].head(1).values[0]
f_attributes = re.findall(r'\s*(.+?)\s*".+?"\s*;', f_attributes)
attributes.update(f_attributes)
return sorted(attributes)

def from_attributes(
self, field, annot: Union[str, pd.DataFrame] = "gtf", check=True
):
Expand Down
3 changes: 2 additions & 1 deletion genomepy/caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from appdirs import user_cache_dir
from diskcache import Cache
from filelock import FileLock
from loguru import logger

from genomepy.__about__ import __version__
from genomepy.config import config
Expand Down Expand Up @@ -45,4 +46,4 @@ def clean():
"""Remove cached data on providers."""
rmtree(genomepy_cache_dir, ignore_errors=True)
os.makedirs(genomepy_cache_dir, exist_ok=True)
print("All clean!")
logger.info("All clean!")
5 changes: 3 additions & 2 deletions genomepy/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,8 +393,9 @@ def terminal_subheader(_):
@click.command(short_help="search for genomes")
@click.argument("term", nargs=-1)
@click.option("-p", "--provider", help="only search this provider")
@click.option("-e", "--exact", is_flag=True, help="exact matches only")
@click.option("-s", "--size", is_flag=True, help="show absolute genome size")
def search(term, provider=None, size=False):
def search(term, provider=None, exact=False, size=False):
"""
Search for genomes that contain TERM in their name, description,
accession (must start with GCA_ or GCF_) or taxonomy (start).
Expand All @@ -408,7 +409,7 @@ def search(term, provider=None, size=False):
"""
term = " ".join(term)
no_genomes = True
for row in genomepy.search(term, provider, size):
for row in genomepy.search(term, provider, exact, size):
if no_genomes:
no_genomes = False
terminal_header(size)
Expand Down
3 changes: 2 additions & 1 deletion genomepy/config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from shutil import copyfile

from appdirs import user_config_dir
from loguru import logger
from norns import config as cfg

__all__ = ["config", "manage_config"]
Expand All @@ -21,7 +22,7 @@ def generate_config():
default_config = cfg("genomepy", default="config/default.yaml").config_file
copyfile(default_config, new_config)
config.config_file = new_config
print(f"Created config file {new_config}")
logger.info(f"Created config file {new_config}")


def manage_config(command):
Expand Down
1 change: 1 addition & 0 deletions genomepy/config/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ bgzip: false
genomes_dir: ~/.local/share/genomes/
cache_exp_genomes: 6.048e5 # cache expiration time in seconds (None = infinite)
cache_exp_other: 3.6e3
ucsc_mirror: us # options: eu us
plugin: []
2 changes: 1 addition & 1 deletion genomepy/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ def bgzip_and_name(fname, bgzip_file=True) -> str:
up to date filename
"""
if bgzip_file:
ret = sp.check_call(["bgzip", fname])
ret = sp.check_call(f"bgzip {fname}", shell=True)
fname += ".gz"
if ret != 0:
raise Exception(f"Error bgzipping genome {fname}. Is pysam installed?")
Expand Down
7 changes: 5 additions & 2 deletions genomepy/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,8 +268,11 @@ def install_genome(

if annotation_downloaded:
annotation = Annotation(localname, genomes_dir=genomes_dir)
if genome_found and not (skip_matching and skip_filter):
annotation.sanitize(not skip_matching, not skip_filter, True)
if genome_found:
# update references to annotation files
genome = Genome(localname, genomes_dir=genomes_dir)
if not (skip_matching and skip_filter):
annotation.sanitize(not skip_matching, not skip_filter, True)

# Run active plugins (also if the genome was downloaded earlier)
if genome_found:
Expand Down
2 changes: 0 additions & 2 deletions genomepy/plugins/hisat2.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ def after_genome_download(self, genome, threads=1, force=False):

# if an annotation is present, generate a splice-aware index
gtf_file = genome.annotation_gtf_file
print(gtf_file)
if gtf_file:
with extracted_file(gtf_file) as _gtf_file:
# generate splice and exon site files to enhance indexing
Expand Down Expand Up @@ -60,7 +59,6 @@ def after_genome_download(self, genome, threads=1, force=False):

# Create index
run_index_cmd("hisat2", cmd)
print(gtf_file)

def get_properties(self, genome):
props = {
Expand Down
16 changes: 10 additions & 6 deletions genomepy/providers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,20 +106,24 @@ def online_providers(provider: str = None):
logger.warning(str(e))


def search(term: str or int, provider: str = None, size=False):
def search(term: str or int, provider: str = None, exact=False, size=False):
"""
Search for a genome.
Search for term in genome names and descriptions (if term contains text. Case-insensitive),
assembly accession IDs (if term starts with GCA_ or GCF_),
or taxonomy IDs (if term is a number).
If provider is specified, search only that specific provider, else
search all providers. Both the name and description are used for the
search. Search term is case-insensitive and can contain regex.
If provider is specified, search only that specific provider, else search all providers.
Note: exact accession ID search on UCSC may return different patch levels.
Parameters
----------
term : str, int
Search term, case-insensitive, allows regex.
provider : str , optional
Only search the specified provider (faster).
exact : bool, optional
term must be an exact match
size : bool, optional
Show absolute genome size.
Expand All @@ -129,7 +133,7 @@ def search(term: str or int, provider: str = None, size=False):
genome name, provider and metadata
"""
for p in online_providers(provider):
for row in p.search(term, size):
for row in p.search(term, exact, size):
ret = list(row[:1]) + [p.name] + list(row[1:])
yield ret

Expand Down
58 changes: 37 additions & 21 deletions genomepy/providers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,20 +71,23 @@ def _check_name(self, name):
f" genomepy search {name} -p {self.name}"
)

def _genome_info_tuple(self, name, size=False):
def _genome_info_tuple(self, name: str, size: bool = False):
"""tuple with assembly metadata"""
raise NotImplementedError()

def list_available_genomes(self, size=False):
"""
List all available genomes.
Parameters
----------
size : bool, optional
Show absolute genome size.
Yields
------
genomes : list of tuples
tuples with assembly name, accession, scientific_name, taxonomy id and description
size : bool, optional
Show absolute genome size.
"""
for name in self.genomes.keys():
yield self._genome_info_tuple(name, size)
Expand Down Expand Up @@ -317,10 +320,13 @@ def download_annotation(self, name, genomes_dir=None, localname=None, **kwargs):
readme = os.path.join(genomes_dir, localname, "README.txt")
update_readme(readme, updated_metadata={"annotation url": link})

def _search_text(self, term: str) -> Iterator[str]:
def _search_text(self, term: str, exact=False) -> Iterator[str]:
"""check if search term is found in the provider's genome name or description field(s)"""
# multiple search terms: order doesn't matter
if " " in term:
if exact:
# allow several commonly used spacers inside the term
term = re.sub(r"[ _.-]", r"[ _.-]", rf"^{term}$")
elif " " in term:
# multiple search terms: order doesn't matter
term = "".join([f"(?=.*{t})" for t in term.split()])

pattern = re.compile(term, re.I) # case insensitive
Expand All @@ -329,36 +335,46 @@ def _search_text(self, term: str) -> Iterator[str]:
if any(pattern.search(t) for t in texts):
yield name

def _search_accession(self, term: str) -> Iterator[str]:
def _search_accession(self, term: str, exact=False) -> Iterator[str]:
"""check if search term is found in the provider's accession field(s)"""
# cut off prefix (GCA_/GCF_) and suffix (version numbers, e.g. '.3')
term = term[4:].split(".")[0]
# cut off prefix (GCA/GCF) and suffix (version numbers, e.g. '.3')
term = term.upper() if exact else term[3:].split(".")[0]
for name, metadata in self.genomes.items():
if any(term in str(metadata[f]) for f in self.accession_fields):
yield name

def _search_taxonomy(self, term: str) -> Iterator[str]:
def _search_taxonomy(self, term: str, exact=False) -> Iterator[str]:
"""check if search term is the start of the provider's taxonomy field(s)"""

def fuzzy_match(md, t):
return str(md).strip().startswith(t)

def exact_match(md, t):
return str(md).strip() == t

func = exact_match if exact else fuzzy_match
for name, metadata in self.genomes.items():
if any(
str(metadata[f]).strip().startswith(term) for f in self.taxid_fields
):
if any(func(metadata[f], term) for f in self.taxid_fields):
yield name

def search(self, term: str or int, size=False):
def search(self, term: str or int, exact=False, size=False):
"""
Search for term in genome names, descriptions and taxonomy ID.
Search for term in genome names and descriptions (if term contains text. Case-insensitive),
assembly accession IDs (if term starts with GCA_ or GCF_),
or taxonomy IDs (if term is a number).
The search is case-insensitive.
Note: exact accession ID search on UCSC may return different patch levels.
Parameters
----------
term : str, int
Search term, case-insensitive.
Can be (part of) an assembly name (e.g. hg38),
scientific name (Danio rerio) or assembly
accession (`GCA_000146045`/`GCF_`),
or a taxonomy id (7227).
Can be an assembly name (e.g. hg38),
scientific name (Danio rerio),
assembly accession ID (GCA_000146045),
or taxonomy ID (7227).
exact : bool, optional
term must be an exact match
size : bool, optional
Show absolute genome size.
Expand All @@ -375,7 +391,7 @@ def search(self, term: str or int, size=False):
else:
search_function = self._search_text

for name in search_function(term):
for name in search_function(term, exact):
yield self._genome_info_tuple(name, size)

def head_annotation(self, name: str, genomes_dir=None, n: int = 5, **kwargs):
Expand Down
Loading

0 comments on commit d38e477

Please sign in to comment.