Merge branch 'release/0.16.0'

vanheeringen-lab · May 31, 2023 · d38e477 · d38e477
2 parents d641c7a + 0bf0814
commit d38e477
Show file tree

Hide file tree

Showing 29 changed files with 369 additions and 202 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,31 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ## [Unreleased]
 
+## [0.16.0] - 2023-05-31
+
+### Added
+- `genomepy search` now accepts the `--exact` flag
+- `genomepy.Annotation.attributes()` returns a list of all attributes from the GTF attributes column. 
+  - e.g. gene_name, gene_version
+  - nice to use with `genomepy.Annotation.from_attributes()` or `genomepy.Annotation.gtf_dict()`
+- When installing assemblies from older Ensembl release versions, a clearer error message is given if assembly cannot be found:
+  - if the release does not exist, options will be given
+  - if the assembly does not exist on the release version, all available options are given
+  - if the URL to the genome or annotation files is incorrect, the error message stays the same
+- new config option: `ucsc_mirror`, options: `eu` or `us`.
+  - the mirror should only affect download speed
+  - can be nice if the other mirror is down!
+
+### Changed
+- function `get_division` is now a class method of EnsemblProvider
+- EnsemblProvider class methods `get_division` and `get_version` now require an assembly name.
+- UCSC data is now downloaded over HTTPS instead of HTTP
+
+### Fixed
+- `genomepy.install()` now returns a `Genome` instance with updated annotation attributes.
+- now ignoring ~1600 assemblies from the Ensembl database with incorrect metadata
+  - no easy way to retrieve this data
+
 ## [0.15.0] - 2023-02-28
 
 ### Added
@@ -448,6 +473,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `-r` and `--match/--no-match` option to select sequences by regex.
 
 [Unreleased]: https://github.com/vanheeringen-lab/genomepy/compare/master...develop
+[0.16.0]: https://github.com/vanheeringen-lab/genomepy/compare/0.15.0...0.16.0
 [0.15.0]: https://github.com/vanheeringen-lab/genomepy/compare/0.14.0...0.15.0
 [0.14.0]: https://github.com/vanheeringen-lab/genomepy/compare/0.13.1...0.14.0
 [0.13.1]: https://github.com/vanheeringen-lab/genomepy/compare/0.13.0...0.13.1

diff --git a/environment.yml b/environment.yml
@@ -34,7 +34,7 @@ dependencies:
   # Plugins
   - bowtie2
   - bwa
-  - gmap
+  - gmap <=2021.08.25
   - hisat2
   - minimap2
   - star

diff --git a/genomepy/__about__.py b/genomepy/__about__.py
@@ -1,5 +1,5 @@
 """Metadata"""
-__version__ = "0.15.0"
+__version__ = "0.16.0"
 __author__ = (
     "Siebren Frölich, Maarten van der Sande, Tilman Schäfers and Simon van Heeringen"
 )
diff --git a/genomepy/annotation/__init__.py b/genomepy/annotation/__init__.py
@@ -140,6 +140,28 @@ def __setattr__(self, name, value):
             self.genome_contigs = None  # noqa
         super(Annotation, self).__setattr__(name, value)
 
+    def attributes(self, annot: Union[str, pd.DataFrame] = "gtf"):
+        """
+        list all attributes present in the GTF attribute field.
+
+        Parameters
+        ----------
+        annot : str or pd.Dataframe, optional
+            any GTF in dataframe format, or the default GTF.
+
+        Returns
+        -------
+        list
+            with attributes
+        """
+        df = _parse_annot(self, annot)
+        attributes = set()
+        for feature in df["feature"].unique():
+            f_attributes = df[df["feature"] == feature]["attribute"].head(1).values[0]
+            f_attributes = re.findall(r'\s*(.+?)\s*".+?"\s*;', f_attributes)
+            attributes.update(f_attributes)
+        return sorted(attributes)
+
     def from_attributes(
         self, field, annot: Union[str, pd.DataFrame] = "gtf", check=True
     ):

diff --git a/genomepy/caching.py b/genomepy/caching.py
@@ -5,6 +5,7 @@
 from appdirs import user_cache_dir
 from diskcache import Cache
 from filelock import FileLock
+from loguru import logger
 
 from genomepy.__about__ import __version__
 from genomepy.config import config
@@ -45,4 +46,4 @@ def clean():
     """Remove cached data on providers."""
     rmtree(genomepy_cache_dir, ignore_errors=True)
     os.makedirs(genomepy_cache_dir, exist_ok=True)
-    print("All clean!")
+    logger.info("All clean!")
diff --git a/genomepy/cli.py b/genomepy/cli.py
@@ -393,8 +393,9 @@ def terminal_subheader(_):
 @click.command(short_help="search for genomes")
 @click.argument("term", nargs=-1)
 @click.option("-p", "--provider", help="only search this provider")
+@click.option("-e", "--exact", is_flag=True, help="exact matches only")
 @click.option("-s", "--size", is_flag=True, help="show absolute genome size")
-def search(term, provider=None, size=False):
+def search(term, provider=None, exact=False, size=False):
     """
     Search for genomes that contain TERM in their name, description,
     accession (must start with GCA_ or GCF_) or taxonomy (start).
@@ -408,7 +409,7 @@ def search(term, provider=None, size=False):
     """
     term = " ".join(term)
     no_genomes = True
-    for row in genomepy.search(term, provider, size):
+    for row in genomepy.search(term, provider, exact, size):
         if no_genomes:
             no_genomes = False
             terminal_header(size)

diff --git a/genomepy/config/__init__.py b/genomepy/config/__init__.py
@@ -2,6 +2,7 @@
 from shutil import copyfile
 
 from appdirs import user_config_dir
+from loguru import logger
 from norns import config as cfg
 
 __all__ = ["config", "manage_config"]
@@ -21,7 +22,7 @@ def generate_config():
     default_config = cfg("genomepy", default="config/default.yaml").config_file
     copyfile(default_config, new_config)
     config.config_file = new_config
-    print(f"Created config file {new_config}")
+    logger.info(f"Created config file {new_config}")
 
 
 def manage_config(command):

diff --git a/genomepy/config/default.yaml b/genomepy/config/default.yaml
@@ -2,4 +2,5 @@ bgzip: false
 genomes_dir: ~/.local/share/genomes/
 cache_exp_genomes: 6.048e5  # cache expiration time in seconds (None = infinite)
 cache_exp_other: 3.6e3
+ucsc_mirror: us  # options: eu us
 plugin: []
diff --git a/genomepy/files.py b/genomepy/files.py
@@ -323,7 +323,7 @@ def bgzip_and_name(fname, bgzip_file=True) -> str:
         up to date filename
     """
     if bgzip_file:
-        ret = sp.check_call(["bgzip", fname])
+        ret = sp.check_call(f"bgzip {fname}", shell=True)
         fname += ".gz"
         if ret != 0:
             raise Exception(f"Error bgzipping genome {fname}. Is pysam installed?")

diff --git a/genomepy/functions.py b/genomepy/functions.py
@@ -268,8 +268,11 @@ def install_genome(
 
     if annotation_downloaded:
         annotation = Annotation(localname, genomes_dir=genomes_dir)
-        if genome_found and not (skip_matching and skip_filter):
-            annotation.sanitize(not skip_matching, not skip_filter, True)
+        if genome_found:
+            # update references to annotation files
+            genome = Genome(localname, genomes_dir=genomes_dir)
+            if not (skip_matching and skip_filter):
+                annotation.sanitize(not skip_matching, not skip_filter, True)
 
     # Run active plugins (also if the genome was downloaded earlier)
     if genome_found:

diff --git a/genomepy/plugins/hisat2.py b/genomepy/plugins/hisat2.py
@@ -28,7 +28,6 @@ def after_genome_download(self, genome, threads=1, force=False):
 
             # if an annotation is present, generate a splice-aware index
             gtf_file = genome.annotation_gtf_file
-            print(gtf_file)
             if gtf_file:
                 with extracted_file(gtf_file) as _gtf_file:
                     # generate splice and exon site files to enhance indexing
@@ -60,7 +59,6 @@ def after_genome_download(self, genome, threads=1, force=False):
 
                 # Create index
                 run_index_cmd("hisat2", cmd)
-            print(gtf_file)
 
     def get_properties(self, genome):
         props = {

diff --git a/genomepy/providers/__init__.py b/genomepy/providers/__init__.py
@@ -106,20 +106,24 @@ def online_providers(provider: str = None):
             logger.warning(str(e))
 
 
-def search(term: str or int, provider: str = None, size=False):
+def search(term: str or int, provider: str = None, exact=False, size=False):
     """
-    Search for a genome.
+    Search for term in genome names and descriptions (if term contains text. Case-insensitive),
+    assembly accession IDs (if term starts with GCA_ or GCF_),
+    or taxonomy IDs (if term is a number).
 
-    If provider is specified, search only that specific provider, else
-    search all providers. Both the name and description are used for the
-    search. Search term is case-insensitive and can contain regex.
+    If provider is specified, search only that specific provider, else search all providers.
+
+    Note: exact accession ID search on UCSC may return different patch levels.
 
     Parameters
     ----------
     term : str, int
         Search term, case-insensitive, allows regex.
     provider : str , optional
         Only search the specified provider (faster).
+    exact : bool, optional
+        term must be an exact match
     size : bool, optional
         Show absolute genome size.
 
@@ -129,7 +133,7 @@ def search(term: str or int, provider: str = None, size=False):
         genome name, provider and metadata
     """
     for p in online_providers(provider):
-        for row in p.search(term, size):
+        for row in p.search(term, exact, size):
             ret = list(row[:1]) + [p.name] + list(row[1:])
             yield ret
 

diff --git a/genomepy/providers/base.py b/genomepy/providers/base.py
@@ -71,20 +71,23 @@ def _check_name(self, name):
             f"  genomepy search {name} -p {self.name}"
         )
 
-    def _genome_info_tuple(self, name, size=False):
+    def _genome_info_tuple(self, name: str, size: bool = False):
         """tuple with assembly metadata"""
         raise NotImplementedError()
 
     def list_available_genomes(self, size=False):
         """
         List all available genomes.
 
+        Parameters
+        ----------
+        size : bool, optional
+            Show absolute genome size.
+
         Yields
         ------
         genomes : list of tuples
             tuples with assembly name, accession, scientific_name, taxonomy id and description
-        size : bool, optional
-            Show absolute genome size.
         """
         for name in self.genomes.keys():
             yield self._genome_info_tuple(name, size)
@@ -317,10 +320,13 @@ def download_annotation(self, name, genomes_dir=None, localname=None, **kwargs):
         readme = os.path.join(genomes_dir, localname, "README.txt")
         update_readme(readme, updated_metadata={"annotation url": link})
 
-    def _search_text(self, term: str) -> Iterator[str]:
+    def _search_text(self, term: str, exact=False) -> Iterator[str]:
         """check if search term is found in the provider's genome name or description field(s)"""
-        # multiple search terms: order doesn't matter
-        if " " in term:
+        if exact:
+            # allow several commonly used spacers inside the term
+            term = re.sub(r"[ _.-]", r"[ _.-]", rf"^{term}$")
+        elif " " in term:
+            # multiple search terms: order doesn't matter
             term = "".join([f"(?=.*{t})" for t in term.split()])
 
         pattern = re.compile(term, re.I)  # case insensitive
@@ -329,36 +335,46 @@ def _search_text(self, term: str) -> Iterator[str]:
             if any(pattern.search(t) for t in texts):
                 yield name
 
-    def _search_accession(self, term: str) -> Iterator[str]:
+    def _search_accession(self, term: str, exact=False) -> Iterator[str]:
         """check if search term is found in the provider's accession field(s)"""
-        # cut off prefix (GCA_/GCF_) and suffix (version numbers, e.g. '.3')
-        term = term[4:].split(".")[0]
+        # cut off prefix (GCA/GCF) and suffix (version numbers, e.g. '.3')
+        term = term.upper() if exact else term[3:].split(".")[0]
         for name, metadata in self.genomes.items():
             if any(term in str(metadata[f]) for f in self.accession_fields):
                 yield name
 
-    def _search_taxonomy(self, term: str) -> Iterator[str]:
+    def _search_taxonomy(self, term: str, exact=False) -> Iterator[str]:
         """check if search term is the start of the provider's taxonomy field(s)"""
+
+        def fuzzy_match(md, t):
+            return str(md).strip().startswith(t)
+
+        def exact_match(md, t):
+            return str(md).strip() == t
+
+        func = exact_match if exact else fuzzy_match
         for name, metadata in self.genomes.items():
-            if any(
-                str(metadata[f]).strip().startswith(term) for f in self.taxid_fields
-            ):
+            if any(func(metadata[f], term) for f in self.taxid_fields):
                 yield name
 
-    def search(self, term: str or int, size=False):
+    def search(self, term: str or int, exact=False, size=False):
         """
-        Search for term in genome names, descriptions and taxonomy ID.
+        Search for term in genome names and descriptions (if term contains text. Case-insensitive),
+        assembly accession IDs (if term starts with GCA_ or GCF_),
+        or taxonomy IDs (if term is a number).
 
-        The search is case-insensitive.
+        Note: exact accession ID search on UCSC may return different patch levels.
 
         Parameters
         ----------
         term : str, int
             Search term, case-insensitive.
-            Can be (part of) an assembly name (e.g. hg38),
-            scientific name (Danio rerio) or assembly
-            accession (`GCA_000146045`/`GCF_`),
-            or a taxonomy id (7227).
+            Can be an assembly name (e.g. hg38),
+            scientific name (Danio rerio),
+            assembly accession ID (GCA_000146045),
+            or taxonomy ID (7227).
+        exact : bool, optional
+            term must be an exact match
         size : bool, optional
             Show absolute genome size.
 
@@ -375,7 +391,7 @@ def search(self, term: str or int, size=False):
         else:
             search_function = self._search_text
 
-        for name in search_function(term):
+        for name in search_function(term, exact):
             yield self._genome_info_tuple(name, size)
 
     def head_annotation(self, name: str, genomes_dir=None, n: int = 5, **kwargs):
-Original file line number
+Diff line change
@@ Expand Up / @@ -34,7 +34,7 @@ dependencies: @@
       # Plugins
       - bowtie2
       - bwa
-      - gmap
+      - gmap <=2021.08.25
       - hisat2
       - minimap2
       - star
@@ Expand Down @@