Merge pull request #52 from ncbo/restructure_2

Restructuring - get all ontologies
ncbo · Aug 16, 2024 · b323dd5 · b323dd5
2 parents f99f783 + 136c23a
commit b323dd5
Show file tree

Hide file tree

Showing 3 changed files with 85 additions and 19 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/src/kg_bioportal/cli.py b/src/kg_bioportal/cli.py
@@ -32,6 +32,37 @@ def main(verbose: int, quiet: bool):
     logger.info(f"Logger {logger.name} set to level {logger.level}")
 
 
+@main.command()
+@click.option("output_dir", "-o", required=True, default="data/raw")
+@click.option(
+    "api_key",
+    "-k",
+    required=False,
+    type=str,
+    help="API key for BioPortal",
+)
+def get_ontology_list(output_dir, api_key) -> None:
+    """Downloads the list of all BioPortal ontologies and saves to a file in the data directory (default: data/raw).
+
+    Args:
+
+        output_dir: A string pointing to the directory to download data to.
+        Defaults to data/raw.
+
+        api_key: BioPortal / NCBO API key.
+
+    Returns:
+        None.
+
+    """
+
+    dl = Downloader(output_dir=output_dir, api_key=api_key)
+
+    dl.get_ontology_list()
+
+    return None
+
+
 @main.command()
 @click.option(
     "ontologies",
@@ -67,7 +98,9 @@ def main(verbose: int, quiet: bool):
     type=str,
     help="API key for BioPortal",
 )
-def download(ontologies, ontology_file, output_dir, snippet_only, ignore_cache, api_key) -> None:
+def download(
+    ontologies, ontology_file, output_dir, snippet_only, ignore_cache, api_key
+) -> None:
     """Downloads specified ontologies into data directory (default: data/raw).
 
     Args:
@@ -81,9 +114,11 @@ def download(ontologies, ontology_file, output_dir, snippet_only, ignore_cache,
         output_dir: A string pointing to the directory to download data to.
         Defaults to data/raw.
 
-        snippet_only: Downloads only the first 5 kB of the source, for testing and file checks.
+        snippet_only: (Not yet implemented) Downloads only the first 5 kB of the source, for testing and file checks.
+
+        ignore_cache: (Not yet implemented) If specified, will ignore existing files and download again.
 
-        ignore_cache: If specified, will ignore existing files and download again.
+        api_key: BioPortal / NCBO API key.
 
     Returns:
         None.
@@ -105,7 +140,12 @@ def download(ontologies, ontology_file, output_dir, snippet_only, ignore_cache,
 
     logging.info(f"{len(onto_list)} ontologies to retrieve.")
 
-    dl = Downloader(output_dir, snippet_only, ignore_cache, api_key)
+    dl = Downloader(
+        output_dir=output_dir,
+        snippet_only=snippet_only,
+        ignore_cache=ignore_cache,
+        api_key=api_key,
+    )
 
     dl.download(onto_list)
 

diff --git a/src/kg_bioportal/downloader.py b/src/kg_bioportal/downloader.py
@@ -4,9 +4,12 @@
 import os
 import requests
 
+ONTOLOGY_LIST_NAME = "ontologylist.tsv"
 
 class Downloader:
 
+    # TODO: implement ignore_cache and snippet_only
+
     # Directory to save the downloaded files
     output_dir: str = "data/raw"
 
@@ -87,3 +90,26 @@ def download(self, onto_list: list = []) -> None:
 
 
         return None
+
+    def get_ontology_list(self) -> None:
+        """Get the list of ontologies from BioPortal.
+
+        Args:
+            None.
+
+        Returns:
+            None.
+        """
+        headers = {"Authorization": f"apikey token={self.api_key}"}
+
+        logging.info("Getting set of all ontologies...")
+
+        analytics_url = "https://data.bioontology.org/analytics"
+
+        ontologies = requests.get(analytics_url, headers=headers, allow_redirects=True).json()
+
+        with open(f"{self.output_dir}/{ONTOLOGY_LIST_NAME}", "w") as outfile:
+            for name in ontologies:
+                outfile.write(f"{name}\n")
+
+        logging.info(f"Wrote to {self.output_dir}/{ONTOLOGY_LIST_NAME}")