From ea0b7e4ff63c452b4ce3c1d8aac34dc34d7de15b Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Tue, 17 Sep 2024 12:17:49 -0400 Subject: [PATCH 1/2] Compress nodes and edges by default --- src/kg_bioportal/cli.py | 11 +++++++++-- src/kg_bioportal/transformer.py | 25 +++++++++++++++++++------ 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/src/kg_bioportal/cli.py b/src/kg_bioportal/cli.py index a9dad1f..9763660 100644 --- a/src/kg_bioportal/cli.py +++ b/src/kg_bioportal/cli.py @@ -172,7 +172,14 @@ def download( @main.command() @click.option("--input_dir", "-i", default="data/raw", type=click.Path(exists=True)) @click.option("--output_dir", "-o", default="data/transformed") -def transform(input_dir, output_dir) -> None: +@click.option( + "--compress", + "-c", + is_flag=True, + default=True, + help="If true, compresses the output nodes and edges to tar.gz. Defaults to True.", +) +def transform(input_dir, output_dir, compress) -> None: """Transforms all ontologies in the input directory to KGX nodes and edges. Yields two log files: total_stats.yaml and onto_stats.yaml. @@ -190,7 +197,7 @@ def transform(input_dir, output_dir) -> None: tx = Transformer(input_dir=input_dir, output_dir=output_dir) - tx.transform_all() + tx.transform_all(compress=compress) return None diff --git a/src/kg_bioportal/transformer.py b/src/kg_bioportal/transformer.py index 354b515..e96b59d 100644 --- a/src/kg_bioportal/transformer.py +++ b/src/kg_bioportal/transformer.py @@ -3,13 +3,15 @@ import logging import os import sys +import tarfile from typing import Tuple import yaml from kgx.transformer import Transformer as KGXTransformer from kg_bioportal.downloader import ONTOLOGY_LIST_NAME -from kg_bioportal.robot_utils import initialize_robot, robot_convert, robot_relax +from kg_bioportal.robot_utils import (initialize_robot, robot_convert, + robot_relax) # TODO: Don't repeat steps if the products already exist # TODO: Fix KGX hijacking logging @@ -55,7 +57,7 @@ def __init__( return None - def transform_all(self) -> None: + def transform_all(self, compress: bool) -> None: """Transforms all ontologies in the input directory to KGX nodes and edges. Yields two log files: total_stats.yaml and onto_stats.yaml. @@ -63,7 +65,7 @@ def transform_all(self) -> None: The second contains the counts of nodes and edges for each ontology. Args: - None. + compress: If True, compresses the output nodes and edges to tar.gz. Returns: None. @@ -95,7 +97,7 @@ def transform_all(self) -> None: for filepath in filepaths: ontology_name = (os.path.relpath(filepath, self.input_dir)).split(os.sep)[0] - success, nodecount, edgecount = self.transform(filepath) + success, nodecount, edgecount = self.transform(filepath, compress) if not success: logging.error(f"Error transforming {filepath}.") status = False @@ -142,11 +144,12 @@ def transform_all(self) -> None: return None - def transform(self, ontology_path: str) -> Tuple[bool, int, int]: + def transform(self, ontology_path: str, compress: bool) -> Tuple[bool, int, int]: """Transforms a single ontology to KGX nodes and edges. Args: - ontology: A string of the path to the ontology file to transform. + ontology_path: A string of the path to the ontology file to transform. + compress: If True, compresses the output nodes and edges to tar.gz. Returns: Tuple of: @@ -239,6 +242,16 @@ def transform(self, ontology_path: str) -> Tuple[bool, int, int]: with open(edgefilename, "r") as f: edgecount = len(f.readlines()) - 1 + # Compress if requested + if compress: + logging.info("Compressing nodes and edges.") + with tarfile.open(f"{outfilename}.tar.gz", "w:gz") as tar: + tar.add(nodefilename, arcname=f"{ontology_name}_nodes.tsv") + tar.add(edgefilename, arcname=f"{ontology_name}_edges.tsv") + + os.remove(nodefilename) + os.remove(edgefilename) + except Exception as e: logging.error( f"Error transforming {ontology_name} to KGX nodes and edges: {e}" From 78884d71ea90956dda614f5514a6e14fe91de75a Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Tue, 17 Sep 2024 12:21:47 -0400 Subject: [PATCH 2/2] Remove interstitial products --- src/kg_bioportal/transformer.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/kg_bioportal/transformer.py b/src/kg_bioportal/transformer.py index e96b59d..21377d9 100644 --- a/src/kg_bioportal/transformer.py +++ b/src/kg_bioportal/transformer.py @@ -10,8 +10,7 @@ from kgx.transformer import Transformer as KGXTransformer from kg_bioportal.downloader import ONTOLOGY_LIST_NAME -from kg_bioportal.robot_utils import (initialize_robot, robot_convert, - robot_relax) +from kg_bioportal.robot_utils import initialize_robot, robot_convert, robot_relax # TODO: Don't repeat steps if the products already exist # TODO: Fix KGX hijacking logging @@ -248,10 +247,21 @@ def transform(self, ontology_path: str, compress: bool) -> Tuple[bool, int, int] with tarfile.open(f"{outfilename}.tar.gz", "w:gz") as tar: tar.add(nodefilename, arcname=f"{ontology_name}_nodes.tsv") tar.add(edgefilename, arcname=f"{ontology_name}_edges.tsv") - + os.remove(nodefilename) os.remove(edgefilename) + # Remove the owl files + # They may not exist if the transform failed + try: + os.remove(owl_output_path) + except OSError: + pass + try: + os.remove(relaxed_outpath) + except OSError: + pass + except Exception as e: logging.error( f"Error transforming {ontology_name} to KGX nodes and edges: {e}"