Skip to content

Commit

Permalink
Compress nodes and edges by default
Browse files Browse the repository at this point in the history
  • Loading branch information
caufieldjh committed Sep 17, 2024
1 parent 8392336 commit ea0b7e4
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 8 deletions.
11 changes: 9 additions & 2 deletions src/kg_bioportal/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,14 @@ def download(
@main.command()
@click.option("--input_dir", "-i", default="data/raw", type=click.Path(exists=True))
@click.option("--output_dir", "-o", default="data/transformed")
def transform(input_dir, output_dir) -> None:
@click.option(
"--compress",
"-c",
is_flag=True,
default=True,
help="If true, compresses the output nodes and edges to tar.gz. Defaults to True.",
)
def transform(input_dir, output_dir, compress) -> None:
"""Transforms all ontologies in the input directory to KGX nodes and edges.
Yields two log files: total_stats.yaml and onto_stats.yaml.
Expand All @@ -190,7 +197,7 @@ def transform(input_dir, output_dir) -> None:

tx = Transformer(input_dir=input_dir, output_dir=output_dir)

tx.transform_all()
tx.transform_all(compress=compress)

return None

Expand Down
25 changes: 19 additions & 6 deletions src/kg_bioportal/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@
import logging
import os
import sys
import tarfile
from typing import Tuple

import yaml
from kgx.transformer import Transformer as KGXTransformer

from kg_bioportal.downloader import ONTOLOGY_LIST_NAME
from kg_bioportal.robot_utils import initialize_robot, robot_convert, robot_relax
from kg_bioportal.robot_utils import (initialize_robot, robot_convert,
robot_relax)

# TODO: Don't repeat steps if the products already exist
# TODO: Fix KGX hijacking logging
Expand Down Expand Up @@ -55,15 +57,15 @@ def __init__(

return None

def transform_all(self) -> None:
def transform_all(self, compress: bool) -> None:
"""Transforms all ontologies in the input directory to KGX nodes and edges.
Yields two log files: total_stats.yaml and onto_stats.yaml.
The first contains the total counts of Bioportal ontologies and transforms.
The second contains the counts of nodes and edges for each ontology.
Args:
None.
compress: If True, compresses the output nodes and edges to tar.gz.
Returns:
None.
Expand Down Expand Up @@ -95,7 +97,7 @@ def transform_all(self) -> None:

for filepath in filepaths:
ontology_name = (os.path.relpath(filepath, self.input_dir)).split(os.sep)[0]
success, nodecount, edgecount = self.transform(filepath)
success, nodecount, edgecount = self.transform(filepath, compress)
if not success:
logging.error(f"Error transforming {filepath}.")
status = False
Expand Down Expand Up @@ -142,11 +144,12 @@ def transform_all(self) -> None:

return None

def transform(self, ontology_path: str) -> Tuple[bool, int, int]:
def transform(self, ontology_path: str, compress: bool) -> Tuple[bool, int, int]:
"""Transforms a single ontology to KGX nodes and edges.
Args:
ontology: A string of the path to the ontology file to transform.
ontology_path: A string of the path to the ontology file to transform.
compress: If True, compresses the output nodes and edges to tar.gz.
Returns:
Tuple of:
Expand Down Expand Up @@ -239,6 +242,16 @@ def transform(self, ontology_path: str) -> Tuple[bool, int, int]:
with open(edgefilename, "r") as f:
edgecount = len(f.readlines()) - 1

# Compress if requested
if compress:
logging.info("Compressing nodes and edges.")
with tarfile.open(f"{outfilename}.tar.gz", "w:gz") as tar:
tar.add(nodefilename, arcname=f"{ontology_name}_nodes.tsv")
tar.add(edgefilename, arcname=f"{ontology_name}_edges.tsv")

os.remove(nodefilename)
os.remove(edgefilename)

except Exception as e:
logging.error(
f"Error transforming {ontology_name} to KGX nodes and edges: {e}"
Expand Down

0 comments on commit ea0b7e4

Please sign in to comment.