Skip to content

Commit

Permalink
Merge pull request #88 from ncbo/85-compress-nodes-and-edges-before-u…
Browse files Browse the repository at this point in the history
…pload

Compress nodes and edges before upload; remove interstitial files
  • Loading branch information
caufieldjh authored Sep 17, 2024
2 parents 8392336 + 78884d7 commit c1fe4cd
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 7 deletions.
11 changes: 9 additions & 2 deletions src/kg_bioportal/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,14 @@ def download(
@main.command()
@click.option("--input_dir", "-i", default="data/raw", type=click.Path(exists=True))
@click.option("--output_dir", "-o", default="data/transformed")
def transform(input_dir, output_dir) -> None:
@click.option(
"--compress",
"-c",
is_flag=True,
default=True,
help="If true, compresses the output nodes and edges to tar.gz. Defaults to True.",
)
def transform(input_dir, output_dir, compress) -> None:
"""Transforms all ontologies in the input directory to KGX nodes and edges.
Yields two log files: total_stats.yaml and onto_stats.yaml.
Expand All @@ -190,7 +197,7 @@ def transform(input_dir, output_dir) -> None:

tx = Transformer(input_dir=input_dir, output_dir=output_dir)

tx.transform_all()
tx.transform_all(compress=compress)

return None

Expand Down
33 changes: 28 additions & 5 deletions src/kg_bioportal/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import logging
import os
import sys
import tarfile
from typing import Tuple

import yaml
Expand Down Expand Up @@ -55,15 +56,15 @@ def __init__(

return None

def transform_all(self) -> None:
def transform_all(self, compress: bool) -> None:
"""Transforms all ontologies in the input directory to KGX nodes and edges.
Yields two log files: total_stats.yaml and onto_stats.yaml.
The first contains the total counts of Bioportal ontologies and transforms.
The second contains the counts of nodes and edges for each ontology.
Args:
None.
compress: If True, compresses the output nodes and edges to tar.gz.
Returns:
None.
Expand Down Expand Up @@ -95,7 +96,7 @@ def transform_all(self) -> None:

for filepath in filepaths:
ontology_name = (os.path.relpath(filepath, self.input_dir)).split(os.sep)[0]
success, nodecount, edgecount = self.transform(filepath)
success, nodecount, edgecount = self.transform(filepath, compress)
if not success:
logging.error(f"Error transforming {filepath}.")
status = False
Expand Down Expand Up @@ -142,11 +143,12 @@ def transform_all(self) -> None:

return None

def transform(self, ontology_path: str) -> Tuple[bool, int, int]:
def transform(self, ontology_path: str, compress: bool) -> Tuple[bool, int, int]:
"""Transforms a single ontology to KGX nodes and edges.
Args:
ontology: A string of the path to the ontology file to transform.
ontology_path: A string of the path to the ontology file to transform.
compress: If True, compresses the output nodes and edges to tar.gz.
Returns:
Tuple of:
Expand Down Expand Up @@ -239,6 +241,27 @@ def transform(self, ontology_path: str) -> Tuple[bool, int, int]:
with open(edgefilename, "r") as f:
edgecount = len(f.readlines()) - 1

# Compress if requested
if compress:
logging.info("Compressing nodes and edges.")
with tarfile.open(f"{outfilename}.tar.gz", "w:gz") as tar:
tar.add(nodefilename, arcname=f"{ontology_name}_nodes.tsv")
tar.add(edgefilename, arcname=f"{ontology_name}_edges.tsv")

os.remove(nodefilename)
os.remove(edgefilename)

# Remove the owl files
# They may not exist if the transform failed
try:
os.remove(owl_output_path)
except OSError:
pass
try:
os.remove(relaxed_outpath)
except OSError:
pass

except Exception as e:
logging.error(
f"Error transforming {ontology_name} to KGX nodes and edges: {e}"
Expand Down

0 comments on commit c1fe4cd

Please sign in to comment.