Skip to content

Commit

Permalink
#332 centralize location of filenames for post-etl scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
ecwood committed Jul 25, 2023
1 parent de4d28c commit 8a3c8ec
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 15 deletions.
6 changes: 3 additions & 3 deletions Snakefile-finish
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ rule Finish:
simplified_report_file = config['SIMPLIFIED_REPORT_FILE'],
slim_output_nodes_file = config['SLIM_OUTPUT_NODES_FILE'],
slim_output_edges_file = config['SLIM_OUTPUT_EDGES_FILE'],
placeholder = config['BUILD_DIR'] + "/tsv_placeholder.empty"
run:
shell("bash -x " + config['CODE_DIR'] + "/finish-snakemake.sh {input.final_output_nodes_file} {input.final_output_edges_file} {input.output_file_orphan_edges} {input.report_file} {input.simplified_output_nodes_file} {input.simplified_output_edges_file} {input.simplified_report_file} {input.slim_output_nodes_file} {input.slim_output_edges_file} " + config['KG2_TSV_DIR'] + " \"" + config['S3_CP_CMD'] + "\" " + config['KG2_TSV_TARBALL'] + " " + config['S3_BUCKET'] + " " + config['S3_BUCKET_PUBLIC'] + " " + config['CODE_DIR'] + " " + config['S3_BUCKET_VERSIONED'] + " " + config['BUILD_DIR'] + " " + config['SIMPLIFIED_REPORT_FILE_BASE'] + " " + config['VENV_DIR'])
placeholder = config['TSV_PLACEHOLDER']
shell:
"bash -x " + config['CODE_DIR'] + "/finish-snakemake.sh {input.final_output_nodes_file} {input.final_output_edges_file} {input.output_file_orphan_edges} {input.report_file} {input.simplified_output_nodes_file} {input.simplified_output_edges_file} {input.simplified_report_file} {input.slim_output_nodes_file} {input.slim_output_edges_file} " + config['KG2_TSV_DIR'] + " \"" + config['S3_CP_CMD'] + "\" " + config['KG2_TSV_TARBALL'] + " " + config['S3_BUCKET'] + " " + config['S3_BUCKET_PUBLIC'] + " " + config['CODE_DIR'] + " " + config['S3_BUCKET_VERSIONED'] + " " + config['BUILD_DIR'] + " " + config['SIMPLIFIED_REPORT_FILE_BASE'] + " " + config['VENV_DIR']


34 changes: 22 additions & 12 deletions Snakefile-post-etl
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
rule Merge:
input:
code = config['MERGE_SCRIPT'],
ont_nodes = config['ONT_OUTPUT_NODES_FILE'],
ont_edges = config['ONT_OUTPUT_EDGES_FILE'],
uniprot_nodes = config['UNIPROTKB_OUTPUT_NODES_FILE'],
Expand Down Expand Up @@ -44,8 +45,10 @@ rule Merge:
nodes = config['FINAL_OUTPUT_NODES_FILE'],
edges = config['FINAL_OUTPUT_EDGES_FILE'],
orph = config['OUTPUT_FILE_ORPHAN_EDGES']
log:
config['MERGE_LOG']
shell:
config['VENV_DIR'] + "/bin/python3 -u " + config['CODE_DIR'] + "/merge_graphs.py " + config['TEST_ARG'] + \
config['PYTHON_COMMAND'] + " {input.code} " + config['TEST_ARG'] + \
" --kgFileOrphanEdges {output.orph}" + \
" --outputNodesFile {output.nodes} " + \
" --outputEdgesFile {output.edges} " + \
Expand Down Expand Up @@ -90,62 +93,69 @@ rule Merge:
"{input.drugcentral_edges} " + \
"{input.intact_edges} " + \
"{input.disgenet_edges} " + \
"{input.kegg_edges}"
"{input.kegg_edges} > {log} 2>&1"

rule Stats:
input:
code = config['REPORT_SCRIPT'],
nodes = config['FINAL_OUTPUT_NODES_FILE'],
edges = config['FINAL_OUTPUT_EDGES_FILE']
output:
config['REPORT_FILE']
log:
config['BUILD_DIR'] + "/report_stats_on_json_kg" + config['TEST_SUFFIX'] + ".log"
config['REPORT_LOG']
shell:
config['VENV_DIR'] + "/bin/python3 -u " + config['CODE_DIR'] + "/report_stats_on_json_kg.py {input.nodes} {input.edges} {output} > {log} 2>&1"
config['PYTHON_COMMAND'] + " {input.code} {input.nodes} {input.edges} {output} > {log} 2>&1"

rule Simplify:
input:
code = config['SIMPLIFY_SCRIPT'],
nodes = config['FINAL_OUTPUT_NODES_FILE'],
edges = config['FINAL_OUTPUT_EDGES_FILE']
output:
nodes = config['SIMPLIFIED_OUTPUT_NODES_FILE'],
edges = config['SIMPLIFIED_OUTPUT_EDGES_FILE']
log:
config['BUILD_DIR'] + "/filter_kg_and_remap_predicates" + config['TEST_SUFFIX'] + ".log"
config['SIMPLIFY_LOG']
shell:
"bash -x " + config['CODE_DIR'] + "/run-simplify.sh {input.nodes} {input.edges} {output.nodes} {output.edges} " + config['VERSION_FILE'] + " " + config['TEST_FLAG'] + " > {log} 2>&1"
"bash -x {input.code} {input.nodes} {input.edges} {output.nodes} {output.edges} " + config['VERSION_FILE'] + " " + config['TEST_FLAG'] + " > {log} 2>&1"

rule Slim:
input:
code = config['SLIM_SCRIPT'],
nodes = config['SIMPLIFIED_OUTPUT_NODES_FILE'],
edges = config['SIMPLIFIED_OUTPUT_EDGES_FILE']
output:
nodes = config['SLIM_OUTPUT_NODES_FILE'],
edges = config['SLIM_OUTPUT_EDGES_FILE']
log:
config['BUILD_DIR'] + "/slim_kg2" + config['TEST_SUFFIX'] + ".log"
config['SLIM_LOG']
shell:
config['VENV_DIR'] + "/bin/python3 -u " + config['CODE_DIR'] + "/slim_kg2.py " + config['TEST_ARG'] + " {input.nodes} {input.edges} {output.nodes} {output.edges} > {log} 2>&1"
config['PYTHON_COMMAND'] + " {input.code} " + config['TEST_ARG'] + " {input.nodes} {input.edges} {output.nodes} {output.edges} > {log} 2>&1"

rule Simplify_Stats:
input:
code = config['REPORT_SCRIPT'],
nodes = config['SIMPLIFIED_OUTPUT_NODES_FILE'],
edges = config['SIMPLIFIED_OUTPUT_EDGES_FILE']
output:
config['SIMPLIFIED_REPORT_FILE']
log:
config['BUILD_DIR'] + "/report_stats_on_json_kg_simplified" + config['TEST_SUFFIX'] + ".log"
config['SIMPLIFIED_REPORT_LOG']
shell:
config['VENV_DIR'] + "/bin/python3 -u " + config['CODE_DIR'] + "/report_stats_on_json_kg.py --useSimplifiedPredicates {input.nodes} {input.edges} {output} > {log} 2>&1"
config['PYTHON_COMMAND'] + " {input.code} --useSimplifiedPredicates {input.nodes} {input.edges} {output} > {log} 2>&1"

rule TSV:
input:
code = config['TSV_SCRIPT']
nodes = config['SIMPLIFIED_OUTPUT_NODES_FILE'],
edges = config['SIMPLIFIED_OUTPUT_EDGES_FILE']
output:
placeholder = config['BUILD_DIR'] + "/tsv_placeholder.empty"
placeholder = config['TSV_PLACEHOLDER']
log:
config['TSV_LOG']
run:
shell("rm -rf " + config['KG2_TSV_DIR'])
shell("mkdir -p " + config['KG2_TSV_DIR'])
shell(config['VENV_DIR'] + "/bin/python3 -u " + config['CODE_DIR'] + "/kg_json_to_tsv.py {input.nodes} {input.edges} " + config['KG2_TSV_DIR'])
shell(config['PYTHON_COMMAND'] + " {input.code} {input.nodes} {input.edges} " + config['KG2_TSV_DIR'] + " > {log} 2>&1")
shell("touch {output.placeholder}")
12 changes: 12 additions & 0 deletions snakemake-config-var.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -162,22 +162,34 @@ kegg_conversion_log: ${BUILD_DIR}/kegg_json_to_kg_json${test_suffix}.log
kegg_output_nodes_file: ${BUILD_DIR}/kg2-kegg-nodes${test_suffix}.jsonl
kegg_output_edges_file: ${BUILD_DIR}/kg2-kegg-edges${test_suffix}.jsonl

merge_script: ${CODE_DIR}/merge_graphs.py
merge_log: ${BUILD_DIR}/merge_graphs${test_suffix}.log
final_output_nodes_file: ${BUILD_DIR}/kg2-nodes${test_suffix}.jsonl
final_output_edges_file: ${BUILD_DIR}/kg2-edges${test_suffix}.jsonl
output_file_orphan_edges: ${BUILD_DIR}/kg2-orphan-edges${test_suffix}.jsonl

simplify_script: ${CODE_DIR}/run-simplify.sh
simplify_log: ${BUILD_DIR}/run-simplify${test_suffix}.log
simplified_output_nodes_file: ${BUILD_DIR}/kg2-simplified-nodes${test_suffix}.jsonl
simplified_output_edges_file: ${BUILD_DIR}/kg2-simplified-edges${test_suffix}.jsonl

report_script: ${CODE_DIR}/report_stats_on_kg_jsonl.py
report_log: ${BUILD_DIR}/report_stats_on_kg_jsonl${test_suffix}.log
report_file: ${BUILD_DIR}/kg2-report${test_suffix}.json

simplified_report_log: ${BUILD_DIR}/report_stats_on_kg_jsonl-simplified${test_suffix}.log
simplified_report_file: ${BUILD_DIR}/kg2-simplified-report${test_suffix}.json

slim_script: ${CODE_DIR}/slim_kg2.py
slim_log: ${BUILD_DIR}/slim_kg2${test_suffix}.log
slim_output_nodes_file: ${BUILD_DIR}/kg2-slim-nodes${test_suffix}.jsonl
slim_output_edges_file: ${BUILD_DIR}/kg2-slim-edges${test_suffix}.jsonl

tsv_script: ${CODE_DIR}/kg_json_to_tsv.py
tsv_log: ${BUILD_DIR}/kg_json_to_tsv${test_suffix}.log
kg2_tsv_dir: ${BUILD_DIR}/TSV
kg2_tsv_tarball: ${BUILD_DIR}/kg2-tsv-for-neo4j${test_suffix}.tar.gz
tsv_placeholder: ${BUILD_DIR}/tsv_placeholder.empty

version_file: ${BUILD_DIR}/kg2-version.txt

Expand Down

0 comments on commit 8a3c8ec

Please sign in to comment.