From d76afc0cc9f20b3a1bc0467493c1e2815b88f599 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 24 Jul 2023 23:16:04 -0700 Subject: [PATCH] #321 updates for new SemMedDB flow --- Snakefile-extraction | 11 +++++++++++ Snakefile-semmeddb-extraction | 11 ----------- build-kg2-snakemake.sh | 11 +++-------- extract-semmeddb.sh | 9 --------- semmeddb_mysql_to_tuple_list_json.py | 8 +------- 5 files changed, 15 insertions(+), 35 deletions(-) delete mode 100644 Snakefile-semmeddb-extraction diff --git a/Snakefile-extraction b/Snakefile-extraction index c4d79689..9530d2f6 100644 --- a/Snakefile-extraction +++ b/Snakefile-extraction @@ -8,6 +8,17 @@ rule UMLS: shell: "bash -x " + config['CODE_DIR'] + "/extract-umls.sh " + config['BUILD_DIR'] + " {output} > {log} 2>&1" +rule SemMedDB: + input: + config['VALIDATION_PLACEHOLDER'] + output: + tuplelist = config['SEMMED_TUPLELIST_FILE'], + exclusion_list = config['SEMMED_EXCLUSION_FILE'] + log: + config['BUILD_DIR'] + "/extract-semmeddb" + config['TEST_SUFFIX'] + ".log" + shell: + "bash -x " + config['CODE_DIR'] + "/extract-semmeddb.sh {output.tuplelist} {output.exclusion_list} > {log} 2>&1" + rule UniProtKB: input: config['VALIDATION_PLACEHOLDER'] diff --git a/Snakefile-semmeddb-extraction b/Snakefile-semmeddb-extraction deleted file mode 100644 index baea2298..00000000 --- a/Snakefile-semmeddb-extraction +++ /dev/null @@ -1,11 +0,0 @@ -rule SemMedDB: - input: - config['VALIDATION_PLACEHOLDER'] - output: - tuplelist = config['SEMMED_TUPLELIST_FILE'], - exclusion_list = config['SEMMED_EXCLUSION_FILE'] - log: - config['BUILD_DIR'] + "/extract-semmeddb" + config['TEST_SUFFIX'] + ".log" - shell: - "bash -x " + config['CODE_DIR'] + "/extract-semmeddb.sh {output.tuplelist} {output.exclusion_list} " + config['TEST_FLAG'] + " > {log} 2>&1" - diff --git a/build-kg2-snakemake.sh b/build-kg2-snakemake.sh index 8b4cf33a..f0ef30b6 100755 --- a/build-kg2-snakemake.sh +++ b/build-kg2-snakemake.sh @@ -6,12 +6,12 @@ set -o nounset -o pipefail -o errexit if [[ "${1:-}" == "--help" || "${1:-}" == "-h" ]]; then - echo Usage: "$0 [test|alltest|all|-n|nodes|graphic|-R_*|-F] [-n|nodes|graphic|-R_*|-F] " + echo Usage: "$0 [test|all|-n|nodes|graphic|-R_*|-F] [-n|nodes|graphic|-R_*|-F] " echo "[-n|nodes|graphic|-R_*|-F|ci] [nodes|ci|-n] [ci]" exit 2 fi -# Usage: build-kg2-snakemake.sh [test|alltest|all|-n|nodes|graphic|-R_*|-F] [-n|nodes|graphic|-R_*|-F] +# Usage: build-kg2-snakemake.sh [test|all|-n|nodes|graphic|-R_*|-F] [-n|nodes|graphic|-R_*|-F] # [-n|nodes|graphic|-R_*|-F|ci] [nodes|ci|-n] [ci] config_dir=`dirname "$0"` @@ -29,7 +29,7 @@ then ci_flag="ci" fi -if [[ "${build_flag}" == "test" || "${build_flag}" == "alltest" ]] +if [[ "${build_flag}" == "test" ]] then # The test argument for bash scripts (ex. extract-semmeddb.sh test) test_flag="test" @@ -128,11 +128,6 @@ echo 'include: "Snakefile-conversion"' >> ${snakefile} echo 'include: "Snakefile-post-etl"' >> ${snakefile} -if [[ "${build_flag}" == "all" || "${build_flag}" == "alltest" ]] -then - echo 'include: "Snakefile-semmeddb-extraction"' >> ${snakefile} -fi - if [[ "${build_flag}" == "all" ]] then echo 'include: "Snakefile-extraction"' >> ${snakefile} diff --git a/extract-semmeddb.sh b/extract-semmeddb.sh index ff55aaf3..d0850ba3 100755 --- a/extract-semmeddb.sh +++ b/extract-semmeddb.sh @@ -76,16 +76,7 @@ domain_range_exclusion_file=${2:-"${BUILD_DIR}/${domain_range_exclusion_filename ${curl_get} ${domain_range_exclusion_link} -o ${domain_range_exclusion_file} -if [[ "${build_flag}" == "test" || "${build_flag}" == 'alltest' ]] -then - test_arg=" --test" -else - test_arg="" -fi - - ${VENV_DIR}/bin/python3 ${CODE_DIR}/semmeddb_mysql_to_tuple_list_json.py \ - ${test_arg} \ ${mysql_conf} \ ${mysql_dbname} \ ${semmed_ver} \ diff --git a/semmeddb_mysql_to_tuple_list_json.py b/semmeddb_mysql_to_tuple_list_json.py index 6f863576..90500e00 100755 --- a/semmeddb_mysql_to_tuple_list_json.py +++ b/semmeddb_mysql_to_tuple_list_json.py @@ -26,7 +26,6 @@ def make_arg_parser(): arg_parser = argparse.ArgumentParser(description='semmeddb_mysql_to_tuple_list_json.py: extracts all the predicate triples from SemMedDB, ' + 'as a list of tuples') - arg_parser.add_argument('--test', dest='test', action="store_true", default=False) arg_parser.add_argument('mysqlConfigFile', type=str) arg_parser.add_argument('mysqlDBName', type=str) arg_parser.add_argument('versionNumber', type=str) @@ -44,11 +43,10 @@ def make_arg_parser(): version_date = args.versionDate output_file_name = args.outputFile version_number = version_number.strip('VER') - test_mode = args.test connection = pymysql.connect(read_default_file=mysql_config_file, db=mysql_db_name) preds_dict = dict() - output_info = kg2_util.create_single_jsonlines(test_mode) + output_info = kg2_util.create_single_jsonlines(False) output = output_info[0] # https://stackoverflow.com/questions/7208773/mysql-row-30153-was-cut-by-group-concat-error @@ -60,10 +58,6 @@ def make_arg_parser(): "FROM ((PREDICATION NATURAL JOIN CITATIONS) NATURAL JOIN SENTENCE) NATURAL JOIN PREDICATION_AUX " "GROUP BY SUBJECT_CUI, PREDICATE, OBJECT_CUI") - - if test_mode: - sql_statement += " LIMIT 10000" - with connection.cursor() as cursor: cursor.execute(max_len_sql_statement) cursor.fetchall()