Skip to content

Commit

Permalink
#321 revive SemMedDB versioning
Browse files Browse the repository at this point in the history
  • Loading branch information
ecwood committed Jul 26, 2023
1 parent 7f1ccb0 commit e5a66d4
Show file tree
Hide file tree
Showing 6 changed files with 25 additions and 24 deletions.
3 changes: 2 additions & 1 deletion Snakefile-conversion
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,15 @@ rule SemMedDB_Conversion:
real = config['SEMMEDDB_TUPLELIST_FILE'],
mrcui_req = config['UMLS_CUI_FILE'],
exclusion_list = config['SEMMEDDB_EXCLUSION_FILE'],
version_file = config['SEMMEDDB_VERSION_FILE'],
validation = config['VALIDATION_PLACEHOLDER']
output:
nodes = config['SEMMEDDB_OUTPUT_NODES_FILE'],
edges = config['SEMMEDDB_OUTPUT_EDGES_FILE']
log:
config['SEMMEDDB_CONVERSION_LOG']
shell:
config['PYTHON_COMMAND'] + " {input.code} --mrcuiFile ~/kg2-build/umls/META/MRCUI.RRF {input.real} {input.exclusion_list} {output.nodes} {output.edges} " + config['TEST_ARG'] + " > {log} 2>&1"
config['PYTHON_COMMAND'] + " {input.code} --mrcuiFile ~/kg2-build/umls/META/MRCUI.RRF {input.real} {input.exclusion_list} {input.version_file} {output.nodes} {output.edges} " + config['TEST_ARG'] + " > {log} 2>&1"

rule UniProtKB_Conversion:
input:
Expand Down
5 changes: 3 additions & 2 deletions Snakefile-extraction
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,12 @@ rule SemMedDB:
validation = config['VALIDATION_PLACEHOLDER']
output:
tuplelist = config['SEMMEDDB_TUPLELIST_FILE'],
exclusion_list = config['SEMMEDDB_EXCLUSION_FILE']
exclusion_list = config['SEMMEDDB_EXCLUSION_FILE'],
version_file = config['SEMMEDDB_VERSION_FILE']
log:
config['SEMMEDDB_EXTRACTION_LOG']
shell:
"bash -x {input.code} {output.tuplelist} {output.exclusion_list} > {log} 2>&1"
"bash -x {input.code} {output.tuplelist} {output.exclusion_list} {output.version_file} > {log} 2>&1"

rule UniProtKB:
input:
Expand Down
20 changes: 9 additions & 11 deletions extract-semmeddb.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
set -o nounset -o pipefail -o errexit

if [[ "${1:-}" == "--help" || "${1:-}" == "-h" ]]; then
echo Usage: "$0 <output_file.json> [test]"
echo Usage: "$0 <output_file.json> <output_exclude_list.yaml> <output_versioning.txt>"
exit 2
fi

# Usage: extract-semmeddb.sh <output_file.json>
# Usage: extract-semmeddb.sh <output_file.json> <output_exclude_list.yaml> <output_versioning.txt>

echo "================= starting extract-semmeddb.sh ================="
date
Expand All @@ -18,15 +18,16 @@ config_dir=`dirname "$0"`
source ${config_dir}/master-config.shinc

semmed_output_file=${1:-"${BUILD_DIR}/kg2-semmeddb-tuplelist.json"}

## supply a default value for the build_flag string
build_flag=${3:-""}
domain_range_exclusion_file=${2:-"${BUILD_DIR}/${domain_range_exclusion_filename}"}
semmeddb_version_file=${3:-"${BUILD_DIR}/semmeddb-version.txt"}

semmed_ver=VER43
semmed_year=2023
semmed_dir=${BUILD_DIR}/semmeddb
semmed_output_dir=`dirname "${semmed_output_file}"`

echo -e "Version: ${semmed_ver}\nYear: ${semmed_year}" > ${semmeddb_version_file}

## SQL files
base_filename=semmed${semmed_ver}_${semmed_year}_R_

Expand Down Expand Up @@ -72,16 +73,13 @@ biolink_base_url_no_version=https://raw.githubusercontent.com/biolink/biolink-mo
biolink_raw_base_url=${biolink_base_url_no_version}v${biolink_model_version}/
domain_range_exclusion_filename=semmed-exclude-list.yaml
domain_range_exclusion_link=${biolink_raw_base_url}${domain_range_exclusion_filename}
domain_range_exclusion_file=${2:-"${BUILD_DIR}/${domain_range_exclusion_filename}"}

${curl_get} ${domain_range_exclusion_link} -o ${domain_range_exclusion_file}

${python_command} ${CODE_DIR}/semmeddb_mysql_to_tuplelist_jsonl.py \
${mysql_conf} \
${mysql_dbname} \
${semmed_ver} \
${semmed_year} \
${semmed_output_file}
${mysql_conf} \
${mysql_dbname} \
${semmed_output_file}

date
echo "================= finished extract-semmeddb.sh ================="
Expand Down
4 changes: 0 additions & 4 deletions semmeddb_mysql_to_tuplelist_jsonl.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@ def make_arg_parser():
'as a list of tuples')
arg_parser.add_argument('mysqlConfigFile', type=str)
arg_parser.add_argument('mysqlDBName', type=str)
arg_parser.add_argument('versionNumber', type=str)
arg_parser.add_argument('versionDate', type=str)
arg_parser.add_argument('outputFile', type=str)
return arg_parser

Expand All @@ -39,8 +37,6 @@ def make_arg_parser():
args = make_arg_parser().parse_args()
mysql_config_file = args.mysqlConfigFile
mysql_db_name = args.mysqlDBName
version_number = args.versionNumber
version_date = args.versionDate
output_file_name = args.outputFile
version_number = version_number.strip('VER')
connection = pymysql.connect(read_default_file=mysql_config_file, db=mysql_db_name)
Expand Down
16 changes: 10 additions & 6 deletions semmeddb_tuplelist_json_to_kg_jsonl.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ def make_arg_parser():
arg_parser.add_argument('--mrcuiFile', dest='mrcui_file_name', type=str, default='/home/ubuntu/kg2-build/umls/META/MRCUI.RRF')
arg_parser.add_argument('inputFile', type=str)
arg_parser.add_argument('semmedExcludeList', type=str)
arg_parser.add_argument('versionFile', type=str)
arg_parser.add_argument('outputNodesFile', type=str)
arg_parser.add_argument('outputEdgesFile', type=str)
return arg_parser
Expand Down Expand Up @@ -232,6 +233,7 @@ def create_semmed_exclude_list(semmed_exclude_list_name):
semmed_exclude_list_name = args.semmedExcludeList
exclusions = create_semmed_exclude_list(semmed_exclude_list_name)
input_file_name = args.inputFile
version_file = args.versionFile
output_nodes_file_name = args.outputNodesFile
output_edges_file_name = args.outputEdgesFile
test_mode = args.test
Expand All @@ -249,12 +251,14 @@ def create_semmed_exclude_list(semmed_exclude_list_name):

row_ctr = 0

# versioning = input_data['versioning']
# version_number = versioning['version_number']
# version_date = versioning['version_date']

version_number = "TEMP"
version_date = "TEMP"
with open(version_file, 'r') as versioning:
line_count = 0
for line in versioning:
line_count += 1
if line_count == 1:
version_number = line.replace('Version: VER', '')
if line_count == 2:
version_date = line.replace('Year: ', '')

update_date_dt = datetime.datetime.fromisoformat('2018-01-01 00:00:00') # picking an arbitrary time in the past

Expand Down
1 change: 1 addition & 0 deletions snakemake-config-var.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ semmeddb_extraction_script: ${CODE_DIR}/${semmeddb_extraction_base}.sh
semmeddb_extraction_log: ${BUILD_DIR}/${semmeddb_extraction_base}${test_suffix}.log
semmeddb_tuplelist_file: ${BUILD_DIR}/semmeddb/semmeddb-tuplelist.jsonl
semmeddb_exclusion_file: ${BUILD_DIR}/semmed-exclude-list.yaml
semmeddb_version_file: ${BUILD_DIR}/semmeddb-version.txt
semmeddb_conversion_script: ${CODE_DIR}/${semmeddb_conversion_base}.py
semmeddb_conversion_log: ${BUILD_DIR}/${semmeddb_conversion_base}${test_suffix}.log
semmeddb_output_nodes_file: ${BUILD_DIR}/${semmeddb_output_base}${nodes_suffix}${test_suffix}.jsonl
Expand Down

0 comments on commit e5a66d4

Please sign in to comment.