From 58d1bddc4ce6bf4f1d88f0cdc0be8c6cf4a9d549 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 01:31:50 -0700 Subject: [PATCH] #387 cleaning up the formatting of the new files --- convert/ontologies_jsonl_to_kg_jsonl.py | 748 +++++++++---------- extract/owlparser.py | 924 ++++++++++++------------ 2 files changed, 854 insertions(+), 818 deletions(-) diff --git a/convert/ontologies_jsonl_to_kg_jsonl.py b/convert/ontologies_jsonl_to_kg_jsonl.py index 4dfb9992..a27561f4 100644 --- a/convert/ontologies_jsonl_to_kg_jsonl.py +++ b/convert/ontologies_jsonl_to_kg_jsonl.py @@ -1,8 +1,25 @@ +#!/usr/bin/env python3 +''' ontologies_jsonl_to_kg_jsonl.py: Converts JSON Lines representation of ontologies into KG JSON Lines format + + Usage: ontologies_jsonl_to_kg_jsonl.py [--test] +''' + + import argparse import kg2_util import json import datetime +__author__ = 'Erica Wood' +__copyright__ = 'Oregon State University' +__credits__ = ['Stephen Ramsey', 'Erica Wood'] +__license__ = 'MIT' +__version__ = '0.1.0' +__maintainer__ = '' +__email__ = '' +__status__ = 'Prototype' + + ID_TAG = "rdf:about" NAME_TAG = "rdfs:label" @@ -18,26 +35,26 @@ DESCRIPTION_DELIM = " // " BASE_EDGE_TYPES = {"mondo-base:exactMatch": RESOURCE_KEY, - "mondo-base:closeMatch": RESOURCE_KEY, - "mondo-base:relatedMatch": RESOURCE_KEY, - "mondo-base:broadMatch": RESOURCE_KEY, - "mondo-base:narrowMatch": RESOURCE_KEY, - "skos:exactMatch": RESOURCE_KEY, - "skos:closeMatch": RESOURCE_KEY, - "skos:broadMatch": RESOURCE_KEY, - "skos:relatedMatch": RESOURCE_KEY, - "skos:narrowMatch": RESOURCE_KEY, - "obo:IAO_0100001": RESOURCE_KEY, - "obo:RO_0002175": RESOURCE_KEY, - "obo:RO_0002161": RESOURCE_KEY, - "obo:RO_0002604": RESOURCE_KEY, - "obo:RO_0002171": RESOURCE_KEY, - "obo:RO_0002174": RESOURCE_KEY, - "obo:RO_0002475": RESOURCE_KEY, - "obo:RO_0001900": RESOURCE_KEY, - "oboInOwl:hasAlternativeId": TEXT_KEY, - "oboInOwl:hasDbXref": TEXT_KEY, - "oboInOwl:xref": TEXT_KEY} + "mondo-base:closeMatch": RESOURCE_KEY, + "mondo-base:relatedMatch": RESOURCE_KEY, + "mondo-base:broadMatch": RESOURCE_KEY, + "mondo-base:narrowMatch": RESOURCE_KEY, + "skos:exactMatch": RESOURCE_KEY, + "skos:closeMatch": RESOURCE_KEY, + "skos:broadMatch": RESOURCE_KEY, + "skos:relatedMatch": RESOURCE_KEY, + "skos:narrowMatch": RESOURCE_KEY, + "obo:IAO_0100001": RESOURCE_KEY, + "obo:RO_0002175": RESOURCE_KEY, + "obo:RO_0002161": RESOURCE_KEY, + "obo:RO_0002604": RESOURCE_KEY, + "obo:RO_0002171": RESOURCE_KEY, + "obo:RO_0002174": RESOURCE_KEY, + "obo:RO_0002475": RESOURCE_KEY, + "obo:RO_0001900": RESOURCE_KEY, + "oboInOwl:hasAlternativeId": TEXT_KEY, + "oboInOwl:hasDbXref": TEXT_KEY, + "oboInOwl:xref": TEXT_KEY} CLASS_TO_SUPERCLASSES = dict() SAVED_NODE_INFO = dict() @@ -73,383 +90,386 @@ VERSION_KEY = "version" def get_args(): - arg_parser = argparse.ArgumentParser() - arg_parser.add_argument('--test', dest='test', - action="store_true", default=False) - arg_parser.add_argument('inputFile', type=str) - arg_parser.add_argument('curiesToCategoriesYAML', type=str) - arg_parser.add_argument('curiesToURLsYAML', type=str) - arg_parser.add_argument('outputNodesFile', type=str) - arg_parser.add_argument('outputEdgesFile', type=str) - return arg_parser.parse_args() + arg_parser = argparse.ArgumentParser() + arg_parser.add_argument('--test', dest='test', + action="store_true", default=False) + arg_parser.add_argument('inputFile', type=str) + arg_parser.add_argument('curiesToCategoriesYAML', type=str) + arg_parser.add_argument('curiesToURLsYAML', type=str) + arg_parser.add_argument('outputNodesFile', type=str) + arg_parser.add_argument('outputEdgesFile', type=str) + return arg_parser.parse_args() def categorize_node(node_id, recursion_depth=0): - node_prefix = node_id.split(':')[0] - - if node_id in NODE_CATEGORY_MAPPINGS and NODE_CATEGORY_MAPPINGS[node_id][1] == FILE_MAPPING: - return NODE_CATEGORY_MAPPINGS[node_id][0] - - if node_prefix in PREFIX_MAPPINGS: - node_category = PREFIX_MAPPINGS[node_prefix] - NODE_CATEGORY_MAPPINGS[node_id] = (node_category, PREFIX_MAPPING) - return PREFIX_MAPPINGS[node_prefix] - - # Get try to get the most common superclass categorization - superclass_categorizations = dict() - highest_value = 0 - highest_category = kg2_util.BIOLINK_CATEGORY_NAMED_THING - if recursion_depth == 10: - return kg2_util.BIOLINK_CATEGORY_NAMED_THING - - for superclass in CLASS_TO_SUPERCLASSES.get(node_id, list()): - superclass_category = categorize_node(superclass, recursion_depth + 1) - if superclass_category not in superclass_categorizations: - superclass_categorizations[superclass_category] = 0 - superclass_categorizations[superclass_category] += 1 - if superclass_categorizations[superclass_category] > highest_value: - highest_value = superclass_categorizations[superclass_category] - highest_category = superclass_category - - NODE_CATEGORY_MAPPINGS[node_id] = (highest_category, RECURSE_MAPPING) - return highest_category + node_prefix = node_id.split(':')[0] + + if node_id in NODE_CATEGORY_MAPPINGS and NODE_CATEGORY_MAPPINGS[node_id][1] == FILE_MAPPING: + return NODE_CATEGORY_MAPPINGS[node_id][0] + + if node_prefix in PREFIX_MAPPINGS: + node_category = PREFIX_MAPPINGS[node_prefix] + NODE_CATEGORY_MAPPINGS[node_id] = (node_category, PREFIX_MAPPING) + return PREFIX_MAPPINGS[node_prefix] + + # Get try to get the most common superclass categorization + superclass_categorizations = dict() + highest_value = 0 + highest_category = kg2_util.BIOLINK_CATEGORY_NAMED_THING + if recursion_depth == 10: + return kg2_util.BIOLINK_CATEGORY_NAMED_THING + + for superclass in CLASS_TO_SUPERCLASSES.get(node_id, list()): + superclass_category = categorize_node(superclass, recursion_depth + 1) + if superclass_category not in superclass_categorizations: + superclass_categorizations[superclass_category] = 0 + superclass_categorizations[superclass_category] += 1 + if superclass_categorizations[superclass_category] > highest_value: + highest_value = superclass_categorizations[superclass_category] + highest_category = superclass_category + + NODE_CATEGORY_MAPPINGS[node_id] = (highest_category, RECURSE_MAPPING) + return highest_category def reformat_obo_date(date_str): - if date_str is None: - return None - - if '-' in date_str: - delim = 'T' - if ' ' in date_str: - delim = ' ' - date_spl = date_str.strip('Z').split(delim) - date_fh = date_spl[0].split('-') - year = int(date_fh[0]) - month = int(date_fh[1]) - day = int(date_fh[2]) - - if month < 1 or month > 12 or day < 1 or day > 31: - return None - - if len(date_spl) > 1: - date_sh = date_spl[1].split(':') - hour = int(date_sh[0]) - minute = int(date_sh[1]) - second = int(date_sh[2][0:1]) - - return datetime.datetime(year, month, day, hour, minute, second) - else: - return datetime.datetime(year, month, day) - else: - date_spl = date_str.split(' ') - date_fh = date_spl[0].split(':') - year = int(date_fh[2]) - month = int(date_fh[1]) - day = int(date_fh[0]) - - if month < 1 or month > 12 or day < 1 or day > 31: - return None - - return datetime.datetime(year, month, day) + if date_str is None: + return None + + if '-' in date_str: + delim = 'T' + if ' ' in date_str: + delim = ' ' + date_spl = date_str.strip('Z').split(delim) + date_fh = date_spl[0].split('-') + year = int(date_fh[0]) + month = int(date_fh[1]) + day = int(date_fh[2]) + + if month < 1 or month > 12 or day < 1 or day > 31: + return None + + if len(date_spl) > 1: + date_sh = date_spl[1].split(':') + hour = int(date_sh[0]) + minute = int(date_sh[1]) + second = int(date_sh[2][0:1]) + + return datetime.datetime(year, month, day, hour, minute, second) + else: + return datetime.datetime(year, month, day) + else: + date_spl = date_str.split(' ') + date_fh = date_spl[0].split(':') + year = int(date_fh[2]) + month = int(date_fh[1]) + day = int(date_fh[0]) + + if month < 1 or month > 12 or day < 1 or day > 31: + return None + + return datetime.datetime(year, month, day) def pick_most_recent_date(dates, alternate_date=None): - latest_date = None - for date in dates: - if date == None: - continue - if latest_date == None or date > latest_date: - latest_date = date - - if latest_date == None: - if alternate_date is not None: - latest_date = alternate_date - else: - return None - - return latest_date.isoformat(sep=' ') + latest_date = None + for date in dates: + if date == None: + continue + if latest_date == None or date > latest_date: + latest_date = date + + if latest_date == None: + if alternate_date is not None: + latest_date = alternate_date + else: + return None + + return latest_date.isoformat(sep=' ') def process_ontology_term(ontology_node, source, ontology_name, owl_source=True): - owl_prefix = "" - if owl_source: - owl_prefix = "owl:" - ontology_version = None - ontology_versions = [version.get(TEXT_KEY, str()) for version in ontology_node.get(owl_prefix + "versionInfo", list()) if TEXT_KEY in version] - ontology_version_iri = [version.get(RESOURCE_KEY, str()) for version in ontology_node.get(owl_prefix + "versionIRI", list()) if RESOURCE_KEY in version] - ontology_dates = [reformat_obo_date(version.get(TEXT_KEY, str())) for date_type in ["oboInOwl:date", "dcterms:date", "dc:date"] for version in ontology_node.get(date_type, list()) if TEXT_KEY in version] - ontology_iri = ontology_node.get("rdf:about", str()) - if len(ontology_versions) == 1: - ontology_version = ontology_versions[0] - elif len(ontology_version_iri) == 1: - ontology_version = ontology_version_iri[0] - version_replacements = [ontology_iri.replace('.owl', '') + '/', '/' + source, 'releases/'] - for replacement in version_replacements: - ontology_version = ontology_version.replace(replacement, "") - ontology_version = ontology_version.split('/')[0] - elif len(ontology_dates) >= 1: - ontology_version = pick_most_recent_date(ontology_dates) - - if ontology_version is None: - print("Warning: source", source, "lacks any versioning information.") - - ontology_date = reformat_obo_date(pick_most_recent_date(ontology_dates)) - source_id = kg2_util.CURIE_PREFIX_OBO + ':' + source - - if source not in SOURCE_INFO: - SOURCE_INFO[source] = {SOURCE_KEY: source_id, IRI_KEY: ontology_iri, NAME_KEY: ontology_name, UPDATE_DATE_KEY: ontology_date, VERSION_KEY: ontology_version} + owl_prefix = "" + if owl_source: + owl_prefix = "owl:" + ontology_version = None + ontology_versions = [version.get(TEXT_KEY, str()) for version in ontology_node.get(owl_prefix + "versionInfo", list()) if TEXT_KEY in version] + ontology_version_iri = [version.get(RESOURCE_KEY, str()) for version in ontology_node.get(owl_prefix + "versionIRI", list()) if RESOURCE_KEY in version] + ontology_dates = [reformat_obo_date(version.get(TEXT_KEY, str())) for date_type in ["oboInOwl:date", "dcterms:date", "dc:date"] for version in ontology_node.get(date_type, list()) if TEXT_KEY in version] + ontology_iri = ontology_node.get("rdf:about", str()) + if len(ontology_versions) == 1: + ontology_version = ontology_versions[0] + elif len(ontology_version_iri) == 1: + ontology_version = ontology_version_iri[0] + version_replacements = [ontology_iri.replace('.owl', '') + '/', '/' + source, 'releases/'] + for replacement in version_replacements: + ontology_version = ontology_version.replace(replacement, "") + ontology_version = ontology_version.split('/')[0] + elif len(ontology_dates) >= 1: + ontology_version = pick_most_recent_date(ontology_dates) + + if ontology_version is None: + print("Warning: source", source, "lacks any versioning information.") + + ontology_date = reformat_obo_date(pick_most_recent_date(ontology_dates)) + source_id = kg2_util.CURIE_PREFIX_OBO + ':' + source + + if source not in SOURCE_INFO: + SOURCE_INFO[source] = {SOURCE_KEY: source_id, IRI_KEY: ontology_iri, NAME_KEY: ontology_name, UPDATE_DATE_KEY: ontology_date, VERSION_KEY: ontology_version} def process_ontology_class(owl_class, source, ontology_name, owl_source=True): - owl_prefix = "" - if owl_source: - owl_prefix = "owl:" - # Typically genid classes which don't neatly map onto the KG2 schema - if ID_TAG not in owl_class: - return - node_id = match_prefix(owl_class.get(ID_TAG, str())) - if node_id is None: - return - node_prefix = node_id.split(':')[0] - node_iri = PREFIX_TO_IRI_MAP[node_prefix] + node_id.replace(node_prefix + ':', '') - - # Configure the name - name_list = [name.get(TEXT_KEY, None) for name in owl_class.get("rdfs:label", dict()) if TEXT_KEY in name] - if len(name_list) == 0: - return - - # Configure the description - description_list = list() - description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:IAO_0000115", list()) if (TEXT_KEY in description)] - description_list += [COMMENT_PREFIX + description.get(TEXT_KEY, str()) for description in owl_class.get("rdfs:comment", list()) if (TEXT_KEY in description)] - description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:UBPROP_0000001", list()) if (TEXT_KEY in description)] - description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:UBPROP_0000005", list()) if (TEXT_KEY in description)] - description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("efo1:source_description", list()) if (TEXT_KEY in description)] - - deprecated = "true" in owl_class.get(owl_prefix + "deprecated", list()) - for name in name_list: - search_name = name.lower() - if search_name.startswith("obsolete") or search_name.startswith("(obsolete") or search_name.endswith("obsolete"): - deprecated = True - - # Configure the synonyms - synonym_list = list() - synonym_keys = ["oboInOwl:hasExactSynonym", "oboInOwl:hasRelatedSynonym", "oboInOwl:hasNarrowSynonym", "oboInOwl:hasBroadSynonym", "go:hasExactSynonym", - "go:hasSynonym", "go:hasNarrowSynonym", "go:hasBroadSynonym", "obo:IAO_0000118", "obo:IAO_0000589", "go:hasRelatedSynonym", "obo:IAO_0000111", - "obo:IAO_0000028", "skos:prefLabel"] - synonym_list += [synonym.get(TEXT_KEY, None) for synonym_key in synonym_keys for synonym in owl_class.get(synonym_key, list()) if (TEXT_KEY in synonym)] - - update_date_list = list() - update_date_keys = ["dc:date", "dcterms:date", "terms:date"] - update_date_list += [reformat_obo_date(update_date.get(TEXT_KEY, None)) for update_date_key in update_date_keys for update_date in owl_class.get(update_date_key, list()) if (TEXT_KEY in update_date)] - - creation_date_list = list() - creation_date_keys = ["oboInOwl:creation_date", "go:creation_date"] - creation_date_list += [reformat_obo_date(creation_date.get(TEXT_KEY, None)) for creation_date_key in creation_date_keys for creation_date in owl_class.get(creation_date_key, list()) if (TEXT_KEY in creation_date)] - - # Configure the biological sequence - has_biological_sequence = dict() - has_biological_sequence['formula'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:formula", list()) if TEXT_KEY in biological_sequence] - has_biological_sequence['smiles'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:smiles", list()) if TEXT_KEY in biological_sequence] - has_biological_sequence['inchi'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:inchi", list()) if TEXT_KEY in biological_sequence] - has_biological_sequence['inchikey'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:inchikey", list()) if TEXT_KEY in biological_sequence] - - # Extract edge triples - edges_list = list() - - for edge_type in BASE_EDGE_TYPES: - for edge in owl_class.get(edge_type, list()): - if BASE_EDGE_TYPES[edge_type] in edge: - edges_list.append((edge_type, edge.get(BASE_EDGE_TYPES[edge_type], None))) - - - restriction_edges = list() - restriction_edges += [(edge, "rdfs:subClassOf") for edge in owl_class.get("rdfs:subClassOf", list())] - for equiv in owl_class.get(owl_prefix + "equivalentClass", list()): - for mini_class in equiv.get(owl_prefix + "Class", list()): - for edge in mini_class.get(owl_prefix + "intersectionOf", list()): - restriction_edges.append((edge, owl_prefix + "equivalentClass")) - - for (edge, general_edge_type) in restriction_edges: - for restriction in edge.get(owl_prefix + "Restriction", list()): - edge_type = restriction.get(owl_prefix + "onProperty", list()) - edge_object = restriction.get(owl_prefix + "someValuesFrom", list()) - if len(edge_type) != 1: - assert len(edge_type) <= 1, edge - continue - if len(edge_object) != 1: - assert len(edge_object) <= 1, edge - continue - edge_type = edge_type[0].get(RESOURCE_KEY, None) - edge_object = edge_object[0].get(RESOURCE_KEY, None) - - if edge_type != None and edge_object != None: - edges_list.append((edge_type, edge_object)) - - if RESOURCE_KEY in edge: - edges_list.append((general_edge_type, edge.get(RESOURCE_KEY, None))) - - superclasses = set() - final_edges_list = list() - for (edge_relation, edge_object) in edges_list: - edge_object = match_prefix(edge_object) - if edge_object is None: - continue - edge_relation = match_prefix(edge_relation) - if edge_relation is None: - continue - if edge_relation in ["rdfs:subClassOf"]: - superclasses.add(edge_object) - final_edges_list.append((edge_relation, edge_object)) - - # Imperfect way to make it deterministic - superclasses = sorted(list(superclasses)) - if node_id not in CLASS_TO_SUPERCLASSES: - CLASS_TO_SUPERCLASSES[node_id] = list() - CLASS_TO_SUPERCLASSES[node_id] += superclasses - CLASS_TO_SUPERCLASSES[node_id] = sorted(list(set(CLASS_TO_SUPERCLASSES[node_id]))) - - if node_id not in SAVED_NODE_INFO: - SAVED_NODE_INFO[node_id] = list() - SAVED_NODE_INFO[node_id].append({ID_KEY: node_id, - DEPRECATED_KEY: deprecated, - UPDATE_DATE_KEY: update_date_list, - CREATION_DATE_KEY: creation_date_list, - SYNONYM_KEY: synonym_list, - DESCRIPTION_KEY: description_list, - NAME_KEY: name_list, - SOURCE_KEY: source, - BIOLOGICAL_SEQUENCE_KEY: has_biological_sequence, - IRI_KEY: node_iri, - EDGES_KEY: final_edges_list}) + owl_prefix = "" + if owl_source: + owl_prefix = "owl:" + # Typically genid classes which don't neatly map onto the KG2 schema + if ID_TAG not in owl_class: + return + node_id = match_prefix(owl_class.get(ID_TAG, str())) + if node_id is None: + return + node_prefix = node_id.split(':')[0] + node_iri = PREFIX_TO_IRI_MAP[node_prefix] + node_id.replace(node_prefix + ':', '') + + # Configure the name + name_list = [name.get(TEXT_KEY, None) for name in owl_class.get("rdfs:label", dict()) if TEXT_KEY in name] + if len(name_list) == 0: + return + + # Configure the description + description_list = list() + description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:IAO_0000115", list()) if (TEXT_KEY in description)] + description_list += [COMMENT_PREFIX + description.get(TEXT_KEY, str()) for description in owl_class.get("rdfs:comment", list()) if (TEXT_KEY in description)] + description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:UBPROP_0000001", list()) if (TEXT_KEY in description)] + description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:UBPROP_0000005", list()) if (TEXT_KEY in description)] + description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("efo1:source_description", list()) if (TEXT_KEY in description)] + + deprecated = "true" in owl_class.get(owl_prefix + "deprecated", list()) + for name in name_list: + search_name = name.lower() + if search_name.startswith("obsolete") or search_name.startswith("(obsolete") or search_name.endswith("obsolete"): + deprecated = True + + # Configure the synonyms + synonym_list = list() + synonym_keys = ["oboInOwl:hasExactSynonym", "oboInOwl:hasRelatedSynonym", "oboInOwl:hasNarrowSynonym", "oboInOwl:hasBroadSynonym", "go:hasExactSynonym", + "go:hasSynonym", "go:hasNarrowSynonym", "go:hasBroadSynonym", "obo:IAO_0000118", "obo:IAO_0000589", "go:hasRelatedSynonym", "obo:IAO_0000111", + "obo:IAO_0000028", "skos:prefLabel"] + synonym_list += [synonym.get(TEXT_KEY, None) for synonym_key in synonym_keys for synonym in owl_class.get(synonym_key, list()) if (TEXT_KEY in synonym)] + + update_date_list = list() + update_date_keys = ["dc:date", "dcterms:date", "terms:date"] + update_date_list += [reformat_obo_date(update_date.get(TEXT_KEY, None)) for update_date_key in update_date_keys for update_date in owl_class.get(update_date_key, list()) if (TEXT_KEY in update_date)] + + creation_date_list = list() + creation_date_keys = ["oboInOwl:creation_date", "go:creation_date"] + creation_date_list += [reformat_obo_date(creation_date.get(TEXT_KEY, None)) for creation_date_key in creation_date_keys for creation_date in owl_class.get(creation_date_key, list()) if (TEXT_KEY in creation_date)] + + # Configure the biological sequence + has_biological_sequence = dict() + has_biological_sequence['formula'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:formula", list()) if TEXT_KEY in biological_sequence] + has_biological_sequence['smiles'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:smiles", list()) if TEXT_KEY in biological_sequence] + has_biological_sequence['inchi'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:inchi", list()) if TEXT_KEY in biological_sequence] + has_biological_sequence['inchikey'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:inchikey", list()) if TEXT_KEY in biological_sequence] + + # Extract edge triples + edges_list = list() + + for edge_type in BASE_EDGE_TYPES: + for edge in owl_class.get(edge_type, list()): + if BASE_EDGE_TYPES[edge_type] in edge: + edges_list.append((edge_type, edge.get(BASE_EDGE_TYPES[edge_type], None))) + + + restriction_edges = list() + restriction_edges += [(edge, "rdfs:subClassOf") for edge in owl_class.get("rdfs:subClassOf", list())] + for equiv in owl_class.get(owl_prefix + "equivalentClass", list()): + for mini_class in equiv.get(owl_prefix + "Class", list()): + for edge in mini_class.get(owl_prefix + "intersectionOf", list()): + restriction_edges.append((edge, owl_prefix + "equivalentClass")) + + for (edge, general_edge_type) in restriction_edges: + for restriction in edge.get(owl_prefix + "Restriction", list()): + edge_type = restriction.get(owl_prefix + "onProperty", list()) + edge_object = restriction.get(owl_prefix + "someValuesFrom", list()) + if len(edge_type) != 1: + assert len(edge_type) <= 1, edge + continue + if len(edge_object) != 1: + assert len(edge_object) <= 1, edge + continue + edge_type = edge_type[0].get(RESOURCE_KEY, None) + edge_object = edge_object[0].get(RESOURCE_KEY, None) + + if edge_type != None and edge_object != None: + edges_list.append((edge_type, edge_object)) + + if RESOURCE_KEY in edge: + edges_list.append((general_edge_type, edge.get(RESOURCE_KEY, None))) + + superclasses = set() + final_edges_list = list() + for (edge_relation, edge_object) in edges_list: + edge_object = match_prefix(edge_object) + if edge_object is None: + continue + edge_relation = match_prefix(edge_relation) + if edge_relation is None: + continue + if edge_relation in ["rdfs:subClassOf"]: + superclasses.add(edge_object) + final_edges_list.append((edge_relation, edge_object)) + + # Imperfect way to make it deterministic + superclasses = sorted(list(superclasses)) + if node_id not in CLASS_TO_SUPERCLASSES: + CLASS_TO_SUPERCLASSES[node_id] = list() + CLASS_TO_SUPERCLASSES[node_id] += superclasses + CLASS_TO_SUPERCLASSES[node_id] = sorted(list(set(CLASS_TO_SUPERCLASSES[node_id]))) + + if node_id not in SAVED_NODE_INFO: + SAVED_NODE_INFO[node_id] = list() + SAVED_NODE_INFO[node_id].append({ID_KEY: node_id, + DEPRECATED_KEY: deprecated, + UPDATE_DATE_KEY: update_date_list, + CREATION_DATE_KEY: creation_date_list, + SYNONYM_KEY: synonym_list, + DESCRIPTION_KEY: description_list, + NAME_KEY: name_list, + SOURCE_KEY: source, + BIOLOGICAL_SEQUENCE_KEY: has_biological_sequence, + IRI_KEY: node_iri, + EDGES_KEY: final_edges_list}) def process_ontology_item(ontology_item): - source = ontology_item.get(OWL_SOURCE_KEY, str()) - ontology_name = ontology_item.get(OWL_SOURCE_NAME_KEY, str()) + source = ontology_item.get(OWL_SOURCE_KEY, str()) + ontology_name = ontology_item.get(OWL_SOURCE_NAME_KEY, str()) - for owl_class in ontology_item.get("owl:Class", list()): - process_ontology_class(owl_class, source, ontology_name) + for owl_class in ontology_item.get("owl:Class", list()): + process_ontology_class(owl_class, source, ontology_name) - for owl_class in ontology_item.get("Class", list()): - process_ontology_class(owl_class, source, ontology_name, False) + for owl_class in ontology_item.get("Class", list()): + process_ontology_class(owl_class, source, ontology_name, False) - for ontology_node in ontology_item.get("owl:Ontology", list()): - process_ontology_term(ontology_node, source, ontology_name) + for ontology_node in ontology_item.get("owl:Ontology", list()): + process_ontology_term(ontology_node, source, ontology_name) - # Because of ORDO - for ontology_node in ontology_item.get("Ontology", list()): - process_ontology_term(ontology_node, source, ontology_name, False) + # Because of ORDO + for ontology_node in ontology_item.get("Ontology", list()): + process_ontology_term(ontology_node, source, ontology_name, False) def generate_uri_map(curies_to_urls_file_name): - uri_input_map = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(curies_to_urls_file_name)) - bidirectional_map = uri_input_map['use_for_bidirectional_mapping'] - contraction_map = uri_input_map['use_for_contraction_only'] - - for curie_prefix_dict in bidirectional_map: - for curie_prefix in curie_prefix_dict: - curie_url = curie_prefix_dict[curie_prefix] - URI_MAP[curie_url] = curie_prefix - PREFIX_TO_IRI_MAP[curie_prefix] = curie_url - - for curie_prefix_dict in contraction_map: - for curie_prefix in curie_prefix_dict: - curie_url = curie_prefix_dict[curie_prefix] - URI_MAP[curie_url] = curie_prefix - - # So that you get the most accurate match, you want to match to the longest url (in case one is a substring of another) - # Apparently have to use global key word to write to a module wide list (https://stackoverflow.com/questions/4630543/defining-lists-as-global-variables-in-python) - global URI_MAP_KEYS - URI_MAP_KEYS = sorted(URI_MAP.keys(), key=len, reverse=True) + uri_input_map = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(curies_to_urls_file_name)) + bidirectional_map = uri_input_map['use_for_bidirectional_mapping'] + contraction_map = uri_input_map['use_for_contraction_only'] + + for curie_prefix_dict in bidirectional_map: + for curie_prefix in curie_prefix_dict: + curie_url = curie_prefix_dict[curie_prefix] + URI_MAP[curie_url] = curie_prefix + PREFIX_TO_IRI_MAP[curie_prefix] = curie_url + + for curie_prefix_dict in contraction_map: + for curie_prefix in curie_prefix_dict: + curie_url = curie_prefix_dict[curie_prefix] + URI_MAP[curie_url] = curie_prefix + + # So that you get the most accurate match, you want to match to the longest url (in case one is a substring of another) + # Apparently have to use global key word to write to a module wide list (https://stackoverflow.com/questions/4630543/defining-lists-as-global-variables-in-python) + global URI_MAP_KEYS + URI_MAP_KEYS = sorted(URI_MAP.keys(), key=len, reverse=True) def match_prefix(node_id): - for curie_url in URI_MAP_KEYS: - if node_id.startswith(curie_url): - return node_id.replace(curie_url, URI_MAP[curie_url] + ":") - - if "http" in node_id: - MISSING_ID_PREFIXES.add('/'.join(node_id.split('/')[0:-1]) + "/") - elif ':' in node_id: - MISSING_ID_PREFIXES.add(node_id.split(':')[0] + ":") - elif '_' in node_id: - MISSING_ID_PREFIXES.add(node_id.split('_')[0] + "_") - else: - MISSING_ID_PREFIXES.add(node_id) + for curie_url in URI_MAP_KEYS: + if node_id.startswith(curie_url): + return node_id.replace(curie_url, URI_MAP[curie_url] + ":") + + if "http" in node_id: + MISSING_ID_PREFIXES.add('/'.join(node_id.split('/')[0:-1]) + "/") + elif ':' in node_id: + MISSING_ID_PREFIXES.add(node_id.split(':')[0] + ":") + elif '_' in node_id: + MISSING_ID_PREFIXES.add(node_id.split('_')[0] + "_") + else: + MISSING_ID_PREFIXES.add(node_id) def construct_nodes_and_edges(nodes_output, edges_output): - for source in SOURCE_INFO: - source_date = pick_most_recent_date([SOURCE_INFO[source][UPDATE_DATE_KEY]]) - source_name = SOURCE_INFO[source][NAME_KEY] + " v" + SOURCE_INFO[source][VERSION_KEY] - source_id = SOURCE_INFO[source][SOURCE_KEY] - source_iri = SOURCE_INFO[source][IRI_KEY] - node = kg2_util.make_node(source_id, source_iri, source_name, kg2_util.BIOLINK_CATEGORY_INFORMATION_CONTENT_ENTITY, source_date, source_id) + for source in SOURCE_INFO: + source_date = pick_most_recent_date([SOURCE_INFO[source][UPDATE_DATE_KEY]]) + source_name = SOURCE_INFO[source][NAME_KEY] + " v" + SOURCE_INFO[source][VERSION_KEY] + source_id = SOURCE_INFO[source][SOURCE_KEY] + source_iri = SOURCE_INFO[source][IRI_KEY] + node = kg2_util.make_node(source_id, source_iri, source_name, kg2_util.BIOLINK_CATEGORY_INFORMATION_CONTENT_ENTITY, source_date, source_id) - nodes_output.write(node) + nodes_output.write(node) - for node_id in SAVED_NODE_INFO: - for source_node_index in range(len(SAVED_NODE_INFO[node_id])): - if SAVED_NODE_INFO[node_id][source_node_index][DEPRECATED_KEY]: - continue - name = SAVED_NODE_INFO[node_id][source_node_index][NAME_KEY][0] # Imperfect way of choosing the name - node_iri = SAVED_NODE_INFO[node_id][source_node_index][IRI_KEY] - description = DESCRIPTION_DELIM.join(SAVED_NODE_INFO[node_id][source_node_index][DESCRIPTION_KEY]) - has_biological_sequence = SAVED_NODE_INFO[node_id][source_node_index][BIOLOGICAL_SEQUENCE_KEY].get("smiles", None) - synonyms = SAVED_NODE_INFO[node_id][source_node_index][SYNONYM_KEY] - category = SAVED_NODE_INFO[node_id][source_node_index][CATEGORY_KEY] + for node_id in SAVED_NODE_INFO: + for source_node_index in range(len(SAVED_NODE_INFO[node_id])): + if SAVED_NODE_INFO[node_id][source_node_index][DEPRECATED_KEY]: + continue + name = SAVED_NODE_INFO[node_id][source_node_index][NAME_KEY][0] # Imperfect way of choosing the name + node_iri = SAVED_NODE_INFO[node_id][source_node_index][IRI_KEY] + description = DESCRIPTION_DELIM.join(SAVED_NODE_INFO[node_id][source_node_index][DESCRIPTION_KEY]) + has_biological_sequence = SAVED_NODE_INFO[node_id][source_node_index][BIOLOGICAL_SEQUENCE_KEY].get("smiles", None) + synonyms = SAVED_NODE_INFO[node_id][source_node_index][SYNONYM_KEY] + category = SAVED_NODE_INFO[node_id][source_node_index][CATEGORY_KEY] - source = SAVED_NODE_INFO[node_id][source_node_index][SOURCE_KEY] - provided_by = kg2_util.CURIE_PREFIX_OBO + ':' + source - source_date = SOURCE_INFO[source][UPDATE_DATE_KEY] + source = SAVED_NODE_INFO[node_id][source_node_index][SOURCE_KEY] + provided_by = kg2_util.CURIE_PREFIX_OBO + ':' + source + source_date = SOURCE_INFO[source][UPDATE_DATE_KEY] - update_date = pick_most_recent_date(SAVED_NODE_INFO[node_id][source_node_index][UPDATE_DATE_KEY], source_date) - creation_date = pick_most_recent_date(SAVED_NODE_INFO[node_id][source_node_index][CREATION_DATE_KEY], source_date) + update_date = pick_most_recent_date(SAVED_NODE_INFO[node_id][source_node_index][UPDATE_DATE_KEY], source_date) + creation_date = pick_most_recent_date(SAVED_NODE_INFO[node_id][source_node_index][CREATION_DATE_KEY], source_date) - node = kg2_util.make_node(node_id, node_iri, name, category, update_date, provided_by) - node["description"] = description - node["has_biological_sequence"] = has_biological_sequence - node["creation_date"] = creation_date - node["synonym"] = synonyms + node = kg2_util.make_node(node_id, node_iri, name, category, update_date, provided_by) + node["description"] = description + node["has_biological_sequence"] = has_biological_sequence + node["creation_date"] = creation_date + node["synonym"] = synonyms - nodes_output.write(node) + nodes_output.write(node) - for (edge_relation, edge_object) in SAVED_NODE_INFO[node_id][source_node_index][EDGES_KEY]: - relation_label = edge_relation.split(':')[1] - edge = kg2_util.make_edge(node_id, edge_object, edge_relation, relation_label, provided_by, update_date) + for (edge_relation, edge_object) in SAVED_NODE_INFO[node_id][source_node_index][EDGES_KEY]: + relation_label = edge_relation.split(':')[1] + edge = kg2_util.make_edge(node_id, edge_object, edge_relation, relation_label, provided_by, update_date) - edges_output.write(edge) + edges_output.write(edge) if __name__ == '__main__': - args = get_args() - input_file_name = args.inputFile - curies_to_categories_file_name = args.curiesToCategoriesYAML - curies_to_urls_file_name = args.curiesToURLsYAML - output_nodes_file_name = args.outputNodesFile - output_edges_file_name = args.outputEdgesFile - test_mode = args.test - - nodes_info, edges_info = kg2_util.create_kg2_jsonlines(test_mode) - nodes_output = nodes_info[0] - edges_output = edges_info[0] - - curies_to_categories_data = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(curies_to_categories_file_name)) - for mapping_node in curies_to_categories_data["term-mappings"]: - NODE_CATEGORY_MAPPINGS[mapping_node] = (curies_to_categories_data["term-mappings"][mapping_node], FILE_MAPPING) - for prefix in curies_to_categories_data["prefix-mappings"]: - PREFIX_MAPPINGS[prefix] = curies_to_categories_data["prefix-mappings"][prefix] - - input_read_jsonlines_info = kg2_util.start_read_jsonlines(input_file_name) - input_data = input_read_jsonlines_info[0] - - ontology_prefixes = set() - generate_uri_map(curies_to_urls_file_name) - for ontology_item in input_data: - process_ontology_item(ontology_item) - - for node_id in SAVED_NODE_INFO: - categorize_node(node_id) - node_category = NODE_CATEGORY_MAPPINGS[node_id][0] - for index in range(len(SAVED_NODE_INFO[node_id])): - SAVED_NODE_INFO[node_id][index][CATEGORY_KEY] = node_category - - construct_nodes_and_edges(nodes_output, edges_output) - - kg2_util.close_kg2_jsonlines(nodes_info, edges_info, output_nodes_file_name, output_edges_file_name) \ No newline at end of file + print("Start time: ", kg2_util.date()) + args = get_args() + input_file_name = args.inputFile + curies_to_categories_file_name = args.curiesToCategoriesYAML + curies_to_urls_file_name = args.curiesToURLsYAML + output_nodes_file_name = args.outputNodesFile + output_edges_file_name = args.outputEdgesFile + test_mode = args.test + + nodes_info, edges_info = kg2_util.create_kg2_jsonlines(test_mode) + nodes_output = nodes_info[0] + edges_output = edges_info[0] + + curies_to_categories_data = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(curies_to_categories_file_name)) + for mapping_node in curies_to_categories_data["term-mappings"]: + NODE_CATEGORY_MAPPINGS[mapping_node] = (curies_to_categories_data["term-mappings"][mapping_node], FILE_MAPPING) + for prefix in curies_to_categories_data["prefix-mappings"]: + PREFIX_MAPPINGS[prefix] = curies_to_categories_data["prefix-mappings"][prefix] + + input_read_jsonlines_info = kg2_util.start_read_jsonlines(input_file_name) + input_data = input_read_jsonlines_info[0] + + ontology_prefixes = set() + generate_uri_map(curies_to_urls_file_name) + for ontology_item in input_data: + process_ontology_item(ontology_item) + + for node_id in SAVED_NODE_INFO: + categorize_node(node_id) + node_category = NODE_CATEGORY_MAPPINGS[node_id][0] + for index in range(len(SAVED_NODE_INFO[node_id])): + SAVED_NODE_INFO[node_id][index][CATEGORY_KEY] = node_category + + construct_nodes_and_edges(nodes_output, edges_output) + + kg2_util.close_kg2_jsonlines(nodes_info, edges_info, output_nodes_file_name, output_edges_file_name) + + print("Finish time: ", kg2_util.date()) diff --git a/extract/owlparser.py b/extract/owlparser.py index 34e99fe3..fe540f3b 100644 --- a/extract/owlparser.py +++ b/extract/owlparser.py @@ -1,524 +1,540 @@ +#!/usr/bin/env python3 +''' owlparser.py: Converts OWL (XML) Files into JSON Lines Representations + + Usage: owlparser.py [--test] +''' + import json import argparse import datetime import kg2_util +__author__ = 'Erica Wood' +__copyright__ = 'Oregon State University' +__credits__ = ['Stephen Ramsey', 'Erica Wood'] +__license__ = 'MIT' +__version__ = '0.1.0' +__maintainer__ = '' +__email__ = '' +__status__ = 'Prototype' + + def get_args(): - arg_parser = argparse.ArgumentParser() - arg_parser.add_argument('--test', dest='test', - action="store_true", default=False) - arg_parser.add_argument('inputFile', type=str) - arg_parser.add_argument('owlFilePath', type=str) - arg_parser.add_argument('outputFile', type=str) - return arg_parser.parse_args() + arg_parser = argparse.ArgumentParser() + arg_parser.add_argument('--test', dest='test', + action="store_true", default=False) + arg_parser.add_argument('inputFile', type=str) + arg_parser.add_argument('owlFilePath', type=str) + arg_parser.add_argument('outputFile', type=str) + return arg_parser.parse_args() def date(): - return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") class LineElementRead(): - NONE = 0 - TAG = 1 - ATTRIBUTE_TAG = 2 - ATTRIBUTE_TEXT = 3 - MAIN = 4 - END_TAG = 5 + NONE = 0 + TAG = 1 + ATTRIBUTE_TAG = 2 + ATTRIBUTE_TEXT = 3 + MAIN = 4 + END_TAG = 5 class XMLParser(): - def __init__(self, skip_tags, ignored_attributes, processing_func): - self.COMMENT = "!--" - self.OUTMOST_TAGS_SKIP = skip_tags - self.IGNORED_ATTRIBUTES = ignored_attributes - self.processing_func = processing_func - - self.LINE_TYPE_IGNORE = "ignore" - self.LINE_TYPE_START_NEST = "start nest" - self.LINE_TYPE_START_NEST_WITH_ATTR = "start nest with attributes" - self.LINE_TYPE_ENTRY = "entry" - self.LINE_TYPE_ENTRY_WITH_ATTR = "entry with attributes" - self.LINE_TYPE_ENTRY_ONLY_ATTR = "entry with only attributes" - self.LINE_TYPE_END_NEST = "end nest" - - self.KEY_TAG = "tag" - self.KEY_ATTRIBUTES = "attributes" - self.KEY_TEXT = "ENTRY_TEXT" - self.KEY_TYPE = "type" - - # Variables for line reading - self.tag = "" - self.attributes = dict() - self.attribute_tag = "" - self.attribute_text = "" - self.main_text = "" - self.end_tag = "" - self.only_tag = False - self.start_brackets = 0 - self.line = "" - self.letter = "" - self.next_letter = "" - self.prev_letter = "" - self.type_to_read = LineElementRead.NONE - - def categorize_line(self): - # Categorize the type of line - line_type = str() - out = dict() - - # Putting "only_tag" here isn't necessarily the best idea, but I don't know what else to do with it - if self.tag == self.COMMENT or self.tag in self.OUTMOST_TAGS_SKIP or self.end_tag in self.OUTMOST_TAGS_SKIP or self.only_tag: - line_type = self.LINE_TYPE_IGNORE - else: - start_tag_exists = (self.tag != str()) - attributes_exist = (self.attributes != dict()) - text_exists = (self.main_text != str()) - end_tag_exists = (self.end_tag != str()) - - if start_tag_exists: - if attributes_exist: - if text_exists: - line_type = self.LINE_TYPE_ENTRY_WITH_ATTR - out[self.KEY_TAG] = self.tag - out[self.KEY_ATTRIBUTES] = self.attributes - out[self.KEY_TEXT] = self.main_text - elif end_tag_exists: - line_type = self.LINE_TYPE_ENTRY_ONLY_ATTR - out[self.KEY_TAG] = self.tag - out[self.KEY_ATTRIBUTES] = self.attributes - else: - line_type = self.LINE_TYPE_START_NEST_WITH_ATTR - out[self.KEY_TAG] = self.tag - out[self.KEY_ATTRIBUTES] = self.attributes - elif text_exists: - line_type = self.LINE_TYPE_ENTRY - out[self.KEY_TAG] = self.tag - out[self.KEY_TEXT] = self.main_text - else: - line_type = self.LINE_TYPE_START_NEST - out[self.KEY_TAG] = self.tag - elif end_tag_exists: - line_type = self.LINE_TYPE_END_NEST - out[self.KEY_TAG] = self.end_tag - - out[self.KEY_TYPE] = line_type - - return out - - def get_letters(self, letter_index): - self.letter = self.line[letter_index] - self.next_letter = "" - self.prev_letter = "" - if letter_index + 1 < len(self.line): - self.next_letter = self.line[letter_index + 1] - if letter_index - 1 >= 0: - self.prev_letter = self.line[letter_index - 1] - - if self.letter == '<': - self.start_brackets += 1 - if self.letter == '>': - self.start_brackets -= 1 - - - def identify_tag_type(self, letter_index): - changed = True - - if self.letter == '<' and letter_index == 0: - if self.next_letter != '/': - self.type_to_read = LineElementRead.TAG - elif self.letter == '/' and self.prev_letter == '<': - self.type_to_read = LineElementRead.END_TAG - else: - changed = False - - return changed - - - def read_tag(self): - changed = True - - if self.letter == ' ' and self.type_to_read == LineElementRead.TAG: - self.type_to_read = LineElementRead.ATTRIBUTE_TAG - elif self.letter == '>' and self.type_to_read == LineElementRead.TAG and self.start_brackets == 0: - self.type_to_read = LineElementRead.MAIN - - if self.prev_letter == '/': - print("Warning - strange tag, ignoring", self.line) - self.only_tag = True - elif self.type_to_read == LineElementRead.TAG: - self.tag += self.letter - else: - changed = False - - return changed - - - def store_attribute(self): - if self.attribute_tag not in self.IGNORED_ATTRIBUTES: - self.attributes[self.attribute_tag] = self.attribute_text.strip('/').strip('"') - self.attribute_tag = "" - self.attribute_text = "" - - - def read_attributes(self): - changed = True - start_reading_attributes = (self.type_to_read == LineElementRead.ATTRIBUTE_TAG or self.type_to_read == LineElementRead.ATTRIBUTE_TEXT) - - if self.letter == '>' and start_reading_attributes and self.start_brackets == 0: - self.type_to_read = LineElementRead.MAIN - - self.store_attribute() - - if self.prev_letter == '/': - self.end_tag = self.tag - elif start_reading_attributes: - if self.letter == '=' and self.type_to_read == LineElementRead.ATTRIBUTE_TAG: - self.type_to_read = LineElementRead.ATTRIBUTE_TEXT - elif self.type_to_read == LineElementRead.ATTRIBUTE_TAG: - self.attribute_tag += self.letter - elif self.letter == ' ' and self.type_to_read == LineElementRead.ATTRIBUTE_TEXT: - self.type_to_read = LineElementRead.ATTRIBUTE_TAG - self.store_attribute() - elif self.type_to_read == LineElementRead.ATTRIBUTE_TEXT: - self.attribute_text += self.letter - else: - changed = False + def __init__(self, skip_tags, ignored_attributes, processing_func): + self.COMMENT = "!--" + self.OUTMOST_TAGS_SKIP = skip_tags + self.IGNORED_ATTRIBUTES = ignored_attributes + self.processing_func = processing_func + + self.LINE_TYPE_IGNORE = "ignore" + self.LINE_TYPE_START_NEST = "start nest" + self.LINE_TYPE_START_NEST_WITH_ATTR = "start nest with attributes" + self.LINE_TYPE_ENTRY = "entry" + self.LINE_TYPE_ENTRY_WITH_ATTR = "entry with attributes" + self.LINE_TYPE_ENTRY_ONLY_ATTR = "entry with only attributes" + self.LINE_TYPE_END_NEST = "end nest" + + self.KEY_TAG = "tag" + self.KEY_ATTRIBUTES = "attributes" + self.KEY_TEXT = "ENTRY_TEXT" + self.KEY_TYPE = "type" + + # Variables for line reading + self.tag = "" + self.attributes = dict() + self.attribute_tag = "" + self.attribute_text = "" + self.main_text = "" + self.end_tag = "" + self.only_tag = False + self.start_brackets = 0 + self.line = "" + self.letter = "" + self.next_letter = "" + self.prev_letter = "" + self.type_to_read = LineElementRead.NONE + + def categorize_line(self): + # Categorize the type of line + line_type = str() + out = dict() + + # Putting "only_tag" here isn't necessarily the best idea, but I don't know what else to do with it + if self.tag == self.COMMENT or self.tag in self.OUTMOST_TAGS_SKIP or self.end_tag in self.OUTMOST_TAGS_SKIP or self.only_tag: + line_type = self.LINE_TYPE_IGNORE + else: + start_tag_exists = (self.tag != str()) + attributes_exist = (self.attributes != dict()) + text_exists = (self.main_text != str()) + end_tag_exists = (self.end_tag != str()) + + if start_tag_exists: + if attributes_exist: + if text_exists: + line_type = self.LINE_TYPE_ENTRY_WITH_ATTR + out[self.KEY_TAG] = self.tag + out[self.KEY_ATTRIBUTES] = self.attributes + out[self.KEY_TEXT] = self.main_text + elif end_tag_exists: + line_type = self.LINE_TYPE_ENTRY_ONLY_ATTR + out[self.KEY_TAG] = self.tag + out[self.KEY_ATTRIBUTES] = self.attributes + else: + line_type = self.LINE_TYPE_START_NEST_WITH_ATTR + out[self.KEY_TAG] = self.tag + out[self.KEY_ATTRIBUTES] = self.attributes + elif text_exists: + line_type = self.LINE_TYPE_ENTRY + out[self.KEY_TAG] = self.tag + out[self.KEY_TEXT] = self.main_text + else: + line_type = self.LINE_TYPE_START_NEST + out[self.KEY_TAG] = self.tag + elif end_tag_exists: + line_type = self.LINE_TYPE_END_NEST + out[self.KEY_TAG] = self.end_tag + + out[self.KEY_TYPE] = line_type + + return out + + def get_letters(self, letter_index): + self.letter = self.line[letter_index] + self.next_letter = "" + self.prev_letter = "" + if letter_index + 1 < len(self.line): + self.next_letter = self.line[letter_index + 1] + if letter_index - 1 >= 0: + self.prev_letter = self.line[letter_index - 1] + + if self.letter == '<': + self.start_brackets += 1 + if self.letter == '>': + self.start_brackets -= 1 + + + def identify_tag_type(self, letter_index): + changed = True + + if self.letter == '<' and letter_index == 0: + if self.next_letter != '/': + self.type_to_read = LineElementRead.TAG + elif self.letter == '/' and self.prev_letter == '<': + self.type_to_read = LineElementRead.END_TAG + else: + changed = False + + return changed + + + def read_tag(self): + changed = True + + if self.letter == ' ' and self.type_to_read == LineElementRead.TAG: + self.type_to_read = LineElementRead.ATTRIBUTE_TAG + elif self.letter == '>' and self.type_to_read == LineElementRead.TAG and self.start_brackets == 0: + self.type_to_read = LineElementRead.MAIN + + if self.prev_letter == '/': + print("Warning - strange tag, ignoring", self.line) + self.only_tag = True + elif self.type_to_read == LineElementRead.TAG: + self.tag += self.letter + else: + changed = False + + return changed + + + def store_attribute(self): + if self.attribute_tag not in self.IGNORED_ATTRIBUTES: + self.attributes[self.attribute_tag] = self.attribute_text.strip('/').strip('"') + self.attribute_tag = "" + self.attribute_text = "" + + + def read_attributes(self): + changed = True + start_reading_attributes = (self.type_to_read == LineElementRead.ATTRIBUTE_TAG or self.type_to_read == LineElementRead.ATTRIBUTE_TEXT) + + if self.letter == '>' and start_reading_attributes and self.start_brackets == 0: + self.type_to_read = LineElementRead.MAIN + + self.store_attribute() + + if self.prev_letter == '/': + self.end_tag = self.tag + elif start_reading_attributes: + if self.letter == '=' and self.type_to_read == LineElementRead.ATTRIBUTE_TAG: + self.type_to_read = LineElementRead.ATTRIBUTE_TEXT + elif self.type_to_read == LineElementRead.ATTRIBUTE_TAG: + self.attribute_tag += self.letter + elif self.letter == ' ' and self.type_to_read == LineElementRead.ATTRIBUTE_TEXT: + self.type_to_read = LineElementRead.ATTRIBUTE_TAG + self.store_attribute() + elif self.type_to_read == LineElementRead.ATTRIBUTE_TEXT: + self.attribute_text += self.letter + else: + changed = False - return changed + return changed - def read_main(self): - changed = True - if self.letter == '<' and self.type_to_read == LineElementRead.MAIN: - self.type_to_read = LineElementRead.END_TAG - elif self.type_to_read == LineElementRead.MAIN: - self.main_text += self.letter - else: - changed = False + def read_main(self): + changed = True + if self.letter == '<' and self.type_to_read == LineElementRead.MAIN: + self.type_to_read = LineElementRead.END_TAG + elif self.type_to_read == LineElementRead.MAIN: + self.main_text += self.letter + else: + changed = False - return changed + return changed - def read_end_tag(self): - changed = True - if self.letter == '>' and self.type_to_read == LineElementRead.END_TAG and self.start_brackets == 0: - pass - elif self.type_to_read == LineElementRead.END_TAG: - self.end_tag += self.letter - else: - changed = False + def read_end_tag(self): + changed = True + if self.letter == '>' and self.type_to_read == LineElementRead.END_TAG and self.start_brackets == 0: + pass + elif self.type_to_read == LineElementRead.END_TAG: + self.end_tag += self.letter + else: + changed = False - return changed + return changed - def convert_line(self): - self.tag = "" - self.attributes = dict() - self.attribute_tag = "" - self.attribute_text = "" - self.main_text = "" - self.end_tag = "" + def convert_line(self): + self.tag = "" + self.attributes = dict() + self.attribute_tag = "" + self.attribute_text = "" + self.main_text = "" + self.end_tag = "" - self.type_to_read = LineElementRead.NONE + self.type_to_read = LineElementRead.NONE - self.only_tag = False + self.only_tag = False - self.start_brackets = 0 + self.start_brackets = 0 - for letter_index in range(len(self.line)): - self.get_letters(letter_index) + for letter_index in range(len(self.line)): + self.get_letters(letter_index) - # First < - if self.identify_tag_type(letter_index): - continue + # First < + if self.identify_tag_type(letter_index): + continue - if self.read_tag(): - continue + if self.read_tag(): + continue - if self.read_attributes(): - continue + if self.read_attributes(): + continue - if self.read_main(): - continue + if self.read_main(): + continue - if self.read_end_tag(): - continue + if self.read_end_tag(): + continue - return self.categorize_line() + return self.categorize_line() - def convert_nest(self, nest, start_index): - nest_dict = dict() - curr_index = start_index + def convert_nest(self, nest, start_index): + nest_dict = dict() + curr_index = start_index - while curr_index < len(nest): - element = nest[curr_index] - line_type = element[self.KEY_TYPE] - line_tag = element[self.KEY_TAG] - line_text = element.get(self.KEY_TEXT, None) - line_attributes = element.get(self.KEY_ATTRIBUTES, None) + while curr_index < len(nest): + element = nest[curr_index] + line_type = element[self.KEY_TYPE] + line_tag = element[self.KEY_TAG] + line_text = element.get(self.KEY_TEXT, None) + line_attributes = element.get(self.KEY_ATTRIBUTES, None) - if line_type in [self.LINE_TYPE_START_NEST, self.LINE_TYPE_START_NEST_WITH_ATTR]: - if line_tag not in nest_dict: - nest_dict[line_tag] = list() + if line_type in [self.LINE_TYPE_START_NEST, self.LINE_TYPE_START_NEST_WITH_ATTR]: + if line_tag not in nest_dict: + nest_dict[line_tag] = list() - converted_nest, ret_index = self.convert_nest(nest, curr_index + 1) + converted_nest, ret_index = self.convert_nest(nest, curr_index + 1) - if line_attributes is not None: - for attribute in line_attributes: - converted_nest[attribute] = line_attributes[attribute] + if line_attributes is not None: + for attribute in line_attributes: + converted_nest[attribute] = line_attributes[attribute] - nest_dict[line_tag].append(converted_nest) + nest_dict[line_tag].append(converted_nest) - curr_index = ret_index + 1 - continue + curr_index = ret_index + 1 + continue - if line_type in [self.LINE_TYPE_ENTRY, self.LINE_TYPE_ENTRY_WITH_ATTR, self.LINE_TYPE_ENTRY_ONLY_ATTR]: - if line_tag not in nest_dict: - nest_dict[line_tag] = list() + if line_type in [self.LINE_TYPE_ENTRY, self.LINE_TYPE_ENTRY_WITH_ATTR, self.LINE_TYPE_ENTRY_ONLY_ATTR]: + if line_tag not in nest_dict: + nest_dict[line_tag] = list() - curr_dict = dict() + curr_dict = dict() - if line_text is not None: - curr_dict[self.KEY_TEXT] = line_text + if line_text is not None: + curr_dict[self.KEY_TEXT] = line_text - if line_attributes is not None: - for attribute in line_attributes: - curr_dict[attribute] = line_attributes[attribute] + if line_attributes is not None: + for attribute in line_attributes: + curr_dict[attribute] = line_attributes[attribute] - nest_dict[line_tag].append(curr_dict) + nest_dict[line_tag].append(curr_dict) - curr_index += 1 - continue + curr_index += 1 + continue - if line_type in [self.LINE_TYPE_END_NEST]: - return nest_dict, curr_index + if line_type in [self.LINE_TYPE_END_NEST]: + return nest_dict, curr_index - return nest_dict, curr_index + return nest_dict, curr_index - def divide_into_lines(self, input_file_name): - curr_str = "" - curr_nest = list() - curr_nest_tags = list() # Treating it as a stack - start_brackets = 0 + def divide_into_lines(self, input_file_name): + curr_str = "" + curr_nest = list() + curr_nest_tags = list() # Treating it as a stack + start_brackets = 0 - with open(input_file_name) as input_file: - for line in input_file: - line_str = line.strip() + with open(input_file_name) as input_file: + for line in input_file: + line_str = line.strip() - for letter_index in range(len(line_str)): - letter = line_str[letter_index] - if letter == '<': - start_brackets += 1 - if letter == '>': - start_brackets -= 1 + for letter_index in range(len(line_str)): + letter = line_str[letter_index] + if letter == '<': + start_brackets += 1 + if letter == '>': + start_brackets -= 1 - next_letter = "" - if letter_index + 1 < len(line_str): - next_letter = line_str[letter_index + 1] + next_letter = "" + if letter_index + 1 < len(line_str): + next_letter = line_str[letter_index + 1] - curr_str += letter + curr_str += letter - if letter == '>' and (next_letter == '<' or next_letter == "") and start_brackets == 0: - # Only return if nesting - self.line = curr_str - line_parsed = self.convert_line() + if letter == '>' and (next_letter == '<' or next_letter == "") and start_brackets == 0: + # Only return if nesting + self.line = curr_str + line_parsed = self.convert_line() - tag = line_parsed.get(self.KEY_TAG, None) - assert tag != self.KEY_TEXT # This could cause a massive conflict, but it is unlikely - line_type = line_parsed.get(self.KEY_TYPE, None) - attribute_keys = line_parsed.get(self.KEY_ATTRIBUTES, dict()).keys() + tag = line_parsed.get(self.KEY_TAG, None) + assert tag != self.KEY_TEXT # This could cause a massive conflict, but it is unlikely + line_type = line_parsed.get(self.KEY_TYPE, None) + attribute_keys = line_parsed.get(self.KEY_ATTRIBUTES, dict()).keys() - if line_type != self.LINE_TYPE_IGNORE: - curr_nest.append(line_parsed) + if line_type != self.LINE_TYPE_IGNORE: + curr_nest.append(line_parsed) - output_nest = (line_type in [self.LINE_TYPE_ENTRY, self.LINE_TYPE_ENTRY_WITH_ATTR, self.LINE_TYPE_ENTRY_ONLY_ATTR] and len(curr_nest_tags) == 0) + output_nest = (line_type in [self.LINE_TYPE_ENTRY, self.LINE_TYPE_ENTRY_WITH_ATTR, self.LINE_TYPE_ENTRY_ONLY_ATTR] and len(curr_nest_tags) == 0) - if line_type in [self.LINE_TYPE_START_NEST, self.LINE_TYPE_START_NEST_WITH_ATTR]: - curr_nest_tags.append(tag) - elif line_type == self.LINE_TYPE_END_NEST: - popped_curr_nest_tag = curr_nest_tags.pop() - assert popped_curr_nest_tag == tag, curr_nest - if len(curr_nest_tags) == 0: - output_nest = True - if output_nest: - nest_dict, _ = self.convert_nest(curr_nest, 0) + if line_type in [self.LINE_TYPE_START_NEST, self.LINE_TYPE_START_NEST_WITH_ATTR]: + curr_nest_tags.append(tag) + elif line_type == self.LINE_TYPE_END_NEST: + popped_curr_nest_tag = curr_nest_tags.pop() + assert popped_curr_nest_tag == tag, curr_nest + if len(curr_nest_tags) == 0: + output_nest = True + if output_nest: + nest_dict, _ = self.convert_nest(curr_nest, 0) - self.processing_func(nest_dict) + self.processing_func(nest_dict) - curr_nest = list() - curr_nest_tag = str() + curr_nest = list() + curr_nest_tag = str() - curr_str = "" + curr_str = "" - if curr_str != "": - # divide lines by a space - curr_str += ' ' + if curr_str != "": + # divide lines by a space + curr_str += ' ' class OWLParser(): - def __init__(self, input_files, input_file_names, owl_file_path, output_file_name): - self.XML_TAG = "?xml" - self.RDF_TAG = "rdf:RDF" - self.DOCTYPE_TAG = "!DOCTYPE" - self.CLASS_TAG = "owl:Class" - self.RESTRICTION_TAG = "owl:Restriction" - self.SUBCLASS_TAG = "rdfs:subClassOf" - self.NODEID_TAG = "rdf:nodeID" - self.RDF_ABOUT_TAG = "rdf:about" - self.GENID_PREFIX = "genid" - - self.OWL_SOURCE_KEY = "owl_source" - self.OWL_SOURCE_NAME_KEY = "owl_source_name" - - self.skip_tags = [self.XML_TAG, self.RDF_TAG, self.DOCTYPE_TAG] - - self.ignored_attributes = ["xml:lang"] - - self.xml_parser = XMLParser(self.skip_tags, self.ignored_attributes, self.triage_nest_dict) - - self.GENID_REMAINING_NESTS = dict() - self.GENID_TO_ID = dict() - self.ID_TO_GENIDS = dict() - - self.input_files = input_files - self.input_file_names = input_file_names - self.owl_file_path = owl_file_path - self.output_file_name = output_file_name - - self.output_info = kg2_util.create_single_jsonlines() - self.output = self.output_info[0] - - def check_for_class_genids(self, nest_dict): - genids = list() - - nest_dict_classes = nest_dict.get(self.CLASS_TAG, list()) - for nest_class_index in range(len(nest_dict_classes)): - nest_class = nest_dict_classes[nest_class_index] - nest_subclasses = nest_class.get(self.SUBCLASS_TAG, list()) - for nest_subclass_index in range(len(nest_subclasses)): - nest_subclass = nest_subclasses[nest_subclass_index] - potential_genid = nest_subclass.get(self.NODEID_TAG, str()) - if potential_genid.startswith(self.GENID_PREFIX): - genids.append(potential_genid) - - return genids - - - def check_for_restriction_genids(self, nest_dict): - for nest_restriction in nest_dict.get(self.RESTRICTION_TAG, dict()): - potential_genid = nest_restriction.get(self.NODEID_TAG, str()) - if potential_genid.startswith(self.GENID_PREFIX): - return potential_genid - return None - - def extract_class_id(self, nest_dict): - nest_dict_classes = nest_dict.get(self.CLASS_TAG, list()) - # Can't have competing class_ids - assert len(nest_dict_classes) <= 1 - - for nest_class_index in range(len(nest_dict_classes)): - nest_class = nest_dict_classes[nest_class_index] - return nest_class.get(self.RDF_ABOUT_TAG, str()) - - def store_genid_nest_in_class_nest(self, genid, genid_nest, class_nest): - output_class_nest = class_nest - - nest_dict_classes = class_nest.get(self.CLASS_TAG, list()) - for nest_class_index in range(len(nest_dict_classes)): - nest_class = nest_dict_classes[nest_class_index] - nest_subclasses = nest_class.get(self.SUBCLASS_TAG, list()) - for nest_subclass_index in range(len(nest_subclasses)): - nest_subclass = nest_subclasses[nest_subclass_index] - potential_genid = nest_subclass.get(self.NODEID_TAG, str()) - if potential_genid == genid: - output_class_nest[self.CLASS_TAG][nest_class_index][self.SUBCLASS_TAG][nest_subclass_index][self.RESTRICTION_TAG] = genid_nest[self.RESTRICTION_TAG] - - return output_class_nest - - - def write_to_output(self, output_dict, source_file): - output_dict[self.OWL_SOURCE_KEY] = source_file - output_dict[self.OWL_SOURCE_NAME_KEY] = self.input_file_names[source_file] - self.output.write(output_dict) - - return - - - def triage_nest_dict(self, nest_dict): - genids = self.check_for_class_genids(nest_dict) - restriction_genid = self.check_for_restriction_genids(nest_dict) - class_id = self.extract_class_id(nest_dict) - - if len(genids) > 0: - for genid in genids: - self.GENID_TO_ID[genid] = class_id - self.ID_TO_GENIDS[class_id] = genids - self.GENID_REMAINING_NESTS[class_id] = nest_dict - elif restriction_genid is not None: - class_id = self.GENID_TO_ID.get(restriction_genid, str()) - if len(class_id) == 0: - print("WARNING WITH:", restriction_genid, "- NO CLASS_ID FOUND") - - # Save to output despite not matching with an existing class - self.write_to_output(nest_dict, self.input_file) - return - class_nest = self.GENID_REMAINING_NESTS[class_id] - self.ID_TO_GENIDS[class_id].remove(restriction_genid) - updated_class_nest = self.store_genid_nest_in_class_nest(restriction_genid, nest_dict, class_nest) - - if len(self.ID_TO_GENIDS[class_id]) > 0: - self.GENID_REMAINING_NESTS[class_id] = updated_class_nest - else: - # Since all of the genids used in this class have been matched, output - self.write_to_output(nest_dict, self.input_file) - self.GENID_REMAINING_NESTS[class_id] = None - else: - # There are no genids that need to be worked with, so just output - self.write_to_output(nest_dict, self.input_file) - - - def parse_OWL_file(self): - for input_file in self.input_files: - self.input_file = input_file - print("Reading:", input_file, "starting at", date()) - self.xml_parser.divide_into_lines(self.owl_file_path + input_file) - - # Genid wasn't filled, still want to include them though - for item in self.GENID_REMAINING_NESTS: - if self.GENID_REMAINING_NESTS[item] != None: - self.write_to_output(self.GENID_REMAINING_NESTS[item], self.input_file) - - # Refresh everything for the next file - self.GENID_REMAINING_NESTS = dict() - self.GENID_TO_ID = dict() - self.ID_TO_GENIDS = dict() - - kg2_util.close_single_jsonlines(self.output_info, self.output_file_name) + def __init__(self, input_files, input_file_names, owl_file_path, output_file_name): + self.XML_TAG = "?xml" + self.RDF_TAG = "rdf:RDF" + self.DOCTYPE_TAG = "!DOCTYPE" + self.CLASS_TAG = "owl:Class" + self.RESTRICTION_TAG = "owl:Restriction" + self.SUBCLASS_TAG = "rdfs:subClassOf" + self.NODEID_TAG = "rdf:nodeID" + self.RDF_ABOUT_TAG = "rdf:about" + self.GENID_PREFIX = "genid" + + self.OWL_SOURCE_KEY = "owl_source" + self.OWL_SOURCE_NAME_KEY = "owl_source_name" + + self.skip_tags = [self.XML_TAG, self.RDF_TAG, self.DOCTYPE_TAG] + + self.ignored_attributes = ["xml:lang"] + + self.xml_parser = XMLParser(self.skip_tags, self.ignored_attributes, self.triage_nest_dict) + + self.GENID_REMAINING_NESTS = dict() + self.GENID_TO_ID = dict() + self.ID_TO_GENIDS = dict() + + self.input_files = input_files + self.input_file_names = input_file_names + self.owl_file_path = owl_file_path + self.output_file_name = output_file_name + + self.output_info = kg2_util.create_single_jsonlines() + self.output = self.output_info[0] + + def check_for_class_genids(self, nest_dict): + genids = list() + + nest_dict_classes = nest_dict.get(self.CLASS_TAG, list()) + for nest_class_index in range(len(nest_dict_classes)): + nest_class = nest_dict_classes[nest_class_index] + nest_subclasses = nest_class.get(self.SUBCLASS_TAG, list()) + for nest_subclass_index in range(len(nest_subclasses)): + nest_subclass = nest_subclasses[nest_subclass_index] + potential_genid = nest_subclass.get(self.NODEID_TAG, str()) + if potential_genid.startswith(self.GENID_PREFIX): + genids.append(potential_genid) + + return genids + + + def check_for_restriction_genids(self, nest_dict): + for nest_restriction in nest_dict.get(self.RESTRICTION_TAG, dict()): + potential_genid = nest_restriction.get(self.NODEID_TAG, str()) + if potential_genid.startswith(self.GENID_PREFIX): + return potential_genid + return None + + def extract_class_id(self, nest_dict): + nest_dict_classes = nest_dict.get(self.CLASS_TAG, list()) + # Can't have competing class_ids + assert len(nest_dict_classes) <= 1 + + for nest_class_index in range(len(nest_dict_classes)): + nest_class = nest_dict_classes[nest_class_index] + return nest_class.get(self.RDF_ABOUT_TAG, str()) + + def store_genid_nest_in_class_nest(self, genid, genid_nest, class_nest): + output_class_nest = class_nest + + nest_dict_classes = class_nest.get(self.CLASS_TAG, list()) + for nest_class_index in range(len(nest_dict_classes)): + nest_class = nest_dict_classes[nest_class_index] + nest_subclasses = nest_class.get(self.SUBCLASS_TAG, list()) + for nest_subclass_index in range(len(nest_subclasses)): + nest_subclass = nest_subclasses[nest_subclass_index] + potential_genid = nest_subclass.get(self.NODEID_TAG, str()) + if potential_genid == genid: + output_class_nest[self.CLASS_TAG][nest_class_index][self.SUBCLASS_TAG][nest_subclass_index][self.RESTRICTION_TAG] = genid_nest[self.RESTRICTION_TAG] + + return output_class_nest + + + def write_to_output(self, output_dict, source_file): + output_dict[self.OWL_SOURCE_KEY] = source_file + output_dict[self.OWL_SOURCE_NAME_KEY] = self.input_file_names[source_file] + self.output.write(output_dict) + + return + + + def triage_nest_dict(self, nest_dict): + genids = self.check_for_class_genids(nest_dict) + restriction_genid = self.check_for_restriction_genids(nest_dict) + class_id = self.extract_class_id(nest_dict) + + if len(genids) > 0: + for genid in genids: + self.GENID_TO_ID[genid] = class_id + self.ID_TO_GENIDS[class_id] = genids + self.GENID_REMAINING_NESTS[class_id] = nest_dict + elif restriction_genid is not None: + class_id = self.GENID_TO_ID.get(restriction_genid, str()) + if len(class_id) == 0: + print("WARNING WITH:", restriction_genid, "- NO CLASS_ID FOUND") + + # Save to output despite not matching with an existing class + self.write_to_output(nest_dict, self.input_file) + return + class_nest = self.GENID_REMAINING_NESTS[class_id] + self.ID_TO_GENIDS[class_id].remove(restriction_genid) + updated_class_nest = self.store_genid_nest_in_class_nest(restriction_genid, nest_dict, class_nest) + + if len(self.ID_TO_GENIDS[class_id]) > 0: + self.GENID_REMAINING_NESTS[class_id] = updated_class_nest + else: + # Since all of the genids used in this class have been matched, output + self.write_to_output(nest_dict, self.input_file) + self.GENID_REMAINING_NESTS[class_id] = None + else: + # There are no genids that need to be worked with, so just output + self.write_to_output(nest_dict, self.input_file) + + + def parse_OWL_file(self): + for input_file in self.input_files: + self.input_file = input_file + print("Reading:", input_file, "starting at", date()) + self.xml_parser.divide_into_lines(self.owl_file_path + input_file) + + # Genid wasn't filled, still want to include them though + for item in self.GENID_REMAINING_NESTS: + if self.GENID_REMAINING_NESTS[item] != None: + self.write_to_output(self.GENID_REMAINING_NESTS[item], self.input_file) + + # Refresh everything for the next file + self.GENID_REMAINING_NESTS = dict() + self.GENID_TO_ID = dict() + self.ID_TO_GENIDS = dict() + + kg2_util.close_single_jsonlines(self.output_info, self.output_file_name) def identify_and_download_input_files(ont_load_inventory, path_to_owl_files): - input_files = list() - input_file_names = dict() - owl_file_path = path_to_owl_files.rstrip('/') + "/" - for item in ont_load_inventory: - input_files.append(item['file']) - input_file_names[item['file']] = item['title'] - print("Downloading:", item['file'], "starting at", date()) - kg2_util.download_file_if_not_exist_locally(item['url'], owl_file_path + item['file']) - print("Download of:", item['file'], "finished at", date()) - - return input_files, input_file_names, owl_file_path + input_files = list() + input_file_names = dict() + owl_file_path = path_to_owl_files.rstrip('/') + "/" + for item in ont_load_inventory: + input_files.append(item['file']) + input_file_names[item['file']] = item['title'] + print("Downloading:", item['file'], "starting at", date()) + kg2_util.download_file_if_not_exist_locally(item['url'], owl_file_path + item['file']) + print("Download of:", item['file'], "finished at", date()) + + return input_files, input_file_names, owl_file_path if __name__ == '__main__': - args = get_args() - input_file_name = args.inputFile - owl_path = args.owlFilePath - output_file_name = args.outputFile - - ont_load_inventory = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(input_file_name)) - input_files, input_file_names, owl_file_path = identify_and_download_input_files(ont_load_inventory, owl_path) - - print("Files:", input_files) - print("Start Time:", date()) - owl_parser = OWLParser(input_files, input_file_names, owl_file_path, output_file_name) - owl_parser.parse_OWL_file() - print("End Time:", date()) \ No newline at end of file + args = get_args() + input_file_name = args.inputFile + owl_path = args.owlFilePath + output_file_name = args.outputFile + + ont_load_inventory = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(input_file_name)) + input_files, input_file_names, owl_file_path = identify_and_download_input_files(ont_load_inventory, owl_path) + + print("Files:", input_files) + print("Start Time:", date()) + owl_parser = OWLParser(input_files, input_file_names, owl_file_path, output_file_name) + owl_parser.parse_OWL_file() + print("End Time:", date()) \ No newline at end of file