From 58d1bddc4ce6bf4f1d88f0cdc0be8c6cf4a9d549 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 01:31:50 -0700
Subject: [PATCH] #387 cleaning up the formatting of the new files

---
 convert/ontologies_jsonl_to_kg_jsonl.py | 748 +++++++++----------
 extract/owlparser.py                    | 924 ++++++++++++------------
 2 files changed, 854 insertions(+), 818 deletions(-)

diff --git a/convert/ontologies_jsonl_to_kg_jsonl.py b/convert/ontologies_jsonl_to_kg_jsonl.py
index 4dfb9992..a27561f4 100644
--- a/convert/ontologies_jsonl_to_kg_jsonl.py
+++ b/convert/ontologies_jsonl_to_kg_jsonl.py
@@ -1,8 +1,25 @@
+#!/usr/bin/env python3
+''' ontologies_jsonl_to_kg_jsonl.py: Converts JSON Lines representation of ontologies into KG JSON Lines format
+
+    Usage: ontologies_jsonl_to_kg_jsonl.py [--test] <inputFile.jsonl> <curiesToCategoriesYAML.yaml> <curiesToURLsYAML.yaml> <outputNodesFile.jsonl> <outputEdgesFile.jsonl>
+'''
+
+
 import argparse
 import kg2_util
 import json
 import datetime
 
+__author__ = 'Erica Wood'
+__copyright__ = 'Oregon State University'
+__credits__ = ['Stephen Ramsey', 'Erica Wood']
+__license__ = 'MIT'
+__version__ = '0.1.0'
+__maintainer__ = ''
+__email__ = ''
+__status__ = 'Prototype'
+
+
 ID_TAG = "rdf:about"
 NAME_TAG = "rdfs:label"
 
@@ -18,26 +35,26 @@
 DESCRIPTION_DELIM = " // "
 
 BASE_EDGE_TYPES = {"mondo-base:exactMatch": RESOURCE_KEY,
-				   "mondo-base:closeMatch": RESOURCE_KEY,
-				   "mondo-base:relatedMatch": RESOURCE_KEY,
-				   "mondo-base:broadMatch": RESOURCE_KEY,
-				   "mondo-base:narrowMatch": RESOURCE_KEY,
-				   "skos:exactMatch": RESOURCE_KEY,
-				   "skos:closeMatch": RESOURCE_KEY,
-				   "skos:broadMatch": RESOURCE_KEY,
-				   "skos:relatedMatch": RESOURCE_KEY,
-				   "skos:narrowMatch": RESOURCE_KEY,
-				   "obo:IAO_0100001": RESOURCE_KEY,
-				   "obo:RO_0002175": RESOURCE_KEY,
-				   "obo:RO_0002161": RESOURCE_KEY,
-				   "obo:RO_0002604": RESOURCE_KEY,
-				   "obo:RO_0002171": RESOURCE_KEY,
-				   "obo:RO_0002174": RESOURCE_KEY,
-				   "obo:RO_0002475": RESOURCE_KEY,
-				   "obo:RO_0001900": RESOURCE_KEY,
-				   "oboInOwl:hasAlternativeId": TEXT_KEY,
-				   "oboInOwl:hasDbXref": TEXT_KEY,
-				   "oboInOwl:xref": TEXT_KEY}
+                   "mondo-base:closeMatch": RESOURCE_KEY,
+                   "mondo-base:relatedMatch": RESOURCE_KEY,
+                   "mondo-base:broadMatch": RESOURCE_KEY,
+                   "mondo-base:narrowMatch": RESOURCE_KEY,
+                   "skos:exactMatch": RESOURCE_KEY,
+                   "skos:closeMatch": RESOURCE_KEY,
+                   "skos:broadMatch": RESOURCE_KEY,
+                   "skos:relatedMatch": RESOURCE_KEY,
+                   "skos:narrowMatch": RESOURCE_KEY,
+                   "obo:IAO_0100001": RESOURCE_KEY,
+                   "obo:RO_0002175": RESOURCE_KEY,
+                   "obo:RO_0002161": RESOURCE_KEY,
+                   "obo:RO_0002604": RESOURCE_KEY,
+                   "obo:RO_0002171": RESOURCE_KEY,
+                   "obo:RO_0002174": RESOURCE_KEY,
+                   "obo:RO_0002475": RESOURCE_KEY,
+                   "obo:RO_0001900": RESOURCE_KEY,
+                   "oboInOwl:hasAlternativeId": TEXT_KEY,
+                   "oboInOwl:hasDbXref": TEXT_KEY,
+                   "oboInOwl:xref": TEXT_KEY}
 
 CLASS_TO_SUPERCLASSES = dict()
 SAVED_NODE_INFO = dict()
@@ -73,383 +90,386 @@
 VERSION_KEY = "version"
 
 def get_args():
-	arg_parser = argparse.ArgumentParser()
-	arg_parser.add_argument('--test', dest='test',
-							action="store_true", default=False)
-	arg_parser.add_argument('inputFile', type=str)
-	arg_parser.add_argument('curiesToCategoriesYAML', type=str)
-	arg_parser.add_argument('curiesToURLsYAML', type=str)
-	arg_parser.add_argument('outputNodesFile', type=str)
-	arg_parser.add_argument('outputEdgesFile', type=str)
-	return arg_parser.parse_args()
+    arg_parser = argparse.ArgumentParser()
+    arg_parser.add_argument('--test', dest='test',
+                            action="store_true", default=False)
+    arg_parser.add_argument('inputFile', type=str)
+    arg_parser.add_argument('curiesToCategoriesYAML', type=str)
+    arg_parser.add_argument('curiesToURLsYAML', type=str)
+    arg_parser.add_argument('outputNodesFile', type=str)
+    arg_parser.add_argument('outputEdgesFile', type=str)
+    return arg_parser.parse_args()
 
 def categorize_node(node_id, recursion_depth=0):
-	node_prefix = node_id.split(':')[0]
-
-	if node_id in NODE_CATEGORY_MAPPINGS and NODE_CATEGORY_MAPPINGS[node_id][1] == FILE_MAPPING:
-		return NODE_CATEGORY_MAPPINGS[node_id][0]
-
-	if node_prefix in PREFIX_MAPPINGS:
-		node_category = PREFIX_MAPPINGS[node_prefix]
-		NODE_CATEGORY_MAPPINGS[node_id] = (node_category, PREFIX_MAPPING)
-		return PREFIX_MAPPINGS[node_prefix]
-
-	# Get try to get the most common superclass categorization
-	superclass_categorizations = dict()
-	highest_value = 0
-	highest_category = kg2_util.BIOLINK_CATEGORY_NAMED_THING
-	if recursion_depth == 10:
-		return kg2_util.BIOLINK_CATEGORY_NAMED_THING
-
-	for superclass in CLASS_TO_SUPERCLASSES.get(node_id, list()):
-		superclass_category = categorize_node(superclass, recursion_depth + 1)
-		if superclass_category not in superclass_categorizations:
-			superclass_categorizations[superclass_category] = 0
-		superclass_categorizations[superclass_category] += 1
-		if superclass_categorizations[superclass_category] > highest_value:
-			highest_value = superclass_categorizations[superclass_category]
-			highest_category = superclass_category
-
-	NODE_CATEGORY_MAPPINGS[node_id] = (highest_category, RECURSE_MAPPING)
-	return highest_category
+    node_prefix = node_id.split(':')[0]
+
+    if node_id in NODE_CATEGORY_MAPPINGS and NODE_CATEGORY_MAPPINGS[node_id][1] == FILE_MAPPING:
+        return NODE_CATEGORY_MAPPINGS[node_id][0]
+
+    if node_prefix in PREFIX_MAPPINGS:
+        node_category = PREFIX_MAPPINGS[node_prefix]
+        NODE_CATEGORY_MAPPINGS[node_id] = (node_category, PREFIX_MAPPING)
+        return PREFIX_MAPPINGS[node_prefix]
+
+    # Get try to get the most common superclass categorization
+    superclass_categorizations = dict()
+    highest_value = 0
+    highest_category = kg2_util.BIOLINK_CATEGORY_NAMED_THING
+    if recursion_depth == 10:
+        return kg2_util.BIOLINK_CATEGORY_NAMED_THING
+
+    for superclass in CLASS_TO_SUPERCLASSES.get(node_id, list()):
+        superclass_category = categorize_node(superclass, recursion_depth + 1)
+        if superclass_category not in superclass_categorizations:
+            superclass_categorizations[superclass_category] = 0
+        superclass_categorizations[superclass_category] += 1
+        if superclass_categorizations[superclass_category] > highest_value:
+            highest_value = superclass_categorizations[superclass_category]
+            highest_category = superclass_category
+
+    NODE_CATEGORY_MAPPINGS[node_id] = (highest_category, RECURSE_MAPPING)
+    return highest_category
 
 def reformat_obo_date(date_str):
-	if date_str is None:
-		return None
-
-	if '-' in date_str:
-		delim = 'T'
-		if ' ' in date_str:
-			delim = ' '
-		date_spl = date_str.strip('Z').split(delim)
-		date_fh = date_spl[0].split('-')
-		year = int(date_fh[0])
-		month = int(date_fh[1])
-		day = int(date_fh[2])
-
-		if month < 1 or month > 12 or day < 1 or day > 31:
-			return None
-
-		if len(date_spl) > 1:
-			date_sh = date_spl[1].split(':')
-			hour = int(date_sh[0])
-			minute = int(date_sh[1])
-			second = int(date_sh[2][0:1])
-
-			return datetime.datetime(year, month, day, hour, minute, second)
-		else:
-			return datetime.datetime(year, month, day)
-	else:
-		date_spl = date_str.split(' ')
-		date_fh = date_spl[0].split(':')
-		year = int(date_fh[2])
-		month = int(date_fh[1])
-		day = int(date_fh[0])
-
-		if month < 1 or month > 12 or day < 1 or day > 31:
-			return None
-
-		return datetime.datetime(year, month, day)
+    if date_str is None:
+        return None
+
+    if '-' in date_str:
+        delim = 'T'
+        if ' ' in date_str:
+            delim = ' '
+        date_spl = date_str.strip('Z').split(delim)
+        date_fh = date_spl[0].split('-')
+        year = int(date_fh[0])
+        month = int(date_fh[1])
+        day = int(date_fh[2])
+
+        if month < 1 or month > 12 or day < 1 or day > 31:
+            return None
+
+        if len(date_spl) > 1:
+            date_sh = date_spl[1].split(':')
+            hour = int(date_sh[0])
+            minute = int(date_sh[1])
+            second = int(date_sh[2][0:1])
+
+            return datetime.datetime(year, month, day, hour, minute, second)
+        else:
+            return datetime.datetime(year, month, day)
+    else:
+        date_spl = date_str.split(' ')
+        date_fh = date_spl[0].split(':')
+        year = int(date_fh[2])
+        month = int(date_fh[1])
+        day = int(date_fh[0])
+
+        if month < 1 or month > 12 or day < 1 or day > 31:
+            return None
+
+        return datetime.datetime(year, month, day)
 
 def pick_most_recent_date(dates, alternate_date=None):
-	latest_date = None
-	for date in dates:
-		if date == None:
-			continue
-		if latest_date == None or date > latest_date:
-			latest_date = date
-	
-	if latest_date == None:
-		if alternate_date is not None:
-			latest_date = alternate_date
-		else:
-			return None
-
-	return latest_date.isoformat(sep=' ')
+    latest_date = None
+    for date in dates:
+        if date == None:
+            continue
+        if latest_date == None or date > latest_date:
+            latest_date = date
+    
+    if latest_date == None:
+        if alternate_date is not None:
+            latest_date = alternate_date
+        else:
+            return None
+
+    return latest_date.isoformat(sep=' ')
 
 def process_ontology_term(ontology_node, source, ontology_name, owl_source=True):
-	owl_prefix = ""
-	if owl_source:
-		owl_prefix = "owl:"
-	ontology_version = None
-	ontology_versions = [version.get(TEXT_KEY, str()) for version in ontology_node.get(owl_prefix + "versionInfo", list()) if TEXT_KEY in version]
-	ontology_version_iri = [version.get(RESOURCE_KEY, str()) for version in ontology_node.get(owl_prefix + "versionIRI", list()) if RESOURCE_KEY in version]
-	ontology_dates = [reformat_obo_date(version.get(TEXT_KEY, str())) for date_type in ["oboInOwl:date", "dcterms:date", "dc:date"] for version in ontology_node.get(date_type, list()) if TEXT_KEY in version]
-	ontology_iri = ontology_node.get("rdf:about", str())
-	if len(ontology_versions) == 1:
-		ontology_version = ontology_versions[0]
-	elif len(ontology_version_iri) == 1:
-		ontology_version = ontology_version_iri[0]
-		version_replacements = [ontology_iri.replace('.owl', '') + '/', '/' + source, 'releases/']
-		for replacement in version_replacements:
-			ontology_version = ontology_version.replace(replacement, "")
-		ontology_version = ontology_version.split('/')[0]
-	elif len(ontology_dates) >= 1:
-		ontology_version = pick_most_recent_date(ontology_dates)
-
-	if ontology_version is None:
-		print("Warning: source", source, "lacks any versioning information.")
-
-	ontology_date = reformat_obo_date(pick_most_recent_date(ontology_dates))
-	source_id = kg2_util.CURIE_PREFIX_OBO + ':' + source
-
-	if source not in SOURCE_INFO:
-		SOURCE_INFO[source] = {SOURCE_KEY: source_id, IRI_KEY: ontology_iri, NAME_KEY: ontology_name, UPDATE_DATE_KEY: ontology_date, VERSION_KEY: ontology_version}
+    owl_prefix = ""
+    if owl_source:
+        owl_prefix = "owl:"
+    ontology_version = None
+    ontology_versions = [version.get(TEXT_KEY, str()) for version in ontology_node.get(owl_prefix + "versionInfo", list()) if TEXT_KEY in version]
+    ontology_version_iri = [version.get(RESOURCE_KEY, str()) for version in ontology_node.get(owl_prefix + "versionIRI", list()) if RESOURCE_KEY in version]
+    ontology_dates = [reformat_obo_date(version.get(TEXT_KEY, str())) for date_type in ["oboInOwl:date", "dcterms:date", "dc:date"] for version in ontology_node.get(date_type, list()) if TEXT_KEY in version]
+    ontology_iri = ontology_node.get("rdf:about", str())
+    if len(ontology_versions) == 1:
+        ontology_version = ontology_versions[0]
+    elif len(ontology_version_iri) == 1:
+        ontology_version = ontology_version_iri[0]
+        version_replacements = [ontology_iri.replace('.owl', '') + '/', '/' + source, 'releases/']
+        for replacement in version_replacements:
+            ontology_version = ontology_version.replace(replacement, "")
+        ontology_version = ontology_version.split('/')[0]
+    elif len(ontology_dates) >= 1:
+        ontology_version = pick_most_recent_date(ontology_dates)
+
+    if ontology_version is None:
+        print("Warning: source", source, "lacks any versioning information.")
+
+    ontology_date = reformat_obo_date(pick_most_recent_date(ontology_dates))
+    source_id = kg2_util.CURIE_PREFIX_OBO + ':' + source
+
+    if source not in SOURCE_INFO:
+        SOURCE_INFO[source] = {SOURCE_KEY: source_id, IRI_KEY: ontology_iri, NAME_KEY: ontology_name, UPDATE_DATE_KEY: ontology_date, VERSION_KEY: ontology_version}
 
 
 def process_ontology_class(owl_class, source, ontology_name, owl_source=True):
-	owl_prefix = ""
-	if owl_source:
-		owl_prefix = "owl:"
-	# Typically genid classes which don't neatly map onto the KG2 schema
-	if ID_TAG not in owl_class:
-		return
-	node_id = match_prefix(owl_class.get(ID_TAG, str()))
-	if node_id is None:
-		return
-	node_prefix = node_id.split(':')[0]
-	node_iri = PREFIX_TO_IRI_MAP[node_prefix] + node_id.replace(node_prefix + ':', '')
-
-	# Configure the name
-	name_list = [name.get(TEXT_KEY, None) for name in owl_class.get("rdfs:label", dict()) if TEXT_KEY in name]
-	if len(name_list) == 0:
-		return
-
-	# Configure the description
-	description_list = list()
-	description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:IAO_0000115", list()) if (TEXT_KEY in description)]
-	description_list += [COMMENT_PREFIX + description.get(TEXT_KEY, str()) for description in owl_class.get("rdfs:comment", list()) if (TEXT_KEY in description)]
-	description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:UBPROP_0000001", list()) if (TEXT_KEY in description)]
-	description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:UBPROP_0000005", list()) if (TEXT_KEY in description)]
-	description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("efo1:source_description", list()) if (TEXT_KEY in description)]
-
-	deprecated = "true" in owl_class.get(owl_prefix + "deprecated", list())
-	for name in name_list:
-		search_name = name.lower()
-		if search_name.startswith("obsolete") or search_name.startswith("(obsolete") or search_name.endswith("obsolete"):
-			deprecated = True
-
-	# Configure the synonyms
-	synonym_list = list()
-	synonym_keys = ["oboInOwl:hasExactSynonym", "oboInOwl:hasRelatedSynonym", "oboInOwl:hasNarrowSynonym", "oboInOwl:hasBroadSynonym", "go:hasExactSynonym",
-					"go:hasSynonym", "go:hasNarrowSynonym", "go:hasBroadSynonym", "obo:IAO_0000118", "obo:IAO_0000589", "go:hasRelatedSynonym", "obo:IAO_0000111",
-					"obo:IAO_0000028", "skos:prefLabel"]
-	synonym_list += [synonym.get(TEXT_KEY, None) for synonym_key in synonym_keys for synonym in owl_class.get(synonym_key, list()) if (TEXT_KEY in synonym)]
-
-	update_date_list = list()
-	update_date_keys = ["dc:date", "dcterms:date", "terms:date"]
-	update_date_list += [reformat_obo_date(update_date.get(TEXT_KEY, None)) for update_date_key in update_date_keys for update_date in owl_class.get(update_date_key, list()) if (TEXT_KEY in update_date)]
-
-	creation_date_list = list()
-	creation_date_keys = ["oboInOwl:creation_date", "go:creation_date"]
-	creation_date_list += [reformat_obo_date(creation_date.get(TEXT_KEY, None)) for creation_date_key in creation_date_keys for creation_date in owl_class.get(creation_date_key, list()) if (TEXT_KEY in creation_date)]
-
-	# Configure the biological sequence
-	has_biological_sequence = dict()
-	has_biological_sequence['formula'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:formula", list()) if TEXT_KEY in biological_sequence]
-	has_biological_sequence['smiles'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:smiles", list()) if TEXT_KEY in biological_sequence]
-	has_biological_sequence['inchi'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:inchi", list()) if TEXT_KEY in biological_sequence]
-	has_biological_sequence['inchikey'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:inchikey", list()) if TEXT_KEY in biological_sequence]
-
-	# Extract edge triples
-	edges_list = list()
-
-	for edge_type in BASE_EDGE_TYPES:
-		for edge in owl_class.get(edge_type, list()):
-			if BASE_EDGE_TYPES[edge_type] in edge:
-				edges_list.append((edge_type, edge.get(BASE_EDGE_TYPES[edge_type], None)))
-
-
-	restriction_edges = list()
-	restriction_edges += [(edge, "rdfs:subClassOf") for edge in owl_class.get("rdfs:subClassOf", list())]
-	for equiv in owl_class.get(owl_prefix + "equivalentClass", list()):
-		for mini_class in equiv.get(owl_prefix + "Class", list()):
-			for edge in mini_class.get(owl_prefix + "intersectionOf", list()):
-				restriction_edges.append((edge, owl_prefix + "equivalentClass"))
-
-	for (edge, general_edge_type) in restriction_edges:
-		for restriction in edge.get(owl_prefix + "Restriction", list()):
-			edge_type = restriction.get(owl_prefix + "onProperty", list())
-			edge_object = restriction.get(owl_prefix + "someValuesFrom", list())
-			if len(edge_type) != 1:
-				assert len(edge_type) <= 1, edge 
-				continue
-			if len(edge_object) != 1:
-				assert len(edge_object) <= 1, edge
-				continue
-			edge_type = edge_type[0].get(RESOURCE_KEY, None)
-			edge_object = edge_object[0].get(RESOURCE_KEY, None)
-
-			if edge_type != None and edge_object != None:
-				edges_list.append((edge_type, edge_object))
-
-		if RESOURCE_KEY in edge:
-			edges_list.append((general_edge_type, edge.get(RESOURCE_KEY, None)))
-
-	superclasses = set()
-	final_edges_list = list()
-	for (edge_relation, edge_object) in edges_list:
-		edge_object = match_prefix(edge_object)
-		if edge_object is None:
-			continue
-		edge_relation = match_prefix(edge_relation)
-		if edge_relation is None:
-			continue
-		if edge_relation in ["rdfs:subClassOf"]:
-			superclasses.add(edge_object)
-		final_edges_list.append((edge_relation, edge_object))
-
-	# Imperfect way to make it deterministic
-	superclasses = sorted(list(superclasses))
-	if node_id not in CLASS_TO_SUPERCLASSES:
-		CLASS_TO_SUPERCLASSES[node_id] = list()
-	CLASS_TO_SUPERCLASSES[node_id] += superclasses
-	CLASS_TO_SUPERCLASSES[node_id] = sorted(list(set(CLASS_TO_SUPERCLASSES[node_id])))
-
-	if node_id not in SAVED_NODE_INFO:
-		SAVED_NODE_INFO[node_id] = list()
-	SAVED_NODE_INFO[node_id].append({ID_KEY: node_id,
-									 DEPRECATED_KEY: deprecated,
-									 UPDATE_DATE_KEY: update_date_list,
-									 CREATION_DATE_KEY: creation_date_list,
-									 SYNONYM_KEY: synonym_list,
-									 DESCRIPTION_KEY: description_list,
-									 NAME_KEY: name_list,
-									 SOURCE_KEY: source,
-									 BIOLOGICAL_SEQUENCE_KEY: has_biological_sequence,
-									 IRI_KEY: node_iri,
-									 EDGES_KEY: final_edges_list})
+    owl_prefix = ""
+    if owl_source:
+        owl_prefix = "owl:"
+    # Typically genid classes which don't neatly map onto the KG2 schema
+    if ID_TAG not in owl_class:
+        return
+    node_id = match_prefix(owl_class.get(ID_TAG, str()))
+    if node_id is None:
+        return
+    node_prefix = node_id.split(':')[0]
+    node_iri = PREFIX_TO_IRI_MAP[node_prefix] + node_id.replace(node_prefix + ':', '')
+
+    # Configure the name
+    name_list = [name.get(TEXT_KEY, None) for name in owl_class.get("rdfs:label", dict()) if TEXT_KEY in name]
+    if len(name_list) == 0:
+        return
+
+    # Configure the description
+    description_list = list()
+    description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:IAO_0000115", list()) if (TEXT_KEY in description)]
+    description_list += [COMMENT_PREFIX + description.get(TEXT_KEY, str()) for description in owl_class.get("rdfs:comment", list()) if (TEXT_KEY in description)]
+    description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:UBPROP_0000001", list()) if (TEXT_KEY in description)]
+    description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:UBPROP_0000005", list()) if (TEXT_KEY in description)]
+    description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("efo1:source_description", list()) if (TEXT_KEY in description)]
+
+    deprecated = "true" in owl_class.get(owl_prefix + "deprecated", list())
+    for name in name_list:
+        search_name = name.lower()
+        if search_name.startswith("obsolete") or search_name.startswith("(obsolete") or search_name.endswith("obsolete"):
+            deprecated = True
+
+    # Configure the synonyms
+    synonym_list = list()
+    synonym_keys = ["oboInOwl:hasExactSynonym", "oboInOwl:hasRelatedSynonym", "oboInOwl:hasNarrowSynonym", "oboInOwl:hasBroadSynonym", "go:hasExactSynonym",
+                    "go:hasSynonym", "go:hasNarrowSynonym", "go:hasBroadSynonym", "obo:IAO_0000118", "obo:IAO_0000589", "go:hasRelatedSynonym", "obo:IAO_0000111",
+                    "obo:IAO_0000028", "skos:prefLabel"]
+    synonym_list += [synonym.get(TEXT_KEY, None) for synonym_key in synonym_keys for synonym in owl_class.get(synonym_key, list()) if (TEXT_KEY in synonym)]
+
+    update_date_list = list()
+    update_date_keys = ["dc:date", "dcterms:date", "terms:date"]
+    update_date_list += [reformat_obo_date(update_date.get(TEXT_KEY, None)) for update_date_key in update_date_keys for update_date in owl_class.get(update_date_key, list()) if (TEXT_KEY in update_date)]
+
+    creation_date_list = list()
+    creation_date_keys = ["oboInOwl:creation_date", "go:creation_date"]
+    creation_date_list += [reformat_obo_date(creation_date.get(TEXT_KEY, None)) for creation_date_key in creation_date_keys for creation_date in owl_class.get(creation_date_key, list()) if (TEXT_KEY in creation_date)]
+
+    # Configure the biological sequence
+    has_biological_sequence = dict()
+    has_biological_sequence['formula'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:formula", list()) if TEXT_KEY in biological_sequence]
+    has_biological_sequence['smiles'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:smiles", list()) if TEXT_KEY in biological_sequence]
+    has_biological_sequence['inchi'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:inchi", list()) if TEXT_KEY in biological_sequence]
+    has_biological_sequence['inchikey'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:inchikey", list()) if TEXT_KEY in biological_sequence]
+
+    # Extract edge triples
+    edges_list = list()
+
+    for edge_type in BASE_EDGE_TYPES:
+        for edge in owl_class.get(edge_type, list()):
+            if BASE_EDGE_TYPES[edge_type] in edge:
+                edges_list.append((edge_type, edge.get(BASE_EDGE_TYPES[edge_type], None)))
+
+
+    restriction_edges = list()
+    restriction_edges += [(edge, "rdfs:subClassOf") for edge in owl_class.get("rdfs:subClassOf", list())]
+    for equiv in owl_class.get(owl_prefix + "equivalentClass", list()):
+        for mini_class in equiv.get(owl_prefix + "Class", list()):
+            for edge in mini_class.get(owl_prefix + "intersectionOf", list()):
+                restriction_edges.append((edge, owl_prefix + "equivalentClass"))
+
+    for (edge, general_edge_type) in restriction_edges:
+        for restriction in edge.get(owl_prefix + "Restriction", list()):
+            edge_type = restriction.get(owl_prefix + "onProperty", list())
+            edge_object = restriction.get(owl_prefix + "someValuesFrom", list())
+            if len(edge_type) != 1:
+                assert len(edge_type) <= 1, edge 
+                continue
+            if len(edge_object) != 1:
+                assert len(edge_object) <= 1, edge
+                continue
+            edge_type = edge_type[0].get(RESOURCE_KEY, None)
+            edge_object = edge_object[0].get(RESOURCE_KEY, None)
+
+            if edge_type != None and edge_object != None:
+                edges_list.append((edge_type, edge_object))
+
+        if RESOURCE_KEY in edge:
+            edges_list.append((general_edge_type, edge.get(RESOURCE_KEY, None)))
+
+    superclasses = set()
+    final_edges_list = list()
+    for (edge_relation, edge_object) in edges_list:
+        edge_object = match_prefix(edge_object)
+        if edge_object is None:
+            continue
+        edge_relation = match_prefix(edge_relation)
+        if edge_relation is None:
+            continue
+        if edge_relation in ["rdfs:subClassOf"]:
+            superclasses.add(edge_object)
+        final_edges_list.append((edge_relation, edge_object))
+
+    # Imperfect way to make it deterministic
+    superclasses = sorted(list(superclasses))
+    if node_id not in CLASS_TO_SUPERCLASSES:
+        CLASS_TO_SUPERCLASSES[node_id] = list()
+    CLASS_TO_SUPERCLASSES[node_id] += superclasses
+    CLASS_TO_SUPERCLASSES[node_id] = sorted(list(set(CLASS_TO_SUPERCLASSES[node_id])))
+
+    if node_id not in SAVED_NODE_INFO:
+        SAVED_NODE_INFO[node_id] = list()
+    SAVED_NODE_INFO[node_id].append({ID_KEY: node_id,
+                                     DEPRECATED_KEY: deprecated,
+                                     UPDATE_DATE_KEY: update_date_list,
+                                     CREATION_DATE_KEY: creation_date_list,
+                                     SYNONYM_KEY: synonym_list,
+                                     DESCRIPTION_KEY: description_list,
+                                     NAME_KEY: name_list,
+                                     SOURCE_KEY: source,
+                                     BIOLOGICAL_SEQUENCE_KEY: has_biological_sequence,
+                                     IRI_KEY: node_iri,
+                                     EDGES_KEY: final_edges_list})
 
 def process_ontology_item(ontology_item):
-	source = ontology_item.get(OWL_SOURCE_KEY, str())
-	ontology_name = ontology_item.get(OWL_SOURCE_NAME_KEY, str())
+    source = ontology_item.get(OWL_SOURCE_KEY, str())
+    ontology_name = ontology_item.get(OWL_SOURCE_NAME_KEY, str())
 
-	for owl_class in ontology_item.get("owl:Class", list()):
-		process_ontology_class(owl_class, source, ontology_name)
+    for owl_class in ontology_item.get("owl:Class", list()):
+        process_ontology_class(owl_class, source, ontology_name)
 
-	for owl_class in ontology_item.get("Class", list()):
-		process_ontology_class(owl_class, source, ontology_name, False)
+    for owl_class in ontology_item.get("Class", list()):
+        process_ontology_class(owl_class, source, ontology_name, False)
 
-	for ontology_node in ontology_item.get("owl:Ontology", list()):
-		process_ontology_term(ontology_node, source, ontology_name)
+    for ontology_node in ontology_item.get("owl:Ontology", list()):
+        process_ontology_term(ontology_node, source, ontology_name)
 
-	# Because of ORDO
-	for ontology_node in ontology_item.get("Ontology", list()):
-		process_ontology_term(ontology_node, source, ontology_name, False)
+    # Because of ORDO
+    for ontology_node in ontology_item.get("Ontology", list()):
+        process_ontology_term(ontology_node, source, ontology_name, False)
 
 def generate_uri_map(curies_to_urls_file_name):
-	uri_input_map = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(curies_to_urls_file_name))
-	bidirectional_map = uri_input_map['use_for_bidirectional_mapping']
-	contraction_map = uri_input_map['use_for_contraction_only']
-
-	for curie_prefix_dict in bidirectional_map:
-		for curie_prefix in curie_prefix_dict:
-			curie_url = curie_prefix_dict[curie_prefix]
-			URI_MAP[curie_url] = curie_prefix
-			PREFIX_TO_IRI_MAP[curie_prefix] = curie_url
-
-	for curie_prefix_dict in contraction_map:
-		for curie_prefix in curie_prefix_dict:
-			curie_url = curie_prefix_dict[curie_prefix]
-			URI_MAP[curie_url] = curie_prefix
-
-	# So that you get the most accurate match, you want to match to the longest url (in case one is a substring of another)
-	# Apparently have to use global key word to write to a module wide list (https://stackoverflow.com/questions/4630543/defining-lists-as-global-variables-in-python)
-	global URI_MAP_KEYS
-	URI_MAP_KEYS = sorted(URI_MAP.keys(), key=len, reverse=True)
+    uri_input_map = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(curies_to_urls_file_name))
+    bidirectional_map = uri_input_map['use_for_bidirectional_mapping']
+    contraction_map = uri_input_map['use_for_contraction_only']
+
+    for curie_prefix_dict in bidirectional_map:
+        for curie_prefix in curie_prefix_dict:
+            curie_url = curie_prefix_dict[curie_prefix]
+            URI_MAP[curie_url] = curie_prefix
+            PREFIX_TO_IRI_MAP[curie_prefix] = curie_url
+
+    for curie_prefix_dict in contraction_map:
+        for curie_prefix in curie_prefix_dict:
+            curie_url = curie_prefix_dict[curie_prefix]
+            URI_MAP[curie_url] = curie_prefix
+
+    # So that you get the most accurate match, you want to match to the longest url (in case one is a substring of another)
+    # Apparently have to use global key word to write to a module wide list (https://stackoverflow.com/questions/4630543/defining-lists-as-global-variables-in-python)
+    global URI_MAP_KEYS
+    URI_MAP_KEYS = sorted(URI_MAP.keys(), key=len, reverse=True)
 
 def match_prefix(node_id):
-	for curie_url in URI_MAP_KEYS:
-		if node_id.startswith(curie_url):
-			return node_id.replace(curie_url, URI_MAP[curie_url] + ":")
-	
-	if "http" in node_id:
-		MISSING_ID_PREFIXES.add('/'.join(node_id.split('/')[0:-1]) + "/")
-	elif ':' in node_id:
-		MISSING_ID_PREFIXES.add(node_id.split(':')[0] + ":")
-	elif '_' in node_id:
-		MISSING_ID_PREFIXES.add(node_id.split('_')[0] + "_")
-	else:
-		MISSING_ID_PREFIXES.add(node_id)
+    for curie_url in URI_MAP_KEYS:
+        if node_id.startswith(curie_url):
+            return node_id.replace(curie_url, URI_MAP[curie_url] + ":")
+    
+    if "http" in node_id:
+        MISSING_ID_PREFIXES.add('/'.join(node_id.split('/')[0:-1]) + "/")
+    elif ':' in node_id:
+        MISSING_ID_PREFIXES.add(node_id.split(':')[0] + ":")
+    elif '_' in node_id:
+        MISSING_ID_PREFIXES.add(node_id.split('_')[0] + "_")
+    else:
+        MISSING_ID_PREFIXES.add(node_id)
 
 def construct_nodes_and_edges(nodes_output, edges_output):
-	for source in SOURCE_INFO:
-		source_date = pick_most_recent_date([SOURCE_INFO[source][UPDATE_DATE_KEY]])
-		source_name = SOURCE_INFO[source][NAME_KEY] + " v" + SOURCE_INFO[source][VERSION_KEY]
-		source_id = SOURCE_INFO[source][SOURCE_KEY]
-		source_iri = SOURCE_INFO[source][IRI_KEY]
-		node = kg2_util.make_node(source_id, source_iri, source_name, kg2_util.BIOLINK_CATEGORY_INFORMATION_CONTENT_ENTITY, source_date, source_id)
+    for source in SOURCE_INFO:
+        source_date = pick_most_recent_date([SOURCE_INFO[source][UPDATE_DATE_KEY]])
+        source_name = SOURCE_INFO[source][NAME_KEY] + " v" + SOURCE_INFO[source][VERSION_KEY]
+        source_id = SOURCE_INFO[source][SOURCE_KEY]
+        source_iri = SOURCE_INFO[source][IRI_KEY]
+        node = kg2_util.make_node(source_id, source_iri, source_name, kg2_util.BIOLINK_CATEGORY_INFORMATION_CONTENT_ENTITY, source_date, source_id)
 
-		nodes_output.write(node)
+        nodes_output.write(node)
 
 
-	for node_id in SAVED_NODE_INFO:
-		for source_node_index in range(len(SAVED_NODE_INFO[node_id])):
-			if SAVED_NODE_INFO[node_id][source_node_index][DEPRECATED_KEY]:
-				continue
-			name = SAVED_NODE_INFO[node_id][source_node_index][NAME_KEY][0] # Imperfect way of choosing the name
-			node_iri = SAVED_NODE_INFO[node_id][source_node_index][IRI_KEY]
-			description = DESCRIPTION_DELIM.join(SAVED_NODE_INFO[node_id][source_node_index][DESCRIPTION_KEY])
-			has_biological_sequence = SAVED_NODE_INFO[node_id][source_node_index][BIOLOGICAL_SEQUENCE_KEY].get("smiles", None)
-			synonyms = SAVED_NODE_INFO[node_id][source_node_index][SYNONYM_KEY]
-			category = SAVED_NODE_INFO[node_id][source_node_index][CATEGORY_KEY]
+    for node_id in SAVED_NODE_INFO:
+        for source_node_index in range(len(SAVED_NODE_INFO[node_id])):
+            if SAVED_NODE_INFO[node_id][source_node_index][DEPRECATED_KEY]:
+                continue
+            name = SAVED_NODE_INFO[node_id][source_node_index][NAME_KEY][0] # Imperfect way of choosing the name
+            node_iri = SAVED_NODE_INFO[node_id][source_node_index][IRI_KEY]
+            description = DESCRIPTION_DELIM.join(SAVED_NODE_INFO[node_id][source_node_index][DESCRIPTION_KEY])
+            has_biological_sequence = SAVED_NODE_INFO[node_id][source_node_index][BIOLOGICAL_SEQUENCE_KEY].get("smiles", None)
+            synonyms = SAVED_NODE_INFO[node_id][source_node_index][SYNONYM_KEY]
+            category = SAVED_NODE_INFO[node_id][source_node_index][CATEGORY_KEY]
 
-			source = SAVED_NODE_INFO[node_id][source_node_index][SOURCE_KEY]
-			provided_by = kg2_util.CURIE_PREFIX_OBO + ':' + source
-			source_date = SOURCE_INFO[source][UPDATE_DATE_KEY]
+            source = SAVED_NODE_INFO[node_id][source_node_index][SOURCE_KEY]
+            provided_by = kg2_util.CURIE_PREFIX_OBO + ':' + source
+            source_date = SOURCE_INFO[source][UPDATE_DATE_KEY]
 
-			update_date = pick_most_recent_date(SAVED_NODE_INFO[node_id][source_node_index][UPDATE_DATE_KEY], source_date)
-			creation_date = pick_most_recent_date(SAVED_NODE_INFO[node_id][source_node_index][CREATION_DATE_KEY], source_date)
+            update_date = pick_most_recent_date(SAVED_NODE_INFO[node_id][source_node_index][UPDATE_DATE_KEY], source_date)
+            creation_date = pick_most_recent_date(SAVED_NODE_INFO[node_id][source_node_index][CREATION_DATE_KEY], source_date)
 
-			node = kg2_util.make_node(node_id, node_iri, name, category, update_date, provided_by)
-			node["description"] = description
-			node["has_biological_sequence"] = has_biological_sequence
-			node["creation_date"] = creation_date
-			node["synonym"] = synonyms
+            node = kg2_util.make_node(node_id, node_iri, name, category, update_date, provided_by)
+            node["description"] = description
+            node["has_biological_sequence"] = has_biological_sequence
+            node["creation_date"] = creation_date
+            node["synonym"] = synonyms
 
-			nodes_output.write(node)
+            nodes_output.write(node)
 
-			for (edge_relation, edge_object) in SAVED_NODE_INFO[node_id][source_node_index][EDGES_KEY]:
-				relation_label = edge_relation.split(':')[1]
-				edge = kg2_util.make_edge(node_id, edge_object, edge_relation, relation_label, provided_by, update_date)
+            for (edge_relation, edge_object) in SAVED_NODE_INFO[node_id][source_node_index][EDGES_KEY]:
+                relation_label = edge_relation.split(':')[1]
+                edge = kg2_util.make_edge(node_id, edge_object, edge_relation, relation_label, provided_by, update_date)
 
-				edges_output.write(edge)
+                edges_output.write(edge)
 
 
 
 if __name__ == '__main__':
-	args = get_args()
-	input_file_name = args.inputFile
-	curies_to_categories_file_name = args.curiesToCategoriesYAML
-	curies_to_urls_file_name = args.curiesToURLsYAML
-	output_nodes_file_name = args.outputNodesFile
-	output_edges_file_name = args.outputEdgesFile
-	test_mode = args.test
-
-	nodes_info, edges_info = kg2_util.create_kg2_jsonlines(test_mode)
-	nodes_output = nodes_info[0]
-	edges_output = edges_info[0]
-
-	curies_to_categories_data = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(curies_to_categories_file_name))
-	for mapping_node in curies_to_categories_data["term-mappings"]:
-		NODE_CATEGORY_MAPPINGS[mapping_node] = (curies_to_categories_data["term-mappings"][mapping_node], FILE_MAPPING)
-	for prefix in curies_to_categories_data["prefix-mappings"]:
-		PREFIX_MAPPINGS[prefix] = curies_to_categories_data["prefix-mappings"][prefix]
-
-	input_read_jsonlines_info = kg2_util.start_read_jsonlines(input_file_name)
-	input_data = input_read_jsonlines_info[0]
-
-	ontology_prefixes = set()
-	generate_uri_map(curies_to_urls_file_name)
-	for ontology_item in input_data:
-		process_ontology_item(ontology_item)
-
-	for node_id in SAVED_NODE_INFO:
-		categorize_node(node_id)
-		node_category = NODE_CATEGORY_MAPPINGS[node_id][0]
-		for index in range(len(SAVED_NODE_INFO[node_id])):
-			SAVED_NODE_INFO[node_id][index][CATEGORY_KEY] = node_category
-
-	construct_nodes_and_edges(nodes_output, edges_output)
-
-	kg2_util.close_kg2_jsonlines(nodes_info, edges_info, output_nodes_file_name, output_edges_file_name)
\ No newline at end of file
+    print("Start time: ", kg2_util.date())
+    args = get_args()
+    input_file_name = args.inputFile
+    curies_to_categories_file_name = args.curiesToCategoriesYAML
+    curies_to_urls_file_name = args.curiesToURLsYAML
+    output_nodes_file_name = args.outputNodesFile
+    output_edges_file_name = args.outputEdgesFile
+    test_mode = args.test
+
+    nodes_info, edges_info = kg2_util.create_kg2_jsonlines(test_mode)
+    nodes_output = nodes_info[0]
+    edges_output = edges_info[0]
+
+    curies_to_categories_data = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(curies_to_categories_file_name))
+    for mapping_node in curies_to_categories_data["term-mappings"]:
+        NODE_CATEGORY_MAPPINGS[mapping_node] = (curies_to_categories_data["term-mappings"][mapping_node], FILE_MAPPING)
+    for prefix in curies_to_categories_data["prefix-mappings"]:
+        PREFIX_MAPPINGS[prefix] = curies_to_categories_data["prefix-mappings"][prefix]
+
+    input_read_jsonlines_info = kg2_util.start_read_jsonlines(input_file_name)
+    input_data = input_read_jsonlines_info[0]
+
+    ontology_prefixes = set()
+    generate_uri_map(curies_to_urls_file_name)
+    for ontology_item in input_data:
+        process_ontology_item(ontology_item)
+
+    for node_id in SAVED_NODE_INFO:
+        categorize_node(node_id)
+        node_category = NODE_CATEGORY_MAPPINGS[node_id][0]
+        for index in range(len(SAVED_NODE_INFO[node_id])):
+            SAVED_NODE_INFO[node_id][index][CATEGORY_KEY] = node_category
+
+    construct_nodes_and_edges(nodes_output, edges_output)
+
+    kg2_util.close_kg2_jsonlines(nodes_info, edges_info, output_nodes_file_name, output_edges_file_name)
+
+    print("Finish time: ", kg2_util.date())
diff --git a/extract/owlparser.py b/extract/owlparser.py
index 34e99fe3..fe540f3b 100644
--- a/extract/owlparser.py
+++ b/extract/owlparser.py
@@ -1,524 +1,540 @@
+#!/usr/bin/env python3
+''' owlparser.py: Converts OWL (XML) Files into JSON Lines Representations
+
+    Usage: owlparser.py [--test] <inputFile.yaml> <owlFilePath> <outputFile.jsonl>
+'''
+
 import json
 import argparse
 import datetime
 import kg2_util
 
+__author__ = 'Erica Wood'
+__copyright__ = 'Oregon State University'
+__credits__ = ['Stephen Ramsey', 'Erica Wood']
+__license__ = 'MIT'
+__version__ = '0.1.0'
+__maintainer__ = ''
+__email__ = ''
+__status__ = 'Prototype'
+
+
 def get_args():
-	arg_parser = argparse.ArgumentParser()
-	arg_parser.add_argument('--test', dest='test',
-							action="store_true", default=False)
-	arg_parser.add_argument('inputFile', type=str)
-	arg_parser.add_argument('owlFilePath', type=str)
-	arg_parser.add_argument('outputFile', type=str)
-	return arg_parser.parse_args()
+    arg_parser = argparse.ArgumentParser()
+    arg_parser.add_argument('--test', dest='test',
+                            action="store_true", default=False)
+    arg_parser.add_argument('inputFile', type=str)
+    arg_parser.add_argument('owlFilePath', type=str)
+    arg_parser.add_argument('outputFile', type=str)
+    return arg_parser.parse_args()
 
 def date():
-	return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
 
 class LineElementRead():
-	NONE = 0
-	TAG = 1
-	ATTRIBUTE_TAG = 2
-	ATTRIBUTE_TEXT = 3
-	MAIN = 4
-	END_TAG = 5
+    NONE = 0
+    TAG = 1
+    ATTRIBUTE_TAG = 2
+    ATTRIBUTE_TEXT = 3
+    MAIN = 4
+    END_TAG = 5
 
 class XMLParser():
-	def __init__(self, skip_tags, ignored_attributes, processing_func):
-		self.COMMENT = "!--"
-		self.OUTMOST_TAGS_SKIP = skip_tags
-		self.IGNORED_ATTRIBUTES = ignored_attributes
-		self.processing_func = processing_func
-
-		self.LINE_TYPE_IGNORE = "ignore"
-		self.LINE_TYPE_START_NEST = "start nest"
-		self.LINE_TYPE_START_NEST_WITH_ATTR = "start nest with attributes"
-		self.LINE_TYPE_ENTRY = "entry"
-		self.LINE_TYPE_ENTRY_WITH_ATTR = "entry with attributes"
-		self.LINE_TYPE_ENTRY_ONLY_ATTR = "entry with only attributes"
-		self.LINE_TYPE_END_NEST = "end nest"
-
-		self.KEY_TAG = "tag"
-		self.KEY_ATTRIBUTES = "attributes"
-		self.KEY_TEXT = "ENTRY_TEXT"
-		self.KEY_TYPE = "type"
-
-		# Variables for line reading
-		self.tag = ""
-		self.attributes = dict()
-		self.attribute_tag = ""
-		self.attribute_text = ""
-		self.main_text = ""
-		self.end_tag = ""
-		self.only_tag = False
-		self.start_brackets = 0
-		self.line = ""
-		self.letter = ""
-		self.next_letter = ""
-		self.prev_letter = ""
-		self.type_to_read = LineElementRead.NONE
-
-	def categorize_line(self):
-		# Categorize the type of line
-		line_type = str()
-		out = dict()
-
-		# Putting "only_tag" here isn't necessarily the best idea, but I don't know what else to do with it
-		if self.tag == self.COMMENT or self.tag in self.OUTMOST_TAGS_SKIP or self.end_tag in self.OUTMOST_TAGS_SKIP or self.only_tag:
-			line_type = self.LINE_TYPE_IGNORE
-		else:
-			start_tag_exists = (self.tag != str())
-			attributes_exist = (self.attributes != dict())
-			text_exists = (self.main_text != str())
-			end_tag_exists = (self.end_tag != str())
-
-			if start_tag_exists:
-				if attributes_exist:
-					if text_exists:
-						line_type = self.LINE_TYPE_ENTRY_WITH_ATTR
-						out[self.KEY_TAG] = self.tag
-						out[self.KEY_ATTRIBUTES] = self.attributes
-						out[self.KEY_TEXT] = self.main_text
-					elif end_tag_exists:
-						line_type = self.LINE_TYPE_ENTRY_ONLY_ATTR
-						out[self.KEY_TAG] = self.tag
-						out[self.KEY_ATTRIBUTES] = self.attributes
-					else:
-						line_type = self.LINE_TYPE_START_NEST_WITH_ATTR
-						out[self.KEY_TAG] = self.tag
-						out[self.KEY_ATTRIBUTES] = self.attributes
-				elif text_exists:
-					line_type = self.LINE_TYPE_ENTRY
-					out[self.KEY_TAG] = self.tag
-					out[self.KEY_TEXT] = self.main_text
-				else:
-					line_type = self.LINE_TYPE_START_NEST
-					out[self.KEY_TAG] = self.tag
-			elif end_tag_exists:
-				line_type = self.LINE_TYPE_END_NEST
-				out[self.KEY_TAG] = self.end_tag
-
-		out[self.KEY_TYPE] = line_type
-
-		return out
-
-	def get_letters(self, letter_index):
-		self.letter = self.line[letter_index]
-		self.next_letter = ""
-		self.prev_letter = ""
-		if letter_index + 1 < len(self.line):
-			self.next_letter = self.line[letter_index + 1]
-		if letter_index - 1 >= 0:
-			self.prev_letter = self.line[letter_index - 1]
-
-		if self.letter == '<':
-			self.start_brackets += 1
-		if self.letter == '>':
-			self.start_brackets -= 1
-
-
-	def identify_tag_type(self, letter_index):
-		changed = True
-
-		if self.letter == '<' and letter_index == 0:
-			if self.next_letter != '/':
-				self.type_to_read = LineElementRead.TAG
-		elif self.letter == '/' and self.prev_letter == '<':
-			self.type_to_read = LineElementRead.END_TAG
-		else:
-			changed = False
-
-		return changed
-
-
-	def read_tag(self):
-		changed = True
-
-		if self.letter == ' ' and self.type_to_read == LineElementRead.TAG:
-			self.type_to_read = LineElementRead.ATTRIBUTE_TAG
-		elif self.letter == '>' and self.type_to_read == LineElementRead.TAG and self.start_brackets == 0:
-			self.type_to_read = LineElementRead.MAIN
-
-			if self.prev_letter == '/':
-				print("Warning - strange tag, ignoring", self.line)
-				self.only_tag = True
-		elif self.type_to_read == LineElementRead.TAG:
-			self.tag += self.letter
-		else:
-			changed = False
-
-		return changed
-
-
-	def store_attribute(self):
-		if self.attribute_tag not in self.IGNORED_ATTRIBUTES:
-			self.attributes[self.attribute_tag] = self.attribute_text.strip('/').strip('"')
-		self.attribute_tag = ""
-		self.attribute_text = ""
-
-
-	def read_attributes(self):
-		changed = True
-		start_reading_attributes = (self.type_to_read == LineElementRead.ATTRIBUTE_TAG or self.type_to_read == LineElementRead.ATTRIBUTE_TEXT)
-
-		if self.letter == '>' and start_reading_attributes and self.start_brackets == 0:
-			self.type_to_read = LineElementRead.MAIN
-			
-			self.store_attribute()
-
-			if self.prev_letter == '/':
-				self.end_tag = self.tag
-		elif start_reading_attributes:
-			if self.letter == '=' and self.type_to_read == LineElementRead.ATTRIBUTE_TAG:
-				self.type_to_read = LineElementRead.ATTRIBUTE_TEXT
-			elif self.type_to_read == LineElementRead.ATTRIBUTE_TAG:
-				self.attribute_tag += self.letter
-			elif self.letter == ' ' and self.type_to_read == LineElementRead.ATTRIBUTE_TEXT:
-				self.type_to_read = LineElementRead.ATTRIBUTE_TAG
-				self.store_attribute()
-			elif self.type_to_read == LineElementRead.ATTRIBUTE_TEXT:
-				self.attribute_text += self.letter
-		else:
-			changed = False
+    def __init__(self, skip_tags, ignored_attributes, processing_func):
+        self.COMMENT = "!--"
+        self.OUTMOST_TAGS_SKIP = skip_tags
+        self.IGNORED_ATTRIBUTES = ignored_attributes
+        self.processing_func = processing_func
+
+        self.LINE_TYPE_IGNORE = "ignore"
+        self.LINE_TYPE_START_NEST = "start nest"
+        self.LINE_TYPE_START_NEST_WITH_ATTR = "start nest with attributes"
+        self.LINE_TYPE_ENTRY = "entry"
+        self.LINE_TYPE_ENTRY_WITH_ATTR = "entry with attributes"
+        self.LINE_TYPE_ENTRY_ONLY_ATTR = "entry with only attributes"
+        self.LINE_TYPE_END_NEST = "end nest"
+
+        self.KEY_TAG = "tag"
+        self.KEY_ATTRIBUTES = "attributes"
+        self.KEY_TEXT = "ENTRY_TEXT"
+        self.KEY_TYPE = "type"
+
+        # Variables for line reading
+        self.tag = ""
+        self.attributes = dict()
+        self.attribute_tag = ""
+        self.attribute_text = ""
+        self.main_text = ""
+        self.end_tag = ""
+        self.only_tag = False
+        self.start_brackets = 0
+        self.line = ""
+        self.letter = ""
+        self.next_letter = ""
+        self.prev_letter = ""
+        self.type_to_read = LineElementRead.NONE
+
+    def categorize_line(self):
+        # Categorize the type of line
+        line_type = str()
+        out = dict()
+
+        # Putting "only_tag" here isn't necessarily the best idea, but I don't know what else to do with it
+        if self.tag == self.COMMENT or self.tag in self.OUTMOST_TAGS_SKIP or self.end_tag in self.OUTMOST_TAGS_SKIP or self.only_tag:
+            line_type = self.LINE_TYPE_IGNORE
+        else:
+            start_tag_exists = (self.tag != str())
+            attributes_exist = (self.attributes != dict())
+            text_exists = (self.main_text != str())
+            end_tag_exists = (self.end_tag != str())
+
+            if start_tag_exists:
+                if attributes_exist:
+                    if text_exists:
+                        line_type = self.LINE_TYPE_ENTRY_WITH_ATTR
+                        out[self.KEY_TAG] = self.tag
+                        out[self.KEY_ATTRIBUTES] = self.attributes
+                        out[self.KEY_TEXT] = self.main_text
+                    elif end_tag_exists:
+                        line_type = self.LINE_TYPE_ENTRY_ONLY_ATTR
+                        out[self.KEY_TAG] = self.tag
+                        out[self.KEY_ATTRIBUTES] = self.attributes
+                    else:
+                        line_type = self.LINE_TYPE_START_NEST_WITH_ATTR
+                        out[self.KEY_TAG] = self.tag
+                        out[self.KEY_ATTRIBUTES] = self.attributes
+                elif text_exists:
+                    line_type = self.LINE_TYPE_ENTRY
+                    out[self.KEY_TAG] = self.tag
+                    out[self.KEY_TEXT] = self.main_text
+                else:
+                    line_type = self.LINE_TYPE_START_NEST
+                    out[self.KEY_TAG] = self.tag
+            elif end_tag_exists:
+                line_type = self.LINE_TYPE_END_NEST
+                out[self.KEY_TAG] = self.end_tag
+
+        out[self.KEY_TYPE] = line_type
+
+        return out
+
+    def get_letters(self, letter_index):
+        self.letter = self.line[letter_index]
+        self.next_letter = ""
+        self.prev_letter = ""
+        if letter_index + 1 < len(self.line):
+            self.next_letter = self.line[letter_index + 1]
+        if letter_index - 1 >= 0:
+            self.prev_letter = self.line[letter_index - 1]
+
+        if self.letter == '<':
+            self.start_brackets += 1
+        if self.letter == '>':
+            self.start_brackets -= 1
+
+
+    def identify_tag_type(self, letter_index):
+        changed = True
+
+        if self.letter == '<' and letter_index == 0:
+            if self.next_letter != '/':
+                self.type_to_read = LineElementRead.TAG
+        elif self.letter == '/' and self.prev_letter == '<':
+            self.type_to_read = LineElementRead.END_TAG
+        else:
+            changed = False
+
+        return changed
+
+
+    def read_tag(self):
+        changed = True
+
+        if self.letter == ' ' and self.type_to_read == LineElementRead.TAG:
+            self.type_to_read = LineElementRead.ATTRIBUTE_TAG
+        elif self.letter == '>' and self.type_to_read == LineElementRead.TAG and self.start_brackets == 0:
+            self.type_to_read = LineElementRead.MAIN
+
+            if self.prev_letter == '/':
+                print("Warning - strange tag, ignoring", self.line)
+                self.only_tag = True
+        elif self.type_to_read == LineElementRead.TAG:
+            self.tag += self.letter
+        else:
+            changed = False
+
+        return changed
+
+
+    def store_attribute(self):
+        if self.attribute_tag not in self.IGNORED_ATTRIBUTES:
+            self.attributes[self.attribute_tag] = self.attribute_text.strip('/').strip('"')
+        self.attribute_tag = ""
+        self.attribute_text = ""
+
+
+    def read_attributes(self):
+        changed = True
+        start_reading_attributes = (self.type_to_read == LineElementRead.ATTRIBUTE_TAG or self.type_to_read == LineElementRead.ATTRIBUTE_TEXT)
+
+        if self.letter == '>' and start_reading_attributes and self.start_brackets == 0:
+            self.type_to_read = LineElementRead.MAIN
+            
+            self.store_attribute()
+
+            if self.prev_letter == '/':
+                self.end_tag = self.tag
+        elif start_reading_attributes:
+            if self.letter == '=' and self.type_to_read == LineElementRead.ATTRIBUTE_TAG:
+                self.type_to_read = LineElementRead.ATTRIBUTE_TEXT
+            elif self.type_to_read == LineElementRead.ATTRIBUTE_TAG:
+                self.attribute_tag += self.letter
+            elif self.letter == ' ' and self.type_to_read == LineElementRead.ATTRIBUTE_TEXT:
+                self.type_to_read = LineElementRead.ATTRIBUTE_TAG
+                self.store_attribute()
+            elif self.type_to_read == LineElementRead.ATTRIBUTE_TEXT:
+                self.attribute_text += self.letter
+        else:
+            changed = False
 
-		return changed
+        return changed
 
 
-	def read_main(self):
-		changed = True
-		if self.letter == '<' and self.type_to_read == LineElementRead.MAIN:
-			self.type_to_read = LineElementRead.END_TAG
-		elif self.type_to_read == LineElementRead.MAIN:
-			self.main_text += self.letter
-		else:
-			changed = False
+    def read_main(self):
+        changed = True
+        if self.letter == '<' and self.type_to_read == LineElementRead.MAIN:
+            self.type_to_read = LineElementRead.END_TAG
+        elif self.type_to_read == LineElementRead.MAIN:
+            self.main_text += self.letter
+        else:
+            changed = False
 
-		return changed
+        return changed
 
 
-	def read_end_tag(self):
-		changed = True
-		if self.letter == '>' and self.type_to_read == LineElementRead.END_TAG and self.start_brackets == 0:
-			pass
-		elif self.type_to_read == LineElementRead.END_TAG:
-			self.end_tag += self.letter
-		else:
-			changed = False
+    def read_end_tag(self):
+        changed = True
+        if self.letter == '>' and self.type_to_read == LineElementRead.END_TAG and self.start_brackets == 0:
+            pass
+        elif self.type_to_read == LineElementRead.END_TAG:
+            self.end_tag += self.letter
+        else:
+            changed = False
 
-		return changed
+        return changed
 
 
-	def convert_line(self):
-		self.tag = ""
-		self.attributes = dict()
-		self.attribute_tag = ""
-		self.attribute_text = ""
-		self.main_text = ""
-		self.end_tag = ""
+    def convert_line(self):
+        self.tag = ""
+        self.attributes = dict()
+        self.attribute_tag = ""
+        self.attribute_text = ""
+        self.main_text = ""
+        self.end_tag = ""
 
-		self.type_to_read = LineElementRead.NONE
+        self.type_to_read = LineElementRead.NONE
 
-		self.only_tag = False
+        self.only_tag = False
 
-		self.start_brackets = 0
+        self.start_brackets = 0
 
-		for letter_index in range(len(self.line)):
-			self.get_letters(letter_index)
+        for letter_index in range(len(self.line)):
+            self.get_letters(letter_index)
 
-			# First <
-			if self.identify_tag_type(letter_index):
-				continue
+            # First <
+            if self.identify_tag_type(letter_index):
+                continue
 
-			if self.read_tag():
-				continue
+            if self.read_tag():
+                continue
 
-			if self.read_attributes():
-				continue
+            if self.read_attributes():
+                continue
 
-			if self.read_main():
-				continue
+            if self.read_main():
+                continue
 
-			if self.read_end_tag():
-				continue
+            if self.read_end_tag():
+                continue
 
-		return self.categorize_line()
+        return self.categorize_line()
 
 
-	def convert_nest(self, nest, start_index):
-		nest_dict = dict()
-		curr_index = start_index
+    def convert_nest(self, nest, start_index):
+        nest_dict = dict()
+        curr_index = start_index
 
-		while curr_index < len(nest):
-			element = nest[curr_index]
-			line_type = element[self.KEY_TYPE]
-			line_tag = element[self.KEY_TAG]
-			line_text = element.get(self.KEY_TEXT, None)
-			line_attributes = element.get(self.KEY_ATTRIBUTES, None)
+        while curr_index < len(nest):
+            element = nest[curr_index]
+            line_type = element[self.KEY_TYPE]
+            line_tag = element[self.KEY_TAG]
+            line_text = element.get(self.KEY_TEXT, None)
+            line_attributes = element.get(self.KEY_ATTRIBUTES, None)
 
-			if line_type in [self.LINE_TYPE_START_NEST, self.LINE_TYPE_START_NEST_WITH_ATTR]:
-				if line_tag not in nest_dict:
-					nest_dict[line_tag] = list()
+            if line_type in [self.LINE_TYPE_START_NEST, self.LINE_TYPE_START_NEST_WITH_ATTR]:
+                if line_tag not in nest_dict:
+                    nest_dict[line_tag] = list()
 
-				converted_nest, ret_index = self.convert_nest(nest, curr_index + 1)
+                converted_nest, ret_index = self.convert_nest(nest, curr_index + 1)
 
-				if line_attributes is not None:
-					for attribute in line_attributes:
-						converted_nest[attribute] = line_attributes[attribute]
+                if line_attributes is not None:
+                    for attribute in line_attributes:
+                        converted_nest[attribute] = line_attributes[attribute]
 
-				nest_dict[line_tag].append(converted_nest)
+                nest_dict[line_tag].append(converted_nest)
 
-				curr_index = ret_index + 1
-				continue
+                curr_index = ret_index + 1
+                continue
 
-			if line_type in [self.LINE_TYPE_ENTRY, self.LINE_TYPE_ENTRY_WITH_ATTR, self.LINE_TYPE_ENTRY_ONLY_ATTR]:
-				if line_tag not in nest_dict:
-					nest_dict[line_tag] = list()
+            if line_type in [self.LINE_TYPE_ENTRY, self.LINE_TYPE_ENTRY_WITH_ATTR, self.LINE_TYPE_ENTRY_ONLY_ATTR]:
+                if line_tag not in nest_dict:
+                    nest_dict[line_tag] = list()
 
-				curr_dict = dict()
+                curr_dict = dict()
 
-				if line_text is not None:
-					curr_dict[self.KEY_TEXT] = line_text
+                if line_text is not None:
+                    curr_dict[self.KEY_TEXT] = line_text
 
-				if line_attributes is not None:
-					for attribute in line_attributes:
-						curr_dict[attribute] = line_attributes[attribute]
+                if line_attributes is not None:
+                    for attribute in line_attributes:
+                        curr_dict[attribute] = line_attributes[attribute]
 
-				nest_dict[line_tag].append(curr_dict)
+                nest_dict[line_tag].append(curr_dict)
 
-				curr_index += 1
-				continue
+                curr_index += 1
+                continue
 
-			if line_type in [self.LINE_TYPE_END_NEST]:
-				return nest_dict, curr_index
+            if line_type in [self.LINE_TYPE_END_NEST]:
+                return nest_dict, curr_index
 
-		return nest_dict, curr_index
+        return nest_dict, curr_index
 
 
-	def divide_into_lines(self, input_file_name):
-		curr_str = ""
-		curr_nest = list()
-		curr_nest_tags = list() # Treating it as a stack
-		start_brackets = 0
+    def divide_into_lines(self, input_file_name):
+        curr_str = ""
+        curr_nest = list()
+        curr_nest_tags = list() # Treating it as a stack
+        start_brackets = 0
 
-		with open(input_file_name) as input_file:
-			for line in input_file:
-				line_str = line.strip()
+        with open(input_file_name) as input_file:
+            for line in input_file:
+                line_str = line.strip()
 
-				for letter_index in range(len(line_str)):
-					letter = line_str[letter_index]
-					if letter == '<':
-						start_brackets += 1
-					if letter == '>':
-						start_brackets -= 1
+                for letter_index in range(len(line_str)):
+                    letter = line_str[letter_index]
+                    if letter == '<':
+                        start_brackets += 1
+                    if letter == '>':
+                        start_brackets -= 1
 
-					next_letter = ""
-					if letter_index + 1 < len(line_str):
-						next_letter = line_str[letter_index + 1]
+                    next_letter = ""
+                    if letter_index + 1 < len(line_str):
+                        next_letter = line_str[letter_index + 1]
 
-					curr_str += letter
+                    curr_str += letter
 
-					if letter == '>' and (next_letter == '<' or next_letter == "") and start_brackets == 0:
-						# Only return if nesting
-						self.line = curr_str
-						line_parsed = self.convert_line()
+                    if letter == '>' and (next_letter == '<' or next_letter == "") and start_brackets == 0:
+                        # Only return if nesting
+                        self.line = curr_str
+                        line_parsed = self.convert_line()
 
-						tag = line_parsed.get(self.KEY_TAG, None)
-						assert tag != self.KEY_TEXT # This could cause a massive conflict, but it is unlikely
-						line_type = line_parsed.get(self.KEY_TYPE, None)
-						attribute_keys = line_parsed.get(self.KEY_ATTRIBUTES, dict()).keys()
+                        tag = line_parsed.get(self.KEY_TAG, None)
+                        assert tag != self.KEY_TEXT # This could cause a massive conflict, but it is unlikely
+                        line_type = line_parsed.get(self.KEY_TYPE, None)
+                        attribute_keys = line_parsed.get(self.KEY_ATTRIBUTES, dict()).keys()
 
-						if line_type != self.LINE_TYPE_IGNORE:
-							curr_nest.append(line_parsed)
+                        if line_type != self.LINE_TYPE_IGNORE:
+                            curr_nest.append(line_parsed)
 
-						output_nest = (line_type in [self.LINE_TYPE_ENTRY, self.LINE_TYPE_ENTRY_WITH_ATTR, self.LINE_TYPE_ENTRY_ONLY_ATTR] and len(curr_nest_tags) == 0)
+                        output_nest = (line_type in [self.LINE_TYPE_ENTRY, self.LINE_TYPE_ENTRY_WITH_ATTR, self.LINE_TYPE_ENTRY_ONLY_ATTR] and len(curr_nest_tags) == 0)
 
-						if line_type in [self.LINE_TYPE_START_NEST, self.LINE_TYPE_START_NEST_WITH_ATTR]:
-							curr_nest_tags.append(tag)
-						elif line_type == self.LINE_TYPE_END_NEST:
-							popped_curr_nest_tag = curr_nest_tags.pop()
-							assert popped_curr_nest_tag == tag, curr_nest
-							if len(curr_nest_tags) == 0:
-								output_nest = True
-						if output_nest: 
-							nest_dict, _ = self.convert_nest(curr_nest, 0)
+                        if line_type in [self.LINE_TYPE_START_NEST, self.LINE_TYPE_START_NEST_WITH_ATTR]:
+                            curr_nest_tags.append(tag)
+                        elif line_type == self.LINE_TYPE_END_NEST:
+                            popped_curr_nest_tag = curr_nest_tags.pop()
+                            assert popped_curr_nest_tag == tag, curr_nest
+                            if len(curr_nest_tags) == 0:
+                                output_nest = True
+                        if output_nest: 
+                            nest_dict, _ = self.convert_nest(curr_nest, 0)
 
-							self.processing_func(nest_dict)
+                            self.processing_func(nest_dict)
 
-							curr_nest = list()
-							curr_nest_tag = str()
+                            curr_nest = list()
+                            curr_nest_tag = str()
 
-						curr_str = ""
+                        curr_str = ""
 
-				if curr_str != "":
-					# divide lines by a space
-					curr_str += ' '
+                if curr_str != "":
+                    # divide lines by a space
+                    curr_str += ' '
 
 
 class OWLParser():
-	def __init__(self, input_files, input_file_names, owl_file_path, output_file_name):
-		self.XML_TAG = "?xml"
-		self.RDF_TAG = "rdf:RDF"
-		self.DOCTYPE_TAG = "!DOCTYPE"
-		self.CLASS_TAG = "owl:Class"
-		self.RESTRICTION_TAG = "owl:Restriction"
-		self.SUBCLASS_TAG = "rdfs:subClassOf"
-		self.NODEID_TAG = "rdf:nodeID"
-		self.RDF_ABOUT_TAG = "rdf:about"
-		self.GENID_PREFIX = "genid"
-
-		self.OWL_SOURCE_KEY = "owl_source"
-		self.OWL_SOURCE_NAME_KEY = "owl_source_name"
-
-		self.skip_tags = [self.XML_TAG, self.RDF_TAG, self.DOCTYPE_TAG]
-
-		self.ignored_attributes = ["xml:lang"]
-
-		self.xml_parser = XMLParser(self.skip_tags, self.ignored_attributes, self.triage_nest_dict)
-
-		self.GENID_REMAINING_NESTS = dict()
-		self.GENID_TO_ID = dict()
-		self.ID_TO_GENIDS = dict()
-
-		self.input_files = input_files
-		self.input_file_names = input_file_names
-		self.owl_file_path = owl_file_path
-		self.output_file_name = output_file_name
-
-		self.output_info = kg2_util.create_single_jsonlines()
-		self.output = self.output_info[0]
-
-	def check_for_class_genids(self, nest_dict):
-		genids = list()
-
-		nest_dict_classes = nest_dict.get(self.CLASS_TAG, list())
-		for nest_class_index in range(len(nest_dict_classes)):
-			nest_class = nest_dict_classes[nest_class_index]
-			nest_subclasses = nest_class.get(self.SUBCLASS_TAG, list())
-			for nest_subclass_index in range(len(nest_subclasses)):
-				nest_subclass = nest_subclasses[nest_subclass_index]
-				potential_genid = nest_subclass.get(self.NODEID_TAG, str())
-				if potential_genid.startswith(self.GENID_PREFIX):
-					genids.append(potential_genid)
-
-		return genids
-
-
-	def check_for_restriction_genids(self, nest_dict):
-		for nest_restriction in nest_dict.get(self.RESTRICTION_TAG, dict()):
-			potential_genid = nest_restriction.get(self.NODEID_TAG, str())
-			if potential_genid.startswith(self.GENID_PREFIX):
-					return potential_genid
-		return None
-
-	def extract_class_id(self, nest_dict):
-		nest_dict_classes = nest_dict.get(self.CLASS_TAG, list())
-		# Can't have competing class_ids
-		assert len(nest_dict_classes) <= 1
-
-		for nest_class_index in range(len(nest_dict_classes)):
-			nest_class = nest_dict_classes[nest_class_index]
-			return nest_class.get(self.RDF_ABOUT_TAG, str())
-
-	def store_genid_nest_in_class_nest(self, genid, genid_nest, class_nest):
-		output_class_nest = class_nest
-		
-		nest_dict_classes = class_nest.get(self.CLASS_TAG, list())
-		for nest_class_index in range(len(nest_dict_classes)):
-			nest_class = nest_dict_classes[nest_class_index]
-			nest_subclasses = nest_class.get(self.SUBCLASS_TAG, list())
-			for nest_subclass_index in range(len(nest_subclasses)):
-				nest_subclass = nest_subclasses[nest_subclass_index]
-				potential_genid = nest_subclass.get(self.NODEID_TAG, str())
-				if potential_genid == genid:
-					output_class_nest[self.CLASS_TAG][nest_class_index][self.SUBCLASS_TAG][nest_subclass_index][self.RESTRICTION_TAG] = genid_nest[self.RESTRICTION_TAG]
-
-		return output_class_nest
-
-
-	def write_to_output(self, output_dict, source_file):
-		output_dict[self.OWL_SOURCE_KEY] = source_file
-		output_dict[self.OWL_SOURCE_NAME_KEY] = self.input_file_names[source_file]
-		self.output.write(output_dict)
-
-		return
-
-
-	def triage_nest_dict(self, nest_dict):
-		genids = self.check_for_class_genids(nest_dict)
-		restriction_genid = self.check_for_restriction_genids(nest_dict)
-		class_id = self.extract_class_id(nest_dict)
-
-		if len(genids) > 0:
-			for genid in genids:
-				self.GENID_TO_ID[genid] = class_id
-			self.ID_TO_GENIDS[class_id] = genids
-			self.GENID_REMAINING_NESTS[class_id] = nest_dict
-		elif restriction_genid is not None:
-			class_id = self.GENID_TO_ID.get(restriction_genid, str())
-			if len(class_id) == 0:
-				print("WARNING WITH:", restriction_genid, "- NO CLASS_ID FOUND")
-
-				# Save to output despite not matching with an existing class
-				self.write_to_output(nest_dict, self.input_file)
-				return
-			class_nest = self.GENID_REMAINING_NESTS[class_id]
-			self.ID_TO_GENIDS[class_id].remove(restriction_genid)
-			updated_class_nest = self.store_genid_nest_in_class_nest(restriction_genid, nest_dict, class_nest)
-
-			if len(self.ID_TO_GENIDS[class_id]) > 0:
-				self.GENID_REMAINING_NESTS[class_id] = updated_class_nest
-			else:
-				# Since all of the genids used in this class have been matched, output
-				self.write_to_output(nest_dict, self.input_file)
-				self.GENID_REMAINING_NESTS[class_id] = None
-		else:
-			# There are no genids that need to be worked with, so just output
-			self.write_to_output(nest_dict, self.input_file)
-
-
-	def parse_OWL_file(self):
-		for input_file in self.input_files:
-			self.input_file = input_file
-			print("Reading:", input_file, "starting at", date())
-			self.xml_parser.divide_into_lines(self.owl_file_path + input_file)
-
-			# Genid wasn't filled, still want to include them though
-			for item in self.GENID_REMAINING_NESTS:
-				if self.GENID_REMAINING_NESTS[item] != None:
-					self.write_to_output(self.GENID_REMAINING_NESTS[item], self.input_file)
-
-			# Refresh everything for the next file
-			self.GENID_REMAINING_NESTS = dict()
-			self.GENID_TO_ID = dict()
-			self.ID_TO_GENIDS = dict()
-
-		kg2_util.close_single_jsonlines(self.output_info, self.output_file_name)
+    def __init__(self, input_files, input_file_names, owl_file_path, output_file_name):
+        self.XML_TAG = "?xml"
+        self.RDF_TAG = "rdf:RDF"
+        self.DOCTYPE_TAG = "!DOCTYPE"
+        self.CLASS_TAG = "owl:Class"
+        self.RESTRICTION_TAG = "owl:Restriction"
+        self.SUBCLASS_TAG = "rdfs:subClassOf"
+        self.NODEID_TAG = "rdf:nodeID"
+        self.RDF_ABOUT_TAG = "rdf:about"
+        self.GENID_PREFIX = "genid"
+
+        self.OWL_SOURCE_KEY = "owl_source"
+        self.OWL_SOURCE_NAME_KEY = "owl_source_name"
+
+        self.skip_tags = [self.XML_TAG, self.RDF_TAG, self.DOCTYPE_TAG]
+
+        self.ignored_attributes = ["xml:lang"]
+
+        self.xml_parser = XMLParser(self.skip_tags, self.ignored_attributes, self.triage_nest_dict)
+
+        self.GENID_REMAINING_NESTS = dict()
+        self.GENID_TO_ID = dict()
+        self.ID_TO_GENIDS = dict()
+
+        self.input_files = input_files
+        self.input_file_names = input_file_names
+        self.owl_file_path = owl_file_path
+        self.output_file_name = output_file_name
+
+        self.output_info = kg2_util.create_single_jsonlines()
+        self.output = self.output_info[0]
+
+    def check_for_class_genids(self, nest_dict):
+        genids = list()
+
+        nest_dict_classes = nest_dict.get(self.CLASS_TAG, list())
+        for nest_class_index in range(len(nest_dict_classes)):
+            nest_class = nest_dict_classes[nest_class_index]
+            nest_subclasses = nest_class.get(self.SUBCLASS_TAG, list())
+            for nest_subclass_index in range(len(nest_subclasses)):
+                nest_subclass = nest_subclasses[nest_subclass_index]
+                potential_genid = nest_subclass.get(self.NODEID_TAG, str())
+                if potential_genid.startswith(self.GENID_PREFIX):
+                    genids.append(potential_genid)
+
+        return genids
+
+
+    def check_for_restriction_genids(self, nest_dict):
+        for nest_restriction in nest_dict.get(self.RESTRICTION_TAG, dict()):
+            potential_genid = nest_restriction.get(self.NODEID_TAG, str())
+            if potential_genid.startswith(self.GENID_PREFIX):
+                    return potential_genid
+        return None
+
+    def extract_class_id(self, nest_dict):
+        nest_dict_classes = nest_dict.get(self.CLASS_TAG, list())
+        # Can't have competing class_ids
+        assert len(nest_dict_classes) <= 1
+
+        for nest_class_index in range(len(nest_dict_classes)):
+            nest_class = nest_dict_classes[nest_class_index]
+            return nest_class.get(self.RDF_ABOUT_TAG, str())
+
+    def store_genid_nest_in_class_nest(self, genid, genid_nest, class_nest):
+        output_class_nest = class_nest
+        
+        nest_dict_classes = class_nest.get(self.CLASS_TAG, list())
+        for nest_class_index in range(len(nest_dict_classes)):
+            nest_class = nest_dict_classes[nest_class_index]
+            nest_subclasses = nest_class.get(self.SUBCLASS_TAG, list())
+            for nest_subclass_index in range(len(nest_subclasses)):
+                nest_subclass = nest_subclasses[nest_subclass_index]
+                potential_genid = nest_subclass.get(self.NODEID_TAG, str())
+                if potential_genid == genid:
+                    output_class_nest[self.CLASS_TAG][nest_class_index][self.SUBCLASS_TAG][nest_subclass_index][self.RESTRICTION_TAG] = genid_nest[self.RESTRICTION_TAG]
+
+        return output_class_nest
+
+
+    def write_to_output(self, output_dict, source_file):
+        output_dict[self.OWL_SOURCE_KEY] = source_file
+        output_dict[self.OWL_SOURCE_NAME_KEY] = self.input_file_names[source_file]
+        self.output.write(output_dict)
+
+        return
+
+
+    def triage_nest_dict(self, nest_dict):
+        genids = self.check_for_class_genids(nest_dict)
+        restriction_genid = self.check_for_restriction_genids(nest_dict)
+        class_id = self.extract_class_id(nest_dict)
+
+        if len(genids) > 0:
+            for genid in genids:
+                self.GENID_TO_ID[genid] = class_id
+            self.ID_TO_GENIDS[class_id] = genids
+            self.GENID_REMAINING_NESTS[class_id] = nest_dict
+        elif restriction_genid is not None:
+            class_id = self.GENID_TO_ID.get(restriction_genid, str())
+            if len(class_id) == 0:
+                print("WARNING WITH:", restriction_genid, "- NO CLASS_ID FOUND")
+
+                # Save to output despite not matching with an existing class
+                self.write_to_output(nest_dict, self.input_file)
+                return
+            class_nest = self.GENID_REMAINING_NESTS[class_id]
+            self.ID_TO_GENIDS[class_id].remove(restriction_genid)
+            updated_class_nest = self.store_genid_nest_in_class_nest(restriction_genid, nest_dict, class_nest)
+
+            if len(self.ID_TO_GENIDS[class_id]) > 0:
+                self.GENID_REMAINING_NESTS[class_id] = updated_class_nest
+            else:
+                # Since all of the genids used in this class have been matched, output
+                self.write_to_output(nest_dict, self.input_file)
+                self.GENID_REMAINING_NESTS[class_id] = None
+        else:
+            # There are no genids that need to be worked with, so just output
+            self.write_to_output(nest_dict, self.input_file)
+
+
+    def parse_OWL_file(self):
+        for input_file in self.input_files:
+            self.input_file = input_file
+            print("Reading:", input_file, "starting at", date())
+            self.xml_parser.divide_into_lines(self.owl_file_path + input_file)
+
+            # Genid wasn't filled, still want to include them though
+            for item in self.GENID_REMAINING_NESTS:
+                if self.GENID_REMAINING_NESTS[item] != None:
+                    self.write_to_output(self.GENID_REMAINING_NESTS[item], self.input_file)
+
+            # Refresh everything for the next file
+            self.GENID_REMAINING_NESTS = dict()
+            self.GENID_TO_ID = dict()
+            self.ID_TO_GENIDS = dict()
+
+        kg2_util.close_single_jsonlines(self.output_info, self.output_file_name)
 
 
 def identify_and_download_input_files(ont_load_inventory, path_to_owl_files):
-	input_files = list()
-	input_file_names = dict()
-	owl_file_path = path_to_owl_files.rstrip('/') + "/"
-	for item in ont_load_inventory:
-		input_files.append(item['file'])
-		input_file_names[item['file']] = item['title']
-		print("Downloading:", item['file'], "starting at", date())
-		kg2_util.download_file_if_not_exist_locally(item['url'], owl_file_path + item['file'])
-		print("Download of:", item['file'], "finished at", date())
-
-	return input_files, input_file_names, owl_file_path
+    input_files = list()
+    input_file_names = dict()
+    owl_file_path = path_to_owl_files.rstrip('/') + "/"
+    for item in ont_load_inventory:
+        input_files.append(item['file'])
+        input_file_names[item['file']] = item['title']
+        print("Downloading:", item['file'], "starting at", date())
+        kg2_util.download_file_if_not_exist_locally(item['url'], owl_file_path + item['file'])
+        print("Download of:", item['file'], "finished at", date())
+
+    return input_files, input_file_names, owl_file_path
 
 if __name__ == '__main__':
-	args = get_args()
-	input_file_name = args.inputFile
-	owl_path = args.owlFilePath
-	output_file_name = args.outputFile
-
-	ont_load_inventory = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(input_file_name))
-	input_files, input_file_names, owl_file_path = identify_and_download_input_files(ont_load_inventory, owl_path)
-
-	print("Files:", input_files)
-	print("Start Time:", date())
-	owl_parser = OWLParser(input_files, input_file_names, owl_file_path, output_file_name)
-	owl_parser.parse_OWL_file()
-	print("End Time:", date())
\ No newline at end of file
+    args = get_args()
+    input_file_name = args.inputFile
+    owl_path = args.owlFilePath
+    output_file_name = args.outputFile
+
+    ont_load_inventory = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(input_file_name))
+    input_files, input_file_names, owl_file_path = identify_and_download_input_files(ont_load_inventory, owl_path)
+
+    print("Files:", input_files)
+    print("Start Time:", date())
+    owl_parser = OWLParser(input_files, input_file_names, owl_file_path, output_file_name)
+    owl_parser.parse_OWL_file()
+    print("End Time:", date())
\ No newline at end of file