From 900f0d00bb7030e1833a194688505de3d1f08ad6 Mon Sep 17 00:00:00 2001
From: Evan Morris <evandietzmorris@gmail.com>
Date: Tue, 1 Oct 2024 03:26:52 -0400
Subject: [PATCH] fixing issue with attributes and adding tests

Attributes on edges were being included inside the attributes section and in their original form on the top level, this fixes that issue and checks for that situation for edges and nodes. Also cleans up the edge conversion part of cypher.py and adds better comments.
---
 reasoner_transpiler/cypher.py       | 55 +++++++++++++++++++----------
 tests/test_props.py                 | 17 ++++-----
 tests/test_qualifier_constraints.py |  2 ++
 tests/test_sources.py               |  1 +
 4 files changed, 46 insertions(+), 29 deletions(-)

diff --git a/reasoner_transpiler/cypher.py b/reasoner_transpiler/cypher.py
index 72242c0..787427c 100644
--- a/reasoner_transpiler/cypher.py
+++ b/reasoner_transpiler/cypher.py
@@ -3,7 +3,6 @@
 import json
 
 from collections import defaultdict
-from neo4j import AsyncResult, Result
 
 from .attributes import transform_attributes, EDGE_SOURCE_PROPS
 from .matching import match_query
@@ -310,6 +309,8 @@ def transform_nodes_list(nodes):
 
 
 def transform_edges_list(edges):
+    # See convert_bolt_edge_to_dict() for details on the contents of edges,
+    # it is a list of lists (which can also be lists), representing unique edges from the graph
     kg_edges = {}
     element_id_to_edge_id = {}
     for edge_index, cypher_edge_result in enumerate(edges):
@@ -324,19 +325,19 @@ def transform_edges_list(edges):
             cypher_edges = list()  # this looks weird, but it's necessary to make a list of one list (I think?)
             cypher_edges.append(cypher_edge_result)
 
-        # transform the edge into TRAPI
+        # transform the edge(s) into TRAPI
         for cypher_edge in cypher_edges:
-            edge_element_id, edge_dict = convert_bolt_edge_to_dict(cypher_edge)
-            edge_id = edge_dict.get('id', f'edge_{edge_index}')
+            # transform the edge into TRAPI and return:
+            # edge_element_id - neo4j element id,
+            # edge_id - the edge id that will be used for edges in the TRAPI knowledge graph and edge bindings
+            # trapi_edge - a dictionary that represents an edge in the knowledge_graph part of the TRAPI response
+            edge_element_id, edge_id, trapi_edge = convert_bolt_edge_to_trapi(cypher_edge)
+            if not edge_id:
+                edge_id = f'e_{edge_index}'
+            # make a mapping that will be used to look up the edge id by element id later
             element_id_to_edge_id[edge_element_id] = edge_id
-            # get properties matching EDGE_SOURCE_PROPS keys, remove biolink: if needed,
-            # then pass (key, value) tuples to construct_sources_tree for formatting, constructing the 'sources' section
-            edge_dict['sources'] = construct_sources_tree([
-                    (edge_source_prop.removeprefix('biolink:'), edge_dict.get(edge_source_prop))
-                    for edge_source_prop in EDGE_SOURCE_PROPS if edge_dict.get(edge_source_prop, None)])
-            # convert all remaining attributes to TRAPI format
-            edge_dict.update(transform_attributes(edge_dict, node=False))
-            kg_edges[edge_id] = edge_dict
+            # add it to the knowledge graph
+            kg_edges[edge_id] = trapi_edge
     return kg_edges, element_id_to_edge_id
 
 
@@ -412,23 +413,39 @@ def convert_jolt_node_to_dict(jolt_node):
     return node
 
 
-def convert_bolt_edge_to_dict(bolt_edge):
+# Convert a list representing an edge from cypher results into a dictionary.
+# This is not any standard object from the bolt driver, it's a list generated by a specific cypher return clause, like:
+# [elementId(edge_1), startNode(edge_1).id, type(edge_1), endNode(edge_1).id, properties(edge_1)]
+# This is done to prevent including often redundant node and edge properties on nodes and edges in pathway results.
+# See the cypher generated in the edges_assemble clause in assemble_results() for more details.
+def convert_bolt_edge_to_trapi(bolt_edge):
     if not bolt_edge:
         print(f'Tried to convert a missing edge: {bolt_edge}')
         return None, None
 
-    # Convert a list representing an edge from cypher results into a dictionary.
-    # This is not a standard Edge object from the bolt driver, it's a list product of a specific cypher return format.
-    # This is done to prevent including redundant node information on every edge.
-    # See the cypher generated in the edges_assemble clause in assemble_results() for more details.
     element_id = bolt_edge[0]
     converted_edge = {
         'subject': bolt_edge[1],
         'predicate': bolt_edge[2],
         'object': bolt_edge[3],
-        **bolt_edge[4]
     }
-    return element_id, converted_edge
+    # edge_props - any other properties from the edge
+    edge_props = {**bolt_edge[4]}
+
+    # get the id if there is one on the edge
+    edge_id = edge_props.pop('id', None)
+
+    # get properties matching EDGE_SOURCE_PROPS keys, remove biolink: if needed,
+    # then pass (key, value) tuples to construct_sources_tree for formatting, constructing the sources section
+    converted_edge['sources'] = construct_sources_tree([
+        (edge_source_prop.removeprefix('biolink:'), edge_props.pop(edge_source_prop))
+        for edge_source_prop in EDGE_SOURCE_PROPS if edge_source_prop in edge_props])
+
+    # convert all remaining attributes to TRAPI format, constructing the attributes section
+    converted_edge.update(transform_attributes(edge_props, node=False))
+
+    # return element id, a dict with the core edge properties, and a dict with any other properties
+    return element_id, edge_id, converted_edge,
 
 
 def convert_jolt_edge_to_dict(jolt_edges, jolt_element_id_lookup):
diff --git a/tests/test_props.py b/tests/test_props.py
index 8efd567..64c2bde 100644
--- a/tests/test_props.py
+++ b/tests/test_props.py
@@ -23,15 +23,10 @@ def test_numeric(neo4j_driver):
     }
     output = neo4j_driver.run(get_query(qgraph), convert_to_trapi=True, qgraph=qgraph)
     assert len(output["results"]) == 1
-    results = sorted(
-        output["knowledge_graph"]["nodes"].values(),
-        key=lambda node: node["name"],
-    )
-    expected_nodes = [
-        "CASP3",
-    ]
-    for ind, result in enumerate(results):
-        assert result["name"] == expected_nodes[ind]
+
+    node_1 = list(output["knowledge_graph"]["nodes"].values())[0]
+    assert node_1["name"] == "CASP3"
+    assert "length" not in node_1
 
 
 def test_string(neo4j_driver):
@@ -176,7 +171,9 @@ def test_valid_biolink_attribute_without_mapping(neo4j_driver):
     output = neo4j_driver.run(get_query(qgraph), convert_to_trapi=True, qgraph=qgraph)
     edges = output["knowledge_graph"]["edges"]
     assert len(edges) == 1
-    attributes = list(edges.values())[0]["attributes"]
+    edge = list(edges.values())[0]
+    assert "p_value" not in edge
+    attributes = edge["attributes"]
     assert len(attributes) == 1
     assert attributes[0] == {
         "original_attribute_name": "p_value",
diff --git a/tests/test_qualifier_constraints.py b/tests/test_qualifier_constraints.py
index fe9f8ba..8fbe81c 100644
--- a/tests/test_qualifier_constraints.py
+++ b/tests/test_qualifier_constraints.py
@@ -40,6 +40,8 @@ def test_single_qualifier(neo4j_driver):
     qualified_predicate_output = {'qualifier_type_id': 'biolink:qualified_predicate', 'qualifier_value': 'biolink:causes'}
     qualifier_output = output["knowledge_graph"]["edges"]["qualified_edge_multiple_qualifier"]["qualifiers"]
     assert qualified_predicate_output in qualifier_output
+    assert "qualified_predicate" not in output["knowledge_graph"]["edges"]["qualified_edge_multiple_qualifier"]
+    assert "biolink:qualified_predicate" not in output["knowledge_graph"]["edges"]["qualified_edge_multiple_qualifier"]
 
 
 def test_multi_qualifier(neo4j_driver):
diff --git a/tests/test_sources.py b/tests/test_sources.py
index 935f4dc..ae8ea9b 100644
--- a/tests/test_sources.py
+++ b/tests/test_sources.py
@@ -31,6 +31,7 @@ def test_primary_source(neo4j_driver):
     edge_sources = {
         e: edge["sources"] for e, edge in output["knowledge_graph"]["edges"].items()
     }
+    assert not any(['primary_knowledge_source' in edge for edge in output["knowledge_graph"]["edges"].values()])
     assert edge_sources["metformin_treats_t2d"] == [
         {'resource_id': 'infores:test', 'resource_role': 'primary_knowledge_source'},
         {'resource_id': 'ctd', 'resource_role': 'aggregator_knowledge_source',