Don't store redundant columns in PropertyGraph Dataframes

Closes rapidsai#2400 I still need to update MG tests. I'll also remove the in-code assertions, since they won't always be True, because a column name could have previously been used as a property. Nevertheless, seeing these assertions pass should give us warm-fuzzies :)
eriknw · Jul 27, 2022 · 1c280e0 · 1c280e0
1 parent 2c50989
commit 1c280e0
Show file tree

Hide file tree

Showing 3 changed files with 58 additions and 16 deletions.
diff --git a/python/cugraph/cugraph/dask/structure/mg_property_graph.py b/python/cugraph/cugraph/dask/structure/mg_property_graph.py
@@ -314,7 +314,9 @@ def add_vertex_data(self,
             # remove the ones to keep
             column_names_to_drop.difference_update(property_columns +
                                                    default_vertex_columns)
-            tmp_df = tmp_df.drop(labels=column_names_to_drop, axis=1)
+        else:
+            column_names_to_drop = {vertex_col_name}
+        tmp_df = tmp_df.drop(labels=column_names_to_drop, axis=1)
 
         # Save the original dtypes for each new column so they can be restored
         # prior to constructing subgraphs (since column dtypes may get altered
@@ -330,6 +332,10 @@ def add_vertex_data(self,
         latest = dict([(n, self.__vertex_prop_dataframe[n])
                        for n in self.__vertex_prop_dataframe.columns])
         self.__vertex_prop_eval_dict.update(latest)
+        # TODO: delete asserts before merging
+        assert vertex_col_name not in self.__vertex_prop_dataframe.columns
+        assert vertex_col_name not in self.__vertex_prop_dtypes
+        assert vertex_col_name not in self.__vertex_prop_eval_dict
 
     def add_edge_data(self,
                       dataframe,
@@ -433,7 +439,9 @@ def add_edge_data(self,
             # remove the ones to keep
             column_names_to_drop.difference_update(property_columns +
                                                    default_edge_columns)
-            tmp_df = tmp_df.drop(labels=column_names_to_drop, axis=1)
+        else:
+            column_names_to_drop = {vertex_col_names[0], vertex_col_names[1]}
+        tmp_df = tmp_df.drop(labels=column_names_to_drop, axis=1)
 
         # Save the original dtypes for each new column so they can be restored
         # prior to constructing subgraphs (since column dtypes may get altered
@@ -447,6 +455,15 @@ def add_edge_data(self,
         latest = dict([(n, self.__edge_prop_dataframe[n])
                        for n in self.__edge_prop_dataframe.columns])
         self.__edge_prop_eval_dict.update(latest)
+        # TODO: delete asserts before merging
+        src = vertex_col_names[0]
+        dst = vertex_col_names[1]
+        assert src not in self.__edge_prop_dataframe.columns
+        assert src not in self.__edge_prop_eval_dict
+        assert src not in self.__edge_prop_dtypes
+        assert dst not in self.__edge_prop_dataframe.columns
+        assert dst not in self.__edge_prop_eval_dict
+        assert dst not in self.__edge_prop_dtypes
 
     def select_vertices(self, expr, from_previous_selection=None):
         raise NotImplementedError

diff --git a/python/cugraph/cugraph/structure/property_graph.py b/python/cugraph/cugraph/structure/property_graph.py
@@ -329,7 +329,9 @@ def add_vertex_data(self,
             # remove the ones to keep
             column_names_to_drop.difference_update(property_columns +
                                                    default_vertex_columns)
-            tmp_df = tmp_df.drop(labels=column_names_to_drop, axis=1)
+        else:
+            column_names_to_drop = {vertex_col_name}
+        tmp_df = tmp_df.drop(labels=column_names_to_drop, axis=1)
 
         # Save the original dtypes for each new column so they can be restored
         # prior to constructing subgraphs (since column dtypes may get altered
@@ -345,6 +347,10 @@ def add_vertex_data(self,
         latest = dict([(n, self.__vertex_prop_dataframe[n])
                        for n in self.__vertex_prop_dataframe.columns])
         self.__vertex_prop_eval_dict.update(latest)
+        # TODO: delete asserts before merging
+        assert vertex_col_name not in self.__vertex_prop_dataframe.columns
+        assert vertex_col_name not in self.__vertex_prop_dtypes
+        assert vertex_col_name not in self.__vertex_prop_eval_dict
 
     def add_edge_data(self,
                       dataframe,
@@ -451,7 +457,9 @@ def add_edge_data(self,
             # remove the ones to keep
             column_names_to_drop.difference_update(property_columns +
                                                    default_edge_columns)
-            tmp_df = tmp_df.drop(labels=column_names_to_drop, axis=1)
+        else:
+            column_names_to_drop = {vertex_col_names[0], vertex_col_names[1]}
+        tmp_df = tmp_df.drop(labels=column_names_to_drop, axis=1)
 
         # Save the original dtypes for each new column so they can be restored
         # prior to constructing subgraphs (since column dtypes may get altered
@@ -469,6 +477,15 @@ def add_edge_data(self,
         latest = dict([(n, self.__edge_prop_dataframe[n])
                        for n in self.__edge_prop_dataframe.columns])
         self.__edge_prop_eval_dict.update(latest)
+        # TODO: delete asserts before merging
+        src = vertex_col_names[0]
+        dst = vertex_col_names[1]
+        assert src not in self.__edge_prop_dataframe.columns
+        assert src not in self.__edge_prop_eval_dict
+        assert src not in self.__edge_prop_dtypes
+        assert dst not in self.__edge_prop_dataframe.columns
+        assert dst not in self.__edge_prop_eval_dict
+        assert dst not in self.__edge_prop_dtypes
 
     def select_vertices(self, expr, from_previous_selection=None):
         """

diff --git a/python/cugraph/cugraph/tests/test_property_graph.py b/python/cugraph/cugraph/tests/test_property_graph.py
@@ -290,7 +290,7 @@ def test_add_vertex_data(df_type):
 
     assert pG.num_vertices == 5
     assert pG.num_edges == 0
-    expected_props = merchants[0].copy()
+    expected_props = set(merchants[0].copy()) - {'merchant_id'}
     assert sorted(pG.vertex_property_names) == sorted(expected_props)
 
 
@@ -473,8 +473,8 @@ def test_add_edge_data(df_type):
 
     assert pG.num_vertices == 7
     assert pG.num_edges == 4
-    expected_props = ["merchant_id", "user_id",
-                      "volume", "time", "card_num", "card_type"]
+    # Original SRC and DST columns no longer include "merchant_id", "user_id"
+    expected_props = ["volume", "time", "card_num", "card_type"]
     assert sorted(pG.edge_property_names) == sorted(expected_props)
 
 
@@ -643,8 +643,9 @@ def test_extract_subgraph_specific_query(dataset1_PropertyGraph):
     pG = dataset1_PropertyGraph
     tcn = PropertyGraph.type_col_name
 
+    # _DST_ below used to be referred to as merchant_id
     selection = pG.select_edges(f"({tcn}=='transactions') & "
-                                "(merchant_id==4) & "
+                                "(_DST_==4) & "
                                 "(time>1639085000)")
     G = pG.extract_subgraph(selection=selection,
                             create_using=DiGraph_inst,
@@ -738,7 +739,13 @@ def test_extract_subgraph_no_edges(dataset1_PropertyGraph):
     """
     pG = dataset1_PropertyGraph
 
-    selection = pG.select_vertices("(_TYPE_=='merchants') & (merchant_id==86)")
+    # "merchant_id" column is no longer saved; use as "_VERTEX_"
+    with pytest.raises(NameError, match="merchant_id"):
+        selection = pG.select_vertices(
+            "(_TYPE_=='merchants') & (merchant_id==86)"
+        )
+
+    selection = pG.select_vertices("(_TYPE_=='merchants') & (_VERTEX_==86)")
     G = pG.extract_subgraph(selection=selection)
 
     assert len(G.edgelist.edgelist_df) == 0
@@ -1066,13 +1073,14 @@ def test_property_names_attrs(dataset1_PropertyGraph):
     """
     pG = dataset1_PropertyGraph
 
-    expected_vert_prop_names = ["merchant_id", "merchant_location",
-                                "merchant_size", "merchant_sales",
-                                "merchant_num_employees", "merchant_name",
-                                "user_id", "user_location", "vertical"]
-    expected_edge_prop_names = ["user_id", "merchant_id", "volume", "time",
-                                "card_num", "card_type", "user_id_1",
-                                "user_id_2", "relationship_type", "stars"]
+    # _VERTEX_ columns: "merchant_id", "user_id"
+    expected_vert_prop_names = ["merchant_location", "merchant_size",
+                                "merchant_sales", "merchant_num_employees",
+                                "user_location", "merchant_name", "vertical"]
+    # _SRC_ and _DST_ columns: "user_id", "user_id_1", "user_id_2"
+    # Note that "merchant_id" is a property in for type "transactions"
+    expected_edge_prop_names = ["merchant_id", "volume", "time", "card_num",
+                                "card_type", "relationship_type", "stars"]
 
     # Extracting a subgraph with weights has/had a side-effect of adding a
     # weight column, so call extract_subgraph() to ensure the internal weight