Don't store redundant columns in PropertyGraph Dataframes (#2449)

The main purpose of this is to reduce memory usage. Closes #2400 I still need to update MG tests. I'll also remove the in-code assertions, since they won't always be True, because a column name could have previously been used as a property. Nevertheless, seeing these assertions pass should give us warm-fuzzies :) Authors: - Erik Welch (https://github.com/eriknw) Approvers: - Rick Ratzel (https://github.com/rlratzel) - Alex Barghi (https://github.com/alexbarghi-nv) - Vibhu Jawa (https://github.com/VibhuJawa) - Brad Rees (https://github.com/BradReesWork) URL: #2449
rapidsai · Aug 2, 2022 · ac42e0b · ac42e0b
1 parent b74e22a
commit ac42e0b
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 17 deletions.
diff --git a/python/cugraph/cugraph/dask/structure/mg_property_graph.py b/python/cugraph/cugraph/dask/structure/mg_property_graph.py
@@ -412,7 +412,9 @@ def add_vertex_data(self,
             # remove the ones to keep
             column_names_to_drop.difference_update(property_columns +
                                                    default_vertex_columns)
-            tmp_df = tmp_df.drop(labels=column_names_to_drop, axis=1)
+        else:
+            column_names_to_drop = {vertex_col_name}
+        tmp_df = tmp_df.drop(labels=column_names_to_drop, axis=1)
 
         # Save the original dtypes for each new column so they can be restored
         # prior to constructing subgraphs (since column dtypes may get altered
@@ -566,7 +568,9 @@ def add_edge_data(self,
             # remove the ones to keep
             column_names_to_drop.difference_update(property_columns +
                                                    default_edge_columns)
-            tmp_df = tmp_df.drop(labels=column_names_to_drop, axis=1)
+        else:
+            column_names_to_drop = {vertex_col_names[0], vertex_col_names[1]}
+        tmp_df = tmp_df.drop(labels=column_names_to_drop, axis=1)
 
         # Save the original dtypes for each new column so they can be restored
         # prior to constructing subgraphs (since column dtypes may get altered

diff --git a/python/cugraph/cugraph/structure/property_graph.py b/python/cugraph/cugraph/structure/property_graph.py
@@ -424,7 +424,9 @@ def add_vertex_data(self,
             # remove the ones to keep
             column_names_to_drop.difference_update(property_columns +
                                                    default_vertex_columns)
-            tmp_df = tmp_df.drop(labels=column_names_to_drop, axis=1)
+        else:
+            column_names_to_drop = {vertex_col_name}
+        tmp_df.drop(labels=column_names_to_drop, axis=1, inplace=True)
 
         # Save the original dtypes for each new column so they can be restored
         # prior to constructing subgraphs (since column dtypes may get altered
@@ -591,7 +593,9 @@ def add_edge_data(self,
             # remove the ones to keep
             column_names_to_drop.difference_update(property_columns +
                                                    default_edge_columns)
-            tmp_df = tmp_df.drop(labels=column_names_to_drop, axis=1)
+        else:
+            column_names_to_drop = {vertex_col_names[0], vertex_col_names[1]}
+        tmp_df.drop(labels=column_names_to_drop, axis=1, inplace=True)
 
         # Save the original dtypes for each new column so they can be restored
         # prior to constructing subgraphs (since column dtypes may get altered

diff --git a/python/cugraph/cugraph/tests/test_property_graph.py b/python/cugraph/cugraph/tests/test_property_graph.py
@@ -333,11 +333,10 @@ def test_add_vertex_data(df_type):
                        type_name="merchants",
                        vertex_col_name="merchant_id",
                        property_columns=None)
-
     assert pG.get_num_vertices() == 5
     assert pG.get_num_vertices('merchants') == 5
     assert pG.get_num_edges() == 0
-    expected_props = merchants[0].copy()
+    expected_props = set(merchants[0].copy()) - {'merchant_id'}
     assert sorted(pG.vertex_property_names) == sorted(expected_props)
 
 
@@ -564,6 +563,7 @@ def test_get_vertex_data(dataset1_PropertyGraph):
     for d in ["merchants", "users"]:
         for name in data[d][0]:
             expected_columns.add(name)
+    expected_columns -= {'merchant_id', 'user_id'}
     actual_columns = set(some_vertex_data.columns)
     assert actual_columns == expected_columns
 
@@ -620,6 +620,7 @@ def test_get_edge_data(dataset1_PropertyGraph):
     for d in ["transactions", "relationships", "referrals"]:
         for name in data[d][0]:
             expected_columns.add(name)
+    expected_columns -= {'user_id', 'user_id_1', 'user_id_2'}
 
     actual_columns = set(some_edge_data.columns)
 
@@ -755,8 +756,8 @@ def test_add_edge_data(df_type):
     assert pG.get_num_vertices('transactions') == 0
     assert pG.get_num_edges() == 4
     assert pG.get_num_edges('transactions') == 4
-    expected_props = ["merchant_id", "user_id",
-                      "volume", "time", "card_num", "card_type"]
+    # Original SRC and DST columns no longer include "merchant_id", "user_id"
+    expected_props = ["volume", "time", "card_num", "card_type"]
     assert sorted(pG.edge_property_names) == sorted(expected_props)
 
 
@@ -928,8 +929,9 @@ def test_extract_subgraph_specific_query(dataset1_PropertyGraph):
     (pG, data) = dataset1_PropertyGraph
     tcn = PropertyGraph.type_col_name
 
+    # _DST_ below used to be referred to as merchant_id
     selection = pG.select_edges(f"({tcn}=='transactions') & "
-                                "(merchant_id==4) & "
+                                "(_DST_==4) & "
                                 "(time>1639085000)")
     G = pG.extract_subgraph(selection=selection,
                             create_using=DiGraph_inst,
@@ -1023,7 +1025,13 @@ def test_extract_subgraph_no_edges(dataset1_PropertyGraph):
     """
     (pG, data) = dataset1_PropertyGraph
 
-    selection = pG.select_vertices("(_TYPE_=='merchants') & (merchant_id==86)")
+    # "merchant_id" column is no longer saved; use as "_VERTEX_"
+    with pytest.raises(NameError, match="merchant_id"):
+        selection = pG.select_vertices(
+            "(_TYPE_=='merchants') & (merchant_id==86)"
+        )
+
+    selection = pG.select_vertices("(_TYPE_=='merchants') & (_VERTEX_==86)")
     G = pG.extract_subgraph(selection=selection)
     assert G.is_directed()
 
@@ -1360,13 +1368,14 @@ def test_property_names_attrs(dataset1_PropertyGraph):
     """
     (pG, data) = dataset1_PropertyGraph
 
-    expected_vert_prop_names = ["merchant_id", "merchant_location",
-                                "merchant_size", "merchant_sales",
-                                "merchant_num_employees", "merchant_name",
-                                "user_id", "user_location", "vertical"]
-    expected_edge_prop_names = ["user_id", "merchant_id", "volume", "time",
-                                "card_num", "card_type", "user_id_1",
-                                "user_id_2", "relationship_type", "stars"]
+    # _VERTEX_ columns: "merchant_id", "user_id"
+    expected_vert_prop_names = ["merchant_location", "merchant_size",
+                                "merchant_sales", "merchant_num_employees",
+                                "user_location", "merchant_name", "vertical"]
+    # _SRC_ and _DST_ columns: "user_id", "user_id_1", "user_id_2"
+    # Note that "merchant_id" is a property in for type "transactions"
+    expected_edge_prop_names = ["merchant_id", "volume", "time", "card_num",
+                                "card_type", "relationship_type", "stars"]
 
     # Extracting a subgraph with weights has/had a side-effect of adding a
     # weight column, so call extract_subgraph() to ensure the internal weight