Skip to content

Commit

Permalink
Don't store redundant columns in PropertyGraph Dataframes
Browse files Browse the repository at this point in the history
Closes rapidsai#2400

I still need to update MG tests.

I'll also remove the in-code assertions, since they won't always be True,
because a column name could have previously been used as a property.
Nevertheless, seeing these assertions pass should give us warm-fuzzies :)
  • Loading branch information
eriknw committed Jul 27, 2022
1 parent 2c50989 commit 1c280e0
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 16 deletions.
21 changes: 19 additions & 2 deletions python/cugraph/cugraph/dask/structure/mg_property_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,9 @@ def add_vertex_data(self,
# remove the ones to keep
column_names_to_drop.difference_update(property_columns +
default_vertex_columns)
tmp_df = tmp_df.drop(labels=column_names_to_drop, axis=1)
else:
column_names_to_drop = {vertex_col_name}
tmp_df = tmp_df.drop(labels=column_names_to_drop, axis=1)

# Save the original dtypes for each new column so they can be restored
# prior to constructing subgraphs (since column dtypes may get altered
Expand All @@ -330,6 +332,10 @@ def add_vertex_data(self,
latest = dict([(n, self.__vertex_prop_dataframe[n])
for n in self.__vertex_prop_dataframe.columns])
self.__vertex_prop_eval_dict.update(latest)
# TODO: delete asserts before merging
assert vertex_col_name not in self.__vertex_prop_dataframe.columns
assert vertex_col_name not in self.__vertex_prop_dtypes
assert vertex_col_name not in self.__vertex_prop_eval_dict

def add_edge_data(self,
dataframe,
Expand Down Expand Up @@ -433,7 +439,9 @@ def add_edge_data(self,
# remove the ones to keep
column_names_to_drop.difference_update(property_columns +
default_edge_columns)
tmp_df = tmp_df.drop(labels=column_names_to_drop, axis=1)
else:
column_names_to_drop = {vertex_col_names[0], vertex_col_names[1]}
tmp_df = tmp_df.drop(labels=column_names_to_drop, axis=1)

# Save the original dtypes for each new column so they can be restored
# prior to constructing subgraphs (since column dtypes may get altered
Expand All @@ -447,6 +455,15 @@ def add_edge_data(self,
latest = dict([(n, self.__edge_prop_dataframe[n])
for n in self.__edge_prop_dataframe.columns])
self.__edge_prop_eval_dict.update(latest)
# TODO: delete asserts before merging
src = vertex_col_names[0]
dst = vertex_col_names[1]
assert src not in self.__edge_prop_dataframe.columns
assert src not in self.__edge_prop_eval_dict
assert src not in self.__edge_prop_dtypes
assert dst not in self.__edge_prop_dataframe.columns
assert dst not in self.__edge_prop_eval_dict
assert dst not in self.__edge_prop_dtypes

def select_vertices(self, expr, from_previous_selection=None):
raise NotImplementedError
Expand Down
21 changes: 19 additions & 2 deletions python/cugraph/cugraph/structure/property_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,9 @@ def add_vertex_data(self,
# remove the ones to keep
column_names_to_drop.difference_update(property_columns +
default_vertex_columns)
tmp_df = tmp_df.drop(labels=column_names_to_drop, axis=1)
else:
column_names_to_drop = {vertex_col_name}
tmp_df = tmp_df.drop(labels=column_names_to_drop, axis=1)

# Save the original dtypes for each new column so they can be restored
# prior to constructing subgraphs (since column dtypes may get altered
Expand All @@ -345,6 +347,10 @@ def add_vertex_data(self,
latest = dict([(n, self.__vertex_prop_dataframe[n])
for n in self.__vertex_prop_dataframe.columns])
self.__vertex_prop_eval_dict.update(latest)
# TODO: delete asserts before merging
assert vertex_col_name not in self.__vertex_prop_dataframe.columns
assert vertex_col_name not in self.__vertex_prop_dtypes
assert vertex_col_name not in self.__vertex_prop_eval_dict

def add_edge_data(self,
dataframe,
Expand Down Expand Up @@ -451,7 +457,9 @@ def add_edge_data(self,
# remove the ones to keep
column_names_to_drop.difference_update(property_columns +
default_edge_columns)
tmp_df = tmp_df.drop(labels=column_names_to_drop, axis=1)
else:
column_names_to_drop = {vertex_col_names[0], vertex_col_names[1]}
tmp_df = tmp_df.drop(labels=column_names_to_drop, axis=1)

# Save the original dtypes for each new column so they can be restored
# prior to constructing subgraphs (since column dtypes may get altered
Expand All @@ -469,6 +477,15 @@ def add_edge_data(self,
latest = dict([(n, self.__edge_prop_dataframe[n])
for n in self.__edge_prop_dataframe.columns])
self.__edge_prop_eval_dict.update(latest)
# TODO: delete asserts before merging
src = vertex_col_names[0]
dst = vertex_col_names[1]
assert src not in self.__edge_prop_dataframe.columns
assert src not in self.__edge_prop_eval_dict
assert src not in self.__edge_prop_dtypes
assert dst not in self.__edge_prop_dataframe.columns
assert dst not in self.__edge_prop_eval_dict
assert dst not in self.__edge_prop_dtypes

def select_vertices(self, expr, from_previous_selection=None):
"""
Expand Down
32 changes: 20 additions & 12 deletions python/cugraph/cugraph/tests/test_property_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ def test_add_vertex_data(df_type):

assert pG.num_vertices == 5
assert pG.num_edges == 0
expected_props = merchants[0].copy()
expected_props = set(merchants[0].copy()) - {'merchant_id'}
assert sorted(pG.vertex_property_names) == sorted(expected_props)


Expand Down Expand Up @@ -473,8 +473,8 @@ def test_add_edge_data(df_type):

assert pG.num_vertices == 7
assert pG.num_edges == 4
expected_props = ["merchant_id", "user_id",
"volume", "time", "card_num", "card_type"]
# Original SRC and DST columns no longer include "merchant_id", "user_id"
expected_props = ["volume", "time", "card_num", "card_type"]
assert sorted(pG.edge_property_names) == sorted(expected_props)


Expand Down Expand Up @@ -643,8 +643,9 @@ def test_extract_subgraph_specific_query(dataset1_PropertyGraph):
pG = dataset1_PropertyGraph
tcn = PropertyGraph.type_col_name

# _DST_ below used to be referred to as merchant_id
selection = pG.select_edges(f"({tcn}=='transactions') & "
"(merchant_id==4) & "
"(_DST_==4) & "
"(time>1639085000)")
G = pG.extract_subgraph(selection=selection,
create_using=DiGraph_inst,
Expand Down Expand Up @@ -738,7 +739,13 @@ def test_extract_subgraph_no_edges(dataset1_PropertyGraph):
"""
pG = dataset1_PropertyGraph

selection = pG.select_vertices("(_TYPE_=='merchants') & (merchant_id==86)")
# "merchant_id" column is no longer saved; use as "_VERTEX_"
with pytest.raises(NameError, match="merchant_id"):
selection = pG.select_vertices(
"(_TYPE_=='merchants') & (merchant_id==86)"
)

selection = pG.select_vertices("(_TYPE_=='merchants') & (_VERTEX_==86)")
G = pG.extract_subgraph(selection=selection)

assert len(G.edgelist.edgelist_df) == 0
Expand Down Expand Up @@ -1066,13 +1073,14 @@ def test_property_names_attrs(dataset1_PropertyGraph):
"""
pG = dataset1_PropertyGraph

expected_vert_prop_names = ["merchant_id", "merchant_location",
"merchant_size", "merchant_sales",
"merchant_num_employees", "merchant_name",
"user_id", "user_location", "vertical"]
expected_edge_prop_names = ["user_id", "merchant_id", "volume", "time",
"card_num", "card_type", "user_id_1",
"user_id_2", "relationship_type", "stars"]
# _VERTEX_ columns: "merchant_id", "user_id"
expected_vert_prop_names = ["merchant_location", "merchant_size",
"merchant_sales", "merchant_num_employees",
"user_location", "merchant_name", "vertical"]
# _SRC_ and _DST_ columns: "user_id", "user_id_1", "user_id_2"
# Note that "merchant_id" is a property in for type "transactions"
expected_edge_prop_names = ["merchant_id", "volume", "time", "card_num",
"card_type", "relationship_type", "stars"]

# Extracting a subgraph with weights has/had a side-effect of adding a
# weight column, so call extract_subgraph() to ensure the internal weight
Expand Down

0 comments on commit 1c280e0

Please sign in to comment.