Skip to content

Commit

Permalink
Don't store redundant columns in PropertyGraph Dataframes (#2449)
Browse files Browse the repository at this point in the history
The main purpose of this is to reduce memory usage.

Closes #2400

I still need to update MG tests.

I'll also remove the in-code assertions, since they won't always be True, because a column name could have previously been used as a property. Nevertheless, seeing these assertions pass should give us warm-fuzzies :)

Authors:
  - Erik Welch (https://github.com/eriknw)

Approvers:
  - Rick Ratzel (https://github.com/rlratzel)
  - Alex Barghi (https://github.com/alexbarghi-nv)
  - Vibhu Jawa (https://github.com/VibhuJawa)
  - Brad Rees (https://github.com/BradReesWork)

URL: #2449
  • Loading branch information
eriknw authored Aug 2, 2022
1 parent b74e22a commit ac42e0b
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 17 deletions.
8 changes: 6 additions & 2 deletions python/cugraph/cugraph/dask/structure/mg_property_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,7 +412,9 @@ def add_vertex_data(self,
# remove the ones to keep
column_names_to_drop.difference_update(property_columns +
default_vertex_columns)
tmp_df = tmp_df.drop(labels=column_names_to_drop, axis=1)
else:
column_names_to_drop = {vertex_col_name}
tmp_df = tmp_df.drop(labels=column_names_to_drop, axis=1)

# Save the original dtypes for each new column so they can be restored
# prior to constructing subgraphs (since column dtypes may get altered
Expand Down Expand Up @@ -566,7 +568,9 @@ def add_edge_data(self,
# remove the ones to keep
column_names_to_drop.difference_update(property_columns +
default_edge_columns)
tmp_df = tmp_df.drop(labels=column_names_to_drop, axis=1)
else:
column_names_to_drop = {vertex_col_names[0], vertex_col_names[1]}
tmp_df = tmp_df.drop(labels=column_names_to_drop, axis=1)

# Save the original dtypes for each new column so they can be restored
# prior to constructing subgraphs (since column dtypes may get altered
Expand Down
8 changes: 6 additions & 2 deletions python/cugraph/cugraph/structure/property_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,9 @@ def add_vertex_data(self,
# remove the ones to keep
column_names_to_drop.difference_update(property_columns +
default_vertex_columns)
tmp_df = tmp_df.drop(labels=column_names_to_drop, axis=1)
else:
column_names_to_drop = {vertex_col_name}
tmp_df.drop(labels=column_names_to_drop, axis=1, inplace=True)

# Save the original dtypes for each new column so they can be restored
# prior to constructing subgraphs (since column dtypes may get altered
Expand Down Expand Up @@ -591,7 +593,9 @@ def add_edge_data(self,
# remove the ones to keep
column_names_to_drop.difference_update(property_columns +
default_edge_columns)
tmp_df = tmp_df.drop(labels=column_names_to_drop, axis=1)
else:
column_names_to_drop = {vertex_col_names[0], vertex_col_names[1]}
tmp_df.drop(labels=column_names_to_drop, axis=1, inplace=True)

# Save the original dtypes for each new column so they can be restored
# prior to constructing subgraphs (since column dtypes may get altered
Expand Down
35 changes: 22 additions & 13 deletions python/cugraph/cugraph/tests/test_property_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,11 +333,10 @@ def test_add_vertex_data(df_type):
type_name="merchants",
vertex_col_name="merchant_id",
property_columns=None)

assert pG.get_num_vertices() == 5
assert pG.get_num_vertices('merchants') == 5
assert pG.get_num_edges() == 0
expected_props = merchants[0].copy()
expected_props = set(merchants[0].copy()) - {'merchant_id'}
assert sorted(pG.vertex_property_names) == sorted(expected_props)


Expand Down Expand Up @@ -564,6 +563,7 @@ def test_get_vertex_data(dataset1_PropertyGraph):
for d in ["merchants", "users"]:
for name in data[d][0]:
expected_columns.add(name)
expected_columns -= {'merchant_id', 'user_id'}
actual_columns = set(some_vertex_data.columns)
assert actual_columns == expected_columns

Expand Down Expand Up @@ -620,6 +620,7 @@ def test_get_edge_data(dataset1_PropertyGraph):
for d in ["transactions", "relationships", "referrals"]:
for name in data[d][0]:
expected_columns.add(name)
expected_columns -= {'user_id', 'user_id_1', 'user_id_2'}

actual_columns = set(some_edge_data.columns)

Expand Down Expand Up @@ -755,8 +756,8 @@ def test_add_edge_data(df_type):
assert pG.get_num_vertices('transactions') == 0
assert pG.get_num_edges() == 4
assert pG.get_num_edges('transactions') == 4
expected_props = ["merchant_id", "user_id",
"volume", "time", "card_num", "card_type"]
# Original SRC and DST columns no longer include "merchant_id", "user_id"
expected_props = ["volume", "time", "card_num", "card_type"]
assert sorted(pG.edge_property_names) == sorted(expected_props)


Expand Down Expand Up @@ -928,8 +929,9 @@ def test_extract_subgraph_specific_query(dataset1_PropertyGraph):
(pG, data) = dataset1_PropertyGraph
tcn = PropertyGraph.type_col_name

# _DST_ below used to be referred to as merchant_id
selection = pG.select_edges(f"({tcn}=='transactions') & "
"(merchant_id==4) & "
"(_DST_==4) & "
"(time>1639085000)")
G = pG.extract_subgraph(selection=selection,
create_using=DiGraph_inst,
Expand Down Expand Up @@ -1023,7 +1025,13 @@ def test_extract_subgraph_no_edges(dataset1_PropertyGraph):
"""
(pG, data) = dataset1_PropertyGraph

selection = pG.select_vertices("(_TYPE_=='merchants') & (merchant_id==86)")
# "merchant_id" column is no longer saved; use as "_VERTEX_"
with pytest.raises(NameError, match="merchant_id"):
selection = pG.select_vertices(
"(_TYPE_=='merchants') & (merchant_id==86)"
)

selection = pG.select_vertices("(_TYPE_=='merchants') & (_VERTEX_==86)")
G = pG.extract_subgraph(selection=selection)
assert G.is_directed()

Expand Down Expand Up @@ -1360,13 +1368,14 @@ def test_property_names_attrs(dataset1_PropertyGraph):
"""
(pG, data) = dataset1_PropertyGraph

expected_vert_prop_names = ["merchant_id", "merchant_location",
"merchant_size", "merchant_sales",
"merchant_num_employees", "merchant_name",
"user_id", "user_location", "vertical"]
expected_edge_prop_names = ["user_id", "merchant_id", "volume", "time",
"card_num", "card_type", "user_id_1",
"user_id_2", "relationship_type", "stars"]
# _VERTEX_ columns: "merchant_id", "user_id"
expected_vert_prop_names = ["merchant_location", "merchant_size",
"merchant_sales", "merchant_num_employees",
"user_location", "merchant_name", "vertical"]
# _SRC_ and _DST_ columns: "user_id", "user_id_1", "user_id_2"
# Note that "merchant_id" is a property in for type "transactions"
expected_edge_prop_names = ["merchant_id", "volume", "time", "card_num",
"card_type", "relationship_type", "stars"]

# Extracting a subgraph with weights has/had a side-effect of adding a
# weight column, so call extract_subgraph() to ensure the internal weight
Expand Down

0 comments on commit ac42e0b

Please sign in to comment.