Skip to content

Commit

Permalink
Fix PropertyGraph.renumber_*_by_type with only default types (#3352)
Browse files Browse the repository at this point in the history
Fixes #3058

Authors:
  - Erik Welch (https://github.com/eriknw)
  - Alex Barghi (https://github.com/alexbarghi-nv)

Approvers:
  - Alex Barghi (https://github.com/alexbarghi-nv)

URL: #3352
  • Loading branch information
eriknw authored Apr 5, 2023
1 parent e76406d commit e1c44b7
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 9 deletions.
17 changes: 11 additions & 6 deletions python/cugraph/cugraph/dask/structure/mg_property_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -1514,7 +1514,11 @@ def renumber_vertices_by_type(self, prev_id_column=None):
# Include self.vertex_col_name when sorting by values to ensure we can
# evenly distribute the data across workers.
df = df.reset_index().persist()
df = df.sort_values(by=[TCN, self.vertex_col_name], ignore_index=True).persist()
if len(cat_dtype.categories) > 1 and len(self.vertex_types) > 1:
# `self.vertex_types` is currently not cheap, b/c it looks at edge df
df = df.sort_values(
by=[TCN, self.vertex_col_name], ignore_index=True
).persist()
if self.__edge_prop_dataframe is not None:
new_name = f"new_{self.vertex_col_name}"
df[new_name] = 1
Expand Down Expand Up @@ -1605,9 +1609,10 @@ def renumber_edges_by_type(self, prev_id_column=None):
# Include self.edge_id_col_name when sorting by values to ensure we can
# evenly distribute the data across workers.
df = df.reset_index().persist()
df = df.sort_values(
by=[self.type_col_name, self.edge_id_col_name], ignore_index=True
).persist()
if len(cat_dtype.categories) > 1 and len(self.edge_types) > 1:
df = df.sort_values(
by=[self.type_col_name, self.edge_id_col_name], ignore_index=True
).persist()
if prev_id_column is not None:
df[prev_id_column] = df[self.edge_id_col_name]

Expand All @@ -1623,8 +1628,8 @@ def renumber_edges_by_type(self, prev_id_column=None):

# FIXME DASK_CUDF: https://github.com/rapidsai/cudf/issues/11795
df = self._edge_type_value_counts
assert df.index.dtype == cat_dtype
df.index = df.index.astype(str)
if df.index.dtype == cat_dtype:
df.index = df.index.astype(str)

# self._edge_type_value_counts
rv = df.sort_index().cumsum().to_frame("stop")
Expand Down
16 changes: 13 additions & 3 deletions python/cugraph/cugraph/structure/property_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -2074,7 +2074,11 @@ def renumber_vertices_by_type(self, prev_id_column=None):
].astype(cat_dtype)

index_dtype = self.__vertex_prop_dataframe.index.dtype
df = self.__vertex_prop_dataframe.reset_index().sort_values(by=TCN)
df = self.__vertex_prop_dataframe.reset_index()
if len(df.dtypes[TCN].categories) > 1 and len(self.vertex_types) > 1:
# Avoid `sort_values` if we know there is only one type
# `self.vertex_types` is currently not cheap, b/c it looks at edge df
df = df.sort_values(by=TCN, ignore_index=True)
df.index = df.index.astype(index_dtype)
if self.__edge_prop_dataframe is not None:
mapper = self.__series_type(df.index, index=df[self.vertex_col_name])
Expand Down Expand Up @@ -2164,9 +2168,15 @@ def renumber_edges_by_type(self, prev_id_column=None):
df = self.__edge_prop_dataframe
index_dtype = df.index.dtype
if prev_id_column is None:
df = df.sort_values(by=TCN, ignore_index=True)
if len(df.dtypes[TCN].categories) > 1 and len(self.edge_types) > 1:
# Avoid `sort_values` if we know there is only one type
df = df.sort_values(by=TCN, ignore_index=True)
else:
df.reset_index(drop=True, inplace=True)
else:
df = df.sort_values(by=TCN)
if len(df.dtypes[TCN].categories) > 1 and len(self.edge_types) > 1:
# Avoid `sort_values` if we know there is only one type
df = df.sort_values(by=TCN)
df.index.name = prev_id_column
df.reset_index(inplace=True)
df.index = df.index.astype(index_dtype)
Expand Down
31 changes: 31 additions & 0 deletions python/cugraph/cugraph/tests/data_store/test_property_graph_mg.py
Original file line number Diff line number Diff line change
Expand Up @@ -1477,6 +1477,37 @@ def test_types_from_numerals(dask_client):
]


@pytest.mark.mg
def test_renumber_by_type_only_default_type(dask_client):
from cugraph.experimental import MGPropertyGraph

pG = MGPropertyGraph()
df = cudf.DataFrame(
{
"src": cp.array([0, 0, 1, 2, 2, 3], dtype="int32"),
"dst": cp.array([1, 2, 4, 3, 4, 1], dtype="int32"),
}
)
ddf = dask_cudf.from_cudf(df, npartitions=2)
pG.add_edge_data(ddf, vertex_col_names=["src", "dst"])

df2 = cudf.DataFrame(
{
"prop1": [100, 200, 300, 400, 500],
"prop2": [5, 4, 3, 2, 1],
"id": cp.array([0, 1, 2, 3, 4], dtype="int32"),
}
)
ddf2 = dask_cudf.from_cudf(df2, npartitions=2)
pG.add_vertex_data(ddf2, vertex_col_name="id")
pG.renumber_vertices_by_type()
got = pG.get_vertex_data().compute()
assert got[pG.vertex_col_name].to_arrow().to_pylist() == list(range(len(got)))
pG.renumber_edges_by_type()
got = pG.get_edge_data().compute()
assert got[pG.edge_id_col_name].to_arrow().to_pylist() == list(range(len(got)))


# =============================================================================
# Benchmarks
# =============================================================================
Expand Down

0 comments on commit e1c44b7

Please sign in to comment.