Skip to content

Commit

Permalink
Updates SG PropertyGraph and cugraph-service to apply `DataFrame.…
Browse files Browse the repository at this point in the history
…fillna()` based on latest cuDF changes (#4408)

This handles a [recent cuDF change](rapidsai/cudf#15683) by applying non-dict and non-Series values for a `fillna()` call on `PropertyGraph` instances only to the user-defined columns, with the assumption that savvy users that intend to update the "internal" columns, or users that are aware of their own categorical dtype columns, will use a dict or Series value to properly apply dtypes as needed.

This also updates code in `cugraph-service` that serializes dataframes to numpy bytes to properly convert NA values when categoricals are present.

Notes:
* This is only applied to the SG `PropertyGraph` class.  The MG class needs further review as to how to best apply the same policy (and because there are other MG failing tests that need addressed).  Since this is blocking CI for the SG case only, this PR is being submitted now and MG will be addressed later, which should be okay since `PropertyGraph` is experimental.
* This could be considered a breaking change if `PropertyGraph` was not experimental.

Authors:
  - Rick Ratzel (https://github.com/rlratzel)

Approvers:
  - Alex Barghi (https://github.com/alexbarghi-nv)

URL: #4408
  • Loading branch information
rlratzel authored May 10, 2024
1 parent af749c3 commit 0639797
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 6 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -1370,7 +1370,23 @@ def __get_graph_data_as_numpy_bytes(self, dataframe, null_replacement_value):
# FIXME: should something other than a numpy type be serialized to
# prevent a copy? (note: any other type required to be de-serialzed
# on the client end could add dependencies on the client)
df_numpy = dataframe.to_numpy(na_value=n)
df_copy = dataframe.copy()
for col_name in df_copy.columns:
if df_copy[col_name].dtype == "category":
cat_dt = df_copy.dtypes[col_name].categories.dtype
if cat_dt == "object":
new_cat = str(n)
else:
new_cat = n
if new_cat not in df_copy.dtypes[col_name].categories:
df_copy[col_name] = df_copy[col_name].cat.add_categories(
new_cat
)
df_copy[col_name].fillna(new_cat, inplace=True)
else:
df_copy[col_name].fillna(n, inplace=True)

df_numpy = df_copy.to_numpy()
return df_numpy.dumps()

except Exception:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -25,12 +25,12 @@
def create_graph_from_builtin_dataset(dataset_name, mg=False, server=None):
dataset_obj = getattr(datasets, dataset_name)
# FIXME: create an MG graph if server is mg?
return dataset_obj.get_graph(fetch=True)
return dataset_obj.get_graph(download=True)


def create_property_graph_from_builtin_dataset(dataset_name, mg=False, server=None):
dataset_obj = getattr(datasets, dataset_name)
edgelist_df = dataset_obj.get_edgelist(fetch=True)
edgelist_df = dataset_obj.get_edgelist(download=True)

if mg and (server is not None) and server.is_multi_gpu:
G = MGPropertyGraph()
Expand Down
30 changes: 29 additions & 1 deletion python/cugraph/cugraph/structure/property_graph.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2021-2023, NVIDIA CORPORATION.
# Copyright (c) 2021-2024, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand Down Expand Up @@ -123,6 +123,17 @@ class EXPERIMENTAL__PropertyGraph:

_default_type_name = ""

_internal_col_names = set(
(
vertex_col_name,
src_col_name,
dst_col_name,
type_col_name,
edge_id_col_name,
weight_col_name,
)
)

def __init__(self):
# The dataframe containing the properties for each vertex.
# Each vertex occupies a row, and individual properties are maintained
Expand Down Expand Up @@ -1380,6 +1391,15 @@ def fillna_vertices(self, val=0):
Series is passed, the index or keys are the columns to fill
and the values are the fill value for the corresponding column.
"""
# Omit internal columns if an object is passed in to be applied to the
# entire DataFrame and assume the intent is for users to fillna only on
# their data.
if type(val) not in [dict, self.__series_type]:
user_col_names = (
set(self.__vertex_prop_dataframe.columns) - self._internal_col_names
)
val = dict((k, val) for k in user_col_names)

self.__vertex_prop_dataframe.fillna(val, inplace=True)

def fillna_edges(self, val=0):
Expand All @@ -1394,6 +1414,14 @@ def fillna_edges(self, val=0):
Series is passed, the index or keys are the columns to fill
and the values are the fill value for the corresponding column.
"""
# Omit internal columns if an object is passed in to be applied to the
# entire DataFrame and assume the intent is for users to fillna only on
# their data.
if type(val) not in [dict, self.__series_type]:
user_col_names = (
set(self.__edge_prop_dataframe.columns) - self._internal_col_names
)
val = dict((k, val) for k in user_col_names)

self.__edge_prop_dataframe.fillna(val, inplace=True)

Expand Down

0 comments on commit 0639797

Please sign in to comment.