From 06397973d04b14d18a2805c9c5e16150099ffd21 Mon Sep 17 00:00:00 2001 From: Rick Ratzel <3039903+rlratzel@users.noreply.github.com> Date: Fri, 10 May 2024 17:58:49 -0500 Subject: [PATCH] Updates SG `PropertyGraph` and `cugraph-service` to apply `DataFrame.fillna()` based on latest cuDF changes (#4408) This handles a [recent cuDF change](https://github.com/rapidsai/cudf/pull/15683) by applying non-dict and non-Series values for a `fillna()` call on `PropertyGraph` instances only to the user-defined columns, with the assumption that savvy users that intend to update the "internal" columns, or users that are aware of their own categorical dtype columns, will use a dict or Series value to properly apply dtypes as needed. This also updates code in `cugraph-service` that serializes dataframes to numpy bytes to properly convert NA values when categoricals are present. Notes: * This is only applied to the SG `PropertyGraph` class. The MG class needs further review as to how to best apply the same policy (and because there are other MG failing tests that need addressed). Since this is blocking CI for the SG case only, this PR is being submitted now and MG will be addressed later, which should be okay since `PropertyGraph` is experimental. * This could be considered a breaking change if `PropertyGraph` was not experimental. Authors: - Rick Ratzel (https://github.com/rlratzel) Approvers: - Alex Barghi (https://github.com/alexbarghi-nv) URL: https://github.com/rapidsai/cugraph/pull/4408 --- .../cugraph_service_server/cugraph_handler.py | 20 +++++++++++-- .../testing/benchmark_server_extension.py | 6 ++-- .../cugraph/structure/property_graph.py | 30 ++++++++++++++++++- 3 files changed, 50 insertions(+), 6 deletions(-) diff --git a/python/cugraph-service/server/cugraph_service_server/cugraph_handler.py b/python/cugraph-service/server/cugraph_service_server/cugraph_handler.py index 6cdf0d793d4..f60f597cfae 100644 --- a/python/cugraph-service/server/cugraph_service_server/cugraph_handler.py +++ b/python/cugraph-service/server/cugraph_service_server/cugraph_handler.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -1370,7 +1370,23 @@ def __get_graph_data_as_numpy_bytes(self, dataframe, null_replacement_value): # FIXME: should something other than a numpy type be serialized to # prevent a copy? (note: any other type required to be de-serialzed # on the client end could add dependencies on the client) - df_numpy = dataframe.to_numpy(na_value=n) + df_copy = dataframe.copy() + for col_name in df_copy.columns: + if df_copy[col_name].dtype == "category": + cat_dt = df_copy.dtypes[col_name].categories.dtype + if cat_dt == "object": + new_cat = str(n) + else: + new_cat = n + if new_cat not in df_copy.dtypes[col_name].categories: + df_copy[col_name] = df_copy[col_name].cat.add_categories( + new_cat + ) + df_copy[col_name].fillna(new_cat, inplace=True) + else: + df_copy[col_name].fillna(n, inplace=True) + + df_numpy = df_copy.to_numpy() return df_numpy.dumps() except Exception: diff --git a/python/cugraph-service/server/cugraph_service_server/testing/benchmark_server_extension.py b/python/cugraph-service/server/cugraph_service_server/testing/benchmark_server_extension.py index 361226c8071..dbd75e6abd9 100644 --- a/python/cugraph-service/server/cugraph_service_server/testing/benchmark_server_extension.py +++ b/python/cugraph-service/server/cugraph_service_server/testing/benchmark_server_extension.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -25,12 +25,12 @@ def create_graph_from_builtin_dataset(dataset_name, mg=False, server=None): dataset_obj = getattr(datasets, dataset_name) # FIXME: create an MG graph if server is mg? - return dataset_obj.get_graph(fetch=True) + return dataset_obj.get_graph(download=True) def create_property_graph_from_builtin_dataset(dataset_name, mg=False, server=None): dataset_obj = getattr(datasets, dataset_name) - edgelist_df = dataset_obj.get_edgelist(fetch=True) + edgelist_df = dataset_obj.get_edgelist(download=True) if mg and (server is not None) and server.is_multi_gpu: G = MGPropertyGraph() diff --git a/python/cugraph/cugraph/structure/property_graph.py b/python/cugraph/cugraph/structure/property_graph.py index 513798f35f9..53c1bf778c7 100644 --- a/python/cugraph/cugraph/structure/property_graph.py +++ b/python/cugraph/cugraph/structure/property_graph.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -123,6 +123,17 @@ class EXPERIMENTAL__PropertyGraph: _default_type_name = "" + _internal_col_names = set( + ( + vertex_col_name, + src_col_name, + dst_col_name, + type_col_name, + edge_id_col_name, + weight_col_name, + ) + ) + def __init__(self): # The dataframe containing the properties for each vertex. # Each vertex occupies a row, and individual properties are maintained @@ -1380,6 +1391,15 @@ def fillna_vertices(self, val=0): Series is passed, the index or keys are the columns to fill and the values are the fill value for the corresponding column. """ + # Omit internal columns if an object is passed in to be applied to the + # entire DataFrame and assume the intent is for users to fillna only on + # their data. + if type(val) not in [dict, self.__series_type]: + user_col_names = ( + set(self.__vertex_prop_dataframe.columns) - self._internal_col_names + ) + val = dict((k, val) for k in user_col_names) + self.__vertex_prop_dataframe.fillna(val, inplace=True) def fillna_edges(self, val=0): @@ -1394,6 +1414,14 @@ def fillna_edges(self, val=0): Series is passed, the index or keys are the columns to fill and the values are the fill value for the corresponding column. """ + # Omit internal columns if an object is passed in to be applied to the + # entire DataFrame and assume the intent is for users to fillna only on + # their data. + if type(val) not in [dict, self.__series_type]: + user_col_names = ( + set(self.__edge_prop_dataframe.columns) - self._internal_col_names + ) + val = dict((k, val) for k in user_col_names) self.__edge_prop_dataframe.fillna(val, inplace=True)