From d925926f5b4d0af927131c099482548daa1c6753 Mon Sep 17 00:00:00 2001 From: Erik Welch Date: Wed, 27 Jul 2022 10:05:43 -0500 Subject: [PATCH] Fix pandas SettingWithCopyWarning, which really shouldn't be ignored. (#2447) See: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy This implementation requires column names to be strings, which is hopefully fine (but could be changed). The downside is that this sometimes copies data when unnecessary. It's pretty hard to know when objects are views or not when using public APIS. Pandas objects have a `._is_view` attribute that tell us this. I don't know about cugraph, or if this is even an issue in cugraph. CC @rlratzel Authors: - Erik Welch (https://github.com/eriknw) Approvers: - Rick Ratzel (https://github.com/rlratzel) URL: https://github.com/rapidsai/cugraph/pull/2447 --- .../cugraph/structure/property_graph.py | 23 ++++++++++++------- .../cugraph/tests/test_property_graph.py | 15 ++++++++++++ 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/python/cugraph/cugraph/structure/property_graph.py b/python/cugraph/cugraph/structure/property_graph.py index 815192ef7b4..59f66f44f93 100644 --- a/python/cugraph/cugraph/structure/property_graph.py +++ b/python/cugraph/cugraph/structure/property_graph.py @@ -308,7 +308,7 @@ def add_vertex_data(self, # column in the incoming dataframe, since the initial merge may not # result in the same dtype. (see # https://github.com/rapidsai/cudf/issues/9981) - self.__update_dataframe_dtypes( + self.__vertex_prop_dataframe = self.__update_dataframe_dtypes( self.__vertex_prop_dataframe, {self.vertex_col_name: dataframe[vertex_col_name].dtype}) @@ -430,7 +430,7 @@ def add_edge_data(self, # column in the incoming dataframe, since the initial merge may not # result in the same dtype. (see # https://github.com/rapidsai/cudf/issues/9981) - self.__update_dataframe_dtypes( + self.__edge_prop_dataframe = self.__update_dataframe_dtypes( self.__edge_prop_dataframe, {self.src_col_name: dataframe[vertex_col_names[0]].dtype, self.dst_col_name: dataframe[vertex_col_names[1]].dtype, @@ -659,7 +659,7 @@ def extract_subgraph(self, # possibly had their dtypes converted in order to accommodate NaN # values. Restore the original dtypes in the resulting edges df prior # to creating a Graph. - self.__update_dataframe_dtypes(edges, self.__edge_prop_dtypes) + edges = self.__update_dataframe_dtypes(edges, self.__edge_prop_dtypes) return self.edge_props_to_graph( edges, @@ -727,7 +727,9 @@ def annotate_dataframe(self, df, G, edge_vertex_col_names): inplace=True) # restore the original dtypes - self.__update_dataframe_dtypes(new_df, self.__edge_prop_dtypes) + new_df = self.__update_dataframe_dtypes( + new_df, self.__edge_prop_dtypes + ) for col in df.columns: new_df[col] = new_df[col].astype(df[col].dtype) @@ -906,6 +908,7 @@ def __update_dataframe_dtypes(df, column_dtype_dict): This also handles converting standard integer dtypes to nullable integer dtypes, needed to accommodate NA values in columns. """ + update_cols = {} for (col, dtype) in column_dtype_dict.items(): # If the DataFrame is Pandas and the dtype is an integer type, # ensure a nullable integer array is used by specifying the correct @@ -919,7 +922,11 @@ def __update_dataframe_dtypes(df, column_dtype_dict): # Assigning to df[col] produces a (false?) warning with Pandas, # but assigning to df.loc[:,col] does not update the df in # cudf, so do one or the other based on type. - if type(df) is cudf.DataFrame: - df[col] = df[col].astype(dtype_str) - else: - df.loc[:, col] = df[col].astype(dtype_str) + update_cols[col] = df[col].astype(dtype_str) + if not update_cols: + return df + # Use df.assign to avoid assignment into df in case df is a view: + # https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html + # #returning-a-view-versus-a-copy + # Note that this requires all column names to be strings. + return df.assign(**update_cols) diff --git a/python/cugraph/cugraph/tests/test_property_graph.py b/python/cugraph/cugraph/tests/test_property_graph.py index d4d185dac18..5a91d9d577f 100644 --- a/python/cugraph/cugraph/tests/test_property_graph.py +++ b/python/cugraph/cugraph/tests/test_property_graph.py @@ -118,6 +118,21 @@ def setup_function(): # ============================================================================= # Pytest fixtures # ============================================================================= +@pytest.fixture(scope="function", autouse=True) +def raise_on_pandas_warning(): + """Raise when pandas gives SettingWithCopyWarning warning""" + # Perhaps we should put this in pytest.ini, pyproject.toml, or conftest.py + import warnings + + filters = list(warnings.filters) + warnings.filterwarnings( + "error", + category=pd.core.common.SettingWithCopyWarning + ) + yield + warnings.filters = filters + + df_types = [cudf.DataFrame, pd.DataFrame]