From 56ea75f11ffc3763b7f768063328c6cb7270e625 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Tue, 14 Nov 2023 09:51:02 -0500 Subject: [PATCH 1/5] cudf.pandas: cuDF subpath checking in module `__getattr__` (#14388) Closes https://github.com/rapidsai/cudf/issues/14384. `x.startswith(y)` is not a good enough check for if `x` is a subdirectory of `y`. It causes `pandasai` to be reported as a sub-package of `pandas`. Authors: - Ashwin Srinath (https://github.com/shwina) Approvers: - https://github.com/brandon-b-miller URL: https://github.com/rapidsai/cudf/pull/14388 --- python/cudf/cudf/pandas/module_accelerator.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/pandas/module_accelerator.py b/python/cudf/cudf/pandas/module_accelerator.py index eb35c4adaaf..180d75d96e8 100644 --- a/python/cudf/cudf/pandas/module_accelerator.py +++ b/python/cudf/cudf/pandas/module_accelerator.py @@ -10,6 +10,7 @@ import importlib.abc import importlib.machinery import os +import pathlib import sys import threading import warnings @@ -554,9 +555,10 @@ def getattr_real_or_wrapped( frame = sys._getframe() # We cannot possibly be at the top level. assert frame.f_back - calling_module = frame.f_back.f_code.co_filename + calling_module = pathlib.PurePath(frame.f_back.f_code.co_filename) use_real = any( - calling_module.startswith(path) for path in loader._denylist + calling_module.is_relative_to(path) + for path in loader._denylist ) try: if use_real: From 7e1f3aae1af90b7880d07d5f4abe393564599036 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 22 Nov 2023 11:17:56 -0500 Subject: [PATCH 2/5] Add tests --- python/cudf/cudf_pandas_tests/test_cudf_pandas.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index 2500ba07bd9..1de01e1c6a0 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -1076,6 +1076,8 @@ def test_np_array_of_timestamps(): xpd.Index(["a", 2, 3]), # Other types xpd.tseries.offsets.BDay(5), + xpd.Timestamp("2001-01-01"), + xpd.Timestamp("2001-01-01", freq="D"), ], ) def test_pickle(obj): From 6835e097de8bcde1e378535e4c8b96703374bc5e Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 22 Nov 2023 11:40:43 -0500 Subject: [PATCH 3/5] Make Timestamps picklable --- python/cudf/cudf/pandas/_wrappers/pandas.py | 35 ++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py index 71daf1e6f0d..810ada471ce 100644 --- a/python/cudf/cudf/pandas/_wrappers/pandas.py +++ b/python/cudf/cudf/pandas/_wrappers/pandas.py @@ -1,7 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 - +import copyreg +import pickle import sys import pandas as pd @@ -1304,3 +1305,35 @@ def _df_query_method(self, *args, local_dict=None, global_dict=None, **kwargs): _Unusable, typ, ) + +# timestamps are not proxied and unproxied types are currently not +# picklable when the module accelerator is enabled. Workaround: + + +def _unpickle_timestamp(args): + # vendored from pd._libs.tslibs.timestamps._unpickle_timestamp + from cudf.pandas.module_accelerator import disable_module_accelerator + + with disable_module_accelerator(): + value, freq, tz, reso = pickle.loads(args) + ts = pd.Timestamp._from_value_and_reso(value, reso, tz) + ts._set_freq(freq) + return ts + + +def _reduce_timestamp(ts): + from cudf.pandas.module_accelerator import disable_module_accelerator + + _, args = ts.__reduce__() + with disable_module_accelerator(): + # args can contain objects that are unpicklable + # when the module accelerator is disabled + # (freq is of a proxy type): + args = pickle.dumps(args) + return _unpickle_timestamp, (args,) + + +# register the custom reducer with copyreg: +copyreg.dispatch_table[ + pd._libs.tslibs.timestamps.Timestamp +] = _reduce_timestamp From 3e36192c5e7ee64464547b5a7eecc25aae2f8217 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 22 Nov 2023 12:25:24 -0500 Subject: [PATCH 4/5] Timedeltas too --- python/cudf/cudf/pandas/_wrappers/pandas.py | 45 ++++++++++++------- .../cudf_pandas_tests/test_cudf_pandas.py | 2 + 2 files changed, 32 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py index 810ada471ce..5f0e7b62369 100644 --- a/python/cudf/cudf/pandas/_wrappers/pandas.py +++ b/python/cudf/cudf/pandas/_wrappers/pandas.py @@ -1306,34 +1306,49 @@ def _df_query_method(self, *args, local_dict=None, global_dict=None, **kwargs): typ, ) -# timestamps are not proxied and unproxied types are currently not -# picklable when the module accelerator is enabled. Workaround: +# timestamps and timedeltas are not proxied and "real" pandas types +# are currently not picklable when the module accelerator is enabled. +def _unpickle_timestamp(pickled_args): + from cudf.pandas.module_accelerator import disable_module_accelerator + + with disable_module_accelerator(): + unpickler, args = pickle.loads(pickled_args) + ts = unpickler(*args) + return ts + + +def _reduce_timestamp(ts): + from cudf.pandas.module_accelerator import disable_module_accelerator + with disable_module_accelerator(): + # args can contain objects that are unpicklable + # when the module accelerator is disabled + # (freq is of a proxy type): + pickled_args = pickle.dumps(ts.__reduce__()) -def _unpickle_timestamp(args): - # vendored from pd._libs.tslibs.timestamps._unpickle_timestamp + return _unpickle_timestamp, (pickled_args,) + + +def _unpickle_timedelta(pickled_args): from cudf.pandas.module_accelerator import disable_module_accelerator with disable_module_accelerator(): - value, freq, tz, reso = pickle.loads(args) - ts = pd.Timestamp._from_value_and_reso(value, reso, tz) - ts._set_freq(freq) + unpickler, args = pickle.loads(pickled_args) + ts = unpickler(*args) return ts -def _reduce_timestamp(ts): +def _reduce_timedelta(ts): from cudf.pandas.module_accelerator import disable_module_accelerator - _, args = ts.__reduce__() with disable_module_accelerator(): # args can contain objects that are unpicklable # when the module accelerator is disabled # (freq is of a proxy type): - args = pickle.dumps(args) - return _unpickle_timestamp, (args,) + pickled_args = pickle.dumps(ts.__reduce__()) + + return _unpickle_timedelta, (pickled_args,) -# register the custom reducer with copyreg: -copyreg.dispatch_table[ - pd._libs.tslibs.timestamps.Timestamp -] = _reduce_timestamp +copyreg.dispatch_table[pd.Timestamp] = _reduce_timestamp +copyreg.dispatch_table[pd.Timedelta] = _reduce_timedelta diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index 1de01e1c6a0..0dbf2c305e5 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -1078,6 +1078,8 @@ def test_np_array_of_timestamps(): xpd.tseries.offsets.BDay(5), xpd.Timestamp("2001-01-01"), xpd.Timestamp("2001-01-01", freq="D"), + xpd.Timedelta("1 days"), + xpd.Timedelta(1, "D"), ], ) def test_pickle(obj): From d8b6b15764f66ce1fe91752fd46d482693129209 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 27 Nov 2023 10:21:44 -0500 Subject: [PATCH 5/5] Consolidate timestamp/timedelta functions --- python/cudf/cudf/pandas/_wrappers/pandas.py | 43 ++++++--------------- 1 file changed, 12 insertions(+), 31 deletions(-) diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py index 5f0e7b62369..193ef404a8c 100644 --- a/python/cudf/cudf/pandas/_wrappers/pandas.py +++ b/python/cudf/cudf/pandas/_wrappers/pandas.py @@ -1306,49 +1306,30 @@ def _df_query_method(self, *args, local_dict=None, global_dict=None, **kwargs): typ, ) -# timestamps and timedeltas are not proxied and "real" pandas types -# are currently not picklable when the module accelerator is enabled. -def _unpickle_timestamp(pickled_args): - from cudf.pandas.module_accelerator import disable_module_accelerator - - with disable_module_accelerator(): - unpickler, args = pickle.loads(pickled_args) - ts = unpickler(*args) - return ts - - -def _reduce_timestamp(ts): +# timestamps and timedeltas are not proxied, but non-proxied +# pandas types are currently not picklable. Thus, we define +# custom reducer/unpicker functions for these types: +def _reduce_obj(obj): from cudf.pandas.module_accelerator import disable_module_accelerator with disable_module_accelerator(): # args can contain objects that are unpicklable # when the module accelerator is disabled # (freq is of a proxy type): - pickled_args = pickle.dumps(ts.__reduce__()) + pickled_args = pickle.dumps(obj.__reduce__()) - return _unpickle_timestamp, (pickled_args,) + return _unpickle_obj, (pickled_args,) -def _unpickle_timedelta(pickled_args): +def _unpickle_obj(pickled_args): from cudf.pandas.module_accelerator import disable_module_accelerator with disable_module_accelerator(): unpickler, args = pickle.loads(pickled_args) - ts = unpickler(*args) - return ts - - -def _reduce_timedelta(ts): - from cudf.pandas.module_accelerator import disable_module_accelerator - - with disable_module_accelerator(): - # args can contain objects that are unpicklable - # when the module accelerator is disabled - # (freq is of a proxy type): - pickled_args = pickle.dumps(ts.__reduce__()) - - return _unpickle_timedelta, (pickled_args,) + obj = unpickler(*args) + return obj -copyreg.dispatch_table[pd.Timestamp] = _reduce_timestamp -copyreg.dispatch_table[pd.Timedelta] = _reduce_timedelta +copyreg.dispatch_table[pd.Timestamp] = _reduce_obj +# same reducer/unpickler can be used for Timedelta: +copyreg.dispatch_table[pd.Timedelta] = _reduce_obj