From b4795c9461aa5eab0cde2721e01790dec4695706 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 1 Jul 2024 12:28:55 -0700 Subject: [PATCH 1/5] Add environment variable to log fallback calls --- python/cudf/cudf/pandas/fast_slow_proxy.py | 32 ++++++++++++++++++- .../cudf/pandas/scripts/run-pandas-tests.sh | 5 +++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index dfb729cae6b..4ceba61433c 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -6,6 +6,7 @@ import functools import inspect +import logging import operator import pickle import types @@ -930,13 +931,42 @@ def _fast_slow_function_call( "Pandas debugging mode failed. " f"The exception was {e}." ) - except Exception: + except Exception as err: with nvtx.annotate( "EXECUTE_SLOW", color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"], domain="cudf_pandas", ): slow_args, slow_kwargs = _slow_arg(args), _slow_arg(kwargs) + if _env_get_bool("CUDF_PANDAS_SLOW_LOG", False): + + def reprify(arg) -> str: + try: + return repr(arg) + except Exception: + return "" + + logging.basicConfig( + filename="pandas-tests.log", level=logging.INFO + ) + module = getattr(slow_args[0], "__module__", "") + obj_name = getattr( + slow_args[0], "__name__", type(slow_args[0]).__name__ + ) + called = f"{module}.{obj_name}" + fmt_args = ",".join((reprify(val) for val in slow_args[1])) + if len(slow_args) == 3: + fmt_kwargs = ",".join( + f"{kwarg}={value!r}" + for kwarg, value in slow_args[2].items() + ) + fmt_args = ",".join((fmt_args, fmt_kwargs)) + logging.info( + f"The following fast call was made: {called}({fmt_args})" + ) + logging.info(f"Exception Type: {type(err).__name__}") + logging.info(f"Called: {called}") + logging.info(f"Arguments: {fmt_args}") with disable_module_accelerator(): result = func(*slow_args, **slow_kwargs) return _maybe_wrap_result(result, func, *args, **kwargs), fast diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh index a66f63c09b3..493654bc62c 100755 --- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh +++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh @@ -63,6 +63,11 @@ markers = [ "arm_slow: mark a test as slow for arm64 architecture", "skip_ubsan: Tests known to fail UBSAN check", ] +log_format = %(asctime)s %(levelname)s %(message)s +log_date_format = %Y-%m-%d %H:%M:%S +log_file = pandas-tests.log +log_file_level = INFO +log_cli = true EOF # append the contents of patch-confest.py to conftest.py cat ../python/cudf/cudf/pandas/scripts/conftest-patch.py >> pandas-tests/conftest.py From a3e9c59babe97474707917d53a971a6433b2f69f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 1 Jul 2024 19:18:30 -0700 Subject: [PATCH 2/5] Use structured logging --- python/cudf/cudf/pandas/_logger.py | 26 +++++++++++++++ python/cudf/cudf/pandas/fast_slow_proxy.py | 32 +++++++++++-------- .../cudf/pandas/scripts/run-pandas-tests.sh | 2 +- 3 files changed, 46 insertions(+), 14 deletions(-) create mode 100644 python/cudf/cudf/pandas/_logger.py diff --git a/python/cudf/cudf/pandas/_logger.py b/python/cudf/cudf/pandas/_logger.py new file mode 100644 index 00000000000..7b3e42ece7e --- /dev/null +++ b/python/cudf/cudf/pandas/_logger.py @@ -0,0 +1,26 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import json +import logging + +# https://docs.python.org/3/howto/logging-cookbook.html#implementing-structured-logging + + +class StructuredMessage: + def __init__(self, debug_type: str, /, **kwargs): + self.debug_type = debug_type + self.kwargs = kwargs + + def __str__(self): + log = {"debug_type": self.debug_type} + return json.dumps({**log, **self.kwargs}) + + +logging.basicConfig( + filename="cudf_pandas_unit_tests_debug.log", level=logging.INFO +) +logger = logging.getLogger() diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index 4ceba61433c..ed8883e14d2 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -6,7 +6,6 @@ import functools import inspect -import logging import operator import pickle import types @@ -939,6 +938,7 @@ def _fast_slow_function_call( ): slow_args, slow_kwargs = _slow_arg(args), _slow_arg(kwargs) if _env_get_bool("CUDF_PANDAS_SLOW_LOG", False): + from ._logger import StructuredMessage, logger def reprify(arg) -> str: try: @@ -946,27 +946,33 @@ def reprify(arg) -> str: except Exception: return "" - logging.basicConfig( - filename="pandas-tests.log", level=logging.INFO - ) module = getattr(slow_args[0], "__module__", "") obj_name = getattr( slow_args[0], "__name__", type(slow_args[0]).__name__ ) - called = f"{module}.{obj_name}" - fmt_args = ",".join((reprify(val) for val in slow_args[1])) + slow_object = f"{module}.{obj_name}" + # TODO: maybe use inspect.signature to map called args and kwargs + # to their keyword names + called_args = ",".join((reprify(val) for val in slow_args[1])) if len(slow_args) == 3: fmt_kwargs = ",".join( - f"{kwarg}={value!r}" + f"{kwarg}={reprify(value)}" for kwarg, value in slow_args[2].items() ) - fmt_args = ",".join((fmt_args, fmt_kwargs)) - logging.info( - f"The following fast call was made: {called}({fmt_args})" + called_args = ",".join((called_args, fmt_kwargs)) + passed_kwargs = dict(slow_args[2].items()) + else: + passed_kwargs = None + message = StructuredMessage( + "CUDF_PANDAS_SLOW_LOG", + failed_call=f"{slow_object}({called_args})", + exception=type(err).__name__, + exception_message=str(err), + pandas_object=slow_object, + passed_args=called_args, + passed_kwargs=passed_kwargs, ) - logging.info(f"Exception Type: {type(err).__name__}") - logging.info(f"Called: {called}") - logging.info(f"Arguments: {fmt_args}") + logger.info(message) with disable_module_accelerator(): result = func(*slow_args, **slow_kwargs) return _maybe_wrap_result(result, func, *args, **kwargs), fast diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh index 493654bc62c..1e6df615604 100755 --- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh +++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh @@ -65,7 +65,7 @@ markers = [ ] log_format = %(asctime)s %(levelname)s %(message)s log_date_format = %Y-%m-%d %H:%M:%S -log_file = pandas-tests.log +log_file = cudf_pandas_unit_tests_debug.log log_file_level = INFO log_cli = true EOF From 6ca5ca03334aaa8b986e7af028bf76d263e22252 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 2 Jul 2024 17:40:00 -0700 Subject: [PATCH 3/5] ensure kwargs are serializeable --- python/cudf/cudf/pandas/fast_slow_proxy.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index ed8883e14d2..aafcb2e1137 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -960,9 +960,12 @@ def reprify(arg) -> str: for kwarg, value in slow_args[2].items() ) called_args = ",".join((called_args, fmt_kwargs)) - passed_kwargs = dict(slow_args[2].items()) + passed_kwargs = { + kwarg: reprify(value) + for kwarg, value in slow_args[2].items() + } else: - passed_kwargs = None + passed_kwargs = {} message = StructuredMessage( "CUDF_PANDAS_SLOW_LOG", failed_call=f"{slow_object}({called_args})", From 9542ec263cead68a8c7b3a08e61266828d61d4c8 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 3 Jul 2024 14:32:53 -0700 Subject: [PATCH 4/5] dont need the pytest configs afterall --- python/cudf/cudf/pandas/scripts/run-pandas-tests.sh | 5 ----- 1 file changed, 5 deletions(-) diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh index 1e6df615604..a66f63c09b3 100755 --- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh +++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh @@ -63,11 +63,6 @@ markers = [ "arm_slow: mark a test as slow for arm64 architecture", "skip_ubsan: Tests known to fail UBSAN check", ] -log_format = %(asctime)s %(levelname)s %(message)s -log_date_format = %Y-%m-%d %H:%M:%S -log_file = cudf_pandas_unit_tests_debug.log -log_file_level = INFO -log_cli = true EOF # append the contents of patch-confest.py to conftest.py cat ../python/cudf/cudf/pandas/scripts/conftest-patch.py >> pandas-tests/conftest.py From 53b737c8712e45a7755aa9aa67c3612295fb9cdd Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 17 Jul 2024 13:20:15 -0700 Subject: [PATCH 5/5] Address review comments --- python/cudf/cudf/pandas/_logger.py | 70 +++++++++++++++++++--- python/cudf/cudf/pandas/fast_slow_proxy.py | 41 +------------ 2 files changed, 65 insertions(+), 46 deletions(-) diff --git a/python/cudf/cudf/pandas/_logger.py b/python/cudf/cudf/pandas/_logger.py index 7b3e42ece7e..68923c3e35c 100644 --- a/python/cudf/cudf/pandas/_logger.py +++ b/python/cudf/cudf/pandas/_logger.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 @@ -7,20 +7,74 @@ import json import logging -# https://docs.python.org/3/howto/logging-cookbook.html#implementing-structured-logging +logging.basicConfig( + filename="cudf_pandas_unit_tests_debug.log", level=logging.INFO +) +logger = logging.getLogger() class StructuredMessage: - def __init__(self, debug_type: str, /, **kwargs): + # https://docs.python.org/3/howto/logging-cookbook.html#implementing-structured-logging + def __init__(self, debug_type: str, /, **kwargs) -> None: self.debug_type = debug_type self.kwargs = kwargs - def __str__(self): + def __str__(self) -> str: log = {"debug_type": self.debug_type} return json.dumps({**log, **self.kwargs}) -logging.basicConfig( - filename="cudf_pandas_unit_tests_debug.log", level=logging.INFO -) -logger = logging.getLogger() +def reprify(arg) -> str: + """Attempt to return arg's repr for logging.""" + try: + return repr(arg) + except Exception: + return "" + + +def log_fallback( + slow_args: tuple, slow_kwargs: dict, exception: Exception +) -> None: + """Log when a fast call falls back to the slow path.""" + caller = slow_args[0] + module = getattr(caller, "__module__", "") + obj_name = getattr(caller, "__qualname__", type(caller).__qualname__) + if module: + slow_object = f"{module}.{obj_name}" + else: + slow_object = obj_name + # TODO: Maybe use inspect.signature to map called args and kwargs + # to their keyword names, but a user calling an API incorrectly would + # break this. + caller_args = slow_args[1] + args_passed = ", ".join((reprify(arg) for arg in caller_args)) + args_types_passed = ", ".join((type(arg).__name__ for arg in caller_args)) + kwargs_passed = {} + kwargs_types_passed = "" + if len(slow_args) == 3: + caller_kwargs = slow_args[2] + if caller_kwargs: + fmt_kwargs = ", ".join( + f"{kwarg}={reprify(value)}" + for kwarg, value in caller_kwargs.items() + ) + kwargs_types_passed = ", ".join( + f"{kwarg}={type(value).__name__}" + for kwarg, value in caller_kwargs.items() + ) + args_passed = f"{args_passed}, {fmt_kwargs}" + kwargs_passed = { + kwarg: reprify(value) for kwarg, value in caller_kwargs.items() + } + message = StructuredMessage( + "LOG_FAST_FALLBACK", + failed_call=f"{slow_object}({args_passed})", + exception=type(exception).__name__, + exception_message=str(exception), + slow_object=slow_object, + args_passed=args_passed, + kwargs_passed=kwargs_passed, + args_types_passed=args_types_passed, + kwargs_types_passed=kwargs_types_passed, + ) + logger.info(message) diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index aafcb2e1137..bb678fd1efe 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -937,45 +937,10 @@ def _fast_slow_function_call( domain="cudf_pandas", ): slow_args, slow_kwargs = _slow_arg(args), _slow_arg(kwargs) - if _env_get_bool("CUDF_PANDAS_SLOW_LOG", False): - from ._logger import StructuredMessage, logger + if _env_get_bool("LOG_FAST_FALLBACK", False): + from ._logger import log_fallback - def reprify(arg) -> str: - try: - return repr(arg) - except Exception: - return "" - - module = getattr(slow_args[0], "__module__", "") - obj_name = getattr( - slow_args[0], "__name__", type(slow_args[0]).__name__ - ) - slow_object = f"{module}.{obj_name}" - # TODO: maybe use inspect.signature to map called args and kwargs - # to their keyword names - called_args = ",".join((reprify(val) for val in slow_args[1])) - if len(slow_args) == 3: - fmt_kwargs = ",".join( - f"{kwarg}={reprify(value)}" - for kwarg, value in slow_args[2].items() - ) - called_args = ",".join((called_args, fmt_kwargs)) - passed_kwargs = { - kwarg: reprify(value) - for kwarg, value in slow_args[2].items() - } - else: - passed_kwargs = {} - message = StructuredMessage( - "CUDF_PANDAS_SLOW_LOG", - failed_call=f"{slow_object}({called_args})", - exception=type(err).__name__, - exception_message=str(err), - pandas_object=slow_object, - passed_args=called_args, - passed_kwargs=passed_kwargs, - ) - logger.info(message) + log_fallback(slow_args, slow_kwargs, err) with disable_module_accelerator(): result = func(*slow_args, **slow_kwargs) return _maybe_wrap_result(result, func, *args, **kwargs), fast