From 895b0070f6c353d6900ff8a298c97ce8dc759fb1 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 16 Feb 2022 15:35:33 -0800 Subject: [PATCH] Prevent internal usage of expensive APIs (#10263) This PR introduces a decorator that can be used to mark APIs that are unnecessarily expensive and should be avoided in internal usage. The decorator does nothing by default, but when the corresponding environment variable is set the decorator wraps decorated functions in a check of the current call stack. If the caller of that API is within the cudf source tree, it will raise an exception that may optionally also indicate an alternate API for developers to use instead. This implementation is completely transparent to users (including having no performance impact aside from some negligible additional import time from applying the decorator) but can be an invaluable tool to developers seeking to reduce Python overheads in cuDF Python. This decorator has been tested and shown to work (both locally and on CI) for `DataFrame.columns`, but those changes are not included in this PR in order to keep the changeset clean. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) - AJ Schmidt (https://github.com/ajschmidt8) URL: https://github.com/rapidsai/cudf/pull/10263 --- ci/gpu/build.sh | 8 ++--- python/cudf/cudf/tests/conftest.py | 31 +++++++++++++++++ python/cudf/cudf/utils/utils.py | 56 ++++++++++++++++++++++++++++++ 3 files changed, 91 insertions(+), 4 deletions(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 53ad948b61c..4acdc372817 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. ############################################## # cuDF GPU build and test script for CI # ############################################## @@ -249,15 +249,15 @@ fi cd "$WORKSPACE/python/cudf" gpuci_logger "Python py.test for cuDF" -py.test -n 8 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term --dist=loadscope +py.test -n 8 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term --dist=loadscope cudf cd "$WORKSPACE/python/dask_cudf" gpuci_logger "Python py.test for dask-cudf" -py.test -n 8 --cache-clear --basetemp="$WORKSPACE/dask-cudf-cuda-tmp" --junitxml="$WORKSPACE/junit-dask-cudf.xml" -v --cov-config=.coveragerc --cov=dask_cudf --cov-report=xml:"$WORKSPACE/python/dask_cudf/dask-cudf-coverage.xml" --cov-report term +py.test -n 8 --cache-clear --basetemp="$WORKSPACE/dask-cudf-cuda-tmp" --junitxml="$WORKSPACE/junit-dask-cudf.xml" -v --cov-config=.coveragerc --cov=dask_cudf --cov-report=xml:"$WORKSPACE/python/dask_cudf/dask-cudf-coverage.xml" --cov-report term dask_cudf cd "$WORKSPACE/python/custreamz" gpuci_logger "Python py.test for cuStreamz" -py.test -n 8 --cache-clear --basetemp="$WORKSPACE/custreamz-cuda-tmp" --junitxml="$WORKSPACE/junit-custreamz.xml" -v --cov-config=.coveragerc --cov=custreamz --cov-report=xml:"$WORKSPACE/python/custreamz/custreamz-coverage.xml" --cov-report term +py.test -n 8 --cache-clear --basetemp="$WORKSPACE/custreamz-cuda-tmp" --junitxml="$WORKSPACE/junit-custreamz.xml" -v --cov-config=.coveragerc --cov=custreamz --cov-report=xml:"$WORKSPACE/python/custreamz/custreamz-coverage.xml" --cov-report term custreamz gpuci_logger "Test notebooks" "$WORKSPACE/ci/gpu/test-notebooks.sh" 2>&1 | tee nbtest.log diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index 041bd055f0a..4d5b5926d6e 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -1,10 +1,41 @@ +# Copyright (c) 2019-2022, NVIDIA CORPORATION. + +import os import pathlib import pytest import rmm # noqa: F401 +_CURRENT_DIRECTORY = str(pathlib.Path(__file__).resolve().parent) + @pytest.fixture(scope="session") def datadir(): return pathlib.Path(__file__).parent / "data" + + +# To set and remove the NO_EXTERNAL_ONLY_APIS environment variable we must use +# the sessionstart and sessionfinish hooks rather than a simple autouse, +# session-scope fixture because we need to set these variable before collection +# occurs because the environment variable will be checked as soon as cudf is +# imported anywhere. +def pytest_sessionstart(session): + """ + Called after the Session object has been created and + before performing collection and entering the run test loop. + """ + os.environ["NO_EXTERNAL_ONLY_APIS"] = "1" + os.environ["_CUDF_TEST_ROOT"] = _CURRENT_DIRECTORY + + +def pytest_sessionfinish(session, exitstatus): + """ + Called after whole test run finished, right before + returning the exit status to the system. + """ + try: + del os.environ["NO_EXTERNAL_ONLY_APIS"] + del os.environ["_CUDF_TEST_ROOT"] + except KeyError: + pass diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 8adca07b252..cf845a5d525 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -2,6 +2,8 @@ import decimal import functools +import os +import traceback from collections.abc import Sequence from typing import FrozenSet, Set, Union @@ -37,6 +39,60 @@ } +# The test root is set by pytest to support situations where tests are run from +# a source tree on a built version of cudf. +NO_EXTERNAL_ONLY_APIS = os.getenv("NO_EXTERNAL_ONLY_APIS") + +_cudf_root = os.path.dirname(cudf.__file__) +# If the environment variable for the test root is not set, we default to +# using the path relative to the cudf root directory. +_tests_root = os.getenv("_CUDF_TEST_ROOT") or os.path.join(_cudf_root, "tests") + + +def _external_only_api(func, alternative=""): + """Decorator to indicate that a function should not be used internally. + + cudf contains many APIs that exist for pandas compatibility but are + intrinsically inefficient. For some of these cudf has internal + equivalents that are much faster. Usage of the slow public APIs inside + our implementation can lead to unnecessary performance bottlenecks. + Applying this decorator to such functions and setting the environment + variable NO_EXTERNAL_ONLY_APIS will cause such functions to raise + exceptions if they are called from anywhere inside cudf, making it easy + to identify and excise such usage. + + The `alternative` should be a complete phrase or sentence since it will + be used verbatim in error messages. + """ + + # If the first arg is a string then an alternative function to use in + # place of this API was provided, so we pass that to a subsequent call. + # It would be cleaner to implement this pattern by using a class + # decorator with a factory method, but there is no way to generically + # wrap docstrings on a class (we would need the docstring to be on the + # class itself, not instances, because that's what `help` looks at) and + # there is also no way to make mypy happy with that approach. + if isinstance(func, str): + return lambda actual_func: _external_only_api(actual_func, func) + + if not NO_EXTERNAL_ONLY_APIS: + return func + + @functools.wraps(func) + def wrapper(*args, **kwargs): + # Check the immediately preceding frame to see if it's in cudf. + frame, lineno = next(traceback.walk_stack(None)) + fn = frame.f_code.co_filename + if _cudf_root in fn and _tests_root not in fn: + raise RuntimeError( + f"External-only API called in {fn} at line {lineno}. " + f"{alternative}" + ) + return func(*args, **kwargs) + + return wrapper + + def scalar_broadcast_to(scalar, size, dtype=None): if isinstance(size, (tuple, list)):