Skip to content

Commit

Permalink
Prevent internal usage of expensive APIs (rapidsai#10263)
Browse files Browse the repository at this point in the history
This PR introduces a decorator that can be used to mark APIs that are unnecessarily expensive and should be avoided in internal usage. The decorator does nothing by default, but when the corresponding environment variable is set the decorator wraps decorated functions in a check of the current call stack. If the caller of that API is within the cudf source tree, it will raise an exception that may optionally also indicate an alternate API for developers to use instead. This implementation is completely transparent to users (including having no performance impact aside from some negligible additional import time from applying the decorator) but can be an invaluable tool to developers seeking to reduce Python overheads in cuDF Python. This decorator has been tested and shown to work (both locally and on CI) for `DataFrame.columns`, but those changes are not included in this PR in order to keep the changeset clean.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: rapidsai#10263
  • Loading branch information
vyasr authored Feb 16, 2022
1 parent 183eec3 commit 895b007
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 4 deletions.
8 changes: 4 additions & 4 deletions ci/gpu/build.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
# Copyright (c) 2018-2021, NVIDIA CORPORATION.
# Copyright (c) 2018-2022, NVIDIA CORPORATION.
##############################################
# cuDF GPU build and test script for CI #
##############################################
Expand Down Expand Up @@ -249,15 +249,15 @@ fi

cd "$WORKSPACE/python/cudf"
gpuci_logger "Python py.test for cuDF"
py.test -n 8 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term --dist=loadscope
py.test -n 8 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term --dist=loadscope cudf

cd "$WORKSPACE/python/dask_cudf"
gpuci_logger "Python py.test for dask-cudf"
py.test -n 8 --cache-clear --basetemp="$WORKSPACE/dask-cudf-cuda-tmp" --junitxml="$WORKSPACE/junit-dask-cudf.xml" -v --cov-config=.coveragerc --cov=dask_cudf --cov-report=xml:"$WORKSPACE/python/dask_cudf/dask-cudf-coverage.xml" --cov-report term
py.test -n 8 --cache-clear --basetemp="$WORKSPACE/dask-cudf-cuda-tmp" --junitxml="$WORKSPACE/junit-dask-cudf.xml" -v --cov-config=.coveragerc --cov=dask_cudf --cov-report=xml:"$WORKSPACE/python/dask_cudf/dask-cudf-coverage.xml" --cov-report term dask_cudf

cd "$WORKSPACE/python/custreamz"
gpuci_logger "Python py.test for cuStreamz"
py.test -n 8 --cache-clear --basetemp="$WORKSPACE/custreamz-cuda-tmp" --junitxml="$WORKSPACE/junit-custreamz.xml" -v --cov-config=.coveragerc --cov=custreamz --cov-report=xml:"$WORKSPACE/python/custreamz/custreamz-coverage.xml" --cov-report term
py.test -n 8 --cache-clear --basetemp="$WORKSPACE/custreamz-cuda-tmp" --junitxml="$WORKSPACE/junit-custreamz.xml" -v --cov-config=.coveragerc --cov=custreamz --cov-report=xml:"$WORKSPACE/python/custreamz/custreamz-coverage.xml" --cov-report term custreamz

gpuci_logger "Test notebooks"
"$WORKSPACE/ci/gpu/test-notebooks.sh" 2>&1 | tee nbtest.log
Expand Down
31 changes: 31 additions & 0 deletions python/cudf/cudf/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,41 @@
# Copyright (c) 2019-2022, NVIDIA CORPORATION.

import os
import pathlib

import pytest

import rmm # noqa: F401

_CURRENT_DIRECTORY = str(pathlib.Path(__file__).resolve().parent)


@pytest.fixture(scope="session")
def datadir():
return pathlib.Path(__file__).parent / "data"


# To set and remove the NO_EXTERNAL_ONLY_APIS environment variable we must use
# the sessionstart and sessionfinish hooks rather than a simple autouse,
# session-scope fixture because we need to set these variable before collection
# occurs because the environment variable will be checked as soon as cudf is
# imported anywhere.
def pytest_sessionstart(session):
"""
Called after the Session object has been created and
before performing collection and entering the run test loop.
"""
os.environ["NO_EXTERNAL_ONLY_APIS"] = "1"
os.environ["_CUDF_TEST_ROOT"] = _CURRENT_DIRECTORY


def pytest_sessionfinish(session, exitstatus):
"""
Called after whole test run finished, right before
returning the exit status to the system.
"""
try:
del os.environ["NO_EXTERNAL_ONLY_APIS"]
del os.environ["_CUDF_TEST_ROOT"]
except KeyError:
pass
56 changes: 56 additions & 0 deletions python/cudf/cudf/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import decimal
import functools
import os
import traceback
from collections.abc import Sequence
from typing import FrozenSet, Set, Union

Expand Down Expand Up @@ -37,6 +39,60 @@
}


# The test root is set by pytest to support situations where tests are run from
# a source tree on a built version of cudf.
NO_EXTERNAL_ONLY_APIS = os.getenv("NO_EXTERNAL_ONLY_APIS")

_cudf_root = os.path.dirname(cudf.__file__)
# If the environment variable for the test root is not set, we default to
# using the path relative to the cudf root directory.
_tests_root = os.getenv("_CUDF_TEST_ROOT") or os.path.join(_cudf_root, "tests")


def _external_only_api(func, alternative=""):
"""Decorator to indicate that a function should not be used internally.
cudf contains many APIs that exist for pandas compatibility but are
intrinsically inefficient. For some of these cudf has internal
equivalents that are much faster. Usage of the slow public APIs inside
our implementation can lead to unnecessary performance bottlenecks.
Applying this decorator to such functions and setting the environment
variable NO_EXTERNAL_ONLY_APIS will cause such functions to raise
exceptions if they are called from anywhere inside cudf, making it easy
to identify and excise such usage.
The `alternative` should be a complete phrase or sentence since it will
be used verbatim in error messages.
"""

# If the first arg is a string then an alternative function to use in
# place of this API was provided, so we pass that to a subsequent call.
# It would be cleaner to implement this pattern by using a class
# decorator with a factory method, but there is no way to generically
# wrap docstrings on a class (we would need the docstring to be on the
# class itself, not instances, because that's what `help` looks at) and
# there is also no way to make mypy happy with that approach.
if isinstance(func, str):
return lambda actual_func: _external_only_api(actual_func, func)

if not NO_EXTERNAL_ONLY_APIS:
return func

@functools.wraps(func)
def wrapper(*args, **kwargs):
# Check the immediately preceding frame to see if it's in cudf.
frame, lineno = next(traceback.walk_stack(None))
fn = frame.f_code.co_filename
if _cudf_root in fn and _tests_root not in fn:
raise RuntimeError(
f"External-only API called in {fn} at line {lineno}. "
f"{alternative}"
)
return func(*args, **kwargs)

return wrapper


def scalar_broadcast_to(scalar, size, dtype=None):

if isinstance(size, (tuple, list)):
Expand Down

0 comments on commit 895b007

Please sign in to comment.