Skip to content

Commit

Permalink
compile_udf: Cache PTX for similar functions (#7371)
Browse files Browse the repository at this point in the history
Compiling a UDF generated in a loop will result in a distinct compilation for each loop iteration, because each new definition of the UDF does not compare equal to any previous definition, and a new compilation occurs. Furthermore, each new compilation returns PTX that differs only in a trivial way (the generated code is the same but function names are different), so JITify's cache also misses.

For example:

```python
for data_size in range(3):
    data = Series([3] * (2 ** data_size), dtype=np.float64)
    for i in range(3):
        data.applymap(lambda x: x + 1)
```

results in nine compilations when one would have sufficed.

This commit adds an additional cache to `compile_udf` keyed on the signature, code, and closure variables of the function. This can hit for distinct definitions of the same function. The existing `lru_cache` wrapping `compile_udf` is left in place as it is expected to be able to hash the function much more quickly, though I don't know if this has a noticeable impact on performance - perhaps it would be worth removing it for simplicity, so that there is only one level of caching.

Authors:
  - Graham Markall (@gmarkall)

Approvers:
  - Keith Kraus (@kkraus14)
  - AJ Schmidt (@ajschmidt8)

URL: #7371
  • Loading branch information
gmarkall authored Feb 26, 2021
1 parent 862559f commit 7526be7
Show file tree
Hide file tree
Showing 6 changed files with 102 additions and 4 deletions.
1 change: 1 addition & 0 deletions conda/environments/cudf_dev_cuda10.1.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ dependencies:
- packaging
- protobuf
- nvtx>=0.2.1
- cachetools
- pip:
- git+https://github.com/dask/dask.git@master
- git+https://github.com/dask/distributed.git@master
Expand Down
1 change: 1 addition & 0 deletions conda/environments/cudf_dev_cuda10.2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ dependencies:
- packaging
- protobuf
- nvtx>=0.2.1
- cachetools
- pip:
- git+https://github.com/dask/dask.git@master
- git+https://github.com/dask/distributed.git@master
Expand Down
1 change: 1 addition & 0 deletions conda/environments/cudf_dev_cuda11.0.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ dependencies:
- packaging
- protobuf
- nvtx>=0.2.1
- cachetools
- pip:
- git+https://github.com/dask/dask.git@master
- git+https://github.com/dask/distributed.git@master
Expand Down
1 change: 1 addition & 0 deletions conda/recipes/cudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ requirements:
- fsspec>=0.6.0
- {{ pin_compatible('cudatoolkit', max_pin='x.x') }}
- nvtx >=0.2.1
- cachetools

test:
requires:
Expand Down
66 changes: 66 additions & 0 deletions python/cudf/cudf/tests/test_compile_udf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Copyright (c) 2021, NVIDIA CORPORATION.

from cudf.utils import cudautils
from numba import types


def setup_function():
cudautils._udf_code_cache.clear()


def assert_cache_size(size):
assert cudautils._udf_code_cache.currsize == size


def test_first_compile_sets_cache_entry():
# The first compilation should put an entry in the cache
cudautils.compile_udf(lambda x: x + 1, (types.float32,))
assert_cache_size(1)


def test_code_cache_same_code_different_function_hit():
# Compilation of a distinct function with the same code and signature
# should reuse the cached entry

cudautils.compile_udf(lambda x: x + 1, (types.float32,))
assert_cache_size(1)

cudautils.compile_udf(lambda x: x + 1, (types.float32,))
assert_cache_size(1)


def test_code_cache_different_types_miss():
# Compilation of a distinct function with the same code but different types
# should create an additional cache entry

cudautils.compile_udf(lambda x: x + 1, (types.float32,))
assert_cache_size(1)

cudautils.compile_udf(lambda x: x + 1, (types.float64,))
assert_cache_size(2)


def test_code_cache_different_cvars_miss():
# Compilation of a distinct function with the same types and code as an
# existing entry but different closure variables should create an
# additional cache entry

def gen_closure(y):
return lambda x: x + y

cudautils.compile_udf(gen_closure(1), (types.float32,))
assert_cache_size(1)

cudautils.compile_udf(gen_closure(2), (types.float32,))
assert_cache_size(2)


def test_lambda_in_loop_code_cached():
# Compiling a UDF defined in a loop should result in the code cache being
# reused for each loop iteration after the first. We check for this by
# ensuring that there is only one entry in the code cache after the loop.

for i in range(3):
cudautils.compile_udf(lambda x: x + 1, (types.float32,))

assert_cache_size(1)
36 changes: 32 additions & 4 deletions python/cudf/cudf/utils/cudautils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# Copyright (c) 2018-2021, NVIDIA CORPORATION.
from functools import lru_cache

import cachetools
import cupy
import numpy as np
from numba import cuda
from pickle import dumps

import cudf
from cudf.utils.utils import check_equals_float, check_equals_int
Expand Down Expand Up @@ -235,7 +235,13 @@ def grouped_window_sizes_from_offset(arr, group_starts, offset):
return window_sizes


@lru_cache(maxsize=32)
# This cache is keyed on the (signature, code, closure variables) of UDFs, so
# it can hit for distinct functions that are similar. The lru_cache wrapping
# compile_udf misses for these similar functions, but doesn't need to serialize
# closure variables to check for a hit.
_udf_code_cache = cachetools.LRUCache(maxsize=32)


def compile_udf(udf, type_signature):
"""Compile ``udf`` with `numba`
Expand Down Expand Up @@ -266,8 +272,30 @@ def compile_udf(udf, type_signature):
An numpy type
"""

# Check if we've already compiled a similar (but possibly distinct)
# function before
codebytes = udf.__code__.co_code
if udf.__closure__ is not None:
cvars = tuple([x.cell_contents for x in udf.__closure__])
cvarbytes = dumps(cvars)
else:
cvarbytes = b""

key = (type_signature, codebytes, cvarbytes)
res = _udf_code_cache.get(key)
if res:
return res

# We haven't compiled a function like this before, so need to fall back to
# compilation with Numba
ptx_code, return_type = cuda.compile_ptx_for_current_device(
udf, type_signature, device=True
)
output_type = numpy_support.as_dtype(return_type)
return (ptx_code, output_type.type)

# Populate the cache for this function
res = (ptx_code, output_type.type)
_udf_code_cache[key] = res

return res

0 comments on commit 7526be7

Please sign in to comment.