From 7526be71bdcafd4cfa73d2f0c9e77234d197cf37 Mon Sep 17 00:00:00 2001 From: Graham Markall <535640+gmarkall@users.noreply.github.com> Date: Fri, 26 Feb 2021 18:51:36 +0000 Subject: [PATCH] compile_udf: Cache PTX for similar functions (#7371) Compiling a UDF generated in a loop will result in a distinct compilation for each loop iteration, because each new definition of the UDF does not compare equal to any previous definition, and a new compilation occurs. Furthermore, each new compilation returns PTX that differs only in a trivial way (the generated code is the same but function names are different), so JITify's cache also misses. For example: ```python for data_size in range(3): data = Series([3] * (2 ** data_size), dtype=np.float64) for i in range(3): data.applymap(lambda x: x + 1) ``` results in nine compilations when one would have sufficed. This commit adds an additional cache to `compile_udf` keyed on the signature, code, and closure variables of the function. This can hit for distinct definitions of the same function. The existing `lru_cache` wrapping `compile_udf` is left in place as it is expected to be able to hash the function much more quickly, though I don't know if this has a noticeable impact on performance - perhaps it would be worth removing it for simplicity, so that there is only one level of caching. Authors: - Graham Markall (@gmarkall) Approvers: - Keith Kraus (@kkraus14) - AJ Schmidt (@ajschmidt8) URL: https://github.com/rapidsai/cudf/pull/7371 --- conda/environments/cudf_dev_cuda10.1.yml | 1 + conda/environments/cudf_dev_cuda10.2.yml | 1 + conda/environments/cudf_dev_cuda11.0.yml | 1 + conda/recipes/cudf/meta.yaml | 1 + python/cudf/cudf/tests/test_compile_udf.py | 66 ++++++++++++++++++++++ python/cudf/cudf/utils/cudautils.py | 36 ++++++++++-- 6 files changed, 102 insertions(+), 4 deletions(-) create mode 100644 python/cudf/cudf/tests/test_compile_udf.py diff --git a/conda/environments/cudf_dev_cuda10.1.yml b/conda/environments/cudf_dev_cuda10.1.yml index 69d729aea0c..76a9f8fd01d 100644 --- a/conda/environments/cudf_dev_cuda10.1.yml +++ b/conda/environments/cudf_dev_cuda10.1.yml @@ -60,6 +60,7 @@ dependencies: - packaging - protobuf - nvtx>=0.2.1 + - cachetools - pip: - git+https://github.com/dask/dask.git@master - git+https://github.com/dask/distributed.git@master diff --git a/conda/environments/cudf_dev_cuda10.2.yml b/conda/environments/cudf_dev_cuda10.2.yml index 68c2ffc6aee..a6a39ecdcba 100644 --- a/conda/environments/cudf_dev_cuda10.2.yml +++ b/conda/environments/cudf_dev_cuda10.2.yml @@ -60,6 +60,7 @@ dependencies: - packaging - protobuf - nvtx>=0.2.1 + - cachetools - pip: - git+https://github.com/dask/dask.git@master - git+https://github.com/dask/distributed.git@master diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml index 4070802e8a8..0afa36721c5 100644 --- a/conda/environments/cudf_dev_cuda11.0.yml +++ b/conda/environments/cudf_dev_cuda11.0.yml @@ -60,6 +60,7 @@ dependencies: - packaging - protobuf - nvtx>=0.2.1 + - cachetools - pip: - git+https://github.com/dask/dask.git@master - git+https://github.com/dask/distributed.git@master diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index c5f7bd34c25..85280181711 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -45,6 +45,7 @@ requirements: - fsspec>=0.6.0 - {{ pin_compatible('cudatoolkit', max_pin='x.x') }} - nvtx >=0.2.1 + - cachetools test: requires: diff --git a/python/cudf/cudf/tests/test_compile_udf.py b/python/cudf/cudf/tests/test_compile_udf.py new file mode 100644 index 00000000000..96c0e91d8d7 --- /dev/null +++ b/python/cudf/cudf/tests/test_compile_udf.py @@ -0,0 +1,66 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from cudf.utils import cudautils +from numba import types + + +def setup_function(): + cudautils._udf_code_cache.clear() + + +def assert_cache_size(size): + assert cudautils._udf_code_cache.currsize == size + + +def test_first_compile_sets_cache_entry(): + # The first compilation should put an entry in the cache + cudautils.compile_udf(lambda x: x + 1, (types.float32,)) + assert_cache_size(1) + + +def test_code_cache_same_code_different_function_hit(): + # Compilation of a distinct function with the same code and signature + # should reuse the cached entry + + cudautils.compile_udf(lambda x: x + 1, (types.float32,)) + assert_cache_size(1) + + cudautils.compile_udf(lambda x: x + 1, (types.float32,)) + assert_cache_size(1) + + +def test_code_cache_different_types_miss(): + # Compilation of a distinct function with the same code but different types + # should create an additional cache entry + + cudautils.compile_udf(lambda x: x + 1, (types.float32,)) + assert_cache_size(1) + + cudautils.compile_udf(lambda x: x + 1, (types.float64,)) + assert_cache_size(2) + + +def test_code_cache_different_cvars_miss(): + # Compilation of a distinct function with the same types and code as an + # existing entry but different closure variables should create an + # additional cache entry + + def gen_closure(y): + return lambda x: x + y + + cudautils.compile_udf(gen_closure(1), (types.float32,)) + assert_cache_size(1) + + cudautils.compile_udf(gen_closure(2), (types.float32,)) + assert_cache_size(2) + + +def test_lambda_in_loop_code_cached(): + # Compiling a UDF defined in a loop should result in the code cache being + # reused for each loop iteration after the first. We check for this by + # ensuring that there is only one entry in the code cache after the loop. + + for i in range(3): + cudautils.compile_udf(lambda x: x + 1, (types.float32,)) + + assert_cache_size(1) diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py index fbf6d008284..f62ca862091 100755 --- a/python/cudf/cudf/utils/cudautils.py +++ b/python/cudf/cudf/utils/cudautils.py @@ -1,9 +1,9 @@ # Copyright (c) 2018-2021, NVIDIA CORPORATION. -from functools import lru_cache - +import cachetools import cupy import numpy as np from numba import cuda +from pickle import dumps import cudf from cudf.utils.utils import check_equals_float, check_equals_int @@ -235,7 +235,13 @@ def grouped_window_sizes_from_offset(arr, group_starts, offset): return window_sizes -@lru_cache(maxsize=32) +# This cache is keyed on the (signature, code, closure variables) of UDFs, so +# it can hit for distinct functions that are similar. The lru_cache wrapping +# compile_udf misses for these similar functions, but doesn't need to serialize +# closure variables to check for a hit. +_udf_code_cache = cachetools.LRUCache(maxsize=32) + + def compile_udf(udf, type_signature): """Compile ``udf`` with `numba` @@ -266,8 +272,30 @@ def compile_udf(udf, type_signature): An numpy type """ + + # Check if we've already compiled a similar (but possibly distinct) + # function before + codebytes = udf.__code__.co_code + if udf.__closure__ is not None: + cvars = tuple([x.cell_contents for x in udf.__closure__]) + cvarbytes = dumps(cvars) + else: + cvarbytes = b"" + + key = (type_signature, codebytes, cvarbytes) + res = _udf_code_cache.get(key) + if res: + return res + + # We haven't compiled a function like this before, so need to fall back to + # compilation with Numba ptx_code, return_type = cuda.compile_ptx_for_current_device( udf, type_signature, device=True ) output_type = numpy_support.as_dtype(return_type) - return (ptx_code, output_type.type) + + # Populate the cache for this function + res = (ptx_code, output_type.type) + _udf_code_cache[key] = res + + return res