From 7526be71bdcafd4cfa73d2f0c9e77234d197cf37 Mon Sep 17 00:00:00 2001
From: Graham Markall <535640+gmarkall@users.noreply.github.com>
Date: Fri, 26 Feb 2021 18:51:36 +0000
Subject: [PATCH] compile_udf: Cache PTX for similar functions (#7371)

Compiling a UDF generated in a loop will result in a distinct compilation for each loop iteration, because each new definition of the UDF does not compare equal to any previous definition, and a new compilation occurs. Furthermore, each new compilation returns PTX that differs only in a trivial way (the generated code is the same but function names are different), so JITify's cache also misses.

For example:

```python
for data_size in range(3):
    data = Series([3] * (2 ** data_size), dtype=np.float64)
    for i in range(3):
        data.applymap(lambda x: x + 1)
```

results in nine compilations when one would have sufficed.

This commit adds an additional cache to `compile_udf` keyed on the signature, code, and closure variables of the function. This can hit for distinct definitions of the same function. The existing `lru_cache` wrapping `compile_udf` is left in place as it is expected to be able to hash the function much more quickly, though I don't know if this has a noticeable impact on performance - perhaps it would be worth removing it for simplicity, so that there is only one level of caching.

Authors:
  - Graham Markall (@gmarkall)

Approvers:
  - Keith Kraus (@kkraus14)
  - AJ Schmidt (@ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/7371
---
 conda/environments/cudf_dev_cuda10.1.yml   |  1 +
 conda/environments/cudf_dev_cuda10.2.yml   |  1 +
 conda/environments/cudf_dev_cuda11.0.yml   |  1 +
 conda/recipes/cudf/meta.yaml               |  1 +
 python/cudf/cudf/tests/test_compile_udf.py | 66 ++++++++++++++++++++++
 python/cudf/cudf/utils/cudautils.py        | 36 ++++++++++--
 6 files changed, 102 insertions(+), 4 deletions(-)
 create mode 100644 python/cudf/cudf/tests/test_compile_udf.py

diff --git a/conda/environments/cudf_dev_cuda10.1.yml b/conda/environments/cudf_dev_cuda10.1.yml
index 69d729aea0c..76a9f8fd01d 100644
--- a/conda/environments/cudf_dev_cuda10.1.yml
+++ b/conda/environments/cudf_dev_cuda10.1.yml
@@ -60,6 +60,7 @@ dependencies:
   - packaging
   - protobuf
   - nvtx>=0.2.1
+  - cachetools
   - pip:
       - git+https://github.com/dask/dask.git@master
       - git+https://github.com/dask/distributed.git@master
diff --git a/conda/environments/cudf_dev_cuda10.2.yml b/conda/environments/cudf_dev_cuda10.2.yml
index 68c2ffc6aee..a6a39ecdcba 100644
--- a/conda/environments/cudf_dev_cuda10.2.yml
+++ b/conda/environments/cudf_dev_cuda10.2.yml
@@ -60,6 +60,7 @@ dependencies:
   - packaging
   - protobuf
   - nvtx>=0.2.1
+  - cachetools
   - pip:
       - git+https://github.com/dask/dask.git@master
       - git+https://github.com/dask/distributed.git@master
diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml
index 4070802e8a8..0afa36721c5 100644
--- a/conda/environments/cudf_dev_cuda11.0.yml
+++ b/conda/environments/cudf_dev_cuda11.0.yml
@@ -60,6 +60,7 @@ dependencies:
   - packaging
   - protobuf
   - nvtx>=0.2.1
+  - cachetools
   - pip:
       - git+https://github.com/dask/dask.git@master
       - git+https://github.com/dask/distributed.git@master
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index c5f7bd34c25..85280181711 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -45,6 +45,7 @@ requirements:
     - fsspec>=0.6.0
     - {{ pin_compatible('cudatoolkit', max_pin='x.x') }}
     - nvtx >=0.2.1
+    - cachetools
 
 test:
   requires:
diff --git a/python/cudf/cudf/tests/test_compile_udf.py b/python/cudf/cudf/tests/test_compile_udf.py
new file mode 100644
index 00000000000..96c0e91d8d7
--- /dev/null
+++ b/python/cudf/cudf/tests/test_compile_udf.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from cudf.utils import cudautils
+from numba import types
+
+
+def setup_function():
+    cudautils._udf_code_cache.clear()
+
+
+def assert_cache_size(size):
+    assert cudautils._udf_code_cache.currsize == size
+
+
+def test_first_compile_sets_cache_entry():
+    # The first compilation should put an entry in the cache
+    cudautils.compile_udf(lambda x: x + 1, (types.float32,))
+    assert_cache_size(1)
+
+
+def test_code_cache_same_code_different_function_hit():
+    # Compilation of a distinct function with the same code and signature
+    # should reuse the cached entry
+
+    cudautils.compile_udf(lambda x: x + 1, (types.float32,))
+    assert_cache_size(1)
+
+    cudautils.compile_udf(lambda x: x + 1, (types.float32,))
+    assert_cache_size(1)
+
+
+def test_code_cache_different_types_miss():
+    # Compilation of a distinct function with the same code but different types
+    # should create an additional cache entry
+
+    cudautils.compile_udf(lambda x: x + 1, (types.float32,))
+    assert_cache_size(1)
+
+    cudautils.compile_udf(lambda x: x + 1, (types.float64,))
+    assert_cache_size(2)
+
+
+def test_code_cache_different_cvars_miss():
+    # Compilation of a distinct function with the same types and code as an
+    # existing entry but different closure variables should create an
+    # additional cache entry
+
+    def gen_closure(y):
+        return lambda x: x + y
+
+    cudautils.compile_udf(gen_closure(1), (types.float32,))
+    assert_cache_size(1)
+
+    cudautils.compile_udf(gen_closure(2), (types.float32,))
+    assert_cache_size(2)
+
+
+def test_lambda_in_loop_code_cached():
+    # Compiling a UDF defined in a loop should result in the code cache being
+    # reused for each loop iteration after the first. We check for this by
+    # ensuring that there is only one entry in the code cache after the loop.
+
+    for i in range(3):
+        cudautils.compile_udf(lambda x: x + 1, (types.float32,))
+
+    assert_cache_size(1)
diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py
index fbf6d008284..f62ca862091 100755
--- a/python/cudf/cudf/utils/cudautils.py
+++ b/python/cudf/cudf/utils/cudautils.py
@@ -1,9 +1,9 @@
 # Copyright (c) 2018-2021, NVIDIA CORPORATION.
-from functools import lru_cache
-
+import cachetools
 import cupy
 import numpy as np
 from numba import cuda
+from pickle import dumps
 
 import cudf
 from cudf.utils.utils import check_equals_float, check_equals_int
@@ -235,7 +235,13 @@ def grouped_window_sizes_from_offset(arr, group_starts, offset):
     return window_sizes
 
 
-@lru_cache(maxsize=32)
+# This cache is keyed on the (signature, code, closure variables) of UDFs, so
+# it can hit for distinct functions that are similar. The lru_cache wrapping
+# compile_udf misses for these similar functions, but doesn't need to serialize
+# closure variables to check for a hit.
+_udf_code_cache = cachetools.LRUCache(maxsize=32)
+
+
 def compile_udf(udf, type_signature):
     """Compile ``udf`` with `numba`
 
@@ -266,8 +272,30 @@ def compile_udf(udf, type_signature):
       An numpy type
 
     """
+
+    # Check if we've already compiled a similar (but possibly distinct)
+    # function before
+    codebytes = udf.__code__.co_code
+    if udf.__closure__ is not None:
+        cvars = tuple([x.cell_contents for x in udf.__closure__])
+        cvarbytes = dumps(cvars)
+    else:
+        cvarbytes = b""
+
+    key = (type_signature, codebytes, cvarbytes)
+    res = _udf_code_cache.get(key)
+    if res:
+        return res
+
+    # We haven't compiled a function like this before, so need to fall back to
+    # compilation with Numba
     ptx_code, return_type = cuda.compile_ptx_for_current_device(
         udf, type_signature, device=True
     )
     output_type = numpy_support.as_dtype(return_type)
-    return (ptx_code, output_type.type)
+
+    # Populate the cache for this function
+    res = (ptx_code, output_type.type)
+    _udf_code_cache[key] = res
+
+    return res