First pass at adding testing for pylibcudf (#15300)

This PR adds tests of the `pylibcudf.copying` module along with establishing the infrastructure and best practices for writing pylibcudf tests going forward (and adding associated documentation). Resolves #15133 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Muhammad Haseeb (https://github.com/mhaseeb123) - Ashwin Srinath (https://github.com/shwina) - Jake Awe (https://github.com/AyodeAwe) - https://github.com/brandon-b-miller URL: #15300
rapidsai · Apr 1, 2024 · aab6137 · aab6137
1 parent 7c69e66
commit aab6137
Show file tree

Hide file tree

Showing 21 changed files with 1,254 additions and 54 deletions.
diff --git a/ci/test_python_cudf.sh b/ci/test_python_cudf.sh
@@ -14,6 +14,14 @@ EXITCODE=0
 trap "EXITCODE=1" ERR
 set +e
 
+rapids-logger "pytest pylibcudf"
+pushd python/cudf/cudf/pylibcudf_tests
+python -m pytest \
+  --cache-clear \
+  --dist=worksteal \
+  .
+popd
+
 rapids-logger "pytest cudf"
 ./ci/run_cudf_pytests.sh \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf.xml" \

diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh
@@ -18,6 +18,14 @@ if [[ "$(arch)" == "aarch64" && ${RAPIDS_BUILD_TYPE} == "pull-request" ]]; then
     rapids-logger "Run smoke tests for cudf"
     python ./ci/wheel_smoke_test_cudf.py
 else
+    rapids-logger "pytest pylibcudf"
+    pushd python/cudf/cudf/pylibcudf_tests
+    python -m pytest \
+      --cache-clear \
+      --dist=worksteal \
+      .
+    popd
+
     rapids-logger "pytest cudf"
     pushd python/cudf/cudf/tests
     python -m pytest \

diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp
@@ -253,6 +253,8 @@ std::unique_ptr<column> empty_like(scalar const& input);
  * If the `mask_alloc` allocates a validity mask that mask is also uninitialized
  * and the validity bits and the null count should be set by the caller.
  *
+ * @throws cudf::data_type_error if input type is not of fixed width.
+ *
  * @param input Immutable view of input column to emulate
  * @param mask_alloc Optional, Policy for allocating null mask. Defaults to RETAIN
  * @param mr Device memory resource used to allocate the returned column's device memory
@@ -360,6 +362,7 @@ void copy_range_in_place(column_view const& source,
  *
  * @throws std::out_of_range for any invalid range.
  * @throws cudf::data_type_error if @p target and @p source have different types.
+ * @throws cudf::data_type_error if the data type is not fixed width, string, or dictionary
  *
  * @param source The column to copy from inside the range
  * @param target The column to copy from outside the range

diff --git a/cpp/src/copying/copy.cpp b/cpp/src/copying/copy.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -122,7 +122,8 @@ std::unique_ptr<column> allocate_like(column_view const& input,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  CUDF_EXPECTS(is_fixed_width(input.type()), "Expects only fixed-width type column");
+  CUDF_EXPECTS(
+    is_fixed_width(input.type()), "Expects only fixed-width type column", cudf::data_type_error);
   mask_state allocate_mask = should_allocate_mask(mask_alloc, input.nullable());
 
   return std::make_unique<column>(input.type(),

diff --git a/cpp/src/copying/copy_range.cu b/cpp/src/copying/copy_range.cu
@@ -119,7 +119,7 @@ struct out_of_place_copy_range_dispatch {
   std::enable_if_t<not cudf::is_rep_layout_compatible<T>(), std::unique_ptr<cudf::column>>
   operator()(Args...)
   {
-    CUDF_FAIL("Unsupported type for out of place copy.");
+    CUDF_FAIL("Unsupported type for out of place copy.", cudf::data_type_error);
   }
 };
 

diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
@@ -144,7 +144,9 @@ struct column_scalar_scatterer_impl<string_view, MapIterator> {
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr) const
   {
-    CUDF_EXPECTS(source.get().type() == target.type(), "scalar and column types must match");
+    CUDF_EXPECTS(source.get().type() == target.type(),
+                 "scalar and column types must match",
+                 cudf::data_type_error);
 
     auto const scalar_impl = static_cast<string_scalar const*>(&source.get());
     auto const source_view = string_view(scalar_impl->data(), scalar_impl->size());
@@ -166,6 +168,9 @@ struct column_scalar_scatterer_impl<list_view, MapIterator> {
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr) const
   {
+    CUDF_EXPECTS(source.get().type() == target.type(),
+                 "scalar and column types must match",
+                 cudf::data_type_error);
     auto result =
       lists::detail::scatter(source, scatter_iter, scatter_iter + scatter_rows, target, stream, mr);
 
@@ -249,6 +254,10 @@ struct column_scalar_scatterer_impl<struct_view, MapIterator> {
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr) const
   {
+    CUDF_EXPECTS(source.get().type() == target.type(),
+                 "scalar and column types must match",
+                 cudf::data_type_error);
+
     // For each field of `source`, copy construct a scalar from the field
     // and dispatch to the corresponding scalar scatterer
 

diff --git a/docs/cudf/source/developer_guide/pylibcudf.md b/docs/cudf/source/developer_guide/pylibcudf.md
@@ -96,6 +96,72 @@ There are a couple of notable points from the snippet above:
 - The object returned from libcudf is immediately converted to a pylibcudf type.
 - `cudf::gather` accepts a `cudf::out_of_bounds_policy` enum parameter. `OutOfBoundsPolicy` is an alias for this type in pylibcudf that matches our Python naming conventions (CapsCase instead of snake\_case).
 
+## Testing
+
+When writing pylibcudf tests, it is important to remember that all the APIs should be tested in the C++ layer in libcudf already.
+The primary purpose of pylibcudf tests is to ensure the correctness of the _bindings_; the correctness of the underlying implementation should generally be validated in libcudf.
+If pylibcudf tests uncover a libcudf bug, a suitable libcudf test should be added to cover this case rather than relying solely on pylibcudf testing.
+
+pylibcudf's ``conftest.py`` contains some standard parametrized dtype fixture lists that may in turn be used to parametrize other fixtures.
+Fixtures allocating data should leverage these dtype lists wherever possible to simplify testing across the matrix of important types.
+Where appropriate, new fixture lists may be added.
+
+To run tests as efficiently as possible, the test suite should make generous use of fixtures.
+The simplest general structure to follow is for pyarrow array/table/scalar fixtures to be parametrized by one of the dtype list.
+Then, a corresponding pylibcudf fixture may be created using a simple `from_arrow` call.
+This approach ensures consistent global coverage across types for various tests.
+
+In general, pylibcudf tests should prefer validating against a corresponding pyarrow implementation rather than hardcoding data.
+This approach is more resilient to changes to input data, particularly given the fixture strategy outlined above.
+Standard tools for comparing between pylibcudf and pyarrow types are provided in the utils module.
+
+Here is an example demonstrating the above points:
+
+```python
+import pyarrow as pa
+import pyarrow.compute as pc
+import pytest
+from cudf._lib import pylibcudf as plc
+from utils import assert_column_eq
+
+# The pa_dtype fixture is defined in conftest.py.
+@pytest.fixture(scope="module")
+def pa_column(pa_dtype):
+    pa.array([1, 2, 3])
+
+
+@pytest.fixture(scope="module")
+def column(pa_column):
+    return plc.interop.from_arrow(pa_column)
+
+
+def test_foo(pa_column, column):
+    index = 1
+    result = plc.foo(column)
+    expected = pa.foo(pa_column)
+
+    assert_column_eq(result, expected)
+```
+
+Some guidelines on what should be tested:
+- Tests SHOULD comprehensively cover the API, including all possible combinations of arguments required to ensure good test coverage.
+- pylibcudf SHOULD NOT attempt to stress test large data sizes, and SHOULD instead defer to libcudf tests.
+  - Exception: In special cases where constructing suitable large tests is difficult in C++ (such as creating suitable input data for I/O testing), tests may be added to pylibcudf instead.
+- Nullable data should always be tested.
+- Expected exceptions should be tested. Tests should be written from the user's perspective in mind, and if the API is not currently throwing the appropriate exception it should be updated.
+  - Important note: If the exception should be produced by libcudf, the underlying libcudf API should be updated to throw the desired exception in C++. Such changes may require consultation with libcudf devs in nontrivial cases. [This issue](https://github.com/rapidsai/cudf/issues/12885) provides an overview and an indication of acceptable exception types that should cover most use cases. In rare cases a new C++ exception may need to be introduced in [`error.hpp`](https://github.com/rapidsai/cudf/blob/branch-24.04/cpp/include/cudf/utilities/error.hpp). If so, this exception will also need to be mapped to a suitable Python exception in [`exception_handler.pxd`](https://github.com/rapidsai/cudf/blob/branch-24.04/python/cudf/cudf/_lib/exception_handler.pxd).
+
+Some guidelines on how best to use pytests.
+- By default, fixtures producing device data containers should be of module scope and treated as immutable by tests. Allocating data on the GPU is expensive and slows tests. Almost all pylibcudf operations are out of place operations, so module-scoped fixtures should not typically be problematic to work with. Session-scoped fixtures would also work, but they are harder to reason about since they live in a different module, and if they need to change for any reason they could affect an arbitrarily large number of tests. Module scope is a good balance.
+- Where necessary, mutable fixtures should be named as such (e.g. `mutable_col`) and be of function scope. If possible, they can be implemented as simply making a copy of a corresponding module-scope immutable fixture to avoid duplicating the generation logic.
+
+Tests should be organized corresponding to pylibcudf modules, i.e. one test module for each pylibcudf module.
+
+The following sections of the cuDF Python testing guide also generally apply to pylibcudf unless superseded by any statements above:
+- [](#test_parametrization)
+- [](#xfailing_tests)
+- [](#testing_warnings)
+
 ## Miscellaneous Notes
 
 ### Cython Scoped Enums

diff --git a/docs/cudf/source/developer_guide/testing.md b/docs/cudf/source/developer_guide/testing.md
@@ -55,6 +55,8 @@ Typically, exception cases require specific assertions or other special logic, s
 The main exception to this rule is tests based on comparison to pandas.
 Such tests may test exceptional cases alongside more typical cases since the logic is generally identical.
 
+(test_parametrization)=
+
 ### Parametrization: custom fixtures and `pytest.mark.parametrize`
 
 When it comes to parametrizing tests written with `pytest`,
@@ -140,6 +142,8 @@ def test_odds():
 
 Other approaches are also possible, and the best solution should be discussed on a case-by-case basis during PR review.
 
+(xfailing_tests)=
+
 ### Tests with expected failures (`xfail`s)
 
 In some circumstances it makes sense to mark a test as _expected_ to
@@ -218,6 +222,8 @@ This way, when the bug is fixed, the test suite will fail at this
 point (and we will remember to update the test).
 
 
+(testing_warnings)=
+
 ### Testing code that throws warnings
 
 Some code may be expected to throw warnings.

diff --git a/python/cudf/cudf/_lib/cpp/copying.pxd b/python/cudf/cudf/_lib/cpp/copying.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t, int64_t, uint8_t
 from libcpp cimport bool
@@ -33,19 +33,19 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
         const column_view& input,
         size_type offset,
         const scalar& fill_values
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[table] scatter (
         const table_view& source_table,
         const column_view& scatter_map,
         const table_view& target_table,
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[table] scatter (
         const vector[reference_wrapper[constscalar]]& source_scalars,
         const column_view& indices,
         const table_view& target,
-    ) except +
+    ) except +cudf_exception_handler
 
     cpdef enum class mask_allocation_policy(int32_t):
         NEVER
@@ -54,99 +54,99 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
 
     cdef unique_ptr[column] empty_like (
         const column_view& input_column
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] allocate_like (
         const column_view& input_column,
         mask_allocation_policy policy
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] allocate_like (
         const column_view& input_column,
         size_type size,
         mask_allocation_policy policy
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[table] empty_like (
         const table_view& input_table
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef void copy_range_in_place (
         const column_view& input_column,
         mutable_column_view& target_column,
         size_type input_begin,
         size_type input_end,
         size_type target_begin
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] copy_range (
         const column_view& input_column,
         const column_view& target_column,
         size_type input_begin,
         size_type input_end,
         size_type target_begin
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef vector[column_view] slice (
         const column_view& input_column,
         vector[size_type] indices
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef vector[table_view] slice (
         const table_view& input_table,
         vector[size_type] indices
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef vector[column_view] split (
         const column_view& input_column,
         vector[size_type] splits
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef vector[table_view] split (
         const table_view& input_table,
         vector[size_type] splits
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] copy_if_else (
         const column_view& lhs,
         const column_view& rhs,
         const column_view& boolean_mask
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] copy_if_else (
         const scalar& lhs,
         const column_view& rhs,
         const column_view& boolean_mask
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] copy_if_else (
         const column_view& lhs,
         const scalar& rhs,
         const column_view boolean_mask
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] copy_if_else (
         const scalar& lhs,
         const scalar& rhs,
         const column_view boolean_mask
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[table] boolean_mask_scatter (
         const table_view& input,
         const table_view& target,
         const column_view& boolean_mask
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[table] boolean_mask_scatter (
         const vector[reference_wrapper[constscalar]]& input,
         const table_view& target,
         const column_view& boolean_mask
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[scalar] get_element (
         const column_view& input,
         size_type index
-    ) except +
+    ) except +cudf_exception_handler
 
     cpdef enum class sample_with_replacement(bool):
         FALSE

diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pxd b/python/cudf/cudf/_lib/pylibcudf/column.pxd
@@ -43,6 +43,7 @@ cdef class Column:
     cpdef gpumemoryview data(self)
     cpdef gpumemoryview null_mask(self)
     cpdef list children(self)
+    cpdef Column copy(self)
 
     cpdef ListColumnView list_view(self)
 

diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx
@@ -1,7 +1,7 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from cython.operator cimport dereference
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.utility cimport move
 
 from rmm._lib.device_buffer cimport DeviceBuffer
@@ -274,6 +274,13 @@ cdef class Column:
         """The children of the column."""
         return self._children
 
+    cpdef Column copy(self):
+        """Create a copy of the column."""
+        cdef unique_ptr[column] c_result
+        with nogil:
+            c_result = move(make_unique[column](self.view()))
+        return Column.from_libcudf(move(c_result))
+
 
 cdef class ListColumnView:
     """Accessor for methods of a Column that are specific to lists."""

diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pxd b/python/cudf/cudf/_lib/pylibcudf/copying.pxd
@@ -58,12 +58,12 @@ cpdef Column copy_range(
     size_type target_begin,
 )
 
-cpdef Column shift(Column input, size_type offset, Scalar fill_values)
-
-cpdef list split(ColumnOrTable input, list splits)
+cpdef Column shift(Column input, size_type offset, Scalar fill_value)
 
 cpdef list slice(ColumnOrTable input, list indices)
 
+cpdef list split(ColumnOrTable input, list splits)
+
 cpdef Column copy_if_else(
     LeftCopyIfElseOperand lhs,
     RightCopyIfElseOperand rhs,