From b8ae0e4b41c541c5b2b27417af30fa1b9afcbdce Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Tue, 7 Feb 2023 23:33:56 -0600 Subject: [PATCH 1/6] Support conversion to/from cudf in dask.dataframe.core.to_backend (#12380) This PR corresponds to the `cudf` component of https://github.com/dask/dask/pull/9758 Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/12380 --- python/dask_cudf/dask_cudf/backends.py | 194 +++++++++++------- python/dask_cudf/dask_cudf/tests/test_core.py | 55 ++++- 2 files changed, 170 insertions(+), 79 deletions(-) diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index b6be5ade6ba..821ec103204 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -11,6 +11,10 @@ import dask.dataframe as dd from dask import config +from dask.dataframe.backends import ( + DataFrameBackendEntrypoint, + PandasBackendEntrypoint, +) from dask.dataframe.core import get_parallel_type, meta_nonempty from dask.dataframe.dispatch import ( categorical_dtype_dispatch, @@ -30,7 +34,7 @@ make_meta_obj, ) from dask.sizeof import sizeof as sizeof_dispatch -from dask.utils import is_arraylike +from dask.utils import Dispatch, is_arraylike import cudf from cudf.api.types import is_string_dtype @@ -446,91 +450,127 @@ def _default_backend(func, *args, **kwargs): return func(*args, **kwargs) -try: +def _unsupported_kwargs(old, new, kwargs): + # Utility to raise a meaningful error when + # unsupported kwargs are encountered within + # ``to_backend_dispatch`` + if kwargs: + raise ValueError( + f"Unsupported key-word arguments used in `to_backend` " + f"for {old}-to-{new} conversion: {kwargs}" + ) - # Define "cudf" backend engine to be registered with Dask - from dask.dataframe.backends import DataFrameBackendEntrypoint - - class CudfBackendEntrypoint(DataFrameBackendEntrypoint): - """Backend-entrypoint class for Dask-DataFrame - - This class is registered under the name "cudf" for the - ``dask.dataframe.backends`` entrypoint in ``setup.cfg``. - Dask-DataFrame will use the methods defined in this class - in place of ``dask.dataframe.`` when the - "dataframe.backend" configuration is set to "cudf": - - Examples - -------- - >>> import dask - >>> import dask.dataframe as dd - >>> with dask.config.set({"dataframe.backend": "cudf"}): - ... ddf = dd.from_dict({"a": range(10)}) - >>> type(ddf) - - """ - - @staticmethod - def from_dict( - data, - npartitions, - orient="columns", - dtype=None, - columns=None, - constructor=cudf.DataFrame, - ): - - return _default_backend( - dd.from_dict, - data, - npartitions=npartitions, - orient=orient, - dtype=dtype, - columns=columns, - constructor=constructor, - ) - @staticmethod - def read_parquet(*args, engine=None, **kwargs): - from dask_cudf.io.parquet import CudfEngine +# Register cudf->pandas +to_pandas_dispatch = PandasBackendEntrypoint.to_backend_dispatch() - return _default_backend( - dd.read_parquet, - *args, - engine=CudfEngine, - **kwargs, - ) - @staticmethod - def read_json(*args, **kwargs): - from dask_cudf.io.json import read_json +@to_pandas_dispatch.register((cudf.DataFrame, cudf.Series, cudf.Index)) +def to_pandas_dispatch_from_cudf(data, nullable=False, **kwargs): + _unsupported_kwargs("cudf", "pandas", kwargs) + return data.to_pandas(nullable=nullable) - return read_json(*args, **kwargs) - @staticmethod - def read_orc(*args, **kwargs): - from dask_cudf.io import read_orc +# Register pandas->cudf +to_cudf_dispatch = Dispatch("to_cudf_dispatch") - return read_orc(*args, **kwargs) - @staticmethod - def read_csv(*args, **kwargs): - from dask_cudf.io import read_csv +@to_cudf_dispatch.register((pd.DataFrame, pd.Series, pd.Index)) +def to_cudf_dispatch_from_pandas(data, nan_as_null=None, **kwargs): + _unsupported_kwargs("pandas", "cudf", kwargs) + return cudf.from_pandas(data, nan_as_null=nan_as_null) - return read_csv(*args, **kwargs) - @staticmethod - def read_hdf(*args, **kwargs): - from dask_cudf import from_dask_dataframe +# Define "cudf" backend engine to be registered with Dask +class CudfBackendEntrypoint(DataFrameBackendEntrypoint): + """Backend-entrypoint class for Dask-DataFrame - # HDF5 reader not yet implemented in cudf - warnings.warn( - "read_hdf is not yet implemented in cudf/dask_cudf. " - "Moving to cudf from pandas. Expect poor performance!" - ) - return from_dask_dataframe( - _default_backend(dd.read_hdf, *args, **kwargs) - ) + This class is registered under the name "cudf" for the + ``dask.dataframe.backends`` entrypoint in ``setup.cfg``. + Dask-DataFrame will use the methods defined in this class + in place of ``dask.dataframe.`` when the + "dataframe.backend" configuration is set to "cudf": -except ImportError: - pass + Examples + -------- + >>> import dask + >>> import dask.dataframe as dd + >>> with dask.config.set({"dataframe.backend": "cudf"}): + ... ddf = dd.from_dict({"a": range(10)}) + >>> type(ddf) + + """ + + @classmethod + def to_backend_dispatch(cls): + return to_cudf_dispatch + + @classmethod + def to_backend(cls, data: dd.core._Frame, **kwargs): + if isinstance(data._meta, (cudf.DataFrame, cudf.Series, cudf.Index)): + # Already a cudf-backed collection + _unsupported_kwargs("cudf", "cudf", kwargs) + return data + return data.map_partitions(cls.to_backend_dispatch(), **kwargs) + + @staticmethod + def from_dict( + data, + npartitions, + orient="columns", + dtype=None, + columns=None, + constructor=cudf.DataFrame, + ): + + return _default_backend( + dd.from_dict, + data, + npartitions=npartitions, + orient=orient, + dtype=dtype, + columns=columns, + constructor=constructor, + ) + + @staticmethod + def read_parquet(*args, engine=None, **kwargs): + from dask_cudf.io.parquet import CudfEngine + + return _default_backend( + dd.read_parquet, + *args, + engine=CudfEngine, + **kwargs, + ) + + @staticmethod + def read_json(*args, **kwargs): + from dask_cudf.io.json import read_json + + return read_json(*args, **kwargs) + + @staticmethod + def read_orc(*args, **kwargs): + from dask_cudf.io import read_orc + + return read_orc(*args, **kwargs) + + @staticmethod + def read_csv(*args, **kwargs): + from dask_cudf.io import read_csv + + return read_csv(*args, **kwargs) + + @staticmethod + def read_hdf(*args, **kwargs): + from dask_cudf import from_dask_dataframe + + # HDF5 reader not yet implemented in cudf + warnings.warn( + "read_hdf is not yet implemented in cudf/dask_cudf. " + "Moving to cudf from pandas. Expect poor performance!" + ) + return from_dask_dataframe( + _default_backend(dd.read_hdf, *args, **kwargs) + ) diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index ee8229bc7e8..7f8876c8564 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd import pytest +from packaging import version import dask from dask import dataframe as dd @@ -31,6 +32,58 @@ def test_from_dict_backend_dispatch(): dd.assert_eq(expect, ddf) +def test_to_backend(): + np.random.seed(0) + data = { + "x": np.random.randint(0, 5, size=10000), + "y": np.random.normal(size=10000), + } + with dask.config.set({"dataframe.backend": "pandas"}): + ddf = dd.from_dict(data, npartitions=2) + assert isinstance(ddf._meta, pd.DataFrame) + + gdf = ddf.to_backend("cudf") + assert isinstance(gdf, dgd.DataFrame) + dd.assert_eq(cudf.DataFrame(data), ddf) + + assert isinstance(gdf.to_backend()._meta, pd.DataFrame) + + +def test_to_backend_kwargs(): + data = {"x": [0, 2, np.nan, 3, 4, 5]} + with dask.config.set({"dataframe.backend": "pandas"}): + dser = dd.from_dict(data, npartitions=2)["x"] + assert isinstance(dser._meta, pd.Series) + + # Using `nan_as_null=False` will result in a cudf-backed + # Series with a NaN element (ranther than ) + gser_nan = dser.to_backend("cudf", nan_as_null=False) + assert isinstance(gser_nan, dgd.Series) + assert np.isnan(gser_nan.compute()).sum() == 1 + + # Using `nan_as_null=True` will result in a cudf-backed + # Series with a element (ranther than NaN) + gser_null = dser.to_backend("cudf", nan_as_null=True) + assert isinstance(gser_null, dgd.Series) + assert np.isnan(gser_null.compute()).sum() == 0 + + # Check `nullable` argument for `cudf.Series.to_pandas` + dser_null = gser_null.to_backend("pandas", nullable=False) + assert dser_null.compute().dtype == "float" + dser_null = gser_null.to_backend("pandas", nullable=True) + assert isinstance(dser_null.compute().dtype, pd.Float64Dtype) + + # Check unsupported arguments + with pytest.raises(ValueError, match="pandas-to-cudf"): + dser.to_backend("cudf", bad_arg=True) + + with pytest.raises(ValueError, match="cudf-to-cudf"): + gser_null.to_backend("cudf", bad_arg=True) + + with pytest.raises(ValueError, match="cudf-to-pandas"): + gser_null.to_backend("pandas", bad_arg=True) + + def test_from_cudf(): np.random.seed(0) @@ -547,8 +600,6 @@ def test_unary_ops(func, gdf, gddf): # Fixed in https://github.com/dask/dask/pull/4657 if isinstance(p, cudf.Index): - from packaging import version - if version.parse(dask.__version__) < version.parse("1.1.6"): pytest.skip( "dask.dataframe assert_eq index check hardcoded to " From 8ad4166c7026482a53a60f47b56dd5e1dec1a463 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Wed, 8 Feb 2023 10:39:42 -0500 Subject: [PATCH 2/6] Remove cudf::strings::repeat_strings_output_sizes and optional parameter from cudf::strings::repeat_strings (#12609) Removes `cudf::strings::repeat_strings_output_sizes` and the optional sizes parameter from `cudf::strings::repeat_strings`. This function (and corresponding optional parameter) is no longer needed now that the internal utilities will throw an error if the column output size exceeds the maximum. Closes #12542 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Karthikeyan (https://github.com/karthikeyann) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/12609 --- cpp/benchmarks/string/repeat_strings.cpp | 56 +----- cpp/include/cudf/strings/repeat_strings.hpp | 94 +++------- cpp/src/strings/repeat_strings.cu | 194 +++----------------- cpp/tests/strings/repeat_strings_tests.cpp | 121 +----------- 4 files changed, 56 insertions(+), 409 deletions(-) diff --git a/cpp/benchmarks/string/repeat_strings.cpp b/cpp/benchmarks/string/repeat_strings.cpp index 1844e93bc53..fe015b27f13 100644 --- a/cpp/benchmarks/string/repeat_strings.cpp +++ b/cpp/benchmarks/string/repeat_strings.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -79,42 +79,6 @@ static void BM_repeat_strings_column_times(benchmark::State& state) (strings_col.chars_size() + repeat_times_col.size() * sizeof(int32_t))); } -static void BM_compute_output_strings_sizes(benchmark::State& state) -{ - auto const n_rows = static_cast(state.range(0)); - auto const max_str_length = static_cast(state.range(1)); - auto const table = create_data_table(2, n_rows, max_str_length); - auto const strings_col = cudf::strings_column_view(table->view().column(0)); - auto const repeat_times_col = table->view().column(1); - - for ([[maybe_unused]] auto _ : state) { - [[maybe_unused]] cuda_event_timer raii(state, true, cudf::get_default_stream()); - cudf::strings::repeat_strings_output_sizes(strings_col, repeat_times_col); - } - - state.SetBytesProcessed(state.iterations() * - (strings_col.chars_size() + repeat_times_col.size() * sizeof(int32_t))); -} - -static void BM_repeat_strings_column_times_precomputed_sizes(benchmark::State& state) -{ - auto const n_rows = static_cast(state.range(0)); - auto const max_str_length = static_cast(state.range(1)); - auto const table = create_data_table(2, n_rows, max_str_length); - auto const strings_col = cudf::strings_column_view(table->view().column(0)); - auto const repeat_times_col = table->view().column(1); - [[maybe_unused]] auto const [sizes, total_bytes] = - cudf::strings::repeat_strings_output_sizes(strings_col, repeat_times_col); - - for ([[maybe_unused]] auto _ : state) { - [[maybe_unused]] cuda_event_timer raii(state, true, cudf::get_default_stream()); - cudf::strings::repeat_strings(strings_col, repeat_times_col, *sizes); - } - - state.SetBytesProcessed(state.iterations() * - (strings_col.chars_size() + repeat_times_col.size() * sizeof(int32_t))); -} - static void generate_bench_args(benchmark::internal::Benchmark* b) { int const min_rows = 1 << 8; @@ -145,23 +109,5 @@ class RepeatStrings : public cudf::benchmark { ->UseManualTime() \ ->Unit(benchmark::kMillisecond); -#define COMPUTE_OUTPUT_STRINGS_SIZES_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(RepeatStrings, name) \ - (::benchmark::State & st) { BM_compute_output_strings_sizes(st); } \ - BENCHMARK_REGISTER_F(RepeatStrings, name) \ - ->Apply(generate_bench_args) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -#define REPEAT_STRINGS_COLUMN_TIMES_PRECOMPUTED_SIZES_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(RepeatStrings, name) \ - (::benchmark::State & st) { BM_repeat_strings_column_times_precomputed_sizes(st); } \ - BENCHMARK_REGISTER_F(RepeatStrings, name) \ - ->Apply(generate_bench_args) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - REPEAT_STRINGS_SCALAR_TIMES_BENCHMARK_DEFINE(scalar_times) REPEAT_STRINGS_COLUMN_TIMES_BENCHMARK_DEFINE(column_times) -COMPUTE_OUTPUT_STRINGS_SIZES_BENCHMARK_DEFINE(compute_output_strings_sizes) -REPEAT_STRINGS_COLUMN_TIMES_PRECOMPUTED_SIZES_BENCHMARK_DEFINE(precomputed_sizes) diff --git a/cpp/include/cudf/strings/repeat_strings.hpp b/cpp/include/cudf/strings/repeat_strings.hpp index 0e6ee2126d3..26fe5f95983 100644 --- a/cpp/include/cudf/strings/repeat_strings.hpp +++ b/cpp/include/cudf/strings/repeat_strings.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -32,15 +32,15 @@ namespace strings { */ /** - * @brief Repeat the given string scalar by a given number of times. + * @brief Repeat the given string scalar a given number of times * * An output string scalar is generated by repeating the input string by a number of times given by - * the @p `repeat_times` parameter. + * the `repeat_times` parameter. * * In special cases: - * - If @p `repeat_times` is not a positive value, an empty (valid) string scalar will be returned. + * - If `repeat_times` is not a positive value, an empty (valid) string scalar will be returned. * - An invalid input scalar will always result in an invalid output scalar regardless of the - * value of @p `repeat_times` parameter. + * value of `repeat_times` parameter. * * @code{.pseudo} * Example: @@ -50,13 +50,13 @@ namespace strings { * @endcode * * @throw cudf::logic_error if the size of the output string scalar exceeds the maximum value that - * can be stored by the index type - * (i.e., @code input.size() * repeat_times > numeric_limits::max() @endcode). + * can be stored by the index type: + * `input.size() * repeat_times > max of size_type` * - * @param input The scalar containing the string to repeat. - * @param repeat_times The number of times the input string is repeated. - * @param mr Device memory resource used to allocate the returned string scalar. - * @return New string scalar in which the input string is repeated. + * @param input The scalar containing the string to repeat + * @param repeat_times The number of times the input string is repeated + * @param mr Device memory resource used to allocate the returned string scalar + * @return New string scalar in which the input string is repeated */ std::unique_ptr repeat_string( string_scalar const& input, @@ -64,19 +64,16 @@ std::unique_ptr repeat_string( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Repeat each string in the given strings column by a given number of times. + * @brief Repeat each string in the given strings column a given number of times * - * An output strings column is generated by repeating each string from the input strings column by a - * number of times given by the @p `repeat_times` parameter. + * An output strings column is generated by repeating each string from the input strings column by + * the number of times given by the `repeat_times` parameter. * * In special cases: - * - If @p `repeat_times` is not a positive number, a non-null input string will always result in + * - If `repeat_times` is not a positive number, a non-null input string will always result in * an empty output string. * - A null input string will always result in a null output string regardless of the value of the - * @p `repeat_times` parameter. - * - * The caller is responsible for checking the output column size will not exceed the maximum size of - * a strings column (number of total characters is less than the max size_type value). + * `repeat_times` parameter. * * @code{.pseudo} * Example: @@ -85,10 +82,10 @@ std::unique_ptr repeat_string( * out is ['aaaaaa', null, '', 'bbcbbcbbc'] * @endcode * - * @param input The column containing strings to repeat. - * @param repeat_times The number of times each input string is repeated. - * @param mr Device memory resource used to allocate the returned strings column. - * @return New column containing the repeated strings. + * @param input The column containing strings to repeat + * @param repeat_times The number of times each input string is repeated + * @param mr Device memory resource used to allocate the returned strings column + * @return New column containing the repeated strings */ std::unique_ptr repeat_strings( strings_column_view const& input, @@ -97,11 +94,10 @@ std::unique_ptr repeat_strings( /** * @brief Repeat each string in the given strings column by the numbers of times given in another - * numeric column. + * numeric column * * An output strings column is generated by repeating each of the input string by a number of times - * given by the corresponding row in a @p `repeat_times` numeric column. The computational time can - * be reduced if sizes of the output strings are known and provided. + * given by the corresponding row in a `repeat_times` numeric column. * * In special cases: * - Any null row (from either the input strings column or the `repeat_times` column) will always @@ -109,9 +105,6 @@ std::unique_ptr repeat_strings( * - If any value in the `repeat_times` column is not a positive number and its corresponding input * string is not null, the output string will be an empty string. * - * The caller is responsible for checking the output column size will not exceed the maximum size of - * a strings column (number of total characters is less than the max size_type value). - * * @code{.pseudo} * Example: * strs = ['aa', null, '', 'bbc-'] @@ -120,51 +113,16 @@ std::unique_ptr repeat_strings( * out is ['aa', null, '', 'bbc-bbc-bbc-bbc-'] * @endcode * - * @throw cudf::logic_error if the input `repeat_times` column has data type other than integer. + * @throw cudf::logic_error if the input `repeat_times` is not an integer type * @throw cudf::logic_error if the input columns have different sizes. * - * @param input The column containing strings to repeat. + * @param input The column containing strings to repeat * @param repeat_times The column containing numbers of times that the corresponding input strings - * are repeated. - * @param output_strings_sizes The optional column containing pre-computed sizes of the output - * strings. - * @param mr Device memory resource used to allocate the returned strings column. + * are repeated + * @param mr Device memory resource used to allocate the returned strings column * @return New column containing the repeated strings. */ std::unique_ptr repeat_strings( - strings_column_view const& input, - column_view const& repeat_times, - std::optional output_strings_sizes = std::nullopt, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Compute sizes of the output strings if each string in the input strings column - * is repeated by the numbers of times given in another numeric column. - * - * The output column storing string output sizes is not nullable. These string sizes are - * also summed up and returned (in an `int64_t` value), which can be used to detect if the input - * strings column can be safely repeated without data corruption due to overflow in string indexing. - * - * @code{.pseudo} - * Example: - * strs = ['aa', null, '', 'bbc-'] - * repeat_times = [ 1, 2, 3, 4 ] - * [output_sizes, total_size] = repeat_strings_output_sizes(strs, repeat_times) - * out is [2, 0, 0, 16], and total_size = 18 - * @endcode - * - * @throw cudf::logic_error if the input `repeat_times` column has data type other than integer. - * @throw cudf::logic_error if the input columns have different sizes. - * - * @param input The column containing strings to repeat. - * @param repeat_times The column containing numbers of times that the corresponding input strings - * are repeated. - * @param mr Device memory resource used to allocate the returned strings column. - * @return A pair with the first item is an int32_t column containing sizes of the output strings, - * and the second item is an int64_t number containing the total sizes (in bytes) of the - * output strings column. - */ -std::pair, int64_t> repeat_strings_output_sizes( strings_column_view const& input, column_view const& repeat_times, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu index cc283fbcee2..3784b535a5b 100644 --- a/cpp/src/strings/repeat_strings.cu +++ b/cpp/src/strings/repeat_strings.cu @@ -176,7 +176,7 @@ namespace { * separate number of times. */ template -struct compute_size_and_repeat_separately_fn { +struct compute_sizes_and_repeat_fn { column_device_view const strings_dv; column_device_view const repeat_times_dv; Iterator const repeat_times_iter; @@ -189,146 +189,63 @@ struct compute_size_and_repeat_separately_fn { // If d_chars != nullptr: only repeat strings. char* d_chars{nullptr}; - __device__ int64_t operator()(size_type const idx) const noexcept + __device__ void operator()(size_type const idx) const noexcept { auto const string_is_valid = !strings_has_nulls || strings_dv.is_valid_nocheck(idx); auto const rtimes_is_valid = !rtimes_has_nulls || repeat_times_dv.is_valid_nocheck(idx); // Any null input (either string or repeat_times value) will result in a null output. auto const is_valid = string_is_valid && rtimes_is_valid; + if (!is_valid) { + if (!d_chars) { d_offsets[idx] = 0; } + return; + } - // When the input string is null, `repeat_times` and `string_size` are also set to 0. - // This makes sure that if `repeat_times > 0` then we will always have a valid input string, - // and if `repeat_times <= 0` we will never copy anything to the output. - auto const repeat_times = is_valid ? repeat_times_iter[idx] : size_type{0}; - auto const string_size = - is_valid ? strings_dv.element(idx).size_bytes() : size_type{0}; - - // The output_size is returned, and it needs to be an int64_t number to prevent overflow. - auto const output_size = - repeat_times > 0 ? static_cast(repeat_times) * static_cast(string_size) - : int64_t{0}; + auto repeat_times = repeat_times_iter[idx]; + auto const d_str = strings_dv.element(idx); if (!d_chars) { - // If overflow happen, the stored value of output string size will be incorrect due to - // downcasting. In such cases, the entire output string size array should be discarded. - d_offsets[idx] = static_cast(output_size); - } else if (repeat_times > 0 && string_size > 0) { - auto const d_str = strings_dv.element(idx); - auto const input_ptr = d_str.data(); - auto output_ptr = d_chars + d_offsets[idx]; - for (size_type repeat_idx = 0; repeat_idx < repeat_times; ++repeat_idx) { - output_ptr = copy_and_increment(output_ptr, input_ptr, string_size); + // repeat_times could be negative + d_offsets[idx] = (repeat_times > 0) ? (repeat_times * d_str.size_bytes()) : 0; + } else { + auto output_ptr = d_chars + d_offsets[idx]; + while (repeat_times-- > 0) { + output_ptr = copy_and_increment(output_ptr, d_str.data(), d_str.size_bytes()); } } - - // The output_size value may be used to sum up to detect overflow at the caller site. - // The caller can detect overflow easily by checking `SUM(output_size) > INT_MAX`. - return output_size; } }; -/** - * @brief Creates child offsets and chars columns by applying the template function that - * can be used for computing the output size of each string as well as create the output. - * - * This function is similar to `strings::detail::make_strings_children`, except that it accepts an - * optional input `std::optional` that can contain the precomputed sizes of the output - * strings. - * - * @deprecated This will be removed with issue 12542 - */ -template -auto make_strings_children(Func fn, - size_type exec_size, - size_type strings_count, - std::optional output_strings_sizes, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - auto offsets_column = make_numeric_column( - data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr); - - auto offsets_view = offsets_column->mutable_view(); - auto d_offsets = offsets_view.template data(); - fn.d_offsets = d_offsets; - - // This may be called twice -- once for offsets and once for chars. - auto for_each_fn = [exec_size, stream](Func& fn) { - thrust::for_each_n( - rmm::exec_policy(stream), thrust::make_counting_iterator(0), exec_size, fn); - }; - - if (!output_strings_sizes.has_value()) { - // Compute the output sizes only if they are not given. - for_each_fn(fn); - - // Compute the offsets values. - auto const bytes = - cudf::detail::sizes_to_offsets(d_offsets, d_offsets + strings_count + 1, d_offsets, stream); - CUDF_EXPECTS(bytes <= static_cast(std::numeric_limits::max()), - "Size of output exceeds column size limit"); - } else { - // Compute the offsets values from the provided output string sizes. - auto const string_sizes = output_strings_sizes.value(); - CUDF_CUDA_TRY(cudaMemsetAsync(d_offsets, 0, sizeof(offset_type), stream.value())); - thrust::inclusive_scan(rmm::exec_policy(stream), - string_sizes.template begin(), - string_sizes.template end(), - d_offsets + 1); - } - - // Now build the chars column - auto const bytes = cudf::detail::get_value(offsets_view, strings_count, stream); - auto chars_column = create_chars_child_column(bytes, stream, mr); - - // Execute the function fn again to fill the chars column. - // Note that if the output chars column has zero size, the function fn should not be called to - // avoid accidentally overwriting the offsets. - if (bytes > 0) { - fn.d_chars = chars_column->mutable_view().template data(); - for_each_fn(fn); - } - - return std::pair(std::move(offsets_column), std::move(chars_column)); -} - } // namespace std::unique_ptr repeat_strings(strings_column_view const& input, column_view const& repeat_times, - std::optional output_strings_sizes, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(input.size() == repeat_times.size(), "The input columns must have the same size."); CUDF_EXPECTS(cudf::is_index_type(repeat_times.type()), "repeat_strings expects an integer type for the `repeat_times` input column."); - if (output_strings_sizes.has_value()) { - auto const output_sizes = output_strings_sizes.value(); - CUDF_EXPECTS(input.size() == output_sizes.size() && - (!output_sizes.nullable() || !output_sizes.has_nulls()), - "The given column of output string sizes is invalid."); - } auto const strings_count = input.size(); if (strings_count == 0) { return make_empty_column(type_id::STRING); } auto const strings_dv_ptr = column_device_view::create(input.parent(), stream); auto const repeat_times_dv_ptr = column_device_view::create(repeat_times, stream); - auto const strings_has_nulls = input.has_nulls(); - auto const rtimes_has_nulls = repeat_times.has_nulls(); auto const repeat_times_iter = cudf::detail::indexalator_factory::make_input_iterator(repeat_times); - auto const fn = compute_size_and_repeat_separately_fn{ - *strings_dv_ptr, *repeat_times_dv_ptr, repeat_times_iter, strings_has_nulls, rtimes_has_nulls}; - - auto [offsets_column, chars_column] = - make_strings_children(fn, strings_count, strings_count, output_strings_sizes, stream, mr); - - // We generate new bitmask by AND of the input columns' bitmasks. - // Note that if the input columns are nullable, the output column will also be nullable (which may - // not have nulls). + auto const fn = + compute_sizes_and_repeat_fn{*strings_dv_ptr, + *repeat_times_dv_ptr, + repeat_times_iter, + input.has_nulls(), + repeat_times.has_nulls()}; + + auto [offsets_column, chars_column] = make_strings_children(fn, strings_count, stream, mr); + + // We generate new bitmask by AND of the two input columns' bitmasks. + // Note that if either of the input columns are nullable, the output column will also be nullable + // but may not have nulls. auto [null_mask, null_count] = cudf::detail::bitmask_and(table_view{{input.parent(), repeat_times}}, stream, mr); @@ -338,52 +255,6 @@ std::unique_ptr repeat_strings(strings_column_view const& input, null_count, std::move(null_mask)); } - -std::pair, int64_t> repeat_strings_output_sizes( - strings_column_view const& input, - column_view const& repeat_times, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_EXPECTS(input.size() == repeat_times.size(), "The input columns must have the same size."); - CUDF_EXPECTS( - cudf::is_index_type(repeat_times.type()), - "repeat_strings_output_sizes expects an integer type for the `repeat_times` input column."); - - auto const strings_count = input.size(); - if (strings_count == 0) { - return std::pair(make_empty_column(type_to_id()), int64_t{0}); - } - - auto output_sizes = make_numeric_column( - data_type{type_to_id()}, strings_count, mask_state::UNALLOCATED, stream, mr); - - auto const strings_dv_ptr = column_device_view::create(input.parent(), stream); - auto const repeat_times_dv_ptr = column_device_view::create(repeat_times, stream); - auto const strings_has_nulls = input.has_nulls(); - auto const rtimes_has_nulls = repeat_times.has_nulls(); - auto const repeat_times_iter = - cudf::detail::indexalator_factory::make_input_iterator(repeat_times); - - auto const fn = compute_size_and_repeat_separately_fn{ - *strings_dv_ptr, - *repeat_times_dv_ptr, - repeat_times_iter, - strings_has_nulls, - rtimes_has_nulls, - output_sizes->mutable_view().template begin()}; - - auto const total_bytes = - thrust::transform_reduce(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count), - fn, - int64_t{0}, - thrust::plus{}); - - return std::pair(std::move(output_sizes), total_bytes); -} - } // namespace detail std::unique_ptr repeat_string(string_scalar const& input, @@ -404,21 +275,10 @@ std::unique_ptr repeat_strings(strings_column_view const& input, std::unique_ptr repeat_strings(strings_column_view const& input, column_view const& repeat_times, - std::optional output_strings_sizes, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::repeat_strings( - input, repeat_times, output_strings_sizes, cudf::get_default_stream(), mr); -} - -std::pair, int64_t> repeat_strings_output_sizes( - strings_column_view const& input, - column_view const& repeat_times, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::repeat_strings_output_sizes(input, repeat_times, cudf::get_default_stream(), mr); + return detail::repeat_strings(input, repeat_times, cudf::get_default_stream(), mr); } } // namespace strings diff --git a/cpp/tests/strings/repeat_strings_tests.cpp b/cpp/tests/strings/repeat_strings_tests.cpp index 69d0494c253..e75409d9f39 100644 --- a/cpp/tests/strings/repeat_strings_tests.cpp +++ b/cpp/tests/strings/repeat_strings_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -207,20 +207,6 @@ TEST_F(RepeatStringsTest, StringsColumnWithColumnRepeatTimesInvalidInput) EXPECT_THROW(cudf::strings::repeat_strings(strs_cv, repeat_times), cudf::logic_error); } - // Sizes mismatched between strings column and output_strings_sizes column. - { - auto const repeat_times = int32s_col{1, 2}; - auto const sizes = int32s_col{1, 2, 3, 4, 5}; - EXPECT_THROW(cudf::strings::repeat_strings(strs_cv, repeat_times, sizes), cudf::logic_error); - } - - // output_strings_sizes column has nulls. - { - auto const repeat_times = int32s_col{1, 2}; - auto const sizes = int32s_col{{null, 2}, null_at(0)}; - EXPECT_THROW(cudf::strings::repeat_strings(strs_cv, repeat_times, sizes), cudf::logic_error); - } - // Invalid data type for repeat_times column. { auto const repeat_times = cudf::test::fixed_width_column_wrapper{1, 2, 3, 4, 5, 6}; @@ -243,11 +229,7 @@ TEST_F(RepeatStringsTest, StringsColumnWithColumnRepeatTimesOverflowOutput) auto const repeat_times = int32s_col{half_max, half_max, half_max, half_max, half_max, half_max, half_max}; - auto const [sizes, total_bytes] = - cudf::strings::repeat_strings_output_sizes(strs_cv, repeat_times); - (void)sizes; - auto const expected_bytes = static_cast(half_max) * int64_t{1 + 2 + 3 + 4 + 5 + 6 + 7}; - EXPECT_EQ(expected_bytes, total_bytes); + EXPECT_THROW(cudf::strings::repeat_strings(strs_cv, repeat_times), cudf::logic_error); } TYPED_TEST(RepeatStringsTypedTest, StringsColumnNoNullWithScalarRepeatTimes) @@ -301,15 +283,6 @@ TYPED_TEST(RepeatStringsTypedTest, StringsColumnNoNullWithColumnRepeatTimes) auto results = cudf::strings::repeat_strings(strs_cv, repeat_times); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); - - auto const expected_sizes = int32s_col{6, 12, 27, 0, 0}; - auto const [sizes, total_bytes] = - cudf::strings::repeat_strings_output_sizes(strs_cv, repeat_times); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity); - EXPECT_EQ(45, total_bytes); - - results = cudf::strings::repeat_strings(strs_cv, repeat_times, *sizes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); } // repeat_times column has nulls. @@ -320,15 +293,6 @@ TYPED_TEST(RepeatStringsTypedTest, StringsColumnNoNullWithColumnRepeatTimes) auto results = cudf::strings::repeat_strings(strs_cv, repeat_times); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); - - auto const expected_sizes = int32s_col{6, 0, 27, 12, 0}; - auto const [sizes, total_bytes] = - cudf::strings::repeat_strings_output_sizes(strs_cv, repeat_times); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity); - EXPECT_EQ(45, total_bytes); - - results = cudf::strings::repeat_strings(strs_cv, repeat_times, *sizes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); } } @@ -377,15 +341,6 @@ TYPED_TEST(RepeatStringsTypedTest, SlicedStringsColumnNoNullWithColumnRepeatTime auto results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); - - auto const expected_sizes = int32s_col{6, 12, 27}; - auto const [sizes, total_bytes] = - cudf::strings::repeat_strings_output_sizes(sliced_strs_cv, sliced_rtimes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity); - EXPECT_EQ(45, total_bytes); - - results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes, *sizes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); } // Sliced the middle of the column. @@ -397,15 +352,6 @@ TYPED_TEST(RepeatStringsTypedTest, SlicedStringsColumnNoNullWithColumnRepeatTime auto results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); - - auto const expected_sizes = int32s_col{12, 27}; - auto const [sizes, total_bytes] = - cudf::strings::repeat_strings_output_sizes(sliced_strs_cv, sliced_rtimes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity); - EXPECT_EQ(39, total_bytes); - - results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes, *sizes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); } // Sliced the second half of the column. @@ -417,15 +363,6 @@ TYPED_TEST(RepeatStringsTypedTest, SlicedStringsColumnNoNullWithColumnRepeatTime auto results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); - - auto const expected_sizes = int32s_col{27, 12, 12}; - auto const [sizes, total_bytes] = - cudf::strings::repeat_strings_output_sizes(sliced_strs_cv, sliced_rtimes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity); - EXPECT_EQ(51, total_bytes); - - results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes, *sizes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); } } @@ -520,15 +457,6 @@ TYPED_TEST(RepeatStringsTypedTest, StringsColumnWithNullsWithColumnRepeatTimes) auto results = cudf::strings::repeat_strings(strs_cv, repeat_times); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); - - auto const expected_sizes = int32s_col{6, 0, 18, 0, 0, 0, 12, 12, 0, 0}; - auto const [sizes, total_bytes] = - cudf::strings::repeat_strings_output_sizes(strs_cv, repeat_times); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity); - EXPECT_EQ(48, total_bytes); - - results = cudf::strings::repeat_strings(strs_cv, repeat_times, *sizes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); } // repeat_times column has nulls. @@ -549,15 +477,6 @@ TYPED_TEST(RepeatStringsTypedTest, StringsColumnWithNullsWithColumnRepeatTimes) auto results = cudf::strings::repeat_strings(strs_cv, repeat_times); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); - - auto const expected_sizes = int32s_col{6, 0, 0, 0, 0, 0, 12, 0, 0, 0}; - auto const [sizes, total_bytes] = - cudf::strings::repeat_strings_output_sizes(strs_cv, repeat_times); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity); - EXPECT_EQ(18, total_bytes); - - results = cudf::strings::repeat_strings(strs_cv, repeat_times, *sizes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); } } @@ -631,15 +550,6 @@ TYPED_TEST(RepeatStringsTypedTest, SlicedStringsColumnWithNullsWithColumnRepeatT auto results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); - - auto const expected_sizes = int32s_col{6, 0, 0}; - auto const [sizes, total_bytes] = - cudf::strings::repeat_strings_output_sizes(sliced_strs_cv, sliced_rtimes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity); - EXPECT_EQ(6, total_bytes); - - results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes, *sizes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); } // Sliced the middle of the column. @@ -652,15 +562,6 @@ TYPED_TEST(RepeatStringsTypedTest, SlicedStringsColumnWithNullsWithColumnRepeatT auto results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); - - auto const expected_sizes = int32s_col{0, 0, 0, 0, 12}; - auto const [sizes, total_bytes] = - cudf::strings::repeat_strings_output_sizes(sliced_strs_cv, sliced_rtimes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity); - EXPECT_EQ(12, total_bytes); - - results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes, *sizes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); } // Sliced the second half of the column, output has nulls. @@ -672,15 +573,6 @@ TYPED_TEST(RepeatStringsTypedTest, SlicedStringsColumnWithNullsWithColumnRepeatT auto results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); - - auto const expected_sizes = int32s_col{12, 0, 0, 0}; - auto const [sizes, total_bytes] = - cudf::strings::repeat_strings_output_sizes(sliced_strs_cv, sliced_rtimes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity); - EXPECT_EQ(12, total_bytes); - - results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes, *sizes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); } // Sliced the second half of the column, output does not have null. @@ -693,14 +585,5 @@ TYPED_TEST(RepeatStringsTypedTest, SlicedStringsColumnWithNullsWithColumnRepeatT auto results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_strs, *results, verbosity); - - auto const expected_sizes = int32s_col{0, 0}; - auto const [sizes, total_bytes] = - cudf::strings::repeat_strings_output_sizes(sliced_strs_cv, sliced_rtimes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity); - EXPECT_EQ(0, total_bytes); - - results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes, *sizes); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_strs, *results, verbosity); } } From 476d5bbf9cfdbcef024bdccc29f30cd1c6fdbc94 Mon Sep 17 00:00:00 2001 From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com> Date: Wed, 8 Feb 2023 10:02:08 -0600 Subject: [PATCH 3/6] Handle parquet list data corner case (#12698) Fixes an issue with a particular arrangement of page data related to lists. Specifically, it is possible for page `N` to contain "0" rows because the values for the row it is a part of start on page `N-1` and end on page `N+1`. This was defeating logic in the decode kernel that would erroneously cause these values to be skipped. Similar to https://github.com/rapidsai/cudf/pull/12488 this is only reproducible with data out in the wild. In this case, we have a file that we could in theory check in to create a test with, but it is 16 MB so it's fairly large. Looking for feedback on whether this is too big. Authors: - https://github.com/nvdbaranec - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Bradley Dice (https://github.com/bdice) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/12698 --- cpp/src/io/parquet/page_data.cu | 50 +++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 8 deletions(-) diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index 23d130e1585..ee115e7432a 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -104,20 +104,41 @@ struct page_state_s { * specified row bounds * * @param s The page to be checked - * @param min_row The starting row index + * @param start_row The starting row index * @param num_rows The number of rows * * @return True if the page spans the beginning or the end of the row bounds */ -inline __device__ bool is_bounds_page(page_state_s* const s, size_t min_row, size_t num_rows) +inline __device__ bool is_bounds_page(page_state_s* const s, size_t start_row, size_t num_rows) { size_t const page_begin = s->col.start_row + s->page.chunk_row; size_t const page_end = page_begin + s->page.num_rows; - size_t const begin = min_row; - size_t const end = min_row + num_rows; + size_t const begin = start_row; + size_t const end = start_row + num_rows; + return ((page_begin <= begin && page_end >= begin) || (page_begin <= end && page_end >= end)); } +/** + * @brief Returns whether or not a page is completely contained within the specified + * row bounds + * + * @param s The page to be checked + * @param start_row The starting row index + * @param num_rows The number of rows + * + * @return True if the page is completely contained within the row bounds + */ +inline __device__ bool is_page_contained(page_state_s* const s, size_t start_row, size_t num_rows) +{ + size_t const page_begin = s->col.start_row + s->page.chunk_row; + size_t const page_end = page_begin + s->page.num_rows; + size_t const begin = start_row; + size_t const end = start_row + num_rows; + + return page_begin >= begin && page_end <= end; +} + /** * @brief Read a 32-bit varint integer * @@ -1728,10 +1749,11 @@ __global__ void __launch_bounds__(block_size) auto const thread_depth = depth + t; if (thread_depth < s->page.num_output_nesting_levels) { // if we are not a bounding page (as checked above) then we are either - // returning 0 rows from the page (completely outside the bounds) or all - // rows in the page (completely within the bounds) + // returning all rows/values from this page, or 0 of them pp->nesting[thread_depth].batch_size = - s->num_rows == 0 ? 0 : pp->nesting[thread_depth].size; + (s->num_rows == 0 && !is_page_contained(s, min_row, num_rows)) + ? 0 + : pp->nesting[thread_depth].size; } depth += blockDim.x; } @@ -1838,7 +1860,19 @@ __global__ void __launch_bounds__(block_size) gpuDecodePageData( bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0; // if we have no work to do (eg, in a skip_rows/num_rows case) in this page. - if (s->num_rows == 0 && !(has_repetition && is_bounds_page(s, min_row, num_rows))) { return; } + // + // corner case: in the case of lists, we can have pages that contain "0" rows if the current row + // starts before this page and ends after this page: + // P0 P1 P2 + // |---------|---------|----------| + // ^------------------^ + // row start row end + // P1 will contain 0 rows + // + if (s->num_rows == 0 && !(has_repetition && (is_bounds_page(s, min_row, num_rows) || + is_page_contained(s, min_row, num_rows)))) { + return; + } if (s->dict_base) { out_thread0 = (s->dict_bits > 0) ? 64 : 32; From 89ec635dceacde2b6715af253029ef317905df4e Mon Sep 17 00:00:00 2001 From: AJ Schmidt Date: Wed, 8 Feb 2023 12:17:50 -0500 Subject: [PATCH 4/6] Update shared workflow branches (#12733) This PR updates the branch reference used for our shared workflows. Authors: - AJ Schmidt (https://github.com/ajschmidt8) Approvers: - Ray Douglass (https://github.com/raydouglass) URL: https://github.com/rapidsai/cudf/pull/12733 --- .github/workflows/build.yaml | 14 +++++++------- .github/workflows/pr.yaml | 26 +++++++++++++------------- .github/workflows/test.yaml | 14 +++++++------- 3 files changed, 27 insertions(+), 27 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 3366554db30..26d07515f70 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -28,7 +28,7 @@ concurrency: jobs: cpp-build: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.04 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -37,7 +37,7 @@ jobs: python-build: needs: [cpp-build] secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.04 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -46,7 +46,7 @@ jobs: upload-conda: needs: [cpp-build, python-build] secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.04 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -55,7 +55,7 @@ jobs: skip_upload_pkgs: libcudf-example wheel-build-cudf: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.04 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -67,7 +67,7 @@ jobs: wheel-publish-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@branch-23.04 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -77,7 +77,7 @@ jobs: wheel-build-dask-cudf: needs: wheel-publish-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-build.yml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-build.yml@branch-23.04 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -88,7 +88,7 @@ jobs: wheel-publish-dask-cudf: needs: wheel-build-dask-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-publish.yml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-publish.yml@branch-23.04 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index cf20b0006a2..f33fc15c52f 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -25,32 +25,32 @@ jobs: - wheel-build-dask-cudf - wheel-tests-dask-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.04 checks: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.04 conda-cpp-build: needs: checks secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.04 with: build_type: pull-request conda-cpp-tests: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.04 with: build_type: pull-request conda-python-build: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.04 with: build_type: pull-request conda-python-cudf-tests: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.04 with: build_type: pull-request test_script: "ci/test_python_cudf.sh" @@ -58,14 +58,14 @@ jobs: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism needs: conda-python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.04 with: build_type: pull-request test_script: "ci/test_python_other.sh" conda-java-tests: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.04 with: build_type: pull-request node_type: "gpu-latest-1" @@ -75,7 +75,7 @@ jobs: conda-notebook-tests: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.04 with: build_type: pull-request node_type: "gpu-latest-1" @@ -85,7 +85,7 @@ jobs: wheel-build-cudf: needs: checks secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.04 with: build_type: pull-request package-name: cudf @@ -94,7 +94,7 @@ jobs: wheel-tests-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.04 with: build_type: pull-request package-name: cudf @@ -106,7 +106,7 @@ jobs: wheel-build-dask-cudf: needs: wheel-tests-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-build.yml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-build.yml@branch-23.04 with: build_type: pull-request package-name: dask_cudf @@ -115,7 +115,7 @@ jobs: wheel-tests-dask-cudf: needs: wheel-build-dask-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-test.yml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-test.yml@branch-23.04 with: build_type: pull-request package-name: dask_cudf diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 1b117bb2f4f..ff19d51f8ef 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -16,7 +16,7 @@ on: jobs: conda-cpp-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.04 with: build_type: nightly branch: ${{ inputs.branch }} @@ -24,7 +24,7 @@ jobs: sha: ${{ inputs.sha }} conda-python-cudf-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.04 with: build_type: nightly branch: ${{ inputs.branch }} @@ -34,7 +34,7 @@ jobs: conda-python-other-tests: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.04 with: build_type: nightly branch: ${{ inputs.branch }} @@ -43,7 +43,7 @@ jobs: test_script: "ci/test_python_other.sh" conda-java-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.04 with: build_type: nightly branch: ${{ inputs.branch }} @@ -55,7 +55,7 @@ jobs: run_script: "ci/test_java.sh" conda-notebook-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.04 with: build_type: nightly branch: ${{ inputs.branch }} @@ -67,7 +67,7 @@ jobs: run_script: "ci/test_notebooks.sh" wheel-tests-cudf: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.04 with: build_type: nightly branch: ${{ inputs.branch }} @@ -78,7 +78,7 @@ jobs: test-unittest: "pytest -v -n 8 ./python/cudf/cudf/tests" wheel-tests-dask-cudf: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-test.yml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-test.yml@branch-23.04 with: build_type: nightly branch: ${{ inputs.branch }} From d3f9dafa49c973c5e5d8b8a9336bbc92555ea0c3 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Wed, 8 Feb 2023 11:41:10 -0600 Subject: [PATCH 5/6] Fix faulty conditional logic in JIT `GroupBy.apply` (#12706) Closes https://github.com/rapidsai/cudf/issues/12686 Authors: - https://github.com/brandon-b-miller Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/12706 --- python/cudf/cudf/tests/test_groupby.py | 17 +++++++++++++++++ python/cudf/udf_cpp/groupby/function.cu | 6 +++--- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index c5b330fd89c..1fea3c7a37e 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -519,6 +519,23 @@ def test_groupby_apply_jit_args(func, args, groupby_jit_data): run_groupby_apply_jit_test(groupby_jit_data, func, ["key1", "key2"], *args) +def test_groupby_apply_jit_block_divergence(): + # https://github.com/rapidsai/cudf/issues/12686 + df = cudf.DataFrame( + { + "a": [0, 0, 0, 1, 1, 1], + "b": [1, 1, 1, 2, 3, 4], + } + ) + + def diverging_block(grp_df): + if grp_df["a"].mean() > 0: + return grp_df["b"].mean() + return 0 + + run_groupby_apply_jit_test(df, diverging_block, ["a"]) + + @pytest.mark.parametrize("nelem", [2, 3, 100, 500, 1000]) @pytest.mark.parametrize( "func", diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index f94f99c4b49..782371b8a44 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -284,7 +284,7 @@ extern "C" { __device__ int name##_##cname(return_type* numba_return_value, type* const data, int64_t size) \ { \ return_type const res = name(data, size); \ - if (threadIdx.x == 0) { *numba_return_value = res; } \ + *numba_return_value = res; \ __syncthreads(); \ return 0; \ } @@ -309,8 +309,8 @@ extern "C" { __device__ int name##_##cname( \ int64_t* numba_return_value, type* const data, int64_t* index, int64_t size) \ { \ - auto const res = name(data, index, size); \ - if (threadIdx.x == 0) { *numba_return_value = res; } \ + auto const res = name(data, index, size); \ + *numba_return_value = res; \ __syncthreads(); \ return 0; \ } From 0161ba896a1d70ba3e049bdbb3d649cedba2aeb0 Mon Sep 17 00:00:00 2001 From: Cindy Jiang <47068112+cindyyuanjiang@users.noreply.github.com> Date: Wed, 8 Feb 2023 10:48:38 -0800 Subject: [PATCH 6/6] Add `regex_program` strings replacing java APIs and tests (#12701) This PR adds [replace_re, replace_with_backrefs](https://docs.rapids.ai/api/libcudf/nightly/replace__re_8hpp.html) related `regex_program` java APIs and unit tests. Part of work for https://github.com/NVIDIA/spark-rapids/issues/7295. Authors: - Cindy Jiang (https://github.com/cindyyuanjiang) Approvers: - Jason Lowe (https://github.com/jlowe) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/12701 --- .../main/java/ai/rapids/cudf/ColumnView.java | 71 +++++++++++--- java/src/main/native/src/ColumnViewJni.cpp | 43 ++++---- .../java/ai/rapids/cudf/ColumnVectorTest.java | 98 ++++++++++++------- 3 files changed, 149 insertions(+), 63 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 2d0bf28225f..0cb9ed37d9f 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -2922,8 +2922,21 @@ public final ColumnVector stringReplace(Scalar target, Scalar replace) { * @param repl The string scalar to replace for each pattern match. * @return A new column vector containing the string results. */ + @Deprecated public final ColumnVector replaceRegex(String pattern, Scalar repl) { - return replaceRegex(pattern, repl, -1); + return replaceRegex(new RegexProgram(pattern, CaptureGroups.NON_CAPTURE), repl); + } + + /** + * For each string, replaces any character sequence matching the given regex program pattern + * using the replacement string scalar. + * + * @param regexProg The regex program with pattern to search within each string. + * @param repl The string scalar to replace for each pattern match. + * @return A new column vector containing the string results. + */ + public final ColumnVector replaceRegex(RegexProgram regexProg, Scalar repl) { + return replaceRegex(regexProg, repl, -1); } /** @@ -2935,12 +2948,27 @@ public final ColumnVector replaceRegex(String pattern, Scalar repl) { * @param maxRepl The maximum number of times a replacement should occur within each string. * @return A new column vector containing the string results. */ + @Deprecated public final ColumnVector replaceRegex(String pattern, Scalar repl, int maxRepl) { + return replaceRegex(new RegexProgram(pattern, CaptureGroups.NON_CAPTURE), repl, maxRepl); + } + + /** + * For each string, replaces any character sequence matching the given regex program pattern + * using the replacement string scalar. + * + * @param regexProg The regex program with pattern to search within each string. + * @param repl The string scalar to replace for each pattern match. + * @param maxRepl The maximum number of times a replacement should occur within each string. + * @return A new column vector containing the string results. + */ + public final ColumnVector replaceRegex(RegexProgram regexProg, Scalar repl, int maxRepl) { if (!repl.getType().equals(DType.STRING)) { throw new IllegalArgumentException("Replacement must be a string scalar"); } - return new ColumnVector(replaceRegex(getNativeView(), pattern, repl.getScalarHandle(), - maxRepl)); + assert regexProg != null : "regex program may not be null"; + return new ColumnVector(replaceRegex(getNativeView(), regexProg.pattern(), regexProg.combinedFlags(), + regexProg.capture().nativeId, repl.getScalarHandle(), maxRepl)); } /** @@ -2966,9 +2994,26 @@ public final ColumnVector replaceMultiRegex(String[] patterns, ColumnView repls) * @param replace The replacement template for creating the output string. * @return A new java column vector containing the string results. */ + @Deprecated public final ColumnVector stringReplaceWithBackrefs(String pattern, String replace) { - return new ColumnVector(stringReplaceWithBackrefs(getNativeView(), pattern, - replace)); + return stringReplaceWithBackrefs(new RegexProgram(pattern), replace); + } + + /** + * For each string, replaces any character sequence matching the given regex program + * pattern using the replace template for back-references. + * + * Any null string entries return corresponding null output column entries. + * + * @param regexProg The regex program with pattern to search within each string. + * @param replace The replacement template for creating the output string. + * @return A new java column vector containing the string results. + */ + public final ColumnVector stringReplaceWithBackrefs(RegexProgram regexProg, String replace) { + assert regexProg != null : "regex program may not be null"; + return new ColumnVector( + stringReplaceWithBackrefs(getNativeView(), regexProg.pattern(), regexProg.combinedFlags(), + regexProg.capture().nativeId, replace)); } /** @@ -4129,12 +4174,14 @@ private static native long substringColumn(long columnView, long startColumn, lo * Native method for replacing each regular expression pattern match with the specified * replacement string. * @param columnView native handle of the cudf::column_view being operated on. - * @param pattern The regular expression pattern to search within each string. + * @param pattern regular expression pattern to search within each string. + * @param flags regex flags setting. + * @param capture capture groups setting. * @param repl native handle of the cudf::scalar containing the replacement string. * @param maxRepl maximum number of times to replace the pattern within a string * @return native handle of the resulting cudf column containing the string results. */ - private static native long replaceRegex(long columnView, String pattern, + private static native long replaceRegex(long columnView, String pattern, int flags, int capture, long repl, long maxRepl) throws CudfException; /** @@ -4148,15 +4195,17 @@ private static native long replaceMultiRegex(long columnView, String[] patterns, long repls) throws CudfException; /** - * Native method for replacing any character sequence matching the given pattern - * using the replace template for back-references. + * Native method for replacing any character sequence matching the given regex program + * pattern using the replace template for back-references. * @param columnView native handle of the cudf::column_view being operated on. * @param pattern The regular expression patterns to search within each string. + * @param flags Regex flags setting. + * @param capture Capture groups setting. * @param replace The replacement template for creating the output string. * @return native handle of the resulting cudf column containing the string results. */ - private static native long stringReplaceWithBackrefs(long columnView, String pattern, - String replace) throws CudfException; + private static native long stringReplaceWithBackrefs(long columnView, String pattern, int flags, + int capture, String replace) throws CudfException; /** * Native method for checking if strings in a column starts with a specified comparison string. diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 958efd364ed..c42cc430560 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -1606,21 +1606,24 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_mapContains(JNIEnv *env, CATCH_STD(env, 0); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceRegex(JNIEnv *env, jclass, - jlong j_column_view, - jstring j_pattern, jlong j_repl, - jlong j_maxrepl) { +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceRegex( + JNIEnv *env, jclass, jlong j_column_view, jstring j_pattern, jint regex_flags, + jint capture_groups, jlong j_repl, jlong j_maxrepl) { JNI_NULL_CHECK(env, j_column_view, "column is null", 0); JNI_NULL_CHECK(env, j_pattern, "pattern string is null", 0); JNI_NULL_CHECK(env, j_repl, "replace scalar is null", 0); try { cudf::jni::auto_set_device(env); - auto cv = reinterpret_cast(j_column_view); - cudf::strings_column_view scv(*cv); - cudf::jni::native_jstring pattern(env, j_pattern); - auto repl = reinterpret_cast(j_repl); - return release_as_jlong(cudf::strings::replace_re(scv, pattern.get(), *repl, j_maxrepl)); + auto const cv = reinterpret_cast(j_column_view); + auto const strings_column = cudf::strings_column_view{*cv}; + auto const pattern = cudf::jni::native_jstring(env, j_pattern); + auto const flags = static_cast(regex_flags); + auto const groups = static_cast(capture_groups); + auto const regex_prog = cudf::strings::regex_program::create(pattern.get(), flags, groups); + auto const repl = reinterpret_cast(j_repl); + return release_as_jlong( + cudf::strings::replace_re(strings_column, *regex_prog, *repl, j_maxrepl)); } CATCH_STD(env, 0); } @@ -1646,19 +1649,23 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceMultiRegex(JNIEnv } JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringReplaceWithBackrefs( - JNIEnv *env, jclass, jlong column_view, jstring patternObj, jstring replaceObj) { + JNIEnv *env, jclass, jlong j_column_view, jstring pattern_obj, jint regex_flags, + jint capture_groups, jstring replace_obj) { - JNI_NULL_CHECK(env, column_view, "column is null", 0); - JNI_NULL_CHECK(env, patternObj, "pattern string is null", 0); - JNI_NULL_CHECK(env, replaceObj, "replace string is null", 0); + JNI_NULL_CHECK(env, j_column_view, "column is null", 0); + JNI_NULL_CHECK(env, pattern_obj, "pattern string is null", 0); + JNI_NULL_CHECK(env, replace_obj, "replace string is null", 0); try { cudf::jni::auto_set_device(env); - cudf::column_view *cv = reinterpret_cast(column_view); - cudf::strings_column_view scv(*cv); - cudf::jni::native_jstring ss_pattern(env, patternObj); - cudf::jni::native_jstring ss_replace(env, replaceObj); + auto const cv = reinterpret_cast(j_column_view); + auto const strings_column = cudf::strings_column_view{*cv}; + auto const pattern = cudf::jni::native_jstring(env, pattern_obj); + auto const flags = static_cast(regex_flags); + auto const groups = static_cast(capture_groups); + auto const regex_prog = cudf::strings::regex_program::create(pattern.get(), flags, groups); + cudf::jni::native_jstring ss_replace(env, replace_obj); return release_as_jlong( - cudf::strings::replace_with_backrefs(scv, ss_pattern.get(), ss_replace.get())); + cudf::strings::replace_with_backrefs(strings_column, *regex_prog, ss_replace.get())); } CATCH_STD(env, 0); } diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index ab4baf74277..db64dcb08c7 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -5147,29 +5147,42 @@ void teststringReplaceThrowsException() { @Test void testReplaceRegex() { - try (ColumnVector v = - ColumnVector.fromStrings("title and Title with title", "nothing", null, "Title"); - Scalar repl = Scalar.fromString("Repl"); - ColumnVector actual = v.replaceRegex("[tT]itle", repl); - ColumnVector expected = - ColumnVector.fromStrings("Repl and Repl with Repl", "nothing", null, "Repl")) { - assertColumnsAreEqual(expected, actual); - } + try (ColumnVector v = ColumnVector.fromStrings("title and Title with title", "nothing", null, "Title"); + Scalar repl = Scalar.fromString("Repl")) { + String pattern = "[tT]itle"; + RegexProgram regexProg = new RegexProgram(pattern, CaptureGroups.NON_CAPTURE); - try (ColumnVector v = - ColumnVector.fromStrings("title and Title with title", "nothing", null, "Title"); - Scalar repl = Scalar.fromString("Repl"); - ColumnVector actual = v.replaceRegex("[tT]itle", repl, 0)) { - assertColumnsAreEqual(v, actual); - } + try (ColumnVector actual = v.replaceRegex(pattern, repl); + ColumnVector expected = + ColumnVector.fromStrings("Repl and Repl with Repl", "nothing", null, "Repl")) { + assertColumnsAreEqual(expected, actual); + } - try (ColumnVector v = - ColumnVector.fromStrings("title and Title with title", "nothing", null, "Title"); - Scalar repl = Scalar.fromString("Repl"); - ColumnVector actual = v.replaceRegex("[tT]itle", repl, 1); - ColumnVector expected = - ColumnVector.fromStrings("Repl and Title with title", "nothing", null, "Repl")) { - assertColumnsAreEqual(expected, actual); + try (ColumnVector actual = v.replaceRegex(pattern, repl, 0)) { + assertColumnsAreEqual(v, actual); + } + + try (ColumnVector actual = v.replaceRegex(pattern, repl, 1); + ColumnVector expected = + ColumnVector.fromStrings("Repl and Title with title", "nothing", null, "Repl")) { + assertColumnsAreEqual(expected, actual); + } + + try (ColumnVector actual = v.replaceRegex(regexProg, repl); + ColumnVector expected = + ColumnVector.fromStrings("Repl and Repl with Repl", "nothing", null, "Repl")) { + assertColumnsAreEqual(expected, actual); + } + + try (ColumnVector actual = v.replaceRegex(regexProg, repl, 0)) { + assertColumnsAreEqual(v, actual); + } + + try (ColumnVector actual = v.replaceRegex(regexProg, repl, 1); + ColumnVector expected = + ColumnVector.fromStrings("Repl and Title with title", "nothing", null, "Repl")) { + assertColumnsAreEqual(expected, actual); + } } } @@ -5188,45 +5201,55 @@ void testReplaceMultiRegex() { @Test void testStringReplaceWithBackrefs() { - try (ColumnVector v = ColumnVector.fromStrings("

title

", "

another title

", - null); + try (ColumnVector v = ColumnVector.fromStrings("

title

", "

another title

", null); ColumnVector expected = ColumnVector.fromStrings("

title

", "

another title

", null); - ColumnVector actual = v.stringReplaceWithBackrefs("

(.*)

", "

\\1

")) { + ColumnVector actual = v.stringReplaceWithBackrefs("

(.*)

", "

\\1

"); + ColumnVector actualRe = + v.stringReplaceWithBackrefs(new RegexProgram("

(.*)

"), "

\\1

")) { assertColumnsAreEqual(expected, actual); + assertColumnsAreEqual(expected, actualRe); } try (ColumnVector v = ColumnVector.fromStrings("2020-1-01", "2020-2-02", null); ColumnVector expected = ColumnVector.fromStrings("2020-01-01", "2020-02-02", null); - ColumnVector actual = v.stringReplaceWithBackrefs("-([0-9])-", "-0\\1-")) { + ColumnVector actual = v.stringReplaceWithBackrefs("-([0-9])-", "-0\\1-"); + ColumnVector actualRe = + v.stringReplaceWithBackrefs(new RegexProgram("-([0-9])-"), "-0\\1-")) { assertColumnsAreEqual(expected, actual); + assertColumnsAreEqual(expected, actualRe); } - try (ColumnVector v = ColumnVector.fromStrings("2020-01-1", "2020-02-2", - "2020-03-3invalid", null); + try (ColumnVector v = ColumnVector.fromStrings("2020-01-1", "2020-02-2", "2020-03-3invalid", null); ColumnVector expected = ColumnVector.fromStrings("2020-01-01", "2020-02-02", "2020-03-3invalid", null); - ColumnVector actual = v.stringReplaceWithBackrefs( - "-([0-9])$", "-0\\1")) { + ColumnVector actual = v.stringReplaceWithBackrefs("-([0-9])$", "-0\\1"); + ColumnVector actualRe = + v.stringReplaceWithBackrefs(new RegexProgram("-([0-9])$"), "-0\\1")) { assertColumnsAreEqual(expected, actual); + assertColumnsAreEqual(expected, actualRe); } try (ColumnVector v = ColumnVector.fromStrings("2020-01-1 random_text", "2020-02-2T12:34:56", - "2020-03-3invalid", null); + "2020-03-3invalid", null); ColumnVector expected = ColumnVector.fromStrings("2020-01-01 random_text", "2020-02-02T12:34:56", "2020-03-3invalid", null); - ColumnVector actual = v.stringReplaceWithBackrefs( - "-([0-9])([ T])", "-0\\1\\2")) { + ColumnVector actual = v.stringReplaceWithBackrefs("-([0-9])([ T])", "-0\\1\\2"); + ColumnVector actualRe = + v.stringReplaceWithBackrefs(new RegexProgram("-([0-9])([ T])"), "-0\\1\\2")) { assertColumnsAreEqual(expected, actual); + assertColumnsAreEqual(expected, actualRe); } // test zero as group index try (ColumnVector v = ColumnVector.fromStrings("aa-11 b2b-345", "aa-11a 1c-2b2 b2-c3", "11-aa", null); ColumnVector expected = ColumnVector.fromStrings("aa-11:aa:11; b2b-345:b:345;", "aa-11:aa:11;a 1c-2:c:2;b2 b2-c3", "11-aa", null); - ColumnVector actual = v.stringReplaceWithBackrefs( - "([a-z]+)-([0-9]+)", "${0}:${1}:${2};")) { + ColumnVector actual = v.stringReplaceWithBackrefs("([a-z]+)-([0-9]+)", "${0}:${1}:${2};"); + ColumnVector actualRe = + v.stringReplaceWithBackrefs(new RegexProgram("([a-z]+)-([0-9]+)"), "${0}:${1}:${2};")) { assertColumnsAreEqual(expected, actual); + assertColumnsAreEqual(expected, actualRe); } // group index exceeds group count @@ -5236,6 +5259,13 @@ void testStringReplaceWithBackrefs() { } }); + // group index exceeds group count + assertThrows(CudfException.class, () -> { + try (ColumnVector v = ColumnVector.fromStrings("ABC123defgh"); + ColumnVector r = + v.stringReplaceWithBackrefs(new RegexProgram("([A-Z]+)([0-9]+)([a-z]+)"), "\\4")) { + } + }); } @Test