From daea0dd6e112a4ccf7fe320fdf8fdc2631a293ef Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 29 Mar 2022 15:02:28 -0700 Subject: [PATCH 01/11] Pin click version to last support by black<22.3.0. --- .pre-commit-config.yaml | 2 ++ conda/environments/cudf_dev_cuda11.5.yml | 1 + 2 files changed, 3 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9e72c0119f3..65b61aa769f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -32,6 +32,8 @@ repos: hooks: - id: black files: python/.* + additional_dependencies: + - click==8.0.4 - repo: https://github.com/PyCQA/flake8 rev: 3.8.3 hooks: diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml index 0970612c970..1fecda80da6 100644 --- a/conda/environments/cudf_dev_cuda11.5.yml +++ b/conda/environments/cudf_dev_cuda11.5.yml @@ -10,6 +10,7 @@ channels: dependencies: - clang=11.1.0 - clang-tools=11.1.0 + - click=8.0.4 - cupy>=9.5.0,<11.0.0a0 - rmm=22.04.* - cmake>=3.20.1 From e8dc00cb18720d43c7e1c080efb6b8bf865d46c9 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 29 Mar 2022 15:08:39 -0700 Subject: [PATCH 02/11] Fix formatting. --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 65b61aa769f..dbe02c85341 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -32,8 +32,8 @@ repos: hooks: - id: black files: python/.* - additional_dependencies: - - click==8.0.4 + additional_dependencies: + - click==8.0.4 - repo: https://github.com/PyCQA/flake8 rev: 3.8.3 hooks: From bc6239b2738b17ba5f4e59a966ea3b41f0fca791 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 29 Mar 2022 15:11:13 -0700 Subject: [PATCH 03/11] Remove deprecated `decimal_cols_as_float` in the ORC reader (#10515) Closes #10129 Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Bradley Dice (https://github.com/bdice) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/10515 --- cpp/include/cudf/io/orc.hpp | 38 ------------------- cpp/src/io/orc/reader_impl.cu | 49 ++++++++++-------------- cpp/src/io/orc/reader_impl.hpp | 3 +- cpp/src/io/orc/stripe_data.cu | 56 +++++++++------------------- cpp/tests/io/orc_test.cpp | 12 +----- python/cudf/cudf/_lib/cpp/io/orc.pxd | 6 +-- python/cudf/cudf/_lib/orc.pyx | 10 +---- python/cudf/cudf/io/orc.py | 8 ---- python/cudf/cudf/tests/test_orc.py | 27 +------------- python/cudf/cudf/utils/ioutils.py | 3 -- 10 files changed, 44 insertions(+), 168 deletions(-) diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp index c2187f056cf..9e8fd1244d0 100644 --- a/cpp/include/cudf/io/orc.hpp +++ b/cpp/include/cudf/io/orc.hpp @@ -67,9 +67,6 @@ class orc_reader_options { // Cast timestamp columns to a specific type data_type _timestamp_type{type_id::EMPTY}; - // Columns that should be converted from Decimal to Float64 - std::vector _decimal_cols_as_float; - // Columns that should be read as Decimal128 std::vector _decimal128_columns; @@ -138,14 +135,6 @@ class orc_reader_options { */ data_type get_timestamp_type() const { return _timestamp_type; } - /** - * @brief Fully qualified names of columns that should be converted from Decimal to Float64. - */ - std::vector const& get_decimal_cols_as_float() const - { - return _decimal_cols_as_float; - } - /** * @brief Fully qualified names of columns that should be read as 128-bit Decimal. */ @@ -215,18 +204,6 @@ class orc_reader_options { */ void set_timestamp_type(data_type type) { _timestamp_type = type; } - /** - * @brief Set columns that should be converted from Decimal to Float64 - * - * @param val Vector of fully qualified column names. - */ - [[deprecated( - "Decimal to float conversion is deprecated and will be remove in future release")]] void - set_decimal_cols_as_float(std::vector val) - { - _decimal_cols_as_float = std::move(val); - } - /** * @brief Set columns that should be read as 128-bit Decimal * @@ -340,21 +317,6 @@ class orc_reader_options_builder { return *this; } - /** - * @brief Columns that should be converted from decimals to float64. - * - * @param val Vector of column names. - * @return this for chaining. - */ - [[deprecated( - "Decimal to float conversion is deprecated and will be remove in future " - "release")]] orc_reader_options_builder& - decimal_cols_as_float(std::vector val) - { - options._decimal_cols_as_float = std::move(val); - return *this; - } - /** * @brief Columns that should be read as 128-bit Decimal * diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index 081ae69e48f..7f9badad9a9 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -231,30 +231,22 @@ size_t gather_stream_info(const size_t stripe_index, /** * @brief Determines cuDF type of an ORC Decimal column. */ -auto decimal_column_type(std::vector const& float64_columns, - std::vector const& decimal128_columns, +auto decimal_column_type(std::vector const& decimal128_columns, cudf::io::orc::detail::aggregate_orc_metadata const& metadata, int column_index) { - if (metadata.get_col_type(column_index).kind != DECIMAL) return type_id::EMPTY; + if (metadata.get_col_type(column_index).kind != DECIMAL) { return type_id::EMPTY; } - auto const& column_path = metadata.column_path(0, column_index); - auto is_column_in = [&](const std::vector& cols) { - return std::find(cols.cbegin(), cols.cend(), column_path) != cols.end(); - }; - - auto const user_selected_float64 = is_column_in(float64_columns); - auto const user_selected_decimal128 = is_column_in(decimal128_columns); - CUDF_EXPECTS(not user_selected_float64 or not user_selected_decimal128, - "Both decimal128 and float64 types selected for column " + column_path); - - if (user_selected_float64) return type_id::FLOAT64; - if (user_selected_decimal128) return type_id::DECIMAL128; + if (std::find(decimal128_columns.cbegin(), + decimal128_columns.cend(), + metadata.column_path(0, column_index)) != decimal128_columns.end()) { + return type_id::DECIMAL128; + } auto const precision = metadata.get_col_type(column_index) .precision.value_or(cuda::std::numeric_limits::digits10); - if (precision <= cuda::std::numeric_limits::digits10) return type_id::DECIMAL32; - if (precision <= cuda::std::numeric_limits::digits10) return type_id::DECIMAL64; + if (precision <= cuda::std::numeric_limits::digits10) { return type_id::DECIMAL32; } + if (precision <= cuda::std::numeric_limits::digits10) { return type_id::DECIMAL64; } return type_id::DECIMAL128; } @@ -786,12 +778,11 @@ std::unique_ptr reader::impl::create_empty_column(const size_type orc_co rmm::cuda_stream_view stream) { schema_info.name = _metadata.column_name(0, orc_col_id); - auto const type = to_type_id( - _metadata.get_schema(orc_col_id), - _use_np_dtypes, - _timestamp_type.id(), - decimal_column_type(_decimal_cols_as_float, decimal128_columns, _metadata, orc_col_id)); - int32_t scale = 0; + auto const type = to_type_id(_metadata.get_schema(orc_col_id), + _use_np_dtypes, + _timestamp_type.id(), + decimal_column_type(decimal128_columns, _metadata, orc_col_id)); + int32_t scale = 0; std::vector> child_columns; std::unique_ptr out_col = nullptr; auto kind = _metadata.get_col_type(orc_col_id).kind; @@ -933,8 +924,7 @@ reader::impl::impl(std::vector>&& sources, _use_np_dtypes = options.is_enabled_use_np_dtypes(); // Control decimals conversion - _decimal_cols_as_float = options.get_decimal_cols_as_float(); - decimal128_columns = options.get_decimal128_columns(); + decimal128_columns = options.get_decimal128_columns(); } timezone_table reader::impl::compute_timezone_table( @@ -994,11 +984,10 @@ table_with_metadata reader::impl::read(size_type skip_rows, // Get a list of column data types std::vector column_types; for (auto& col : columns_level) { - auto col_type = to_type_id( - _metadata.get_col_type(col.id), - _use_np_dtypes, - _timestamp_type.id(), - decimal_column_type(_decimal_cols_as_float, decimal128_columns, _metadata, col.id)); + auto col_type = to_type_id(_metadata.get_col_type(col.id), + _use_np_dtypes, + _timestamp_type.id(), + decimal_column_type(decimal128_columns, _metadata, col.id)); CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type"); if (col_type == type_id::DECIMAL32 or col_type == type_id::DECIMAL64 or col_type == type_id::DECIMAL128) { diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp index 1e586bcde00..103093f055f 100644 --- a/cpp/src/io/orc/reader_impl.hpp +++ b/cpp/src/io/orc/reader_impl.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -221,7 +221,6 @@ class reader::impl { bool _use_index{true}; bool _use_np_dtypes{true}; - std::vector _decimal_cols_as_float; std::vector decimal128_columns; data_type _timestamp_type{type_id::EMPTY}; reader_column_meta _col_meta{}; diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index dc09b3e7dd8..b4cbb5d9037 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -962,15 +962,6 @@ static __device__ uint32_t Byte_RLE(orc_bytestream_s* bs, return rle->num_vals; } -/** - * @brief Powers of 10 - */ -static const __device__ __constant__ double kPow10[40] = { - 1.0, 1.e1, 1.e2, 1.e3, 1.e4, 1.e5, 1.e6, 1.e7, 1.e8, 1.e9, 1.e10, 1.e11, 1.e12, 1.e13, - 1.e14, 1.e15, 1.e16, 1.e17, 1.e18, 1.e19, 1.e20, 1.e21, 1.e22, 1.e23, 1.e24, 1.e25, 1.e26, 1.e27, - 1.e28, 1.e29, 1.e30, 1.e31, 1.e32, 1.e33, 1.e34, 1.e35, 1.e36, 1.e37, 1.e38, 1.e39, -}; - static const __device__ __constant__ int64_t kPow5i[28] = {1, 5, 25, @@ -1045,34 +1036,24 @@ static __device__ int Decode_Decimals(orc_bytestream_s* bs, auto const pos = static_cast(vals.i64[2 * t]); __int128_t v = decode_varint128(bs, pos); - if (dtype_id == type_id::FLOAT64) { - double f = v; - int32_t scale = (t < numvals) ? val_scale : 0; - if (scale >= 0) - vals.f64[t] = f / kPow10[min(scale, 39)]; - else - vals.f64[t] = f * kPow10[min(-scale, 39)]; - } else { - auto const scaled_value = [&]() { - // Since cuDF column stores just one scale, value needs to be adjusted to col_scale from - // val_scale. So the difference of them will be used to add 0s or remove digits. - int32_t scale = (t < numvals) ? col_scale - val_scale : 0; - if (scale >= 0) { - scale = min(scale, 27); - return (v * kPow5i[scale]) << scale; - } else // if (scale < 0) - { - scale = min(-scale, 27); - return (v / kPow5i[scale]) >> scale; - } - }(); - if (dtype_id == type_id::DECIMAL32) { - vals.i32[t] = scaled_value; - } else if (dtype_id == type_id::DECIMAL64) { - vals.i64[t] = scaled_value; + auto const scaled_value = [&]() { + // Since cuDF column stores just one scale, value needs to be adjusted to col_scale from + // val_scale. So the difference of them will be used to add 0s or remove digits. + int32_t const scale = (t < numvals) ? col_scale - val_scale : 0; + if (scale >= 0) { + auto const abs_scale = min(scale, 27); + return (v * kPow5i[abs_scale]) << abs_scale; } else { - vals.i128[t] = scaled_value; + auto const abs_scale = min(-scale, 27); + return (v / kPow5i[abs_scale]) >> abs_scale; } + }(); + if (dtype_id == type_id::DECIMAL32) { + vals.i32[t] = scaled_value; + } else if (dtype_id == type_id::DECIMAL64) { + vals.i64[t] = scaled_value; + } else { + vals.i128[t] = scaled_value; } } // There is nothing to read, so break @@ -1711,8 +1692,7 @@ __global__ void __launch_bounds__(block_size) case DECIMAL: if (s->chunk.dtype_id == type_id::DECIMAL32) { static_cast(data_out)[row] = s->vals.u32[t + vals_skipped]; - } else if (s->chunk.dtype_id == type_id::FLOAT64 or - s->chunk.dtype_id == type_id::DECIMAL64) { + } else if (s->chunk.dtype_id == type_id::DECIMAL64) { static_cast(data_out)[row] = s->vals.u64[t + vals_skipped]; } else { // decimal128 diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index bac5bf1f55b..5823a859f7b 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -1421,17 +1421,9 @@ TEST_F(OrcReaderTest, DecimalOptions) cudf_io::orc_reader_options valid_opts = cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}) - .decimal128_columns({"dec", "fake_name"}) - .decimal_cols_as_float({"decc", "fake_name"}); - // Should not throw, even with "fake name" in both options + .decimal128_columns({"dec", "fake_name"}); + // Should not throw, even with "fake name" EXPECT_NO_THROW(cudf_io::read_orc(valid_opts)); - - cudf_io::orc_reader_options invalid_opts = - cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}) - .decimal128_columns({"dec", "fake_name"}) - .decimal_cols_as_float({"dec", "fake_name"}); - // Should throw, options overlap - EXPECT_THROW(cudf_io::read_orc(invalid_opts), cudf::logic_error); } TEST_F(OrcWriterTest, DecimalOptionsNested) diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd index 0c2f971a26c..62ff5eb4f53 100644 --- a/python/cudf/cudf/_lib/cpp/io/orc.pxd +++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libc.stdint cimport uint8_t from libcpp cimport bool @@ -36,7 +36,6 @@ cdef extern from "cudf/io/orc.hpp" \ void enable_use_index(bool val) except+ void enable_use_np_dtypes(bool val) except+ void set_timestamp_type(data_type type) except+ - void set_decimal_cols_as_float(vector[string] val) except+ @staticmethod orc_reader_options_builder builder( @@ -55,9 +54,6 @@ cdef extern from "cudf/io/orc.hpp" \ orc_reader_options_builder& use_index(bool val) except+ orc_reader_options_builder& use_np_dtypes(bool val) except+ orc_reader_options_builder& timestamp_type(data_type type) except+ - orc_reader_options_builder& decimal_cols_as_float( - vector[string] val - ) except+ orc_reader_options build() except+ diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index 127e3a612dc..8331f9c3d17 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -93,7 +93,6 @@ cpdef read_orc(object filepaths_or_buffers, object skip_rows=None, object num_rows=None, bool use_index=True, - object decimal_cols_as_float=None, object timestamp_type=None): """ Cython function to call into libcudf API, see `read_orc`. @@ -120,7 +119,6 @@ cpdef read_orc(object filepaths_or_buffers, ) ), use_index, - decimal_cols_as_float or [], ) cdef table_with_metadata c_result @@ -319,8 +317,7 @@ cdef orc_reader_options make_orc_reader_options( size_type skip_rows, size_type num_rows, type_id timestamp_type, - bool use_index, - object decimal_cols_as_float + bool use_index ) except*: for i, datasource in enumerate(filepaths_or_buffers): @@ -333,10 +330,6 @@ cdef orc_reader_options make_orc_reader_options( c_column_names.push_back(str(col).encode()) cdef orc_reader_options opts cdef source_info src = make_source_info(filepaths_or_buffers) - cdef vector[string] c_decimal_cols_as_float - c_decimal_cols_as_float.reserve(len(decimal_cols_as_float)) - for decimal_col in decimal_cols_as_float: - c_decimal_cols_as_float.push_back(str(decimal_col).encode()) opts = move( orc_reader_options.builder(src) .columns(c_column_names) @@ -345,7 +338,6 @@ cdef orc_reader_options make_orc_reader_options( .num_rows(num_rows) .timestamp_type(data_type(timestamp_type)) .use_index(use_index) - .decimal_cols_as_float(c_decimal_cols_as_float) .build() ) diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index 0ac0e02e4d1..6a2ffef52db 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -287,18 +287,11 @@ def read_orc( skiprows=None, num_rows=None, use_index=True, - decimal_cols_as_float=None, timestamp_type=None, use_python_file_object=True, **kwargs, ): """{docstring}""" - if decimal_cols_as_float is not None: - warnings.warn( - "`decimal_cols_as_float` is deprecated and will be removed in " - "the future", - FutureWarning, - ) from cudf import DataFrame # Multiple sources are passed as a list. If a single source is passed, @@ -365,7 +358,6 @@ def read_orc( skiprows, num_rows, use_index, - decimal_cols_as_float, timestamp_type, ) ) diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 62715ad7580..5082fb08b92 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -1266,10 +1266,7 @@ def test_map_type_read(columns, num_rows, use_index): assert_eq(expected_tbl.to_pandas(), gdf) -@pytest.mark.parametrize( - "data", [["_col0"], ["FakeName", "_col0", "TerriblyFakeColumnName"]] -) -def test_orc_reader_decimal(datadir, data): +def test_orc_reader_decimal(datadir): path = datadir / "TestOrcFile.decimal.orc" try: orcfile = pa.orc.ORCFile(path) @@ -1277,28 +1274,8 @@ def test_orc_reader_decimal(datadir, data): pytest.skip(".orc file is not found: %s" % e) pdf = orcfile.read().to_pandas() - gdf = cudf.read_orc(path, decimal_cols_as_float=data).to_pandas() - - # Convert the decimal dtype from PyArrow to float64 for comparison to cuDF - # This is because cuDF returns as float64 - pdf = pdf.apply(pd.to_numeric) - - assert_eq(pdf, gdf) - - -@pytest.mark.parametrize("data", [["InvalidColumnName"]]) -def test_orc_reader_decimal_invalid_column(datadir, data): - path = datadir / "TestOrcFile.decimal.orc" - try: - orcfile = pa.orc.ORCFile(path) - except pa.ArrowIOError as e: - pytest.skip(".orc file is not found: %s" % e) - - pdf = orcfile.read().to_pandas() - gdf = cudf.read_orc(path, decimal_cols_as_float=data).to_pandas() + gdf = cudf.read_orc(path).to_pandas() - # Since the `decimal_cols_as_float` column name - # is invalid, this should be a decimal assert_eq(pdf, gdf) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index cfe1957dfd6..5f348563243 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -392,9 +392,6 @@ If not None, the total number of rows to read. use_index : bool, default True If True, use row index if available for faster seeking. -decimal_cols_as_float: list, default None - If specified, names of the columns that should be converted from - Decimal to Float64 in the resulting dataframe. use_python_file_object : boolean, default True If True, Arrow-backed PythonFile objects will be used in place of fsspec AbstractBufferedFile objects at IO time. This option is likely to improve From 25e5e7367392bcf79687e68384609c1bb936467f Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 29 Mar 2022 20:29:38 -0500 Subject: [PATCH 04/11] Update build.sh --- ci/gpu/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 3534f92b8c3..1c0b07da1d6 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -31,7 +31,7 @@ export GIT_DESCRIBE_TAG=`git describe --tags` export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'` # Dask & Distributed option to install main(nightly) or `conda-forge` packages. -export INSTALL_DASK_MAIN=1 +export INSTALL_DASK_MAIN=0 # ucx-py version export UCX_PY_VERSION='0.25.*' From d30b4f64524391ba707f6174207fb3d84fdc7746 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 29 Mar 2022 20:30:14 -0500 Subject: [PATCH 05/11] Update build.sh --- ci/benchmark/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh index 6bad2342012..31d080e95d7 100755 --- a/ci/benchmark/build.sh +++ b/ci/benchmark/build.sh @@ -37,7 +37,7 @@ export GBENCH_BENCHMARKS_DIR="$WORKSPACE/cpp/build/gbenchmarks/" export LIBCUDF_KERNEL_CACHE_PATH="$HOME/.jitify-cache" # Dask & Distributed option to install main(nightly) or `conda-forge` packages. -export INSTALL_DASK_MAIN=1 +export INSTALL_DASK_MAIN=0 function remove_libcudf_kernel_cache_dir { EXITCODE=$? From 1cccc29ec8b0a7e17e0ed4d17021a7ad2fbd43c6 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 30 Mar 2022 10:35:53 -0700 Subject: [PATCH 06/11] Pin CMake to prevent 3.23 bugs. --- conda/environments/cudf_dev_cuda11.5.yml | 2 +- conda/recipes/libcudf/meta.yaml | 2 +- conda/recipes/libcudf_example/meta.yaml | 4 ++-- conda/recipes/libcudf_kafka/meta.yaml | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml index 1fecda80da6..694bea0350b 100644 --- a/conda/environments/cudf_dev_cuda11.5.yml +++ b/conda/environments/cudf_dev_cuda11.5.yml @@ -13,7 +13,7 @@ dependencies: - click=8.0.4 - cupy>=9.5.0,<11.0.0a0 - rmm=22.04.* - - cmake>=3.20.1 + - cmake>=3.20.1,<3.23 - cmake_setuptools>=0.1.3 - python>=3.7,<3.9 - numba>=0.54 diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 4ea4ace11da..93225267ad6 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -36,7 +36,7 @@ build: requirements: build: - - cmake >=3.20.1 + - cmake >=3.20.1,<3.23 host: - librmm {{ minor_version }}.* - cudatoolkit {{ cuda_version }}.* diff --git a/conda/recipes/libcudf_example/meta.yaml b/conda/recipes/libcudf_example/meta.yaml index c20a62c44c7..f258fd0cc74 100644 --- a/conda/recipes/libcudf_example/meta.yaml +++ b/conda/recipes/libcudf_example/meta.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} @@ -23,7 +23,7 @@ build: requirements: build: - - cmake >=3.20.1 + - cmake >=3.20.1,<3.23 host: - libcudf {{ version }} diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml index d5864a7d68c..0aa099a8382 100644 --- a/conda/recipes/libcudf_kafka/meta.yaml +++ b/conda/recipes/libcudf_kafka/meta.yaml @@ -26,7 +26,7 @@ build: requirements: build: - - cmake >=3.20.1 + - cmake >=3.20.1,<3.23 host: - libcudf {{version}} - librdkafka >=1.7.0,<1.8.0a0 From 4770599ec18d61388bf9f96fc4025b48ef22381c Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Wed, 30 Mar 2022 15:39:51 -0500 Subject: [PATCH 07/11] Adds launch bounds hints to mixed join kernels to address regression seen in NDS q72 in Spark (#10534) The following change addresses a performance degradation we noticed in the `mixed_join` and `compute_mixed_join_output_size` that looks to be tied to the theoretical occupancy of these kernels, as limited by the number of registers used. The regression is triggered by this patch: https://github.com/rapidsai/cudf/pull/9727, which improves handling of unreachable code paths. That said, somehow, this change is altering the number of registers these kernels need. Both `mixed_join` and `compute_mixed_join_output_size` are very sensitive to the register count, per NSight compute. With the patch, the register required changed from 92 to 102, and 118 to 141 respectively. The fix here hints the compiler what our block size is (128 threads). This, from our testing, allows the compiler to reduce the number of registers required to 128 for `compute_mixed_join_output_size` and 96 for `mixed_join`. This lead to better occupancy (I think @nvdbaranec measured it going from 30% to 50%) and I saw the wall clock time of q72 (which started all this) to go from 133s to 121s, which is within the ballpark I'd expect. Authors: - Alessandro Bellina (https://github.com/abellina) Approvers: - Mike Wilson (https://github.com/hyperbolic2346) --- cpp/src/join/mixed_join_kernels.cu | 25 ++++++++++---------- cpp/src/join/mixed_join_kernels_semi.cu | 23 +++++++++--------- cpp/src/join/mixed_join_size_kernels.cu | 2 +- cpp/src/join/mixed_join_size_kernels_semi.cu | 2 +- 4 files changed, 27 insertions(+), 25 deletions(-) diff --git a/cpp/src/join/mixed_join_kernels.cu b/cpp/src/join/mixed_join_kernels.cu index 5638f0ddd38..efaea841e45 100644 --- a/cpp/src/join/mixed_join_kernels.cu +++ b/cpp/src/join/mixed_join_kernels.cu @@ -35,18 +35,19 @@ namespace detail { namespace cg = cooperative_groups; template -__global__ void mixed_join(table_device_view left_table, - table_device_view right_table, - table_device_view probe, - table_device_view build, - row_equality const equality_probe, - join_kind const join_type, - cudf::detail::mixed_multimap_type::device_view hash_table_view, - size_type* join_output_l, - size_type* join_output_r, - cudf::ast::detail::expression_device_view device_expression_data, - cudf::size_type const* join_result_offsets, - bool const swap_tables) +__launch_bounds__(block_size) __global__ + void mixed_join(table_device_view left_table, + table_device_view right_table, + table_device_view probe, + table_device_view build, + row_equality const equality_probe, + join_kind const join_type, + cudf::detail::mixed_multimap_type::device_view hash_table_view, + size_type* join_output_l, + size_type* join_output_r, + cudf::ast::detail::expression_device_view device_expression_data, + cudf::size_type const* join_result_offsets, + bool const swap_tables) { // Normally the casting of a shared memory array is used to create multiple // arrays of different types from the shared memory buffer, but here it is diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu index c8cfc9998f0..63a69554245 100644 --- a/cpp/src/join/mixed_join_kernels_semi.cu +++ b/cpp/src/join/mixed_join_kernels_semi.cu @@ -32,17 +32,18 @@ namespace detail { namespace cg = cooperative_groups; template -__global__ void mixed_join_semi(table_device_view left_table, - table_device_view right_table, - table_device_view probe, - table_device_view build, - row_equality const equality_probe, - join_kind const join_type, - cudf::detail::semi_map_type::device_view hash_table_view, - size_type* join_output_l, - cudf::ast::detail::expression_device_view device_expression_data, - cudf::size_type const* join_result_offsets, - bool const swap_tables) +__launch_bounds__(block_size) __global__ + void mixed_join_semi(table_device_view left_table, + table_device_view right_table, + table_device_view probe, + table_device_view build, + row_equality const equality_probe, + join_kind const join_type, + cudf::detail::semi_map_type::device_view hash_table_view, + size_type* join_output_l, + cudf::ast::detail::expression_device_view device_expression_data, + cudf::size_type const* join_result_offsets, + bool const swap_tables) { // Normally the casting of a shared memory array is used to create multiple // arrays of different types from the shared memory buffer, but here it is diff --git a/cpp/src/join/mixed_join_size_kernels.cu b/cpp/src/join/mixed_join_size_kernels.cu index 1a08b8792c2..22c71bfc33a 100644 --- a/cpp/src/join/mixed_join_size_kernels.cu +++ b/cpp/src/join/mixed_join_size_kernels.cu @@ -35,7 +35,7 @@ namespace detail { namespace cg = cooperative_groups; template -__global__ void compute_mixed_join_output_size( +__launch_bounds__(block_size) __global__ void compute_mixed_join_output_size( table_device_view left_table, table_device_view right_table, table_device_view probe, diff --git a/cpp/src/join/mixed_join_size_kernels_semi.cu b/cpp/src/join/mixed_join_size_kernels_semi.cu index 2c077a698f8..f6b9fb85bbb 100644 --- a/cpp/src/join/mixed_join_size_kernels_semi.cu +++ b/cpp/src/join/mixed_join_size_kernels_semi.cu @@ -32,7 +32,7 @@ namespace detail { namespace cg = cooperative_groups; template -__global__ void compute_mixed_join_output_size_semi( +__launch_bounds__(block_size) __global__ void compute_mixed_join_output_size_semi( table_device_view left_table, table_device_view right_table, table_device_view probe, From 1f0967ecade501b592e348bab3fde4808d6ed3a9 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 30 Mar 2022 14:10:50 -0700 Subject: [PATCH 08/11] Remove Click pinnings that are unnecessary after upgrading black. (#10541) This PR undoes #10535 (which was just a patch for cudf 22.04) on cudf 22.06 since we have implemented the longer term solution in #10523. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) - Ashwin Srinath (https://github.com/shwina) URL: https://github.com/rapidsai/cudf/pull/10541 --- .pre-commit-config.yaml | 2 -- conda/environments/cudf_dev_cuda11.5.yml | 1 - 2 files changed, 3 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1220b211019..21f15ade458 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -32,8 +32,6 @@ repos: hooks: - id: black files: python/.* - additional_dependencies: - - click==8.0.4 - repo: https://github.com/PyCQA/flake8 rev: 3.8.3 hooks: diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml index ec1492894cd..e9d018a2d18 100644 --- a/conda/environments/cudf_dev_cuda11.5.yml +++ b/conda/environments/cudf_dev_cuda11.5.yml @@ -10,7 +10,6 @@ channels: dependencies: - clang=11.1.0 - clang-tools=11.1.0 - - click=8.0.4 - cupy>=9.5.0,<11.0.0a0 - rmm=22.06.* - cmake>=3.20.1 From 4f3ab29619522c2eec55a63c1dbfdda2c4fe64b4 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 30 Mar 2022 16:22:33 -0700 Subject: [PATCH 09/11] Remove pip requirements files. (#10543) This PR removes some pip requirements files that are no longer used. These were previously introduced to support #7647 but that Dockerfile is no longer maintained in this repository. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - https://github.com/brandon-b-miller - GALI PREM SAGAR (https://github.com/galipremsagar) - https://github.com/jakirkham URL: https://github.com/rapidsai/cudf/pull/10543 --- .../cuda-11.0/dev_requirements.txt | 41 ------------------- .../cuda-11.2/dev_requirements.txt | 41 ------------------- python/cudf_kafka/dev_requirements.txt | 11 ----- python/custreamz/dev_requirements.txt | 12 ------ python/dask_cudf/dev_requirements.txt | 14 ------- 5 files changed, 119 deletions(-) delete mode 100644 python/cudf/requirements/cuda-11.0/dev_requirements.txt delete mode 100644 python/cudf/requirements/cuda-11.2/dev_requirements.txt delete mode 100644 python/cudf_kafka/dev_requirements.txt delete mode 100644 python/custreamz/dev_requirements.txt delete mode 100644 python/dask_cudf/dev_requirements.txt diff --git a/python/cudf/requirements/cuda-11.0/dev_requirements.txt b/python/cudf/requirements/cuda-11.0/dev_requirements.txt deleted file mode 100644 index d8dce276820..00000000000 --- a/python/cudf/requirements/cuda-11.0/dev_requirements.txt +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. - -# pyarrow gpu package will have to be built from source : -# https://arrow.apache.org/docs/python/install.html#installing-from-source - -cupy-cuda110 -cachetools -cmake -cmake-setuptools>=0.1.3 -cython>=0.29,<0.30 -dlpack -fastavro>=0.22.9 -python-snappy>=0.6.0 -fsspec>=0.6.0 -hypothesis -mimesis<4.1 -mypy==0.782 -nbsphinx -numba>=0.53.1 -numpy -numpydoc -nvtx>=0.2.1 -packaging -pandas>=1.0,<1.4.0dev0 -pandoc==2.0a4 -protobuf -pydata-sphinx-theme -pyorc -pytest -pytest-benchmark -pytest-xdist -rapidjson -recommonmark -setuptools -sphinx -sphinx-copybutton -sphinx-markdown-tables -sphinxcontrib-websupport -transformers<=4.10.3 -typing_extensions -wheel diff --git a/python/cudf/requirements/cuda-11.2/dev_requirements.txt b/python/cudf/requirements/cuda-11.2/dev_requirements.txt deleted file mode 100644 index c11d108360d..00000000000 --- a/python/cudf/requirements/cuda-11.2/dev_requirements.txt +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. - -# pyarrow gpu package will have to be built from source : -# https://arrow.apache.org/docs/python/install.html#installing-from-source - -cupy-cuda112 -cachetools -cmake -cmake-setuptools>=0.1.3 -cython>=0.29,<0.30 -dlpack -fastavro>=0.22.9 -python-snappy>=0.6.0 -fsspec>=0.6.0 -hypothesis -mimesis<4.1 -mypy==0.782 -nbsphinx -numba>=0.53.1 -numpy -numpydoc -nvtx>=0.2.1 -packaging -pandas>=1.0,<1.4.0dev0 -pandoc==2.0a4 -protobuf -pydata-sphinx-theme -pyorc -pytest -pytest-benchmark -pytest-xdist -rapidjson -recommonmark -setuptools -sphinx -sphinx-copybutton -sphinx-markdown-tables -sphinxcontrib-websupport -transformers<=4.10.3 -typing_extensions -wheel diff --git a/python/cudf_kafka/dev_requirements.txt b/python/cudf_kafka/dev_requirements.txt deleted file mode 100644 index af52659e08e..00000000000 --- a/python/cudf_kafka/dev_requirements.txt +++ /dev/null @@ -1,11 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. - -flake8==3.8.3 -black==19.10b0 -isort==5.6.4 -python-confluent-kafka -pytest -setuptools -wheel -cython>=0.29,<0.30 -python-confluent-kafka diff --git a/python/custreamz/dev_requirements.txt b/python/custreamz/dev_requirements.txt deleted file mode 100644 index a6b44c640f6..00000000000 --- a/python/custreamz/dev_requirements.txt +++ /dev/null @@ -1,12 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. - -flake8==3.8.3 -black==19.10b0 -isort==5.6.4 -dask==2022.03.0 -distributed==2022.03.0 -streamz -python-confluent-kafka -pytest -setuptools -wheel diff --git a/python/dask_cudf/dev_requirements.txt b/python/dask_cudf/dev_requirements.txt deleted file mode 100644 index 438317adf87..00000000000 --- a/python/dask_cudf/dev_requirements.txt +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. - -dask==2022.03.0 -distributed==2022.03.0 -fsspec>=0.6.0 -numba>=0.53.1 -numpy -pandas>=1.0,<1.4.0dev0 -pytest -setuptools -wheel -flake8==3.8.3 -black==19.10b0 -isort==5.6.4 From 13551916d874c42ad02a8b3090bc97e02b35a5fa Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 30 Mar 2022 19:46:49 -0500 Subject: [PATCH 10/11] Refactor `memory_usage` to improve performance (#10537) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactored `Frame.memory_usage` to return a tuple of lists: (column_names, memory_usages). Motivation for this change is to remove redundent steps i.e., `dict`(`Frame.memory_usage`)->`unpack & dict` (`DataFrame.memory_usage`)->`unpack dict`(in `Series.init`). Choosing to remove `dict` being returned by `Frame.memory_usage` will now result in a 10% faster execution of external API. Not a huge speedup but it quickly adds up when `dask_cudf` is used. ```python In [1]: import cudf In [2]: df = cudf.DataFrame({'a':[1, 2, 3], 'b':['a', 'b', 'c'], 'd':[111, 123, 123]}) # THIS PR In [3]: %timeit df.memory_usage() 198 µs ± 3.44 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each) # branch-22.06 In [3]: %timeit df.memory_usage() 219 µs ± 726 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each) # branch-22.06 In [4]: %timeit dask_cudf.backends.sizeof_cudf_dataframe(df) 377 µs ± 5.67 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each) # this PR In [6]: %timeit dask_cudf.backends.sizeof_cudf_dataframe(df) 1.8 µs ± 14 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each) ``` Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Bradley Dice (https://github.com/bdice) - Vyas Ramasubramani (https://github.com/vyasr) - Ashwin Srinath (https://github.com/shwina) URL: https://github.com/rapidsai/cudf/pull/10537 --- python/cudf/cudf/core/dataframe.py | 10 ++++++++-- python/cudf/cudf/core/frame.py | 7 +------ python/cudf/cudf/core/index.py | 2 +- python/cudf/cudf/core/indexed_frame.py | 5 +---- python/cudf/cudf/core/multiindex.py | 2 +- python/cudf/cudf/core/series.py | 4 +++- python/dask_cudf/dask_cudf/backends.py | 5 ++++- 7 files changed, 19 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 17cac3593a3..08a30729e7c 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1339,8 +1339,14 @@ def _slice(self: T, arg: slice) -> T: @_cudf_nvtx_annotate def memory_usage(self, index=True, deep=False): - return Series( - {str(k): v for k, v in super().memory_usage(index, deep).items()} + mem_usage = [col.memory_usage for col in self._data.columns] + names = [str(name) for name in self._data.names] + if index: + mem_usage.append(self._index.memory_usage()) + names.append("Index") + return Series._from_data( + data={None: as_column(mem_usage)}, + index=as_index(names), ) @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index a84606b0953..75c6e4d0964 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -339,12 +339,7 @@ def memory_usage(self, deep=False): ------- The total bytes used. """ - if deep: - warnings.warn( - "The deep parameter is ignored and is only included " - "for pandas compatibility." - ) - return {name: col.memory_usage for name, col in self._data.items()} + raise NotImplementedError def __len__(self): return self._num_rows diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 7df5be3f692..a31fe4c3b99 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -914,7 +914,7 @@ def _concat(cls, objs): @_cudf_nvtx_annotate def memory_usage(self, deep=False): - return sum(super().memory_usage(deep=deep).values()) + return self._column.memory_usage @_cudf_nvtx_annotate def equals(self, other, **kwargs): diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index c5c2322d95a..458fc16c511 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -704,10 +704,7 @@ def memory_usage(self, index=True, deep=False): >>> s.memory_usage(index=False) 24 """ - usage = super().memory_usage(deep=deep) - if index: - usage["Index"] = self.index.memory_usage() - return usage + raise NotImplementedError def hash_values(self, method="murmur3"): """Compute the hash of values in this column. diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 39228f034d4..d80fb00942b 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1474,7 +1474,7 @@ def _clean_nulls_from_index(self): @_cudf_nvtx_annotate def memory_usage(self, deep=False): - usage = sum(super().memory_usage(deep=deep).values()) + usage = sum(col.memory_usage for col in self._data.columns) if self.levels: for level in self.levels: usage += level.memory_usage(deep=deep) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 0ea02edb924..8748b9775be 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -856,7 +856,9 @@ def to_frame(self, name=None): @_cudf_nvtx_annotate def memory_usage(self, index=True, deep=False): - return sum(super().memory_usage(index, deep).values()) + return self._column.memory_usage + ( + self._index.memory_usage() if index else 0 + ) @_cudf_nvtx_annotate def __array_function__(self, func, types, args, kwargs): diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index d1edfb071a2..36e3416c8a3 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -398,7 +398,10 @@ def group_split_cudf(df, c, k, ignore_index=False): @sizeof_dispatch.register(cudf.DataFrame) @_dask_cudf_nvtx_annotate def sizeof_cudf_dataframe(df): - return int(df.memory_usage().sum()) + return int( + sum(col.memory_usage for col in df._data.columns) + + df._index.memory_usage() + ) @sizeof_dispatch.register((cudf.Series, cudf.BaseIndex)) From bc8f57843d2427ed07101faa86a40b162883d032 Mon Sep 17 00:00:00 2001 From: Alfred Xu Date: Thu, 31 Mar 2022 09:49:20 +0800 Subject: [PATCH 11/11] Adjust the valid range of group index for replace_with_backrefs (#10530) Current PR is to adjust to the valid range of group index for cuDF API `cudf::strings::replace_with_backrefs`. 1. enable 0 as group index For now, the range of group index starts with 1, which doesn't include the special value 0. Zero-value as backref index usually refers the entire matching pattern. So does cuDF regexp system. Therefore, what we only need to do is lifting the restrictions to allow zero-value passed as the group index of back references. Example of zero-value index: input: `aa-11 b2b-345` pattern: `([a-z]+)-([0-9]+)` replacement: `${0}:${1}:${2};` output: ```aa-11:aa:11; b2b-345:b:345;``` 2. group index should not exceed group count For now, group indices can exceed group count. The exceeding ones will end up to be empty string. IMHO, it is better to throw an exception under this circumstance instead of ignoring these overflow indices. Authors: - Alfred Xu (https://github.com/sperlingxx) Approvers: - Jason Lowe (https://github.com/jlowe) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/10530 --- cpp/include/cudf/strings/replace_re.hpp | 5 ++-- cpp/src/strings/replace/backref_re.cu | 9 +++++--- cpp/tests/strings/replace_regex_tests.cpp | 23 +++++++++++++++++-- .../java/ai/rapids/cudf/ColumnVectorTest.java | 16 +++++++++++++ 4 files changed, 46 insertions(+), 7 deletions(-) diff --git a/cpp/include/cudf/strings/replace_re.hpp b/cpp/include/cudf/strings/replace_re.hpp index 0e904958d15..0ab3953470d 100644 --- a/cpp/include/cudf/strings/replace_re.hpp +++ b/cpp/include/cudf/strings/replace_re.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -86,7 +86,8 @@ std::unique_ptr replace_re( * * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. * - * @throw cudf::logic_error if capture index values in `replacement` are not in range 1-99 + * @throw cudf::logic_error if capture index values in `replacement` are not in range 0-99, and also + * if the index exceeds the group count specified in the pattern * * @param strings Strings instance for this operation. * @param pattern The regular expression patterns to search within each string. diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu index 27e0bd4fac9..384813d6e3d 100644 --- a/cpp/src/strings/replace/backref_re.cu +++ b/cpp/src/strings/replace/backref_re.cu @@ -68,7 +68,8 @@ std::string get_backref_pattern(std::string const& repl) * For example, for input string 'hello \2 and \1' the returned `backref_type` vector * contains `[(2,6),(1,11)]` and the returned string is 'hello and '. */ -std::pair> parse_backrefs(std::string const& repl) +std::pair> parse_backrefs(std::string const& repl, + int const group_count) { std::vector backrefs; std::string str = repl; // make a modifiable copy @@ -79,7 +80,8 @@ std::pair> parse_backrefs(std::string con while (std::regex_search(str, m, ex) && !m.empty()) { // parse the back-ref index number size_type const index = static_cast(std::atoi(std::string{m[1]}.c_str())); - CUDF_EXPECTS(index > 0 && index < 100, "Group index numbers must be in the range 1-99"); + CUDF_EXPECTS(index >= 0 && index <= group_count, + "Group index numbers must be in the range 0 to group count"); // store the new byte offset and index value size_type const position = static_cast(m.position(0)); @@ -146,7 +148,8 @@ std::unique_ptr replace_with_backrefs( reprog_device::create(pattern, flags, get_character_flags_table(), input.size(), stream); // parse the repl string for back-ref indicators - auto const parse_result = parse_backrefs(replacement); + auto group_count = std::min(99, d_prog->group_counts()); // group count should NOT exceed 99 + auto const parse_result = parse_backrefs(replacement, group_count); rmm::device_uvector backrefs = cudf::detail::make_device_uvector_async(parse_result.second, stream); string_scalar repl_scalar(parse_result.first, true, stream); diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp index ddbd9f5b3d6..aac99c79721 100644 --- a/cpp/tests/strings/replace_regex_tests.cpp +++ b/cpp/tests/strings/replace_regex_tests.cpp @@ -279,13 +279,32 @@ TEST_F(StringsReplaceRegexTest, BackrefWithGreedyQuantifier) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } +TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexZeroIndexTest) +{ + cudf::test::strings_column_wrapper strings( + {"TEST123", "TEST1TEST2", "TEST2-TEST1122", "TEST1-TEST-T", "TES3"}); + auto strings_view = cudf::strings_column_view(strings); + std::string pattern = "(TEST)(\\d+)"; + std::string repl_template = "${0}: ${1}, ${2}; "; + auto results = cudf::strings::replace_with_backrefs(strings_view, pattern, repl_template); + + cudf::test::strings_column_wrapper expected({ + "TEST123: TEST, 123; ", + "TEST1: TEST, 1; TEST2: TEST, 2; ", + "TEST2: TEST, 2; -TEST1122: TEST, 1122; ", + "TEST1: TEST, 1; -TEST-T", + "TES3", + }); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexErrorTest) { cudf::test::strings_column_wrapper strings({"this string left intentionally blank"}); auto view = cudf::strings_column_view(strings); - EXPECT_THROW(cudf::strings::replace_with_backrefs(view, "(\\w)", "\\0"), cudf::logic_error); - EXPECT_THROW(cudf::strings::replace_with_backrefs(view, "(\\w)", "\\123"), cudf::logic_error); + // group index(3) exceeds the group count(2) + EXPECT_THROW(cudf::strings::replace_with_backrefs(view, "(\\w).(\\w)", "\\3"), cudf::logic_error); EXPECT_THROW(cudf::strings::replace_with_backrefs(view, "", "\\1"), cudf::logic_error); EXPECT_THROW(cudf::strings::replace_with_backrefs(view, "(\\w)", ""), cudf::logic_error); } diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index d1509f14c6e..58901d5743b 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -4987,6 +4987,22 @@ void testStringReplaceWithBackrefs() { assertColumnsAreEqual(expected, actual); } + // test zero as group index + try (ColumnVector v = ColumnVector.fromStrings("aa-11 b2b-345", "aa-11a 1c-2b2 b2-c3", "11-aa", null); + ColumnVector expected = ColumnVector.fromStrings("aa-11:aa:11; b2b-345:b:345;", + "aa-11:aa:11;a 1c-2:c:2;b2 b2-c3", "11-aa", null); + ColumnVector actual = v.stringReplaceWithBackrefs( + "([a-z]+)-([0-9]+)", "${0}:${1}:${2};")) { + assertColumnsAreEqual(expected, actual); + } + + // group index exceeds group count + assertThrows(CudfException.class, () -> { + try (ColumnVector v = ColumnVector.fromStrings("ABC123defgh"); + ColumnVector r = v.stringReplaceWithBackrefs("([A-Z]+)([0-9]+)([a-z]+)", "\\4")) { + } + }); + } @Test