diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index a557a2ef066..4ac2fe79bf6 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -89,7 +89,7 @@ gpuci_mamba_retry install -y \ "ucx-py=${UCX_PY_VERSION}" # https://docs.rapids.ai/maintainers/depmgmt/ -# gpuci_mamba_retry remove --force rapids-build-env rapids-notebook-env +# gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env # gpuci_mamba_retry install -y "your-pkg=1.0.0" diff --git a/conda/recipes/cudf_kafka/build.sh b/conda/recipes/cudf_kafka/build.sh index 3db559c144d..5d8720f1c98 100644 --- a/conda/recipes/cudf_kafka/build.sh +++ b/conda/recipes/cudf_kafka/build.sh @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. # This assumes the script is executed from the root of the repo directory ./build.sh -v cudf_kafka diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml index e450d306cbe..d434e53c9b1 100644 --- a/conda/recipes/cudf_kafka/meta.yaml +++ b/conda/recipes/cudf_kafka/meta.yaml @@ -1,9 +1,9 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} -{% set py_version=environ.get('CONDA_PY', 36) %} -{% set cuda_version='.'.join(environ.get('CUDA', '11.5').split('.')[:2]) %} +{% set cuda_version = '.'.join(environ.get('CUDA', '11.5').split('.')[:2]) %} +{% set py_version = environ.get('python', '3.8') %} package: name: cudf_kafka @@ -14,7 +14,7 @@ source: build: number: {{ GIT_DESCRIBE_NUMBER }} - string: py{{ py_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} + string: py{{ py_version.replace('.', '') }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} script_env: - CC - CXX @@ -26,14 +26,15 @@ requirements: build: - cmake >=3.20.1 host: - - python + - python {{ py_version }} - cython >=0.29,<0.30 - - setuptools - cudf {{ version }} - libcudf_kafka {{ version }} + - setuptools run: + - python {{ py_version }} - libcudf_kafka {{ version }} - - python-confluent-kafka + - python-confluent-kafka >=1.7.0,<1.8.0a0=py{{ py_version.replace('.', '') }}* - cudf {{ version }} test: # [linux64] diff --git a/conda/recipes/custreamz/build.sh b/conda/recipes/custreamz/build.sh index 6ce9e4f21a9..88fccf90c69 100644 --- a/conda/recipes/custreamz/build.sh +++ b/conda/recipes/custreamz/build.sh @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. # This assumes the script is executed from the root of the repo directory ./build.sh -v custreamz diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml index a8b096d4892..73f4727b70b 100644 --- a/conda/recipes/custreamz/meta.yaml +++ b/conda/recipes/custreamz/meta.yaml @@ -1,9 +1,9 @@ -# Copyright (c) 2018-2019, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} -{% set py_version=environ.get('CONDA_PY', 36) %} -{% set cuda_version='.'.join(environ.get('CUDA', '11.5').split('.')[:2]) %} +{% set cuda_version = '.'.join(environ.get('CUDA', '11.5').split('.')[:2]) %} +{% set py_version = environ.get('python', '3.8') %} package: name: custreamz @@ -14,7 +14,7 @@ source: build: number: {{ GIT_DESCRIBE_NUMBER }} - string: py{{ py_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} + string: py{{ py_version.replace('.', '') }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} script_env: - VERSION_SUFFIX - PARALLEL_LEVEL @@ -24,16 +24,16 @@ build: requirements: host: - - python - - python-confluent-kafka + - python {{ py_version }} + - python-confluent-kafka >=1.7.0,<1.8.0a0=py{{ py_version.replace('.', '') }}* - cudf_kafka {{ version }} run: - - python - - streamz + - python {{ py_version }} + - streamz - cudf {{ version }} - dask>=2021.11.1,<=2021.11.2 - distributed>=2021.11.1,<=2021.11.2 - - python-confluent-kafka + - python-confluent-kafka >=1.7.0,<1.8.0a0=py{{ py_version.replace('.', '') }}* - cudf_kafka {{ version }} test: # [linux64] diff --git a/conda/recipes/libcudf_kafka/build.sh b/conda/recipes/libcudf_kafka/build.sh index cbe4584cb63..b656f55a64e 100644 --- a/conda/recipes/libcudf_kafka/build.sh +++ b/conda/recipes/libcudf_kafka/build.sh @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then # This assumes the script is executed from the root of the repo directory diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml index 6b15890e7c7..0b274f3a41d 100644 --- a/conda/recipes/libcudf_kafka/meta.yaml +++ b/conda/recipes/libcudf_kafka/meta.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} @@ -26,7 +26,7 @@ requirements: - cmake >=3.20.1 host: - libcudf {{version}} - - librdkafka >=1.6.0,<1.7.0a0 + - librdkafka >=1.7.0,<1.8.0a0 run: - {{ pin_compatible('librdkafka', max_pin='x.x') }} #TODO: librdkafka should be automatically included here by run_exports but is not diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 86ec24c1b7b..84e486c7e18 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -343,6 +343,7 @@ add_library( src/lists/lists_column_factories.cu src/lists/lists_column_view.cu src/lists/segmented_sort.cu + src/lists/sequences.cu src/merge/merge.cu src/partitioning/partitioning.cu src/partitioning/round_robin.cu @@ -416,7 +417,8 @@ add_library( src/strings/copying/concatenate.cu src/strings/copying/copying.cu src/strings/copying/shift.cu - src/strings/extract.cu + src/strings/extract/extract.cu + src/strings/extract/extract_all.cu src/strings/filling/fill.cu src/strings/filter_chars.cu src/strings/findall.cu diff --git a/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp index a68ce2bd1a1..888102c03be 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp @@ -89,14 +89,14 @@ void BM_parq_read_varying_options(benchmark::State& state) auto const use_pandas_metadata = (flags & 2) != 0; auto const ts_type = cudf::data_type{static_cast(state.range(state_idx++))}; - auto const data_types = - dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL), - int32_t(type_group_id::FLOATING_POINT), - int32_t(type_group_id::FIXED_POINT), - int32_t(type_group_id::TIMESTAMP), - int32_t(cudf::type_id::STRING), - int32_t(cudf::type_id::LIST)}), - col_sel); + auto const data_types = dtypes_for_column_selection( + get_type_or_group({static_cast(type_group_id::INTEGRAL), + static_cast(type_group_id::FLOATING_POINT), + static_cast(type_group_id::FIXED_POINT), + static_cast(type_group_id::TIMESTAMP), + static_cast(cudf::type_id::STRING), + static_cast(cudf::type_id::LIST)}), + col_sel); auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size}); auto const view = tbl->view(); @@ -181,6 +181,9 @@ BENCHMARK_REGISTER_F(ParquetRead, column_selection) ->Unit(benchmark::kMillisecond) ->UseManualTime(); +// Disabled until we add an API to read metadata from a parquet file and determine num row groups. +// https://github.com/rapidsai/cudf/pull/9963#issuecomment-1004832863 +/* BENCHMARK_DEFINE_F(ParquetRead, row_selection) (::benchmark::State& state) { BM_parq_read_varying_options(state); } BENCHMARK_REGISTER_F(ParquetRead, row_selection) @@ -191,6 +194,7 @@ BENCHMARK_REGISTER_F(ParquetRead, row_selection) {int32_t(cudf::type_id::EMPTY)}}) ->Unit(benchmark::kMillisecond) ->UseManualTime(); +*/ BENCHMARK_DEFINE_F(ParquetRead, misc_options) (::benchmark::State& state) { BM_parq_read_varying_options(state); } diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake index b58bdb55de3..16e7a58b020 100644 --- a/cpp/cmake/thirdparty/get_cucollections.cmake +++ b/cpp/cmake/thirdparty/get_cucollections.cmake @@ -21,7 +21,7 @@ function(find_and_configure_cucollections) cuco 0.0 GLOBAL_TARGETS cuco::cuco CPM_ARGS GITHUB_REPOSITORY NVIDIA/cuCollections - GIT_TAG 6433e8ad7571f14cc5384051b049029c60dd1ce0 + GIT_TAG 193de1aa74f5721717f991ca757dc610c852bb17 OPTIONS "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" "BUILD_EXAMPLES OFF" ) diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake index 574bfa26a0c..fcf9f0d73ee 100644 --- a/cpp/cmake/thirdparty/get_thrust.cmake +++ b/cpp/cmake/thirdparty/get_thrust.cmake @@ -80,6 +80,6 @@ function(find_and_configure_thrust VERSION) endif() endfunction() -set(CUDF_MIN_VERSION_Thrust 1.12.0) +set(CUDF_MIN_VERSION_Thrust 1.15.0) find_and_configure_thrust(${CUDF_MIN_VERSION_Thrust}) diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp index 17bea935dfd..117119cd40f 100644 --- a/cpp/include/cudf/datetime.hpp +++ b/cpp/include/cudf/datetime.hpp @@ -285,280 +285,66 @@ std::unique_ptr extract_quarter( cudf::column_view const& column, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -/** @} */ // end of group - -/** - * @brief Round up to the nearest day - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr ceil_day( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round up to the nearest hour - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr ceil_hour( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round up to the nearest minute - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr ceil_minute( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round up to the nearest second - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr ceil_second( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round up to the nearest millisecond - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr ceil_millisecond( - column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round up to the nearest microsecond - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr ceil_microsecond( - column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round up to the nearest nanosecond - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr ceil_nanosecond( - column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - /** - * @brief Round down to the nearest day - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. + * @brief Fixed frequencies supported by datetime rounding functions ceil, floor, round. * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column */ -std::unique_ptr floor_day( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +enum class rounding_frequency : int32_t { + DAY, + HOUR, + MINUTE, + SECOND, + MILLISECOND, + MICROSECOND, + NANOSECOND +}; /** - * @brief Round down to the nearest hour + * @brief Round datetimes up to the nearest multiple of the given frequency. * - * @param column cudf::column_view of the input datetime values + * @param column cudf::column_view of the input datetime values. + * @param freq rounding_frequency indicating the frequency to round up to. * @param mr Device memory resource used to allocate device memory of the returned column. * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column + * @throw cudf::logic_error if input column datatype is not TIMESTAMP. + * @return cudf::column of the same datetime resolution as the input column. */ -std::unique_ptr floor_hour( +std::unique_ptr ceil_datetimes( cudf::column_view const& column, + rounding_frequency freq, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Round down to the nearest minute + * @brief Round datetimes down to the nearest multiple of the given frequency. * - * @param column cudf::column_view of the input datetime values + * @param column cudf::column_view of the input datetime values. + * @param freq rounding_frequency indicating the frequency to round down to. * @param mr Device memory resource used to allocate device memory of the returned column. * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column + * @throw cudf::logic_error if input column datatype is not TIMESTAMP. + * @return cudf::column of the same datetime resolution as the input column. */ -std::unique_ptr floor_minute( +std::unique_ptr floor_datetimes( cudf::column_view const& column, + rounding_frequency freq, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Round down to the nearest second + * @brief Round datetimes to the nearest multiple of the given frequency. * - * @param column cudf::column_view of the input datetime values + * @param column cudf::column_view of the input datetime values. + * @param freq rounding_frequency indicating the frequency to round to. * @param mr Device memory resource used to allocate device memory of the returned column. * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column + * @throw cudf::logic_error if input column datatype is not TIMESTAMP. + * @return cudf::column of the same datetime resolution as the input column. */ -std::unique_ptr floor_second( +std::unique_ptr round_datetimes( cudf::column_view const& column, + rounding_frequency freq, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -/** - * @brief Round down to the nearest millisecond - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr floor_millisecond( - column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round down to the nearest microsecond - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr floor_microsecond( - column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round down to the nearest nanosecond - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr floor_nanosecond( - column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round to the nearest day - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr round_day( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round to the nearest hour - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr round_hour( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round to the nearest minute - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr round_minute( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round to the nearest second - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr round_second( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round to the nearest millisecond - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr round_millisecond( - column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round to the nearest microsecond - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr round_microsecond( - column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round to the nearest nanosecond - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr round_nanosecond( - column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** @} */ // end of group } // namespace datetime } // namespace cudf diff --git a/cpp/include/cudf/filling.hpp b/cpp/include/cudf/filling.hpp index aff0d20a467..905a897eb40 100644 --- a/cpp/include/cudf/filling.hpp +++ b/cpp/include/cudf/filling.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -169,7 +169,7 @@ std::unique_ptr repeat( * @param init First value in the sequence * @param step Increment value * @param mr Device memory resource used to allocate the returned column's device memory - * @return std::unique_ptr The result table containing the sequence + * @return The result column containing the generated sequence */ std::unique_ptr sequence( size_type size, @@ -195,7 +195,7 @@ std::unique_ptr sequence( * @param size Size of the output column * @param init First value in the sequence * @param mr Device memory resource used to allocate the returned column's device memory - * @return std::unique_ptr The result table containing the sequence + * @return The result column containing the generated sequence */ std::unique_ptr sequence( size_type size, @@ -223,7 +223,7 @@ std::unique_ptr sequence( * @param months Months to increment * @param mr Device memory resource used to allocate the returned column's device memory * - * @returns Timestamps column with sequences of months. + * @return Timestamps column with sequences of months. */ std::unique_ptr calendrical_month_sequence( size_type size, diff --git a/cpp/include/cudf/lists/filling.hpp b/cpp/include/cudf/lists/filling.hpp new file mode 100644 index 00000000000..74a4dac1e10 --- /dev/null +++ b/cpp/include/cudf/lists/filling.hpp @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +namespace cudf::lists { +/** + * @addtogroup lists_filling + * @{ + * @file + * @brief Column APIs for individual list sequence + */ + +/** + * @brief Create a lists column in which each row contains a sequence of values specified by a tuple + * of (`start`, `size`) parameters. + * + * Create a lists column in which each row is a sequence of values starting from a `start` value, + * incrementing by one, and its cardinality is specified by a `size` value. The `start` and `size` + * values used to generate each list is taken from the corresponding row of the input @p starts and + * @p sizes columns. + * + * - @p sizes must be a column of integer types. + * - All the input columns must not have nulls. + * - If any row of the @p sizes column contains negative value, the output is undefined. + * + * @code{.pseudo} + * starts = [0, 1, 2, 3, 4] + * sizes = [0, 2, 2, 1, 3] + * + * output = [ [], [1, 2], [2, 3], [3], [4, 5, 6] ] + * @endcode + * + * @throws cudf::logic_error if @p sizes column is not of integer types. + * @throws cudf::logic_error if any input column has nulls. + * @throws cudf::logic_error if @p starts and @p sizes columns do not have the same size. + * + * @param starts First values in the result sequences. + * @param sizes Numbers of values in the result sequences. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return The result column containing generated sequences. + */ +std::unique_ptr sequences( + column_view const& starts, + column_view const& sizes, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Create a lists column in which each row contains a sequence of values specified by a tuple + * of (`start`, `step`, `size`) parameters. + * + * Create a lists column in which each row is a sequence of values starting from a `start` value, + * incrementing by a `step` value, and its cardinality is specified by a `size` value. The values + * `start`, `step`, and `size` used to generate each list is taken from the corresponding row of the + * input @p starts, @p steps, and @p sizes columns. + * + * - @p sizes must be a column of integer types. + * - @p starts and @p steps columns must have the same type. + * - All the input columns must not have nulls. + * - If any row of the @p sizes column contains negative value, the output is undefined. + * + * @code{.pseudo} + * starts = [0, 1, 2, 3, 4] + * steps = [2, 1, 1, 1, -3] + * sizes = [0, 2, 2, 1, 3] + * + * output = [ [], [1, 2], [2, 3], [3], [4, 1, -2] ] + * @endcode + * + * @throws cudf::logic_error if @p sizes column is not of integer types. + * @throws cudf::logic_error if any input column has nulls. + * @throws cudf::logic_error if @p starts and @p steps columns have different types. + * @throws cudf::logic_error if @p starts, @p steps, and @p sizes columns do not have the same size. + * + * @param starts First values in the result sequences. + * @param steps Increment values for the result sequences. + * @param sizes Numbers of values in the result sequences. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return The result column containing generated sequences. + */ +std::unique_ptr sequences( + column_view const& starts, + column_view const& steps, + column_view const& sizes, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** @} */ // end of group +} // namespace cudf::lists diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh index b35f5df2903..9da3c6b0e91 100644 --- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh +++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh @@ -33,6 +33,12 @@ namespace cudf { namespace strings { namespace detail { +/** + * @brief Basic type expected for iterators passed to `make_strings_column` that represent string + * data in device memory. + */ +using string_index_pair = thrust::pair; + /** * @brief Average string byte-length threshold for deciding character-level * vs. row-level parallel algorithm. @@ -64,8 +70,6 @@ std::unique_ptr make_strings_column(IndexPairIterator begin, size_type strings_count = thrust::distance(begin, end); if (strings_count == 0) return make_empty_column(type_id::STRING); - using string_index_pair = thrust::pair; - // check total size is not too large for cudf column auto size_checker = [] __device__(string_index_pair const& item) { return (item.first != nullptr) ? item.second : 0; diff --git a/cpp/include/cudf/strings/extract.hpp b/cpp/include/cudf/strings/extract.hpp index 6f5902266b2..466f71aace0 100644 --- a/cpp/include/cudf/strings/extract.hpp +++ b/cpp/include/cudf/strings/extract.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,20 +27,21 @@ namespace strings { */ /** - * @brief Returns a vector of strings columns for each matching group specified in the given regular - * expression pattern. + * @brief Returns a table of strings columns where each column corresponds to the matching + * group specified in the given regular expression pattern. * * All the strings for the first group will go in the first output column; the second group - * go in the second column and so on. Null entries are added if the string does match. + * go in the second column and so on. Null entries are added to the columns in row `i` if + * the string at row `i` does not match. * * Any null string entries return corresponding null output column entries. * * @code{.pseudo} * Example: - * s = ["a1","b2","c3"] - * r = extract(s,"([ab])(\\d)") - * r is now [["a","b",null], - * ["1","2",null]] + * s = ["a1", "b2", "c3"] + * r = extract(s, "([ab])(\\d)") + * r is now [ ["a", "b", null], + * ["1", "2", null] ] * @endcode * * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. @@ -55,6 +56,39 @@ std::unique_ptr
extract( std::string const& pattern, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Returns a lists column of strings where each string column row corresponds to the + * matching group specified in the given regular expression pattern. + * + * All the matching groups for the first row will go in the first row output column; the second + * row results will go into the second row output column and so on. + * + * A null output row will result if the corresponding input string row does not match or + * that input row is null. + * + * @code{.pseudo} + * Example: + * s = ["a1 b4", "b2", "c3 a5", "b", null] + * r = extract_all(s,"([ab])(\\d)") + * r is now [ ["a", "1", "b", "4"], + * ["b", "2"], + * ["a", "5"], + * null, + * null ] + * @endcode + * + * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. + * + * @param strings Strings instance for this operation. + * @param pattern The regular expression pattern with group indicators. + * @param mr Device memory resource used to allocate any returned device memory. + * @return Lists column containing strings extracted from the input column. + */ +std::unique_ptr extract_all( + strings_column_view const& strings, + std::string const& pattern, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of doxygen group } // namespace strings } // namespace cudf diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp index 8cc000ff095..2b09ec66203 100644 --- a/cpp/include/nvtext/subword_tokenize.hpp +++ b/cpp/include/nvtext/subword_tokenize.hpp @@ -130,9 +130,7 @@ struct tokenizer_result { * larger than the max value for cudf::size_type * * @param strings The input strings to tokenize. - * @param filename_hashed_vocabulary A path to the preprocessed vocab.txt file. - * Note that this is the file AFTER python/perfect_hash.py has been used - * for preprocessing. + * @param vocabulary_table The vocabulary table pre-loaded into this object. * @param max_sequence_length Limit of the number of token-ids per row in final tensor * for each string. * @param stride Each row in the output token-ids will replicate `max_sequence_length - stride` @@ -150,25 +148,6 @@ struct tokenizer_result { * @param mr Memory resource to allocate any returned objects. * @return token-ids, attention-mask, and metadata */ -tokenizer_result subword_tokenize( - cudf::strings_column_view const& strings, - std::string const& filename_hashed_vocabulary, - uint32_t max_sequence_length, - uint32_t stride, - bool do_lower_case, - bool do_truncate, - uint32_t max_rows_tensor, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @copydoc subword_tokenize() - * - * This function differs from the one above by only the hashed vocabulary parameter. - * The file can be pre-loaded using the @ref load_vocabulary_file API and then - * passed in place of the file name in a call to this API. - * - * @param vocabulary_table The vocabulary table pre-loaded into this object. - */ tokenizer_result subword_tokenize( cudf::strings_column_view const& strings, hashed_vocabulary const& vocabulary_table, diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt index d0874b57c2d..e6abba207d9 100644 --- a/cpp/libcudf_kafka/CMakeLists.txt +++ b/cpp/libcudf_kafka/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -29,6 +29,10 @@ project( # Set a default build type if none was specified rapids_cmake_build_type(Release) +# ################################################################################################## +# * conda environment ----------------------------------------------------------------------------- +rapids_cmake_support_conda_env(conda_env MODIFY_PREFIX_PATH) + # ################################################################################################## # * Build options option(BUILD_TESTS "Build tests for libcudf_kafka" ON) @@ -55,7 +59,7 @@ endif() # ################################################################################################## # * library target -------------------------------------------------------------------------------- -add_library(cudf_kafka SHARED src/kafka_consumer.cpp) +add_library(cudf_kafka SHARED src/kafka_consumer.cpp src/kafka_callback.cpp) # ################################################################################################## # * include paths --------------------------------------------------------------------------------- @@ -68,6 +72,11 @@ target_include_directories( # * library paths --------------------------------------------------------------------------------- target_link_libraries(cudf_kafka PUBLIC cudf::cudf RDKAFKA::RDKAFKA) +# Add Conda library, and include paths if specified +if(TARGET conda_env) + target_link_libraries(cudf_kafka PRIVATE conda_env) +endif() + set_target_properties( cudf_kafka PROPERTIES BUILD_RPATH "\$ORIGIN" INSTALL_RPATH "\$ORIGIN" # set target compile options diff --git a/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake b/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake index 1e04d40a7d5..aa4c5b60e7a 100644 --- a/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake +++ b/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at diff --git a/cpp/libcudf_kafka/cmake/thirdparty/get_rdkafka.cmake b/cpp/libcudf_kafka/cmake/thirdparty/get_rdkafka.cmake index 3b3342cb297..5c3c9f01f17 100644 --- a/cpp/libcudf_kafka/cmake/thirdparty/get_rdkafka.cmake +++ b/cpp/libcudf_kafka/cmake/thirdparty/get_rdkafka.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at diff --git a/cpp/libcudf_kafka/include/cudf_kafka/kafka_callback.hpp b/cpp/libcudf_kafka/include/cudf_kafka/kafka_callback.hpp new file mode 100644 index 00000000000..a4ff18054b1 --- /dev/null +++ b/cpp/libcudf_kafka/include/cudf_kafka/kafka_callback.hpp @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include + +#include +#include +#include + +namespace cudf { +namespace io { +namespace external { +namespace kafka { + +/** + * @brief Python Callback function wrapper type used for Kafka OAuth events + * + * The KafkaConsumer calls the `kafka_oauth_callback_wrapper_type` when the existing + * oauth token is considered expired by the KafkaConsumer. Typically that + * means this will be invoked a single time when the KafkaConsumer is created + * to get the initial token and then intermediately as the token becomes + * expired. + * + * The callback function signature is: + * `std::map kafka_oauth_callback_wrapper_type(void*)` + * + * The callback function returns a std::map, + * where the std::map consists of the Oauth token and its + * linux epoch expiration time. Generally the token and expiration + * time is retrieved from an external service by the callback. + * Ex: [token, token_expiration_in_epoch] + */ +using kafka_oauth_callback_wrapper_type = std::map (*)(void*); +using python_callable_type = void*; + +/** + * @brief Callback to retrieve OAuth token from external source. Invoked when + * token refresh is required. + */ +class python_oauth_refresh_callback : public RdKafka::OAuthBearerTokenRefreshCb { + public: + python_oauth_refresh_callback(kafka_oauth_callback_wrapper_type callback_wrapper, + python_callable_type python_callable); + + void oauthbearer_token_refresh_cb(RdKafka::Handle* handle, const std::string& oauthbearer_config); + + private: + kafka_oauth_callback_wrapper_type callback_wrapper_; + python_callable_type python_callable_; +}; + +} // namespace kafka +} // namespace external +} // namespace io +} // namespace cudf diff --git a/cpp/libcudf_kafka/include/cudf_kafka/kafka_consumer.hpp b/cpp/libcudf_kafka/include/cudf_kafka/kafka_consumer.hpp index 464d1cd71b1..c65774d2e1a 100644 --- a/cpp/libcudf_kafka/include/cudf_kafka/kafka_consumer.hpp +++ b/cpp/libcudf_kafka/include/cudf_kafka/kafka_consumer.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,10 +15,14 @@ */ #pragma once -#include -#include +#include "kafka_callback.hpp" + #include + #include + +#include +#include #include #include #include @@ -48,8 +52,15 @@ class kafka_consumer : public cudf::io::datasource { * * @param configs key/value pairs of librdkafka configurations that will be * passed to the librdkafka client + * @param python_callable `python_callable_type` pointer to a Python functools.partial object + * @param callable_wrapper `kafka_oauth_callback_wrapper_type` Cython wrapper that will + * be used to invoke the `python_callable`. This wrapper serves the purpose + * of preventing us from having to link against the Python development library + * in libcudf_kafka. */ - kafka_consumer(std::map const& configs); + kafka_consumer(std::map configs, + python_callable_type python_callable, + kafka_oauth_callback_wrapper_type callable_wrapper); /** * @brief Instantiate a Kafka consumer object. Documentation for librdkafka configurations can be @@ -57,6 +68,11 @@ class kafka_consumer : public cudf::io::datasource { * * @param configs key/value pairs of librdkafka configurations that will be * passed to the librdkafka client + * @param python_callable `python_callable_type` pointer to a Python functools.partial object + * @param callable_wrapper `kafka_oauth_callback_wrapper_type` Cython wrapper that will + * be used to invoke the `python_callable`. This wrapper serves the purpose + * of preventing us from having to link against the Python development library + * in libcudf_kafka. * @param topic_name name of the Kafka topic to consume from * @param partition partition index to consume from between `0` and `TOPIC_NUM_PARTITIONS - 1` * inclusive @@ -66,7 +82,9 @@ class kafka_consumer : public cudf::io::datasource { * before batch_timeout, a smaller subset will be returned * @param delimiter optional delimiter to insert into the output between kafka messages, Ex: "\n" */ - kafka_consumer(std::map const& configs, + kafka_consumer(std::map configs, + python_callable_type python_callable, + kafka_oauth_callback_wrapper_type callable_wrapper, std::string const& topic_name, int partition, int64_t start_offset, @@ -178,6 +196,10 @@ class kafka_consumer : public cudf::io::datasource { std::unique_ptr kafka_conf; // RDKafka configuration object std::unique_ptr consumer; + std::map configs; + python_callable_type python_callable_; + kafka_oauth_callback_wrapper_type callable_wrapper_; + std::string topic_name; int partition; int64_t start_offset; diff --git a/cpp/libcudf_kafka/src/kafka_callback.cpp b/cpp/libcudf_kafka/src/kafka_callback.cpp new file mode 100644 index 00000000000..6b98747c145 --- /dev/null +++ b/cpp/libcudf_kafka/src/kafka_callback.cpp @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "cudf_kafka/kafka_callback.hpp" + +#include + +namespace cudf { +namespace io { +namespace external { +namespace kafka { + +python_oauth_refresh_callback::python_oauth_refresh_callback( + kafka_oauth_callback_wrapper_type callback_wrapper, python_callable_type python_callable) + : callback_wrapper_(callback_wrapper), python_callable_(python_callable){}; + +void python_oauth_refresh_callback::oauthbearer_token_refresh_cb( + RdKafka::Handle* handle, std::string const& oauthbearer_config) +{ + std::map resp = callback_wrapper_(python_callable_); + + // Build parameters to pass to librdkafka + std::string token = resp["token"]; + int64_t token_lifetime_ms = std::stoll(resp["token_expiration_in_epoch"]); + std::list extensions; // currently not supported + std::string errstr; + CUDF_EXPECTS( + RdKafka::ErrorCode::ERR_NO_ERROR == + handle->oauthbearer_set_token(token, token_lifetime_ms, "kafka", extensions, errstr), + "Error occurred while setting the oauthbearer token"); +} + +} // namespace kafka +} // namespace external +} // namespace io +} // namespace cudf diff --git a/cpp/libcudf_kafka/src/kafka_consumer.cpp b/cpp/libcudf_kafka/src/kafka_consumer.cpp index 4f7cdba632e..49e89a56e60 100644 --- a/cpp/libcudf_kafka/src/kafka_consumer.cpp +++ b/cpp/libcudf_kafka/src/kafka_consumer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,10 +13,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #include "cudf_kafka/kafka_consumer.hpp" -#include + #include + +#include #include namespace cudf { @@ -24,8 +25,13 @@ namespace io { namespace external { namespace kafka { -kafka_consumer::kafka_consumer(std::map const& configs) - : kafka_conf(RdKafka::Conf::create(RdKafka::Conf::CONF_GLOBAL)) +kafka_consumer::kafka_consumer(std::map configs, + python_callable_type python_callable, + kafka_oauth_callback_wrapper_type callable_wrapper) + : configs(configs), + python_callable_(python_callable), + callable_wrapper_(callable_wrapper), + kafka_conf(RdKafka::Conf::create(RdKafka::Conf::CONF_GLOBAL)) { for (auto const& key_value : configs) { std::string error_string; @@ -34,6 +40,14 @@ kafka_consumer::kafka_consumer(std::map const& configs "Invalid Kafka configuration"); } + if (python_callable_ != nullptr) { + std::string error_string; + python_oauth_refresh_callback cb(callable_wrapper_, python_callable_); + CUDF_EXPECTS(RdKafka::Conf::ConfResult::CONF_OK == + kafka_conf->set("oauthbearer_token_refresh_cb", &cb, error_string), + "Failed to set Kafka oauth callback"); + } + // Kafka 0.9 > requires group.id in the configuration std::string conf_val; CUDF_EXPECTS(RdKafka::Conf::ConfResult::CONF_OK == kafka_conf->get("group.id", conf_val), @@ -44,22 +58,26 @@ kafka_consumer::kafka_consumer(std::map const& configs RdKafka::KafkaConsumer::create(kafka_conf.get(), errstr)); } -kafka_consumer::kafka_consumer(std::map const& configs, +kafka_consumer::kafka_consumer(std::map configs, + python_callable_type python_callable, + kafka_oauth_callback_wrapper_type callback_wrapper, std::string const& topic_name, int partition, int64_t start_offset, int64_t end_offset, int batch_timeout, std::string const& delimiter) - : topic_name(topic_name), + : configs(configs), + python_callable_(python_callable), + callable_wrapper_(callback_wrapper), + topic_name(topic_name), partition(partition), start_offset(start_offset), end_offset(end_offset), batch_timeout(batch_timeout), - delimiter(delimiter) + delimiter(delimiter), + kafka_conf(RdKafka::Conf::create(RdKafka::Conf::CONF_GLOBAL)) { - kafka_conf = std::unique_ptr(RdKafka::Conf::create(RdKafka::Conf::CONF_GLOBAL)); - for (auto const& key_value : configs) { std::string error_string; CUDF_EXPECTS(RdKafka::Conf::ConfResult::CONF_OK == @@ -67,6 +85,14 @@ kafka_consumer::kafka_consumer(std::map const& configs "Invalid Kafka configuration"); } + if (python_callable_ != nullptr) { + std::string error_string; + python_oauth_refresh_callback cb(callable_wrapper_, python_callable_); + CUDF_EXPECTS(RdKafka::Conf::ConfResult::CONF_OK == + kafka_conf->set("oauthbearer_token_refresh_cb", &cb, error_string), + "Failed to set Kafka oauth callback"); + } + // Kafka 0.9 > requires group.id in the configuration std::string conf_val; CUDF_EXPECTS(RdKafka::Conf::ConfResult::CONF_OK == kafka_conf->get("group.id", conf_val), diff --git a/cpp/libcudf_kafka/tests/CMakeLists.txt b/cpp/libcudf_kafka/tests/CMakeLists.txt index 3920758f3f2..db2131ba00c 100644 --- a/cpp/libcudf_kafka/tests/CMakeLists.txt +++ b/cpp/libcudf_kafka/tests/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -23,8 +23,9 @@ function(ConfigureTest test_name) ${test_name} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$" ) - target_link_libraries(${test_name} PRIVATE GTest::gmock_main GTest::gtest_main cudf_kafka) - + target_link_libraries( + ${test_name} PRIVATE GTest::gmock GTest::gmock_main GTest::gtest_main cudf_kafka + ) add_test(NAME ${test_name} COMMAND ${test_name}) endfunction() diff --git a/cpp/libcudf_kafka/tests/kafka_consumer_tests.cpp b/cpp/libcudf_kafka/tests/kafka_consumer_tests.cpp index ca4b70531db..613c2435f4d 100644 --- a/cpp/libcudf_kafka/tests/kafka_consumer_tests.cpp +++ b/cpp/libcudf_kafka/tests/kafka_consumer_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "cudf_kafka/kafka_consumer.hpp" +#include #include #include #include @@ -32,25 +32,37 @@ TEST_F(KafkaDatasourceTest, MissingGroupID) { // group.id is a required configuration. std::map kafka_configs; - kafka_configs.insert({"bootstrap.servers", "localhost:9092"}); + kafka_configs["bootstrap.servers"] = "localhost:9092"; - EXPECT_THROW(kafka::kafka_consumer kc(kafka_configs, "csv-topic", 0, 0, 3, 5000, "\n"), - cudf::logic_error); + kafka::python_callable_type python_callable; + kafka::kafka_oauth_callback_wrapper_type callback_wrapper; + + EXPECT_THROW( + kafka::kafka_consumer kc( + kafka_configs, python_callable, callback_wrapper, "csv-topic", 0, 0, 3, 5000, "\n"), + cudf::logic_error); } TEST_F(KafkaDatasourceTest, InvalidConfigValues) { // Give a made up configuration value std::map kafka_configs; - kafka_configs.insert({"completely_made_up_config", "wrong"}); + kafka_configs["completely_made_up_config"] = "wrong"; - EXPECT_THROW(kafka::kafka_consumer kc(kafka_configs, "csv-topic", 0, 0, 3, 5000, "\n"), - cudf::logic_error); + kafka::python_callable_type python_callable; + kafka::kafka_oauth_callback_wrapper_type callback_wrapper; - kafka_configs.clear(); + EXPECT_THROW( + kafka::kafka_consumer kc( + kafka_configs, python_callable, callback_wrapper, "csv-topic", 0, 0, 3, 5000, "\n"), + cudf::logic_error); // Give a good config property with a bad value - kafka_configs.insert({"message.max.bytes", "this should be a number not text"}); - EXPECT_THROW(kafka::kafka_consumer kc(kafka_configs, "csv-topic", 0, 0, 3, 5000, "\n"), - cudf::logic_error); + kafka_configs.clear(); + kafka_configs["message.max.bytes"] = "this should be a number not text"; + + EXPECT_THROW( + kafka::kafka_consumer kc( + kafka_configs, python_callable, callback_wrapper, "csv-topic", 0, 0, 3, 5000, "\n"), + cudf::logic_error); } diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index 85653b4f0be..1e9a39560b8 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -113,9 +113,9 @@ struct RoundFunctor { struct RoundingDispatcher { rounding_function round_kind; - datetime_component component; + rounding_frequency component; - RoundingDispatcher(rounding_function round_kind, datetime_component component) + RoundingDispatcher(rounding_function round_kind, rounding_frequency component) : round_kind(round_kind), component(component) { } @@ -124,25 +124,25 @@ struct RoundingDispatcher { CUDA_DEVICE_CALLABLE Timestamp operator()(Timestamp const ts) const { switch (component) { - case datetime_component::DAY: + case rounding_frequency::DAY: return time_point_cast( RoundFunctor{}(round_kind, ts)); - case datetime_component::HOUR: + case rounding_frequency::HOUR: return time_point_cast( RoundFunctor{}(round_kind, ts)); - case datetime_component::MINUTE: + case rounding_frequency::MINUTE: return time_point_cast( RoundFunctor{}(round_kind, ts)); - case datetime_component::SECOND: + case rounding_frequency::SECOND: return time_point_cast( RoundFunctor{}(round_kind, ts)); - case datetime_component::MILLISECOND: + case rounding_frequency::MILLISECOND: return time_point_cast( RoundFunctor{}(round_kind, ts)); - case datetime_component::MICROSECOND: + case rounding_frequency::MICROSECOND: return time_point_cast( RoundFunctor{}(round_kind, ts)); - case datetime_component::NANOSECOND: + case rounding_frequency::NANOSECOND: return time_point_cast( RoundFunctor{}(round_kind, ts)); default: cudf_assert(false && "Unsupported datetime rounding resolution."); @@ -234,7 +234,7 @@ struct dispatch_round { template std::enable_if_t(), std::unique_ptr> operator()( rounding_function round_kind, - datetime_component component, + rounding_frequency component, cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const @@ -420,7 +420,7 @@ std::unique_ptr add_calendrical_months(column_view const& timestamp_colu } std::unique_ptr round_general(rounding_function round_kind, - datetime_component component, + rounding_frequency component, column_view const& column, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) @@ -531,223 +531,31 @@ std::unique_ptr extract_quarter(column_view const& column, } // namespace detail -std::unique_ptr ceil_day(column_view const& column, rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_function::CEIL, - detail::datetime_component::DAY, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr ceil_hour(column_view const& column, rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_function::CEIL, - detail::datetime_component::HOUR, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr ceil_minute(column_view const& column, rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_function::CEIL, - detail::datetime_component::MINUTE, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr ceil_second(column_view const& column, rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_function::CEIL, - detail::datetime_component::SECOND, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr ceil_millisecond(column_view const& column, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_function::CEIL, - detail::datetime_component::MILLISECOND, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr ceil_microsecond(column_view const& column, - rmm::mr::device_memory_resource* mr) +std::unique_ptr ceil_datetimes(column_view const& column, + rounding_frequency freq, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_function::CEIL, - detail::datetime_component::MICROSECOND, - column, - rmm::cuda_stream_default, - mr); + return detail::round_general( + detail::rounding_function::CEIL, freq, column, rmm::cuda_stream_default, mr); } -std::unique_ptr ceil_nanosecond(column_view const& column, +std::unique_ptr floor_datetimes(column_view const& column, + rounding_frequency freq, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_function::CEIL, - detail::datetime_component::NANOSECOND, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr floor_day(column_view const& column, rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_function::FLOOR, - detail::datetime_component::DAY, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr floor_hour(column_view const& column, rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_function::FLOOR, - detail::datetime_component::HOUR, - column, - rmm::cuda_stream_default, - mr); + return detail::round_general( + detail::rounding_function::FLOOR, freq, column, rmm::cuda_stream_default, mr); } -std::unique_ptr floor_minute(column_view const& column, rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_function::FLOOR, - detail::datetime_component::MINUTE, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr floor_second(column_view const& column, rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_function::FLOOR, - detail::datetime_component::SECOND, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr floor_millisecond(column_view const& column, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_function::FLOOR, - detail::datetime_component::MILLISECOND, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr floor_microsecond(column_view const& column, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_function::FLOOR, - detail::datetime_component::MICROSECOND, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr floor_nanosecond(column_view const& column, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_function::FLOOR, - detail::datetime_component::NANOSECOND, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr round_day(column_view const& column, rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_function::ROUND, - detail::datetime_component::DAY, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr round_hour(column_view const& column, rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_function::ROUND, - detail::datetime_component::HOUR, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr round_minute(column_view const& column, rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_function::ROUND, - detail::datetime_component::MINUTE, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr round_second(column_view const& column, rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_function::ROUND, - detail::datetime_component::SECOND, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr round_millisecond(column_view const& column, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_function::ROUND, - detail::datetime_component::MILLISECOND, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr round_microsecond(column_view const& column, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_function::ROUND, - detail::datetime_component::MICROSECOND, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr round_nanosecond(column_view const& column, - rmm::mr::device_memory_resource* mr) +std::unique_ptr round_datetimes(column_view const& column, + rounding_frequency freq, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_function::ROUND, - detail::datetime_component::NANOSECOND, - column, - rmm::cuda_stream_default, - mr); + return detail::round_general( + detail::rounding_function::ROUND, freq, column, rmm::cuda_stream_default, mr); } std::unique_ptr extract_year(column_view const& column, rmm::mr::device_memory_resource* mr) diff --git a/cpp/src/groupby/sort/group_tdigest.cu b/cpp/src/groupby/sort/group_tdigest.cu index ecb18c09f9d..b7b45341ad2 100644 --- a/cpp/src/groupby/sort/group_tdigest.cu +++ b/cpp/src/groupby/sort/group_tdigest.cu @@ -101,10 +101,14 @@ struct merge_centroids { * nearest whole number <= it is floor(3.56) == 3. */ struct nearest_value_scalar_weights { - thrust::pair operator() __device__(double next_limit, size_type) + offset_type const* group_offsets; + + thrust::pair operator() __device__(double next_limit, size_type group_index) { - double const f = floor(next_limit); - return {f, max(0, static_cast(next_limit) - 1)}; + double const f = floor(next_limit); + auto const relative_weight_index = max(0, static_cast(next_limit) - 1); + auto const group_size = group_offsets[group_index + 1] - group_offsets[group_index]; + return {f, relative_weight_index < group_size ? relative_weight_index : group_size - 1}; } }; @@ -136,7 +140,8 @@ struct nearest_value_centroid_weights { group_cumulative_weights); return index == 0 ? thrust::pair{0, 0} - : thrust::pair{group_cumulative_weights[index - 1], index - 1}; + : thrust::pair{group_cumulative_weights[index - 1], + static_cast(index) - 1}; } }; @@ -187,6 +192,39 @@ struct cumulative_centroid_weight { } }; +// retrieve group info of scalar inputs by group index +struct scalar_group_info { + size_type const* group_valid_counts; + offset_type const* group_offsets; + + __device__ thrust::tuple operator()(size_type group_index) + { + return {static_cast(group_valid_counts[group_index]), + group_offsets[group_index + 1] - group_offsets[group_index], + group_offsets[group_index]}; + } +}; + +// retrieve group info of centroid inputs by group index +struct centroid_group_info { + double const* cumulative_weights; + offset_type const* outer_offsets; + offset_type const* inner_offsets; + + __device__ thrust::tuple operator()(size_type group_index) + { + // if there's no weights in this group of digests at all, return 0. + auto const group_start = inner_offsets[outer_offsets[group_index]]; + auto const group_end = inner_offsets[outer_offsets[group_index + 1]]; + auto const num_weights = group_end - group_start; + auto const last_weight_index = group_end - 1; + return num_weights == 0 + ? thrust::tuple{0, num_weights, group_start} + : thrust::tuple{ + cumulative_weights[last_weight_index], num_weights, group_start}; + } +}; + struct tdigest_min { __device__ double operator()(thrust::tuple const& t) { @@ -231,37 +269,40 @@ __device__ double scale_func_k1(double quantile, double delta_norm) * cluster sizes and total # of clusters, and once to compute the actual * weight limits per cluster. * - * @param delta_ tdigest compression level + * @param delta tdigest compression level * @param num_groups The number of input groups - * @param nearest_weight_ A functor which returns the nearest weight in the input + * @param nearest_weight A functor which returns the nearest weight in the input * stream that falls before our current cluster limit - * @param total_weight_ A functor which returns the expected total weight for - * the entire stream of input values for the specified group. + * @param group_info A functor which returns the info for the specified group (total + * weight, size and start offset) * @param group_cluster_wl Output. The set of cluster weight limits for each group. * @param group_num_clusters Output. The number of output clusters for each input group. * @param group_cluster_offsets Offsets per-group to the start of it's clusters * @param has_nulls Whether or not the input contains nulls * */ -template -__global__ void generate_cluster_limits_kernel(int delta_, + +template +__global__ void generate_cluster_limits_kernel(int delta, size_type num_groups, NearestWeightFunc nearest_weight, - TotalWeightIter total_weight_, + GroupInfo group_info, CumulativeWeight cumulative_weight, double* group_cluster_wl, size_type* group_num_clusters, offset_type const* group_cluster_offsets, bool has_nulls) { - int const tid = threadIdx.x + blockIdx.x * blockDim.x; + int const tid = threadIdx.x + blockIdx.x * blockDim.x; + auto const group_index = tid; if (group_index >= num_groups) { return; } // we will generate at most delta clusters. - double const delta = static_cast(delta_); - double const delta_norm = delta / (2.0 * M_PI); - double const total_weight = total_weight_[group_index]; + double const delta_norm = static_cast(delta) / (2.0 * M_PI); + double total_weight; + size_type group_size, group_start; + thrust::tie(total_weight, group_size, group_start) = group_info(group_index); // start at the correct place based on our cluster offset. double* cluster_wl = @@ -281,11 +322,11 @@ __global__ void generate_cluster_limits_kernel(int delta_, double cur_limit = 0.0; double cur_weight = 0.0; double next_limit = -1.0; - int last_inserted_index = -1; + int last_inserted_index = -1; // group-relative index into the input stream // compute the first cluster limit double nearest_w; - int nearest_w_index; + int nearest_w_index; // group-relative index into the input stream while (1) { cur_weight = next_limit < 0 ? 0 : max(cur_weight + 1, nearest_w); if (cur_weight >= total_weight) { break; } @@ -331,12 +372,19 @@ __global__ void generate_cluster_limits_kernel(int delta_, // during the reduction step to be trivial. // double adjusted_next_limit = next_limit; - if (nearest_w_index == last_inserted_index || last_inserted_index < 0) { - nearest_w_index = last_inserted_index + 1; - auto [r, i, adjusted] = cumulative_weight(nearest_w_index); - adjusted_next_limit = max(next_limit, adjusted); - (void)r; - (void)i; + if ((last_inserted_index < 0) || // if we haven't inserted anything yet + (nearest_w_index == + last_inserted_index)) { // if we land in the same bucket as the previous cap + + // force the value into this bucket + nearest_w_index = + (last_inserted_index == group_size - 1) ? last_inserted_index : last_inserted_index + 1; + + // the "adjusted" weight must be high enough so that this value will fall in the bucket. + // NOTE: cumulative_weight expects an absolute index into the input value stream, not a + // group-relative index + [[maybe_unused]] auto [r, i, adjusted] = cumulative_weight(nearest_w_index + group_start); + adjusted_next_limit = max(next_limit, adjusted); } cluster_wl[group_num_clusters[group_index]] = adjusted_next_limit; last_inserted_index = nearest_w_index; @@ -360,8 +408,8 @@ __global__ void generate_cluster_limits_kernel(int delta_, * @param num_groups The number of input groups * @param nearest_weight A functor which returns the nearest weight in the input * stream that falls before our current cluster limit - * @param total_weight A functor which returns the expected total weight for - * the entire stream of input values for the specified group. + * @param group_info A functor which returns the info for the specified group (total weight, + * size and start offset) * @param has_nulls Whether or not the input data contains nulls * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory @@ -369,12 +417,12 @@ __global__ void generate_cluster_limits_kernel(int delta_, * @returns A tuple containing the set of cluster weight limits for each group, a set of * list-style offsets indicating group sizes, and the total number of clusters */ -template +template std::tuple, std::unique_ptr, size_type> generate_group_cluster_info(int delta, size_type num_groups, NearestWeight nearest_weight, - TotalWeightIter total_weight, + GroupInfo group_info, CumulativeWeight cumulative_weight, bool has_nulls, rmm::cuda_stream_view stream, @@ -390,7 +438,7 @@ generate_group_cluster_info(int delta, delta, num_groups, nearest_weight, - total_weight, + group_info, cumulative_weight, nullptr, group_num_clusters.begin(), @@ -420,7 +468,7 @@ generate_group_cluster_info(int delta, delta, num_groups, nearest_weight, - total_weight, + group_info, cumulative_weight, group_cluster_wl.begin(), group_num_clusters.begin(), @@ -583,9 +631,8 @@ std::unique_ptr compute_tdigests(int delta, group_cluster_offsets = group_cluster_offsets->view().begin(), group_cumulative_weight] __device__(size_type value_index) -> size_type { // get group index, relative value index within the group and cumulative weight. - auto [group_index, relative_value_index, cumulative_weight] = + [[maybe_unused]] auto [group_index, relative_value_index, cumulative_weight] = group_cumulative_weight(value_index); - (void)relative_value_index; auto const num_clusters = group_cluster_offsets[group_index + 1] - group_cluster_offsets[group_index]; @@ -616,8 +663,9 @@ std::unique_ptr compute_tdigests(int delta, cudf::mutable_column_view weight_col(*centroid_weights); // reduce the centroids into the clusters - auto output = thrust::make_zip_iterator(thrust::make_tuple( + auto output = thrust::make_zip_iterator(thrust::make_tuple( mean_col.begin(), weight_col.begin(), thrust::make_discard_iterator())); + auto const num_values = std::distance(centroids_begin, centroids_end); thrust::reduce_by_key(rmm::exec_policy(stream), keys, @@ -640,12 +688,6 @@ std::unique_ptr compute_tdigests(int delta, mr); } -// retrieve total weight of scalar inputs by group index -struct scalar_total_weight { - size_type const* group_valid_counts; - __device__ double operator()(size_type group_index) { return group_valid_counts[group_index]; } -}; - // return the min/max value of scalar inputs by group index template struct get_scalar_minmax { @@ -678,17 +720,15 @@ struct typed_group_tdigest { rmm::mr::device_memory_resource* mr) { // first, generate cluster weight information for each input group - auto total_weight = cudf::detail::make_counting_transform_iterator( - 0, scalar_total_weight{group_valid_counts.begin()}); - auto [group_cluster_wl, group_cluster_offsets, total_clusters] = - generate_group_cluster_info(delta, - num_groups, - nearest_value_scalar_weights{}, - total_weight, - cumulative_scalar_weight{group_offsets, group_labels}, - col.null_count() > 0, - stream, - mr); + auto [group_cluster_wl, group_cluster_offsets, total_clusters] = generate_group_cluster_info( + delta, + num_groups, + nearest_value_scalar_weights{group_offsets.begin()}, + scalar_group_info{group_valid_counts.begin(), group_offsets.begin()}, + cumulative_scalar_weight{group_offsets, group_labels}, + col.null_count() > 0, + stream, + mr); // device column view. handy because the .element() function // automatically handles fixed-point conversions for us @@ -927,25 +967,15 @@ std::unique_ptr group_merge_tdigest(column_view const& input, auto const delta = max_centroids; // generate cluster info - auto total_group_weight = cudf::detail::make_counting_transform_iterator( - 0, - [outer_offsets = group_offsets.data(), - inner_offsets = tdigest_offsets.begin(), - cumulative_weights = - cumulative_weights->view().begin()] __device__(size_type group_index) -> double { - // if there's no weights in this group of digests at all, return 0. - auto const num_weights = - inner_offsets[outer_offsets[group_index + 1]] - inner_offsets[outer_offsets[group_index]]; - auto const last_weight_index = inner_offsets[outer_offsets[group_index + 1]] - 1; - return num_weights == 0 ? 0 : cumulative_weights[last_weight_index]; - }); auto [group_cluster_wl, group_cluster_offsets, total_clusters] = generate_group_cluster_info( delta, num_groups, nearest_value_centroid_weights{cumulative_weights->view().begin(), group_offsets.data(), tdigest_offsets.begin()}, - total_group_weight, + centroid_group_info{cumulative_weights->view().begin(), + group_offsets.data(), + tdigest_offsets.begin()}, cumulative_centroid_weight{ cumulative_weights->view().begin(), group_labels, diff --git a/cpp/src/io/avro/avro_gpu.h b/cpp/src/io/avro/avro_gpu.h index c87ac8afb13..3811132435b 100644 --- a/cpp/src/io/avro/avro_gpu.h +++ b/cpp/src/io/avro/avro_gpu.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -47,17 +47,17 @@ struct schemadesc_s { * @param[in] max_rows Maximum number of rows to load * @param[in] first_row Crop all rows below first_row * @param[in] min_row_size Minimum size in bytes of a row - * @param[in] stream CUDA stream to use, default 0 + * @param[in] stream CUDA stream to use */ void DecodeAvroColumnData(cudf::device_span blocks, schemadesc_s* schema, cudf::device_span global_dictionary, uint8_t const* avro_data, uint32_t schema_len, - size_t max_rows = ~0, - size_t first_row = 0, - uint32_t min_row_size = 0, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); + size_t max_rows, + size_t first_row, + uint32_t min_row_size, + rmm::cuda_stream_view stream); } // namespace gpu } // namespace avro diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu index d908e6c8ed5..0fa5680c5d2 100644 --- a/cpp/src/io/avro/reader_impl.cu +++ b/cpp/src/io/avro/reader_impl.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -159,8 +159,8 @@ rmm::device_buffer decompress_data(datasource& source, if (meta.codec == "deflate") { size_t uncompressed_data_size = 0; - auto inflate_in = hostdevice_vector(meta.block_list.size()); - auto inflate_out = hostdevice_vector(meta.block_list.size()); + auto inflate_in = hostdevice_vector(meta.block_list.size(), stream); + auto inflate_out = hostdevice_vector(meta.block_list.size(), stream); // Guess an initial maximum uncompressed block size uint32_t initial_blk_len = (meta.max_block_size * 2 + 0xfff) & ~0xfff; @@ -343,7 +343,7 @@ std::vector decode_data(metadata& meta, } // Build gpu schema - auto schema_desc = hostdevice_vector(meta.schema.size()); + auto schema_desc = hostdevice_vector(meta.schema.size(), stream); uint32_t min_row_data_size = 0; int skip_field_cnt = 0; diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu index 338af72e4c9..dab8ce1afa5 100644 --- a/cpp/src/io/comp/gpuinflate.cu +++ b/cpp/src/io/comp/gpuinflate.cu @@ -780,22 +780,19 @@ __device__ void process_symbols(inflate_state_s* s, int t) do { volatile uint32_t* b = &s->x.u.symqueue[batch * batch_size]; - int batch_len, pos; - int32_t symt; - uint32_t lit_mask; - + int batch_len = 0; if (t == 0) { while ((batch_len = s->x.batch_len[batch]) == 0) {} - } else { - batch_len = 0; } batch_len = shuffle(batch_len); if (batch_len < 0) { break; } - symt = (t < batch_len) ? b[t] : 256; - lit_mask = ballot(symt >= 256); - pos = min((__ffs(lit_mask) - 1) & 0xff, 32); + auto const symt = (t < batch_len) ? b[t] : 256; + auto const lit_mask = ballot(symt >= 256); + auto pos = min((__ffs(lit_mask) - 1) & 0xff, 32); + if (t == 0) { s->x.batch_len[batch] = 0; } + if (t < pos && out + t < outend) { out[t] = symt; } out += pos; batch_len -= pos; @@ -825,7 +822,7 @@ __device__ void process_symbols(inflate_state_s* s, int t) } } batch = (batch + 1) & (batch_count - 1); - } while (1); + } while (true); if (t == 0) { s->out = out; } } diff --git a/cpp/src/io/comp/gpuinflate.h b/cpp/src/io/comp/gpuinflate.h index a37d282997e..3ca9c9eee10 100644 --- a/cpp/src/io/comp/gpuinflate.h +++ b/cpp/src/io/comp/gpuinflate.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2018-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -49,26 +49,26 @@ struct gpu_inflate_status_s { * * @param[in] inputs List of input argument structures * @param[out] outputs List of output status structures - * @param[in] count Number of input/output structures, default 1 - * @param[in] parse_hdr Whether or not to parse GZIP header, default false - * @param[in] stream CUDA stream to use, default 0 + * @param[in] count Number of input/output structures + * @param[in] parse_hdr Whether or not to parse GZIP header + * @param[in] stream CUDA stream to use */ cudaError_t gpuinflate(gpu_inflate_input_s* inputs, gpu_inflate_status_s* outputs, - int count = 1, - int parse_hdr = 0, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); + int count, + int parse_hdr, + rmm::cuda_stream_view stream); /** * @brief Interface for copying uncompressed byte blocks * * @param[in] inputs List of input argument structures - * @param[in] count Number of input structures, default 1 - * @param[in] stream CUDA stream to use, default 0 + * @param[in] count Number of input structures + * @param[in] stream CUDA stream to use */ cudaError_t gpu_copy_uncompressed_blocks(gpu_inflate_input_s* inputs, - int count = 1, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); + int count, + rmm::cuda_stream_view stream); /** * @brief Interface for decompressing Snappy-compressed data @@ -78,13 +78,13 @@ cudaError_t gpu_copy_uncompressed_blocks(gpu_inflate_input_s* inputs, * * @param[in] inputs List of input argument structures * @param[out] outputs List of output status structures - * @param[in] count Number of input/output structures, default 1 - * @param[in] stream CUDA stream to use, default 0 + * @param[in] count Number of input/output structures + * @param[in] stream CUDA stream to use */ cudaError_t gpu_unsnap(gpu_inflate_input_s* inputs, gpu_inflate_status_s* outputs, - int count = 1, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); + int count, + rmm::cuda_stream_view stream); /** * @brief Computes the size of temporary memory for Brotli decompression @@ -105,15 +105,15 @@ size_t get_gpu_debrotli_scratch_size(int max_num_inputs = 0); * @param[out] outputs List of output status structures * @param[in] scratch Temporary memory for intermediate work * @param[in] scratch_size Size in bytes of the temporary memory - * @param[in] count Number of input/output structures, default 1 - * @param[in] stream CUDA stream to use, default 0 + * @param[in] count Number of input/output structures + * @param[in] stream CUDA stream to use */ cudaError_t gpu_debrotli(gpu_inflate_input_s* inputs, gpu_inflate_status_s* outputs, void* scratch, size_t scratch_size, - int count = 1, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); + int count, + rmm::cuda_stream_view stream); /** * @brief Interface for compressing data with Snappy @@ -123,13 +123,13 @@ cudaError_t gpu_debrotli(gpu_inflate_input_s* inputs, * * @param[in] inputs List of input argument structures * @param[out] outputs List of output status structures - * @param[in] count Number of input/output structures, default 1 - * @param[in] stream CUDA stream to use, default 0 + * @param[in] count Number of input/output structures + * @param[in] stream CUDA stream to use */ cudaError_t gpu_snap(gpu_inflate_input_s* inputs, gpu_inflate_status_s* outputs, - int count = 1, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); + int count, + rmm::cuda_stream_view stream); } // namespace io } // namespace cudf diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 7f032b6987c..0e50bb46232 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -237,7 +237,7 @@ std::pair, selected_rows_offsets> load_data_and_gather size_t buffer_size = std::min(max_chunk_bytes, data.size()); size_t max_blocks = std::max((buffer_size / cudf::io::csv::gpu::rowofs_block_bytes) + 1, 2); - hostdevice_vector row_ctx(max_blocks); + hostdevice_vector row_ctx(max_blocks, stream); size_t buffer_pos = std::min(range_begin - std::min(range_begin, sizeof(char)), data.size()); size_t pos = std::min(range_begin, data.size()); size_t header_rows = (reader_opts.get_header() >= 0) ? reader_opts.get_header() + 1 : 0; diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index b9b6fc6cf94..1b66df860a3 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -137,10 +137,9 @@ struct column_to_strings_fn { (cudf::is_timestamp()) || (cudf::is_duration())); } - explicit column_to_strings_fn( - csv_writer_options const& options, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) + explicit column_to_strings_fn(csv_writer_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) : options_(options), stream_(stream), mr_(mr) { } diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index 8f8bb87d9e4..05bc25597c2 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -409,7 +409,7 @@ inline __device__ int decode_base128_varint(volatile orc_bytestream_s* bs, int p if (b > 0x7f) { b = bytestream_readbyte(bs, pos++); v = (v & 0x0fffffff) | (b << 28); - if (sizeof(T) > 4) { + if constexpr (sizeof(T) > 4) { uint32_t lo = v; uint64_t hi; v = b >> 4; @@ -650,13 +650,11 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, int t, bool has_buffered_values = false) { - uint32_t numvals, numruns; - int r, tr; - if (t == 0) { uint32_t maxpos = min(bs->len, bs->pos + (bytestream_buffer_size - 8u)); uint32_t lastpos = bs->pos; - numvals = numruns = 0; + auto numvals = 0; + auto numruns = 0; // Find the length and start location of each run while (numvals < maxvals) { uint32_t pos = lastpos; @@ -713,9 +711,9 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, } __syncthreads(); // Process the runs, 1 warp per run - numruns = rle->num_runs; - r = t >> 5; - tr = t & 0x1f; + auto const numruns = rle->num_runs; + auto const r = t >> 5; + auto const tr = t & 0x1f; for (uint32_t run = r; run < numruns; run += num_warps) { uint32_t base, pos, w, n; int mode; @@ -731,7 +729,7 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, w = 8 + (byte0 & 0x38); // 8 to 64 bits n = 3 + (byte0 & 7); // 3 to 10 values bytestream_readbe(bs, pos * 8, w, baseval); - if (sizeof(T) <= 4) { + if constexpr (sizeof(T) <= 4) { rle->baseval.u32[r] = baseval; } else { rle->baseval.u64[r] = baseval; @@ -746,7 +744,7 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, uint32_t byte3 = bytestream_readbyte(bs, pos++); uint32_t bw = 1 + (byte2 >> 5); // base value width, 1 to 8 bytes uint32_t pw = kRLEv2_W[byte2 & 0x1f]; // patch width, 1 to 64 bits - if (sizeof(T) <= 4) { + if constexpr (sizeof(T) <= 4) { uint32_t baseval, mask; bytestream_readbe(bs, pos * 8, bw * 8, baseval); mask = (1 << (bw * 8 - 1)) - 1; @@ -766,7 +764,7 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, int64_t delta; // Delta pos = decode_varint(bs, pos, baseval); - if (sizeof(T) <= 4) { + if constexpr (sizeof(T) <= 4) { rle->baseval.u32[r] = baseval; } else { rle->baseval.u64[r] = baseval; @@ -782,8 +780,9 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, pos = shuffle(pos); n = shuffle(n); w = shuffle(w); + __syncwarp(); // Not required, included to fix the racecheck warning for (uint32_t i = tr; i < n; i += 32) { - if (sizeof(T) <= 4) { + if constexpr (sizeof(T) <= 4) { if (mode == 0) { vals[base + i] = rle->baseval.u32[r]; } else if (mode == 1) { @@ -860,7 +859,7 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, if (j & i) vals[base + j] += vals[base + ((j & ~i) | (i - 1))]; } } - if (sizeof(T) <= 4) + if constexpr (sizeof(T) <= 4) baseval = rle->baseval.u32[r]; else baseval = rle->baseval.u64[r]; @@ -868,6 +867,7 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, vals[base + j] += baseval; } } + __syncwarp(); } __syncthreads(); return rle->num_vals; @@ -1679,11 +1679,12 @@ __global__ void __launch_bounds__(block_size) } } } - if (t == 0 && numvals + vals_skipped > 0 && numvals < s->top.data.max_vals) { - if (s->chunk.type_kind == TIMESTAMP) { - s->top.data.buffered_count = s->top.data.max_vals - numvals; + if (t == 0 && numvals + vals_skipped > 0) { + auto const max_vals = s->top.data.max_vals; + if (max_vals > numvals) { + if (s->chunk.type_kind == TIMESTAMP) { s->top.data.buffered_count = max_vals - numvals; } + s->top.data.max_vals = numvals; } - s->top.data.max_vals = numvals; } __syncthreads(); // Use the valid bits to compute non-null row positions until we get a full batch of values to diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index 829e4877c44..660ec025d00 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -414,7 +414,7 @@ static __device__ uint32_t IntegerRLE( uint32_t mode1_w, mode2_w; typename std::make_unsigned::type vrange_mode1, vrange_mode2; block_vmin = static_cast(vmin); - if (sizeof(T) > 4) { + if constexpr (sizeof(T) > 4) { vrange_mode1 = (is_signed) ? max(zigzag(vmin), zigzag(vmax)) : vmax; vrange_mode2 = vmax - vmin; mode1_w = 8 - min(CountLeadingBytes64(vrange_mode1), 7); @@ -705,10 +705,7 @@ static __device__ void encode_null_mask(orcenc_state_s* s, } // reset shared state - if (t == 0) { - s->nnz = 0; - s->numvals = 0; - } + if (t == 0) { s->nnz = 0; } } /** diff --git a/cpp/src/io/orc/timezone.cuh b/cpp/src/io/orc/timezone.cuh index 77c2bd4ffa0..e15144f9ea5 100644 --- a/cpp/src/io/orc/timezone.cuh +++ b/cpp/src/io/orc/timezone.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -107,10 +107,13 @@ inline __device__ int32_t get_gmt_offset(cudf::device_span ttimes return get_gmt_offset_impl(ttimes.begin(), offsets.begin(), ttimes.size(), ts); } -struct timezone_table { +class timezone_table { int32_t gmt_offset = 0; rmm::device_uvector ttimes; rmm::device_uvector offsets; + + public: + // Safe to use the default stream, device_uvectors will not change after they are created empty timezone_table() : ttimes{0, rmm::cuda_stream_default}, offsets{0, rmm::cuda_stream_default} {} timezone_table(int32_t gmt_offset, rmm::device_uvector&& ttimes, diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp index 80c22b09927..d989721334e 100644 --- a/cpp/src/io/orc/writer_impl.hpp +++ b/cpp/src/io/orc/writer_impl.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -349,7 +349,7 @@ class writer::impl { private: rmm::mr::device_memory_resource* _mr = nullptr; // Cuda stream to be used - rmm::cuda_stream_view stream = rmm::cuda_stream_default; + rmm::cuda_stream_view stream; stripe_size_limits max_stripe_size; size_type row_index_stride; diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 53bb11c8b70..b77eeac68f5 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021, NVIDIA CORPORATION. + * Copyright (c) 2018-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -479,7 +479,7 @@ struct dremel_data { dremel_data get_dremel_data(column_view h_col, rmm::device_uvector const& d_nullability, std::vector const& nullability, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); + rmm::cuda_stream_view stream); /** * @brief Launches kernel for initializing encoder page fragments diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu index 69d480edf85..fc4afe951db 100644 --- a/cpp/src/io/parquet/reader_impl.cu +++ b/cpp/src/io/parquet/reader_impl.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -301,7 +301,7 @@ struct metadata : public FileMetaData { } }; -class aggregate_metadata { +class aggregate_reader_metadata { std::vector const per_file_metadata; std::map const agg_keyval_map; size_type const num_rows; @@ -357,7 +357,7 @@ class aggregate_metadata { } public: - aggregate_metadata(std::vector> const& sources) + aggregate_reader_metadata(std::vector> const& sources) : per_file_metadata(metadatas_from_sources(sources)), agg_keyval_map(merge_keyval_metadata()), num_rows(calc_num_rows()), @@ -822,7 +822,7 @@ class aggregate_metadata { */ void generate_depth_remappings(std::map, std::vector>>& remap, int src_col_schema, - aggregate_metadata const& md) + aggregate_reader_metadata const& md) { // already generated for this level if (remap.find(src_col_schema) != remap.end()) { return; } @@ -1427,8 +1427,8 @@ void reader::impl::decode_page_data(hostdevice_vector& chu // In order to reduce the number of allocations of hostdevice_vector, we allocate a single vector // to store all per-chunk pointers to nested data/nullmask. `chunk_offsets[i]` will store the // offset into `chunk_nested_data`/`chunk_nested_valids` for the array of pointers for chunk `i` - auto chunk_nested_valids = hostdevice_vector(sum_max_depths); - auto chunk_nested_data = hostdevice_vector(sum_max_depths); + auto chunk_nested_valids = hostdevice_vector(sum_max_depths, stream); + auto chunk_nested_data = hostdevice_vector(sum_max_depths, stream); auto chunk_offsets = std::vector(); // Update chunks with pointers to column data. @@ -1587,7 +1587,7 @@ reader::impl::impl(std::vector>&& sources, : _mr(mr), _sources(std::move(sources)) { // Open and parse the source dataset metadata - _metadata = std::make_unique(_sources); + _metadata = std::make_unique(_sources); // Override output timestamp resolution if requested if (options.get_timestamp_type().id() != type_id::EMPTY) { diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp index 6564c4120a8..01fca5a8b50 100644 --- a/cpp/src/io/parquet/reader_impl.hpp +++ b/cpp/src/io/parquet/reader_impl.hpp @@ -46,7 +46,7 @@ using namespace cudf::io::parquet; using namespace cudf::io; // Forward declarations -class aggregate_metadata; +class aggregate_reader_metadata; /** * @brief Implementation for Parquet reader @@ -199,7 +199,7 @@ class reader::impl { private: rmm::mr::device_memory_resource* _mr = nullptr; std::vector> _sources; - std::unique_ptr _metadata; + std::unique_ptr _metadata; // input columns to be processed std::vector _input_columns; diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index aceb3bfbec1..b302516ba39 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -79,12 +79,12 @@ parquet::Compression to_parquet_compression(compression_type compression) } // namespace -struct aggregate_metadata { - aggregate_metadata(std::vector const& partitions, - size_type num_columns, - std::vector schema, - statistics_freq stats_granularity, - std::vector> const& kv_md) +struct aggregate_writer_metadata { + aggregate_writer_metadata(std::vector const& partitions, + size_type num_columns, + std::vector schema, + statistics_freq stats_granularity, + std::vector> const& kv_md) : version(1), schema(std::move(schema)), files(partitions.size()) { for (size_t i = 0; i < partitions.size(); ++i) { @@ -447,25 +447,28 @@ struct leaf_schema_fn { std::enable_if_t(), void> operator()() { if (std::is_same_v) { - col_schema.type = Type::INT32; - col_schema.stats_dtype = statistics_dtype::dtype_int32; + col_schema.type = Type::INT32; + col_schema.stats_dtype = statistics_dtype::dtype_int32; + col_schema.decimal_precision = 9; } else if (std::is_same_v) { - col_schema.type = Type::INT64; - col_schema.stats_dtype = statistics_dtype::dtype_decimal64; + col_schema.type = Type::INT64; + col_schema.stats_dtype = statistics_dtype::dtype_decimal64; + col_schema.decimal_precision = 18; } else if (std::is_same_v) { - col_schema.type = Type::FIXED_LEN_BYTE_ARRAY; - col_schema.type_length = sizeof(__int128_t); - col_schema.stats_dtype = statistics_dtype::dtype_decimal128; + col_schema.type = Type::FIXED_LEN_BYTE_ARRAY; + col_schema.type_length = sizeof(__int128_t); + col_schema.stats_dtype = statistics_dtype::dtype_decimal128; + col_schema.decimal_precision = 38; } else { CUDF_FAIL("Unsupported fixed point type for parquet writer"); } col_schema.converted_type = ConvertedType::DECIMAL; col_schema.decimal_scale = -col->type().scale(); // parquet and cudf disagree about scale signs - CUDF_EXPECTS(col_meta.is_decimal_precision_set(), - "Precision must be specified for decimal columns"); - CUDF_EXPECTS(col_meta.get_decimal_precision() >= col_schema.decimal_scale, - "Precision must be equal to or greater than scale!"); - col_schema.decimal_precision = col_meta.get_decimal_precision(); + if (col_meta.is_decimal_precision_set()) { + CUDF_EXPECTS(col_meta.get_decimal_precision() >= col_schema.decimal_scale, + "Precision must be equal to or greater than scale!"); + col_schema.decimal_precision = col_meta.get_decimal_precision(); + } } template @@ -1226,7 +1229,7 @@ void writer::impl::write(table_view const& table, std::vector co std::vector this_table_schema(schema_tree.begin(), schema_tree.end()); if (!md) { - md = std::make_unique( + md = std::make_unique( partitions, num_columns, std::move(this_table_schema), stats_granularity_, kv_md); } else { // verify the user isn't passing mismatched tables diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp index 1cefb91c904..405ab0c2880 100644 --- a/cpp/src/io/parquet/writer_impl.hpp +++ b/cpp/src/io/parquet/writer_impl.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -45,7 +45,7 @@ namespace detail { namespace parquet { // Forward internal classes struct parquet_column_view; -struct aggregate_metadata; +struct aggregate_writer_metadata; using namespace cudf::io::parquet; using namespace cudf::io; @@ -206,7 +206,7 @@ class writer::impl { // TODO : figure out if we want to keep this. It is currently unused. rmm::mr::device_memory_resource* _mr = nullptr; // Cuda stream to be used - rmm::cuda_stream_view stream = rmm::cuda_stream_default; + rmm::cuda_stream_view stream; size_t max_row_group_size = default_row_group_size_bytes; size_type max_row_group_rows = default_row_group_size_rows; @@ -214,7 +214,7 @@ class writer::impl { statistics_freq stats_granularity_ = statistics_freq::STATISTICS_NONE; bool int96_timestamps = false; // Overall file metadata. Filled in during the process and written during write_chunked_end() - std::unique_ptr md; + std::unique_ptr md; // File footer key-value metadata. Written during write_chunked_end() std::vector> kv_md; // optional user metadata diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp index 9300bd0f8b2..17df49009c2 100644 --- a/cpp/src/io/utilities/column_buffer.hpp +++ b/cpp/src/io/utilities/column_buffer.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -45,11 +45,10 @@ namespace detail { * * @return `rmm::device_buffer` Device buffer allocation */ -inline rmm::device_buffer create_data( - data_type type, - size_type size, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +inline rmm::device_buffer create_data(data_type type, + size_type size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { std::size_t data_size = size_of(type) * size; @@ -75,9 +74,9 @@ struct column_buffer { // construct with a known size. allocates memory column_buffer(data_type _type, size_type _size, - bool _is_nullable = true, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) + bool _is_nullable, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) : type(_type), is_nullable(_is_nullable) { create(_size, stream, mr); @@ -93,9 +92,7 @@ struct column_buffer { // instantiate a column of known type with a specified size. Allows deferred creation for // preprocessing steps such as in the Parquet reader - void create(size_type _size, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + void create(size_type _size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); auto data() { return _strings ? _strings->data() : _data.data(); } auto data_size() const { return _strings ? _strings->size() : _data.size(); } @@ -134,11 +131,10 @@ struct column_buffer { * * @return `std::unique_ptr` Column from the existing device data */ -std::unique_ptr make_column( - column_buffer& buffer, - column_name_info* schema_info = nullptr, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr make_column(column_buffer& buffer, + column_name_info* schema_info, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @brief Creates an equivalent empty column from an existing set of device memory buffers. @@ -155,11 +151,10 @@ std::unique_ptr make_column( * * @return `std::unique_ptr` Column from the existing device data */ -std::unique_ptr empty_like( - column_buffer& buffer, - column_name_info* schema_info = nullptr, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr empty_like(column_buffer& buffer, + column_name_info* schema_info, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); } // namespace detail } // namespace io diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp index a7f9aec7bb4..cbf914b8da6 100644 --- a/cpp/src/io/utilities/hostdevice_vector.hpp +++ b/cpp/src/io/utilities/hostdevice_vector.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -45,15 +45,12 @@ class hostdevice_vector { return *this; } - explicit hostdevice_vector(size_t max_size, - rmm::cuda_stream_view stream = rmm::cuda_stream_default) + explicit hostdevice_vector(size_t max_size, rmm::cuda_stream_view stream) : hostdevice_vector(max_size, max_size, stream) { } - explicit hostdevice_vector(size_t initial_size, - size_t max_size, - rmm::cuda_stream_view stream = rmm::cuda_stream_default) + explicit hostdevice_vector(size_t initial_size, size_t max_size, rmm::cuda_stream_view stream) : num_elements(initial_size), max_elements(max_size) { if (max_elements != 0) { @@ -148,9 +145,7 @@ namespace detail { template class hostdevice_2dvector { public: - hostdevice_2dvector(size_t rows, - size_t columns, - rmm::cuda_stream_view stream = rmm::cuda_stream_default) + hostdevice_2dvector(size_t rows, size_t columns, rmm::cuda_stream_view stream) : _size{rows, columns}, _data{rows * columns, stream} { } diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index c6f842c6c55..c259be2a285 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -36,22 +36,6 @@ namespace detail { namespace { -/** - * @brief Device functor to determine if a row is valid. - */ -class row_is_valid { - public: - row_is_valid(bitmask_type const* row_bitmask) : _row_bitmask{row_bitmask} {} - - __device__ __inline__ bool operator()(const size_type& i) const noexcept - { - return cudf::bit_is_set(_row_bitmask, i); - } - - private: - bitmask_type const* _row_bitmask; -}; - } // anonymous namespace std::pair, std::unique_ptr
> get_empty_joined_table( diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh index 39a9f19c0ee..2fd0207a2c0 100644 --- a/cpp/src/join/join_common_utils.cuh +++ b/cpp/src/join/join_common_utils.cuh @@ -27,6 +27,22 @@ namespace cudf { namespace detail { +/** + * @brief Device functor to determine if a row is valid. + */ +class row_is_valid { + public: + row_is_valid(bitmask_type const* row_bitmask) : _row_bitmask{row_bitmask} {} + + __device__ __inline__ bool operator()(const size_type& i) const noexcept + { + return cudf::bit_is_set(_row_bitmask, i); + } + + private: + bitmask_type const* _row_bitmask; +}; + /** * @brief Device functor to determine if two pairs are identical. */ diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu index e781472e025..5eb8ca2452e 100644 --- a/cpp/src/join/semi_join.cu +++ b/cpp/src/join/semi_join.cu @@ -14,11 +14,12 @@ * limitations under the License. */ -#include +#include #include #include #include +#include #include #include #include @@ -34,10 +35,28 @@ #include #include #include +#include + +#include namespace cudf { namespace detail { +namespace { +/** + * @brief Device functor to create a pair of hash value and index for a given row. + */ +struct make_pair_function { + __device__ __forceinline__ cudf::detail::pair_type operator()(size_type i) const noexcept + { + // The value is irrelevant since we only ever use the hash map to check for + // membership of a particular row index. + return cuco::make_pair(i, 0); + } +}; + +} // namespace + std::unique_ptr> left_semi_anti_join( join_kind const kind, cudf::table_view const& left_keys, @@ -71,67 +90,67 @@ std::unique_ptr> left_semi_anti_join( auto right_flattened_keys = right_flattened_tables.flattened_columns(); auto left_flattened_keys = left_flattened_tables.flattened_columns(); - // Only care about existence, so we'll use an unordered map (other joins need a multimap) - using hash_table_type = concurrent_unordered_map; + // Create hash table. + auto hash_table = cuco:: + static_map{ + compute_hash_table_size(right_num_rows), + std::numeric_limits::max(), + cudf::detail::JoinNoneValue, + hash_table_allocator_type{default_allocator{}, stream}, + stream.value()}; // Create hash table containing all keys found in right table - auto right_rows_d = table_device_view::create(right_flattened_keys, stream); - size_t const hash_table_size = compute_hash_table_size(right_num_rows); - auto const right_nulls = cudf::nullate::DYNAMIC{cudf::has_nulls(right_flattened_keys)}; - row_hash hash_build{right_nulls, *right_rows_d}; + auto right_rows_d = table_device_view::create(right_flattened_keys, stream); + auto const right_nulls = cudf::nullate::DYNAMIC{cudf::has_nulls(right_flattened_keys)}; + row_hash const hash_build{right_nulls, *right_rows_d}; row_equality equality_build{right_nulls, *right_rows_d, *right_rows_d, compare_nulls}; + make_pair_function pair_func_build{}; + + auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func_build); - // Going to join it with left table - auto left_rows_d = table_device_view::create(left_flattened_keys, stream); - auto const left_nulls = cudf::nullate::DYNAMIC{cudf::has_nulls(left_flattened_keys)}; - row_hash hash_probe{left_nulls, *left_rows_d}; - row_equality equality_probe{left_nulls, *left_rows_d, *right_rows_d, compare_nulls}; - - auto hash_table_ptr = hash_table_type::create(hash_table_size, - stream, - std::numeric_limits::max(), - std::numeric_limits::max(), - hash_build, - equality_build); - auto hash_table = *hash_table_ptr; - - // if compare_nulls == UNEQUAL, we can simply ignore any rows that - // contain a NULL in any column as they will never compare to equal. - auto const row_bitmask = (compare_nulls == null_equality::EQUAL) - ? rmm::device_buffer{} - : cudf::detail::bitmask_and(right_flattened_keys, stream).first; // skip rows that are null here. - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - right_num_rows, - [hash_table, row_bitmask = static_cast(row_bitmask.data())] __device__( - size_type idx) mutable { - if (!row_bitmask || cudf::bit_is_set(row_bitmask, idx)) { - hash_table.insert(thrust::make_pair(idx, true)); - } - }); + if ((compare_nulls == null_equality::EQUAL) or (not nullable(right_keys))) { + hash_table.insert(iter, iter + right_num_rows, hash_build, equality_build, stream.value()); + } else { + thrust::counting_iterator stencil(0); + auto const [row_bitmask, _] = cudf::detail::bitmask_and(right_flattened_keys, stream); + row_is_valid pred{static_cast(row_bitmask.data())}; + + // insert valid rows + hash_table.insert_if( + iter, iter + right_num_rows, stencil, pred, hash_build, equality_build, stream.value()); + } - // // Now we have a hash table, we need to iterate over the rows of the left table // and check to see if they are contained in the hash table - // + auto left_rows_d = table_device_view::create(left_flattened_keys, stream); + auto const left_nulls = cudf::nullate::DYNAMIC{cudf::has_nulls(left_flattened_keys)}; + row_hash hash_probe{left_nulls, *left_rows_d}; + // Note: This equality comparator violates symmetry of equality and is + // therefore relying on the implementation detail of the order in which its + // operator is invoked. If cuco makes no promises about the order of + // invocation this seems a bit unsafe. + row_equality equality_probe{left_nulls, *right_rows_d, *left_rows_d, compare_nulls}; // For semi join we want contains to be true, for anti join we want contains to be false bool const join_type_boolean = (kind == join_kind::LEFT_SEMI_JOIN); + auto hash_table_view = hash_table.get_device_view(); + auto gather_map = std::make_unique>(left_num_rows, stream, mr); // gather_map_end will be the end of valid data in gather_map auto gather_map_end = thrust::copy_if( rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(left_num_rows), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(left_num_rows), gather_map->begin(), - [hash_table, join_type_boolean, hash_probe, equality_probe] __device__(size_type idx) { - auto pos = hash_table.find(idx, hash_probe, equality_probe); - return (pos != hash_table.end()) == join_type_boolean; + [hash_table_view, join_type_boolean, hash_probe, equality_probe] __device__( + size_type const idx) { + // Look up this row. The hash function used here needs to map a (left) row index to the hash + // of the row, so it's a row hash. The equality check needs to verify + return hash_table_view.contains(idx, hash_probe, equality_probe) == join_type_boolean; }); auto join_size = thrust::distance(gather_map->begin(), gather_map_end); diff --git a/cpp/src/lists/sequences.cu b/cpp/src/lists/sequences.cu new file mode 100644 index 00000000000..5007918441b --- /dev/null +++ b/cpp/src/lists/sequences.cu @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include + +namespace cudf::lists { +namespace detail { +namespace { +template +struct tabulator { + size_type const n_lists; + size_type const n_elements; + + T const* const starts; + T const* const steps; + offset_type const* const offsets; + + template + static std::enable_if_t(), T> __device__ multiply(U x, size_type times) + { + return x * static_cast(times); + } + + template + static std::enable_if_t(), T> __device__ multiply(U x, size_type times) + { + return T{x.count() * times}; + } + + auto __device__ operator()(size_type idx) const + { + auto const list_idx_end = thrust::upper_bound(thrust::seq, offsets, offsets + n_lists, idx); + auto const list_idx = thrust::distance(offsets, list_idx_end) - 1; + auto const list_offset = offsets[list_idx]; + auto const list_step = steps ? steps[list_idx] : T{1}; + return starts[list_idx] + multiply(list_step, idx - list_offset); + } +}; + +template +struct sequences_functor { + template + static std::unique_ptr invoke(Args&&...) + { + CUDF_FAIL("Unsupported per-list sequence type-agg combination."); + } +}; + +struct sequences_dispatcher { + template + std::unique_ptr operator()(size_type n_lists, + size_type n_elements, + column_view const& starts, + std::optional const& steps, + offset_type const* offsets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + return sequences_functor::invoke(n_lists, n_elements, starts, steps, offsets, stream, mr); + } +}; + +template +static constexpr bool is_supported() +{ + return (cudf::is_numeric() && !cudf::is_boolean()) || cudf::is_duration(); +} + +template +struct sequences_functor()>> { + static std::unique_ptr invoke(size_type n_lists, + size_type n_elements, + column_view const& starts, + std::optional const& steps, + offset_type const* offsets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + auto result = + make_fixed_width_column(starts.type(), n_elements, mask_state::UNALLOCATED, stream, mr); + if (starts.is_empty()) { return result; } + + auto const result_begin = result->mutable_view().template begin(); + + // Use pointers instead of column_device_view to access start and step values should be enough. + // This is because we don't need to check for nulls and only support numeric and duration types. + auto const starts_begin = starts.template begin(); + auto const steps_begin = steps ? steps.value().template begin() : nullptr; + + auto const op = tabulator{n_lists, n_elements, starts_begin, steps_begin, offsets}; + thrust::tabulate(rmm::exec_policy(stream), result_begin, result_begin + n_elements, op); + + return result; + } +}; + +std::unique_ptr make_empty_lists_column(data_type child_type, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto offsets = make_empty_column(data_type(type_to_id())); + auto child = make_empty_column(child_type); + return make_lists_column( + 0, std::move(offsets), std::move(child), 0, rmm::device_buffer(0, stream, mr), stream, mr); +} + +std::unique_ptr sequences(column_view const& starts, + std::optional const& steps, + column_view const& sizes, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(!starts.has_nulls() && !sizes.has_nulls(), + "starts and sizes input columns must not have nulls."); + CUDF_EXPECTS(starts.size() == sizes.size(), + "starts and sizes input columns must have the same number of rows."); + CUDF_EXPECTS(cudf::is_index_type(sizes.type()), "Input sizes column must be of integer types."); + + if (steps) { + auto const& steps_cv = steps.value(); + CUDF_EXPECTS(!steps_cv.has_nulls(), "steps input column must not have nulls."); + CUDF_EXPECTS(starts.size() == steps_cv.size(), + "starts and steps input columns must have the same number of rows."); + CUDF_EXPECTS(starts.type() == steps_cv.type(), + "starts and steps input columns must have the same type."); + } + + auto const n_lists = starts.size(); + if (n_lists == 0) { return make_empty_lists_column(starts.type(), stream, mr); } + + // Generate list offsets for the output. + auto list_offsets = make_numeric_column( + data_type(type_to_id()), n_lists + 1, mask_state::UNALLOCATED, stream, mr); + auto const offsets_begin = list_offsets->mutable_view().template begin(); + auto const sizes_input_it = cudf::detail::indexalator_factory::make_input_iterator(sizes); + + thrust::exclusive_scan( + rmm::exec_policy(stream), sizes_input_it, sizes_input_it + n_lists + 1, offsets_begin); + auto const n_elements = cudf::detail::get_value(list_offsets->view(), n_lists, stream); + + auto child = type_dispatcher(starts.type(), + sequences_dispatcher{}, + n_lists, + n_elements, + starts, + steps, + offsets_begin, + stream, + mr); + + return make_lists_column(n_lists, + std::move(list_offsets), + std::move(child), + 0, + rmm::device_buffer(0, stream, mr), + stream, + mr); +} + +} // anonymous namespace + +std::unique_ptr sequences(column_view const& starts, + column_view const& sizes, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return sequences(starts, std::nullopt, sizes, stream, mr); +} + +std::unique_ptr sequences(column_view const& starts, + column_view const& steps, + column_view const& sizes, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return sequences(starts, std::optional{steps}, sizes, stream, mr); +} + +} // namespace detail + +std::unique_ptr sequences(column_view const& starts, + column_view const& sizes, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::sequences(starts, sizes, rmm::cuda_stream_default, mr); +} + +std::unique_ptr sequences(column_view const& starts, + column_view const& steps, + column_view const& sizes, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::sequences(starts, steps, sizes, rmm::cuda_stream_default, mr); +} + +} // namespace cudf::lists diff --git a/cpp/src/strings/count_matches.cuh b/cpp/src/strings/count_matches.cuh new file mode 100644 index 00000000000..c14142f4779 --- /dev/null +++ b/cpp/src/strings/count_matches.cuh @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include +#include + +#include +#include + +#include + +namespace cudf { +namespace strings { +namespace detail { + +/** + * @brief Functor counts the total matches to the given regex in each string. + */ +template +struct count_matches_fn { + column_device_view const d_strings; + reprog_device prog; + + __device__ size_type operator()(size_type idx) + { + if (d_strings.is_null(idx)) { return 0; } + size_type count = 0; + auto const d_str = d_strings.element(idx); + + int32_t begin = 0; + int32_t end = d_str.length(); + while ((begin < end) && (prog.find(idx, d_str, begin, end) > 0)) { + ++count; + begin = end; + end = d_str.length(); + } + return count; + } +}; + +/** + * @brief Returns a column of regex match counts for each string in the given column. + * + * A null entry will result in a zero count for that output row. + * + * @param d_strings Device view of the input strings column. + * @param d_prog Regex instance to evaluate on each string. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. + */ +std::unique_ptr count_matches( + column_device_view const& d_strings, + reprog_device const& d_prog, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +{ + // Create output column + auto counts = make_numeric_column( + data_type{type_id::INT32}, d_strings.size() + 1, mask_state::UNALLOCATED, stream, mr); + auto d_counts = counts->mutable_view().data(); + + auto begin = thrust::make_counting_iterator(0); + auto end = thrust::make_counting_iterator(d_strings.size()); + + // Count matches + auto const regex_insts = d_prog.insts_counts(); + if (regex_insts <= RX_SMALL_INSTS) { + count_matches_fn fn{d_strings, d_prog}; + thrust::transform(rmm::exec_policy(stream), begin, end, d_counts, fn); + } else if (regex_insts <= RX_MEDIUM_INSTS) { + count_matches_fn fn{d_strings, d_prog}; + thrust::transform(rmm::exec_policy(stream), begin, end, d_counts, fn); + } else if (regex_insts <= RX_LARGE_INSTS) { + count_matches_fn fn{d_strings, d_prog}; + thrust::transform(rmm::exec_policy(stream), begin, end, d_counts, fn); + } else { + count_matches_fn fn{d_strings, d_prog}; + thrust::transform(rmm::exec_policy(stream), begin, end, d_counts, fn); + } + + return counts; +} + +} // namespace detail +} // namespace strings +} // namespace cudf diff --git a/cpp/src/strings/extract.cu b/cpp/src/strings/extract/extract.cu similarity index 100% rename from cpp/src/strings/extract.cu rename to cpp/src/strings/extract/extract.cu diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu new file mode 100644 index 00000000000..584741298c2 --- /dev/null +++ b/cpp/src/strings/extract/extract_all.cu @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace cudf { +namespace strings { +namespace detail { + +namespace { + +/** + * @brief Functor extracts matched string pointers for each input string. + * + * For regex match within a string, the specified groups are extracted into + * the `d_indices` output vector. + * The `d_offsets` are pre-computed to identify the location of where each + * string's output groups are to be written. + */ +template +struct extract_fn { + column_device_view const d_strings; + reprog_device d_prog; + offset_type const* d_offsets; + string_index_pair* d_indices; + + __device__ void operator()(size_type idx) + { + if (d_strings.is_null(idx)) { return; } + + auto const groups = d_prog.group_counts(); + auto d_output = d_indices + d_offsets[idx]; + size_type output_idx = 0; + + auto const d_str = d_strings.element(idx); + + int32_t begin = 0; + int32_t end = d_str.length(); + // match the regex + while ((begin < end) && d_prog.find(idx, d_str, begin, end) > 0) { + // extract each group into the output + for (auto group_idx = 0; group_idx < groups; ++group_idx) { + // result is an optional containing the bounds of the extracted string at group_idx + auto const extracted = d_prog.extract(idx, d_str, begin, end, group_idx); + + d_output[group_idx + output_idx] = [&] { + if (!extracted) { return string_index_pair{nullptr, 0}; } + auto const start_offset = d_str.byte_offset(extracted->first); + auto const end_offset = d_str.byte_offset(extracted->second); + return string_index_pair{d_str.data() + start_offset, end_offset - start_offset}; + }(); + } + // continue to next match + begin = end; + end = d_str.length(); + output_idx += groups; + } + } +}; +} // namespace + +/** + * @copydoc cudf::strings::extract_all + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr extract_all( + strings_column_view const& strings, + std::string const& pattern, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +{ + auto const strings_count = strings.size(); + auto const d_strings = column_device_view::create(strings.parent(), stream); + + // Compile regex into device object. + auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream); + // The extract pattern should always include groups. + auto const groups = d_prog->group_counts(); + CUDF_EXPECTS(groups > 0, "extract_all requires group indicators in the regex pattern."); + + // Get the match counts for each string. + // This column will become the output lists child offsets column. + auto offsets = count_matches(*d_strings, *d_prog, stream, mr); + auto d_offsets = offsets->mutable_view().data(); + + // Compute null output rows + auto [null_mask, null_count] = cudf::detail::valid_if( + d_offsets, d_offsets + strings_count, [] __device__(auto v) { return v > 0; }, stream, mr); + + // Return an empty lists column if there are no valid rows + if (strings_count == null_count) { + return make_lists_column(0, + make_empty_column(type_to_id()), + make_empty_column(type_id::STRING), + 0, + rmm::device_buffer{}, + stream, + mr); + } + + // Convert counts into offsets. + // Multiply each count by the number of groups. + thrust::transform_exclusive_scan( + rmm::exec_policy(stream), + d_offsets, + d_offsets + strings_count + 1, + d_offsets, + [groups] __device__(auto v) { return v * groups; }, + offset_type{0}, + thrust::plus{}); + auto const total_groups = + cudf::detail::get_value(offsets->view(), strings_count, stream); + + // Create an indices vector with the total number of groups that will be extracted. + rmm::device_uvector indices(total_groups, stream); + auto d_indices = indices.data(); + auto begin = thrust::make_counting_iterator(0); + + // Call the extract functor to fill in the indices vector. + auto const regex_insts = d_prog->insts_counts(); + if (regex_insts <= RX_SMALL_INSTS) { + extract_fn fn{*d_strings, *d_prog, d_offsets, d_indices}; + thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn); + } else if (regex_insts <= RX_MEDIUM_INSTS) { + extract_fn fn{*d_strings, *d_prog, d_offsets, d_indices}; + thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn); + } else if (regex_insts <= RX_LARGE_INSTS) { + extract_fn fn{*d_strings, *d_prog, d_offsets, d_indices}; + thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn); + } else { + extract_fn fn{*d_strings, *d_prog, d_offsets, d_indices}; + thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn); + } + + // Build the child strings column from the indices. + auto strings_output = make_strings_column(indices.begin(), indices.end(), stream, mr); + + // Build the lists column from the offsets and the strings. + return make_lists_column(strings_count, + std::move(offsets), + std::move(strings_output), + null_count, + std::move(null_mask), + stream, + mr); +} + +} // namespace detail + +// external API + +std::unique_ptr extract_all(strings_column_view const& strings, + std::string const& pattern, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::extract_all(strings, pattern, rmm::cuda_stream_default, mr); +} + +} // namespace strings +} // namespace cudf diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu index 6de1044b492..193cd80d9a6 100644 --- a/cpp/src/text/subword/subword_tokenize.cu +++ b/cpp/src/text/subword/subword_tokenize.cu @@ -249,28 +249,6 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings, } // namespace detail -tokenizer_result subword_tokenize(cudf::strings_column_view const& strings, - std::string const& filename_hashed_vocabulary, - uint32_t max_sequence_length, - uint32_t stride, - bool do_lower_case, - bool do_truncate, - uint32_t max_rows_tensor, - rmm::mr::device_memory_resource* mr) -{ - auto vocab_table = load_vocabulary_file(filename_hashed_vocabulary, mr); - CUDF_FUNC_RANGE(); - return detail::subword_tokenize(strings, - *vocab_table, - max_sequence_length, - stride, - do_lower_case, - do_truncate, - max_rows_tensor, - rmm::cuda_stream_default, - mr); -} - tokenizer_result subword_tokenize(cudf::strings_column_view const& strings, hashed_vocabulary const& vocabulary_table, uint32_t max_sequence_length, diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu index f41ebacce53..5b8f3d1ce9f 100644 --- a/cpp/src/unary/cast_ops.cu +++ b/cpp/src/unary/cast_ops.cu @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -175,16 +176,27 @@ std::unique_ptr rescale(column_view input, rmm::mr::device_memory_resource* mr) { using namespace numeric; + using RepType = device_storage_type_t; + auto const type = cudf::data_type{cudf::type_to_id(), scale}; if (input.type().scale() >= scale) { - auto const scalar = make_fixed_point_scalar(0, scale_type{scale}, rmm::cuda_stream_default); - auto const type = cudf::data_type{cudf::type_to_id(), scale}; + auto const scalar = make_fixed_point_scalar(0, scale_type{scale}, stream); return detail::binary_operation(input, *scalar, binary_operator::ADD, type, stream, mr); } else { auto const diff = input.type().scale() - scale; - auto const scalar = - make_fixed_point_scalar(std::pow(10, -diff), scale_type{diff}, rmm::cuda_stream_default); - auto const type = cudf::data_type{cudf::type_to_id(), scale}; + // The value of fixed point scalar will overflow if the scale difference is larger than the + // max digits of underlying integral type. Under this condition, the output values can be + // nothing other than zero value. Therefore, we simply return a zero column. + if (-diff > cuda::std::numeric_limits::digits10) { + auto const scalar = make_fixed_point_scalar(0, scale_type{scale}, stream); + auto output_column = make_column_from_scalar(*scalar, input.size(), stream, mr); + if (input.nullable()) { + auto const null_mask = copy_bitmask(input, stream, mr); + output_column->set_null_mask(std::move(null_mask)); + } + return output_column; + } + auto const scalar = make_fixed_point_scalar(std::pow(10, -diff), scale_type{diff}, stream); return detail::binary_operation(input, *scalar, binary_operator::DIV, type, stream, mr); } }; diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index c1c209b2413..d90260400a0 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -442,6 +442,7 @@ ConfigureTest( lists/drop_list_duplicates_tests.cpp lists/explode_tests.cpp lists/extract_tests.cpp + lists/sequences_tests.cpp lists/sort_lists_tests.cpp ) diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp index 62b8425704f..655fbf5679b 100644 --- a/cpp/tests/datetime/datetime_ops_test.cpp +++ b/cpp/tests/datetime/datetime_ops_test.cpp @@ -347,78 +347,6 @@ TEST_F(BasicDatetimeOpsTest, TestLastDayOfMonthWithDate) verbosity); } -TYPED_TEST(TypedDatetimeOpsTest, TestCeilDatetime) -{ - using T = TypeParam; - using namespace cudf::test; - using namespace cudf::datetime; - using namespace cuda::std::chrono; - - auto start = milliseconds(-2500000000000); // Sat, 11 Oct 1890 19:33:20 GMT - auto stop = milliseconds(2500000000000); // Mon, 22 Mar 2049 04:26:40 GMT - - auto input = generate_timestamps(this->size(), time_point_ms(start), time_point_ms(stop)); - - auto host_val = to_host(input); - thrust::host_vector timestamps = host_val.first; - - thrust::host_vector ceiled_day(timestamps.size()); - thrust::transform(timestamps.begin(), timestamps.end(), ceiled_day.begin(), [](auto i) { - return time_point_cast(ceil(i)); - }); - auto expected_day = - fixed_width_column_wrapper(ceiled_day.begin(), ceiled_day.end()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_day(input), expected_day); - - thrust::host_vector ceiled_hour(timestamps.size()); - thrust::transform(timestamps.begin(), timestamps.end(), ceiled_hour.begin(), [](auto i) { - return time_point_cast(ceil(i)); - }); - auto expected_hour = fixed_width_column_wrapper(ceiled_hour.begin(), - ceiled_hour.end()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_hour(input), expected_hour); - - std::vector ceiled_minute(timestamps.size()); - std::transform(timestamps.begin(), timestamps.end(), ceiled_minute.begin(), [](auto i) { - return time_point_cast(ceil(i)); - }); - auto expected_minute = fixed_width_column_wrapper( - ceiled_minute.begin(), ceiled_minute.end()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_minute(input), expected_minute); - - std::vector ceiled_second(timestamps.size()); - std::transform(timestamps.begin(), timestamps.end(), ceiled_second.begin(), [](auto i) { - return time_point_cast(ceil(i)); - }); - auto expected_second = fixed_width_column_wrapper( - ceiled_second.begin(), ceiled_second.end()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_second(input), expected_second); - - std::vector ceiled_millisecond(timestamps.size()); - std::transform(timestamps.begin(), timestamps.end(), ceiled_millisecond.begin(), [](auto i) { - return time_point_cast(ceil(i)); - }); - auto expected_millisecond = fixed_width_column_wrapper( - ceiled_millisecond.begin(), ceiled_millisecond.end()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_millisecond(input), expected_millisecond); - - std::vector ceiled_microsecond(timestamps.size()); - std::transform(timestamps.begin(), timestamps.end(), ceiled_microsecond.begin(), [](auto i) { - return time_point_cast(ceil(i)); - }); - auto expected_microsecond = fixed_width_column_wrapper( - ceiled_microsecond.begin(), ceiled_microsecond.end()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_microsecond(input), expected_microsecond); - - std::vector ceiled_nanosecond(timestamps.size()); - std::transform(timestamps.begin(), timestamps.end(), ceiled_nanosecond.begin(), [](auto i) { - return time_point_cast(ceil(i)); - }); - auto expected_nanosecond = fixed_width_column_wrapper( - ceiled_nanosecond.begin(), ceiled_nanosecond.end()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_nanosecond(input), expected_nanosecond); -} - TEST_F(BasicDatetimeOpsTest, TestDayOfYearWithDate) { using namespace cudf::test; @@ -841,7 +769,7 @@ TEST_F(BasicDatetimeOpsTest, TestQuarter) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_quarter(timestamps_s), quarter); } -TYPED_TEST(TypedDatetimeOpsTest, TestFloorDatetime) +TYPED_TEST(TypedDatetimeOpsTest, TestCeilDatetime) { using T = TypeParam; using namespace cudf::test; @@ -851,10 +779,85 @@ TYPED_TEST(TypedDatetimeOpsTest, TestFloorDatetime) auto start = milliseconds(-2500000000000); // Sat, 11 Oct 1890 19:33:20 GMT auto stop = milliseconds(2500000000000); // Mon, 22 Mar 2049 04:26:40 GMT - auto input = generate_timestamps(this->size(), time_point_ms(start), time_point_ms(stop)); + auto const input = + generate_timestamps(this->size(), time_point_ms(start), time_point_ms(stop)); + auto const timestamps = to_host(input).first; + + std::vector ceiled_day(timestamps.size()); + thrust::transform(timestamps.begin(), timestamps.end(), ceiled_day.begin(), [](auto i) { + return time_point_cast(ceil(i)); + }); + auto expected_day = + fixed_width_column_wrapper(ceiled_day.begin(), ceiled_day.end()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_datetimes(input, rounding_frequency::DAY), expected_day); + + std::vector ceiled_hour(timestamps.size()); + thrust::transform(timestamps.begin(), timestamps.end(), ceiled_hour.begin(), [](auto i) { + return time_point_cast(ceil(i)); + }); + auto expected_hour = fixed_width_column_wrapper(ceiled_hour.begin(), + ceiled_hour.end()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_datetimes(input, rounding_frequency::HOUR), expected_hour); + + std::vector ceiled_minute(timestamps.size()); + std::transform(timestamps.begin(), timestamps.end(), ceiled_minute.begin(), [](auto i) { + return time_point_cast(ceil(i)); + }); + auto expected_minute = fixed_width_column_wrapper( + ceiled_minute.begin(), ceiled_minute.end()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_datetimes(input, rounding_frequency::MINUTE), + expected_minute); + + std::vector ceiled_second(timestamps.size()); + std::transform(timestamps.begin(), timestamps.end(), ceiled_second.begin(), [](auto i) { + return time_point_cast(ceil(i)); + }); + auto expected_second = fixed_width_column_wrapper( + ceiled_second.begin(), ceiled_second.end()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_datetimes(input, rounding_frequency::SECOND), + expected_second); + + std::vector ceiled_millisecond(timestamps.size()); + std::transform(timestamps.begin(), timestamps.end(), ceiled_millisecond.begin(), [](auto i) { + return time_point_cast(ceil(i)); + }); + auto expected_millisecond = fixed_width_column_wrapper( + ceiled_millisecond.begin(), ceiled_millisecond.end()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_datetimes(input, rounding_frequency::MILLISECOND), + expected_millisecond); + + std::vector ceiled_microsecond(timestamps.size()); + std::transform(timestamps.begin(), timestamps.end(), ceiled_microsecond.begin(), [](auto i) { + return time_point_cast(ceil(i)); + }); + auto expected_microsecond = fixed_width_column_wrapper( + ceiled_microsecond.begin(), ceiled_microsecond.end()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_datetimes(input, rounding_frequency::MICROSECOND), + expected_microsecond); + + std::vector ceiled_nanosecond(timestamps.size()); + std::transform(timestamps.begin(), timestamps.end(), ceiled_nanosecond.begin(), [](auto i) { + return time_point_cast(ceil(i)); + }); + auto expected_nanosecond = fixed_width_column_wrapper( + ceiled_nanosecond.begin(), ceiled_nanosecond.end()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_datetimes(input, rounding_frequency::NANOSECOND), + expected_nanosecond); +} + +TYPED_TEST(TypedDatetimeOpsTest, TestFloorDatetime) +{ + using T = TypeParam; + using namespace cudf::test; + using namespace cudf::datetime; + using namespace cuda::std::chrono; - auto host_val = to_host(input); - thrust::host_vector timestamps = host_val.first; + auto start = milliseconds(-2500000000000); // Sat, 11 Oct 1890 19:33:20 GMT + auto stop = milliseconds(2500000000000); // Mon, 22 Mar 2049 04:26:40 GMT + + auto const input = + generate_timestamps(this->size(), time_point_ms(start), time_point_ms(stop)); + auto const timestamps = to_host(input).first; std::vector floored_day(timestamps.size()); std::transform(timestamps.begin(), timestamps.end(), floored_day.begin(), [](auto i) { @@ -862,7 +865,7 @@ TYPED_TEST(TypedDatetimeOpsTest, TestFloorDatetime) }); auto expected_day = fixed_width_column_wrapper(floored_day.begin(), floored_day.end()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_day(input), expected_day); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_datetimes(input, rounding_frequency::DAY), expected_day); std::vector floored_hour(timestamps.size()); std::transform(timestamps.begin(), timestamps.end(), floored_hour.begin(), [](auto i) { @@ -870,7 +873,7 @@ TYPED_TEST(TypedDatetimeOpsTest, TestFloorDatetime) }); auto expected_hour = fixed_width_column_wrapper( floored_hour.begin(), floored_hour.end()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_hour(input), expected_hour); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_datetimes(input, rounding_frequency::HOUR), expected_hour); std::vector floored_minute(timestamps.size()); std::transform(timestamps.begin(), timestamps.end(), floored_minute.begin(), [](auto i) { @@ -878,7 +881,8 @@ TYPED_TEST(TypedDatetimeOpsTest, TestFloorDatetime) }); auto expected_minute = fixed_width_column_wrapper( floored_minute.begin(), floored_minute.end()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_minute(input), expected_minute); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_datetimes(input, rounding_frequency::MINUTE), + expected_minute); std::vector floored_second(timestamps.size()); std::transform(timestamps.begin(), timestamps.end(), floored_second.begin(), [](auto i) { @@ -886,7 +890,8 @@ TYPED_TEST(TypedDatetimeOpsTest, TestFloorDatetime) }); auto expected_second = fixed_width_column_wrapper( floored_second.begin(), floored_second.end()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_second(input), expected_second); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_datetimes(input, rounding_frequency::SECOND), + expected_second); std::vector floored_millisecond(timestamps.size()); std::transform(timestamps.begin(), timestamps.end(), floored_millisecond.begin(), [](auto i) { @@ -894,7 +899,8 @@ TYPED_TEST(TypedDatetimeOpsTest, TestFloorDatetime) }); auto expected_millisecond = fixed_width_column_wrapper( floored_millisecond.begin(), floored_millisecond.end()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_millisecond(input), expected_millisecond); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_datetimes(input, rounding_frequency::MILLISECOND), + expected_millisecond); std::vector floored_microsecond(timestamps.size()); std::transform(timestamps.begin(), timestamps.end(), floored_microsecond.begin(), [](auto i) { @@ -902,7 +908,8 @@ TYPED_TEST(TypedDatetimeOpsTest, TestFloorDatetime) }); auto expected_microsecond = fixed_width_column_wrapper( floored_microsecond.begin(), floored_microsecond.end()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_second(input), expected_microsecond); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_datetimes(input, rounding_frequency::MICROSECOND), + expected_microsecond); std::vector floored_nanosecond(timestamps.size()); std::transform(timestamps.begin(), timestamps.end(), floored_nanosecond.begin(), [](auto i) { @@ -910,7 +917,8 @@ TYPED_TEST(TypedDatetimeOpsTest, TestFloorDatetime) }); auto expected_nanosecond = fixed_width_column_wrapper( floored_nanosecond.begin(), floored_nanosecond.end()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_millisecond(input), expected_nanosecond); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_datetimes(input, rounding_frequency::NANOSECOND), + expected_nanosecond); } TYPED_TEST(TypedDatetimeOpsTest, TestRoundDatetime) @@ -923,10 +931,9 @@ TYPED_TEST(TypedDatetimeOpsTest, TestRoundDatetime) auto start = milliseconds(-2500000000000); // Sat, 11 Oct 1890 19:33:20 GMT auto stop = milliseconds(2500000000000); // Mon, 22 Mar 2049 04:26:40 GMT - auto input = generate_timestamps(this->size(), time_point_ms(start), time_point_ms(stop)); - - auto host_val = to_host(input); - auto timestamps = host_val.first; + auto const input = + generate_timestamps(this->size(), time_point_ms(start), time_point_ms(stop)); + auto const timestamps = to_host(input).first; std::vector rounded_day(timestamps.size()); std::transform(timestamps.begin(), timestamps.end(), rounded_day.begin(), [](auto i) { @@ -934,7 +941,7 @@ TYPED_TEST(TypedDatetimeOpsTest, TestRoundDatetime) }); auto expected_day = fixed_width_column_wrapper(rounded_day.begin(), rounded_day.end()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_day(input), expected_day); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_datetimes(input, rounding_frequency::DAY), expected_day); std::vector rounded_hour(timestamps.size()); std::transform(timestamps.begin(), timestamps.end(), rounded_hour.begin(), [](auto i) { @@ -942,7 +949,7 @@ TYPED_TEST(TypedDatetimeOpsTest, TestRoundDatetime) }); auto expected_hour = fixed_width_column_wrapper( rounded_hour.begin(), rounded_hour.end()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_hour(input), expected_hour); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_datetimes(input, rounding_frequency::HOUR), expected_hour); std::vector rounded_minute(timestamps.size()); std::transform(timestamps.begin(), timestamps.end(), rounded_minute.begin(), [](auto i) { @@ -950,7 +957,8 @@ TYPED_TEST(TypedDatetimeOpsTest, TestRoundDatetime) }); auto expected_minute = fixed_width_column_wrapper( rounded_minute.begin(), rounded_minute.end()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_minute(input), expected_minute); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_datetimes(input, rounding_frequency::MINUTE), + expected_minute); std::vector rounded_second(timestamps.size()); std::transform(timestamps.begin(), timestamps.end(), rounded_second.begin(), [](auto i) { @@ -958,7 +966,8 @@ TYPED_TEST(TypedDatetimeOpsTest, TestRoundDatetime) }); auto expected_second = fixed_width_column_wrapper( rounded_second.begin(), rounded_second.end()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_second(input), expected_second); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_datetimes(input, rounding_frequency::SECOND), + expected_second); std::vector rounded_millisecond(timestamps.size()); std::transform(timestamps.begin(), timestamps.end(), rounded_millisecond.begin(), [](auto i) { @@ -966,7 +975,8 @@ TYPED_TEST(TypedDatetimeOpsTest, TestRoundDatetime) }); auto expected_millisecond = fixed_width_column_wrapper( rounded_millisecond.begin(), rounded_millisecond.end()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_millisecond(input), expected_millisecond); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_datetimes(input, rounding_frequency::MILLISECOND), + expected_millisecond); std::vector rounded_microsecond(timestamps.size()); std::transform(timestamps.begin(), timestamps.end(), rounded_microsecond.begin(), [](auto i) { @@ -974,7 +984,8 @@ TYPED_TEST(TypedDatetimeOpsTest, TestRoundDatetime) }); auto expected_microsecond = fixed_width_column_wrapper( rounded_microsecond.begin(), rounded_microsecond.end()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_microsecond(input), expected_microsecond); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_datetimes(input, rounding_frequency::MICROSECOND), + expected_microsecond); std::vector rounded_nanosecond(timestamps.size()); std::transform(timestamps.begin(), timestamps.end(), rounded_nanosecond.begin(), [](auto i) { @@ -982,7 +993,8 @@ TYPED_TEST(TypedDatetimeOpsTest, TestRoundDatetime) }); auto expected_nanosecond = fixed_width_column_wrapper( rounded_nanosecond.begin(), rounded_nanosecond.end()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_nanosecond(input), expected_nanosecond); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_datetimes(input, rounding_frequency::NANOSECOND), + expected_nanosecond); } CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/comp/decomp_test.cpp b/cpp/tests/io/comp/decomp_test.cpp index 8247ced4629..dd00b201df9 100644 --- a/cpp/tests/io/comp/decomp_test.cpp +++ b/cpp/tests/io/comp/decomp_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -97,7 +97,7 @@ struct GzipDecompressTest : public DecompressTest { cudaError_t dispatch(cudf::io::gpu_inflate_input_s* d_inf_args, cudf::io::gpu_inflate_status_s* d_inf_stat) { - return cudf::io::gpuinflate(d_inf_args, d_inf_stat, 1, 1); + return cudf::io::gpuinflate(d_inf_args, d_inf_stat, 1, 1, rmm::cuda_stream_default); } }; @@ -108,7 +108,7 @@ struct SnappyDecompressTest : public DecompressTest { cudaError_t dispatch(cudf::io::gpu_inflate_input_s* d_inf_args, cudf::io::gpu_inflate_status_s* d_inf_stat) { - return cudf::io::gpu_unsnap(d_inf_args, d_inf_stat, 1); + return cudf::io::gpu_unsnap(d_inf_args, d_inf_stat, 1, rmm::cuda_stream_default); } }; @@ -122,7 +122,8 @@ struct BrotliDecompressTest : public DecompressTest { rmm::device_buffer d_scratch{cudf::io::get_gpu_debrotli_scratch_size(1), rmm::cuda_stream_default}; - return cudf::io::gpu_debrotli(d_inf_args, d_inf_stat, d_scratch.data(), d_scratch.size(), 1); + return cudf::io::gpu_debrotli( + d_inf_args, d_inf_stat, d_scratch.data(), d_scratch.size(), 1, rmm::cuda_stream_default); } }; diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 75ff39cbe70..9c656abb666 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -2021,9 +2021,6 @@ TEST_F(ParquetWriterTest, DecimalWrite) cudf_io::parquet_writer_options args = cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, table); - // verify failure if no decimal precision given - EXPECT_THROW(cudf_io::write_parquet(args), cudf::logic_error); - cudf_io::table_input_metadata expected_metadata(table); // verify failure if too small a precision is given diff --git a/cpp/tests/join/semi_anti_join_tests.cpp b/cpp/tests/join/semi_anti_join_tests.cpp index 5b38bafb122..ff4270058cd 100644 --- a/cpp/tests/join/semi_anti_join_tests.cpp +++ b/cpp/tests/join/semi_anti_join_tests.cpp @@ -39,6 +39,21 @@ using Table = cudf::table; struct JoinTest : public cudf::test::BaseFixture { }; +TEST_F(JoinTest, TestSimple) +{ + column_wrapper left_col0{0, 1, 2}; + column_wrapper right_col0{0, 1, 3}; + + auto left = cudf::table_view{{left_col0}}; + auto right = cudf::table_view{{right_col0}}; + + auto result = cudf::left_semi_join(left, right); + auto result_cv = cudf::column_view( + cudf::data_type{cudf::type_to_id()}, result->size(), result->data()); + column_wrapper expected{0, 1}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result_cv); +}; + std::pair, std::unique_ptr> get_saj_tables( std::vector const& left_is_human_nulls, std::vector const& right_is_human_nulls) { diff --git a/cpp/tests/lists/sequences_tests.cpp b/cpp/tests/lists/sequences_tests.cpp new file mode 100644 index 00000000000..2dafeaf5cea --- /dev/null +++ b/cpp/tests/lists/sequences_tests.cpp @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include +#include + +using namespace cudf::test::iterators; + +namespace { +template +using ListsCol = cudf::test::lists_column_wrapper; + +template +using FWDCol = cudf::test::fixed_width_column_wrapper; + +using IntsCol = cudf::test::fixed_width_column_wrapper; +} // namespace + +/*-----------------------------------------------------------------------------------------------*/ +template +class NumericSequencesTypedTest : public cudf::test::BaseFixture { +}; +using NumericTypes = + cudf::test::Concat; +TYPED_TEST_SUITE(NumericSequencesTypedTest, NumericTypes); + +TYPED_TEST(NumericSequencesTypedTest, SimpleTestNoNull) +{ + using T = TypeParam; + + auto const starts = FWDCol{1, 2, 3}; + auto const sizes = IntsCol{5, 3, 4}; + + // Sequences with step == 1. + { + auto const expected = + ListsCol{ListsCol{1, 2, 3, 4, 5}, ListsCol{2, 3, 4}, ListsCol{3, 4, 5, 6}}; + auto const result = cudf::lists::sequences(starts, sizes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); + } + + // Sequences with various steps. + { + auto const steps = FWDCol{1, 3, 2}; + auto const expected = + ListsCol{ListsCol{1, 2, 3, 4, 5}, ListsCol{2, 5, 8}, ListsCol{3, 5, 7, 9}}; + auto const result = cudf::lists::sequences(starts, steps, sizes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); + } +} + +TYPED_TEST(NumericSequencesTypedTest, ZeroSizesTest) +{ + using T = TypeParam; + + auto const starts = FWDCol{1, 2, 3}; + auto const sizes = IntsCol{0, 3, 0}; + + // Sequences with step == 1. + { + auto const expected = ListsCol{ListsCol{}, ListsCol{2, 3, 4}, ListsCol{}}; + auto const result = cudf::lists::sequences(starts, sizes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); + } + + // Sequences with various steps. + { + auto const steps = FWDCol{1, 3, 2}; + auto const expected = ListsCol{ListsCol{}, ListsCol{2, 5, 8}, ListsCol{}}; + auto const result = cudf::lists::sequences(starts, steps, sizes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); + } +} + +TYPED_TEST(NumericSequencesTypedTest, SlicedInputTestNoNulls) +{ + using T = TypeParam; + constexpr int32_t dont_care{123}; + + auto const starts_original = + FWDCol{dont_care, dont_care, dont_care, 1, 2, 3, 4, 5, dont_care, dont_care}; + auto const sizes_original = IntsCol{dont_care, 5, 3, 4, 1, 2, dont_care, dont_care}; + + auto const starts = cudf::slice(starts_original, {3, 8})[0]; + auto const sizes = cudf::slice(sizes_original, {1, 6})[0]; + + // Sequences with step == 1. + { + auto const expected = ListsCol{ListsCol{1, 2, 3, 4, 5}, + ListsCol{2, 3, 4}, + ListsCol{3, 4, 5, 6}, + ListsCol{4}, + ListsCol{5, 6} + + }; + auto const result = cudf::lists::sequences(starts, sizes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); + } + + // Sequences with various steps. + { + auto const steps_original = FWDCol{dont_care, dont_care, 1, 3, 2, 2, 3, dont_care}; + auto const steps = cudf::slice(steps_original, {2, 7})[0]; + + auto const expected = ListsCol{ListsCol{1, 2, 3, 4, 5}, + ListsCol{2, 5, 8}, + ListsCol{3, 5, 7, 9}, + ListsCol{4}, + ListsCol{5, 8} + + }; + auto const result = cudf::lists::sequences(starts, steps, sizes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); + } +} + +/*-----------------------------------------------------------------------------------------------*/ +// Data generated using https://www.epochconverter.com/ +template +class DurationSequencesTypedTest : public cudf::test::BaseFixture { +}; +TYPED_TEST_SUITE(DurationSequencesTypedTest, cudf::test::DurationTypes); + +// Start time is 1638477473L - Thursday, December 2, 2021 8:37:53 PM. +constexpr int64_t start_time = 1638477473L; + +TYPED_TEST(DurationSequencesTypedTest, SequencesNoNull) +{ + using T = TypeParam; + + auto const starts = FWDCol{start_time, start_time, start_time}; + auto const sizes = IntsCol{1, 2, 3}; + + // Sequences with step == 1. + { + auto const expected_h = std::vector{start_time, start_time + 1L, start_time + 2L}; + auto const expected = + ListsCol{ListsCol{expected_h.begin(), expected_h.begin() + 1}, + ListsCol{expected_h.begin(), expected_h.begin() + 2}, + ListsCol{expected_h.begin(), expected_h.begin() + 3}}; + auto const result = cudf::lists::sequences(starts, sizes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); + } + + // Sequences with various steps, including negative. + { + auto const steps = FWDCol{10L, -155L, -13L}; + auto const expected = ListsCol{ + ListsCol{start_time}, + ListsCol{start_time, start_time - 155L}, + ListsCol{start_time, start_time - 13L, start_time - 13L * 2L}}; + auto const result = cudf::lists::sequences(starts, steps, sizes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); + } +} + +/*-----------------------------------------------------------------------------------------------*/ +class NumericSequencesTest : public cudf::test::BaseFixture { +}; + +TEST_F(NumericSequencesTest, EmptyInput) +{ + auto const starts = IntsCol{}; + auto const sizes = IntsCol{}; + auto const steps = IntsCol{}; + auto const expected = ListsCol{}; + + // Sequences with step == 1. + { + auto const result = cudf::lists::sequences(starts, sizes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); + } + + // Sequences with given steps. + { + auto const result = cudf::lists::sequences(starts, steps, sizes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); + } +} + +TEST_F(NumericSequencesTest, InvalidSizesInput) +{ + auto const starts = IntsCol{}; + auto const steps = IntsCol{}; + auto const sizes = FWDCol{}; + + EXPECT_THROW(cudf::lists::sequences(starts, sizes), cudf::logic_error); + EXPECT_THROW(cudf::lists::sequences(starts, steps, sizes), cudf::logic_error); +} + +TEST_F(NumericSequencesTest, MismatchedColumnSizesInput) +{ + auto const starts = IntsCol{1, 2, 3}; + auto const steps = IntsCol{1, 2}; + auto const sizes = IntsCol{1, 2, 3, 4}; + + EXPECT_THROW(cudf::lists::sequences(starts, sizes), cudf::logic_error); + EXPECT_THROW(cudf::lists::sequences(starts, steps, sizes), cudf::logic_error); +} + +TEST_F(NumericSequencesTest, MismatchedColumnTypesInput) +{ + auto const starts = IntsCol{1, 2, 3}; + auto const steps = FWDCol{1, 2, 3}; + auto const sizes = IntsCol{1, 2, 3}; + + EXPECT_THROW(cudf::lists::sequences(starts, steps, sizes), cudf::logic_error); +} + +TEST_F(NumericSequencesTest, InputHasNulls) +{ + constexpr int32_t null{0}; + + { + auto const starts = IntsCol{{null, 2, 3}, null_at(0)}; + auto const sizes = IntsCol{1, 2, 3}; + EXPECT_THROW(cudf::lists::sequences(starts, sizes), cudf::logic_error); + } + + { + auto const starts = IntsCol{1, 2, 3}; + auto const sizes = IntsCol{{null, 2, 3}, null_at(0)}; + EXPECT_THROW(cudf::lists::sequences(starts, sizes), cudf::logic_error); + } + + { + auto const starts = IntsCol{1, 2, 3}; + auto const steps = IntsCol{{null, 2, 3}, null_at(0)}; + auto const sizes = IntsCol{1, 2, 3}; + EXPECT_THROW(cudf::lists::sequences(starts, steps, sizes), cudf::logic_error); + } +} diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp index 824bf7deb34..2bb1c6dac8e 100644 --- a/cpp/tests/strings/extract_tests.cpp +++ b/cpp/tests/strings/extract_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,14 +14,17 @@ * limitations under the License. */ -#include -#include -#include +#include + #include #include #include #include -#include + +#include +#include +#include +#include #include @@ -169,6 +172,38 @@ TEST_F(StringsExtractTests, EmptyExtractTest) CUDF_TEST_EXPECT_TABLES_EQUAL(*results, table_expected); } +TEST_F(StringsExtractTests, ExtractAllTest) +{ + std::vector h_input( + {"123 banana 7 eleven", "41 apple", "6 pear 0 pair", nullptr, "", "bees", "4 pare"}); + auto validity = + thrust::make_transform_iterator(h_input.begin(), [](auto str) { return str != nullptr; }); + cudf::test::strings_column_wrapper input(h_input.begin(), h_input.end(), validity); + auto sv = cudf::strings_column_view(input); + + auto results = cudf::strings::extract_all(sv, "(\\d+) (\\w+)"); + + bool valids[] = {1, 1, 1, 0, 0, 0, 1}; + using LCW = cudf::test::lists_column_wrapper; + LCW expected({LCW{"123", "banana", "7", "eleven"}, + LCW{"41", "apple"}, + LCW{"6", "pear", "0", "pair"}, + LCW{}, + LCW{}, + LCW{}, + LCW{"4", "pare"}}, + valids); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); +} + +TEST_F(StringsExtractTests, Errors) +{ + cudf::test::strings_column_wrapper input({"this column intentionally left blank"}); + auto sv = cudf::strings_column_view(input); + EXPECT_THROW(cudf::strings::extract(sv, "\\w+"), cudf::logic_error); + EXPECT_THROW(cudf::strings::extract_all(sv, "\\w+"), cudf::logic_error); +} + TEST_F(StringsExtractTests, MediumRegex) { // This results in 95 regex instructions and falls in the 'medium' range. diff --git a/cpp/tests/text/subword_tests.cpp b/cpp/tests/text/subword_tests.cpp index 65cc466fee7..521a082faa2 100644 --- a/cpp/tests/text/subword_tests.cpp +++ b/cpp/tests/text/subword_tests.cpp @@ -67,12 +67,13 @@ TEST(TextSubwordTest, Tokenize) cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end()); std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt"); create_hashed_vocab(hash_file); + auto vocab = nvtext::load_vocabulary_file(hash_file); uint32_t max_sequence_length = 16; uint32_t stride = 16; auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings}, - hash_file, + *vocab, max_sequence_length, stride, true, // do_lower_case @@ -119,12 +120,13 @@ TEST(TextSubwordTest, TokenizeMultiRow) cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end()); std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt"); create_hashed_vocab(hash_file); + auto vocab = nvtext::load_vocabulary_file(hash_file); uint32_t max_sequence_length = 8; uint32_t stride = 6; auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings}, - hash_file, + *vocab, max_sequence_length, stride, true, // do_lower_case @@ -148,12 +150,13 @@ TEST(TextSubwordTest, TokenizeMaxEqualsTokens) cudf::test::strings_column_wrapper strings({"This is a test."}); std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt"); create_hashed_vocab(hash_file); + auto vocab = nvtext::load_vocabulary_file(hash_file); uint32_t max_sequence_length = 5; // five tokens in strings; uint32_t stride = 5; // this should not effect the result auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings}, - hash_file, + *vocab, max_sequence_length, stride, true, // do_lower_case @@ -175,8 +178,10 @@ TEST(TextSubwordTest, ParameterErrors) cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end()); std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt"); create_hashed_vocab(hash_file); + auto vocab = nvtext::load_vocabulary_file(hash_file); + EXPECT_THROW(nvtext::subword_tokenize(cudf::strings_column_view{strings}, - hash_file, + *vocab, 12, // max_sequence_length 13, // stride <= max_sequence_length true, // do_lower_case @@ -185,7 +190,7 @@ TEST(TextSubwordTest, ParameterErrors) cudf::logic_error); EXPECT_THROW(nvtext::subword_tokenize(cudf::strings_column_view{strings}, - hash_file, + *vocab, 5, 5, true, // do_lower_case @@ -199,8 +204,9 @@ TEST(TextSubwordTest, EmptyStrings) cudf::test::strings_column_wrapper strings; std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt"); create_hashed_vocab(hash_file); + auto vocab = nvtext::load_vocabulary_file(hash_file); auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings}, - hash_file, + *vocab, 16, 16, true, // do_lower_case @@ -217,8 +223,9 @@ TEST(TextSubwordTest, AllNullStrings) cudf::test::strings_column_wrapper strings({"", "", ""}, {0, 0, 0}); std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt"); create_hashed_vocab(hash_file); + auto vocab = nvtext::load_vocabulary_file(hash_file); auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings}, - hash_file, + *vocab, 16, 16, true, // do_lower_case diff --git a/cpp/tests/unary/cast_tests.cpp b/cpp/tests/unary/cast_tests.cpp index db457623d8d..906380f5e87 100644 --- a/cpp/tests/unary/cast_tests.cpp +++ b/cpp/tests/unary/cast_tests.cpp @@ -1004,6 +1004,22 @@ TYPED_TEST(FixedPointTests, Decimal128ToDecimalXXWithLargerScaleAndNullMask) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); } +TYPED_TEST(FixedPointTests, DecimalRescaleOverflowAndNullMask) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = cudf::device_storage_type_t; + using fp_wrapper = cudf::test::fixed_point_column_wrapper; + + auto const vec = std::vector{1729, 17290, 172900, 1729000}; + auto const scale = cuda::std::numeric_limits::digits10 + 1; + auto const input = fp_wrapper{vec.cbegin(), vec.cend(), {1, 0, 0, 1}, scale_type{0}}; + auto const expected = fp_wrapper{{0, 0, 0, 0}, {1, 0, 0, 1}, scale_type{scale}}; + auto const result = cudf::cast(input, make_fixed_point_data_type(scale)); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + TEST_F(FixedPointTestSingleType, Int32ToInt64Convert) { using namespace numeric; diff --git a/cpp/tests/utilities_tests/span_tests.cu b/cpp/tests/utilities_tests/span_tests.cu index a9a5151e7c3..044ac3e60f7 100644 --- a/cpp/tests/utilities_tests/span_tests.cu +++ b/cpp/tests/utilities_tests/span_tests.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -248,9 +248,9 @@ class MdSpanTest : public cudf::test::BaseFixture { TEST(MdSpanTest, CanDetermineEmptiness) { - auto const vector = hostdevice_2dvector(1, 2); - auto const no_rows_vector = hostdevice_2dvector(0, 2); - auto const no_columns_vector = hostdevice_2dvector(1, 0); + auto const vector = hostdevice_2dvector(1, 2, rmm::cuda_stream_default); + auto const no_rows_vector = hostdevice_2dvector(0, 2, rmm::cuda_stream_default); + auto const no_columns_vector = hostdevice_2dvector(1, 0, rmm::cuda_stream_default); EXPECT_FALSE(host_2dspan{vector}.is_empty()); EXPECT_FALSE(device_2dspan{vector}.is_empty()); @@ -271,7 +271,7 @@ __global__ void readwrite_kernel(device_2dspan result) TEST(MdSpanTest, DeviceReadWrite) { - auto vector = hostdevice_2dvector(11, 23); + auto vector = hostdevice_2dvector(11, 23, rmm::cuda_stream_default); readwrite_kernel<<<1, 1>>>(vector); readwrite_kernel<<<1, 1>>>(vector); @@ -281,7 +281,7 @@ TEST(MdSpanTest, DeviceReadWrite) TEST(MdSpanTest, HostReadWrite) { - auto vector = hostdevice_2dvector(11, 23); + auto vector = hostdevice_2dvector(11, 23, rmm::cuda_stream_default); auto span = host_2dspan{vector}; span[5][6] = 5; if (span[5][6] == 5) { span[5][6] *= 6; } @@ -291,7 +291,7 @@ TEST(MdSpanTest, HostReadWrite) TEST(MdSpanTest, CanGetSize) { - auto const vector = hostdevice_2dvector(1, 2); + auto const vector = hostdevice_2dvector(1, 2, rmm::cuda_stream_default); EXPECT_EQ(host_2dspan{vector}.size(), vector.size()); EXPECT_EQ(device_2dspan{vector}.size(), vector.size()); @@ -299,7 +299,7 @@ TEST(MdSpanTest, CanGetSize) TEST(MdSpanTest, CanGetCount) { - auto const vector = hostdevice_2dvector(11, 23); + auto const vector = hostdevice_2dvector(11, 23, rmm::cuda_stream_default); EXPECT_EQ(host_2dspan{vector}.count(), 11ul * 23); EXPECT_EQ(device_2dspan{vector}.count(), 11ul * 23); diff --git a/docs/cudf/source/api_docs/dataframe.rst b/docs/cudf/source/api_docs/dataframe.rst index 94f88a40ea5..2de55553c3f 100644 --- a/docs/cudf/source/api_docs/dataframe.rst +++ b/docs/cudf/source/api_docs/dataframe.rst @@ -254,7 +254,6 @@ Serialization / IO / conversion DataFrame.from_arrow DataFrame.from_pandas DataFrame.from_records - DataFrame.hash_columns DataFrame.hash_values DataFrame.to_arrow DataFrame.to_dlpack diff --git a/java/ci/Dockerfile.centos7 b/java/ci/Dockerfile.centos7 index 2ee57bfaeab..c1d29468f65 100644 --- a/java/ci/Dockerfile.centos7 +++ b/java/ci/Dockerfile.centos7 @@ -28,7 +28,7 @@ FROM gpuci/cuda:$CUDA_VERSION-devel-centos7 ### Install basic requirements RUN yum install -y centos-release-scl RUN yum install -y devtoolset-9 epel-release -RUN yum install -y git zlib-devel maven tar wget patch +RUN yum install -y git zlib-devel maven tar wget patch ninja-build ## pre-create the CMAKE_INSTALL_PREFIX folder, set writable by any user for Jenkins RUN mkdir /usr/local/rapids && mkdir /rapids && chmod 777 /usr/local/rapids && chmod 777 /rapids diff --git a/java/ci/build-in-docker.sh b/java/ci/build-in-docker.sh index a99b6900830..ac8b2584091 100755 --- a/java/ci/build-in-docker.sh +++ b/java/ci/build-in-docker.sh @@ -19,7 +19,6 @@ set -ex gcc --version -PARALLEL_LEVEL=${PARALLEL_LEVEL:-4} SKIP_JAVA_TESTS=${SKIP_JAVA_TESTS:-true} BUILD_CPP_TESTS=${BUILD_CPP_TESTS:-OFF} ENABLE_CUDA_STATIC_RUNTIME=${ENABLE_CUDA_STATIC_RUNTIME:-ON} @@ -28,6 +27,7 @@ RMM_LOGGING_LEVEL=${RMM_LOGGING_LEVEL:-OFF} ENABLE_NVTX=${ENABLE_NVTX:-ON} ENABLE_GDS=${ENABLE_GDS:-OFF} OUT=${OUT:-out} +CMAKE_GENERATOR=${CMAKE_GENERATOR:-Ninja} SIGN_FILE=$1 #Set absolute path for OUT_PATH @@ -54,7 +54,9 @@ export LIBCUDF_KERNEL_CACHE_PATH=/rapids rm -rf "$WORKSPACE/cpp/build" mkdir -p "$WORKSPACE/cpp/build" cd "$WORKSPACE/cpp/build" -cmake .. -DUSE_NVTX=$ENABLE_NVTX \ +cmake .. -G"${CMAKE_GENERATOR}" \ + -DCMAKE_INSTALL_PREFIX=$INSTALL_PREFIX \ + -DUSE_NVTX=$ENABLE_NVTX \ -DCUDF_USE_ARROW_STATIC=ON \ -DCUDF_ENABLE_ARROW_S3=OFF \ -DBUILD_TESTS=$BUILD_CPP_TESTS \ @@ -62,8 +64,12 @@ cmake .. -DUSE_NVTX=$ENABLE_NVTX \ -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL \ -DBUILD_SHARED_LIBS=OFF -make -j$PARALLEL_LEVEL -make install DESTDIR=$INSTALL_PREFIX +if [[ -z "${PARALLEL_LEVEL}" ]]; then + cmake --build . +else + cmake --build . --parallel $PARALLEL_LEVEL +fi +cmake --install . ###### Build cudf jar ###### BUILD_ARG="-Dmaven.repo.local=\"$WORKSPACE/.m2\"\ diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index c83fe6adca1..61981b34615 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -498,6 +498,42 @@ public static ColumnVector sequence(Scalar initialValue, int rows) { } return new ColumnVector(sequence(initialValue.getScalarHandle(), 0, rows)); } + + /** + * Create a list column in which each row is a sequence of values starting from a `start` value, + * incrementing by one, and its cardinality is specified by a `size` value. The `start` and `size` + * values used to generate each list is taken from the corresponding row of the input start and + * size columns. + * @param start first values in the result sequences + * @param size numbers of values in the result sequences + * @return the new ColumnVector. + */ + public static ColumnVector sequence(ColumnView start, ColumnView size) { + assert start.getNullCount() == 0 || size.getNullCount() == 0 : "starts and sizes input " + + "columns must not have nulls."; + return new ColumnVector(sequences(start.getNativeView(), size.getNativeView(), 0)); + } + + /** + * Create a list column in which each row is a sequence of values starting from a `start` value, + * incrementing by a `step` value, and its cardinality is specified by a `size` value. + * The values `start`, `step`, and `size` used to generate each list is taken from the + * corresponding row of the input starts, steps, and sizes columns. + * @param start first values in the result sequences + * @param size numbers of values in the result sequences + * @param step increment values for the result sequences. + * @return the new ColumnVector. + */ + public static ColumnVector sequence(ColumnView start, ColumnView size, ColumnView step) { + assert start.getNullCount() == 0 || size.getNullCount() == 0 || step.getNullCount() == 0: + "start, size and step must not have nulls."; + assert step.getType() == start.getType() : "start and step input columns must" + + " have the same type."; + + return new ColumnVector(sequences(start.getNativeView(), size.getNativeView(), + step.getNativeView())); + } + /** * Create a new vector by concatenating multiple columns together. * Note that all columns must have the same type. @@ -789,6 +825,9 @@ public ColumnVector castTo(DType type) { private static native long sequence(long initialValue, long step, int rows); + private static native long sequences(long startHandle, long sizeHandle, long stepHandle) + throws CudfException; + private static native long fromArrow(int type, long col_length, long null_count, ByteBuffer data, ByteBuffer validity, ByteBuffer offsets) throws CudfException; @@ -868,24 +907,6 @@ private static native long stringConcatenationSepCol(long[] columnViews, // INTERNAL/NATIVE ACCESS ///////////////////////////////////////////////////////////////////////////// - /** - * Close all non-null buffers. Exceptions that occur during the process will - * be aggregated into a single exception thrown at the end. - */ - static void closeBuffers(AutoCloseable buffer) { - Throwable toThrow = null; - if (buffer != null) { - try { - buffer.close(); - } catch (Throwable t) { - toThrow = t; - } - } - if (toThrow != null) { - throw new RuntimeException(toThrow); - } - } - //////// // Native methods specific to cudf::column. These either take or create a cudf::column // instead of a cudf::column_view so they need to be used with caution. These should @@ -1075,13 +1096,17 @@ protected synchronized boolean cleanImpl(boolean logErrorIfNotClean) { if (!toClose.isEmpty()) { try { for (MemoryBuffer toCloseBuff : toClose) { - closeBuffers(toCloseBuff); - } - } catch (Throwable t) { - if (toThrow != null) { - toThrow.addSuppressed(t); - } else { - toThrow = t; + if (toCloseBuff != null) { + try { + toCloseBuff.close(); + } catch (Throwable t) { + if (toThrow != null) { + toThrow.addSuppressed(t); + } else { + toThrow = t; + } + } + } } } finally { toClose.clear(); diff --git a/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java b/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java index dd07df16553..763ecc763a5 100644 --- a/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java +++ b/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java @@ -594,9 +594,15 @@ protected synchronized boolean cleanImpl(boolean logErrorIfNotClean) { boolean neededCleanup = false; if (data != null || valid != null || offsets != null) { try { - ColumnVector.closeBuffers(data); - ColumnVector.closeBuffers(offsets); - ColumnVector.closeBuffers(valid); + if (data != null) { + data.close(); + } + if (offsets != null) { + offsets.close(); + } + if (valid != null) { + valid.close(); + } } finally { // Always mark the resource as freed even if an exception is thrown. // We cannot know how far it progressed before the exception, and diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java index 4748231d8b4..85a9eb7beb3 100644 --- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java +++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,15 +18,35 @@ package ai.rapids.cudf; +import java.util.Collection; + /** * Options for reading in JSON encoded data. */ -public final class JSONOptions extends ColumnFilterOptions { +public final class JSONOptions extends ColumnFilterOptions { public static JSONOptions DEFAULT = new JSONOptions(builder()); + private final boolean dayFirst; + private final boolean lines; + private JSONOptions(Builder builder) { super(builder); + dayFirst = builder.dayFirst; + lines = builder.lines; + } + + public boolean isDayFirst() { + return dayFirst; + } + + public boolean isLines() { + return lines; + } + + @Override + String[] getIncludeColumnNames() { + throw new UnsupportedOperationException("JSON reader didn't support column prune"); } public static Builder builder() { @@ -34,6 +54,40 @@ public static Builder builder() { } public static final class Builder extends ColumnFilterOptions.Builder { + private boolean dayFirst = false; + private boolean lines = true; + + /** + * Whether to parse dates as DD/MM versus MM/DD + * @param dayFirst true: DD/MM, false, MM/DD + * @return + */ + public Builder withDayFirst(boolean dayFirst) { + this.dayFirst = dayFirst; + return this; + } + + /** + * Whether to read the file as a json object per line + * @param perLine true: per line, false: multi-line + * @return builder for chaining + */ + public Builder withLines(boolean perLine) { + assert perLine == true : "Cudf does not support multi-line"; + this.lines = perLine; + return this; + } + + @Override + public Builder includeColumn(String... names) { + throw new UnsupportedOperationException("JSON reader didn't support column prune"); + } + + @Override + public Builder includeColumn(Collection names) { + throw new UnsupportedOperationException("JSON reader didn't support column prune"); + } + public JSONOptions build() { return new JSONOptions(this); } diff --git a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java index 865a668156f..6eee935748e 100644 --- a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java +++ b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,7 +25,9 @@ import java.util.Comparator; import java.util.Iterator; import java.util.Objects; -import java.util.PriorityQueue; +import java.util.Optional; +import java.util.SortedSet; +import java.util.TreeSet; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; @@ -43,19 +45,21 @@ public final class PinnedMemoryPool implements AutoCloseable { private static Future initFuture = null; private final long pinnedPoolBase; - private final PriorityQueue freeHeap = new PriorityQueue<>(new SortedBySize()); + private final SortedSet freeHeap = new TreeSet<>(new SortedByAddress()); private int numAllocatedSections = 0; private long availableBytes; private static class SortedBySize implements Comparator { @Override public int compare(MemorySection s0, MemorySection s1) { - // We want the largest ones first... - int ret = Long.compare(s1.size, s0.size); - if (ret == 0) { - ret = Long.compare(s0.baseAddress, s1.baseAddress); - } - return ret; + return Long.compare(s0.size, s1.size); + } + } + + private static class SortedByAddress implements Comparator { + @Override + public int compare(MemorySection s0, MemorySection s1) { + return Long.compare(s0.baseAddress, s1.baseAddress); } } @@ -162,6 +166,7 @@ private static void freeInternal(MemorySection section) { /** * Initialize the pool. + * * @param poolSize size of the pool to initialize. */ public static synchronized void initialize(long poolSize) { @@ -170,8 +175,9 @@ public static synchronized void initialize(long poolSize) { /** * Initialize the pool. + * * @param poolSize size of the pool to initialize. - * @param gpuId gpu id to set to get memory pool from, -1 means to use default + * @param gpuId gpu id to set to get memory pool from, -1 means to use default */ public static synchronized void initialize(long poolSize, int gpuId) { if (isInitialized()) { @@ -207,11 +213,12 @@ public static synchronized void shutdown() { /** * Factory method to create a pinned host memory buffer. + * * @param bytes size in bytes to allocate * @return newly created buffer or null if insufficient pinned memory */ public static HostMemoryBuffer tryAllocate(long bytes) { - HostMemoryBuffer result = null; + HostMemoryBuffer result = null; PinnedMemoryPool pool = getSingleton(); if (pool != null) { result = pool.tryAllocateInternal(bytes); @@ -222,6 +229,7 @@ public static HostMemoryBuffer tryAllocate(long bytes) { /** * Factory method to create a host buffer but preferably pointing to pinned memory. * It is not guaranteed that the returned buffer will be pointer to pinned memory. + * * @param bytes size in bytes to allocate * @return newly created buffer */ @@ -235,6 +243,7 @@ public static HostMemoryBuffer allocate(long bytes) { /** * Get the number of bytes free in the pinned memory pool. + * * @return amount of free memory in bytes or 0 if the pool is not initialized */ public static long getAvailableBytes() { @@ -246,7 +255,7 @@ public static long getAvailableBytes() { } private PinnedMemoryPool(long poolSize, int gpuId) { - if (gpuId > -1 ) { + if (gpuId > -1) { // set the gpu device to use Cuda.setDevice(gpuId); Cuda.freeZero(); @@ -269,20 +278,28 @@ private synchronized HostMemoryBuffer tryAllocateInternal(long bytes) { } // Align the allocation long alignedBytes = ((bytes + ALIGNMENT - 1) / ALIGNMENT) * ALIGNMENT; - MemorySection largest = freeHeap.peek(); - if (largest.size < alignedBytes) { - log.debug("Insufficient pinned memory. {} needed, {} found", alignedBytes, largest.size); + Optional firstFit = freeHeap.stream() + .filter(section -> section.size >= alignedBytes) + .findFirst(); + if (!firstFit.isPresent()) { + if (log.isDebugEnabled()) { + MemorySection largest = freeHeap.stream() + .max(new SortedBySize()) + .orElse(new MemorySection(0, 0)); + log.debug("Insufficient pinned memory. {} needed, {} found", alignedBytes, largest.size); + } return null; } + MemorySection first = firstFit.get(); log.debug("Allocating {}/{} bytes pinned from {} FREE COUNT {} OUTSTANDING COUNT {}", - bytes, alignedBytes, largest, freeHeap.size(), numAllocatedSections); - freeHeap.remove(largest); + bytes, alignedBytes, first, freeHeap.size(), numAllocatedSections); + freeHeap.remove(first); MemorySection allocated; - if (largest.size == alignedBytes) { - allocated = largest; + if (first.size == alignedBytes) { + allocated = first; } else { - allocated = largest.splitOff(alignedBytes); - freeHeap.add(largest); + allocated = first.splitOff(alignedBytes); + freeHeap.add(first); } numAllocatedSections++; availableBytes -= allocated.size; @@ -293,25 +310,15 @@ private synchronized HostMemoryBuffer tryAllocateInternal(long bytes) { private synchronized void free(MemorySection section) { log.debug("Freeing {} with {} outstanding {}", section, freeHeap, numAllocatedSections); - // This looks inefficient, but in reality it will only walk through the heap about 2 times. - // Because we keep entries up to date, each new entry will at most combine with one above it - // and one below it. That will happen in a single pass through the heap. We do a second pass - // simply out of an abundance of caution. - // Adding it in will be a log(N) operation because it is a heap. availableBytes += section.size; - boolean anyReplaced; - do { - anyReplaced = false; - Iterator it = freeHeap.iterator(); - while(it.hasNext()) { - MemorySection current = it.next(); - if (section.canCombine(current)) { - it.remove(); - anyReplaced = true; - section.combineWith(current); - } + Iterator it = freeHeap.iterator(); + while(it.hasNext()) { + MemorySection current = it.next(); + if (section.canCombine(current)) { + it.remove(); + section.combineWith(current); } - } while(anyReplaced); + } freeHeap.add(section); numAllocatedSections--; log.debug("After freeing {} outstanding {}", freeHeap, numAllocatedSections); diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 00c98c4fef8..9014e69ee74 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -234,6 +234,11 @@ private static native long[] readCSV(String[] columnNames, byte comment, String[] nullValues, String[] trueValues, String[] falseValues) throws CudfException; + private static native long[] readJSON(String[] columnNames, + int[] dTypeIds, int[] dTypeScales, + String filePath, long address, long length, + boolean dayFirst, boolean lines) throws CudfException; + /** * Read in Parquet formatted data. * @param filterColumnNames name of the columns to read, or an empty array if we want to read @@ -797,6 +802,97 @@ public static Table readCSV(Schema schema, CSVOptions opts, HostMemoryBuffer buf opts.getFalseValues())); } + /** + * Read a JSON file using the default JSONOptions. + * @param schema the schema of the file. You may use Schema.INFERRED to infer the schema. + * @param path the local file to read. + * @return the file parsed as a table on the GPU. + */ + public static Table readJSON(Schema schema, File path) { + return readJSON(schema, JSONOptions.DEFAULT, path); + } + + /** + * Read JSON formatted data using the default JSONOptions. + * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema. + * @param buffer raw UTF8 formatted bytes. + * @return the data parsed as a table on the GPU. + */ + public static Table readJSON(Schema schema, byte[] buffer) { + return readJSON(schema, JSONOptions.DEFAULT, buffer, 0, buffer.length); + } + + /** + * Read JSON formatted data. + * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema. + * @param opts various JSON parsing options. + * @param buffer raw UTF8 formatted bytes. + * @return the data parsed as a table on the GPU. + */ + public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer) { + return readJSON(schema, opts, buffer, 0, buffer.length); + } + + /** + * Read a JSON file. + * @param schema the schema of the file. You may use Schema.INFERRED to infer the schema. + * @param opts various JSON parsing options. + * @param path the local file to read. + * @return the file parsed as a table on the GPU. + */ + public static Table readJSON(Schema schema, JSONOptions opts, File path) { + return new Table( + readJSON(schema.getColumnNames(), schema.getTypeIds(), schema.getTypeScales(), + path.getAbsolutePath(), + 0, 0, + opts.isDayFirst(), opts.isLines())); + } + + /** + * Read JSON formatted data. + * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema. + * @param opts various JSON parsing options. + * @param buffer raw UTF8 formatted bytes. + * @param offset the starting offset into buffer. + * @param len the number of bytes to parse. + * @return the data parsed as a table on the GPU. + */ + public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset, + long len) { + if (len <= 0) { + len = buffer.length - offset; + } + assert len > 0; + assert len <= buffer.length - offset; + assert offset >= 0 && offset < buffer.length; + try (HostMemoryBuffer newBuf = HostMemoryBuffer.allocate(len)) { + newBuf.setBytes(0, buffer, offset, len); + return readJSON(schema, opts, newBuf, 0, len); + } + } + + /** + * Read JSON formatted data. + * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema. + * @param opts various JSON parsing options. + * @param buffer raw UTF8 formatted bytes. + * @param offset the starting offset into buffer. + * @param len the number of bytes to parse. + * @return the data parsed as a table on the GPU. + */ + public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer buffer, + long offset, long len) { + if (len <= 0) { + len = buffer.length - offset; + } + assert len > 0; + assert len <= buffer.length - offset; + assert offset >= 0 && offset < buffer.length; + return new Table(readJSON(schema.getColumnNames(), schema.getTypeIds(), schema.getTypeScales(), + null, buffer.getAddress() + offset, len, + opts.isDayFirst(), opts.isLines())); + } + /** * Read a Parquet file using the default ParquetOptions. * @param path the local file to read. diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp index cfad89cb399..e61ab8444d1 100644 --- a/java/src/main/native/src/ColumnVectorJni.cpp +++ b/java/src/main/native/src/ColumnVectorJni.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -54,6 +55,28 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_sequence(JNIEnv *env, j CATCH_STD(env, 0); } +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_sequences(JNIEnv *env, jclass, + jlong j_start_handle, + jlong j_size_handle, + jlong j_step_handle) { + JNI_NULL_CHECK(env, j_start_handle, "start is null", 0); + JNI_NULL_CHECK(env, j_size_handle, "size is null", 0); + try { + cudf::jni::auto_set_device(env); + auto start = reinterpret_cast(j_start_handle); + auto size = reinterpret_cast(j_size_handle); + auto step = reinterpret_cast(j_step_handle); + std::unique_ptr col; + if (step) { + col = cudf::lists::sequences(*start, *step, *size); + } else { + col = cudf::lists::sequences(*start, *size); + } + return reinterpret_cast(col.release()); + } + CATCH_STD(env, 0); +} + JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromArrow( JNIEnv *env, jclass, jint j_type, jlong j_col_length, jlong j_null_count, jobject j_data_obj, jobject j_validity_obj, jobject j_offsets_obj) { diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 0e6425ea7a2..b7bb6880731 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -1294,6 +1295,120 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV( CATCH_STD(env, NULL); } +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readJSON( + JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales, + jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines) { + + bool read_buffer = true; + if (buffer == 0) { + JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", NULL); + read_buffer = false; + } else if (inputfilepath != NULL) { + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", + "cannot pass in both a buffer and an inputfilepath", NULL); + } else if (buffer_length <= 0) { + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "An empty buffer is not supported", + NULL); + } + + try { + cudf::jni::auto_set_device(env); + cudf::jni::native_jstringArray n_col_names(env, col_names); + cudf::jni::native_jintArray n_types(env, j_types); + cudf::jni::native_jintArray n_scales(env, j_scales); + if (n_types.is_null() != n_scales.is_null()) { + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match null", + NULL); + } + std::vector data_types; + if (!n_types.is_null()) { + if (n_types.size() != n_scales.size()) { + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match size", + NULL); + } + data_types.reserve(n_types.size()); + for (int index = 0; index < n_types.size(); index++) { + data_types.emplace_back(cudf::jni::make_data_type(n_types[index], n_scales[index])); + } + } + + cudf::jni::native_jstring filename(env, inputfilepath); + if (!read_buffer && filename.is_empty()) { + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "inputfilepath can't be empty", + NULL); + } + + std::unique_ptr source; + if (read_buffer) { + source.reset(new cudf::io::source_info(reinterpret_cast(buffer), buffer_length)); + } else { + source.reset(new cudf::io::source_info(filename.get())); + } + + cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(*source) + .dayfirst(static_cast(day_first)) + .lines(static_cast(lines)); + + if (!n_col_names.is_null() && data_types.size() > 0) { + if (n_col_names.size() != n_types.size()) { + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", + "types and column names must match size", NULL); + } + + std::map map; + + auto col_names_vec = n_col_names.as_cpp_vector(); + std::transform(col_names_vec.begin(), col_names_vec.end(), data_types.begin(), + std::inserter(map, map.end()), + [](std::string a, cudf::data_type b) { return std::make_pair(a, b); }); + opts.dtypes(map); + } else if (data_types.size() > 0) { + opts.dtypes(data_types); + } else { + // should infer the types + } + + cudf::io::table_with_metadata result = cudf::io::read_json(opts.build()); + + // there is no need to re-order columns when inferring schema + if (result.metadata.column_names.empty() || n_col_names.size() <= 0) { + return cudf::jni::convert_table_for_return(env, result.tbl); + } else { + // json reader will not return the correct column order, + // so we need to re-order the column of table according to table meta. + + // turn name and its index in table into map + std::map m; + for (size_t i = 0; i < result.metadata.column_names.size(); i++) { + m.insert(std::make_pair(result.metadata.column_names[i], i)); + } + + auto col_names_vec = n_col_names.as_cpp_vector(); + std::vector indices; + + bool match = true; + for (size_t i = 0; i < col_names_vec.size(); i++) { + if (m.find(col_names_vec[i]) == m.end()) { + match = false; + break; + } else { + indices.push_back(m.at(col_names_vec[i])); + } + } + + if (!match) { + // can't find some input column names in table meta, return what json reader reads. + return cudf::jni::convert_table_for_return(env, result.tbl); + } else { + auto tbv = result.tbl->view().select(std::move(indices)); + auto table = std::make_unique(tbv); + return cudf::jni::convert_table_for_return(env, table); + } + } + } + CATCH_STD(env, NULL); +} + JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv *env, jclass, jobjectArray filter_col_names, jstring inputfilepath, diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 0771de9492d..8d4bbff1542 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -1216,6 +1216,58 @@ void testSequenceOtherTypes() { }); } + @Test + void testSequencesInt() { + try (ColumnVector start = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5); + ColumnVector size = ColumnVector.fromBoxedInts(2, 3, 2, 0, 1); + ColumnVector step = ColumnVector.fromBoxedInts(2, -1, 1, 1, 0); + ColumnVector cv = ColumnVector.sequence(start, size, step); + ColumnVector cv1 = ColumnVector.sequence(start, size); + ColumnVector expectCv = ColumnVector.fromLists( + new ListType(true, new BasicType(false, DType.INT32)), + Arrays.asList(1, 3), + Arrays.asList(2, 1, 0), + Arrays.asList(3, 4), + Arrays.asList(), + Arrays.asList(5)); + ColumnVector expectCv1 = ColumnVector.fromLists( + new ListType(true, new BasicType(false, DType.INT32)), + Arrays.asList(1, 2), + Arrays.asList(2, 3, 4), + Arrays.asList(3, 4), + Arrays.asList(), + Arrays.asList(5))) { + assertColumnsAreEqual(expectCv, cv); + assertColumnsAreEqual(expectCv1, cv1); + } + } + + @Test + void testSequencesDouble() { + try (ColumnVector start = ColumnVector.fromBoxedDoubles(1.2, 2.2, 3.2, 4.2, 5.2); + ColumnVector size = ColumnVector.fromBoxedInts(2, 3, 2, 0, 1); + ColumnVector step = ColumnVector.fromBoxedDoubles(2.1, -1.1, 1.1, 1.1, 0.1); + ColumnVector cv = ColumnVector.sequence(start, size, step); + ColumnVector cv1 = ColumnVector.sequence(start, size); + ColumnVector expectCv = ColumnVector.fromLists( + new ListType(true, new BasicType(false, DType.FLOAT64)), + Arrays.asList(1.2, 3.3), + Arrays.asList(2.2, 1.1, 0.0), + Arrays.asList(3.2, 4.3), + Arrays.asList(), + Arrays.asList(5.2)); + ColumnVector expectCv1 = ColumnVector.fromLists( + new ListType(true, new BasicType(false, DType.FLOAT64)), + Arrays.asList(1.2, 2.2), + Arrays.asList(2.2, 3.2, 4.2), + Arrays.asList(3.2, 4.2), + Arrays.asList(), + Arrays.asList(5.2))) { + assertColumnsAreEqual(expectCv, cv); + assertColumnsAreEqual(expectCv1, cv1); + } + } + @Test void testFromScalarZeroRows() { // magic number to invoke factory method specialized for decimal types diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 86c55e19776..b2b51553217 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -80,6 +80,7 @@ public class TableTest extends CudfTestBase { private static final File TEST_ORC_TIMESTAMP_DATE_FILE = TestUtils.getResourceAsFile("timestamp-date-test.orc"); private static final File TEST_DECIMAL_PARQUET_FILE = TestUtils.getResourceAsFile("decimal.parquet"); private static final File TEST_SIMPLE_CSV_FILE = TestUtils.getResourceAsFile("simple.csv"); + private static final File TEST_SIMPLE_JSON_FILE = TestUtils.getResourceAsFile("people.json"); private static final Schema CSV_DATA_BUFFER_SCHEMA = Schema.builder() .column(DType.INT32, "A") @@ -292,6 +293,115 @@ void testGetNumberOfColumns() { } } + @Test + void testReadJSONFile() { + Schema schema = Schema.builder() + .column(DType.STRING, "name") + .column(DType.INT32, "age") + .build(); + JSONOptions opts = JSONOptions.builder() + .withLines(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column("Michael", "Andy", "Justin") + .column(null, 30, 19) + .build(); + Table table = Table.readJSON(schema, opts, TEST_SIMPLE_JSON_FILE)) { + assertTablesAreEqual(expected, table); + } + } + + @Test + void testReadJSONFileWithDifferentColumnOrder() { + Schema schema = Schema.builder() + .column(DType.INT32, "age") + .column(DType.STRING, "name") + .build(); + JSONOptions opts = JSONOptions.builder() + .withLines(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column(null, 30, 19) + .column("Michael", "Andy", "Justin") + .build(); + Table table = Table.readJSON(schema, opts, TEST_SIMPLE_JSON_FILE)) { + assertTablesAreEqual(expected, table); + } + } + + @Test + void testReadJSONBufferInferred() { + JSONOptions opts = JSONOptions.builder() + .withDayFirst(true) + .build(); + byte[] data = ("[false,A,1,2,05/03/2001]\n" + + "[true,B,2,3,31/10/2010]'\n" + + "[false,C,3,4,20/10/1994]\n" + + "[true,D,4,5,18/10/1990]").getBytes(StandardCharsets.UTF_8); + try (Table expected = new Table.TestBuilder() + .column(false, true, false, true) + .column("A", "B", "C", "D") + .column(1L, 2L, 3L, 4L) + .column(2L, 3L, 4L, 5L) + .timestampMillisecondsColumn(983750400000L, 1288483200000L, 782611200000L, 656208000000L) + .build(); + Table table = Table.readJSON(Schema.INFERRED, opts, data)) { + assertTablesAreEqual(expected, table); + } + } + + @Test + void testReadJSONBuffer() { + // JSON reader will set the column according to the iterator if can't infer the name + // So we must set the same name accordingly + Schema schema = Schema.builder() + .column(DType.STRING, "0") + .column(DType.INT32, "1") + .column(DType.INT32, "2") + .build(); + JSONOptions opts = JSONOptions.builder() + .build(); + byte[] data = ("[A,1,2]\n" + + "[B,2,3]'\n" + + "[C,3,4]\n" + + "[D,4,5]").getBytes(StandardCharsets.UTF_8); + try (Table expected = new Table.TestBuilder() + .column("A", "B", "C", "D") + .column(1, 2, 3, 4) + .column(2, 3, 4, 5) + .build(); + Table table = Table.readJSON(schema, opts, data)) { + assertTablesAreEqual(expected, table); + } + } + + @Test + void testReadJSONBufferWithOffset() { + // JSON reader will set the column according to the iterator if can't infer the name + // So we must set the same name accordingly + Schema schema = Schema.builder() + .column(DType.STRING, "0") + .column(DType.INT32, "1") + .column(DType.INT32, "2") + .build(); + JSONOptions opts = JSONOptions.builder() + .build(); + int bytesToIgnore = 8; + byte[] data = ("[A,1,2]\n" + + "[B,2,3]'\n" + + "[C,3,4]\n" + + "[D,4,5]").getBytes(StandardCharsets.UTF_8); + try (Table expected = new Table.TestBuilder() + .column("B", "C", "D") + .column(2, 3, 4) + .column(3, 4, 5) + .build(); + Table table = Table.readJSON(schema, opts, data, + bytesToIgnore, data.length - bytesToIgnore)) { + assertTablesAreEqual(expected, table); + } + } + @Test void testReadCSVPrune() { Schema schema = Schema.builder() diff --git a/java/src/test/resources/people.json b/java/src/test/resources/people.json new file mode 100644 index 00000000000..50a859cbd7e --- /dev/null +++ b/java/src/test/resources/people.json @@ -0,0 +1,3 @@ +{"name":"Michael"} +{"name":"Andy", "age":30} +{"name":"Justin", "age":19} diff --git a/python/cudf/cudf/_lib/cpp/datetime.pxd b/python/cudf/cudf/_lib/cpp/datetime.pxd index f75b39ce6ee..498fc313cf9 100644 --- a/python/cudf/cudf/_lib/cpp/datetime.pxd +++ b/python/cudf/cudf/_lib/cpp/datetime.pxd @@ -13,45 +13,26 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil: cdef unique_ptr[column] extract_hour(const column_view& column) except + cdef unique_ptr[column] extract_minute(const column_view& column) except + cdef unique_ptr[column] extract_second(const column_view& column) except + - cdef unique_ptr[column] ceil_day(const column_view& column) except + - cdef unique_ptr[column] ceil_hour(const column_view& column) except + - cdef unique_ptr[column] ceil_minute(const column_view& column) except + - cdef unique_ptr[column] ceil_second(const column_view& column) except + - cdef unique_ptr[column] ceil_millisecond( - const column_view& column - ) except + - cdef unique_ptr[column] ceil_microsecond( - const column_view& column - ) except + - cdef unique_ptr[column] ceil_nanosecond( - const column_view& column - ) except + - cdef unique_ptr[column] floor_day(const column_view& column) except + - cdef unique_ptr[column] floor_hour(const column_view& column) except + - cdef unique_ptr[column] floor_minute(const column_view& column) except + - cdef unique_ptr[column] floor_second(const column_view& column) except + - cdef unique_ptr[column] floor_millisecond( - const column_view& column - ) except + - cdef unique_ptr[column] floor_microsecond( - const column_view& column - ) except + - cdef unique_ptr[column] floor_nanosecond( - const column_view& column - ) except + - cdef unique_ptr[column] round_day(const column_view& column) except + - cdef unique_ptr[column] round_hour(const column_view& column) except + - cdef unique_ptr[column] round_minute(const column_view& column) except + - cdef unique_ptr[column] round_second(const column_view& column) except + - cdef unique_ptr[column] round_millisecond( - const column_view& column + + ctypedef enum rounding_frequency "cudf::datetime::rounding_frequency": + DAY "cudf::datetime::rounding_frequency::DAY" + HOUR "cudf::datetime::rounding_frequency::HOUR" + MINUTE "cudf::datetime::rounding_frequency::MINUTE" + SECOND "cudf::datetime::rounding_frequency::SECOND" + MILLISECOND "cudf::datetime::rounding_frequency::MILLISECOND" + MICROSECOND "cudf::datetime::rounding_frequency::MICROSECOND" + NANOSECOND "cudf::datetime::rounding_frequency::NANOSECOND" + + cdef unique_ptr[column] ceil_datetimes( + const column_view& column, rounding_frequency freq ) except + - cdef unique_ptr[column] round_microsecond( - const column_view& column + cdef unique_ptr[column] floor_datetimes( + const column_view& column, rounding_frequency freq ) except + - cdef unique_ptr[column] round_nanosecond( - const column_view& column + cdef unique_ptr[column] round_datetimes( + const column_view& column, rounding_frequency freq ) except + + cdef unique_ptr[column] add_calendrical_months( const column_view& timestamps, const column_view& months diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx index 3c05a17c268..e41016645cd 100644 --- a/python/cudf/cudf/_lib/datetime.pyx +++ b/python/cudf/cudf/_lib/datetime.pyx @@ -62,82 +62,63 @@ def extract_datetime_component(Column col, object field): return result -def ceil_datetime(Column col, object field): +cdef libcudf_datetime.rounding_frequency _get_rounding_frequency(object freq): + cdef libcudf_datetime.rounding_frequency freq_val + + # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Timedelta.resolution_string.html + if freq == "D": + freq_val = libcudf_datetime.rounding_frequency.DAY + elif freq == "H": + freq_val = libcudf_datetime.rounding_frequency.HOUR + elif freq in ("T", "min"): + freq_val = libcudf_datetime.rounding_frequency.MINUTE + elif freq == "S": + freq_val = libcudf_datetime.rounding_frequency.SECOND + elif freq in ("L", "ms"): + freq_val = libcudf_datetime.rounding_frequency.MILLISECOND + elif freq in ("U", "us"): + freq_val = libcudf_datetime.rounding_frequency.MICROSECOND + elif freq == "N": + freq_val = libcudf_datetime.rounding_frequency.NANOSECOND + else: + raise ValueError(f"Invalid resolution: '{freq}'") + return freq_val + + +def ceil_datetime(Column col, object freq): cdef unique_ptr[column] c_result cdef column_view col_view = col.view() + cdef libcudf_datetime.rounding_frequency freq_val = \ + _get_rounding_frequency(freq) with nogil: - # https://pandas.pydata.org/pandas-docs/version/0.25.0/reference/api/pandas.Timedelta.resolution.html - if field == "D": - c_result = move(libcudf_datetime.ceil_day(col_view)) - elif field == "H": - c_result = move(libcudf_datetime.ceil_hour(col_view)) - elif field == "T" or field == "min": - c_result = move(libcudf_datetime.ceil_minute(col_view)) - elif field == "S": - c_result = move(libcudf_datetime.ceil_second(col_view)) - elif field == "L" or field == "ms": - c_result = move(libcudf_datetime.ceil_millisecond(col_view)) - elif field == "U" or field == "us": - c_result = move(libcudf_datetime.ceil_microsecond(col_view)) - elif field == "N": - c_result = move(libcudf_datetime.ceil_nanosecond(col_view)) - else: - raise ValueError(f"Invalid resolution: '{field}'") + c_result = move(libcudf_datetime.ceil_datetimes(col_view, freq_val)) result = Column.from_unique_ptr(move(c_result)) return result -def floor_datetime(Column col, object field): +def floor_datetime(Column col, object freq): cdef unique_ptr[column] c_result cdef column_view col_view = col.view() + cdef libcudf_datetime.rounding_frequency freq_val = \ + _get_rounding_frequency(freq) with nogil: - # https://pandas.pydata.org/docs/reference/api/pandas.Timedelta.resolution_string.html - if field == "D": - c_result = move(libcudf_datetime.floor_day(col_view)) - elif field == "H": - c_result = move(libcudf_datetime.floor_hour(col_view)) - elif field == "T" or field == "min": - c_result = move(libcudf_datetime.floor_minute(col_view)) - elif field == "S": - c_result = move(libcudf_datetime.floor_second(col_view)) - elif field == "L" or field == "ms": - c_result = move(libcudf_datetime.floor_millisecond(col_view)) - elif field == "U" or field == "us": - c_result = move(libcudf_datetime.floor_microsecond(col_view)) - elif field == "N": - c_result = move(libcudf_datetime.floor_nanosecond(col_view)) - else: - raise ValueError(f"Invalid resolution: '{field}'") + c_result = move(libcudf_datetime.floor_datetimes(col_view, freq_val)) result = Column.from_unique_ptr(move(c_result)) return result -def round_datetime(Column col, object field): +def round_datetime(Column col, object freq): cdef unique_ptr[column] c_result cdef column_view col_view = col.view() + cdef libcudf_datetime.rounding_frequency freq_val = \ + _get_rounding_frequency(freq) with nogil: - # https://pandas.pydata.org/docs/reference/api/pandas.Timedelta.resolution_string.html - if field == "D": - c_result = move(libcudf_datetime.round_day(col_view)) - elif field == "H": - c_result = move(libcudf_datetime.round_hour(col_view)) - elif field == "T" or field == "min": - c_result = move(libcudf_datetime.round_minute(col_view)) - elif field == "S": - c_result = move(libcudf_datetime.round_second(col_view)) - elif field == "L" or field == "ms": - c_result = move(libcudf_datetime.round_millisecond(col_view)) - elif field == "U" or field == "us": - c_result = move(libcudf_datetime.round_microsecond(col_view)) - elif field == "N": - c_result = move(libcudf_datetime.round_nanosecond(col_view)) - else: - raise ValueError(f"Invalid resolution: '{field}'") + c_result = move(libcudf_datetime.round_datetimes(col_view, freq_val)) result = Column.from_unique_ptr(move(c_result)) return result diff --git a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx index 49f24436b88..426744ee46c 100644 --- a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx +++ b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx @@ -58,38 +58,3 @@ def subword_tokenize_inmem_hash( masks = Column.from_unique_ptr(move(c_result.tensor_attention_mask)) metadata = Column.from_unique_ptr(move(c_result.tensor_metadata)) return tokens, masks, metadata - - -def subword_tokenize_vocab_file( - Column strings, - object hash_file, - uint32_t max_sequence_length=64, - uint32_t stride=48, - bool do_lower=True, - bool do_truncate=False, - uint32_t max_rows_tensor=500 -): - """ - Subword tokenizes text series by using the hashed vocabulary - stored on disk - """ - cdef column_view c_strings = strings.view() - cdef cpp_tokenizer_result c_result - cdef string c_hash_file = str(hash_file).encode() - with nogil: - c_result = tr_move( - cpp_subword_tokenize( - c_strings, - c_hash_file, - max_sequence_length, - stride, - do_lower, - do_truncate, - max_rows_tensor - ) - ) - # return the 3 tensor components - tokens = Column.from_unique_ptr(move(c_result.tensor_token_ids)) - masks = Column.from_unique_ptr(move(c_result.tensor_attention_mask)) - metadata = Column.from_unique_ptr(move(c_result.tensor_metadata)) - return tokens, masks, metadata diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index fbc1538cc74..7911d0eff2a 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -12,7 +12,6 @@ is_letter_multi, porter_stemmer_measure, ) -from cudf._lib.nvtext.subword_tokenize import subword_tokenize_vocab_file from cudf._lib.nvtext.tokenize import ( _count_tokens_column, _count_tokens_scalar, diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx index b33a3cefba7..931a2702612 100644 --- a/python/cudf/cudf/_lib/transpose.pyx +++ b/python/cudf/cudf/_lib/transpose.pyx @@ -28,20 +28,19 @@ def transpose(source): return source cats = None - dtype = source._columns[0].dtype + columns = source._columns + dtype = columns[0].dtype if is_categorical_dtype(dtype): - if any(not is_categorical_dtype(c.dtype) for c in source._columns): + if any(not is_categorical_dtype(c.dtype) for c in columns): raise ValueError('Columns must all have the same dtype') - cats = list(c.categories for c in source._columns) + cats = list(c.categories for c in columns) cats = cudf.core.column.concat_columns(cats).unique() source = cudf.core.frame.Frame(index=source._index, data=[ (name, col._set_categories(cats, is_unique=True).codes) for name, col in source._data.items() ]) - elif dtype.kind in 'OU': - raise NotImplementedError('Cannot transpose string columns') - elif any(c.dtype != dtype for c in source._columns): + elif any(c.dtype != dtype for c in columns): raise ValueError('Columns must all have the same dtype') cdef pair[unique_ptr[column], table_view] c_result diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index ed1cc74db71..aa89b8f849f 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1414,6 +1414,16 @@ def from_pandas(cls, index, nan_as_null=None): def _constructor_expanddim(self): return cudf.MultiIndex + def _split_columns_by_levels(self, levels): + if isinstance(levels, int) and levels > 0: + raise ValueError(f"Out of bound level: {levels}") + return ( + [self._data[self.name]], + [], + ["index" if self.name is None else self.name], + [], + ) + def _get_result_name(left_name, right_name): if left_name == right_name: diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 1c9a013810a..a83110d273c 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4711,119 +4711,6 @@ def filter_tokens( ), ) - def subword_tokenize( - self, - hash_file: str, - max_length: int = 64, - stride: int = 48, - do_lower: bool = True, - do_truncate: bool = False, - max_rows_tensor: int = 500, - ) -> Tuple[cupy.ndarray, cupy.ndarray, cupy.ndarray]: - """ - Run CUDA BERT subword tokenizer on cuDF strings column. - Encodes words to token ids using vocabulary from a pretrained - tokenizer. - - This function requires about 21x the number of character bytes - in the input strings column as working memory. - - ``Series.str.subword_tokenize`` is deprecated and will be removed. - Use ``cudf.core.subword_tokenizer.SubwordTokenizer`` instead. - - Parameters - ---------- - hash_file : str - Path to hash file containing vocabulary of words with token-ids. - This can be created from the raw vocabulary - using the ``cudf.utils.hash_vocab_utils.hash_vocab`` function - max_length : int, Default is 64 - Limits the length of the sequence returned. - If tokenized string is shorter than max_length, - output will be padded with 0s. - If the tokenized string is longer than max_length and - do_truncate == False, there will be multiple returned - sequences containing the overflowing token-ids. - stride : int, Default is 48 - If do_truncate == False and the tokenized string is larger - than max_length, the sequences containing the overflowing - token-ids can contain duplicated token-ids from the main - sequence. If max_length is equal to stride there are no - duplicated-id tokens. If stride is 80% of max_length, - 20% of the first sequence will be repeated on the second - sequence and so on until the entire sentence is encoded. - do_lower : bool, Default is True - If set to true, original text will be lowercased before encoding. - do_truncate : bool, Default is False - If set to true, strings will be truncated and padded to - max_length. Each input string will result in exactly one output - sequence. If set to false, there may be multiple output - sequences when the max_length is smaller than generated tokens. - max_rows_tensor : int, Default is 500 - Maximum number of rows for the output token-ids expected - to be generated by the tokenizer. - Used for allocating temporary working memory on the GPU device. - If the output generates a larger number of rows, behavior - is undefined. - This will vary based on stride, truncation, and max_length. - For example, for non-overlapping sequences output rows - will be the same as input rows. - - Returns - ------- - token-ids : cupy.ndarray - The token-ids for each string padded with 0s to max_length. - attention-mask : cupy.ndarray - The mask for token-ids result where corresponding positions - identify valid token-id values. - metadata : cupy.ndarray - Each row contains the index id of the original string and the - first and last index of the token-ids that are non-padded and - non-overlapping. - - Examples - -------- - >>> import cudf - >>> from cudf.utils.hash_vocab_utils import hash_vocab - >>> hash_vocab('bert-base-uncased-vocab.txt', 'voc_hash.txt') - >>> ser = cudf.Series(['this is the', 'best book']) - >>> stride, max_length = 8, 8 - >>> max_rows_tensor = len(ser) - >>> tokens, masks, metadata = ser.str.subword_tokenize('voc_hash.txt', - ... max_length=max_length, stride=stride, - ... max_rows_tensor=max_rows_tensor) - >>> tokens.reshape(-1, max_length) - array([[2023, 2003, 1996, 0, 0, 0, 0, 0], - [2190, 2338, 0, 0, 0, 0, 0, 0]], dtype=uint32) - >>> masks.reshape(-1, max_length) - array([[1, 1, 1, 0, 0, 0, 0, 0], - [1, 1, 0, 0, 0, 0, 0, 0]], dtype=uint32) - >>> metadata.reshape(-1, 3) - array([[0, 0, 2], - [1, 0, 1]], dtype=uint32) - """ - warnings.warn( - "`Series.str.subword_tokenize` is deprecated and will be removed " - "in future versions of cudf. Use " - "`cudf.core.subword_tokenizer.SubwordTokenizer` instead.", - FutureWarning, - ) - - tokens, masks, metadata = libstrings.subword_tokenize_vocab_file( - self._column, - hash_file, - max_length, - stride, - do_lower, - do_truncate, - max_rows_tensor, - ) - return ( - cupy.asarray(tokens), - cupy.asarray(masks), - cupy.asarray(metadata), - ) - def porter_stemmer_measure(self) -> SeriesOrIndex: """ Compute the Porter Stemmer measure for each string. diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 88c8aaebd9e..fe6ac8e1529 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -58,7 +58,9 @@ _FrameIndexer, _get_label_range_or_mask, _indices_from_labels, + doc_reset_index_template, ) +from cudf.core.multiindex import MultiIndex from cudf.core.resample import DataFrameResampler from cudf.core.series import Series from cudf.utils import applyutils, docutils, ioutils, queryutils, utils @@ -90,8 +92,6 @@ class _DataFrameIndexer(_FrameIndexer): def __getitem__(self, arg): - from cudf import MultiIndex - if isinstance(self._frame.index, MultiIndex) or isinstance( self._frame.columns, MultiIndex ): @@ -118,8 +118,6 @@ def _can_downcast_to_series(self, df, arg): operation should be "downcasted" from a DataFrame to a Series """ - from cudf.core.column import as_column - if isinstance(df, cudf.Series): return False nrows, ncols = df.shape @@ -201,11 +199,6 @@ def _getitem_scalar(self, arg): def _getitem_tuple_arg(self, arg): from uuid import uuid4 - from cudf import MultiIndex - from cudf.core.column import column - from cudf.core.dataframe import DataFrame - from cudf.core.index import as_index - # Step 1: Gather columns if isinstance(arg, tuple): columns_df = self._frame._get_columns_by_label(arg[1]) @@ -245,7 +238,7 @@ def _getitem_tuple_arg(self, arg): tmp_arg = ([tmp_arg[0]], tmp_arg[1]) if len(tmp_arg[0]) == 0: return columns_df._empty_like(keep_index=True) - tmp_arg = (column.as_column(tmp_arg[0]), tmp_arg[1]) + tmp_arg = (as_column(tmp_arg[0]), tmp_arg[1]) if is_bool_dtype(tmp_arg[0]): df = columns_df._apply_boolean_mask(tmp_arg[0]) @@ -273,7 +266,7 @@ def _getitem_tuple_arg(self, arg): start = self._frame.index[0] df.index = as_index(start) else: - row_selection = column.as_column(arg[0]) + row_selection = as_column(arg[0]) if is_bool_dtype(row_selection.dtype): df.index = self._frame.index.take(row_selection) else: @@ -285,7 +278,7 @@ def _getitem_tuple_arg(self, arg): @annotate("LOC_SETITEM", color="blue", domain="cudf_python") def _setitem_tuple_arg(self, key, value): - if isinstance(self._frame.index, cudf.MultiIndex) or isinstance( + if isinstance(self._frame.index, MultiIndex) or isinstance( self._frame.columns, pd.MultiIndex ): raise NotImplementedError( @@ -322,7 +315,7 @@ def _setitem_tuple_arg(self, key, value): self._frame._data.insert(key[1], new_col) else: if isinstance(value, (cupy.ndarray, np.ndarray)): - value_df = cudf.DataFrame(value) + value_df = DataFrame(value) if value_df.shape[1] != columns_df.shape[1]: if value_df.shape[1] == 1: value_cols = ( @@ -351,13 +344,9 @@ class _DataFrameIlocIndexer(_DataFrameIndexer): @annotate("ILOC_GETITEM", color="blue", domain="cudf_python") def _getitem_tuple_arg(self, arg): - from cudf import MultiIndex - from cudf.core.column import column - from cudf.core.index import as_index - # Iloc Step 1: # Gather the columns specified by the second tuple arg - columns_df = cudf.DataFrame(self._frame._get_columns_by_index(arg[1])) + columns_df = DataFrame(self._frame._get_columns_by_index(arg[1])) columns_df._index = self._frame._index @@ -385,7 +374,7 @@ def _getitem_tuple_arg(self, arg): index += len(columns_df) df = columns_df._slice(slice(index, index + 1, 1)) else: - arg = (column.as_column(arg[0]), arg[1]) + arg = (as_column(arg[0]), arg[1]) if is_bool_dtype(arg[0]): df = columns_df._apply_boolean_mask(arg[0]) else: @@ -407,7 +396,7 @@ def _getitem_tuple_arg(self, arg): @annotate("ILOC_SETITEM", color="blue", domain="cudf_python") def _setitem_tuple_arg(self, key, value): - columns = cudf.DataFrame(self._frame._get_columns_by_index(key[1])) + columns = DataFrame(self._frame._get_columns_by_index(key[1])) for col in columns: self._frame[col].iloc[key[0]] = value @@ -953,6 +942,7 @@ def ndim(self): return 2 def __dir__(self): + # Add the columns of the DataFrame to the dir output. o = set(dir(type(self))) o.update(self.__dict__) o.update( @@ -1169,8 +1159,6 @@ def _slice(self: T, arg: slice) -> T: arg : should always be of type slice """ - from cudf.core.index import RangeIndex - num_rows = len(self) if num_rows == 0: return self @@ -1284,8 +1272,6 @@ def memory_usage(self, index=True, deep=False): return Series(sizes, index=ind) def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - import cudf - if method == "__call__" and hasattr(cudf, ufunc.__name__): func = getattr(cudf, ufunc.__name__) return func(self) @@ -1329,6 +1315,7 @@ def __array_function__(self, func, types, args, kwargs): else: return NotImplemented + # The _get_numeric_data method is necessary for dask compatibility. def _get_numeric_data(self): """Return a dataframe with only numeric data types""" columns = [ @@ -1554,9 +1541,9 @@ def _concat( out._index._data, indices[:first_data_column_position], ) - if not isinstance( - out._index, cudf.MultiIndex - ) and is_categorical_dtype(out._index._values.dtype): + if not isinstance(out._index, MultiIndex) and is_categorical_dtype( + out._index._values.dtype + ): out = out.set_index( cudf.core.index.as_index(out.index._values) ) @@ -1672,51 +1659,6 @@ def astype(self, dtype, copy=False, errors="raise", **kwargs): return result - def _repr_pandas025_formatting(self, ncols, nrows, dtype=None): - """ - With Pandas > 0.25 there are some new conditional formatting for some - datatypes and column/row configurations. This fixes most of them in - context to match the expected Pandas repr of the same content. - - Examples - -------- - >>> gdf.__repr__() - 0 ... 19 - 0 46 ... 48 - .. .. ... .. - 19 40 ... 29 - - [20 rows x 20 columns] - - >>> nrows, ncols = _repr_pandas025_formatting(2, 2, dtype="category") - >>> pd.options.display.max_rows = nrows - >>> pd.options.display.max_columns = ncols - >>> gdf.__repr__() - 0 ... 19 - 0 46 ... 48 - .. .. ... .. - 19 40 ... 29 - - [20 rows x 20 columns] - """ - ncols = 1 if ncols in [0, 2] and dtype == "datetime64[ns]" else ncols - ncols = ( - 1 - if ncols == 0 - and nrows == 1 - and dtype in ["int8", "str", "category"] - else ncols - ) - ncols = ( - 1 - if nrows == 1 - and dtype in ["int8", "int16", "int64", "str", "category"] - else ncols - ) - ncols = 0 if ncols == 2 else ncols - ncols = 19 if ncols in [20, 21] else ncols - return ncols, nrows - def _clean_renderable_dataframe(self, output): """ This method takes in partial/preprocessed dataframe @@ -1822,7 +1764,7 @@ def _get_renderable_dataframe(self): # adjust right columns for output if multiindex. right_cols = ( right_cols - 1 - if isinstance(self.index, cudf.MultiIndex) + if isinstance(self.index, MultiIndex) else right_cols ) left_cols = int(ncols / 2.0) + 1 @@ -2151,20 +2093,6 @@ def columns(self, columns): data, multiindex=is_multiindex, level_names=columns.names, ) - def _rename_columns(self, new_names): - old_cols = iter(self._data.names) - l_old_cols = len(self._data) - l_new_cols = len(new_names) - if l_new_cols != l_old_cols: - msg = ( - f"Length of new column names: {l_new_cols} does not " - "match length of previous column names: {l_old_cols}" - ) - raise ValueError(msg) - - mapper = dict(zip(old_cols, new_names)) - self.rename(mapper=mapper, inplace=True, axis=1) - def _reindex( self, columns, dtypes=None, deep=False, index=None, inplace=False ): @@ -2209,11 +2137,9 @@ def _reindex( columns = ( columns if columns is not None else list(df._column_names) ) - df = cudf.DataFrame() + df = DataFrame() else: - df = cudf.DataFrame(None, index).join( - df, how="left", sort=True - ) + df = DataFrame(None, index).join(df, how="left", sort=True) # double-argsort to map back from sorted to unsorted positions df = df.take(index.argsort(ascending=True).argsort()) @@ -2445,7 +2371,7 @@ def set_index( except TypeError: msg = f"{col} cannot be converted to column-like." raise TypeError(msg) - if isinstance(col, (cudf.MultiIndex, pd.MultiIndex)): + if isinstance(col, (MultiIndex, pd.MultiIndex)): col = ( cudf.from_pandas(col) if isinstance(col, pd.MultiIndex) @@ -2473,7 +2399,7 @@ def set_index( if append: idx_cols = [self.index._data[x] for x in self.index._data] - if isinstance(self.index, cudf.MultiIndex): + if isinstance(self.index, MultiIndex): idx_names = self.index.names else: idx_names = [self.index.name] @@ -2485,7 +2411,7 @@ def set_index( elif len(columns_to_add) == 1: idx = cudf.Index(columns_to_add[0], name=names[0]) else: - idx = cudf.MultiIndex._from_data( + idx = MultiIndex._from_data( {i: col for i, col in enumerate(columns_to_add)} ) idx.names = names @@ -2504,29 +2430,13 @@ def set_index( df.index = idx return df if not inplace else None - def reset_index( - self, level=None, drop=False, inplace=False, col_level=0, col_fill="" - ): - """ - Reset the index. - - Reset the index of the DataFrame, and use the default one instead. - - Parameters - ---------- - drop : bool, default False - Do not try to insert index into dataframe columns. This resets - the index to the default integer index. - inplace : bool, default False - Modify the DataFrame in place (do not create a new object). - - Returns - ------- - DataFrame or None - DataFrame with the new index or None if ``inplace=True``. - - Examples - -------- + @docutils.doc_apply( + doc_reset_index_template.format( + klass="DataFrame", + argument="", + return_type="DataFrame or None", + return_doc="", + example=""" >>> df = cudf.DataFrame([('bird', 389.0), ... ('bird', 24.0), ... ('mammal', 80.5), @@ -2551,45 +2461,51 @@ class max_speed 1 bird 24.0 2 mammal 80.5 3 mammal - """ - if level is not None: - raise NotImplementedError("level parameter is not supported yet.") - - if col_level != 0: - raise NotImplementedError( - "col_level parameter is not supported yet." - ) - - if col_fill != "": - raise NotImplementedError( - "col_fill parameter is not supported yet." - ) - result = self if inplace else self.copy() - - if not drop: - if isinstance(self.index, cudf.MultiIndex): - names = tuple( - name if name is not None else f"level_{i}" - for i, name in enumerate(self.index.names) + You can also use ``reset_index`` with MultiIndex. + + >>> index = cudf.MultiIndex.from_tuples([('bird', 'falcon'), + ... ('bird', 'parrot'), + ... ('mammal', 'lion'), + ... ('mammal', 'monkey')], + ... names=['class', 'name']) + >>> df = cudf.DataFrame([(389.0, 'fly'), + ... ( 24.0, 'fly'), + ... ( 80.5, 'run'), + ... (np.nan, 'jump')], + ... index=index, + ... columns=('speed', 'type')) + >>> df + speed type + class name + bird falcon 389.0 fly + parrot 24.0 fly + mammal lion 80.5 run + monkey jump + >>> df.reset_index(level='class') + class speed type + name + falcon bird 389.0 fly + parrot bird 24.0 fly + lion mammal 80.5 run + monkey mammal jump + """, + ) + ) + def reset_index( + self, level=None, drop=False, inplace=False, col_level=0, col_fill="" + ): + return self._mimic_inplace( + DataFrame._from_data( + *self._reset_index( + level=level, + drop=drop, + col_level=col_level, + col_fill=col_fill, ) - else: - if self.index.name is None: - if "index" in self._data.names: - names = ("level_0",) - else: - names = ("index",) - else: - names = (self.index.name,) - - index_columns = self.index._data.columns - for name, index_column in zip( - reversed(names), reversed(index_columns) - ): - result.insert(0, name, index_column) - result.index = RangeIndex(len(self)) - if not inplace: - return result + ), + inplace=inplace, + ) def take(self, indices, axis=0, keep_index=None): axis = self._get_axis_from_axis_arg(axis) @@ -3028,9 +2944,7 @@ def rename( "mixed type is not yet supported." ) - if level is not None and isinstance( - self.index, cudf.core.multiindex.MultiIndex - ): + if level is not None and isinstance(self.index, MultiIndex): out_index = self.index.copy(deep=copy) out_index.get_level_values(level).to_frame().replace( to_replace=list(index.keys()), @@ -3128,78 +3042,6 @@ def as_matrix(self, columns=None): ) return self.as_gpu_matrix(columns=columns).copy_to_host() - def one_hot_encoding( - self, column, prefix, cats, prefix_sep="_", dtype="float64" - ): - """ - Expand a column with one-hot-encoding. - - Parameters - ---------- - - column : str - the source column with binary encoding for the data. - prefix : str - the new column name prefix. - cats : sequence of ints - the sequence of categories as integers. - prefix_sep : str - the separator between the prefix and the category. - dtype : - the dtype for the outputs; defaults to float64. - - Returns - ------- - - a new dataframe with new columns append for each category. - - Examples - -------- - >>> import pandas as pd - >>> import cudf - >>> pet_owner = [1, 2, 3, 4, 5] - >>> pet_type = ['fish', 'dog', 'fish', 'bird', 'fish'] - >>> df = pd.DataFrame({'pet_owner': pet_owner, 'pet_type': pet_type}) - >>> df.pet_type = df.pet_type.astype('category') - - Create a column with numerically encoded category values - - >>> df['pet_codes'] = df.pet_type.cat.codes - >>> gdf = cudf.from_pandas(df) - - Create the list of category codes to use in the encoding - - >>> codes = gdf.pet_codes.unique() - >>> gdf.one_hot_encoding('pet_codes', 'pet_dummy', codes).head() - pet_owner pet_type pet_codes pet_dummy_0 pet_dummy_1 pet_dummy_2 - 0 1 fish 2 0.0 0.0 1.0 - 1 2 dog 1 0.0 1.0 0.0 - 2 3 fish 2 0.0 0.0 1.0 - 3 4 bird 0 1.0 0.0 0.0 - 4 5 fish 2 0.0 0.0 1.0 - """ - - warnings.warn( - "DataFrame.one_hot_encoding is deprecated and will be removed in " - "future, use `get_dummies` instead.", - FutureWarning, - ) - - if hasattr(cats, "to_arrow"): - cats = cats.to_arrow().to_pylist() - else: - cats = pd.Series(cats, dtype="object") - - newnames = [ - prefix_sep.join([prefix, "null" if cat is None else str(cat)]) - for cat in cats - ] - newcols = self[column].one_hot_encoding(cats=cats, dtype=dtype) - outdf = self.copy() - for name, col in zip(newnames, newcols): - outdf.insert(len(outdf._data), name, col) - return outdf - def label_encoding( self, column, prefix, cats, prefix_sep="_", dtype=None, na_sentinel=-1 ): @@ -3307,7 +3149,7 @@ def agg(self, aggs, axis=None): raise NotImplementedError("axis not implemented yet") if isinstance(aggs, Iterable) and not isinstance(aggs, (str, dict)): - result = cudf.DataFrame() + result = DataFrame() # TODO : Allow simultaneous pass for multi-aggregation as # a future optimization for agg in aggs: @@ -3320,7 +3162,7 @@ def agg(self, aggs, axis=None): f"{aggs} is not a valid function for " f"'DataFrame' object" ) - result = cudf.DataFrame() + result = DataFrame() result[aggs] = getattr(df_normalized, aggs)() result = result.iloc[:, 0] result.name = None @@ -3355,7 +3197,7 @@ def agg(self, aggs, axis=None): raise NotImplementedError( "callable parameter is not implemented yet" ) - result = cudf.DataFrame(index=idxs, columns=cols) + result = DataFrame(index=idxs, columns=cols) for key in aggs.keys(): col = df_normalized[key] col_empty = column_empty( @@ -4227,77 +4069,6 @@ def apply_chunks( tpb=tpb, ) - def hash_columns(self, columns=None, method="murmur3"): - """Hash the given *columns* and return a new device array - - This method is deprecated. Replace ``df.hash_columns(columns, method)`` - with ``df[columns].hash_values(method)``. - - Parameters - ---------- - columns : sequence of str; optional - Sequence of column names. If columns is *None* (unspecified), - all columns in the frame are used. - method : {'murmur3', 'md5'}, default 'murmur3' - Hash function to use: - * murmur3: MurmurHash3 hash function. - * md5: MD5 hash function. - - Returns - ------- - Series - Hash values for each row. - """ - warnings.warn( - "The `hash_columns` method will be removed in a future cuDF " - "release. Replace `df.hash_columns(columns, method)` with " - "`df[columns].hash_values(method)`.", - FutureWarning, - ) - if columns is None: - # Slice by [:] to keep all columns. - columns = slice(None, None, None) - return self[columns].hash_values(method=method) - - def hash_values(self, method="murmur3"): - """Compute the hash of values in each row. - - Parameters - ---------- - method : {'murmur3', 'md5'}, default 'murmur3' - Hash function to use: - * murmur3: MurmurHash3 hash function. - * md5: MD5 hash function. - - Returns - ------- - Series - A Series with hash values. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({"a": [10, 120, 30], "b": [0.0, 0.25, 0.50]}) - >>> df - a b - 0 10 0.00 - 1 120 0.25 - 2 30 0.50 - >>> df.hash_values(method="murmur3") - 0 -330519225 - 1 -397962448 - 2 -1345834934 - dtype: int32 - >>> df.hash_values(method="md5") - 0 57ce879751b5169c525907d5c563fae1 - 1 948d6221a7c4963d4be411bcead7e32b - 2 fe061786ea286a515b772d91b0dfcd70 - dtype: object - """ - return Series._from_data( - {None: self._hash(method=method)}, index=self.index - ) - def partition_by_hash(self, columns, nparts, keep_index=True): """Partition the dataframe by the hashed value of data in *columns*. @@ -4321,7 +4092,13 @@ def partition_by_hash(self, columns, nparts, keep_index=True): else self._index._num_columns ) key_indices = [self._data.names.index(k) + idx for k in columns] - outdf, offsets = self._hash_partition(key_indices, nparts, keep_index) + + output_data, output_index, offsets = libcudf.hash.hash_partition( + self, key_indices, nparts, keep_index + ) + outdf = self.__class__._from_data(output_data, output_index) + outdf._copy_type_metadata(self, include_index=keep_index) + # Slice into partition return [outdf[s:e] for s, e in zip(offsets, offsets[1:] + [None])] @@ -4758,7 +4535,7 @@ def to_pandas(self, nullable=False, **kwargs): if isinstance(self.columns, BaseIndex): out_columns = self.columns.to_pandas() - if isinstance(self.columns, cudf.core.multiindex.MultiIndex): + if isinstance(self.columns, MultiIndex): if self.columns.names is not None: out_columns.names = self.columns.names else: @@ -4934,7 +4711,7 @@ def to_arrow(self, preserve_index=True): "step": 1, } else: - if isinstance(self.index, cudf.MultiIndex): + if isinstance(self.index, MultiIndex): gen_names = tuple( f"level_{i}" for i, _ in enumerate(self.index._data.names) @@ -5462,7 +5239,7 @@ def _prepare_for_rowwise_op(self, method, skipna): warnings.warn(msg) if not skipna and any(col.nullable for col in filtered._columns): - mask = cudf.DataFrame( + mask = DataFrame( { name: filtered._data[name]._get_mask_as_column() if filtered._data[name].nullable @@ -6010,11 +5787,11 @@ def stack(self, level=-1, dropna=True): repeated_index = self.index.repeat(self.shape[1]) name_index = Frame({0: self._column_names}).tile(self.shape[0]) new_index = list(repeated_index._columns) + [name_index._columns[0]] - if isinstance(self._index, cudf.MultiIndex): + if isinstance(self._index, MultiIndex): index_names = self._index.names + [None] else: index_names = [None] * len(new_index) - new_index = cudf.core.multiindex.MultiIndex.from_frame( + new_index = MultiIndex.from_frame( DataFrame(dict(zip(range(0, len(new_index)), new_index))), names=index_names, ) @@ -6275,8 +6052,8 @@ def append( elif isinstance(other, list): if not other: pass - elif not isinstance(other[0], cudf.DataFrame): - other = cudf.DataFrame(other) + elif not isinstance(other[0], DataFrame): + other = DataFrame(other) if (self.columns.get_indexer(other.columns) >= 0).all(): other = other.reindex(columns=self.columns) @@ -6574,7 +6351,7 @@ def from_pandas(obj, nan_as_null=None): elif isinstance(obj, pd.Series): return Series.from_pandas(obj, nan_as_null=nan_as_null) elif isinstance(obj, pd.MultiIndex): - return cudf.MultiIndex.from_pandas(obj, nan_as_null=nan_as_null) + return MultiIndex.from_pandas(obj, nan_as_null=nan_as_null) elif isinstance(obj, pd.RangeIndex): return cudf.core.index.RangeIndex( start=obj.start, stop=obj.stop, step=obj.step, name=obj.name @@ -6692,7 +6469,7 @@ def extract_col(df, col): if ( col == "index" and col not in df.index._data - and not isinstance(df.index, cudf.MultiIndex) + and not isinstance(df.index, MultiIndex) ): return df.index._data.columns[0] return df.index._data[col] diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index c83b06707a4..0345966d6bd 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -83,13 +83,6 @@ def __init__(self, data=None, index=None): def _num_columns(self) -> int: return len(self._data) - @property - def _num_indices(self) -> int: - if self._index is None: - return 0 - else: - return len(self._index_names) - @property def _num_rows(self) -> int: if self._index is not None: @@ -157,8 +150,8 @@ def _from_columns( n_index_columns = 0 if index_names is not None: n_index_columns = len(index_names) - index = cudf.core.index._index_from_data( - dict(zip(range(n_index_columns), columns)) + index = cudf.core.index._index_from_columns( + columns[:n_index_columns] ) if isinstance(index, cudf.MultiIndex): index.names = index_names @@ -269,15 +262,6 @@ def shape(self): """Returns a tuple representing the dimensionality of the DataFrame.""" return self._num_rows, self._num_columns - @property - def _is_homogeneous(self): - # make sure that the dataframe has columns - if not self._data.columns: - return True - - first_type = self._data.columns[0].dtype.name - return all(x.dtype.name == first_type for x in self._data.columns) - @property def empty(self): """ @@ -580,19 +564,6 @@ def _gather( result._copy_type_metadata(self) return result - def _hash(self, method): - return libcudf.hash.hash(self, method) - - def _hash_partition( - self, columns_to_hash, num_partitions, keep_index=True - ): - output_data, output_index, offsets = libcudf.hash.hash_partition( - self, columns_to_hash, num_partitions, keep_index - ) - output = self.__class__._from_data(output_data, output_index) - output._copy_type_metadata(self, include_index=keep_index) - return output, offsets - def _as_column(self): """ _as_column : Converts a single columned Frame to Column @@ -1009,30 +980,6 @@ def mask(self, cond, other=None, inplace=False): return self.where(cond=~cond, other=other, inplace=inplace) - def _partition(self, scatter_map, npartitions, keep_index=True): - - data, index, output_offsets = libcudf.partitioning.partition( - self, scatter_map, npartitions, keep_index - ) - partitioned = self.__class__._from_data(data, index) - - # due to the split limitation mentioned - # here: https://github.com/rapidsai/cudf/issues/4607 - # we need to remove first & last elements in offsets. - # TODO: Remove this after the above issue is fixed. - output_offsets = output_offsets[1:-1] - - result = partitioned._split(output_offsets, keep_index=keep_index) - - for frame in result: - frame._copy_type_metadata(self, include_index=keep_index) - - if npartitions: - for _ in range(npartitions - len(result)): - result.append(self._empty_like(keep_index)) - - return result - def pipe(self, func, *args, **kwargs): """ Apply ``func(self, *args, **kwargs)``. @@ -1139,9 +1086,29 @@ def scatter_by_map( f"ERROR: map_size must be >= {count} (got {map_size})." ) - tables = self._partition(map_index, map_size, keep_index) + data, index, output_offsets = libcudf.partitioning.partition( + self, map_index, map_size, keep_index + ) + partitioned = self.__class__._from_data(data, index) + + # due to the split limitation mentioned + # here: https://github.com/rapidsai/cudf/issues/4607 + # we need to remove first & last elements in offsets. + # TODO: Remove this after the above issue is fixed. + output_offsets = output_offsets[1:-1] + + result = partitioned._split(output_offsets, keep_index=keep_index) + + for frame in result: + frame._copy_type_metadata(self, include_index=keep_index) - return tables + if map_size: + result += [ + self._empty_like(keep_index) + for _ in range(map_size - len(result)) + ] + + return result def dropna( self, axis=0, how="any", thresh=None, subset=None, inplace=False @@ -1499,8 +1466,6 @@ def _apply_boolean_mask(self, boolean_mask): Applies boolean mask to each row of `self`, rows corresponding to `False` is dropped """ - boolean_mask = as_column(boolean_mask) - result = self.__class__._from_data( *libcudf.stream_compaction.apply_boolean_mask( self, as_column(boolean_mask) @@ -1798,40 +1763,27 @@ def repeat(self, repeats, axis=None): "Only axis=`None` supported at this time." ) - return self._repeat(repeats) - - def _repeat(self, count): - if not is_scalar(count): - count = as_column(count) + if not is_scalar(repeats): + repeats = as_column(repeats) result = self.__class__._from_data( - *libcudf.filling.repeat(self, count) + *libcudf.filling.repeat(self, repeats) ) result._copy_type_metadata(self) return result - def _fill(self, fill_values, begin, end, inplace): - col_and_fill = zip(self._columns, fill_values) - - if not inplace: - data_columns = (c._fill(v, begin, end) for (c, v) in col_and_fill) - return self.__class__._from_data( - zip(self._column_names, data_columns), self._index - ) - - for (c, v) in col_and_fill: - c.fill(v, begin, end, inplace=True) - - return self - def shift(self, periods=1, freq=None, axis=0, fill_value=None): """Shift values by `periods` positions.""" - assert axis in (None, 0) and freq is None - return self._shift(periods) + axis = self._get_axis_from_axis_arg(axis) + if axis != 0: + raise ValueError("Only axis=0 is supported.") + if freq is not None: + raise ValueError("The freq argument is not yet supported.") - def _shift(self, offset, fill_value=None): - data_columns = (col.shift(offset, fill_value) for col in self._columns) + data_columns = ( + col.shift(periods, fill_value) for col in self._columns + ) return self.__class__._from_data( zip(self._column_names, data_columns), self._index ) @@ -2516,18 +2468,6 @@ def _copy_type_metadata( return self - def _copy_interval_data(self, other, include_index=True): - for name, col, other_col in zip( - self._data.keys(), self._data.values(), other._data.values() - ): - if isinstance(other_col, cudf.core.column.IntervalColumn): - self._data[name] = cudf.core.column.IntervalColumn(col) - - def _postprocess_columns(self, other, include_index=True): - self._copy_categories(other, include_index=include_index) - self._copy_struct_names(other, include_index=include_index) - self._copy_interval_data(other, include_index=include_index) - def isnull(self): """ Identify missing values. diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index f1d622362e2..08ef3f07776 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1190,7 +1190,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): result = self.obj.__class__._from_data( *self._groupby.shift( - cudf.core.frame.Frame(value_columns), periods, fill_value + cudf.core.frame.Frame(value_columns._data), periods, fill_value ) ) result = self._mimic_pandas_order(result) @@ -1299,7 +1299,7 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin): def __getitem__(self, key): return self.obj[key].groupby( - self.grouping, dropna=self._dropna, sort=self._sort + by=self.grouping.keys, dropna=self._dropna, sort=self._sort ) def nunique(self): diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 362c96ebbeb..859a81bc5f4 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -114,6 +114,13 @@ def _index_from_data(data: MutableMapping, name: Any = None): return index_class_type._from_data(data, None, name) +def _index_from_columns( + columns: List[cudf.core.column.ColumnBase], name: Any = None +): + """Construct an index from ``columns``, with levels named 0, 1, 2...""" + return _index_from_data(dict(zip(range(len(columns)), columns)), name=name) + + class RangeIndex(BaseIndex): """ Immutable Index implementing a monotonic integer range. diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 4be35d960ee..2f4d4a88195 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -5,7 +5,7 @@ import operator import warnings -from collections import abc +from collections import Counter, abc from typing import Callable, Type, TypeVar from uuid import uuid4 @@ -24,11 +24,37 @@ is_list_like, ) from cudf.core.column import arange +from cudf.core.column_accessor import ColumnAccessor from cudf.core.frame import Frame -from cudf.core.index import Index +from cudf.core.index import Index, RangeIndex, _index_from_columns from cudf.core.multiindex import MultiIndex from cudf.utils.utils import _gather_map_is_valid, cached_property +doc_reset_index_template = """ + Reset the index of the {klass}, or a level of it. + + Parameters + ---------- + level : int, str, tuple, or list, default None + Only remove the given levels from the index. Removes all levels by + default. + drop : bool, default False + Do not try to insert index into dataframe columns. This resets + the index to the default integer index. +{argument} + inplace : bool, default False + Modify the DataFrame in place (do not create a new object). + + Returns + ------- + {return_type} + {klass} with the new index or None if ``inplace=True``.{return_doc} + + Examples + -------- + {example} +""" + def _indices_from_labels(obj, labels): from cudf.core.column import column @@ -445,6 +471,70 @@ def sort_index( out = out.reset_index(drop=True) return self._mimic_inplace(out, inplace=inplace) + def hash_values(self, method="murmur3"): + """Compute the hash of values in this column. + + Parameters + ---------- + method : {'murmur3', 'md5'}, default 'murmur3' + Hash function to use: + * murmur3: MurmurHash3 hash function. + * md5: MD5 hash function. + + Returns + ------- + Series + A Series with hash values. + + Examples + -------- + **Series** + + >>> import cudf + >>> series = cudf.Series([10, 120, 30]) + >>> series + 0 10 + 1 120 + 2 30 + dtype: int64 + >>> series.hash_values(method="murmur3") + 0 -1930516747 + 1 422619251 + 2 -941520876 + dtype: int32 + >>> series.hash_values(method="md5") + 0 7be4bbacbfdb05fb3044e36c22b41e8b + 1 947ca8d2c5f0f27437f156cfbfab0969 + 2 d0580ef52d27c043c8e341fd5039b166 + dtype: object + + **DataFrame** + + >>> import cudf + >>> df = cudf.DataFrame({"a": [10, 120, 30], "b": [0.0, 0.25, 0.50]}) + >>> df + a b + 0 10 0.00 + 1 120 0.25 + 2 30 0.50 + >>> df.hash_values(method="murmur3") + 0 -330519225 + 1 -397962448 + 2 -1345834934 + dtype: int32 + >>> df.hash_values(method="md5") + 0 57ce879751b5169c525907d5c563fae1 + 1 948d6221a7c4963d4be411bcead7e32b + 2 fe061786ea286a515b772d91b0dfcd70 + dtype: object + """ + # Note that both Series and DataFrame return Series objects from this + # calculation, necessitating the unfortunate circular reference to the + # child class here. + return cudf.Series._from_data( + {None: libcudf.hash.hash(self, method)}, index=self.index + ) + def _gather( self, gather_map, keep_index=True, nullify=False, check_bounds=True ): @@ -1107,6 +1197,53 @@ def resample( else cudf.core.resample.DataFrameResampler(self, by=by) ) + def _reset_index(self, level, drop, col_level=0, col_fill=""): + """Shared path for DataFrame.reset_index and Series.reset_index.""" + if level is not None and not isinstance(level, (tuple, list)): + level = (level,) + _check_duplicate_level_names(level, self._index.names) + + # Split the columns in the index into data and index columns + ( + data_columns, + index_columns, + data_names, + index_names, + ) = self._index._split_columns_by_levels(level) + if index_columns: + index = _index_from_columns(index_columns, name=self._index.name,) + if isinstance(index, MultiIndex): + index.names = index_names + else: + index.name = index_names[0] + else: + index = RangeIndex(len(self)) + + if drop: + return self._data, index + + new_column_data = {} + for name, col in zip(data_names, data_columns): + if name == "index" and "index" in self._data: + name = "level_0" + name = ( + tuple( + name if i == col_level else col_fill + for i in range(self._data.nlevels) + ) + if self._data.multiindex + else name + ) + new_column_data[name] = col + # This is to match pandas where the new data columns are always + # inserted to the left of existing data columns. + return ( + ColumnAccessor( + {**new_column_data, **self._data}, self._data.multiindex + ), + index, + ) + def _first_or_last( self, offset, idx: int, op: Callable, side: str, slice_func: Callable ) -> "IndexedFrame": @@ -1228,3 +1365,20 @@ def last(self, offset): side="right", slice_func=lambda i: self.iloc[i:], ) + + +def _check_duplicate_level_names(specified, level_names): + """Raise if any of `specified` has duplicates in `level_names`.""" + if specified is None: + return + if len(set(level_names)) == len(level_names): + return + duplicates = {key for key, val in Counter(level_names).items() if val > 1} + + duplicates_specified = [spec for spec in specified if spec in duplicates] + if not len(duplicates_specified) == 0: + # Note: pandas raises first encountered duplicates, cuDF raises all. + raise ValueError( + f"The names {duplicates_specified} occurs multiple times, use a" + " level number" + ) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index c403c697e3d..b333c862f21 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1743,3 +1743,39 @@ def _intersection(self, other, sort=None): if sort is None and len(other): return midx.sort_values() return midx + + def _split_columns_by_levels(self, levels): + # This function assumes that for levels with duplicate names, they are + # specified by indices, not name by ``levels``. E.g. [None, None] can + # only be specified by 0, 1, not "None". + + if levels is None: + return ( + list(self._data.columns), + [], + [ + f"level_{i}" if name is None else name + for i, name in enumerate(self.names) + ], + [], + ) + + # Normalize named levels into indices + level_names = list(self.names) + level_indices = { + lv if isinstance(lv, int) else level_names.index(lv) + for lv in levels + } + + # Split the columns + data_columns, index_columns = [], [] + data_names, index_names = [], [] + for i, (name, col) in enumerate(zip(self.names, self._data.columns)): + if i in level_indices: + name = f"level_{i}" if name is None else name + data_columns.append(col) + data_names.append(name) + else: + index_columns.append(col) + index_names.append(name) + return data_columns, index_columns, data_names, index_names diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index b2fac7a6140..1733a6c0b9a 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -600,18 +600,18 @@ def get_dummies( df : array-like, Series, or DataFrame Data of which to get dummy indicators. prefix : str, dict, or sequence, optional - prefix to append. Either a str (to apply a constant prefix), dict + Prefix to append. Either a str (to apply a constant prefix), dict mapping column names to prefixes, or sequence of prefixes to apply with the same length as the number of columns. If not supplied, defaults to the empty string prefix_sep : str, dict, or sequence, optional, default '_' - separator to use when appending prefixes + Separator to use when appending prefixes dummy_na : boolean, optional Add a column to indicate Nones, if False Nones are ignored. cats : dict, optional - dictionary mapping column names to sequences of integers representing - that column's category. See `cudf.DataFrame.one_hot_encoding` for more - information. if not supplied, it will be computed + Dictionary mapping column names to sequences of values representing + that column's category. If not supplied, it is computed as the unique + values of the column. sparse : boolean, optional Right now this is NON-FUNCTIONAL argument in rapids. drop_first : boolean, optional @@ -621,7 +621,7 @@ def get_dummies( columns. Note this is different from pandas default behavior, which encodes all columns with dtype object or categorical dtype : str, optional - output dtype, default 'uint8' + Output dtype, default 'uint8' Examples -------- diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 4ec7c3df076..11166320760 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -64,6 +64,7 @@ _FrameIndexer, _get_label_range_or_mask, _indices_from_labels, + doc_reset_index_template, ) from cudf.core.single_column_frame import SingleColumnFrame from cudf.utils import cudautils, docutils @@ -830,30 +831,22 @@ def reindex(self, index=None, copy=True): series.name = self.name return series - def reset_index(self, drop=False, inplace=False): - """ - Reset index to RangeIndex - - Parameters - ---------- - drop : bool, default False - Just reset the index, without inserting it as a column in - the new DataFrame. - inplace : bool, default False - Modify the Series in place (do not create a new object). - - Returns - ------- - Series or DataFrame or None - When `drop` is False (the default), a DataFrame is returned. - The newly created columns will come first in the DataFrame, - followed by the original Series values. - When `drop` is True, a `Series` is returned. - In either case, if ``inplace=True``, no value is returned. - - Examples - -------- - >>> import cudf + @docutils.doc_apply( + doc_reset_index_template.format( + klass="Series", + argument=""" + name : object, optional + The name to use for the column containing the original Series + values. Uses self.name by default. This argument is ignored when + ``drop`` is True.""", + return_type="Series or DataFrame or None", + return_doc=""" For Series, When drop is False (the default), a DataFrame + is returned. The newly created columns will come first in the + DataFrame, followed by the original Series values. When `drop` is + True, a `Series` is returned. In either case, if ``inplace=True``, + no value is returned. +""", + example=""" >>> series = cudf.Series(['a', 'b', 'c', 'd'], index=[10, 11, 12, 13]) >>> series 10 a @@ -873,19 +866,51 @@ def reset_index(self, drop=False, inplace=False): 2 c 3 d dtype: object - """ + + You can also use ``reset_index`` with MultiIndex. + + >>> s2 = cudf.Series( + ... range(4), name='foo', + ... index=cudf.MultiIndex.from_tuples([ + ... ('bar', 'one'), ('bar', 'two'), + ... ('baz', 'one'), ('baz', 'two')], + ... names=['a', 'b'] + ... )) + >>> s2 + a b + bar one 0 + two 1 + baz one 2 + two 3 + Name: foo, dtype: int64 + >>> s2.reset_index(level='a') + a foo + b + one bar 0 + two bar 1 + one baz 2 + two baz 3 +""", + ) + ) + def reset_index(self, level=None, drop=False, name=None, inplace=False): + if not drop and inplace: + raise TypeError( + "Cannot reset_index inplace on a Series " + "to create a DataFrame" + ) + data, index = self._reset_index(level=level, drop=drop) if not drop: - if inplace is True: - raise TypeError( - "Cannot reset_index inplace on a Series " - "to create a DataFrame" - ) - return self.to_frame().reset_index(drop=drop) - else: - if inplace is True: - self._index = RangeIndex(len(self)) - else: - return self._from_data(self._data, index=RangeIndex(len(self))) + if name is None: + name = 0 if self.name is None else self.name + data[name] = data.pop(self.name) + return cudf.core.dataframe.DataFrame._from_data(data, index) + # For ``name`` behavior, see: + # https://github.com/pandas-dev/pandas/issues/44575 + return self._mimic_inplace( + Series._from_data(data, index, name if inplace else None), + inplace=inplace, + ) def set_index(self, index): """Returns a new Series with a different index. @@ -1628,7 +1653,23 @@ def drop_duplicates(self, keep="first", inplace=False, ignore_index=False): return self._mimic_inplace(result, inplace=inplace) def fill(self, fill_value, begin=0, end=-1, inplace=False): - return self._fill([fill_value], begin, end, inplace) + warnings.warn( + "The fill method will be removed in a future cuDF release.", + FutureWarning, + ) + fill_values = [fill_value] + col_and_fill = zip(self._columns, fill_values) + + if not inplace: + data_columns = (c._fill(v, begin, end) for (c, v) in col_and_fill) + return self.__class__._from_data( + zip(self._column_names, data_columns), self._index + ) + + for (c, v) in col_and_fill: + c.fill(v, begin, end, inplace=True) + + return self def fillna( self, value=None, method=None, axis=None, inplace=False, limit=None @@ -2248,83 +2289,6 @@ def reverse(self): {self.name: self._column[rinds]}, self.index._values[rinds] ) - def one_hot_encoding(self, cats, dtype="float64"): - """Perform one-hot-encoding - - Parameters - ---------- - cats : sequence of values - values representing each category. - dtype : numpy.dtype - specifies the output dtype. - - Returns - ------- - Sequence - A sequence of new series for each category. Its length is - determined by the length of ``cats``. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(['a', 'b', 'c', 'a']) - >>> s - 0 a - 1 b - 2 c - 3 a - dtype: object - >>> s.one_hot_encoding(['a', 'c', 'b']) - [0 1.0 - 1 0.0 - 2 0.0 - 3 1.0 - dtype: float64, 0 0.0 - 1 0.0 - 2 1.0 - 3 0.0 - dtype: float64, 0 0.0 - 1 1.0 - 2 0.0 - 3 0.0 - dtype: float64] - """ - - warnings.warn( - "Series.one_hot_encoding is deprecated and will be removed in " - "future, use `get_dummies` instead.", - FutureWarning, - ) - - if hasattr(cats, "to_arrow"): - cats = cats.to_pandas() - else: - cats = pd.Series(cats, dtype="object") - dtype = cudf.dtype(dtype) - - try: - cats_col = as_column(cats, nan_as_null=False, dtype=self.dtype) - except TypeError: - raise ValueError("Cannot convert `cats` as cudf column.") - - if self._column.size * cats_col.size >= np.iinfo("int32").max: - raise ValueError( - "Size limitation exceeded: series.size * category.size < " - "np.iinfo('int32').max. Consider reducing size of category" - ) - - res = libcudf.transform.one_hot_encode(self._column, cats_col) - if dtype.type == np.bool_: - return [ - Series._from_data({None: x}, index=self._index) - for x in list(res.values()) - ] - else: - return [ - Series._from_data({None: x.astype(dtype)}, index=self._index) - for x in list(res.values()) - ] - def label_encoding(self, cats, dtype=None, na_sentinel=-1): """Perform label encoding. @@ -3104,45 +3068,6 @@ def value_counts( res = res / float(res._column.sum()) return res - def hash_values(self, method="murmur3"): - """Compute the hash of values in this column. - - Parameters - ---------- - method : {'murmur3', 'md5'}, default 'murmur3' - Hash function to use: - * murmur3: MurmurHash3 hash function. - * md5: MD5 hash function. - - Returns - ------- - Series - A Series with hash values. - - Examples - -------- - >>> import cudf - >>> series = cudf.Series([10, 120, 30]) - >>> series - 0 10 - 1 120 - 2 30 - dtype: int64 - >>> series.hash_values(method="murmur3") - 0 -1930516747 - 1 422619251 - 2 -941520876 - dtype: int32 - >>> series.hash_values(method="md5") - 0 7be4bbacbfdb05fb3044e36c22b41e8b - 1 947ca8d2c5f0f27437f156cfbfab0969 - 2 d0580ef52d27c043c8e341fd5039b166 - dtype: object - """ - return Series._from_data( - {None: self._hash(method=method)}, index=self.index - ) - def quantile( self, q=0.5, interpolation="linear", exact=True, quant_index=True ): diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py index 3502fc9acae..782b74ef4a6 100644 --- a/python/cudf/cudf/core/subword_tokenizer.py +++ b/python/cudf/cudf/core/subword_tokenizer.py @@ -21,7 +21,7 @@ def _cast_to_appropriate_type(ar, cast_type): from torch.utils.dlpack import from_dlpack elif cast_type == "tf": - from tf.experimental.dlpack import from_dlpack + from tensorflow.experimental.dlpack import from_dlpack return from_dlpack(ar.astype("int32").toDlpack()) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 78560ee6723..e5b298a8448 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1107,34 +1107,6 @@ def test_assign(): np.testing.assert_equal(gdf2.y.to_numpy(), [2, 3, 4]) -@pytest.mark.parametrize("nrows", [1, 8, 100, 1000]) -@pytest.mark.parametrize("method", ["murmur3", "md5"]) -def test_dataframe_hash_columns(nrows, method): - gdf = cudf.DataFrame() - data = np.asarray(range(nrows)) - data[0] = data[-1] # make first and last the same - gdf["a"] = data - gdf["b"] = gdf.a + 100 - with pytest.warns(FutureWarning): - out = gdf.hash_columns(["a", "b"]) - assert isinstance(out, cudf.Series) - assert len(out) == nrows - assert out.dtype == np.int32 - - # Check default - with pytest.warns(FutureWarning): - out_all = gdf.hash_columns() - assert_eq(out, out_all) - - # Check single column - with pytest.warns(FutureWarning): - out_one = gdf.hash_columns(["a"], method=method) - # First matches last - assert out_one.iloc[0] == out_one.iloc[-1] - # Equivalent to the cudf.Series.hash_values() - assert_eq(gdf["a"].hash_values(method=method), out_one) - - @pytest.mark.parametrize("nrows", [1, 8, 100, 1000]) @pytest.mark.parametrize("method", ["murmur3", "md5"]) def test_dataframe_hash_values(nrows, method): @@ -1797,7 +1769,7 @@ def test_dataframe_shape_empty(): @pytest.mark.parametrize("num_cols", [1, 2, 10]) @pytest.mark.parametrize("num_rows", [1, 2, 20]) -@pytest.mark.parametrize("dtype", dtypes) +@pytest.mark.parametrize("dtype", dtypes + ["object"]) @pytest.mark.parametrize("nulls", ["none", "some", "all"]) def test_dataframe_transpose(nulls, num_cols, num_rows, dtype): # In case of `bool` dtype: pandas <= 1.2.5 type-casts @@ -2541,37 +2513,137 @@ def test_tail_for_string(): assert_eq(gdf.tail(3), gdf.to_pandas().tail(3)) +@pytest.mark.parametrize("level", [None, 0, "l0", 1, ["l0", 1]]) @pytest.mark.parametrize("drop", [True, False]) -def test_reset_index(pdf, gdf, drop): - assert_eq( - pdf.reset_index(drop=drop, inplace=False), - gdf.reset_index(drop=drop, inplace=False), +@pytest.mark.parametrize( + "column_names", + [ + ["v0", "v1"], + ["v0", "index"], + pd.MultiIndex.from_tuples([("x0", "x1"), ("y0", "y1")]), + ], +) +@pytest.mark.parametrize("inplace", [True, False]) +@pytest.mark.parametrize("col_level", [0, 1]) +@pytest.mark.parametrize("col_fill", ["", "some_lv"]) +def test_reset_index(level, drop, column_names, inplace, col_level, col_fill): + midx = pd.MultiIndex.from_tuples( + [("a", 1), ("a", 2), ("b", 1), ("b", 2)], names=["l0", None] ) - assert_eq( - pdf.x.reset_index(drop=drop, inplace=False), - gdf.x.reset_index(drop=drop, inplace=False), + pdf = pd.DataFrame( + [[1, 2], [3, 4], [5, 6], [7, 8]], index=midx, columns=column_names ) + gdf = cudf.from_pandas(pdf) + + expect = pdf.reset_index( + level=level, + drop=drop, + inplace=inplace, + col_level=col_level, + col_fill=col_fill, + ) + got = gdf.reset_index( + level=level, + drop=drop, + inplace=inplace, + col_level=col_level, + col_fill=col_fill, + ) + if inplace: + expect = pdf + got = gdf + + assert_eq(expect, got) + + +@pytest.mark.parametrize("level", [None, 0, 1, [None]]) +@pytest.mark.parametrize("drop", [False, True]) +@pytest.mark.parametrize("inplace", [False, True]) +@pytest.mark.parametrize("col_level", [0, 1]) +@pytest.mark.parametrize("col_fill", ["", "some_lv"]) +def test_reset_index_dup_level_name(level, drop, inplace, col_level, col_fill): + # midx levels are named [None, None] + midx = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]) + pdf = pd.DataFrame([[1, 2], [3, 4], [5, 6], [7, 8]], index=midx) + gdf = cudf.from_pandas(pdf) + if level == [None]: + assert_exceptions_equal( + lfunc=pdf.reset_index, + rfunc=gdf.reset_index, + lfunc_args_and_kwargs=( + [], + {"level": level, "drop": drop, "inplace": inplace}, + ), + rfunc_args_and_kwargs=( + [], + {"level": level, "drop": drop, "inplace": inplace}, + ), + expected_error_message="occurs multiple times, use a level number", + ) + return + + expect = pdf.reset_index( + level=level, + drop=drop, + inplace=inplace, + col_level=col_level, + col_fill=col_fill, + ) + got = gdf.reset_index( + level=level, + drop=drop, + inplace=inplace, + col_level=col_level, + col_fill=col_fill, + ) + if inplace: + expect = pdf + got = gdf + + assert_eq(expect, got) @pytest.mark.parametrize("drop", [True, False]) -def test_reset_named_index(pdf, gdf, drop): +@pytest.mark.parametrize("inplace", [False, True]) +@pytest.mark.parametrize("col_level", [0, 1]) +@pytest.mark.parametrize("col_fill", ["", "some_lv"]) +def test_reset_index_named(pdf, gdf, drop, inplace, col_level, col_fill): pdf.index.name = "cudf" gdf.index.name = "cudf" - assert_eq( - pdf.reset_index(drop=drop, inplace=False), - gdf.reset_index(drop=drop, inplace=False), + + expect = pdf.reset_index( + drop=drop, inplace=inplace, col_level=col_level, col_fill=col_fill ) - assert_eq( - pdf.x.reset_index(drop=drop, inplace=False), - gdf.x.reset_index(drop=drop, inplace=False), + got = gdf.reset_index( + drop=drop, inplace=inplace, col_level=col_level, col_fill=col_fill ) + if inplace: + expect = pdf + got = gdf + assert_eq(expect, got) @pytest.mark.parametrize("drop", [True, False]) -def test_reset_index_inplace(pdf, gdf, drop): - pdf.reset_index(drop=drop, inplace=True) - gdf.reset_index(drop=drop, inplace=True) - assert_eq(pdf, gdf) +@pytest.mark.parametrize("inplace", [False, True]) +@pytest.mark.parametrize("column_names", [["x", "y"], ["index", "y"]]) +@pytest.mark.parametrize("col_level", [0, 1]) +@pytest.mark.parametrize("col_fill", ["", "some_lv"]) +def test_reset_index_unnamed( + pdf, gdf, drop, inplace, column_names, col_level, col_fill +): + pdf.columns = column_names + gdf.columns = column_names + + expect = pdf.reset_index( + drop=drop, inplace=inplace, col_level=col_level, col_fill=col_fill + ) + got = gdf.reset_index( + drop=drop, inplace=inplace, col_level=col_level, col_fill=col_fill + ) + if inplace: + expect = pdf + got = gdf + assert_eq(expect, got) @pytest.mark.parametrize( @@ -8724,103 +8796,6 @@ def test_dataframe_init_from_series(data, columns, index): ) -@pytest.mark.parametrize( - "data, expected", - [ - ({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "c": [1.2, 1, 2, 3]}, False), - ({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, True), - ({"a": ["a", "b", "c"], "b": [4, 5, 6], "c": [7, 8, 9]}, False), - ({"a": [True, False, False], "b": [False, False, True]}, True), - ({"a": [True, False, False]}, True), - ({"a": [[1, 2], [3, 4]]}, True), - ({"a": [[1, 2], [3, 4]], "b": ["a", "b"]}, False), - ({"a": [{"c": 5}, {"e": 5}], "b": [{"c": 5}, {"g": 7}]}, True), - ({}, True), - ], -) -def test_is_homogeneous_dataframe(data, expected): - actual = cudf.DataFrame(data)._is_homogeneous - - assert actual == expected - - -@pytest.mark.parametrize( - "data, indexes, expected", - [ - ( - {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "c": [1.2, 1, 2, 3]}, - ["a", "b"], - True, - ), - ( - { - "a": [1, 2, 3, 4], - "b": [5, 6, 7, 8], - "c": [1.2, 1, 2, 3], - "d": ["hello", "world", "cudf", "rapids"], - }, - ["a", "b"], - False, - ), - ( - { - "a": ["a", "b", "c"], - "b": [4, 5, 6], - "c": [7, 8, 9], - "d": [1, 2, 3], - }, - ["a", "b"], - True, - ), - ], -) -def test_is_homogeneous_multiIndex_dataframe(data, indexes, expected): - test_dataframe = cudf.DataFrame(data).set_index(indexes) - actual = cudf.DataFrame(test_dataframe)._is_homogeneous - - assert actual == expected - - -@pytest.mark.parametrize( - "data, expected", [([1, 2, 3, 4], True), ([True, False], True)] -) -def test_is_homogeneous_series(data, expected): - actual = cudf.Series(data)._is_homogeneous - - assert actual == expected - - -@pytest.mark.parametrize( - "levels, codes, expected", - [ - ( - [["lama", "cow", "falcon"], ["speed", "weight", "length"]], - [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], - True, - ), - ( - [[1, 2, 3], [True, False, True]], - [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], - False, - ), - ], -) -def test_is_homogeneous_multiIndex(levels, codes, expected): - actual = cudf.MultiIndex(levels=levels, codes=codes)._is_homogeneous - - assert actual == expected - - -@pytest.mark.parametrize( - "data, expected", - [([1, 2, 3], True), (["Hello", "World"], True), ([True, False], True)], -) -def test_is_homogeneous_index(data, expected): - actual = cudf.Index(data)._is_homogeneous - - assert actual == expected - - def test_frame_series_where(): gdf = cudf.DataFrame( {"a": [1.0, 2.0, None, 3.0, None], "b": [None, 10.0, 11.0, None, 23.0]} diff --git a/python/cudf/cudf/tests/test_fill.py b/python/cudf/cudf/tests/test_fill.py index efbe2834486..224db2b39d1 100644 --- a/python/cudf/cudf/tests/test_fill.py +++ b/python/cudf/cudf/tests/test_fill.py @@ -50,7 +50,7 @@ def test_fill(data, fill_value, begin, end, inplace): begin = max(0, min(len(gs), begin)) end = max(0, min(len(gs), end)) - actual = gs._fill([fill_value], begin, end, False) + actual = gs.fill(fill_value, begin, end, False) assert actual is not gs ps[begin:end] = fill_value diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 1feaddf74e2..c73e96de470 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -2362,4 +2362,44 @@ def test_groupby_get_group(pdf, group, name, obj): assert_groupby_results_equal(expected, actual) +def test_groupby_select_then_ffill(): + pdf = pd.DataFrame( + { + "a": [1, 1, 1, 2, 2], + "b": [1, None, None, 2, None], + "c": [3, None, None, 4, None], + } + ) + gdf = cudf.from_pandas(pdf) + + expected = pdf.groupby("a")["c"].ffill() + actual = gdf.groupby("a")["c"].ffill() + + assert_groupby_results_equal(expected, actual) + + +def test_groupby_select_then_shift(): + pdf = pd.DataFrame( + {"a": [1, 1, 1, 2, 2], "b": [1, 2, 3, 4, 5], "c": [3, 4, 5, 6, 7]} + ) + gdf = cudf.from_pandas(pdf) + + expected = pdf.groupby("a")["c"].shift(1) + actual = gdf.groupby("a")["c"].shift(1) + + assert_groupby_results_equal(expected, actual) + + +def test_groupby_select_then_diff(): + pdf = pd.DataFrame( + {"a": [1, 1, 1, 2, 2], "b": [1, 2, 3, 4, 5], "c": [3, 4, 5, 6, 7]} + ) + gdf = cudf.from_pandas(pdf) + + expected = pdf.groupby("a")["c"].diff(1) + actual = gdf.groupby("a")["c"].diff(1) + + assert_groupby_results_equal(expected, actual) + + # TODO: Add a test including datetime64[ms] column in input data diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py index f2a20a73b63..2b0422ffecb 100644 --- a/python/cudf/cudf/tests/test_onehot.py +++ b/python/cudf/cudf/tests/test_onehot.py @@ -7,108 +7,23 @@ import pytest import cudf -from cudf import DataFrame, Index, Series +from cudf import DataFrame from cudf.testing import _utils as utils -def test_onehot_simple(): - np.random.seed(0) - df = DataFrame() - # Populate with data [0, 10) - df["vals"] = np.arange(10, dtype=np.int32) - # One Hot (Series) - for i, col in enumerate(df["vals"].one_hot_encoding(list(range(10)))): - arr = col.to_numpy() - # Verify 1 in the right position - np.testing.assert_equal(arr[i], 1) - # Every other slots are 0s - np.testing.assert_equal(arr[:i], 0) - np.testing.assert_equal(arr[i + 1 :], 0) - # One Hot (DataFrame) - df2 = df.one_hot_encoding( - column="vals", prefix="vals", cats=list(range(10)) - ) - assert df2.columns[0] == "vals" - for i in range(1, len(df2.columns)): - assert df2.columns[i] == "vals_%s" % (i - 1) - got = df2[df2.columns[1:]].values_host - expect = np.identity(got.shape[0]) - np.testing.assert_equal(got, expect) - - -def test_onehot_random(): - df = DataFrame() - low = 10 - high = 17 - size = 10 - df["src"] = src = np.random.randint(low=low, high=high, size=size) - df2 = df.one_hot_encoding( - column="src", prefix="out_", cats=tuple(range(10, 17)) - ) - mat = df2[df2.columns[1:]].values_host - - for val in range(low, high): - colidx = val - low - arr = mat[:, colidx] - mask = src == val - np.testing.assert_equal(arr, mask) - - -def test_onehot_masked(): - np.random.seed(0) - high = 5 - size = 100 - arr = np.random.randint(low=0, high=high, size=size) - bitmask = utils.random_bitmask(size) - bytemask = np.asarray( - utils.expand_bits_to_bytes(bitmask)[:size], dtype=np.bool_ - ) - arr[~bytemask] = -1 - - df = DataFrame() - df["a"] = Series(arr).set_mask(bitmask) - - out = df.one_hot_encoding( - "a", cats=list(range(high)), prefix="a", dtype=np.int32 - ) - - assert tuple(out.columns) == ("a", "a_0", "a_1", "a_2", "a_3", "a_4") - np.testing.assert_array_equal((out["a_0"] == 1).to_numpy(), arr == 0) - np.testing.assert_array_equal((out["a_1"] == 1).to_numpy(), arr == 1) - np.testing.assert_array_equal((out["a_2"] == 1).to_numpy(), arr == 2) - np.testing.assert_array_equal((out["a_3"] == 1).to_numpy(), arr == 3) - np.testing.assert_array_equal((out["a_4"] == 1).to_numpy(), arr == 4) - - -def test_onehot_generic_index(): - np.random.seed(0) - size = 33 - indices = np.random.randint(low=0, high=100, size=size) - df = DataFrame() - values = np.random.randint(low=0, high=4, size=size) - df["fo"] = Series(values, index=Index(indices)) - out = df.one_hot_encoding( - "fo", cats=df.fo.unique(), prefix="fo", dtype=np.int32 - ) - assert set(out.columns) == {"fo", "fo_0", "fo_1", "fo_2", "fo_3"} - np.testing.assert_array_equal(values == 0, out.fo_0.to_numpy()) - np.testing.assert_array_equal(values == 1, out.fo_1.to_numpy()) - np.testing.assert_array_equal(values == 2, out.fo_2.to_numpy()) - np.testing.assert_array_equal(values == 3, out.fo_3.to_numpy()) - - @pytest.mark.parametrize( - "data", + "data, index", [ - np.arange(10), - ["abc", "zyx", "pppp"], - [], - pd.Series(["cudf", "hello", "pandas"] * 10, dtype="category"), + (np.arange(10), None), + (["abc", "zyx", "pppp"], None), + ([], None), + (pd.Series(["cudf", "hello", "pandas"] * 10, dtype="category"), None), + (range(10), [1, 2, 3, 4, 5] * 2), ], ) -def test_get_dummies(data): - gdf = DataFrame({"x": data}) - pdf = pd.DataFrame({"x": data}) +def test_get_dummies(data, index): + gdf = DataFrame({"x": data}, index=index) + pdf = pd.DataFrame({"x": data}, index=index) encoded_expected = pd.get_dummies(pdf, prefix="test") encoded_actual = cudf.get_dummies(gdf, prefix="test") diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index fe95b2930df..f8c136b8c2d 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -98,15 +98,9 @@ def test_full_dataframe_20(dtype, nrows, ncols): ).astype(dtype) gdf = cudf.from_pandas(pdf) - ncols, nrows = gdf._repr_pandas025_formatting(ncols, nrows, dtype) - pd.options.display.max_rows = int(nrows) - pd.options.display.max_columns = int(ncols) - assert pdf.__repr__() == gdf.__repr__() assert pdf._repr_html_() == gdf._repr_html_() assert pdf._repr_latex_() == gdf._repr_latex_() - pd.reset_option("display.max_rows") - pd.reset_option("display.max_columns") @pytest.mark.parametrize("dtype", repr_categories) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 583d2c7a8dd..ffdd53c58ac 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1345,6 +1345,122 @@ def test_nullable_bool_dtype_series(data, bool_dtype): assert_eq(psr, gsr.to_pandas(nullable=True)) +@pytest.mark.parametrize("level", [None, 0, "l0", 1, ["l0", 1]]) +@pytest.mark.parametrize("drop", [True, False]) +@pytest.mark.parametrize("original_name", [None, "original_ser"]) +@pytest.mark.parametrize("name", [None, "ser"]) +@pytest.mark.parametrize("inplace", [True, False]) +def test_reset_index(level, drop, inplace, original_name, name): + midx = pd.MultiIndex.from_tuples( + [("a", 1), ("a", 2), ("b", 1), ("b", 2)], names=["l0", None] + ) + ps = pd.Series(range(4), index=midx, name=original_name) + gs = cudf.from_pandas(ps) + + if not drop and inplace: + pytest.skip( + "For exception checks, see " + "test_reset_index_dup_level_name_exceptions" + ) + + expect = ps.reset_index(level=level, drop=drop, name=name, inplace=inplace) + got = gs.reset_index(level=level, drop=drop, name=name, inplace=inplace) + if inplace: + expect = ps + got = gs + + assert_eq(expect, got) + + +@pytest.mark.parametrize("level", [None, 0, 1, [None]]) +@pytest.mark.parametrize("drop", [False, True]) +@pytest.mark.parametrize("inplace", [False, True]) +@pytest.mark.parametrize("original_name", [None, "original_ser"]) +@pytest.mark.parametrize("name", [None, "ser"]) +def test_reset_index_dup_level_name(level, drop, inplace, original_name, name): + # midx levels are named [None, None] + midx = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]) + ps = pd.Series(range(4), index=midx, name=original_name) + gs = cudf.from_pandas(ps) + if level == [None] or not drop and inplace: + pytest.skip( + "For exception checks, see " + "test_reset_index_dup_level_name_exceptions" + ) + + expect = ps.reset_index(level=level, drop=drop, inplace=inplace, name=name) + got = gs.reset_index(level=level, drop=drop, inplace=inplace, name=name) + if inplace: + expect = ps + got = gs + + assert_eq(expect, got) + + +@pytest.mark.parametrize("drop", [True, False]) +@pytest.mark.parametrize("inplace", [True, False]) +@pytest.mark.parametrize("original_name", [None, "original_ser"]) +@pytest.mark.parametrize("name", [None, "ser"]) +def test_reset_index_named(drop, inplace, original_name, name): + ps = pd.Series(range(4), index=["x", "y", "z", "w"], name=original_name) + gs = cudf.from_pandas(ps) + + ps.index.name = "cudf" + gs.index.name = "cudf" + + if not drop and inplace: + pytest.skip( + "For exception checks, see " + "test_reset_index_dup_level_name_exceptions" + ) + + expect = ps.reset_index(drop=drop, inplace=inplace, name=name) + got = gs.reset_index(drop=drop, inplace=inplace, name=name) + + if inplace: + expect = ps + got = gs + + assert_eq(expect, got) + + +def test_reset_index_dup_level_name_exceptions(): + midx = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]) + ps = pd.Series(range(4), index=midx) + gs = cudf.from_pandas(ps) + + # Should specify duplicate level names with level number. + assert_exceptions_equal( + lfunc=ps.reset_index, + rfunc=gs.reset_index, + lfunc_args_and_kwargs=([], {"level": [None]},), + rfunc_args_and_kwargs=([], {"level": [None]},), + expected_error_message="occurs multiple times, use a level number", + ) + + # Cannot use drop=False and inplace=True to turn a series into dataframe. + assert_exceptions_equal( + lfunc=ps.reset_index, + rfunc=gs.reset_index, + lfunc_args_and_kwargs=([], {"drop": False, "inplace": True},), + rfunc_args_and_kwargs=([], {"drop": False, "inplace": True},), + ) + + # Pandas raises the above exception should these two inputs crosses. + assert_exceptions_equal( + lfunc=ps.reset_index, + rfunc=gs.reset_index, + lfunc_args_and_kwargs=( + [], + {"level": [None], "drop": False, "inplace": True}, + ), + rfunc_args_and_kwargs=( + [], + {"level": [None], "drop": False, "inplace": True}, + ), + ) + + def test_series_add_prefix(): cd_s = cudf.Series([1, 2, 3, 4]) pd_s = cd_s.to_pandas() diff --git a/python/cudf/cudf/tests/test_subword_tokenizer.py b/python/cudf/cudf/tests/test_subword_tokenizer.py index 717b3de8479..ec6e0b30cb1 100644 --- a/python/cudf/cudf/tests/test_subword_tokenizer.py +++ b/python/cudf/cudf/tests/test_subword_tokenizer.py @@ -1,12 +1,14 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. import os +import cupy import numpy as np import pytest from transformers import BertTokenizer import cudf from cudf.core.subword_tokenizer import SubwordTokenizer +from cudf.testing._utils import assert_eq @pytest.fixture(scope="module") @@ -26,30 +28,6 @@ def assert_equal_tokenization_outputs(hf_output, cudf_output): ) -def test_subword_tokenize_on_disk_vocab_str_api(datadir): - """ - Tests the subword-tokenizer API where - the vocabulary is not pre-loaded - and is accessed via the string accessor - """ - with open( - os.path.join(datadir, "test_sentences.txt"), encoding="utf-8" - ) as file: - input_sentence_ls = [line.strip() for line in file] - - vocab_dir = os.path.join(datadir, "bert_base_cased_sampled") - vocab_hash_path = os.path.join(vocab_dir, "vocab-hash.txt") - - ser = cudf.Series(input_sentence_ls) - tokens, masks, metadata = ser.str.subword_tokenize( - vocab_hash_path, - max_length=32, - stride=32, - do_lower=True, - max_rows_tensor=len(ser), - ) - - @pytest.mark.parametrize("seq_len", [32, 64]) @pytest.mark.parametrize("stride", [0, 15, 30]) @pytest.mark.parametrize("add_special_tokens", [True, False]) @@ -115,3 +93,145 @@ def test_subword_tokenize_with_truncation(datadir): truncation=False, add_special_tokens=True, ) + + +def test_text_subword_tokenize(tmpdir): + sr = cudf.Series( + [ + "This is a test", + "A test this is", + "Is test a this", + "Test test", + "this This", + ] + ) + hash_file = tmpdir.mkdir("nvtext").join("tmp_hashed_vocab.txt") + content = "1\n0\n23\n" + coefficients = [65559] * 23 + for c in coefficients: + content = content + str(c) + " 0\n" + # based on values from the bert_hash_table.txt file for the + # test words used here: 'this' 'is' 'a' test' + table = [0] * 23 + table[0] = 3015668 + table[1] = 6205475701751155871 + table[5] = 6358029 + table[16] = 451412625363 + table[20] = 6206321707968235495 + content = content + "23\n" + for v in table: + content = content + str(v) + "\n" + content = content + "100\n101\n102\n\n" + hash_file.write(content) + + cudf_tokenizer = SubwordTokenizer(hash_file) + + token_d = cudf_tokenizer( + sr, 8, 8, add_special_tokens=False, truncation=True + ) + tokens, masks, metadata = ( + token_d["input_ids"], + token_d["attention_mask"], + token_d["metadata"], + ) + expected_tokens = cupy.asarray( + [ + 2023, + 2003, + 1037, + 3231, + 0, + 0, + 0, + 0, + 1037, + 3231, + 2023, + 2003, + 0, + 0, + 0, + 0, + 2003, + 3231, + 1037, + 2023, + 0, + 0, + 0, + 0, + 3231, + 3231, + 0, + 0, + 0, + 0, + 0, + 0, + 2023, + 2023, + 0, + 0, + 0, + 0, + 0, + 0, + ], + dtype=np.uint32, + ) + expected_tokens = expected_tokens.reshape(-1, 8) + assert_eq(expected_tokens, tokens) + + expected_masks = cupy.asarray( + [ + 1, + 1, + 1, + 1, + 0, + 0, + 0, + 0, + 1, + 1, + 1, + 1, + 0, + 0, + 0, + 0, + 1, + 1, + 1, + 1, + 0, + 0, + 0, + 0, + 1, + 1, + 0, + 0, + 0, + 0, + 0, + 0, + 1, + 1, + 0, + 0, + 0, + 0, + 0, + 0, + ], + dtype=np.uint32, + ) + expected_masks = expected_masks.reshape(-1, 8) + assert_eq(expected_masks, masks) + + expected_metadata = cupy.asarray( + [0, 0, 3, 1, 0, 3, 2, 0, 3, 3, 0, 1, 4, 0, 1], dtype=np.uint32 + ) + expected_metadata = expected_metadata.reshape(-1, 3) + assert_eq(expected_metadata, metadata) diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py index fcae0a21b6a..a447a60c709 100644 --- a/python/cudf/cudf/tests/test_text.py +++ b/python/cudf/cudf/tests/test_text.py @@ -1,6 +1,5 @@ # Copyright (c) 2019, NVIDIA CORPORATION. -import cupy import numpy as np import pytest @@ -655,136 +654,6 @@ def test_text_filter_tokens_error_cases(): sr.str.filter_tokens(3, delimiter=["a", "b"]) -def test_text_subword_tokenize(tmpdir): - sr = cudf.Series( - [ - "This is a test", - "A test this is", - "Is test a this", - "Test test", - "this This", - ] - ) - hash_file = tmpdir.mkdir("nvtext").join("tmp_hashed_vocab.txt") - content = "1\n0\n23\n" - coefficients = [65559] * 23 - for c in coefficients: - content = content + str(c) + " 0\n" - # based on values from the bert_hash_table.txt file for the - # test words used here: 'this' 'is' 'a' test' - table = [0] * 23 - table[0] = 3015668 - table[1] = 6205475701751155871 - table[5] = 6358029 - table[16] = 451412625363 - table[20] = 6206321707968235495 - content = content + "23\n" - for v in table: - content = content + str(v) + "\n" - content = content + "100\n101\n102\n\n" - hash_file.write(content) - - tokens, masks, metadata = sr.str.subword_tokenize(str(hash_file), 8, 8) - expected_tokens = cupy.asarray( - [ - 2023, - 2003, - 1037, - 3231, - 0, - 0, - 0, - 0, - 1037, - 3231, - 2023, - 2003, - 0, - 0, - 0, - 0, - 2003, - 3231, - 1037, - 2023, - 0, - 0, - 0, - 0, - 3231, - 3231, - 0, - 0, - 0, - 0, - 0, - 0, - 2023, - 2023, - 0, - 0, - 0, - 0, - 0, - 0, - ], - dtype=np.uint32, - ) - assert_eq(expected_tokens, tokens) - - expected_masks = cupy.asarray( - [ - 1, - 1, - 1, - 1, - 0, - 0, - 0, - 0, - 1, - 1, - 1, - 1, - 0, - 0, - 0, - 0, - 1, - 1, - 1, - 1, - 0, - 0, - 0, - 0, - 1, - 1, - 0, - 0, - 0, - 0, - 0, - 0, - 1, - 1, - 0, - 0, - 0, - 0, - 0, - 0, - ], - dtype=np.uint32, - ) - assert_eq(expected_masks, masks) - - expected_metadata = cupy.asarray( - [0, 0, 3, 1, 0, 3, 2, 0, 3, 3, 0, 1, 4, 0, 1], dtype=np.uint32 - ) - assert_eq(expected_metadata, metadata) - - def test_edit_distance(): sr = cudf.Series(["kitten", "saturday", "address", "book"]) tg = cudf.Series(["sitting", "sunday", "addressee", "back"]) diff --git a/python/cudf/cudf/utils/docutils.py b/python/cudf/cudf/utils/docutils.py index 57ad612846d..7a4a2673f9b 100644 --- a/python/cudf/cudf/utils/docutils.py +++ b/python/cudf/cudf/utils/docutils.py @@ -68,6 +68,16 @@ def wrapper(func): return wrapper +def doc_apply(doc): + """Set `__doc__` attribute of `func` to `doc`.""" + + def wrapper(func): + func.__doc__ = doc + return func + + return wrapper + + doc_describe = docfmt_partial( docstring=""" Generate descriptive statistics. diff --git a/python/cudf_kafka/cudf_kafka/_lib/.kafka.pxd.swo b/python/cudf_kafka/cudf_kafka/_lib/.kafka.pxd.swo new file mode 100644 index 00000000000..624b60798ae Binary files /dev/null and b/python/cudf_kafka/cudf_kafka/_lib/.kafka.pxd.swo differ diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd index fc985e58b68..e64d8f82739 100644 --- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd +++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libc.stdint cimport int32_t, int64_t from libcpp cimport bool @@ -11,14 +11,21 @@ from cudf._lib.cpp.io.types cimport datasource from cudf._lib.io.datasource cimport Datasource +cdef extern from "kafka_callback.hpp" \ + namespace "cudf::io::external::kafka" nogil: + ctypedef object (*python_callable_type)() + + cdef extern from "kafka_consumer.hpp" \ namespace "cudf::io::external::kafka" nogil: cpdef cppclass kafka_consumer: - kafka_consumer(map[string, string] configs) except + + kafka_consumer(map[string, string] configs, + python_callable_type python_callable) except + kafka_consumer(map[string, string] configs, + python_callable_type python_callable, string topic_name, int32_t partition, int64_t start_offset, @@ -49,7 +56,6 @@ cdef extern from "kafka_consumer.hpp" \ cdef class KafkaDatasource(Datasource): cdef unique_ptr[datasource] c_datasource - cdef map[string, string] kafka_configs cdef string topic cdef int32_t partition cdef int64_t start_offset diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx index 5588b69938b..24d072c544e 100644 --- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx +++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx @@ -1,7 +1,7 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libc.stdint cimport int32_t, int64_t -from libcpp cimport bool +from libcpp cimport bool, nullptr from libcpp.map cimport map from libcpp.memory cimport make_unique, unique_ptr from libcpp.string cimport string @@ -11,19 +11,43 @@ from cudf._lib.cpp.io.types cimport datasource from cudf_kafka._lib.kafka cimport kafka_consumer +# To avoid including in libcudf_kafka +# we introduce this wrapper in Cython +cdef map[string, string] oauth_callback_wrapper(void *ctx): + return ((ctx))() + + cdef class KafkaDatasource(Datasource): def __cinit__(self, - map[string, string] kafka_configs, + object kafka_configs, string topic=b"", int32_t partition=-1, int64_t start_offset=0, int64_t end_offset=0, int32_t batch_timeout=10000, string delimiter=b"",): + + cdef map[string, string] configs + cdef void* python_callable = nullptr + cdef map[string, string] (*python_callable_wrapper)(void *) + + for key in kafka_configs: + if key == 'oauth_cb': + if callable(kafka_configs[key]): + python_callable = kafka_configs[key] + python_callable_wrapper = &oauth_callback_wrapper + else: + raise TypeError("'oauth_cb' configuration must \ + be a Python callable object") + else: + configs[key.encode()] = kafka_configs[key].encode() + if topic != b"" and partition != -1: self.c_datasource = \ - make_unique[kafka_consumer](kafka_configs, + make_unique[kafka_consumer](configs, + python_callable, + python_callable_wrapper, topic, partition, start_offset, @@ -32,7 +56,9 @@ cdef class KafkaDatasource(Datasource): delimiter) else: self.c_datasource = \ - make_unique[kafka_consumer](kafka_configs) + make_unique[kafka_consumer](configs, + python_callable, + python_callable_wrapper) cdef datasource* get_datasource(self) nogil: return self.c_datasource.get() diff --git a/python/custreamz/custreamz/kafka.py b/python/custreamz/custreamz/kafka.py index a301660a2e4..f5d5031602f 100644 --- a/python/custreamz/custreamz/kafka.py +++ b/python/custreamz/custreamz/kafka.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import confluent_kafka as ck from cudf_kafka._lib.kafka import KafkaDatasource @@ -25,13 +25,7 @@ def __init__(self, kafka_configs): """ self.kafka_configs = kafka_configs - - self.kafka_confs = { - str.encode(key): str.encode(value) - for key, value in self.kafka_configs.items() - } - - self.kafka_meta_client = KafkaDatasource(self.kafka_confs) + self.kafka_meta_client = KafkaDatasource(kafka_configs) def list_topics(self, specific_topic=None): @@ -145,7 +139,7 @@ def read_gdf( ) kafka_datasource = KafkaDatasource( - self.kafka_confs, + self.kafka_configs, topic.encode(), partition, start, @@ -173,7 +167,10 @@ def read_gdf( kafka_datasource.close(batch_timeout) if result is not None: - return cudf.DataFrame._from_table(result) + if isinstance(result, cudf.DataFrame): + return result + else: + return cudf.DataFrame._from_data(result) else: # empty Dataframe return cudf.DataFrame() diff --git a/python/custreamz/custreamz/tests/test_kafka.py b/python/custreamz/custreamz/tests/test_kafka.py index d29ebf8db8b..ad3b829544b 100644 --- a/python/custreamz/custreamz/tests/test_kafka.py +++ b/python/custreamz/custreamz/tests/test_kafka.py @@ -5,11 +5,10 @@ from cudf.testing._utils import assert_eq -@pytest.mark.parametrize("commit_offset", [-1, 0, 1, 1000]) +@pytest.mark.parametrize("commit_offset", [1, 45, 100, 22, 1000, 10]) @pytest.mark.parametrize("topic", ["cudf-kafka-test-topic"]) def test_kafka_offset(kafka_client, topic, commit_offset): - ck_top = ck.TopicPartition(topic, 0, commit_offset) - offsets = [ck_top] + offsets = [ck.TopicPartition(topic, 0, commit_offset)] kafka_client.commit(offsets=offsets) # Get the offsets that were just committed to Kafka