diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 986acd104cc..224e5221a5b 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -69,7 +69,7 @@ jobs: uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06 with: build_type: pull-request - test_script: "ci/test_python_cudf.sh" + script: "ci/test_python_cudf.sh" conda-python-other-tests: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism needs: conda-python-build @@ -77,7 +77,7 @@ jobs: uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06 with: build_type: pull-request - test_script: "ci/test_python_other.sh" + script: "ci/test_python_other.sh" conda-java-tests: needs: conda-cpp-build secrets: inherit diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 075825e852e..6f7aef79881 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -51,7 +51,7 @@ jobs: branch: ${{ inputs.branch }} date: ${{ inputs.date }} sha: ${{ inputs.sha }} - test_script: "ci/test_python_cudf.sh" + script: "ci/test_python_cudf.sh" conda-python-other-tests: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism secrets: inherit @@ -61,7 +61,7 @@ jobs: branch: ${{ inputs.branch }} date: ${{ inputs.date }} sha: ${{ inputs.sha }} - test_script: "ci/test_python_other.sh" + script: "ci/test_python_other.sh" conda-java-tests: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06 diff --git a/cpp/include/cudf/contiguous_split.hpp b/cpp/include/cudf/contiguous_split.hpp index 9ea4fa1780b..1bbbf73bd5d 100644 --- a/cpp/include/cudf/contiguous_split.hpp +++ b/cpp/include/cudf/contiguous_split.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -106,9 +106,9 @@ struct packed_table { * @endcode * * - * @throws cudf::logic_error if `splits` has end index > size of `input`. - * @throws cudf::logic_error When the value in `splits` is not in the range [0, input.size()). - * @throws cudf::logic_error When the values in the `splits` are 'strictly decreasing'. + * @throws std::out_of_range if `splits` has end index > size of `input`. + * @throws std::out_of_range When the value in `splits` is not in the range [0, input.size()). + * @throws std::invalid_argument When the values in the `splits` are 'strictly decreasing'. * * @param input View of a table to split * @param splits A vector of indices where the view will be split diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp index b3a8836b193..b2cde82fada 100644 --- a/cpp/include/cudf/copying.hpp +++ b/cpp/include/cudf/copying.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * Copyright (c) 2018-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -66,7 +66,7 @@ enum class out_of_bounds_policy : bool { * For dictionary columns, the keys column component is copied and not trimmed * if the gather results in abandoned key elements. * - * @throws cudf::logic_error if gather_map contains null values. + * @throws std::invalid_argument if gather_map contains null values. * * @param source_table The input columns whose rows will be gathered * @param gather_map View into a non-nullable column of integral indices that maps the @@ -152,6 +152,13 @@ std::unique_ptr reverse( * A negative value `i` in the `scatter_map` is interpreted as `i+n`, where `n` * is the number of rows in the `target` table. * + * @throws std::invalid_argument if the number of columns in source does not match the number of + * columns in target + * @throws std::invalid_argument if the number of rows in source does not match the number of + * elements in scatter_map + * @throws cudf::data_type_error if the data types of the source and target columns do not match + * @throws std::invalid_argument if scatter_map contains null values + * * @param source The input columns containing values to be scattered into the * target columns * @param scatter_map A non-nullable column of integral indices that maps the @@ -191,6 +198,11 @@ std::unique_ptr scatter( * If any values in `scatter_map` are outside of the interval [-n, n) where `n` * is the number of rows in the `target` table, behavior is undefined. * + * @throws std::invalid_argument if the number of scalars does not match the number of columns in + * target + * @throws std::invalid_argument if indices contains null values + * @throws cudf::data_type_error if the data types of the scalars and target columns do not match + * * @param source The input scalars containing values to be scattered into the * target columns * @param indices A non-nullable column of integral indices that indicate @@ -302,15 +314,15 @@ std::unique_ptr
empty_like(table_view const& input_table); * If @p source and @p target refer to the same elements and the ranges overlap, * the behavior is undefined. * - * @throws cudf::logic_error if memory reallocation is required (e.g. for + * @throws cudf::data_type_error if memory reallocation is required (e.g. for * variable width types). - * @throws cudf::logic_error for invalid range (if + * @throws std::out_of_range for invalid range (if * @p source_begin > @p source_end, @p source_begin < 0, * @p source_begin >= @p source.size(), @p source_end > @p source.size(), * @p target_begin < 0, target_begin >= @p target.size(), or * @p target_begin + (@p source_end - @p source_begin) > @p target.size()). - * @throws cudf::logic_error if @p target and @p source have different types. - * @throws cudf::logic_error if @p source has null values and @p target is not + * @throws cudf::data_type_error if @p target and @p source have different types. + * @throws std::invalid_argument if @p source has null values and @p target is not * nullable. * * @param source The column to copy from @@ -341,12 +353,13 @@ void copy_range_in_place(column_view const& source, * If @p source and @p target refer to the same elements and the ranges overlap, * the behavior is undefined. * - * @throws cudf::logic_error for invalid range (if - * @p source_begin > @p source_end, @p source_begin < 0, - * @p source_begin >= @p source.size(), @p source_end > @p source.size(), - * @p target_begin < 0, target_begin >= @p target.size(), or - * @p target_begin + (@p source_end - @p source_begin) > @p target.size()). - * @throws cudf::logic_error if @p target and @p source have different types. + * A range is considered invalid if: + * - Either the begin or end indices are out of bounds for the corresponding column + * - Begin is greater than end for source or target + * - The size of the source range would overflow the target column starting at target_begin + * + * @throws std::out_of_range for any invalid range. + * @throws cudf::data_type_error if @p target and @p source have different types. * * @param source The column to copy from inside the range * @param target The column to copy from outside the range @@ -399,8 +412,8 @@ std::unique_ptr copy_range( * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned result's device memory * - * @throw cudf::logic_error if @p input dtype is neither fixed-width nor string type - * @throw cudf::logic_error if @p fill_value dtype does not match @p input dtype. + * @throw cudf::data_type_error if @p input dtype is neither fixed-width nor string type + * @throw cudf::data_type_error if @p fill_value dtype does not match @p input dtype. * * @return The shifted column */ @@ -432,9 +445,9 @@ std::unique_ptr shift( * output: {{12, 14}, {20, 22, 24, 26}, {14, 16}, {}} * @endcode * - * @throws cudf::logic_error if `indices` size is not even. - * @throws cudf::logic_error When the values in the pair are strictly decreasing. - * @throws cudf::logic_error When any of the values in the pair don't belong to + * @throws std::invalid_argument if `indices` size is not even. + * @throws std::invalid_argument When the values in the pair are strictly decreasing. + * @throws std::out_of_range When any of the values in the pair don't belong to * the range [0, input.size()). * * @param input View of column to slice @@ -476,9 +489,9 @@ std::vector slice(column_view const& input, * {{52, 54}, {60, 22, 24, 26}, {14, 16}, {}}] * @endcode * - * @throws cudf::logic_error if `indices` size is not even. - * @throws cudf::logic_error When the values in the pair are strictly decreasing. - * @throws cudf::logic_error When any of the values in the pair don't belong to + * @throws std::invalid_argument if `indices` size is not even. + * @throws std::invalid_argument When the values in the pair are strictly decreasing. + * @throws std::out_of_range When any of the values in the pair don't belong to * the range [0, input.size()). * * @param input View of table to slice @@ -521,9 +534,9 @@ std::vector slice(table_view const& input, * output: {{10, 12}, {14, 16, 18}, {20, 22, 24, 26}, {28}} * @endcode * - * @throws cudf::logic_error if `splits` has end index > size of `input`. - * @throws cudf::logic_error When the value in `splits` is not in the range [0, input.size()). - * @throws cudf::logic_error When the values in the `splits` are 'strictly decreasing'. + * @throws std::out_of_range if `splits` has end index > size of `input`. + * @throws std::out_of_range When the value in `splits` is not in the range [0, input.size()). + * @throws std::invalid_argument When the values in the `splits` are 'strictly decreasing'. * * @param input View of column to split * @param splits Indices where the view will be split @@ -567,9 +580,9 @@ std::vector split(column_view const& input, * {{50, 52}, {54, 56, 58}, {60, 62, 64, 66}, {68}}] * @endcode * - * @throws cudf::logic_error if `splits` has end index > size of `input`. - * @throws cudf::logic_error When the value in `splits` is not in the range [0, input.size()). - * @throws cudf::logic_error When the values in the `splits` are 'strictly decreasing'. + * @throws std::out_of_range if `splits` has end index > size of `input`. + * @throws std::out_of_range When the value in `splits` is not in the range [0, input.size()). + * @throws std::invalid_argument When the values in the `splits` are 'strictly decreasing'. * * @param input View of a table to split * @param splits Indices where the view will be split @@ -594,10 +607,10 @@ std::vector split(table_view const& input, * Selects each element i in the output column from either @p rhs or @p lhs using the following * rule: `output[i] = (boolean_mask.valid(i) and boolean_mask[i]) ? lhs[i] : rhs[i]` * - * @throws cudf::logic_error if lhs and rhs are not of the same type - * @throws cudf::logic_error if lhs and rhs are not of the same length - * @throws cudf::logic_error if boolean mask is not of type bool - * @throws cudf::logic_error if boolean mask is not of the same length as lhs and rhs + * @throws cudf::data_type_error if lhs and rhs are not of the same type + * @throws std::invalid_argument if lhs and rhs are not of the same length + * @throws cudf::data_type_error if boolean mask is not of type bool + * @throws std::invalid_argument if boolean mask is not of the same length as lhs and rhs * @param lhs left-hand column_view * @param rhs right-hand column_view * @param boolean_mask column of `type_id::BOOL8` representing "left (true) / right (false)" @@ -621,9 +634,9 @@ std::unique_ptr copy_if_else( * Selects each element i in the output column from either @p rhs or @p lhs using the following * rule: `output[i] = (boolean_mask.valid(i) and boolean_mask[i]) ? lhs : rhs[i]` * - * @throws cudf::logic_error if lhs and rhs are not of the same type - * @throws cudf::logic_error if boolean mask is not of type bool - * @throws cudf::logic_error if boolean mask is not of the same length as rhs + * @throws cudf::data_type_error if lhs and rhs are not of the same type + * @throws cudf::data_type_error if boolean mask is not of type bool + * @throws std::invalid_argument if boolean mask is not of the same length as lhs and rhs * @param lhs left-hand scalar * @param rhs right-hand column_view * @param boolean_mask column of `type_id::BOOL8` representing "left (true) / right (false)" @@ -647,9 +660,9 @@ std::unique_ptr copy_if_else( * Selects each element i in the output column from either @p rhs or @p lhs using the following * rule: `output[i] = (boolean_mask.valid(i) and boolean_mask[i]) ? lhs[i] : rhs` * - * @throws cudf::logic_error if lhs and rhs are not of the same type - * @throws cudf::logic_error if boolean mask is not of type bool - * @throws cudf::logic_error if boolean mask is not of the same length as lhs + * @throws cudf::data_type_error if lhs and rhs are not of the same type + * @throws cudf::data_type_error if boolean mask is not of type bool + * @throws std::invalid_argument if boolean mask is not of the same length as lhs and rhs * @param lhs left-hand column_view * @param rhs right-hand scalar * @param boolean_mask column of `type_id::BOOL8` representing "left (true) / right (false)" @@ -713,11 +726,11 @@ std::unique_ptr copy_if_else( * output: {{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}} * @endcode * - * @throw cudf::logic_error if input.num_columns() != target.num_columns() - * @throws cudf::logic_error if any `i`th input_column type != `i`th target_column type - * @throws cudf::logic_error if boolean_mask.type() != bool - * @throws cudf::logic_error if boolean_mask.size() != target.num_rows() - * @throws cudf::logic_error if number of `true` in `boolean_mask` > input.num_rows() + * @throws std::invalid_argument if input.num_columns() != target.num_columns() + * @throws cudf::data_type_error if any `i`th input_column type != `i`th target_column type + * @throws cudf::data_type_error if boolean_mask.type() != bool + * @throws std::invalid_argument if boolean_mask.size() != target.num_rows() + * @throws std::invalid_argument if number of `true` in `boolean_mask` > input.num_rows() * * @param input table_view (set of dense columns) to scatter * @param target table_view to modify with scattered values from `input` @@ -740,8 +753,8 @@ std::unique_ptr
boolean_mask_scatter( * * @ingroup copy_scatter * - * The `i`th scalar in `input` will be written to all columns of the output - * table at the location of the `i`th true value in `boolean_mask`. + * The `i`th scalar in `input` will be written to the ith column of the output + * table at the location of every true value in `boolean_mask`. * All other rows in the output will equal the same row in `target`. * * @code{.pseudo} @@ -753,10 +766,10 @@ std::unique_ptr
boolean_mask_scatter( * output: {{ 11, 2, 3, 4, 11, 11, 7, 11, 11, 10}} * @endcode * - * @throw cudf::logic_error if input.size() != target.num_columns() - * @throws cudf::logic_error if any `i`th input_scalar type != `i`th target_column type - * @throws cudf::logic_error if boolean_mask.type() != bool - * @throws cudf::logic_error if boolean_mask.size() != target.size() + * @throws std::invalid_argument if input.size() != target.num_columns() + * @throws cudf::data_type_error if any `i`th input_column type != `i`th target_column type + * @throws cudf::data_type_error if boolean_mask.type() != bool + * @throws std::invalid_argument if boolean_mask.size() != target.num_rows() * * @param input scalars to scatter * @param target table_view to modify with scattered values from `input` @@ -779,7 +792,7 @@ std::unique_ptr
boolean_mask_scatter( * @warning This function is expensive (invokes a kernel launch). So, it is not * recommended to be used in performance sensitive code or inside a loop. * - * @throws cudf::logic_error if `index` is not within the range `[0, input.size())` + * @throws std::out_of_range if `index` is not within the range `[0, input.size())` * * @param input Column view to get the element from * @param index Index into `input` to get the element at diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index 3b55a62cec0..db373f47a01 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -368,13 +368,16 @@ template size_type validate_segmented_indices(IndexIterator indices_begin, IndexIterator indices_end) { auto const num_indices = static_cast(std::distance(indices_begin, indices_end)); - CUDF_EXPECTS(num_indices % 2 == 0, "Array of indices needs to have an even number of elements."); + CUDF_EXPECTS(num_indices % 2 == 0, + "Array of indices needs to have an even number of elements.", + std::invalid_argument); size_type const num_segments = num_indices / 2; for (size_type i = 0; i < num_segments; i++) { auto begin = indices_begin[2 * i]; auto end = indices_begin[2 * i + 1]; - CUDF_EXPECTS(begin >= 0, "Starting index cannot be negative."); - CUDF_EXPECTS(end >= begin, "End index cannot be smaller than the starting index."); + CUDF_EXPECTS(begin >= 0, "Starting index cannot be negative.", std::out_of_range); + CUDF_EXPECTS( + end >= begin, "End index cannot be smaller than the starting index.", std::invalid_argument); } return num_segments; } diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index c28237587eb..23224d3225d 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -48,6 +48,7 @@ #include #include +#include namespace cudf { namespace { @@ -1729,13 +1730,15 @@ bool check_inputs(cudf::table_view const& input, std::vector const& s if (input.num_columns() == 0) { return true; } if (splits.size() > 0) { CUDF_EXPECTS(splits.back() <= input.column(0).size(), - "splits can't exceed size of input columns"); + "splits can't exceed size of input columns", + std::out_of_range); } size_type begin = 0; for (auto end : splits) { - CUDF_EXPECTS(begin >= 0, "Starting index cannot be negative."); - CUDF_EXPECTS(end >= begin, "End index cannot be smaller than the starting index."); - CUDF_EXPECTS(end <= input.column(0).size(), "Slice range out of bounds."); + CUDF_EXPECTS(begin >= 0, "Starting index cannot be negative.", std::out_of_range); + CUDF_EXPECTS( + end >= begin, "End index cannot be smaller than the starting index.", std::invalid_argument); + CUDF_EXPECTS(end <= input.column(0).size(), "Slice range out of bounds.", std::out_of_range); begin = end; } return input.column(0).size() == 0; diff --git a/cpp/src/copying/copy.cu b/cpp/src/copying/copy.cu index 6b7fae32d48..8299c211fad 100644 --- a/cpp/src/copying/copy.cu +++ b/cpp/src/copying/copy.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -35,6 +35,8 @@ #include #include +#include + namespace cudf { namespace detail { namespace { @@ -319,7 +321,8 @@ std::unique_ptr copy_if_else(Left const& lhs, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(boolean_mask.type() == data_type(type_id::BOOL8), - "Boolean mask column must be of type type_id::BOOL8"); + "Boolean mask column must be of type type_id::BOOL8", + cudf::data_type_error); if (boolean_mask.is_empty()) { return cudf::empty_like(lhs); } @@ -356,9 +359,11 @@ std::unique_ptr copy_if_else(column_view const& lhs, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(boolean_mask.size() == lhs.size(), - "Boolean mask column must be the same size as lhs and rhs columns"); - CUDF_EXPECTS(lhs.size() == rhs.size(), "Both columns must be of the size"); - CUDF_EXPECTS(lhs.type() == rhs.type(), "Both inputs must be of the same type"); + "Boolean mask column must be the same size as lhs and rhs columns", + std::invalid_argument); + CUDF_EXPECTS(lhs.size() == rhs.size(), "Both columns must be of the size", std::invalid_argument); + CUDF_EXPECTS( + lhs.type() == rhs.type(), "Both inputs must be of the same type", cudf::data_type_error); return copy_if_else(lhs, rhs, lhs.has_nulls(), rhs.has_nulls(), boolean_mask, stream, mr); } @@ -370,11 +375,13 @@ std::unique_ptr copy_if_else(scalar const& lhs, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(boolean_mask.size() == rhs.size(), - "Boolean mask column must be the same size as rhs column"); + "Boolean mask column must be the same size as rhs column", + std::invalid_argument); auto rhs_type = cudf::is_dictionary(rhs.type()) ? cudf::dictionary_column_view(rhs).keys_type() : rhs.type(); - CUDF_EXPECTS(lhs.type() == rhs_type, "Both inputs must be of the same type"); + CUDF_EXPECTS( + lhs.type() == rhs_type, "Both inputs must be of the same type", cudf::data_type_error); return copy_if_else(lhs, rhs, !lhs.is_valid(stream), rhs.has_nulls(), boolean_mask, stream, mr); } @@ -386,11 +393,13 @@ std::unique_ptr copy_if_else(column_view const& lhs, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(boolean_mask.size() == lhs.size(), - "Boolean mask column must be the same size as lhs column"); + "Boolean mask column must be the same size as lhs column", + std::invalid_argument); auto lhs_type = cudf::is_dictionary(lhs.type()) ? cudf::dictionary_column_view(lhs).keys_type() : lhs.type(); - CUDF_EXPECTS(lhs_type == rhs.type(), "Both inputs must be of the same type"); + CUDF_EXPECTS( + lhs_type == rhs.type(), "Both inputs must be of the same type", cudf::data_type_error); return copy_if_else(lhs, rhs, lhs.has_nulls(), !rhs.is_valid(stream), boolean_mask, stream, mr); } @@ -401,7 +410,8 @@ std::unique_ptr copy_if_else(scalar const& lhs, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_EXPECTS(lhs.type() == rhs.type(), "Both inputs must be of the same type"); + CUDF_EXPECTS( + lhs.type() == rhs.type(), "Both inputs must be of the same type", cudf::data_type_error); return copy_if_else( lhs, rhs, !lhs.is_valid(stream), !rhs.is_valid(stream), boolean_mask, stream, mr); } diff --git a/cpp/src/copying/copy_range.cu b/cpp/src/copying/copy_range.cu index 61d51f1d284..038646d8cf4 100644 --- a/cpp/src/copying/copy_range.cu +++ b/cpp/src/copying/copy_range.cu @@ -38,6 +38,7 @@ #include #include +#include namespace { template @@ -202,14 +203,17 @@ void copy_range_in_place(column_view const& source, rmm::cuda_stream_view stream) { CUDF_EXPECTS(cudf::is_fixed_width(target.type()), - "In-place copy_range does not support variable-sized types."); + "In-place copy_range does not support variable-sized types.", + cudf::data_type_error); CUDF_EXPECTS((source_begin >= 0) && (source_end <= source.size()) && (source_begin <= source_end) && (target_begin >= 0) && (target_begin <= target.size() - (source_end - source_begin)), - "Range is out of bounds."); - CUDF_EXPECTS(target.type() == source.type(), "Data type mismatch."); + "Range is out of bounds.", + std::out_of_range); + CUDF_EXPECTS(target.type() == source.type(), "Data type mismatch.", cudf::data_type_error); CUDF_EXPECTS(target.nullable() || not source.has_nulls(), - "target should be nullable if source has null values."); + "target should be nullable if source has null values.", + std::invalid_argument); if (source_end != source_begin) { // otherwise no-op cudf::type_dispatcher(target.type(), @@ -232,8 +236,9 @@ std::unique_ptr copy_range(column_view const& source, CUDF_EXPECTS((source_begin >= 0) && (source_end <= source.size()) && (source_begin <= source_end) && (target_begin >= 0) && (target_begin <= target.size() - (source_end - source_begin)), - "Range is out of bounds."); - CUDF_EXPECTS(target.type() == source.type(), "Data type mismatch."); + "Range is out of bounds.", + std::out_of_range); + CUDF_EXPECTS(target.type() == source.type(), "Data type mismatch.", cudf::data_type_error); return cudf::type_dispatcher( target.type(), diff --git a/cpp/src/copying/gather.cu b/cpp/src/copying/gather.cu index 921f84b6b50..78748e5a00b 100644 --- a/cpp/src/copying/gather.cu +++ b/cpp/src/copying/gather.cu @@ -29,6 +29,8 @@ #include #include +#include + namespace cudf { namespace detail { @@ -39,7 +41,7 @@ std::unique_ptr
gather(table_view const& source_table, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_EXPECTS(not gather_map.has_nulls(), "gather_map contains nulls"); + CUDF_EXPECTS(not gather_map.has_nulls(), "gather_map contains nulls", std::invalid_argument); // create index type normalizing iterator for the gather_map auto map_begin = indexalator_factory::make_input_iterator(gather_map); diff --git a/cpp/src/copying/get_element.cu b/cpp/src/copying/get_element.cu index a3f9be0bc76..2e804415439 100644 --- a/cpp/src/copying/get_element.cu +++ b/cpp/src/copying/get_element.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,6 +30,8 @@ #include +#include + namespace cudf { namespace detail { @@ -193,7 +195,7 @@ std::unique_ptr get_element(column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_EXPECTS(index >= 0 and index < input.size(), "Index out of bounds"); + CUDF_EXPECTS(index >= 0 and index < input.size(), "Index out of bounds", std::out_of_range); return type_dispatcher(input.type(), get_element_functor{}, input, index, stream, mr); } diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu index baa5d85d4d4..7931df4c9f0 100644 --- a/cpp/src/copying/scatter.cu +++ b/cpp/src/copying/scatter.cu @@ -44,6 +44,8 @@ #include #include +#include + namespace cudf { namespace detail { namespace { @@ -109,7 +111,9 @@ struct column_scalar_scatterer_impl { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { - CUDF_EXPECTS(source.get().type() == target.type(), "scalar and column types must match"); + CUDF_EXPECTS(source.get().type() == target.type(), + "scalar and column types must match", + cudf::data_type_error); // make a copy of data and null mask from source auto result = std::make_unique(target, stream, mr); @@ -296,17 +300,20 @@ std::unique_ptr
scatter(table_view const& source, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(source.num_columns() == target.num_columns(), - "Number of columns in source and target not equal"); + "Number of columns in source and target not equal", + std::invalid_argument); CUDF_EXPECTS(scatter_map.size() <= source.num_rows(), - "Size of scatter map must be equal to or less than source rows"); + "Size of scatter map must be equal to or less than source rows", + std::invalid_argument); CUDF_EXPECTS(std::equal(source.begin(), source.end(), target.begin(), [](auto const& col1, auto const& col2) { return col1.type().id() == col2.type().id(); }), - "Column types do not match between source and target"); - CUDF_EXPECTS(not scatter_map.has_nulls(), "Scatter map contains nulls"); + "Column types do not match between source and target", + cudf::data_type_error); + CUDF_EXPECTS(not scatter_map.has_nulls(), "Scatter map contains nulls", std::invalid_argument); if (scatter_map.is_empty()) { return std::make_unique
(target, stream, mr); } @@ -340,8 +347,9 @@ std::unique_ptr
scatter(std::vector> rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(source.size() == static_cast(target.num_columns()), - "Number of columns in source and target not equal"); - CUDF_EXPECTS(not indices.has_nulls(), "indices contains nulls"); + "Number of scalars in source and number of columns in target not equal", + std::invalid_argument); + CUDF_EXPECTS(not indices.has_nulls(), "indices contains nulls", std::invalid_argument); if (indices.is_empty()) { return std::make_unique
(target, stream, mr); } @@ -425,10 +433,14 @@ std::unique_ptr
boolean_mask_scatter(table_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(input.num_columns() == target.num_columns(), - "Mismatch in number of input columns and target columns"); + "Mismatch in number of input columns and target columns", + std::invalid_argument); CUDF_EXPECTS(boolean_mask.size() == target.num_rows(), - "Boolean mask size and number of target rows mismatch"); - CUDF_EXPECTS(boolean_mask.type().id() == type_id::BOOL8, "Mask must be of Boolean type"); + "Boolean mask size and number of target rows mismatch", + std::invalid_argument); + CUDF_EXPECTS(boolean_mask.type().id() == type_id::BOOL8, + "Mask must be of Boolean type", + cudf::data_type_error); // Count valid pair of input and columns as per type at each column index i CUDF_EXPECTS( std::all_of(thrust::counting_iterator(0), @@ -436,7 +448,8 @@ std::unique_ptr
boolean_mask_scatter(table_view const& input, [&input, &target](auto index) { return ((input.column(index).type().id()) == (target.column(index).type().id())); }), - "Type mismatch in input column and target column"); + "Type mismatch in input column and target column", + cudf::data_type_error); if (target.num_rows() != 0) { std::vector> out_columns(target.num_columns()); @@ -463,10 +476,14 @@ std::unique_ptr
boolean_mask_scatter( rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(static_cast(input.size()) == target.num_columns(), - "Mismatch in number of scalars and target columns"); + "Mismatch in number of scalars and target columns", + std::invalid_argument); CUDF_EXPECTS(boolean_mask.size() == target.num_rows(), - "Boolean mask size and number of target rows mismatch"); - CUDF_EXPECTS(boolean_mask.type().id() == type_id::BOOL8, "Mask must be of Boolean type"); + "Boolean mask size and number of target rows mismatch", + std::invalid_argument); + CUDF_EXPECTS(boolean_mask.type().id() == type_id::BOOL8, + "Mask must be of Boolean type", + cudf::data_type_error); // Count valid pair of input and columns as per type at each column/scalar index i CUDF_EXPECTS( @@ -475,7 +492,8 @@ std::unique_ptr
boolean_mask_scatter( [&input, &target](auto index) { return (input[index].get().type().id() == target.column(index).type().id()); }), - "Type mismatch in input scalar and target column"); + "Type mismatch in input scalar and target column", + cudf::data_type_error); if (target.num_rows() != 0) { std::vector> out_columns(target.num_columns()); diff --git a/cpp/src/copying/shift.cu b/cpp/src/copying/shift.cu index 89d6551737b..8e013bb1212 100644 --- a/cpp/src/copying/shift.cu +++ b/cpp/src/copying/shift.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -38,6 +38,7 @@ #include #include #include +#include namespace cudf { namespace { @@ -71,7 +72,7 @@ struct shift_functor { std::unique_ptr> operator()(Args&&...) { - CUDF_FAIL("shift only supports fixed-width or string types."); + CUDF_FAIL("shift only supports fixed-width or string types.", cudf::data_type_error); } template @@ -157,7 +158,8 @@ std::unique_ptr shift(column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(input.type() == fill_value.type(), - "shift requires each fill value type to match the corresponding column type."); + "shift requires each fill value type to match the corresponding column type.", + cudf::data_type_error); if (input.is_empty()) { return empty_like(input); } diff --git a/cpp/src/copying/slice.cu b/cpp/src/copying/slice.cu index 7c524dde3c8..dc37addf4ee 100644 --- a/cpp/src/copying/slice.cu +++ b/cpp/src/copying/slice.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,6 +29,7 @@ #include #include +#include namespace cudf { namespace detail { @@ -39,9 +40,9 @@ ColumnView slice(ColumnView const& input, size_type end, rmm::cuda_stream_view stream) { - CUDF_EXPECTS(begin >= 0, "Invalid beginning of range."); - CUDF_EXPECTS(end >= begin, "Invalid end of range."); - CUDF_EXPECTS(end <= input.size(), "Slice range out of bounds."); + CUDF_EXPECTS(begin >= 0, "Invalid beginning of range.", std::out_of_range); + CUDF_EXPECTS(end >= begin, "Invalid end of range.", std::invalid_argument); + CUDF_EXPECTS(end <= input.size(), "Slice range out of bounds.", std::out_of_range); std::vector children{}; children.reserve(input.num_children()); @@ -72,7 +73,7 @@ std::vector slice(column_view const& input, host_span indices, rmm::cuda_stream_view stream) { - CUDF_EXPECTS(indices.size() % 2 == 0, "indices size must be even"); + CUDF_EXPECTS(indices.size() % 2 == 0, "indices size must be even", std::invalid_argument); if (indices.empty()) return {}; @@ -88,9 +89,10 @@ std::vector slice(column_view const& input, auto op = [&](auto i) { auto begin = indices[2 * i]; auto end = indices[2 * i + 1]; - CUDF_EXPECTS(begin >= 0, "Starting index cannot be negative."); - CUDF_EXPECTS(end >= begin, "End index cannot be smaller than the starting index."); - CUDF_EXPECTS(end <= input.size(), "Slice range out of bounds."); + CUDF_EXPECTS(begin >= 0, "Starting index cannot be negative.", std::out_of_range); + CUDF_EXPECTS( + end >= begin, "End index cannot be smaller than the starting index.", std::invalid_argument); + CUDF_EXPECTS(end <= input.size(), "Slice range out of bounds.", std::out_of_range); return column_view{input.type(), end - begin, input.head(), @@ -107,7 +109,7 @@ std::vector slice(table_view const& input, host_span indices, rmm::cuda_stream_view stream) { - CUDF_EXPECTS(indices.size() % 2 == 0, "indices size must be even"); + CUDF_EXPECTS(indices.size() % 2 == 0, "indices size must be even", std::invalid_argument); if (indices.empty()) { return {}; } // 2d arrangement of column_views that represent the outgoing table_views sliced_table[i][j] diff --git a/cpp/src/copying/split.cpp b/cpp/src/copying/split.cpp index 1621bcdb36d..832a72ed5b0 100644 --- a/cpp/src/copying/split.cpp +++ b/cpp/src/copying/split.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,6 +23,7 @@ #include #include +#include namespace cudf { namespace detail { @@ -34,7 +35,8 @@ std::vector split(T const& input, rmm::cuda_stream_view stream) { if (splits.empty() or column_size == 0) { return std::vector{input}; } - CUDF_EXPECTS(splits.back() <= column_size, "splits can't exceed size of input columns"); + CUDF_EXPECTS( + splits.back() <= column_size, "splits can't exceed size of input columns", std::out_of_range); // If the size is not zero, the split will always start at `0` std::vector indices{0}; diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu index 86e4da664a8..b3a029224d7 100644 --- a/cpp/src/io/json/json_normalization.cu +++ b/cpp/src/io/json/json_normalization.cu @@ -103,8 +103,11 @@ struct TransduceToNormalizedQuotes { // SQS | {'} -> {"} // SQS | {"} -> {\"} // SQS | {\} -> + // DQS | {\} -> // SEC | {'} -> {'} // SEC | Sigma\{'} -> {\*} + // DEC | {'} -> {'} + // DEC | Sigma\{'} -> {\*} // Whether this transition translates to the escape sequence: \" bool const outputs_escape_sequence = @@ -119,20 +122,23 @@ struct TransduceToNormalizedQuotes { return '"'; } // Case when the read symbol is an escape character - the actual translation for \ for some - // symbol is handled by transitions from SEC. For now, there is no output for this - // transition - if ((match_id == static_cast(dfa_symbol_group_id::ESCAPE_CHAR)) && - ((state_id == static_cast(dfa_states::TT_SQS)))) { + // symbol is handled by transitions from SEC. The same logic applies for the transition from + // DEC. For now, there is no output for this transition + if (match_id == static_cast(dfa_symbol_group_id::ESCAPE_CHAR) && + (state_id == static_cast(dfa_states::TT_SQS) || + state_id == static_cast(dfa_states::TT_DQS))) { return 0; } - // Case when an escaped single quote in an input single-quoted string needs to be replaced by an - // unescaped single quote - if ((match_id == static_cast(dfa_symbol_group_id::SINGLE_QUOTE_CHAR)) && - ((state_id == static_cast(dfa_states::TT_SEC)))) { + // Case when an escaped single quote in an input single-quoted or double-quoted string needs + // to be replaced by an unescaped single quote + if (match_id == static_cast(dfa_symbol_group_id::SINGLE_QUOTE_CHAR) && + (state_id == static_cast(dfa_states::TT_SEC) || + state_id == static_cast(dfa_states::TT_DEC))) { return '\''; } // Case when an escaped symbol that is not a single-quote needs to be replaced with \ - if (state_id == static_cast(dfa_states::TT_SEC)) { + if (state_id == static_cast(dfa_states::TT_SEC) || + state_id == static_cast(dfa_states::TT_DEC)) { return (relative_offset == 0) ? '\\' : read_symbol; } // In all other cases we simply output the input symbol @@ -156,18 +162,23 @@ struct TransduceToNormalizedQuotes { (match_id == static_cast(dfa_symbol_group_id::DOUBLE_QUOTE_CHAR)); // Number of characters to output on this transition if (sqs_outputs_escape_sequence) { return 2; } + // Whether this transition translates to the escape sequence \ or unescaped ' - bool const sec_outputs_escape_sequence = - (state_id == static_cast(dfa_states::TT_SEC)) && + bool const sec_dec_outputs_escape_sequence = + (state_id == static_cast(dfa_states::TT_SEC) || + state_id == static_cast(dfa_states::TT_DEC)) && (match_id != static_cast(dfa_symbol_group_id::SINGLE_QUOTE_CHAR)); // Number of characters to output on this transition - if (sec_outputs_escape_sequence) { return 2; } + if (sec_dec_outputs_escape_sequence) { return 2; } + // Whether this transition translates to no output - bool const sqs_outputs_nop = - (state_id == static_cast(dfa_states::TT_SQS)) && + bool const sqs_dqs_outputs_nop = + (state_id == static_cast(dfa_states::TT_SQS) || + state_id == static_cast(dfa_states::TT_DQS)) && (match_id == static_cast(dfa_symbol_group_id::ESCAPE_CHAR)); // Number of characters to output on this transition - if (sqs_outputs_nop) { return 0; } + if (sqs_dqs_outputs_nop) { return 0; } + return 1; } }; diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index fb17545875a..d881ab6f9b7 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -2896,9 +2896,9 @@ __device__ std::pair get_extremum(statistics_val const* s return {scratch, sizeof(float)}; } case dtype_int64: + case dtype_decimal64: case dtype_timestamp64: case dtype_float64: return {stats_val, sizeof(int64_t)}; - case dtype_decimal64: case dtype_decimal128: byte_reverse128(stats_val->d128_val, scratch); return {scratch, sizeof(__int128_t)}; diff --git a/cpp/tests/bitmask/bitmask_tests.cpp b/cpp/tests/bitmask/bitmask_tests.cpp index 72ef88e4ed1..4bf648bed5a 100644 --- a/cpp/tests/bitmask/bitmask_tests.cpp +++ b/cpp/tests/bitmask/bitmask_tests.cpp @@ -33,6 +33,8 @@ #include #include +#include + struct BitmaskUtilitiesTest : public cudf::test::BaseFixture {}; TEST_F(BitmaskUtilitiesTest, StateNullCount) @@ -110,10 +112,10 @@ TEST_F(CountBitmaskTest, NegativeStart) std::vector indices = {0, 16, -1, 32}; EXPECT_THROW( cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::get_default_stream()), - cudf::logic_error); + std::out_of_range); EXPECT_THROW( cudf::detail::segmented_valid_count(mask.data(), indices, cudf::get_default_stream()), - cudf::logic_error); + std::out_of_range); } TEST_F(CountBitmaskTest, StartLargerThanStop) @@ -127,10 +129,10 @@ TEST_F(CountBitmaskTest, StartLargerThanStop) std::vector indices = {0, 16, 31, 30}; EXPECT_THROW( cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::get_default_stream()), - cudf::logic_error); + std::invalid_argument); EXPECT_THROW( cudf::detail::segmented_valid_count(mask.data(), indices, cudf::get_default_stream()), - cudf::logic_error); + std::invalid_argument); } TEST_F(CountBitmaskTest, EmptyRange) diff --git a/cpp/tests/copying/copy_range_tests.cpp b/cpp/tests/copying/copy_range_tests.cpp index 96fbdcb1eb7..bcc0ac29b3e 100644 --- a/cpp/tests/copying/copy_range_tests.cpp +++ b/cpp/tests/copying/copy_range_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,6 +30,8 @@ #include #include +#include + auto all_valid = [](cudf::size_type row) { return true; }; auto even_valid = [](cudf::size_type row) { return (row % 2 == 0); }; @@ -378,7 +380,7 @@ TEST_F(CopyRangeErrorTestFixture, InvalidInplaceCall) cudf::mutable_column_view target_view{target}; // source has null values but target is not nullable. - EXPECT_THROW(cudf::copy_range_in_place(source, target_view, 0, size, 0), cudf::logic_error); + EXPECT_THROW(cudf::copy_range_in_place(source, target_view, 0, size, 0), std::invalid_argument); std::vector strings{"", "this", "is", "a", "column", "of", "strings"}; auto target_string = cudf::test::strings_column_wrapper(strings.begin(), strings.end()); @@ -386,7 +388,7 @@ TEST_F(CopyRangeErrorTestFixture, InvalidInplaceCall) cudf::mutable_column_view target_view_string{target_string}; EXPECT_THROW(cudf::copy_range_in_place(source_string, target_view_string, 0, size, 0), - cudf::logic_error); + cudf::data_type_error); } TEST_F(CopyRangeErrorTestFixture, InvalidRange) @@ -407,32 +409,32 @@ TEST_F(CopyRangeErrorTestFixture, InvalidRange) EXPECT_NO_THROW(auto p_ret = cudf::copy_range(source, target, 0, 0, 0)); // source_begin is negative - EXPECT_THROW(cudf::copy_range_in_place(source, target_view, -1, size, 0), cudf::logic_error); - EXPECT_THROW(auto p_ret = cudf::copy_range(source, target, -1, size, 0), cudf::logic_error); + EXPECT_THROW(cudf::copy_range_in_place(source, target_view, -1, size, 0), std::out_of_range); + EXPECT_THROW(auto p_ret = cudf::copy_range(source, target, -1, size, 0), std::out_of_range); // source_begin > source_end - EXPECT_THROW(cudf::copy_range_in_place(source, target_view, 10, 5, 0), cudf::logic_error); - EXPECT_THROW(auto p_ret = cudf::copy_range(source, target, 10, 5, 0), cudf::logic_error); + EXPECT_THROW(cudf::copy_range_in_place(source, target_view, 10, 5, 0), std::out_of_range); + EXPECT_THROW(auto p_ret = cudf::copy_range(source, target, 10, 5, 0), std::out_of_range); // source_begin >= source.size() - EXPECT_THROW(cudf::copy_range_in_place(source, target_view, 101, 100, 0), cudf::logic_error); - EXPECT_THROW(auto p_ret = cudf::copy_range(source, target, 101, 100, 0), cudf::logic_error); + EXPECT_THROW(cudf::copy_range_in_place(source, target_view, 101, 100, 0), std::out_of_range); + EXPECT_THROW(auto p_ret = cudf::copy_range(source, target, 101, 100, 0), std::out_of_range); // source_end > source.size() - EXPECT_THROW(cudf::copy_range_in_place(source, target_view, 99, 101, 0), cudf::logic_error); - EXPECT_THROW(auto p_ret = cudf::copy_range(source, target, 99, 101, 0), cudf::logic_error); + EXPECT_THROW(cudf::copy_range_in_place(source, target_view, 99, 101, 0), std::out_of_range); + EXPECT_THROW(auto p_ret = cudf::copy_range(source, target, 99, 101, 0), std::out_of_range); // target_begin < 0 - EXPECT_THROW(cudf::copy_range_in_place(source, target_view, 50, 100, -5), cudf::logic_error); - EXPECT_THROW(auto p_ret = cudf::copy_range(source, target, 50, 100, -5), cudf::logic_error); + EXPECT_THROW(cudf::copy_range_in_place(source, target_view, 50, 100, -5), std::out_of_range); + EXPECT_THROW(auto p_ret = cudf::copy_range(source, target, 50, 100, -5), std::out_of_range); // target_begin >= target.size() - EXPECT_THROW(cudf::copy_range_in_place(source, target_view, 50, 100, 100), cudf::logic_error); - EXPECT_THROW(auto p_ret = cudf::copy_range(source, target, 50, 100, 100), cudf::logic_error); + EXPECT_THROW(cudf::copy_range_in_place(source, target_view, 50, 100, 100), std::out_of_range); + EXPECT_THROW(auto p_ret = cudf::copy_range(source, target, 50, 100, 100), std::out_of_range); // target_begin + (source_end - source_begin) > target.size() - EXPECT_THROW(cudf::copy_range_in_place(source, target_view, 50, 100, 80), cudf::logic_error); - EXPECT_THROW(auto p_ret = cudf::copy_range(source, target, 50, 100, 80), cudf::logic_error); + EXPECT_THROW(cudf::copy_range_in_place(source, target_view, 50, 100, 80), std::out_of_range); + EXPECT_THROW(auto p_ret = cudf::copy_range(source, target, 50, 100, 80), std::out_of_range); // Empty column target = cudf::test::fixed_width_column_wrapper{}; @@ -457,8 +459,8 @@ TEST_F(CopyRangeErrorTestFixture, DTypeMismatch) cudf::mutable_column_view target_view{target}; - EXPECT_THROW(cudf::copy_range_in_place(source, target_view, 0, 100, 0), cudf::logic_error); - EXPECT_THROW(cudf::copy_range(source, target, 0, 100, 0), cudf::logic_error); + EXPECT_THROW(cudf::copy_range_in_place(source, target_view, 0, 100, 0), cudf::data_type_error); + EXPECT_THROW(cudf::copy_range(source, target, 0, 100, 0), cudf::data_type_error); auto dict_target = cudf::dictionary::encode(target); auto dict_source = cudf::dictionary::encode(source); @@ -516,5 +518,5 @@ TYPED_TEST(FixedPointTypesCopyRange, FixedPointScaleMismatch) auto const source = fp_wrapper{{110, 220, 330, 440, 550, 660}, scale_type{-2}}; auto const target = fp_wrapper{{0, 0, 0, 0, 0, 0}, scale_type{-3}}; - EXPECT_THROW(cudf::copy_range(source, target, 1, 4, 1), cudf::logic_error); + EXPECT_THROW(cudf::copy_range(source, target, 1, 4, 1), cudf::data_type_error); } diff --git a/cpp/tests/copying/copy_tests.cpp b/cpp/tests/copying/copy_tests.cpp index 737937367d5..138e1935363 100644 --- a/cpp/tests/copying/copy_tests.cpp +++ b/cpp/tests/copying/copy_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,6 +30,8 @@ #include #include +#include + template struct CopyTest : public cudf::test::BaseFixture {}; @@ -215,7 +217,7 @@ TYPED_TEST(CopyTest, CopyIfElseBadInputLength) wrapper lhs_w({5, 5, 5, 5}); wrapper rhs_w({6, 6, 6, 6}); - EXPECT_THROW(cudf::copy_if_else(lhs_w, rhs_w, mask_w), cudf::logic_error); + EXPECT_THROW(cudf::copy_if_else(lhs_w, rhs_w, mask_w), std::invalid_argument); } // column length mismatch @@ -225,7 +227,7 @@ TYPED_TEST(CopyTest, CopyIfElseBadInputLength) wrapper lhs_w({5, 5, 5}); wrapper rhs_w({6, 6, 6, 6}); - EXPECT_THROW(cudf::copy_if_else(lhs_w, rhs_w, mask_w), cudf::logic_error); + EXPECT_THROW(cudf::copy_if_else(lhs_w, rhs_w, mask_w), std::invalid_argument); } } @@ -465,7 +467,7 @@ TEST_F(CopyTestUntyped, CopyIfElseTypeMismatch) wrapper lhs_w{5, 5, 5, 5}; wrapper rhs_w{6, 6, 6, 6}; - EXPECT_THROW(cudf::copy_if_else(lhs_w, rhs_w, mask_w), cudf::logic_error); + EXPECT_THROW(cudf::copy_if_else(lhs_w, rhs_w, mask_w), cudf::data_type_error); } struct StringsCopyIfElseTest : public cudf::test::BaseFixture {}; @@ -634,7 +636,7 @@ TYPED_TEST(FixedPointTypes, FixedPointScaleMismatch) auto const a = fp_wrapper{{110, 220, 330, 440, 550, 660}, scale_type{-2}}; auto const b = fp_wrapper{{0, 0, 0, 0, 0, 0}, scale_type{-1}}; - EXPECT_THROW(cudf::copy_if_else(a, b, mask), cudf::logic_error); + EXPECT_THROW(cudf::copy_if_else(a, b, mask), cudf::data_type_error); } struct DictionaryCopyIfElseTest : public cudf::test::BaseFixture {}; @@ -713,7 +715,7 @@ TEST_F(DictionaryCopyIfElseTest, TypeMismatch) EXPECT_THROW(cudf::copy_if_else(input1, input2, mask), cudf::logic_error); cudf::string_scalar input3{"1"}; - EXPECT_THROW(cudf::copy_if_else(input1, input3, mask), cudf::logic_error); - EXPECT_THROW(cudf::copy_if_else(input3, input2, mask), cudf::logic_error); - EXPECT_THROW(cudf::copy_if_else(input2, input3, mask), cudf::logic_error); + EXPECT_THROW(cudf::copy_if_else(input1, input3, mask), cudf::data_type_error); + EXPECT_THROW(cudf::copy_if_else(input3, input2, mask), cudf::data_type_error); + EXPECT_THROW(cudf::copy_if_else(input2, input3, mask), cudf::data_type_error); } diff --git a/cpp/tests/copying/get_value_tests.cpp b/cpp/tests/copying/get_value_tests.cpp index d58aeb2ddfc..2be3c26af1d 100644 --- a/cpp/tests/copying/get_value_tests.cpp +++ b/cpp/tests/copying/get_value_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,6 +33,8 @@ #include +#include + using namespace cudf::test::iterators; template @@ -77,8 +79,8 @@ TYPED_TEST(FixedWidthGetValueTest, IndexOutOfBounds) cudf::test::fixed_width_column_wrapper col({9, 8, 7, 6}, {0, 1, 0, 1}); // Test for out of bounds indexes in both directions. - EXPECT_THROW(cudf::get_element(col, -1), cudf::logic_error); - EXPECT_THROW(cudf::get_element(col, 4), cudf::logic_error); + EXPECT_THROW(cudf::get_element(col, -1), std::out_of_range); + EXPECT_THROW(cudf::get_element(col, 4), std::out_of_range); } struct StringGetValueTest : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/copying/scatter_tests.cpp b/cpp/tests/copying/scatter_tests.cpp index 8194a74c10a..16cbeb7e657 100644 --- a/cpp/tests/copying/scatter_tests.cpp +++ b/cpp/tests/copying/scatter_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,6 +25,8 @@ #include #include +#include + class ScatterUntypedTests : public cudf::test::BaseFixture {}; // Throw logic error if scatter map is longer than source @@ -37,7 +39,7 @@ TEST_F(ScatterUntypedTests, ScatterMapTooLong) auto const source_table = cudf::table_view({source, source}); auto const target_table = cudf::table_view({target, target}); - EXPECT_THROW(cudf::scatter(source_table, scatter_map, target_table), cudf::logic_error); + EXPECT_THROW(cudf::scatter(source_table, scatter_map, target_table), std::invalid_argument); } // Throw logic error if scatter map has nulls @@ -50,7 +52,7 @@ TEST_F(ScatterUntypedTests, ScatterMapNulls) auto const source_table = cudf::table_view({source, source}); auto const target_table = cudf::table_view({target, target}); - EXPECT_THROW(cudf::scatter(source_table, scatter_map, target_table), cudf::logic_error); + EXPECT_THROW(cudf::scatter(source_table, scatter_map, target_table), std::invalid_argument); } // Throw logic error if scatter map has nulls @@ -65,7 +67,7 @@ TEST_F(ScatterUntypedTests, ScatterScalarMapNulls) auto const target_table = cudf::table_view({target}); - EXPECT_THROW(cudf::scatter(source_vector, scatter_map, target_table), cudf::logic_error); + EXPECT_THROW(cudf::scatter(source_vector, scatter_map, target_table), std::invalid_argument); } // Throw logic error if source and target have different number of columns @@ -78,7 +80,7 @@ TEST_F(ScatterUntypedTests, ScatterColumnNumberMismatch) auto const source_table = cudf::table_view({source}); auto const target_table = cudf::table_view({target, target}); - EXPECT_THROW(cudf::scatter(source_table, scatter_map, target_table), cudf::logic_error); + EXPECT_THROW(cudf::scatter(source_table, scatter_map, target_table), std::invalid_argument); } // Throw logic error if number of scalars doesn't match number of columns @@ -93,7 +95,7 @@ TEST_F(ScatterUntypedTests, ScatterScalarColumnNumberMismatch) auto const target_table = cudf::table_view({target, target}); - EXPECT_THROW(cudf::scatter(source_vector, scatter_map, target_table), cudf::logic_error); + EXPECT_THROW(cudf::scatter(source_vector, scatter_map, target_table), std::invalid_argument); } // Throw logic error if source and target have different data types @@ -106,7 +108,7 @@ TEST_F(ScatterUntypedTests, ScatterDataTypeMismatch) auto const source_table = cudf::table_view({source}); auto const target_table = cudf::table_view({target}); - EXPECT_THROW(cudf::scatter(source_table, scatter_map, target_table), cudf::logic_error); + EXPECT_THROW(cudf::scatter(source_table, scatter_map, target_table), cudf::data_type_error); } // Throw logic error if source and target have different data types @@ -121,7 +123,7 @@ TEST_F(ScatterUntypedTests, ScatterScalarDataTypeMismatch) auto const target_table = cudf::table_view({target}); - EXPECT_THROW(cudf::scatter(source_vector, scatter_map, target_table), cudf::logic_error); + EXPECT_THROW(cudf::scatter(source_vector, scatter_map, target_table), cudf::data_type_error); } template @@ -589,7 +591,7 @@ TEST_F(BooleanMaskScatterFails, SourceAndTargetTypeMismatch) auto source_table = cudf::table_view({source}); auto target_table = cudf::table_view({target}); - EXPECT_THROW(cudf::boolean_mask_scatter(source_table, target_table, mask), cudf::logic_error); + EXPECT_THROW(cudf::boolean_mask_scatter(source_table, target_table, mask), cudf::data_type_error); } TEST_F(BooleanMaskScatterFails, BooleanMaskTypeMismatch) @@ -601,7 +603,7 @@ TEST_F(BooleanMaskScatterFails, BooleanMaskTypeMismatch) auto source_table = cudf::table_view({source}); auto target_table = cudf::table_view({target}); - EXPECT_THROW(cudf::boolean_mask_scatter(source_table, target_table, mask), cudf::logic_error); + EXPECT_THROW(cudf::boolean_mask_scatter(source_table, target_table, mask), cudf::data_type_error); } TEST_F(BooleanMaskScatterFails, BooleanMaskTargetSizeMismatch) @@ -613,7 +615,7 @@ TEST_F(BooleanMaskScatterFails, BooleanMaskTargetSizeMismatch) auto source_table = cudf::table_view({source}); auto target_table = cudf::table_view({target}); - EXPECT_THROW(cudf::boolean_mask_scatter(source_table, target_table, mask), cudf::logic_error); + EXPECT_THROW(cudf::boolean_mask_scatter(source_table, target_table, mask), std::invalid_argument); } TEST_F(BooleanMaskScatterFails, NumberOfColumnMismatch) @@ -625,7 +627,7 @@ TEST_F(BooleanMaskScatterFails, NumberOfColumnMismatch) auto source_table = cudf::table_view({source, source}); auto target_table = cudf::table_view({target}); - EXPECT_THROW(cudf::boolean_mask_scatter(source_table, target_table, mask), cudf::logic_error); + EXPECT_THROW(cudf::boolean_mask_scatter(source_table, target_table, mask), std::invalid_argument); } TEST_F(BooleanMaskScatterFails, MoreTruesInMaskThanSourceSize) @@ -637,7 +639,7 @@ TEST_F(BooleanMaskScatterFails, MoreTruesInMaskThanSourceSize) auto source_table = cudf::table_view({source, source}); auto target_table = cudf::table_view({target}); - EXPECT_THROW(cudf::boolean_mask_scatter(source_table, target_table, mask), cudf::logic_error); + EXPECT_THROW(cudf::boolean_mask_scatter(source_table, target_table, mask), std::invalid_argument); } template @@ -768,7 +770,7 @@ TEST_F(BooleanMaskScatterScalarFails, SourceAndTargetTypeMismatch) {true, false, false, false, true, true, false, true, true, false}); auto target_table = cudf::table_view({target}); - EXPECT_THROW(cudf::boolean_mask_scatter(scalar_vect, target_table, mask), cudf::logic_error); + EXPECT_THROW(cudf::boolean_mask_scatter(scalar_vect, target_table, mask), cudf::data_type_error); } TEST_F(BooleanMaskScatterScalarFails, BooleanMaskTypeMismatch) @@ -782,7 +784,7 @@ TEST_F(BooleanMaskScatterScalarFails, BooleanMaskTypeMismatch) {true, false, false, false, true, true, false, true, true, false}); auto target_table = cudf::table_view({target}); - EXPECT_THROW(cudf::boolean_mask_scatter(scalar_vect, target_table, mask), cudf::logic_error); + EXPECT_THROW(cudf::boolean_mask_scatter(scalar_vect, target_table, mask), cudf::data_type_error); } TEST_F(BooleanMaskScatterScalarFails, BooleanMaskTargetSizeMismatch) @@ -796,7 +798,7 @@ TEST_F(BooleanMaskScatterScalarFails, BooleanMaskTargetSizeMismatch) {true, false, false, false, true, true, false, true, true}); auto target_table = cudf::table_view({target}); - EXPECT_THROW(cudf::boolean_mask_scatter(scalar_vect, target_table, mask), cudf::logic_error); + EXPECT_THROW(cudf::boolean_mask_scatter(scalar_vect, target_table, mask), std::invalid_argument); } TEST_F(BooleanMaskScatterScalarFails, NumberOfColumnAndScalarMismatch) @@ -811,7 +813,7 @@ TEST_F(BooleanMaskScatterScalarFails, NumberOfColumnAndScalarMismatch) {true, false, false, false, true, true, false, true, true}); auto target_table = cudf::table_view({target}); - EXPECT_THROW(cudf::boolean_mask_scatter(scalar_vect, target_table, mask), cudf::logic_error); + EXPECT_THROW(cudf::boolean_mask_scatter(scalar_vect, target_table, mask), std::invalid_argument); } template diff --git a/cpp/tests/copying/shift_tests.cpp b/cpp/tests/copying/shift_tests.cpp index 17e56ea8ed8..f904696593c 100644 --- a/cpp/tests/copying/shift_tests.cpp +++ b/cpp/tests/copying/shift_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,6 +29,7 @@ #include #include +#include using TestTypes = cudf::test::Types; @@ -192,7 +193,7 @@ TYPED_TEST(ShiftTestsTyped, MismatchFillValueDtypes) auto fill = cudf::string_scalar(""); - EXPECT_THROW(cudf::shift(input, 5, fill), cudf::logic_error); + EXPECT_THROW(cudf::shift(input, 5, fill), cudf::data_type_error); } struct ShiftTests : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/copying/slice_tests.cpp b/cpp/tests/copying/slice_tests.cpp index 29ff3e1cf9b..fffc51eef2c 100644 --- a/cpp/tests/copying/slice_tests.cpp +++ b/cpp/tests/copying/slice_tests.cpp @@ -29,6 +29,7 @@ #include #include +#include #include #include @@ -302,7 +303,7 @@ TEST_F(SliceCornerCases, InvalidSetOfIndices) create_fixed_columns(start, size, valids); std::vector indices{11, 12}; - EXPECT_THROW(cudf::slice(col, indices), cudf::logic_error); + EXPECT_THROW(cudf::slice(col, indices), std::out_of_range); } TEST_F(SliceCornerCases, ImproperRange) @@ -316,7 +317,7 @@ TEST_F(SliceCornerCases, ImproperRange) create_fixed_columns(start, size, valids); std::vector indices{5, 4}; - EXPECT_THROW(cudf::slice(col, indices), cudf::logic_error); + EXPECT_THROW(cudf::slice(col, indices), std::invalid_argument); } TEST_F(SliceCornerCases, NegativeOffset) @@ -330,7 +331,7 @@ TEST_F(SliceCornerCases, NegativeOffset) create_fixed_columns(start, size, valids); std::vector indices{-1, 4}; - EXPECT_THROW(cudf::slice(col, indices), cudf::logic_error); + EXPECT_THROW(cudf::slice(col, indices), std::out_of_range); } template @@ -437,7 +438,7 @@ TEST_F(SliceTableCornerCases, InvalidSetOfIndices) std::vector indices{11, 12}; - EXPECT_THROW(cudf::slice(src_table, indices), cudf::logic_error); + EXPECT_THROW(cudf::slice(src_table, indices), std::out_of_range); } TEST_F(SliceTableCornerCases, ImproperRange) @@ -452,7 +453,7 @@ TEST_F(SliceTableCornerCases, ImproperRange) std::vector indices{5, 4}; - EXPECT_THROW(cudf::slice(src_table, indices), cudf::logic_error); + EXPECT_THROW(cudf::slice(src_table, indices), std::invalid_argument); } TEST_F(SliceTableCornerCases, NegativeOffset) @@ -467,7 +468,7 @@ TEST_F(SliceTableCornerCases, NegativeOffset) std::vector indices{-1, 4}; - EXPECT_THROW(cudf::slice(src_table, indices), cudf::logic_error); + EXPECT_THROW(cudf::slice(src_table, indices), std::out_of_range); } TEST_F(SliceTableCornerCases, MiscOffset) diff --git a/cpp/tests/copying/split_tests.cpp b/cpp/tests/copying/split_tests.cpp index 7c3beabaedf..077092ca036 100644 --- a/cpp/tests/copying/split_tests.cpp +++ b/cpp/tests/copying/split_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,6 +34,7 @@ #include #include +#include #include #include @@ -368,7 +369,7 @@ TEST_F(SplitCornerCases, InvalidSetOfIndices) create_fixed_columns(start, size, valids); std::vector splits{11, 12}; - EXPECT_THROW(cudf::split(col, splits), cudf::logic_error); + EXPECT_THROW(cudf::split(col, splits), std::out_of_range); } TEST_F(SplitCornerCases, ImproperRange) @@ -382,7 +383,7 @@ TEST_F(SplitCornerCases, ImproperRange) create_fixed_columns(start, size, valids); std::vector splits{5, 4}; - EXPECT_THROW(cudf::split(col, splits), cudf::logic_error); + EXPECT_THROW(cudf::split(col, splits), std::invalid_argument); } TEST_F(SplitCornerCases, NegativeValue) @@ -396,7 +397,7 @@ TEST_F(SplitCornerCases, NegativeValue) create_fixed_columns(start, size, valids); std::vector splits{-1, 4}; - EXPECT_THROW(cudf::split(col, splits), cudf::logic_error); + EXPECT_THROW(cudf::split(col, splits), std::invalid_argument); } // common functions for testing split/contiguous_split @@ -491,7 +492,7 @@ void split_invalid_indices(SplitFunc Split) std::vector splits{11, 12}; - EXPECT_THROW(Split(src_table, splits), cudf::logic_error); + EXPECT_THROW(Split(src_table, splits), std::out_of_range); } template @@ -507,7 +508,7 @@ void split_improper_range(SplitFunc Split) std::vector splits{5, 4}; - EXPECT_THROW(Split(src_table, splits), cudf::logic_error); + EXPECT_THROW(Split(src_table, splits), std::invalid_argument); } template @@ -523,7 +524,7 @@ void split_negative_value(SplitFunc Split) std::vector splits{-1, 4}; - EXPECT_THROW(Split(src_table, splits), cudf::logic_error); + EXPECT_THROW(Split(src_table, splits), std::invalid_argument); } template @@ -2296,7 +2297,7 @@ TEST_F(ContiguousSplitTableCornerCases, SplitEmpty) } { - EXPECT_THROW(cudf::contiguous_split(sliced[0], {1}), cudf::logic_error); + EXPECT_THROW(cudf::contiguous_split(sliced[0], {1}), std::out_of_range); } } diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json_quote_normalization_test.cpp index b13e5bd4177..593c8136e6a 100644 --- a/cpp/tests/io/json_quote_normalization_test.cpp +++ b/cpp/tests/io/json_quote_normalization_test.cpp @@ -60,28 +60,28 @@ void run_test(const std::string& host_input, const std::string& expected_host_ou preprocessed_host_output, expected_host_output, preprocessed_host_output.size()); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization1) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Single) { - std::string input = R"({"A":'TEST"'})"; - std::string output = R"({"A":"TEST\""})"; + std::string input = R"({'A':"TEST'"} ['OTHER STUFF'])"; + std::string output = R"({"A":"TEST'"} ["OTHER STUFF"])"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization2) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_MoreSingle) { - std::string input = R"({'A':"TEST'"} ['OTHER STUFF'])"; - std::string output = R"({"A":"TEST'"} ["OTHER STUFF"])"; + std::string input = R"(['\t','\\t','\\','\\\"\'\\\\','\n','\b','\u0012'])"; + std::string output = R"(["\t","\\t","\\","\\\"'\\\\","\n","\b","\u0012"])"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization3) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_DoubleInSingle) { - std::string input = R"(['{"A": "B"}',"{'A': 'B'}"])"; - std::string output = R"(["{\"A\": \"B\"}","{'A': 'B'}"])"; + std::string input = R"({"A":'TEST"'})"; + std::string output = R"({"A":"TEST\""})"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization4) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_MoreDoubleInSingle) { std::string input = R"({"ain't ain't a word and you ain't supposed to say it":'"""""""""""'})"; std::string output = @@ -89,77 +89,84 @@ TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization4) run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization5) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_StillMoreDoubleInSingle) { - std::string input = R"({"\"'\"'\"'\"'":'"\'"\'"\'"\'"'})"; - std::string output = R"({"\"'\"'\"'\"'":"\"'\"'\"'\"'\""})"; + std::string input = R"([{"ABC':'CBA":'XYZ":"ZXY'}])"; + std::string output = R"([{"ABC':'CBA":"XYZ\":\"ZXY"}])"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization6) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_DoubleInSingleAndViceVersa) { - std::string input = R"([{"ABC':'CBA":'XYZ":"ZXY'}])"; - std::string output = R"([{"ABC':'CBA":"XYZ\":\"ZXY"}])"; + std::string input = R"(['{"A": "B"}',"{'A': 'B'}"])"; + std::string output = R"(["{\"A\": \"B\"}","{'A': 'B'}"])"; + run_test(input, output); +} + +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_DoubleAndSingleInSingle) +{ + std::string input = R"({"\"'\"'\"'\"'":'"\'"\'"\'"\'"'})"; + std::string output = R"({"\"'\"'\"'\"'":"\"'\"'\"'\"'\""})"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization7) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_EscapedSingleInDouble) { std::string input = R"(["\t","\\t","\\","\\\'\"\\\\","\n","\b"])"; - std::string output = R"(["\t","\\t","\\","\\\'\"\\\\","\n","\b"])"; + std::string output = R"(["\t","\\t","\\","\\'\"\\\\","\n","\b"])"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization8) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_EscapedDoubleInSingle) { - std::string input = R"(['\t','\\t','\\','\\\"\'\\\\','\n','\b','\u0012'])"; - std::string output = R"(["\t","\\t","\\","\\\"'\\\\","\n","\b","\u0012"])"; + std::string input = R"(["\t","\\t","\\",'\\\'\"\\\\',"\n","\b"])"; + std::string output = R"(["\t","\\t","\\","\\'\"\\\\","\n","\b"])"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid1) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_MismatchedQuotes) { std::string input = R"(["THIS IS A TEST'])"; std::string output = R"(["THIS IS A TEST'])"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid2) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_MismatchedQuotesEscapedOutput) { std::string input = R"(['THIS IS A TEST"])"; std::string output = R"(["THIS IS A TEST\"])"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid3) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_MoreMismatchedQuotes) { std::string input = R"({"MORE TEST'N":'RESUL})"; std::string output = R"({"MORE TEST'N":"RESUL})"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid4) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_NoEndQuote) { std::string input = R"({"NUMBER":100'0,'STRING':'SOMETHING'})"; std::string output = R"({"NUMBER":100"0,"STRING":"SOMETHING"})"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid5) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_InvalidJSON) { std::string input = R"({'NUMBER':100"0,"STRING":"SOMETHING"})"; std::string output = R"({"NUMBER":100"0,"STRING":"SOMETHING"})"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid6) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_WrongBackslash) { std::string input = R"({'a':'\\''})"; std::string output = R"({"a":"\\""})"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid7) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_WrongBraces) { std::string input = R"(}'a': 'b'{)"; std::string output = R"(}"a": "b"{)"; diff --git a/cpp/tests/io/parquet_chunked_reader_test.cu b/cpp/tests/io/parquet_chunked_reader_test.cu index 2c992677a65..b1c0ff9b5a8 100644 --- a/cpp/tests/io/parquet_chunked_reader_test.cu +++ b/cpp/tests/io/parquet_chunked_reader_test.cu @@ -66,8 +66,6 @@ auto write_file(std::vector>& input_columns, std::size_t max_page_size_bytes = cudf::io::default_max_page_size_bytes, std::size_t max_page_size_rows = cudf::io::default_max_page_size_rows) { - // Just shift nulls of the next column by one position to avoid having all nulls in the same - // table rows. if (nullable) { // Generate deterministic bitmask instead of random bitmask for easy computation of data size. auto const valid_iter = cudf::detail::make_counting_transform_iterator( @@ -83,6 +81,10 @@ auto write_file(std::vector>& input_columns, std::move(col), cudf::get_default_stream(), rmm::mr::get_current_device_resource()); + + // Shift nulls of the next column by one position, to avoid having all nulls + // in the same table rows. + ++offset; } } @@ -988,7 +990,7 @@ TEST_F(ParquetChunkedReaderTest, TestChunkedReadWithListsOfStructs) { auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1'500'000); - EXPECT_EQ(num_chunks, 4); + EXPECT_EQ(num_chunks, 5); CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result); } diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp index 200c58bb9aa..ffa672fb564 100644 --- a/cpp/tests/io/parquet_writer_test.cpp +++ b/cpp/tests/io/parquet_writer_test.cpp @@ -719,6 +719,64 @@ TEST_F(ParquetWriterTest, CheckPageRowsTooSmall) EXPECT_EQ(ph.data_page_header.num_values, num_rows); } +TEST_F(ParquetWriterTest, Decimal32Stats) +{ + // check that decimal64 min and max statistics are written properly + std::vector expected_min{0, 0, 0xb2, 0xa1}; + std::vector expected_max{0xb2, 0xa1, 0, 0}; + + int32_t val0 = 0xa1b2; + int32_t val1 = val0 << 16; + column_wrapper col0{{numeric::decimal32(val0, numeric::scale_type{0}), + numeric::decimal32(val1, numeric::scale_type{0})}}; + + auto expected = table_view{{col0}}; + + auto const filepath = temp_env->get_temp_filepath("Decimal32Stats.parquet"); + const cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected); + cudf::io::write_parquet(out_opts); + + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::detail::FileMetaData fmd; + + read_footer(source, &fmd); + + auto const stats = get_statistics(fmd.row_groups[0].columns[0]); + + EXPECT_EQ(expected_min, stats.min_value); + EXPECT_EQ(expected_max, stats.max_value); +} + +TEST_F(ParquetWriterTest, Decimal64Stats) +{ + // check that decimal64 min and max statistics are written properly + std::vector expected_min{0, 0, 0, 0, 0xd4, 0xc3, 0xb2, 0xa1}; + std::vector expected_max{0xd4, 0xc3, 0xb2, 0xa1, 0, 0, 0, 0}; + + int64_t val0 = 0xa1b2'c3d4UL; + int64_t val1 = val0 << 32; + column_wrapper col0{{numeric::decimal64(val0, numeric::scale_type{0}), + numeric::decimal64(val1, numeric::scale_type{0})}}; + + auto expected = table_view{{col0}}; + + auto const filepath = temp_env->get_temp_filepath("Decimal64Stats.parquet"); + const cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected); + cudf::io::write_parquet(out_opts); + + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::detail::FileMetaData fmd; + + read_footer(source, &fmd); + + auto const stats = get_statistics(fmd.row_groups[0].columns[0]); + + EXPECT_EQ(expected_min, stats.min_value); + EXPECT_EQ(expected_max, stats.max_value); +} + TEST_F(ParquetWriterTest, Decimal128Stats) { // check that decimal128 min and max statistics are written in network byte order diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index b67c26f779f..07f334fdc12 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -65,7 +65,8 @@ rapids_cython_create_modules( target_link_libraries(strings_udf PUBLIC cudf_strings_udf) -link_to_pyarrow_headers("${RAPIDS_CYTHON_CREATED_TARGETS}") +set(targets_using_arrow_headers interop avro csv orc json parquet) +link_to_pyarrow_headers("${targets_using_arrow_headers}") add_subdirectory(cpp) add_subdirectory(io) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 45aa1081b8d..9c48a731cea 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -39,9 +39,14 @@ from cudf._lib.types cimport ( from cudf._lib.null_mask import bitmask_allocation_size_bytes from cudf._lib.types import dtype_from_pylibcudf_column +# TODO: We currently need this for "casting" empty pylibcudf columns in +# from_pylibcudf by instead creating an empty numeric column. We will be able +# to remove this once column factories are exposed to pylibcudf. + cimport cudf._lib.cpp.copying as cpp_copying cimport cudf._lib.cpp.types as libcudf_types cimport cudf._lib.cpp.unary as libcudf_unary +from cudf._lib cimport pylibcudf from cudf._lib.cpp.column.column cimport column, column_contents from cudf._lib.cpp.column.column_factories cimport ( make_column_from_scalar as cpp_make_column_from_scalar, @@ -618,6 +623,24 @@ cdef class Column: pylibcudf.Column A new pylibcudf.Column referencing the same data. """ + cdef libcudf_types.data_type new_dtype + if col.type().id() == pylibcudf.TypeId.TIMESTAMP_DAYS: + col = pylibcudf.unary.cast( + col, pylibcudf.DataType(pylibcudf.TypeId.TIMESTAMP_SECONDS) + ) + elif col.type().id() == pylibcudf.TypeId.EMPTY: + new_dtype = libcudf_types.data_type(libcudf_types.type_id.INT8) + # TODO: This function call is what requires cimporting pylibcudf. + # We can remove the cimport once we can directly do + # pylibcudf.column_factories.make_numeric_column or equivalent. + col = pylibcudf.Column.from_libcudf( + move( + make_numeric_column( + new_dtype, col.size(), libcudf_types.mask_state.ALL_NULL + ) + ) + ) + dtype = dtype_from_pylibcudf_column(col) return cudf.core.column.build_column( diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx index c777a3ff766..009a69ea501 100644 --- a/python/cudf/cudf/_lib/datetime.pyx +++ b/python/cudf/cudf/_lib/datetime.pyx @@ -185,7 +185,7 @@ def date_range(DeviceScalar start, size_type n, offset): + offset.kwds.get("months", 0) ) - cdef const scalar* c_start = start.c_value.get() + cdef const scalar* c_start = start.get_raw_ptr() with nogil: c_result = move(calendrical_month_sequence( n, diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx index 13c8ce43ea3..0afed1bbd2e 100644 --- a/python/cudf/cudf/_lib/interop.pyx +++ b/python/cudf/cudf/_lib/interop.pyx @@ -1,22 +1,23 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. from cpython cimport pycapsule -from libcpp.memory cimport shared_ptr, unique_ptr +from libcpp.memory cimport unique_ptr from libcpp.utility cimport move -from libcpp.vector cimport vector -from pyarrow.lib cimport CTable, pyarrow_unwrap_table, pyarrow_wrap_table + +from cudf._lib import pylibcudf from cudf._lib.cpp.interop cimport ( DLManagedTensor, - column_metadata, - from_arrow as cpp_from_arrow, from_dlpack as cpp_from_dlpack, - to_arrow as cpp_to_arrow, to_dlpack as cpp_to_dlpack, ) from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view -from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns +from cudf._lib.utils cimport ( + columns_from_pylibcudf_table, + columns_from_unique_ptr, + table_view_from_columns, +) from cudf.core.buffer import acquire_spill_lock from cudf.core.dtypes import ListDtype, StructDtype @@ -83,21 +84,19 @@ cdef void dlmanaged_tensor_pycapsule_deleter(object pycap_obj) noexcept: dlpack_tensor.deleter(dlpack_tensor) -cdef vector[column_metadata] gather_metadata(object cols_dtypes) except *: +def gather_metadata(object cols_dtypes): """ - Generates a column_metadata vector for each column. + Generates a ColumnMetadata vector for each column. Parameters ---------- cols_dtypes : iterable An iterable of ``(column_name, dtype)`` pairs. """ - cdef vector[column_metadata] cpp_metadata - cpp_metadata.reserve(len(cols_dtypes)) - + cpp_metadata = [] if cols_dtypes is not None: for idx, (col_name, col_dtype) in enumerate(cols_dtypes): - cpp_metadata.push_back(column_metadata(col_name.encode())) + cpp_metadata.append(pylibcudf.interop.ColumnMetadata(col_name)) if isinstance(col_dtype, (ListDtype, StructDtype)): _set_col_children_metadata(col_dtype, cpp_metadata[idx]) else: @@ -108,31 +107,22 @@ cdef vector[column_metadata] gather_metadata(object cols_dtypes) except *: return cpp_metadata -cdef _set_col_children_metadata(dtype, - column_metadata& col_meta): - - cdef column_metadata element_metadata - +def _set_col_children_metadata(dtype, col_meta): if isinstance(dtype, StructDtype): for name, value in dtype.fields.items(): - element_metadata = column_metadata(name.encode()) - _set_col_children_metadata( - value, element_metadata - ) - col_meta.children_meta.push_back(element_metadata) + element_metadata = pylibcudf.interop.ColumnMetadata(name) + _set_col_children_metadata(value, element_metadata) + col_meta.children_meta.append(element_metadata) elif isinstance(dtype, ListDtype): - col_meta.children_meta.reserve(2) # Offsets - child 0 - col_meta.children_meta.push_back(column_metadata()) + col_meta.children_meta.append(pylibcudf.interop.ColumnMetadata()) # Element column - child 1 - element_metadata = column_metadata() - _set_col_children_metadata( - dtype.element_type, element_metadata - ) - col_meta.children_meta.push_back(element_metadata) + element_metadata = pylibcudf.interop.ColumnMetadata() + _set_col_children_metadata(dtype.element_type, element_metadata) + col_meta.children_meta.append(element_metadata) else: - col_meta.children_meta.push_back(column_metadata()) + col_meta.children_meta.append(pylibcudf.interop.ColumnMetadata()) @acquire_spill_lock() @@ -149,16 +139,11 @@ def to_arrow(list source_columns, object column_dtypes): ------- pyarrow table """ - cdef vector[column_metadata] cpp_metadata = gather_metadata(column_dtypes) - cdef table_view input_table_view = table_view_from_columns(source_columns) - - cdef shared_ptr[CTable] cpp_arrow_table - with nogil: - cpp_arrow_table = cpp_to_arrow( - input_table_view, cpp_metadata - ) - - return pyarrow_wrap_table(cpp_arrow_table) + cpp_metadata = gather_metadata(column_dtypes) + return pylibcudf.interop.to_arrow( + pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source_columns]), + cpp_metadata, + ) @acquire_spill_lock() @@ -173,12 +158,6 @@ def from_arrow(object input_table): ------- A list of columns to construct Frame object """ - cdef shared_ptr[CTable] cpp_arrow_table = ( - pyarrow_unwrap_table(input_table) + return columns_from_pylibcudf_table( + pylibcudf.interop.from_arrow(input_table) ) - cdef unique_ptr[table] c_result - - with nogil: - c_result = move(cpp_from_arrow(cpp_arrow_table.get()[0])) - - return columns_from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt index 55301789812..22ec5d472f2 100644 --- a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt +++ b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -22,4 +22,3 @@ rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX nvtext_ ASSOCIATED_TARGETS cudf ) -link_to_pyarrow_headers("${RAPIDS_CYTHON_CREATED_TARGETS}") diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index ada47de5cae..81d15cf95b4 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -43,4 +43,4 @@ rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf ) -link_to_pyarrow_headers("${RAPIDS_CYTHON_CREATED_TARGETS}") +link_to_pyarrow_headers(pylibcudf_interop) diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index 39b29eace10..48c23a9dd4c 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -8,7 +8,6 @@ from . cimport ( copying, filling, groupby, - interop, join, lists, merge, @@ -41,7 +40,6 @@ __all__ = [ "filling", "gpumemoryview", "groupby", - "interop", "join", "lists", "merge", diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx index 62a83efa3e2..3c5c53f99cf 100644 --- a/python/cudf/cudf/_lib/pylibcudf/column.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx @@ -134,6 +134,7 @@ cdef class Column: """ cdef DataType dtype = DataType.from_libcudf(libcudf_col.get().type()) cdef size_type size = libcudf_col.get().size() + cdef size_type null_count = libcudf_col.get().null_count() cdef column_contents contents = move(libcudf_col.get().release()) diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pxd b/python/cudf/cudf/_lib/pylibcudf/interop.pxd deleted file mode 100644 index 3a79e5425d4..00000000000 --- a/python/cudf/cudf/_lib/pylibcudf/interop.pxd +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. - -from cudf._lib.cpp.interop cimport column_metadata - - -cdef class ColumnMetadata: - cdef public object name - cdef public object children_meta - cdef column_metadata to_libcudf(self) diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx index 1ec5eb2e71a..e7471033fc8 100644 --- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx @@ -1,34 +1,211 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. -from cudf._lib.cpp.interop cimport column_metadata +from cython.operator cimport dereference +from libcpp.memory cimport shared_ptr, unique_ptr +from libcpp.utility cimport move +from libcpp.vector cimport vector +from pyarrow cimport lib as pa +from dataclasses import dataclass, field +from functools import singledispatch -cdef class ColumnMetadata: +from pyarrow import lib as pa + +from cudf._lib.cpp.interop cimport ( + column_metadata, + from_arrow as cpp_from_arrow, + to_arrow as cpp_to_arrow, +) +from cudf._lib.cpp.scalar.scalar cimport fixed_point_scalar, scalar +from cudf._lib.cpp.table.table cimport table +from cudf._lib.cpp.wrappers.decimals cimport ( + decimal32, + decimal64, + decimal128, + scale_type, +) + +from .column cimport Column +from .scalar cimport Scalar +from .table cimport Table +from .types cimport DataType, type_id + + +cdef column_metadata _metadata_to_libcudf(metadata): + """Convert a ColumnMetadata object to C++ column_metadata. + + Since this class is mutable and cheap, it is easier to create the C++ + object on the fly rather than have it directly backing the storage for + the Cython class. Additionally, this structure restricts the dependency + on C++ types to just within this module, allowing us to make the module a + pure Python module (from an import sense, i.e. no pxd declarations). + """ + cdef column_metadata c_metadata + c_metadata.name = metadata.name.encode() + for child_meta in metadata.children_meta: + c_metadata.children_meta.push_back(_metadata_to_libcudf(child_meta)) + return c_metadata + + +@dataclass +class ColumnMetadata: """Metadata associated with a column. - This is the Cython representation of :cpp:class:`cudf::column_metadata`. + This is the Python representation of :cpp:class:`cudf::column_metadata`. + """ + name: str = "" + children_meta: list[ColumnMetadata] = field(default_factory=list) + + +@singledispatch +def from_arrow(pyarrow_object, *, DataType data_type=None): + """Create a cudf object from a pyarrow object. + + Parameters + ---------- + pyarrow_object : Union[pyarrow.Array, pyarrow.Table, pyarrow.Scalar] + The PyArrow object to convert. + + Returns + ------- + Union[Table, Scalar] + The converted object of type corresponding to the input type in cudf. + """ + raise TypeError("from_arrow only accepts Table and Scalar objects") + + +@from_arrow.register(pa.Table) +def _from_arrow_table(pyarrow_object, *, DataType data_type=None): + if data_type is not None: + raise ValueError("data_type may not be passed for tables") + cdef shared_ptr[pa.CTable] arrow_table = pa.pyarrow_unwrap_table(pyarrow_object) + + cdef unique_ptr[table] c_result + with nogil: + c_result = move(cpp_from_arrow(dereference(arrow_table))) + + return Table.from_libcudf(move(c_result)) + + +@from_arrow.register(pa.Scalar) +def _from_arrow_scalar(pyarrow_object, *, DataType data_type=None): + cdef shared_ptr[pa.CScalar] arrow_scalar = pa.pyarrow_unwrap_scalar(pyarrow_object) + + cdef unique_ptr[scalar] c_result + with nogil: + c_result = move(cpp_from_arrow(dereference(arrow_scalar))) + + cdef Scalar result = Scalar.from_libcudf(move(c_result)) + + if result.type().id() != type_id.DECIMAL128: + if data_type is not None: + raise ValueError( + "dtype may not be passed for non-decimal types" + ) + return result + + if data_type is None: + raise ValueError( + "Decimal scalars must be constructed with a dtype" + ) + + cdef type_id tid = data_type.id() + + if tid == type_id.DECIMAL32: + result.c_obj.reset( + new fixed_point_scalar[decimal32]( + ( + result.c_obj.get() + ).value(), + scale_type(-pyarrow_object.type.scale), + result.c_obj.get().is_valid() + ) + ) + elif tid == type_id.DECIMAL64: + result.c_obj.reset( + new fixed_point_scalar[decimal64]( + ( + result.c_obj.get() + ).value(), + scale_type(-pyarrow_object.type.scale), + result.c_obj.get().is_valid() + ) + ) + elif tid != type_id.DECIMAL128: + raise ValueError( + "Decimal scalars may only be cast to decimals" + ) + + return result + + +@from_arrow.register(pa.Array) +def _from_arrow_column(pyarrow_object, *, DataType data_type=None): + if data_type is not None: + raise ValueError("data_type may not be passed for arrays") + pa_table = pa.table([pyarrow_object], [""]) + return from_arrow(pa_table).columns()[0] + + +@singledispatch +def to_arrow(cudf_object, metadata=None): + """Convert to a PyArrow object. Parameters ---------- - id : TypeId - The type's identifier - scale : int - The scale associated with the data. Only used for decimal data types. + cudf_object : Union[Column, Table, Scalar] + The cudf object to convert. + metadata : list + The metadata to attach to the columns of the table. + + Returns + ------- + Union[pyarrow.Array, pyarrow.Table, pyarrow.Scalar] + The converted object of type corresponding to the input type in PyArrow. """ - def __init__(self, name): - self.name = name - self.children_meta = [] - - cdef column_metadata to_libcudf(self): - """Convert to C++ column_metadata. - - Since this class is mutable and cheap, it is easier to create the C++ - object on the fly rather than have it directly backing the storage for - the Cython class. - """ - cdef column_metadata c_metadata - cdef ColumnMetadata child_meta - c_metadata.name = self.name.encode() - for child_meta in self.children_meta: - c_metadata.children_meta.push_back(child_meta.to_libcudf()) - return c_metadata + raise TypeError("to_arrow only accepts Table and Scalar objects") + + +@to_arrow.register(Table) +def _to_arrow_table(cudf_object, metadata=None): + if metadata is None: + metadata = [ColumnMetadata() for _ in range(len(cudf_object.columns()))] + metadata = [ColumnMetadata(m) if isinstance(m, str) else m for m in metadata] + cdef vector[column_metadata] c_table_metadata + cdef shared_ptr[pa.CTable] c_table_result + for meta in metadata: + c_table_metadata.push_back(_metadata_to_libcudf(meta)) + with nogil: + c_table_result = move( + cpp_to_arrow((
cudf_object).view(), c_table_metadata) + ) + + return pa.pyarrow_wrap_table(c_table_result) + + +@to_arrow.register(Scalar) +def _to_arrow_scalar(cudf_object, metadata=None): + # Note that metadata for scalars is primarily important for preserving + # information on nested types since names are otherwise irrelevant. + if metadata is None: + metadata = ColumnMetadata() + metadata = ColumnMetadata(metadata) if isinstance(metadata, str) else metadata + cdef column_metadata c_scalar_metadata = _metadata_to_libcudf(metadata) + cdef shared_ptr[pa.CScalar] c_scalar_result + with nogil: + c_scalar_result = move( + cpp_to_arrow( + dereference(( cudf_object).c_obj), c_scalar_metadata + ) + ) + + return pa.pyarrow_wrap_scalar(c_scalar_result) + + +@to_arrow.register(Column) +def _to_arrow_array(cudf_object, metadata=None): + """Create a PyArrow array from a pylibcudf column.""" + if metadata is None: + metadata = ColumnMetadata() + metadata = ColumnMetadata(metadata) if isinstance(metadata, str) else metadata + return to_arrow(Table([cudf_object]), [metadata])[0] diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd index 0edc934ca22..85744eca902 100644 --- a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd @@ -1,14 +1,12 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr -from pyarrow cimport lib as pa from rmm._lib.memory_resource cimport DeviceMemoryResource from cudf._lib.cpp.scalar.scalar cimport scalar -from .interop cimport ColumnMetadata from .types cimport DataType @@ -28,5 +26,3 @@ cdef class Scalar: @staticmethod cdef Scalar from_libcudf(unique_ptr[scalar] libcudf_scalar, dtype=*) - - cpdef pa.Scalar to_arrow(self, ColumnMetadata metadata) diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx index a1a347bc924..4a2d8f393bd 100644 --- a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx @@ -1,28 +1,13 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. from cython cimport no_gc_clear -from cython.operator cimport dereference -from libcpp.memory cimport shared_ptr, unique_ptr -from libcpp.utility cimport move -from pyarrow cimport lib as pa +from libcpp.memory cimport unique_ptr from rmm._lib.memory_resource cimport get_current_device_resource -from cudf._lib.cpp.interop cimport ( - column_metadata, - from_arrow as cpp_from_arrow, - to_arrow as cpp_to_arrow, -) -from cudf._lib.cpp.scalar.scalar cimport fixed_point_scalar, scalar -from cudf._lib.cpp.wrappers.decimals cimport ( - decimal32, - decimal64, - decimal128, - scale_type, -) +from cudf._lib.cpp.scalar.scalar cimport scalar -from .interop cimport ColumnMetadata -from .types cimport DataType, type_id +from .types cimport DataType # The DeviceMemoryResource attribute could be released prematurely @@ -44,89 +29,11 @@ cdef class Scalar: def __cinit__(self, *args, **kwargs): self.mr = get_current_device_resource() - def __init__(self, pa.Scalar value=None): + def __init__(self, *args, **kwargs): # TODO: This case is not something we really want to # support, but it here for now to ease the transition of # DeviceScalar. - if value is not None: - raise ValueError("Scalar should be constructed with a factory") - - @staticmethod - def from_arrow(pa.Scalar value, DataType data_type=None): - """Create a Scalar from a pyarrow Scalar. - - Parameters - ---------- - value : pyarrow.Scalar - The pyarrow scalar to construct from - data_type : DataType, optional - The data type of the scalar. If not passed, the data type will be - inferred from the pyarrow scalar. - """ - # Allow passing a dtype, but only for the purpose of decimals for now - - cdef shared_ptr[pa.CScalar] cscalar = ( - pa.pyarrow_unwrap_scalar(value) - ) - cdef unique_ptr[scalar] c_result - - with nogil: - c_result = move(cpp_from_arrow(cscalar.get()[0])) - - cdef Scalar s = Scalar.from_libcudf(move(c_result)) - - if s.type().id() != type_id.DECIMAL128: - if data_type is not None: - raise ValueError( - "dtype may not be passed for non-decimal types" - ) - return s - - if data_type is None: - raise ValueError( - "Decimal scalars must be constructed with a dtype" - ) - - cdef type_id tid = data_type.id() - - if tid == type_id.DECIMAL32: - s.c_obj.reset( - new fixed_point_scalar[decimal32]( - ( s.c_obj.get()).value(), - scale_type(-value.type.scale), - s.c_obj.get().is_valid() - ) - ) - elif tid == type_id.DECIMAL64: - s.c_obj.reset( - new fixed_point_scalar[decimal64]( - ( s.c_obj.get()).value(), - scale_type(-value.type.scale), - s.c_obj.get().is_valid() - ) - ) - elif tid != type_id.DECIMAL128: - raise ValueError( - "Decimal scalars may only be cast to decimals" - ) - - return s - - cpdef pa.Scalar to_arrow(self, ColumnMetadata metadata): - """Convert to a pyarrow scalar. - - Parameters - ---------- - metadata : ColumnMetadata - The metadata for the column the scalar is being used in. - """ - cdef shared_ptr[pa.CScalar] c_result - cdef column_metadata c_metadata = metadata.to_libcudf() - - with nogil: - c_result = move(cpp_to_arrow(dereference(self.c_obj.get()), c_metadata)) - - return pa.pyarrow_wrap_scalar(c_result) + raise ValueError("Scalar should be constructed with a factory") cdef const scalar* get(self) noexcept nogil: return self.c_obj.get() diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pxd b/python/cudf/cudf/_lib/pylibcudf/table.pxd index 2e76c811717..327f3911489 100644 --- a/python/cudf/cudf/_lib/pylibcudf/table.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/table.pxd @@ -1,7 +1,6 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr -from pyarrow cimport lib as pa from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view @@ -20,5 +19,3 @@ cdef class Table: cdef Table from_table_view(const table_view& tv, Table owner) cpdef list columns(self) - - cpdef pa.Table to_arrow(self, list metadata) diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pyx b/python/cudf/cudf/_lib/pylibcudf/table.pyx index 0cde346fa9c..793e6330244 100644 --- a/python/cudf/cudf/_lib/pylibcudf/table.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/table.pyx @@ -1,22 +1,15 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. from cython.operator cimport dereference -from libcpp.memory cimport shared_ptr, unique_ptr +from libcpp.memory cimport unique_ptr from libcpp.utility cimport move from libcpp.vector cimport vector -from pyarrow cimport lib as pa from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view -from cudf._lib.cpp.interop cimport ( - column_metadata, - from_arrow as cpp_from_arrow, - to_arrow as cpp_to_arrow, -) from cudf._lib.cpp.table.table cimport table from .column cimport Column -from .interop cimport ColumnMetadata cdef class Table: @@ -87,42 +80,3 @@ cdef class Table: cpdef list columns(self): """The columns in this table.""" return self._columns - - @staticmethod - def from_arrow(pa.Table pyarrow_table): - """Create a Table from a PyArrow Table. - - Parameters - ---------- - pyarrow_table : pyarrow.Table - The PyArrow Table to convert to a Table. - """ - - cdef shared_ptr[pa.CTable] ctable = ( - pa.pyarrow_unwrap_table(pyarrow_table) - ) - cdef unique_ptr[table] c_result - - with nogil: - c_result = move(cpp_from_arrow(ctable.get()[0])) - - return Table.from_libcudf(move(c_result)) - - cpdef pa.Table to_arrow(self, list metadata): - """Convert to a PyArrow Table. - - Parameters - ---------- - metadata : list - The metadata to attach to the columns of the table. - """ - cdef shared_ptr[pa.CTable] c_result - cdef vector[column_metadata] c_metadata - cdef ColumnMetadata meta - for meta in metadata: - c_metadata.push_back(meta.to_libcudf()) - - with nogil: - c_result = move(cpp_to_arrow(self.view(), c_metadata)) - - return pa.pyarrow_wrap_table(c_result) diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pxd b/python/cudf/cudf/_lib/pylibcudf/types.pxd index e0f6a73fd55..6c53636d332 100644 --- a/python/cudf/cudf/_lib/pylibcudf/types.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/types.pxd @@ -6,6 +6,7 @@ from libcpp cimport bool as cbool from cudf._lib.cpp.types cimport ( data_type, interpolation, + mask_state, nan_equality, nan_policy, null_equality, diff --git a/python/cudf/cudf/_lib/scalar.pxd b/python/cudf/cudf/_lib/scalar.pxd index 49f5c527aa0..154ee22e796 100644 --- a/python/cudf/cudf/_lib/scalar.pxd +++ b/python/cudf/cudf/_lib/scalar.pxd @@ -5,15 +5,11 @@ from libcpp.memory cimport unique_ptr from rmm._lib.memory_resource cimport DeviceMemoryResource -# TODO: Would like to remove this cimport, but it will require some more work -# to excise all C code in scalar.pyx that relies on using the C API of the -# pylibcudf Scalar underlying the DeviceScalar. -from cudf._lib cimport pylibcudf from cudf._lib.cpp.scalar.scalar cimport scalar cdef class DeviceScalar: - cdef public pylibcudf.Scalar c_value + cdef public object c_value cdef object _dtype @@ -23,7 +19,7 @@ cdef class DeviceScalar: cdef DeviceScalar from_unique_ptr(unique_ptr[scalar] ptr, dtype=*) @staticmethod - cdef DeviceScalar from_pylibcudf(pylibcudf.Scalar scalar, dtype=*) + cdef DeviceScalar from_pylibcudf(pscalar, dtype=*) cdef void _set_dtype(self, dtype=*) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index cd9793270e2..7ddf4ff4883 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -18,6 +18,12 @@ from cudf.core.dtypes import ListDtype, StructDtype from cudf.core.missing import NA, NaT cimport cudf._lib.cpp.types as libcudf_types +# We currently need this cimport because some of the implementations here +# access the c_obj of the scalar, and because we need to be able to call +# pylibcudf.Scalar.from_libcudf. Both of those are temporarily acceptable until +# DeviceScalar is phased out entirely from cuDF Cython (at which point +# cudf.Scalar will be directly backed by pylibcudf.Scalar). +from cudf._lib cimport pylibcudf from cudf._lib.cpp.scalar.scalar cimport ( duration_scalar, list_scalar, @@ -92,7 +98,7 @@ cdef class DeviceScalar: # that from_unique_ptr is implemented is probably dereferencing this in an # invalid state. See what the best way to fix that is. def __cinit__(self, *args, **kwargs): - self.c_value = pylibcudf.Scalar() + self.c_value = pylibcudf.Scalar.__new__(pylibcudf.Scalar) def __init__(self, value, dtype): """ @@ -138,7 +144,7 @@ cdef class DeviceScalar: pa_array = pa.array([pa.scalar(value, type=pa_type)]) pa_table = pa.Table.from_arrays([pa_array], names=[""]) - table = pylibcudf.Table.from_arrow(pa_table) + table = pylibcudf.interop.from_arrow(pa_table) column = table.columns()[0] if isinstance(dtype, cudf.core.dtypes.DecimalDtype): @@ -161,7 +167,7 @@ cdef class DeviceScalar: null_type = NaT if is_datetime or is_timedelta else NA metadata = gather_metadata({"": self.dtype})[0] - ps = self.c_value.to_arrow(metadata) + ps = pylibcudf.interop.to_arrow(self.c_value, metadata) if not ps.is_valid: return null_type @@ -200,7 +206,7 @@ cdef class DeviceScalar: return self._to_host_scalar() cdef const scalar* get_raw_ptr(self) except *: - return self.c_value.c_obj.get() + return ( self.c_value).c_obj.get() cpdef bool is_valid(self): """ @@ -223,12 +229,13 @@ cdef class DeviceScalar: Construct a Scalar object from a unique_ptr. """ cdef DeviceScalar s = DeviceScalar.__new__(DeviceScalar) + # Note: This line requires pylibcudf to be cimported s.c_value = pylibcudf.Scalar.from_libcudf(move(ptr)) s._set_dtype(dtype) return s @staticmethod - cdef DeviceScalar from_pylibcudf(pylibcudf.Scalar pscalar, dtype=None): + cdef DeviceScalar from_pylibcudf(pscalar, dtype=None): cdef DeviceScalar s = DeviceScalar.__new__(DeviceScalar) s.c_value = pscalar s._set_dtype(dtype) @@ -360,9 +367,13 @@ def _create_proxy_nat_scalar(dtype): if dtype.char in 'mM': nat = dtype.type('NaT').astype(dtype) if dtype.type == np.datetime64: - _set_datetime64_from_np_scalar(result.c_value.c_obj, nat, dtype, True) + _set_datetime64_from_np_scalar( + ( result.c_value).c_obj, nat, dtype, True + ) elif dtype.type == np.timedelta64: - _set_timedelta64_from_np_scalar(result.c_value.c_obj, nat, dtype, True) + _set_timedelta64_from_np_scalar( + ( result.c_value).c_obj, nat, dtype, True + ) return result else: raise TypeError('NAT only valid for datetime and timedelta') diff --git a/python/cudf/cudf/_lib/strings/CMakeLists.txt b/python/cudf/cudf/_lib/strings/CMakeLists.txt index 081b84db79c..ceeff71683c 100644 --- a/python/cudf/cudf/_lib/strings/CMakeLists.txt +++ b/python/cudf/cudf/_lib/strings/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -40,7 +40,6 @@ rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf ) -link_to_pyarrow_headers("${RAPIDS_CYTHON_CREATED_TARGETS}") add_subdirectory(convert) add_subdirectory(split) diff --git a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt b/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt index ebd7a793bf4..e8a76b476a8 100644 --- a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt +++ b/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -22,4 +22,3 @@ rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf ) -link_to_pyarrow_headers("${RAPIDS_CYTHON_CREATED_TARGETS}") diff --git a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt b/python/cudf/cudf/_lib/strings/split/CMakeLists.txt index 105e73788fe..4ede0a2fac5 100644 --- a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt +++ b/python/cudf/cudf/_lib/strings/split/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -20,4 +20,3 @@ rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf ) -link_to_pyarrow_headers("${RAPIDS_CYTHON_CREATED_TARGETS}")