diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1e1ad94ab0b..9e72c0119f3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -88,6 +88,13 @@ repos: # of dependencies, so we'll have to update this manually. additional_dependencies: - cmake-format==0.6.11 + - id: copyright-check + name: copyright-check + # This hook's use of Git tools appears to conflict with + # existing CI invocations so we don't invoke it during CI runs. + stages: [commit] + entry: python ./ci/checks/copyright.py --git-modified-only + language: python default_language_version: python: python3 diff --git a/cpp/benchmarks/binaryop/compiled_binaryop.cpp b/cpp/benchmarks/binaryop/compiled_binaryop.cpp index 745d4e354e7..f8226c7387a 100644 --- a/cpp/benchmarks/binaryop/compiled_binaryop.cpp +++ b/cpp/benchmarks/binaryop/compiled_binaryop.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,8 +27,8 @@ class COMPILED_BINARYOP : public cudf::benchmark { }; -template -void BM_compiled_binaryop(benchmark::State& state) +template +void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop) { const cudf::size_type column_size{(cudf::size_type)state.range(0)}; @@ -50,21 +50,26 @@ void BM_compiled_binaryop(benchmark::State& state) } // TODO tparam boolean for null. -#define BINARYOP_BENCHMARK_DEFINE(TypeLhs, TypeRhs, binop, TypeOut) \ - TEMPLATED_BENCHMARK_F(COMPILED_BINARYOP, \ - BM_compiled_binaryop, \ - TypeLhs, \ - TypeRhs, \ - TypeOut, \ - cudf::binary_operator::binop) \ - ->Unit(benchmark::kMicrosecond) \ - ->UseManualTime() \ - ->Arg(10000) /* 10k */ \ - ->Arg(100000) /* 100k */ \ - ->Arg(1000000) /* 1M */ \ - ->Arg(10000000) /* 10M */ \ +#define BM_BINARYOP_BENCHMARK_DEFINE(name, lhs, rhs, bop, tout) \ + BENCHMARK_DEFINE_F(COMPILED_BINARYOP, name) \ + (::benchmark::State & st) \ + { \ + BM_compiled_binaryop(st, cudf::binary_operator::bop); \ + } \ + BENCHMARK_REGISTER_F(COMPILED_BINARYOP, name) \ + ->Unit(benchmark::kMicrosecond) \ + ->UseManualTime() \ + ->Arg(10000) /* 10k */ \ + ->Arg(100000) /* 100k */ \ + ->Arg(1000000) /* 1M */ \ + ->Arg(10000000) /* 10M */ \ ->Arg(100000000); /* 100M */ +#define build_name(a, b, c, d) a##_##b##_##c##_##d + +#define BINARYOP_BENCHMARK_DEFINE(lhs, rhs, bop, tout) \ + BM_BINARYOP_BENCHMARK_DEFINE(build_name(bop, lhs, rhs, tout), lhs, rhs, bop, tout) + using namespace cudf; using namespace numeric; diff --git a/cpp/benchmarks/string/url_decode.cpp b/cpp/benchmarks/string/url_decode.cpp index 4dc77cffa1a..6dc79c44437 100644 --- a/cpp/benchmarks/string/url_decode.cpp +++ b/cpp/benchmarks/string/url_decode.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -70,8 +70,7 @@ cudf::test::strings_column_wrapper generate_column(cudf::size_type num_rows, class UrlDecode : public cudf::benchmark { }; -template -void BM_url_decode(benchmark::State& state) +void BM_url_decode(benchmark::State& state, int esc_seq_pct) { cudf::size_type const num_rows = state.range(0); cudf::size_type const chars_per_row = state.range(1); @@ -88,12 +87,14 @@ void BM_url_decode(benchmark::State& state) (chars_per_row + sizeof(cudf::size_type))); } -#define URLD_BENCHMARK_DEFINE(esc_seq_pct) \ - TEMPLATED_BENCHMARK_F(UrlDecode, BM_url_decode, esc_seq_pct) \ - ->Args({100000000, 10}) \ - ->Args({10000000, 100}) \ - ->Args({1000000, 1000}) \ - ->Unit(benchmark::kMillisecond) \ +#define URLD_BENCHMARK_DEFINE(esc_seq_pct) \ + BENCHMARK_DEFINE_F(UrlDecode, esc_seq_pct) \ + (::benchmark::State & st) { BM_url_decode(st, esc_seq_pct); } \ + BENCHMARK_REGISTER_F(UrlDecode, esc_seq_pct) \ + ->Args({100000000, 10}) \ + ->Args({10000000, 100}) \ + ->Args({1000000, 1000}) \ + ->Unit(benchmark::kMillisecond) \ ->UseManualTime(); URLD_BENCHMARK_DEFINE(10) diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh index dde4e00eb4a..8e1463f7964 100644 --- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh +++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -74,12 +74,16 @@ struct element_arg_minmax_fn { /** * @brief Value accessor for column which supports dictionary column too. * + * This is similar to `value_accessor` in `column_device_view.cuh` but with support of dictionary + * type. + * * @tparam T Type of the underlying column. For dictionary column, type of the key column. */ template struct value_accessor { column_device_view const col; bool const is_dict; + value_accessor(column_device_view const& col) : col(col), is_dict(cudf::is_dictionary(col.type())) { } @@ -93,6 +97,7 @@ struct value_accessor { return col.element(i); } } + __device__ auto operator()(size_type i) const { return value(i); } }; @@ -100,20 +105,28 @@ struct value_accessor { * @brief Null replaced value accessor for column which supports dictionary column too. * For null value, returns null `init` value * - * @tparam T Type of the underlying column. For dictionary column, type of the key column. + * @tparam SourceType Type of the underlying column. For dictionary column, type of the key column. + * @tparam TargetType Type that is used for computation. */ -template -struct null_replaced_value_accessor : value_accessor { - using super_t = value_accessor; +template +struct null_replaced_value_accessor : value_accessor { + using super_t = value_accessor; + + TargetType const init; bool const has_nulls; - T const init; - null_replaced_value_accessor(column_device_view const& col, T const& init, bool const has_nulls) + + null_replaced_value_accessor(column_device_view const& col, + TargetType const& init, + bool const has_nulls) : super_t(col), init(init), has_nulls(has_nulls) { } - __device__ T operator()(size_type i) const + + __device__ TargetType operator()(size_type i) const { - return has_nulls && super_t::col.is_null_nocheck(i) ? init : super_t::value(i); + return has_nulls && super_t::col.is_null_nocheck(i) + ? init + : static_cast(super_t::value(i)); } }; @@ -168,7 +181,7 @@ struct group_reduction_functor; + using SourceDType = device_storage_type_t; using ResultType = cudf::detail::target_type_t; using ResultDType = device_storage_type_t; @@ -203,9 +216,11 @@ struct group_reduction_functor; - auto init = OpType::template identity(); + auto init = OpType::template identity(); auto inp_values = cudf::detail::make_counting_transform_iterator( - 0, null_replaced_value_accessor{*d_values_ptr, init, values.has_nulls()}); + 0, + null_replaced_value_accessor{ + *d_values_ptr, init, values.has_nulls()}); do_reduction(inp_values, result_begin, OpType{}); } diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp index dabe992d959..e2893a2e881 100644 --- a/cpp/src/io/utilities/file_io_utilities.cpp +++ b/cpp/src/io/utilities/file_io_utilities.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -194,20 +194,13 @@ template > make_sliced_tasks( F function, DataT* ptr, size_t offset, size_t size, cudf::detail::thread_pool& pool) { + constexpr size_t default_max_slice_size = 4 * 1024 * 1024; + static auto const max_slice_size = getenv_or("LIBCUDF_CUFILE_SLICE_SIZE", default_max_slice_size); + auto const slices = make_file_io_slices(size, max_slice_size); std::vector> slice_tasks; - constexpr size_t default_max_slice_bytes = 4 * 1024 * 1024; - static auto const max_slice_bytes = - getenv_or("LIBCUDF_CUFILE_SLICE_SIZE", default_max_slice_bytes); - size_t const n_slices = util::div_rounding_up_safe(size, max_slice_bytes); - size_t slice_offset = 0; - for (size_t t = 0; t < n_slices; ++t) { - DataT* ptr_slice = ptr + slice_offset; - - size_t const slice_size = (t == n_slices - 1) ? size % max_slice_bytes : max_slice_bytes; - slice_tasks.push_back(pool.submit(function, ptr_slice, slice_size, offset + slice_offset)); - - slice_offset += slice_size; - } + std::transform(slices.cbegin(), slices.cend(), std::back_inserter(slice_tasks), [&](auto& slice) { + return pool.submit(function, ptr + slice.offset, slice.size, offset + slice.offset); + }); return slice_tasks; } @@ -318,6 +311,21 @@ std::unique_ptr make_cufile_output(std::string const& filepa return nullptr; } +std::vector make_file_io_slices(size_t size, size_t max_slice_size) +{ + max_slice_size = std::max(1024ul, max_slice_size); + auto const n_slices = util::div_rounding_up_safe(size, max_slice_size); + std::vector slices; + slices.reserve(n_slices); + std::generate_n(std::back_inserter(slices), n_slices, [&, idx = 0]() mutable { + auto const slice_offset = idx++ * max_slice_size; + auto const slice_size = std::min(size - slice_offset, max_slice_size); + return file_io_slice{slice_offset, slice_size}; + }); + + return slices; +} + } // namespace detail } // namespace io } // namespace cudf diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp index fcee4e43a20..be3ecc49ab0 100644 --- a/cpp/src/io/utilities/file_io_utilities.hpp +++ b/cpp/src/io/utilities/file_io_utilities.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -291,6 +291,21 @@ std::unique_ptr make_cufile_input(std::string const& filepath */ std::unique_ptr make_cufile_output(std::string const& filepath); +/** + * @brief Byte range to be read/written in a single operation. + */ +struct file_io_slice { + size_t offset; + size_t size; +}; + +/** + * @brief Split the total number of bytes to read/write into slices to enable parallel IO. + * + * If `max_slice_size` is below 1024, 1024 will be used instead to prevent potential misuse. + */ +std::vector make_file_io_slices(size_t size, size_t max_slice_size); + } // namespace detail } // namespace io } // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 913761ecd03..27dd472b3f5 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -199,6 +199,7 @@ ConfigureTest( ConfigureTest(DECOMPRESSION_TEST io/comp/decomp_test.cpp) ConfigureTest(CSV_TEST io/csv_test.cpp) +ConfigureTest(FILE_IO_TEST io/file_io_test.cpp) ConfigureTest(ORC_TEST io/orc_test.cpp) ConfigureTest(PARQUET_TEST io/parquet_test.cpp) ConfigureTest(JSON_TEST io/json_test.cpp) diff --git a/cpp/tests/groupby/sum_tests.cpp b/cpp/tests/groupby/sum_tests.cpp index 5947e309bec..be7da4a784c 100644 --- a/cpp/tests/groupby/sum_tests.cpp +++ b/cpp/tests/groupby/sum_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -156,6 +156,27 @@ TYPED_TEST(groupby_sum_test, dictionary) force_use_sort_impl::YES); } +struct overflow_test : public cudf::test::BaseFixture { +}; +TEST_F(overflow_test, overflow_integer) +{ + using int32_col = fixed_width_column_wrapper; + using int64_col = fixed_width_column_wrapper; + + auto const keys = int32_col{0, 0}; + auto const vals = int32_col{-2147483648, -2147483648}; + auto const expect_keys = int32_col{0}; + auto const expect_vals = int64_col{-4294967296L}; + + auto test_sum = [&](auto const use_sort) { + auto agg = make_sum_aggregation(); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), use_sort); + }; + + test_sum(force_use_sort_impl::NO); + test_sum(force_use_sort_impl::YES); +} + template struct FixedPointTestAllReps : public cudf::test::BaseFixture { }; diff --git a/cpp/tests/io/file_io_test.cpp b/cpp/tests/io/file_io_test.cpp new file mode 100644 index 00000000000..b546239fdca --- /dev/null +++ b/cpp/tests/io/file_io_test.cpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +#include + +// Base test fixture for tests +struct CuFileIOTest : public cudf::test::BaseFixture { +}; + +TEST_F(CuFileIOTest, SliceSize) +{ + std::vector> test_cases{ + {1 << 20, 1 << 18}, {1 << 18, 1 << 20}, {1 << 20, 3333}, {0, 1 << 18}, {0, 0}, {1 << 20, 0}}; + for (auto const& test_case : test_cases) { + auto const slices = cudf::io::detail::make_file_io_slices(test_case.first, test_case.second); + if (slices.empty()) { + ASSERT_EQ(test_case.first, 0); + } else { + ASSERT_EQ(slices.front().offset, 0); + ASSERT_EQ(slices.back().offset + slices.back().size, test_case.first); + for (auto i = 1u; i < slices.size(); ++i) { + ASSERT_EQ(slices[i].offset, slices[i - 1].offset + slices[i - 1].size); + } + } + } +} + +CUDF_TEST_PROGRAM_MAIN() diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 3ff2a370e4f..f91ee5535b1 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -826,18 +826,18 @@ public final ColumnVector mergeAndSetValidity(BinaryOp mergeOp, ColumnView... co /** * Creates a deep copy of a column while replacing the validity mask. The validity mask is the * device_vector equivalent of the boolean column given as argument. - * + * * The boolColumn must have the same number of rows as the current column. - * The result column will have the same number of rows as the current column. + * The result column will have the same number of rows as the current column. * For all indices `i` where the boolColumn is `true`, the result column will have a valid value at index i. * For all other values (i.e. `false` or `null`), the result column will have nulls. - * + * * If the current column has a null at a given index `i`, and the new validity mask is `true` at index `i`, * then the row value is undefined. - * + * * @param boolColumn bool column whose value is to be used as the validity mask. * @return Deep copy of the column with replaced validity mask. - */ + */ public final ColumnVector copyWithBooleanColumnAsValidity(ColumnView boolColumn) { return new ColumnVector(copyWithBooleanColumnAsValidity(getNativeView(), boolColumn.getNativeView())); } @@ -2345,88 +2345,128 @@ public final ColumnVector stringLocate(Scalar substring, int start, int end) { } /** - * Returns a list of columns by splitting each string using the specified delimiter. - * The number of rows in the output columns will be the same as the input column. - * Null entries are added for a row where split results have been exhausted. - * Null string entries return corresponding null output columns. - * @param delimiter UTF-8 encoded string identifying the split points in each string. - * An empty string indicates split on whitespace. - * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. - * @return New table of strings columns. + * Returns a list of columns by splitting each string using the specified pattern. The number of + * rows in the output columns will be the same as the input column. Null entries are added for a + * row where split results have been exhausted. Null input entries result in all nulls in the + * corresponding rows of the output columns. + * + * @param pattern UTF-8 encoded string identifying the split pattern for each input string. + * @param limit the maximum size of the list resulting from splitting each input string, + * or -1 for all possible splits. Note that limit = 0 (all possible splits without + * trailing empty strings) and limit = 1 (no split at all) are not supported. + * @param splitByRegex a boolean flag indicating whether the input strings will be split by a + * regular expression pattern or just by a string literal delimiter. + * @return list of strings columns as a table. */ - public final Table stringSplit(Scalar delimiter, int maxSplit) { + public final Table stringSplit(String pattern, int limit, boolean splitByRegex) { assert type.equals(DType.STRING) : "column type must be a String"; - assert delimiter != null : "delimiter may not be null"; - assert delimiter.getType().equals(DType.STRING) : "delimiter must be a string scalar"; - return new Table(stringSplit(this.getNativeView(), delimiter.getScalarHandle(), maxSplit)); + assert pattern != null : "pattern is null"; + assert pattern.length() > 0 : "empty pattern is not supported"; + assert limit != 0 && limit != 1 : "split limit == 0 and limit == 1 are not supported"; + return new Table(stringSplit(this.getNativeView(), pattern, limit, splitByRegex)); } - + /** - * Returns a list of columns by splitting each string using the specified delimiter. - * The number of rows in the output columns will be the same as the input column. - * Null entries are added for a row where split results have been exhausted. - * Null string entries return corresponding null output columns. - * @param delimiter UTF-8 encoded string identifying the split points in each string. - * An empty string indicates split on whitespace. - * @return New table of strings columns. + * Returns a list of columns by splitting each string using the specified pattern. The number of + * rows in the output columns will be the same as the input column. Null entries are added for a + * row where split results have been exhausted. Null input entries result in all nulls in the + * corresponding rows of the output columns. + * + * @param pattern UTF-8 encoded string identifying the split pattern for each input string. + * @param splitByRegex a boolean flag indicating whether the input strings will be split by a + * regular expression pattern or just by a string literal delimiter. + * @return list of strings columns as a table. */ - public final Table stringSplit(Scalar delimiter) { - return stringSplit(delimiter, -1); + public final Table stringSplit(String pattern, boolean splitByRegex) { + return stringSplit(pattern, -1, splitByRegex); } /** - * Returns a list of columns by splitting each string using whitespace as the delimiter. - * The number of rows in the output columns will be the same as the input column. - * Null entries are added for a row where split results have been exhausted. - * Null string entries return corresponding null output columns. - * @return New table of strings columns. + * Returns a list of columns by splitting each string using the specified string literal + * delimiter. The number of rows in the output columns will be the same as the input column. + * Null entries are added for a row where split results have been exhausted. Null input entries + * result in all nulls in the corresponding rows of the output columns. + * + * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string. + * @param limit the maximum size of the list resulting from splitting each input string, + * or -1 for all possible splits. Note that limit = 0 (all possible splits without + * trailing empty strings) and limit = 1 (no split at all) are not supported. + * @return list of strings columns as a table. */ - public final Table stringSplit() { - try (Scalar emptyString = Scalar.fromString("")) { - return stringSplit(emptyString, -1); - } + public final Table stringSplit(String delimiter, int limit) { + return stringSplit(delimiter, limit, false); } /** - * Returns a column of lists of strings by splitting each string using whitespace as the delimiter. + * Returns a list of columns by splitting each string using the specified string literal + * delimiter. The number of rows in the output columns will be the same as the input column. + * Null entries are added for a row where split results have been exhausted. Null input entries + * result in all nulls in the corresponding rows of the output columns. + * + * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string. + * @return list of strings columns as a table. */ - public final ColumnVector stringSplitRecord() { - return stringSplitRecord(-1); + public final Table stringSplit(String delimiter) { + return stringSplit(delimiter, -1, false); } /** - * Returns a column of lists of strings by splitting each string using whitespace as the delimiter. - * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. + * Returns a column that are lists of strings in which each list is made by splitting the + * corresponding input string using the specified pattern. + * + * @param pattern UTF-8 encoded string identifying the split pattern for each input string. + * @param limit the maximum size of the list resulting from splitting each input string, + * or -1 for all possible splits. Note that limit = 0 (all possible splits without + * trailing empty strings) and limit = 1 (no split at all) are not supported. + * @param splitByRegex a boolean flag indicating whether the input strings will be split by a + * regular expression pattern or just by a string literal delimiter. + * @return a LIST column of string elements. */ - public final ColumnVector stringSplitRecord(int maxSplit) { - try (Scalar emptyString = Scalar.fromString("")) { - return stringSplitRecord(emptyString, maxSplit); - } + public final ColumnVector stringSplitRecord(String pattern, int limit, boolean splitByRegex) { + assert type.equals(DType.STRING) : "column type must be String"; + assert pattern != null : "pattern is null"; + assert pattern.length() > 0 : "empty pattern is not supported"; + assert limit != 0 && limit != 1 : "split limit == 0 and limit == 1 are not supported"; + return new ColumnVector( + stringSplitRecord(this.getNativeView(), pattern, limit, splitByRegex)); + } + + /** + * Returns a column that are lists of strings in which each list is made by splitting the + * corresponding input string using the specified pattern. + * + * @param pattern UTF-8 encoded string identifying the split pattern for each input string. + * @param splitByRegex a boolean flag indicating whether the input strings will be split by a + * regular expression pattern or just by a string literal delimiter. + * @return a LIST column of string elements. + */ + public final ColumnVector stringSplitRecord(String pattern, boolean splitByRegex) { + return stringSplitRecord(pattern, -1, splitByRegex); } /** - * Returns a column of lists of strings by splitting each string using the specified delimiter. - * @param delimiter UTF-8 encoded string identifying the split points in each string. - * An empty string indicates split on whitespace. + * Returns a column that are lists of strings in which each list is made by splitting the + * corresponding input string using the specified string literal delimiter. + * + * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string. + * @param limit the maximum size of the list resulting from splitting each input string, + * or -1 for all possible splits. Note that limit = 0 (all possible splits without + * trailing empty strings) and limit = 1 (no split at all) are not supported. + * @return a LIST column of string elements. */ - public final ColumnVector stringSplitRecord(Scalar delimiter) { - return stringSplitRecord(delimiter, -1); + public final ColumnVector stringSplitRecord(String delimiter, int limit) { + return stringSplitRecord(delimiter, limit, false); } /** - * Returns a column that is a list of strings. Each string list is made by splitting each input - * string using the specified delimiter. - * @param delimiter UTF-8 encoded string identifying the split points in each string. - * An empty string indicates split on whitespace. - * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. - * @return New table of strings columns. + * Returns a column that are lists of strings in which each list is made by splitting the + * corresponding input string using the specified string literal delimiter. + * + * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string. + * @return a LIST column of string elements. */ - public final ColumnVector stringSplitRecord(Scalar delimiter, int maxSplit) { - assert type.equals(DType.STRING) : "column type must be a String"; - assert delimiter != null : "delimiter may not be null"; - assert delimiter.getType().equals(DType.STRING) : "delimiter must be a string scalar"; - return new ColumnVector( - stringSplitRecord(this.getNativeView(), delimiter.getScalarHandle(), maxSplit)); + public final ColumnVector stringSplitRecord(String delimiter) { + return stringSplitRecord(delimiter, -1, false); } /** @@ -3248,7 +3288,7 @@ public enum FindOptions {FIND_FIRST, FIND_LAST}; * Create a column of int32 indices, indicating the position of the scalar search key * in each list row. * All indices are 0-based. If a search key is not found, the index is set to -1. - * The index is set to null if one of the following is true: + * The index is set to null if one of the following is true: * 1. The search key is null. * 2. The list row is null. * @param key The scalar search key @@ -3265,7 +3305,7 @@ public final ColumnVector listIndexOf(Scalar key, FindOptions findOption) { * Create a column of int32 indices, indicating the position of each row in the * search key column in the corresponding row of the lists column. * All indices are 0-based. If a search key is not found, the index is set to -1. - * The index is set to null if one of the following is true: + * The index is set to null if one of the following is true: * 1. The search key row is null. * 2. The list row is null. * @param keys ColumnView of search keys. @@ -3531,15 +3571,36 @@ private static native long repeatStringsWithColumnRepeatTimes(long stringsHandle private static native long substringLocate(long columnView, long substringScalar, int start, int end); /** - * Native method which returns array of columns by splitting each string using the specified - * delimiter. - * @param columnView native handle of the cudf::column_view being operated on. - * @param delimiter UTF-8 encoded string identifying the split points in each string. - * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. + * Returns a list of columns by splitting each string using the specified pattern. The number of + * rows in the output columns will be the same as the input column. Null entries are added for a + * row where split results have been exhausted. Null input entries result in all nulls in the + * corresponding rows of the output columns. + * + * @param nativeHandle native handle of the input strings column that being operated on. + * @param pattern UTF-8 encoded string identifying the split pattern for each input string. + * @param limit the maximum size of the list resulting from splitting each input string, + * or -1 for all possible splits. Note that limit = 0 (all possible splits without + * trailing empty strings) and limit = 1 (no split at all) are not supported. + * @param splitByRegex a boolean flag indicating whether the input strings will be split by a + * regular expression pattern or just by a string literal delimiter. */ - private static native long[] stringSplit(long columnView, long delimiter, int maxSplit); + private static native long[] stringSplit(long nativeHandle, String pattern, int limit, + boolean splitByRegex); - private static native long stringSplitRecord(long nativeView, long scalarHandle, int maxSplit); + /** + * Returns a column that are lists of strings in which each list is made by splitting the + * corresponding input string using the specified string literal delimiter. + * + * @param nativeHandle native handle of the input strings column that being operated on. + * @param pattern UTF-8 encoded string identifying the split pattern for each input string. + * @param limit the maximum size of the list resulting from splitting each input string, + * or -1 for all possible splits. Note that limit = 0 (all possible splits without + * trailing empty strings) and limit = 1 (no split at all) are not supported. + * @param splitByRegex a boolean flag indicating whether the input strings will be split by a + * regular expression pattern or just by a string literal delimiter. + */ + private static native long stringSplitRecord(long nativeHandle, String pattern, int limit, + boolean splitByRegex); /** * Native method to calculate substring from a given string column. 0 indexing. @@ -3714,7 +3775,7 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat /** * Native method to search list rows for null elements. * @param nativeView the column view handle of the list - * @return column handle of the resultant boolean column + * @return column handle of the resultant boolean column */ private static native long listContainsNulls(long nativeView); @@ -3896,20 +3957,20 @@ private static native long bitwiseMergeAndSetValidity(long baseHandle, long[] vi /** * Native method to deep copy a column while replacing the null mask. The null mask is the * device_vector equivalent of the boolean column given as argument. - * + * * The boolColumn must have the same number of rows as the exemplar column. * The result column will have the same number of rows as the exemplar. * For all indices `i` where the boolean column is `true`, the result column will have a valid value at index i. * For all other values (i.e. `false` or `null`), the result column will have nulls. - * + * * If the exemplar column has a null at a given index `i`, and the new validity mask is `true` at index `i`, * then the resultant row value is undefined. - * + * * @param exemplarViewHandle column view of the column that is deep copied. * @param boolColumnViewHandle bool column whose value is to be used as the null mask. * @return Deep copy of the column with replaced null mask. - */ - private static native long copyWithBooleanColumnAsValidity(long exemplarViewHandle, + */ + private static native long copyWithBooleanColumnAsValidity(long exemplarViewHandle, long boolColumnViewHandle) throws CudfException; //////// diff --git a/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java index 0e49636fae6..78b3d5d52ec 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -40,6 +40,9 @@ private ColumnWriterOptions(AbstractStructBuilder builder) { (ColumnWriterOptions[]) builder.children.toArray(new ColumnWriterOptions[0]); } + // The sentinel value of unknown precision (default value) + public static int UNKNOWN_PRECISION = -1; + /** * Constructor used for list */ @@ -103,7 +106,7 @@ protected ColumnWriterOptions withDecimal(String name, int precision, protected ColumnWriterOptions withTimestamp(String name, boolean isInt96, boolean isNullable) { - return new ColumnWriterOptions(name, isInt96, 0, isNullable); + return new ColumnWriterOptions(name, isInt96, UNKNOWN_PRECISION, isNullable); } /** @@ -243,7 +246,7 @@ public ColumnWriterOptions(String columnName, boolean isTimestampTypeInt96, public ColumnWriterOptions(String columnName, boolean isNullable) { this.isTimestampTypeInt96 = false; - this.precision = 0; + this.precision = UNKNOWN_PRECISION; this.isNullable = isNullable; this.columnName = columnName; } diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index eec4a78a457..548844aa0d3 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -61,6 +61,7 @@ #include #include #include +#include #include #include #include @@ -561,34 +562,78 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listSortRows(JNIEnv *env, } JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *env, jclass, - jlong column_view, - jlong delimiter_ptr, - jint max_split) { - JNI_NULL_CHECK(env, column_view, "column is null", 0); - JNI_NULL_CHECK(env, delimiter_ptr, "string scalar delimiter is null", 0); + jlong input_handle, + jstring pattern_obj, + jint limit, + jboolean split_by_regex) { + JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0); + + if (limit == 0 || limit == 1) { + // Cannot achieve the results of splitting with limit == 0 or limit == 1. + // This is because cudf operates on a different parameter (`max_split`) which is converted from + // limit. When limit == 0 or limit == 1, max_split will be non-positive and will result in an + // unlimited split. + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", + "limit == 0 and limit == 1 are not supported", 0); + } + try { cudf::jni::auto_set_device(env); - cudf::strings_column_view const scv{*reinterpret_cast(column_view)}; - auto delimiter = reinterpret_cast(delimiter_ptr); + auto const input = reinterpret_cast(input_handle); + auto const strs_input = cudf::strings_column_view{*input}; - return cudf::jni::convert_table_for_return(env, - cudf::strings::split(scv, *delimiter, max_split)); + auto const pattern_jstr = cudf::jni::native_jstring(env, pattern_obj); + if (pattern_jstr.is_empty()) { + // Java's split API produces different behaviors than cudf when splitting with empty + // pattern. + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Empty pattern is not supported", 0); + } + + auto const pattern = std::string(pattern_jstr.get(), pattern_jstr.size_bytes()); + auto const max_split = limit > 1 ? limit - 1 : limit; + auto result = split_by_regex ? + cudf::strings::split_re(strs_input, pattern, max_split) : + cudf::strings::split(strs_input, cudf::string_scalar{pattern}, max_split); + return cudf::jni::convert_table_for_return(env, std::move(result)); } CATCH_STD(env, 0); } JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv *env, jclass, - jlong column_view, - jlong delimiter, - jint max_split) { - JNI_NULL_CHECK(env, column_view, "column is null", 0); - JNI_NULL_CHECK(env, delimiter, "string scalar delimiter is null", 0); + jlong input_handle, + jstring pattern_obj, + jint limit, + jboolean split_by_regex) { + JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0); + + if (limit == 0 || limit == 1) { + // Cannot achieve the results of splitting with limit == 0 or limit == 1. + // This is because cudf operates on a different parameter (`max_split`) which is converted from + // limit. When limit == 0 or limit == 1, max_split will be non-positive and will result in an + // unlimited split. + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", + "limit == 0 and limit == 1 are not supported", 0); + } + try { cudf::jni::auto_set_device(env); - cudf::column_view *cv = reinterpret_cast(column_view); - cudf::strings_column_view scv(*cv); - cudf::string_scalar *ss_scalar = reinterpret_cast(delimiter); - return release_as_jlong(cudf::strings::split_record(scv, *ss_scalar, max_split)); + auto const input = reinterpret_cast(input_handle); + auto const strs_input = cudf::strings_column_view{*input}; + + auto const pattern_jstr = cudf::jni::native_jstring(env, pattern_obj); + if (pattern_jstr.is_empty()) { + // Java's split API produces different behaviors than cudf when splitting with empty + // pattern. + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Empty pattern is not supported", 0); + } + + auto const pattern = std::string(pattern_jstr.get(), pattern_jstr.size_bytes()); + auto const max_split = limit > 1 ? limit - 1 : limit; + auto result = + split_by_regex ? + cudf::strings::split_record_re(strs_input, pattern, max_split) : + cudf::strings::split_record(strs_input, cudf::string_scalar{pattern}, max_split); + return release_as_jlong(result); } CATCH_STD(env, 0); } diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index eac76222475..1cf56da35da 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -676,9 +676,10 @@ int set_column_metadata(cudf::io::column_in_metadata &column_metadata, int write_index = 0; for (int i = 0; i < num_children; i++, write_index++) { cudf::io::column_in_metadata child; - child.set_name(col_names[read_index]) - .set_decimal_precision(precisions[read_index]) - .set_nullability(nullability[read_index]); + child.set_name(col_names[read_index]).set_nullability(nullability[read_index]); + if (precisions[read_index] > -1) { + child.set_decimal_precision(precisions[read_index]); + } if (!is_int96.is_null()) { child.set_int96_timestamps(is_int96[read_index]); } @@ -717,8 +718,10 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam for (int i = read_index, write_index = 0; i < top_level_children; i++, write_index++) { metadata.column_metadata[write_index] .set_name(cpp_names[read_index]) - .set_nullability(col_nullability[read_index]) - .set_decimal_precision(precisions[read_index]); + .set_nullability(col_nullability[read_index]); + if (precisions[read_index] > -1) { + metadata.column_metadata[write_index].set_decimal_precision(precisions[read_index]); + } if (!is_int96.is_null()) { metadata.column_metadata[write_index].set_int96_timestamps(is_int96[read_index]); } diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 9c00cdbc084..b759c746735 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -4364,10 +4364,10 @@ void testExtractListElements() { ColumnVector expected = ColumnVector.fromStrings("Héllo", "thésé", null, - null, + "", "ARé", "test"); - ColumnVector tmp = v.stringSplitRecord(); + ColumnVector tmp = v.stringSplitRecord(" "); ColumnVector result = tmp.extractListElement(0)) { assertColumnsAreEqual(expected, result); } @@ -4761,28 +4761,12 @@ void testListSortRowsWithStringChild() { } } - @Test - void testStringSplitRecord() { - try (ColumnVector v = ColumnVector.fromStrings("Héllo there", "thésé", "null", "", "ARé some", "test strings"); - ColumnVector expected = ColumnVector.fromLists( - new HostColumnVector.ListType(true, - new HostColumnVector.BasicType(true, DType.STRING)), - Arrays.asList("Héllo", "there"), - Arrays.asList("thésé"), - Arrays.asList("null"), - Arrays.asList(""), - Arrays.asList("ARé", "some"), - Arrays.asList("test", "strings")); - Scalar pattern = Scalar.fromString(" "); - ColumnVector result = v.stringSplitRecord(pattern, -1)) { - assertColumnsAreEqual(expected, result); - } - } - @Test void testStringSplit() { - try (ColumnVector v = ColumnVector.fromStrings("Héllo there all", "thésé", null, "", "ARé some things", "test strings here"); - Table expectedSplitOnce = new Table.TestBuilder() + String pattern = " "; + try (ColumnVector v = ColumnVector.fromStrings("Héllo there all", "thésé", null, "", + "ARé some things", "test strings here"); + Table expectedSplitLimit2 = new Table.TestBuilder() .column("Héllo", "thésé", null, "", "ARé", "test") .column("there all", null, null, null, "some things", "strings here") .build(); @@ -4791,41 +4775,92 @@ void testStringSplit() { .column("there", null, null, null, "some", "strings") .column("all", null, null, null, "things", "here") .build(); - Scalar pattern = Scalar.fromString(" "); - Table resultSplitOnce = v.stringSplit(pattern, 1); + Table resultSplitLimit2 = v.stringSplit(pattern, 2); Table resultSplitAll = v.stringSplit(pattern)) { - assertTablesAreEqual(expectedSplitOnce, resultSplitOnce); + assertTablesAreEqual(expectedSplitLimit2, resultSplitLimit2); assertTablesAreEqual(expectedSplitAll, resultSplitAll); } } @Test - void teststringSplitWhiteSpace() { - try (ColumnVector v = ColumnVector.fromStrings("Héllo thesé", null, "are\tsome", "tést\nString", " "); - Table expected = new Table.TestBuilder().column("Héllo", null, "are", "tést", null) - .column("thesé", null, "some", "String", null) - .build(); - Table result = v.stringSplit()) { - assertTablesAreEqual(expected, result); + void testStringSplitByRegularExpression() { + String pattern = "[_ ]"; + try (ColumnVector v = ColumnVector.fromStrings("Héllo_there all", "thésé", null, "", + "ARé some_things", "test_strings_here"); + Table expectedSplitLimit2 = new Table.TestBuilder() + .column("Héllo", "thésé", null, "", "ARé", "test") + .column("there all", null, null, null, "some_things", "strings_here") + .build(); + Table expectedSplitAll = new Table.TestBuilder() + .column("Héllo", "thésé", null, "", "ARé", "test") + .column("there", null, null, null, "some", "strings") + .column("all", null, null, null, "things", "here") + .build(); + Table resultSplitLimit2 = v.stringSplit(pattern, 2, true); + Table resultSplitAll = v.stringSplit(pattern, true)) { + assertTablesAreEqual(expectedSplitLimit2, resultSplitLimit2); + assertTablesAreEqual(expectedSplitAll, resultSplitAll); } } @Test - void teststringSplitThrowsException() { - assertThrows(CudfException.class, () -> { - try (ColumnVector cv = ColumnVector.fromStrings("Héllo", "thésé", null, "", "ARé", "strings"); - Scalar delimiter = Scalar.fromString(null); - Table result = cv.stringSplit(delimiter)) {} - }); - assertThrows(AssertionError.class, () -> { - try (ColumnVector cv = ColumnVector.fromStrings("Héllo", "thésé", null, "", "ARé", "strings"); - Scalar delimiter = Scalar.fromInt(1); - Table result = cv.stringSplit(delimiter)) {} - }); - assertThrows(AssertionError.class, () -> { - try (ColumnVector cv = ColumnVector.fromStrings("Héllo", "thésé", null, "", "ARé", "strings"); - Table result = cv.stringSplit(null)) {} - }); + void testStringSplitRecord() { + String pattern = " "; + try (ColumnVector v = ColumnVector.fromStrings("Héllo there all", "thésé", null, "", + "ARé some things", "test strings here"); + ColumnVector expectedSplitLimit2 = ColumnVector.fromLists( + new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), + Arrays.asList("Héllo", "there all"), + Arrays.asList("thésé"), + null, + Arrays.asList(""), + Arrays.asList("ARé", "some things"), + Arrays.asList("test", "strings here")); + ColumnVector expectedSplitAll = ColumnVector.fromLists( + new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), + Arrays.asList("Héllo", "there", "all"), + Arrays.asList("thésé"), + null, + Arrays.asList(""), + Arrays.asList("ARé", "some", "things"), + Arrays.asList("test", "strings", "here")); + ColumnVector resultSplitLimit2 = v.stringSplitRecord(pattern, 2); + ColumnVector resultSplitAll = v.stringSplitRecord(pattern)) { + assertColumnsAreEqual(expectedSplitLimit2, resultSplitLimit2); + assertColumnsAreEqual(expectedSplitAll, resultSplitAll); + } + } + + @Test + void testStringSplitRecordByRegularExpression() { + String pattern = "[_ ]"; + try (ColumnVector v = ColumnVector.fromStrings("Héllo_there all", "thésé", null, "", + "ARé some_things", "test_strings_here"); + ColumnVector expectedSplitLimit2 = ColumnVector.fromLists( + new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), + Arrays.asList("Héllo", "there all"), + Arrays.asList("thésé"), + null, + Arrays.asList(""), + Arrays.asList("ARé", "some_things"), + Arrays.asList("test", "strings_here")); + ColumnVector expectedSplitAll = ColumnVector.fromLists( + new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), + Arrays.asList("Héllo", "there", "all"), + Arrays.asList("thésé"), + null, + Arrays.asList(""), + Arrays.asList("ARé", "some", "things"), + Arrays.asList("test", "strings", "here")); + ColumnVector resultSplitLimit2 = v.stringSplitRecord(pattern, 2, true); + ColumnVector resultSplitAll = v.stringSplitRecord(pattern, true)) { + assertColumnsAreEqual(expectedSplitLimit2, resultSplitLimit2); + assertColumnsAreEqual(expectedSplitAll, resultSplitAll); + } } @Test diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index cdb1e9fc86f..c96d940c378 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -3,6 +3,7 @@ from __future__ import annotations import pickle +from functools import cached_property from typing import Any, Set import pandas as pd @@ -31,7 +32,6 @@ is_mixed_with_object_dtype, numeric_normalize_types, ) -from cudf.utils.utils import cached_property class BaseIndex(Serializable): diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 2535ba5ab8d..393afe4a5b9 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1602,8 +1602,8 @@ def build_struct_column( Parameters ---------- - names : list-like - Field names to map to children dtypes + names : sequence of strings + Field names to map to children dtypes, must be strings. children : tuple mask: Buffer diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 67976ac27d4..9cb86ca1cd2 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -4,7 +4,7 @@ import itertools from collections.abc import MutableMapping -from functools import reduce +from functools import cached_property, reduce from typing import ( TYPE_CHECKING, Any, @@ -20,7 +20,6 @@ import cudf from cudf.core import column -from cudf.utils.utils import cached_property if TYPE_CHECKING: from cudf.core.column import ColumnBase @@ -360,9 +359,9 @@ def select_by_index(self, index: Any) -> ColumnAccessor: start, stop, step = index.indices(len(self._data)) keys = self.names[start:stop:step] elif pd.api.types.is_integer(index): - keys = [self.names[index]] + keys = (self.names[index],) else: - keys = (self.names[i] for i in index) + keys = tuple(self.names[i] for i in index) data = {k: self._data[k] for k in keys} return self.__class__( data, multiindex=self.multiindex, level_names=self.level_names, diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 371404ca477..fb15f8da8d9 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1271,14 +1271,6 @@ def memory_usage(self, index=True, deep=False): {str(k): v for k, v in super().memory_usage(index, deep).items()} ) - @annotate("DATAFRAME_ARRAY_UFUNC", color="blue", domain="cudf_python") - def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - if method == "__call__" and hasattr(cudf, ufunc.__name__): - func = getattr(cudf, ufunc.__name__) - return func(self) - else: - return NotImplemented - @annotate("DATAFRAME_ARRAY_FUNCTION", color="blue", domain="cudf_python") def __array_function__(self, func, types, args, kwargs): @@ -1864,8 +1856,7 @@ def _get_columns_by_label(self, labels, downcast=False): ) return out - @annotate("DATAFRAME_BINARYOP", color="blue", domain="cudf_python") - def _binaryop( + def _prep_for_binop( self, other: Any, fn: str, @@ -1885,6 +1876,7 @@ def _binaryop( # implementation assumes that binary operations between a column and # NULL are always commutative, even for binops (like subtraction) that # are normally anticommutative. + # TODO: We probably should support pandas DataFrame/Series objects. if isinstance(rhs, Sequence): # TODO: Consider validating sequence length (pandas does). operands = { @@ -1948,11 +1940,30 @@ def _binaryop( right = right_dict[col] operands[col] = (left, right, reflect, fill_value) else: + return NotImplemented, None + + return operands, lhs._index + + @annotate("DATAFRAME_BINARYOP", color="blue", domain="cudf_python") + def _binaryop( + self, + other: Any, + fn: str, + fill_value: Any = None, + reflect: bool = False, + can_reindex: bool = False, + *args, + **kwargs, + ): + operands, out_index = self._prep_for_binop( + other, fn, fill_value, reflect, can_reindex + ) + if operands is NotImplemented: return NotImplemented return self._from_data( ColumnAccessor(type(self)._colwise_binop(operands, fn)), - index=lhs._index, + index=out_index, ) @annotate("DATAFRAME_UPDATE", color="blue", domain="cudf_python") @@ -5864,8 +5875,16 @@ def to_struct(self, name=None): ----- Note that a copy of the columns is made. """ + if not all(isinstance(name, str) for name in self._data.names): + warnings.warn( + "DataFrame contains non-string column name(s). Struct column " + "requires field name to be string. Non-string column names " + "will be casted to string as the field name." + ) + field_names = [str(name) for name in self._data.names] + col = cudf.core.column.build_struct_column( - names=self._data.names, children=self._data.columns, size=len(self) + names=field_names, children=self._data.columns, size=len(self) ) return cudf.Series._from_data( cudf.core.column_accessor.ColumnAccessor( diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 6e46c107d2e..6038bb49bfb 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -90,22 +90,22 @@ def _num_rows(self) -> int: return len(self._data.columns[0]) @property - def _column_names(self) -> List[Any]: # TODO: List[str]? - return self._data.names + def _column_names(self) -> Tuple[Any, ...]: # TODO: Tuple[str]? + return tuple(self._data.names) @property - def _index_names(self) -> List[Any]: # TODO: List[str]? + def _index_names(self) -> Optional[Tuple[Any, ...]]: # TODO: Tuple[str]? # TODO: Temporarily suppressing mypy warnings to avoid introducing bugs # by returning an empty list where one is not expected. return ( None # type: ignore if self._index is None - else self._index._data.names + else tuple(self._index._data.names) ) @property - def _columns(self) -> List[Any]: # TODO: List[Column]? - return self._data.columns + def _columns(self) -> Tuple[Any, ...]: # TODO: Tuple[Column]? + return tuple(self._data.columns) def serialize(self): header = { diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index ff700144bed..4bd14a2c47b 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -4,6 +4,7 @@ import itertools import pickle import warnings +from functools import cached_property import numpy as np import pandas as pd @@ -16,7 +17,7 @@ from cudf.core.abc import Serializable from cudf.core.column.column import arange, as_column from cudf.core.multiindex import MultiIndex -from cudf.utils.utils import GetAttrGetItemMixin, cached_property +from cudf.utils.utils import GetAttrGetItemMixin # The three functions below return the quantiles [25%, 50%, 75%] diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index f71f930a21c..5b60e8dbd1c 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1,10 +1,11 @@ -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. from __future__ import annotations import math import pickle import warnings +from functools import cached_property from numbers import Number from typing import ( Any, @@ -54,7 +55,7 @@ from cudf.core.single_column_frame import SingleColumnFrame from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import find_common_type -from cudf.utils.utils import cached_property, search_range +from cudf.utils.utils import search_range T = TypeVar("T", bound="Frame") diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 10b9f2396bb..e1ff3984948 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -6,6 +6,7 @@ import operator import warnings from collections import Counter, abc +from functools import cached_property from typing import Callable, Type, TypeVar from uuid import uuid4 @@ -29,7 +30,6 @@ from cudf.core.index import Index, RangeIndex, _index_from_columns from cudf.core.multiindex import MultiIndex from cudf.core.udf.utils import _compile_or_get, _supported_cols_from_frame -from cudf.utils.utils import cached_property doc_reset_index_template = """ Reset the index of the {klass}, or a level of it. @@ -1697,6 +1697,154 @@ def last(self, offset): slice_func=lambda i: self.iloc[i:], ) + # For more detail on this function and how it should work, see + # https://numpy.org/doc/stable/reference/ufuncs.html + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + # We don't currently support reduction, accumulation, etc. We also + # don't support any special kwargs or higher arity ufuncs than binary. + if method != "__call__" or kwargs or ufunc.nin > 2: + return NotImplemented + + # Binary operations + binary_operations = { + # Arithmetic binary operations. + "add": "add", + "subtract": "sub", + "multiply": "mul", + "matmul": "matmul", + "divide": "truediv", + "true_divide": "truediv", + "floor_divide": "floordiv", + "power": "pow", + "float_power": "pow", + "remainder": "mod", + "mod": "mod", + "fmod": "mod", + # Bitwise binary operations. + "bitwise_and": "and", + "bitwise_or": "or", + "bitwise_xor": "xor", + # Comparison binary operators + "greater": "gt", + "greater_equal": "ge", + "less": "lt", + "less_equal": "le", + "not_equal": "ne", + "equal": "eq", + } + + # First look for methods of the class. + fname = ufunc.__name__ + if fname in binary_operations: + reflect = self is not inputs[0] + other = inputs[0] if reflect else inputs[1] + + # These operators need to be mapped to their inverses when + # performing a reflected operation because no reflected version of + # the operators themselves exist. + ops_without_reflection = { + "gt": "lt", + "ge": "le", + "lt": "gt", + "le": "ge", + # ne and eq are symmetric, so they are their own inverse op + "ne": "ne", + "eq": "eq", + } + + op = binary_operations[fname] + if reflect and op in ops_without_reflection: + op = ops_without_reflection[op] + reflect = False + op = f"__{'r' if reflect else ''}{op}__" + + # pandas bitwise operations return bools if indexes are misaligned. + if ( + "bitwise" in fname + and isinstance(other, IndexedFrame) + and not self.index.equals(other.index) + ): + return getattr(self, op)(other).astype(bool) + # Float_power returns float irrespective of the input type. + if fname == "float_power": + return getattr(self, op)(other).astype(float) + return getattr(self, op)(other) + + # Special handling for unary operations. + if fname == "negative": + return self * -1 + if fname == "positive": + return self.copy(deep=True) + if fname == "invert": + return ~self + if fname == "absolute": + return self.abs() + if fname == "fabs": + return self.abs().astype(np.float64) + + # Note: There are some operations that may be supported by libcudf but + # are not supported by pandas APIs. In particular, libcudf binary + # operations support logical and/or operations, but those operations + # are not defined on pd.Series/DataFrame. For now those operations will + # dispatch to cupy, but if ufuncs are ever a bottleneck we could add + # special handling to dispatch those (or any other) functions that we + # could implement without cupy. + + # Attempt to dispatch all other functions to cupy. + cupy_func = getattr(cp, fname) + if cupy_func: + # Indices must be aligned before converting to arrays. + if ufunc.nin == 2: + other = inputs[self is inputs[0]] + inputs, index = self._prep_for_binop(other, fname) + else: + inputs = { + name: (col, None, False, None) + for name, col in self._data.items() + } + index = self._index + + mask = None + data = [{} for _ in range(ufunc.nout)] + for name, (left, right, _, _) in inputs.items(): + cupy_inputs = [] + # TODO: I'm jumping through multiple hoops to get the unary + # behavior to match up with the binary. I should see if there + # are better patterns to employ here. + for inp in (left, right) if ufunc.nin == 2 else (left,): + if ( + isinstance(inp, cudf.core.column.ColumnBase) + and inp.has_nulls() + ): + new_mask = cudf.core.column.as_column(inp.nullmask) + + # TODO: This is a hackish way to perform a bitwise and + # of bitmasks. Once we expose + # cudf::detail::bitwise_and, then we can use that + # instead. + mask = new_mask if mask is None else (mask & new_mask) + + # Arbitrarily fill with zeros. For ufuncs, we assume + # that the end result propagates nulls via a bitwise + # and, so these elements are irrelevant. + inp = inp.fillna(0) + cupy_inputs.append(cp.asarray(inp)) + + cp_output = cupy_func(*cupy_inputs, **kwargs) + if ufunc.nout == 1: + cp_output = (cp_output,) + for i, out in enumerate(cp_output): + data[i][name] = cudf.core.column.as_column(out).set_mask( + mask + ) + + out = tuple( + self.__class__._from_data(out, index=index) for out in data + ) + return out[0] if ufunc.nout == 1 else out + + return NotImplemented + def _check_duplicate_level_names(specified, level_names): """Raise if any of `specified` has duplicates in `level_names`.""" diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 39ff4718550..c7e46cf0165 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -1,7 +1,7 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from __future__ import annotations -from typing import TYPE_CHECKING, Callable, cast +from typing import TYPE_CHECKING, Any, Callable, List, cast import cudf from cudf import _lib as libcudf @@ -320,7 +320,7 @@ def _sort_result(self, result: Frame) -> Frame: # same order as given in 'on'. If the indices are used as # keys, the index will be sorted. If one index is specified, # the key columns on the other side will be used to sort. - by = [] + by: List[Any] = [] if self._using_left_index and self._using_right_index: if result._index is not None: by.extend(result._index._data.columns) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 8581b97c217..5e0cd2ca8cb 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2021, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. from __future__ import annotations @@ -6,6 +6,7 @@ import numbers import pickle from collections.abc import Sequence +from functools import cached_property from numbers import Integral from typing import Any, List, MutableMapping, Optional, Tuple, Union @@ -22,11 +23,7 @@ from cudf.core._compat import PANDAS_GE_120 from cudf.core.frame import Frame from cudf.core.index import BaseIndex, _lexsorted_equal_range, as_index -from cudf.utils.utils import ( - NotIterable, - _maybe_indices_to_slice, - cached_property, -) +from cudf.utils.utils import NotIterable, _maybe_indices_to_slice class MultiIndex(Frame, BaseIndex, NotIterable): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 90ebeba5087..3aef4447a28 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -7,7 +7,6 @@ import pickle import warnings from collections import abc as abc -from itertools import repeat from numbers import Number from shutil import get_terminal_size from typing import Any, MutableMapping, Optional, Set, Union @@ -959,141 +958,6 @@ def to_frame(self, name=None): def memory_usage(self, index=True, deep=False): return sum(super().memory_usage(index, deep).values()) - # For more detail on this function and how it should work, see - # https://numpy.org/doc/stable/reference/ufuncs.html - def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - # We don't currently support reduction, accumulation, etc. We also - # don't support any special kwargs or higher arity ufuncs than binary. - if method != "__call__" or kwargs or ufunc.nin > 2: - return NotImplemented - - # Binary operations - binary_operations = { - # Arithmetic binary operations. - "add": "add", - "subtract": "sub", - "multiply": "mul", - "matmul": "matmul", - "divide": "truediv", - "true_divide": "truediv", - "floor_divide": "floordiv", - "power": "pow", - "float_power": "pow", - "remainder": "mod", - "mod": "mod", - "fmod": "mod", - # Bitwise binary operations. - "bitwise_and": "and", - "bitwise_or": "or", - "bitwise_xor": "xor", - # Comparison binary operators - "greater": "gt", - "greater_equal": "ge", - "less": "lt", - "less_equal": "le", - "not_equal": "ne", - "equal": "eq", - } - - # First look for methods of the class. - fname = ufunc.__name__ - if fname in binary_operations: - reflect = self is not inputs[0] - other = inputs[0] if reflect else inputs[1] - - # These operators need to be mapped to their inverses when - # performing a reflected operation because no reflected version of - # the operators themselves exist. - ops_without_reflection = { - "gt": "lt", - "ge": "le", - "lt": "gt", - "le": "ge", - # ne and eq are symmetric, so they are their own inverse op - "ne": "ne", - "eq": "eq", - } - - op = binary_operations[fname] - if reflect and op in ops_without_reflection: - op = ops_without_reflection[op] - reflect = False - op = f"__{'r' if reflect else ''}{op}__" - - # pandas bitwise operations return bools if indexes are misaligned. - # TODO: Generalize for other types of Frames - if ( - "bitwise" in fname - and isinstance(other, Series) - and not self.index.equals(other.index) - ): - return getattr(self, op)(other).astype(bool) - # Float_power returns float irrespective of the input type. - if fname == "float_power": - return getattr(self, op)(other).astype(float) - return getattr(self, op)(other) - - # Special handling for unary operations. - if fname == "negative": - return self * -1 - if fname == "positive": - return self.copy(deep=True) - if fname == "invert": - return ~self - if fname == "absolute": - return self.abs() - if fname == "fabs": - return self.abs().astype(np.float64) - - # Note: There are some operations that may be supported by libcudf but - # are not supported by pandas APIs. In particular, libcudf binary - # operations support logical and/or operations, but those operations - # are not defined on pd.Series/DataFrame. For now those operations will - # dispatch to cupy, but if ufuncs are ever a bottleneck we could add - # special handling to dispatch those (or any other) functions that we - # could implement without cupy. - - # Attempt to dispatch all other functions to cupy. - cupy_func = getattr(cupy, fname) - if cupy_func: - # Indices must be aligned before converting to arrays. - if ufunc.nin == 2 and all(map(isinstance, inputs, repeat(Series))): - inputs = _align_indices(inputs, allow_non_unique=True) - index = inputs[0].index - else: - index = self.index - - cupy_inputs = [] - mask = None - for inp in inputs: - # TODO: Generalize for other types of Frames - if isinstance(inp, Series) and inp.has_nulls: - new_mask = as_column(inp.nullmask) - - # TODO: This is a hackish way to perform a bitwise and of - # bitmasks. Once we expose cudf::detail::bitwise_and, then - # we can use that instead. - mask = new_mask if mask is None else (mask & new_mask) - - # Arbitrarily fill with zeros. For ufuncs, we assume that - # the end result propagates nulls via a bitwise and, so - # these elements are irrelevant. - inp = inp.fillna(0) - cupy_inputs.append(cupy.asarray(inp)) - - cp_output = cupy_func(*cupy_inputs, **kwargs) - - def make_frame(arr): - return self.__class__._from_data( - {self.name: as_column(arr).set_mask(mask)}, index=index - ) - - if ufunc.nout > 1: - return tuple(make_frame(out) for out in cp_output) - return make_frame(cp_output) - - return NotImplemented - def __array_function__(self, func, types, args, kwargs): handled_types = [cudf.Series] for t in types: @@ -1342,9 +1206,9 @@ def __repr__(self): lines.append(category_memory) return "\n".join(lines) - def _binaryop( + def _prep_for_binop( self, - other: Frame, + other: Any, fn: str, fill_value: Any = None, reflect: bool = False, @@ -1376,9 +1240,24 @@ def _binaryop( lhs = self operands = lhs._make_operands_for_binop(other, fill_value, reflect) + return operands, lhs._index + + def _binaryop( + self, + other: Frame, + fn: str, + fill_value: Any = None, + reflect: bool = False, + can_reindex: bool = False, + *args, + **kwargs, + ): + operands, out_index = self._prep_for_binop( + other, fn, fill_value, reflect, can_reindex + ) return ( - lhs._from_data( - data=lhs._colwise_binop(operands, fn), index=lhs._index, + self._from_data( + data=self._colwise_binop(operands, fn), index=out_index, ) if operands is not NotImplemented else NotImplemented diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index bf867923b57..50b206d3388 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. """Base class for Frame types that only have a single column.""" from __future__ import annotations @@ -274,7 +274,7 @@ def factorize(self, na_sentinel=-1): def _make_operands_for_binop( self, - other: T, + other: Any, fill_value: Any = None, reflect: bool = False, *args, @@ -310,7 +310,7 @@ def _make_operands_for_binop( else: result_name = self.name - # This needs to be tested correctly + # TODO: This needs to be tested correctly if isinstance(other, SingleColumnFrame): other = other._column elif not _is_scalar_or_zero_d_array(other): diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py index a384ddecca6..f1aad1af9e6 100644 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ b/python/cudf/cudf/tests/test_array_ufunc.py @@ -109,7 +109,7 @@ def test_ufunc_series(ufunc, has_nulls, indexed): @pytest.mark.parametrize("reflect", [True, False]) def test_binary_ufunc_series_array(ufunc, has_nulls, indexed, type_, reflect): fname = ufunc.__name__ - if fname in ("greater", "greater_equal") and has_nulls: + if fname in ("greater", "greater_equal", "logical_and") and has_nulls: pytest.xfail( "The way cudf casts nans in arrays to nulls during binops with " "cudf objects is currently incompatible with pandas." @@ -181,3 +181,102 @@ def test_ufunc_cudf_series_error_with_out_kwarg(func): # this throws a value-error because of presence of out kwarg with pytest.raises(TypeError): func(x1=cudf_s1, x2=cudf_s2, out=cudf_s3) + + +# Skip matmul since it requires aligned shapes. +@pytest.mark.parametrize("ufunc", (uf for uf in _UFUNCS if uf != np.matmul)) +@pytest.mark.parametrize("has_nulls", [True, False]) +@pytest.mark.parametrize("indexed", [True, False]) +def test_ufunc_dataframe(ufunc, has_nulls, indexed): + # Note: This test assumes that all ufuncs are unary or binary. + fname = ufunc.__name__ + # TODO: When pandas starts supporting misaligned indexes properly, remove + # this check but enable the one below. + if indexed: + pytest.xfail( + "pandas does not currently support misaligned indexes in " + "DataFrames, but we do. Until this is fixed we will skip these " + "tests. See the error here: " + "https://github.com/pandas-dev/pandas/blob/main/pandas/core/arraylike.py#L212, " # noqa: E501 + "called from https://github.com/pandas-dev/pandas/blob/main/pandas/core/arraylike.py#L258" # noqa: E501 + ) + # TODO: Enable the check below when we remove the check above. + # if indexed and fname in ( + # "greater", + # "greater_equal", + # "less", + # "less_equal", + # "not_equal", + # "equal", + # ): + # pytest.skip("Comparison operators do not support misaligned indexes.") # noqa: E501 + + N = 100 + # Avoid zeros in either array to skip division by 0 errors. Also limit the + # scale to avoid issues with overflow, etc. We use ints because some + # operations (like bitwise ops) are not defined for floats. + # TODO: Add tests of mismatched columns etc. + pandas_args = args = [ + cudf.DataFrame( + {"foo": cp.random.randint(low=1, high=10, size=N)}, + index=cp.random.choice(range(N), N, False) if indexed else None, + ) + for _ in range(ufunc.nin) + ] + + if has_nulls: + # Converting nullable integer cudf.Series to pandas will produce a + # float pd.Series, so instead we replace nulls with an arbitrary + # integer value, precompute the mask, and then reapply it afterwards. + for arg in args: + set_random_null_mask_inplace(arg["foo"]) + pandas_args = [arg.copy() for arg in args] + for arg in pandas_args: + arg["foo"] = arg["foo"].fillna(0) + + # Note: Different indexes must be aligned before the mask is computed. + # This requires using an internal function (_align_indices), and that + # is unlikely to change for the foreseeable future. + aligned = ( + cudf.core.dataframe._align_indices(*args) + if indexed and ufunc.nin == 2 + else args + ) + mask = reduce( + operator.or_, (a["foo"].isna() for a in aligned) + ).to_pandas() + + try: + got = ufunc(*args) + except AttributeError as e: + # We xfail if we don't have an explicit dispatch and cupy doesn't have + # the method so that we can easily identify these methods. As of this + # writing, the only missing methods are isnat and heaviside. + if "module 'cupy' has no attribute" in str(e): + pytest.xfail(reason="Operation not supported by cupy") + raise + + expect = ufunc(*(arg.to_pandas() for arg in pandas_args)) + + try: + if ufunc.nout > 1: + for g, e in zip(got, expect): + if has_nulls: + e[mask] = np.nan + assert_eq(g, e) + else: + if has_nulls: + expect[mask] = np.nan + assert_eq(got, expect) + except AssertionError: + # TODO: This branch can be removed when + # https://github.com/rapidsai/cudf/issues/10178 is resolved + if fname in ("power", "float_power"): + not_equal = cudf.from_pandas(expect) != got + not_equal[got.isna()] = False + diffs = got[not_equal] - cudf.from_pandas( + expect[not_equal.to_pandas()] + ) + if diffs["foo"].abs().max() == 1: + pytest.xfail("https://github.com/rapidsai/cudf/issues/10178") + raise diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py index dbff626c363..167f171fa26 100644 --- a/python/cudf/cudf/tests/test_struct.py +++ b/python/cudf/cudf/tests/test_struct.py @@ -205,6 +205,14 @@ def test_dataframe_to_struct(): df["a"][0] = 5 assert_eq(got, expect) + # check that a non-string (but convertible to string) named column can be + # converted to struct + df = cudf.DataFrame([[1, 2], [3, 4]], columns=[(1, "b"), 0]) + expect = cudf.Series([{"(1, 'b')": 1, "0": 2}, {"(1, 'b')": 3, "0": 4}]) + with pytest.warns(UserWarning, match="will be casted"): + got = df.to_struct() + assert_eq(got, expect) + @pytest.mark.parametrize( "series, slce", diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 8571d9ffed5..4143cbd1d66 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -144,28 +144,6 @@ def set_allocator( IS_NEP18_ACTIVE = _is_nep18_active() -class cached_property: - """ - Like @property, but only evaluated upon first invocation. - To force re-evaluation of a cached_property, simply delete - it with `del`. - """ - - # TODO: Can be replaced with functools.cached_property when we drop support - # for Python 3.7. - - def __init__(self, func): - self.func = func - - def __get__(self, instance, cls): - if instance is None: - return self - else: - value = self.func(instance) - object.__setattr__(instance, self.func.__name__, value) - return value - - class GetAttrGetItemMixin: """This mixin changes `__getattr__` to attempt a `__getitem__` call.