diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh index 6bad2342012..31d080e95d7 100755 --- a/ci/benchmark/build.sh +++ b/ci/benchmark/build.sh @@ -37,7 +37,7 @@ export GBENCH_BENCHMARKS_DIR="$WORKSPACE/cpp/build/gbenchmarks/" export LIBCUDF_KERNEL_CACHE_PATH="$HOME/.jitify-cache" # Dask & Distributed option to install main(nightly) or `conda-forge` packages. -export INSTALL_DASK_MAIN=1 +export INSTALL_DASK_MAIN=0 function remove_libcudf_kernel_cache_dir { EXITCODE=$? diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index b1d649db8f9..08f9034357a 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -31,7 +31,7 @@ export GIT_DESCRIBE_TAG=`git describe --tags` export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'` # Dask & Distributed option to install main(nightly) or `conda-forge` packages. -export INSTALL_DASK_MAIN=1 +export INSTALL_DASK_MAIN=0 # ucx-py version export UCX_PY_VERSION='0.26.*' diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml index e9d018a2d18..2225cbe0918 100644 --- a/conda/environments/cudf_dev_cuda11.5.yml +++ b/conda/environments/cudf_dev_cuda11.5.yml @@ -12,7 +12,7 @@ dependencies: - clang-tools=11.1.0 - cupy>=9.5.0,<11.0.0a0 - rmm=22.06.* - - cmake>=3.20.1 + - cmake>=3.20.1,<3.23 - cmake_setuptools>=0.1.3 - python>=3.7,<3.9 - numba>=0.54 diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp index c2187f056cf..9e8fd1244d0 100644 --- a/cpp/include/cudf/io/orc.hpp +++ b/cpp/include/cudf/io/orc.hpp @@ -67,9 +67,6 @@ class orc_reader_options { // Cast timestamp columns to a specific type data_type _timestamp_type{type_id::EMPTY}; - // Columns that should be converted from Decimal to Float64 - std::vector _decimal_cols_as_float; - // Columns that should be read as Decimal128 std::vector _decimal128_columns; @@ -138,14 +135,6 @@ class orc_reader_options { */ data_type get_timestamp_type() const { return _timestamp_type; } - /** - * @brief Fully qualified names of columns that should be converted from Decimal to Float64. - */ - std::vector const& get_decimal_cols_as_float() const - { - return _decimal_cols_as_float; - } - /** * @brief Fully qualified names of columns that should be read as 128-bit Decimal. */ @@ -215,18 +204,6 @@ class orc_reader_options { */ void set_timestamp_type(data_type type) { _timestamp_type = type; } - /** - * @brief Set columns that should be converted from Decimal to Float64 - * - * @param val Vector of fully qualified column names. - */ - [[deprecated( - "Decimal to float conversion is deprecated and will be remove in future release")]] void - set_decimal_cols_as_float(std::vector val) - { - _decimal_cols_as_float = std::move(val); - } - /** * @brief Set columns that should be read as 128-bit Decimal * @@ -340,21 +317,6 @@ class orc_reader_options_builder { return *this; } - /** - * @brief Columns that should be converted from decimals to float64. - * - * @param val Vector of column names. - * @return this for chaining. - */ - [[deprecated( - "Decimal to float conversion is deprecated and will be remove in future " - "release")]] orc_reader_options_builder& - decimal_cols_as_float(std::vector val) - { - options._decimal_cols_as_float = std::move(val); - return *this; - } - /** * @brief Columns that should be read as 128-bit Decimal * diff --git a/cpp/include/cudf/strings/replace_re.hpp b/cpp/include/cudf/strings/replace_re.hpp index 0e904958d15..0ab3953470d 100644 --- a/cpp/include/cudf/strings/replace_re.hpp +++ b/cpp/include/cudf/strings/replace_re.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -86,7 +86,8 @@ std::unique_ptr replace_re( * * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. * - * @throw cudf::logic_error if capture index values in `replacement` are not in range 1-99 + * @throw cudf::logic_error if capture index values in `replacement` are not in range 0-99, and also + * if the index exceeds the group count specified in the pattern * * @param strings Strings instance for this operation. * @param pattern The regular expression patterns to search within each string. diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index 5cbc0899196..059df283c94 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -241,30 +241,22 @@ size_t gather_stream_info(const size_t stripe_index, /** * @brief Determines cuDF type of an ORC Decimal column. */ -auto decimal_column_type(std::vector const& float64_columns, - std::vector const& decimal128_columns, +auto decimal_column_type(std::vector const& decimal128_columns, cudf::io::orc::detail::aggregate_orc_metadata const& metadata, int column_index) { - if (metadata.get_col_type(column_index).kind != DECIMAL) return type_id::EMPTY; + if (metadata.get_col_type(column_index).kind != DECIMAL) { return type_id::EMPTY; } - auto const& column_path = metadata.column_path(0, column_index); - auto is_column_in = [&](const std::vector& cols) { - return std::find(cols.cbegin(), cols.cend(), column_path) != cols.end(); - }; - - auto const user_selected_float64 = is_column_in(float64_columns); - auto const user_selected_decimal128 = is_column_in(decimal128_columns); - CUDF_EXPECTS(not user_selected_float64 or not user_selected_decimal128, - "Both decimal128 and float64 types selected for column " + column_path); - - if (user_selected_float64) return type_id::FLOAT64; - if (user_selected_decimal128) return type_id::DECIMAL128; + if (std::find(decimal128_columns.cbegin(), + decimal128_columns.cend(), + metadata.column_path(0, column_index)) != decimal128_columns.end()) { + return type_id::DECIMAL128; + } auto const precision = metadata.get_col_type(column_index) .precision.value_or(cuda::std::numeric_limits::digits10); - if (precision <= cuda::std::numeric_limits::digits10) return type_id::DECIMAL32; - if (precision <= cuda::std::numeric_limits::digits10) return type_id::DECIMAL64; + if (precision <= cuda::std::numeric_limits::digits10) { return type_id::DECIMAL32; } + if (precision <= cuda::std::numeric_limits::digits10) { return type_id::DECIMAL64; } return type_id::DECIMAL128; } @@ -796,12 +788,11 @@ std::unique_ptr reader::impl::create_empty_column(const size_type orc_co rmm::cuda_stream_view stream) { schema_info.name = _metadata.column_name(0, orc_col_id); - auto const type = to_type_id( - _metadata.get_schema(orc_col_id), - _use_np_dtypes, - _timestamp_type.id(), - decimal_column_type(_decimal_cols_as_float, decimal128_columns, _metadata, orc_col_id)); - int32_t scale = 0; + auto const type = to_type_id(_metadata.get_schema(orc_col_id), + _use_np_dtypes, + _timestamp_type.id(), + decimal_column_type(decimal128_columns, _metadata, orc_col_id)); + int32_t scale = 0; std::vector> child_columns; std::unique_ptr out_col = nullptr; auto kind = _metadata.get_col_type(orc_col_id).kind; @@ -943,8 +934,7 @@ reader::impl::impl(std::vector>&& sources, _use_np_dtypes = options.is_enabled_use_np_dtypes(); // Control decimals conversion - _decimal_cols_as_float = options.get_decimal_cols_as_float(); - decimal128_columns = options.get_decimal128_columns(); + decimal128_columns = options.get_decimal128_columns(); } timezone_table reader::impl::compute_timezone_table( @@ -1004,11 +994,10 @@ table_with_metadata reader::impl::read(size_type skip_rows, // Get a list of column data types std::vector column_types; for (auto& col : columns_level) { - auto col_type = to_type_id( - _metadata.get_col_type(col.id), - _use_np_dtypes, - _timestamp_type.id(), - decimal_column_type(_decimal_cols_as_float, decimal128_columns, _metadata, col.id)); + auto col_type = to_type_id(_metadata.get_col_type(col.id), + _use_np_dtypes, + _timestamp_type.id(), + decimal_column_type(decimal128_columns, _metadata, col.id)); CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type"); if (col_type == type_id::DECIMAL32 or col_type == type_id::DECIMAL64 or col_type == type_id::DECIMAL128) { diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp index 1e586bcde00..103093f055f 100644 --- a/cpp/src/io/orc/reader_impl.hpp +++ b/cpp/src/io/orc/reader_impl.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -221,7 +221,6 @@ class reader::impl { bool _use_index{true}; bool _use_np_dtypes{true}; - std::vector _decimal_cols_as_float; std::vector decimal128_columns; data_type _timestamp_type{type_id::EMPTY}; reader_column_meta _col_meta{}; diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index dc09b3e7dd8..b4cbb5d9037 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -962,15 +962,6 @@ static __device__ uint32_t Byte_RLE(orc_bytestream_s* bs, return rle->num_vals; } -/** - * @brief Powers of 10 - */ -static const __device__ __constant__ double kPow10[40] = { - 1.0, 1.e1, 1.e2, 1.e3, 1.e4, 1.e5, 1.e6, 1.e7, 1.e8, 1.e9, 1.e10, 1.e11, 1.e12, 1.e13, - 1.e14, 1.e15, 1.e16, 1.e17, 1.e18, 1.e19, 1.e20, 1.e21, 1.e22, 1.e23, 1.e24, 1.e25, 1.e26, 1.e27, - 1.e28, 1.e29, 1.e30, 1.e31, 1.e32, 1.e33, 1.e34, 1.e35, 1.e36, 1.e37, 1.e38, 1.e39, -}; - static const __device__ __constant__ int64_t kPow5i[28] = {1, 5, 25, @@ -1045,34 +1036,24 @@ static __device__ int Decode_Decimals(orc_bytestream_s* bs, auto const pos = static_cast(vals.i64[2 * t]); __int128_t v = decode_varint128(bs, pos); - if (dtype_id == type_id::FLOAT64) { - double f = v; - int32_t scale = (t < numvals) ? val_scale : 0; - if (scale >= 0) - vals.f64[t] = f / kPow10[min(scale, 39)]; - else - vals.f64[t] = f * kPow10[min(-scale, 39)]; - } else { - auto const scaled_value = [&]() { - // Since cuDF column stores just one scale, value needs to be adjusted to col_scale from - // val_scale. So the difference of them will be used to add 0s or remove digits. - int32_t scale = (t < numvals) ? col_scale - val_scale : 0; - if (scale >= 0) { - scale = min(scale, 27); - return (v * kPow5i[scale]) << scale; - } else // if (scale < 0) - { - scale = min(-scale, 27); - return (v / kPow5i[scale]) >> scale; - } - }(); - if (dtype_id == type_id::DECIMAL32) { - vals.i32[t] = scaled_value; - } else if (dtype_id == type_id::DECIMAL64) { - vals.i64[t] = scaled_value; + auto const scaled_value = [&]() { + // Since cuDF column stores just one scale, value needs to be adjusted to col_scale from + // val_scale. So the difference of them will be used to add 0s or remove digits. + int32_t const scale = (t < numvals) ? col_scale - val_scale : 0; + if (scale >= 0) { + auto const abs_scale = min(scale, 27); + return (v * kPow5i[abs_scale]) << abs_scale; } else { - vals.i128[t] = scaled_value; + auto const abs_scale = min(-scale, 27); + return (v / kPow5i[abs_scale]) >> abs_scale; } + }(); + if (dtype_id == type_id::DECIMAL32) { + vals.i32[t] = scaled_value; + } else if (dtype_id == type_id::DECIMAL64) { + vals.i64[t] = scaled_value; + } else { + vals.i128[t] = scaled_value; } } // There is nothing to read, so break @@ -1711,8 +1692,7 @@ __global__ void __launch_bounds__(block_size) case DECIMAL: if (s->chunk.dtype_id == type_id::DECIMAL32) { static_cast(data_out)[row] = s->vals.u32[t + vals_skipped]; - } else if (s->chunk.dtype_id == type_id::FLOAT64 or - s->chunk.dtype_id == type_id::DECIMAL64) { + } else if (s->chunk.dtype_id == type_id::DECIMAL64) { static_cast(data_out)[row] = s->vals.u64[t + vals_skipped]; } else { // decimal128 diff --git a/cpp/src/join/mixed_join_kernels.cu b/cpp/src/join/mixed_join_kernels.cu index 5638f0ddd38..efaea841e45 100644 --- a/cpp/src/join/mixed_join_kernels.cu +++ b/cpp/src/join/mixed_join_kernels.cu @@ -35,18 +35,19 @@ namespace detail { namespace cg = cooperative_groups; template -__global__ void mixed_join(table_device_view left_table, - table_device_view right_table, - table_device_view probe, - table_device_view build, - row_equality const equality_probe, - join_kind const join_type, - cudf::detail::mixed_multimap_type::device_view hash_table_view, - size_type* join_output_l, - size_type* join_output_r, - cudf::ast::detail::expression_device_view device_expression_data, - cudf::size_type const* join_result_offsets, - bool const swap_tables) +__launch_bounds__(block_size) __global__ + void mixed_join(table_device_view left_table, + table_device_view right_table, + table_device_view probe, + table_device_view build, + row_equality const equality_probe, + join_kind const join_type, + cudf::detail::mixed_multimap_type::device_view hash_table_view, + size_type* join_output_l, + size_type* join_output_r, + cudf::ast::detail::expression_device_view device_expression_data, + cudf::size_type const* join_result_offsets, + bool const swap_tables) { // Normally the casting of a shared memory array is used to create multiple // arrays of different types from the shared memory buffer, but here it is diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu index c8cfc9998f0..63a69554245 100644 --- a/cpp/src/join/mixed_join_kernels_semi.cu +++ b/cpp/src/join/mixed_join_kernels_semi.cu @@ -32,17 +32,18 @@ namespace detail { namespace cg = cooperative_groups; template -__global__ void mixed_join_semi(table_device_view left_table, - table_device_view right_table, - table_device_view probe, - table_device_view build, - row_equality const equality_probe, - join_kind const join_type, - cudf::detail::semi_map_type::device_view hash_table_view, - size_type* join_output_l, - cudf::ast::detail::expression_device_view device_expression_data, - cudf::size_type const* join_result_offsets, - bool const swap_tables) +__launch_bounds__(block_size) __global__ + void mixed_join_semi(table_device_view left_table, + table_device_view right_table, + table_device_view probe, + table_device_view build, + row_equality const equality_probe, + join_kind const join_type, + cudf::detail::semi_map_type::device_view hash_table_view, + size_type* join_output_l, + cudf::ast::detail::expression_device_view device_expression_data, + cudf::size_type const* join_result_offsets, + bool const swap_tables) { // Normally the casting of a shared memory array is used to create multiple // arrays of different types from the shared memory buffer, but here it is diff --git a/cpp/src/join/mixed_join_size_kernels.cu b/cpp/src/join/mixed_join_size_kernels.cu index 1a08b8792c2..22c71bfc33a 100644 --- a/cpp/src/join/mixed_join_size_kernels.cu +++ b/cpp/src/join/mixed_join_size_kernels.cu @@ -35,7 +35,7 @@ namespace detail { namespace cg = cooperative_groups; template -__global__ void compute_mixed_join_output_size( +__launch_bounds__(block_size) __global__ void compute_mixed_join_output_size( table_device_view left_table, table_device_view right_table, table_device_view probe, diff --git a/cpp/src/join/mixed_join_size_kernels_semi.cu b/cpp/src/join/mixed_join_size_kernels_semi.cu index 2c077a698f8..f6b9fb85bbb 100644 --- a/cpp/src/join/mixed_join_size_kernels_semi.cu +++ b/cpp/src/join/mixed_join_size_kernels_semi.cu @@ -32,7 +32,7 @@ namespace detail { namespace cg = cooperative_groups; template -__global__ void compute_mixed_join_output_size_semi( +__launch_bounds__(block_size) __global__ void compute_mixed_join_output_size_semi( table_device_view left_table, table_device_view right_table, table_device_view probe, diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu index 27e0bd4fac9..384813d6e3d 100644 --- a/cpp/src/strings/replace/backref_re.cu +++ b/cpp/src/strings/replace/backref_re.cu @@ -68,7 +68,8 @@ std::string get_backref_pattern(std::string const& repl) * For example, for input string 'hello \2 and \1' the returned `backref_type` vector * contains `[(2,6),(1,11)]` and the returned string is 'hello and '. */ -std::pair> parse_backrefs(std::string const& repl) +std::pair> parse_backrefs(std::string const& repl, + int const group_count) { std::vector backrefs; std::string str = repl; // make a modifiable copy @@ -79,7 +80,8 @@ std::pair> parse_backrefs(std::string con while (std::regex_search(str, m, ex) && !m.empty()) { // parse the back-ref index number size_type const index = static_cast(std::atoi(std::string{m[1]}.c_str())); - CUDF_EXPECTS(index > 0 && index < 100, "Group index numbers must be in the range 1-99"); + CUDF_EXPECTS(index >= 0 && index <= group_count, + "Group index numbers must be in the range 0 to group count"); // store the new byte offset and index value size_type const position = static_cast(m.position(0)); @@ -146,7 +148,8 @@ std::unique_ptr replace_with_backrefs( reprog_device::create(pattern, flags, get_character_flags_table(), input.size(), stream); // parse the repl string for back-ref indicators - auto const parse_result = parse_backrefs(replacement); + auto group_count = std::min(99, d_prog->group_counts()); // group count should NOT exceed 99 + auto const parse_result = parse_backrefs(replacement, group_count); rmm::device_uvector backrefs = cudf::detail::make_device_uvector_async(parse_result.second, stream); string_scalar repl_scalar(parse_result.first, true, stream); diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index bac5bf1f55b..5823a859f7b 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -1421,17 +1421,9 @@ TEST_F(OrcReaderTest, DecimalOptions) cudf_io::orc_reader_options valid_opts = cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}) - .decimal128_columns({"dec", "fake_name"}) - .decimal_cols_as_float({"decc", "fake_name"}); - // Should not throw, even with "fake name" in both options + .decimal128_columns({"dec", "fake_name"}); + // Should not throw, even with "fake name" EXPECT_NO_THROW(cudf_io::read_orc(valid_opts)); - - cudf_io::orc_reader_options invalid_opts = - cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}) - .decimal128_columns({"dec", "fake_name"}) - .decimal_cols_as_float({"dec", "fake_name"}); - // Should throw, options overlap - EXPECT_THROW(cudf_io::read_orc(invalid_opts), cudf::logic_error); } TEST_F(OrcWriterTest, DecimalOptionsNested) diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp index 7c87d8bf64a..2b9e8b7aae7 100644 --- a/cpp/tests/strings/replace_regex_tests.cpp +++ b/cpp/tests/strings/replace_regex_tests.cpp @@ -281,13 +281,32 @@ TEST_F(StringsReplaceRegexTest, BackrefWithGreedyQuantifier) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } +TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexZeroIndexTest) +{ + cudf::test::strings_column_wrapper strings( + {"TEST123", "TEST1TEST2", "TEST2-TEST1122", "TEST1-TEST-T", "TES3"}); + auto strings_view = cudf::strings_column_view(strings); + std::string pattern = "(TEST)(\\d+)"; + std::string repl_template = "${0}: ${1}, ${2}; "; + auto results = cudf::strings::replace_with_backrefs(strings_view, pattern, repl_template); + + cudf::test::strings_column_wrapper expected({ + "TEST123: TEST, 123; ", + "TEST1: TEST, 1; TEST2: TEST, 2; ", + "TEST2: TEST, 2; -TEST1122: TEST, 1122; ", + "TEST1: TEST, 1; -TEST-T", + "TES3", + }); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexErrorTest) { cudf::test::strings_column_wrapper strings({"this string left intentionally blank"}); auto view = cudf::strings_column_view(strings); - EXPECT_THROW(cudf::strings::replace_with_backrefs(view, "(\\w)", "\\0"), cudf::logic_error); - EXPECT_THROW(cudf::strings::replace_with_backrefs(view, "(\\w)", "\\123"), cudf::logic_error); + // group index(3) exceeds the group count(2) + EXPECT_THROW(cudf::strings::replace_with_backrefs(view, "(\\w).(\\w)", "\\3"), cudf::logic_error); EXPECT_THROW(cudf::strings::replace_with_backrefs(view, "", "\\1"), cudf::logic_error); EXPECT_THROW(cudf::strings::replace_with_backrefs(view, "(\\w)", ""), cudf::logic_error); } diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index d1509f14c6e..58901d5743b 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -4987,6 +4987,22 @@ void testStringReplaceWithBackrefs() { assertColumnsAreEqual(expected, actual); } + // test zero as group index + try (ColumnVector v = ColumnVector.fromStrings("aa-11 b2b-345", "aa-11a 1c-2b2 b2-c3", "11-aa", null); + ColumnVector expected = ColumnVector.fromStrings("aa-11:aa:11; b2b-345:b:345;", + "aa-11:aa:11;a 1c-2:c:2;b2 b2-c3", "11-aa", null); + ColumnVector actual = v.stringReplaceWithBackrefs( + "([a-z]+)-([0-9]+)", "${0}:${1}:${2};")) { + assertColumnsAreEqual(expected, actual); + } + + // group index exceeds group count + assertThrows(CudfException.class, () -> { + try (ColumnVector v = ColumnVector.fromStrings("ABC123defgh"); + ColumnVector r = v.stringReplaceWithBackrefs("([A-Z]+)([0-9]+)([a-z]+)", "\\4")) { + } + }); + } @Test diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd index 0c2f971a26c..62ff5eb4f53 100644 --- a/python/cudf/cudf/_lib/cpp/io/orc.pxd +++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libc.stdint cimport uint8_t from libcpp cimport bool @@ -36,7 +36,6 @@ cdef extern from "cudf/io/orc.hpp" \ void enable_use_index(bool val) except+ void enable_use_np_dtypes(bool val) except+ void set_timestamp_type(data_type type) except+ - void set_decimal_cols_as_float(vector[string] val) except+ @staticmethod orc_reader_options_builder builder( @@ -55,9 +54,6 @@ cdef extern from "cudf/io/orc.hpp" \ orc_reader_options_builder& use_index(bool val) except+ orc_reader_options_builder& use_np_dtypes(bool val) except+ orc_reader_options_builder& timestamp_type(data_type type) except+ - orc_reader_options_builder& decimal_cols_as_float( - vector[string] val - ) except+ orc_reader_options build() except+ diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index 127e3a612dc..8331f9c3d17 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -93,7 +93,6 @@ cpdef read_orc(object filepaths_or_buffers, object skip_rows=None, object num_rows=None, bool use_index=True, - object decimal_cols_as_float=None, object timestamp_type=None): """ Cython function to call into libcudf API, see `read_orc`. @@ -120,7 +119,6 @@ cpdef read_orc(object filepaths_or_buffers, ) ), use_index, - decimal_cols_as_float or [], ) cdef table_with_metadata c_result @@ -319,8 +317,7 @@ cdef orc_reader_options make_orc_reader_options( size_type skip_rows, size_type num_rows, type_id timestamp_type, - bool use_index, - object decimal_cols_as_float + bool use_index ) except*: for i, datasource in enumerate(filepaths_or_buffers): @@ -333,10 +330,6 @@ cdef orc_reader_options make_orc_reader_options( c_column_names.push_back(str(col).encode()) cdef orc_reader_options opts cdef source_info src = make_source_info(filepaths_or_buffers) - cdef vector[string] c_decimal_cols_as_float - c_decimal_cols_as_float.reserve(len(decimal_cols_as_float)) - for decimal_col in decimal_cols_as_float: - c_decimal_cols_as_float.push_back(str(decimal_col).encode()) opts = move( orc_reader_options.builder(src) .columns(c_column_names) @@ -345,7 +338,6 @@ cdef orc_reader_options make_orc_reader_options( .num_rows(num_rows) .timestamp_type(data_type(timestamp_type)) .use_index(use_index) - .decimal_cols_as_float(c_decimal_cols_as_float) .build() ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 17cac3593a3..08a30729e7c 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1339,8 +1339,14 @@ def _slice(self: T, arg: slice) -> T: @_cudf_nvtx_annotate def memory_usage(self, index=True, deep=False): - return Series( - {str(k): v for k, v in super().memory_usage(index, deep).items()} + mem_usage = [col.memory_usage for col in self._data.columns] + names = [str(name) for name in self._data.names] + if index: + mem_usage.append(self._index.memory_usage()) + names.append("Index") + return Series._from_data( + data={None: as_column(mem_usage)}, + index=as_index(names), ) @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index a84606b0953..75c6e4d0964 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -339,12 +339,7 @@ def memory_usage(self, deep=False): ------- The total bytes used. """ - if deep: - warnings.warn( - "The deep parameter is ignored and is only included " - "for pandas compatibility." - ) - return {name: col.memory_usage for name, col in self._data.items()} + raise NotImplementedError def __len__(self): return self._num_rows diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 7df5be3f692..a31fe4c3b99 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -914,7 +914,7 @@ def _concat(cls, objs): @_cudf_nvtx_annotate def memory_usage(self, deep=False): - return sum(super().memory_usage(deep=deep).values()) + return self._column.memory_usage @_cudf_nvtx_annotate def equals(self, other, **kwargs): diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index c5c2322d95a..458fc16c511 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -704,10 +704,7 @@ def memory_usage(self, index=True, deep=False): >>> s.memory_usage(index=False) 24 """ - usage = super().memory_usage(deep=deep) - if index: - usage["Index"] = self.index.memory_usage() - return usage + raise NotImplementedError def hash_values(self, method="murmur3"): """Compute the hash of values in this column. diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 39228f034d4..d80fb00942b 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1474,7 +1474,7 @@ def _clean_nulls_from_index(self): @_cudf_nvtx_annotate def memory_usage(self, deep=False): - usage = sum(super().memory_usage(deep=deep).values()) + usage = sum(col.memory_usage for col in self._data.columns) if self.levels: for level in self.levels: usage += level.memory_usage(deep=deep) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 0ea02edb924..8748b9775be 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -856,7 +856,9 @@ def to_frame(self, name=None): @_cudf_nvtx_annotate def memory_usage(self, index=True, deep=False): - return sum(super().memory_usage(index, deep).values()) + return self._column.memory_usage + ( + self._index.memory_usage() if index else 0 + ) @_cudf_nvtx_annotate def __array_function__(self, func, types, args, kwargs): diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index 0ac0e02e4d1..6a2ffef52db 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -287,18 +287,11 @@ def read_orc( skiprows=None, num_rows=None, use_index=True, - decimal_cols_as_float=None, timestamp_type=None, use_python_file_object=True, **kwargs, ): """{docstring}""" - if decimal_cols_as_float is not None: - warnings.warn( - "`decimal_cols_as_float` is deprecated and will be removed in " - "the future", - FutureWarning, - ) from cudf import DataFrame # Multiple sources are passed as a list. If a single source is passed, @@ -365,7 +358,6 @@ def read_orc( skiprows, num_rows, use_index, - decimal_cols_as_float, timestamp_type, ) ) diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 62715ad7580..5082fb08b92 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -1266,10 +1266,7 @@ def test_map_type_read(columns, num_rows, use_index): assert_eq(expected_tbl.to_pandas(), gdf) -@pytest.mark.parametrize( - "data", [["_col0"], ["FakeName", "_col0", "TerriblyFakeColumnName"]] -) -def test_orc_reader_decimal(datadir, data): +def test_orc_reader_decimal(datadir): path = datadir / "TestOrcFile.decimal.orc" try: orcfile = pa.orc.ORCFile(path) @@ -1277,28 +1274,8 @@ def test_orc_reader_decimal(datadir, data): pytest.skip(".orc file is not found: %s" % e) pdf = orcfile.read().to_pandas() - gdf = cudf.read_orc(path, decimal_cols_as_float=data).to_pandas() - - # Convert the decimal dtype from PyArrow to float64 for comparison to cuDF - # This is because cuDF returns as float64 - pdf = pdf.apply(pd.to_numeric) - - assert_eq(pdf, gdf) - - -@pytest.mark.parametrize("data", [["InvalidColumnName"]]) -def test_orc_reader_decimal_invalid_column(datadir, data): - path = datadir / "TestOrcFile.decimal.orc" - try: - orcfile = pa.orc.ORCFile(path) - except pa.ArrowIOError as e: - pytest.skip(".orc file is not found: %s" % e) - - pdf = orcfile.read().to_pandas() - gdf = cudf.read_orc(path, decimal_cols_as_float=data).to_pandas() + gdf = cudf.read_orc(path).to_pandas() - # Since the `decimal_cols_as_float` column name - # is invalid, this should be a decimal assert_eq(pdf, gdf) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index cfe1957dfd6..5f348563243 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -392,9 +392,6 @@ If not None, the total number of rows to read. use_index : bool, default True If True, use row index if available for faster seeking. -decimal_cols_as_float: list, default None - If specified, names of the columns that should be converted from - Decimal to Float64 in the resulting dataframe. use_python_file_object : boolean, default True If True, Arrow-backed PythonFile objects will be used in place of fsspec AbstractBufferedFile objects at IO time. This option is likely to improve diff --git a/python/cudf/requirements/cuda-11.0/dev_requirements.txt b/python/cudf/requirements/cuda-11.0/dev_requirements.txt deleted file mode 100644 index d8dce276820..00000000000 --- a/python/cudf/requirements/cuda-11.0/dev_requirements.txt +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. - -# pyarrow gpu package will have to be built from source : -# https://arrow.apache.org/docs/python/install.html#installing-from-source - -cupy-cuda110 -cachetools -cmake -cmake-setuptools>=0.1.3 -cython>=0.29,<0.30 -dlpack -fastavro>=0.22.9 -python-snappy>=0.6.0 -fsspec>=0.6.0 -hypothesis -mimesis<4.1 -mypy==0.782 -nbsphinx -numba>=0.53.1 -numpy -numpydoc -nvtx>=0.2.1 -packaging -pandas>=1.0,<1.4.0dev0 -pandoc==2.0a4 -protobuf -pydata-sphinx-theme -pyorc -pytest -pytest-benchmark -pytest-xdist -rapidjson -recommonmark -setuptools -sphinx -sphinx-copybutton -sphinx-markdown-tables -sphinxcontrib-websupport -transformers<=4.10.3 -typing_extensions -wheel diff --git a/python/cudf/requirements/cuda-11.2/dev_requirements.txt b/python/cudf/requirements/cuda-11.2/dev_requirements.txt deleted file mode 100644 index c11d108360d..00000000000 --- a/python/cudf/requirements/cuda-11.2/dev_requirements.txt +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. - -# pyarrow gpu package will have to be built from source : -# https://arrow.apache.org/docs/python/install.html#installing-from-source - -cupy-cuda112 -cachetools -cmake -cmake-setuptools>=0.1.3 -cython>=0.29,<0.30 -dlpack -fastavro>=0.22.9 -python-snappy>=0.6.0 -fsspec>=0.6.0 -hypothesis -mimesis<4.1 -mypy==0.782 -nbsphinx -numba>=0.53.1 -numpy -numpydoc -nvtx>=0.2.1 -packaging -pandas>=1.0,<1.4.0dev0 -pandoc==2.0a4 -protobuf -pydata-sphinx-theme -pyorc -pytest -pytest-benchmark -pytest-xdist -rapidjson -recommonmark -setuptools -sphinx -sphinx-copybutton -sphinx-markdown-tables -sphinxcontrib-websupport -transformers<=4.10.3 -typing_extensions -wheel diff --git a/python/cudf_kafka/dev_requirements.txt b/python/cudf_kafka/dev_requirements.txt deleted file mode 100644 index af52659e08e..00000000000 --- a/python/cudf_kafka/dev_requirements.txt +++ /dev/null @@ -1,11 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. - -flake8==3.8.3 -black==19.10b0 -isort==5.6.4 -python-confluent-kafka -pytest -setuptools -wheel -cython>=0.29,<0.30 -python-confluent-kafka diff --git a/python/custreamz/dev_requirements.txt b/python/custreamz/dev_requirements.txt deleted file mode 100644 index a6b44c640f6..00000000000 --- a/python/custreamz/dev_requirements.txt +++ /dev/null @@ -1,12 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. - -flake8==3.8.3 -black==19.10b0 -isort==5.6.4 -dask==2022.03.0 -distributed==2022.03.0 -streamz -python-confluent-kafka -pytest -setuptools -wheel diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index d1edfb071a2..36e3416c8a3 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -398,7 +398,10 @@ def group_split_cudf(df, c, k, ignore_index=False): @sizeof_dispatch.register(cudf.DataFrame) @_dask_cudf_nvtx_annotate def sizeof_cudf_dataframe(df): - return int(df.memory_usage().sum()) + return int( + sum(col.memory_usage for col in df._data.columns) + + df._index.memory_usage() + ) @sizeof_dispatch.register((cudf.Series, cudf.BaseIndex)) diff --git a/python/dask_cudf/dev_requirements.txt b/python/dask_cudf/dev_requirements.txt deleted file mode 100644 index 438317adf87..00000000000 --- a/python/dask_cudf/dev_requirements.txt +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. - -dask==2022.03.0 -distributed==2022.03.0 -fsspec>=0.6.0 -numba>=0.53.1 -numpy -pandas>=1.0,<1.4.0dev0 -pytest -setuptools -wheel -flake8==3.8.3 -black==19.10b0 -isort==5.6.4