From 6bbe2e896ebdaa5784b4378f434c03e9523beb93 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Wed, 19 Jan 2022 21:38:42 -0500 Subject: [PATCH 01/14] Include in headers that use std::optional (#10044) Detected when compiling with gcc-11 where wasn't being brought in by other standard headers Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - Bradley Dice (https://github.com/bdice) - David Wendt (https://github.com/davidwendt) - Nghia Truong (https://github.com/ttnghia) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/10044 --- cpp/benchmarks/common/generate_benchmark_input.cpp | 3 ++- cpp/include/cudf/strings/repeat_strings.hpp | 2 ++ cpp/include/cudf/strings/replace_re.hpp | 2 ++ cpp/src/io/utilities/trie.cuh | 2 ++ 4 files changed, 8 insertions(+), 1 deletion(-) diff --git a/cpp/benchmarks/common/generate_benchmark_input.cpp b/cpp/benchmarks/common/generate_benchmark_input.cpp index 995cea13c27..dcd8e32fc9d 100644 --- a/cpp/benchmarks/common/generate_benchmark_input.cpp +++ b/cpp/benchmarks/common/generate_benchmark_input.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -31,6 +31,7 @@ #include #include +#include #include #include #include diff --git a/cpp/include/cudf/strings/repeat_strings.hpp b/cpp/include/cudf/strings/repeat_strings.hpp index edba01b174f..f6bf12af967 100644 --- a/cpp/include/cudf/strings/repeat_strings.hpp +++ b/cpp/include/cudf/strings/repeat_strings.hpp @@ -18,6 +18,8 @@ #include #include +#include + namespace cudf { namespace strings { /** diff --git a/cpp/include/cudf/strings/replace_re.hpp b/cpp/include/cudf/strings/replace_re.hpp index a2c4eba1636..0e904958d15 100644 --- a/cpp/include/cudf/strings/replace_re.hpp +++ b/cpp/include/cudf/strings/replace_re.hpp @@ -20,6 +20,8 @@ #include #include +#include + namespace cudf { namespace strings { /** diff --git a/cpp/src/io/utilities/trie.cuh b/cpp/src/io/utilities/trie.cuh index 1140a08b76b..85834ad2f0e 100644 --- a/cpp/src/io/utilities/trie.cuh +++ b/cpp/src/io/utilities/trie.cuh @@ -23,6 +23,8 @@ #include +#include + namespace cudf { namespace detail { static constexpr char trie_terminating_character = '\n'; From ab752d482ee8f8b76bdec7917538c50771b81693 Mon Sep 17 00:00:00 2001 From: Jordan Jacobelli Date: Thu, 20 Jan 2022 03:41:51 +0100 Subject: [PATCH 02/14] Simplify custreamz and cudf_kafka recipes files (#10065) Adding build string constraints with Python version is not required for `python-confluent-kafka` dependency as the python version is fixed during conda build with `--python` flag Authors: - Jordan Jacobelli (https://github.com/Ethyling) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) URL: https://github.com/rapidsai/cudf/pull/10065 --- conda/recipes/cudf_kafka/meta.yaml | 2 +- conda/recipes/custreamz/meta.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml index acb56c464e4..56f2730db7a 100644 --- a/conda/recipes/cudf_kafka/meta.yaml +++ b/conda/recipes/cudf_kafka/meta.yaml @@ -35,7 +35,7 @@ requirements: run: - python - libcudf_kafka {{ version }} - - python-confluent-kafka >=1.7.0,<1.8.0a0=py{{ py_version_numeric }}* + - python-confluent-kafka >=1.7.0,<1.8.0a0 - cudf {{ version }} test: # [linux64] diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml index 8bcdd1ec61e..ddeaa2ccd7b 100644 --- a/conda/recipes/custreamz/meta.yaml +++ b/conda/recipes/custreamz/meta.yaml @@ -26,7 +26,7 @@ build: requirements: host: - python - - python-confluent-kafka >=1.7.0,<1.8.0a0=py{{ py_version_numeric }}* + - python-confluent-kafka >=1.7.0,<1.8.0a0 - cudf_kafka {{ version }} run: - python @@ -34,7 +34,7 @@ requirements: - cudf {{ version }} - dask>=2021.11.1,<=2021.11.2 - distributed>=2021.11.1,<=2021.11.2 - - python-confluent-kafka >=1.7.0,<1.8.0a0=py{{ py_version_numeric }}* + - python-confluent-kafka >=1.7.0,<1.8.0a0 - cudf_kafka {{ version }} test: # [linux64] From c00f42bf93c400055b78a11a0701d7e8c69098d7 Mon Sep 17 00:00:00 2001 From: Ryan Lee Date: Wed, 19 Jan 2022 20:04:54 -0800 Subject: [PATCH 03/14] Spark Decimal128 hashing (#9919) cudf work for https://github.com/NVIDIA/spark-rapids/issues/3878 Shortens the hashed data by removing preceding zero values -- ensuring the leave a sign bit -- and flipping the endianness before hashing the value. Authors: - Ryan Lee (https://github.com/rwlee) - Bradley Dice (https://github.com/bdice) Approvers: - Nghia Truong (https://github.com/ttnghia) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/9919 --- .../cudf/detail/utilities/hash_functions.cuh | 76 +++++++++++++++---- cpp/tests/hashing/hash_test.cpp | 32 ++++++-- 2 files changed, 86 insertions(+), 22 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh index c35d24ddeac..8a7f4276d05 100644 --- a/cpp/include/cudf/detail/utilities/hash_functions.cuh +++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh @@ -16,12 +16,16 @@ #pragma once +#include + #include #include #include #include #include +#include + using hash_value_type = uint32_t; namespace cudf { @@ -337,17 +341,21 @@ struct SparkMurmurHash3_32 { template result_type __device__ inline compute(TKey const& key) const { - constexpr int len = sizeof(TKey); - int8_t const* const data = reinterpret_cast(&key); - constexpr int nblocks = len / 4; + return compute_bytes(reinterpret_cast(&key), sizeof(TKey)); + } + + result_type __device__ compute_bytes(std::byte const* const data, cudf::size_type const len) const + { + constexpr cudf::size_type block_size = sizeof(uint32_t) / sizeof(std::byte); + cudf::size_type const nblocks = len / block_size; + uint32_t h1 = m_seed; + constexpr uint32_t c1 = 0xcc9e2d51; + constexpr uint32_t c2 = 0x1b873593; - uint32_t h1 = m_seed; - constexpr uint32_t c1 = 0xcc9e2d51; - constexpr uint32_t c2 = 0x1b873593; //---------- - // body - uint32_t const* const blocks = reinterpret_cast(data + nblocks * 4); - for (int i = -nblocks; i; i++) { + // Process all four-byte chunks + uint32_t const* const blocks = reinterpret_cast(data); + for (cudf::size_type i = 0; i < nblocks; i++) { uint32_t k1 = blocks[i]; k1 *= c1; k1 = rotl32(k1, 15); @@ -357,9 +365,14 @@ struct SparkMurmurHash3_32 { h1 = h1 * 5 + 0xe6546b64; } //---------- - // byte by byte tail processing - for (int i = nblocks * 4; i < len; i++) { - int32_t k1 = data[i]; + // Process remaining bytes that do not fill a four-byte chunk using Spark's approach + // (does not conform to normal MurmurHash3) + for (cudf::size_type i = nblocks * 4; i < len; i++) { + // We require a two-step cast to get the k1 value from the byte. First, + // we must cast to a signed int8_t. Then, the sign bit is preserved when + // casting to uint32_t under 2's complement. Java preserves the + // signedness when casting byte-to-int, but C++ does not. + uint32_t k1 = static_cast(std::to_integer(data[i])); k1 *= c1; k1 = rotl32(k1, 15); k1 *= c2; @@ -427,7 +440,42 @@ template <> hash_value_type __device__ inline SparkMurmurHash3_32::operator()( numeric::decimal128 const& key) const { - return this->compute<__int128_t>(key.value()); + // Generates the Spark MurmurHash3 hash value, mimicking the conversion: + // java.math.BigDecimal.valueOf(unscaled_value, _scale).unscaledValue().toByteArray() + // https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala#L381 + __int128_t const val = key.value(); + constexpr cudf::size_type key_size = sizeof(__int128_t); + std::byte const* data = reinterpret_cast(&val); + + // Small negative values start with 0xff..., small positive values start with 0x00... + bool const is_negative = val < 0; + std::byte const zero_value = is_negative ? std::byte{0xff} : std::byte{0x00}; + + // If the value can be represented with a shorter than 16-byte integer, the + // leading bytes of the little-endian value are truncated and are not hashed. + auto const reverse_begin = thrust::reverse_iterator(data + key_size); + auto const reverse_end = thrust::reverse_iterator(data); + auto const first_nonzero_byte = + thrust::find_if_not(thrust::seq, reverse_begin, reverse_end, [zero_value](std::byte const& v) { + return v == zero_value; + }).base(); + // Max handles special case of 0 and -1 which would shorten to 0 length otherwise + cudf::size_type length = + std::max(1, static_cast(thrust::distance(data, first_nonzero_byte))); + + // Preserve the 2's complement sign bit by adding a byte back on if necessary. + // e.g. 0x0000ff would shorten to 0x00ff. The 0x00 byte is retained to + // preserve the sign bit, rather than leaving an "f" at the front which would + // change the sign bit. However, 0x00007f would shorten to 0x7f. No extra byte + // is needed because the leftmost bit matches the sign bit. Similarly for + // negative values: 0xffff00 --> 0xff00 and 0xffff80 --> 0x80. + if ((length < key_size) && (is_negative ^ bool(data[length - 1] & std::byte{0x80}))) { ++length; } + + // Convert to big endian by reversing the range of nonzero bytes. Only those bytes are hashed. + __int128_t big_endian_value = 0; + auto big_endian_data = reinterpret_cast(&big_endian_value); + thrust::reverse_copy(thrust::seq, data, data + length, big_endian_data); + return this->compute_bytes(big_endian_data, length); } template <> @@ -480,7 +528,7 @@ hash_value_type __device__ inline SparkMurmurHash3_32::operat //---------- // Spark's byte by byte tail processing for (int i = nblocks * 4; i < len; i++) { - int32_t k1 = data[i]; + uint32_t k1 = data[i]; k1 *= c1; k1 = rotl32(k1, 15); k1 *= c2; diff --git a/cpp/tests/hashing/hash_test.cpp b/cpp/tests/hashing/hash_test.cpp index bd6deae9dc4..1a73fb3abc9 100644 --- a/cpp/tests/hashing/hash_test.cpp +++ b/cpp/tests/hashing/hash_test.cpp @@ -298,32 +298,36 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds) // The hash values were determined by running the following Scala code in Apache Spark: // import org.apache.spark.sql.catalyst.util.DateTimeUtils // val schema = new StructType().add("structs", new StructType().add("a",IntegerType) - // .add("b",StringType).add("c",new StructType().add("x",FloatType).add("y",LongType))) + // .add("b",StringType).add("c",new StructType().add("x",FloatType).add("y",LongType))) // .add("strings",StringType).add("doubles",DoubleType).add("timestamps",TimestampType) // .add("decimal64", DecimalType(18,7)).add("longs",LongType).add("floats",FloatType) // .add("dates",DateType).add("decimal32", DecimalType(9,3)).add("ints",IntegerType) // .add("shorts",ShortType).add("bytes",ByteType).add("bools",BooleanType) + // .add("decimal128", DecimalType(38,11)) // val data = Seq( // Row(Row(0, "a", Row(0f, 0L)), "", 0.toDouble, DateTimeUtils.toJavaTimestamp(0), BigDecimal(0), // 0.toLong, 0.toFloat, DateTimeUtils.toJavaDate(0), BigDecimal(0), 0, 0.toShort, 0.toByte, - // false), + // false, BigDecimal(0)), // Row(Row(100, "bc", Row(100f, 100L)), "The quick brown fox", -(0.toDouble), // DateTimeUtils.toJavaTimestamp(100), BigDecimal("0.00001"), 100.toLong, -(0.toFloat), - // DateTimeUtils.toJavaDate(100), BigDecimal("0.1"), 100, 100.toShort, 100.toByte, true), + // DateTimeUtils.toJavaDate(100), BigDecimal("0.1"), 100, 100.toShort, 100.toByte, true, + // BigDecimal("0.000000001")), // Row(Row(-100, "def", Row(-100f, -100L)), "jumps over the lazy dog.", -Double.NaN, // DateTimeUtils.toJavaTimestamp(-100), BigDecimal("-0.00001"), -100.toLong, -Float.NaN, // DateTimeUtils.toJavaDate(-100), BigDecimal("-0.1"), -100, -100.toShort, -100.toByte, - // true), + // true, BigDecimal("-0.00000000001")), // Row(Row(0x12345678, "ghij", Row(Float.PositiveInfinity, 0x123456789abcdefL)), // "All work and no play makes Jack a dull boy", Double.MinValue, // DateTimeUtils.toJavaTimestamp(Long.MinValue/1000000), BigDecimal("-99999999999.9999999"), // Long.MinValue, Float.MinValue, DateTimeUtils.toJavaDate(Int.MinValue/100), - // BigDecimal("-999999.999"), Int.MinValue, Short.MinValue, Byte.MinValue, true), + // BigDecimal("-999999.999"), Int.MinValue, Short.MinValue, Byte.MinValue, true, + // BigDecimal("-9999999999999999.99999999999")), // Row(Row(-0x76543210, "klmno", Row(Float.NegativeInfinity, -0x123456789abcdefL)), // "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721", Double.MaxValue, // DateTimeUtils.toJavaTimestamp(Long.MaxValue/1000000), BigDecimal("99999999999.9999999"), // Long.MaxValue, Float.MaxValue, DateTimeUtils.toJavaDate(Int.MaxValue/100), - // BigDecimal("999999.999"), Int.MaxValue, Short.MaxValue, Byte.MaxValue, false)) + // BigDecimal("999999.999"), Int.MaxValue, Short.MaxValue, Byte.MaxValue, false, + // BigDecimal("99999999999999999999999999.99999999999"))) // val df = spark.createDataFrame(sc.parallelize(data), schema) // df.columns.foreach(c => println(s"$c => ${df.select(hash(col(c))).collect.mkString(",")}")) // df.select(hash(col("*"))).collect @@ -353,8 +357,10 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds) {933211791, 751823303, -1080202046, 1110053733, 1135925485}); fixed_width_column_wrapper const hash_bools_expected( {933211791, -559580957, -559580957, -559580957, 933211791}); + fixed_width_column_wrapper const hash_decimal128_expected( + {-783713497, -295670906, 1398487324, -52622807, -1359749815}); fixed_width_column_wrapper const hash_combined_expected( - {-1172364561, -442972638, 1213234395, 796626751, 214075225}); + {401603227, 588162166, 552160517, 1132537411, -326043017}); using double_limits = std::numeric_limits; using long_limits = std::numeric_limits; @@ -394,6 +400,13 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds) fixed_width_column_wrapper const bytes_col({0, 100, -100, -128, 127}); fixed_width_column_wrapper const bools_col1({0, 1, 1, 1, 0}); fixed_width_column_wrapper const bools_col2({0, 1, 2, 255, 0}); + fixed_point_column_wrapper<__int128_t> const decimal128_col( + {static_cast<__int128>(0), + static_cast<__int128>(100), + static_cast<__int128>(-1), + (static_cast<__int128>(0xFFFFFFFFFCC4D1C3u) << 64 | 0x602F7FC318000001u), + (static_cast<__int128>(0x0785EE10D5DA46D9u) << 64 | 0x00F4369FFFFFFFFFu)}, + numeric::scale_type{-11}); constexpr auto hasher = cudf::hash_id::HASH_SPARK_MURMUR3; auto const hash_structs = cudf::hash(cudf::table_view({structs_col}), hasher, 42); @@ -410,6 +423,7 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds) auto const hash_bytes = cudf::hash(cudf::table_view({bytes_col}), hasher, 42); auto const hash_bools1 = cudf::hash(cudf::table_view({bools_col1}), hasher, 42); auto const hash_bools2 = cudf::hash(cudf::table_view({bools_col2}), hasher, 42); + auto const hash_decimal128 = cudf::hash(cudf::table_view({decimal128_col}), hasher, 42); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_structs, hash_structs_expected, verbosity); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_strings, hash_strings_expected, verbosity); @@ -425,6 +439,7 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bytes, hash_bytes_expected, verbosity); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools1, hash_bools_expected, verbosity); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools2, hash_bools_expected, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_decimal128, hash_decimal128_expected, verbosity); auto const combined_table = cudf::table_view({structs_col, strings_col, @@ -438,7 +453,8 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds) ints_col, shorts_col, bytes_col, - bools_col2}); + bools_col2, + decimal128_col}); auto const hash_combined = cudf::hash(combined_table, hasher, 42); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_combined, hash_combined_expected, verbosity); } From d5f1aed8903ed3ad0e49d7852d3fdf0cfb7f376f Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Thu, 20 Jan 2022 06:20:21 -0600 Subject: [PATCH 04/14] Add in support for NULL_LOGICAL_AND and NULL_LOGICAL_OR binops (#10016) These already exist as a part of the AST. Spark's AND/OR implementations follow these requirements and to be able to re-implement it using existing CUDF functionality ended up being very expensive. We found that this one change could cut almost 13% off the total run time on TPC-DS query 28. AND/OR are common enough in all queries we expect this to have a major performance impact generally. We tried to use the AST version instead, but depending on the hardware used the overhead of AST does not pay for itself when the input/intermediate outputs are boolean columns. It appears to be because the amount of memory transfers saved is relatively small in most boolean cases and on large GPUs like the a100 the intermediate results might even fit entirely in the L2 cache. Authors: - Robert (Bobby) Evans (https://github.com/revans2) Approvers: - Jason Lowe (https://github.com/jlowe) - Conor Hoekstra (https://github.com/codereport) - Jake Hemstad (https://github.com/jrhemstad) - Jim Brennan (https://github.com/jbrennan333) URL: https://github.com/rapidsai/cudf/pull/10016 --- cpp/CMakeLists.txt | 4 +- cpp/include/cudf/binaryop.hpp | 8 +- cpp/src/binaryop/binaryop.cpp | 5 +- cpp/src/binaryop/compiled/NullLogicalAnd.cu | 26 ++++++ cpp/src/binaryop/compiled/NullLogicalOr.cu | 26 ++++++ cpp/src/binaryop/compiled/binary_ops.cu | 4 +- cpp/src/binaryop/compiled/binary_ops.cuh | 6 +- cpp/src/binaryop/compiled/operation.cuh | 34 +++++++- cpp/src/binaryop/compiled/util.cpp | 9 +- cpp/tests/binaryop/binop-compiled-test.cpp | 86 +++++++++++++------ cpp/tests/binaryop/util/operation.h | 44 +++++++++- .../main/java/ai/rapids/cudf/BinaryOp.java | 6 +- .../java/ai/rapids/cudf/BinaryOpTest.java | 42 ++++++++- 13 files changed, 259 insertions(+), 41 deletions(-) create mode 100644 cpp/src/binaryop/compiled/NullLogicalAnd.cu create mode 100644 cpp/src/binaryop/compiled/NullLogicalOr.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 2f51f582e12..4db9f6de4d5 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -186,6 +186,8 @@ add_library( src/binaryop/compiled/Mod.cu src/binaryop/compiled/Mul.cu src/binaryop/compiled/NullEquals.cu + src/binaryop/compiled/NullLogicalOr.cu + src/binaryop/compiled/NullLogicalAnd.cu src/binaryop/compiled/NullMax.cu src/binaryop/compiled/NullMin.cu src/binaryop/compiled/PMod.cu diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp index a514010c1f0..daf55c0befe 100644 --- a/cpp/include/cudf/binaryop.hpp +++ b/cpp/include/cudf/binaryop.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -72,7 +72,11 @@ enum class binary_operator : int32_t { ///< operand when one is null; or invalid when both are null GENERIC_BINARY, ///< generic binary operator to be generated with input ///< ptx code - INVALID_BINARY ///< invalid operation + NULL_LOGICAL_AND, ///< operator && with Spark rules: (null, null) is null, (null, true) is null, + ///< (null, false) is false, and (valid, valid) == LOGICAL_AND(valid, valid) + NULL_LOGICAL_OR, ///< operator || with Spark rules: (null, null) is null, (null, true) is true, + ///< (null, false) is null, and (valid, valid) == LOGICAL_OR(valid, valid) + INVALID_BINARY ///< invalid operation }; /** * @brief Performs a binary operation between a scalar and a column. diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp index 7087b71a84e..5f9ff2574e3 100644 --- a/cpp/src/binaryop/binaryop.cpp +++ b/cpp/src/binaryop/binaryop.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Copyright 2018-2019 BlazingDB, Inc. * Copyright 2018 Christian Noboa Mardini @@ -74,7 +74,8 @@ rmm::device_buffer scalar_col_valid_mask_and(column_view const& col, inline bool is_null_dependent(binary_operator op) { return op == binary_operator::NULL_EQUALS || op == binary_operator::NULL_MIN || - op == binary_operator::NULL_MAX; + op == binary_operator::NULL_MAX || op == binary_operator::NULL_LOGICAL_AND || + op == binary_operator::NULL_LOGICAL_OR; } /** diff --git a/cpp/src/binaryop/compiled/NullLogicalAnd.cu b/cpp/src/binaryop/compiled/NullLogicalAnd.cu new file mode 100644 index 00000000000..48ae125bc93 --- /dev/null +++ b/cpp/src/binaryop/compiled/NullLogicalAnd.cu @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "binary_ops.cuh" + +namespace cudf::binops::compiled { +template void apply_binary_op(mutable_column_device_view&, + column_device_view const&, + column_device_view const&, + bool is_lhs_scalar, + bool is_rhs_scalar, + rmm::cuda_stream_view); +} // namespace cudf::binops::compiled diff --git a/cpp/src/binaryop/compiled/NullLogicalOr.cu b/cpp/src/binaryop/compiled/NullLogicalOr.cu new file mode 100644 index 00000000000..e0ea95ac3ee --- /dev/null +++ b/cpp/src/binaryop/compiled/NullLogicalOr.cu @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "binary_ops.cuh" + +namespace cudf::binops::compiled { +template void apply_binary_op(mutable_column_device_view&, + column_device_view const&, + column_device_view const&, + bool is_lhs_scalar, + bool is_rhs_scalar, + rmm::cuda_stream_view); +} // namespace cudf::binops::compiled diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu index efa8cdca2cc..1d12fac1938 100644 --- a/cpp/src/binaryop/compiled/binary_ops.cu +++ b/cpp/src/binaryop/compiled/binary_ops.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -339,6 +339,8 @@ case binary_operator::PMOD: apply_binary_op(out, lhs, case binary_operator::NULL_EQUALS: apply_binary_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break; case binary_operator::NULL_MAX: apply_binary_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break; case binary_operator::NULL_MIN: apply_binary_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break; +case binary_operator::NULL_LOGICAL_AND: apply_binary_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break; +case binary_operator::NULL_LOGICAL_OR: apply_binary_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break; default:; } // clang-format on diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh index dc1cae82796..9b3e33f491e 100644 --- a/cpp/src/binaryop/compiled/binary_ops.cuh +++ b/cpp/src/binaryop/compiled/binary_ops.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -103,6 +103,8 @@ struct ops_wrapper { type_dispatcher(rhs.type(), type_casted_accessor{}, i, rhs, is_rhs_scalar); auto result = [&]() { if constexpr (std::is_same_v or + std::is_same_v or + std::is_same_v or std::is_same_v or std::is_same_v) { bool output_valid = false; @@ -150,6 +152,8 @@ struct ops2_wrapper { TypeRhs y = rhs.element(is_rhs_scalar ? 0 : i); auto result = [&]() { if constexpr (std::is_same_v or + std::is_same_v or + std::is_same_v or std::is_same_v or std::is_same_v) { bool output_valid = false; diff --git a/cpp/src/binaryop/compiled/operation.cuh b/cpp/src/binaryop/compiled/operation.cuh index 75507d055e0..313fc34567d 100644 --- a/cpp/src/binaryop/compiled/operation.cuh +++ b/cpp/src/binaryop/compiled/operation.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -415,6 +415,38 @@ struct NullMin { -> decltype(static_cast(static_cast(x) < static_cast(y) ? x : y)); }; +struct NullLogicalAnd { + template + __device__ inline auto operator()( + TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) -> decltype(x && y) + { + bool lhs_false = lhs_valid && !x; + bool rhs_false = rhs_valid && !y; + bool both_valid = lhs_valid && rhs_valid; + output_valid = lhs_false || rhs_false || both_valid; + return both_valid && !lhs_false && !rhs_false; + } + // To allow std::is_invocable_v = true + template + __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x && y); +}; + +struct NullLogicalOr { + template + __device__ inline auto operator()( + TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) -> decltype(x || y) + { + bool lhs_true = lhs_valid && x; + bool rhs_true = rhs_valid && y; + bool both_valid = lhs_valid && rhs_valid; + output_valid = lhs_true || rhs_true || both_valid; + return lhs_true || rhs_true; + } + // To allow std::is_invocable_v = true + template + __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x || y); +}; + } // namespace ops } // namespace compiled } // namespace binops diff --git a/cpp/src/binaryop/compiled/util.cpp b/cpp/src/binaryop/compiled/util.cpp index f89941a3d68..146e53aae59 100644 --- a/cpp/src/binaryop/compiled/util.cpp +++ b/cpp/src/binaryop/compiled/util.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -71,8 +71,9 @@ struct is_binary_operation_supported { if constexpr (has_common_type_v) { using common_t = std::common_type_t; return std::is_invocable_v; - } else + } else { return std::is_invocable_v; + } } else { return false; } @@ -166,6 +167,10 @@ struct is_supported_operation_functor { case binary_operator::LESS_EQUAL: return bool_op(out); case binary_operator::GREATER_EQUAL: return bool_op(out); case binary_operator::NULL_EQUALS: return bool_op(out); + case binary_operator::NULL_LOGICAL_AND: + return bool_op(out); + case binary_operator::NULL_LOGICAL_OR: + return bool_op(out); default: return type_dispatcher(out, nested_support_functor{}, op); } return false; diff --git a/cpp/tests/binaryop/binop-compiled-test.cpp b/cpp/tests/binaryop/binop-compiled-test.cpp index 0339d52dda9..00408741653 100644 --- a/cpp/tests/binaryop/binop-compiled-test.cpp +++ b/cpp/tests/binaryop/binop-compiled-test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -475,6 +475,64 @@ TYPED_TEST(BinaryOperationCompiledTest_Logical, LogicalOr_Vector_Vector) this->template test(cudf::binary_operator::LOGICAL_OR); } +template +using column_wrapper = std::conditional_t, + cudf::test::strings_column_wrapper, + cudf::test::fixed_width_column_wrapper>; + +template +auto NullOp_Result(column_view lhs, column_view rhs) +{ + auto [lhs_data, lhs_mask] = cudf::test::to_host(lhs); + auto [rhs_data, rhs_mask] = cudf::test::to_host(rhs); + std::vector result(lhs.size()); + std::vector result_mask; + std::transform(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(lhs.size()), + result.begin(), + [&lhs_data, &lhs_mask, &rhs_data, &rhs_mask, &result_mask](auto i) -> TypeOut { + auto lhs_valid = lhs_mask.data() and cudf::bit_is_set(lhs_mask.data(), i); + auto rhs_valid = rhs_mask.data() and cudf::bit_is_set(rhs_mask.data(), i); + bool output_valid = lhs_valid or rhs_valid; + auto result = OP{}(lhs_data[i], rhs_data[i], lhs_valid, rhs_valid, output_valid); + result_mask.push_back(output_valid); + return result; + }); + return column_wrapper(result.cbegin(), result.cend(), result_mask.cbegin()); +} + +TYPED_TEST(BinaryOperationCompiledTest_Logical, NullLogicalAnd_Vector_Vector) +{ + using TypeOut = bool; + using TypeLhs = typename TestFixture::TypeLhs; + using TypeRhs = typename TestFixture::TypeRhs; + using NULL_AND = cudf::library::operation::NullLogicalAnd; + + auto lhs = lhs_random_column(col_size); + auto rhs = rhs_random_column(col_size); + auto const expected = NullOp_Result(lhs, rhs); + + auto const result = cudf::binary_operation( + lhs, rhs, cudf::binary_operator::NULL_LOGICAL_AND, data_type(type_to_id())); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + +TYPED_TEST(BinaryOperationCompiledTest_Logical, NullLogicalOr_Vector_Vector) +{ + using TypeOut = bool; + using TypeLhs = typename TestFixture::TypeLhs; + using TypeRhs = typename TestFixture::TypeRhs; + using NULL_OR = cudf::library::operation::NullLogicalOr; + + auto lhs = lhs_random_column(col_size); + auto rhs = rhs_random_column(col_size); + auto const expected = NullOp_Result(lhs, rhs); + + auto const result = cudf::binary_operation( + lhs, rhs, cudf::binary_operator::NULL_LOGICAL_OR, data_type(type_to_id())); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + // Comparison Operations ==, !=, <, >, <=, >= // nn, tt, dd, ss, dcdc using Comparison_types = cudf::test::Types, @@ -554,32 +612,6 @@ struct BinaryOperationCompiledTest_NullOps : public BinaryOperationCompiledTest< }; TYPED_TEST_SUITE(BinaryOperationCompiledTest_NullOps, Null_types); -template -using column_wrapper = std::conditional_t, - cudf::test::strings_column_wrapper, - cudf::test::fixed_width_column_wrapper>; - -template -auto NullOp_Result(column_view lhs, column_view rhs) -{ - auto [lhs_data, lhs_mask] = cudf::test::to_host(lhs); - auto [rhs_data, rhs_mask] = cudf::test::to_host(rhs); - std::vector result(lhs.size()); - std::vector result_mask; - std::transform(thrust::make_counting_iterator(0), - thrust::make_counting_iterator(lhs.size()), - result.begin(), - [&lhs_data, &lhs_mask, &rhs_data, &rhs_mask, &result_mask](auto i) -> TypeOut { - auto lhs_valid = lhs_mask.data() and cudf::bit_is_set(lhs_mask.data(), i); - auto rhs_valid = rhs_mask.data() and cudf::bit_is_set(rhs_mask.data(), i); - bool output_valid = lhs_valid or rhs_valid; - auto result = OP{}(lhs_data[i], rhs_data[i], lhs_valid, rhs_valid, output_valid); - result_mask.push_back(output_valid); - return result; - }); - return column_wrapper(result.cbegin(), result.cend(), result_mask.cbegin()); -} - TYPED_TEST(BinaryOperationCompiledTest_NullOps, NullEquals_Vector_Vector) { using TypeOut = bool; diff --git a/cpp/tests/binaryop/util/operation.h b/cpp/tests/binaryop/util/operation.h index 481e5cfd4a9..22802580cd0 100644 --- a/cpp/tests/binaryop/util/operation.h +++ b/cpp/tests/binaryop/util/operation.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Copyright 2018-2019 BlazingDB, Inc. * Copyright 2018 Christian Noboa Mardini @@ -323,6 +323,48 @@ struct PyMod { } }; +template +struct NullLogicalAnd { + TypeOut operator()(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) const + { + if (lhs_valid && !x) { + output_valid = true; + return false; + } + if (rhs_valid && !y) { + output_valid = true; + return false; + } + if (lhs_valid && rhs_valid) { + output_valid = true; + return true; + } + output_valid = false; + return false; + } +}; + +template +struct NullLogicalOr { + TypeOut operator()(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) const + { + if (lhs_valid && x) { + output_valid = true; + return true; + } + if (rhs_valid && y) { + output_valid = true; + return true; + } + if (lhs_valid && rhs_valid) { + output_valid = true; + return false; + } + output_valid = false; + return false; + } +}; + template struct NullEquals { TypeOut operator()(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) const diff --git a/java/src/main/java/ai/rapids/cudf/BinaryOp.java b/java/src/main/java/ai/rapids/cudf/BinaryOp.java index 8b58d8383b4..15b8d32d6da 100644 --- a/java/src/main/java/ai/rapids/cudf/BinaryOp.java +++ b/java/src/main/java/ai/rapids/cudf/BinaryOp.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2020,2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -49,8 +49,10 @@ public enum BinaryOp { GREATER_EQUAL(25), // >= NULL_EQUALS(26), // like EQUAL but NULL == NULL is TRUE and NULL == not NULL is FALSE NULL_MAX(27), // MAX but NULL < not NULL - NULL_MIN(28); // MIN but NULL > not NULL + NULL_MIN(28), // MIN but NULL > not NULL //NOT IMPLEMENTED YET GENERIC_BINARY(29); + NULL_LOGICAL_AND(30), + NULL_LOGICAL_OR(31); static final EnumSet COMPARISON = EnumSet.of( diff --git a/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java b/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java index 0ca997d3c80..862f3860d3d 100644 --- a/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java +++ b/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -1363,6 +1363,46 @@ public void testBitXor() { } } + @Test + public void testNullAnd() { + try (ColumnVector icv1 = ColumnVector.fromBoxedBooleans( + true, true, true, + false, false, false, + null, null, null); + ColumnVector icv2 = ColumnVector.fromBoxedBooleans( + true, false, null, + true, false, null, + true, false, null)) { + try (ColumnVector answer = icv1.binaryOp(BinaryOp.NULL_LOGICAL_AND, icv2, DType.BOOL8); + ColumnVector expected = ColumnVector.fromBoxedBooleans( + true, false, null, + false, false, false, + null, false, null)) { + assertColumnsAreEqual(expected, answer, "boolean NULL AND boolean"); + } + } + } + + @Test + public void testNullOr() { + try (ColumnVector icv1 = ColumnVector.fromBoxedBooleans( + true, true, true, + false, false, false, + null, null, null); + ColumnVector icv2 = ColumnVector.fromBoxedBooleans( + true, false, null, + true, false, null, + true, false, null)) { + try (ColumnVector answer = icv1.binaryOp(BinaryOp.NULL_LOGICAL_OR, icv2, DType.BOOL8); + ColumnVector expected = ColumnVector.fromBoxedBooleans( + true, true, true, + true, false, null, + true, null, null)) { + assertColumnsAreEqual(expected, answer, "boolean NULL OR boolean"); + } + } + } + @Test public void testAnd() { try (ColumnVector icv1 = ColumnVector.fromBoxedBooleans(BOOLEANS_1); From 690993cdc937be78849dcbf7a11a93d326d8aecc Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 20 Jan 2022 07:20:47 -0600 Subject: [PATCH 05/14] Add `struct` generation support in datagenerator & fuzz tests (#9180) Resolves: #7618 This PR adds struct dtype support in data-generator for fuzz-testing. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu) URL: https://github.com/rapidsai/cudf/pull/9180 --- python/cudf/cudf/_fuzz_testing/io.py | 6 + python/cudf/cudf/_fuzz_testing/orc.py | 5 +- python/cudf/cudf/_fuzz_testing/parquet.py | 2 + python/cudf/cudf/_fuzz_testing/utils.py | 87 +++++++++-- python/cudf/cudf/testing/dataset_generator.py | 144 ++++++++++++++++-- 5 files changed, 223 insertions(+), 21 deletions(-) diff --git a/python/cudf/cudf/_fuzz_testing/io.py b/python/cudf/cudf/_fuzz_testing/io.py index 1312300f714..193fb4c7f7f 100644 --- a/python/cudf/cudf/_fuzz_testing/io.py +++ b/python/cudf/cudf/_fuzz_testing/io.py @@ -25,6 +25,9 @@ def __init__( max_string_length=None, max_lists_length=None, max_lists_nesting_depth=None, + max_structs_nesting_depth=None, + max_struct_null_frequency=None, + max_struct_types_at_each_level=None, ): dirs = [] if dirs is None else dirs self._inputs = [] @@ -33,6 +36,9 @@ def __init__( self._max_string_length = max_string_length self._max_lists_length = max_lists_length self._max_lists_nesting_depth = max_lists_nesting_depth + self._max_structs_nesting_depth = max_structs_nesting_depth + self._max_struct_null_frequency = max_struct_null_frequency + self._max_struct_types_at_each_level = max_struct_types_at_each_level for i, path in enumerate(dirs): if i == 0 and not os.path.exists(path): diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py index 2aa01eb3967..78e01fb76a4 100644 --- a/python/cudf/cudf/_fuzz_testing/orc.py +++ b/python/cudf/cudf/_fuzz_testing/orc.py @@ -83,7 +83,10 @@ def generate_input(self): self._df = df file_obj = io.BytesIO() pandas_to_orc( - df, file_io_obj=file_obj, stripe_size=self._rand(len(df)) + df, + file_io_obj=file_obj, + stripe_size=self._rand(len(df)), + arrow_table_schema=table.schema, ) file_obj.seek(0) buf = file_obj.read() diff --git a/python/cudf/cudf/_fuzz_testing/parquet.py b/python/cudf/cudf/_fuzz_testing/parquet.py index 5b00f96d88d..859d09b407f 100644 --- a/python/cudf/cudf/_fuzz_testing/parquet.py +++ b/python/cudf/cudf/_fuzz_testing/parquet.py @@ -59,6 +59,7 @@ def generate_input(self): - {"uint32"} | {"list", "decimal64"} ) + dtypes_meta, num_rows, num_cols = _generate_rand_meta( self, dtypes_list ) @@ -80,6 +81,7 @@ def generate_input(self): # https://issues.apache.org/jira/browse/ARROW-10123 # file = io.BytesIO() + df.to_parquet("temp_file") # file.seek(0) # self._current_buffer = copy.copy(file.read()) diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py index ff5870c50be..87a8fc46374 100644 --- a/python/cudf/cudf/_fuzz_testing/utils.py +++ b/python/cudf/cudf/_fuzz_testing/utils.py @@ -6,6 +6,7 @@ import fastavro import numpy as np import pandas as pd +import pyarrow as pa import pyorc import cudf @@ -114,6 +115,26 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None): meta["value_type"] = random.choice( list(cudf.utils.dtypes.ALL_TYPES - {"category"}) ) + elif dtype == "struct": + if obj._max_lists_nesting_depth is None: + meta["nesting_max_depth"] = np.random.randint(2, 10) + else: + meta["nesting_max_depth"] = obj._max_lists_nesting_depth + + if obj._max_struct_null_frequency is None: + meta["max_null_frequency"] = random.uniform(0, 1) + else: + meta["max_null_frequency"] = obj._max_struct_null_frequency + + if obj._max_struct_types_at_each_level is None: + meta["max_types_at_each_level"] = np.random.randint( + low=1, high=10 + ) + else: + meta[ + "max_types_at_each_level" + ] = obj._max_struct_types_at_each_level + elif dtype == "decimal64": meta["max_precision"] = cudf.Decimal64Dtype.MAX_PRECISION elif dtype == "decimal32": @@ -161,6 +182,8 @@ def pyarrow_to_pandas(table): df[column._name] = pd.Series( column, dtype=pyarrow_dtypes_to_pandas_dtypes[column.type] ) + elif isinstance(column.type, pa.StructType): + df[column._name] = column.to_pandas(integer_object_nulls=True) else: df[column._name] = column.to_pandas() @@ -196,6 +219,14 @@ def get_orc_dtype_info(dtype): ) +def get_arrow_dtype_info_for_pyorc(dtype): + if isinstance(dtype, pa.StructType): + return get_orc_schema(df=None, arrow_table_schema=dtype) + else: + pd_dtype = cudf.dtype(dtype.to_pandas_dtype()) + return get_orc_dtype_info(pd_dtype) + + def get_avro_schema(df): fields = [ {"name": col_name, "type": get_avro_dtype_info(col_dtype)} @@ -205,11 +236,17 @@ def get_avro_schema(df): return schema -def get_orc_schema(df): - ordered_dict = OrderedDict( - (col_name, get_orc_dtype_info(col_dtype)) - for col_name, col_dtype in df.dtypes.items() - ) +def get_orc_schema(df, arrow_table_schema=None): + if arrow_table_schema is None: + ordered_dict = OrderedDict( + (col_name, get_orc_dtype_info(col_dtype)) + for col_name, col_dtype in df.dtypes.items() + ) + else: + ordered_dict = OrderedDict( + (field.name, get_arrow_dtype_info_for_pyorc(field.type)) + for field in arrow_table_schema + ) schema = pyorc.Struct(**ordered_dict) return schema @@ -255,13 +292,25 @@ def pandas_to_avro(df, file_name=None, file_io_obj=None): fastavro.writer(file_io_obj, avro_schema, records) -def _preprocess_to_orc_tuple(df): +def _preprocess_to_orc_tuple(df, arrow_table_schema): def _null_to_None(value): if value is pd.NA or value is pd.NaT: return None else: return value + def sanitize(value, struct_type): + if value is None: + return None + + values_list = [] + for name, sub_type in struct_type.fields.items(): + if isinstance(sub_type, cudf.StructDtype): + values_list.append(sanitize(value[name], sub_type)) + else: + values_list.append(value[name]) + return tuple(values_list) + has_nulls_or_nullable_dtype = any( [ True @@ -271,19 +320,35 @@ def _null_to_None(value): for col in df.columns ] ) + pdf = df.copy(deep=True) + for field in arrow_table_schema: + if isinstance(field.type, pa.StructType): + pdf[field.name] = pdf[field.name].apply( + sanitize, args=(cudf.StructDtype.from_arrow(field.type),) + ) + else: + pdf[field.name] = pdf[field.name] tuple_list = [ tuple(map(_null_to_None, tup)) if has_nulls_or_nullable_dtype else tup - for tup in df.itertuples(index=False, name=None) + for tup in pdf.itertuples(index=False, name=None) ] - return tuple_list + return tuple_list, pdf, df -def pandas_to_orc(df, file_name=None, file_io_obj=None, stripe_size=67108864): - schema = get_orc_schema(df) +def pandas_to_orc( + df, + file_name=None, + file_io_obj=None, + stripe_size=67108864, + arrow_table_schema=None, +): + schema = get_orc_schema(df, arrow_table_schema=arrow_table_schema) - tuple_list = _preprocess_to_orc_tuple(df) + tuple_list, pdf, df = _preprocess_to_orc_tuple( + df, arrow_table_schema=arrow_table_schema + ) if file_name is not None: with open(file_name, "wb") as data: diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py index 13be158ed78..e1c7b42c7a3 100644 --- a/python/cudf/cudf/testing/dataset_generator.py +++ b/python/cudf/cudf/testing/dataset_generator.py @@ -133,7 +133,25 @@ def _generate_column(column_params, num_rows): else: arrow_type = None - if not isinstance(arrow_type, pa.lib.Decimal128Type): + if isinstance(column_params.dtype, cudf.StructDtype): + vals = pa.StructArray.from_arrays( + column_params.generator, + names=column_params.dtype.fields.keys(), + mask=pa.array( + np.random.choice( + [True, False], + size=num_rows, + p=[ + column_params.null_frequency, + 1 - column_params.null_frequency, + ], + ) + ) + if column_params.null_frequency > 0.0 + else None, + ) + return vals + elif not isinstance(arrow_type, pa.lib.Decimal128Type): vals = pa.array( column_params.generator, size=column_params.cardinality, @@ -352,6 +370,30 @@ def rand_dataframe( dtype=dtype, ) ) + elif dtype == "struct": + nesting_max_depth = meta["nesting_max_depth"] + max_types_at_each_level = meta["max_types_at_each_level"] + max_null_frequency = meta["max_null_frequency"] + nesting_depth = np.random.randint(1, nesting_max_depth) + structDtype = create_nested_struct_type( + max_types_at_each_level=max_types_at_each_level, + nesting_level=nesting_depth, + ) + + column_params.append( + ColumnParameters( + cardinality=cardinality, + null_frequency=null_frequency, + generator=struct_generator( + dtype=structDtype, + cardinality=cardinality, + size=rows, + max_null_frequency=max_null_frequency, + ), + is_sorted=False, + dtype=structDtype, + ) + ) elif dtype == "decimal64": max_precision = meta.get( "max_precision", cudf.Decimal64Dtype.MAX_PRECISION @@ -600,11 +642,15 @@ def decimal_generator(dtype, size): ) -def get_values_for_nested_data(dtype, lists_max_length): +def get_values_for_nested_data(dtype, lists_max_length=None, size=None): """ Returns list of values based on dtype. """ - cardinality = np.random.randint(0, lists_max_length) + if size is None: + cardinality = np.random.randint(0, lists_max_length) + else: + cardinality = size + dtype = cudf.dtype(dtype) if dtype.kind in ("i", "u"): values = int_generator(dtype=dtype, size=cardinality)() @@ -628,12 +674,7 @@ def get_values_for_nested_data(dtype, lists_max_length): else: raise TypeError(f"Unsupported dtype: {dtype}") - # To ensure numpy arrays are not passed as input to - # list constructor, returning a python list object here. - if isinstance(values, np.ndarray): - return values.tolist() - else: - return values + return values def make_lists(dtype, lists_max_length, nesting_depth, top_level_list): @@ -657,9 +698,40 @@ def make_lists(dtype, lists_max_length, nesting_depth, top_level_list): top_level_list = get_values_for_nested_data( dtype=dtype, lists_max_length=lists_max_length ) + # To ensure numpy arrays are not passed as input to + # list constructor, returning a python list object here. + if isinstance(top_level_list, np.ndarray): + top_level_list = top_level_list.tolist() + return top_level_list +def make_array_for_struct(dtype, cardinality, size, max_null_frequency): + """ + Helper to create a pa.array with `size` and `dtype` + for a `StructArray`. + """ + + null_frequency = np.random.uniform(low=0, high=max_null_frequency) + local_cardinality = max(np.random.randint(low=0, high=cardinality), 1) + data = get_values_for_nested_data( + dtype=dtype.type.to_pandas_dtype(), size=local_cardinality + ) + vals = np.random.choice(data, size=size) + + return pa.array( + vals, + mask=np.random.choice( + [True, False], size=size, p=[null_frequency, 1 - null_frequency], + ) + if null_frequency > 0.0 + else None, + size=size, + safe=False, + type=dtype.type, + ) + + def get_nested_lists(dtype, size, nesting_depth, lists_max_length): """ Returns a list of nested lists with random nesting @@ -680,6 +752,34 @@ def get_nested_lists(dtype, size, nesting_depth, lists_max_length): return list_of_lists +def get_nested_structs(dtype, cardinality, size, max_null_frequency): + """ + Returns a list of arrays with random data + corresponding to the dtype provided. + ``dtype`` here should be a ``cudf.StructDtype`` + """ + list_of_arrays = [] + + for name, col_dtype in dtype.fields.items(): + if isinstance(col_dtype, cudf.StructDtype): + result_arrays = get_nested_structs( + col_dtype, cardinality, size, max_null_frequency + ) + result_arrays = pa.StructArray.from_arrays( + result_arrays, names=col_dtype.fields.keys() + ) + else: + result_arrays = make_array_for_struct( + dtype=dtype._typ[name], + cardinality=cardinality, + size=size, + max_null_frequency=max_null_frequency, + ) + list_of_arrays.append(result_arrays) + + return list_of_arrays + + def list_generator(dtype, size, nesting_depth, lists_max_length): """ Generator for list data @@ -690,3 +790,29 @@ def list_generator(dtype, size, nesting_depth, lists_max_length): nesting_depth=nesting_depth, lists_max_length=lists_max_length, ) + + +def struct_generator(dtype, cardinality, size, max_null_frequency): + """ + Generator for struct data + """ + return lambda: get_nested_structs( + dtype=dtype, + cardinality=cardinality, + size=size, + max_null_frequency=max_null_frequency, + ) + + +def create_nested_struct_type(max_types_at_each_level, nesting_level): + dtypes_list = cudf.utils.dtypes.ALL_TYPES + picked_types = np.random.choice(list(dtypes_list), max_types_at_each_level) + type_dict = {} + for name, type_ in enumerate(picked_types): + if type_ == "struct": + type_dict[str(name)] = create_nested_struct_type( + max_types_at_each_level, nesting_level - 1 + ) + else: + type_dict[str(name)] = cudf.dtype(type_) + return cudf.StructDtype(type_dict) From 2bd7320c0097aa08033a68bbca41632315a5e58c Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 20 Jan 2022 05:21:57 -0800 Subject: [PATCH 06/14] Add `_from_column_like_self` factory (#10022) Follow up to #9558 On a return trip from libcudf, it is a common pattern for cudf frame to apply its own metadata to the columns. This PR generalizes this procedure as a new factory function `_from_colums_like_self` Authors: - Michael Wang (https://github.com/isVoid) Approvers: - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu) - Paul Taylor (https://github.com/trxcllnt) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/10022 --- python/cudf/cudf/core/_base_index.py | 18 +++++------------- python/cudf/cudf/core/frame.py | 16 ++++++++++++++++ python/cudf/cudf/core/indexed_frame.py | 17 ++++------------- 3 files changed, 25 insertions(+), 26 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index be5a1e7cc93..b1335c7c076 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1444,7 +1444,8 @@ def drop_duplicates( """ # This utilizes the fact that all `Index` is also a `Frame`. - result = self.__class__._from_columns( + # Except RangeIndex. + return self._from_columns_like_self( drop_duplicates( list(self._columns), keys=range(len(self._data)), @@ -1453,8 +1454,6 @@ def drop_duplicates( ), self._column_names, ) - result._copy_type_metadata(self, include_index=False) - return result def dropna(self, how="any"): """ @@ -1476,12 +1475,10 @@ def dropna(self, how="any"): for col in self._columns ] - result = self.__class__._from_columns( + return self._from_columns_like_self( drop_nulls(data_columns, how=how, keys=range(len(data_columns)),), self._column_names, ) - result._copy_type_metadata(self, include_index=False) - return result def _gather(self, gather_map, nullify=False, check_bounds=True): """Gather rows of index specified by indices in `gather_map`. @@ -1501,14 +1498,11 @@ def _gather(self, gather_map, nullify=False, check_bounds=True): ): raise IndexError("Gather map index is out of bounds.") - result = self.__class__._from_columns( + return self._from_columns_like_self( gather(list(self._columns), gather_map, nullify=nullify), self._column_names, ) - result._copy_type_metadata(self, include_index=False) - return result - def take(self, indices, axis=0, allow_fill=True, fill_value=None): """Return a new index containing the rows specified by *indices* @@ -1561,12 +1555,10 @@ def _apply_boolean_mask(self, boolean_mask): if not is_bool_dtype(boolean_mask.dtype): raise ValueError("boolean_mask is not boolean type.") - result = self.__class__._from_columns( + return self._from_columns_like_self( apply_boolean_mask(list(self._columns), boolean_mask), column_names=self._column_names, ) - result._copy_type_metadata(self) - return result def _split_columns_by_levels(self, levels): if isinstance(levels, int) and levels > 0: diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 1d59d9f3b1a..69dc5389e7a 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -163,6 +163,22 @@ def _from_columns( return cls._from_data(data, index) + def _from_columns_like_self( + self, + columns: List[ColumnBase], + column_names: List[str], + index_names: Optional[List[str]] = None, + ): + """Construct a `Frame` from a list of columns with metadata from self. + + If `index_names` is set, the first `len(index_names)` columns are + used to construct the index of the frame. + """ + frame = self.__class__._from_columns( + columns, column_names, index_names + ) + return frame._copy_type_metadata(self, include_index=bool(index_names)) + def _mimic_inplace( self: T, result: Frame, inplace: bool = False ) -> Optional[Frame]: diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 9458057894a..e9f2de1cb1c 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -556,7 +556,7 @@ def _gather( ): raise IndexError("Gather map index is out of bounds.") - result = self.__class__._from_columns( + return self._from_columns_like_self( libcudf.copying.gather( list(self._index._columns + self._columns) if keep_index @@ -568,9 +568,6 @@ def _gather( self._index.names if keep_index else None, ) - result._copy_type_metadata(self, include_index=keep_index) - return result - def _positions_from_column_names( self, column_names, offset_by_index_columns=False ): @@ -628,7 +625,7 @@ def drop_duplicates( keys = self._positions_from_column_names( subset, offset_by_index_columns=not ignore_index ) - result = self.__class__._from_columns( + return self._from_columns_like_self( libcudf.stream_compaction.drop_duplicates( list(self._columns) if ignore_index @@ -640,8 +637,6 @@ def drop_duplicates( self._column_names, self._index.names if not ignore_index else None, ) - result._copy_type_metadata(self) - return result def add_prefix(self, prefix): """ @@ -1354,7 +1349,7 @@ def _drop_na_rows( for col in self._columns ] - result = self.__class__._from_columns( + return self._from_columns_like_self( libcudf.stream_compaction.drop_nulls( list(self._index._data.columns) + data_columns, how=how, @@ -1366,8 +1361,6 @@ def _drop_na_rows( self._column_names, self._index.names, ) - result._copy_type_metadata(self) - return result def _apply_boolean_mask(self, boolean_mask): """Apply boolean mask to each row of `self`. @@ -1378,15 +1371,13 @@ def _apply_boolean_mask(self, boolean_mask): if not is_bool_dtype(boolean_mask.dtype): raise ValueError("boolean_mask is not boolean type.") - result = self.__class__._from_columns( + return self._from_columns_like_self( libcudf.stream_compaction.apply_boolean_mask( list(self._index._columns + self._columns), boolean_mask ), column_names=self._column_names, index_names=self._index.names, ) - result._copy_type_metadata(self) - return result def take(self, indices, axis=0): """Return a new frame containing the rows specified by *indices*. From e78f47ae6cd19501d0875595b82f8618278ca4eb Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Thu, 20 Jan 2022 08:22:17 -0500 Subject: [PATCH 07/14] Add `groupby.transform` (only support for aggregations) (#10005) Closes https://github.com/rapidsai/cudf/issues/4522 This PR adds support for doing groupby aggregations via the `transform()` API, where the result of the aggregation is broadcasted to the size of the group. Note that more general transformations are not supported at this time. Authors: - Ashwin Srinath (https://github.com/shwina) Approvers: - Michael Wang (https://github.com/isVoid) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/10005 --- docs/cudf/source/api_docs/groupby.rst | 1 + docs/cudf/source/basics/groupby.rst | 23 +++++++++ python/cudf/cudf/core/groupby/groupby.py | 64 +++++++++++++++++++++++- python/cudf/cudf/tests/test_groupby.py | 22 ++++++++ 4 files changed, 109 insertions(+), 1 deletion(-) diff --git a/docs/cudf/source/api_docs/groupby.rst b/docs/cudf/source/api_docs/groupby.rst index 575d7442cdf..190978a7581 100644 --- a/docs/cudf/source/api_docs/groupby.rst +++ b/docs/cudf/source/api_docs/groupby.rst @@ -34,6 +34,7 @@ Function application SeriesGroupBy.aggregate DataFrameGroupBy.aggregate GroupBy.pipe + GroupBy.transform Computations / descriptive stats -------------------------------- diff --git a/docs/cudf/source/basics/groupby.rst b/docs/cudf/source/basics/groupby.rst index f3269768025..cbc8f7e712f 100644 --- a/docs/cudf/source/basics/groupby.rst +++ b/docs/cudf/source/basics/groupby.rst @@ -1,3 +1,5 @@ +.. _basics.groupby: + GroupBy ======= @@ -220,6 +222,27 @@ Limitations .. |describe| replace:: ``describe`` .. _describe: https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#flexible-apply + +Transform +--------- + +The ``.transform()`` method aggregates per group, and broadcasts the +result to the group size, resulting in a Series/DataFrame that is of +the same size as the input Series/DataFrame. + +.. code:: python + + >>> import cudf + >>> df = cudf.DataFrame({'a': [2, 1, 1, 2, 2], 'b': [1, 2, 3, 4, 5]}) + >>> df.groupby('a').transform('max') + b + 0 5 + 1 3 + 2 3 + 3 5 + 4 5 + + Rolling window calculations --------------------------- diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 6da98bf980d..a393d8e9457 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -184,11 +184,25 @@ def agg(self, func): Parameters ---------- func : str, callable, list or dict + Argument specifying the aggregation(s) to perform on the + groups. `func` can be any of the following: + + - string: the name of a supported aggregation + - callable: a function that accepts a Series/DataFrame and + performs a supported operation on it. + - list: a list of strings/callables specifying the + aggregations to perform on every column. + - dict: a mapping of column names to string/callable + specifying the aggregations to perform on those + columns. + + See :ref:`the user guide ` for supported + aggregations. Returns ------- A Series or DataFrame containing the combined results of the - aggregation. + aggregation(s). Examples -------- @@ -655,6 +669,54 @@ def rolling_avg(val, avg): kwargs.update({"chunks": offsets}) return grouped_values.apply_chunks(function, **kwargs) + def transform(self, function): + """Apply an aggregation, then broadcast the result to the group size. + + Parameters + ---------- + function: str or callable + Aggregation to apply to each group. Note that the set of + operations currently supported by `transform` is identical + to that supported by the `agg` method. + + Returns + ------- + A Series or DataFrame of the same size as the input, with the + result of the aggregation per group broadcasted to the group + size. + + Examples + -------- + .. code-block:: python + + import cudf + df = cudf.DataFrame({'a': [2, 1, 1, 2, 2], 'b': [1, 2, 3, 4, 5]}) + df.groupby('a').transform('max') + b + 0 5 + 1 3 + 2 3 + 3 5 + 4 5 + + See also + -------- + cudf.core.groupby.GroupBy.agg + """ + try: + result = self.agg(function) + except TypeError as e: + raise NotImplementedError( + "Currently, `transform()` supports only aggregations." + ) from e + + if not result.index.equals(self.grouping.keys): + result = result._align_to_index( + self.grouping.keys, how="right", allow_non_unique=True + ) + result = result.reset_index(drop=True) + return result + def rolling(self, *args, **kwargs): """ Returns a `RollingGroupby` object that enables rolling window diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index c73e96de470..f5decd62ea9 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -2362,6 +2362,28 @@ def test_groupby_get_group(pdf, group, name, obj): assert_groupby_results_equal(expected, actual) +@pytest.mark.parametrize( + "by", + [ + "a", + ["a", "b"], + pd.Series([2, 1, 1, 2, 2]), + pd.Series(["b", "a", "a", "b", "b"]), + ], +) +@pytest.mark.parametrize("agg", ["sum", "mean", lambda df: df.mean()]) +def test_groupby_transform_aggregation(by, agg): + gdf = cudf.DataFrame( + {"a": [2, 2, 1, 2, 1], "b": [1, 1, 1, 2, 2], "c": [1, 2, 3, 4, 5]} + ) + pdf = gdf.to_pandas() + + expected = pdf.groupby(by).transform(agg) + actual = gdf.groupby(by).transform(agg) + + assert_groupby_results_equal(expected, actual) + + def test_groupby_select_then_ffill(): pdf = pd.DataFrame( { From 13429ffd67367ca565380c03edd52e00b3b12495 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 20 Jan 2022 08:25:29 -0500 Subject: [PATCH 08/14] Fix matching regex word-boundary (\b) in strings replace (#9997) Closes #9950 Fixes matching a single word-boundary (BOW) regex pattern. This pattern will match word boundaries and not any actual characters. This means the `(begin,end)` position values will be equal. The replace code was always expecting `begin < end` character range to replace. The logic has been updated to allow for this case. Additional gtests have been added that include a single `\b` pattern character. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - MithunR (https://github.com/mythrocks) URL: https://github.com/rapidsai/cudf/pull/9997 --- cpp/src/strings/replace/replace_re.cu | 70 +++++++++++++---------- cpp/tests/strings/replace_regex_tests.cpp | 12 +++- 2 files changed, 52 insertions(+), 30 deletions(-) diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu index 9fd1768453a..2c594bb86a8 100644 --- a/cpp/src/strings/replace/replace_re.cu +++ b/cpp/src/strings/replace/replace_re.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -62,37 +62,49 @@ struct replace_regex_fn { if (!d_chars) d_offsets[idx] = 0; return; } - auto const d_str = d_strings.element(idx); - auto const nchars = d_str.length(); // number of characters in input string - auto nbytes = d_str.size_bytes(); // number of bytes in input string - auto mxn = maxrepl < 0 ? nchars : maxrepl; // max possible replaces for this string - auto in_ptr = d_str.data(); // input pointer (i) - auto out_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr; // output pointer (o) - size_type lpos = 0; - int32_t begin = 0; - int32_t end = static_cast(nchars); + + auto const d_str = d_strings.element(idx); + auto nbytes = d_str.size_bytes(); // number of bytes in input string + auto mxn = maxrepl < 0 ? d_str.length() + 1 : maxrepl; // max possible replaces for this string + auto in_ptr = d_str.data(); // input pointer (i) + auto out_ptr = d_chars ? d_chars + d_offsets[idx] // output pointer (o) + : nullptr; + size_type last_pos = 0; + int32_t begin = 0; // these are for calling prog.find + int32_t end = -1; // matches final word-boundary if at the end of the string + // copy input to output replacing strings as we go - while (mxn-- > 0) // maximum number of replaces - { - if (prog.is_empty() || prog.find(idx, d_str, begin, end) <= 0) - break; // no more matches - auto spos = d_str.byte_offset(begin); // get offset for these - auto epos = d_str.byte_offset(end); // character position values - nbytes += d_repl.size_bytes() - (epos - spos); // compute new size - if (out_ptr) // replace - { // i:bbbbsssseeee - out_ptr = copy_and_increment(out_ptr, in_ptr + lpos, spos - lpos); // o:bbbb - out_ptr = copy_string(out_ptr, d_repl); // o:bbbbrrrrrr - // out_ptr ---^ - lpos = epos; // i:bbbbsssseeee - } // in_ptr --^ - begin = end; - end = static_cast(nchars); + while (mxn-- > 0) { // maximum number of replaces + + if (prog.is_empty() || prog.find(idx, d_str, begin, end) <= 0) { + break; // no more matches + } + + auto const start_pos = d_str.byte_offset(begin); // get offset for these + auto const end_pos = d_str.byte_offset(end); // character position values + nbytes += d_repl.size_bytes() - (end_pos - start_pos); // and compute new size + + if (out_ptr) { // replace: + // i:bbbbsssseeee + out_ptr = copy_and_increment(out_ptr, // ^ + in_ptr + last_pos, // o:bbbb + start_pos - last_pos); // ^ + out_ptr = copy_string(out_ptr, d_repl); // o:bbbbrrrrrr + // out_ptr ---^ + last_pos = end_pos; // i:bbbbsssseeee + } // in_ptr --^ + + begin = end + (begin == end); + end = -1; } - if (out_ptr) // copy the remainder - memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos); // o:bbbbrrrrrreeee - else + + if (out_ptr) { + memcpy(out_ptr, // copy the remainder + in_ptr + last_pos, // o:bbbbrrrrrreeee + d_str.size_bytes() - last_pos); // ^ ^ + } else { d_offsets[idx] = static_cast(nbytes); + } } }; diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp index eac06fa4588..ddbd9f5b3d6 100644 --- a/cpp/tests/strings/replace_regex_tests.cpp +++ b/cpp/tests/strings/replace_regex_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -145,6 +145,16 @@ TEST_F(StringsReplaceRegexTest, MultiReplacement) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, input); } +TEST_F(StringsReplaceRegexTest, WordBoundary) +{ + cudf::test::strings_column_wrapper input({"aba bcd\naba", "zéz", "A1B2-é3", "e é"}); + auto results = + cudf::strings::replace_re(cudf::strings_column_view(input), "\\b", cudf::string_scalar("X")); + cudf::test::strings_column_wrapper expected( + {"XabaX XbcdX\nXabaX", "XzézX", "XA1B2X-Xé3X", "XeX XéX"}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); +} + TEST_F(StringsReplaceRegexTest, Multiline) { auto const multiline = cudf::strings::regex_flags::MULTILINE; From 276bcf4171c82101a12b3d2392802e4746d0d2e3 Mon Sep 17 00:00:00 2001 From: Conor Hoekstra <36027403+codereport@users.noreply.github.com> Date: Thu, 20 Jan 2022 11:05:13 -0500 Subject: [PATCH 09/14] Add `clang-tidy` to libcudf (#9860) This PR is adding clang-tidy to cudf and adding the initial checks. Note more checks will be enabled in the future. Relevant PRs: * `rmm`: https://github.com/rapidsai/rmm/pull/857 * `cuml`: https://github.com/rapidsai/cuml/pull/1945 To do list: * [x] Add `.clang-tidy` file * [x] Add python script * [x] Apply `modernize-` changes * [x] Revert `cxxopts` changes * [x] Fixed Python parquet failures * [x] Ignore `cxxopts` file * [x] Ignore the `build/_deps` directories Splitting out the following into a separate PR so we can get the changes merged for 22.02 (https://github.com/rapidsai/cudf/pull/10064): * ~~[ ] Disable `clang-diagnostic-errors/warnings`~~ * ~~[ ] Fix include files being skipped~~ * ~~[ ] Set up CI script~~ * ~~[ ] Clean up python script~~ Authors: - Conor Hoekstra (https://github.com/codereport) Approvers: - Bradley Dice (https://github.com/bdice) - Nghia Truong (https://github.com/ttnghia) - David Wendt (https://github.com/davidwendt) - Mark Harris (https://github.com/harrism) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/9860 --- .clang-tidy | 27 ++ .../common/generate_benchmark_input.hpp | 4 +- .../copying/contiguous_split_benchmark.cu | 6 +- cpp/benchmarks/copying/gather_benchmark.cu | 2 +- cpp/benchmarks/copying/scatter_benchmark.cu | 2 +- cpp/benchmarks/fixture/benchmark_fixture.hpp | 13 +- .../io/csv/csv_reader_benchmark.cpp | 4 +- .../io/csv/csv_writer_benchmark.cpp | 4 +- .../io/orc/orc_reader_benchmark.cpp | 2 +- .../io/orc/orc_writer_benchmark.cpp | 2 +- .../io/parquet/parquet_reader_benchmark.cpp | 2 +- .../io/parquet/parquet_writer_benchmark.cpp | 2 +- .../lists/copying/scatter_lists_benchmark.cu | 2 +- .../type_dispatcher_benchmark.cu | 6 +- cpp/include/cudf/aggregation.hpp | 12 +- .../cudf/ast/detail/expression_evaluator.cuh | 8 +- .../cudf/ast/detail/expression_parser.hpp | 10 +- cpp/include/cudf/ast/expressions.hpp | 49 ++-- cpp/include/cudf/column/column.hpp | 19 +- .../cudf/column/column_device_view.cuh | 48 ++-- cpp/include/cudf/column/column_view.hpp | 36 +-- cpp/include/cudf/copying.hpp | 4 +- .../cudf/detail/aggregation/aggregation.hpp | 145 ++++++---- .../cudf/detail/aggregation/result_cache.hpp | 4 +- cpp/include/cudf/detail/merge.cuh | 4 +- cpp/include/cudf/detail/structs/utilities.hpp | 6 +- .../cudf/detail/utilities/device_atomics.cuh | 12 +- .../cudf/detail/utilities/hash_functions.cuh | 4 +- .../dictionary/dictionary_column_view.hpp | 12 +- cpp/include/cudf/fixed_point/fixed_point.hpp | 32 +-- cpp/include/cudf/io/avro.hpp | 8 +- cpp/include/cudf/io/csv.hpp | 94 ++++--- cpp/include/cudf/io/data_sink.hpp | 7 +- cpp/include/cudf/io/datasource.hpp | 38 +-- cpp/include/cudf/io/json.hpp | 2 +- cpp/include/cudf/io/orc.hpp | 40 +-- cpp/include/cudf/io/parquet.hpp | 35 +-- .../cudf/io/text/data_chunk_source.hpp | 6 +- .../io/text/data_chunk_source_factories.hpp | 14 +- .../cudf/io/text/detail/multistate.hpp | 16 +- cpp/include/cudf/io/text/detail/trie.hpp | 6 +- cpp/include/cudf/io/types.hpp | 50 ++-- cpp/include/cudf/join.hpp | 14 +- .../cudf/lists/detail/scatter_helper.cuh | 8 +- cpp/include/cudf/lists/list_device_view.cuh | 21 +- .../cudf/lists/lists_column_device_view.cuh | 15 +- cpp/include/cudf/lists/lists_column_view.hpp | 15 +- .../cudf/rolling/range_window_bounds.hpp | 4 +- cpp/include/cudf/scalar/scalar.hpp | 29 +- .../cudf/scalar/scalar_device_view.cuh | 16 +- cpp/include/cudf/strings/json.hpp | 7 +- cpp/include/cudf/strings/string_view.cuh | 8 +- cpp/include/cudf/strings/string_view.hpp | 50 ++-- .../cudf/strings/strings_column_view.hpp | 16 +- .../cudf/structs/structs_column_view.hpp | 2 +- cpp/include/cudf/table/table.hpp | 10 +- cpp/include/cudf/table/table_device_view.cuh | 4 +- cpp/include/cudf/table/table_view.hpp | 14 +- .../cudf/tdigest/tdigest_column_view.cuh | 14 +- cpp/include/cudf/types.hpp | 4 +- cpp/include/cudf/utilities/span.hpp | 21 +- cpp/include/cudf_test/cudf_gtest.hpp | 2 +- cpp/include/cudf_test/file_utilities.hpp | 2 +- cpp/include/nvtext/detail/load_hash_file.hpp | 4 +- cpp/include/nvtext/subword_tokenize.hpp | 4 +- cpp/scripts/run-clang-tidy.py | 254 ++++++++++++++++++ cpp/src/binaryop/compiled/binary_ops.cu | 4 +- cpp/src/binaryop/compiled/operation.cuh | 4 +- cpp/src/binaryop/compiled/util.cpp | 4 +- cpp/src/copying/concatenate.cu | 2 +- cpp/src/copying/contiguous_split.cu | 6 +- cpp/src/groupby/sort/functors.hpp | 2 +- cpp/src/groupby/sort/group_std.cu | 2 +- cpp/src/groupby/sort/group_tdigest.cu | 2 +- cpp/src/hash/concurrent_unordered_map.cuh | 2 +- .../hash/concurrent_unordered_multimap.cuh | 2 +- cpp/src/hash/hash_allocator.cuh | 4 +- cpp/src/hash/managed.cuh | 2 +- cpp/src/interop/dlpack.cpp | 2 +- cpp/src/io/avro/avro.cpp | 9 +- cpp/src/io/avro/avro.h | 12 +- cpp/src/io/avro/avro_common.h | 5 +- cpp/src/io/avro/avro_gpu.cu | 2 +- cpp/src/io/comp/brotli_dict.cpp | 5 +- cpp/src/io/comp/brotli_dict.h | 2 +- cpp/src/io/comp/brotli_tables.h | 4 +- cpp/src/io/comp/cpu_unbz2.cpp | 16 +- cpp/src/io/comp/debrotli.cu | 44 +-- cpp/src/io/comp/gpuinflate.cu | 12 +- cpp/src/io/comp/gpuinflate.h | 2 +- cpp/src/io/comp/snap.cu | 24 +- cpp/src/io/comp/uncomp.cpp | 11 +- cpp/src/io/comp/unsnap.cu | 16 +- cpp/src/io/csv/csv_gpu.h | 8 +- cpp/src/io/csv/writer_impl.cu | 2 +- cpp/src/io/orc/aggregate_orc_metadata.hpp | 25 +- cpp/src/io/orc/orc.h | 37 +-- cpp/src/io/orc/reader_impl.cu | 2 +- cpp/src/io/orc/stripe_data.cu | 8 +- cpp/src/io/orc/stripe_enc.cu | 2 +- cpp/src/io/orc/stripe_init.cu | 4 +- cpp/src/io/orc/timezone.cpp | 6 +- cpp/src/io/orc/timezone.cuh | 4 +- cpp/src/io/orc/writer_impl.cu | 32 +-- cpp/src/io/orc/writer_impl.hpp | 16 +- .../io/parquet/compact_protocol_writer.hpp | 4 +- cpp/src/io/parquet/page_data.cu | 21 +- cpp/src/io/parquet/page_enc.cu | 4 +- cpp/src/io/parquet/parquet.hpp | 69 ++--- cpp/src/io/parquet/reader_impl.cu | 43 +-- cpp/src/io/parquet/writer_impl.cu | 18 +- cpp/src/io/statistics/statistics.cuh | 4 +- .../io/statistics/typed_statistics_chunk.cuh | 31 +-- cpp/src/io/utilities/block_utils.cuh | 26 +- cpp/src/io/utilities/data_sink.cpp | 13 +- cpp/src/io/utilities/datasource.cpp | 15 +- cpp/src/io/utilities/file_io_utilities.hpp | 6 +- cpp/src/io/utilities/hostdevice_vector.hpp | 6 +- cpp/src/io/utilities/parsing_utils.cuh | 2 +- cpp/src/io/utilities/thread_pool.hpp | 10 +- cpp/src/join/hash_join.cuh | 12 +- cpp/src/lists/copying/gather.cu | 4 +- cpp/src/partitioning/partitioning.cu | 2 +- cpp/src/quantiles/quantiles_util.hpp | 8 +- cpp/src/rolling/rolling_detail.cuh | 4 +- cpp/src/strings/capitalize.cu | 2 +- cpp/src/strings/combine/join_list_elements.cu | 21 +- cpp/src/strings/contains.cu | 2 +- cpp/src/strings/convert/convert_datetime.cu | 20 +- cpp/src/strings/convert/convert_durations.cu | 2 +- cpp/src/strings/findall.cu | 2 +- cpp/src/strings/json/json_path.cu | 49 ++-- cpp/src/strings/padding.cu | 2 +- cpp/src/strings/regex/regcomp.cpp | 2 +- cpp/src/strings/regex/regcomp.h | 18 +- cpp/src/strings/regex/regex.cuh | 18 +- cpp/src/strings/regex/regex.inl | 14 +- cpp/src/strings/regex/regexec.cu | 3 +- cpp/src/strings/split/split.cu | 2 +- cpp/src/text/subword/data_normalizer.cu | 5 +- .../text/subword/detail/tokenizer_utils.cuh | 2 +- cpp/src/text/subword/load_hash_file.cu | 2 +- cpp/src/transform/row_bit_count.cu | 8 +- cpp/tests/column/column_view_shallow_test.cpp | 4 +- cpp/tests/copying/concatenate_tests.cu | 35 +-- cpp/tests/copying/copy_tests.cpp | 76 +++--- cpp/tests/groupby/tdigest_tests.cu | 3 +- cpp/tests/hash_map/multimap_test.cu | 15 +- cpp/tests/hashing/hash_test.cpp | 12 +- cpp/tests/io/csv_test.cpp | 2 +- cpp/tests/io/orc_test.cpp | 22 +- cpp/tests/io/parquet_test.cpp | 21 +- cpp/tests/replace/replace_tests.cpp | 2 +- cpp/tests/scalar/factories_test.cpp | 2 +- cpp/tests/strings/chars_types_tests.cpp | 21 +- cpp/tests/strings/extract_tests.cpp | 2 +- cpp/tests/strings/factories_test.cu | 4 +- cpp/tests/strings/json_tests.cpp | 46 ++-- cpp/tests/table/table_view_tests.cu | 2 +- 159 files changed, 1410 insertions(+), 1020 deletions(-) create mode 100644 .clang-tidy create mode 100644 cpp/scripts/run-clang-tidy.py diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 00000000000..043a93e6ff9 --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,27 @@ +--- +Checks: + 'modernize-*, + -modernize-use-equals-default, + -modernize-concat-nested-namespaces, + -modernize-use-trailing-return-type' + + # -modernize-use-equals-default # auto-fix is broken (doesn't insert =default correctly) + # -modernize-concat-nested-namespaces # auto-fix is broken (can delete code) + # -modernize-use-trailing-return-type # just a preference + +WarningsAsErrors: '' +HeaderFilterRegex: '' +AnalyzeTemporaryDtors: false +FormatStyle: none +CheckOptions: + - key: modernize-loop-convert.MaxCopySize + value: '16' + - key: modernize-loop-convert.MinConfidence + value: reasonable + - key: modernize-pass-by-value.IncludeStyle + value: llvm + - key: modernize-replace-auto-ptr.IncludeStyle + value: llvm + - key: modernize-use-nullptr.NullMacros + value: 'NULL' +... diff --git a/cpp/benchmarks/common/generate_benchmark_input.hpp b/cpp/benchmarks/common/generate_benchmark_input.hpp index 3dbc6561839..893c8a61543 100644 --- a/cpp/benchmarks/common/generate_benchmark_input.hpp +++ b/cpp/benchmarks/common/generate_benchmark_input.hpp @@ -300,8 +300,8 @@ class data_profile { auto get_bool_probability() const { return bool_probability; } auto get_null_frequency() const { return null_frequency; }; - auto get_cardinality() const { return cardinality; }; - auto get_avg_run_length() const { return avg_run_length; }; + [[nodiscard]] auto get_cardinality() const { return cardinality; }; + [[nodiscard]] auto get_avg_run_length() const { return avg_run_length; }; // Users should pass integral values for bounds when setting the parameters for types that have // discrete distributions (integers, strings, lists). Otherwise the call with have no effect. diff --git a/cpp/benchmarks/copying/contiguous_split_benchmark.cu b/cpp/benchmarks/copying/contiguous_split_benchmark.cu index 55e1360efc8..bb6a9320c4a 100644 --- a/cpp/benchmarks/copying/contiguous_split_benchmark.cu +++ b/cpp/benchmarks/copying/contiguous_split_benchmark.cu @@ -51,10 +51,12 @@ void BM_contiguous_split_common(benchmark::State& state, std::vector> columns(src_cols.size()); std::transform(src_cols.begin(), src_cols.end(), columns.begin(), [](T& in) { auto ret = in.release(); - ret->null_count(); + // computing the null count is not a part of the benchmark's target code path, and we want the + // property to be pre-computed so that we measure the performance of only the intended code path + [[maybe_unused]] auto const nulls = ret->null_count(); return ret; }); - cudf::table src_table(std::move(columns)); + auto const src_table = cudf::table(std::move(columns)); for (auto _ : state) { cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 diff --git a/cpp/benchmarks/copying/gather_benchmark.cu b/cpp/benchmarks/copying/gather_benchmark.cu index f075e9c486e..eaa201a0678 100644 --- a/cpp/benchmarks/copying/gather_benchmark.cu +++ b/cpp/benchmarks/copying/gather_benchmark.cu @@ -39,7 +39,7 @@ template void BM_gather(benchmark::State& state) { const cudf::size_type source_size{(cudf::size_type)state.range(0)}; - const cudf::size_type n_cols = (cudf::size_type)state.range(1); + const auto n_cols = (cudf::size_type)state.range(1); // Every element is valid auto data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; }); diff --git a/cpp/benchmarks/copying/scatter_benchmark.cu b/cpp/benchmarks/copying/scatter_benchmark.cu index 0c24dd50a13..a9ab376c8c3 100644 --- a/cpp/benchmarks/copying/scatter_benchmark.cu +++ b/cpp/benchmarks/copying/scatter_benchmark.cu @@ -40,7 +40,7 @@ template void BM_scatter(benchmark::State& state) { const cudf::size_type source_size{(cudf::size_type)state.range(0)}; - const cudf::size_type n_cols = (cudf::size_type)state.range(1); + const auto n_cols = (cudf::size_type)state.range(1); // Every element is valid auto data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; }); diff --git a/cpp/benchmarks/fixture/benchmark_fixture.hpp b/cpp/benchmarks/fixture/benchmark_fixture.hpp index 8476a137c12..83f79bd68c5 100644 --- a/cpp/benchmarks/fixture/benchmark_fixture.hpp +++ b/cpp/benchmarks/fixture/benchmark_fixture.hpp @@ -68,13 +68,13 @@ inline auto make_pool() */ class benchmark : public ::benchmark::Fixture { public: - virtual void SetUp(const ::benchmark::State& state) + void SetUp(const ::benchmark::State& state) override { mr = make_pool(); rmm::mr::set_current_device_resource(mr.get()); // set default resource to pool } - virtual void TearDown(const ::benchmark::State& state) + void TearDown(const ::benchmark::State& state) override { // reset default resource to the initial resource rmm::mr::set_current_device_resource(nullptr); @@ -82,8 +82,8 @@ class benchmark : public ::benchmark::Fixture { } // eliminate partial override warnings (see benchmark/benchmark.h) - virtual void SetUp(::benchmark::State& st) { SetUp(const_cast(st)); } - virtual void TearDown(::benchmark::State& st) + void SetUp(::benchmark::State& st) override { SetUp(const_cast(st)); } + void TearDown(::benchmark::State& st) override { TearDown(const_cast(st)); } @@ -102,7 +102,10 @@ class memory_stats_logger { ~memory_stats_logger() { rmm::mr::set_current_device_resource(existing_mr); } - size_t peak_memory_usage() const noexcept { return statistics_mr.get_bytes_counter().peak; } + [[nodiscard]] size_t peak_memory_usage() const noexcept + { + return statistics_mr.get_bytes_counter().peak; + } private: rmm::mr::device_memory_resource* existing_mr; diff --git a/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp b/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp index 77bf4b03a14..7de10f9f4c1 100644 --- a/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp +++ b/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp @@ -35,8 +35,8 @@ class CsvRead : public cudf::benchmark { void BM_csv_read_varying_input(benchmark::State& state) { - auto const data_types = get_type_or_group(state.range(0)); - io_type const source_type = static_cast(state.range(1)); + auto const data_types = get_type_or_group(state.range(0)); + auto const source_type = static_cast(state.range(1)); auto const tbl = create_random_table(data_types, num_cols, table_size_bytes{data_size}); auto const view = tbl->view(); diff --git a/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp b/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp index 9baab6b2571..1e757da6f33 100644 --- a/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp +++ b/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp @@ -35,8 +35,8 @@ class CsvWrite : public cudf::benchmark { void BM_csv_write_varying_inout(benchmark::State& state) { - auto const data_types = get_type_or_group(state.range(0)); - io_type const sink_type = static_cast(state.range(1)); + auto const data_types = get_type_or_group(state.range(0)); + auto const sink_type = static_cast(state.range(1)); auto const tbl = create_random_table(data_types, num_cols, table_size_bytes{data_size}); auto const view = tbl->view(); diff --git a/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp b/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp index 6ab8d8d09c0..0c54136226a 100644 --- a/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp +++ b/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp @@ -40,7 +40,7 @@ void BM_orc_read_varying_input(benchmark::State& state) cudf::size_type const run_length = state.range(2); cudf_io::compression_type const compression = state.range(3) ? cudf_io::compression_type::SNAPPY : cudf_io::compression_type::NONE; - io_type const source_type = static_cast(state.range(4)); + auto const source_type = static_cast(state.range(4)); data_profile table_data_profile; table_data_profile.set_cardinality(cardinality); diff --git a/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp b/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp index 933b3d02e08..be1a2073057 100644 --- a/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp +++ b/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp @@ -40,7 +40,7 @@ void BM_orc_write_varying_inout(benchmark::State& state) cudf::size_type const run_length = state.range(2); cudf_io::compression_type const compression = state.range(3) ? cudf_io::compression_type::SNAPPY : cudf_io::compression_type::NONE; - io_type const sink_type = static_cast(state.range(4)); + auto const sink_type = static_cast(state.range(4)); data_profile table_data_profile; table_data_profile.set_cardinality(cardinality); diff --git a/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp index 888102c03be..d9e37d84036 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp @@ -40,7 +40,7 @@ void BM_parq_read_varying_input(benchmark::State& state) cudf::size_type const run_length = state.range(2); cudf_io::compression_type const compression = state.range(3) ? cudf_io::compression_type::SNAPPY : cudf_io::compression_type::NONE; - io_type const source_type = static_cast(state.range(4)); + auto const source_type = static_cast(state.range(4)); data_profile table_data_profile; table_data_profile.set_cardinality(cardinality); diff --git a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp index 1af7e206692..74289fd414a 100644 --- a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp +++ b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp @@ -40,7 +40,7 @@ void BM_parq_write_varying_inout(benchmark::State& state) cudf::size_type const run_length = state.range(2); cudf_io::compression_type const compression = state.range(3) ? cudf_io::compression_type::SNAPPY : cudf_io::compression_type::NONE; - io_type const sink_type = static_cast(state.range(4)); + auto const sink_type = static_cast(state.range(4)); data_profile table_data_profile; table_data_profile.set_cardinality(cardinality); diff --git a/cpp/benchmarks/lists/copying/scatter_lists_benchmark.cu b/cpp/benchmarks/lists/copying/scatter_lists_benchmark.cu index 49007fda7a3..22e4be9ce9d 100644 --- a/cpp/benchmarks/lists/copying/scatter_lists_benchmark.cu +++ b/cpp/benchmarks/lists/copying/scatter_lists_benchmark.cu @@ -45,7 +45,7 @@ void BM_lists_scatter(::benchmark::State& state) const size_type base_size{(size_type)state.range(0)}; const size_type num_elements_per_row{(size_type)state.range(1)}; - const size_type num_rows = (size_type)ceil(double(base_size) / num_elements_per_row); + const auto num_rows = (size_type)ceil(double(base_size) / num_elements_per_row); auto source_base_col = make_fixed_width_column( data_type{type_to_id()}, base_size, mask_state::UNALLOCATED, stream, mr); diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu index 8e51bcca63d..90097889a86 100644 --- a/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu +++ b/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu @@ -170,11 +170,11 @@ void launch_kernel(cudf::mutable_table_view input, T** d_ptr, int work_per_threa template void type_dispatcher_benchmark(::benchmark::State& state) { - const cudf::size_type source_size = static_cast(state.range(1)); + const auto source_size = static_cast(state.range(1)); - const cudf::size_type n_cols = static_cast(state.range(0)); + const auto n_cols = static_cast(state.range(0)); - const cudf::size_type work_per_thread = static_cast(state.range(2)); + const auto work_per_thread = static_cast(state.range(2)); auto data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; }); diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp index 374af536dc5..23587f49334 100644 --- a/cpp/include/cudf/aggregation.hpp +++ b/cpp/include/cudf/aggregation.hpp @@ -99,9 +99,9 @@ class aggregation { Kind kind; ///< The aggregation to perform virtual ~aggregation() = default; - virtual bool is_equal(aggregation const& other) const { return kind == other.kind; } - virtual size_t do_hash() const { return std::hash{}(kind); } - virtual std::unique_ptr clone() const = 0; + [[nodiscard]] virtual bool is_equal(aggregation const& other) const { return kind == other.kind; } + [[nodiscard]] virtual size_t do_hash() const { return std::hash{}(kind); } + [[nodiscard]] virtual std::unique_ptr clone() const = 0; // override functions for compound aggregations virtual std::vector> get_simple_aggregations( @@ -118,7 +118,7 @@ class aggregation { */ class rolling_aggregation : public virtual aggregation { public: - ~rolling_aggregation() = default; + ~rolling_aggregation() override = default; protected: rolling_aggregation() {} @@ -130,7 +130,7 @@ class rolling_aggregation : public virtual aggregation { */ class groupby_aggregation : public virtual aggregation { public: - ~groupby_aggregation() = default; + ~groupby_aggregation() override = default; protected: groupby_aggregation() {} @@ -141,7 +141,7 @@ class groupby_aggregation : public virtual aggregation { */ class groupby_scan_aggregation : public virtual aggregation { public: - ~groupby_scan_aggregation() = default; + ~groupby_scan_aggregation() override = default; protected: groupby_scan_aggregation() {} diff --git a/cpp/include/cudf/ast/detail/expression_evaluator.cuh b/cpp/include/cudf/ast/detail/expression_evaluator.cuh index ecd46ec2c23..2bfe1b03dd3 100644 --- a/cpp/include/cudf/ast/detail/expression_evaluator.cuh +++ b/cpp/include/cudf/ast/detail/expression_evaluator.cuh @@ -76,7 +76,7 @@ struct expression_result { subclass().template set_value(index, result); } - __device__ inline bool is_valid() const { return subclass().is_valid(); } + [[nodiscard]] __device__ inline bool is_valid() const { return subclass().is_valid(); } __device__ inline T value() const { return subclass().value(); } }; @@ -110,7 +110,7 @@ struct value_expression_result /** * @brief Returns true if the underlying data is valid and false otherwise. */ - __device__ inline bool is_valid() const + [[nodiscard]] __device__ inline bool is_valid() const { if constexpr (has_nulls) { return _obj.has_value(); } return true; @@ -174,7 +174,7 @@ struct mutable_column_expression_result /** * @brief Not implemented for this specialization. */ - __device__ inline bool is_valid() const + [[nodiscard]] __device__ inline bool is_valid() const { // Not implemented since it would require modifying the API in the parent class to accept an // index. @@ -186,7 +186,7 @@ struct mutable_column_expression_result /** * @brief Not implemented for this specialization. */ - __device__ inline mutable_column_device_view value() const + [[nodiscard]] __device__ inline mutable_column_device_view value() const { // Not implemented since it would require modifying the API in the parent class to accept an // index. diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp index 4f73cb1ef6e..0b54dc7e4f0 100644 --- a/cpp/include/cudf/ast/detail/expression_parser.hpp +++ b/cpp/include/cudf/ast/detail/expression_parser.hpp @@ -166,7 +166,7 @@ class expression_parser { * * @return cudf::data_type */ - cudf::data_type output_type() const; + [[nodiscard]] cudf::data_type output_type() const; /** * @brief Visit a literal expression. @@ -206,10 +206,10 @@ class expression_parser { */ class intermediate_counter { public: - intermediate_counter() : used_values(), max_used(0) {} + intermediate_counter() : used_values() {} cudf::size_type take(); void give(cudf::size_type value); - cudf::size_type get_max_used() const { return max_used; } + [[nodiscard]] cudf::size_type get_max_used() const { return max_used; } private: /** @@ -221,10 +221,10 @@ class expression_parser { * * @return cudf::size_type Smallest value not already in the container. */ - cudf::size_type find_first_missing() const; + [[nodiscard]] cudf::size_type find_first_missing() const; std::vector used_values; - cudf::size_type max_used; + cudf::size_type max_used{0}; }; expression_device_view device_expression_data; ///< The collection of data required to evaluate diff --git a/cpp/include/cudf/ast/expressions.hpp b/cpp/include/cudf/ast/expressions.hpp index 20aaa42fb68..eb98e0e0bee 100644 --- a/cpp/include/cudf/ast/expressions.hpp +++ b/cpp/include/cudf/ast/expressions.hpp @@ -38,14 +38,14 @@ class expression_parser; struct expression { virtual cudf::size_type accept(detail::expression_parser& visitor) const = 0; - bool may_evaluate_null(table_view const& left, rmm::cuda_stream_view stream) const + [[nodiscard]] bool may_evaluate_null(table_view const& left, rmm::cuda_stream_view stream) const { return may_evaluate_null(left, left, stream); } - virtual bool may_evaluate_null(table_view const& left, - table_view const& right, - rmm::cuda_stream_view stream) const = 0; + [[nodiscard]] virtual bool may_evaluate_null(table_view const& left, + table_view const& right, + rmm::cuda_stream_view stream) const = 0; virtual ~expression() {} }; @@ -173,14 +173,17 @@ class literal : public expression { * * @return cudf::data_type */ - cudf::data_type get_data_type() const { return get_value().type(); } + [[nodiscard]] cudf::data_type get_data_type() const { return get_value().type(); } /** * @brief Get the value object. * * @return cudf::detail::fixed_width_scalar_device_view_base */ - cudf::detail::fixed_width_scalar_device_view_base get_value() const { return value; } + [[nodiscard]] cudf::detail::fixed_width_scalar_device_view_base get_value() const + { + return value; + } /** * @brief Accepts a visitor class. @@ -190,9 +193,9 @@ class literal : public expression { */ cudf::size_type accept(detail::expression_parser& visitor) const override; - bool may_evaluate_null(table_view const& left, - table_view const& right, - rmm::cuda_stream_view stream) const override + [[nodiscard]] bool may_evaluate_null(table_view const& left, + table_view const& right, + rmm::cuda_stream_view stream) const override { return !is_valid(stream); } @@ -202,7 +205,10 @@ class literal : public expression { * * @return bool */ - bool is_valid(rmm::cuda_stream_view stream) const { return scalar.is_valid(stream); } + [[nodiscard]] bool is_valid(rmm::cuda_stream_view stream) const + { + return scalar.is_valid(stream); + } private: cudf::scalar const& scalar; @@ -232,14 +238,14 @@ class column_reference : public expression { * * @return cudf::size_type */ - cudf::size_type get_column_index() const { return column_index; } + [[nodiscard]] cudf::size_type get_column_index() const { return column_index; } /** * @brief Get the table source. * * @return table_reference */ - table_reference get_table_source() const { return table_source; } + [[nodiscard]] table_reference get_table_source() const { return table_source; } /** * @brief Get the data type. @@ -247,7 +253,7 @@ class column_reference : public expression { * @param table Table used to determine types. * @return cudf::data_type */ - cudf::data_type get_data_type(table_view const& table) const + [[nodiscard]] cudf::data_type get_data_type(table_view const& table) const { return table.column(get_column_index()).type(); } @@ -259,7 +265,8 @@ class column_reference : public expression { * @param right_table Right table used to determine types. * @return cudf::data_type */ - cudf::data_type get_data_type(table_view const& left_table, table_view const& right_table) const + [[nodiscard]] cudf::data_type get_data_type(table_view const& left_table, + table_view const& right_table) const { auto const table = [&] { if (get_table_source() == table_reference::LEFT) { @@ -281,9 +288,9 @@ class column_reference : public expression { */ cudf::size_type accept(detail::expression_parser& visitor) const override; - bool may_evaluate_null(table_view const& left, - table_view const& right, - rmm::cuda_stream_view stream) const override + [[nodiscard]] bool may_evaluate_null(table_view const& left, + table_view const& right, + rmm::cuda_stream_view stream) const override { return (table_source == table_reference::LEFT ? left : right).column(column_index).has_nulls(); } @@ -327,7 +334,7 @@ class operation : public expression { * * @return ast_operator */ - ast_operator get_operator() const { return op; } + [[nodiscard]] ast_operator get_operator() const { return op; } /** * @brief Get the operands. @@ -344,9 +351,9 @@ class operation : public expression { */ cudf::size_type accept(detail::expression_parser& visitor) const override; - bool may_evaluate_null(table_view const& left, - table_view const& right, - rmm::cuda_stream_view stream) const override + [[nodiscard]] bool may_evaluate_null(table_view const& left, + table_view const& right, + rmm::cuda_stream_view stream) const override { return std::any_of(operands.cbegin(), operands.cend(), diff --git a/cpp/include/cudf/column/column.hpp b/cpp/include/cudf/column/column.hpp index 8decce7f260..7869f9bd2aa 100644 --- a/cpp/include/cudf/column/column.hpp +++ b/cpp/include/cudf/column/column.hpp @@ -122,12 +122,12 @@ class column { /** * @brief Returns the column's logical element type */ - data_type type() const noexcept { return _type; } + [[nodiscard]] data_type type() const noexcept { return _type; } /** * @brief Returns the number of elements */ - size_type size() const noexcept { return _size; } + [[nodiscard]] size_type size() const noexcept { return _size; } /** * @brief Returns the count of null elements. @@ -137,7 +137,7 @@ class column { * first invocation of `null_count()` will compute and store the count of null * elements indicated by the `null_mask` (if it exists). */ - size_type null_count() const; + [[nodiscard]] size_type null_count() const; /** * @brief Sets the column's null value indicator bitmask to `new_null_mask`. @@ -199,7 +199,7 @@ class column { * @return true The column can hold null values * @return false The column cannot hold null values */ - bool nullable() const noexcept { return (_null_mask.size() > 0); } + [[nodiscard]] bool nullable() const noexcept { return (_null_mask.size() > 0); } /** * @brief Indicates whether the column contains null elements. @@ -207,12 +207,12 @@ class column { * @return true One or more elements are null * @return false Zero elements are null */ - bool has_nulls() const noexcept { return (null_count() > 0); } + [[nodiscard]] bool has_nulls() const noexcept { return (null_count() > 0); } /** * @brief Returns the number of child columns */ - size_type num_children() const noexcept { return _children.size(); } + [[nodiscard]] size_type num_children() const noexcept { return _children.size(); } /** * @brief Returns a reference to the specified child @@ -228,7 +228,10 @@ class column { * @param child_index Index of the desired child * @return column const& Const reference to the desired child */ - column const& child(size_type child_index) const noexcept { return *_children[child_index]; }; + [[nodiscard]] column const& child(size_type child_index) const noexcept + { + return *_children[child_index]; + }; /** * @brief Wrapper for the contents of a column. @@ -264,7 +267,7 @@ class column { * * @return column_view The immutable, non-owning view */ - column_view view() const; + [[nodiscard]] column_view view() const; /** * @brief Implicit conversion operator to a `column_view`. diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index b29df1852b2..d2332ef9026 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -139,12 +139,12 @@ class alignas(16) column_device_view_base { /** * @brief Returns the number of elements in the column. */ - __host__ __device__ size_type size() const noexcept { return _size; } + [[nodiscard]] __host__ __device__ size_type size() const noexcept { return _size; } /** * @brief Returns the element type */ - __host__ __device__ data_type type() const noexcept { return _type; } + [[nodiscard]] __host__ __device__ data_type type() const noexcept { return _type; } /** * @brief Indicates whether the column can contain null elements, i.e., if it @@ -155,7 +155,7 @@ class alignas(16) column_device_view_base { * @return true The bitmask is allocated * @return false The bitmask is not allocated */ - __host__ __device__ bool nullable() const noexcept { return nullptr != _null_mask; } + [[nodiscard]] __host__ __device__ bool nullable() const noexcept { return nullptr != _null_mask; } /** * @brief Returns raw pointer to the underlying bitmask allocation. @@ -164,13 +164,16 @@ class alignas(16) column_device_view_base { * * @note If `null_count() == 0`, this may return `nullptr`. */ - __host__ __device__ bitmask_type const* null_mask() const noexcept { return _null_mask; } + [[nodiscard]] __host__ __device__ bitmask_type const* null_mask() const noexcept + { + return _null_mask; + } /** * @brief Returns the index of the first element relative to the base memory * allocation, i.e., what is returned from `head()`. */ - __host__ __device__ size_type offset() const noexcept { return _offset; } + [[nodiscard]] __host__ __device__ size_type offset() const noexcept { return _offset; } /** * @brief Returns whether the specified element holds a valid value (i.e., not @@ -186,7 +189,7 @@ class alignas(16) column_device_view_base { * @return true The element is valid * @return false The element is null */ - __device__ bool is_valid(size_type element_index) const noexcept + [[nodiscard]] __device__ bool is_valid(size_type element_index) const noexcept { return not nullable() or is_valid_nocheck(element_index); } @@ -203,7 +206,7 @@ class alignas(16) column_device_view_base { * @return true The element is valid * @return false The element is null */ - __device__ bool is_valid_nocheck(size_type element_index) const noexcept + [[nodiscard]] __device__ bool is_valid_nocheck(size_type element_index) const noexcept { return bit_is_set(_null_mask, offset() + element_index); } @@ -221,7 +224,7 @@ class alignas(16) column_device_view_base { * @return true The element is null * @return false The element is valid */ - __device__ bool is_null(size_type element_index) const noexcept + [[nodiscard]] __device__ bool is_null(size_type element_index) const noexcept { return not is_valid(element_index); } @@ -237,7 +240,7 @@ class alignas(16) column_device_view_base { * @return true The element is null * @return false The element is valid */ - __device__ bool is_null_nocheck(size_type element_index) const noexcept + [[nodiscard]] __device__ bool is_null_nocheck(size_type element_index) const noexcept { return not is_valid_nocheck(element_index); } @@ -251,7 +254,7 @@ class alignas(16) column_device_view_base { * @param word_index The index of the word to get * @return bitmask word for the given word_index */ - __device__ bitmask_type get_mask_word(size_type word_index) const noexcept + [[nodiscard]] __device__ bitmask_type get_mask_word(size_type word_index) const noexcept { return null_mask()[word_index]; } @@ -476,7 +479,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { * For columns with null elements, use `make_null_replacement_iterator`. */ template ())> - const_iterator begin() const + [[nodiscard]] const_iterator begin() const { return const_iterator{count_it{0}, detail::value_accessor{*this}}; } @@ -494,7 +497,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { * For columns with null elements, use `make_null_replacement_iterator`. */ template ())> - const_iterator end() const + [[nodiscard]] const_iterator end() const { return const_iterator{count_it{size()}, detail::value_accessor{*this}}; } @@ -602,7 +605,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { template ())> - const_pair_iterator pair_begin() const + [[nodiscard]] const_pair_iterator pair_begin() const { return const_pair_iterator{count_it{0}, detail::pair_accessor{*this}}; @@ -632,7 +635,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { template ())> - const_pair_rep_iterator pair_rep_begin() const + [[nodiscard]] const_pair_rep_iterator pair_rep_begin() const { return const_pair_rep_iterator{count_it{0}, detail::pair_rep_accessor{*this}}; @@ -673,7 +676,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { template ())> - const_pair_iterator pair_end() const + [[nodiscard]] const_pair_iterator pair_end() const { return const_pair_iterator{count_it{size()}, detail::pair_accessor{*this}}; @@ -693,7 +696,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { template ())> - const_pair_rep_iterator pair_rep_end() const + [[nodiscard]] const_pair_rep_iterator pair_rep_end() const { return const_pair_rep_iterator{count_it{size()}, detail::pair_rep_accessor{*this}}; @@ -743,7 +746,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { * @param child_index The index of the desired child * @return column_view The requested child `column_view` */ - __device__ column_device_view child(size_type child_index) const noexcept + [[nodiscard]] __device__ column_device_view child(size_type child_index) const noexcept { return d_children[child_index]; } @@ -751,7 +754,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { /** * @brief Returns a span containing the children of this column */ - __device__ device_span children() const noexcept + [[nodiscard]] __device__ device_span children() const noexcept { return device_span(d_children, _num_children); } @@ -761,7 +764,10 @@ class alignas(16) column_device_view : public detail::column_device_view_base { * * @return The number of child columns */ - __host__ __device__ size_type num_child_columns() const noexcept { return _num_children; } + [[nodiscard]] __host__ __device__ size_type num_child_columns() const noexcept + { + return _num_children; + } protected: column_device_view* d_children{}; ///< Array of `column_device_view` @@ -907,7 +913,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view * * @note If `null_count() == 0`, this may return `nullptr`. */ - __host__ __device__ bitmask_type* null_mask() const noexcept + [[nodiscard]] __host__ __device__ bitmask_type* null_mask() const noexcept { return const_cast(detail::column_device_view_base::null_mask()); } @@ -957,7 +963,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view * @param child_index The index of the desired child * @return column_view The requested child `column_view` */ - __device__ mutable_column_device_view child(size_type child_index) const noexcept + [[nodiscard]] __device__ mutable_column_device_view child(size_type child_index) const noexcept { return d_children[child_index]; } diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp index 3f335509da8..325f023f283 100644 --- a/cpp/include/cudf/column/column_view.hpp +++ b/cpp/include/cudf/column/column_view.hpp @@ -123,17 +123,17 @@ class column_view_base { /** * @brief Returns the number of elements in the column */ - size_type size() const noexcept { return _size; } + [[nodiscard]] size_type size() const noexcept { return _size; } /** * @brief Returns true if `size()` returns zero, or false otherwise */ - size_type is_empty() const noexcept { return size() == 0; } + [[nodiscard]] size_type is_empty() const noexcept { return size() == 0; } /** * @brief Returns the element `data_type` */ - data_type type() const noexcept { return _type; } + [[nodiscard]] data_type type() const noexcept { return _type; } /** * @brief Indicates if the column can contain null elements, i.e., if it has @@ -144,7 +144,7 @@ class column_view_base { * @return true The bitmask is allocated * @return false The bitmask is not allocated */ - bool nullable() const noexcept { return nullptr != _null_mask; } + [[nodiscard]] bool nullable() const noexcept { return nullptr != _null_mask; } /** * @brief Returns the count of null elements @@ -154,7 +154,7 @@ class column_view_base { * first invocation of `null_count()` will compute and store the count of null * elements indicated by the `null_mask` (if it exists). */ - size_type null_count() const; + [[nodiscard]] size_type null_count() const; /** * @brief Returns the count of null elements in the range [begin, end) @@ -169,7 +169,7 @@ class column_view_base { * @param[in] begin The starting index of the range (inclusive). * @param[in] end The index of the last element in the range (exclusive). */ - size_type null_count(size_type begin, size_type end) const; + [[nodiscard]] size_type null_count(size_type begin, size_type end) const; /** * @brief Indicates if the column contains null elements, @@ -178,7 +178,7 @@ class column_view_base { * @return true One or more elements are null * @return false All elements are valid */ - bool has_nulls() const { return null_count() > 0; } + [[nodiscard]] bool has_nulls() const { return null_count() > 0; } /** * @brief Indicates if the column contains null elements in the range @@ -192,7 +192,10 @@ class column_view_base { * @return true One or more elements are null in the range [begin, end) * @return false All elements are valid in the range [begin, end) */ - bool has_nulls(size_type begin, size_type end) const { return null_count(begin, end) > 0; } + [[nodiscard]] bool has_nulls(size_type begin, size_type end) const + { + return null_count(begin, end) > 0; + } /** * @brief Returns raw pointer to the underlying bitmask allocation. @@ -201,13 +204,13 @@ class column_view_base { * * @note If `null_count() == 0`, this may return `nullptr`. */ - bitmask_type const* null_mask() const noexcept { return _null_mask; } + [[nodiscard]] bitmask_type const* null_mask() const noexcept { return _null_mask; } /** * @brief Returns the index of the first element relative to the base memory * allocation, i.e., what is returned from `head()`. */ - size_type offset() const noexcept { return _offset; } + [[nodiscard]] size_type offset() const noexcept { return _offset; } protected: data_type _type{type_id::EMPTY}; ///< Element type @@ -352,12 +355,15 @@ class column_view : public detail::column_view_base { * @param child_index The index of the desired child * @return column_view The requested child `column_view` */ - column_view child(size_type child_index) const noexcept { return _children[child_index]; } + [[nodiscard]] column_view child(size_type child_index) const noexcept + { + return _children[child_index]; + } /** * @brief Returns the number of child columns. */ - size_type num_children() const noexcept { return _children.size(); } + [[nodiscard]] size_type num_children() const noexcept { return _children.size(); } /** * @brief Returns iterator to the beginning of the ordered sequence of child column-views. @@ -524,7 +530,7 @@ class mutable_column_view : public detail::column_view_base { * * @note If `null_count() == 0`, this may return `nullptr`. */ - bitmask_type* null_mask() const noexcept + [[nodiscard]] bitmask_type* null_mask() const noexcept { return const_cast(detail::column_view_base::null_mask()); } @@ -544,7 +550,7 @@ class mutable_column_view : public detail::column_view_base { * @param child_index The index of the desired child * @return mutable_column_view The requested child `mutable_column_view` */ - mutable_column_view child(size_type child_index) const noexcept + [[nodiscard]] mutable_column_view child(size_type child_index) const noexcept { return mutable_children[child_index]; } @@ -552,7 +558,7 @@ class mutable_column_view : public detail::column_view_base { /** * @brief Returns the number of child columns. */ - size_type num_children() const noexcept { return mutable_children.size(); } + [[nodiscard]] size_type num_children() const noexcept { return mutable_children.size(); } /** * @brief Returns iterator to the beginning of the ordered sequence of child column-views. diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp index 81dddbd284a..850a11426af 100644 --- a/cpp/include/cudf/copying.hpp +++ b/cpp/include/cudf/copying.hpp @@ -553,8 +553,8 @@ struct packed_columns { struct metadata { metadata() = default; metadata(std::vector&& v) : data_(std::move(v)) {} - uint8_t const* data() const { return data_.data(); } - size_t size() const { return data_.size(); } + [[nodiscard]] uint8_t const* data() const { return data_.data(); } + [[nodiscard]] size_t size() const { return data_.size(); } private: std::vector data_; diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index 3674efbcc7b..fbf315776f4 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -148,7 +148,7 @@ class sum_aggregation final : public rolling_aggregation, public: sum_aggregation() : aggregation(SUM) {} - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -167,7 +167,7 @@ class product_aggregation final : public groupby_aggregation { public: product_aggregation() : aggregation(PRODUCT) {} - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -188,7 +188,7 @@ class min_aggregation final : public rolling_aggregation, public: min_aggregation() : aggregation(MIN) {} - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -209,7 +209,7 @@ class max_aggregation final : public rolling_aggregation, public: max_aggregation() : aggregation(MAX) {} - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -230,7 +230,7 @@ class count_aggregation final : public rolling_aggregation, public: count_aggregation(aggregation::Kind kind) : aggregation(kind) {} - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -249,7 +249,7 @@ class any_aggregation final : public aggregation { public: any_aggregation() : aggregation(ANY) {} - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -268,7 +268,7 @@ class all_aggregation final : public aggregation { public: all_aggregation() : aggregation(ALL) {} - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -287,7 +287,7 @@ class sum_of_squares_aggregation final : public groupby_aggregation { public: sum_of_squares_aggregation() : aggregation(SUM_OF_SQUARES) {} - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -306,7 +306,7 @@ class mean_aggregation final : public rolling_aggregation, public groupby_aggreg public: mean_aggregation() : aggregation(MEAN) {} - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -325,7 +325,7 @@ class m2_aggregation : public groupby_aggregation { public: m2_aggregation() : aggregation{M2} {} - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -344,14 +344,17 @@ class std_var_aggregation : public rolling_aggregation, public groupby_aggregati public: size_type _ddof; ///< Delta degrees of freedom - bool is_equal(aggregation const& _other) const override + [[nodiscard]] bool is_equal(aggregation const& _other) const override { if (!this->aggregation::is_equal(_other)) { return false; } auto const& other = dynamic_cast(_other); return _ddof == other._ddof; } - size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); } + [[nodiscard]] size_t do_hash() const override + { + return this->aggregation::do_hash() ^ hash_impl(); + } protected: std_var_aggregation(aggregation::Kind k, size_type ddof) : rolling_aggregation(k), _ddof{ddof} @@ -359,7 +362,7 @@ class std_var_aggregation : public rolling_aggregation, public groupby_aggregati CUDF_EXPECTS(k == aggregation::STD or k == aggregation::VARIANCE, "std_var_aggregation can accept only STD, VARIANCE"); } - size_type hash_impl() const { return std::hash{}(_ddof); } + [[nodiscard]] size_type hash_impl() const { return std::hash{}(_ddof); } }; /** @@ -372,7 +375,7 @@ class var_aggregation final : public std_var_aggregation { { } - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -394,7 +397,7 @@ class std_aggregation final : public std_var_aggregation { { } - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -413,7 +416,7 @@ class median_aggregation final : public groupby_aggregation { public: median_aggregation() : aggregation(MEDIAN) {} - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -437,7 +440,7 @@ class quantile_aggregation final : public groupby_aggregation { std::vector _quantiles; ///< Desired quantile(s) interpolation _interpolation; ///< Desired interpolation - bool is_equal(aggregation const& _other) const override + [[nodiscard]] bool is_equal(aggregation const& _other) const override { if (!this->aggregation::is_equal(_other)) { return false; } @@ -447,9 +450,12 @@ class quantile_aggregation final : public groupby_aggregation { std::equal(_quantiles.begin(), _quantiles.end(), other._quantiles.begin()); } - size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); } + [[nodiscard]] size_t do_hash() const override + { + return this->aggregation::do_hash() ^ hash_impl(); + } - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -478,7 +484,7 @@ class argmax_aggregation final : public rolling_aggregation, public groupby_aggr public: argmax_aggregation() : aggregation(ARGMAX) {} - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -497,7 +503,7 @@ class argmin_aggregation final : public rolling_aggregation, public groupby_aggr public: argmin_aggregation() : aggregation(ARGMIN) {} - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -521,16 +527,19 @@ class nunique_aggregation final : public groupby_aggregation { null_policy _null_handling; ///< include or exclude nulls - bool is_equal(aggregation const& _other) const override + [[nodiscard]] bool is_equal(aggregation const& _other) const override { if (!this->aggregation::is_equal(_other)) { return false; } auto const& other = dynamic_cast(_other); return _null_handling == other._null_handling; } - size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); } + [[nodiscard]] size_t do_hash() const override + { + return this->aggregation::do_hash() ^ hash_impl(); + } - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -558,16 +567,19 @@ class nth_element_aggregation final : public groupby_aggregation { size_type _n; ///< nth index to return null_policy _null_handling; ///< include or exclude nulls - bool is_equal(aggregation const& _other) const override + [[nodiscard]] bool is_equal(aggregation const& _other) const override { if (!this->aggregation::is_equal(_other)) { return false; } auto const& other = dynamic_cast(_other); return _n == other._n and _null_handling == other._null_handling; } - size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); } + [[nodiscard]] size_t do_hash() const override + { + return this->aggregation::do_hash() ^ hash_impl(); + } - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -592,7 +604,7 @@ class row_number_aggregation final : public rolling_aggregation { public: row_number_aggregation() : aggregation(ROW_NUMBER) {} - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -611,7 +623,7 @@ class rank_aggregation final : public rolling_aggregation, public groupby_scan_a public: rank_aggregation() : aggregation{RANK} {} - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -630,7 +642,7 @@ class dense_rank_aggregation final : public rolling_aggregation, public groupby_ public: dense_rank_aggregation() : aggregation{DENSE_RANK} {} - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -654,16 +666,19 @@ class collect_list_aggregation final : public rolling_aggregation, public groupb null_policy _null_handling; ///< include or exclude nulls - bool is_equal(aggregation const& _other) const override + [[nodiscard]] bool is_equal(aggregation const& _other) const override { if (!this->aggregation::is_equal(_other)) { return false; } auto const& other = dynamic_cast(_other); return (_null_handling == other._null_handling); } - size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); } + [[nodiscard]] size_t do_hash() const override + { + return this->aggregation::do_hash() ^ hash_impl(); + } - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -698,7 +713,7 @@ class collect_set_aggregation final : public rolling_aggregation, public groupby nan_equality _nans_equal; ///< whether to consider NaNs as equal value (applicable only to ///< floating point types) - bool is_equal(aggregation const& _other) const override + [[nodiscard]] bool is_equal(aggregation const& _other) const override { if (!this->aggregation::is_equal(_other)) { return false; } auto const& other = dynamic_cast(_other); @@ -706,9 +721,12 @@ class collect_set_aggregation final : public rolling_aggregation, public groupby _nans_equal == other._nans_equal); } - size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); } + [[nodiscard]] size_t do_hash() const override + { + return this->aggregation::do_hash() ^ hash_impl(); + } - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -737,16 +755,19 @@ class lead_lag_aggregation final : public rolling_aggregation { { } - bool is_equal(aggregation const& _other) const override + [[nodiscard]] bool is_equal(aggregation const& _other) const override { if (!this->aggregation::is_equal(_other)) { return false; } auto const& other = dynamic_cast(_other); return (row_offset == other.row_offset); } - size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); } + [[nodiscard]] size_t do_hash() const override + { + return this->aggregation::do_hash() ^ hash_impl(); + } - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -760,7 +781,7 @@ class lead_lag_aggregation final : public rolling_aggregation { size_type row_offset; private: - size_t hash_impl() const { return std::hash()(row_offset); } + [[nodiscard]] size_t hash_impl() const { return std::hash()(row_offset); } }; /** @@ -782,7 +803,7 @@ class udf_aggregation final : public rolling_aggregation { "udf_aggregation can accept only PTX, CUDA"); } - bool is_equal(aggregation const& _other) const override + [[nodiscard]] bool is_equal(aggregation const& _other) const override { if (!this->aggregation::is_equal(_other)) { return false; } auto const& other = dynamic_cast(_other); @@ -790,9 +811,12 @@ class udf_aggregation final : public rolling_aggregation { _function_name == other._function_name and _output_type == other._output_type); } - size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); } + [[nodiscard]] size_t do_hash() const override + { + return this->aggregation::do_hash() ^ hash_impl(); + } - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -809,7 +833,7 @@ class udf_aggregation final : public rolling_aggregation { data_type _output_type; protected: - size_t hash_impl() const + [[nodiscard]] size_t hash_impl() const { return std::hash{}(_source) ^ std::hash{}(_operator_name) ^ std::hash{}(_function_name) ^ @@ -824,7 +848,7 @@ class merge_lists_aggregation final : public groupby_aggregation { public: explicit merge_lists_aggregation() : aggregation{MERGE_LISTS} {} - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -850,16 +874,19 @@ class merge_sets_aggregation final : public groupby_aggregation { nan_equality _nans_equal; ///< whether to consider NaNs as equal value (applicable only to ///< floating point types) - bool is_equal(aggregation const& _other) const override + [[nodiscard]] bool is_equal(aggregation const& _other) const override { if (!this->aggregation::is_equal(_other)) { return false; } auto const& other = dynamic_cast(_other); return (_nulls_equal == other._nulls_equal && _nans_equal == other._nans_equal); } - size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); } + [[nodiscard]] size_t do_hash() const override + { + return this->aggregation::do_hash() ^ hash_impl(); + } - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -884,7 +911,7 @@ class merge_m2_aggregation final : public groupby_aggregation { public: explicit merge_m2_aggregation() : aggregation{MERGE_M2} {} - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -908,9 +935,12 @@ class covariance_aggregation final : public groupby_aggregation { size_type _min_periods; size_type _ddof; - size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); } + [[nodiscard]] size_t do_hash() const override + { + return this->aggregation::do_hash() ^ hash_impl(); + } - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -940,16 +970,19 @@ class correlation_aggregation final : public groupby_aggregation { correlation_type _type; size_type _min_periods; - bool is_equal(aggregation const& _other) const override + [[nodiscard]] bool is_equal(aggregation const& _other) const override { if (!this->aggregation::is_equal(_other)) { return false; } auto const& other = dynamic_cast(_other); return (_type == other._type); } - size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); } + [[nodiscard]] size_t do_hash() const override + { + return this->aggregation::do_hash() ^ hash_impl(); + } - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -979,7 +1012,7 @@ class tdigest_aggregation final : public groupby_aggregation { int const max_centroids; - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -1003,7 +1036,7 @@ class merge_tdigest_aggregation final : public groupby_aggregation { int const max_centroids; - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } diff --git a/cpp/include/cudf/detail/aggregation/result_cache.hpp b/cpp/include/cudf/detail/aggregation/result_cache.hpp index 41f5c19f06a..4409d7e0d73 100644 --- a/cpp/include/cudf/detail/aggregation/result_cache.hpp +++ b/cpp/include/cudf/detail/aggregation/result_cache.hpp @@ -49,11 +49,11 @@ class result_cache { result_cache(size_t num_columns) : _cache(num_columns) {} - bool has_result(column_view const& input, aggregation const& agg) const; + [[nodiscard]] bool has_result(column_view const& input, aggregation const& agg) const; void add_result(column_view const& input, aggregation const& agg, std::unique_ptr&& col); - column_view get_result(column_view const& input, aggregation const& agg) const; + [[nodiscard]] column_view get_result(column_view const& input, aggregation const& agg) const; std::unique_ptr release_result(column_view const& input, aggregation const& agg); diff --git a/cpp/include/cudf/detail/merge.cuh b/cpp/include/cudf/detail/merge.cuh index ee5cb5c265d..1debef17db7 100644 --- a/cpp/include/cudf/detail/merge.cuh +++ b/cpp/include/cudf/detail/merge.cuh @@ -77,8 +77,8 @@ struct tagged_element_relational_comparator { { } - __device__ weak_ordering compare(index_type lhs_tagged_index, - index_type rhs_tagged_index) const noexcept + [[nodiscard]] __device__ weak_ordering compare(index_type lhs_tagged_index, + index_type rhs_tagged_index) const noexcept { auto const [l_side, l_indx] = lhs_tagged_index; auto const [r_side, r_indx] = rhs_tagged_index; diff --git a/cpp/include/cudf/detail/structs/utilities.hpp b/cpp/include/cudf/detail/structs/utilities.hpp index 6f32e3190bf..751b7c00e8a 100644 --- a/cpp/include/cudf/detail/structs/utilities.hpp +++ b/cpp/include/cudf/detail/structs/utilities.hpp @@ -106,17 +106,17 @@ class flattened_table { /** * @brief Getter for the flattened columns, as a `table_view`. */ - table_view flattened_columns() const { return _flattened_columns; } + [[nodiscard]] table_view flattened_columns() const { return _flattened_columns; } /** * @brief Getter for the cudf::order of the table_view's columns. */ - std::vector orders() const { return _orders; } + [[nodiscard]] std::vector orders() const { return _orders; } /** * @brief Getter for the cudf::null_order of the table_view's columns. */ - std::vector null_orders() const { return _null_orders; } + [[nodiscard]] std::vector null_orders() const { return _null_orders; } /** * @brief Conversion to `table_view`, to fetch flattened columns. diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh index 6380e76fdfa..b8ea228383d 100644 --- a/cpp/include/cudf/detail/utilities/device_atomics.cuh +++ b/cpp/include/cudf/detail/utilities/device_atomics.cuh @@ -62,8 +62,8 @@ struct genericAtomicOperationImpl { { using T_int = unsigned int; - T_int* address_uint32 = reinterpret_cast(addr - (reinterpret_cast(addr) & 3)); - T_int shift = ((reinterpret_cast(addr) & 3) * 8); + auto* address_uint32 = reinterpret_cast(addr - (reinterpret_cast(addr) & 3)); + T_int shift = ((reinterpret_cast(addr) & 3) * 8); T_int old = *address_uint32; T_int assumed; @@ -87,7 +87,7 @@ struct genericAtomicOperationImpl { { using T_int = unsigned int; bool is_32_align = (reinterpret_cast(addr) & 2) ? false : true; - T_int* address_uint32 = + auto* address_uint32 = reinterpret_cast(reinterpret_cast(addr) - (is_32_align ? 0 : 2)); T_int old = *address_uint32; @@ -322,8 +322,8 @@ struct typesAtomicCASImpl { { using T_int = unsigned int; - T_int shift = ((reinterpret_cast(addr) & 3) * 8); - T_int* address_uint32 = reinterpret_cast(addr - (reinterpret_cast(addr) & 3)); + T_int shift = ((reinterpret_cast(addr) & 3) * 8); + auto* address_uint32 = reinterpret_cast(addr - (reinterpret_cast(addr) & 3)); // the 'target_value' in `old` can be different from `compare` // because other thread may update the value @@ -355,7 +355,7 @@ struct typesAtomicCASImpl { using T_int = unsigned int; bool is_32_align = (reinterpret_cast(addr) & 2) ? false : true; - T_int* address_uint32 = + auto* address_uint32 = reinterpret_cast(reinterpret_cast(addr) - (is_32_align ? 0 : 2)); T_int old = *address_uint32; diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh index 8a7f4276d05..b5ca5a3590e 100644 --- a/cpp/include/cudf/detail/utilities/hash_functions.cuh +++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh @@ -90,12 +90,12 @@ struct MurmurHash3_32 { MurmurHash3_32() = default; constexpr MurmurHash3_32(uint32_t seed) : m_seed(seed) {} - __device__ inline uint32_t rotl32(uint32_t x, int8_t r) const + [[nodiscard]] __device__ inline uint32_t rotl32(uint32_t x, int8_t r) const { return (x << r) | (x >> (32 - r)); } - __device__ inline uint32_t fmix32(uint32_t h) const + [[nodiscard]] __device__ inline uint32_t fmix32(uint32_t h) const { h ^= h >> 16; h *= 0x85ebca6b; diff --git a/cpp/include/cudf/dictionary/dictionary_column_view.hpp b/cpp/include/cudf/dictionary/dictionary_column_view.hpp index 42f8310040e..33e29e70304 100644 --- a/cpp/include/cudf/dictionary/dictionary_column_view.hpp +++ b/cpp/include/cudf/dictionary/dictionary_column_view.hpp @@ -59,33 +59,33 @@ class dictionary_column_view : private column_view { /** * @brief Returns the parent column. */ - column_view parent() const noexcept; + [[nodiscard]] column_view parent() const noexcept; /** * @brief Returns the column of indices */ - column_view indices() const noexcept; + [[nodiscard]] column_view indices() const noexcept; /** * @brief Returns a column_view combining the indices data * with offset, size, and nulls from the parent. */ - column_view get_indices_annotated() const noexcept; + [[nodiscard]] column_view get_indices_annotated() const noexcept; /** * @brief Returns the column of keys */ - column_view keys() const noexcept; + [[nodiscard]] column_view keys() const noexcept; /** * @brief Returns the `data_type` of the keys child column. */ - data_type keys_type() const noexcept; + [[nodiscard]] data_type keys_type() const noexcept; /** * @brief Returns the number of rows in the keys column. */ - size_type keys_size() const noexcept; + [[nodiscard]] size_type keys_size() const noexcept; }; /** @} */ // end of group diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp index 727dce0db9d..6a85428d8f0 100644 --- a/cpp/include/cudf/fixed_point/fixed_point.hpp +++ b/cpp/include/cudf/fixed_point/fixed_point.hpp @@ -82,7 +82,8 @@ template = 0 && "integer exponentiation with negative exponent is not possible."); - if (exponent == 0) return static_cast(1); + if (exponent == 0) { return static_cast(1); } + auto extra = static_cast(1); auto square = static_cast(Base); while (exponent > 1) { @@ -146,12 +147,9 @@ CUDF_HOST_DEVICE inline constexpr T left_shift(T const& val, scale_type const& s template CUDF_HOST_DEVICE inline constexpr T shift(T const& val, scale_type const& scale) { - if (scale == 0) - return val; - else if (scale > 0) - return right_shift(val, scale); - else - return left_shift(val, scale); + if (scale == 0) { return val; } + if (scale > 0) { return right_shift(val, scale); } + return left_shift(val, scale); } } // namespace detail @@ -193,7 +191,7 @@ struct scaled_integer { */ template class fixed_point { - Rep _value; + Rep _value{}; scale_type _scale; public: @@ -258,7 +256,7 @@ class fixed_point { * @brief Default constructor that constructs `fixed_point` number with a * value and scale of zero */ - CUDF_HOST_DEVICE inline fixed_point() : _value{0}, _scale{scale_type{0}} {} + CUDF_HOST_DEVICE inline fixed_point() : _scale{scale_type{0}} {} /** * @brief Explicit conversion operator for casting to floating point types @@ -543,7 +541,7 @@ class fixed_point { */ CUDF_HOST_DEVICE inline fixed_point rescaled(scale_type scale) const { - if (scale == _scale) return *this; + if (scale == _scale) { return *this; } Rep const value = detail::shift(_value, scale_type{scale - _scale}); return fixed_point{scaled_integer{value, scale}}; } @@ -563,10 +561,9 @@ class fixed_point { auto const sign = _value < 0 ? std::string("-") : std::string(); return sign + detail::to_string(av / n) + std::string(".") + zeros + detail::to_string(av % n); - } else { - auto const zeros = std::string(_scale, '0'); - return detail::to_string(_value) + zeros; } + auto const zeros = std::string(_scale, '0'); + return detail::to_string(_value) + zeros; } }; @@ -628,12 +625,9 @@ CUDF_HOST_DEVICE inline auto multiplication_overflow(T lhs, T rhs) { auto const min = cuda::std::numeric_limits::min(); auto const max = cuda::std::numeric_limits::max(); - if (rhs > 0) - return lhs > max / rhs || lhs < min / rhs; - else if (rhs < -1) - return lhs > min / rhs || lhs < max / rhs; - else - return rhs == -1 && lhs == min; + if (rhs > 0) { return lhs > max / rhs || lhs < min / rhs; } + if (rhs < -1) { return lhs > min / rhs || lhs < max / rhs; } + return rhs == -1 && lhs == min; } // PLUS Operation diff --git a/cpp/include/cudf/io/avro.hpp b/cpp/include/cudf/io/avro.hpp index 4e8bd65672f..0e00d14291d 100644 --- a/cpp/include/cudf/io/avro.hpp +++ b/cpp/include/cudf/io/avro.hpp @@ -74,22 +74,22 @@ class avro_reader_options { /** * @brief Returns source info. */ - source_info const& get_source() const { return _source; } + [[nodiscard]] source_info const& get_source() const { return _source; } /** * @brief Returns names of the columns to be read. */ - std::vector get_columns() const { return _columns; } + [[nodiscard]] std::vector get_columns() const { return _columns; } /** * @brief Returns number of rows to skip from the start. */ - size_type get_skip_rows() const { return _skip_rows; } + [[nodiscard]] size_type get_skip_rows() const { return _skip_rows; } /** * @brief Returns number of rows to read. */ - size_type get_num_rows() const { return _num_rows; } + [[nodiscard]] size_type get_num_rows() const { return _num_rows; } /** * @brief Set names of the column to be read. diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp index 89719cb7f67..44ede9b0d63 100644 --- a/cpp/include/cudf/io/csv.hpp +++ b/cpp/include/cudf/io/csv.hpp @@ -159,27 +159,27 @@ class csv_reader_options { /** * @brief Returns source info. */ - source_info const& get_source() const { return _source; } + [[nodiscard]] source_info const& get_source() const { return _source; } /** * @brief Returns compression format of the source. */ - compression_type get_compression() const { return _compression; } + [[nodiscard]] compression_type get_compression() const { return _compression; } /** * @brief Returns number of bytes to skip from source start. */ - std::size_t get_byte_range_offset() const { return _byte_range_offset; } + [[nodiscard]] std::size_t get_byte_range_offset() const { return _byte_range_offset; } /** * @brief Returns number of bytes to read. */ - std::size_t get_byte_range_size() const { return _byte_range_size; } + [[nodiscard]] std::size_t get_byte_range_size() const { return _byte_range_size; } /** * @brief Returns number of bytes to read with padding. */ - std::size_t get_byte_range_size_with_padding() const + [[nodiscard]] std::size_t get_byte_range_size_with_padding() const { if (_byte_range_size == 0) { return 0; @@ -191,7 +191,7 @@ class csv_reader_options { /** * @brief Returns number of bytes to pad when reading. */ - std::size_t get_byte_range_padding() const + [[nodiscard]] std::size_t get_byte_range_padding() const { auto const num_names = _names.size(); auto const num_dtypes = std::visit([](const auto& dtypes) { return dtypes.size(); }, _dtypes); @@ -213,127 +213,139 @@ class csv_reader_options { /** * @brief Returns names of the columns. */ - std::vector const& get_names() const { return _names; } + [[nodiscard]] std::vector const& get_names() const { return _names; } /** * @brief Returns prefix to be used for column ID. */ - std::string get_prefix() const { return _prefix; } + [[nodiscard]] std::string get_prefix() const { return _prefix; } /** * @brief Whether to rename duplicate column names. */ - bool is_enabled_mangle_dupe_cols() const { return _mangle_dupe_cols; } + [[nodiscard]] bool is_enabled_mangle_dupe_cols() const { return _mangle_dupe_cols; } /** * @brief Returns names of the columns to be read. */ - std::vector const& get_use_cols_names() const { return _use_cols_names; } + [[nodiscard]] std::vector const& get_use_cols_names() const + { + return _use_cols_names; + } /** * @brief Returns indexes of columns to read. */ - std::vector const& get_use_cols_indexes() const { return _use_cols_indexes; } + [[nodiscard]] std::vector const& get_use_cols_indexes() const { return _use_cols_indexes; } /** * @brief Returns number of rows to read. */ - size_type get_nrows() const { return _nrows; } + [[nodiscard]] size_type get_nrows() const { return _nrows; } /** * @brief Returns number of rows to skip from start. */ - size_type get_skiprows() const { return _skiprows; } + [[nodiscard]] size_type get_skiprows() const { return _skiprows; } /** * @brief Returns number of rows to skip from end. */ - size_type get_skipfooter() const { return _skipfooter; } + [[nodiscard]] size_type get_skipfooter() const { return _skipfooter; } /** * @brief Returns header row index. */ - size_type get_header() const { return _header; } + [[nodiscard]] size_type get_header() const { return _header; } /** * @brief Returns line terminator. */ - char get_lineterminator() const { return _lineterminator; } + [[nodiscard]] char get_lineterminator() const { return _lineterminator; } /** * @brief Returns field delimiter. */ - char get_delimiter() const { return _delimiter; } + [[nodiscard]] char get_delimiter() const { return _delimiter; } /** * @brief Returns numeric data thousands separator. */ - char get_thousands() const { return _thousands; } + [[nodiscard]] char get_thousands() const { return _thousands; } /** * @brief Returns decimal point character. */ - char get_decimal() const { return _decimal; } + [[nodiscard]] char get_decimal() const { return _decimal; } /** * @brief Returns comment line start character. */ - char get_comment() const { return _comment; } + [[nodiscard]] char get_comment() const { return _comment; } /** * @brief Whether to treat `\r\n` as line terminator. */ - bool is_enabled_windowslinetermination() const { return _windowslinetermination; } + [[nodiscard]] bool is_enabled_windowslinetermination() const { return _windowslinetermination; } /** * @brief Whether to treat whitespace as field delimiter. */ - bool is_enabled_delim_whitespace() const { return _delim_whitespace; } + [[nodiscard]] bool is_enabled_delim_whitespace() const { return _delim_whitespace; } /** * @brief Whether to skip whitespace after the delimiter. */ - bool is_enabled_skipinitialspace() const { return _skipinitialspace; } + [[nodiscard]] bool is_enabled_skipinitialspace() const { return _skipinitialspace; } /** * @brief Whether to ignore empty lines or parse line values as invalid. */ - bool is_enabled_skip_blank_lines() const { return _skip_blank_lines; } + [[nodiscard]] bool is_enabled_skip_blank_lines() const { return _skip_blank_lines; } /** * @brief Returns quoting style. */ - quote_style get_quoting() const { return _quoting; } + [[nodiscard]] quote_style get_quoting() const { return _quoting; } /** * @brief Returns quoting character. */ - char get_quotechar() const { return _quotechar; } + [[nodiscard]] char get_quotechar() const { return _quotechar; } /** * @brief Whether a quote inside a value is double-quoted. */ - bool is_enabled_doublequote() const { return _doublequote; } + [[nodiscard]] bool is_enabled_doublequote() const { return _doublequote; } /** * @brief Returns names of columns to read as datetime. */ - std::vector const& get_parse_dates_names() const { return _parse_dates_names; } + [[nodiscard]] std::vector const& get_parse_dates_names() const + { + return _parse_dates_names; + } /** * @brief Returns indexes of columns to read as datetime. */ - std::vector const& get_parse_dates_indexes() const { return _parse_dates_indexes; } + [[nodiscard]] std::vector const& get_parse_dates_indexes() const + { + return _parse_dates_indexes; + } /** * @brief Returns names of columns to read as hexadecimal. */ - std::vector const& get_parse_hex_names() const { return _parse_hex_names; } + [[nodiscard]] std::vector const& get_parse_hex_names() const + { + return _parse_hex_names; + } /** * @brief Returns indexes of columns to read as hexadecimal. */ - std::vector const& get_parse_hex_indexes() const { return _parse_hex_indexes; } + [[nodiscard]] std::vector const& get_parse_hex_indexes() const { return _parse_hex_indexes; } /** * @brief Returns per-column types. @@ -1277,52 +1289,52 @@ class csv_writer_options { /** * @brief Returns sink used for writer output. */ - sink_info const& get_sink(void) const { return _sink; } + [[nodiscard]] sink_info const& get_sink() const { return _sink; } /** * @brief Returns table that would be written to output. */ - table_view const& get_table(void) const { return _table; } + [[nodiscard]] table_view const& get_table() const { return _table; } /** * @brief Returns optional associated metadata. */ - table_metadata const* get_metadata(void) const { return _metadata; } + [[nodiscard]] table_metadata const* get_metadata() const { return _metadata; } /** * @brief Returns string to used for null entries. */ - std::string get_na_rep(void) const { return _na_rep; } + [[nodiscard]] std::string get_na_rep() const { return _na_rep; } /** * @brief Whether to write headers to csv. */ - bool is_enabled_include_header(void) const { return _include_header; } + [[nodiscard]] bool is_enabled_include_header() const { return _include_header; } /** * @brief Returns maximum number of rows to process for each file write. */ - size_type get_rows_per_chunk(void) const { return _rows_per_chunk; } + [[nodiscard]] size_type get_rows_per_chunk() const { return _rows_per_chunk; } /** * @brief Returns character used for separating lines. */ - std::string get_line_terminator(void) const { return _line_terminator; } + [[nodiscard]] std::string get_line_terminator() const { return _line_terminator; } /** * @brief Returns character used for separating lines. */ - char get_inter_column_delimiter(void) const { return _inter_column_delimiter; } + [[nodiscard]] char get_inter_column_delimiter() const { return _inter_column_delimiter; } /** * @brief Returns string used for values != 0 in INT8 types. */ - std::string get_true_value(void) const { return _true_value; } + [[nodiscard]] std::string get_true_value() const { return _true_value; } /** * @brief Returns string used for values == 0 in INT8 types. */ - std::string get_false_value(void) const { return _false_value; } + [[nodiscard]] std::string get_false_value() const { return _false_value; } // Setter /** diff --git a/cpp/include/cudf/io/data_sink.hpp b/cpp/include/cudf/io/data_sink.hpp index 2c1966ee6ba..6d4c8ec9b8c 100644 --- a/cpp/include/cudf/io/data_sink.hpp +++ b/cpp/include/cudf/io/data_sink.hpp @@ -120,7 +120,7 @@ class data_sink { * * @return bool If this writer supports device_write() calls. */ - virtual bool supports_device_write() const { return false; } + [[nodiscard]] virtual bool supports_device_write() const { return false; } /** * @brief Estimates whether a direct device write would be more optimal for the given size. @@ -128,7 +128,10 @@ class data_sink { * @param size Number of bytes to write * @return whether the device write is expected to be more performant for the given size */ - virtual bool is_device_write_preferred(size_t size) const { return supports_device_write(); } + [[nodiscard]] virtual bool is_device_write_preferred(size_t size) const + { + return supports_device_write(); + } /** * @brief Append the buffer content to the sink from a gpu address diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp index 627ec29a496..18ab8aad088 100644 --- a/cpp/include/cudf/io/datasource.hpp +++ b/cpp/include/cudf/io/datasource.hpp @@ -52,12 +52,12 @@ class datasource { /** * @brief Returns the buffer size in bytes. */ - virtual size_t size() const = 0; + [[nodiscard]] virtual size_t size() const = 0; /** * @brief Returns the address of the data in the buffer. */ - virtual uint8_t const* data() const = 0; + [[nodiscard]] virtual uint8_t const* data() const = 0; /** * @brief Base class destructor @@ -155,7 +155,7 @@ class datasource { * * @return bool Whether this source supports device_read() calls */ - virtual bool supports_device_read() const { return false; } + [[nodiscard]] virtual bool supports_device_read() const { return false; } /** * @brief Estimates whether a direct device read would be more optimal for the given size. @@ -163,7 +163,10 @@ class datasource { * @param size Number of bytes to read * @return whether the device read is expected to be more performant for the given size */ - virtual bool is_device_read_preferred(size_t size) const { return supports_device_read(); } + [[nodiscard]] virtual bool is_device_read_preferred(size_t size) const + { + return supports_device_read(); + } /** * @brief Returns a device buffer with a subset of data from the source. @@ -243,31 +246,31 @@ class datasource { * * @return size_t The size of the source data in bytes */ - virtual size_t size() const = 0; + [[nodiscard]] virtual size_t size() const = 0; /** * @brief Returns whether the source contains any data. * * @return bool True if there is data, False otherwise */ - virtual bool is_empty() const { return size() == 0; } + [[nodiscard]] virtual bool is_empty() const { return size() == 0; } /** * @brief Implementation for non owning buffer where datasource holds buffer until destruction. */ class non_owning_buffer : public buffer { public: - non_owning_buffer() : _data(0), _size(0) {} + non_owning_buffer() {} non_owning_buffer(uint8_t* data, size_t size) : _data(data), _size(size) {} - size_t size() const override { return _size; } + [[nodiscard]] size_t size() const override { return _size; } - uint8_t const* data() const override { return _data; } + [[nodiscard]] uint8_t const* data() const override { return _data; } private: - uint8_t* const _data; - size_t const _size; + uint8_t* const _data{nullptr}; + size_t const _size{0}; }; /** @@ -297,9 +300,12 @@ class datasource { { } - size_t size() const override { return _size; } + [[nodiscard]] size_t size() const override { return _size; } - uint8_t const* data() const override { return static_cast(_data_ptr); } + [[nodiscard]] uint8_t const* data() const override + { + return static_cast(_data_ptr); + } private: Container _data; @@ -330,8 +336,8 @@ class arrow_io_source : public datasource { : arrow_buffer(arrow_buffer) { } - size_t size() const override { return arrow_buffer->size(); } - uint8_t const* data() const override { return arrow_buffer->data(); } + [[nodiscard]] size_t size() const override { return arrow_buffer->size(); } + [[nodiscard]] uint8_t const* data() const override { return arrow_buffer->data(); } }; public: @@ -393,7 +399,7 @@ class arrow_io_source : public datasource { /** * @brief Returns the size of the data in the `arrow` source. */ - size_t size() const override + [[nodiscard]] size_t size() const override { auto result = arrow_file->GetSize(); CUDF_EXPECTS(result.ok(), "Cannot get file size"); diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 5f34803f28e..727c24a4431 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -111,7 +111,7 @@ class json_reader_options { /** * @brief Returns source info. */ - source_info const& get_source() const { return _source; } + [[nodiscard]] source_info const& get_source() const { return _source; } /** * @brief Returns data types of the columns. diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp index b3a2f6bcbbb..51f82bc4061 100644 --- a/cpp/include/cudf/io/orc.hpp +++ b/cpp/include/cudf/io/orc.hpp @@ -102,12 +102,12 @@ class orc_reader_options { /** * @brief Returns source info. */ - source_info const& get_source() const { return _source; } + [[nodiscard]] source_info const& get_source() const { return _source; } /** * @brief Returns names of the columns to read. */ - std::vector const& get_columns() const { return _columns; } + [[nodiscard]] std::vector const& get_columns() const { return _columns; } /** * @brief Returns vector of vectors, stripes to read for each input source @@ -491,27 +491,27 @@ class orc_writer_options { /** * @brief Returns sink info. */ - sink_info const& get_sink() const { return _sink; } + [[nodiscard]] sink_info const& get_sink() const { return _sink; } /** * @brief Returns compression type. */ - compression_type get_compression() const { return _compression; } + [[nodiscard]] compression_type get_compression() const { return _compression; } /** * @brief Whether writing column statistics is enabled/disabled. */ - bool is_enabled_statistics() const { return _enable_statistics; } + [[nodiscard]] bool is_enabled_statistics() const { return _enable_statistics; } /** * @brief Returns maximum stripe size, in bytes. */ - auto get_stripe_size_bytes() const { return _stripe_size_bytes; } + [[nodiscard]] auto get_stripe_size_bytes() const { return _stripe_size_bytes; } /** * @brief Returns maximum stripe size, in rows. */ - auto get_stripe_size_rows() const { return _stripe_size_rows; } + [[nodiscard]] auto get_stripe_size_rows() const { return _stripe_size_rows; } /** * @brief Returns the row index stride. @@ -525,17 +525,20 @@ class orc_writer_options { /** * @brief Returns table to be written to output. */ - table_view get_table() const { return _table; } + [[nodiscard]] table_view get_table() const { return _table; } /** * @brief Returns associated metadata. */ - table_input_metadata const* get_metadata() const { return _metadata; } + [[nodiscard]] table_input_metadata const* get_metadata() const { return _metadata; } /** * @brief Returns Key-Value footer metadata information. */ - std::map const& get_key_value_metadata() const { return _user_data; } + [[nodiscard]] std::map const& get_key_value_metadata() const + { + return _user_data; + } // Setters @@ -814,27 +817,27 @@ class chunked_orc_writer_options { /** * @brief Returns sink info. */ - sink_info const& get_sink() const { return _sink; } + [[nodiscard]] sink_info const& get_sink() const { return _sink; } /** * @brief Returns compression type. */ - compression_type get_compression() const { return _compression; } + [[nodiscard]] compression_type get_compression() const { return _compression; } /** * @brief Whether writing column statistics is enabled/disabled. */ - bool is_enabled_statistics() const { return _enable_statistics; } + [[nodiscard]] bool is_enabled_statistics() const { return _enable_statistics; } /** * @brief Returns maximum stripe size, in bytes. */ - auto get_stripe_size_bytes() const { return _stripe_size_bytes; } + [[nodiscard]] auto get_stripe_size_bytes() const { return _stripe_size_bytes; } /** * @brief Returns maximum stripe size, in rows. */ - auto get_stripe_size_rows() const { return _stripe_size_rows; } + [[nodiscard]] auto get_stripe_size_rows() const { return _stripe_size_rows; } /** * @brief Returns the row index stride. @@ -848,12 +851,15 @@ class chunked_orc_writer_options { /** * @brief Returns associated metadata. */ - table_input_metadata const* get_metadata() const { return _metadata; } + [[nodiscard]] table_input_metadata const* get_metadata() const { return _metadata; } /** * @brief Returns Key-Value footer metadata information. */ - std::map const& get_key_value_metadata() const { return _user_data; } + [[nodiscard]] std::map const& get_key_value_metadata() const + { + return _user_data; + } // Setters diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index 740f7a8b2db..2ceac947c8d 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -96,33 +96,36 @@ class parquet_reader_options { /** * @brief Returns source info. */ - source_info const& get_source() const { return _source; } + [[nodiscard]] source_info const& get_source() const { return _source; } /** * @brief Returns true/false depending on whether strings should be converted to categories or * not. */ - bool is_enabled_convert_strings_to_categories() const { return _convert_strings_to_categories; } + [[nodiscard]] bool is_enabled_convert_strings_to_categories() const + { + return _convert_strings_to_categories; + } /** * @brief Returns true/false depending whether to use pandas metadata or not while reading. */ - bool is_enabled_use_pandas_metadata() const { return _use_pandas_metadata; } + [[nodiscard]] bool is_enabled_use_pandas_metadata() const { return _use_pandas_metadata; } /** * @brief Returns number of rows to skip from the start. */ - size_type get_skip_rows() const { return _skip_rows; } + [[nodiscard]] size_type get_skip_rows() const { return _skip_rows; } /** * @brief Returns number of rows to read. */ - size_type get_num_rows() const { return _num_rows; } + [[nodiscard]] size_type get_num_rows() const { return _num_rows; } /** * @brief Returns names of column to be read. */ - std::vector const& get_columns() const { return _columns; } + [[nodiscard]] std::vector const& get_columns() const { return _columns; } /** * @brief Returns list of individual row groups to be read. @@ -421,32 +424,32 @@ class parquet_writer_options { /** * @brief Returns sink info. */ - sink_info const& get_sink() const { return _sink; } + [[nodiscard]] sink_info const& get_sink() const { return _sink; } /** * @brief Returns compression format used. */ - compression_type get_compression() const { return _compression; } + [[nodiscard]] compression_type get_compression() const { return _compression; } /** * @brief Returns level of statistics requested in output file. */ - statistics_freq get_stats_level() const { return _stats_level; } + [[nodiscard]] statistics_freq get_stats_level() const { return _stats_level; } /** * @brief Returns table_view. */ - table_view get_table() const { return _table; } + [[nodiscard]] table_view get_table() const { return _table; } /** * @brief Returns partitions. */ - std::vector const& get_partitions() const { return _partitions; } + [[nodiscard]] std::vector const& get_partitions() const { return _partitions; } /** * @brief Returns associated metadata. */ - table_input_metadata const* get_metadata() const { return _metadata; } + [[nodiscard]] table_input_metadata const* get_metadata() const { return _metadata; } /** * @brief Returns Key-Value footer metadata information. @@ -801,22 +804,22 @@ class chunked_parquet_writer_options { /** * @brief Returns sink info. */ - sink_info const& get_sink() const { return _sink; } + [[nodiscard]] sink_info const& get_sink() const { return _sink; } /** * @brief Returns compression format used. */ - compression_type get_compression() const { return _compression; } + [[nodiscard]] compression_type get_compression() const { return _compression; } /** * @brief Returns level of statistics requested in output file. */ - statistics_freq get_stats_level() const { return _stats_level; } + [[nodiscard]] statistics_freq get_stats_level() const { return _stats_level; } /** * @brief Returns metadata information. */ - table_input_metadata const* get_metadata() const { return _metadata; } + [[nodiscard]] table_input_metadata const* get_metadata() const { return _metadata; } /** * @brief Returns Key-Value footer metadata information. diff --git a/cpp/include/cudf/io/text/data_chunk_source.hpp b/cpp/include/cudf/io/text/data_chunk_source.hpp index e65afa04fe5..5e6dda5a514 100644 --- a/cpp/include/cudf/io/text/data_chunk_source.hpp +++ b/cpp/include/cudf/io/text/data_chunk_source.hpp @@ -36,8 +36,8 @@ namespace text { */ class device_data_chunk { public: - virtual char const* data() const = 0; - virtual std::size_t size() const = 0; + [[nodiscard]] virtual char const* data() const = 0; + [[nodiscard]] virtual std::size_t size() const = 0; virtual operator device_span() const = 0; }; @@ -76,7 +76,7 @@ class data_chunk_reader { */ class data_chunk_source { public: - virtual std::unique_ptr create_reader() const = 0; + [[nodiscard]] virtual std::unique_ptr create_reader() const = 0; }; } // namespace text diff --git a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp index 6b95de53ee7..aeb4b7fff53 100644 --- a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp +++ b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp @@ -41,8 +41,8 @@ class device_span_data_chunk : public device_data_chunk { public: device_span_data_chunk(device_span data) : _data(data) {} - char const* data() const override { return _data.data(); } - std::size_t size() const override { return _data.size(); } + [[nodiscard]] char const* data() const override { return _data.data(); } + [[nodiscard]] std::size_t size() const override { return _data.size(); } operator device_span() const override { return _data; } private: @@ -53,8 +53,8 @@ class device_uvector_data_chunk : public device_data_chunk { public: device_uvector_data_chunk(rmm::device_uvector&& data) : _data(std::move(data)) {} - char const* data() const override { return _data.data(); } - std::size_t size() const override { return _data.size(); } + [[nodiscard]] char const* data() const override { return _data.data(); } + [[nodiscard]] std::size_t size() const override { return _data.size(); } operator device_span() const override { return _data; } private: @@ -171,7 +171,7 @@ class device_span_data_chunk_reader : public data_chunk_reader { class file_data_chunk_source : public data_chunk_source { public: file_data_chunk_source(std::string filename) : _filename(filename) {} - std::unique_ptr create_reader() const override + [[nodiscard]] std::unique_ptr create_reader() const override { return std::make_unique( std::make_unique(_filename, std::ifstream::in)); @@ -187,7 +187,7 @@ class file_data_chunk_source : public data_chunk_source { class string_data_chunk_source : public data_chunk_source { public: string_data_chunk_source(std::string const& data) : _data(data) {} - std::unique_ptr create_reader() const override + [[nodiscard]] std::unique_ptr create_reader() const override { return std::make_unique(std::make_unique(_data)); } @@ -202,7 +202,7 @@ class string_data_chunk_source : public data_chunk_source { class device_span_data_chunk_source : public data_chunk_source { public: device_span_data_chunk_source(device_span data) : _data(data) {} - std::unique_ptr create_reader() const override + [[nodiscard]] std::unique_ptr create_reader() const override { return std::make_unique(_data); } diff --git a/cpp/include/cudf/io/text/detail/multistate.hpp b/cpp/include/cudf/io/text/detail/multistate.hpp index d3c8909ab51..e7136ac69a5 100644 --- a/cpp/include/cudf/io/text/detail/multistate.hpp +++ b/cpp/include/cudf/io/text/detail/multistate.hpp @@ -37,7 +37,7 @@ struct multistate { * @brief Creates a segment which represents (0, 0] */ - constexpr multistate_segment() : _data(0) {} + constexpr multistate_segment() = default; /** * @brief Creates a segment which represents (head, tail] * @@ -52,15 +52,15 @@ struct multistate { /** * @brief Get's the (head, ____] value from the segment. */ - constexpr uint8_t get_head() const { return _data & 0b1111; } + [[nodiscard]] constexpr uint8_t get_head() const { return _data & 0b1111; } /** * @brief Get's the (____, tail] value from the segment. */ - constexpr uint8_t get_tail() const { return _data >> 4; } + [[nodiscard]] constexpr uint8_t get_tail() const { return _data >> 4; } private: - uint8_t _data; + uint8_t _data{0}; }; public: @@ -87,12 +87,12 @@ struct multistate { /** * @brief get's the number of segments this multistate represents */ - constexpr uint8_t size() const { return _size; } + [[nodiscard]] constexpr uint8_t size() const { return _size; } /** * @brief get's the highest (____, tail] value this multistate represents */ - constexpr uint8_t max_tail() const + [[nodiscard]] constexpr uint8_t max_tail() const { uint8_t maximum = 0; @@ -106,12 +106,12 @@ struct multistate { /** * @brief get's the Nth (head, ____] value state this multistate represents */ - constexpr uint8_t get_head(uint8_t idx) const { return _segments[idx].get_head(); } + [[nodiscard]] constexpr uint8_t get_head(uint8_t idx) const { return _segments[idx].get_head(); } /** * @brief get's the Nth (____, tail] value state this multistate represents */ - constexpr uint8_t get_tail(uint8_t idx) const { return _segments[idx].get_tail(); } + [[nodiscard]] constexpr uint8_t get_tail(uint8_t idx) const { return _segments[idx].get_tail(); } private: uint8_t _size = 0; diff --git a/cpp/include/cudf/io/text/detail/trie.hpp b/cpp/include/cudf/io/text/detail/trie.hpp index d14fe15b0a9..06d15276a68 100644 --- a/cpp/include/cudf/io/text/detail/trie.hpp +++ b/cpp/include/cudf/io/text/detail/trie.hpp @@ -161,13 +161,13 @@ struct trie { /** * @brief Gets the number of nodes contained in this trie. */ - cudf::size_type size() const { return _nodes.size(); } + [[nodiscard]] cudf::size_type size() const { return _nodes.size(); } /** * @brief A pessimistic count of duplicate tokens in the trie. Used to determine the maximum * possible stack size required to compute matches of this trie in parallel. */ - cudf::size_type max_duplicate_tokens() const { return _max_duplicate_tokens; } + [[nodiscard]] cudf::size_type max_duplicate_tokens() const { return _max_duplicate_tokens; } /** * @brief Create a trie which represents the given pattern. @@ -255,7 +255,7 @@ struct trie { cudf::detail::make_device_uvector_sync(trie_nodes, stream, mr)}; } - trie_device_view view() const { return trie_device_view{_nodes}; } + [[nodiscard]] trie_device_view view() const { return trie_device_view{_nodes}; } }; } // namespace detail diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp index 8f06de99f05..7e4ab5b8d9d 100644 --- a/cpp/include/cudf/io/types.hpp +++ b/cpp/include/cudf/io/types.hpp @@ -155,14 +155,8 @@ struct source_info { source_info() = default; - explicit source_info(std::vector const& file_paths) - : _type(io_type::FILEPATH), _filepaths(file_paths) - { - } - explicit source_info(std::string const& file_path) - : _type(io_type::FILEPATH), _filepaths({file_path}) - { - } + explicit source_info(std::vector const& file_paths) : _filepaths(file_paths) {} + explicit source_info(std::string const& file_path) : _filepaths({file_path}) {} explicit source_info(std::vector const& host_buffers) : _type(io_type::HOST_BUFFER), _buffers(host_buffers) @@ -182,11 +176,11 @@ struct source_info { { } - auto type() const { return _type; } - auto const& filepaths() const { return _filepaths; } - auto const& buffers() const { return _buffers; } - auto const& files() const { return _files; } - auto const& user_sources() const { return _user_sources; } + [[nodiscard]] auto type() const { return _type; } + [[nodiscard]] auto const& filepaths() const { return _filepaths; } + [[nodiscard]] auto const& buffers() const { return _buffers; } + [[nodiscard]] auto const& files() const { return _files; } + [[nodiscard]] auto const& user_sources() const { return _user_sources; } private: io_type _type = io_type::FILEPATH; @@ -200,7 +194,7 @@ struct source_info { */ struct sink_info { sink_info() = default; - sink_info(size_t num_sinks) : _type(io_type::VOID), _num_sinks(num_sinks) {} + sink_info(size_t num_sinks) : _num_sinks(num_sinks) {} explicit sink_info(std::vector const& file_paths) : _type(io_type::FILEPATH), _num_sinks(file_paths.size()), _filepaths(file_paths) @@ -226,11 +220,11 @@ struct sink_info { { } - auto type() const { return _type; } - auto num_sinks() const { return _num_sinks; } - auto const& filepaths() const { return _filepaths; } - auto const& buffers() const { return _buffers; } - auto const& user_sinks() const { return _user_sinks; } + [[nodiscard]] auto type() const { return _type; } + [[nodiscard]] auto num_sinks() const { return _num_sinks; } + [[nodiscard]] auto const& filepaths() const { return _filepaths; } + [[nodiscard]] auto const& buffers() const { return _buffers; } + [[nodiscard]] auto const& user_sinks() const { return _user_sinks; } private: io_type _type = io_type::VOID; @@ -344,51 +338,51 @@ class column_in_metadata { * @param i Index of the child to get * @return this for chaining */ - column_in_metadata const& child(size_type i) const { return children[i]; } + [[nodiscard]] column_in_metadata const& child(size_type i) const { return children[i]; } /** * @brief Get the name of this column */ - std::string get_name() const { return _name; } + [[nodiscard]] std::string get_name() const { return _name; } /** * @brief Get whether nullability has been explicitly set for this column. */ - bool is_nullability_defined() const { return _nullable.has_value(); } + [[nodiscard]] bool is_nullability_defined() const { return _nullable.has_value(); } /** * @brief Gets the explicitly set nullability for this column. * @throws If nullability is not explicitly defined for this column. * Check using `is_nullability_defined()` first. */ - bool nullable() const { return _nullable.value(); } + [[nodiscard]] bool nullable() const { return _nullable.value(); } /** * @brief If this is the metadata of a list column, returns whether it is to be encoded as a map. */ - bool is_map() const { return _list_column_is_map; } + [[nodiscard]] bool is_map() const { return _list_column_is_map; } /** * @brief Get whether to encode this timestamp column using deprecated int96 physical type */ - bool is_enabled_int96_timestamps() const { return _use_int96_timestamp; } + [[nodiscard]] bool is_enabled_int96_timestamps() const { return _use_int96_timestamp; } /** * @brief Get whether precision has been set for this decimal column */ - bool is_decimal_precision_set() const { return _decimal_precision.has_value(); } + [[nodiscard]] bool is_decimal_precision_set() const { return _decimal_precision.has_value(); } /** * @brief Get the decimal precision that was set for this column. * @throws If decimal precision was not set for this column. * Check using `is_decimal_precision_set()` first. */ - uint8_t get_decimal_precision() const { return _decimal_precision.value(); } + [[nodiscard]] uint8_t get_decimal_precision() const { return _decimal_precision.value(); } /** * @brief Get the number of children of this column */ - size_type num_children() const { return children.size(); } + [[nodiscard]] size_type num_children() const { return children.size(); } }; class table_input_metadata { diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index 30400074c50..8520cb1bb0d 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -607,9 +607,10 @@ class hash_join { * @return The exact number of output when performing an inner join between two tables with * `build` and `probe` as the the join keys . */ - std::size_t inner_join_size(cudf::table_view const& probe, - null_equality compare_nulls = null_equality::EQUAL, - rmm::cuda_stream_view stream = rmm::cuda_stream_default) const; + [[nodiscard]] std::size_t inner_join_size( + cudf::table_view const& probe, + null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = rmm::cuda_stream_default) const; /** * Returns the exact number of matches (rows) when performing a left join with the specified probe @@ -622,9 +623,10 @@ class hash_join { * @return The exact number of output when performing a left join between two tables with `build` * and `probe` as the the join keys . */ - std::size_t left_join_size(cudf::table_view const& probe, - null_equality compare_nulls = null_equality::EQUAL, - rmm::cuda_stream_view stream = rmm::cuda_stream_default) const; + [[nodiscard]] std::size_t left_join_size( + cudf::table_view const& probe, + null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = rmm::cuda_stream_default) const; /** * Returns the exact number of matches (rows) when performing a full join with the specified probe diff --git a/cpp/include/cudf/lists/detail/scatter_helper.cuh b/cpp/include/cudf/lists/detail/scatter_helper.cuh index bdf68037944..34747f4a2c7 100644 --- a/cpp/include/cudf/lists/detail/scatter_helper.cuh +++ b/cpp/include/cudf/lists/detail/scatter_helper.cuh @@ -91,17 +91,17 @@ struct unbound_list_view { /** * @brief Returns number of elements in this list row. */ - __device__ inline size_type size() const { return _size; } + [[nodiscard]] __device__ inline size_type size() const { return _size; } /** * @brief Returns whether this row came from the `scatter()` source or target */ - __device__ inline label_type label() const { return _label; } + [[nodiscard]] __device__ inline label_type label() const { return _label; } /** * @brief Returns the index in the source/target column */ - __device__ inline size_type row_index() const { return _row_index; } + [[nodiscard]] __device__ inline size_type row_index() const { return _row_index; } /** * @brief Binds to source/target column (depending on SOURCE/TARGET labels), @@ -111,7 +111,7 @@ struct unbound_list_view { * @param scatter_target Target column for the scatter operation * @return A (bound) list_view for the row that this object represents */ - __device__ inline list_device_view bind_to_column( + [[nodiscard]] __device__ inline list_device_view bind_to_column( lists_column_device_view const& scatter_source, lists_column_device_view const& scatter_target) const { diff --git a/cpp/include/cudf/lists/list_device_view.cuh b/cpp/include/cudf/lists/list_device_view.cuh index 5071f046e0c..e4803f98e68 100644 --- a/cpp/include/cudf/lists/list_device_view.cuh +++ b/cpp/include/cudf/lists/list_device_view.cuh @@ -69,7 +69,7 @@ class list_device_view { * The offset of this element as stored in the child column (i.e. 5) * may be fetched using this method. */ - __device__ inline size_type element_offset(size_type idx) const + [[nodiscard]] __device__ inline size_type element_offset(size_type idx) const { cudf_assert(idx >= 0 && idx < size() && "idx out of bounds"); return begin_offset + idx; @@ -91,7 +91,7 @@ class list_device_view { /** * @brief Checks whether element is null at specified index in the list row. */ - __device__ inline bool is_null(size_type idx) const + [[nodiscard]] __device__ inline bool is_null(size_type idx) const { cudf_assert(idx >= 0 && idx < size() && "Index out of bounds."); auto element_offset = begin_offset + idx; @@ -101,17 +101,20 @@ class list_device_view { /** * @brief Checks whether this list row is null. */ - __device__ inline bool is_null() const { return lists_column.is_null(_row_index); } + [[nodiscard]] __device__ inline bool is_null() const { return lists_column.is_null(_row_index); } /** * @brief Fetches the number of elements in this list row. */ - __device__ inline size_type size() const { return _size; } + [[nodiscard]] __device__ inline size_type size() const { return _size; } /** * @brief Fetches the lists_column_device_view that contains this list. */ - __device__ inline lists_column_device_view const& get_column() const { return lists_column; } + [[nodiscard]] __device__ inline lists_column_device_view const& get_column() const + { + return lists_column; + } template struct pair_accessor; @@ -141,7 +144,7 @@ class list_device_view { * 2. `p.second == false` */ template - __device__ inline const_pair_iterator pair_begin() const + [[nodiscard]] __device__ inline const_pair_iterator pair_begin() const { return const_pair_iterator{thrust::counting_iterator(0), pair_accessor{*this}}; } @@ -151,7 +154,7 @@ class list_device_view { * list_device_view. */ template - __device__ inline const_pair_iterator pair_end() const + [[nodiscard]] __device__ inline const_pair_iterator pair_end() const { return const_pair_iterator{thrust::counting_iterator(size()), pair_accessor{*this}}; @@ -173,7 +176,7 @@ class list_device_view { * 2. `p.second == false` */ template - __device__ inline const_pair_rep_iterator pair_rep_begin() const + [[nodiscard]] __device__ inline const_pair_rep_iterator pair_rep_begin() const { return const_pair_rep_iterator{thrust::counting_iterator(0), pair_rep_accessor{*this}}; @@ -184,7 +187,7 @@ class list_device_view { * list_device_view. */ template - __device__ inline const_pair_rep_iterator pair_rep_end() const + [[nodiscard]] __device__ inline const_pair_rep_iterator pair_rep_end() const { return const_pair_rep_iterator{thrust::counting_iterator(size()), pair_rep_accessor{*this}}; diff --git a/cpp/include/cudf/lists/lists_column_device_view.cuh b/cpp/include/cudf/lists/lists_column_device_view.cuh index aff088a7f44..e48707ec298 100644 --- a/cpp/include/cudf/lists/lists_column_device_view.cuh +++ b/cpp/include/cudf/lists/lists_column_device_view.cuh @@ -46,12 +46,12 @@ class lists_column_device_view { /** * @brief Fetches number of rows in the lists column */ - CUDF_HOST_DEVICE inline cudf::size_type size() const { return underlying.size(); } + [[nodiscard]] CUDF_HOST_DEVICE inline cudf::size_type size() const { return underlying.size(); } /** * @brief Fetches the offsets column of the underlying list column. */ - __device__ inline column_device_view offsets() const + [[nodiscard]] __device__ inline column_device_view offsets() const { return underlying.child(lists_column_view::offsets_column_index); } @@ -59,7 +59,7 @@ class lists_column_device_view { /** * @brief Fetches the child column of the underlying list column. */ - __device__ inline column_device_view child() const + [[nodiscard]] __device__ inline column_device_view child() const { return underlying.child(lists_column_view::child_column_index); } @@ -67,19 +67,22 @@ class lists_column_device_view { /** * @brief Indicates whether the list column is nullable. */ - __device__ inline bool nullable() const { return underlying.nullable(); } + [[nodiscard]] __device__ inline bool nullable() const { return underlying.nullable(); } /** * @brief Indicates whether the row (i.e. list) at the specified * index is null. */ - __device__ inline bool is_null(size_type idx) const { return underlying.is_null(idx); } + [[nodiscard]] __device__ inline bool is_null(size_type idx) const + { + return underlying.is_null(idx); + } /** * @brief Fetches the offset of the underlying column_device_view, * in case it is a sliced/offset column. */ - __device__ inline size_type offset() const { return underlying.offset(); } + [[nodiscard]] __device__ inline size_type offset() const { return underlying.offset(); } private: column_device_view underlying; diff --git a/cpp/include/cudf/lists/lists_column_view.hpp b/cpp/include/cudf/lists/lists_column_view.hpp index b055a050bf8..d09bc2c935f 100644 --- a/cpp/include/cudf/lists/lists_column_view.hpp +++ b/cpp/include/cudf/lists/lists_column_view.hpp @@ -63,21 +63,21 @@ class lists_column_view : private column_view { /** * @brief Returns the parent column. */ - column_view parent() const; + [[nodiscard]] column_view parent() const; /** * @brief Returns the internal column of offsets * * @throw cudf::logic error if this is an empty column */ - column_view offsets() const; + [[nodiscard]] column_view offsets() const; /** * @brief Returns the internal child column * * @throw cudf::logic error if this is an empty column */ - column_view child() const; + [[nodiscard]] column_view child() const; /** * @brief Returns the internal child column, applying any offset from the root. @@ -89,14 +89,14 @@ class lists_column_view : private column_view { * * @throw cudf::logic error if this is an empty column */ - column_view get_sliced_child(rmm::cuda_stream_view stream) const; + [[nodiscard]] column_view get_sliced_child(rmm::cuda_stream_view stream) const; /** * @brief Return first offset (accounting for column offset) * * @return int32_t const* Pointer to the first offset */ - offset_iterator offsets_begin() const noexcept + [[nodiscard]] offset_iterator offsets_begin() const noexcept { return offsets().begin() + offset(); } @@ -111,7 +111,10 @@ class lists_column_view : private column_view { * * @return int32_t const* Pointer to one past the last offset */ - offset_iterator offsets_end() const noexcept { return offsets_begin() + size() + 1; } + [[nodiscard]] offset_iterator offsets_end() const noexcept + { + return offsets_begin() + size() + 1; + } }; /** @} */ // end of group } // namespace cudf diff --git a/cpp/include/cudf/rolling/range_window_bounds.hpp b/cpp/include/cudf/rolling/range_window_bounds.hpp index a4f0a51eac7..4d31bb98f9c 100644 --- a/cpp/include/cudf/rolling/range_window_bounds.hpp +++ b/cpp/include/cudf/rolling/range_window_bounds.hpp @@ -56,12 +56,12 @@ struct range_window_bounds { * @return true If window is unbounded * @return false If window is of finite bounds */ - bool is_unbounded() const { return _is_unbounded; } + [[nodiscard]] bool is_unbounded() const { return _is_unbounded; } /** * @brief Returns the underlying scalar value for the bounds */ - scalar const& range_scalar() const { return *_range_scalar; } + [[nodiscard]] scalar const& range_scalar() const { return *_range_scalar; } range_window_bounds(range_window_bounds const&) = default; // Required to return (by copy) from functions. diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp index dc2df368bae..0db729aec28 100644 --- a/cpp/include/cudf/scalar/scalar.hpp +++ b/cpp/include/cudf/scalar/scalar.hpp @@ -52,7 +52,7 @@ class scalar { /** * @brief Returns the scalar's logical value type. */ - data_type type() const noexcept; + [[nodiscard]] data_type type() const noexcept; /** * @brief Updates the validity of the value. @@ -72,7 +72,7 @@ class scalar { * @return true Value is valid. * @return false Value is invalid/null. */ - bool is_valid(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const; + [[nodiscard]] bool is_valid(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const; /** * @brief Returns a raw pointer to the validity bool in device memory. @@ -82,7 +82,7 @@ class scalar { /** * @brief Returns a const raw pointer to the validity bool in device memory. */ - bool const* validity_data() const; + [[nodiscard]] bool const* validity_data() const; protected: data_type _type{type_id::EMPTY}; ///< Logical type of value in the scalar @@ -128,7 +128,7 @@ class fixed_width_scalar : public scalar { public: using value_type = T; - ~fixed_width_scalar() = default; + ~fixed_width_scalar() override = default; fixed_width_scalar(fixed_width_scalar&& other) = default; fixed_width_scalar& operator=(fixed_width_scalar const& other) = delete; @@ -278,7 +278,7 @@ class fixed_point_scalar : public scalar { using value_type = T; fixed_point_scalar() = delete; - ~fixed_point_scalar() = default; + ~fixed_point_scalar() override = default; fixed_point_scalar(fixed_point_scalar&& other) = default; fixed_point_scalar& operator=(fixed_point_scalar const& other) = delete; @@ -392,7 +392,7 @@ class string_scalar : public scalar { using value_type = cudf::string_view; string_scalar() = delete; - ~string_scalar() = default; + ~string_scalar() override = default; string_scalar(string_scalar&& other) = default; // string_scalar(string_scalar const& other) = delete; @@ -479,24 +479,25 @@ class string_scalar : public scalar { * * @param stream CUDA stream used for device memory operations. */ - std::string to_string(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const; + [[nodiscard]] std::string to_string( + rmm::cuda_stream_view stream = rmm::cuda_stream_default) const; /** * @brief Get the value of the scalar as a string_view. * * @param stream CUDA stream used for device memory operations. */ - value_type value(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const; + [[nodiscard]] value_type value(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const; /** * @brief Returns the size of the string in bytes. */ - size_type size() const; + [[nodiscard]] size_type size() const; /** * @brief Returns a raw pointer to the string in device memory. */ - const char* data() const; + [[nodiscard]] const char* data() const; protected: rmm::device_buffer _data{}; ///< device memory containing the string @@ -647,7 +648,7 @@ class duration_scalar : public chrono_scalar { class list_scalar : public scalar { public: list_scalar() = delete; - ~list_scalar() = default; + ~list_scalar() override = default; list_scalar(list_scalar&& other) = default; list_scalar& operator=(list_scalar const& other) = delete; @@ -695,7 +696,7 @@ class list_scalar : public scalar { /** * @brief Returns a non-owning, immutable view to underlying device data. */ - column_view view() const; + [[nodiscard]] column_view view() const; private: cudf::column _data; @@ -707,7 +708,7 @@ class list_scalar : public scalar { class struct_scalar : public scalar { public: struct_scalar() = delete; - ~struct_scalar() = default; + ~struct_scalar() override = default; struct_scalar(struct_scalar&& other) = default; struct_scalar& operator=(struct_scalar const& other) = delete; struct_scalar& operator=(struct_scalar&& other) = delete; @@ -765,7 +766,7 @@ class struct_scalar : public scalar { /** * @brief Returns a non-owning, immutable view to underlying device data. */ - table_view view() const; + [[nodiscard]] table_view view() const; private: table _data; diff --git a/cpp/include/cudf/scalar/scalar_device_view.cuh b/cpp/include/cudf/scalar/scalar_device_view.cuh index 56afa150dfc..ae658da9f9b 100644 --- a/cpp/include/cudf/scalar/scalar_device_view.cuh +++ b/cpp/include/cudf/scalar/scalar_device_view.cuh @@ -37,7 +37,7 @@ class scalar_device_view_base { /** * @brief Returns the value type */ - __host__ __device__ data_type type() const noexcept { return _type; } + [[nodiscard]] __host__ __device__ data_type type() const noexcept { return _type; } /** * @brief Returns whether the scalar holds a valid value (i.e., not null). @@ -45,7 +45,7 @@ class scalar_device_view_base { * @return true The element is valid * @return false The element is null */ - __device__ bool is_valid() const noexcept { return *_is_valid; } + [[nodiscard]] __device__ bool is_valid() const noexcept { return *_is_valid; } /** * @brief Updates the validity of the value @@ -260,17 +260,23 @@ class string_scalar_device_view : public detail::scalar_device_view_base { /** * @brief Returns string_view of the value of this scalar. */ - __device__ ValueType value() const noexcept { return ValueType{this->data(), _size}; } + [[nodiscard]] __device__ ValueType value() const noexcept + { + return ValueType{this->data(), _size}; + } /** * @brief Returns a raw pointer to the value in device memory */ - __device__ char const* data() const noexcept { return static_cast(_data); } + [[nodiscard]] __device__ char const* data() const noexcept + { + return static_cast(_data); + } /** * @brief Returns the size of the string in bytes. */ - __device__ size_type size() const noexcept { return _size; } + [[nodiscard]] __device__ size_type size() const noexcept { return _size; } private: const char* _data{}; ///< Pointer to device memory containing the value diff --git a/cpp/include/cudf/strings/json.hpp b/cpp/include/cudf/strings/json.hpp index 8435c47eaf5..f6645f2e029 100644 --- a/cpp/include/cudf/strings/json.hpp +++ b/cpp/include/cudf/strings/json.hpp @@ -48,7 +48,10 @@ class get_json_object_options { * @brief Returns true/false depending on whether single-quotes for representing strings * are allowed. */ - CUDF_HOST_DEVICE inline bool get_allow_single_quotes() const { return allow_single_quotes; } + [[nodiscard]] CUDF_HOST_DEVICE inline bool get_allow_single_quotes() const + { + return allow_single_quotes; + } /** * @brief Returns true/false depending on whether individually returned string values have @@ -72,7 +75,7 @@ class get_json_object_options { * * @endcode */ - CUDF_HOST_DEVICE inline bool get_strip_quotes_from_single_strings() const + [[nodiscard]] CUDF_HOST_DEVICE inline bool get_strip_quotes_from_single_strings() const { return strip_quotes_from_single_strings; } diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index 43a90997c86..24c8bfea2be 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -44,7 +44,7 @@ namespace detail { */ __device__ inline size_type characters_in_string(const char* str, size_type bytes) { - if ((str == 0) || (bytes == 0)) return 0; + if ((str == nullptr) || (bytes == 0)) return 0; auto ptr = reinterpret_cast(str); #ifndef CUDF_JIT_UDF return thrust::count_if( @@ -271,9 +271,9 @@ __device__ inline int string_view::compare(const string_view& in) const __device__ inline int string_view::compare(const char* data, size_type bytes) const { - size_type const len1 = size_bytes(); - const unsigned char* ptr1 = reinterpret_cast(this->data()); - const unsigned char* ptr2 = reinterpret_cast(data); + size_type const len1 = size_bytes(); + const auto* ptr1 = reinterpret_cast(this->data()); + const auto* ptr2 = reinterpret_cast(data); if ((ptr1 == ptr2) && (bytes == len1)) return 0; size_type idx = 0; for (; (idx < len1) && (idx < bytes); ++idx) { diff --git a/cpp/include/cudf/strings/string_view.hpp b/cpp/include/cudf/strings/string_view.hpp index 22409ab3dc7..f88f573ac0c 100644 --- a/cpp/include/cudf/strings/string_view.hpp +++ b/cpp/include/cudf/strings/string_view.hpp @@ -51,20 +51,20 @@ class string_view { /** * @brief Return the number of bytes in this string */ - CUDF_HOST_DEVICE inline size_type size_bytes() const { return _bytes; } + CUDF_HOST_DEVICE [[nodiscard]] inline size_type size_bytes() const { return _bytes; } /** * @brief Return the number of characters in this string */ - __device__ inline size_type length() const; + __device__ [[nodiscard]] inline size_type length() const; /** * @brief Return a pointer to the internal device array */ - CUDF_HOST_DEVICE inline const char* data() const { return _data; } + CUDF_HOST_DEVICE [[nodiscard]] inline const char* data() const { return _data; } /** * @brief Return true if string has no characters */ - CUDF_HOST_DEVICE inline bool empty() const { return size_bytes() == 0; } + CUDF_HOST_DEVICE [[nodiscard]] inline bool empty() const { return size_bytes() == 0; } /** * @brief Handy iterator for navigating through encoded characters. @@ -96,8 +96,8 @@ class string_view { __device__ inline bool operator>(const const_iterator&) const; __device__ inline bool operator>=(const const_iterator&) const; __device__ inline char_utf8 operator*() const; - __device__ inline size_type position() const; - __device__ inline size_type byte_offset() const; + [[nodiscard]] __device__ inline size_type position() const; + [[nodiscard]] __device__ inline size_type byte_offset() const; private: const char* p{}; @@ -109,11 +109,11 @@ class string_view { /** * @brief Return new iterator pointing to the beginning of this string */ - __device__ inline const_iterator begin() const; + __device__ [[nodiscard]] inline const_iterator begin() const; /** * @brief Return new iterator pointing past the end of this string */ - __device__ inline const_iterator end() const; + __device__ [[nodiscard]] inline const_iterator end() const; /** * @brief Return single UTF-8 character at the given character position @@ -126,7 +126,7 @@ class string_view { * * @param pos Character position */ - __device__ inline size_type byte_offset(size_type pos) const; + __device__ [[nodiscard]] inline size_type byte_offset(size_type pos) const; /** * @brief Comparing target string with this string. Each character is compared @@ -141,7 +141,7 @@ class string_view { * not match is greater in the arg string, or all compared characters * match but the arg string is longer. */ - __device__ inline int compare(const string_view& str) const; + __device__ [[nodiscard]] inline int compare(const string_view& str) const; /** * @brief Comparing target string with this string. Each character is compared * as a UTF-8 code-point value. @@ -193,9 +193,9 @@ class string_view { * Specify -1 to indicate to the end of the string. * @return -1 if str is not found in this string. */ - __device__ inline size_type find(const string_view& str, - size_type pos = 0, - size_type count = -1) const; + __device__ [[nodiscard]] inline size_type find(const string_view& str, + size_type pos = 0, + size_type count = -1) const; /** * @brief Returns the character position of the first occurrence where the * array str is found in this string within the character range [pos,pos+n). @@ -221,9 +221,9 @@ class string_view { * Specify -1 to indicate to the end of the string. * @return -1 if arg string is not found in this string. */ - __device__ inline size_type find(char_utf8 character, - size_type pos = 0, - size_type count = -1) const; + __device__ [[nodiscard]] inline size_type find(char_utf8 character, + size_type pos = 0, + size_type count = -1) const; /** * @brief Returns the character position of the last occurrence where the * argument str is found in this string within the character range [pos,pos+n). @@ -234,9 +234,9 @@ class string_view { * Specify -1 to indicate to the end of the string. * @return -1 if arg string is not found in this string. */ - __device__ inline size_type rfind(const string_view& str, - size_type pos = 0, - size_type count = -1) const; + __device__ [[nodiscard]] inline size_type rfind(const string_view& str, + size_type pos = 0, + size_type count = -1) const; /** * @brief Returns the character position of the last occurrence where the * array str is found in this string within the character range [pos,pos+n). @@ -262,9 +262,9 @@ class string_view { * Specify -1 to indicate to the end of the string. * @return -1 if arg string is not found in this string. */ - __device__ inline size_type rfind(char_utf8 character, - size_type pos = 0, - size_type count = -1) const; + __device__ [[nodiscard]] inline size_type rfind(char_utf8 character, + size_type pos = 0, + size_type count = -1) const; /** * @brief Return a sub-string of this string. The original string and device @@ -274,7 +274,7 @@ class string_view { * @param length Number of characters from start to include in the sub-string. * @return New instance pointing to a subset of the characters within this instance. */ - __device__ inline string_view substr(size_type start, size_type length) const; + __device__ [[nodiscard]] inline string_view substr(size_type start, size_type length) const; /** * @brief Return minimum value associated with the string type @@ -300,7 +300,7 @@ class string_view { /** * @brief Default constructor represents an empty string. */ - CUDF_HOST_DEVICE inline string_view() : _data(""), _bytes(0), _length(0) {} + CUDF_HOST_DEVICE inline string_view() : _data("") {} /** * @brief Create instance from existing device char array. @@ -330,7 +330,7 @@ class string_view { * @param bytepos Byte position from start of _data. * @return The character position for the specified byte. */ - __device__ inline size_type character_offset(size_type bytepos) const; + __device__ [[nodiscard]] inline size_type character_offset(size_type bytepos) const; }; namespace strings { diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp index fb3b9387a9b..aab898932de 100644 --- a/cpp/include/cudf/strings/strings_column_view.hpp +++ b/cpp/include/cudf/strings/strings_column_view.hpp @@ -58,14 +58,14 @@ class strings_column_view : private column_view { /** * @brief Returns the parent column. */ - column_view parent() const; + [[nodiscard]] column_view parent() const; /** * @brief Returns the internal column of offsets * * @throw cudf::logic error if this is an empty column */ - column_view offsets() const; + [[nodiscard]] column_view offsets() const; /** * @brief Return an iterator for the offsets child column. @@ -74,7 +74,7 @@ class strings_column_view : private column_view { * * @return Iterator pointing to the first offset value. */ - offset_iterator offsets_begin() const; + [[nodiscard]] offset_iterator offsets_begin() const; /** * @brief Return an end iterator for the offsets child column. @@ -83,14 +83,14 @@ class strings_column_view : private column_view { * * @return Iterator pointing 1 past the last offset value. */ - offset_iterator offsets_end() const; + [[nodiscard]] offset_iterator offsets_end() const; /** * @brief Returns the internal column of chars * * @throw cudf::logic error if this is an empty column */ - column_view chars() const; + [[nodiscard]] column_view chars() const; /** * @brief Returns the number of bytes in the chars child column. @@ -98,7 +98,7 @@ class strings_column_view : private column_view { * This accounts for empty columns but does not reflect a sliced parent column * view (i.e.: non-zero offset or reduced row count). */ - size_type chars_size() const noexcept; + [[nodiscard]] size_type chars_size() const noexcept; /** * @brief Return an iterator for the chars child column. @@ -111,7 +111,7 @@ class strings_column_view : private column_view { * * @return Iterator pointing to the first char byte. */ - chars_iterator chars_begin() const; + [[nodiscard]] chars_iterator chars_begin() const; /** * @brief Return an end iterator for the offsets child column. @@ -121,7 +121,7 @@ class strings_column_view : private column_view { * * @return Iterator pointing 1 past the last char byte. */ - chars_iterator chars_end() const; + [[nodiscard]] chars_iterator chars_end() const; }; //! Strings column APIs. diff --git a/cpp/include/cudf/structs/structs_column_view.hpp b/cpp/include/cudf/structs/structs_column_view.hpp index a25dce9c278..329c24cfe0a 100644 --- a/cpp/include/cudf/structs/structs_column_view.hpp +++ b/cpp/include/cudf/structs/structs_column_view.hpp @@ -60,7 +60,7 @@ class structs_column_view : public column_view { * * @throw cudf::logic error if this is an empty column */ - column_view get_sliced_child(int index) const; + [[nodiscard]] column_view get_sliced_child(int index) const; }; // class structs_column_view; /** @} */ // end of group } // namespace cudf diff --git a/cpp/include/cudf/table/table.hpp b/cpp/include/cudf/table/table.hpp index 3c4b4dda61e..4a3c31d08e9 100644 --- a/cpp/include/cudf/table/table.hpp +++ b/cpp/include/cudf/table/table.hpp @@ -71,18 +71,18 @@ class table { /** * @brief Returns the number of columns in the table */ - size_type num_columns() const noexcept { return _columns.size(); } + [[nodiscard]] size_type num_columns() const noexcept { return _columns.size(); } /** * @brief Returns the number of rows */ - size_type num_rows() const noexcept { return _num_rows; } + [[nodiscard]] size_type num_rows() const noexcept { return _num_rows; } /** * @brief Returns an immutable, non-owning `table_view` of the contents of *this `table`. */ - table_view view() const; + [[nodiscard]] table_view view() const; /** * @brief Conversion operator to an immutable, non-owning `table_view` of the @@ -141,7 +141,7 @@ class table { * @return A table_view consisting of columns from the original table * specified by the elements of `column_indices` */ - table_view select(std::vector const& column_indices) const + [[nodiscard]] table_view select(std::vector const& column_indices) const { return select(column_indices.begin(), column_indices.end()); }; @@ -166,7 +166,7 @@ class table { * @param i Index of the desired column * @return A const reference to the desired column */ - column const& get_column(cudf::size_type i) const { return *(_columns.at(i)); } + [[nodiscard]] column const& get_column(cudf::size_type i) const { return *(_columns.at(i)); } private: std::vector> _columns{}; diff --git a/cpp/include/cudf/table/table_device_view.cuh b/cpp/include/cudf/table/table_device_view.cuh index 2404fe88a9c..ce61e8853b6 100644 --- a/cpp/include/cudf/table/table_device_view.cuh +++ b/cpp/include/cudf/table/table_device_view.cuh @@ -61,9 +61,9 @@ class table_device_view_base { return _columns[column_index]; } - __host__ __device__ size_type num_columns() const noexcept { return _num_columns; } + [[nodiscard]] __host__ __device__ size_type num_columns() const noexcept { return _num_columns; } - __host__ __device__ size_type num_rows() const noexcept { return _num_rows; } + [[nodiscard]] __host__ __device__ size_type num_rows() const noexcept { return _num_rows; } void destroy(); diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp index 8abd7aed8e9..77b9e539506 100644 --- a/cpp/include/cudf/table/table_view.hpp +++ b/cpp/include/cudf/table/table_view.hpp @@ -87,7 +87,7 @@ class table_view_base { /** * @brief Returns an iterator to the first view in the `table`. */ - const_iterator begin() const noexcept { return std::begin(_columns); } + [[nodiscard]] const_iterator begin() const noexcept { return std::begin(_columns); } /** * @brief Returns an iterator one past the last column view in the `table`. @@ -103,7 +103,7 @@ class table_view_base { * `end()` acts as a place holder. Attempting to dereference it results in * undefined behavior. */ - const_iterator end() const noexcept { return std::end(_columns); } + [[nodiscard]] const_iterator end() const noexcept { return std::end(_columns); } /** * @brief Returns a reference to the view of the specified column @@ -119,17 +119,17 @@ class table_view_base { /** * @brief Returns the number of columns */ - size_type num_columns() const noexcept { return _columns.size(); } + [[nodiscard]] size_type num_columns() const noexcept { return _columns.size(); } /** * @brief Returns the number of rows */ - size_type num_rows() const noexcept { return _num_rows; } + [[nodiscard]] size_type num_rows() const noexcept { return _num_rows; } /** * @brief Returns true if `num_columns()` returns zero, or false otherwise */ - size_type is_empty() const noexcept { return num_columns() == 0; } + [[nodiscard]] size_type is_empty() const noexcept { return num_columns() == 0; } table_view_base() = default; @@ -208,7 +208,7 @@ class table_view : public detail::table_view_base { * @return A table_view consisting of columns from the original table * specified by the elements of `column_indices` */ - table_view select(std::vector const& column_indices) const; + [[nodiscard]] table_view select(std::vector const& column_indices) const; }; /** @@ -227,7 +227,7 @@ class mutable_table_view : public detail::table_view_base { mutable_table_view() = default; - mutable_column_view& column(size_type column_index) const + [[nodiscard]] mutable_column_view& column(size_type column_index) const { return const_cast(table_view_base::column(column_index)); } diff --git a/cpp/include/cudf/tdigest/tdigest_column_view.cuh b/cpp/include/cudf/tdigest/tdigest_column_view.cuh index c7513452387..696657191ca 100644 --- a/cpp/include/cudf/tdigest/tdigest_column_view.cuh +++ b/cpp/include/cudf/tdigest/tdigest_column_view.cuh @@ -82,28 +82,28 @@ class tdigest_column_view : private column_view { /** * @brief Returns the parent column. */ - column_view parent() const; + [[nodiscard]] column_view parent() const; /** * @brief Returns the column of centroids */ - lists_column_view centroids() const; + [[nodiscard]] lists_column_view centroids() const; /** * @brief Returns the internal column of mean values */ - column_view means() const; + [[nodiscard]] column_view means() const; /** * @brief Returns the internal column of weight values */ - column_view weights() const; + [[nodiscard]] column_view weights() const; /** * @brief Returns an iterator that returns the size of each tdigest * in the column (each row is 1 digest) */ - auto size_begin() const + [[nodiscard]] auto size_begin() const { return cudf::detail::make_counting_transform_iterator( 0, tdigest_size{centroids().offsets_begin()}); @@ -113,13 +113,13 @@ class tdigest_column_view : private column_view { * @brief Returns the first min value for the column. Each row corresponds * to the minimum value for the accompanying digest. */ - double const* min_begin() const; + [[nodiscard]] double const* min_begin() const; /** * @brief Returns the first max value for the column. Each row corresponds * to the maximum value for the accompanying digest. */ - double const* max_begin() const; + [[nodiscard]] double const* max_begin() const; }; } // namespace tdigest diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp index 459a4182aa0..6222b2e680e 100644 --- a/cpp/include/cudf/types.hpp +++ b/cpp/include/cudf/types.hpp @@ -268,12 +268,12 @@ class data_type { /** * @brief Returns the type identifier */ - constexpr type_id id() const noexcept { return _id; } + [[nodiscard]] constexpr type_id id() const noexcept { return _id; } /** * @brief Returns the scale (for fixed_point types) */ - constexpr int32_t scale() const noexcept { return _fixed_point_scale; } + [[nodiscard]] constexpr int32_t scale() const noexcept { return _fixed_point_scale; } private: type_id _id{type_id::EMPTY}; diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp index 766fe93b9d1..0ac41b2c4a1 100644 --- a/cpp/include/cudf/utilities/span.hpp +++ b/cpp/include/cudf/utilities/span.hpp @@ -54,7 +54,7 @@ class span_base { static constexpr std::size_t extent = Extent; - constexpr span_base() noexcept : _data(nullptr), _size(0) {} + constexpr span_base() noexcept {} constexpr span_base(pointer data, size_type size) : _data(data), _size(size) {} // constexpr span_base(pointer begin, pointer end) : _data(begin), _size(end - begin) {} constexpr span_base(span_base const& other) noexcept = default; @@ -71,9 +71,9 @@ class span_base { constexpr iterator end() const noexcept { return _data + _size; } constexpr pointer data() const noexcept { return _data; } - constexpr size_type size() const noexcept { return _size; } - constexpr size_type size_bytes() const noexcept { return sizeof(T) * _size; } - constexpr bool empty() const noexcept { return _size == 0; } + [[nodiscard]] constexpr size_type size() const noexcept { return _size; } + [[nodiscard]] constexpr size_type size_bytes() const noexcept { return sizeof(T) * _size; } + [[nodiscard]] constexpr bool empty() const noexcept { return _size == 0; } /** * @brief Obtains a subspan consisting of the first N elements of the sequence @@ -98,8 +98,8 @@ class span_base { } private: - pointer _data; - size_type _size; + pointer _data{nullptr}; + size_type _size{0}; }; } // namespace detail @@ -251,7 +251,7 @@ class base_2dspan { constexpr auto data() const noexcept { return _data; } constexpr auto size() const noexcept { return _size; } constexpr auto count() const noexcept { return size().first * size().second; } - constexpr bool is_empty() const noexcept { return count() == 0; } + [[nodiscard]] constexpr bool is_empty() const noexcept { return count() == 0; } static constexpr size_t flatten_index(size_t row, size_t column, size_type size) noexcept { @@ -263,8 +263,11 @@ class base_2dspan { return {this->data() + flatten_index(row, 0, this->size()), this->size().second}; } - constexpr RowType front() const { return (*this)[0]; } - constexpr RowType back() const { return (*this)[size().first - 1]; } + [[nodiscard]] constexpr RowType front() const { return (*this)[0]; } + [[nodiscard]] constexpr RowType back() const + { + return (*this)[size().first - 1]; + } constexpr base_2dspan subspan(size_t first_row, size_t num_rows) const noexcept { diff --git a/cpp/include/cudf_test/cudf_gtest.hpp b/cpp/include/cudf_test/cudf_gtest.hpp index 87e4c94070b..d078bf90a8a 100644 --- a/cpp/include/cudf_test/cudf_gtest.hpp +++ b/cpp/include/cudf_test/cudf_gtest.hpp @@ -79,7 +79,7 @@ using Templates0 = Templates<>; template struct TypeList { - typedef Types type; + using type = Types; }; template diff --git a/cpp/include/cudf_test/file_utilities.hpp b/cpp/include/cudf_test/file_utilities.hpp index 8e242e5a4f3..6c21d8dfad2 100644 --- a/cpp/include/cudf_test/file_utilities.hpp +++ b/cpp/include/cudf_test/file_utilities.hpp @@ -58,5 +58,5 @@ class temp_directory { * * @return string path of the temporary directory */ - const std::string& path() const { return _path; } + [[nodiscard]] const std::string& path() const { return _path; } }; diff --git a/cpp/include/nvtext/detail/load_hash_file.hpp b/cpp/include/nvtext/detail/load_hash_file.hpp index b105c5c280e..9f4640f1daf 100644 --- a/cpp/include/nvtext/detail/load_hash_file.hpp +++ b/cpp/include/nvtext/detail/load_hash_file.hpp @@ -21,8 +21,8 @@ #include -#include -#include +#include +#include namespace nvtext { namespace detail { diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp index 2b09ec66203..43cc059eddd 100644 --- a/cpp/include/nvtext/subword_tokenize.hpp +++ b/cpp/include/nvtext/subword_tokenize.hpp @@ -19,8 +19,8 @@ #include #include -#include -#include +#include +#include namespace nvtext { diff --git a/cpp/scripts/run-clang-tidy.py b/cpp/scripts/run-clang-tidy.py new file mode 100644 index 00000000000..3a1a663e231 --- /dev/null +++ b/cpp/scripts/run-clang-tidy.py @@ -0,0 +1,254 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function +import re +import os +import subprocess +import argparse +import json +import multiprocessing as mp +import shutil + + +EXPECTED_VERSION = "11.1.0" +VERSION_REGEX = re.compile(r" LLVM version ([0-9.]+)") +GPU_ARCH_REGEX = re.compile(r"sm_(\d+)") +SPACES = re.compile(r"\s+") +SEPARATOR = "-" * 16 + + +def parse_args(): + argparser = argparse.ArgumentParser("Runs clang-tidy on a project") + argparser.add_argument("-cdb", type=str, + # TODO This is a hack, needs to be fixed + default="cpp/build/cuda-11.5.0/clang-tidy/release/compile_commands.clangd.json", + help="Path to cmake-generated compilation database" + " file. It is always found inside the root of the " + "cmake build folder. So make sure that `cmake` has " + "been run once before running this script!") + argparser.add_argument("-exe", type=str, default="clang-tidy", + help="Path to clang-tidy exe") + argparser.add_argument("-ignore", type=str, default="[.]cu$|examples/kmeans/", + help="Regex used to ignore files from checking") + argparser.add_argument("-select", type=str, default=None, + help="Regex used to select files for checking") + argparser.add_argument("-j", type=int, default=-1, + help="Number of parallel jobs to launch.") + args = argparser.parse_args() + if args.j <= 0: + args.j = mp.cpu_count() + args.ignore_compiled = re.compile(args.ignore) if args.ignore else None + args.select_compiled = re.compile(args.select) if args.select else None + ret = subprocess.check_output("%s --version" % args.exe, shell=True) + ret = ret.decode("utf-8") + version = VERSION_REGEX.search(ret) + if version is None: + raise Exception("Failed to figure out clang-tidy version!") + version = version.group(1) + if version != EXPECTED_VERSION: + raise Exception("clang-tidy exe must be v%s found '%s'" % \ + (EXPECTED_VERSION, version)) + if not os.path.exists(args.cdb): + raise Exception("Compilation database '%s' missing" % args.cdb) + return args + + +def get_all_commands(cdb): + with open(cdb, "r") as fp: + return json.load(fp) + + +def get_gpu_archs(command): + archs = [] + for loc in range(len(command)): + if command[loc] != "-gencode": + continue + arch_flag = command[loc + 1] + match = GPU_ARCH_REGEX.search(arch_flag) + if match is not None: + archs.append("--cuda-gpu-arch=sm_%s" % match.group(1)) + return archs + + +def get_index(arr, item): + try: + return arr.index(item) + except: + return -1 + + +def remove_item(arr, item): + loc = get_index(arr, item) + if loc >= 0: + del arr[loc] + return loc + + +def remove_item_plus_one(arr, item): + loc = get_index(arr, item) + if loc >= 0: + del arr[loc + 1] + del arr[loc] + return loc + + +def get_clang_includes(exe): + dir = os.getenv("CONDA_PREFIX") + if dir is None: + ret = subprocess.check_output("which %s 2>&1" % exe, shell=True) + ret = ret.decode("utf-8") + dir = os.path.dirname(os.path.dirname(ret)) + header = os.path.join(dir, "include", "ClangHeaders") + return ["-I", header] + + +def get_tidy_args(cmd, exe): + command, file = cmd["command"], cmd["file"] + is_cuda = file.endswith(".cu") + command = re.split(SPACES, command) + # compiler is always clang++! + command[0] = "clang++" + # remove compilation and output targets from the original command + remove_item_plus_one(command, "-c") + remove_item_plus_one(command, "-o") + if is_cuda: + # replace nvcc's "-gencode ..." with clang's "--cuda-gpu-arch ..." + archs = get_gpu_archs(command) + command.extend(archs) + while True: + loc = remove_item_plus_one(command, "-gencode") + if loc < 0: + break + # "-x cuda" is the right usage in clang + loc = get_index(command, "-x") + if loc >= 0: + command[loc + 1] = "cuda" + remove_item_plus_one(command, "-ccbin") + remove_item(command, "--expt-extended-lambda") + remove_item(command, "--diag_suppress=unrecognized_gcc_pragma") + command.extend(get_clang_includes(exe)) + return command, is_cuda + + +def run_clang_tidy_command(tidy_cmd): + cmd = " ".join(tidy_cmd) + result = subprocess.run(cmd, check=False, shell=True, + stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + status = result.returncode == 0 + if status: + out = "" + else: + out = "CMD: " + cmd + out += result.stdout.decode("utf-8").rstrip() + return status, out + + +def run_clang_tidy(cmd, args): + command, is_cuda = get_tidy_args(cmd, args.exe) + tidy_cmd = [args.exe, + "-header-filter='.*cudf/cpp/(src|include|bench|comms).*'", + cmd["file"], "--", ] + tidy_cmd.extend(command) + status = True + out = "" + if is_cuda: + tidy_cmd.append("--cuda-device-only") + tidy_cmd.append(cmd["file"]) + ret, out1 = run_clang_tidy_command(tidy_cmd) + out += out1 + out += "%s" % SEPARATOR + if not ret: + status = ret + tidy_cmd[-2] = "--cuda-host-only" + ret, out1 = run_clang_tidy_command(tidy_cmd) + if not ret: + status = ret + out += out1 + else: + tidy_cmd.append(cmd["file"]) + ret, out1 = run_clang_tidy_command(tidy_cmd) + if not ret: + status = ret + out += out1 + return status, out, cmd["file"] + + +# yikes! global var :( +results = [] +def collect_result(result): + global results + results.append(result) + + +def print_result(passed, stdout, file): + status_str = "PASSED" if passed else "FAILED" + print("%s File:%s %s %s" % (SEPARATOR, file, status_str, SEPARATOR)) + if stdout: + print(stdout) + print("%s File:%s ENDS %s" % (SEPARATOR, file, SEPARATOR)) + + +def print_results(): + global results + status = True + for passed, stdout, file in results: + print_result(passed, stdout, file) + if not passed: + status = False + return status + + +def run_tidy_for_all_files(args, all_files): + pool = None if args.j == 1 else mp.Pool(args.j) + # actual tidy checker + for cmd in all_files: + # skip files that we don't want to look at + if args.ignore_compiled is not None and \ + re.search(args.ignore_compiled, cmd["file"]) is not None: + continue + if args.select_compiled is not None and \ + re.search(args.select_compiled, cmd["file"]) is None: + continue + if pool is not None: + pool.apply_async(run_clang_tidy, args=(cmd, args), + callback=collect_result) + else: + passed, stdout, file = run_clang_tidy(cmd, args) + collect_result((passed, stdout, file)) + if pool is not None: + pool.close() + pool.join() + return print_results() + + +def main(): + args = parse_args() + # Attempt to making sure that we run this script from root of repo always + if not os.path.exists(".git"): + raise Exception("This needs to always be run from the root of repo") + # Check whether clang-tidy exists + # print(args) + if "exe" not in args and shutil.which("clang-tidy") is not None: + print("clang-tidy not found. Exiting...") + return + all_files = get_all_commands(args.cdb) + status = run_tidy_for_all_files(args, all_files) + if not status: + raise Exception("clang-tidy failed! Refer to the errors above.") + + +if __name__ == "__main__": + main() diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu index 1d12fac1938..995c6702cf8 100644 --- a/cpp/src/binaryop/compiled/binary_ops.cu +++ b/cpp/src/binaryop/compiled/binary_ops.cu @@ -164,13 +164,13 @@ struct compare_functor { // This functor performs null aware binop between two columns or a column and a scalar by // iterating over them on the device struct null_considering_binop { - auto get_device_view(cudf::scalar const& scalar_item) const + [[nodiscard]] auto get_device_view(cudf::scalar const& scalar_item) const { return get_scalar_device_view( static_cast&>(const_cast(scalar_item))); } - auto get_device_view(column_device_view const& col_item) const { return col_item; } + [[nodiscard]] auto get_device_view(column_device_view const& col_item) const { return col_item; } template void populate_out_col(LhsViewT const& lhsv, diff --git a/cpp/src/binaryop/compiled/operation.cuh b/cpp/src/binaryop/compiled/operation.cuh index 313fc34567d..4b5f78dc400 100644 --- a/cpp/src/binaryop/compiled/operation.cuh +++ b/cpp/src/binaryop/compiled/operation.cuh @@ -179,8 +179,8 @@ struct PyMod { std::enable_if_t<(std::is_floating_point_v>)>* = nullptr> __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> double { - double x1 = static_cast(x); - double y1 = static_cast(y); + auto x1 = static_cast(x); + auto y1 = static_cast(y); return fmod(fmod(x1, y1) + y1, y1); } diff --git a/cpp/src/binaryop/compiled/util.cpp b/cpp/src/binaryop/compiled/util.cpp index 146e53aae59..9481c236142 100644 --- a/cpp/src/binaryop/compiled/util.cpp +++ b/cpp/src/binaryop/compiled/util.cpp @@ -64,7 +64,7 @@ template struct is_binary_operation_supported { // For types where Out type is fixed. (eg. comparison types) template - inline constexpr bool operator()(void) + inline constexpr bool operator()() { if constexpr (column_device_view::has_element_accessor() and column_device_view::has_element_accessor()) { @@ -80,7 +80,7 @@ struct is_binary_operation_supported { } template - inline constexpr bool operator()(void) + inline constexpr bool operator()() { if constexpr (column_device_view::has_element_accessor() and column_device_view::has_element_accessor() and diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu index 34c0cea683e..3412733f0b2 100644 --- a/cpp/src/copying/concatenate.cu +++ b/cpp/src/copying/concatenate.cu @@ -113,7 +113,7 @@ __global__ void concatenate_masks_kernel(column_device_view const* views, thrust::upper_bound( thrust::seq, output_offsets, output_offsets + number_of_views, mask_index) - output_offsets - 1; - bool bit_is_set = 1; + bool bit_is_set = true; if (source_view_index < number_of_views) { size_type const column_element_index = mask_index - output_offsets[source_view_index]; bit_is_set = views[source_view_index].is_valid(column_element_index); diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index f8c0006ed45..a74b97da5a1 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -1017,7 +1017,7 @@ std::vector contiguous_split(cudf::table_view const& input, rmm::device_buffer d_indices_and_source_info(indices_size + src_buf_info_size + offset_stack_size, stream, rmm::mr::get_current_device_resource()); - size_type* d_indices = reinterpret_cast(d_indices_and_source_info.data()); + auto* d_indices = reinterpret_cast(d_indices_and_source_info.data()); src_buf_info* d_src_buf_info = reinterpret_cast( reinterpret_cast(d_indices_and_source_info.data()) + indices_size); size_type* d_offset_stack = @@ -1198,8 +1198,8 @@ std::vector contiguous_split(cudf::table_view const& input, rmm::device_buffer d_src_and_dst_buffers(src_bufs_size + dst_bufs_size + offset_stack_size, stream, rmm::mr::get_current_device_resource()); - uint8_t const** d_src_bufs = reinterpret_cast(d_src_and_dst_buffers.data()); - uint8_t** d_dst_bufs = reinterpret_cast( + auto const** d_src_bufs = reinterpret_cast(d_src_and_dst_buffers.data()); + uint8_t** d_dst_bufs = reinterpret_cast( reinterpret_cast(d_src_and_dst_buffers.data()) + src_bufs_size); // setup src buffers diff --git a/cpp/src/groupby/sort/functors.hpp b/cpp/src/groupby/sort/functors.hpp index 05330a7c492..fa3d19bdcfd 100644 --- a/cpp/src/groupby/sort/functors.hpp +++ b/cpp/src/groupby/sort/functors.hpp @@ -55,7 +55,7 @@ struct store_result_functor { /** * @brief Check if the groupby keys are presorted */ - bool is_presorted() const { return keys_are_sorted == sorted::YES; } + [[nodiscard]] bool is_presorted() const { return keys_are_sorted == sorted::YES; } /** * @brief Get the grouped values diff --git a/cpp/src/groupby/sort/group_std.cu b/cpp/src/groupby/sort/group_std.cu index 9ebb516ee14..50e3b812b62 100644 --- a/cpp/src/groupby/sort/group_std.cu +++ b/cpp/src/groupby/sort/group_std.cu @@ -49,7 +49,7 @@ struct var_transform { { if (d_values.is_null(i)) return 0.0; - ResultType x = static_cast(values_iter[i]); + auto x = static_cast(values_iter[i]); size_type group_idx = d_group_labels[i]; size_type group_size = d_group_sizes[group_idx]; diff --git a/cpp/src/groupby/sort/group_tdigest.cu b/cpp/src/groupby/sort/group_tdigest.cu index b7b45341ad2..f48ab852f24 100644 --- a/cpp/src/groupby/sort/group_tdigest.cu +++ b/cpp/src/groupby/sort/group_tdigest.cu @@ -327,7 +327,7 @@ __global__ void generate_cluster_limits_kernel(int delta, // compute the first cluster limit double nearest_w; int nearest_w_index; // group-relative index into the input stream - while (1) { + while (true) { cur_weight = next_limit < 0 ? 0 : max(cur_weight + 1, nearest_w); if (cur_weight >= total_weight) { break; } diff --git a/cpp/src/hash/concurrent_unordered_map.cuh b/cpp/src/hash/concurrent_unordered_map.cuh index a3f954920c8..64ab69cd377 100644 --- a/cpp/src/hash/concurrent_unordered_map.cuh +++ b/cpp/src/hash/concurrent_unordered_map.cuh @@ -242,7 +242,7 @@ class concurrent_unordered_map { __host__ __device__ mapped_type get_unused_element() const { return m_unused_element; } - __host__ __device__ size_type capacity() const { return m_capacity; } + [[nodiscard]] __host__ __device__ size_type capacity() const { return m_capacity; } private: /** diff --git a/cpp/src/hash/concurrent_unordered_multimap.cuh b/cpp/src/hash/concurrent_unordered_multimap.cuh index 2b92c9142ca..cdf5b6a8649 100644 --- a/cpp/src/hash/concurrent_unordered_multimap.cuh +++ b/cpp/src/hash/concurrent_unordered_multimap.cuh @@ -503,7 +503,7 @@ class concurrent_unordered_multimap { if (count_collisions) m_collisions = 0; } - unsigned long long get_num_collisions() const { return m_collisions; } + [[nodiscard]] unsigned long long get_num_collisions() const { return m_collisions; } void print() { diff --git a/cpp/src/hash/hash_allocator.cuh b/cpp/src/hash/hash_allocator.cuh index 0c4acccf33d..db836917808 100644 --- a/cpp/src/hash/hash_allocator.cuh +++ b/cpp/src/hash/hash_allocator.cuh @@ -26,7 +26,7 @@ template struct managed_allocator { - typedef T value_type; + using value_type = T; rmm::mr::device_memory_resource* mr = new rmm::mr::managed_memory_resource; managed_allocator() = default; @@ -62,7 +62,7 @@ bool operator!=(const managed_allocator&, const managed_allocator&) template struct default_allocator { - typedef T value_type; + using value_type = T; rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(); default_allocator() = default; diff --git a/cpp/src/hash/managed.cuh b/cpp/src/hash/managed.cuh index c6cc60a6917..c5aab78589e 100644 --- a/cpp/src/hash/managed.cuh +++ b/cpp/src/hash/managed.cuh @@ -22,7 +22,7 @@ struct managed { static void* operator new(size_t n) { - void* ptr = 0; + void* ptr = nullptr; cudaError_t result = cudaMallocManaged(&ptr, n); if (cudaSuccess != result || 0 == ptr) throw std::bad_alloc(); return ptr; diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp index 0e0ce8c4335..f368ae9fab5 100644 --- a/cpp/src/interop/dlpack.cpp +++ b/cpp/src/interop/dlpack.cpp @@ -168,7 +168,7 @@ std::unique_ptr from_dlpack(DLManagedTensor const* managed_tensor, data_type const dtype = DLDataType_to_data_type(tensor.dtype); size_t const byte_width = size_of(dtype); - size_t const num_rows = static_cast(tensor.shape[0]); + auto const num_rows = static_cast(tensor.shape[0]); size_t const bytes = num_rows * byte_width; // For 2D tensors, if the strides pointer is not null, then strides[1] is the diff --git a/cpp/src/io/avro/avro.cpp b/cpp/src/io/avro/avro.cpp index 9d3db35fea6..c1fa10d19b7 100644 --- a/cpp/src/io/avro/avro.cpp +++ b/cpp/src/io/avro/avro.cpp @@ -15,7 +15,8 @@ */ #include "avro.h" -#include + +#include #include namespace cudf { @@ -75,7 +76,7 @@ bool container::parse(file_metadata* md, size_t max_num_rows, size_t first_row) sig4 |= get_raw() << 24; if (sig4 != avro_magic) { return false; } for (;;) { - uint32_t num_md_items = static_cast(get_encoded()); + auto num_md_items = static_cast(get_encoded()); if (num_md_items == 0) { break; } for (uint32_t i = 0; i < num_md_items; i++) { auto const key = get_encoded(); @@ -103,8 +104,8 @@ bool container::parse(file_metadata* md, size_t max_num_rows, size_t first_row) auto const block_size = static_cast(get_encoded()); if (block_size <= 0 || object_count <= 0 || m_cur + block_size + 16 > m_end) { break; } if (object_count > first_row) { - uint32_t block_row = static_cast(total_object_count); - max_block_size = std::max(max_block_size, block_size); + auto block_row = static_cast(total_object_count); + max_block_size = std::max(max_block_size, block_size); total_object_count += object_count; if (!md->block_list.size()) { md->skip_rows = static_cast(first_row); diff --git a/cpp/src/io/avro/avro.h b/cpp/src/io/avro/avro.h index f84693fdba3..3dd989ffa79 100644 --- a/cpp/src/io/avro/avro.h +++ b/cpp/src/io/avro/avro.h @@ -19,11 +19,11 @@ #include "avro_common.h" #include +#include +#include +#include +#include #include -#include -#include -#include -#include #include #include @@ -85,7 +85,7 @@ class schema_parser { bool parse(std::vector& schema, const std::string& str); protected: - bool more_data() const { return (m_cur < m_end); } + [[nodiscard]] bool more_data() const { return (m_cur < m_end); } std::string get_str(); protected: @@ -103,7 +103,7 @@ class container { { } - auto bytecount() const { return m_cur - m_base; } + [[nodiscard]] auto bytecount() const { return m_cur - m_base; } template T get_raw() diff --git a/cpp/src/io/avro/avro_common.h b/cpp/src/io/avro/avro_common.h index 17f12da3165..1df6d176e95 100644 --- a/cpp/src/io/avro/avro_common.h +++ b/cpp/src/io/avro/avro_common.h @@ -17,8 +17,9 @@ #pragma once #include -#include -#include + +#include +#include namespace cudf { namespace io { diff --git a/cpp/src/io/avro/avro_gpu.cu b/cpp/src/io/avro/avro_gpu.cu index cb1c32458a3..7985d5df345 100644 --- a/cpp/src/io/avro/avro_gpu.cu +++ b/cpp/src/io/avro/avro_gpu.cu @@ -120,7 +120,7 @@ avro_decode_row(schemadesc_s const* schema, if (dataptr != nullptr && row < max_rows) { static_cast(dataptr)[row] = v; } } else { // string or enum size_t count = 0; - const char* ptr = 0; + const char* ptr = nullptr; if (kind == type_enum) { // dictionary size_t idx = schema[i].count + v; if (idx < global_dictionary.size()) { diff --git a/cpp/src/io/comp/brotli_dict.cpp b/cpp/src/io/comp/brotli_dict.cpp index 3e6939bb816..ef0fab51be6 100644 --- a/cpp/src/io/comp/brotli_dict.cpp +++ b/cpp/src/io/comp/brotli_dict.cpp @@ -49,7 +49,8 @@ THE SOFTWARE. */ #include "brotli_dict.h" -#include + +#include namespace cudf { namespace io { @@ -6528,7 +6529,7 @@ static const brotli_dictionary_s g_dictionary = { 136, 224, 164, 184, 224, 164, 149, 224, 165, 141, 224, 164, 176, 224, 164, 191, 224, 164, 175, 224, 164, 164, 224, 164, 190}}; -const brotli_dictionary_s* get_brotli_dictionary(void) { return &g_dictionary; } +const brotli_dictionary_s* get_brotli_dictionary() { return &g_dictionary; } } // namespace io } // namespace cudf diff --git a/cpp/src/io/comp/brotli_dict.h b/cpp/src/io/comp/brotli_dict.h index 4c1fec1492c..315fbd9712b 100644 --- a/cpp/src/io/comp/brotli_dict.h +++ b/cpp/src/io/comp/brotli_dict.h @@ -79,7 +79,7 @@ struct brotli_dictionary_s { constexpr int brotli_min_dictionary_word_length = 4; constexpr int brotli_max_dictionary_word_length = 24; -const brotli_dictionary_s* get_brotli_dictionary(void); +const brotli_dictionary_s* get_brotli_dictionary(); } // namespace io } // namespace cudf diff --git a/cpp/src/io/comp/brotli_tables.h b/cpp/src/io/comp/brotli_tables.h index 6e869999329..72a9b40bf95 100644 --- a/cpp/src/io/comp/brotli_tables.h +++ b/cpp/src/io/comp/brotli_tables.h @@ -2149,14 +2149,14 @@ CONSTANT uint8_t kContextLookup[2048] = { 7, }; -typedef struct CmdLutElement { +using CmdLutElement = struct CmdLutElement { uint8_t insert_len_extra_bits; uint8_t copy_len_extra_bits; int8_t distance_code; uint8_t context; uint16_t insert_len_offset; uint16_t copy_len_offset; -} CmdLutElement; +}; CONSTANT CmdLutElement kCmdLut[brotli_num_command_symbols] = { {0x00, 0x00, 0, 0x00, 0x0000, 0x0002}, {0x00, 0x00, 0, 0x01, 0x0000, 0x0003}, diff --git a/cpp/src/io/comp/cpu_unbz2.cpp b/cpp/src/io/comp/cpu_unbz2.cpp index 7f37b62e9c2..113623a2e67 100644 --- a/cpp/src/io/comp/cpu_unbz2.cpp +++ b/cpp/src/io/comp/cpu_unbz2.cpp @@ -81,8 +81,9 @@ For more information on these sources, see the manual. #include "io_uncomp.h" #include "unbz2.h" -#include -#include + +#include +#include #include namespace cudf { @@ -111,15 +112,15 @@ namespace io { #define BZ_MAX_SELECTORS (2 + (900000 / BZ_G_SIZE)) -typedef struct { +using huff_s = struct { int32_t minLen; int32_t limit[BZ_MAX_CODE_LEN]; int32_t base[BZ_MAX_CODE_LEN]; uint16_t perm[BZ_MAX_ALPHA_SIZE]; -} huff_s; +}; // Decoder state -typedef struct { +using unbz_state_s = struct { // Input const uint8_t* cur; const uint8_t* end; @@ -153,7 +154,7 @@ typedef struct { uint8_t len[BZ_MAX_ALPHA_SIZE]; huff_s ht[BZ_N_GROUPS]; -} unbz_state_s; +}; // return next 32 bits static inline uint32_t next32bits(const unbz_state_s* s) @@ -530,7 +531,8 @@ int32_t cpu_bz2_uncompress( int ret; size_t last_valid_block_in, last_valid_block_out; - if (dest == NULL || destLen == NULL || source == NULL || sourceLen < 12) return BZ_PARAM_ERROR; + if (dest == nullptr || destLen == nullptr || source == nullptr || sourceLen < 12) + return BZ_PARAM_ERROR; s.currBlockNo = 0; s.cur = source; diff --git a/cpp/src/io/comp/debrotli.cu b/cpp/src/io/comp/debrotli.cu index 8229245276b..b4a42a66133 100644 --- a/cpp/src/io/comp/debrotli.cu +++ b/cpp/src/io/comp/debrotli.cu @@ -201,8 +201,8 @@ inline __device__ uint32_t Log2Floor(uint32_t value) { return 32 - __clz(value); /// @brief initializes the bit reader __device__ void initbits(debrotli_state_s* s, const uint8_t* base, size_t len, size_t pos = 0) { - const uint8_t* p = base + pos; - uint32_t prefix_bytes = (uint32_t)(((size_t)p) & 3); + const uint8_t* p = base + pos; + auto prefix_bytes = (uint32_t)(((size_t)p) & 3); p -= prefix_bytes; s->base = base; s->end = base + len; @@ -248,7 +248,7 @@ inline __device__ uint32_t getbits(debrotli_state_s* s, uint32_t n) inline __device__ uint32_t getbits_bytealign(debrotli_state_s* s) { - uint32_t n = (uint32_t)((-(int32_t)s->bitpos) & 7); + auto n = (uint32_t)((-(int32_t)s->bitpos) & 7); uint32_t bits = showbits(s, n); skipbits(s, n); return bits; @@ -315,7 +315,7 @@ static __device__ uint8_t* local_alloc(debrotli_state_s* s, uint32_t bytes) int heap_used = s->heap_used; auto const len = allocation_size(bytes); if (heap_used + len <= s->heap_limit) { - uint8_t* ptr = reinterpret_cast(&s->heap[heap_used]); + auto* ptr = reinterpret_cast(&s->heap[heap_used]); s->heap_used = (uint16_t)(heap_used + len); return ptr; } else { @@ -351,9 +351,9 @@ static __device__ uint8_t* ext_heap_alloc(uint32_t bytes, uint8_t* ext_heap_base, uint32_t ext_heap_size) { - uint32_t len = (bytes + 0xf) & ~0xf; - volatile uint32_t* heap_ptr = reinterpret_cast(ext_heap_base); - uint32_t first_free_block = ~0; + uint32_t len = (bytes + 0xf) & ~0xf; + volatile auto* heap_ptr = reinterpret_cast(ext_heap_base); + uint32_t first_free_block = ~0; for (;;) { uint32_t blk_next, blk_prev; first_free_block = atomicExch((unsigned int*)heap_ptr, first_free_block); @@ -421,10 +421,10 @@ static __device__ void ext_heap_free(void* ptr, uint8_t* ext_heap_base, uint32_t ext_heap_size) { - uint32_t len = (bytes + 0xf) & ~0xf; - volatile uint32_t* heap_ptr = (volatile uint32_t*)ext_heap_base; - uint32_t first_free_block = ~0; - uint32_t cur_blk = static_cast(static_cast(ptr) - ext_heap_base); + uint32_t len = (bytes + 0xf) & ~0xf; + volatile auto* heap_ptr = (volatile uint32_t*)ext_heap_base; + uint32_t first_free_block = ~0; + auto cur_blk = static_cast(static_cast(ptr) - ext_heap_base); for (;;) { first_free_block = atomicExch((unsigned int*)heap_ptr, first_free_block); if (first_free_block != ~0) { break; } @@ -1299,7 +1299,7 @@ static __device__ void InverseMoveToFrontTransform(debrotli_state_s* s, uint8_t* uint32_t i = 1; uint32_t upper_bound = s->mtf_upper_bound; uint32_t* mtf = &s->mtf[1]; // Make mtf[-1] addressable. - uint8_t* mtf_u8 = reinterpret_cast(mtf); + auto* mtf_u8 = reinterpret_cast(mtf); uint32_t pattern = 0x03020100; // Little-endian // Initialize list using 4 consequent values pattern. @@ -1419,12 +1419,12 @@ static __device__ debrotli_huff_tree_group_s* HuffmanTreeGroupInit(debrotli_stat uint32_t max_symbol, uint32_t ntrees) { - debrotli_huff_tree_group_s* group = reinterpret_cast(local_alloc( + auto* group = reinterpret_cast(local_alloc( s, sizeof(debrotli_huff_tree_group_s) + ntrees * sizeof(uint16_t*) - sizeof(uint16_t*))); - group->alphabet_size = (uint16_t)alphabet_size; - group->max_symbol = (uint16_t)max_symbol; - group->num_htrees = (uint16_t)ntrees; - group->htrees[0] = nullptr; + group->alphabet_size = (uint16_t)alphabet_size; + group->max_symbol = (uint16_t)max_symbol; + group->num_htrees = (uint16_t)ntrees; + group->htrees[0] = nullptr; return group; } @@ -1640,7 +1640,7 @@ static __device__ void ProcessCommands(debrotli_state_s* s, const brotli_diction const uint8_t *context_map_slice, *dist_context_map_slice; int dist_rb_idx; uint32_t blen_L, blen_I, blen_D; - uint8_t* const dict_scratch = reinterpret_cast( + auto* const dict_scratch = reinterpret_cast( &s->hs); // 24+13 bytes (max length of a dictionary word including prefix & suffix) int context_mode; @@ -1808,7 +1808,7 @@ static __device__ void ProcessCommands(debrotli_state_s* s, const brotli_diction pos = meta_block_len; copy_length = 0; } else { - int32_t offset = (int32_t)words->offsets_by_length[copy_length]; + auto offset = (int32_t)words->offsets_by_length[copy_length]; uint32_t shift = words->size_bits_by_length[copy_length]; uint32_t address = distance_code - max_distance - 1; int32_t word_idx = address & ((1 << shift) - 1); @@ -1927,8 +1927,8 @@ extern "C" __global__ void __launch_bounds__(block_size, 2) if (z >= count) { return; } // Thread0: initializes shared state and decode stream header if (!t) { - uint8_t const* src = static_cast(inputs[z].srcDevice); - size_t src_size = inputs[z].srcSize; + auto const* src = static_cast(inputs[z].srcDevice); + size_t src_size = inputs[z].srcSize; if (src && src_size >= 8) { s->error = 0; s->out = s->outbase = static_cast(inputs[z].dstDevice); @@ -2084,7 +2084,7 @@ cudaError_t __host__ gpu_debrotli(gpu_inflate_input_s* inputs, { uint32_t count32 = (count > 0) ? count : 0; uint32_t fb_heap_size; - uint8_t* scratch_u8 = static_cast(scratch); + auto* scratch_u8 = static_cast(scratch); dim3 dim_block(block_size, 1); dim3 dim_grid(count32, 1); // TODO: Check max grid dimensions vs max expected count diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu index dab8ce1afa5..508e960430d 100644 --- a/cpp/src/io/comp/gpuinflate.cu +++ b/cpp/src/io/comp/gpuinflate.cu @@ -926,8 +926,8 @@ __device__ void copy_stored(inflate_state_s* s, int t) __syncthreads(); if (t == 0) { // Reset bitstream to end of block - uint8_t* p = cur + len; - uint32_t prefix_bytes = (uint32_t)(((size_t)p) & 3); + uint8_t* p = cur + len; + auto prefix_bytes = (uint32_t)(((size_t)p) & 3); p -= prefix_bytes; s->cur = p; s->bitbuf.x = (p < s->end) ? *reinterpret_cast(p) : 0; @@ -952,7 +952,7 @@ __device__ void prefetch_warp(volatile inflate_state_s* s, int t) const uint8_t* cur_p = s->pref.cur_p; const uint8_t* end = s->end; while (shuffle((t == 0) ? s->pref.run : 0)) { - int32_t cur_lo = (int32_t)(size_t)cur_p; + auto cur_lo = (int32_t)(size_t)cur_p; int do_pref = shuffle((t == 0) ? (cur_lo - *(volatile int32_t*)&s->cur < prefetch_size - 32 * 4 - 4) : 0); if (do_pref) { @@ -1035,7 +1035,7 @@ __global__ void __launch_bounds__(block_size) inflate_state_s* state = &state_g; if (!t) { - uint8_t* p = const_cast(static_cast(inputs[z].srcDevice)); + auto* p = const_cast(static_cast(inputs[z].srcDevice)); size_t src_size = inputs[z].srcSize; uint32_t prefix_bytes; // Parse header if needed @@ -1181,8 +1181,8 @@ __global__ void __launch_bounds__(1024) copy_uncompressed_kernel(gpu_inflate_inp src_align_bytes = (uint32_t)(3 & reinterpret_cast(src)); src_align_bits = src_align_bytes << 3; while (len >= 32) { - const uint32_t* src32 = reinterpret_cast(src - src_align_bytes); - uint32_t copy_cnt = min(len >> 2, 1024); + const auto* src32 = reinterpret_cast(src - src_align_bytes); + uint32_t copy_cnt = min(len >> 2, 1024); if (t < copy_cnt) { uint32_t v = src32[t]; if (src_align_bits != 0) { v = __funnelshift_r(v, src32[t + 1], src_align_bits); } diff --git a/cpp/src/io/comp/gpuinflate.h b/cpp/src/io/comp/gpuinflate.h index 3ca9c9eee10..29856bcd3f3 100644 --- a/cpp/src/io/comp/gpuinflate.h +++ b/cpp/src/io/comp/gpuinflate.h @@ -16,7 +16,7 @@ #pragma once -#include +#include #include diff --git a/cpp/src/io/comp/snap.cu b/cpp/src/io/comp/snap.cu index d55c06a7d96..9f0a610f8f7 100644 --- a/cpp/src/io/comp/snap.cu +++ b/cpp/src/io/comp/snap.cu @@ -55,9 +55,9 @@ static inline __device__ uint32_t snap_hash(uint32_t v) */ static inline __device__ uint32_t fetch4(const uint8_t* src) { - uint32_t src_align = 3 & reinterpret_cast(src); - const uint32_t* src32 = reinterpret_cast(src - src_align); - uint32_t v = src32[0]; + uint32_t src_align = 3 & reinterpret_cast(src); + const auto* src32 = reinterpret_cast(src - src_align); + uint32_t v = src32[0]; return (src_align) ? __funnelshift_r(v, src32[1], src_align * 8) : v; } @@ -268,15 +268,15 @@ __global__ void __launch_bounds__(128) const uint8_t* src; if (!t) { - const uint8_t* src = static_cast(inputs[blockIdx.x].srcDevice); - uint32_t src_len = static_cast(inputs[blockIdx.x].srcSize); - uint8_t* dst = static_cast(inputs[blockIdx.x].dstDevice); - uint32_t dst_len = static_cast(inputs[blockIdx.x].dstSize); - uint8_t* end = dst + dst_len; - s->src = src; - s->src_len = src_len; - s->dst_base = dst; - s->end = end; + const auto* src = static_cast(inputs[blockIdx.x].srcDevice); + auto src_len = static_cast(inputs[blockIdx.x].srcSize); + auto* dst = static_cast(inputs[blockIdx.x].dstDevice); + auto dst_len = static_cast(inputs[blockIdx.x].dstSize); + uint8_t* end = dst + dst_len; + s->src = src; + s->src_len = src_len; + s->dst_base = dst; + s->end = end; while (src_len > 0x7f) { if (dst < end) { dst[0] = src_len | 0x80; } dst++; diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp index 94721fb9ce1..66d73074af0 100644 --- a/cpp/src/io/comp/uncomp.cpp +++ b/cpp/src/io/comp/uncomp.cpp @@ -22,7 +22,7 @@ #include -#include // memset +#include // memset #include // uncompress @@ -196,17 +196,16 @@ bool OpenZipArchive(zip_archive_s* dst, const uint8_t* raw, size_t len) for (ptrdiff_t i = len - sizeof(zip_eocd_s) - 2; i + sizeof(zip_eocd_s) + 2 + 0xffff >= len && i >= 0; i--) { - const zip_eocd_s* eocd = reinterpret_cast(raw + i); + const auto* eocd = reinterpret_cast(raw + i); if (eocd->sig == 0x06054b50 && eocd->disk_id == eocd->start_disk // multi-file archives not supported && eocd->num_entries == eocd->total_entries && eocd->cdir_size >= sizeof(zip_cdfh_s) * eocd->num_entries && eocd->cdir_offset < len && i + *reinterpret_cast(eocd + 1) <= static_cast(len)) { - const zip_cdfh_s* cdfh = reinterpret_cast(raw + eocd->cdir_offset); - dst->eocd = eocd; + const auto* cdfh = reinterpret_cast(raw + eocd->cdir_offset); + dst->eocd = eocd; if (i >= static_cast(sizeof(zip64_eocdl))) { - const zip64_eocdl* eocdl = - reinterpret_cast(raw + i - sizeof(zip64_eocdl)); + const auto* eocdl = reinterpret_cast(raw + i - sizeof(zip64_eocdl)); if (eocdl->sig == 0x07064b50) { dst->eocdl = eocdl; } } // Start of central directory diff --git a/cpp/src/io/comp/unsnap.cu b/cpp/src/io/comp/unsnap.cu index bdd9ddaf1ea..791a16bc912 100644 --- a/cpp/src/io/comp/unsnap.cu +++ b/cpp/src/io/comp/unsnap.cu @@ -87,10 +87,10 @@ inline __device__ volatile uint8_t& byte_access(unsnap_state_s* s, uint32_t pos) */ __device__ void snappy_prefetch_bytestream(unsnap_state_s* s, int t) { - const uint8_t* base = s->base; - uint32_t end = (uint32_t)(s->end - base); - uint32_t align_bytes = (uint32_t)(0x20 - (0x1f & reinterpret_cast(base))); - int32_t pos = min(align_bytes, end); + const uint8_t* base = s->base; + auto end = (uint32_t)(s->end - base); + auto align_bytes = (uint32_t)(0x20 - (0x1f & reinterpret_cast(base))); + int32_t pos = min(align_bytes, end); int32_t blen; // Start by prefetching up to the next a 32B-aligned location if (t < pos) { s->q.buf[t] = base[t]; } @@ -278,7 +278,7 @@ inline __device__ uint32_t get_len5_mask(uint32_t v0, uint32_t v1) __device__ void snappy_decode_symbols(unsnap_state_s* s, uint32_t t) { uint32_t cur = 0; - uint32_t end = static_cast(s->end - s->base); + auto end = static_cast(s->end - s->base); uint32_t bytes_left = s->uncompressed_size; uint32_t dst_pos = 0; int32_t batch = 0; @@ -498,7 +498,7 @@ template __device__ void snappy_process_symbols(unsnap_state_s* s, int t, Storage& temp_storage) { const uint8_t* literal_base = s->base; - uint8_t* out = static_cast(s->in.dstDevice); + auto* out = static_cast(s->in.dstDevice); int batch = 0; do { @@ -610,7 +610,7 @@ __device__ void snappy_process_symbols(unsnap_state_s* s, int t, Storage& temp_s __syncwarp(); if (t == 0) { s->q.batch_len[batch] = 0; } batch = (batch + 1) & (batch_count - 1); - } while (1); + } while (true); } /** @@ -639,7 +639,7 @@ __global__ void __launch_bounds__(block_size) if (t < batch_count) { s->q.batch_len[t] = 0; } __syncthreads(); if (!t) { - const uint8_t* cur = static_cast(s->in.srcDevice); + const auto* cur = static_cast(s->in.srcDevice); const uint8_t* end = cur + s->in.srcSize; s->error = 0; if (log_cyclecount) { s->tstart = clock(); } diff --git a/cpp/src/io/csv/csv_gpu.h b/cpp/src/io/csv/csv_gpu.h index 9b83028fa92..ec45dea3072 100644 --- a/cpp/src/io/csv/csv_gpu.h +++ b/cpp/src/io/csv/csv_gpu.h @@ -48,8 +48,8 @@ constexpr uint32_t rowofs_block_bytes = rowofs_block_dim * 32; // 16KB/threadbl * Format: row_count * 4 + id, where `row_count` is the number of rows * in a character block, and `id` is the row parser state at the end of the block. */ -typedef uint32_t rowctx32_t; -typedef uint64_t rowctx64_t; +using rowctx32_t = uint32_t; +using rowctx64_t = uint64_t; /** * Packed row context format @@ -61,7 +61,7 @@ typedef uint64_t rowctx64_t; * always zero (EOF input state implies a zero row count) and therefore * stored as 64-bit. */ -typedef uint64_t packed_rowctx_t; +using packed_rowctx_t = uint64_t; /** * @brief return a row context from a {count, id} pair @@ -116,7 +116,7 @@ inline __host__ __device__ rowctx32_t get_row_context(packed_rowctx_t packed_ctx inline __host__ __device__ rowctx64_t select_row_context(rowctx64_t sel_ctx, packed_rowctx_t packed_ctx) { - uint32_t ctxid = static_cast(sel_ctx & 3); + auto ctxid = static_cast(sel_ctx & 3); rowctx32_t ctx = get_row_context(packed_ctx, ctxid); return (sel_ctx & ~3) + ctx; } diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index 1b66df860a3..1517226952a 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -126,7 +126,7 @@ struct column_to_strings_fn { // fails to compile var-templs); // template - constexpr static bool is_not_handled(void) + constexpr static bool is_not_handled() { // Note: the case (not std::is_same_v) // is already covered by is_integral) diff --git a/cpp/src/io/orc/aggregate_orc_metadata.hpp b/cpp/src/io/orc/aggregate_orc_metadata.hpp index 01418fd3bd6..416beaebe5d 100644 --- a/cpp/src/io/orc/aggregate_orc_metadata.hpp +++ b/cpp/src/io/orc/aggregate_orc_metadata.hpp @@ -47,17 +47,17 @@ class aggregate_orc_metadata { /** * @brief Sums up the number of rows of each source */ - size_type calc_num_rows() const; + [[nodiscard]] size_type calc_num_rows() const; /** * @brief Number of columns in a ORC file. */ - size_type calc_num_cols() const; + [[nodiscard]] size_type calc_num_cols() const; /** * @brief Sums up the number of stripes of each source */ - size_type calc_num_stripes() const; + [[nodiscard]] size_type calc_num_stripes() const; public: std::vector per_file_metadata; @@ -67,26 +67,29 @@ class aggregate_orc_metadata { aggregate_orc_metadata(std::vector> const& sources); - auto const& get_schema(int schema_idx) const { return per_file_metadata[0].ff.types[schema_idx]; } + [[nodiscard]] auto const& get_schema(int schema_idx) const + { + return per_file_metadata[0].ff.types[schema_idx]; + } auto get_col_type(int col_idx) const { return per_file_metadata[0].ff.types[col_idx]; } - auto get_num_rows() const { return num_rows; } + [[nodiscard]] auto get_num_rows() const { return num_rows; } auto get_num_cols() const { return per_file_metadata[0].get_num_columns(); } - auto get_num_stripes() const { return num_stripes; } + [[nodiscard]] auto get_num_stripes() const { return num_stripes; } - auto const& get_types() const { return per_file_metadata[0].ff.types; } + [[nodiscard]] auto const& get_types() const { return per_file_metadata[0].ff.types; } - int get_row_index_stride() const { return per_file_metadata[0].ff.rowIndexStride; } + [[nodiscard]] int get_row_index_stride() const { return per_file_metadata[0].ff.rowIndexStride; } - auto is_row_grp_idx_present() const { return row_grp_idx_present; } + [[nodiscard]] auto is_row_grp_idx_present() const { return row_grp_idx_present; } /** * @brief Returns the name of the given column from the given source. */ - std::string const& column_name(const int source_idx, const int column_id) const + [[nodiscard]] std::string const& column_name(const int source_idx, const int column_id) const { CUDF_EXPECTS(source_idx <= static_cast(per_file_metadata.size()), "Out of range source_idx provided"); @@ -98,7 +101,7 @@ class aggregate_orc_metadata { * * Full name includes ancestor columns' names. */ - std::string const& column_path(const int source_idx, const int column_id) const + [[nodiscard]] std::string const& column_path(const int source_idx, const int column_id) const { CUDF_EXPECTS(source_idx <= static_cast(per_file_metadata.size()), "Out of range source_idx provided"); diff --git a/cpp/src/io/orc/orc.h b/cpp/src/io/orc/orc.h index 4fa3480c90a..311f18bf72e 100644 --- a/cpp/src/io/orc/orc.h +++ b/cpp/src/io/orc/orc.h @@ -25,10 +25,10 @@ #include #include +#include +#include #include #include -#include -#include #include #include @@ -87,7 +87,7 @@ struct Stream { // Returns index of the column in the table, if any // Stream of the 'column 0' does not have a corresponding column in the table - std::optional column_index() const noexcept + [[nodiscard]] std::optional column_index() const noexcept { return column_id.value_or(0) > 0 ? std::optional{*column_id - 1} : std::optional{}; @@ -540,14 +540,14 @@ class OrcDecompressor { public: OrcDecompressor(CompressionKind kind, uint32_t blockSize); const uint8_t* Decompress(const uint8_t* srcBytes, size_t srcLen, size_t* dstLen); - uint32_t GetLog2MaxCompressionRatio() const { return m_log2MaxRatio; } - uint32_t GetMaxUncompressedBlockSize(uint32_t block_len) const + [[nodiscard]] uint32_t GetLog2MaxCompressionRatio() const { return m_log2MaxRatio; } + [[nodiscard]] uint32_t GetMaxUncompressedBlockSize(uint32_t block_len) const { return (block_len < (m_blockSize >> m_log2MaxRatio)) ? block_len << m_log2MaxRatio : m_blockSize; } - CompressionKind GetKind() const { return m_kind; } - uint32_t GetBlockSize() const { return m_blockSize; } + [[nodiscard]] CompressionKind GetKind() const { return m_kind; } + [[nodiscard]] uint32_t GetBlockSize() const { return m_blockSize; } protected: CompressionKind const m_kind; @@ -603,16 +603,16 @@ class metadata { public: explicit metadata(datasource* const src); - size_t get_total_rows() const { return ff.numberOfRows; } - int get_num_stripes() const { return ff.stripes.size(); } - int get_num_columns() const { return ff.types.size(); } + [[nodiscard]] size_t get_total_rows() const { return ff.numberOfRows; } + [[nodiscard]] int get_num_stripes() const { return ff.stripes.size(); } + [[nodiscard]] int get_num_columns() const { return ff.types.size(); } /** * @brief Returns the name of the column with the given ID. * * Name might not be unique in the ORC file, since columns with different parents are allowed to * have the same names. */ - std::string const& column_name(size_type column_id) const + [[nodiscard]] std::string const& column_name(size_type column_id) const { CUDF_EXPECTS(column_id < get_num_columns(), "Out of range column id provided"); return column_names[column_id]; @@ -623,22 +623,25 @@ class metadata { * * Each column in the ORC file has a unique path. */ - std::string const& column_path(size_type column_id) const + [[nodiscard]] std::string const& column_path(size_type column_id) const { CUDF_EXPECTS(column_id < get_num_columns(), "Out of range column id provided"); return column_paths[column_id]; } - int get_row_index_stride() const { return ff.rowIndexStride; } + [[nodiscard]] int get_row_index_stride() const { return ff.rowIndexStride; } /** * @brief Returns the ID of the parent column of the given column. */ - size_type parent_id(size_type column_id) const { return parents.at(column_id).value().id; } + [[nodiscard]] size_type parent_id(size_type column_id) const + { + return parents.at(column_id).value().id; + } /** * @brief Returns the index the given column has in its parent's children list. */ - size_type field_index(size_type column_id) const + [[nodiscard]] size_type field_index(size_type column_id) const { return parents.at(column_id).value().field_idx; } @@ -646,7 +649,7 @@ class metadata { /** * @brief Returns whether the given column has a parent. */ - size_type column_has_parent(size_type column_id) const + [[nodiscard]] size_type column_has_parent(size_type column_id) const { return parents.at(column_id).has_value(); } @@ -693,7 +696,7 @@ struct orc_column_device_view : public column_device_view { struct rowgroup_rows { size_type begin; size_type end; - constexpr auto size() const noexcept { return end - begin; } + [[nodiscard]] constexpr auto size() const noexcept { return end - begin; } }; } // namespace orc diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index 21c52f9295b..817b9fd7b01 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -371,7 +371,7 @@ rmm::device_buffer reader::impl::decompress_stripe_data( size_t decomp_offset = 0; uint32_t max_uncomp_block_size = 0; uint32_t start_pos = 0; - uint32_t start_pos_uncomp = (uint32_t)num_compressed_blocks; + auto start_pos_uncomp = (uint32_t)num_compressed_blocks; for (size_t i = 0; i < compinfo.size(); ++i) { auto dst_base = static_cast(decomp_data.data()); compinfo[i].uncompressed_data = dst_base + decomp_offset; diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index 05bc25597c2..dc09b3e7dd8 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -1179,7 +1179,7 @@ __global__ void __launch_bounds__(block_size) row_in = s->chunk.start_row + s->top.nulls_desc_row - prev_parent_null_count; if (row_in + nrows > first_row && row_in < first_row + max_num_rows && - s->chunk.valid_map_base != NULL) { + s->chunk.valid_map_base != nullptr) { int64_t dst_row = row_in - first_row; int64_t dst_pos = max(dst_row, (int64_t)0); uint32_t startbit = -static_cast(min(dst_row, (int64_t)0)); @@ -1325,14 +1325,14 @@ static __device__ void DecodeRowPositions(orcdec_state_s* s, s->top.data.cur_row + s->top.data.nrows < s->top.data.end_row) { uint32_t nrows = min(s->top.data.end_row - (s->top.data.cur_row + s->top.data.nrows), min((row_decoder_buffer_size - s->u.rowdec.nz_count) * 2, blockDim.x)); - if (s->chunk.valid_map_base != NULL) { + if (s->chunk.valid_map_base != nullptr) { // We have a present stream uint32_t rmax = s->top.data.end_row - min((uint32_t)first_row, s->top.data.end_row); - uint32_t r = (uint32_t)(s->top.data.cur_row + s->top.data.nrows + t - first_row); + auto r = (uint32_t)(s->top.data.cur_row + s->top.data.nrows + t - first_row); uint32_t valid = (t < nrows && r < rmax) ? (((const uint8_t*)s->chunk.valid_map_base)[r >> 3] >> (r & 7)) & 1 : 0; - volatile uint16_t* row_ofs_plus1 = (volatile uint16_t*)&s->u.rowdec.row[s->u.rowdec.nz_count]; + volatile auto* row_ofs_plus1 = (volatile uint16_t*)&s->u.rowdec.row[s->u.rowdec.nz_count]; uint32_t nz_pos, row_plus1, nz_count = s->u.rowdec.nz_count, last_row; if (t < nrows) { row_ofs_plus1[t] = valid; } lengths_to_positions(row_ofs_plus1, nrows, t); diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index 660ec025d00..02ae191d55a 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -1040,7 +1040,7 @@ __global__ void __launch_bounds__(block_size) uint32_t string_idx = (t < numvals) ? dict_data[s->cur_row + t] : 0; if (cid == CI_DICTIONARY) { // Encoding string contents - const char* ptr = 0; + const char* ptr = nullptr; uint32_t count = 0; if (t < numvals) { auto string_val = string_column->element(string_idx); diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu index b197751d925..276a1f49abf 100644 --- a/cpp/src/io/orc/stripe_init.cu +++ b/cpp/src/io/orc/stripe_init.cu @@ -428,7 +428,7 @@ extern "C" __global__ void __launch_bounds__(128, 8) uint32_t rowgroups_in_chunk = s->chunk.num_rowgroups; s->rowgroup_start = s->chunk.rowgroup_id; s->rowgroup_end = s->rowgroup_start + rowgroups_in_chunk; - s->is_compressed = (strm_info != NULL); + s->is_compressed = (strm_info != nullptr); } __syncthreads(); while (s->rowgroup_start < s->rowgroup_end) { @@ -480,7 +480,7 @@ __global__ void __launch_bounds__(block_size) device_2dspan rowgroup_bounds, device_2dspan set_counts) { - typedef cub::BlockReduce BlockReduce; + using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; auto const column_id = blockIdx.x; diff --git a/cpp/src/io/orc/timezone.cpp b/cpp/src/io/orc/timezone.cpp index 3a1e8bf898a..810dfe87320 100644 --- a/cpp/src/io/orc/timezone.cpp +++ b/cpp/src/io/orc/timezone.cpp @@ -70,8 +70,8 @@ struct timezone_file { std::vector ttype; std::vector posix_tz_string; - auto timecnt() const { return header.timecnt; } - auto typecnt() const { return header.typecnt; } + [[nodiscard]] auto timecnt() const { return header.timecnt; } + [[nodiscard]] auto typecnt() const { return header.typecnt; } // Based on https://tools.ietf.org/id/draft-murchison-tzdist-tzif-00.html static constexpr auto leap_second_rec_size(bool is_64bit) noexcept @@ -222,7 +222,7 @@ class posix_parser { /** * @brief Returns the next character in the input. */ - char next_character() const { return *cur; } + [[nodiscard]] char next_character() const { return *cur; } private: typename Container::const_iterator cur; diff --git a/cpp/src/io/orc/timezone.cuh b/cpp/src/io/orc/timezone.cuh index b25dfd0a621..a14d94df540 100644 --- a/cpp/src/io/orc/timezone.cuh +++ b/cpp/src/io/orc/timezone.cuh @@ -24,7 +24,7 @@ #include #include -#include +#include #include #include @@ -121,7 +121,7 @@ class timezone_table { : gmt_offset{gmt_offset}, ttimes{std::move(ttimes)}, offsets{std::move(offsets)} { } - timezone_table_view view() const { return {gmt_offset, ttimes, offsets}; } + [[nodiscard]] timezone_table_view view() const { return {gmt_offset, ttimes, offsets}; } }; /** diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index b7264cb81ac..105c473c15e 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -181,7 +181,7 @@ class orc_column_view { auto is_string() const noexcept { return cudf_column.type().id() == type_id::STRING; } void set_dict_stride(size_t stride) noexcept { _dict_stride = stride; } - auto dict_stride() const noexcept { return _dict_stride; } + [[nodiscard]] auto dict_stride() const noexcept { return _dict_stride; } /** * @brief Function that associates an existing dictionary chunk allocation @@ -192,14 +192,14 @@ class orc_column_view { dict = host_dict; d_dict = dev_dict; } - auto host_dict_chunk(size_t rowgroup) const + [[nodiscard]] auto host_dict_chunk(size_t rowgroup) const { CUDF_EXPECTS(is_string(), "Dictionary chunks are only present in string columns."); return &dict[rowgroup * _dict_stride + _str_idx]; } - auto device_dict_chunk() const { return d_dict; } + [[nodiscard]] auto device_dict_chunk() const { return d_dict; } - auto const& decimal_offsets() const { return d_decimal_offsets; } + [[nodiscard]] auto const& decimal_offsets() const { return d_decimal_offsets; } void attach_decimal_offsets(uint32_t* sizes_ptr) { d_decimal_offsets = sizes_ptr; } /** @@ -211,39 +211,39 @@ class orc_column_view { stripe_dict = host_stripe_dict; d_stripe_dict = dev_stripe_dict; } - auto host_stripe_dict(size_t stripe) const + [[nodiscard]] auto host_stripe_dict(size_t stripe) const { CUDF_EXPECTS(is_string(), "Stripe dictionary is only present in string columns."); return &stripe_dict[stripe * _dict_stride + _str_idx]; } - auto device_stripe_dict() const noexcept { return d_stripe_dict; } + [[nodiscard]] auto device_stripe_dict() const noexcept { return d_stripe_dict; } // Index in the table - uint32_t index() const noexcept { return _index; } + [[nodiscard]] uint32_t index() const noexcept { return _index; } // Id in the ORC file - auto id() const noexcept { return _index + 1; } + [[nodiscard]] auto id() const noexcept { return _index + 1; } - auto is_child() const noexcept { return _is_child; } + [[nodiscard]] auto is_child() const noexcept { return _is_child; } auto parent_index() const noexcept { return _parent_index.value(); } auto child_begin() const noexcept { return children.cbegin(); } auto child_end() const noexcept { return children.cend(); } auto num_children() const noexcept { return children.size(); } - auto type_width() const noexcept { return _type_width; } + [[nodiscard]] auto type_width() const noexcept { return _type_width; } auto size() const noexcept { return cudf_column.size(); } auto null_count() const noexcept { return cudf_column.null_count(); } auto null_mask() const noexcept { return cudf_column.null_mask(); } - bool nullable() const noexcept { return null_mask() != nullptr; } + [[nodiscard]] bool nullable() const noexcept { return null_mask() != nullptr; } auto user_defined_nullable() const noexcept { return nullable_from_metadata; } - auto scale() const noexcept { return _scale; } - auto precision() const noexcept { return _precision; } + [[nodiscard]] auto scale() const noexcept { return _scale; } + [[nodiscard]] auto precision() const noexcept { return _precision; } void set_orc_encoding(ColumnEncodingKind e) noexcept { _encoding_kind = e; } - auto orc_kind() const noexcept { return _type_kind; } - auto orc_encoding() const noexcept { return _encoding_kind; } - std::string_view orc_name() const noexcept { return name; } + [[nodiscard]] auto orc_kind() const noexcept { return _type_kind; } + [[nodiscard]] auto orc_encoding() const noexcept { return _encoding_kind; } + [[nodiscard]] std::string_view orc_name() const noexcept { return name; } private: column_view cudf_column; diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp index 2738a77e50a..903ceaa1714 100644 --- a/cpp/src/io/orc/writer_impl.hpp +++ b/cpp/src/io/orc/writer_impl.hpp @@ -62,14 +62,14 @@ struct orc_table_view { rmm::device_uvector d_string_column_indices; auto num_columns() const noexcept { return columns.size(); } - size_type num_rows() const noexcept; + [[nodiscard]] size_type num_rows() const noexcept; auto num_string_columns() const noexcept { return string_column_indices.size(); } auto& column(uint32_t idx) { return columns.at(idx); } - auto const& column(uint32_t idx) const { return columns.at(idx); } + [[nodiscard]] auto const& column(uint32_t idx) const { return columns.at(idx); } auto& string_column(uint32_t idx) { return columns.at(string_column_indices.at(idx)); } - auto const& string_column(uint32_t idx) const + [[nodiscard]] auto const& string_column(uint32_t idx) const { return columns.at(string_column_indices.at(idx)); } @@ -85,8 +85,8 @@ struct stripe_rowgroups { uint32_t first; // first rowgroup in the stripe uint32_t size; // number of rowgroups in the stripe stripe_rowgroups(uint32_t id, uint32_t first, uint32_t size) : id{id}, first{first}, size{size} {} - auto cbegin() const { return thrust::make_counting_iterator(first); } - auto cend() const { return thrust::make_counting_iterator(first + size); } + [[nodiscard]] auto cbegin() const { return thrust::make_counting_iterator(first); } + [[nodiscard]] auto cend() const { return thrust::make_counting_iterator(first + size); } }; /** @@ -123,10 +123,10 @@ class orc_streams { std::vector offsets; size_t non_rle_data_size = 0; size_t rle_data_size = 0; - auto data_size() const { return non_rle_data_size + rle_data_size; } + [[nodiscard]] auto data_size() const { return non_rle_data_size + rle_data_size; } }; - orc_stream_offsets compute_offsets(host_span columns, - size_t num_rowgroups) const; + [[nodiscard]] orc_stream_offsets compute_offsets(host_span columns, + size_t num_rowgroups) const; operator std::vector const &() const { return streams; } diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp index 71452bd7809..53739a26beb 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.hpp +++ b/cpp/src/io/parquet/compact_protocol_writer.hpp @@ -20,8 +20,8 @@ #include "parquet_common.hpp" #include -#include -#include +#include +#include #include #include diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index 751d6b62319..df4310fcd63 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -102,7 +102,7 @@ struct page_state_s { */ __device__ uint32_t device_str2hash32(const char* key, size_t len, uint32_t seed = 33) { - const uint8_t* p = reinterpret_cast(key); + const auto* p = reinterpret_cast(key); uint32_t h1 = seed, k1; const uint32_t c1 = 0xcc9e2d51; const uint32_t c2 = 0x1b873593; @@ -513,7 +513,7 @@ __device__ void gpuInitStringDescriptors(volatile page_state_s* s, int target_po */ inline __device__ void gpuOutputString(volatile page_state_s* s, int src_pos, void* dstv) { - const char* ptr = NULL; + const char* ptr = nullptr; size_t len = 0; if (s->dict_base) { @@ -522,10 +522,9 @@ inline __device__ void gpuOutputString(volatile page_state_s* s, int src_pos, vo sizeof(string_index_pair) : 0; if (dict_pos < (uint32_t)s->dict_size) { - const string_index_pair* src = - reinterpret_cast(s->dict_base + dict_pos); - ptr = src->first; - len = src->second; + const auto* src = reinterpret_cast(s->dict_base + dict_pos); + ptr = src->first; + len = src->second; } } else { // Plain encoding @@ -540,9 +539,9 @@ inline __device__ void gpuOutputString(volatile page_state_s* s, int src_pos, vo *static_cast(dstv) = device_str2hash32(ptr, len); } else { // Output string descriptor - string_index_pair* dst = static_cast(dstv); - dst->first = ptr; - dst->second = len; + auto* dst = static_cast(dstv); + dst->first = ptr; + dst->second = len; } } @@ -1016,7 +1015,7 @@ static __device__ bool setupLocalPageInfo(page_state_s* const s, cur += InitLevelSection(s, cur, end, level_type::DEFINITION); s->dict_bits = 0; - s->dict_base = 0; + s->dict_base = nullptr; s->dict_size = 0; switch (s->page.encoding) { case Encoding::PLAIN_DICTIONARY: @@ -1133,7 +1132,7 @@ static __device__ void store_validity(PageNestingInfo* pni, int bit_offset = pni->valid_map_offset % 32; // if we fit entirely in the output word if (bit_offset + value_count <= 32) { - uint32_t relevant_mask = static_cast((static_cast(1) << value_count) - 1); + auto relevant_mask = static_cast((static_cast(1) << value_count) - 1); if (relevant_mask == ~0) { pni->valid_map[word_offset] = valid_mask; diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index ec6b24b3b4e..2074304251f 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -1068,7 +1068,7 @@ __global__ void __launch_bounds__(128, 8) } if (t == 0) { uint8_t* base = s->page.page_data + s->page.max_hdr_size; - uint32_t actual_data_size = static_cast(s->cur - base); + auto actual_data_size = static_cast(s->cur - base); uint32_t compressed_bfr_size = GetMaxCompressedBfrSize(actual_data_size); s->page.max_data_size = actual_data_size; s->comp_in.srcDevice = base; @@ -1244,7 +1244,7 @@ class header_encoder { *header_end = current_header_ptr; } - inline __device__ uint8_t* get_ptr(void) { return current_header_ptr; } + inline __device__ uint8_t* get_ptr() { return current_header_ptr; } inline __device__ void set_ptr(uint8_t* ptr) { current_header_ptr = ptr; } }; diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index 21610638843..b4fa9b4ae82 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -19,8 +19,8 @@ #include "parquet_common.hpp" #include -#include -#include +#include +#include #include #include #include @@ -65,11 +65,11 @@ struct MilliSeconds { }; struct MicroSeconds { }; -typedef struct TimeUnit_isset { - TimeUnit_isset() : MILLIS(false), MICROS(false) {} - bool MILLIS; - bool MICROS; -} TimeUnit_isset; +using TimeUnit_isset = struct TimeUnit_isset { + TimeUnit_isset() {} + bool MILLIS{false}; + bool MICROS{false}; +}; struct TimeUnit { TimeUnit_isset isset; @@ -97,35 +97,21 @@ struct BsonType { }; // thrift generated code simplified. -typedef struct LogicalType_isset { - LogicalType_isset() - : STRING(false), - MAP(false), - LIST(false), - ENUM(false), - DECIMAL(false), - DATE(false), - TIME(false), - TIMESTAMP(false), - INTEGER(false), - UNKNOWN(false), - JSON(false), - BSON(false) - { - } - bool STRING; - bool MAP; - bool LIST; - bool ENUM; - bool DECIMAL; - bool DATE; - bool TIME; - bool TIMESTAMP; - bool INTEGER; - bool UNKNOWN; - bool JSON; - bool BSON; -} LogicalType_isset; +using LogicalType_isset = struct LogicalType_isset { + LogicalType_isset() {} + bool STRING{false}; + bool MAP{false}; + bool LIST{false}; + bool ENUM{false}; + bool DECIMAL{false}; + bool DATE{false}; + bool TIME{false}; + bool TIMESTAMP{false}; + bool INTEGER{false}; + bool UNKNOWN{false}; + bool JSON{false}; + bool BSON{false}; +}; struct LogicalType { LogicalType_isset isset; @@ -197,16 +183,19 @@ struct SchemaElement { // required int32 num; // }; // } - bool is_stub() const { return repetition_type == REPEATED && num_children == 1; } + [[nodiscard]] bool is_stub() const { return repetition_type == REPEATED && num_children == 1; } // https://github.com/apache/parquet-cpp/blob/642da05/src/parquet/schema.h#L49-L50 // One-level LIST encoding: Only allows required lists with required cells: // repeated value_type name - bool is_one_level_list() const { return repetition_type == REPEATED and num_children == 0; } + [[nodiscard]] bool is_one_level_list() const + { + return repetition_type == REPEATED and num_children == 0; + } // in parquet terms, a group is a level of nesting in the schema. a group // can be a struct or a list - bool is_struct() const + [[nodiscard]] bool is_struct() const { return type == UNDEFINED_TYPE && // this assumption might be a little weak. @@ -369,7 +358,7 @@ class CompactProtocolReader { m_base = m_cur = base; m_end = base + len; } - ptrdiff_t bytecount() const noexcept { return m_cur - m_base; } + [[nodiscard]] ptrdiff_t bytecount() const noexcept { return m_cur - m_base; } unsigned int getb() noexcept { return (m_cur < m_end) ? *m_cur++ : 0; } void skip_bytes(size_t bytecnt) noexcept { diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu index fc4afe951db..885f36aeca4 100644 --- a/cpp/src/io/parquet/reader_impl.cu +++ b/cpp/src/io/parquet/reader_impl.cu @@ -337,7 +337,7 @@ class aggregate_reader_metadata { /** * @brief Sums up the number of rows of each source */ - size_type calc_num_rows() const + [[nodiscard]] size_type calc_num_rows() const { return std::accumulate( per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto& sum, auto& pfm) { @@ -348,7 +348,7 @@ class aggregate_reader_metadata { /** * @brief Sums up the number of row groups of each source */ - size_type calc_num_row_groups() const + [[nodiscard]] size_type calc_num_row_groups() const { return std::accumulate( per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto& sum, auto& pfm) { @@ -381,16 +381,16 @@ class aggregate_reader_metadata { } } - auto const& get_row_group(size_type row_group_index, size_type src_idx) const + [[nodiscard]] auto const& get_row_group(size_type row_group_index, size_type src_idx) const { CUDF_EXPECTS(src_idx >= 0 && src_idx < static_cast(per_file_metadata.size()), "invalid source index"); return per_file_metadata[src_idx].row_groups[row_group_index]; } - auto const& get_column_metadata(size_type row_group_index, - size_type src_idx, - int schema_idx) const + [[nodiscard]] auto const& get_column_metadata(size_type row_group_index, + size_type src_idx, + int schema_idx) const { auto col = std::find_if( per_file_metadata[src_idx].row_groups[row_group_index].columns.begin(), @@ -401,13 +401,16 @@ class aggregate_reader_metadata { return col->meta_data; } - auto get_num_rows() const { return num_rows; } + [[nodiscard]] auto get_num_rows() const { return num_rows; } - auto get_num_row_groups() const { return num_row_groups; } + [[nodiscard]] auto get_num_row_groups() const { return num_row_groups; } - auto const& get_schema(int schema_idx) const { return per_file_metadata[0].schema[schema_idx]; } + [[nodiscard]] auto const& get_schema(int schema_idx) const + { + return per_file_metadata[0].schema[schema_idx]; + } - auto const& get_key_value_metadata() const { return agg_keyval_map; } + [[nodiscard]] auto const& get_key_value_metadata() const { return agg_keyval_map; } /** * @brief Gets the concrete nesting depth of output cudf columns @@ -416,7 +419,7 @@ class aggregate_reader_metadata { * * @return comma-separated index column names in quotes */ - inline int get_output_nesting_depth(int schema_index) const + [[nodiscard]] inline int get_output_nesting_depth(int schema_index) const { auto& pfm = per_file_metadata[0]; int depth = 0; @@ -441,7 +444,7 @@ class aggregate_reader_metadata { * * @return comma-separated index column names in quotes */ - std::string get_pandas_index() const + [[nodiscard]] std::string get_pandas_index() const { auto it = agg_keyval_map.find("pandas"); if (it != agg_keyval_map.end()) { @@ -472,7 +475,7 @@ class aggregate_reader_metadata { * * @param names List of column names to load, where index column name(s) will be added */ - std::vector get_pandas_index_names() const + [[nodiscard]] std::vector get_pandas_index_names() const { std::vector names; auto str = get_pandas_index(); @@ -511,9 +514,9 @@ class aggregate_reader_metadata { * * @return List of row group indexes and its starting row */ - auto select_row_groups(std::vector> const& row_groups, - size_type& row_start, - size_type& row_count) const + [[nodiscard]] auto select_row_groups(std::vector> const& row_groups, + size_type& row_start, + size_type& row_count) const { if (!row_groups.empty()) { std::vector selection; @@ -570,10 +573,10 @@ class aggregate_reader_metadata { * @return input column information, output column information, list of output column schema * indices */ - auto select_columns(std::vector const& use_names, - bool include_index, - bool strings_to_categorical, - type_id timestamp_type_id) const + [[nodiscard]] auto select_columns(std::vector const& use_names, + bool include_index, + bool strings_to_categorical, + type_id timestamp_type_id) const { auto find_schema_child = [&](SchemaElement const& schema_elem, std::string const& name) { auto const& col_schema_idx = std::find_if( diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index b302516ba39..a9306275b26 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -166,12 +166,12 @@ struct aggregate_writer_metadata { return global_rowgroup_base; } - bool schema_matches(std::vector const& schema) const + [[nodiscard]] bool schema_matches(std::vector const& schema) const { return this->schema == schema; } auto& file(size_t p) { return files[p]; } - size_t num_files() const { return files.size(); } + [[nodiscard]] size_t num_files() const { return files.size(); } private: int32_t version = 0; @@ -678,18 +678,18 @@ struct parquet_column_view { std::vector const& schema_tree, rmm::cuda_stream_view stream); - column_view leaf_column_view() const; - gpu::parquet_column_device_view get_device_view(rmm::cuda_stream_view stream) const; + [[nodiscard]] column_view leaf_column_view() const; + [[nodiscard]] gpu::parquet_column_device_view get_device_view(rmm::cuda_stream_view stream) const; - column_view cudf_column_view() const { return cudf_col; } - parquet::Type physical_type() const { return schema_node.type; } + [[nodiscard]] column_view cudf_column_view() const { return cudf_col; } + [[nodiscard]] parquet::Type physical_type() const { return schema_node.type; } std::vector const& get_path_in_schema() { return path_in_schema; } // LIST related member functions - uint8_t max_def_level() const noexcept { return _max_def_level; } - uint8_t max_rep_level() const noexcept { return _max_rep_level; } - bool is_list() const noexcept { return _is_list; } + [[nodiscard]] uint8_t max_def_level() const noexcept { return _max_def_level; } + [[nodiscard]] uint8_t max_rep_level() const noexcept { return _max_rep_level; } + [[nodiscard]] bool is_list() const noexcept { return _is_list; } private: // Schema related members diff --git a/cpp/src/io/statistics/statistics.cuh b/cpp/src/io/statistics/statistics.cuh index 755f3416b1d..15fe2544930 100644 --- a/cpp/src/io/statistics/statistics.cuh +++ b/cpp/src/io/statistics/statistics.cuh @@ -20,13 +20,15 @@ */ #pragma once -#include #include #include #include + #include +#include + namespace cudf { namespace io { diff --git a/cpp/src/io/statistics/typed_statistics_chunk.cuh b/cpp/src/io/statistics/typed_statistics_chunk.cuh index 0992a557491..8e35fcf3c44 100644 --- a/cpp/src/io/statistics/typed_statistics_chunk.cuh +++ b/cpp/src/io/statistics/typed_statistics_chunk.cuh @@ -92,24 +92,20 @@ struct typed_statistics_chunk { using E = typename detail::extrema_type::type; using A = typename detail::aggregation_type::type; - uint32_t non_nulls; //!< number of non-null values in chunk - uint32_t null_count; //!< number of null values in chunk + uint32_t non_nulls{0}; //!< number of non-null values in chunk + uint32_t null_count{0}; //!< number of null values in chunk E minimum_value; E maximum_value; A aggregate; - uint8_t has_minmax; //!< Nonzero if min_value and max_values are valid - uint8_t has_sum; //!< Nonzero if sum is valid + uint8_t has_minmax{false}; //!< Nonzero if min_value and max_values are valid + uint8_t has_sum{false}; //!< Nonzero if sum is valid __device__ typed_statistics_chunk() - : non_nulls(0), - null_count(0), - minimum_value(detail::minimum_identity()), + : minimum_value(detail::minimum_identity()), maximum_value(detail::maximum_identity()), - aggregate(0), - has_minmax(false), - has_sum(false) // Set to true when storing + aggregate(0) { } @@ -140,22 +136,17 @@ template struct typed_statistics_chunk { using E = typename detail::extrema_type::type; - uint32_t non_nulls; //!< number of non-null values in chunk - uint32_t null_count; //!< number of null values in chunk + uint32_t non_nulls{0}; //!< number of non-null values in chunk + uint32_t null_count{0}; //!< number of null values in chunk E minimum_value; E maximum_value; - uint8_t has_minmax; //!< Nonzero if min_value and max_values are valid - uint8_t has_sum; //!< Nonzero if sum is valid + uint8_t has_minmax{false}; //!< Nonzero if min_value and max_values are valid + uint8_t has_sum{false}; //!< Nonzero if sum is valid __device__ typed_statistics_chunk() - : non_nulls(0), - null_count(0), - minimum_value(detail::minimum_identity()), - maximum_value(detail::maximum_identity()), - has_minmax(false), - has_sum(false) // Set to true when storing + : minimum_value(detail::minimum_identity()), maximum_value(detail::maximum_identity()) { } diff --git a/cpp/src/io/utilities/block_utils.cuh b/cpp/src/io/utilities/block_utils.cuh index 2b4f69df10f..d73f0ebc9b7 100644 --- a/cpp/src/io/utilities/block_utils.cuh +++ b/cpp/src/io/utilities/block_utils.cuh @@ -15,7 +15,7 @@ */ #pragma once -#include +#include namespace cudf { namespace io { @@ -32,7 +32,7 @@ inline __device__ T shuffle_xor(T var, uint32_t delta) return __shfl_xor_sync(~0, var, delta); } -inline __device__ void syncwarp(void) { __syncwarp(); } +inline __device__ void syncwarp() { __syncwarp(); } inline __device__ uint32_t ballot(int pred) { return __ballot_sync(~0, pred); } @@ -126,18 +126,18 @@ inline __device__ double Int128ToDouble_rn(uint64_t lo, int64_t hi) inline __device__ uint32_t unaligned_load32(const uint8_t* p) { - uint32_t ofs = 3 & reinterpret_cast(p); - const uint32_t* p32 = reinterpret_cast(p - ofs); - uint32_t v = p32[0]; + uint32_t ofs = 3 & reinterpret_cast(p); + const auto* p32 = reinterpret_cast(p - ofs); + uint32_t v = p32[0]; return (ofs) ? __funnelshift_r(v, p32[1], ofs * 8) : v; } inline __device__ uint64_t unaligned_load64(const uint8_t* p) { - uint32_t ofs = 3 & reinterpret_cast(p); - const uint32_t* p32 = reinterpret_cast(p - ofs); - uint32_t v0 = p32[0]; - uint32_t v1 = p32[1]; + uint32_t ofs = 3 & reinterpret_cast(p); + const auto* p32 = reinterpret_cast(p - ofs); + uint32_t v0 = p32[0]; + uint32_t v1 = p32[1]; if (ofs) { v0 = __funnelshift_r(v0, v1, ofs * 8); v1 = __funnelshift_r(v1, p32[2], ofs * 8); @@ -148,8 +148,8 @@ inline __device__ uint64_t unaligned_load64(const uint8_t* p) template inline __device__ void memcpy_block(void* dstv, const void* srcv, uint32_t len, uint32_t t) { - uint8_t* dst = static_cast(dstv); - const uint8_t* src = static_cast(srcv); + auto* dst = static_cast(dstv); + const auto* src = static_cast(srcv); uint32_t dst_align_bytes, src_align_bytes, src_align_bits; // Align output to 32-bit dst_align_bytes = 3 & -reinterpret_cast(dst); @@ -166,8 +166,8 @@ inline __device__ void memcpy_block(void* dstv, const void* srcv, uint32_t len, src_align_bytes = (uint32_t)(3 & reinterpret_cast(src)); src_align_bits = src_align_bytes * 8; while (len >= 4) { - const uint32_t* src32 = reinterpret_cast(src - src_align_bytes); - uint32_t copy_cnt = min(len >> 2, nthreads); + const auto* src32 = reinterpret_cast(src - src_align_bytes); + uint32_t copy_cnt = min(len >> 2, nthreads); uint32_t v; if (t < copy_cnt) { v = src32[t]; diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index 642f3518edd..63d0103ddec 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -49,9 +49,9 @@ class file_sink : public data_sink { size_t bytes_written() override { return _bytes_written; } - bool supports_device_write() const override { return _cufile_out != nullptr; } + [[nodiscard]] bool supports_device_write() const override { return _cufile_out != nullptr; } - bool is_device_write_preferred(size_t size) const override + [[nodiscard]] bool is_device_write_preferred(size_t size) const override { return _cufile_out != nullptr && _cufile_out->is_cufile_io_preferred(size); } @@ -109,13 +109,13 @@ class host_buffer_sink : public data_sink { */ class void_sink : public data_sink { public: - explicit void_sink() : _bytes_written(0) {} + explicit void_sink() {} virtual ~void_sink() {} void host_write(void const* data, size_t size) override { _bytes_written += size; } - bool supports_device_write() const override { return true; } + [[nodiscard]] bool supports_device_write() const override { return true; } void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override { @@ -146,7 +146,10 @@ class user_sink_wrapper : public data_sink { void host_write(void const* data, size_t size) override { user_sink->host_write(data, size); } - bool supports_device_write() const override { return user_sink->supports_device_write(); } + [[nodiscard]] bool supports_device_write() const override + { + return user_sink->supports_device_write(); + } void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override { diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 3de6f35cb0d..6f864ab509f 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -40,9 +40,9 @@ class file_source : public datasource { virtual ~file_source() = default; - bool supports_device_read() const override { return _cufile_in != nullptr; } + [[nodiscard]] bool supports_device_read() const override { return _cufile_in != nullptr; } - bool is_device_read_preferred(size_t size) const override + [[nodiscard]] bool is_device_read_preferred(size_t size) const override { return _cufile_in != nullptr && _cufile_in->is_cufile_io_preferred(size); } @@ -79,7 +79,7 @@ class file_source : public datasource { return _cufile_in->read_async(offset, read_size, dst, stream); } - size_t size() const override { return _file.size(); } + [[nodiscard]] size_t size() const override { return _file.size(); } protected: detail::file_wrapper _file; @@ -102,7 +102,7 @@ class memory_mapped_source : public file_source { if (_file.size() != 0) map(_file.desc(), offset, size); } - virtual ~memory_mapped_source() + ~memory_mapped_source() override { if (_map_addr != nullptr) { munmap(_map_addr, _map_size); } } @@ -210,7 +210,10 @@ class user_datasource_wrapper : public datasource { return source->host_read(offset, size); } - bool supports_device_read() const override { return source->supports_device_read(); } + [[nodiscard]] bool supports_device_read() const override + { + return source->supports_device_read(); + } size_t device_read(size_t offset, size_t size, @@ -227,7 +230,7 @@ class user_datasource_wrapper : public datasource { return source->device_read(offset, size, stream); } - size_t size() const override { return source->size(); } + [[nodiscard]] size_t size() const override { return source->size(); } private: datasource* const source; ///< A non-owning pointer to the user-implemented datasource diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp index 7178418bbbf..fcee4e43a20 100644 --- a/cpp/src/io/utilities/file_io_utilities.hpp +++ b/cpp/src/io/utilities/file_io_utilities.hpp @@ -45,8 +45,8 @@ class file_wrapper { explicit file_wrapper(std::string const& filepath, int flags); explicit file_wrapper(std::string const& filepath, int flags, mode_t mode); ~file_wrapper(); - auto size() const { return _size; } - auto desc() const { return fd; } + [[nodiscard]] auto size() const { return _size; } + [[nodiscard]] auto desc() const { return fd; } }; /** @@ -184,7 +184,7 @@ struct cufile_registered_file { register_handle(); } - auto const& handle() const noexcept { return cf_handle; } + [[nodiscard]] auto const& handle() const noexcept { return cf_handle; } ~cufile_registered_file(); diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp index cbf914b8da6..367bbfcbdfa 100644 --- a/cpp/src/io/utilities/hostdevice_vector.hpp +++ b/cpp/src/io/utilities/hostdevice_vector.hpp @@ -77,9 +77,9 @@ class hostdevice_vector { return false; } - size_t max_size() const noexcept { return max_elements; } - size_t size() const noexcept { return num_elements; } - size_t memory_size() const noexcept { return sizeof(T) * num_elements; } + [[nodiscard]] size_t max_size() const noexcept { return max_elements; } + [[nodiscard]] size_t size() const noexcept { return num_elements; } + [[nodiscard]] size_t memory_size() const noexcept { return sizeof(T) * num_elements; } T& operator[](size_t i) const { return h_data[i]; } T* host_ptr(size_t offset = 0) const { return h_data + offset; } diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 6da3296055c..878b36191ac 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -68,7 +68,7 @@ struct parse_options { cudf::detail::optional_trie trie_na; bool multi_delimiter; - parse_options_view view() const + [[nodiscard]] parse_options_view view() const { return {delimiter, terminator, diff --git a/cpp/src/io/utilities/thread_pool.hpp b/cpp/src/io/utilities/thread_pool.hpp index c57082034db..952ab58813a 100644 --- a/cpp/src/io/utilities/thread_pool.hpp +++ b/cpp/src/io/utilities/thread_pool.hpp @@ -44,7 +44,7 @@ namespace detail { * and/or obtain its eventual return value. */ class thread_pool { - typedef std::uint_fast32_t ui32; + using ui32 = int; public: /** @@ -79,7 +79,7 @@ class thread_pool { * * @return The number of queued tasks. */ - size_t get_tasks_queued() const + [[nodiscard]] size_t get_tasks_queued() const { const std::scoped_lock lock(queue_mutex); return tasks.size(); @@ -90,7 +90,7 @@ class thread_pool { * * @return The number of running tasks. */ - ui32 get_tasks_running() const { return tasks_total - (ui32)get_tasks_queued(); } + [[nodiscard]] ui32 get_tasks_running() const { return tasks_total - (ui32)get_tasks_queued(); } /** * @brief Get the total number of unfinished tasks - either still in the queue, or running in a @@ -98,14 +98,14 @@ class thread_pool { * * @return The total number of tasks. */ - ui32 get_tasks_total() const { return tasks_total; } + [[nodiscard]] ui32 get_tasks_total() const { return tasks_total; } /** * @brief Get the number of threads in the pool. * * @return The number of threads. */ - ui32 get_thread_count() const { return thread_count; } + [[nodiscard]] ui32 get_thread_count() const { return thread_count; } /** * @brief Parallelize a loop by splitting it into blocks, submitting each block separately to the diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh index 4005d6101bd..c2115c3caa4 100644 --- a/cpp/src/join/hash_join.cuh +++ b/cpp/src/join/hash_join.cuh @@ -242,13 +242,13 @@ struct hash_join::hash_join_impl { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const; - std::size_t inner_join_size(cudf::table_view const& probe, - null_equality compare_nulls, - rmm::cuda_stream_view stream) const; + [[nodiscard]] std::size_t inner_join_size(cudf::table_view const& probe, + null_equality compare_nulls, + rmm::cuda_stream_view stream) const; - std::size_t left_join_size(cudf::table_view const& probe, - null_equality compare_nulls, - rmm::cuda_stream_view stream) const; + [[nodiscard]] std::size_t left_join_size(cudf::table_view const& probe, + null_equality compare_nulls, + rmm::cuda_stream_view stream) const; std::size_t full_join_size(cudf::table_view const& probe, null_equality compare_nulls, diff --git a/cpp/src/lists/copying/gather.cu b/cpp/src/lists/copying/gather.cu index fe45cdfc338..8d2de8997d1 100644 --- a/cpp/src/lists/copying/gather.cu +++ b/cpp/src/lists/copying/gather.cu @@ -53,8 +53,8 @@ namespace detail { * @endcode */ struct list_gatherer { - typedef size_type argument_type; - typedef size_type result_type; + using argument_type = size_type; + using result_type = size_type; size_t offset_count; size_type const* base_offsets; diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu index 7b3b7b0f3fd..66b26148ede 100644 --- a/cpp/src/partitioning/partitioning.cu +++ b/cpp/src/partitioning/partitioning.cu @@ -257,7 +257,7 @@ __global__ void copy_block_partitions(InputIter input_iter, reinterpret_cast(block_output + OPTIMIZED_BLOCK_SIZE * OPTIMIZED_ROWS_PER_THREAD); auto partition_offset_global = partition_offset_shared + num_partitions + 1; - typedef cub::BlockScan BlockScan; + using BlockScan = cub::BlockScan; __shared__ typename BlockScan::TempStorage temp_storage; // use ELEMENTS_PER_THREAD=2 to support upto 1024 partitions diff --git a/cpp/src/quantiles/quantiles_util.hpp b/cpp/src/quantiles/quantiles_util.hpp index 7af1e47087b..a5dc643a688 100644 --- a/cpp/src/quantiles/quantiles_util.hpp +++ b/cpp/src/quantiles/quantiles_util.hpp @@ -45,8 +45,8 @@ CUDF_HOST_DEVICE inline Result linear(T lhs, T rhs, double frac) // Underflow may occur when converting int64 to double // detail: https://github.com/rapidsai/cudf/issues/1417 - double dlhs = static_cast(lhs); - double drhs = static_cast(rhs); + auto dlhs = static_cast(lhs); + auto drhs = static_cast(rhs); double one_minus_frac = 1.0 - frac; return static_cast(one_minus_frac * dlhs + frac * drhs); } @@ -55,8 +55,8 @@ template CUDF_HOST_DEVICE inline Result midpoint(T lhs, T rhs) { // TODO: try std::midpoint (C++20) if available - double dlhs = static_cast(lhs); - double drhs = static_cast(rhs); + auto dlhs = static_cast(lhs); + auto drhs = static_cast(rhs); return static_cast(dlhs / 2 + drhs / 2); } diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh index bc1947dfeed..7c52856b147 100644 --- a/cpp/src/rolling/rolling_detail.cuh +++ b/cpp/src/rolling/rolling_detail.cuh @@ -950,9 +950,9 @@ __launch_bounds__(block_size) __global__ int64_t following_window = following_window_begin[i]; // compute bounds - size_type start = static_cast( + auto start = static_cast( min(static_cast(input.size()), max(0L, i - preceding_window + 1))); - size_type end = static_cast( + auto end = static_cast( min(static_cast(input.size()), max(0L, i + following_window + 1))); size_type start_index = min(start, end); size_type end_index = max(start, end); diff --git a/cpp/src/strings/capitalize.cu b/cpp/src/strings/capitalize.cu index 9618f325fce..84ae2b73bba 100644 --- a/cpp/src/strings/capitalize.cu +++ b/cpp/src/strings/capitalize.cu @@ -108,7 +108,7 @@ struct base_fn { if (!d_chars) d_offsets[idx] = 0; } - Derived& derived = static_cast(*this); + auto& derived = static_cast(*this); auto const d_str = d_column.element(idx); offset_type bytes = 0; auto d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; diff --git a/cpp/src/strings/combine/join_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu index 9482d4db9b8..8f364f5c9bc 100644 --- a/cpp/src/strings/combine/join_list_elements.cu +++ b/cpp/src/strings/combine/join_list_elements.cu @@ -61,9 +61,9 @@ struct compute_size_and_concatenate_fn { // If d_chars != nullptr: only concatenate strings. char* d_chars{nullptr}; - __device__ bool output_is_null(size_type const idx, - size_type const start_idx, - size_type const end_idx) const noexcept + [[nodiscard]] __device__ bool output_is_null(size_type const idx, + size_type const start_idx, + size_type const end_idx) const noexcept { if (func.is_null_list(lists_dv, idx)) { return true; } return empty_list_policy == output_if_empty_list::NULL_ELEMENT && start_idx == end_idx; @@ -127,13 +127,16 @@ struct compute_size_and_concatenate_fn { struct scalar_separator_fn { string_scalar_device_view const d_separator; - __device__ bool is_null_list(column_device_view const& lists_dv, - size_type const idx) const noexcept + [[nodiscard]] __device__ bool is_null_list(column_device_view const& lists_dv, + size_type const idx) const noexcept { return lists_dv.is_null(idx); } - __device__ string_view separator(size_type const) const noexcept { return d_separator.value(); } + [[nodiscard]] __device__ string_view separator(size_type const) const noexcept + { + return d_separator.value(); + } }; template @@ -222,13 +225,13 @@ struct column_separators_fn { column_device_view const separators_dv; string_scalar_device_view const sep_narep_dv; - __device__ bool is_null_list(column_device_view const& lists_dv, - size_type const idx) const noexcept + [[nodiscard]] __device__ bool is_null_list(column_device_view const& lists_dv, + size_type const idx) const noexcept { return lists_dv.is_null(idx) || (separators_dv.is_null(idx) && !sep_narep_dv.is_valid()); } - __device__ string_view separator(size_type const idx) const noexcept + [[nodiscard]] __device__ string_view separator(size_type const idx) const noexcept { return separators_dv.is_valid(idx) ? separators_dv.element(idx) : sep_narep_dv.value(); diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu index 9376a0082a8..efdee65c1f6 100644 --- a/cpp/src/strings/contains.cu +++ b/cpp/src/strings/contains.cu @@ -53,7 +53,7 @@ struct contains_fn { __device__ bool operator()(size_type idx) { - if (d_strings.is_null(idx)) return 0; + if (d_strings.is_null(idx)) return false; string_view d_str = d_strings.element(idx); int32_t begin = 0; int32_t end = bmatch ? 1 // match only the beginning of the string; diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu index 8d0c5704a7b..cd3dc3b46f3 100644 --- a/cpp/src/strings/convert/convert_datetime.cu +++ b/cpp/src/strings/convert/convert_datetime.cu @@ -156,7 +156,7 @@ struct format_compiler { device_span format_items() { return device_span(d_items); } - int8_t subsecond_precision() const { return specifiers.at('f'); } + [[nodiscard]] int8_t subsecond_precision() const { return specifiers.at('f'); } }; /** @@ -194,7 +194,7 @@ struct parse_datetime { * * @return `1x10^exponent` for `0 <= exponent <= 9` */ - __device__ constexpr int64_t power_of_ten(int32_t const exponent) const + [[nodiscard]] __device__ constexpr int64_t power_of_ten(int32_t const exponent) const { constexpr int64_t powers_of_ten[] = { 1L, 10L, 100L, 1000L, 10000L, 100000L, 1000000L, 10000000L, 100000000L, 1000000000L}; @@ -202,7 +202,7 @@ struct parse_datetime { } // Walk the format_items to parse the string into date/time components - __device__ timestamp_components parse_into_parts(string_view const& d_string) const + [[nodiscard]] __device__ timestamp_components parse_into_parts(string_view const& d_string) const { timestamp_components timeparts = {1970, 1, 1, 0}; // init to epoch time @@ -310,7 +310,7 @@ struct parse_datetime { return timeparts; } - __device__ int64_t timestamp_from_parts(timestamp_components const& timeparts) const + [[nodiscard]] __device__ int64_t timestamp_from_parts(timestamp_components const& timeparts) const { auto const ymd = // convenient chrono class handles the leap year calculations for us cuda::std::chrono::year_month_day( @@ -689,7 +689,7 @@ struct from_timestamp_base { * modulo(-1,60) -> 59 * @endcode */ - __device__ int32_t modulo_time(int64_t time, int64_t base) const + [[nodiscard]] __device__ int32_t modulo_time(int64_t time, int64_t base) const { return static_cast(((time % base) + base) % base); }; @@ -707,12 +707,12 @@ struct from_timestamp_base { * scale( 61,60) -> 1 * @endcode */ - __device__ int64_t scale_time(int64_t time, int64_t base) const + [[nodiscard]] __device__ int64_t scale_time(int64_t time, int64_t base) const { return (time - ((time < 0) * (base - 1L))) / base; }; - __device__ time_components get_time_components(int64_t tstamp) const + [[nodiscard]] __device__ time_components get_time_components(int64_t tstamp) const { time_components result = {0}; if constexpr (std::is_same_v) { return result; } @@ -855,7 +855,7 @@ struct datetime_formatter : public from_timestamp_base { } // from https://howardhinnant.github.io/date/date.html - __device__ thrust::pair get_iso_week_year( + [[nodiscard]] __device__ thrust::pair get_iso_week_year( cuda::std::chrono::year_month_day const& ymd) const { auto const days = cuda::std::chrono::sys_days(ymd); @@ -885,8 +885,8 @@ struct datetime_formatter : public from_timestamp_base { static_cast(year)); } - __device__ int8_t get_week_of_year(cuda::std::chrono::sys_days const days, - cuda::std::chrono::sys_days const start) const + [[nodiscard]] __device__ int8_t get_week_of_year(cuda::std::chrono::sys_days const days, + cuda::std::chrono::sys_days const start) const { return days < start ? 0 diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu index f286149ea46..66e6f31cca2 100644 --- a/cpp/src/strings/convert/convert_durations.cu +++ b/cpp/src/strings/convert/convert_durations.cu @@ -156,7 +156,7 @@ struct format_compiler { format_item const* compiled_format_items() { return d_items.data(); } - size_type items_count() const { return static_cast(d_items.size()); } + [[nodiscard]] size_type items_count() const { return static_cast(d_items.size()); } }; template diff --git a/cpp/src/strings/findall.cu b/cpp/src/strings/findall.cu index 8d96f0de415..c82ab4f81c3 100644 --- a/cpp/src/strings/findall.cu +++ b/cpp/src/strings/findall.cu @@ -67,7 +67,7 @@ struct findall_fn { string_view d_str = d_strings.element(idx); auto const nchars = d_str.length(); int32_t spos = 0; - int32_t epos = static_cast(nchars); + auto epos = static_cast(nchars); size_type column_count = 0; while (spos <= nchars) { if (prog.find(idx, d_str, spos, epos) <= 0) break; // no more matches found diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu index c61fb8905f5..ae807db10e6 100644 --- a/cpp/src/strings/json/json_path.cu +++ b/cpp/src/strings/json/json_path.cu @@ -72,7 +72,7 @@ enum class parse_result { */ class parser { protected: - CUDF_HOST_DEVICE inline parser() : input(nullptr), input_len(0), pos(nullptr) {} + CUDF_HOST_DEVICE inline parser() {} CUDF_HOST_DEVICE inline parser(const char* _input, int64_t _input_len) : input(_input), input_len(_input_len), pos(_input) { @@ -177,9 +177,9 @@ class parser { } protected: - char const* input; - int64_t input_len; - char const* pos; + char const* input{nullptr}; + int64_t input_len{0}; + char const* pos{nullptr}; CUDF_HOST_DEVICE inline bool is_whitespace(char c) { return c <= ' '; } }; @@ -220,18 +220,10 @@ enum json_element_type { NONE, OBJECT, ARRAY, VALUE }; */ class json_state : private parser { public: - __device__ json_state() - : parser(), - cur_el_start(nullptr), - cur_el_type(json_element_type::NONE), - parent_el_type(json_element_type::NONE) - { - } + __device__ json_state() : parser() {} __device__ json_state(const char* _input, int64_t _input_len, get_json_object_options _options) : parser(_input, _input_len), - cur_el_start(nullptr), - cur_el_type(json_element_type::NONE), - parent_el_type(json_element_type::NONE), + options(_options) { } @@ -340,7 +332,7 @@ class json_state : private parser { // next parse_result result = next_element_internal(false); if (result != parse_result::SUCCESS) { return result; } - } while (1); + } while (true); return parse_result::ERROR; } @@ -486,12 +478,12 @@ class json_state : private parser { return (c == '\"') || (options.get_allow_single_quotes() && (c == '\'')); } - const char* cur_el_start; // pointer to the first character of the -value- of the current - // element - not the name - string_view cur_el_name; // name of the current element (if applicable) - json_element_type cur_el_type; // type of the current element - json_element_type parent_el_type; // parent element type - get_json_object_options options; // behavior options + const char* cur_el_start{nullptr}; // pointer to the first character of the -value- of the + // current element - not the name + string_view cur_el_name; // name of the current element (if applicable) + json_element_type cur_el_type{json_element_type::NONE}; // type of the current element + json_element_type parent_el_type{json_element_type::NONE}; // parent element type + get_json_object_options options; // behavior options }; enum class path_operator_type { ROOT, CHILD, CHILD_WILDCARD, CHILD_INDEX, ERROR, END }; @@ -501,26 +493,23 @@ enum class path_operator_type { ROOT, CHILD, CHILD_WILDCARD, CHILD_INDEX, ERROR, * an array of these operators applied to the incoming json string, */ struct path_operator { - CUDF_HOST_DEVICE inline path_operator() - : type(path_operator_type::ERROR), index(-1), expected_type{NONE} - { - } + CUDF_HOST_DEVICE inline path_operator() {} CUDF_HOST_DEVICE inline path_operator(path_operator_type _type, json_element_type _expected_type = NONE) - : type(_type), index(-1), expected_type{_expected_type} + : type(_type), expected_type{_expected_type} { } - path_operator_type type; // operator type + path_operator_type type{path_operator_type::ERROR}; // operator type // the expected element type we're applying this operation to. // for example: // - you cannot retrieve a subscripted field (eg [5]) from an object. // - you cannot retrieve a field by name (eg .book) from an array. // - you -can- use .* for both arrays and objects // a value of NONE imples any type accepted - json_element_type expected_type; // the expected type of the element we're working with - string_view name; // name to match against (if applicable) - int index; // index for subscript operator + json_element_type expected_type{NONE}; // the expected type of the element we're working with + string_view name; // name to match against (if applicable) + int index{-1}; // index for subscript operator }; /** diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu index 20868077cf4..f2a27d1b11d 100644 --- a/cpp/src/strings/padding.cu +++ b/cpp/src/strings/padding.cu @@ -122,7 +122,7 @@ std::unique_ptr pad( if (d_strings.is_null(idx)) return; string_view d_str = d_strings.element(idx); char* ptr = d_chars + d_offsets[idx]; - int32_t pad = static_cast(width - d_str.length()); + auto pad = static_cast(width - d_str.length()); auto right_pad = (width & 1) ? pad / 2 : (pad - pad / 2); // odd width = right-justify auto left_pad = pad - right_pad; // e.g. width=7 gives "++foxx+" while width=6 gives "+fox++" diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index 8fbd82b8dc7..7be88d01387 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -20,7 +20,7 @@ #include #include -#include +#include namespace cudf { namespace strings { diff --git a/cpp/src/strings/regex/regcomp.h b/cpp/src/strings/regex/regcomp.h index 63d7933eebe..3131767de59 100644 --- a/cpp/src/strings/regex/regcomp.h +++ b/cpp/src/strings/regex/regcomp.h @@ -51,9 +51,9 @@ enum InstType { * @brief Class type for regex compiler instruction. */ struct reclass { - int32_t builtins; // bit mask identifying builtin classes + int32_t builtins{0}; // bit mask identifying builtin classes std::u32string literals; // ranges as pairs of utf-8 characters - reclass() : builtins(0) {} + reclass() {} reclass(int m) : builtins(m) {} }; @@ -99,20 +99,20 @@ class reprog { int32_t add_class(reclass cls); void set_groups_count(int32_t groups); - int32_t groups_count() const; + [[nodiscard]] int32_t groups_count() const; - const reinst* insts_data() const; - int32_t insts_count() const; + [[nodiscard]] const reinst* insts_data() const; + [[nodiscard]] int32_t insts_count() const; reinst& inst_at(int32_t id); reclass& class_at(int32_t id); - int32_t classes_count() const; + [[nodiscard]] int32_t classes_count() const; - const int32_t* starts_data() const; - int32_t starts_count() const; + [[nodiscard]] const int32_t* starts_data() const; + [[nodiscard]] int32_t starts_count() const; void set_start_inst(int32_t id); - int32_t get_start_inst() const; + [[nodiscard]] int32_t get_start_inst() const; void optimize1(); void optimize2(); diff --git a/cpp/src/strings/regex/regex.cuh b/cpp/src/strings/regex/regex.cuh index d6b8307c3fb..a9928a6bd49 100644 --- a/cpp/src/strings/regex/regex.cuh +++ b/cpp/src/strings/regex/regex.cuh @@ -132,32 +132,38 @@ class reprog_device { /** * @brief Returns the number of regex instructions. */ - __host__ __device__ int32_t insts_counts() const { return _insts_count; } + [[nodiscard]] __host__ __device__ int32_t insts_counts() const { return _insts_count; } /** * @brief Returns true if this is an empty program. */ - __device__ bool is_empty() const { return insts_counts() == 0 || get_inst(0)->type == END; } + [[nodiscard]] __device__ bool is_empty() const + { + return insts_counts() == 0 || get_inst(0)->type == END; + } /** * @brief Returns the number of regex groups found in the expression. */ - CUDF_HOST_DEVICE inline int32_t group_counts() const { return _num_capturing_groups; } + [[nodiscard]] CUDF_HOST_DEVICE inline int32_t group_counts() const + { + return _num_capturing_groups; + } /** * @brief Returns the regex instruction object for a given index. */ - __device__ inline reinst* get_inst(int32_t idx) const; + [[nodiscard]] __device__ inline reinst* get_inst(int32_t idx) const; /** * @brief Returns the regex class object for a given index. */ - __device__ inline reclass_device get_class(int32_t idx) const; + [[nodiscard]] __device__ inline reclass_device get_class(int32_t idx) const; /** * @brief Returns the start-instruction-ids vector. */ - __device__ inline int32_t* startinst_ids() const; + [[nodiscard]] __device__ inline int32_t* startinst_ids() const; /** * @brief Does a find evaluation using the compiled expression on the given string. diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl index 0e11e9c1bbd..50aab8c3ac4 100644 --- a/cpp/src/strings/regex/regex.inl +++ b/cpp/src/strings/regex/regex.inl @@ -246,7 +246,7 @@ __device__ inline int32_t reprog_device::regexec( expanded = false; for (int16_t i = 0; i < jnk.list1->size; i++) { - int32_t inst_id = static_cast(jnk.list1->inst_ids[i]); + auto inst_id = static_cast(jnk.list1->inst_ids[i]); int2& range = jnk.list1->ranges[i]; const reinst* inst = get_inst(inst_id); int32_t id_activate = -1; @@ -283,7 +283,7 @@ __device__ inline int32_t reprog_device::regexec( break; case BOW: { auto codept = utf8_to_codepoint(c); - char32_t last_c = static_cast(pos ? dstr[pos - 1] : 0); + auto last_c = static_cast(pos ? dstr[pos - 1] : 0); auto last_codept = utf8_to_codepoint(last_c); bool cur_alphaNumeric = (codept < 0x010000) && IS_ALPHANUM(_codepoint_flags[codept]); bool last_alphaNumeric = @@ -296,7 +296,7 @@ __device__ inline int32_t reprog_device::regexec( } case NBOW: { auto codept = utf8_to_codepoint(c); - char32_t last_c = static_cast(pos ? dstr[pos - 1] : 0); + auto last_c = static_cast(pos ? dstr[pos - 1] : 0); auto last_codept = utf8_to_codepoint(last_c); bool cur_alphaNumeric = (codept < 0x010000) && IS_ALPHANUM(_codepoint_flags[codept]); bool last_alphaNumeric = @@ -323,7 +323,7 @@ __device__ inline int32_t reprog_device::regexec( bool continue_execute = true; jnk.list2->reset(); for (int16_t i = 0; continue_execute && i < jnk.list1->size; i++) { - int32_t inst_id = static_cast(jnk.list1->inst_ids[i]); + auto inst_id = static_cast(jnk.list1->inst_ids[i]); int2& range = jnk.list1->ranges[i]; const reinst* inst = get_inst(inst_id); int32_t id_activate = -1; @@ -415,11 +415,11 @@ __device__ inline int32_t reprog_device::call_regexec( auto const schar = get_inst(_startinst_id)->u1.c; auto const relists_size = relist::alloc_size(_insts_count); - u_char* listmem = reinterpret_cast(_relists_mem); // beginning of relist buffer; + auto* listmem = reinterpret_cast(_relists_mem); // beginning of relist buffer; listmem += (idx * relists_size * 2); // two relist ptrs in reljunk: - relist* list1 = new (listmem) relist(static_cast(_insts_count)); - relist* list2 = new (listmem + relists_size) relist(static_cast(_insts_count)); + auto* list1 = new (listmem) relist(static_cast(_insts_count)); + auto* list2 = new (listmem + relists_size) relist(static_cast(_insts_count)); reljunk jnk(list1, list2, stype, schar); return regexec(dstr, jnk, begin, end, group_id); diff --git a/cpp/src/strings/regex/regexec.cu b/cpp/src/strings/regex/regexec.cu index 4f93bbd6e7b..b286812226b 100644 --- a/cpp/src/strings/regex/regexec.cu +++ b/cpp/src/strings/regex/regexec.cu @@ -67,8 +67,7 @@ reprog_device::reprog_device(reprog& prog) _num_capturing_groups{prog.groups_count()}, _insts_count{prog.insts_count()}, _starts_count{prog.starts_count()}, - _classes_count{prog.classes_count()}, - _relists_mem{nullptr} + _classes_count{prog.classes_count()} { } diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu index c6e52a79059..aae911e8ed6 100644 --- a/cpp/src/strings/split/split.cu +++ b/cpp/src/strings/split/split.cu @@ -547,7 +547,7 @@ std::unique_ptr
split_fn(strings_column_view const& strings_column, */ struct base_whitespace_split_tokenizer { // count the tokens only between non-whitespace characters - __device__ size_type count_tokens(size_type idx) const + [[nodiscard]] __device__ size_type count_tokens(size_type idx) const { if (d_strings.is_null(idx)) return 0; const string_view d_str = d_strings.element(idx); diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu index 2e6dbe62cf1..f3b642132e3 100644 --- a/cpp/src/text/subword/data_normalizer.cu +++ b/cpp/src/text/subword/data_normalizer.cu @@ -250,9 +250,8 @@ __global__ void kernel_data_normalizer(unsigned char const* strings, chars_per_thread[char_for_thread] = num_new_chars; - typedef cub:: - BlockStore - BlockStore; + using BlockStore = + cub::BlockStore; __shared__ typename BlockStore::TempStorage temp_storage; // Now we perform coalesced writes back to global memory using cub. diff --git a/cpp/src/text/subword/detail/tokenizer_utils.cuh b/cpp/src/text/subword/detail/tokenizer_utils.cuh index 48ee0fc2b51..dcd241fc045 100644 --- a/cpp/src/text/subword/detail/tokenizer_utils.cuh +++ b/cpp/src/text/subword/detail/tokenizer_utils.cuh @@ -20,7 +20,7 @@ #include -#include +#include namespace nvtext { namespace detail { diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu index 0af34eb8092..75c79381032 100644 --- a/cpp/src/text/subword/load_hash_file.cu +++ b/cpp/src/text/subword/load_hash_file.cu @@ -29,9 +29,9 @@ #include #include +#include #include #include -#include #include namespace nvtext { diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu index ff720daa5cb..f6b10cfc583 100644 --- a/cpp/src/transform/row_bit_count.cu +++ b/cpp/src/transform/row_bit_count.cu @@ -116,17 +116,17 @@ struct column_info { * */ struct hierarchy_info { - hierarchy_info() : simple_per_row_size(0), complex_type_count(0), max_branch_depth(0) {} + hierarchy_info() {} // These two fields act as an optimization. If we find that the entire table // is just fixed-width types, we do not need to do the more expensive kernel call that // traverses the individual columns. So if complex_type_count is 0, we can just // return a column where every row contains the value simple_per_row_size - size_type simple_per_row_size; // in bits - size_type complex_type_count; + size_type simple_per_row_size{0}; // in bits + size_type complex_type_count{0}; // max depth of span branches present in the hierarchy. - size_type max_branch_depth; + size_type max_branch_depth{0}; }; /** diff --git a/cpp/tests/column/column_view_shallow_test.cpp b/cpp/tests/column/column_view_shallow_test.cpp index 4afa96f08d7..8a742b50baa 100644 --- a/cpp/tests/column/column_view_shallow_test.cpp +++ b/cpp/tests/column/column_view_shallow_test.cpp @@ -163,7 +163,7 @@ TYPED_TEST(ColumnViewShallowTests, shallow_hash_update_data) col->set_null_mask(cudf::create_null_mask(col->size(), cudf::mask_state::ALL_VALID)); auto col_view_new = cudf::column_view{*col}; EXPECT_NE(shallow_hash(col_view), shallow_hash(col_view_new)); - col_view_new.null_count(); + [[maybe_unused]] auto const nulls = col_view_new.null_count(); EXPECT_NE(shallow_hash(col_view), shallow_hash(col_view_new)); auto col_view_new2 = cudf::column_view{*col}; EXPECT_EQ(shallow_hash(col_view_new), shallow_hash(col_view_new2)); @@ -332,7 +332,7 @@ TYPED_TEST(ColumnViewShallowTests, is_shallow_equivalent_update_data) col->set_null_mask(cudf::create_null_mask(col->size(), cudf::mask_state::ALL_VALID)); auto col_view_new = cudf::column_view{*col}; EXPECT_FALSE(is_shallow_equivalent(col_view, col_view_new)); - col_view_new.null_count(); + [[maybe_unused]] auto const nulls = col_view_new.null_count(); EXPECT_FALSE(is_shallow_equivalent(col_view, col_view_new)); auto col_view_new2 = cudf::column_view{*col}; EXPECT_TRUE(is_shallow_equivalent(col_view_new, col_view_new2)); diff --git a/cpp/tests/copying/concatenate_tests.cu b/cpp/tests/copying/concatenate_tests.cu index 306037e6473..a306736d131 100644 --- a/cpp/tests/copying/concatenate_tests.cu +++ b/cpp/tests/copying/concatenate_tests.cu @@ -343,7 +343,7 @@ TEST_F(OverflowTest, OverflowTest) // primitive column { - constexpr size_type size = static_cast(static_cast(1024) * 1024 * 1024); + constexpr auto size = static_cast(static_cast(1024) * 1024 * 1024); // try and concatenate 6 char columns of size 1 billion each auto many_chars = cudf::make_fixed_width_column(data_type{type_id::INT8}, size); @@ -355,7 +355,7 @@ TEST_F(OverflowTest, OverflowTest) // string column, overflow on chars { - constexpr size_type size = static_cast(static_cast(1024) * 1024 * 1024); + constexpr auto size = static_cast(static_cast(1024) * 1024 * 1024); // try and concatenate 6 string columns of with 1 billion chars in each auto offsets = cudf::test::fixed_width_column_wrapper{0, size}; @@ -370,7 +370,7 @@ TEST_F(OverflowTest, OverflowTest) // string column, overflow on offsets (rows) { - constexpr size_type size = static_cast(static_cast(1024) * 1024 * 1024); + constexpr auto size = static_cast(static_cast(1024) * 1024 * 1024); // try and concatenate 6 string columns 1 billion rows each auto many_offsets = cudf::make_fixed_width_column(data_type{type_id::INT32}, size + 1); @@ -385,8 +385,7 @@ TEST_F(OverflowTest, OverflowTest) // list, structs too long { - constexpr size_type inner_size = - static_cast(static_cast(512) * 1024 * 1024); + constexpr auto inner_size = static_cast(static_cast(512) * 1024 * 1024); // struct std::vector> children; @@ -408,9 +407,8 @@ TEST_F(OverflowTest, OverflowTest) // struct, list child too long { - constexpr size_type inner_size = - static_cast(static_cast(512) * 1024 * 1024); - constexpr size_type size = 3; + constexpr auto inner_size = static_cast(static_cast(512) * 1024 * 1024); + constexpr size_type size = 3; // list auto offsets = cudf::test::fixed_width_column_wrapper{0, 0, 0, inner_size}; @@ -437,7 +435,7 @@ TEST_F(OverflowTest, Presliced) // primitive column { - constexpr size_type size = static_cast(static_cast(1024) * 1024 * 1024); + constexpr auto size = static_cast(static_cast(1024) * 1024 * 1024); // try and concatenate 4 char columns of size ~1/2 billion each auto many_chars = cudf::make_fixed_width_column(data_type{type_id::INT8}, size); @@ -454,7 +452,7 @@ TEST_F(OverflowTest, Presliced) // struct column { - constexpr size_type size = static_cast(static_cast(1024) * 1024 * 1024); + constexpr auto size = static_cast(static_cast(1024) * 1024 * 1024); // try and concatenate 4 char columns of size ~1/2 billion each std::vector> children; @@ -542,8 +540,7 @@ TEST_F(OverflowTest, Presliced) // list, structs too long { - constexpr size_type inner_size = - static_cast(static_cast(1024) * 1024 * 1024); + constexpr auto inner_size = static_cast(static_cast(1024) * 1024 * 1024); // struct std::vector> children; @@ -616,8 +613,7 @@ TEST_F(OverflowTest, Presliced) // struct, list child elements too long { - constexpr size_type inner_size = - static_cast(static_cast(1024) * 1024 * 1024); + constexpr auto inner_size = static_cast(static_cast(1024) * 1024 * 1024); constexpr size_type num_rows = 4; constexpr size_type list_size = inner_size / num_rows; @@ -656,7 +652,7 @@ TEST_F(OverflowTest, BigColumnsSmallSlices) // primitive column { - constexpr size_type size = static_cast(static_cast(1024) * 1024 * 1024); + constexpr auto size = static_cast(static_cast(1024) * 1024 * 1024); auto many_chars = cudf::make_fixed_width_column(data_type{type_id::INT8}, size); auto sliced = cudf::slice(*many_chars, {16, 32}); @@ -668,8 +664,7 @@ TEST_F(OverflowTest, BigColumnsSmallSlices) // strings column { - constexpr size_type inner_size = - static_cast(static_cast(1024) * 1024 * 1024); + constexpr auto inner_size = static_cast(static_cast(1024) * 1024 * 1024); constexpr size_type num_rows = 1024; constexpr size_type string_size = inner_size / num_rows; @@ -696,8 +691,7 @@ TEST_F(OverflowTest, BigColumnsSmallSlices) // list column { - constexpr size_type inner_size = - static_cast(static_cast(1024) * 1024 * 1024); + constexpr auto inner_size = static_cast(static_cast(1024) * 1024 * 1024); constexpr size_type num_rows = 1024; constexpr size_type list_size = inner_size / num_rows; @@ -724,8 +718,7 @@ TEST_F(OverflowTest, BigColumnsSmallSlices) // struct { - constexpr size_type inner_size = - static_cast(static_cast(1024) * 1024 * 1024); + constexpr auto inner_size = static_cast(static_cast(1024) * 1024 * 1024); constexpr size_type num_rows = 1024; constexpr size_type list_size = inner_size / num_rows; diff --git a/cpp/tests/copying/copy_tests.cpp b/cpp/tests/copying/copy_tests.cpp index 4468bc69640..2f02f4cba02 100644 --- a/cpp/tests/copying/copy_tests.cpp +++ b/cpp/tests/copying/copy_tests.cpp @@ -70,30 +70,38 @@ TYPED_TEST(CopyTest, CopyIfElseTestLong) // make sure we span at least 2 warps int num_els = 64; - bool mask[] = {1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, - 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + bool mask[] = {true, false, true, false, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, false, false, false, false, true, true, true, + true, true, true, true, true, true, false, false, false, false, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true}; cudf::test::fixed_width_column_wrapper mask_w(mask, mask + num_els); - bool lhs_v[] = {1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + bool lhs_v[] = {true, true, true, true, false, false, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true}; wrapper lhs_w({5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}, lhs_v); - bool rhs_v[] = {1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + bool rhs_v[] = {true, true, true, true, true, true, false, false, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true}; wrapper rhs_w({6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}, rhs_v); - bool exp_v[] = {1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + bool exp_v[] = {true, true, true, true, false, false, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true}; wrapper expected_w({5, 6, 5, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}, @@ -309,13 +317,13 @@ TYPED_TEST(CopyTestNumeric, CopyIfElseTestScalarColumn) int num_els = 4; - bool mask[] = {1, 0, 0, 1}; + bool mask[] = {true, false, false, true}; cudf::test::fixed_width_column_wrapper mask_w(mask, mask + num_els); cudf::numeric_scalar lhs_w(5); const auto rhs = cudf::test::make_type_param_vector({6, 6, 6, 6}); - bool rhs_v[] = {1, 0, 1, 1}; + bool rhs_v[] = {true, false, true, true}; wrapper rhs_w(rhs.begin(), rhs.end(), rhs_v); const auto expected = cudf::test::make_type_param_vector({5, 6, 6, 5}); @@ -331,12 +339,12 @@ TYPED_TEST(CopyTestNumeric, CopyIfElseTestColumnScalar) int num_els = 4; - bool mask[] = {1, 0, 0, 1}; - bool mask_v[] = {1, 1, 1, 0}; + bool mask[] = {true, false, false, true}; + bool mask_v[] = {true, true, true, false}; cudf::test::fixed_width_column_wrapper mask_w(mask, mask + num_els, mask_v); const auto lhs = cudf::test::make_type_param_vector({5, 5, 5, 5}); - bool lhs_v[] = {0, 1, 1, 1}; + bool lhs_v[] = {false, true, true, true}; wrapper lhs_w(lhs.begin(), lhs.end(), lhs_v); cudf::numeric_scalar rhs_w(6); @@ -354,7 +362,7 @@ TYPED_TEST(CopyTestNumeric, CopyIfElseTestScalarScalar) int num_els = 4; - bool mask[] = {1, 0, 0, 1}; + bool mask[] = {true, false, false, true}; cudf::test::fixed_width_column_wrapper mask_w(mask, mask + num_els); cudf::numeric_scalar lhs_w(5); @@ -399,12 +407,12 @@ TYPED_TEST(CopyTestChrono, CopyIfElseTestScalarColumn) int num_els = 4; - bool mask[] = {1, 0, 0, 1}; + bool mask[] = {true, false, false, true}; cudf::test::fixed_width_column_wrapper mask_w(mask, mask + num_els); auto lhs_w = create_chrono_scalar{}(cudf::test::make_type_param_scalar(5), true); - bool rhs_v[] = {1, 0, 1, 1}; + bool rhs_v[] = {true, false, true, true}; wrapper rhs_w({6, 6, 6, 6}, rhs_v); wrapper expected_w({5, 6, 6, 5}, rhs_v); @@ -419,10 +427,10 @@ TYPED_TEST(CopyTestChrono, CopyIfElseTestColumnScalar) int num_els = 4; - bool mask[] = {1, 0, 0, 1}; + bool mask[] = {true, false, false, true}; cudf::test::fixed_width_column_wrapper mask_w(mask, mask + num_els); - bool lhs_v[] = {0, 1, 1, 1}; + bool lhs_v[] = {false, true, true, true}; wrapper lhs_w({5, 5, 5, 5}, lhs_v); auto rhs_w = create_chrono_scalar{}(cudf::test::make_type_param_scalar(6), true); @@ -439,7 +447,7 @@ TYPED_TEST(CopyTestChrono, CopyIfElseTestScalarScalar) int num_els = 4; - bool mask[] = {1, 0, 0, 1}; + bool mask[] = {true, false, false, true}; cudf::test::fixed_width_column_wrapper mask_w(mask, mask + num_els); auto lhs_w = create_chrono_scalar{}(cudf::test::make_type_param_scalar(5), true); @@ -477,8 +485,8 @@ TEST_F(StringsCopyIfElseTest, CopyIfElse) std::vector h_strings2{"zz", "", "yyy", "w", "ééé", "ooo"}; cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), valids); - bool mask[] = {1, 1, 0, 1, 0, 1}; - bool mask_v[] = {1, 1, 1, 1, 1, 0}; + bool mask[] = {true, true, false, true, false, true}; + bool mask_v[] = {true, true, true, true, true, false}; cudf::test::fixed_width_column_wrapper mask_w(mask, mask + 6, mask_v); auto results = cudf::copy_if_else(strings1, strings2, mask_w); @@ -504,8 +512,8 @@ TEST_F(StringsCopyIfElseTest, CopyIfElseScalarColumn) std::vector h_strings2{"zz", "", "yyy", "w", "ééé", "ooo"}; cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), valids); - bool mask[] = {1, 0, 1, 0, 1, 0}; - bool mask_v[] = {1, 1, 1, 1, 1, 0}; + bool mask[] = {true, false, true, false, true, false}; + bool mask_v[] = {true, true, true, true, true, false}; cudf::test::fixed_width_column_wrapper mask_w(mask, mask + 6, mask_v); auto results = cudf::copy_if_else(strings1, strings2, mask_w); @@ -532,7 +540,7 @@ TEST_F(StringsCopyIfElseTest, CopyIfElseColumnScalar) std::vector h_strings2{"zz", "", "yyy", "w", "ééé", "ooo"}; cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), valids); - bool mask[] = {0, 1, 1, 1, 0, 1}; + bool mask[] = {false, true, true, true, false, true}; cudf::test::fixed_width_column_wrapper mask_w(mask, mask + 6); auto results = cudf::copy_if_else(strings2, strings1, mask_w); @@ -560,14 +568,14 @@ TEST_F(StringsCopyIfElseTest, CopyIfElseScalarScalar) cudf::string_scalar string2{h_string2[0], false}; constexpr cudf::size_type mask_size = 6; - bool mask[] = {1, 0, 1, 0, 1, 0}; + bool mask[] = {true, false, true, false, true, false}; cudf::test::fixed_width_column_wrapper mask_w(mask, mask + mask_size); auto results = cudf::copy_if_else(string1, string2, mask_w); std::vector h_expected; - for (cudf::size_type idx = 0; idx < static_cast(mask_size); ++idx) { - if (mask[idx]) { + for (bool idx : mask) { + if (idx) { h_expected.push_back(h_string1[0]); } else { h_expected.push_back(h_string2[0]); @@ -649,8 +657,8 @@ TEST_F(DictionaryCopyIfElseTest, ColumnColumn) cudf::test::dictionary_column_wrapper input2( h_strings2.begin(), h_strings2.end(), valids); - bool mask[] = {1, 1, 0, 1, 0, 1}; - bool mask_v[] = {1, 1, 1, 1, 1, 0}; + bool mask[] = {true, true, false, true, false, true}; + bool mask_v[] = {true, true, true, true, true, false}; cudf::test::fixed_width_column_wrapper mask_w(mask, mask + 6, mask_v); auto results = cudf::copy_if_else(input1, input2, mask_w); @@ -676,7 +684,7 @@ TEST_F(DictionaryCopyIfElseTest, ColumnScalar) cudf::test::dictionary_column_wrapper input2( h_strings.begin(), h_strings.end(), valids); - bool mask[] = {0, 1, 1, 1, 0, 1}; + bool mask[] = {false, true, true, true, false, true}; cudf::test::fixed_width_column_wrapper mask_w(mask, mask + 6); auto results = cudf::copy_if_else(input2, input1, mask_w); diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu index 1199dfb44f2..2591f395914 100644 --- a/cpp/tests/groupby/tdigest_tests.cu +++ b/cpp/tests/groupby/tdigest_tests.cu @@ -36,8 +36,7 @@ namespace cudf { namespace test { using namespace cudf; - -typedef thrust::tuple expected_value; +using expected_value = thrust::tuple; template struct TDigestAllTypes : public cudf::test::BaseFixture { diff --git a/cpp/tests/hash_map/multimap_test.cu b/cpp/tests/hash_map/multimap_test.cu index 4a0e3807a4c..456ba951a45 100644 --- a/cpp/tests/hash_map/multimap_test.cu +++ b/cpp/tests/hash_map/multimap_test.cu @@ -66,20 +66,19 @@ class MultimapTest : public cudf::test::BaseFixture { rmm::cuda_stream_default.synchronize(); } - ~MultimapTest() {} + ~MultimapTest() override {} }; // Google Test can only do a parameterized typed-test over a single type, so we // have to nest multiple types inside of the KeyValueTypes struct above // KeyValueTypes implies key_type = type1, value_type = type2 // This list is the types across which Google Test will run our tests -typedef ::testing::Types, - KeyValueTypes, - KeyValueTypes, - KeyValueTypes, - KeyValueTypes, - KeyValueTypes> - Implementations; +using Implementations = ::testing::Types, + KeyValueTypes, + KeyValueTypes, + KeyValueTypes, + KeyValueTypes, + KeyValueTypes>; TYPED_TEST_SUITE(MultimapTest, Implementations); diff --git a/cpp/tests/hashing/hash_test.cpp b/cpp/tests/hashing/hash_test.cpp index 1a73fb3abc9..da933b44b8d 100644 --- a/cpp/tests/hashing/hash_test.cpp +++ b/cpp/tests/hashing/hash_test.cpp @@ -38,7 +38,7 @@ TEST_F(HashTest, MultiValue) "The quick brown fox", "jumps over the lazy dog.", "All work and no play makes Jack a dull boy", - "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"}); + R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"}); using limits = std::numeric_limits; fixed_width_column_wrapper const ints_col({0, 100, -100, limits::min(), limits::max()}); @@ -71,13 +71,13 @@ TEST_F(HashTest, MultiValueNulls) "The quick brown fox", "jumps over the lazy dog.", "All work and no play makes Jack a dull boy", - "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"}, + R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"}, {0, 1, 1, 0, 1}); strings_column_wrapper const strings_col2({"different but null", "The quick brown fox", "jumps over the lazy dog.", "I am Jack's complete lack of null value", - "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"}, + R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"}, {0, 1, 1, 0, 1}); // Nulls with different values should be equal @@ -478,7 +478,7 @@ TEST_F(MD5HashTest, MultiValue) "A very long (greater than 128 bytes/char string) to test a multi hash-step data point in the " "MD5 hash function. This string needed to be longer.", "All work and no play makes Jack a dull boy", - "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"}); + R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"}); strings_column_wrapper const md5_string_results1({"d41d8cd98f00b204e9800998ecf8427e", "682240021651ae166d08fe2a014d5c09", @@ -525,7 +525,7 @@ TEST_F(MD5HashTest, MultiValueNulls) "A very long (greater than 128 bytes/char string) to test a multi hash-step data point in the " "MD5 hash function. This string needed to be longer.", "All work and no play makes Jack a dull boy", - "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"}, + R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"}, {1, 0, 0, 1, 0}); strings_column_wrapper const strings_col2( {"", @@ -567,7 +567,7 @@ TEST_F(MD5HashTest, StringListsNulls) "A very long (greater than 128 bytes/char string) to test a multi hash-step data point in the " "MD5 hash function. This string needed to be longer. It needed to be even longer.", "All work and no play makes Jack a dull boy", - "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"}); + R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"}); lists_column_wrapper strings_list_col( {{""}, diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index b7835b4d4d1..868b19254ca 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -1901,7 +1901,7 @@ class TestSource : public cudf::io::datasource { return read_size; } - size_t size() const override { return str.size(); } + [[nodiscard]] size_t size() const override { return str.size(); } }; TEST_F(CsvReaderTest, UserImplementedSource) diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index 837ac96ef21..a31cd22ee3e 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -85,8 +85,8 @@ std::unique_ptr create_random_fixed_table(cudf::size_type num_colum } std::vector> columns(num_columns); std::transform(src_cols.begin(), src_cols.end(), columns.begin(), [](column_wrapper& in) { - auto ret = in.release(); - ret->has_nulls(); + auto ret = in.release(); + [[maybe_unused]] auto nulls = ret->has_nulls(); // pre-cache the null count return ret; }); return std::make_unique(std::move(columns)); @@ -162,8 +162,8 @@ inline auto random_values(size_t size) } struct SkipRowTest { - int test_calls; - SkipRowTest(void) : test_calls(0) {} + int test_calls{0}; + SkipRowTest() {} std::unique_ptr
get_expected_result(const std::string& filepath, int skip_rows, @@ -773,12 +773,12 @@ TEST_F(OrcChunkedWriterTest, Metadata) TEST_F(OrcChunkedWriterTest, Strings) { - bool mask1[] = {1, 1, 0, 1, 1, 1, 1}; + bool mask1[] = {true, true, false, true, true, true, true}; std::vector h_strings1{"four", "score", "and", "seven", "years", "ago", "abcdefgh"}; str_col strings1(h_strings1.begin(), h_strings1.end(), mask1); table_view tbl1({strings1}); - bool mask2[] = {0, 1, 1, 1, 1, 1, 1}; + bool mask2[] = {false, true, true, true, true, true, true}; std::vector h_strings2{"ooooo", "ppppppp", "fff", "j", "cccc", "bbb", "zzzzzzzzzzz"}; str_col strings2(h_strings2.begin(), h_strings2.end(), mask2); table_view tbl2({strings2}); @@ -885,8 +885,9 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize) int num_els = 31; - bool mask[] = {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + bool mask[] = {false, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true}; T c1a[num_els]; std::fill(c1a, c1a + num_els, static_cast(5)); @@ -927,8 +928,9 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize2) int num_els = 33; - bool mask[] = {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + bool mask[] = {false, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true}; T c1a[num_els]; std::fill(c1a, c1a + num_els, static_cast(5)); diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 9c656abb666..b45670fd265 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -76,7 +76,8 @@ std::unique_ptr create_fixed_table(cudf::size_type num_columns, columns.begin(), [](cudf::test::fixed_width_column_wrapper& in) { auto ret = in.release(); - ret->has_nulls(); + // pre-cache the null count + [[maybe_unused]] auto const nulls = ret->has_nulls(); return ret; }); return std::make_unique(std::move(columns)); @@ -1086,7 +1087,7 @@ class custom_test_data_sink : public cudf::io::data_sink { outfile_.write(static_cast(data), size); } - bool supports_device_write() const override { return true; } + [[nodiscard]] bool supports_device_write() const override { return true; } void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override { @@ -1413,13 +1414,13 @@ TEST_F(ParquetChunkedWriterTest, Strings) { std::vector> cols; - bool mask1[] = {1, 1, 0, 1, 1, 1, 1}; + bool mask1[] = {true, true, false, true, true, true, true}; std::vector h_strings1{"four", "score", "and", "seven", "years", "ago", "abcdefgh"}; cudf::test::strings_column_wrapper strings1(h_strings1.begin(), h_strings1.end(), mask1); cols.push_back(strings1.release()); cudf::table tbl1(std::move(cols)); - bool mask2[] = {0, 1, 1, 1, 1, 1, 1}; + bool mask2[] = {false, true, true, true, true, true, true}; std::vector h_strings2{"ooooo", "ppppppp", "fff", "j", "cccc", "bbb", "zzzzzzzzzzz"}; cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), mask2); cols.push_back(strings2.release()); @@ -2052,8 +2053,9 @@ TYPED_TEST(ParquetChunkedWriterNumericTypeTest, UnalignedSize) int num_els = 31; std::vector> cols; - bool mask[] = {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + bool mask[] = {false, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true}; T c1a[num_els]; std::fill(c1a, c1a + num_els, static_cast(5)); @@ -2099,8 +2101,9 @@ TYPED_TEST(ParquetChunkedWriterNumericTypeTest, UnalignedSize2) int num_els = 33; std::vector> cols; - bool mask[] = {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + bool mask[] = {false, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true}; T c1a[num_els]; std::fill(c1a, c1a + num_els, static_cast(5)); @@ -2149,7 +2152,7 @@ class custom_test_memmap_sink : public cudf::io::data_sink { void host_write(void const* data, size_t size) override { mm_writer->host_write(data, size); } - bool supports_device_write() const override { return supports_device_writes; } + [[nodiscard]] bool supports_device_write() const override { return supports_device_writes; } void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override { diff --git a/cpp/tests/replace/replace_tests.cpp b/cpp/tests/replace/replace_tests.cpp index fe65fe0474a..7540dfd94c5 100644 --- a/cpp/tests/replace/replace_tests.cpp +++ b/cpp/tests/replace/replace_tests.cpp @@ -298,7 +298,7 @@ struct ReplaceTest : cudf::test::BaseFixture { std::srand(number_of_instantiations++); } - ~ReplaceTest() {} + ~ReplaceTest() override {} }; /** diff --git a/cpp/tests/scalar/factories_test.cpp b/cpp/tests/scalar/factories_test.cpp index 3e89e435bc0..7cd8b655231 100644 --- a/cpp/tests/scalar/factories_test.cpp +++ b/cpp/tests/scalar/factories_test.cpp @@ -114,7 +114,7 @@ TYPED_TEST(DefaultScalarFactory, TypeCast) auto numeric_s = static_cast*>(s.get()); - EXPECT_NO_THROW(numeric_s->value()); + EXPECT_NO_THROW((void)numeric_s->value()); EXPECT_FALSE(numeric_s->is_valid()); EXPECT_FALSE(s->is_valid()); } diff --git a/cpp/tests/strings/chars_types_tests.cpp b/cpp/tests/strings/chars_types_tests.cpp index ff9f79ea87f..c1552ab3f57 100644 --- a/cpp/tests/strings/chars_types_tests.cpp +++ b/cpp/tests/strings/chars_types_tests.cpp @@ -51,13 +51,20 @@ TEST_P(CharsTypes, AllTypes) "de", "\t\r\n\f "}; - bool expecteds[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, // decimal - 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, // numeric - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, // digit - 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, // alpha - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, // space - 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // upper - 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0}; // lower + bool expecteds[] = {false, false, false, false, false, false, false, false, + false, false, false, false, false, true, false, false, // decimal + false, false, false, false, false, false, false, false, + false, true, false, true, false, true, false, false, // numeric + false, false, false, false, false, false, false, false, + false, false, false, true, false, true, false, false, // digit + true, true, false, true, false, false, false, false, + false, false, false, false, false, false, true, false, // alpha + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, true, // space + false, false, false, true, false, false, false, false, + false, false, false, false, false, false, false, false, // upper + false, true, false, false, false, false, false, false, + false, false, false, false, false, false, true, false}; // lower auto is_parm = GetParam(); diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp index 2bb1c6dac8e..516882bd8ad 100644 --- a/cpp/tests/strings/extract_tests.cpp +++ b/cpp/tests/strings/extract_tests.cpp @@ -183,7 +183,7 @@ TEST_F(StringsExtractTests, ExtractAllTest) auto results = cudf::strings::extract_all(sv, "(\\d+) (\\w+)"); - bool valids[] = {1, 1, 1, 0, 0, 0, 1}; + bool valids[] = {true, true, true, false, false, false, true}; using LCW = cudf::test::lists_column_wrapper; LCW expected({LCW{"123", "banana", "7", "eleven"}, LCW{"41", "apple"}, diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu index 38f905078a7..d35cb5c3b9d 100644 --- a/cpp/tests/strings/factories_test.cu +++ b/cpp/tests/strings/factories_test.cu @@ -68,7 +68,7 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair) strings[idx] = thrust::pair{nullptr, 0}; nulls++; } else { - cudf::size_type length = (cudf::size_type)strlen(str); + auto length = (cudf::size_type)strlen(str); memcpy(h_buffer.data() + offset, str, length); strings[idx] = thrust::pair{d_buffer.data() + offset, length}; offset += length; @@ -130,7 +130,7 @@ TEST_F(StringsFactoriesTest, CreateColumnFromOffsets) h_null_mask = (h_null_mask << 1); const char* str = h_test_strings[idx]; if (str) { - cudf::size_type length = (cudf::size_type)strlen(str); + auto length = (cudf::size_type)strlen(str); memcpy(h_buffer.data() + offset, str, length); offset += length; h_null_mask |= 1; diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp index dfcc646a8f6..2dfe50d2ef5 100644 --- a/cpp/tests/strings/json_tests.cpp +++ b/cpp/tests/strings/json_tests.cpp @@ -389,7 +389,7 @@ TEST_F(JsonPathTests, GetJsonObjectFilter) auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); auto result = drop_whitespace(*result_raw); - cudf::test::strings_column_wrapper expected_raw{"[\"0-553-21311-3\",\"0-395-19395-8\"]"}; + cudf::test::strings_column_wrapper expected_raw{R"(["0-553-21311-3","0-395-19395-8"])"}; auto expected = drop_whitespace(expected_raw); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); @@ -402,7 +402,7 @@ TEST_F(JsonPathTests, GetJsonObjectFilter) auto result = drop_whitespace(*result_raw); cudf::test::strings_column_wrapper expected_raw{ - "[\"reference\",\"fiction\",\"fiction\",\"fiction\"]"}; + R"(["reference","fiction","fiction","fiction"])"}; auto expected = drop_whitespace(expected_raw); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); @@ -415,7 +415,7 @@ TEST_F(JsonPathTests, GetJsonObjectFilter) auto result = drop_whitespace(*result_raw); cudf::test::strings_column_wrapper expected_raw{ - "[\"Sayings of the Century\",\"Sword of Honour\",\"Moby Dick\",\"The Lord of the Rings\"]"}; + R"(["Sayings of the Century","Sword of Honour","Moby Dick","The Lord of the Rings"])"}; auto expected = drop_whitespace(expected_raw); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); @@ -470,7 +470,7 @@ TEST_F(JsonPathTests, GetJsonObjectEmptyQuery) { // empty query -> null { - cudf::test::strings_column_wrapper input{"{\"a\" : \"b\"}"}; + cudf::test::strings_column_wrapper input{R"({"a" : "b"})"}; std::string json_path(""); auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); @@ -497,7 +497,7 @@ TEST_F(JsonPathTests, GetJsonObjectEmptyInputsAndOutputs) // returning something, but it happens to be empty. so we expect // a valid, but empty row { - cudf::test::strings_column_wrapper input{"{\"store\": { \"bicycle\" : \"\" } }"}; + cudf::test::strings_column_wrapper input{R"({"store": { "bicycle" : "" } })"}; std::string json_path("$.store.bicycle"); auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); @@ -513,7 +513,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery) // can't have more than one root operator, or a root operator anywhere other // than the beginning { - cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + cudf::test::strings_column_wrapper input{R"({"a": "b"})"}; std::string json_path("$$"); auto query = [&]() { auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); @@ -523,7 +523,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery) // invalid index { - cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + cudf::test::strings_column_wrapper input{R"({"a": "b"})"}; std::string json_path("$[auh46h-]"); auto query = [&]() { auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); @@ -533,7 +533,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery) // invalid index { - cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + cudf::test::strings_column_wrapper input{R"({"a": "b"})"}; std::string json_path("$[[]]"); auto query = [&]() { auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); @@ -543,7 +543,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery) // negative index { - cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + cudf::test::strings_column_wrapper input{R"({"a": "b"})"}; std::string json_path("$[-1]"); auto query = [&]() { auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); @@ -553,7 +553,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery) // child operator with no name specified { - cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + cudf::test::strings_column_wrapper input{R"({"a": "b"})"}; std::string json_path("."); auto query = [&]() { auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); @@ -562,7 +562,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery) } { - cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + cudf::test::strings_column_wrapper input{R"({"a": "b"})"}; std::string json_path("]["); auto query = [&]() { auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); @@ -571,7 +571,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery) } { - cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + cudf::test::strings_column_wrapper input{R"({"a": "b"})"}; std::string json_path("6hw6,56i3"); auto query = [&]() { auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); @@ -585,7 +585,7 @@ TEST_F(JsonPathTests, GetJsonObjectInvalidQuery) { // non-existent field { - cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + cudf::test::strings_column_wrapper input{R"({"a": "b"})"}; std::string json_path("$[*].c"); auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); @@ -596,7 +596,7 @@ TEST_F(JsonPathTests, GetJsonObjectInvalidQuery) // non-existent field { - cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + cudf::test::strings_column_wrapper input{R"({"a": "b"})"}; std::string json_path("$[*].c[2]"); auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); @@ -667,8 +667,8 @@ TEST_F(JsonPathTests, MixedOutput) // clang-format off cudf::test::strings_column_wrapper expected({ - "{\"b\" : \"c\"}", - "{\"b\" : \"c\"}", + R"({"b" : "c"})", + R"({"b" : "c"})", "", "[\"y\",500]", "", @@ -786,7 +786,7 @@ TEST_F(JsonPathTests, StripQuotes) // a valid, but empty row { - cudf::test::strings_column_wrapper input{"{\"store\": { \"bicycle\" : \"\" } }"}; + cudf::test::strings_column_wrapper input{R"({"store": { "bicycle" : "" } })"}; std::string json_path("$.store.bicycle"); cudf::strings::get_json_object_options options; @@ -858,8 +858,8 @@ TEST_F(JsonPathTests, AllowSingleQuotes) // clang-format off cudf::test::strings_column_wrapper expected({ - "{\'b\' : \'c\'}", - "{\'b\' : \"c\"}", + R"({'b' : 'c'})", + R"({'b' : "c"})", "", "[\'y\',500]", "", @@ -902,7 +902,7 @@ TEST_F(JsonPathTests, StringsWithSpecialChars) // clang-format off cudf::test::strings_column_wrapper expected({ - "[{\"key\" : \"value[\"}]", + R"([{"key" : "value["}])", }); // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); @@ -928,7 +928,7 @@ TEST_F(JsonPathTests, StringsWithSpecialChars) // clang-format off cudf::test::strings_column_wrapper expected({ - "[}{}][][{[\\\"}}[\\\"]", + R"([}{}][][{[\"}}[\"])", }); // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); @@ -961,8 +961,8 @@ TEST_F(JsonPathTests, EscapeSequences) // clang-format off cudf::test::strings_column_wrapper expected({ - "\\\" \\\\ \\/ \\b \\f \\n \\r \\t", - "\\u1248 \\uacdf \\uACDF \\u10EF" + R"(\" \\ \/ \b \f \n \r \t)", + R"(\u1248 \uacdf \uACDF \u10EF)" }); // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); diff --git a/cpp/tests/table/table_view_tests.cu b/cpp/tests/table/table_view_tests.cu index c94963525a0..a1c0c49a881 100644 --- a/cpp/tests/table/table_view_tests.cu +++ b/cpp/tests/table/table_view_tests.cu @@ -123,7 +123,7 @@ TEST_F(TableViewTest, SelectOutOfBounds) fixed_width_column_wrapper col4{{4, 5, 6, 7}}; cudf::table_view t{{col1, col2}}; - EXPECT_THROW(t.select({2, 3, 4}), std::out_of_range); + EXPECT_THROW((void)t.select({2, 3, 4}), std::out_of_range); } TEST_F(TableViewTest, SelectNoColumns) From 12a0f596e5f1adceab8f386f89954598fb812757 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Thu, 20 Jan 2022 11:05:25 -0500 Subject: [PATCH 10/14] Remove libcudacxx patch needed for nvcc 11.4 (#10057) The `libcudacxx.patch` was required to fix issues with libcudacxx 1.6 and incorrect detection of the arm nvcc 11.4 compiler. As we move to libcudacxx 1.7 this patch is not needed, and should be removed. Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - Mark Harris (https://github.com/harrism) URL: https://github.com/rapidsai/cudf/pull/10057 --- cpp/cmake/libcudacxx.patch | 21 -------------------- cpp/cmake/thirdparty/get_cucollections.cmake | 2 +- cpp/cmake/thirdparty/get_libcudacxx.cmake | 6 +----- 3 files changed, 2 insertions(+), 27 deletions(-) delete mode 100644 cpp/cmake/libcudacxx.patch diff --git a/cpp/cmake/libcudacxx.patch b/cpp/cmake/libcudacxx.patch deleted file mode 100644 index 3cdc40ef084..00000000000 --- a/cpp/cmake/libcudacxx.patch +++ /dev/null @@ -1,21 +0,0 @@ -diff --git a/include/cuda/std/detail/__config b/include/cuda/std/detail/__config -index d55a43688..654142d7e 100644 ---- a/include/cuda/std/detail/__config -+++ b/include/cuda/std/detail/__config -@@ -23,7 +23,7 @@ - #define _LIBCUDACXX_CUDACC_VER_MINOR __CUDACC_VER_MINOR__ - #define _LIBCUDACXX_CUDACC_VER_BUILD __CUDACC_VER_BUILD__ - #define _LIBCUDACXX_CUDACC_VER \ -- _LIBCUDACXX_CUDACC_VER_MAJOR * 10000 + _LIBCUDACXX_CUDACC_VER_MINOR * 100 + \ -+ _LIBCUDACXX_CUDACC_VER_MAJOR * 100000 + _LIBCUDACXX_CUDACC_VER_MINOR * 1000 + \ - _LIBCUDACXX_CUDACC_VER_BUILD - - #define _LIBCUDACXX_HAS_NO_LONG_DOUBLE -@@ -64,7 +64,7 @@ - # endif - #endif - --#if defined(_LIBCUDACXX_COMPILER_MSVC) || (defined(_LIBCUDACXX_CUDACC_VER) && (_LIBCUDACXX_CUDACC_VER < 110500)) -+#if defined(_LIBCUDACXX_COMPILER_MSVC) || (defined(_LIBCUDACXX_CUDACC_VER) && (_LIBCUDACXX_CUDACC_VER < 1105000)) - # define _LIBCUDACXX_HAS_NO_INT128 - #endif diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake index 16e7a58b020..c964c85156c 100644 --- a/cpp/cmake/thirdparty/get_cucollections.cmake +++ b/cpp/cmake/thirdparty/get_cucollections.cmake @@ -21,7 +21,7 @@ function(find_and_configure_cucollections) cuco 0.0 GLOBAL_TARGETS cuco::cuco CPM_ARGS GITHUB_REPOSITORY NVIDIA/cuCollections - GIT_TAG 193de1aa74f5721717f991ca757dc610c852bb17 + GIT_TAG 0ca860b824f5dc22cf8a41f09912e62e11f07d82 OPTIONS "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" "BUILD_EXAMPLES OFF" ) diff --git a/cpp/cmake/thirdparty/get_libcudacxx.cmake b/cpp/cmake/thirdparty/get_libcudacxx.cmake index 0917adcd764..4b2917bc11e 100644 --- a/cpp/cmake/thirdparty/get_libcudacxx.cmake +++ b/cpp/cmake/thirdparty/get_libcudacxx.cmake @@ -16,11 +16,7 @@ function(find_and_configure_libcudacxx) include(${rapids-cmake-dir}/cpm/libcudacxx.cmake) - rapids_cpm_libcudacxx( - BUILD_EXPORT_SET cudf-exports - INSTALL_EXPORT_SET cudf-exports PATCH_COMMAND patch --reject-file=- -p1 -N < - ${CUDF_SOURCE_DIR}/cmake/libcudacxx.patch || true - ) + rapids_cpm_libcudacxx(BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports) set(LIBCUDACXX_INCLUDE_DIR "${libcudacxx_SOURCE_DIR}/include" From 09035d606cebc66e7efa28e6b0702a698a67cff2 Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Thu, 20 Jan 2022 14:36:44 -0600 Subject: [PATCH 11/14] Use fsspec.parquet for improved read_parquet performance from remote storage (#9589) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit **Important Note**: ~Marking this as WIP until the `fsspec.parquet` module is available in a filesystem_spec release~ (fsspec.parquet module is available) This PR modifies `cudf.read_parquet` and `dask_cudf.read_parquet` to leverage the new `fsspec.parquet.open_parquet_file` function for optimized data transfer/caching from remote storage. The ~long-term~ goal is to remove the temporary data-transfer optimizations that we currently use in cudf.read_parquet. **Performance Motivation**: ```python In [1]: import cudf, dask_cudf ...: path = [ ...: "gs://my-bucket/criteo-parquet/day_0.parquet", ...: "gs://my-bucket/criteo-parquet/day_1.parquet", ...: ] # cudf BEFORE In [2]: %time df = cudf.read_parquet(path, columns=["I10"], storage_options=…) CPU times: user 11.1 s, sys: 11.5 s, total: 22.6 s Wall time: 24.4 s # cudf AFTER In [2]: %time df = cudf.read_parquet(path, columns=["I10"], storage_options=…) CPU times: user 3.48 s, sys: 722 ms, total: 4.2 s Wall time: 6.32 s # (Threaded) Dask-cudf BEFORE In [2]: %time df = dask_cudf.read_parquet(path, columns=["I10"], storage_options=…).compute() CPU times: user 27.1 s, sys: 15.5 s, total: 42.6 s Wall time: 57.6 s # (Threaded) Dask-cudf AFTER In [2]: %time df = dask_cudf.read_parquet(path, columns=["I10"], storage_options=…).compute() CPU times: user 3.43 s, sys: 851 ms, total: 4.28 s Wall time: 13.1 s ``` Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) Approvers: - https://github.com/brandon-b-miller - Benjamin Zaitlen (https://github.com/quasiben) URL: https://github.com/rapidsai/cudf/pull/9589 --- python/cudf/cudf/io/csv.py | 7 - python/cudf/cudf/io/parquet.py | 181 ++++++----------- python/cudf/cudf/tests/test_parquet.py | 13 +- python/cudf/cudf/tests/test_s3.py | 43 ++-- python/cudf/cudf/utils/ioutils.py | 183 +++++++++++++----- python/dask_cudf/dask_cudf/io/parquet.py | 31 +-- .../dask_cudf/dask_cudf/io/tests/test_s3.py | 20 +- 7 files changed, 257 insertions(+), 221 deletions(-) diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index 01f1fdf9020..4694243ad18 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -59,17 +59,10 @@ def read_csv( "`read_csv` does not yet support reading multiple files" ) - # Only need to pass byte_ranges to get_filepath_or_buffer - # if `use_python_file_object=False` - byte_ranges = None - if not use_python_file_object and byte_range: - byte_ranges = [byte_range] - filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( path_or_data=filepath_or_buffer, compression=compression, iotypes=(BytesIO, StringIO, NativeFile), - byte_ranges=byte_ranges, use_python_file_object=use_python_file_object, **kwargs, ) diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 3e73e0c9e3d..a919b00692d 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -1,14 +1,11 @@ # Copyright (c) 2019-2022, NVIDIA CORPORATION. -import io -import json import warnings from collections import defaultdict from contextlib import ExitStack from typing import Dict, List, Tuple from uuid import uuid4 -import fsspec import numpy as np from pyarrow import dataset as ds, parquet as pq @@ -310,103 +307,6 @@ def _process_dataset( ) -def _get_byte_ranges(file_list, row_groups, columns, fs, **kwargs): - - # This utility is used to collect the footer metadata - # from a parquet file. This metadata is used to define - # the exact byte-ranges that will be needed to read the - # target column-chunks from the file. - # - # This utility is only used for remote storage. - # - # The calculated byte-range information is used within - # cudf.io.ioutils.get_filepath_or_buffer (which uses - # _fsspec_data_transfer to convert non-local fsspec file - # objects into local byte buffers). - - if row_groups is None: - if columns is None: - return None, None, None # No reason to construct this - row_groups = [None for path in file_list] - - # Construct a list of required byte-ranges for every file - all_byte_ranges, all_footers, all_sizes = [], [], [] - for path, rgs in zip(file_list, row_groups): - - # Step 0 - Get size of file - if fs is None: - file_size = path.size - else: - file_size = fs.size(path) - - # Step 1 - Get 32 KB from tail of file. - # - # This "sample size" can be tunable, but should - # always be >= 8 bytes (so we can read the footer size) - tail_size = min(kwargs.get("footer_sample_size", 32_000), file_size,) - if fs is None: - path.seek(file_size - tail_size) - footer_sample = path.read(tail_size) - else: - footer_sample = fs.tail(path, tail_size) - - # Step 2 - Read the footer size and re-read a larger - # tail if necessary - footer_size = int.from_bytes(footer_sample[-8:-4], "little") - if tail_size < (footer_size + 8): - if fs is None: - path.seek(file_size - (footer_size + 8)) - footer_sample = path.read(footer_size + 8) - else: - footer_sample = fs.tail(path, footer_size + 8) - - # Step 3 - Collect required byte ranges - byte_ranges = [] - md = pq.ParquetFile(io.BytesIO(footer_sample)).metadata - column_set = None if columns is None else set(columns) - if column_set is not None: - schema = md.schema.to_arrow_schema() - has_pandas_metadata = ( - schema.metadata is not None and b"pandas" in schema.metadata - ) - if has_pandas_metadata: - md_index = [ - ind - for ind in json.loads( - schema.metadata[b"pandas"].decode("utf8") - ).get("index_columns", []) - # Ignore RangeIndex information - if not isinstance(ind, dict) - ] - column_set |= set(md_index) - for r in range(md.num_row_groups): - # Skip this row-group if we are targetting - # specific row-groups - if rgs is None or r in rgs: - row_group = md.row_group(r) - for c in range(row_group.num_columns): - column = row_group.column(c) - name = column.path_in_schema - # Skip this column if we are targetting a - # specific columns - split_name = name.split(".")[0] - if ( - column_set is None - or name in column_set - or split_name in column_set - ): - file_offset0 = column.dictionary_page_offset - if file_offset0 is None: - file_offset0 = column.data_page_offset - num_bytes = column.total_compressed_size - byte_ranges.append((file_offset0, num_bytes)) - - all_byte_ranges.append(byte_ranges) - all_footers.append(footer_sample) - all_sizes.append(file_size) - return all_byte_ranges, all_footers, all_sizes - - @ioutils.doc_read_parquet() def read_parquet( filepath_or_buffer, @@ -418,13 +318,24 @@ def read_parquet( num_rows=None, strings_to_categorical=False, use_pandas_metadata=True, - use_python_file_object=False, + use_python_file_object=True, categorical_partitions=True, + open_file_options=None, *args, **kwargs, ): """{docstring}""" + # Do not allow the user to set file-opening options + # when `use_python_file_object=False` is specified + if use_python_file_object is False: + if open_file_options: + raise ValueError( + "open_file_options is not currently supported when " + "use_python_file_object is set to False." + ) + open_file_options = {} + # Multiple sources are passed as a list. If a single source is passed, # wrap it in a list for unified processing downstream. if not is_list_like(filepath_or_buffer): @@ -470,38 +381,18 @@ def read_parquet( raise ValueError("cudf cannot apply filters to open file objects.") filepath_or_buffer = paths if paths else filepath_or_buffer - # Check if we should calculate the specific byte-ranges - # needed for each parquet file. We always do this when we - # have a file-system object to work with and it is not a - # local filesystem object. We can also do it without a - # file-system object for `AbstractBufferedFile` buffers - byte_ranges, footers, file_sizes = None, None, None - if not use_python_file_object: - need_byte_ranges = fs is not None and not ioutils._is_local_filesystem( - fs - ) - if need_byte_ranges or ( - filepath_or_buffer - and isinstance( - filepath_or_buffer[0], fsspec.spec.AbstractBufferedFile, - ) - ): - byte_ranges, footers, file_sizes = _get_byte_ranges( - filepath_or_buffer, row_groups, columns, fs, **kwargs - ) - filepaths_or_buffers = [] + if use_python_file_object: + open_file_options = _default_open_file_options( + open_file_options, columns, row_groups, fs=fs, + ) for i, source in enumerate(filepath_or_buffer): - tmp_source, compression = ioutils.get_filepath_or_buffer( path_or_data=source, compression=None, fs=fs, - byte_ranges=byte_ranges[i] if byte_ranges else None, - footer=footers[i] if footers else None, - file_size=file_sizes[i] if file_sizes else None, - add_par1_magic=True, use_python_file_object=use_python_file_object, + open_file_options=open_file_options, **kwargs, ) @@ -953,3 +844,41 @@ def __enter__(self): def __exit__(self, *args): self.close() + + +def _default_open_file_options( + open_file_options, columns, row_groups, fs=None +): + """ + Set default fields in open_file_options. + + Copies and updates `open_file_options` to + include column and row-group information + under the "precache_options" key. By default, + we set "method" to "parquet", but precaching + will be disabled if the user chooses `method=None` + + Parameters + ---------- + open_file_options : dict or None + columns : list + row_groups : list + fs : fsspec.AbstractFileSystem, Optional + """ + if fs and ioutils._is_local_filesystem(fs): + # Quick return for local fs + return open_file_options or {} + # Assume remote storage if `fs` was not specified + open_file_options = (open_file_options or {}).copy() + precache_options = open_file_options.pop("precache_options", {}).copy() + if precache_options.get("method", "parquet") == "parquet": + precache_options.update( + { + "method": "parquet", + "engine": precache_options.get("engine", "pyarrow"), + "columns": columns, + "row_groups": row_groups, + } + ) + open_file_options["precache_options"] = precache_options + return open_file_options diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 519f24b7ca6..21556aad1eb 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -748,7 +748,10 @@ def test_parquet_reader_arrow_nativefile(parquet_path_or_buf): assert_eq(expect, got) -def test_parquet_reader_use_python_file_object(parquet_path_or_buf): +@pytest.mark.parametrize("use_python_file_object", [True, False]) +def test_parquet_reader_use_python_file_object( + parquet_path_or_buf, use_python_file_object +): # Check that the non-default `use_python_file_object=True` # option works as expected expect = cudf.read_parquet(parquet_path_or_buf("filepath")) @@ -756,11 +759,15 @@ def test_parquet_reader_use_python_file_object(parquet_path_or_buf): # Pass open fsspec file with fs.open(paths[0], mode="rb") as fil: - got1 = cudf.read_parquet(fil, use_python_file_object=True) + got1 = cudf.read_parquet( + fil, use_python_file_object=use_python_file_object + ) assert_eq(expect, got1) # Pass path only - got2 = cudf.read_parquet(paths[0], use_python_file_object=True) + got2 = cudf.read_parquet( + paths[0], use_python_file_object=use_python_file_object + ) assert_eq(expect, got2) diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py index 5738e1f0d00..da1ffc1fc16 100644 --- a/python/cudf/cudf/tests/test_s3.py +++ b/python/cudf/cudf/tests/test_s3.py @@ -131,6 +131,9 @@ def pdf_ext(scope="module"): df["Integer"] = np.array([i for i in range(size)]) df["List"] = [[i] for i in range(size)] df["Struct"] = [{"a": i} for i in range(size)] + df["String"] = (["Alpha", "Beta", "Gamma", "Delta"] * (-(size // -4)))[ + :size + ] return df @@ -225,9 +228,16 @@ def test_write_csv(s3_base, s3so, pdf, chunksize): @pytest.mark.parametrize("bytes_per_thread", [32, 1024]) @pytest.mark.parametrize("columns", [None, ["Float", "String"]]) -@pytest.mark.parametrize("use_python_file_object", [False, True]) +@pytest.mark.parametrize("precache", [None, "parquet"]) +@pytest.mark.parametrize("use_python_file_object", [True, False]) def test_read_parquet( - s3_base, s3so, pdf, bytes_per_thread, columns, use_python_file_object + s3_base, + s3so, + pdf, + bytes_per_thread, + columns, + precache, + use_python_file_object, ): fname = "test_parquet_reader.parquet" bname = "parquet" @@ -239,10 +249,15 @@ def test_read_parquet( with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}): got1 = cudf.read_parquet( "s3://{}/{}".format(bname, fname), - use_python_file_object=use_python_file_object, + open_file_options=( + {"precache_options": {"method": precache}} + if use_python_file_object + else None + ), storage_options=s3so, bytes_per_thread=bytes_per_thread, columns=columns, + use_python_file_object=use_python_file_object, ) expect = pdf[columns] if columns else pdf assert_eq(expect, got1) @@ -256,25 +271,18 @@ def test_read_parquet( with fs.open("s3://{}/{}".format(bname, fname), mode="rb") as f: got2 = cudf.read_parquet( f, - use_python_file_object=use_python_file_object, bytes_per_thread=bytes_per_thread, columns=columns, + use_python_file_object=use_python_file_object, ) assert_eq(expect, got2) @pytest.mark.parametrize("bytes_per_thread", [32, 1024]) @pytest.mark.parametrize("columns", [None, ["List", "Struct"]]) -@pytest.mark.parametrize("use_python_file_object", [False, True]) @pytest.mark.parametrize("index", [None, "Integer"]) def test_read_parquet_ext( - s3_base, - s3so, - pdf_ext, - bytes_per_thread, - columns, - use_python_file_object, - index, + s3_base, s3so, pdf_ext, bytes_per_thread, columns, index, ): fname = "test_parquet_reader_ext.parquet" bname = "parquet" @@ -290,7 +298,6 @@ def test_read_parquet_ext( with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}): got1 = cudf.read_parquet( "s3://{}/{}".format(bname, fname), - use_python_file_object=use_python_file_object, storage_options=s3so, bytes_per_thread=bytes_per_thread, footer_sample_size=3200, @@ -326,12 +333,12 @@ def test_read_parquet_arrow_nativefile(s3_base, s3so, pdf, columns): assert_eq(expect, got) -@pytest.mark.parametrize("python_file", [True, False]) -def test_read_parquet_filters(s3_base, s3so, pdf, python_file): +@pytest.mark.parametrize("precache", [None, "parquet"]) +def test_read_parquet_filters(s3_base, s3so, pdf_ext, precache): fname = "test_parquet_reader_filters.parquet" bname = "parquet" buffer = BytesIO() - pdf.to_parquet(path=buffer) + pdf_ext.to_parquet(path=buffer) buffer.seek(0) filters = [("String", "==", "Omega")] with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}): @@ -339,11 +346,11 @@ def test_read_parquet_filters(s3_base, s3so, pdf, python_file): "s3://{}/{}".format(bname, fname), storage_options=s3so, filters=filters, - use_python_file_object=python_file, + open_file_options={"precache_options": {"method": precache}}, ) # All row-groups should be filtered out - assert_eq(pdf.iloc[:0], got.reset_index(drop=True)) + assert_eq(pdf_ext.iloc[:0], got.reset_index(drop=True)) @pytest.mark.parametrize("partition_cols", [None, ["String"]]) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 6f958860dad..8f8a40ae4ab 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -3,6 +3,7 @@ import datetime import os import urllib +import warnings from io import BufferedWriter, BytesIO, IOBase, TextIOWrapper from threading import Thread @@ -17,6 +18,13 @@ from cudf.utils.docutils import docfmt_partial +try: + import fsspec.parquet as fsspec_parquet + +except ImportError: + fsspec_parquet = None + + _docstring_remote_sources = """ - cuDF supports local and remote data stores. See configuration details for available sources @@ -160,10 +168,17 @@ use_pandas_metadata : boolean, default True If True and dataset has custom PANDAS schema metadata, ensure that index columns are also loaded. -use_python_file_object : boolean, default False +use_python_file_object : boolean, default True If True, Arrow-backed PythonFile objects will be used in place of fsspec - AbstractBufferedFile objects at IO time. This option is likely to improve - performance when making small reads from larger parquet files. + AbstractBufferedFile objects at IO time. Setting this argument to `False` + will require the entire file to be copied to host memory, and is highly + discouraged. +open_file_options : dict, optional + Dictionary of key-value pairs to pass to the function used to open remote + files. By default, this will be `fsspec.parquet.open_parquet_file`. To + deactivate optimized precaching, set the "method" to `None` under the + "precache_options" key. Note that the `open_file_func` key can also be + used to specify a custom file-open function. Returns ------- @@ -1220,6 +1235,100 @@ def _get_filesystem_and_paths(path_or_data, **kwargs): return fs, return_paths +def _set_context(obj, stack): + # Helper function to place open file on context stack + if stack is None: + return obj + return stack.enter_context(obj) + + +def _open_remote_files( + paths, + fs, + context_stack=None, + open_file_func=None, + precache_options=None, + **kwargs, +): + """Return a list of open file-like objects given + a list of remote file paths. + + Parameters + ---------- + paths : list(str) + List of file-path strings. + fs : fsspec.AbstractFileSystem + Fsspec file-system object. + context_stack : contextlib.ExitStack, Optional + Context manager to use for open files. + open_file_func : Callable, Optional + Call-back function to use for opening. If this argument + is specified, all other arguments will be ignored. + precache_options : dict, optional + Dictionary of key-word arguments to pass to use for + precaching. Unless the input contains ``{"method": None}``, + ``fsspec.parquet.open_parquet_file`` will be used for remote + storage. + **kwargs : + Key-word arguments to be passed to format-specific + open functions. + """ + + # Just use call-back function if one was specified + if open_file_func is not None: + return [ + _set_context(open_file_func(path, **kwargs), context_stack) + for path in paths + ] + + # Check if the "precache" option is supported. + # In the future, fsspec should do this check for us + precache_options = (precache_options or {}).copy() + precache = precache_options.pop("method", None) + if precache not in ("parquet", None): + raise ValueError(f"{precache} not a supported `precache` option.") + + # Check that "parts" caching (used for all format-aware file handling) + # is supported by the installed fsspec/s3fs version + if precache == "parquet" and not fsspec_parquet: + warnings.warn( + f"This version of fsspec ({fsspec.__version__}) does " + f"not support parquet-optimized precaching. Please upgrade " + f"to the latest fsspec version for better performance." + ) + precache = None + + if precache == "parquet": + # Use fsspec.parquet module. + # TODO: Use `cat_ranges` to collect "known" + # parts for all files at once. + row_groups = precache_options.pop("row_groups", None) or ( + [None] * len(paths) + ) + return [ + ArrowPythonFile( + _set_context( + fsspec_parquet.open_parquet_file( + path, + fs=fs, + row_groups=rgs, + **precache_options, + **kwargs, + ), + context_stack, + ) + ) + for path, rgs in zip(paths, row_groups) + ] + + # Default open - Use pyarrow filesystem API + pa_fs = PyFileSystem(FSSpecHandler(fs)) + return [ + _set_context(pa_fs.open_input_file(fpath), context_stack) + for fpath in paths + ] + + def get_filepath_or_buffer( path_or_data, compression, @@ -1228,6 +1337,7 @@ def get_filepath_or_buffer( iotypes=(BytesIO, NativeFile), byte_ranges=None, use_python_file_object=False, + open_file_options=None, **kwargs, ): """Return either a filepath string to data, or a memory buffer of data. @@ -1249,6 +1359,9 @@ def get_filepath_or_buffer( use_python_file_object : boolean, default False If True, Arrow-backed PythonFile objects will be used in place of fsspec AbstractBufferedFile objects. + open_file_options : dict, optional + Optional dictionary of key-word arguments to pass to + `_open_remote_files` (used for remote storage only). Returns ------- @@ -1282,19 +1395,14 @@ def get_filepath_or_buffer( else: if use_python_file_object: - pa_fs = PyFileSystem(FSSpecHandler(fs)) - path_or_data = [ - pa_fs.open_input_file(fpath) for fpath in paths - ] + path_or_data = _open_remote_files( + paths, fs, **(open_file_options or {}), + ) else: path_or_data = [ BytesIO( _fsspec_data_transfer( - fpath, - fs=fs, - mode=mode, - byte_ranges=byte_ranges, - **kwargs, + fpath, fs=fs, mode=mode, **kwargs, ) ) for fpath in paths @@ -1309,9 +1417,7 @@ def get_filepath_or_buffer( path_or_data = ArrowPythonFile(path_or_data) else: path_or_data = BytesIO( - _fsspec_data_transfer( - path_or_data, mode=mode, byte_ranges=byte_ranges, **kwargs - ) + _fsspec_data_transfer(path_or_data, mode=mode, **kwargs) ) return path_or_data, compression @@ -1545,10 +1651,7 @@ def _ensure_filesystem(passed_filesystem, path, **kwargs): def _fsspec_data_transfer( path_or_fob, fs=None, - byte_ranges=None, - footer=None, file_size=None, - add_par1_magic=None, bytes_per_thread=256_000_000, max_gap=64_000, mode="rb", @@ -1568,48 +1671,22 @@ def _fsspec_data_transfer( file_size = file_size or fs.size(path_or_fob) # Check if a direct read makes the most sense - if not byte_ranges and bytes_per_thread >= file_size: + if bytes_per_thread >= file_size: if file_like: return path_or_fob.read() else: - return fs.open(path_or_fob, mode=mode, cache_type="none").read() + return fs.open(path_or_fob, mode=mode, cache_type="all").read() # Threaded read into "local" buffer buf = np.zeros(file_size, dtype="b") - if byte_ranges: - - # Optimize/merge the ranges - byte_ranges = _merge_ranges( - byte_ranges, max_block=bytes_per_thread, max_gap=max_gap, - ) - - # Call multi-threaded data transfer of - # remote byte-ranges to local buffer - _read_byte_ranges( - path_or_fob, byte_ranges, buf, fs=fs, **kwargs, - ) - - # Add Header & Footer bytes - if footer is not None: - footer_size = len(footer) - buf[-footer_size:] = np.frombuffer( - footer[-footer_size:], dtype="b" - ) - # Add parquet magic bytes (optional) - if add_par1_magic: - buf[:4] = np.frombuffer(b"PAR1", dtype="b") - if footer is None: - buf[-4:] = np.frombuffer(b"PAR1", dtype="b") - - else: - byte_ranges = [ - (b, min(bytes_per_thread, file_size - b)) - for b in range(0, file_size, bytes_per_thread) - ] - _read_byte_ranges( - path_or_fob, byte_ranges, buf, fs=fs, **kwargs, - ) + byte_ranges = [ + (b, min(bytes_per_thread, file_size - b)) + for b in range(0, file_size, bytes_per_thread) + ] + _read_byte_ranges( + path_or_fob, byte_ranges, buf, fs=fs, **kwargs, + ) return buf.tobytes() diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index a49d73493ec..ac5795fa2ec 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -20,7 +20,9 @@ import cudf from cudf.core.column import as_column, build_categorical_column from cudf.io import write_to_dataset +from cudf.io.parquet import _default_open_file_options from cudf.utils.dtypes import cudf_dtype_from_pa_type +from cudf.utils.ioutils import _is_local_filesystem, _open_remote_files class CudfEngine(ArrowDatasetEngine): @@ -64,6 +66,7 @@ def _read_paths( partitions=None, partitioning=None, partition_keys=None, + open_file_options=None, **kwargs, ): @@ -75,15 +78,15 @@ def _read_paths( # Non-local filesystem handling paths_or_fobs = paths - if not cudf.utils.ioutils._is_local_filesystem(fs): - - # Convert paths to file objects for remote data - paths_or_fobs = [ - stack.enter_context( - fs.open(path, mode="rb", cache_type="none") - ) - for path in paths - ] + if not _is_local_filesystem(fs): + paths_or_fobs = _open_remote_files( + paths_or_fobs, + fs, + context_stack=stack, + **_default_open_file_options( + open_file_options, columns, row_groups + ), + ) # Use cudf to read in data df = cudf.read_parquet( @@ -150,6 +153,7 @@ def read_partition( partitions=(), partitioning=None, schema=None, + open_file_options=None, **kwargs, ): @@ -168,7 +172,10 @@ def read_partition( if not isinstance(pieces, list): pieces = [pieces] + # Extract supported kwargs from `kwargs` strings_to_cats = kwargs.get("strings_to_categorical", False) + read_kwargs = kwargs.get("read", {}) + read_kwargs.update(open_file_options or {}) # Assume multi-piece read paths = [] @@ -192,7 +199,7 @@ def read_partition( partitions=partitions, partitioning=partitioning, partition_keys=last_partition_keys, - **kwargs.get("read", {}), + **read_kwargs, ) ) paths = rgs = [] @@ -215,13 +222,13 @@ def read_partition( partitions=partitions, partitioning=partitioning, partition_keys=last_partition_keys, - **kwargs.get("read", {}), + **read_kwargs, ) ) df = cudf.concat(dfs) if len(dfs) > 1 else dfs[0] # Re-set "object" dtypes align with pa schema - set_object_dtypes_from_pa_schema(df, kwargs.get("schema", None)) + set_object_dtypes_from_pa_schema(df, schema) if index and (index[0] in df.columns): df = df.set_index(index[0]) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py index ad53f5cfe0f..83ff1273b36 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py @@ -6,6 +6,7 @@ from io import BytesIO import pandas as pd +import pyarrow.fs as pa_fs import pytest import dask_cudf @@ -115,7 +116,15 @@ def test_read_csv(s3_base, s3so): assert df.a.sum().compute() == 4 -def test_read_parquet(s3_base, s3so): +@pytest.mark.parametrize( + "open_file_options", + [ + {"precache_options": {"method": None}}, + {"precache_options": {"method": "parquet"}}, + {"open_file_func": None}, + ], +) +def test_read_parquet(s3_base, s3so, open_file_options): pdf = pd.DataFrame({"a": [1, 2, 3, 4], "b": [2.1, 2.2, 2.3, 2.4]}) buffer = BytesIO() pdf.to_parquet(path=buffer) @@ -123,8 +132,15 @@ def test_read_parquet(s3_base, s3so): with s3_context( s3_base=s3_base, bucket="daskparquet", files={"file.parq": buffer} ): + if "open_file_func" in open_file_options: + fs = pa_fs.S3FileSystem( + endpoint_override=s3so["client_kwargs"]["endpoint_url"], + ) + open_file_options["open_file_func"] = fs.open_input_file df = dask_cudf.read_parquet( - "s3://daskparquet/*.parq", storage_options=s3so + "s3://daskparquet/*.parq", + storage_options=s3so, + open_file_options=open_file_options, ) assert df.a.sum().compute() == 10 assert df.b.sum().compute() == 9 From 1b93126ee20da94acf28c07c2fb2ebba14376ea8 Mon Sep 17 00:00:00 2001 From: Jordan Jacobelli Date: Thu, 20 Jan 2022 22:05:21 +0100 Subject: [PATCH 12/14] Prepare upload scripts for Python 3.7 removal (#10092) As we will remove Python 3.7, we need to update the Python version in the upload scripts Authors: - Jordan Jacobelli (https://github.com/Ethyling) Approvers: - Sevag Hanssian (https://github.com/sevagh) - AJ Schmidt (https://github.com/ajschmidt8) URL: https://github.com/rapidsai/cudf/pull/10092 --- ci/cpu/prebuild.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ci/cpu/prebuild.sh b/ci/cpu/prebuild.sh index 746c0005a47..8a2c9d9be7c 100755 --- a/ci/cpu/prebuild.sh +++ b/ci/cpu/prebuild.sh @@ -4,12 +4,13 @@ set -e DEFAULT_CUDA_VER="11.5" +DEFAULT_PYTHON_VER="3.8" #Always upload cudf Python package export UPLOAD_CUDF=1 #Upload libcudf once per CUDA -if [[ "$PYTHON" == "3.7" ]]; then +if [[ "$PYTHON" == "${DEFAULT_PYTHON_VER}" ]]; then export UPLOAD_LIBCUDF=1 else export UPLOAD_LIBCUDF=0 @@ -23,7 +24,7 @@ else fi #We only want to upload libcudf_kafka once per python/CUDA combo -if [[ "$PYTHON" == "3.7" ]] && [[ "$CUDA" == "${DEFAULT_CUDA_VER}" ]]; then +if [[ "$PYTHON" == "${DEFAULT_PYTHON_VER}" ]] && [[ "$CUDA" == "${DEFAULT_CUDA_VER}" ]]; then export UPLOAD_LIBCUDF_KAFKA=1 else export UPLOAD_LIBCUDF_KAFKA=0 From 53a31d1b0198412ffa002870d0762a4e719a4e0f Mon Sep 17 00:00:00 2001 From: MithunR Date: Thu, 20 Jan 2022 15:00:49 -0800 Subject: [PATCH 13/14] ORC writer API changes for granular statistics (#10058) Depends on #10041. The erstwhile ORC writer API exposed only a binary choice to choose the level of statistics: ENABLED/DISABLED. This commit allows the ORC writer to further choose whether statistics are collected at the ROW_GROUP or STRIPE level. This commit also includes the relevant changes to `java/` and `python/`. Authors: - MithunR (https://github.com/mythrocks) - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Jason Lowe (https://github.com/jlowe) - GALI PREM SAGAR (https://github.com/galipremsagar) - Christopher Harris (https://github.com/cwharris) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/10058 --- .../io/orc/orc_writer_benchmark.cpp | 19 +++-- cpp/include/cudf/io/orc.hpp | 84 ++++++++++++++----- cpp/src/io/orc/writer_impl.cu | 44 ++++++---- cpp/src/io/orc/writer_impl.hpp | 8 +- java/src/main/native/src/TableJni.cpp | 4 +- .../cudf/_fuzz_testing/tests/fuzz_test_orc.py | 4 +- python/cudf/cudf/_lib/orc.pyx | 30 +++++-- python/cudf/cudf/io/orc.py | 8 +- python/cudf/cudf/tests/test_orc.py | 63 +++++++++++++- 9 files changed, 200 insertions(+), 64 deletions(-) diff --git a/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp b/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp index be1a2073057..b0eba17359f 100644 --- a/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp +++ b/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "cudf/io/types.hpp" #include #include @@ -65,8 +66,14 @@ void BM_orc_write_varying_inout(benchmark::State& state) void BM_orc_write_varying_options(benchmark::State& state) { - auto const compression = static_cast(state.range(0)); - auto const enable_stats = state.range(1) != 0; + auto const compression = static_cast(state.range(0)); + auto const stats_freq = [&] { + switch (state.range(2)) { + case 0: return cudf::io::STATISTICS_NONE; + case 1: return cudf::io::ORC_STATISTICS_STRIPE; + default: return cudf::io::ORC_STATISTICS_ROW_GROUP; + } + }(); auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED), int32_t(type_group_id::FLOATING_POINT), @@ -85,7 +92,7 @@ void BM_orc_write_varying_options(benchmark::State& state) cudf_io::orc_writer_options const options = cudf_io::orc_writer_options::builder(source_sink.make_sink_info(), view) .compression(compression) - .enable_statistics(enable_stats); + .enable_statistics(stats_freq); cudf_io::write_orc(options); } @@ -113,6 +120,8 @@ BENCHMARK_DEFINE_F(OrcWrite, writer_options) BENCHMARK_REGISTER_F(OrcWrite, writer_options) ->ArgsProduct({{int32_t(cudf::io::compression_type::NONE), int32_t(cudf::io::compression_type::SNAPPY)}, - {0, 1}}) + {int32_t{cudf::io::STATISTICS_NONE}, + int32_t{cudf::io::ORC_STATISTICS_STRIPE}, + int32_t{cudf::io::ORC_STATISTICS_ROW_GROUP}}}) ->Unit(benchmark::kMillisecond) ->UseManualTime(); diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp index 51f82bc4061..108251dd646 100644 --- a/cpp/include/cudf/io/orc.hpp +++ b/cpp/include/cudf/io/orc.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -434,6 +434,18 @@ table_with_metadata read_orc( */ class orc_writer_options_builder; +/** + * @brief Constants to disambiguate statistics terminology for ORC. + * + * ORC refers to its finest granularity of row-grouping as "row group", + * which corresponds to Parquet "pages". + * Similarly, ORC's "stripe" corresponds to a Parquet "row group". + * The following constants disambiguate the terminology for the statistics + * collected at each level. + */ +static constexpr statistics_freq ORC_STATISTICS_STRIPE = statistics_freq::STATISTICS_ROWGROUP; +static constexpr statistics_freq ORC_STATISTICS_ROW_GROUP = statistics_freq::STATISTICS_PAGE; + /** * @brief Settings to use for `write_orc()`. */ @@ -442,8 +454,8 @@ class orc_writer_options { sink_info _sink; // Specify the compression format to use compression_type _compression = compression_type::AUTO; - // Enable writing column statistics - bool _enable_statistics = true; + // Specify frequency of statistics collection + statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP; // Maximum size of each stripe (unless smaller than a single row group) size_t _stripe_size_bytes = default_stripe_size_bytes; // Maximum number of rows in stripe (unless smaller than a single row group) @@ -501,7 +513,15 @@ class orc_writer_options { /** * @brief Whether writing column statistics is enabled/disabled. */ - [[nodiscard]] bool is_enabled_statistics() const { return _enable_statistics; } + [[nodiscard]] bool is_enabled_statistics() const + { + return _stats_freq != statistics_freq::STATISTICS_NONE; + } + + /** + * @brief Returns frequency of statistics collection. + */ + [[nodiscard]] statistics_freq get_statistics_freq() const { return _stats_freq; } /** * @brief Returns maximum stripe size, in bytes. @@ -550,11 +570,16 @@ class orc_writer_options { void set_compression(compression_type comp) { _compression = comp; } /** - * @brief Enable/Disable writing column statistics. + * @brief Choose granularity of statistics collection. * - * @param val Boolean value to enable/disable statistics. + * The granularity can be set to: + * - cudf::io::STATISTICS_NONE: No statistics are collected. + * - cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe. + * - cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group. + * + * @param val Frequency of statistics collection. */ - void enable_statistics(bool val) { _enable_statistics = val; } + void enable_statistics(statistics_freq val) { _stats_freq = val; } /** * @brief Sets the maximum stripe size, in bytes. @@ -647,14 +672,19 @@ class orc_writer_options_builder { } /** - * @brief Enable/Disable writing column statistics. + * @brief Choose granularity of column statistics to be written + * + * The granularity can be set to: + * - cudf::io::STATISTICS_NONE: No statistics are collected. + * - cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe. + * - cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group. * - * @param val Boolean value to enable/disable. + * @param val Level of statistics collection. * @return this for chaining. */ - orc_writer_options_builder& enable_statistics(bool val) + orc_writer_options_builder& enable_statistics(statistics_freq val) { - options._enable_statistics = val; + options._stats_freq = val; return *this; } @@ -775,8 +805,8 @@ class chunked_orc_writer_options { sink_info _sink; // Specify the compression format to use compression_type _compression = compression_type::AUTO; - // Enable writing column statistics - bool _enable_statistics = true; + // Specify granularity of statistics collection + statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP; // Maximum size of each stripe (unless smaller than a single row group) size_t _stripe_size_bytes = default_stripe_size_bytes; // Maximum number of rows in stripe (unless smaller than a single row group) @@ -825,9 +855,9 @@ class chunked_orc_writer_options { [[nodiscard]] compression_type get_compression() const { return _compression; } /** - * @brief Whether writing column statistics is enabled/disabled. + * @brief Returns granularity of statistics collection. */ - [[nodiscard]] bool is_enabled_statistics() const { return _enable_statistics; } + [[nodiscard]] statistics_freq get_statistics_freq() const { return _stats_freq; } /** * @brief Returns maximum stripe size, in bytes. @@ -871,11 +901,16 @@ class chunked_orc_writer_options { void set_compression(compression_type comp) { _compression = comp; } /** - * @brief Enable/Disable writing column statistics. + * @brief Choose granularity of statistics collection + * + * The granularity can be set to: + * - cudf::io::STATISTICS_NONE: No statistics are collected. + * - cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe. + * - cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group. * - * @param val Boolean value to enable/disable. + * @param val Frequency of statistics collection. */ - void enable_statistics(bool val) { _enable_statistics = val; } + void enable_statistics(statistics_freq val) { _stats_freq = val; } /** * @brief Sets the maximum stripe size, in bytes. @@ -958,14 +993,19 @@ class chunked_orc_writer_options_builder { } /** - * @brief Enable/Disable writing column statistics. + * @brief Choose granularity of statistics collection + * + * The granularity can be set to: + * - cudf::io::STATISTICS_NONE: No statistics are collected. + * - cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe. + * - cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group. * - * @param val Boolean value to enable/disable. + * @param val Frequency of statistics collection. * @return this for chaining. */ - chunked_orc_writer_options_builder& enable_statistics(bool val) + chunked_orc_writer_options_builder& enable_statistics(statistics_freq val) { - options._enable_statistics = val; + options._stats_freq = val; return *this; } diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 105c473c15e..a917dbf93a5 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -1063,15 +1063,15 @@ void set_stat_desc_leaf_cols(device_span columns, } writer::impl::encoded_statistics writer::impl::gather_statistic_blobs( - bool are_statistics_enabled, + statistics_freq stats_freq, orc_table_view const& orc_table, file_segmentation const& segmentation) { - auto const num_rowgroup_blobs = segmentation.rowgroups.count(); - auto const num_stripe_blobs = segmentation.num_stripes() * orc_table.num_columns(); - auto const num_file_blobs = orc_table.num_columns(); - auto const num_stat_blobs = num_rowgroup_blobs + num_stripe_blobs + num_file_blobs; - + auto const num_rowgroup_blobs = segmentation.rowgroups.count(); + auto const num_stripe_blobs = segmentation.num_stripes() * orc_table.num_columns(); + auto const num_file_blobs = orc_table.num_columns(); + auto const num_stat_blobs = num_rowgroup_blobs + num_stripe_blobs + num_file_blobs; + auto const are_statistics_enabled = stats_freq != statistics_freq::STATISTICS_NONE; if (not are_statistics_enabled or num_stat_blobs == 0) { return {}; } hostdevice_vector stat_desc(orc_table.num_columns(), stream); @@ -1164,17 +1164,27 @@ writer::impl::encoded_statistics writer::impl::gather_statistic_blobs( hostdevice_vector blobs( stat_merge[num_stat_blobs - 1].start_chunk + stat_merge[num_stat_blobs - 1].num_chunks, stream); - gpu::orc_encode_statistics( - blobs.device_ptr(), stat_merge.device_ptr(), stat_chunks.data(), num_stat_blobs, stream); + // Skip rowgroup blobs when encoding, if chosen granularity is coarser than "ROW_GROUP". + auto const is_granularity_rowgroup = stats_freq == ORC_STATISTICS_ROW_GROUP; + auto const num_skip = is_granularity_rowgroup ? 0 : num_rowgroup_blobs; + gpu::orc_encode_statistics(blobs.device_ptr(), + stat_merge.device_ptr(num_skip), + stat_chunks.data() + num_skip, + num_stat_blobs - num_skip, + stream); stat_merge.device_to_host(stream); blobs.device_to_host(stream, true); - std::vector rowgroup_blobs(num_rowgroup_blobs); - for (size_t i = 0; i < num_rowgroup_blobs; i++) { - auto const stat_begin = blobs.host_ptr(rowgroup_stat_merge[i].start_chunk); - auto const stat_end = stat_begin + rowgroup_stat_merge[i].num_chunks; - rowgroup_blobs[i].assign(stat_begin, stat_end); - } + auto rowgroup_blobs = [&]() -> std::vector { + if (not is_granularity_rowgroup) { return {}; } + std::vector rowgroup_blobs(num_rowgroup_blobs); + for (size_t i = 0; i < num_rowgroup_blobs; i++) { + auto const stat_begin = blobs.host_ptr(rowgroup_stat_merge[i].start_chunk); + auto const stat_end = stat_begin + rowgroup_stat_merge[i].num_chunks; + rowgroup_blobs[i].assign(stat_begin, stat_end); + } + return rowgroup_blobs; + }(); std::vector stripe_blobs(num_stripe_blobs); for (size_t i = 0; i < num_stripe_blobs; i++) { @@ -1351,7 +1361,7 @@ writer::impl::impl(std::unique_ptr sink, max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()}, row_index_stride{options.get_row_index_stride()}, compression_kind_(to_orc_compression(options.get_compression())), - enable_statistics_(options.is_enabled_statistics()), + stats_freq_(options.get_statistics_freq()), single_write_mode(mode == SingleWriteMode::YES), kv_meta(options.get_key_value_metadata()), out_sink_(std::move(sink)) @@ -1372,7 +1382,7 @@ writer::impl::impl(std::unique_ptr sink, max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()}, row_index_stride{options.get_row_index_stride()}, compression_kind_(to_orc_compression(options.get_compression())), - enable_statistics_(options.is_enabled_statistics()), + stats_freq_(options.get_statistics_freq()), single_write_mode(mode == SingleWriteMode::YES), kv_meta(options.get_key_value_metadata()), out_sink_(std::move(sink)) @@ -1954,7 +1964,7 @@ void writer::impl::write(table_view const& table) ProtobufWriter pbw_(&buffer_); - auto const statistics = gather_statistic_blobs(enable_statistics_, orc_table, segmentation); + auto const statistics = gather_statistic_blobs(stats_freq_, orc_table, segmentation); // Write stripes std::vector> write_tasks; diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp index 903ceaa1714..69bb6029ee0 100644 --- a/cpp/src/io/orc/writer_impl.hpp +++ b/cpp/src/io/orc/writer_impl.hpp @@ -293,13 +293,13 @@ class writer::impl { /** * @brief Returns column statistics encoded in ORC protobuf format. * - * @param are_statistics_enabled True if statistics are to be included in the output file + * @param statistics_freq Frequency of statistics to be included in the output file * @param orc_table Table information to be written * @param columns List of columns * @param segmentation stripe and rowgroup ranges * @return The statistic blobs */ - encoded_statistics gather_statistic_blobs(bool are_statistics_enabled, + encoded_statistics gather_statistic_blobs(statistics_freq statistics_freq, orc_table_view const& orc_table, file_segmentation const& segmentation); @@ -365,8 +365,8 @@ class writer::impl { size_t compression_blocksize_ = DEFAULT_COMPRESSION_BLOCKSIZE; CompressionKind compression_kind_ = CompressionKind::NONE; - bool enable_dictionary_ = true; - bool enable_statistics_ = true; + bool enable_dictionary_ = true; + statistics_freq stats_freq_ = ORC_STATISTICS_ROW_GROUP; // Overall file metadata. Filled in during the process and written during write_chunked_end() cudf::io::orc::FileFooter ff; diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 03faf9be021..22b089fa93a 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1733,7 +1733,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin( chunked_orc_writer_options opts = chunked_orc_writer_options::builder(sink) .metadata(&metadata) .compression(static_cast(j_compression)) - .enable_statistics(true) + .enable_statistics(ORC_STATISTICS_ROW_GROUP) .key_value_metadata(kv_metadata) .build(); auto writer_ptr = std::make_unique(opts); @@ -1776,7 +1776,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin( chunked_orc_writer_options opts = chunked_orc_writer_options::builder(sink) .metadata(&metadata) .compression(static_cast(j_compression)) - .enable_statistics(true) + .enable_statistics(ORC_STATISTICS_ROW_GROUP) .key_value_metadata(kv_metadata) .build(); auto writer_ptr = std::make_unique(opts); diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py index b3fd7e8c5a7..977038d1fcb 100644 --- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py +++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import io import sys @@ -74,7 +74,7 @@ def orc_reader_stripes_test(input_tuple, columns, stripes): data_handle=OrcWriter, params={ "compression": [None, "snappy"], - "enable_statistics": [True, False], + "enable_statistics": ["NONE", "STRIPE", "ROWGROUP"], }, ) def orc_writer_test(pdf, compression, enable_statistics): diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index cbba1796c26..ce4f183e795 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -9,6 +9,7 @@ from libcpp.string cimport string from libcpp.utility cimport move from libcpp.vector cimport vector +cimport cudf._lib.cpp.io.types as cudf_io_types from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.io.orc cimport ( @@ -144,10 +145,27 @@ cdef compression_type _get_comp_type(object compression): raise ValueError(f"Unsupported `compression` type {compression}") +cdef cudf_io_types.statistics_freq _get_orc_stat_freq(object statistics): + """ + Convert ORC statistics terms to CUDF convention: + - ORC "STRIPE" == CUDF "ROWGROUP" + - ORC "ROWGROUP" == CUDF "PAGE" + """ + statistics = str(statistics).upper() + if statistics == "NONE": + return cudf_io_types.statistics_freq.STATISTICS_NONE + elif statistics == "STRIPE": + return cudf_io_types.statistics_freq.STATISTICS_ROWGROUP + elif statistics == "ROWGROUP": + return cudf_io_types.statistics_freq.STATISTICS_PAGE + else: + raise ValueError(f"Unsupported `statistics_freq` type {statistics}") + + cpdef write_orc(table, object path_or_buf, object compression=None, - bool enable_statistics=True, + object statistics="ROWGROUP", object stripe_size_bytes=None, object stripe_size_rows=None, object row_index_stride=None): @@ -189,7 +207,7 @@ cpdef write_orc(table, sink_info_c, table_view_from_table(table, ignore_index=True) ).metadata(tbl_meta.get()) .compression(compression_) - .enable_statistics( (True if enable_statistics else False)) + .enable_statistics(_get_orc_stat_freq(statistics)) .build() ) if stripe_size_bytes is not None: @@ -268,15 +286,15 @@ cdef class ORCWriter: cdef unique_ptr[orc_chunked_writer] writer cdef sink_info sink cdef unique_ptr[data_sink] _data_sink - cdef bool enable_stats + cdef cudf_io_types.statistics_freq stat_freq cdef compression_type comp_type cdef object index cdef unique_ptr[table_input_metadata] tbl_meta def __cinit__(self, object path, object index=None, - object compression=None, bool enable_statistics=True): + object compression=None, object statistics="ROWGROUP"): self.sink = make_sink_info(path, self._data_sink) - self.enable_stats = enable_statistics + self.stat_freq = _get_orc_stat_freq(statistics) self.comp_type = _get_comp_type(compression) self.index = index self.initialized = False @@ -350,7 +368,7 @@ cdef class ORCWriter: .metadata(self.tbl_meta.get()) .key_value_metadata(move(user_data)) .compression(self.comp_type) - .enable_statistics(self.enable_stats) + .enable_statistics(self.stat_freq) .build() ) self.writer.reset(new orc_chunked_writer(args)) diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index c1cce3f996f..5c35d004ac0 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2021, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. import datetime import warnings @@ -395,7 +395,7 @@ def to_orc( df, fname, compression=None, - enable_statistics=True, + statistics="ROWGROUP", stripe_size_bytes=None, stripe_size_rows=None, row_index_stride=None, @@ -431,7 +431,7 @@ def to_orc( df, file_obj, compression, - enable_statistics, + statistics, stripe_size_bytes, stripe_size_rows, row_index_stride, @@ -441,7 +441,7 @@ def to_orc( df, path_or_buf, compression, - enable_statistics, + statistics, stripe_size_bytes, stripe_size_rows, row_index_stride, diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 44812f5aba4..8689f773a02 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -391,6 +391,64 @@ def test_orc_writer(datadir, tmpdir, reference_file, columns, compression): assert_eq(expect, got) +@pytest.mark.parametrize("stats_freq", ["NONE", "STRIPE", "ROWGROUP"]) +def test_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq): + reference_file = "TestOrcFile.demo-12-zlib.orc" + pdf_fname = datadir / reference_file + gdf_fname = tmpdir.join("gdf.orc") + + try: + orcfile = pa.orc.ORCFile(pdf_fname) + except Exception as excpr: + if type(excpr).__name__ == "ArrowIOError": + pytest.skip(".orc file is not found") + else: + print(type(excpr).__name__) + + expect = orcfile.read().to_pandas() + cudf.from_pandas(expect).to_orc(gdf_fname.strpath, statistics=stats_freq) + got = pa.orc.ORCFile(gdf_fname).read().to_pandas() + + assert_eq(expect, got) + + +@pytest.mark.parametrize("stats_freq", ["NONE", "STRIPE", "ROWGROUP"]) +def test_chunked_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq): + reference_file = "TestOrcFile.test1.orc" + pdf_fname = datadir / reference_file + gdf_fname = tmpdir.join("chunked_gdf.orc") + + try: + orcfile = pa.orc.ORCFile(pdf_fname) + except Exception as excpr: + if type(excpr).__name__ == "ArrowIOError": + pytest.skip(".orc file is not found") + else: + print(type(excpr).__name__) + + columns = [ + "boolean1", + "byte1", + "short1", + "int1", + "long1", + "float1", + "double1", + ] + pdf = orcfile.read(columns=columns).to_pandas() + gdf = cudf.from_pandas(pdf) + expect = pd.concat([pdf, pdf]).reset_index(drop=True) + + writer = ORCWriter(gdf_fname, statistics=stats_freq) + writer.write_table(gdf) + writer.write_table(gdf) + writer.close() + + got = pa.orc.ORCFile(gdf_fname).read().to_pandas() + + assert_eq(expect, got) + + @pytest.mark.parametrize("compression", [None, "snappy"]) @pytest.mark.parametrize( "reference_file, columns", @@ -592,8 +650,9 @@ def normalized_equals(value1, value2): return value1 == value2 +@pytest.mark.parametrize("stats_freq", ["STRIPE", "ROWGROUP"]) @pytest.mark.parametrize("nrows", [1, 100, 6000000]) -def test_orc_write_statistics(tmpdir, datadir, nrows): +def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq): supported_stat_types = supported_numpy_dtypes + ["str"] # Can't write random bool columns until issue #6763 is fixed if nrows == 6000000: @@ -609,7 +668,7 @@ def test_orc_write_statistics(tmpdir, datadir, nrows): fname = tmpdir.join("gdf.orc") # Write said dataframe to ORC with cuDF - gdf.to_orc(fname.strpath) + gdf.to_orc(fname.strpath, statistics=stats_freq) # Read back written ORC's statistics orc_file = pa.orc.ORCFile(fname) From 5a4c5f36f082ad9cf1dcc6a2e48a96218ec2093d Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Fri, 21 Jan 2022 13:24:33 -0600 Subject: [PATCH 14/14] Fix for appending decimal128 under list and struct types (#10105) I know that this is past the freeze date. This is a fix for a P1 bug that we just found when trying to build Scalar values of Lists and Structs that contain Decimal128 values. We might be able to work around it some other way, but it would take a lot of changes to the existing Spark plugin code to do that so I wanted to try this first. Authors: - Robert (Bobby) Evans (https://github.com/revans2) Approvers: - Kuhu Shukla (https://github.com/kuhushukla) - Niranjan Artal (https://github.com/nartal1) --- .../main/java/ai/rapids/cudf/HostColumnVector.java | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java index e21a4ac81c6..0fe7d7a5df8 100644 --- a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -1136,6 +1136,8 @@ private void appendChildOrNull(ColumnBuilder childBuilder, Object listElement) { childBuilder.append((Short) listElement); } else if (listElement instanceof BigDecimal) { childBuilder.append((BigDecimal) listElement); + } else if (listElement instanceof BigInteger) { + childBuilder.append((BigInteger) listElement); } else if (listElement instanceof List) { childBuilder.append((List) listElement); } else if (listElement instanceof StructData) { @@ -1230,18 +1232,20 @@ public final ColumnBuilder append(boolean value) { return this; } - public final ColumnBuilder append(BigDecimal value) { + public ColumnBuilder append(BigDecimal value) { + return append(value.setScale(-type.getScale(), RoundingMode.UNNECESSARY).unscaledValue()); + } + + public ColumnBuilder append(BigInteger unscaledVal) { growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes()); assert currentIndex < rows; - // Rescale input decimal with UNNECESSARY policy, which accepts no precision loss. - BigInteger unscaledVal = value.setScale(-type.getScale(), RoundingMode.UNNECESSARY).unscaledValue(); if (type.typeId == DType.DTypeEnum.DECIMAL32) { data.setInt(currentIndex * type.getSizeInBytes(), unscaledVal.intValueExact()); } else if (type.typeId == DType.DTypeEnum.DECIMAL64) { data.setLong(currentIndex * type.getSizeInBytes(), unscaledVal.longValueExact()); } else if (type.typeId == DType.DTypeEnum.DECIMAL128) { assert currentIndex < rows; - byte[] unscaledValueBytes = value.unscaledValue().toByteArray(); + byte[] unscaledValueBytes = unscaledVal.toByteArray(); byte[] result = convertDecimal128FromJavaToCudf(unscaledValueBytes); data.setBytes(currentIndex*DType.DTypeEnum.DECIMAL128.sizeInBytes, result, 0, result.length); } else {