From 1c0dbc04e67b19b9f0e6d7450f91552538ce2fa1 Mon Sep 17 00:00:00 2001 From: MithunR Date: Wed, 24 May 2023 17:49:03 -0700 Subject: [PATCH] Fix null counts for output of row/column conversion: Fixes #1169. This commit fixes the null counts in row/column conversion, for all output columns. The prior fix in #1155 only fixed the count for STRING outputs. There are 2 additional/tangential fixes in this change: 1. `cudf::detail::null_count()` is used in place of `cudf::null_count()`, thus running on the current stream. 2. The Java `RowConversion.convertFromRowsFixedWidthOptimized()` now uses the `convertFromRowsFixedWidthOptimized()` native function instead of `convertFromRows()`. This should prove faster. Signed-off-by: MithunR --- src/main/cpp/src/row_conversion.cu | 35 ++++++++++++------- .../spark/rapids/jni/RowConversion.java | 2 +- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/src/main/cpp/src/row_conversion.cu b/src/main/cpp/src/row_conversion.cu index 9952ed59ea..aa772908dc 100644 --- a/src/main/cpp/src/row_conversion.cu +++ b/src/main/cpp/src/row_conversion.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -2031,6 +2032,19 @@ convert_to_rows_fixed_width_optimized(table_view const &tbl, rmm::cuda_stream_vi } } +namespace { + +/// @brief Calculates and sets null counts for specified columns +void fixup_null_counts(std::vector>& output_columns, + rmm::cuda_stream_view stream) { + for (auto &col : output_columns) { + col->set_null_count( + cudf::detail::null_count(col->view().null_mask(), 0, col->size(), stream)); + } +} + +} + /** * @brief convert from JCUDF row format to cudf columns * @@ -2260,23 +2274,21 @@ std::unique_ptr convert_from_rows(lists_column_view const &input, if (schema[i].id() == type_id::STRING) { // stuff real string column auto string_data = string_row_offset_columns[string_idx].release()->release(); - auto const null_count = [&] { - // Null-count not set previously. Calculate, on the fly. - auto const &null_mask = *string_data.null_mask; - return null_mask.data() ? - cudf::null_count(static_cast(null_mask.data()), 0, - num_rows) : - 0; - }(); output_columns[i] = make_strings_column(num_rows, std::move(string_col_offsets[string_idx]), std::move(string_data_cols[string_idx]), - std::move(*string_data.null_mask.release()), null_count); + std::move(*string_data.null_mask.release()), 0); + // Null count set to 0, temporarily. Will be fixed up before return. string_idx++; } } } + // Set null counts, because output_columns are modified via mutable-view, + // in the kernel above. + // TODO(future): Consider setting null count in the kernel itself. + fixup_null_counts(output_columns, stream); + return std::make_unique
(std::move(output_columns)); } @@ -2335,9 +2347,8 @@ std::unique_ptr
convert_from_rows_fixed_width_optimized( // Set null counts, because output_columns are modified via mutable-view, // in the kernel above. // TODO(future): Consider setting null count in the kernel itself. - for (auto &col : output_columns) { - col->set_null_count(cudf::null_count(col->view().null_mask(), 0, col->size())); - } + fixup_null_counts(output_columns, stream); + return std::make_unique
(std::move(output_columns)); } else { CUDF_FAIL("Only fixed width types are currently supported"); diff --git a/src/main/java/com/nvidia/spark/rapids/jni/RowConversion.java b/src/main/java/com/nvidia/spark/rapids/jni/RowConversion.java index af0b0352dd..fdb7f40480 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/RowConversion.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/RowConversion.java @@ -163,7 +163,7 @@ public static Table convertFromRowsFixedWidthOptimized(ColumnView vec, DType ... scale[i] = schema[i].getScale(); } - return new Table(convertFromRows(vec.getNativeView(), types, scale)); + return new Table(convertFromRowsFixedWidthOptimized(vec.getNativeView(), types, scale)); } private static native long[] convertToRows(long nativeHandle);