From 6f381ba6696fa6692992c99b82d41a343b2f64fe Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Tue, 25 Jan 2022 09:25:04 -0600 Subject: [PATCH 1/2] Java utilities to aid in accelerating aggregations on 128-bit types --- .../ai/rapids/cudf/Aggregation128Utils.java | 67 ++++++++ java/src/main/native/CMakeLists.txt | 6 +- .../native/src/Aggregation128UtilsJni.cpp | 47 ++++++ .../main/native/src/aggregation128_utils.cu | 150 ++++++++++++++++++ .../main/native/src/aggregation128_utils.hpp | 69 ++++++++ .../rapids/cudf/Aggregation128UtilsTest.java | 80 ++++++++++ 6 files changed, 417 insertions(+), 2 deletions(-) create mode 100644 java/src/main/java/ai/rapids/cudf/Aggregation128Utils.java create mode 100644 java/src/main/native/src/Aggregation128UtilsJni.cpp create mode 100644 java/src/main/native/src/aggregation128_utils.cu create mode 100644 java/src/main/native/src/aggregation128_utils.hpp create mode 100644 java/src/test/java/ai/rapids/cudf/Aggregation128UtilsTest.java diff --git a/java/src/main/java/ai/rapids/cudf/Aggregation128Utils.java b/java/src/main/java/ai/rapids/cudf/Aggregation128Utils.java new file mode 100644 index 00000000000..9a0ac709e3e --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/Aggregation128Utils.java @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ai.rapids.cudf; + +/** + * Utility methods for breaking apart and reassembling 128-bit values during aggregations + * to enable hash-based aggregations and detect overflows. + */ +public class Aggregation128Utils { + static { + NativeDepsLoader.loadNativeDeps(); + } + + /** + * Extract a 32-bit chunk from a 128-bit value. + * @param col column of 128-bit values (e.g.: DECIMAL128) + * @param outType integer type to use for the output column (e.g.: UINT32 or INT32) + * @param chunkIdx index of the 32-bit chunk to extract where 0 is the least significant chunk + * and 3 is the most significant chunk + * @return column containing the specified 32-bit chunk of the input column values. A null input + * row will result in a corresponding null output row. + */ + public static ColumnVector extractInt32Chunk(ColumnView col, DType outType, int chunkIdx) { + return new ColumnVector(extractInt32Chunk(col.getNativeView(), + outType.getTypeId().getNativeId(), chunkIdx)); + } + + /** + * Reassemble a column of 128-bit values from a table of four 64-bit integer columns and check + * for overflow. The 128-bit value is reconstructed by overlapping the 64-bit values by 32-bits. + * The least significant 32-bits of the least significant 64-bit value are used directly as the + * least significant 32-bits of the final 128-bit value, and the remaining 32-bits are added to + * the next most significant 64-bit value. The lower 32-bits of that sum become the next most + * significant 32-bits in the final 128-bit value, and the remaining 32-bits are added to the + * next most significant 64-bit input value, and so on. + * + * @param chunks table of four 64-bit integer columns with the columns ordered from least + * significant to most significant. The last column must be of type INT64. + * @param type the type to use for the resulting 128-bit value column + * @return table containing a boolean column and a 128-bit value column of the requested type. + * The boolean value will be true if an overflow was detected for that row's value when + * it was reassembled. A null input row will result in a corresponding null output row. + */ + public static Table combineInt64SumChunks(Table chunks, DType type) { + return new Table(combineInt64SumChunks(chunks.getNativeView(), + type.getTypeId().getNativeId(), + type.getScale())); + } + + private static native long extractInt32Chunk(long columnView, int outTypeId, int chunkIdx); + + private static native long[] combineInt64SumChunks(long chunksTableView, int dtype, int scale); +} diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt index 00747efff27..ffbeeb155e0 100755 --- a/java/src/main/native/CMakeLists.txt +++ b/java/src/main/native/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2019-2021, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -219,7 +219,7 @@ endif() add_library( cudfjni SHARED - src/row_conversion.cu + src/Aggregation128UtilsJni.cpp src/AggregationJni.cpp src/CudfJni.cpp src/CudaJni.cpp @@ -236,7 +236,9 @@ add_library( src/RmmJni.cpp src/ScalarJni.cpp src/TableJni.cpp + src/aggregation128_utils.cu src/map_lookup.cu + src/row_conversion.cu src/check_nvcomp_output_sizes.cu ) diff --git a/java/src/main/native/src/Aggregation128UtilsJni.cpp b/java/src/main/native/src/Aggregation128UtilsJni.cpp new file mode 100644 index 00000000000..71c36cb724a --- /dev/null +++ b/java/src/main/native/src/Aggregation128UtilsJni.cpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "aggregation128_utils.hpp" +#include "cudf_jni_apis.hpp" +#include "dtype_utils.hpp" + +extern "C" { + +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation128Utils_extractInt32Chunk( + JNIEnv *env, jclass, jlong j_column_view, jint j_out_dtype, jint j_chunk_idx) { + JNI_NULL_CHECK(env, j_column_view, "column is null", 0); + try { + cudf::jni::auto_set_device(env); + auto cview = reinterpret_cast(j_column_view); + auto dtype = cudf::jni::make_data_type(j_out_dtype, 0); + return cudf::jni::release_as_jlong(cudf::jni::extract_chunk32(*cview, dtype, j_chunk_idx)); + } + CATCH_STD(env, 0); +} + +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Aggregation128Utils_combineInt64SumChunks( + JNIEnv *env, jclass, jlong j_table_view, jint j_dtype, jint j_scale) { + JNI_NULL_CHECK(env, j_table_view, "table is null", 0); + try { + cudf::jni::auto_set_device(env); + auto tview = reinterpret_cast(j_table_view); + std::unique_ptr result = + cudf::jni::assemble128_from_sum(*tview, cudf::jni::make_data_type(j_dtype, j_scale)); + return cudf::jni::convert_table_for_return(env, result); + } + CATCH_STD(env, 0); +} +} diff --git a/java/src/main/native/src/aggregation128_utils.cu b/java/src/main/native/src/aggregation128_utils.cu new file mode 100644 index 00000000000..4bd428dab03 --- /dev/null +++ b/java/src/main/native/src/aggregation128_utils.cu @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "aggregation128_utils.hpp" + +namespace { + +// Iterate every 4th 32-bit value, i.e.: one "chunk" of a __int128_t value +class chunk_strided_range { +public: + typedef typename thrust::iterator_difference::type difference_type; + + struct stride_functor : public thrust::unary_function { + __device__ inline difference_type operator()(difference_type i) const { return i * 4; } + }; + + typedef typename thrust::counting_iterator CountingIterator; + typedef typename thrust::transform_iterator TransformIterator; + typedef typename thrust::permutation_iterator + PermutationIterator; + + typedef PermutationIterator iterator; + + chunk_strided_range(int32_t const *start, int32_t const *finish, int chunk_idx) + : start(start + chunk_idx), finish(finish + chunk_idx) {} + + iterator begin() const { + return PermutationIterator(start, TransformIterator(CountingIterator(0), stride_functor())); + } + + iterator end() const { return begin() + ((finish - start) + 3) / 4; } + +private: + int32_t const *start; + int32_t const *finish; +}; + +// Functor to reassemble a 128-bit value from four 64-bit chunks with overflow detection. +class chunk_assembler : public thrust::unary_function { +public: + chunk_assembler(bool *overflows, uint64_t const *chunks0, uint64_t const *chunks1, + uint64_t const *chunks2, int64_t const *chunks3) + : overflows(overflows), chunks0(chunks0), chunks1(chunks1), chunks2(chunks2), + chunks3(chunks3) {} + + __device__ __int128_t operator()(cudf::size_type i) const { + // Starting with the least significant input and moving to the most significant, propagate the + // upper 32-bits of the previous column into the next column, i.e.: propagate the "carry" bits + // of each 64-bit chunk into the next chunk. + uint64_t const c0 = chunks0[i]; + uint64_t const c1 = chunks1[i] + (c0 >> 32); + uint64_t const c2 = chunks2[i] + (c1 >> 32); + int64_t const c3 = chunks3[i] + (c2 >> 32); + uint64_t const lower64 = (c1 << 32) | static_cast(c0); + int64_t const upper64 = (c3 << 32) | static_cast(c2); + + // check for overflow by ensuring the sign bit matches the top carry bits + int32_t const replicated_sign_bit = static_cast(c3) >> 31; + int32_t const top_carry_bits = static_cast(c3 >> 32); + overflows[i] = (replicated_sign_bit != top_carry_bits); + + return (static_cast<__int128_t>(upper64) << 64) | lower64; + } + +private: + // output column for overflow detected + bool *const overflows; + + // input columns for the four 64-bit values + uint64_t const *const chunks0; + uint64_t const *const chunks1; + uint64_t const *const chunks2; + int64_t const *const chunks3; +}; + +} // anonymous namespace + +namespace cudf::jni { + +// Extract a 32-bit chunk from a 128-bit value. +std::unique_ptr extract_chunk32(cudf::column_view const &in_col, cudf::data_type type, + int chunk_idx, rmm::cuda_stream_view stream) { + CUDF_EXPECTS(in_col.type().id() == cudf::type_id::DECIMAL128, "not a 128-bit type"); + CUDF_EXPECTS(chunk_idx >= 0 && chunk_idx < 4, "invalid chunk index"); + CUDF_EXPECTS(type.id() == cudf::type_id::INT32 || type.id() == cudf::type_id::UINT32, + "not a 32-bit integer type"); + auto const num_rows = in_col.size(); + auto out_col = cudf::make_fixed_width_column(type, num_rows, copy_bitmask(in_col)); + auto out_view = out_col->mutable_view(); + auto const in_begin = in_col.begin(); + auto const in_end = in_begin + in_col.size() * 4; + chunk_strided_range range(in_begin, in_end, chunk_idx); + thrust::copy(rmm::exec_policy(stream), range.begin(), range.end(), out_view.data()); + return out_col; +} + +// Reassemble a column of 128-bit values from four 64-bit integer columns with overflow detection. +std::unique_ptr assemble128_from_sum(cudf::table_view const &chunks_table, + cudf::data_type output_type, + rmm::cuda_stream_view stream) { + CUDF_EXPECTS(output_type.id() == cudf::type_id::DECIMAL128, "not a 128-bit type"); + CUDF_EXPECTS(chunks_table.num_columns() == 4, "must be 4 column table"); + auto const num_rows = chunks_table.num_rows(); + auto const chunks0 = chunks_table.column(0); + auto const chunks1 = chunks_table.column(1); + auto const chunks2 = chunks_table.column(2); + auto const chunks3 = chunks_table.column(3); + CUDF_EXPECTS(cudf::size_of(chunks0.type()) == 8 && cudf::size_of(chunks1.type()) == 8 && + cudf::size_of(chunks2.type()) == 8 && + chunks3.type().id() == cudf::type_id::INT64, + "chunks type mismatch"); + std::vector> columns; + columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, num_rows, + copy_bitmask(chunks0))); + columns.push_back(cudf::make_fixed_width_column(output_type, num_rows, copy_bitmask(chunks0))); + auto overflows_view = columns[0]->mutable_view(); + auto assembled_view = columns[1]->mutable_view(); + thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_rows), + assembled_view.begin<__int128_t>(), + chunk_assembler(overflows_view.begin(), chunks0.begin(), + chunks1.begin(), chunks2.begin(), + chunks3.begin())); + return std::make_unique(std::move(columns)); +} + +} // namespace cudf::jni diff --git a/java/src/main/native/src/aggregation128_utils.hpp b/java/src/main/native/src/aggregation128_utils.hpp new file mode 100644 index 00000000000..30c1032b795 --- /dev/null +++ b/java/src/main/native/src/aggregation128_utils.hpp @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +namespace cudf::jni { + +/** + * @brief Extract a 32-bit integer column from a column of 128-bit values. + * + * Given a 128-bit input column, a 32-bit integer column is returned corresponding to + * the index of which 32-bit chunk of the original 128-bit values to extract. + * 0 corresponds to the least significant chunk, and 3 corresponds to the most + * significant chunk. + * + * A null input row will result in a corresponding null output row. + * + * @param col Column of 128-bit values + * @param dtype Integer type to use for the output column (e.g.: UINT32 or INT32) + * @param chunk_idx Index of the 32-bit chunk to extract + * @param stream CUDA stream to use + * @return A column containing the extracted 32-bit integer values + */ +std::unique_ptr +extract_chunk32(cudf::column_view const &col, cudf::data_type dtype, int chunk_idx, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); + +/** + * @brief Reassemble a 128-bit column from four 64-bit integer columns with overflow detection. + * + * The 128-bit value is reconstructed by overlapping the 64-bit values by 32-bits. The least + * significant 32-bits of the least significant 64-bit value are used directly as the least + * significant 32-bits of the final 128-bit value, and the remaining 32-bits are added to the next + * most significant 64-bit value. The lower 32-bits of that sum become the next most significant + * 32-bits in the final 128-bit value, and the remaining 32-bits are added to the next most + * significant 64-bit input value, and so on. + * + * A null input row will result in a corresponding null output row. + * + * @param chunks_table Table of four 64-bit integer columns with the columns ordered from least + * significant to most significant. The last column must be an INT64 column. + * @param output_type The type to use for the resulting 128-bit value column + * @param stream CUDA stream to use + * @return Table containing a boolean column and a 128-bit value column of the + * requested type. The boolean value will be true if an overflow was detected + * for that row's value. + */ +std::unique_ptr +assemble128_from_sum(cudf::table_view const &chunks_table, cudf::data_type output_type, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); + +} // namespace cudf::jni diff --git a/java/src/test/java/ai/rapids/cudf/Aggregation128UtilsTest.java b/java/src/test/java/ai/rapids/cudf/Aggregation128UtilsTest.java new file mode 100644 index 00000000000..11e2aff7259 --- /dev/null +++ b/java/src/test/java/ai/rapids/cudf/Aggregation128UtilsTest.java @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ai.rapids.cudf; + +import org.junit.jupiter.api.Test; + +import java.math.BigInteger; + +public class Aggregation128UtilsTest extends CudfTestBase { + @Test + public void testExtractInt32Chunks() { + BigInteger[] intvals = new BigInteger[] { + null, + new BigInteger("123456789abcdef0f0debc9a78563412", 16), + new BigInteger("123456789abcdef0f0debc9a78563412", 16), + new BigInteger("123456789abcdef0f0debc9a78563412", 16), + null + }; + try (ColumnVector cv = ColumnVector.decimalFromBigInt(-38, intvals); + ColumnVector chunk1 = Aggregation128Utils.extractInt32Chunk(cv, DType.UINT32, 0); + ColumnVector chunk2 = Aggregation128Utils.extractInt32Chunk(cv, DType.UINT32, 1); + ColumnVector chunk3 = Aggregation128Utils.extractInt32Chunk(cv, DType.UINT32, 2); + ColumnVector chunk4 = Aggregation128Utils.extractInt32Chunk(cv, DType.INT32, 3); + Table actualChunks = new Table(chunk1, chunk2, chunk3, chunk4); + ColumnVector expectedChunk1 = ColumnVector.fromBoxedUnsignedInts( + null, 0x78563412, 0x78563412, 0x78563412, null); + ColumnVector expectedChunk2 = ColumnVector.fromBoxedUnsignedInts( + null, -0x0f214366, -0x0f214366, -0x0f214366, null); + ColumnVector expectedChunk3 = ColumnVector.fromBoxedUnsignedInts( + null, -0x65432110, -0x65432110, -0x65432110, null); + ColumnVector expectedChunk4 = ColumnVector.fromBoxedInts( + null, 0x12345678, 0x12345678, 0x12345678, null); + Table expectedChunks = new Table(expectedChunk1, expectedChunk2, expectedChunk3, expectedChunk4)) { + AssertUtils.assertTablesAreEqual(expectedChunks, actualChunks); + } + } + + @Test + public void testCombineInt64SumChunks() { + try (ColumnVector chunks0 = ColumnVector.fromBoxedUnsignedLongs( + null, 0L, 1L, 0L, 0L, 0x12345678L, 0x123456789L, 0x1234567812345678L, 0xfedcba9876543210L); + ColumnVector chunks1 = ColumnVector.fromBoxedUnsignedLongs( + null, 0L, 2L, 0L, 0L, 0x9abcdef0L, 0x9abcdef01L, 0x1122334455667788L, 0xaceaceaceaceaceaL); + ColumnVector chunks2 = ColumnVector.fromBoxedUnsignedLongs( + null, 0L, 3L, 0L, 0L, 0x11223344L, 0x556677889L, 0x99aabbccddeeff00L, 0xbdfbdfbdfbdfbdfbL); + ColumnVector chunks3 = ColumnVector.fromBoxedLongs( + null, 0L, -1L, 0x100000000L, 0x80000000L, 0x55667788L, 0x01234567L, 0x66554434L, -0x42042043L); + Table chunksTable = new Table(chunks0, chunks1, chunks2, chunks3); + Table actual = Aggregation128Utils.combineInt64SumChunks(chunksTable, DType.create(DType.DTypeEnum.DECIMAL128, -20)); + ColumnVector expectedOverflows = ColumnVector.fromBoxedBooleans( + null, false, false, true, true, false, false, true, false); + ColumnVector expectedValues = ColumnVector.decimalFromBigInt(-20, + null, + new BigInteger("0", 16), + new BigInteger("-fffffffcfffffffdffffffff", 16), + new BigInteger("0", 16), + new BigInteger("-80000000000000000000000000000000", 16), + new BigInteger("55667788112233449abcdef012345678", 16), + new BigInteger("123456c56677892abcdef0223456789", 16), + new BigInteger("ef113244679ace0012345678", 16), + new BigInteger("7bf7bf7ba8ca8ca8e9ab678276543210", 16)); + Table expected = new Table(expectedOverflows, expectedValues)) { + AssertUtils.assertTablesAreEqual(expected, actual); + } + } +} From c214cf43cc0a9f2548afdbab0cc5e777de0134ad Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Thu, 3 Feb 2022 14:33:12 -0600 Subject: [PATCH 2/2] Simplify construction of the stride iterator --- .../main/native/src/aggregation128_utils.cu | 43 +++++-------------- 1 file changed, 10 insertions(+), 33 deletions(-) diff --git a/java/src/main/native/src/aggregation128_utils.cu b/java/src/main/native/src/aggregation128_utils.cu index 4bd428dab03..865f607ff7d 100644 --- a/java/src/main/native/src/aggregation128_utils.cu +++ b/java/src/main/native/src/aggregation128_utils.cu @@ -23,41 +23,13 @@ #include #include #include +#include +#include #include "aggregation128_utils.hpp" namespace { -// Iterate every 4th 32-bit value, i.e.: one "chunk" of a __int128_t value -class chunk_strided_range { -public: - typedef typename thrust::iterator_difference::type difference_type; - - struct stride_functor : public thrust::unary_function { - __device__ inline difference_type operator()(difference_type i) const { return i * 4; } - }; - - typedef typename thrust::counting_iterator CountingIterator; - typedef typename thrust::transform_iterator TransformIterator; - typedef typename thrust::permutation_iterator - PermutationIterator; - - typedef PermutationIterator iterator; - - chunk_strided_range(int32_t const *start, int32_t const *finish, int chunk_idx) - : start(start + chunk_idx), finish(finish + chunk_idx) {} - - iterator begin() const { - return PermutationIterator(start, TransformIterator(CountingIterator(0), stride_functor())); - } - - iterator end() const { return begin() + ((finish - start) + 3) / 4; } - -private: - int32_t const *start; - int32_t const *finish; -}; - // Functor to reassemble a 128-bit value from four 64-bit chunks with overflow detection. class chunk_assembler : public thrust::unary_function { public: @@ -111,9 +83,14 @@ std::unique_ptr extract_chunk32(cudf::column_view const &in_col, c auto out_col = cudf::make_fixed_width_column(type, num_rows, copy_bitmask(in_col)); auto out_view = out_col->mutable_view(); auto const in_begin = in_col.begin(); - auto const in_end = in_begin + in_col.size() * 4; - chunk_strided_range range(in_begin, in_end, chunk_idx); - thrust::copy(rmm::exec_policy(stream), range.begin(), range.end(), out_view.data()); + + // Build an iterator for every fourth 32-bit value, i.e.: one "chunk" of a __int128_t value + thrust::transform_iterator transform_iter{thrust::counting_iterator{0}, + [] __device__(auto i) { return i * 4; }}; + thrust::permutation_iterator stride_iter{in_begin + chunk_idx, transform_iter}; + + thrust::copy(rmm::exec_policy(stream), stride_iter, stride_iter + num_rows, + out_view.data()); return out_col; }