Skip to content

Commit

Permalink
Java utilities to aid in accelerating aggregations on 128-bit types (#…
Browse files Browse the repository at this point in the history
…10201)

This adds a couple of custom kernels for Java to help accelerate sum aggregations on 128-bit types and check for overflows.  The first kernel extracts a 32-bit chunk from an 128-bit type which can be used to feed four 32-bit chunks into a sum aggregation.  The second kernel takes the resulting upscaled 64-bit integer results and reassembles the parts into a 128-bit type column along with a boolean column to indicate whether the value overflowed.

By splitting the 128-bit type into 32-bit chunks, a sum aggregation on DECIMAL128 which is a sort-based aggregation can be turned into a hash-based aggregation on 32-bit integer inputs for improved performance.  As a bonus, this approach can also check for overflow which is difficult to do when aggregating on DECIMAL128 sums directly.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Jake Hemstad (https://github.com/jrhemstad)
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: #10201
  • Loading branch information
jlowe authored Feb 4, 2022
1 parent c191d16 commit 4e8cb4f
Show file tree
Hide file tree
Showing 6 changed files with 394 additions and 2 deletions.
67 changes: 67 additions & 0 deletions java/src/main/java/ai/rapids/cudf/Aggregation128Utils.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package ai.rapids.cudf;

/**
* Utility methods for breaking apart and reassembling 128-bit values during aggregations
* to enable hash-based aggregations and detect overflows.
*/
public class Aggregation128Utils {
static {
NativeDepsLoader.loadNativeDeps();
}

/**
* Extract a 32-bit chunk from a 128-bit value.
* @param col column of 128-bit values (e.g.: DECIMAL128)
* @param outType integer type to use for the output column (e.g.: UINT32 or INT32)
* @param chunkIdx index of the 32-bit chunk to extract where 0 is the least significant chunk
* and 3 is the most significant chunk
* @return column containing the specified 32-bit chunk of the input column values. A null input
* row will result in a corresponding null output row.
*/
public static ColumnVector extractInt32Chunk(ColumnView col, DType outType, int chunkIdx) {
return new ColumnVector(extractInt32Chunk(col.getNativeView(),
outType.getTypeId().getNativeId(), chunkIdx));
}

/**
* Reassemble a column of 128-bit values from a table of four 64-bit integer columns and check
* for overflow. The 128-bit value is reconstructed by overlapping the 64-bit values by 32-bits.
* The least significant 32-bits of the least significant 64-bit value are used directly as the
* least significant 32-bits of the final 128-bit value, and the remaining 32-bits are added to
* the next most significant 64-bit value. The lower 32-bits of that sum become the next most
* significant 32-bits in the final 128-bit value, and the remaining 32-bits are added to the
* next most significant 64-bit input value, and so on.
*
* @param chunks table of four 64-bit integer columns with the columns ordered from least
* significant to most significant. The last column must be of type INT64.
* @param type the type to use for the resulting 128-bit value column
* @return table containing a boolean column and a 128-bit value column of the requested type.
* The boolean value will be true if an overflow was detected for that row's value when
* it was reassembled. A null input row will result in a corresponding null output row.
*/
public static Table combineInt64SumChunks(Table chunks, DType type) {
return new Table(combineInt64SumChunks(chunks.getNativeView(),
type.getTypeId().getNativeId(),
type.getScale()));
}

private static native long extractInt32Chunk(long columnView, int outTypeId, int chunkIdx);

private static native long[] combineInt64SumChunks(long chunksTableView, int dtype, int scale);
}
6 changes: 4 additions & 2 deletions java/src/main/native/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# =============================================================================
# Copyright (c) 2019-2021, NVIDIA CORPORATION.
# Copyright (c) 2019-2022, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
Expand Down Expand Up @@ -219,7 +219,7 @@ endif()

add_library(
cudfjni SHARED
src/row_conversion.cu
src/Aggregation128UtilsJni.cpp
src/AggregationJni.cpp
src/CudfJni.cpp
src/CudaJni.cpp
Expand All @@ -236,7 +236,9 @@ add_library(
src/RmmJni.cpp
src/ScalarJni.cpp
src/TableJni.cpp
src/aggregation128_utils.cu
src/map_lookup.cu
src/row_conversion.cu
src/check_nvcomp_output_sizes.cu
)

Expand Down
47 changes: 47 additions & 0 deletions java/src/main/native/src/Aggregation128UtilsJni.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "aggregation128_utils.hpp"
#include "cudf_jni_apis.hpp"
#include "dtype_utils.hpp"

extern "C" {

JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation128Utils_extractInt32Chunk(
JNIEnv *env, jclass, jlong j_column_view, jint j_out_dtype, jint j_chunk_idx) {
JNI_NULL_CHECK(env, j_column_view, "column is null", 0);
try {
cudf::jni::auto_set_device(env);
auto cview = reinterpret_cast<cudf::column_view const *>(j_column_view);
auto dtype = cudf::jni::make_data_type(j_out_dtype, 0);
return cudf::jni::release_as_jlong(cudf::jni::extract_chunk32(*cview, dtype, j_chunk_idx));
}
CATCH_STD(env, 0);
}

JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Aggregation128Utils_combineInt64SumChunks(
JNIEnv *env, jclass, jlong j_table_view, jint j_dtype, jint j_scale) {
JNI_NULL_CHECK(env, j_table_view, "table is null", 0);
try {
cudf::jni::auto_set_device(env);
auto tview = reinterpret_cast<cudf::table_view const *>(j_table_view);
std::unique_ptr<cudf::table> result =
cudf::jni::assemble128_from_sum(*tview, cudf::jni::make_data_type(j_dtype, j_scale));
return cudf::jni::convert_table_for_return(env, result);
}
CATCH_STD(env, 0);
}
}
127 changes: 127 additions & 0 deletions java/src/main/native/src/aggregation128_utils.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cstddef>
#include <utility>
#include <vector>

#include <cudf/column/column_factories.hpp>
#include <cudf/detail/null_mask.hpp>
#include <cudf/utilities/error.hpp>
#include <rmm/exec_policy.hpp>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/transform_iterator.h>

#include "aggregation128_utils.hpp"

namespace {

// Functor to reassemble a 128-bit value from four 64-bit chunks with overflow detection.
class chunk_assembler : public thrust::unary_function<cudf::size_type, __int128_t> {
public:
chunk_assembler(bool *overflows, uint64_t const *chunks0, uint64_t const *chunks1,
uint64_t const *chunks2, int64_t const *chunks3)
: overflows(overflows), chunks0(chunks0), chunks1(chunks1), chunks2(chunks2),
chunks3(chunks3) {}

__device__ __int128_t operator()(cudf::size_type i) const {
// Starting with the least significant input and moving to the most significant, propagate the
// upper 32-bits of the previous column into the next column, i.e.: propagate the "carry" bits
// of each 64-bit chunk into the next chunk.
uint64_t const c0 = chunks0[i];
uint64_t const c1 = chunks1[i] + (c0 >> 32);
uint64_t const c2 = chunks2[i] + (c1 >> 32);
int64_t const c3 = chunks3[i] + (c2 >> 32);
uint64_t const lower64 = (c1 << 32) | static_cast<uint32_t>(c0);
int64_t const upper64 = (c3 << 32) | static_cast<uint32_t>(c2);

// check for overflow by ensuring the sign bit matches the top carry bits
int32_t const replicated_sign_bit = static_cast<int32_t>(c3) >> 31;
int32_t const top_carry_bits = static_cast<int32_t>(c3 >> 32);
overflows[i] = (replicated_sign_bit != top_carry_bits);

return (static_cast<__int128_t>(upper64) << 64) | lower64;
}

private:
// output column for overflow detected
bool *const overflows;

// input columns for the four 64-bit values
uint64_t const *const chunks0;
uint64_t const *const chunks1;
uint64_t const *const chunks2;
int64_t const *const chunks3;
};

} // anonymous namespace

namespace cudf::jni {

// Extract a 32-bit chunk from a 128-bit value.
std::unique_ptr<cudf::column> extract_chunk32(cudf::column_view const &in_col, cudf::data_type type,
int chunk_idx, rmm::cuda_stream_view stream) {
CUDF_EXPECTS(in_col.type().id() == cudf::type_id::DECIMAL128, "not a 128-bit type");
CUDF_EXPECTS(chunk_idx >= 0 && chunk_idx < 4, "invalid chunk index");
CUDF_EXPECTS(type.id() == cudf::type_id::INT32 || type.id() == cudf::type_id::UINT32,
"not a 32-bit integer type");
auto const num_rows = in_col.size();
auto out_col = cudf::make_fixed_width_column(type, num_rows, copy_bitmask(in_col));
auto out_view = out_col->mutable_view();
auto const in_begin = in_col.begin<int32_t>();

// Build an iterator for every fourth 32-bit value, i.e.: one "chunk" of a __int128_t value
thrust::transform_iterator transform_iter{thrust::counting_iterator{0},
[] __device__(auto i) { return i * 4; }};
thrust::permutation_iterator stride_iter{in_begin + chunk_idx, transform_iter};

thrust::copy(rmm::exec_policy(stream), stride_iter, stride_iter + num_rows,
out_view.data<int32_t>());
return out_col;
}

// Reassemble a column of 128-bit values from four 64-bit integer columns with overflow detection.
std::unique_ptr<cudf::table> assemble128_from_sum(cudf::table_view const &chunks_table,
cudf::data_type output_type,
rmm::cuda_stream_view stream) {
CUDF_EXPECTS(output_type.id() == cudf::type_id::DECIMAL128, "not a 128-bit type");
CUDF_EXPECTS(chunks_table.num_columns() == 4, "must be 4 column table");
auto const num_rows = chunks_table.num_rows();
auto const chunks0 = chunks_table.column(0);
auto const chunks1 = chunks_table.column(1);
auto const chunks2 = chunks_table.column(2);
auto const chunks3 = chunks_table.column(3);
CUDF_EXPECTS(cudf::size_of(chunks0.type()) == 8 && cudf::size_of(chunks1.type()) == 8 &&
cudf::size_of(chunks2.type()) == 8 &&
chunks3.type().id() == cudf::type_id::INT64,
"chunks type mismatch");
std::vector<std::unique_ptr<cudf::column>> columns;
columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, num_rows,
copy_bitmask(chunks0)));
columns.push_back(cudf::make_fixed_width_column(output_type, num_rows, copy_bitmask(chunks0)));
auto overflows_view = columns[0]->mutable_view();
auto assembled_view = columns[1]->mutable_view();
thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator<cudf::size_type>(0),
thrust::make_counting_iterator<cudf::size_type>(num_rows),
assembled_view.begin<__int128_t>(),
chunk_assembler(overflows_view.begin<bool>(), chunks0.begin<uint64_t>(),
chunks1.begin<uint64_t>(), chunks2.begin<uint64_t>(),
chunks3.begin<int64_t>()));
return std::make_unique<cudf::table>(std::move(columns));
}

} // namespace cudf::jni
69 changes: 69 additions & 0 deletions java/src/main/native/src/aggregation128_utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <memory>

#include <cudf/column/column_view.hpp>
#include <cudf/table/table.hpp>
#include <rmm/cuda_stream_view.hpp>

namespace cudf::jni {

/**
* @brief Extract a 32-bit integer column from a column of 128-bit values.
*
* Given a 128-bit input column, a 32-bit integer column is returned corresponding to
* the index of which 32-bit chunk of the original 128-bit values to extract.
* 0 corresponds to the least significant chunk, and 3 corresponds to the most
* significant chunk.
*
* A null input row will result in a corresponding null output row.
*
* @param col Column of 128-bit values
* @param dtype Integer type to use for the output column (e.g.: UINT32 or INT32)
* @param chunk_idx Index of the 32-bit chunk to extract
* @param stream CUDA stream to use
* @return A column containing the extracted 32-bit integer values
*/
std::unique_ptr<cudf::column>
extract_chunk32(cudf::column_view const &col, cudf::data_type dtype, int chunk_idx,
rmm::cuda_stream_view stream = rmm::cuda_stream_default);

/**
* @brief Reassemble a 128-bit column from four 64-bit integer columns with overflow detection.
*
* The 128-bit value is reconstructed by overlapping the 64-bit values by 32-bits. The least
* significant 32-bits of the least significant 64-bit value are used directly as the least
* significant 32-bits of the final 128-bit value, and the remaining 32-bits are added to the next
* most significant 64-bit value. The lower 32-bits of that sum become the next most significant
* 32-bits in the final 128-bit value, and the remaining 32-bits are added to the next most
* significant 64-bit input value, and so on.
*
* A null input row will result in a corresponding null output row.
*
* @param chunks_table Table of four 64-bit integer columns with the columns ordered from least
* significant to most significant. The last column must be an INT64 column.
* @param output_type The type to use for the resulting 128-bit value column
* @param stream CUDA stream to use
* @return Table containing a boolean column and a 128-bit value column of the
* requested type. The boolean value will be true if an overflow was detected
* for that row's value.
*/
std::unique_ptr<cudf::table>
assemble128_from_sum(cudf::table_view const &chunks_table, cudf::data_type output_type,
rmm::cuda_stream_view stream = rmm::cuda_stream_default);

} // namespace cudf::jni
Loading

0 comments on commit 4e8cb4f

Please sign in to comment.