Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Java utilities to aid in accelerating aggregations on 128-bit types #10201

Merged
merged 2 commits into from
Feb 4, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions java/src/main/java/ai/rapids/cudf/Aggregation128Utils.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package ai.rapids.cudf;

/**
* Utility methods for breaking apart and reassembling 128-bit values during aggregations
* to enable hash-based aggregations and detect overflows.
*/
public class Aggregation128Utils {
static {
NativeDepsLoader.loadNativeDeps();
}

/**
* Extract a 32-bit chunk from a 128-bit value.
* @param col column of 128-bit values (e.g.: DECIMAL128)
* @param outType integer type to use for the output column (e.g.: UINT32 or INT32)
* @param chunkIdx index of the 32-bit chunk to extract where 0 is the least significant chunk
* and 3 is the most significant chunk
* @return column containing the specified 32-bit chunk of the input column values. A null input
* row will result in a corresponding null output row.
*/
public static ColumnVector extractInt32Chunk(ColumnView col, DType outType, int chunkIdx) {
return new ColumnVector(extractInt32Chunk(col.getNativeView(),
outType.getTypeId().getNativeId(), chunkIdx));
}

/**
* Reassemble a column of 128-bit values from a table of four 64-bit integer columns and check
* for overflow. The 128-bit value is reconstructed by overlapping the 64-bit values by 32-bits.
* The least significant 32-bits of the least significant 64-bit value are used directly as the
* least significant 32-bits of the final 128-bit value, and the remaining 32-bits are added to
* the next most significant 64-bit value. The lower 32-bits of that sum become the next most
* significant 32-bits in the final 128-bit value, and the remaining 32-bits are added to the
* next most significant 64-bit input value, and so on.
*
* @param chunks table of four 64-bit integer columns with the columns ordered from least
* significant to most significant. The last column must be of type INT64.
* @param type the type to use for the resulting 128-bit value column
* @return table containing a boolean column and a 128-bit value column of the requested type.
* The boolean value will be true if an overflow was detected for that row's value when
* it was reassembled. A null input row will result in a corresponding null output row.
*/
public static Table combineInt64SumChunks(Table chunks, DType type) {
return new Table(combineInt64SumChunks(chunks.getNativeView(),
type.getTypeId().getNativeId(),
type.getScale()));
}

private static native long extractInt32Chunk(long columnView, int outTypeId, int chunkIdx);

private static native long[] combineInt64SumChunks(long chunksTableView, int dtype, int scale);
}
6 changes: 4 additions & 2 deletions java/src/main/native/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# =============================================================================
# Copyright (c) 2019-2021, NVIDIA CORPORATION.
# Copyright (c) 2019-2022, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
Expand Down Expand Up @@ -219,7 +219,7 @@ endif()

add_library(
cudfjni SHARED
src/row_conversion.cu
src/Aggregation128UtilsJni.cpp
src/AggregationJni.cpp
src/CudfJni.cpp
src/CudaJni.cpp
Expand All @@ -236,7 +236,9 @@ add_library(
src/RmmJni.cpp
src/ScalarJni.cpp
src/TableJni.cpp
src/aggregation128_utils.cu
src/map_lookup.cu
src/row_conversion.cu
src/check_nvcomp_output_sizes.cu
)

Expand Down
47 changes: 47 additions & 0 deletions java/src/main/native/src/Aggregation128UtilsJni.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "aggregation128_utils.hpp"
#include "cudf_jni_apis.hpp"
#include "dtype_utils.hpp"

extern "C" {

JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation128Utils_extractInt32Chunk(
JNIEnv *env, jclass, jlong j_column_view, jint j_out_dtype, jint j_chunk_idx) {
JNI_NULL_CHECK(env, j_column_view, "column is null", 0);
try {
cudf::jni::auto_set_device(env);
auto cview = reinterpret_cast<cudf::column_view const *>(j_column_view);
auto dtype = cudf::jni::make_data_type(j_out_dtype, 0);
return cudf::jni::release_as_jlong(cudf::jni::extract_chunk32(*cview, dtype, j_chunk_idx));
}
CATCH_STD(env, 0);
}

JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Aggregation128Utils_combineInt64SumChunks(
JNIEnv *env, jclass, jlong j_table_view, jint j_dtype, jint j_scale) {
JNI_NULL_CHECK(env, j_table_view, "table is null", 0);
try {
cudf::jni::auto_set_device(env);
auto tview = reinterpret_cast<cudf::table_view const *>(j_table_view);
std::unique_ptr<cudf::table> result =
cudf::jni::assemble128_from_sum(*tview, cudf::jni::make_data_type(j_dtype, j_scale));
return cudf::jni::convert_table_for_return(env, result);
}
CATCH_STD(env, 0);
}
}
127 changes: 127 additions & 0 deletions java/src/main/native/src/aggregation128_utils.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cstddef>
#include <utility>
#include <vector>

#include <cudf/column/column_factories.hpp>
#include <cudf/detail/null_mask.hpp>
#include <cudf/utilities/error.hpp>
#include <rmm/exec_policy.hpp>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/transform_iterator.h>

#include "aggregation128_utils.hpp"

namespace {

// Functor to reassemble a 128-bit value from four 64-bit chunks with overflow detection.
class chunk_assembler : public thrust::unary_function<cudf::size_type, __int128_t> {
public:
chunk_assembler(bool *overflows, uint64_t const *chunks0, uint64_t const *chunks1,
uint64_t const *chunks2, int64_t const *chunks3)
: overflows(overflows), chunks0(chunks0), chunks1(chunks1), chunks2(chunks2),
chunks3(chunks3) {}

__device__ __int128_t operator()(cudf::size_type i) const {
// Starting with the least significant input and moving to the most significant, propagate the
// upper 32-bits of the previous column into the next column, i.e.: propagate the "carry" bits
// of each 64-bit chunk into the next chunk.
uint64_t const c0 = chunks0[i];
uint64_t const c1 = chunks1[i] + (c0 >> 32);
uint64_t const c2 = chunks2[i] + (c1 >> 32);
int64_t const c3 = chunks3[i] + (c2 >> 32);
uint64_t const lower64 = (c1 << 32) | static_cast<uint32_t>(c0);
int64_t const upper64 = (c3 << 32) | static_cast<uint32_t>(c2);

// check for overflow by ensuring the sign bit matches the top carry bits
int32_t const replicated_sign_bit = static_cast<int32_t>(c3) >> 31;
int32_t const top_carry_bits = static_cast<int32_t>(c3 >> 32);
overflows[i] = (replicated_sign_bit != top_carry_bits);

return (static_cast<__int128_t>(upper64) << 64) | lower64;
}

private:
// output column for overflow detected
bool *const overflows;

// input columns for the four 64-bit values
uint64_t const *const chunks0;
uint64_t const *const chunks1;
uint64_t const *const chunks2;
int64_t const *const chunks3;
};

} // anonymous namespace

namespace cudf::jni {

// Extract a 32-bit chunk from a 128-bit value.
std::unique_ptr<cudf::column> extract_chunk32(cudf::column_view const &in_col, cudf::data_type type,
int chunk_idx, rmm::cuda_stream_view stream) {
jrhemstad marked this conversation as resolved.
Show resolved Hide resolved
CUDF_EXPECTS(in_col.type().id() == cudf::type_id::DECIMAL128, "not a 128-bit type");
CUDF_EXPECTS(chunk_idx >= 0 && chunk_idx < 4, "invalid chunk index");
CUDF_EXPECTS(type.id() == cudf::type_id::INT32 || type.id() == cudf::type_id::UINT32,
"not a 32-bit integer type");
auto const num_rows = in_col.size();
auto out_col = cudf::make_fixed_width_column(type, num_rows, copy_bitmask(in_col));
auto out_view = out_col->mutable_view();
auto const in_begin = in_col.begin<int32_t>();

// Build an iterator for every fourth 32-bit value, i.e.: one "chunk" of a __int128_t value
thrust::transform_iterator transform_iter{thrust::counting_iterator{0},
[] __device__(auto i) { return i * 4; }};
thrust::permutation_iterator stride_iter{in_begin + chunk_idx, transform_iter};

thrust::copy(rmm::exec_policy(stream), stride_iter, stride_iter + num_rows,
out_view.data<int32_t>());
return out_col;
}

// Reassemble a column of 128-bit values from four 64-bit integer columns with overflow detection.
std::unique_ptr<cudf::table> assemble128_from_sum(cudf::table_view const &chunks_table,
cudf::data_type output_type,
rmm::cuda_stream_view stream) {
CUDF_EXPECTS(output_type.id() == cudf::type_id::DECIMAL128, "not a 128-bit type");
CUDF_EXPECTS(chunks_table.num_columns() == 4, "must be 4 column table");
auto const num_rows = chunks_table.num_rows();
auto const chunks0 = chunks_table.column(0);
auto const chunks1 = chunks_table.column(1);
auto const chunks2 = chunks_table.column(2);
auto const chunks3 = chunks_table.column(3);
CUDF_EXPECTS(cudf::size_of(chunks0.type()) == 8 && cudf::size_of(chunks1.type()) == 8 &&
cudf::size_of(chunks2.type()) == 8 &&
chunks3.type().id() == cudf::type_id::INT64,
"chunks type mismatch");
std::vector<std::unique_ptr<cudf::column>> columns;
columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, num_rows,
copy_bitmask(chunks0)));
columns.push_back(cudf::make_fixed_width_column(output_type, num_rows, copy_bitmask(chunks0)));
auto overflows_view = columns[0]->mutable_view();
auto assembled_view = columns[1]->mutable_view();
thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator<cudf::size_type>(0),
thrust::make_counting_iterator<cudf::size_type>(num_rows),
assembled_view.begin<__int128_t>(),
chunk_assembler(overflows_view.begin<bool>(), chunks0.begin<uint64_t>(),
chunks1.begin<uint64_t>(), chunks2.begin<uint64_t>(),
chunks3.begin<int64_t>()));
return std::make_unique<cudf::table>(std::move(columns));
}

} // namespace cudf::jni
69 changes: 69 additions & 0 deletions java/src/main/native/src/aggregation128_utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <memory>

#include <cudf/column/column_view.hpp>
#include <cudf/table/table.hpp>
#include <rmm/cuda_stream_view.hpp>

namespace cudf::jni {

/**
* @brief Extract a 32-bit integer column from a column of 128-bit values.
*
* Given a 128-bit input column, a 32-bit integer column is returned corresponding to
* the index of which 32-bit chunk of the original 128-bit values to extract.
* 0 corresponds to the least significant chunk, and 3 corresponds to the most
* significant chunk.
*
* A null input row will result in a corresponding null output row.
*
* @param col Column of 128-bit values
* @param dtype Integer type to use for the output column (e.g.: UINT32 or INT32)
* @param chunk_idx Index of the 32-bit chunk to extract
* @param stream CUDA stream to use
* @return A column containing the extracted 32-bit integer values
*/
std::unique_ptr<cudf::column>
extract_chunk32(cudf::column_view const &col, cudf::data_type dtype, int chunk_idx,
rmm::cuda_stream_view stream = rmm::cuda_stream_default);

/**
* @brief Reassemble a 128-bit column from four 64-bit integer columns with overflow detection.
*
* The 128-bit value is reconstructed by overlapping the 64-bit values by 32-bits. The least
* significant 32-bits of the least significant 64-bit value are used directly as the least
* significant 32-bits of the final 128-bit value, and the remaining 32-bits are added to the next
* most significant 64-bit value. The lower 32-bits of that sum become the next most significant
* 32-bits in the final 128-bit value, and the remaining 32-bits are added to the next most
* significant 64-bit input value, and so on.
*
* A null input row will result in a corresponding null output row.
*
* @param chunks_table Table of four 64-bit integer columns with the columns ordered from least
* significant to most significant. The last column must be an INT64 column.
* @param output_type The type to use for the resulting 128-bit value column
* @param stream CUDA stream to use
* @return Table containing a boolean column and a 128-bit value column of the
* requested type. The boolean value will be true if an overflow was detected
* for that row's value.
*/
std::unique_ptr<cudf::table>
assemble128_from_sum(cudf::table_view const &chunks_table, cudf::data_type output_type,
rmm::cuda_stream_view stream = rmm::cuda_stream_default);

} // namespace cudf::jni
Loading