Skip to content

Commit

Permalink
Merge branch 'branch-24.06' into bug-cuda-ctx-always-global-init
Browse files Browse the repository at this point in the history
  • Loading branch information
vuule authored Apr 1, 2024
2 parents bcc53b1 + aab6137 commit 34c8b26
Show file tree
Hide file tree
Showing 40 changed files with 1,485 additions and 257 deletions.
2 changes: 1 addition & 1 deletion ci/build_docs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ rapids-dependency-file-generator \
--file_key docs \
--matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml"

rapids-mamba-retry env create --force -f "${ENV_YAML_DIR}/env.yaml" -n docs
rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n docs
conda activate docs

rapids-print-env
Expand Down
2 changes: 1 addition & 1 deletion ci/check_style.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ rapids-dependency-file-generator \
--file_key checks \
--matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml"

rapids-mamba-retry env create --force -f "${ENV_YAML_DIR}/env.yaml" -n checks
rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n checks
conda activate checks

RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
Expand Down
4 changes: 2 additions & 2 deletions ci/test_cpp_common.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.

set -euo pipefail

Expand All @@ -14,7 +14,7 @@ rapids-dependency-file-generator \
--file_key test_cpp \
--matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee "${ENV_YAML_DIR}/env.yaml"

rapids-mamba-retry env create --force -f "${ENV_YAML_DIR}/env.yaml" -n test
rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n test

# Temporarily allow unbound variables for conda activation.
set +u
Expand Down
4 changes: 2 additions & 2 deletions ci/test_java.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.

set -euo pipefail

Expand All @@ -14,7 +14,7 @@ rapids-dependency-file-generator \
--file_key test_java \
--matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee "${ENV_YAML_DIR}/env.yaml"

rapids-mamba-retry env create --force -f "${ENV_YAML_DIR}/env.yaml" -n test
rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n test

export CMAKE_GENERATOR=Ninja

Expand Down
4 changes: 2 additions & 2 deletions ci/test_notebooks.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

set -euo pipefail

Expand All @@ -14,7 +14,7 @@ rapids-dependency-file-generator \
--file_key test_notebooks \
--matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml"

rapids-mamba-retry env create --force -f "${ENV_YAML_DIR}/env.yaml" -n test
rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n test

# Temporarily allow unbound variables for conda activation.
set +u
Expand Down
4 changes: 2 additions & 2 deletions ci/test_python_common.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.

# Common setup steps shared by Python test jobs

Expand All @@ -16,7 +16,7 @@ rapids-dependency-file-generator \
--file_key test_python \
--matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml"

rapids-mamba-retry env create --force -f "${ENV_YAML_DIR}/env.yaml" -n test
rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n test

# Temporarily allow unbound variables for conda activation.
set +u
Expand Down
8 changes: 8 additions & 0 deletions ci/test_python_cudf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,14 @@ EXITCODE=0
trap "EXITCODE=1" ERR
set +e

rapids-logger "pytest pylibcudf"
pushd python/cudf/cudf/pylibcudf_tests
python -m pytest \
--cache-clear \
--dist=worksteal \
.
popd

rapids-logger "pytest cudf"
./ci/run_cudf_pytests.sh \
--junitxml="${RAPIDS_TESTS_DIR}/junit-cudf.xml" \
Expand Down
8 changes: 8 additions & 0 deletions ci/test_wheel_cudf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,14 @@ if [[ "$(arch)" == "aarch64" && ${RAPIDS_BUILD_TYPE} == "pull-request" ]]; then
rapids-logger "Run smoke tests for cudf"
python ./ci/wheel_smoke_test_cudf.py
else
rapids-logger "pytest pylibcudf"
pushd python/cudf/cudf/pylibcudf_tests
python -m pytest \
--cache-clear \
--dist=worksteal \
.
popd

rapids-logger "pytest cudf"
pushd python/cudf/cudf/tests
python -m pytest \
Expand Down
3 changes: 3 additions & 0 deletions cpp/include/cudf/copying.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,8 @@ std::unique_ptr<column> empty_like(scalar const& input);
* If the `mask_alloc` allocates a validity mask that mask is also uninitialized
* and the validity bits and the null count should be set by the caller.
*
* @throws cudf::data_type_error if input type is not of fixed width.
*
* @param input Immutable view of input column to emulate
* @param mask_alloc Optional, Policy for allocating null mask. Defaults to RETAIN
* @param mr Device memory resource used to allocate the returned column's device memory
Expand Down Expand Up @@ -360,6 +362,7 @@ void copy_range_in_place(column_view const& source,
*
* @throws std::out_of_range for any invalid range.
* @throws cudf::data_type_error if @p target and @p source have different types.
* @throws cudf::data_type_error if the data type is not fixed width, string, or dictionary
*
* @param source The column to copy from inside the range
* @param target The column to copy from outside the range
Expand Down
5 changes: 3 additions & 2 deletions cpp/src/copying/copy.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -122,7 +122,8 @@ std::unique_ptr<column> allocate_like(column_view const& input,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
CUDF_EXPECTS(is_fixed_width(input.type()), "Expects only fixed-width type column");
CUDF_EXPECTS(
is_fixed_width(input.type()), "Expects only fixed-width type column", cudf::data_type_error);
mask_state allocate_mask = should_allocate_mask(mask_alloc, input.nullable());

return std::make_unique<column>(input.type(),
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/copying/copy_range.cu
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ struct out_of_place_copy_range_dispatch {
std::enable_if_t<not cudf::is_rep_layout_compatible<T>(), std::unique_ptr<cudf::column>>
operator()(Args...)
{
CUDF_FAIL("Unsupported type for out of place copy.");
CUDF_FAIL("Unsupported type for out of place copy.", cudf::data_type_error);
}
};

Expand Down
11 changes: 10 additions & 1 deletion cpp/src/copying/scatter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,9 @@ struct column_scalar_scatterer_impl<string_view, MapIterator> {
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr) const
{
CUDF_EXPECTS(source.get().type() == target.type(), "scalar and column types must match");
CUDF_EXPECTS(source.get().type() == target.type(),
"scalar and column types must match",
cudf::data_type_error);

auto const scalar_impl = static_cast<string_scalar const*>(&source.get());
auto const source_view = string_view(scalar_impl->data(), scalar_impl->size());
Expand All @@ -166,6 +168,9 @@ struct column_scalar_scatterer_impl<list_view, MapIterator> {
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr) const
{
CUDF_EXPECTS(source.get().type() == target.type(),
"scalar and column types must match",
cudf::data_type_error);
auto result =
lists::detail::scatter(source, scatter_iter, scatter_iter + scatter_rows, target, stream, mr);

Expand Down Expand Up @@ -249,6 +254,10 @@ struct column_scalar_scatterer_impl<struct_view, MapIterator> {
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr) const
{
CUDF_EXPECTS(source.get().type() == target.type(),
"scalar and column types must match",
cudf::data_type_error);

// For each field of `source`, copy construct a scalar from the field
// and dispatch to the corresponding scalar scatterer

Expand Down
4 changes: 2 additions & 2 deletions cpp/src/io/parquet/decode_fixed.cu
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ __device__ inline void gpuDecodeValues(
constexpr int max_batch_size = num_warps * cudf::detail::warp_size;

PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
int const dtype = s->col.data_type & 7;
int const dtype = s->col.physical_type;

// decode values
int pos = start;
Expand All @@ -187,7 +187,7 @@ __device__ inline void gpuDecodeValues(
uint32_t dtype_len = s->dtype_len;
void* dst =
nesting_info_base[leaf_level_index].data_out + static_cast<size_t>(dst_pos) * dtype_len;
if (s->col.converted_type == DECIMAL) {
if (s->col.logical_type.has_value() && s->col.logical_type->type == LogicalType::DECIMAL) {
switch (dtype) {
case INT32: gpuOutputFast(s, sb, src_pos, static_cast<uint32_t*>(dst)); break;
case INT64: gpuOutputFast(s, sb, src_pos, static_cast<uint2*>(dst)); break;
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/io/parquet/decode_preprocess.cu
Original file line number Diff line number Diff line change
Expand Up @@ -389,7 +389,7 @@ CUDF_KERNEL void __launch_bounds__(preprocess_block_size)
// we only need to preprocess hierarchies with repetition in them (ie, hierarchies
// containing lists anywhere within).
compute_string_sizes =
compute_string_sizes && ((s->col.data_type & 7) == BYTE_ARRAY && s->dtype_len != 4);
compute_string_sizes && s->col.physical_type == BYTE_ARRAY && !s->col.is_strings_to_cat;

// early out optimizations:

Expand Down
18 changes: 10 additions & 8 deletions cpp/src/io/parquet/page_data.cu
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
if (s->dict_base) {
out_thread0 = (s->dict_bits > 0) ? 64 : 32;
} else {
switch (s->col.data_type & 7) {
switch (s->col.physical_type) {
case BOOLEAN: [[fallthrough]];
case BYTE_ARRAY: [[fallthrough]];
case FIXED_LEN_BYTE_ARRAY: out_thread0 = 64; break;
Expand Down Expand Up @@ -123,16 +123,16 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
// be needed in the other DecodeXXX kernels.
if (s->dict_base) {
src_target_pos = gpuDecodeDictionaryIndices<false>(s, sb, src_target_pos, t & 0x1f).first;
} else if ((s->col.data_type & 7) == BOOLEAN) {
} else if (s->col.physical_type == BOOLEAN) {
src_target_pos = gpuDecodeRleBooleans(s, sb, src_target_pos, t & 0x1f);
} else if ((s->col.data_type & 7) == BYTE_ARRAY or
(s->col.data_type & 7) == FIXED_LEN_BYTE_ARRAY) {
} else if (s->col.physical_type == BYTE_ARRAY or
s->col.physical_type == FIXED_LEN_BYTE_ARRAY) {
gpuInitStringDescriptors<false>(s, sb, src_target_pos, t & 0x1f);
}
if (t == 32) { s->dict_pos = src_target_pos; }
} else {
// WARP1..WARP3: Decode values
int const dtype = s->col.data_type & 7;
int const dtype = s->col.physical_type;
src_pos += t - out_thread0;

// the position in the output column/buffer
Expand Down Expand Up @@ -166,10 +166,12 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
uint32_t dtype_len = s->dtype_len;
void* dst =
nesting_info_base[leaf_level_index].data_out + static_cast<size_t>(dst_pos) * dtype_len;
auto const is_decimal =
s->col.logical_type.has_value() and s->col.logical_type->type == LogicalType::DECIMAL;
if (dtype == BYTE_ARRAY) {
if (s->col.converted_type == DECIMAL) {
if (is_decimal) {
auto const [ptr, len] = gpuGetStringData(s, sb, val_src_pos);
auto const decimal_precision = s->col.decimal_precision;
auto const decimal_precision = s->col.logical_type->precision();
if (decimal_precision <= MAX_DECIMAL32_PRECISION) {
gpuOutputByteArrayAsInt(ptr, len, static_cast<int32_t*>(dst));
} else if (decimal_precision <= MAX_DECIMAL64_PRECISION) {
Expand All @@ -182,7 +184,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
}
} else if (dtype == BOOLEAN) {
gpuOutputBoolean(sb, val_src_pos, static_cast<uint8_t*>(dst));
} else if (s->col.converted_type == DECIMAL) {
} else if (is_decimal) {
switch (dtype) {
case INT32: gpuOutputFast(s, sb, val_src_pos, static_cast<uint32_t*>(dst)); break;
case INT64: gpuOutputFast(s, sb, val_src_pos, static_cast<uint2*>(dst)); break;
Expand Down
3 changes: 1 addition & 2 deletions cpp/src/io/parquet/page_data.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,7 @@ template <typename state_buf>
inline __device__ void gpuOutputString(page_state_s* s, state_buf* sb, int src_pos, void* dstv)
{
auto [ptr, len] = gpuGetStringData(s, sb, src_pos);
// make sure to only hash `BYTE_ARRAY` when specified with the output type size
if (s->dtype_len == 4 and (s->col.data_type & 7) == BYTE_ARRAY) {
if (s->col.is_strings_to_cat and s->col.physical_type == BYTE_ARRAY) {
// Output hash. This hash value is used if the option to convert strings to
// categoricals is enabled. The seed value is chosen arbitrarily.
uint32_t constexpr hash_seed = 33;
Expand Down
58 changes: 30 additions & 28 deletions cpp/src/io/parquet/page_decode.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,7 @@ gpuInitStringDescriptors(page_state_s* s, [[maybe_unused]] state_buf* sb, int ta

while (pos < target_pos) {
int len = 0;
if ((s->col.data_type & 7) == FIXED_LEN_BYTE_ARRAY) {
if (s->col.physical_type == FIXED_LEN_BYTE_ARRAY) {
if (k < dict_size) { len = s->dtype_len_in; }
} else {
if (k + 4 <= dict_size) {
Expand Down Expand Up @@ -1144,11 +1144,11 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
if (s->page.num_input_values > 0) {
uint8_t* cur = s->page.page_data;
uint8_t* end = cur + s->page.uncompressed_page_size;

uint32_t dtype_len_out = s->col.data_type >> 3;
s->ts_scale = 0;
s->ts_scale = 0;
// Validate data type
auto const data_type = s->col.data_type & 7;
auto const data_type = s->col.physical_type;
auto const is_decimal =
s->col.logical_type.has_value() and s->col.logical_type->type == LogicalType::DECIMAL;
switch (data_type) {
case BOOLEAN:
s->dtype_len = 1; // Boolean are stored as 1 byte on the output
Expand All @@ -1159,13 +1159,15 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
if (s->col.ts_clock_rate) {
int32_t units = 0;
// Duration types are not included because no scaling is done when reading
if (s->col.converted_type == TIMESTAMP_MILLIS) {
units = cudf::timestamp_ms::period::den;
} else if (s->col.converted_type == TIMESTAMP_MICROS) {
units = cudf::timestamp_us::period::den;
} else if (s->col.logical_type.has_value() and
s->col.logical_type->is_timestamp_nanos()) {
units = cudf::timestamp_ns::period::den;
if (s->col.logical_type.has_value()) {
auto const& lt = s->col.logical_type.value();
if (lt.is_timestamp_millis()) {
units = cudf::timestamp_ms::period::den;
} else if (lt.is_timestamp_micros()) {
units = cudf::timestamp_us::period::den;
} else if (lt.is_timestamp_nanos()) {
units = cudf::timestamp_ns::period::den;
}
}
if (units and units != s->col.ts_clock_rate) {
s->ts_scale = (s->col.ts_clock_rate < units) ? -(units / s->col.ts_clock_rate)
Expand All @@ -1176,8 +1178,8 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
case DOUBLE: s->dtype_len = 8; break;
case INT96: s->dtype_len = 12; break;
case BYTE_ARRAY:
if (s->col.converted_type == DECIMAL) {
auto const decimal_precision = s->col.decimal_precision;
if (is_decimal) {
auto const decimal_precision = s->col.logical_type->precision();
s->dtype_len = [decimal_precision]() {
if (decimal_precision <= MAX_DECIMAL32_PRECISION) {
return sizeof(int32_t);
Expand All @@ -1192,14 +1194,14 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
}
break;
default: // FIXED_LEN_BYTE_ARRAY:
s->dtype_len = dtype_len_out;
s->dtype_len = s->col.type_length;
if (s->dtype_len <= 0) { s->set_error_code(decode_error::INVALID_DATA_TYPE); }
break;
}
// Special check for downconversions
s->dtype_len_in = s->dtype_len;
if (data_type == FIXED_LEN_BYTE_ARRAY) {
if (s->col.converted_type == DECIMAL) {
if (is_decimal) {
s->dtype_len = [dtype_len = s->dtype_len]() {
if (dtype_len <= sizeof(int32_t)) {
return sizeof(int32_t);
Expand All @@ -1213,17 +1215,17 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
s->dtype_len = sizeof(string_index_pair);
}
} else if (data_type == INT32) {
if (dtype_len_out == 1) {
// INT8 output
s->dtype_len = 1;
} else if (dtype_len_out == 2) {
// INT16 output
s->dtype_len = 2;
} else if (s->col.converted_type == TIME_MILLIS) {
// INT64 output
s->dtype_len = 8;
// check for smaller bitwidths
if (s->col.logical_type.has_value()) {
auto const& lt = s->col.logical_type.value();
if (lt.type == LogicalType::INTEGER) {
s->dtype_len = lt.bit_width() / 8;
} else if (lt.is_time_millis()) {
// cudf outputs as INT64
s->dtype_len = 8;
}
}
} else if (data_type == BYTE_ARRAY && dtype_len_out == 4) {
} else if (data_type == BYTE_ARRAY && s->col.is_strings_to_cat) {
s->dtype_len = 4; // HASH32 output
} else if (data_type == INT96) {
s->dtype_len = 8; // Convert to 64-bit timestamp
Expand Down Expand Up @@ -1298,7 +1300,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
case Encoding::PLAIN_DICTIONARY:
case Encoding::RLE_DICTIONARY:
// RLE-packed dictionary indices, first byte indicates index length in bits
if (((s->col.data_type & 7) == BYTE_ARRAY) && (s->col.str_dict_index)) {
if (s->col.physical_type == BYTE_ARRAY && s->col.str_dict_index != nullptr) {
// String dictionary: use index
s->dict_base = reinterpret_cast<uint8_t const*>(s->col.str_dict_index);
s->dict_size = s->col.dict_page->num_input_values * sizeof(string_index_pair);
Expand All @@ -1316,7 +1318,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
case Encoding::PLAIN:
s->dict_size = static_cast<int32_t>(end - cur);
s->dict_val = 0;
if ((s->col.data_type & 7) == BOOLEAN) { s->dict_run = s->dict_size * 2 + 1; }
if (s->col.physical_type == BOOLEAN) { s->dict_run = s->dict_size * 2 + 1; }
break;
case Encoding::RLE: {
// first 4 bytes are length of RLE data
Expand Down
Loading

0 comments on commit 34c8b26

Please sign in to comment.