Skip to content

Commit

Permalink
Merge branch 'branch-24.06' into improve-distinct-join
Browse files Browse the repository at this point in the history
  • Loading branch information
PointKernel authored May 6, 2024
2 parents 7c5934f + 4dc6162 commit 01d1dd6
Show file tree
Hide file tree
Showing 161 changed files with 6,452 additions and 1,790 deletions.
8 changes: 4 additions & 4 deletions build.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

# Copyright (c) 2019-2023, NVIDIA CORPORATION.
# Copyright (c) 2019-2024, NVIDIA CORPORATION.

# cuDF build script

Expand Down Expand Up @@ -109,8 +109,8 @@ function buildAll {
}

function buildLibCudfJniInDocker {
local cudaVersion="11.5.0"
local imageName="cudf-build:${cudaVersion}-devel-centos7"
local cudaVersion="11.8.0"
local imageName="cudf-build:${cudaVersion}-devel-rocky8"
local CMAKE_GENERATOR="${CMAKE_GENERATOR:-Ninja}"
local workspaceDir="/rapids"
local localMavenRepo=${LOCAL_MAVEN_REPO:-"$HOME/.m2/repository"}
Expand All @@ -120,7 +120,7 @@ function buildLibCudfJniInDocker {
mkdir -p "$CUDF_JAR_JAVA_BUILD_DIR/libcudf-cmake-build"
mkdir -p "$HOME/.ccache" "$HOME/.m2"
nvidia-docker build \
-f java/ci/Dockerfile.centos7 \
-f java/ci/Dockerfile.rocky \
--build-arg CUDA_VERSION=${cudaVersion} \
-t $imageName .
nvidia-docker run -it -u $(id -u):$(id -g) --rm \
Expand Down
5 changes: 4 additions & 1 deletion conda/recipes/cudf/conda_build_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@ c_compiler_version:
cxx_compiler_version:
- 11

sysroot_version:
c_stdlib:
- sysroot

c_stdlib_version:
- "2.17"

cmake_version:
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/cudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ requirements:
- {{ compiler('cuda') }}
{% endif %}
- cuda-version ={{ cuda_version }}
- sysroot_{{ target_platform }} {{ sysroot_version }}
- {{ stdlib("c") }}
host:
- python
- cython >=3.0.3
Expand Down
5 changes: 4 additions & 1 deletion conda/recipes/cudf_kafka/conda_build_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@ c_compiler_version:
cxx_compiler_version:
- 11

sysroot_version:
c_stdlib:
- sysroot

c_stdlib_version:
- "2.17"

cmake_version:
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/cudf_kafka/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ requirements:
- {{ compiler('cuda') }}
{% endif %}
- cuda-version ={{ cuda_version }}
- sysroot_{{ target_platform }} {{ sysroot_version }}
- {{ stdlib("c") }}
host:
- python
- cython >=3.0.3
Expand Down
5 changes: 4 additions & 1 deletion conda/recipes/libcudf/conda_build_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,10 @@ cuda_compiler:
cuda11_compiler:
- nvcc

sysroot_version:
c_stdlib:
- sysroot

c_stdlib_version:
- "2.17"

cmake_version:
Expand Down
4 changes: 2 additions & 2 deletions conda/recipes/libcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ requirements:
{% endif %}
- cuda-version ={{ cuda_version }}
- ninja
- sysroot_{{ target_platform }} {{ sysroot_version }}
- {{ stdlib("c") }}
host:
- librmm ={{ minor_version }}
- libkvikio ={{ minor_version }}
Expand Down Expand Up @@ -170,7 +170,7 @@ outputs:
{% endif %}
- cuda-version ={{ cuda_version }}
- ninja
- sysroot_{{ target_platform }} {{ sysroot_version }}
- {{ stdlib("c") }}
host:
- {{ pin_subpackage('libcudf', exact=True) }}
{% if cuda_major == "11" %}
Expand Down
4 changes: 3 additions & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -395,8 +395,9 @@ add_library(
src/io/orc/dict_enc.cu
src/io/orc/orc.cpp
src/io/orc/reader_impl.cu
src/io/orc/reader_impl_chunking.cu
src/io/orc/reader_impl_decode.cu
src/io/orc/reader_impl_helpers.cpp
src/io/orc/reader_impl_preprocess.cu
src/io/orc/stats_enc.cu
src/io/orc/stripe_data.cu
src/io/orc/stripe_enc.cu
Expand Down Expand Up @@ -429,6 +430,7 @@ add_library(
src/io/text/multibyte_split.cu
src/io/utilities/arrow_io_source.cpp
src/io/utilities/column_buffer.cpp
src/io/utilities/column_buffer_strings.cu
src/io/utilities/config_utils.cpp
src/io/utilities/data_casting.cu
src/io/utilities/data_sink.cpp
Expand Down
5 changes: 5 additions & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,11 @@ target_link_libraries(MULTIBYTE_SPLIT_NVBENCH PRIVATE ZLIB::ZLIB)
# ---------------------------------------------------------------------------------
ConfigureNVBench(DECIMAL_NVBENCH decimal/convert_floating.cpp)

# ##################################################################################################
# * reshape benchmark
# ---------------------------------------------------------------------------------
ConfigureNVBench(RESHAPE_NVBENCH reshape/interleave.cpp)

add_custom_target(
run_benchmarks
DEPENDS CUDF_BENCHMARKS
Expand Down
106 changes: 83 additions & 23 deletions cpp/benchmarks/io/orc/orc_reader_input.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
* Copyright (c) 2022-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -24,31 +24,59 @@

#include <nvbench/nvbench.cuh>

namespace {

// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
// run on most GPUs, but large enough to allow highest throughput
constexpr int64_t data_size = 512 << 20;
constexpr cudf::size_type num_cols = 64;
constexpr std::size_t data_size = 512 << 20;
constexpr std::size_t Mbytes = 1024 * 1024;

template <bool is_chunked_read>
void orc_read_common(cudf::size_type num_rows_to_read,
cuio_source_sink_pair& source_sink,
nvbench::state& state)
{
cudf::io::orc_reader_options read_opts =
cudf::io::orc_reader_options::builder(source_sink.make_source_info());
auto const read_opts =
cudf::io::orc_reader_options::builder(source_sink.make_source_info()).build();

auto mem_stats_logger = cudf::memory_stats_logger(); // init stats logger
state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
state.exec(
nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
try_drop_l3_cache();

timer.start();
auto const result = cudf::io::read_orc(read_opts);
timer.stop();

CUDF_EXPECTS(result.tbl->num_columns() == num_cols, "Unexpected number of columns");
CUDF_EXPECTS(result.tbl->num_rows() == num_rows_to_read, "Unexpected number of rows");
});
if constexpr (is_chunked_read) {
state.exec(
nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch&, auto& timer) {
try_drop_l3_cache();
auto const output_limit_MB =
static_cast<std::size_t>(state.get_int64("chunk_read_limit_MB"));
auto const read_limit_MB = static_cast<std::size_t>(state.get_int64("pass_read_limit_MB"));

auto reader =
cudf::io::chunked_orc_reader(output_limit_MB * Mbytes, read_limit_MB * Mbytes, read_opts);
cudf::size_type num_rows{0};

timer.start();
do {
auto chunk = reader.read_chunk();
num_rows += chunk.tbl->num_rows();
} while (reader.has_next());
timer.stop();

CUDF_EXPECTS(num_rows == num_rows_to_read, "Unexpected number of rows");
});
} else { // not is_chunked_read
state.exec(
nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch&, auto& timer) {
try_drop_l3_cache();

timer.start();
auto const result = cudf::io::read_orc(read_opts);
timer.stop();

CUDF_EXPECTS(result.tbl->num_columns() == num_cols, "Unexpected number of columns");
CUDF_EXPECTS(result.tbl->num_rows() == num_rows_to_read, "Unexpected number of rows");
});
}

auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
Expand All @@ -57,6 +85,8 @@ void orc_read_common(cudf::size_type num_rows_to_read,
state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
}

} // namespace

template <data_type DataType, cudf::io::io_type IOType>
void BM_orc_read_data(nvbench::state& state,
nvbench::type_list<nvbench::enum_type<DataType>, nvbench::enum_type<IOType>>)
Expand All @@ -79,13 +109,11 @@ void BM_orc_read_data(nvbench::state& state,
return view.num_rows();
}();

orc_read_common(num_rows_written, source_sink, state);
orc_read_common<false>(num_rows_written, source_sink, state);
}

template <cudf::io::io_type IOType, cudf::io::compression_type Compression>
void BM_orc_read_io_compression(
nvbench::state& state,
nvbench::type_list<nvbench::enum_type<IOType>, nvbench::enum_type<Compression>>)
template <cudf::io::io_type IOType, cudf::io::compression_type Compression, bool chunked_read>
void orc_read_io_compression(nvbench::state& state)
{
auto const d_type = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL_SIGNED),
static_cast<int32_t>(data_type::FLOAT),
Expand All @@ -95,15 +123,21 @@ void BM_orc_read_io_compression(
static_cast<int32_t>(data_type::LIST),
static_cast<int32_t>(data_type::STRUCT)});

cudf::size_type const cardinality = state.get_int64("cardinality");
cudf::size_type const run_length = state.get_int64("run_length");
auto const [cardinality, run_length] = [&]() -> std::pair<cudf::size_type, cudf::size_type> {
if constexpr (chunked_read) {
return {0, 4};
} else {
return {static_cast<cudf::size_type>(state.get_int64("cardinality")),
static_cast<cudf::size_type>(state.get_int64("run_length"))};
}
}();
cuio_source_sink_pair source_sink(IOType);

auto const num_rows_written = [&]() {
auto const tbl = create_random_table(
cycle_dtypes(d_type, num_cols),
table_size_bytes{data_size},
data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
data_profile_builder{}.cardinality(cardinality).avg_run_length(run_length));
auto const view = tbl->view();

cudf::io::orc_writer_options opts =
Expand All @@ -113,7 +147,23 @@ void BM_orc_read_io_compression(
return view.num_rows();
}();

orc_read_common(num_rows_written, source_sink, state);
orc_read_common<chunked_read>(num_rows_written, source_sink, state);
}

template <cudf::io::io_type IOType, cudf::io::compression_type Compression>
void BM_orc_read_io_compression(
nvbench::state& state,
nvbench::type_list<nvbench::enum_type<IOType>, nvbench::enum_type<Compression>>)
{
return orc_read_io_compression<IOType, Compression, false>(state);
}

template <cudf::io::compression_type Compression>
void BM_orc_chunked_read_io_compression(nvbench::state& state,
nvbench::type_list<nvbench::enum_type<Compression>>)
{
// Only run benchmark using HOST_BUFFER IO.
return orc_read_io_compression<cudf::io::io_type::HOST_BUFFER, Compression, true>(state);
}

using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL_SIGNED,
Expand Down Expand Up @@ -146,3 +196,13 @@ NVBENCH_BENCH_TYPES(BM_orc_read_io_compression, NVBENCH_TYPE_AXES(io_list, compr
.set_min_samples(4)
.add_int64_axis("cardinality", {0, 1000})
.add_int64_axis("run_length", {1, 32});

// Should have the same parameters as `BM_orc_read_io_compression` for comparison.
NVBENCH_BENCH_TYPES(BM_orc_chunked_read_io_compression, NVBENCH_TYPE_AXES(compression_list))
.set_name("orc_chunked_read_io_compression")
.set_type_axes_names({"compression"})
.set_min_samples(4)
// The input has approximately 520MB and 127K rows.
// The limits below are given in MBs.
.add_int64_axis("chunk_read_limit_MB", {50, 250, 700})
.add_int64_axis("pass_read_limit_MB", {50, 250, 700});
59 changes: 59 additions & 0 deletions cpp/benchmarks/reshape/interleave.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>

#include <cudf/reshape.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/default_stream.hpp>

#include <nvbench/nvbench.cuh>

static void bench_interleave(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
auto const num_cols = static_cast<cudf::size_type>(state.get_int64("columns"));

if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) * num_cols >=
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
state.skip("Skip benchmarks greater than size_type limit");
}

data_profile const str_profile = data_profile_builder().distribution(
cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
std::vector<cudf::type_id> types(num_cols, cudf::type_id::STRING);
auto const source_table = create_random_table(types, row_count{num_rows}, str_profile);

auto const source_view = source_table->view();
auto const stream = cudf::get_default_stream();

state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
auto chars_size = cudf::strings_column_view(source_view.column(0)).chars_size(stream) +
cudf::strings_column_view(source_view.column(1)).chars_size(stream);
state.add_global_memory_reads<nvbench::int8_t>(chars_size); // all bytes are read
state.add_global_memory_writes<nvbench::int8_t>(chars_size); // all bytes are written

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
[[maybe_unused]] auto result = cudf::interleave_columns(source_view);
});
}

NVBENCH_BENCH(bench_interleave)
.set_name("interleave_strings")
.add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
.add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
.add_int64_axis("columns", {2, 10, 100});
5 changes: 1 addition & 4 deletions cpp/cmake/thirdparty/get_nvbench.cmake
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# =============================================================================
# Copyright (c) 2023, NVIDIA CORPORATION.
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
Expand All @@ -18,9 +18,6 @@ function(find_and_configure_nvbench)
include(${rapids-cmake-dir}/cpm/nvbench.cmake)
include(${rapids-cmake-dir}/cpm/package_override.cmake)

set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches")
rapids_cpm_package_override("${cudf_patch_dir}/nvbench_override.json")

rapids_cpm_nvbench(BUILD_STATIC)

endfunction()
Expand Down
9 changes: 0 additions & 9 deletions cpp/cmake/thirdparty/patches/nvbench_override.json

This file was deleted.

Loading

0 comments on commit 01d1dd6

Please sign in to comment.