Skip to content

Commit

Permalink
Merge branch 'branch-22.04' of https://github.com/rapidsai/cudf into …
Browse files Browse the repository at this point in the history
…df.diff
  • Loading branch information
skirui-source committed Feb 3, 2022
2 parents e0722ae + a25a2ec commit 82f941b
Show file tree
Hide file tree
Showing 132 changed files with 3,972 additions and 2,653 deletions.
1 change: 1 addition & 0 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,7 @@ if buildAll || hasArg libcudf; then
fi
echo "$MSG"
python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt html --msg "$MSG" > ${LIB_BUILD_DIR}/ninja_log.html
cp ${LIB_BUILD_DIR}/.ninja_log ${LIB_BUILD_DIR}/ninja.log
fi

if [[ ${INSTALL_TARGET} != "" ]]; then
Expand Down
2 changes: 1 addition & 1 deletion ci/checks/style.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ LANG=C.UTF-8
. /opt/conda/etc/profile.d/conda.sh
conda activate rapids

FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-21.12/cmake-format-rapids-cmake.json
FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.04/cmake-format-rapids-cmake.json
export RAPIDS_CMAKE_FORMAT_FILE=/tmp/rapids_cmake_ci/cmake-formats-rapids-cmake.json
mkdir -p $(dirname ${RAPIDS_CMAKE_FORMAT_FILE})
wget -O ${RAPIDS_CMAKE_FORMAT_FILE} ${FORMAT_FILE_URL}
Expand Down
1 change: 1 addition & 0 deletions ci/cpu/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ if [ "$BUILD_LIBCUDF" == '1' ]; then
gpuci_logger "Copying build metrics results"
mkdir -p "$WORKSPACE/build-metrics"
cp "$LIBCUDF_BUILD_DIR/ninja_log.html" "$WORKSPACE/build-metrics/BuildMetrics.html"
cp "$LIBCUDF_BUILD_DIR/ninja.log" "$WORKSPACE/build-metrics/ninja.log"
fi

gpuci_logger "Build conda pkg for libcudf_kafka"
Expand Down
29 changes: 4 additions & 25 deletions ci/cpu/prebuild.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,32 +3,11 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
set -e

DEFAULT_CUDA_VER="11.5"
DEFAULT_PYTHON_VER="3.8"

#Always upload cudf Python package
#Always upload cudf packages
export UPLOAD_CUDF=1

#Upload libcudf once per CUDA
if [[ "$PYTHON" == "${DEFAULT_PYTHON_VER}" ]]; then
export UPLOAD_LIBCUDF=1
else
export UPLOAD_LIBCUDF=0
fi

# upload cudf_kafka for all versions of Python
if [[ "$CUDA" == "${DEFAULT_CUDA_VER}" ]]; then
export UPLOAD_CUDF_KAFKA=1
else
export UPLOAD_CUDF_KAFKA=0
fi

#We only want to upload libcudf_kafka once per python/CUDA combo
if [[ "$PYTHON" == "${DEFAULT_PYTHON_VER}" ]] && [[ "$CUDA" == "${DEFAULT_CUDA_VER}" ]]; then
export UPLOAD_LIBCUDF_KAFKA=1
else
export UPLOAD_LIBCUDF_KAFKA=0
fi
export UPLOAD_LIBCUDF=1
export UPLOAD_CUDF_KAFKA=1
export UPLOAD_LIBCUDF_KAFKA=1

if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
#If project flash is not activate, always build both
Expand Down
5 changes: 4 additions & 1 deletion ci/release/update-version.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,13 @@ sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g'
# rapids-cmake version
sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cmake"'/g' fetch_rapids.cmake

# cmake-format rapids-cmake definitions
sed_runner 's/'"branch-.*\/cmake-format-rapids-cmake.json"'/'"branch-${NEXT_SHORT_TAG}\/cmake-format-rapids-cmake.json"'/g' ci/checks/style.sh

# doxyfile update
sed_runner 's/PROJECT_NUMBER = .*/PROJECT_NUMBER = '${NEXT_FULL_TAG}'/g' cpp/doxygen/Doxyfile

# RTD update
# sphinx docs update
sed_runner 's/version = .*/version = '"'${NEXT_SHORT_TAG}'"'/g' docs/cudf/source/conf.py
sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/cudf/source/conf.py

Expand Down
8 changes: 5 additions & 3 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -425,13 +425,11 @@ add_library(
src/strings/copying/concatenate.cu
src/strings/copying/copying.cu
src/strings/copying/shift.cu
src/strings/count_matches.cu
src/strings/extract/extract.cu
src/strings/extract/extract_all.cu
src/strings/filling/fill.cu
src/strings/filter_chars.cu
src/strings/findall.cu
src/strings/find.cu
src/strings/find_multiple.cu
src/strings/padding.cu
src/strings/json/json_path.cu
src/strings/regex/regcomp.cpp
Expand All @@ -441,6 +439,10 @@ add_library(
src/strings/replace/multi_re.cu
src/strings/replace/replace.cu
src/strings/replace/replace_re.cu
src/strings/search/findall.cu
src/strings/search/findall_record.cu
src/strings/search/find.cu
src/strings/search/find_multiple.cu
src/strings/split/partition.cu
src/strings/split/split.cu
src/strings/split/split_record.cu
Expand Down
4 changes: 2 additions & 2 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# =============================================================================
# Copyright (c) 2018-2021, NVIDIA CORPORATION.
# Copyright (c) 2018-2022, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
Expand Down Expand Up @@ -123,7 +123,7 @@ ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask.cpp

# ##################################################################################################
# * stream_compaction benchmark -------------------------------------------------------------------
ConfigureBench(STREAM_COMPACTION_BENCH stream_compaction/drop_duplicates.cpp)
ConfigureNVBench(STREAM_COMPACTION_BENCH stream_compaction/drop_duplicates.cpp)

# ##################################################################################################
# * join benchmark --------------------------------------------------------------------------------
Expand Down
5 changes: 3 additions & 2 deletions cpp/benchmarks/fixture/benchmark_fixture.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,9 @@ inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>
inline auto make_pool_instance()
{
static rmm::mr::cuda_memory_resource cuda_mr;
static rmm::mr::pool_memory_resource pool_mr{&cuda_mr};
return std::shared_ptr<rmm::mr::device_memory_resource>(&pool_mr);
static auto pool_mr =
std::make_shared<rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource>>(&cuda_mr);
return pool_mr;
}
} // namespace

Expand Down
17 changes: 8 additions & 9 deletions cpp/benchmarks/io/csv/csv_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,7 @@ void BM_csv_read_varying_input(benchmark::State& state)

cuio_source_sink_pair source_sink(source_type);
cudf_io::csv_writer_options options =
cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view)
.include_header(true)
.rows_per_chunk(1 << 14); // TODO: remove once default is sensible
cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view).include_header(true);
cudf_io::write_csv(options);

cudf_io::csv_reader_options const read_options =
Expand All @@ -59,6 +57,7 @@ void BM_csv_read_varying_input(benchmark::State& state)

state.SetBytesProcessed(data_size * state.iterations());
state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
state.counters["encoded_file_size"] = source_sink.size();
}

void BM_csv_read_varying_options(benchmark::State& state)
Expand All @@ -79,23 +78,22 @@ void BM_csv_read_varying_options(benchmark::State& state)
auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
auto const view = tbl->view();

std::vector<char> csv_data;
cuio_source_sink_pair source_sink(io_type::HOST_BUFFER);
cudf_io::csv_writer_options options =
cudf_io::csv_writer_options::builder(cudf_io::sink_info{&csv_data}, view)
cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view)
.include_header(true)
.line_terminator("\r\n")
.rows_per_chunk(1 << 14); // TODO: remove once default is sensible
.line_terminator("\r\n");
cudf_io::write_csv(options);

cudf_io::csv_reader_options read_options =
cudf_io::csv_reader_options::builder(cudf_io::source_info{csv_data.data(), csv_data.size()})
cudf_io::csv_reader_options::builder(source_sink.make_source_info())
.use_cols_indexes(cols_to_read)
.thousands('\'')
.windowslinetermination(true)
.comment('#')
.prefix("BM_");

size_t const chunk_size = csv_data.size() / num_chunks;
size_t const chunk_size = source_sink.size() / num_chunks;
cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks;
auto mem_stats_logger = cudf::memory_stats_logger();
for (auto _ : state) {
Expand Down Expand Up @@ -132,6 +130,7 @@ void BM_csv_read_varying_options(benchmark::State& state)
auto const data_processed = data_size * cols_to_read.size() / view.num_columns();
state.SetBytesProcessed(data_processed * state.iterations());
state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
state.counters["encoded_file_size"] = source_sink.size();
}

#define CSV_RD_BM_INPUTS_DEFINE(name, type_or_group, src_type) \
Expand Down
10 changes: 5 additions & 5 deletions cpp/benchmarks/io/csv/csv_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,13 @@ void BM_csv_write_varying_inout(benchmark::State& state)
for (auto _ : state) {
cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0
cudf_io::csv_writer_options options =
cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view)
.include_header(true)
.rows_per_chunk(1 << 14); // TODO: remove once default is sensible
cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view).include_header(true);
cudf_io::write_csv(options);
}

state.SetBytesProcessed(data_size * state.iterations());
state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
state.counters["encoded_file_size"] = source_sink.size();
}

void BM_csv_write_varying_options(benchmark::State& state)
Expand All @@ -71,12 +70,12 @@ void BM_csv_write_varying_options(benchmark::State& state)
auto const view = tbl->view();

std::string const na_per(na_per_len, '#');
std::vector<char> csv_data;
cuio_source_sink_pair source_sink(io_type::HOST_BUFFER);
auto mem_stats_logger = cudf::memory_stats_logger();
for (auto _ : state) {
cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0
cudf_io::csv_writer_options options =
cudf_io::csv_writer_options::builder(cudf_io::sink_info{&csv_data}, view)
cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view)
.include_header(true)
.na_rep(na_per)
.rows_per_chunk(rows_per_chunk);
Expand All @@ -85,6 +84,7 @@ void BM_csv_write_varying_options(benchmark::State& state)

state.SetBytesProcessed(data_size * state.iterations());
state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
state.counters["encoded_file_size"] = source_sink.size();
}

#define CSV_WR_BM_INOUTS_DEFINE(name, type_or_group, sink_type) \
Expand Down
15 changes: 14 additions & 1 deletion cpp/benchmarks/io/cuio_common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#include <benchmarks/io/cuio_common.hpp>

#include <fstream>
#include <numeric>
#include <string>

Expand Down Expand Up @@ -53,13 +54,25 @@ cudf_io::source_info cuio_source_sink_pair::make_source_info()
cudf_io::sink_info cuio_source_sink_pair::make_sink_info()
{
switch (type) {
case io_type::VOID: return cudf_io::sink_info();
case io_type::VOID: return cudf_io::sink_info(&void_sink);
case io_type::FILEPATH: return cudf_io::sink_info(file_name);
case io_type::HOST_BUFFER: return cudf_io::sink_info(&buffer);
default: CUDF_FAIL("invalid output type");
}
}

size_t cuio_source_sink_pair::size()
{
switch (type) {
case io_type::VOID: return void_sink.bytes_written();
case io_type::FILEPATH:
return static_cast<size_t>(
std::ifstream(file_name, std::ifstream::ate | std::ifstream::binary).tellg());
case io_type::HOST_BUFFER: return buffer.size();
default: CUDF_FAIL("invalid output type");
}
}

std::vector<cudf::type_id> dtypes_for_column_selection(std::vector<cudf::type_id> const& data_types,
column_selection col_sel)
{
Expand Down
12 changes: 12 additions & 0 deletions cpp/benchmarks/io/cuio_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,15 @@ std::string random_file_in_dir(std::string const& dir_path);
* @brief Class to create a coupled `source_info` and `sink_info` of given type.
*/
class cuio_source_sink_pair {
class bytes_written_only_sink : public cudf::io::data_sink {
size_t _bytes_written = 0;

public:
void host_write(void const* data, size_t size) override { _bytes_written += size; }
void flush() override {}
size_t bytes_written() override { return _bytes_written; }
};

public:
cuio_source_sink_pair(io_type type);
~cuio_source_sink_pair()
Expand Down Expand Up @@ -66,12 +75,15 @@ class cuio_source_sink_pair {
*/
cudf::io::sink_info make_sink_info();

[[nodiscard]] size_t size();

private:
static temp_directory const tmpdir;

io_type const type;
std::vector<char> buffer;
std::string const file_name;
bytes_written_only_sink void_sink;
};

/**
Expand Down
23 changes: 13 additions & 10 deletions cpp/benchmarks/io/orc/orc_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,13 +66,13 @@ void BM_orc_read_varying_input(benchmark::State& state)

state.SetBytesProcessed(data_size * state.iterations());
state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
state.counters["encoded_file_size"] = source_sink.size();
}

std::vector<std::string> get_col_names(std::vector<char> const& orc_data)
std::vector<std::string> get_col_names(cudf_io::source_info const& source)
{
cudf_io::orc_reader_options const read_options =
cudf_io::orc_reader_options::builder(cudf_io::source_info{orc_data.data(), orc_data.size()})
.num_rows(1);
cudf_io::orc_reader_options::builder(source).num_rows(1);
return cudf_io::read_orc(read_options).metadata.column_names;
}

Expand All @@ -88,25 +88,26 @@ void BM_orc_read_varying_options(benchmark::State& state)
auto const use_np_dtypes = (flags & 2) != 0;
auto const ts_type = cudf::data_type{static_cast<cudf::type_id>(state.range(state_idx++))};

// skip_rows is not supported on nested types
auto const data_types =
dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED),
int32_t(type_group_id::FLOATING_POINT),
int32_t(type_group_id::FIXED_POINT),
int32_t(type_group_id::TIMESTAMP),
int32_t(cudf::type_id::STRING),
int32_t(cudf::type_id::LIST)}),
int32_t(cudf::type_id::STRING)}),
col_sel);
auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
auto const view = tbl->view();

std::vector<char> orc_data;
cuio_source_sink_pair source_sink(io_type::HOST_BUFFER);
cudf_io::orc_writer_options options =
cudf_io::orc_writer_options::builder(cudf_io::sink_info{&orc_data}, view);
cudf_io::orc_writer_options::builder(source_sink.make_sink_info(), view);
cudf_io::write_orc(options);

auto const cols_to_read = select_column_names(get_col_names(orc_data), col_sel);
auto const cols_to_read =
select_column_names(get_col_names(source_sink.make_source_info()), col_sel);
cudf_io::orc_reader_options read_options =
cudf_io::orc_reader_options::builder(cudf_io::source_info{orc_data.data(), orc_data.size()})
cudf_io::orc_reader_options::builder(source_sink.make_source_info())
.columns(cols_to_read)
.use_index(use_index)
.use_np_dtypes(use_np_dtypes)
Expand Down Expand Up @@ -148,6 +149,7 @@ void BM_orc_read_varying_options(benchmark::State& state)
auto const data_processed = data_size * cols_to_read.size() / view.num_columns();
state.SetBytesProcessed(data_processed * state.iterations());
state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
state.counters["encoded_file_size"] = source_sink.size();
}

#define ORC_RD_BM_INPUTS_DEFINE(name, type_or_group, src_type) \
Expand Down Expand Up @@ -179,11 +181,12 @@ BENCHMARK_REGISTER_F(OrcRead, column_selection)
->Unit(benchmark::kMillisecond)
->UseManualTime();

// Need an API to get the number of stripes to enable row_selection::STRIPES here
BENCHMARK_DEFINE_F(OrcRead, row_selection)
(::benchmark::State& state) { BM_orc_read_varying_options(state); }
BENCHMARK_REGISTER_F(OrcRead, row_selection)
->ArgsProduct({{int32_t(column_selection::ALL)},
{int32_t(row_selection::STRIPES), int32_t(row_selection::NROWS)},
{int32_t(row_selection::NROWS)},
{1, 8},
{0b11}, // defaults
{int32_t(cudf::type_id::EMPTY)}})
Expand Down
2 changes: 2 additions & 0 deletions cpp/benchmarks/io/orc/orc_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ void BM_orc_write_varying_inout(benchmark::State& state)

state.SetBytesProcessed(data_size * state.iterations());
state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
state.counters["encoded_file_size"] = source_sink.size();
}

void BM_orc_write_varying_options(benchmark::State& state)
Expand Down Expand Up @@ -98,6 +99,7 @@ void BM_orc_write_varying_options(benchmark::State& state)

state.SetBytesProcessed(data_size * state.iterations());
state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
state.counters["encoded_file_size"] = source_sink.size();
}

#define ORC_WR_BM_INOUTS_DEFINE(name, type_or_group, sink_type) \
Expand Down
Loading

0 comments on commit 82f941b

Please sign in to comment.