Skip to content

Commit

Permalink
Merge branch 'branch-22.04' into refactor/isin
Browse files Browse the repository at this point in the history
  • Loading branch information
vyasr committed Feb 8, 2022
2 parents 8d83af6 + bd98bfe commit 0dc70a6
Show file tree
Hide file tree
Showing 156 changed files with 4,435 additions and 1,891 deletions.
239 changes: 237 additions & 2 deletions CHANGELOG.md

Large diffs are not rendered by default.

21 changes: 10 additions & 11 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -185,12 +185,9 @@ if buildAll || hasArg libcudf; then
fi

# get the current count before the compile starts
FILES_IN_CCACHE=""
if [[ "$BUILD_REPORT_INCL_CACHE_STATS" == "ON" && -x "$(command -v ccache)" ]]; then
FILES_IN_CCACHE=$(ccache -s | grep "files in cache")
echo "$FILES_IN_CCACHE"
# zero the ccache statistics
ccache -z
if [[ "$BUILD_REPORT_INCL_CACHE_STATS" == "ON" && -x "$(command -v sccache)" ]]; then
# zero the sccache statistics
sccache --zero-stats
fi

cmake -S $REPODIR/cpp -B ${LIB_BUILD_DIR} \
Expand All @@ -216,11 +213,12 @@ if buildAll || hasArg libcudf; then
echo "Formatting build metrics"
python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt xml > ${LIB_BUILD_DIR}/ninja_log.xml
MSG="<p>"
# get some ccache stats after the compile
if [[ "$BUILD_REPORT_INCL_CACHE_STATS"=="ON" && -x "$(command -v ccache)" ]]; then
MSG="${MSG}<br/>$FILES_IN_CCACHE"
HIT_RATE=$(ccache -s | grep "cache hit rate")
MSG="${MSG}<br/>${HIT_RATE}"
# get some sccache stats after the compile
if [[ "$BUILD_REPORT_INCL_CACHE_STATS" == "ON" && -x "$(command -v sccache)" ]]; then
COMPILE_REQUESTS=$(sccache -s | grep "Compile requests \+ [0-9]\+$" | awk '{ print $NF }')
CACHE_HITS=$(sccache -s | grep "Cache hits \+ [0-9]\+$" | awk '{ print $NF }')
HIT_RATE=$(echo - | awk "{printf \"%.2f\n\", $CACHE_HITS / $COMPILE_REQUESTS * 100}")
MSG="${MSG}<br/>cache hit rate ${HIT_RATE} %"
fi
MSG="${MSG}<br/>parallel setting: $PARALLEL_LEVEL"
MSG="${MSG}<br/>parallel build time: $compile_total seconds"
Expand All @@ -230,6 +228,7 @@ if buildAll || hasArg libcudf; then
fi
echo "$MSG"
python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt html --msg "$MSG" > ${LIB_BUILD_DIR}/ninja_log.html
cp ${LIB_BUILD_DIR}/.ninja_log ${LIB_BUILD_DIR}/ninja.log
fi

if [[ ${INSTALL_TARGET} != "" ]]; then
Expand Down
7 changes: 7 additions & 0 deletions ci/cpu/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ if [[ "$BUILD_MODE" = "branch" && "$SOURCE_BRANCH" = branch-* ]] ; then
export VERSION_SUFFIX=`date +%y%m%d`
fi

export CMAKE_CUDA_COMPILER_LAUNCHER="sccache"
export CMAKE_CXX_COMPILER_LAUNCHER="sccache"
export CMAKE_C_COMPILER_LAUNCHER="sccache"

################################################################################
# SETUP - Check environment
################################################################################
Expand Down Expand Up @@ -77,6 +81,8 @@ if [ "$BUILD_LIBCUDF" == '1' ]; then
gpuci_conda_retry build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libcudf $CONDA_BUILD_ARGS
mkdir -p ${CONDA_BLD_DIR}/libcudf/work
cp -r ${CONDA_BLD_DIR}/work/* ${CONDA_BLD_DIR}/libcudf/work
gpuci_logger "sccache stats"
sccache --show-stats

# Copy libcudf build metrics results
LIBCUDF_BUILD_DIR=$CONDA_BLD_DIR/libcudf/work/cpp/build
Expand All @@ -85,6 +91,7 @@ if [ "$BUILD_LIBCUDF" == '1' ]; then
gpuci_logger "Copying build metrics results"
mkdir -p "$WORKSPACE/build-metrics"
cp "$LIBCUDF_BUILD_DIR/ninja_log.html" "$WORKSPACE/build-metrics/BuildMetrics.html"
cp "$LIBCUDF_BUILD_DIR/ninja.log" "$WORKSPACE/build-metrics/ninja.log"
fi

gpuci_logger "Build conda pkg for libcudf_kafka"
Expand Down
10 changes: 7 additions & 3 deletions ci/gpu/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ export DASK_DISTRIBUTED_GIT_TAG='2022.01.0'
# ucx-py version
export UCX_PY_VERSION='0.25.*'

export CMAKE_CUDA_COMPILER_LAUNCHER="sccache"
export CMAKE_CXX_COMPILER_LAUNCHER="sccache"
export CMAKE_C_COMPILER_LAUNCHER="sccache"

################################################################################
# TRAP - Setup trap for removing jitify cache
################################################################################
Expand Down Expand Up @@ -245,15 +249,15 @@ fi

cd "$WORKSPACE/python/cudf"
gpuci_logger "Python py.test for cuDF"
py.test -n 6 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term
py.test -n 8 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term --dist=loadscope

cd "$WORKSPACE/python/dask_cudf"
gpuci_logger "Python py.test for dask-cudf"
py.test -n 6 --cache-clear --basetemp="$WORKSPACE/dask-cudf-cuda-tmp" --junitxml="$WORKSPACE/junit-dask-cudf.xml" -v --cov-config=.coveragerc --cov=dask_cudf --cov-report=xml:"$WORKSPACE/python/dask_cudf/dask-cudf-coverage.xml" --cov-report term
py.test -n 8 --cache-clear --basetemp="$WORKSPACE/dask-cudf-cuda-tmp" --junitxml="$WORKSPACE/junit-dask-cudf.xml" -v --cov-config=.coveragerc --cov=dask_cudf --cov-report=xml:"$WORKSPACE/python/dask_cudf/dask-cudf-coverage.xml" --cov-report term

cd "$WORKSPACE/python/custreamz"
gpuci_logger "Python py.test for cuStreamz"
py.test -n 6 --cache-clear --basetemp="$WORKSPACE/custreamz-cuda-tmp" --junitxml="$WORKSPACE/junit-custreamz.xml" -v --cov-config=.coveragerc --cov=custreamz --cov-report=xml:"$WORKSPACE/python/custreamz/custreamz-coverage.xml" --cov-report term
py.test -n 8 --cache-clear --basetemp="$WORKSPACE/custreamz-cuda-tmp" --junitxml="$WORKSPACE/junit-custreamz.xml" -v --cov-config=.coveragerc --cov=custreamz --cov-report=xml:"$WORKSPACE/python/custreamz/custreamz-coverage.xml" --cov-report term

gpuci_logger "Test notebooks"
"$WORKSPACE/ci/gpu/test-notebooks.sh" 2>&1 | tee nbtest.log
Expand Down
10 changes: 5 additions & 5 deletions ci/utils/nbtestlog2junitxml.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@
from enum import Enum


startingPatt = re.compile("^STARTING: ([\w\.\-]+)$")
skippingPatt = re.compile("^SKIPPING: ([\w\.\-]+)\s*(\(([\w\.\-\ \,]+)\))?\s*$")
exitCodePatt = re.compile("^EXIT CODE: (\d+)$")
folderPatt = re.compile("^FOLDER: ([\w\.\-]+)$")
timePatt = re.compile("^real\s+([\d\.ms]+)$")
startingPatt = re.compile(r"^STARTING: ([\w\.\-]+)$")
skippingPatt = re.compile(r"^SKIPPING: ([\w\.\-]+)\s*(\(([\w\.\-\ \,]+)\))?\s*$")
exitCodePatt = re.compile(r"^EXIT CODE: (\d+)$")
folderPatt = re.compile(r"^FOLDER: ([\w\.\-]+)$")
timePatt = re.compile(r"^real\s+([\d\.ms]+)$")
linePatt = re.compile("^" + ("-" * 80) + "$")


Expand Down
8 changes: 5 additions & 3 deletions conda/recipes/libcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,15 @@ build:
- PARALLEL_LEVEL
- VERSION_SUFFIX
- PROJECT_FLASH
- CCACHE_DIR
- CCACHE_NOHASHDIR
- CCACHE_COMPILERCHECK
- CMAKE_GENERATOR
- CMAKE_C_COMPILER_LAUNCHER
- CMAKE_CXX_COMPILER_LAUNCHER
- CMAKE_CUDA_COMPILER_LAUNCHER
- SCCACHE_S3_KEY_PREFIX=libcudf-aarch64 # [aarch64]
- SCCACHE_S3_KEY_PREFIX=libcudf-linux64 # [linux64]
- SCCACHE_BUCKET=rapids-sccache
- SCCACHE_REGION=us-west-2
- SCCACHE_IDLE_TIMEOUT=32768
run_exports:
- {{ pin_subpackage("libcudf", max_pin="x.x") }}

Expand Down
27 changes: 21 additions & 6 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# =============================================================================
# Copyright (c) 2018-2021, NVIDIA CORPORATION.
# Copyright (c) 2018-2022, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
Expand Down Expand Up @@ -56,8 +56,9 @@ add_custom_command(
function(ConfigureBench CMAKE_BENCH_NAME)
add_executable(${CMAKE_BENCH_NAME} ${ARGN})
set_target_properties(
${CMAKE_BENCH_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY
"$<BUILD_INTERFACE:${CUDF_BINARY_DIR}/benchmarks>"
${CMAKE_BENCH_NAME}
PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDF_BINARY_DIR}/benchmarks>"
INSTALL_RPATH "\$ORIGIN/../../../lib"
)
target_link_libraries(
${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common cudf_datagen benchmark::benchmark_main
Expand All @@ -69,19 +70,33 @@ function(ConfigureBench CMAKE_BENCH_NAME)
APPEND
COMMENT "Adding ${CMAKE_BENCH_NAME}"
)

install(
TARGETS ${CMAKE_BENCH_NAME}
COMPONENT testing
DESTINATION bin/benchmarks/libcudf
EXCLUDE_FROM_ALL
)
endfunction()

# This function takes in a benchmark name and benchmark source for nvbench benchmarks and handles
# setting all of the associated properties and linking to build the benchmark
function(ConfigureNVBench CMAKE_BENCH_NAME)
add_executable(${CMAKE_BENCH_NAME} ${ARGN})
set_target_properties(
${CMAKE_BENCH_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY
"$<BUILD_INTERFACE:${CUDF_BINARY_DIR}/benchmarks>"
${CMAKE_BENCH_NAME}
PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDF_BINARY_DIR}/benchmarks>"
INSTALL_RPATH "\$ORIGIN/../../../lib"
)
target_link_libraries(
${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common cudf_datagen nvbench::main
)
install(
TARGETS ${CMAKE_BENCH_NAME}
COMPONENT testing
DESTINATION bin/benchmarks/libcudf
EXCLUDE_FROM_ALL
)
endfunction()

# ##################################################################################################
Expand Down Expand Up @@ -123,7 +138,7 @@ ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask.cpp

# ##################################################################################################
# * stream_compaction benchmark -------------------------------------------------------------------
ConfigureBench(STREAM_COMPACTION_BENCH stream_compaction/drop_duplicates.cpp)
ConfigureNVBench(STREAM_COMPACTION_BENCH stream_compaction/drop_duplicates.cpp)

# ##################################################################################################
# * join benchmark --------------------------------------------------------------------------------
Expand Down
27 changes: 14 additions & 13 deletions cpp/benchmarks/common/generate_input.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,45 +65,46 @@ T get_distribution_mean(distribution_params<T> const& dist)
}

// Utilities to determine the mean size of an element, given the data profile
template <typename T>
std::enable_if_t<cudf::is_fixed_width<T>(), size_t> avg_element_size(data_profile const& profile)
template <typename T, CUDF_ENABLE_IF(cudf::is_fixed_width<T>())>
size_t non_fixed_width_size(data_profile const& profile)
{
return sizeof(T);
CUDF_FAIL("Should not be called, use `size_of` for this type instead");
}

template <typename T>
std::enable_if_t<!cudf::is_fixed_width<T>(), size_t> avg_element_size(data_profile const& profile)
template <typename T, CUDF_ENABLE_IF(!cudf::is_fixed_width<T>())>
size_t non_fixed_width_size(data_profile const& profile)
{
CUDF_FAIL("not implemented!");
}

template <>
size_t avg_element_size<cudf::string_view>(data_profile const& profile)
size_t non_fixed_width_size<cudf::string_view>(data_profile const& profile)
{
auto const dist = profile.get_distribution_params<cudf::string_view>().length_params;
return get_distribution_mean(dist);
}

template <>
size_t avg_element_size<cudf::list_view>(data_profile const& profile)
size_t non_fixed_width_size<cudf::list_view>(data_profile const& profile)
{
auto const dist_params = profile.get_distribution_params<cudf::list_view>();
auto const single_level_mean = get_distribution_mean(dist_params.length_params);
auto const element_size = cudf::size_of(cudf::data_type{dist_params.element_type});
return element_size * pow(single_level_mean, dist_params.max_depth);
}

struct avg_element_size_fn {
struct non_fixed_width_size_fn {
template <typename T>
size_t operator()(data_profile const& profile)
{
return avg_element_size<T>(profile);
return non_fixed_width_size<T>(profile);
}
};

size_t avg_element_bytes(data_profile const& profile, cudf::type_id tid)
size_t avg_element_size(data_profile const& profile, cudf::data_type dtype)
{
return cudf::type_dispatcher(cudf::data_type(tid), avg_element_size_fn{}, profile);
if (cudf::is_fixed_width(dtype)) { return cudf::size_of(dtype); }
return cudf::type_dispatcher(dtype, non_fixed_width_size_fn{}, profile);
}

/**
Expand Down Expand Up @@ -419,7 +420,7 @@ std::unique_ptr<cudf::column> create_random_column<cudf::string_view>(data_profi
random_value_fn<uint32_t>{profile.get_distribution_params<cudf::string_view>().length_params};
auto valid_dist = std::bernoulli_distribution{1. - profile.get_null_frequency()};

auto const avg_string_len = avg_element_size<cudf::string_view>(profile);
auto const avg_string_len = non_fixed_width_size<cudf::string_view>(profile);
auto const cardinality = std::min(profile.get_cardinality(), num_rows);
string_column_data samples(cardinality, cardinality * avg_string_len);
for (cudf::size_type si = 0; si < cardinality; ++si) {
Expand Down Expand Up @@ -593,7 +594,7 @@ std::unique_ptr<cudf::table> create_random_table(std::vector<cudf::type_id> cons
auto const out_dtype_ids = repeat_dtypes(dtype_ids, num_cols);
size_t const avg_row_bytes =
std::accumulate(out_dtype_ids.begin(), out_dtype_ids.end(), 0ul, [&](size_t sum, auto tid) {
return sum + avg_element_bytes(profile, tid);
return sum + avg_element_size(profile, cudf::data_type(tid));
});
cudf::size_type const num_rows = table_bytes.size / avg_row_bytes;

Expand Down
17 changes: 8 additions & 9 deletions cpp/benchmarks/io/csv/csv_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,7 @@ void BM_csv_read_varying_input(benchmark::State& state)

cuio_source_sink_pair source_sink(source_type);
cudf_io::csv_writer_options options =
cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view)
.include_header(true)
.rows_per_chunk(1 << 14); // TODO: remove once default is sensible
cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view).include_header(true);
cudf_io::write_csv(options);

cudf_io::csv_reader_options const read_options =
Expand All @@ -59,6 +57,7 @@ void BM_csv_read_varying_input(benchmark::State& state)

state.SetBytesProcessed(data_size * state.iterations());
state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
state.counters["encoded_file_size"] = source_sink.size();
}

void BM_csv_read_varying_options(benchmark::State& state)
Expand All @@ -79,23 +78,22 @@ void BM_csv_read_varying_options(benchmark::State& state)
auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
auto const view = tbl->view();

std::vector<char> csv_data;
cuio_source_sink_pair source_sink(io_type::HOST_BUFFER);
cudf_io::csv_writer_options options =
cudf_io::csv_writer_options::builder(cudf_io::sink_info{&csv_data}, view)
cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view)
.include_header(true)
.line_terminator("\r\n")
.rows_per_chunk(1 << 14); // TODO: remove once default is sensible
.line_terminator("\r\n");
cudf_io::write_csv(options);

cudf_io::csv_reader_options read_options =
cudf_io::csv_reader_options::builder(cudf_io::source_info{csv_data.data(), csv_data.size()})
cudf_io::csv_reader_options::builder(source_sink.make_source_info())
.use_cols_indexes(cols_to_read)
.thousands('\'')
.windowslinetermination(true)
.comment('#')
.prefix("BM_");

size_t const chunk_size = csv_data.size() / num_chunks;
size_t const chunk_size = source_sink.size() / num_chunks;
cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks;
auto mem_stats_logger = cudf::memory_stats_logger();
for (auto _ : state) {
Expand Down Expand Up @@ -132,6 +130,7 @@ void BM_csv_read_varying_options(benchmark::State& state)
auto const data_processed = data_size * cols_to_read.size() / view.num_columns();
state.SetBytesProcessed(data_processed * state.iterations());
state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
state.counters["encoded_file_size"] = source_sink.size();
}

#define CSV_RD_BM_INPUTS_DEFINE(name, type_or_group, src_type) \
Expand Down
10 changes: 5 additions & 5 deletions cpp/benchmarks/io/csv/csv_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,13 @@ void BM_csv_write_varying_inout(benchmark::State& state)
for (auto _ : state) {
cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0
cudf_io::csv_writer_options options =
cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view)
.include_header(true)
.rows_per_chunk(1 << 14); // TODO: remove once default is sensible
cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view).include_header(true);
cudf_io::write_csv(options);
}

state.SetBytesProcessed(data_size * state.iterations());
state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
state.counters["encoded_file_size"] = source_sink.size();
}

void BM_csv_write_varying_options(benchmark::State& state)
Expand All @@ -71,12 +70,12 @@ void BM_csv_write_varying_options(benchmark::State& state)
auto const view = tbl->view();

std::string const na_per(na_per_len, '#');
std::vector<char> csv_data;
cuio_source_sink_pair source_sink(io_type::HOST_BUFFER);
auto mem_stats_logger = cudf::memory_stats_logger();
for (auto _ : state) {
cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0
cudf_io::csv_writer_options options =
cudf_io::csv_writer_options::builder(cudf_io::sink_info{&csv_data}, view)
cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view)
.include_header(true)
.na_rep(na_per)
.rows_per_chunk(rows_per_chunk);
Expand All @@ -85,6 +84,7 @@ void BM_csv_write_varying_options(benchmark::State& state)

state.SetBytesProcessed(data_size * state.iterations());
state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
state.counters["encoded_file_size"] = source_sink.size();
}

#define CSV_WR_BM_INOUTS_DEFINE(name, type_or_group, sink_type) \
Expand Down
Loading

0 comments on commit 0dc70a6

Please sign in to comment.