Skip to content

Commit

Permalink
Merge branch 'branch-22.02' into chunked-partitioned-parq-write
Browse files Browse the repository at this point in the history
  • Loading branch information
devavret committed Jan 12, 2022
2 parents e1d608a + a43682e commit 64aae8d
Show file tree
Hide file tree
Showing 89 changed files with 2,879 additions and 1,063 deletions.
43 changes: 35 additions & 8 deletions build.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

# Copyright (c) 2019-2021, NVIDIA CORPORATION.
# Copyright (c) 2019-2022, NVIDIA CORPORATION.

# cuDF build script

Expand All @@ -17,7 +17,7 @@ ARGS=$*
# script, and that this script resides in the repo dir!
REPODIR=$(cd $(dirname $0); pwd)

VALIDARGS="clean libcudf cudf dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n -l --allgpuarch --disable_nvtx --show_depr_warn --ptds -h"
VALIDARGS="clean libcudf cudf dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n -l --allgpuarch --disable_nvtx --show_depr_warn --ptds -h --build_metrics --incl_cache_stats"
HELP="$0 [clean] [libcudf] [cudf] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [-l] [--cmake-args=\\\"<args>\\\"]
clean - remove all existing build artifacts and configuration (start
over)
Expand All @@ -37,6 +37,8 @@ HELP="$0 [clean] [libcudf] [cudf] [dask_cudf] [benchmarks] [tests] [libcudf_kafk
--disable_nvtx - disable inserting NVTX profiling ranges
--show_depr_warn - show cmake deprecation warnings
--ptds - enable per-thread default stream
--build_metrics - generate build metrics report for libcudf
--incl_cache_stats - include cache statistics in build metrics report
--cmake-args=\\\"<args>\\\" - pass arbitrary list of CMake configuration options (escape all quotes in argument)
-h | --h[elp] - print this text
Expand All @@ -61,6 +63,8 @@ BUILD_NVTX=ON
BUILD_TESTS=OFF
BUILD_DISABLE_DEPRECATION_WARNING=ON
BUILD_PER_THREAD_DEFAULT_STREAM=OFF
BUILD_REPORT_METRICS=OFF
BUILD_REPORT_INCL_CACHE_STATS=OFF

# Set defaults for vars that may not have been defined externally
# FIXME: if INSTALL_PREFIX is not set, check PREFIX, then check
Expand Down Expand Up @@ -144,6 +148,14 @@ fi
if hasArg --ptds; then
BUILD_PER_THREAD_DEFAULT_STREAM=ON
fi
if hasArg --build_metrics; then
BUILD_REPORT_METRICS=ON
fi

if hasArg --incl_cache_stats; then
BUILD_REPORT_INCL_CACHE_STATS=ON
fi


# If clean given, run it prior to any other steps
if hasArg clean; then
Expand Down Expand Up @@ -174,8 +186,11 @@ if buildAll || hasArg libcudf; then

# get the current count before the compile starts
FILES_IN_CCACHE=""
if [ -x "$(command -v ccache)" ]; then
if [[ "$BUILD_REPORT_INCL_CACHE_STATS"=="ON" && -x "$(command -v ccache)" ]]; then
FILES_IN_CCACHE=$(ccache -s | grep "files in cache")
echo "$FILES_IN_CCACHE"
# zero the ccache statistics
ccache -z
fi

cmake -S $REPODIR/cpp -B ${LIB_BUILD_DIR} \
Expand All @@ -197,12 +212,24 @@ if buildAll || hasArg libcudf; then
compile_total=$(( compile_end - compile_start ))

# Record build times
if [[ -f "${LIB_BUILD_DIR}/.ninja_log" ]]; then
echo "Formatting build times"
if [[ "$BUILD_REPORT_METRICS"=="ON" && -f "${LIB_BUILD_DIR}/.ninja_log" ]]; then
echo "Formatting build metrics"
python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt xml > ${LIB_BUILD_DIR}/ninja_log.xml
message="$FILES_IN_CCACHE <p>$PARALLEL_LEVEL parallel build time is $compile_total seconds"
echo "$message"
python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt html --msg "$message" > ${LIB_BUILD_DIR}/ninja_log.html
MSG="<p>"
# get some ccache stats after the compile
if [[ "$BUILD_REPORT_INCL_CACHE_STATS"=="ON" && -x "$(command -v ccache)" ]]; then
MSG="${MSG}<br/>$FILES_IN_CCACHE"
HIT_RATE=$(ccache -s | grep "cache hit rate")
MSG="${MSG}<br/>${HIT_RATE}"
fi
MSG="${MSG}<br/>parallel setting: $PARALLEL_LEVEL"
MSG="${MSG}<br/>parallel build time: $compile_total seconds"
if [[ -f "${LIB_BUILD_DIR}/libcudf.so" ]]; then
LIBCUDF_FS=$(ls -lh ${LIB_BUILD_DIR}/libcudf.so | awk '{print $5}')
MSG="${MSG}<br/>libcudf.so size: $LIBCUDF_FS"
fi
echo "$MSG"
python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt html --msg "$MSG" > ${LIB_BUILD_DIR}/ninja_log.html
fi

if [[ ${INSTALL_TARGET} != "" ]]; then
Expand Down
2 changes: 1 addition & 1 deletion ci/benchmark/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ conda list --show-channel-urls
################################################################################

logger "Build libcudf..."
if [[ ${BUILD_MODE} == "pull-request" ]]; then
if [[ "${BUILD_MODE}" == "pull-request" ]]; then
"$WORKSPACE/build.sh" clean libcudf cudf dask_cudf benchmarks tests --ptds
else
"$WORKSPACE/build.sh" clean libcudf cudf dask_cudf benchmarks tests -l --ptds
Expand Down
8 changes: 8 additions & 0 deletions ci/cpu/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,14 @@ if [ "$BUILD_LIBCUDF" == '1' ]; then
mkdir -p ${CONDA_BLD_DIR}/libcudf/work
cp -r ${CONDA_BLD_DIR}/work/* ${CONDA_BLD_DIR}/libcudf/work

# Copy libcudf build metrics results
LIBCUDF_BUILD_DIR=$CONDA_BLD_DIR/libcudf/work/cpp/build
echo "Checking for build metrics log $LIBCUDF_BUILD_DIR/ninja_log.html"
if [[ -f "$LIBCUDF_BUILD_DIR/ninja_log.html" ]]; then
gpuci_logger "Copying build metrics results"
mkdir -p "$WORKSPACE/build-metrics"
cp "$LIBCUDF_BUILD_DIR/ninja_log.html" "$WORKSPACE/build-metrics/BuildMetrics.html"
fi

gpuci_logger "Build conda pkg for libcudf_kafka"
gpuci_conda_retry build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libcudf_kafka $CONDA_BUILD_ARGS
Expand Down
2 changes: 1 addition & 1 deletion ci/cpu/upload.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ export GPUCI_RETRY_SLEEP=30
export LABEL_OPTION=${LABEL_OPTION:-"--label main"}

# Skip uploads unless BUILD_MODE == "branch"
if [ ${BUILD_MODE} != "branch" ]; then
if [ "${BUILD_MODE}" != "branch" ]; then
echo "Skipping upload"
return 0
fi
Expand Down
10 changes: 4 additions & 6 deletions ci/gpu/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
################################################################################

gpuci_logger "Build from source"
if [[ ${BUILD_MODE} == "pull-request" ]]; then
if [[ "${BUILD_MODE}" == "pull-request" ]]; then
"$WORKSPACE/build.sh" clean libcudf cudf dask_cudf libcudf_kafka cudf_kafka benchmarks tests --ptds
else
"$WORKSPACE/build.sh" clean libcudf cudf dask_cudf libcudf_kafka cudf_kafka benchmarks tests -l --ptds
Expand Down Expand Up @@ -181,12 +181,10 @@ else
done

# Copy libcudf build time results
echo "Checking for build time log $LIB_BUILD_DIR/ninja_log.html"
if [[ -f "$LIB_BUILD_DIR/ninja_log.html" ]]; then
echo "Checking for build time log $LIB_BUILD_DIR/ninja_log.xml"
if [[ -f "$LIB_BUILD_DIR/ninja_log.xml" ]]; then
gpuci_logger "Copying build time results"
cp "$LIB_BUILD_DIR/ninja_log.xml" "$WORKSPACE/test-results/buildtimes-junit.xml"
mkdir -p "$WORKSPACE/build-metrics"
cp "$LIB_BUILD_DIR/ninja_log.html" "$WORKSPACE/build-metrics/BuildMetrics.html"
fi

################################################################################
Expand Down Expand Up @@ -224,7 +222,7 @@ else
install_dask

gpuci_logger "Build python libs from source"
if [[ ${BUILD_MODE} == "pull-request" ]]; then
if [[ "${BUILD_MODE}" == "pull-request" ]]; then
"$WORKSPACE/build.sh" cudf dask_cudf cudf_kafka --ptds
else
"$WORKSPACE/build.sh" cudf dask_cudf cudf_kafka -l --ptds
Expand Down
10 changes: 7 additions & 3 deletions ci/gpu/java.sh
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ conda config --show-sources
conda list --show-channel-urls

gpuci_logger "Install dependencies"
gpuci_conda_retry install -y \
gpuci_mamba_retry install -y \
"cudatoolkit=$CUDA_REL" \
"rapids-build-env=$MINOR_VERSION.*" \
"rapids-notebook-env=$MINOR_VERSION.*" \
Expand All @@ -86,10 +86,14 @@ gpuci_conda_retry install -y \
"ucx-py=${UCX_PY_VERSION}" \
"openjdk=8.*" \
"maven"
# "mamba install openjdk" adds an activation script to set JAVA_HOME but this is
# not triggered on installation. Re-activating the conda environment will set
# this environment variable so that CMake can find JNI.
conda activate rapids

# https://docs.rapids.ai/maintainers/depmgmt/
# gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env
# gpuci_conda_retry install -y "your-pkg=1.0.0"
# gpuci_mamba_retry install -y "your-pkg=1.0.0"


gpuci_logger "Check compiler versions"
Expand Down Expand Up @@ -130,7 +134,7 @@ KAFKA_CONDA_FILE=`basename "$KAFKA_CONDA_FILE" .tar.bz2` #get filename without e
KAFKA_CONDA_FILE=${KAFKA_CONDA_FILE//-/=} #convert to conda install

gpuci_logger "Installing $CUDF_CONDA_FILE & $KAFKA_CONDA_FILE"
conda install -c ${CONDA_ARTIFACT_PATH} "$CUDF_CONDA_FILE" "$KAFKA_CONDA_FILE"
gpuci_mamba_retry install -c ${CONDA_ARTIFACT_PATH} "$CUDF_CONDA_FILE" "$KAFKA_CONDA_FILE"

install_dask

Expand Down
7 changes: 4 additions & 3 deletions conda/recipes/cudf_kafka/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
{% set cuda_version = '.'.join(environ.get('CUDA', '11.5').split('.')[:2]) %}
{% set py_version = environ.get('python', '3.8') %}
{% set py_version = environ.get('PY_VER', '3.8') %}
{% set py_version_numeric = py_version.replace('.', '') %}

package:
name: cudf_kafka
Expand All @@ -14,7 +15,7 @@ source:

build:
number: {{ GIT_DESCRIBE_NUMBER }}
string: py{{ py_version.replace('.', '') }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
string: py{{ py_version_numeric }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
script_env:
- CC
- CXX
Expand All @@ -34,7 +35,7 @@ requirements:
run:
- python {{ py_version }}
- libcudf_kafka {{ version }}
- python-confluent-kafka >=1.7.0,<1.8.0a0=py{{ py_version.replace('.', '') }}*
- python-confluent-kafka >=1.7.0,<1.8.0a0=py{{ py_version_numeric }}*
- cudf {{ version }}

test: # [linux64]
Expand Down
11 changes: 6 additions & 5 deletions conda/recipes/custreamz/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
{% set cuda_version = '.'.join(environ.get('CUDA', '11.5').split('.')[:2]) %}
{% set py_version = environ.get('python', '3.8') %}
{% set py_version = environ.get('PY_VER', '3.8') %}
{% set py_version_numeric = py_version.replace('.', '') %}

package:
name: custreamz
Expand All @@ -14,7 +15,7 @@ source:

build:
number: {{ GIT_DESCRIBE_NUMBER }}
string: py{{ py_version.replace('.', '') }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
string: py{{ py_version_numeric }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
script_env:
- VERSION_SUFFIX
- PARALLEL_LEVEL
Expand All @@ -25,15 +26,15 @@ build:
requirements:
host:
- python {{ py_version }}
- python-confluent-kafka >=1.7.0,<1.8.0a0=py{{ py_version.replace('.', '') }}*
- python-confluent-kafka >=1.7.0,<1.8.0a0=py{{ py_version_numeric }}*
- cudf_kafka {{ version }}
run:
- python {{ py_version }}
- streamz
- streamz
- cudf {{ version }}
- dask>=2021.11.1,<=2021.11.2
- distributed>=2021.11.1,<=2021.11.2
- python-confluent-kafka >=1.7.0,<1.8.0a0=py{{ py_version.replace('.', '') }}*
- python-confluent-kafka >=1.7.0,<1.8.0a0=py{{ py_version_numeric }}*
- cudf_kafka {{ version }}

test: # [linux64]
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/libcudf/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@ if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
# This assumes the script is executed from the root of the repo directory
./build.sh -v libcudf --allgpuarch --cmake-args=\"-DCMAKE_INSTALL_LIBDIR=lib\"
else
./build.sh -v libcudf tests --allgpuarch --cmake-args=\"-DCMAKE_INSTALL_LIBDIR=lib\"
./build.sh -v libcudf tests --allgpuarch --build_metrics --incl_cache_stats --cmake-args=\"-DCMAKE_INSTALL_LIBDIR=lib\"
fi
1 change: 1 addition & 0 deletions conda/recipes/libcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ test:
- test -f $PREFIX/include/cudf/lists/explode.hpp
- test -f $PREFIX/include/cudf/lists/drop_list_duplicates.hpp
- test -f $PREFIX/include/cudf/lists/extract.hpp
- test -f $PREFIX/include/cudf/lists/filling.hpp
- test -f $PREFIX/include/cudf/lists/contains.hpp
- test -f $PREFIX/include/cudf/lists/gather.hpp
- test -f $PREFIX/include/cudf/lists/lists_column_view.hpp
Expand Down
2 changes: 1 addition & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -618,7 +618,7 @@ set_target_properties(
)

target_compile_options(
cudftestutil PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
cudftestutil PUBLIC "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>>"
"$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>>"
)

Expand Down
1 change: 1 addition & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ ConfigureBench(
groupby/group_shift_benchmark.cu
groupby/group_struct_benchmark.cu
groupby/group_no_requests_benchmark.cu
groupby/group_scan_benchmark.cu
)

# ##################################################################################################
Expand Down
29 changes: 29 additions & 0 deletions cpp/benchmarks/groupby/group_benchmark_common.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <random>

template <typename T>
T random_int(T min, T max)
{
static unsigned seed = 13377331;
static std::mt19937 engine{seed};
static std::uniform_int_distribution<T> uniform{min, max};

return uniform(engine);
}
21 changes: 6 additions & 15 deletions cpp/benchmarks/groupby/group_no_requests_benchmark.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -14,32 +14,23 @@
* limitations under the License.
*/

#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/groupby/group_benchmark_common.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/copying.hpp>
#include <cudf/detail/aggregation/aggregation.hpp>
#include <cudf/groupby.hpp>
#include <cudf/sorting.hpp>
#include <cudf/table/table.hpp>

#include <cudf_test/column_wrapper.hpp>
#include <fixture/benchmark_fixture.hpp>
#include <synchronization/synchronization.hpp>

#include <memory>
#include <random>

class Groupby : public cudf::benchmark {
};

// TODO: put it in a struct so `uniform` can be remade with different min, max
template <typename T>
T random_int(T min, T max)
{
static unsigned seed = 13377331;
static std::mt19937 engine{seed};
static std::uniform_int_distribution<T> uniform{min, max};

return uniform(engine);
}

void BM_basic_no_requests(benchmark::State& state)
{
using wrapper = cudf::test::fixed_width_column_wrapper<int64_t>;
Expand Down
Loading

0 comments on commit 64aae8d

Please sign in to comment.