Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/branch-22.06' into mwilson/cui…
Browse files Browse the repository at this point in the history
…o-chunked-stats-p2
  • Loading branch information
hyperbolic2346 committed May 5, 2022
2 parents 0cba0a6 + 14b5169 commit 3447424
Show file tree
Hide file tree
Showing 251 changed files with 8,467 additions and 4,073 deletions.
11 changes: 11 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# Copyright (c) 2019-2022, NVIDIA CORPORATION.

repos:
- repo: https://github.com/PyCQA/isort
rev: 5.6.4
Expand Down Expand Up @@ -56,6 +58,15 @@ repos:
hooks:
- id: pydocstyle
args: ["--config=python/.flake8"]
exclude: |
(?x)^(
ci|
cpp|
conda|
docs|
java|
notebooks
)
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v11.1.0
hooks:
Expand Down
359 changes: 171 additions & 188 deletions CONTRIBUTING.md

Large diffs are not rendered by default.

19 changes: 17 additions & 2 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -112,28 +112,39 @@ function buildLibCudfJniInDocker {
local localMavenRepo=${LOCAL_MAVEN_REPO:-"$HOME/.m2/repository"}
local workspaceRepoDir="$workspaceDir/cudf"
local workspaceMavenRepoDir="$workspaceDir/.m2/repository"
local workspaceCcacheDir="$workspaceDir/.ccache"
mkdir -p "$CUDF_JAR_JAVA_BUILD_DIR/libcudf-cmake-build"
mkdir -p "$HOME/.ccache" "$HOME/.m2"
nvidia-docker build \
-f java/ci/Dockerfile.centos7 \
--build-arg CUDA_VERSION=${cudaVersion} \
-t $imageName .
nvidia-docker run -it -u $(id -u):$(id -g) --rm \
-e PARALLEL_LEVEL \
-e CCACHE_DISABLE \
-e CCACHE_DIR="$workspaceCcacheDir" \
-v "/etc/group:/etc/group:ro" \
-v "/etc/passwd:/etc/passwd:ro" \
-v "/etc/shadow:/etc/shadow:ro" \
-v "/etc/sudoers.d:/etc/sudoers.d:ro" \
-v "$HOME/.ccache:$workspaceCcacheDir:rw" \
-v "$REPODIR:$workspaceRepoDir:rw" \
-v "$localMavenRepo:$workspaceMavenRepoDir:rw" \
--workdir "$workspaceRepoDir/java/target/libcudf-cmake-build" \
${imageName} \
scl enable devtoolset-9 \
"cmake $workspaceRepoDir/cpp \
-G${CMAKE_GENERATOR} \
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
-DCMAKE_CXX_LINKER_LAUNCHER=ccache \
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-DCUDA_STATIC_RUNTIME=ON \
-DCMAKE_CUDA_ARCHITECTURES=${CUDF_CMAKE_CUDA_ARCHITECTURES} \
-DCMAKE_INSTALL_PREFIX==/usr/local/rapids \
-DUSE_NVTX=ON -DCUDF_USE_ARROW_STATIC=ON \
-DCMAKE_INSTALL_PREFIX=/usr/local/rapids \
-DUSE_NVTX=ON \
-DCUDF_USE_ARROW_STATIC=ON \
-DCUDF_ENABLE_ARROW_S3=OFF \
-DBUILD_TESTS=OFF \
-DPER_THREAD_DEFAULT_STREAM=ON \
Expand All @@ -145,6 +156,10 @@ function buildLibCudfJniInDocker {
-Dmaven.repo.local=$workspaceMavenRepoDir \
-DskipTests=${SKIP_TESTS:-false} \
-Dparallel.level=${PARALLEL_LEVEL} \
-Dcmake.ccache.opts='-DCMAKE_C_COMPILER_LAUNCHER=ccache \
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
-DCMAKE_CXX_LINKER_LAUNCHER=ccache' \
-DCUDF_CPP_BUILD_DIR=$workspaceRepoDir/java/target/libcudf-cmake-build \
-DCUDA_STATIC_RUNTIME=ON \
-DPER_THREAD_DEFAULT_STREAM=ON \
Expand Down
1 change: 1 addition & 0 deletions conda/recipes/libcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ outputs:
- test -f $PREFIX/include/cudf/detail/calendrical_month_sequence.cuh
- test -f $PREFIX/include/cudf/detail/concatenate.hpp
- test -f $PREFIX/include/cudf/detail/copy.hpp
- test -f $PREFIX/include/cudf/detail/copy.cuh
- test -f $PREFIX/include/cudf/detail/datetime.hpp
- test -f $PREFIX/include/cudf/detail/fill.hpp
- test -f $PREFIX/include/cudf/detail/gather.hpp
Expand Down
5 changes: 4 additions & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,6 @@ add_library(
src/ast/expression_parser.cpp
src/ast/expressions.cpp
src/binaryop/binaryop.cpp
src/binaryop/compiled/binary_ops.cu
src/binaryop/compiled/Add.cu
src/binaryop/compiled/ATan2.cu
src/binaryop/compiled/BitwiseAnd.cu
Expand Down Expand Up @@ -220,6 +219,7 @@ add_library(
src/binaryop/compiled/ShiftRightUnsigned.cu
src/binaryop/compiled/Sub.cu
src/binaryop/compiled/TrueDiv.cu
src/binaryop/compiled/binary_ops.cu
src/binaryop/compiled/util.cpp
src/labeling/label_bins.cu
src/bitmask/null_mask.cu
Expand All @@ -238,6 +238,7 @@ add_library(
src/copying/gather.cu
src/copying/get_element.cu
src/copying/pack.cpp
src/copying/purge_nonempty_nulls.cu
src/copying/reverse.cu
src/copying/sample.cu
src/copying/scatter.cu
Expand Down Expand Up @@ -302,6 +303,8 @@ add_library(
src/io/comp/cpu_unbz2.cpp
src/io/comp/debrotli.cu
src/io/comp/gpuinflate.cu
src/io/comp/nvcomp_adapter.cpp
src/io/comp/nvcomp_adapter.cu
src/io/comp/snap.cu
src/io/comp/uncomp.cpp
src/io/comp/unsnap.cu
Expand Down
12 changes: 10 additions & 2 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -194,10 +194,18 @@ ConfigureBench(FILL_BENCH filling/repeat.cpp)
# ##################################################################################################
# * groupby benchmark -----------------------------------------------------------------------------
ConfigureBench(
GROUPBY_BENCH groupby/group_sum.cu groupby/group_nth.cu groupby/group_shift.cu
groupby/group_struct.cu groupby/group_no_requests.cu groupby/group_scan.cu
GROUPBY_BENCH
groupby/group_sum.cu
groupby/group_nth.cu
groupby/group_shift.cu
groupby/group_struct.cu
groupby/group_no_requests.cu
groupby/group_scan.cu
groupby/group_rank_benchmark.cu
)

ConfigureNVBench(GROUPBY_NVBENCH groupby/group_rank_benchmark.cu)

# ##################################################################################################
# * hashing benchmark -----------------------------------------------------------------------------
ConfigureBench(HASHING_BENCH hashing/hash.cpp hashing/partition.cpp)
Expand Down
109 changes: 109 additions & 0 deletions cpp/benchmarks/groupby/group_rank_benchmark.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/rmm_pool_raii.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/groupby.hpp>
#include <cudf/sorting.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/types.hpp>

#include <nvbench/nvbench.cuh>

template <cudf::rank_method method>
static void nvbench_groupby_rank(nvbench::state& state,
nvbench::type_list<nvbench::enum_type<method>>)
{
using namespace cudf;
using type = int64_t;
constexpr auto dtype = type_to_id<int64_t>();
cudf::rmm_pool_raii pool_raii;

bool const is_sorted = state.get_int64("is_sorted");
cudf::size_type const column_size = state.get_int64("data_size");
constexpr int num_groups = 100;

data_profile profile;
profile.set_null_frequency(std::nullopt);
profile.set_cardinality(0);
profile.set_distribution_params<type>(dtype, distribution_id::UNIFORM, 0, num_groups);

auto source_table = create_random_table({dtype, dtype}, row_count{column_size}, profile);

// values to be pre-sorted too for groupby rank
if (is_sorted) source_table = cudf::sort(*source_table);

table_view keys{{source_table->view().column(0)}};
column_view order_by{source_table->view().column(1)};

auto agg = cudf::make_rank_aggregation<groupby_scan_aggregation>(method);
std::vector<groupby::scan_request> requests;
requests.emplace_back(groupby::scan_request());
requests[0].values = order_by;
requests[0].aggregations.push_back(std::move(agg));

groupby::groupby gb_obj(keys, null_policy::EXCLUDE, is_sorted ? sorted::YES : sorted::NO);

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
rmm::cuda_stream_view stream_view{launch.get_stream()};
// groupby scan uses sort implementation
auto result = gb_obj.scan(requests);
});
}

enum class rank_method : int32_t {};

NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
cudf::rank_method,
[](cudf::rank_method value) {
switch (value) {
case cudf::rank_method::FIRST: return "FIRST";
case cudf::rank_method::AVERAGE: return "AVERAGE";
case cudf::rank_method::MIN: return "MIN";
case cudf::rank_method::MAX: return "MAX";
case cudf::rank_method::DENSE: return "DENSE";
default: return "unknown";
}
},
[](cudf::rank_method value) {
switch (value) {
case cudf::rank_method::FIRST: return "cudf::rank_method::FIRST";
case cudf::rank_method::AVERAGE: return "cudf::rank_method::AVERAGE";
case cudf::rank_method::MIN: return "cudf::rank_method::MIN";
case cudf::rank_method::MAX: return "cudf::rank_method::MAX";
case cudf::rank_method::DENSE: return "cudf::rank_method::DENSE";
default: return "unknown";
}
})

using methods = nvbench::enum_type_list<cudf::rank_method::AVERAGE,
cudf::rank_method::DENSE,
cudf::rank_method::FIRST,
cudf::rank_method::MAX,
cudf::rank_method::MIN>;

NVBENCH_BENCH_TYPES(nvbench_groupby_rank, NVBENCH_TYPE_AXES(methods))
.set_type_axes_names({"rank_method"})
.set_name("groupby_rank")
.add_int64_axis("data_size",
{
1000000, // 1M
10000000, // 10M
100000000, // 100M
})

.add_int64_axis("is_sorted", {0, 1});
3 changes: 3 additions & 0 deletions cpp/benchmarks/io/cuio_common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#include <benchmarks/io/cuio_common.hpp>

#include <cstdio>
#include <fstream>
#include <numeric>
#include <string>
Expand Down Expand Up @@ -145,6 +146,8 @@ std::vector<cudf::size_type> segments_in_chunk(int num_segments, int num_chunks,
// Executes the command and returns stderr output
std::string exec_cmd(std::string_view cmd)
{
// Prevent the output from the command from mixing with the original process' output
std::fflush(nullptr);
// Switch stderr and stdout to only capture stderr
auto const redirected_cmd = std::string{"( "}.append(cmd).append(" 3>&2 2>&1 1>&3) 2>/dev/null");
std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(redirected_cmd.c_str(), "r"), pclose);
Expand Down
2 changes: 1 addition & 1 deletion cpp/benchmarks/reduction/segment_reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ std::pair<std::unique_ptr<column>, thrust::device_vector<size_type>> make_test_d

thrust::device_vector<size_type> d_offsets(offset_it, offset_it + num_segments + 1);

return std::make_pair(std::move((input->release())[0]), d_offsets);
return std::pair(std::move((input->release())[0]), d_offsets);
}

template <typename InputType, typename OutputType, aggregation::Kind kind>
Expand Down
41 changes: 41 additions & 0 deletions cpp/benchmarks/stream_compaction/distinct.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#include <cudf/column/column_view.hpp>
#include <cudf/detail/stream_compaction.hpp>
#include <cudf/lists/list_view.cuh>
#include <cudf/types.hpp>

#include <nvbench/nvbench.cuh>
Expand Down Expand Up @@ -55,3 +56,43 @@ NVBENCH_BENCH_TYPES(nvbench_distinct, NVBENCH_TYPE_AXES(data_type))
.set_name("distinct")
.set_type_axes_names({"Type"})
.add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000});

template <typename Type>
void nvbench_distinct_list(nvbench::state& state, nvbench::type_list<Type>)
{
cudf::rmm_pool_raii pool_raii;

auto const size = state.get_int64("ColumnSize");
auto const dtype = cudf::type_to_id<Type>();
double const null_frequency = state.get_float64("null_frequency");

data_profile table_data_profile;
if (dtype == cudf::type_id::LIST) {
table_data_profile.set_distribution_params(dtype, distribution_id::UNIFORM, 0, 4);
table_data_profile.set_distribution_params(
cudf::type_id::INT32, distribution_id::UNIFORM, 0, 4);
table_data_profile.set_list_depth(1);
} else {
// We're comparing distinct() on a non-nested column to that on a list column with the same
// number of distinct rows. The max list size is 4 and the number of distinct values in the
// list's child is 5. So the number of distinct rows in the list = 1 + 5 + 5^2 + 5^3 + 5^4 = 781
// We want this column to also have 781 distinct values.
table_data_profile.set_distribution_params(dtype, distribution_id::UNIFORM, 0, 781);
}
table_data_profile.set_null_frequency(null_frequency);

auto const table = create_random_table(
{dtype}, table_size_bytes{static_cast<size_t>(size)}, table_data_profile, 0);

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
rmm::cuda_stream_view stream_view{launch.get_stream()};
auto result = cudf::detail::distinct(*table, {0}, cudf::null_equality::EQUAL, stream_view);
});
}

NVBENCH_BENCH_TYPES(nvbench_distinct_list,
NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, cudf::list_view>))
.set_name("distinct_list")
.set_type_axes_names({"Type"})
.add_float64_axis("null_frequency", {0.0, 0.1})
.add_int64_axis("ColumnSize", {100'000'000});
24 changes: 19 additions & 5 deletions cpp/benchmarks/text/subword.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
* limitations under the License.
*/

#include <benchmark/benchmark.h>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf_test/column_wrapper.hpp>

Expand Down Expand Up @@ -53,9 +54,9 @@ static std::string create_hash_vocab_file()
return hash_file;
}

static void BM_cuda_tokenizer_cudf(benchmark::State& state)
static void BM_subword_tokenizer(benchmark::State& state)
{
uint32_t nrows = 1000;
auto const nrows = static_cast<cudf::size_type>(state.range(0));
std::vector<const char*> h_strings(nrows, "This is a test ");
cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
std::string hash_file = create_hash_vocab_file();
Expand All @@ -67,6 +68,7 @@ static void BM_cuda_tokenizer_cudf(benchmark::State& state)
//
auto vocab = nvtext::load_vocabulary_file(hash_file);
for (auto _ : state) {
cuda_event_timer raii(state, true);
auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings},
*vocab,
max_sequence_length,
Expand All @@ -76,6 +78,18 @@ static void BM_cuda_tokenizer_cudf(benchmark::State& state)
MAX_ROWS_TENSOR);
}
}
BENCHMARK(BM_cuda_tokenizer_cudf);

BENCHMARK_MAIN();
class Subword : public cudf::benchmark {
};

#define SUBWORD_BM_BENCHMARK_DEFINE(name) \
BENCHMARK_DEFINE_F(Subword, name)(::benchmark::State & state) { BM_subword_tokenizer(state); } \
BENCHMARK_REGISTER_F(Subword, name) \
->RangeMultiplier(2) \
->Range(1 << 10, 1 << 17) \
->UseManualTime() \
->Unit(benchmark::kMillisecond);

SUBWORD_BM_BENCHMARK_DEFINE(BM_subword_tokenizer);

// BENCHMARK_MAIN();
4 changes: 2 additions & 2 deletions cpp/docs/DEVELOPER_GUIDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -572,7 +572,7 @@ The preferred style for how inputs are passed in and outputs are returned is the

Sometimes it is necessary for functions to have multiple outputs. There are a few ways this can be
done in C++ (including creating a `struct` for the output). One convenient way to do this is
using `std::tie` and `std::make_pair`. Note that objects passed to `std::make_pair` will invoke
using `std::tie` and `std::pair`. Note that objects passed to `std::pair` will invoke
either the copy constructor or the move constructor of the object, and it may be preferable to move
non-trivially copyable objects (and required for types with deleted copy constructors, like
`std::unique_ptr`).
Expand All @@ -585,7 +585,7 @@ std::pair<table, table> return_two_tables(void){
// Do stuff with out0, out1

// Return a std::pair of the two outputs
return std::make_pair(std::move(out0), std::move(out1));
return std::pair(std::move(out0), std::move(out1));
}

cudf::table out0;
Expand Down
Loading

0 comments on commit 3447424

Please sign in to comment.