Skip to content

Commit

Permalink
Merge branch 'branch-0.19' into fix-spark-hash
Browse files Browse the repository at this point in the history
  • Loading branch information
jlowe committed Mar 24, 2021
2 parents b42b6c5 + 267d29b commit e070822
Show file tree
Hide file tree
Showing 113 changed files with 4,963 additions and 856 deletions.
10 changes: 5 additions & 5 deletions .github/CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ python/dask_cudf/ @rapidsai/cudf-dask-codeowners
java/ @rapidsai/cudf-java-codeowners

#build/ops code owners
.github/ @rapidsai/ops-codeowners
/ci/ @rapidsai/ops-codeowners
.github/ @rapidsai/ops-codeowners
/ci/ @rapidsai/ops-codeowners
conda/ @rapidsai/ops-codeowners
**/Dockerfile @rapidsai/ops-codeowners
**/.dockerignore @rapidsai/ops-codeowners
docker/ @rapidsai/ops-codeowners
/Dockerfile @rapidsai/ops-codeowners
/.dockerignore @rapidsai/ops-codeowners
/docker/ @rapidsai/ops-codeowners
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,17 @@

**NOTE:** For the latest stable [README.md](https://github.com/rapidsai/cudf/blob/main/README.md) ensure you are on the `main` branch.

## Resources

- [cuDF Reference Documentation](https://docs.rapids.ai/api/cudf/stable/): Python API reference, tutorials, and topic guides.
- [libcudf Reference Documentation](https://docs.rapids.ai/api/libcudf/stable/): C/C++ CUDA library API reference.
- [Getting Started](https://rapids.ai/start.html): Instructions for installing cuDF.
- [RAPIDS Community](https://rapids.ai/community.html): Get help, contribute, and collaborate.
- [GitHub repository](https://github.com/rapidsai/cudf): Download the cuDF source code.
- [Issue tracker](https://github.com/rapidsai/cudf/issues): Report issues or request features.

## Overview

Built based on the [Apache Arrow](http://arrow.apache.org/) columnar memory format, cuDF is a GPU DataFrame library for loading, joining, aggregating, filtering, and otherwise manipulating data.

cuDF provides a pandas-like API that will be familiar to data engineers & data scientists, so they can use it to easily accelerate their workflows without going into the details of CUDA programming.
Expand Down
1 change: 1 addition & 0 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ fi
# Build libcudf_kafka library
if hasArg libcudf_kafka; then
cmake -S $REPODIR/cpp/libcudf_kafka -B ${KAFKA_LIB_BUILD_DIR} \
${CUDF_CMAKE_CUDA_ARCHITECTURES} \
-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
-DCMAKE_BUILD_TYPE=${BUILD_TYPE}

Expand Down
3 changes: 3 additions & 0 deletions conda/recipes/libcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ test:
- test -f $PREFIX/include/cudf/ast/linearizer.hpp
- test -f $PREFIX/include/cudf/ast/operators.hpp
- test -f $PREFIX/include/cudf/binaryop.hpp
- test -f $PREFIX/include/cudf/labeling/label_bins.hpp
- test -f $PREFIX/include/cudf/column/column_factories.hpp
- test -f $PREFIX/include/cudf/column/column.hpp
- test -f $PREFIX/include/cudf/column/column_view.hpp
Expand All @@ -66,6 +67,7 @@ test:
- test -f $PREFIX/include/cudf/datetime.hpp
- test -f $PREFIX/include/cudf/detail/aggregation/aggregation.hpp
- test -f $PREFIX/include/cudf/detail/aggregation/result_cache.hpp
- test -f $PREFIX/include/cudf/detail/label_bins.hpp
- test -f $PREFIX/include/cudf/detail/binaryop.hpp
- test -f $PREFIX/include/cudf/detail/concatenate.hpp
- test -f $PREFIX/include/cudf/detail/copy.hpp
Expand Down Expand Up @@ -132,6 +134,7 @@ test:
- test -f $PREFIX/include/cudf/join.hpp
- test -f $PREFIX/include/cudf/lists/detail/concatenate.hpp
- test -f $PREFIX/include/cudf/lists/detail/copying.hpp
- test -f $PREFIX/include/cudf/lists/detail/drop_list_duplicates.hpp
- test -f $PREFIX/include/cudf/lists/detail/sorting.hpp
- test -f $PREFIX/include/cudf/lists/count_elements.hpp
- test -f $PREFIX/include/cudf/lists/explode.hpp
Expand Down
28 changes: 22 additions & 6 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#=============================================================================
# Copyright (c) 2018-2020, NVIDIA CORPORATION.
# Copyright (c) 2018-2021, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -156,6 +156,7 @@ add_library(cudf
src/binaryop/jit/code/kernel.cpp
src/binaryop/jit/code/operation.cpp
src/binaryop/jit/code/traits.cpp
src/labeling/label_bins.cu
src/bitmask/null_mask.cu
src/column/column.cu
src/column/column_device_view.cu
Expand Down Expand Up @@ -194,7 +195,7 @@ add_library(cudf
src/groupby/hash/groupby.cu
src/groupby/sort/group_argmax.cu
src/groupby/sort/group_argmin.cu
src/groupby/sort/groupby.cu
src/groupby/sort/aggregate.cpp
src/groupby/sort/group_collect.cu
src/groupby/sort/group_count.cu
src/groupby/sort/group_max.cu
Expand All @@ -204,6 +205,11 @@ add_library(cudf
src/groupby/sort/group_quantiles.cu
src/groupby/sort/group_std.cu
src/groupby/sort/group_sum.cu
src/groupby/sort/scan.cpp
src/groupby/sort/group_count_scan.cu
src/groupby/sort/group_max_scan.cu
src/groupby/sort/group_min_scan.cu
src/groupby/sort/group_sum_scan.cu
src/groupby/sort/sort_helper.cu
src/hash/hashing.cu
src/interop/dlpack.cpp
Expand Down Expand Up @@ -410,7 +416,7 @@ target_compile_options(cudf

target_compile_definitions(cudf
PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_DEFINITIONS}>"
"$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_DEFINITIONS}>"
"$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_DEFINITIONS}>>"
)

# Disable Jitify log printing. See https://github.com/NVIDIA/jitify/issues/79
Expand Down Expand Up @@ -505,10 +511,11 @@ add_library(cudftestutil STATIC

target_compile_options(cudftestutil
PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
"$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>"
"$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>>"
)

target_compile_features(cudftestutil PUBLIC cxx_std_14 cuda_std_14)
target_compile_features(cudftestutil
PUBLIC cxx_std_14 $<BUILD_INTERFACE:cuda_std_14>)

target_link_libraries(cudftestutil
PUBLIC GTest::gmock
Expand All @@ -522,7 +529,7 @@ target_include_directories(cudftestutil

install(TARGETS cudftestutil
DESTINATION lib
EXPORT cudf-targets)
EXPORT cudf-testing-targets)

add_library(cudf::cudftestutil ALIAS cudftestutil)

Expand Down Expand Up @@ -600,6 +607,11 @@ install(EXPORT cudf-targets
NAMESPACE cudf::
DESTINATION "${INSTALL_CONFIGDIR}")

install(EXPORT cudf-testing-targets
FILE cudf-testing-targets.cmake
NAMESPACE cudf::
DESTINATION "${INSTALL_CONFIGDIR}")

################################################################################################
# - build export -------------------------------------------------------------------------------
configure_package_config_file(cmake/cudf-build-config.cmake.in ${CUDF_BINARY_DIR}/cudf-config.cmake
Expand Down Expand Up @@ -628,6 +640,10 @@ export(EXPORT cudf-targets
FILE ${CUDF_BINARY_DIR}/cudf-targets.cmake
NAMESPACE cudf::)

export(EXPORT cudf-testing-targets
FILE ${CUDF_BINARY_DIR}/cudf-testing-targets.cmake
NAMESPACE cudf::)


###################################################################################################
# - make documentation ----------------------------------------------------------------------------
Expand Down
10 changes: 8 additions & 2 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -173,8 +173,12 @@ ConfigureBench(AST_BENCH ast/transform_benchmark.cpp)
ConfigureBench(BINARYOP_BENCH binaryop/binaryop_benchmark.cu)

###################################################################################################
# - subword tokenizer benchmark -------------------------------------------------------------------
ConfigureBench(SUBWORD_TOKENIZER_BENCH text/subword_benchmark.cpp)
# - nvtext benchmark -------------------------------------------------------------------
ConfigureBench(TEXT_BENCH
text/normalize_benchmark.cpp
text/normalize_spaces_benchmark.cpp
text/tokenize_benchmark.cpp
text/subword_benchmark.cpp)

###################################################################################################
# - strings benchmark -------------------------------------------------------------------
Expand All @@ -191,6 +195,8 @@ ConfigureBench(STRINGS_BENCH
string/filter_benchmark.cpp
string/find_benchmark.cpp
string/replace_benchmark.cpp
string/replace_re_benchmark.cpp
string/split_benchmark.cpp
string/substring_benchmark.cpp
string/translate_benchmark.cpp
string/url_decode_benchmark.cpp)
84 changes: 84 additions & 0 deletions cpp/benchmarks/string/replace_re_benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "string_bench_args.hpp"

#include <benchmark/benchmark.h>
#include <benchmarks/common/generate_benchmark_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/strings/replace_re.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf_test/column_wrapper.hpp>

class StringReplace : public cudf::benchmark {
};

enum replace_type { replace_re, replace_re_multi, replace_backref };

static void BM_replace(benchmark::State& state, replace_type rt)
{
cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
data_profile table_profile;
table_profile.set_distribution_params(
cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
auto const table =
create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
cudf::strings_column_view input(table->view().column(0));
cudf::test::strings_column_wrapper repls({"#", ""});

for (auto _ : state) {
cuda_event_timer raii(state, true, 0);
switch (rt) {
case replace_type::replace_re: // contains_re and matches_re use the same main logic
cudf::strings::replace_re(input, "\\d+");
break;
case replace_type::replace_re_multi: // counts occurrences of pattern
cudf::strings::replace_re(input, {"\\d+", "\\s+"}, cudf::strings_column_view(repls));
break;
case replace_type::replace_backref: // returns occurrences of matches
cudf::strings::replace_with_backrefs(input, "(\\d+)", "#\\1X");
break;
}
}

state.SetBytesProcessed(state.iterations() * input.chars_size());
}

static void generate_bench_args(benchmark::internal::Benchmark* b)
{
int const min_rows = 1 << 12;
int const max_rows = 1 << 24;
int const row_mult = 8;
int const min_rowlen = 1 << 5;
int const max_rowlen = 1 << 13;
int const len_mult = 4;
generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
}

#define STRINGS_BENCHMARK_DEFINE(name) \
BENCHMARK_DEFINE_F(StringReplace, name) \
(::benchmark::State & st) { BM_replace(st, name); } \
BENCHMARK_REGISTER_F(StringReplace, name) \
->Apply(generate_bench_args) \
->UseManualTime() \
->Unit(benchmark::kMillisecond);

STRINGS_BENCHMARK_DEFINE(replace_re)
STRINGS_BENCHMARK_DEFINE(replace_re_multi)
STRINGS_BENCHMARK_DEFINE(replace_backref)
85 changes: 85 additions & 0 deletions cpp/benchmarks/string/translate_benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "string_bench_args.hpp"

#include <benchmark/benchmark.h>
#include <benchmarks/common/generate_benchmark_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/strings/strings_column_view.hpp>
#include <cudf/strings/translate.hpp>
#include <cudf_test/column_wrapper.hpp>

#include <algorithm>

#include <thrust/iterator/counting_iterator.h>

class StringTranslate : public cudf::benchmark {
};

using entry_type = std::pair<cudf::char_utf8, cudf::char_utf8>;

static void BM_translate(benchmark::State& state, int entry_count)
{
cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
data_profile table_profile;
table_profile.set_distribution_params(
cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
auto const table =
create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
cudf::strings_column_view input(table->view().column(0));

std::vector<entry_type> entries(entry_count);
std::transform(thrust::counting_iterator<int>(0),
thrust::counting_iterator<int>(entry_count),
entries.begin(),
[](auto idx) -> entry_type {
return entry_type{'!' + idx, '~' - idx};
});

for (auto _ : state) {
cuda_event_timer raii(state, true, 0);
cudf::strings::translate(input, entries);
}

state.SetBytesProcessed(state.iterations() * input.chars_size());
}

static void generate_bench_args(benchmark::internal::Benchmark* b)
{
int const min_rows = 1 << 12;
int const max_rows = 1 << 24;
int const row_mult = 8;
int const min_rowlen = 1 << 5;
int const max_rowlen = 1 << 13;
int const len_mult = 4;
generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
}

#define STRINGS_BENCHMARK_DEFINE(name, entries) \
BENCHMARK_DEFINE_F(StringTranslate, name) \
(::benchmark::State & st) { BM_translate(st, entries); } \
BENCHMARK_REGISTER_F(StringTranslate, name) \
->Apply(generate_bench_args) \
->UseManualTime() \
->Unit(benchmark::kMillisecond);

STRINGS_BENCHMARK_DEFINE(translate_small, 5)
STRINGS_BENCHMARK_DEFINE(translate_medium, 25)
STRINGS_BENCHMARK_DEFINE(translate_large, 50)
Loading

0 comments on commit e070822

Please sign in to comment.