Skip to content

Commit

Permalink
Merge branch 'branch-0.19' of github.com:rapidsai/cudf into groupby_s…
Browse files Browse the repository at this point in the history
…pan_uvector
  • Loading branch information
karthikeyann committed Mar 25, 2021
2 parents d2f6b3b + f1f1d0f commit 7bf0331
Show file tree
Hide file tree
Showing 67 changed files with 1,868 additions and 778 deletions.
10 changes: 5 additions & 5 deletions .github/CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ python/dask_cudf/ @rapidsai/cudf-dask-codeowners
java/ @rapidsai/cudf-java-codeowners

#build/ops code owners
.github/ @rapidsai/ops-codeowners
/ci/ @rapidsai/ops-codeowners
.github/ @rapidsai/ops-codeowners
/ci/ @rapidsai/ops-codeowners
conda/ @rapidsai/ops-codeowners
**/Dockerfile @rapidsai/ops-codeowners
**/.dockerignore @rapidsai/ops-codeowners
docker/ @rapidsai/ops-codeowners
/Dockerfile @rapidsai/ops-codeowners
/.dockerignore @rapidsai/ops-codeowners
/docker/ @rapidsai/ops-codeowners
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,17 @@

**NOTE:** For the latest stable [README.md](https://github.com/rapidsai/cudf/blob/main/README.md) ensure you are on the `main` branch.

## Resources

- [cuDF Reference Documentation](https://docs.rapids.ai/api/cudf/stable/): Python API reference, tutorials, and topic guides.
- [libcudf Reference Documentation](https://docs.rapids.ai/api/libcudf/stable/): C/C++ CUDA library API reference.
- [Getting Started](https://rapids.ai/start.html): Instructions for installing cuDF.
- [RAPIDS Community](https://rapids.ai/community.html): Get help, contribute, and collaborate.
- [GitHub repository](https://github.com/rapidsai/cudf): Download the cuDF source code.
- [Issue tracker](https://github.com/rapidsai/cudf/issues): Report issues or request features.

## Overview

Built based on the [Apache Arrow](http://arrow.apache.org/) columnar memory format, cuDF is a GPU DataFrame library for loading, joining, aggregating, filtering, and otherwise manipulating data.

cuDF provides a pandas-like API that will be familiar to data engineers & data scientists, so they can use it to easily accelerate their workflows without going into the details of CUDA programming.
Expand Down
1 change: 1 addition & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ ConfigureBench(BINARYOP_BENCH binaryop/binaryop_benchmark.cu)
ConfigureBench(TEXT_BENCH
text/normalize_benchmark.cpp
text/normalize_spaces_benchmark.cpp
text/tokenize_benchmark.cpp
text/subword_benchmark.cpp)

###################################################################################################
Expand Down
92 changes: 92 additions & 0 deletions cpp/benchmarks/text/tokenize_benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmark/benchmark.h>
#include <benchmarks/common/generate_benchmark_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/string/string_bench_args.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_wrapper.hpp>

#include <nvtext/ngrams_tokenize.hpp>
#include <nvtext/tokenize.hpp>

class TextTokenize : public cudf::benchmark {
};

enum class tokenize_type { single, multi, count, count_multi, ngrams };

static void BM_tokenize(benchmark::State& state, tokenize_type tt)
{
auto const n_rows = static_cast<cudf::size_type>(state.range(0));
auto const max_str_length = static_cast<cudf::size_type>(state.range(1));
data_profile table_profile;
table_profile.set_distribution_params(
cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
auto const table =
create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
cudf::strings_column_view input(table->view().column(0));
cudf::test::strings_column_wrapper delimiters({" ", "+", "-"});

for (auto _ : state) {
cuda_event_timer raii(state, true, 0);
switch (tt) {
case tokenize_type::single: nvtext::tokenize(input); break;
case tokenize_type::multi:
nvtext::tokenize(input, cudf::strings_column_view(delimiters));
break;
case tokenize_type::count: nvtext::count_tokens(input); break;
case tokenize_type::count_multi:
nvtext::count_tokens(input, cudf::strings_column_view(delimiters));
break;
case tokenize_type::ngrams:
// default is bigrams
nvtext::ngrams_tokenize(input);
break;
}
}

state.SetBytesProcessed(state.iterations() * input.chars_size());
}

static void generate_bench_args(benchmark::internal::Benchmark* b)
{
int const min_rows = 1 << 12;
int const max_rows = 1 << 24;
int const row_mult = 8;
int const min_rowlen = 1 << 5;
int const max_rowlen = 1 << 13;
int const len_mult = 4;
generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
}

#define NVTEXT_BENCHMARK_DEFINE(name) \
BENCHMARK_DEFINE_F(TextTokenize, name) \
(::benchmark::State & st) { BM_tokenize(st, tokenize_type::name); } \
BENCHMARK_REGISTER_F(TextTokenize, name) \
->Apply(generate_bench_args) \
->UseManualTime() \
->Unit(benchmark::kMillisecond);

NVTEXT_BENCHMARK_DEFINE(single)
NVTEXT_BENCHMARK_DEFINE(multi)
NVTEXT_BENCHMARK_DEFINE(count)
NVTEXT_BENCHMARK_DEFINE(count_multi)
NVTEXT_BENCHMARK_DEFINE(ngrams)
4 changes: 0 additions & 4 deletions cpp/cmake/Modules/ConfigureCUDA.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,6 @@ enable_language(CUDA)

if(CMAKE_COMPILER_IS_GNUCXX)
list(APPEND CUDF_CXX_FLAGS -Wall -Werror -Wno-unknown-pragmas -Wno-error=deprecated-declarations)
if(CUDF_BUILD_TESTS OR CUDF_BUILD_BENCHMARKS)
# Suppress parentheses warning which causes gmock to fail
list(APPEND CUDF_CUDA_FLAGS -Xcompiler=-Wno-parentheses)
endif()
endif()

list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr)
Expand Down
1 change: 0 additions & 1 deletion cpp/include/cudf/column/column_factories.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
#include <cudf/utilities/traits.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_vector.hpp>

namespace cudf {
/**
Expand Down
18 changes: 8 additions & 10 deletions cpp/include/cudf/concatenate.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -17,9 +17,9 @@

#include <cudf/column/column_view.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/utilities/span.hpp>

#include <memory>
#include <vector>

namespace cudf {
/**
Expand All @@ -36,13 +36,13 @@ namespace cudf {
*
* Returns empty `device_buffer` if the column is not nullable
*
* @param views Vector of column views whose bitmask will to be concatenated
* @param views host_span of column views whose bitmask will to be concatenated
* @param mr Device memory resource used for allocating the new device_buffer
* @return rmm::device_buffer A `device_buffer` containing the bitmasks of all
* the column views in the views vector
*/
rmm::device_buffer concatenate_masks(
std::vector<column_view> const& views,
host_span<column_view const> views,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand All @@ -51,14 +51,13 @@ rmm::device_buffer concatenate_masks(
* @throws cudf::logic_error
* If types of the input columns mismatch
*
* @param columns_to_concat The column views to be concatenated into a single
* column
* @param columns_to_concat host_span of column views to be concatenated into a single column
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return Unique pointer to a single table having all the rows from the
* elements of `columns_to_concat` respectively in the same order.
*/
std::unique_ptr<column> concatenate(
std::vector<column_view> const& columns_to_concat,
host_span<column_view const> columns_to_concat,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand All @@ -82,14 +81,13 @@ std::unique_ptr<column> concatenate(
* @throws cudf::logic_error
* If number of columns mismatch
*
* @param tables_to_concat The table views to be concatenated into a single
* table
* @param tables_to_concat host_span of table views to be concatenated into a single table
* @param mr Device memory resource used to allocate the returned table's device memory.
* @return Unique pointer to a single table having all the rows from the
* elements of `tables_to_concat` respectively in the same order.
*/
std::unique_ptr<table> concatenate(
std::vector<table_view> const& tables_to_concat,
host_span<table_view const> tables_to_concat,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group
Expand Down
9 changes: 5 additions & 4 deletions cpp/include/cudf/detail/concatenate.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -20,6 +20,7 @@
#include <cudf/concatenate.hpp>
#include <cudf/detail/concatenate.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/utilities/span.hpp>

#include <rmm/cuda_stream_view.hpp>

Expand All @@ -34,8 +35,8 @@ namespace detail {
*
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
void concatenate_masks(rmm::device_vector<column_device_view> const& d_views,
rmm::device_vector<size_t> const& d_offsets,
void concatenate_masks(device_span<column_device_view const> d_views,
device_span<size_t const> d_offsets,
bitmask_type* dest_mask,
size_type output_size,
rmm::cuda_stream_view stream);
Expand All @@ -45,7 +46,7 @@ void concatenate_masks(rmm::device_vector<column_device_view> const& d_views,
*
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
void concatenate_masks(std::vector<column_view> const& views,
void concatenate_masks(host_span<column_view const> views,
bitmask_type* dest_mask,
rmm::cuda_stream_view stream);

Expand Down
11 changes: 6 additions & 5 deletions cpp/include/cudf/detail/concatenate.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -18,6 +18,7 @@
#include <cudf/column/column_view.hpp>
#include <cudf/concatenate.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/utilities/span.hpp>

#include <rmm/cuda_stream_view.hpp>

Expand All @@ -27,22 +28,22 @@ namespace cudf {
//! Inner interfaces and implementations
namespace detail {
/**
* @copydoc cudf::concatenate(std::vector<column_view> const&,rmm::mr::device_memory_resource*)
* @copydoc cudf::concatenate(host_span<column_view const>,rmm::mr::device_memory_resource*)
*
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<column> concatenate(
std::vector<column_view> const& columns_to_concat,
host_span<column_view const> columns_to_concat,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @copydoc cudf::concatenate(std::vector<table_view> const&,rmm::mr::device_memory_resource*)
* @copydoc cudf::concatenate(host_span<table_view const>,rmm::mr::device_memory_resource*)
*
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<table> concatenate(
std::vector<table_view> const& tables_to_concat,
host_span<table_view const> tables_to_concat,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

Expand Down
49 changes: 45 additions & 4 deletions cpp/include/cudf/detail/utilities/hash_functions.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2017-2020, NVIDIA CORPORATION.
* Copyright (c) 2017-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -18,6 +18,7 @@

#include <cudf/column/column_device_view.cuh>
#include <cudf/detail/utilities/assert.cuh>
#include <cudf/fixed_point/fixed_point.hpp>
#include <cudf/strings/string_view.cuh>
#include <hash/hash_constants.hpp>

Expand Down Expand Up @@ -570,9 +571,7 @@ struct SparkMurmurHash3_32 {
template <typename T, std::enable_if_t<std::is_floating_point<T>::value>* = nullptr>
hash_value_type CUDA_DEVICE_CALLABLE compute_floating_point(T const& key) const
{
if (key == T{0.0}) {
return compute(T{0.0});
} else if (isnan(key)) {
if (isnan(key)) {
T nan = std::numeric_limits<T>::quiet_NaN();
return compute(nan);
} else {
Expand Down Expand Up @@ -630,6 +629,48 @@ hash_value_type CUDA_DEVICE_CALLABLE SparkMurmurHash3_32<bool>::operator()(bool
return this->compute<uint32_t>(key);
}

template <>
hash_value_type CUDA_DEVICE_CALLABLE
SparkMurmurHash3_32<int8_t>::operator()(int8_t const& key) const
{
return this->compute<uint32_t>(key);
}

template <>
hash_value_type CUDA_DEVICE_CALLABLE
SparkMurmurHash3_32<uint8_t>::operator()(uint8_t const& key) const
{
return this->compute<uint32_t>(key);
}

template <>
hash_value_type CUDA_DEVICE_CALLABLE
SparkMurmurHash3_32<int16_t>::operator()(int16_t const& key) const
{
return this->compute<uint32_t>(key);
}

template <>
hash_value_type CUDA_DEVICE_CALLABLE
SparkMurmurHash3_32<uint16_t>::operator()(uint16_t const& key) const
{
return this->compute<uint32_t>(key);
}

template <>
hash_value_type CUDA_DEVICE_CALLABLE
SparkMurmurHash3_32<numeric::decimal32>::operator()(numeric::decimal32 const& key) const
{
return this->compute<uint64_t>(key.value());
}

template <>
hash_value_type CUDA_DEVICE_CALLABLE
SparkMurmurHash3_32<numeric::decimal64>::operator()(numeric::decimal64 const& key) const
{
return this->compute<uint64_t>(key.value());
}

/**
* @brief Specialization of MurmurHash3_32 operator for strings.
*/
Expand Down
5 changes: 3 additions & 2 deletions cpp/include/cudf/dictionary/detail/concatenate.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -17,6 +17,7 @@

#include <cudf/column/column.hpp>
#include <cudf/dictionary/dictionary_column_view.hpp>
#include <cudf/utilities/span.hpp>

#include <rmm/cuda_stream_view.hpp>

Expand All @@ -36,7 +37,7 @@ namespace detail {
* @return New column with concatenated results.
*/
std::unique_ptr<column> concatenate(
std::vector<column_view> const& columns,
host_span<column_view const> columns,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

Expand Down
Loading

0 comments on commit 7bf0331

Please sign in to comment.