Skip to content

Commit

Permalink
Add row hasher with nested column support (#10641)
Browse files Browse the repository at this point in the history
Contributes to #10186

Authors:
  - Devavret Makkar (https://github.com/devavret)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: #10641
  • Loading branch information
devavret authored Apr 29, 2022
1 parent 15e4982 commit 3c4e72e
Show file tree
Hide file tree
Showing 15 changed files with 880 additions and 114 deletions.
41 changes: 41 additions & 0 deletions cpp/benchmarks/stream_compaction/distinct.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#include <cudf/column/column_view.hpp>
#include <cudf/detail/stream_compaction.hpp>
#include <cudf/lists/list_view.cuh>
#include <cudf/types.hpp>

#include <nvbench/nvbench.cuh>
Expand Down Expand Up @@ -55,3 +56,43 @@ NVBENCH_BENCH_TYPES(nvbench_distinct, NVBENCH_TYPE_AXES(data_type))
.set_name("distinct")
.set_type_axes_names({"Type"})
.add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000});

template <typename Type>
void nvbench_distinct_list(nvbench::state& state, nvbench::type_list<Type>)
{
cudf::rmm_pool_raii pool_raii;

auto const size = state.get_int64("ColumnSize");
auto const dtype = cudf::type_to_id<Type>();
double const null_frequency = state.get_float64("null_frequency");

data_profile table_data_profile;
if (dtype == cudf::type_id::LIST) {
table_data_profile.set_distribution_params(dtype, distribution_id::UNIFORM, 0, 4);
table_data_profile.set_distribution_params(
cudf::type_id::INT32, distribution_id::UNIFORM, 0, 4);
table_data_profile.set_list_depth(1);
} else {
// We're comparing distinct() on a non-nested column to that on a list column with the same
// number of distinct rows. The max list size is 4 and the number of distinct values in the
// list's child is 5. So the number of distinct rows in the list = 1 + 5 + 5^2 + 5^3 + 5^4 = 781
// We want this column to also have 781 distinct values.
table_data_profile.set_distribution_params(dtype, distribution_id::UNIFORM, 0, 781);
}
table_data_profile.set_null_frequency(null_frequency);

auto const table = create_random_table(
{dtype}, table_size_bytes{static_cast<size_t>(size)}, table_data_profile, 0);

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
rmm::cuda_stream_view stream_view{launch.get_stream()};
auto result = cudf::detail::distinct(*table, {0}, cudf::null_equality::EQUAL, stream_view);
});
}

NVBENCH_BENCH_TYPES(nvbench_distinct_list,
NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, cudf::list_view>))
.set_name("distinct_list")
.set_type_axes_names({"Type"})
.add_float64_axis("null_frequency", {0.0, 0.1})
.add_int64_axis("ColumnSize", {100'000'000});
5 changes: 3 additions & 2 deletions cpp/include/cudf/detail/hashing.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,19 +33,20 @@ namespace detail {
std::unique_ptr<column> hash(
table_view const& input,
hash_id hash_function = hash_id::HASH_MURMUR3,
uint32_t seed = 0,
uint32_t seed = cudf::DEFAULT_HASH_SEED,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

std::unique_ptr<column> murmur_hash3_32(
table_view const& input,
uint32_t seed = cudf::DEFAULT_HASH_SEED,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

template <template <typename> class hash_function>
std::unique_ptr<column> serial_murmur_hash3_32(
table_view const& input,
uint32_t seed = 0,
uint32_t seed = cudf::DEFAULT_HASH_SEED,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

Expand Down
8 changes: 4 additions & 4 deletions cpp/include/cudf/detail/iterator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ struct null_replaced_value_accessor {
* @brief validity accessor of column with null bitmask
* A unary functor that returns validity at index `i`.
*
* @tparam safe If false, the accessor with throw logic_error if the column is not nullable. If
* @tparam safe If false, the accessor will throw a logic_error if the column is not nullable. If
* true, the accessor checks for nullability and if col is not nullable, returns true.
*/
template <bool safe = false>
Expand Down Expand Up @@ -306,12 +306,12 @@ auto make_pair_rep_iterator(column_device_view const& column)
*
* Dereferencing the returned iterator for element `i` will return the validity
* of `column[i]`
* This iterator is only allowed for nullable columns if `safe` = false
* If `safe` = false, the column must be nullable.
* When safe = true, if the column is not nullable then the validity is always true.
*
* @throws cudf::logic_error if the column is not nullable when safe = false
* @throws cudf::logic_error if the column is not nullable and safe = false
*
* @tparam safe If false, the accessor with throw logic_error if the column is not nullable. If
* @tparam safe If false, the accessor will throw a logic_error if the column is not nullable. If
* true, the accessor checks for nullability and if col is not nullable, returns true.
* @param column The column to iterate
* @return auto Iterator that returns validities of column elements.
Expand Down
28 changes: 28 additions & 0 deletions cpp/include/cudf/detail/utilities/algorithm.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

namespace cudf::detail {

template <typename Iterator, typename T, typename BinaryOp>
__device__ __forceinline__ T accumulate(Iterator first, Iterator last, T init, BinaryOp op)
{
for (; first != last; ++first) {
init = op(std::move(init), *first);
}
return init;
}
} // namespace cudf::detail
10 changes: 3 additions & 7 deletions cpp/include/cudf/detail/utilities/column.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,13 +72,9 @@ struct linked_column_view : public column_view_base {
*/
inline LinkedColVector table_to_linked_columns(table_view const& table)
{
LinkedColVector result;
result.reserve(table.num_columns());
std::transform(table.begin(), table.end(), std::back_inserter(result), [&](column_view const& c) {
return std::make_shared<linked_column_view>(c);
});

return result;
auto linked_it = thrust::make_transform_iterator(
table.begin(), [](auto const& c) { return std::make_shared<linked_column_view>(c); });
return LinkedColVector(linked_it, linked_it + table.num_columns());
}

} // namespace cudf::detail
Loading

0 comments on commit 3c4e72e

Please sign in to comment.