Skip to content

Commit

Permalink
Add shallow hash function and shallow equality comparison for column_…
Browse files Browse the repository at this point in the history
…view (#9185)

Fixes #9140 
Added `shallow_hash(column_view)`
Added unit tests

It computes hash values based on the shallow states of `column_view`:
type, size, data pointer, null_mask pointer,  offset, and the hash value of the children. 
`null_count` is not used since it is a cached value and it may vary based on contents of `null_mask`, and may be pre-computed or not.

Fixes #9139
Added `is_shallow_equivalent(column_view, column_view)` ~shallow_equal~
Added unit tests

It compares two column_views based on the shallow states of column_view:
type, size, data pointer, null_mask pointer, offset, and the column_view of the children.
null_count is not used since it is a cached value and it may vary based on contents of null_mask, and may be pre-computed or not.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Jake Hemstad (https://github.com/jrhemstad)
  - David Wendt (https://github.com/davidwendt)

URL: #9185
  • Loading branch information
karthikeyann authored Sep 22, 2021
1 parent 10fd071 commit 1cb527f
Show file tree
Hide file tree
Showing 7 changed files with 599 additions and 0 deletions.
41 changes: 41 additions & 0 deletions cpp/include/cudf/column/column_view.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -633,4 +633,45 @@ column_view bit_cast(column_view const& input, data_type type);
*/
mutable_column_view bit_cast(mutable_column_view const& input, data_type type);

namespace detail {
/**
* @brief Computes a hash value from the shallow state of the specified column
*
* For any two columns, if `is_shallow_equivalent(c0,c1)` then `shallow_hash(c0) ==
* shallow_hash(c1)`.
*
* The complexity of computing the hash value of `input` is `O( count_descendants(input) )`, i.e.,
* it is independent of the number of elements in the column.
*
* This function does _not_ inspect the elements of `input` nor access any device memory or launch
* any kernels.
*
* @param input The `column_view` to compute hash
* @return The hash value derived from the shallow state of `input`.
*/
std::size_t shallow_hash(column_view const& input);

/**
* @brief Uses only shallow state to determine if two `column_view`s view equivalent columns
*
* Two columns are equivalent if for any operation `F` then:
* ```
* is_shallow_equivalent(c0, c1) ==> The results of F(c0) and F(c1) are equivalent
* ```
* For any two non-empty columns, `is_shallow_equivalent(c0,c1)` is true only if they view the exact
* same physical column. In other words, two physically independent columns may have exactly
* equivalent elements but their shallow state would not be equivalent.
*
* The complexity of this function is `O( min(count_descendants(lhs), count_descendants(rhs)) )`,
* i.e., it is independent of the number of elements in either column.
*
* This function does _not_ inspect the elements of `lhs` or `rhs` nor access any device memory nor
* launch any kernels.
*
* @param lhs The left `column_view` to compare
* @param rhs The right `column_view` to compare
* @return If `lhs` and `rhs` have equivalent shallow state
*/
bool is_shallow_equivalent(column_view const& lhs, column_view const& rhs);
} // namespace detail
} // namespace cudf
36 changes: 36 additions & 0 deletions cpp/include/cudf/detail/hashing.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@

#include <rmm/cuda_stream_view.hpp>

#include <cstddef>
#include <functional>

namespace cudf {
namespace detail {

Expand Down Expand Up @@ -53,5 +56,38 @@ std::unique_ptr<column> serial_murmur_hash3_32(
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/* Copyright 2005-2014 Daniel James.
*
* Use, modification and distribution is subject to the Boost Software
* License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
* http://www.boost.org/LICENSE_1_0.txt)
*/
/**
* @brief Combines two hashed values into a single hashed value.
*
* Adapted from Boost hash_combine function, modified for 64-bit
* https://www.boost.org/doc/libs/1_35_0/doc/html/boost/hash_combine_id241013.html
*
* @param lhs The first hashed value
* @param rhs The second hashed value
* @return Combined hash value
*/
constexpr std::size_t hash_combine(std::size_t lhs, std::size_t rhs)
{
lhs ^= rhs + 0x9e3779b97f4a7c15 + (lhs << 6) + (lhs >> 2);
return lhs;
}
} // namespace detail
} // namespace cudf

// specialization of std::hash for cudf::data_type
namespace std {
template <>
struct hash<cudf::data_type> {
std::size_t operator()(cudf::data_type const& type) const noexcept
{
return cudf::detail::hash_combine(std::hash<int32_t>{}(static_cast<int32_t>(type.id())),
std::hash<int32_t>{}(type.scale()));
}
};
} // namespace std
12 changes: 12 additions & 0 deletions cpp/include/cudf/detail/utilities/hash_functions.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,12 @@ struct MurmurHash3_32 {
return h;
}

/* Copyright 2005-2014 Daniel James.
*
* Use, modification and distribution is subject to the Boost Software
* License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
* http://www.boost.org/LICENSE_1_0.txt)
*/
/**
* @brief Combines two hash values into a new single hash value. Called
* repeatedly to create a hash value from several variables.
Expand Down Expand Up @@ -795,6 +801,12 @@ struct IdentityHash {
IdentityHash() = default;
constexpr IdentityHash(uint32_t seed) : m_seed(seed) {}

/* Copyright 2005-2014 Daniel James.
*
* Use, modification and distribution is subject to the Boost Software
* License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
* http://www.boost.org/LICENSE_1_0.txt)
*/
/**
* @brief Combines two hash values into a new single hash value. Called
* repeatedly to create a hash value from several variables.
Expand Down
12 changes: 12 additions & 0 deletions cpp/include/cudf_test/type_lists.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,18 @@ using FixedWidthTypesWithoutChrono = Concat<NumericTypes, FixedPointTypes>;
*/
using ComparableTypes = Concat<NumericTypes, ChronoTypes, StringTypes>;

/**
* @brief Provides a list of all compound types for use in GTest typed tests.
*
* Example:
* ```
* // Invokes all typed fixture tests for all compound types in libcudf
* TYPED_TEST_CASE(MyTypedFixture, cudf::test::CompoundTypes);
* ```
*/
using CompoundTypes =
cudf::test::Types<cudf::string_view, cudf::dictionary32, cudf::list_view, cudf::struct_view>;

/**
* @brief Provides a list of all types supported in libcudf for use in a GTest
* typed test.
Expand Down
55 changes: 55 additions & 0 deletions cpp/src/column/column_view.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,15 @@
*/

#include <cudf/column/column_view.hpp>
#include <cudf/detail/hashing.hpp>
#include <cudf/null_mask.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/error.hpp>
#include <cudf/utilities/traits.hpp>

#include <thrust/iterator/transform_iterator.h>

#include <algorithm>
#include <exception>
#include <numeric>
#include <vector>
Expand Down Expand Up @@ -76,6 +78,59 @@ size_type column_view_base::null_count(size_type begin, size_type end) const
? 0
: cudf::count_unset_bits(null_mask(), offset() + begin, offset() + end);
}

// Struct to use custom hash combine and fold expression
struct HashValue {
std::size_t hash;
explicit HashValue(std::size_t h) : hash{h} {}
HashValue operator^(HashValue const& other) const
{
return HashValue{hash_combine(hash, other.hash)};
}
};

template <typename... Ts>
constexpr auto hash(Ts&&... ts)
{
return (... ^ HashValue(std::hash<Ts>{}(ts))).hash;
}

std::size_t shallow_hash_impl(column_view const& c, bool is_parent_empty = false)
{
std::size_t const init = (is_parent_empty or c.is_empty())
? hash(c.type(), 0)
: hash(c.type(), c.size(), c.head(), c.null_mask(), c.offset());
return std::accumulate(c.child_begin(),
c.child_end(),
init,
[&c, is_parent_empty](std::size_t hash, auto const& child) {
return hash_combine(
hash, shallow_hash_impl(child, c.is_empty() or is_parent_empty));
});
}

std::size_t shallow_hash(column_view const& input) { return shallow_hash_impl(input); }

bool shallow_equivalent_impl(column_view const& lhs,
column_view const& rhs,
bool is_parent_empty = false)
{
bool const is_empty = (lhs.is_empty() and rhs.is_empty()) or is_parent_empty;
return (lhs.type() == rhs.type()) and
(is_empty or ((lhs.size() == rhs.size()) and (lhs.head() == rhs.head()) and
(lhs.null_mask() == rhs.null_mask()) and (lhs.offset() == rhs.offset()))) and
std::equal(lhs.child_begin(),
lhs.child_end(),
rhs.child_begin(),
rhs.child_end(),
[is_empty](auto const& lhs_child, auto const& rhs_child) {
return shallow_equivalent_impl(lhs_child, rhs_child, is_empty);
});
}
bool is_shallow_equivalent(column_view const& lhs, column_view const& rhs)
{
return shallow_equivalent_impl(lhs, rhs);
}
} // namespace detail

// Immutable view constructor
Expand Down
1 change: 1 addition & 0 deletions cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ endfunction()
# - column tests ----------------------------------------------------------------------------------
ConfigureTest(COLUMN_TEST
column/bit_cast_test.cpp
column/column_view_shallow_test.cpp
column/column_test.cu
column/column_device_view_test.cu
column/compound_test.cu)
Expand Down
Loading

0 comments on commit 1cb527f

Please sign in to comment.