Skip to content

Commit

Permalink
Merge branch 'branch-21.10' into fea-update-apply-signature
Browse files Browse the repository at this point in the history
  • Loading branch information
brandon-b-miller committed Sep 22, 2021
2 parents f2bdcbf + 1cb527f commit 9ccae14
Show file tree
Hide file tree
Showing 26 changed files with 911 additions and 153 deletions.
4 changes: 2 additions & 2 deletions ci/gpu/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,8 @@ function install_dask {
# Install the main version of dask, distributed, and streamz
gpuci_logger "Install the main version of dask, distributed, and streamz"
set -x
pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
pip install "git+https://github.com/dask/distributed.git@2021.07.1" --upgrade --no-deps
pip install "git+https://github.com/dask/dask.git@2021.07.1" --upgrade --no-deps
# Need to uninstall streamz that is already in the env.
pip uninstall -y streamz
pip install "git+https://github.com/python-streamz/streamz.git@master" --upgrade --no-deps
Expand Down
4 changes: 2 additions & 2 deletions conda/environments/cudf_dev_cuda11.0.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ dependencies:
- transformers
- pydata-sphinx-theme
- pip:
- git+https://github.com/dask/dask.git@main
- git+https://github.com/dask/distributed.git@main
- git+https://github.com/dask/dask.git@2021.07.1
- git+https://github.com/dask/distributed.git@2021.07.1
- git+https://github.com/python-streamz/streamz.git@master
- pyorc
4 changes: 2 additions & 2 deletions conda/environments/cudf_dev_cuda11.2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ dependencies:
- transformers
- pydata-sphinx-theme
- pip:
- git+https://github.com/dask/dask.git@main
- git+https://github.com/dask/distributed.git@main
- git+https://github.com/dask/dask.git@2021.07.1
- git+https://github.com/dask/distributed.git@2021.07.1
- git+https://github.com/python-streamz/streamz.git@master
- pyorc
41 changes: 41 additions & 0 deletions cpp/include/cudf/column/column_view.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -633,4 +633,45 @@ column_view bit_cast(column_view const& input, data_type type);
*/
mutable_column_view bit_cast(mutable_column_view const& input, data_type type);

namespace detail {
/**
* @brief Computes a hash value from the shallow state of the specified column
*
* For any two columns, if `is_shallow_equivalent(c0,c1)` then `shallow_hash(c0) ==
* shallow_hash(c1)`.
*
* The complexity of computing the hash value of `input` is `O( count_descendants(input) )`, i.e.,
* it is independent of the number of elements in the column.
*
* This function does _not_ inspect the elements of `input` nor access any device memory or launch
* any kernels.
*
* @param input The `column_view` to compute hash
* @return The hash value derived from the shallow state of `input`.
*/
std::size_t shallow_hash(column_view const& input);

/**
* @brief Uses only shallow state to determine if two `column_view`s view equivalent columns
*
* Two columns are equivalent if for any operation `F` then:
* ```
* is_shallow_equivalent(c0, c1) ==> The results of F(c0) and F(c1) are equivalent
* ```
* For any two non-empty columns, `is_shallow_equivalent(c0,c1)` is true only if they view the exact
* same physical column. In other words, two physically independent columns may have exactly
* equivalent elements but their shallow state would not be equivalent.
*
* The complexity of this function is `O( min(count_descendants(lhs), count_descendants(rhs)) )`,
* i.e., it is independent of the number of elements in either column.
*
* This function does _not_ inspect the elements of `lhs` or `rhs` nor access any device memory nor
* launch any kernels.
*
* @param lhs The left `column_view` to compare
* @param rhs The right `column_view` to compare
* @return If `lhs` and `rhs` have equivalent shallow state
*/
bool is_shallow_equivalent(column_view const& lhs, column_view const& rhs);
} // namespace detail
} // namespace cudf
36 changes: 36 additions & 0 deletions cpp/include/cudf/detail/hashing.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@

#include <rmm/cuda_stream_view.hpp>

#include <cstddef>
#include <functional>

namespace cudf {
namespace detail {

Expand Down Expand Up @@ -53,5 +56,38 @@ std::unique_ptr<column> serial_murmur_hash3_32(
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/* Copyright 2005-2014 Daniel James.
*
* Use, modification and distribution is subject to the Boost Software
* License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
* http://www.boost.org/LICENSE_1_0.txt)
*/
/**
* @brief Combines two hashed values into a single hashed value.
*
* Adapted from Boost hash_combine function, modified for 64-bit
* https://www.boost.org/doc/libs/1_35_0/doc/html/boost/hash_combine_id241013.html
*
* @param lhs The first hashed value
* @param rhs The second hashed value
* @return Combined hash value
*/
constexpr std::size_t hash_combine(std::size_t lhs, std::size_t rhs)
{
lhs ^= rhs + 0x9e3779b97f4a7c15 + (lhs << 6) + (lhs >> 2);
return lhs;
}
} // namespace detail
} // namespace cudf

// specialization of std::hash for cudf::data_type
namespace std {
template <>
struct hash<cudf::data_type> {
std::size_t operator()(cudf::data_type const& type) const noexcept
{
return cudf::detail::hash_combine(std::hash<int32_t>{}(static_cast<int32_t>(type.id())),
std::hash<int32_t>{}(type.scale()));
}
};
} // namespace std
12 changes: 12 additions & 0 deletions cpp/include/cudf/detail/utilities/hash_functions.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,12 @@ struct MurmurHash3_32 {
return h;
}

/* Copyright 2005-2014 Daniel James.
*
* Use, modification and distribution is subject to the Boost Software
* License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
* http://www.boost.org/LICENSE_1_0.txt)
*/
/**
* @brief Combines two hash values into a new single hash value. Called
* repeatedly to create a hash value from several variables.
Expand Down Expand Up @@ -795,6 +801,12 @@ struct IdentityHash {
IdentityHash() = default;
constexpr IdentityHash(uint32_t seed) : m_seed(seed) {}

/* Copyright 2005-2014 Daniel James.
*
* Use, modification and distribution is subject to the Boost Software
* License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
* http://www.boost.org/LICENSE_1_0.txt)
*/
/**
* @brief Combines two hash values into a new single hash value. Called
* repeatedly to create a hash value from several variables.
Expand Down
12 changes: 12 additions & 0 deletions cpp/include/cudf_test/type_lists.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,18 @@ using FixedWidthTypesWithoutChrono = Concat<NumericTypes, FixedPointTypes>;
*/
using ComparableTypes = Concat<NumericTypes, ChronoTypes, StringTypes>;

/**
* @brief Provides a list of all compound types for use in GTest typed tests.
*
* Example:
* ```
* // Invokes all typed fixture tests for all compound types in libcudf
* TYPED_TEST_CASE(MyTypedFixture, cudf::test::CompoundTypes);
* ```
*/
using CompoundTypes =
cudf::test::Types<cudf::string_view, cudf::dictionary32, cudf::list_view, cudf::struct_view>;

/**
* @brief Provides a list of all types supported in libcudf for use in a GTest
* typed test.
Expand Down
55 changes: 55 additions & 0 deletions cpp/src/column/column_view.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,15 @@
*/

#include <cudf/column/column_view.hpp>
#include <cudf/detail/hashing.hpp>
#include <cudf/null_mask.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/error.hpp>
#include <cudf/utilities/traits.hpp>

#include <thrust/iterator/transform_iterator.h>

#include <algorithm>
#include <exception>
#include <numeric>
#include <vector>
Expand Down Expand Up @@ -76,6 +78,59 @@ size_type column_view_base::null_count(size_type begin, size_type end) const
? 0
: cudf::count_unset_bits(null_mask(), offset() + begin, offset() + end);
}

// Struct to use custom hash combine and fold expression
struct HashValue {
std::size_t hash;
explicit HashValue(std::size_t h) : hash{h} {}
HashValue operator^(HashValue const& other) const
{
return HashValue{hash_combine(hash, other.hash)};
}
};

template <typename... Ts>
constexpr auto hash(Ts&&... ts)
{
return (... ^ HashValue(std::hash<Ts>{}(ts))).hash;
}

std::size_t shallow_hash_impl(column_view const& c, bool is_parent_empty = false)
{
std::size_t const init = (is_parent_empty or c.is_empty())
? hash(c.type(), 0)
: hash(c.type(), c.size(), c.head(), c.null_mask(), c.offset());
return std::accumulate(c.child_begin(),
c.child_end(),
init,
[&c, is_parent_empty](std::size_t hash, auto const& child) {
return hash_combine(
hash, shallow_hash_impl(child, c.is_empty() or is_parent_empty));
});
}

std::size_t shallow_hash(column_view const& input) { return shallow_hash_impl(input); }

bool shallow_equivalent_impl(column_view const& lhs,
column_view const& rhs,
bool is_parent_empty = false)
{
bool const is_empty = (lhs.is_empty() and rhs.is_empty()) or is_parent_empty;
return (lhs.type() == rhs.type()) and
(is_empty or ((lhs.size() == rhs.size()) and (lhs.head() == rhs.head()) and
(lhs.null_mask() == rhs.null_mask()) and (lhs.offset() == rhs.offset()))) and
std::equal(lhs.child_begin(),
lhs.child_end(),
rhs.child_begin(),
rhs.child_end(),
[is_empty](auto const& lhs_child, auto const& rhs_child) {
return shallow_equivalent_impl(lhs_child, rhs_child, is_empty);
});
}
bool is_shallow_equivalent(column_view const& lhs, column_view const& rhs)
{
return shallow_equivalent_impl(lhs, rhs);
}
} // namespace detail

// Immutable view constructor
Expand Down
96 changes: 30 additions & 66 deletions cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -31,77 +31,50 @@
#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/transform_output_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/reduce.h>

namespace cudf {
namespace groupby {
namespace detail {

// ArgMin binary operator with tuple of (value, index)
/**
* @brief ArgMin binary operator with index values into input column.
*
* @tparam T Type of the underlying column. Must support '<' operator.
*/
template <typename T>
struct ArgMin {
CUDA_HOST_DEVICE_CALLABLE auto operator()(thrust::tuple<T, size_type> const& lhs,
thrust::tuple<T, size_type> const& rhs) const
{
if (thrust::get<1>(lhs) == cudf::detail::ARGMIN_SENTINEL)
return rhs;
else if (thrust::get<1>(rhs) == cudf::detail::ARGMIN_SENTINEL)
return lhs;
else
return thrust::get<0>(lhs) < thrust::get<0>(rhs) ? lhs : rhs;
}
};

// ArgMax binary operator with tuple of (value, index)
template <typename T>
struct ArgMax {
CUDA_HOST_DEVICE_CALLABLE auto operator()(thrust::tuple<T, size_type> const& lhs,
thrust::tuple<T, size_type> const& rhs) const
{
if (thrust::get<1>(lhs) == cudf::detail::ARGMIN_SENTINEL)
return rhs;
else if (thrust::get<1>(rhs) == cudf::detail::ARGMIN_SENTINEL)
return lhs;
else
return thrust::get<0>(lhs) > thrust::get<0>(rhs) ? lhs : rhs;
}
};

struct get_tuple_second_element {
template <typename T>
__device__ size_type operator()(thrust::tuple<T, size_type> const& rhs) const
column_device_view const d_col;
CUDA_DEVICE_CALLABLE auto operator()(size_type const& lhs, size_type const& rhs) const
{
return thrust::get<1>(rhs);
// The extra bounds checking is due to issue github.com/rapidsai/cudf/9156 and
// github.com/NVIDIA/thrust/issues/1525
// where invalid random values may be passed here by thrust::reduce_by_key
if (lhs < 0 || lhs >= d_col.size() || d_col.is_null(lhs)) { return rhs; }
if (rhs < 0 || rhs >= d_col.size() || d_col.is_null(rhs)) { return lhs; }
return d_col.element<T>(lhs) < d_col.element<T>(rhs) ? lhs : rhs;
}
};

/**
* @brief Functor to store the boolean value to null mask.
* @brief ArgMax binary operator with index values into input column.
*
* @tparam T Type of the underlying column. Must support '<' operator.
*/
struct bool_to_nullmask {
mutable_column_device_view d_result;
__device__ void operator()(size_type i, bool rhs)
template <typename T>
struct ArgMax {
column_device_view const d_col;
CUDA_DEVICE_CALLABLE auto operator()(size_type const& lhs, size_type const& rhs) const
{
if (rhs) {
d_result.set_valid(i);
} else {
d_result.set_null(i);
}
// The extra bounds checking is due to issue github.com/rapidsai/cudf/9156 and
// github.com/NVIDIA/thrust/issues/1525
// where invalid random values may be passed here by thrust::reduce_by_key
if (lhs < 0 || lhs >= d_col.size() || d_col.is_null(lhs)) { return rhs; }
if (rhs < 0 || rhs >= d_col.size() || d_col.is_null(rhs)) { return lhs; }
return d_col.element<T>(rhs) < d_col.element<T>(lhs) ? lhs : rhs;
}
};

/**
* @brief Returns index for non-null element, and SENTINEL for null element in a column.
*
*/
struct null_as_sentinel {
column_device_view const col;
size_type const SENTINEL;
__device__ size_type operator()(size_type i) const { return col.is_null(i) ? SENTINEL : i; }
};

/**
* @brief Value accessor for column which supports dictionary column too.
*
Expand Down Expand Up @@ -191,25 +164,16 @@ struct reduce_functor {
auto resultview = mutable_column_device_view::create(result->mutable_view(), stream);
auto valuesview = column_device_view::create(values, stream);
if constexpr (K == aggregation::ARGMAX || K == aggregation::ARGMIN) {
constexpr auto SENTINEL =
(K == aggregation::ARGMAX ? cudf::detail::ARGMAX_SENTINEL : cudf::detail::ARGMIN_SENTINEL);
auto idx_begin =
cudf::detail::make_counting_transform_iterator(0, null_as_sentinel{*valuesview, SENTINEL});
// dictionary keys are sorted, so dictionary32 index comparison is enough.
auto column_begin = valuesview->begin<DeviceType>();
auto begin = thrust::make_zip_iterator(thrust::make_tuple(column_begin, idx_begin));
auto result_begin = thrust::make_transform_output_iterator(resultview->begin<ResultDType>(),
get_tuple_second_element{});
using OpType =
std::conditional_t<(K == aggregation::ARGMAX), ArgMax<DeviceType>, ArgMin<DeviceType>>;
thrust::reduce_by_key(rmm::exec_policy(stream),
group_labels.data(),
group_labels.data() + group_labels.size(),
begin,
thrust::make_counting_iterator<ResultType>(0),
thrust::make_discard_iterator(),
result_begin,
thrust::equal_to<size_type>{},
OpType{});
resultview->begin<ResultType>(),
thrust::equal_to<ResultType>{},
OpType{*valuesview});
} else {
auto init = OpType::template identity<DeviceType>();
auto begin = cudf::detail::make_counting_transform_iterator(
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/io/orc/orc_common.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -24,6 +24,7 @@ namespace orc {

// ORC rows are divided into groups and assigned indexes for faster seeking
static constexpr uint32_t default_row_index_stride = 10000;
static constexpr uint32_t BLOCK_HEADER_SIZE = 3;

enum CompressionKind : uint8_t {
NONE = 0,
Expand Down
Loading

0 comments on commit 9ccae14

Please sign in to comment.