-
Notifications
You must be signed in to change notification settings - Fork 915
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Update sort groupby to use non-atomic operation #9035
Merged
rapids-bot
merged 11 commits into
rapidsai:branch-21.10
from
karthikeyann:fea-noatomic-groupbyreduce
Aug 27, 2021
Merged
Changes from 4 commits
Commits
Show all changes
11 commits
Select commit
Hold shift + click to select a range
a293aa1
replace update_target_element with reduce_by_key in sort groupby
karthikeyann 2c1040b
reduce compile time of group_argmin/max using output_writer_iterator
karthikeyann 729fcaf
dictionary32 comparison for argmin, argmax
karthikeyann 156ba33
use transform_output_operator for argmin, argmax
karthikeyann 578c37d
Merge branch 'branch-21.10' of github.com:rapidsai/cudf into fea-noat…
karthikeyann aa06c15
remove transform_output_writer_iterator
karthikeyann 9e15dc2
add missing stream, mr
karthikeyann e6257c9
fix null_mask allocation
karthikeyann aea6886
update sort groupby to create base type column for dictionary type
karthikeyann 5bd4321
Merge branch 'branch-21.10' of github.com:rapidsai/cudf into fea-noat…
karthikeyann 4d43263
add groupby templated make_min_aggregation factory
karthikeyann File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,175 @@ | ||
/* | ||
* Copyright (c) 2021, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
// Output writer iterator | ||
#pragma once | ||
|
||
#include <thrust/iterator/iterator_adaptor.h> | ||
|
||
namespace thrust { | ||
namespace detail { | ||
|
||
// Proxy reference that calls BinaryFunction with Iterator value and the rhs of assignment operator | ||
template <typename BinaryFunction, typename Iterator> | ||
class output_writer_iterator_proxy { | ||
public: | ||
__host__ __device__ output_writer_iterator_proxy(const Iterator& index_iter, BinaryFunction fun) | ||
: index_iter(index_iter), fun(fun) | ||
{ | ||
} | ||
template <typename T> | ||
__host__ __device__ output_writer_iterator_proxy operator=(const T& x) | ||
{ | ||
fun(*index_iter, x); | ||
return *this; | ||
} | ||
|
||
private: | ||
Iterator index_iter; | ||
BinaryFunction fun; | ||
}; | ||
|
||
// Register output_writer_iterator_proxy with 'is_proxy_reference' from | ||
// type_traits to enable its use with algorithms. | ||
template <class BinaryFunction, class Iterator> | ||
struct is_proxy_reference<output_writer_iterator_proxy<BinaryFunction, Iterator>> | ||
: public thrust::detail::true_type { | ||
}; | ||
|
||
} // namespace detail | ||
} // namespace thrust | ||
|
||
namespace cudf { | ||
/** | ||
* @brief Transform output iterator with custom writer binary function which takes index and value. | ||
* | ||
* @code {.cpp} | ||
* #include <cudf/utilities/output_writer_iterator.cuh> | ||
* #include <thrust/device_vector.h> | ||
* #include <thrust/iterator/counting_iterator.h> | ||
* #include <thrust/iterator/transform_iterator.h> | ||
* | ||
* struct set_bits_field { | ||
* int* bitfield; | ||
* __device__ inline void set_bit(size_t bit_index) | ||
* { | ||
* atomicOr(&bitfield[bit_index/32], (int{1} << (bit_index % 32))); | ||
* } | ||
* __device__ inline void clear_bit(size_t bit_index) | ||
* { | ||
* atomicAnd(&bitfield[bit_index / 32], ~(int{1} << (bit_index % 32))); | ||
* } | ||
* // Index, value | ||
* __device__ void operator()(size_t i, bool x) | ||
* { | ||
* if (x) | ||
* set_bit(i); | ||
* else | ||
* clear_bit(i); | ||
* } | ||
* }; | ||
* | ||
* thrust::device_vector<int> v(1, 0x0000); | ||
* auto result_begin = cudf::make_output_writer_iterator(thrust::make_counting_iterator(0), | ||
* set_bits_field{v.data().get()}); | ||
* auto value = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [] __device__ | ||
* (int x) { return x%2; | ||
* }); | ||
* thrust::copy(thrust::device, value, value+32, result_begin); | ||
* | ||
* #include <cudf/utilities/output_writer_iterator.cuh> | ||
* #include <thrust/device_vector.h> | ||
* #include <thrust/iterator/counting_iterator.h> | ||
* #include <thrust/iterator/transform_iterator.h> | ||
* | ||
* struct set_bits_field { | ||
* int* bitfield; | ||
* __device__ inline void set_bit(size_t bit_index) | ||
* { | ||
* atomicOr(&bitfield[bit_index/32], (int{1} << (bit_index % 32))); | ||
* } | ||
* __device__ inline void clear_bit(size_t bit_index) | ||
* { | ||
* atomicAnd(&bitfield[bit_index / 32], ~(int{1} << (bit_index % 32))); | ||
* } | ||
* // Index, value | ||
* __device__ void operator()(size_t i, bool x) | ||
* { | ||
* if (x) | ||
* set_bit(i); | ||
* else | ||
* clear_bit(i); | ||
* } | ||
* }; | ||
* | ||
* thrust::device_vector<int> v(1, 0x0000); | ||
* auto result_begin = cudf::make_output_writer_iterator(thrust::make_counting_iterator(0), | ||
* set_bits_field{v.data().get()}); | ||
* auto value = thrust::make_transform_iterator(thrust::make_counting_iterator(0), | ||
* [] __device__ (int x) { return x%2; }); | ||
* thrust::copy(thrust::device, value, value+32, result_begin); | ||
* int(v[0]); // returns 0xaaaaaaaa; | ||
* @endcode | ||
* | ||
* | ||
* @tparam BinaryFunction Binary function to be called with the Iterator value and the rhs of | ||
* assignment operator. | ||
* @tparam Iterator iterator type that acts as index of the output. | ||
*/ | ||
template <typename BinaryFunction, typename Iterator> | ||
class output_writer_iterator | ||
: public thrust::iterator_adaptor< | ||
output_writer_iterator<BinaryFunction, Iterator>, | ||
Iterator, | ||
thrust::use_default, | ||
thrust::use_default, | ||
thrust::use_default, | ||
thrust::detail::output_writer_iterator_proxy<BinaryFunction, Iterator>> { | ||
public: | ||
// parent class. | ||
typedef thrust::iterator_adaptor< | ||
output_writer_iterator<BinaryFunction, Iterator>, | ||
Iterator, | ||
thrust::use_default, | ||
thrust::use_default, | ||
thrust::use_default, | ||
thrust::detail::output_writer_iterator_proxy<BinaryFunction, Iterator>> | ||
super_t; | ||
// friend thrust::iterator_core_access to allow it access to the private interface dereference() | ||
friend class thrust::iterator_core_access; | ||
__host__ __device__ output_writer_iterator(Iterator const& x, BinaryFunction fun) | ||
: super_t(x), fun(fun) | ||
{ | ||
} | ||
|
||
private: | ||
BinaryFunction fun; | ||
|
||
// thrust::iterator_core_access accesses this function | ||
__host__ __device__ typename super_t::reference dereference() const | ||
{ | ||
return thrust::detail::output_writer_iterator_proxy<BinaryFunction, Iterator>( | ||
this->base_reference(), fun); | ||
} | ||
}; | ||
|
||
template <typename BinaryFunction, typename Iterator> | ||
output_writer_iterator<BinaryFunction, Iterator> __host__ __device__ | ||
make_output_writer_iterator(Iterator out, BinaryFunction fun) | ||
{ | ||
return output_writer_iterator<BinaryFunction, Iterator>(out, fun); | ||
} // end make_output_writer_iterator | ||
} // namespace cudf |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks like a lot of machinery and I'm not clear about what it's purpose is.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I created this first to test, if I can set
element
andset_valid
at the same time using singlereduce_by_key
withthrust::optional
. But usingthrust::optional
was always slow.So, I reverted to use 2
reduce_by_key
, 1 for element, another for null_mask.To use
reduce_by_key
with null_mask, it needs a temporary bool buffer andvalid_if
.To avoid this, I used this
transform_output_writer_iterator
.Anyway, this is purely to avoid allocation of temporary bool buffer, but it doesn't affect performance much.
I will revert to using temporary bool buffer, and remove this iterator. (I thought that there may be other use cases for this iterator in cudf. Especially with null_mask.)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This machinery is the bare minimum to use a proxy object for assignments in thrust. The question is if the proxy object is useful. If it is, then the machinery is just overhead.
A better name, imho, would be
assignment_iterator
From what I see the use case is overriding the assignment operation. In an output iterator, the assignment operator is used to take the value on the rhs and assign it to the lhs. here, we intercept that assignment operation and call a binary operator
binop(lhs, rhs)
that can override the assignment operation.This PR uses the
assignment_iterator
with a lambda that captures a null mask. The proxy intercepts the lhsint
and a rhsbool
and then invokes the lambda which callsset_valid
orset_null
on the captured mask. It is the output version ofmake_validity_iterator
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we should be more specific here. I would like to see a
make_validity_output_iterator
that is very focused on dealing with null masks of a particular column. With an interface like this, we can start to experiment with opportunistic coalescing of null mask assignments using cooperative groups. Also, this API makes it very obvious that we are doing individual bit assignments and that coalescing them by changing the calling code could prove more performant.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The implementation may well use the proxy-based iterator you've created, but there would be a very clear use case for it, and other developers will have an easy to use API for that use case.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry, I don't have a concrete use case in the back of my head right now
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jrhemstad I'm still confused. Why exactly is cooperative groups not an option here? I was thinking we could try
coalesced_threads()
, followed bylabeled_partition()
andreduce()
, then usethread_rank() == 0
to indicate which threads should write the output, and use atomics to make sure uncoordinated writes don't interfere with one another. I doubt this will be fast in the majority of use cases, but I am not sure why it would not work. It would be an experiment to see if we can speed up the best-case scenario, where all/most writes are sequential.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
They can. Determining which threads are active is not the problem.
The problem is coordinating and detecting which threads are attempting to update the same bits in a given 4B or 8B word. For example, if any two threads in a grid want to update bits
i
andi+1
, how do you detect that scenario without some form of communication?coalesced_threads
doesn't help you here because whether or not the threads are coalesced doesn't tell you anything about what bits they are updating.Furthermore, even if the two threads are both active at the same time in the same warp (which you could never rely on) how do you detect that
t0
wants to update biti
andt1
update biti+1
when it could just as well bet0
updatesi
andt1
updatesi + 1042
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's where @karthikeyann 's proxy class comes in to play. It knows the index and the value, and the lambda is responsible for using that index and value to assign the appropriate bit to a captured null mask. It appears we have enough information to attempt opportunistic concurrency here. I'm not saying it will be beneficial, only that it seems possible and might be worth an experiment.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the proxy class has the index, and we can pass this to
labeled_partition
.For the sake of the experiment, the communication would be limited to 32 threads within a warp. Communicating across warps would require shmem, and since we're in a lambda called by thrust, I'm not sure if/how that would work.
coalesced_threads
is just to prevent UB/hang,labeled_partition
would be responsible for determining what writes can be coalesced by usingidx/32
as the label (with some offset, if necessary).