-
Notifications
You must be signed in to change notification settings - Fork 915
/
null_mask.cu
711 lines (608 loc) · 27 KB
/
null_mask.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cudf/column/column_device_view.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/null_mask.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/utilities/bit.hpp>
#include <cudf/utilities/error.hpp>
#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_buffer.hpp>
#include <rmm/device_scalar.hpp>
#include <rmm/device_vector.hpp>
#include <rmm/mr/device/device_memory_resource.hpp>
#include <thrust/binary_search.h>
#include <thrust/copy.h>
#include <thrust/device_ptr.h>
#include <thrust/extrema.h>
#include <cub/cub.cuh>
#include <algorithm>
#include <numeric>
#include <type_traits>
namespace cudf {
size_type state_null_count(mask_state state, size_type size)
{
switch (state) {
case mask_state::UNALLOCATED: return 0;
case mask_state::UNINITIALIZED: return UNKNOWN_NULL_COUNT;
case mask_state::ALL_NULL: return size;
case mask_state::ALL_VALID: return 0;
default: CUDF_FAIL("Invalid null mask state.");
}
}
// Computes required allocation size of a bitmask
std::size_t bitmask_allocation_size_bytes(size_type number_of_bits, std::size_t padding_boundary)
{
CUDF_EXPECTS(padding_boundary > 0, "Invalid padding boundary");
auto necessary_bytes = cudf::util::div_rounding_up_safe<size_type>(number_of_bits, CHAR_BIT);
auto padded_bytes = padding_boundary * cudf::util::div_rounding_up_safe<size_type>(
necessary_bytes, padding_boundary);
return padded_bytes;
}
// Computes number of *actual* bitmask_type elements needed
size_type num_bitmask_words(size_type number_of_bits)
{
return cudf::util::div_rounding_up_safe<size_type>(number_of_bits,
detail::size_in_bits<bitmask_type>());
}
namespace detail {
// Create a device_buffer for a null mask
rmm::device_buffer create_null_mask(size_type size,
mask_state state,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource *mr)
{
size_type mask_size{0};
if (state != mask_state::UNALLOCATED) { mask_size = bitmask_allocation_size_bytes(size); }
rmm::device_buffer mask(mask_size, stream, mr);
if (state != mask_state::UNINITIALIZED) {
uint8_t fill_value = (state == mask_state::ALL_VALID) ? 0xff : 0x00;
CUDA_TRY(cudaMemsetAsync(
static_cast<bitmask_type *>(mask.data()), fill_value, mask_size, stream.value()));
}
return mask;
}
namespace {
__global__ void set_null_mask_kernel(bitmask_type *__restrict__ destination,
size_type begin_bit,
size_type end_bit,
bool valid,
size_type number_of_mask_words)
{
auto x = destination + word_index(begin_bit);
const auto last_word = word_index(end_bit) - word_index(begin_bit);
bitmask_type fill_value = (valid == true) ? 0xffffffff : 0x00;
for (size_type destination_word_index = threadIdx.x + blockIdx.x * blockDim.x;
destination_word_index < number_of_mask_words;
destination_word_index += blockDim.x * gridDim.x) {
if (destination_word_index == 0 || destination_word_index == last_word) {
bitmask_type mask = ~bitmask_type{0};
if (destination_word_index == 0) {
mask = ~(set_least_significant_bits(intra_word_index(begin_bit)));
}
if (destination_word_index == last_word) {
mask = mask & set_least_significant_bits(intra_word_index(end_bit));
}
x[destination_word_index] =
(valid == true) ? x[destination_word_index] | mask : x[destination_word_index] & ~mask;
} else {
x[destination_word_index] = fill_value;
}
}
}
} // namespace
// Set pre-allocated null mask of given bit range [begin_bit, end_bit) to valid, if valid==true,
// or null, otherwise;
void set_null_mask(bitmask_type *bitmask,
size_type begin_bit,
size_type end_bit,
bool valid,
rmm::cuda_stream_view stream)
{
CUDF_FUNC_RANGE();
CUDF_EXPECTS(begin_bit >= 0, "Invalid range.");
CUDF_EXPECTS(begin_bit < end_bit, "Invalid bit range.");
if (bitmask != nullptr) {
auto number_of_mask_words =
num_bitmask_words(end_bit) - begin_bit / detail::size_in_bits<bitmask_type>();
cudf::detail::grid_1d config(number_of_mask_words, 256);
set_null_mask_kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
static_cast<bitmask_type *>(bitmask), begin_bit, end_bit, valid, number_of_mask_words);
CHECK_CUDA(stream.value());
}
}
} // namespace detail
// Create a device_buffer for a null mask
rmm::device_buffer create_null_mask(size_type size,
mask_state state,
rmm::mr::device_memory_resource *mr)
{
return detail::create_null_mask(size, state, rmm::cuda_stream_default, mr);
}
// Set pre-allocated null mask of given bit range [begin_bit, end_bit) to valid, if valid==true,
// or null, otherwise;
void set_null_mask(bitmask_type *bitmask, size_type begin_bit, size_type end_bit, bool valid)
{
return detail::set_null_mask(bitmask, begin_bit, end_bit, valid);
}
namespace {
/**
* @brief Counts the number of non-zero bits in a bitmask in the range
* `[first_bit_index, last_bit_index]`.
*
* Expects `0 <= first_bit_index <= last_bit_index`.
*
* @param[in] bitmask The bitmask whose non-zero bits will be counted.
* @param[in] first_bit_index The index (inclusive) of the first bit to count
* @param[in] last_bit_index The index (inclusive) of the last bit to count
* @param[out] global_count The number of non-zero bits in the specified range
*/
template <size_type block_size>
__global__ void count_set_bits_kernel(bitmask_type const *bitmask,
size_type first_bit_index,
size_type last_bit_index,
size_type *global_count)
{
constexpr auto const word_size{detail::size_in_bits<bitmask_type>()};
auto const first_word_index{word_index(first_bit_index)};
auto const last_word_index{word_index(last_bit_index)};
auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
auto thread_word_index = tid + first_word_index;
size_type thread_count{0};
// First, just count the bits in all words
while (thread_word_index <= last_word_index) {
thread_count += __popc(bitmask[thread_word_index]);
thread_word_index += blockDim.x * gridDim.x;
}
// Subtract any slack bits counted from the first and last word
// Two threads handle this -- one for first word, one for last
if (tid < 2) {
bool const first{tid == 0};
bool const last{not first};
size_type bit_index = (first) ? first_bit_index : last_bit_index;
size_type word_index = (first) ? first_word_index : last_word_index;
size_type num_slack_bits = bit_index % word_size;
if (last) { num_slack_bits = word_size - num_slack_bits - 1; }
if (num_slack_bits > 0) {
bitmask_type word = bitmask[word_index];
auto slack_mask = (first) ? set_least_significant_bits(num_slack_bits)
: set_most_significant_bits(num_slack_bits);
thread_count -= __popc(word & slack_mask);
}
}
using BlockReduce = cub::BlockReduce<size_type, block_size>;
__shared__ typename BlockReduce::TempStorage temp_storage;
size_type block_count{BlockReduce(temp_storage).Sum(thread_count)};
if (threadIdx.x == 0) { atomicAdd(global_count, block_count); }
}
/**
* For each range `[first_bit_indices[i], last_bit_indices[i])`
* (where 0 <= i < `num_ranges`), count the number of bits set outside the range
* in the boundary words (i.e. words that include either
* `first_bit_indices[i]'th` bit or `(last_bit_indices[i] - 1)'th` bit) and
* subtract the count from the range's null count.
*
* Expects `0 <= first_bit_indices[i] <= last_bit_indices[i]`.
*
* @param[in] bitmask The bitmask whose non-zero bits outside the range in the
* boundary words will be counted.
* @param[in] num_ranges The number of ranges
* @param[in] first_bit_indices The indices (inclusive) of the first bit in each
* range
* @param[in] last_bit_indices The indices (exclusive) of the last bit in each
* range
* @param[in,out] null_counts The number of non-zero bits in each range to be
* updated
*/
template <typename OffsetIterator, typename OutputIterator>
__global__ void subtract_set_bits_range_boundaries_kerenel(bitmask_type const *bitmask,
size_type num_ranges,
OffsetIterator first_bit_indices,
OffsetIterator last_bit_indices,
OutputIterator null_counts)
{
constexpr size_type const word_size_in_bits{detail::size_in_bits<bitmask_type>()};
cudf::size_type const tid = threadIdx.x + blockIdx.x * blockDim.x;
cudf::size_type range_id = tid;
while (range_id < num_ranges) {
size_type const first_bit_index = *(first_bit_indices + range_id);
size_type const last_bit_index = *(last_bit_indices + range_id);
size_type delta = 0;
size_type num_slack_bits = 0;
// compute delta due to the preceding bits in the first word in the range
num_slack_bits = intra_word_index(first_bit_index);
if (num_slack_bits > 0) {
bitmask_type word = bitmask[word_index(first_bit_index)];
bitmask_type slack_mask = set_least_significant_bits(num_slack_bits);
delta -= __popc(word & slack_mask);
}
// compute delta due to the following bits in the last word in the range
num_slack_bits = (last_bit_index % word_size_in_bits) == 0
? 0
: word_size_in_bits - intra_word_index(last_bit_index);
if (num_slack_bits > 0) {
bitmask_type word = bitmask[word_index(last_bit_index)];
bitmask_type slack_mask = set_most_significant_bits(num_slack_bits);
delta -= __popc(word & slack_mask);
}
size_type updated_null_count = *(null_counts + range_id) + delta;
*(null_counts + range_id) = updated_null_count;
range_id += blockDim.x * gridDim.x;
}
}
/**
* @brief Copies the bits starting at the specified offset from a source
* bitmask into the destination bitmask.
*
* Bit `i` in `destination` will be equal to bit `i + offset` from `source`.
*
* @param destination The mask to copy into
* @param source The mask to copy from
* @param source_begin_bit The offset into `source` from which to begin the copy
* @param source_end_bit The offset into `source` till which copying is done
* @param number_of_mask_words The number of `cudf::bitmask_type` words to copy
*/
// TODO: Also make binops test that uses offset in column_view
__global__ void copy_offset_bitmask(bitmask_type *__restrict__ destination,
bitmask_type const *__restrict__ source,
size_type source_begin_bit,
size_type source_end_bit,
size_type number_of_mask_words)
{
for (size_type destination_word_index = threadIdx.x + blockIdx.x * blockDim.x;
destination_word_index < number_of_mask_words;
destination_word_index += blockDim.x * gridDim.x) {
destination[destination_word_index] = detail::get_mask_offset_word(
source, destination_word_index, source_begin_bit, source_end_bit);
}
}
/**
* @brief Computes the bitwise AND of an array of bitmasks
*
* @param destination The bitmask to write result into
* @param source Array of source mask pointers. All masks must be of same size
* @param begin_bit Array of offsets into corresponding @p source masks.
* Must be same size as source array
* @param num_sources Number of masks in @p source array
* @param source_size Number of bits in each mask in @p source
* @param number_of_mask_words The number of words of type bitmask_type to copy
*/
__global__ void offset_bitmask_and(bitmask_type *__restrict__ destination,
bitmask_type const *const *__restrict__ source,
size_type const *__restrict__ begin_bit,
size_type num_sources,
size_type source_size,
size_type number_of_mask_words)
{
for (size_type destination_word_index = threadIdx.x + blockIdx.x * blockDim.x;
destination_word_index < number_of_mask_words;
destination_word_index += blockDim.x * gridDim.x) {
bitmask_type destination_word = ~bitmask_type{0}; // All bits 1
for (size_type i = 0; i < num_sources; i++) {
destination_word &= detail::get_mask_offset_word(
source[i], destination_word_index, begin_bit[i], begin_bit[i] + source_size);
}
destination[destination_word_index] = destination_word;
}
}
// convert [first_bit_index,last_bit_index) to
// [first_word_index,last_word_index)
struct to_word_index : public thrust::unary_function<size_type, size_type> {
const bool _inclusive = false;
size_type const *const _d_bit_indices = nullptr;
/**
* @brief Constructor of a functor that converts bit indices to bitmask word
* indices.
*
* @param[in] inclusive Flag that indicates whether bit indices are inclusive
* or exclusive.
* @param[in] d_bit_indices Pointer to an array of bit indices
*/
__host__ to_word_index(bool inclusive, size_type const *d_bit_indices)
: _inclusive(inclusive), _d_bit_indices(d_bit_indices)
{
}
__device__ size_type operator()(const size_type &i) const
{
auto bit_index = _d_bit_indices[i];
return word_index(bit_index) + ((_inclusive || intra_word_index(bit_index) == 0) ? 0 : 1);
}
};
} // namespace
namespace detail {
// Create a bitmask from a specific range
rmm::device_buffer copy_bitmask(bitmask_type const *mask,
size_type begin_bit,
size_type end_bit,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource *mr)
{
CUDF_FUNC_RANGE();
CUDF_EXPECTS(begin_bit >= 0, "Invalid range.");
CUDF_EXPECTS(begin_bit <= end_bit, "Invalid bit range.");
rmm::device_buffer dest_mask{};
auto num_bytes = bitmask_allocation_size_bytes(end_bit - begin_bit);
if ((mask == nullptr) || (num_bytes == 0)) { return dest_mask; }
if (begin_bit == 0) {
dest_mask = rmm::device_buffer{static_cast<void const *>(mask), num_bytes, stream, mr};
} else {
auto number_of_mask_words = num_bitmask_words(end_bit - begin_bit);
dest_mask = rmm::device_buffer{num_bytes, stream, mr};
cudf::detail::grid_1d config(number_of_mask_words, 256);
copy_offset_bitmask<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
static_cast<bitmask_type *>(dest_mask.data()),
mask,
begin_bit,
end_bit,
number_of_mask_words);
CHECK_CUDA(stream.value());
}
return dest_mask;
}
// Create a bitmask from a column view
rmm::device_buffer copy_bitmask(column_view const &view,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource *mr)
{
CUDF_FUNC_RANGE();
rmm::device_buffer null_mask{0, stream, mr};
if (view.nullable()) {
null_mask =
copy_bitmask(view.null_mask(), view.offset(), view.offset() + view.size(), stream, mr);
}
return null_mask;
}
// Inplace Bitwise AND of the masks
void inplace_bitmask_and(bitmask_type *dest_mask,
std::vector<bitmask_type const *> const &masks,
std::vector<size_type> const &begin_bits,
size_type mask_size,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource *mr)
{
CUDF_EXPECTS(std::all_of(begin_bits.begin(), begin_bits.end(), [](auto b) { return b >= 0; }),
"Invalid range.");
CUDF_EXPECTS(mask_size > 0, "Invalid bit range.");
CUDF_EXPECTS(std::all_of(masks.begin(), masks.end(), [](auto p) { return p != nullptr; }),
"Mask pointer cannot be null");
auto number_of_mask_words = num_bitmask_words(mask_size);
rmm::device_vector<bitmask_type const *> d_masks(masks);
rmm::device_vector<size_type> d_begin_bits(begin_bits);
cudf::detail::grid_1d config(number_of_mask_words, 256);
offset_bitmask_and<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
dest_mask,
d_masks.data().get(),
d_begin_bits.data().get(),
d_masks.size(),
mask_size,
number_of_mask_words);
CHECK_CUDA(stream.value());
}
// Bitwise AND of the masks
rmm::device_buffer bitmask_and(std::vector<bitmask_type const *> const &masks,
std::vector<size_type> const &begin_bits,
size_type mask_size,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource *mr)
{
rmm::device_buffer dest_mask{};
auto num_bytes = bitmask_allocation_size_bytes(mask_size);
dest_mask = rmm::device_buffer{num_bytes, stream, mr};
inplace_bitmask_and(
static_cast<bitmask_type *>(dest_mask.data()), masks, begin_bits, mask_size, stream, mr);
return dest_mask;
}
cudf::size_type count_set_bits(bitmask_type const *bitmask,
size_type start,
size_type stop,
rmm::cuda_stream_view stream = rmm::cuda_stream_default)
{
if (nullptr == bitmask) { return 0; }
CUDF_EXPECTS(start >= 0, "Invalid range.");
CUDF_EXPECTS(start <= stop, "Invalid bit range.");
std::size_t num_bits_to_count = stop - start;
if (num_bits_to_count == 0) { return 0; }
auto num_words = num_bitmask_words(num_bits_to_count);
constexpr size_type block_size{256};
cudf::detail::grid_1d grid(num_words, block_size);
rmm::device_scalar<size_type> non_zero_count(0, stream);
count_set_bits_kernel<block_size>
<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
bitmask, start, stop - 1, non_zero_count.data());
return non_zero_count.value(stream);
}
cudf::size_type count_unset_bits(bitmask_type const *bitmask,
size_type start,
size_type stop,
rmm::cuda_stream_view stream = rmm::cuda_stream_default)
{
if (nullptr == bitmask) { return 0; }
auto num_bits = (stop - start);
return (num_bits - detail::count_set_bits(bitmask, start, stop, stream));
}
std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
std::vector<size_type> const &indices,
rmm::cuda_stream_view stream)
{
CUDF_EXPECTS(indices.size() % 2 == 0,
"Array of indices needs to have an even number of elements.");
for (size_t i = 0; i < indices.size() / 2; i++) {
auto begin = indices[i * 2];
auto end = indices[i * 2 + 1];
CUDF_EXPECTS(begin >= 0, "Starting index cannot be negative.");
CUDF_EXPECTS(end >= begin, "End index cannot be smaller than the starting index.");
}
if (indices.empty()) {
return std::vector<size_type>{};
} else if (bitmask == nullptr) {
std::vector<size_type> ret(indices.size() / 2);
for (size_t i = 0; i < indices.size() / 2; i++) {
ret[i] = indices[2 * i + 1] - indices[2 * i];
}
return ret;
}
size_type num_ranges = indices.size() / 2;
thrust::host_vector<size_type> h_first_indices(num_ranges);
thrust::host_vector<size_type> h_last_indices(num_ranges);
thrust::stable_partition_copy(thrust::seq,
std::begin(indices),
std::end(indices),
thrust::make_counting_iterator(0),
h_first_indices.begin(),
h_last_indices.begin(),
[](auto i) { return (i % 2) == 0; });
rmm::device_vector<size_type> d_first_indices = h_first_indices;
rmm::device_vector<size_type> d_last_indices = h_last_indices;
rmm::device_vector<size_type> d_null_counts(num_ranges, 0);
auto word_num_set_bits = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[bitmask] __device__(auto i) { return static_cast<size_type>(__popc(bitmask[i])); });
auto first_word_indices = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
// We cannot use lambda as cub::DeviceSegmentedReduce::Sum() requires
// first_word_indices and last_word_indices to have the same type.
to_word_index(true, d_first_indices.data().get()));
auto last_word_indices = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
// We cannot use lambda as cub::DeviceSegmentedReduce::Sum() requires
// first_word_indices and last_word_indices to have the same type.
to_word_index(false, d_last_indices.data().get()));
// first allocate temporary memroy
size_t temp_storage_bytes{0};
CUDA_TRY(cub::DeviceSegmentedReduce::Sum(nullptr,
temp_storage_bytes,
word_num_set_bits,
d_null_counts.begin(),
num_ranges,
first_word_indices,
last_word_indices,
stream.value()));
rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);
// second perform segmented reduction
CUDA_TRY(cub::DeviceSegmentedReduce::Sum(d_temp_storage.data(),
temp_storage_bytes,
word_num_set_bits,
d_null_counts.begin(),
num_ranges,
first_word_indices,
last_word_indices,
stream.value()));
CHECK_CUDA(stream.value());
// third, adjust counts in segment boundaries (if segments are not
// word-aligned)
constexpr size_type block_size{256};
cudf::detail::grid_1d grid(num_ranges, block_size);
subtract_set_bits_range_boundaries_kerenel<<<grid.num_blocks,
grid.num_threads_per_block,
0,
stream.value()>>>(
bitmask, num_ranges, d_first_indices.begin(), d_last_indices.begin(), d_null_counts.begin());
CHECK_CUDA(stream.value());
std::vector<size_type> ret(num_ranges);
CUDA_TRY(cudaMemcpyAsync(ret.data(),
d_null_counts.data().get(),
num_ranges * sizeof(size_type),
cudaMemcpyDeviceToHost,
stream.value()));
stream.synchronize(); // now ret is valid.
return ret;
}
std::vector<size_type> segmented_count_unset_bits(bitmask_type const *bitmask,
std::vector<size_type> const &indices,
rmm::cuda_stream_view stream)
{
if (indices.empty()) {
return std::vector<size_type>{};
} else if (bitmask == nullptr) {
return std::vector<size_type>(indices.size() / 2, 0);
}
auto ret = segmented_count_set_bits(bitmask, indices, stream);
for (size_t i = 0; i < ret.size(); i++) {
auto begin = indices[i * 2];
auto end = indices[i * 2 + 1];
ret[i] = (end - begin) - ret[i];
}
return ret;
}
// Returns the bitwise AND of the null masks of all columns in the table view
rmm::device_buffer bitmask_and(table_view const &view,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource *mr)
{
CUDF_FUNC_RANGE();
rmm::device_buffer null_mask{0, stream, mr};
if (view.num_rows() == 0 or view.num_columns() == 0) { return null_mask; }
std::vector<bitmask_type const *> masks;
std::vector<size_type> offsets;
for (auto &&col : view) {
if (col.nullable()) {
masks.push_back(col.null_mask());
offsets.push_back(col.offset());
}
}
if (masks.size() > 0) {
return cudf::detail::bitmask_and(masks, offsets, view.num_rows(), stream, mr);
}
return null_mask;
}
} // namespace detail
// Count non-zero bits in the specified range
cudf::size_type count_set_bits(bitmask_type const *bitmask, size_type start, size_type stop)
{
CUDF_FUNC_RANGE();
return detail::count_set_bits(bitmask, start, stop);
}
// Count zero bits in the specified range
cudf::size_type count_unset_bits(bitmask_type const *bitmask, size_type start, size_type stop)
{
CUDF_FUNC_RANGE();
return detail::count_unset_bits(bitmask, start, stop);
}
// Count non-zero bits in the specified ranges
std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
std::vector<size_type> const &indices)
{
CUDF_FUNC_RANGE();
return detail::segmented_count_set_bits(bitmask, indices, rmm::cuda_stream_default);
}
// Count zero bits in the specified ranges
std::vector<size_type> segmented_count_unset_bits(bitmask_type const *bitmask,
std::vector<size_type> const &indices)
{
CUDF_FUNC_RANGE();
return detail::segmented_count_unset_bits(bitmask, indices, rmm::cuda_stream_default);
}
// Create a bitmask from a specific range
rmm::device_buffer copy_bitmask(bitmask_type const *mask,
size_type begin_bit,
size_type end_bit,
rmm::mr::device_memory_resource *mr)
{
return detail::copy_bitmask(mask, begin_bit, end_bit, rmm::cuda_stream_default, mr);
}
// Create a bitmask from a column view
rmm::device_buffer copy_bitmask(column_view const &view, rmm::mr::device_memory_resource *mr)
{
return detail::copy_bitmask(view, rmm::cuda_stream_default, mr);
}
rmm::device_buffer bitmask_and(table_view const &view, rmm::mr::device_memory_resource *mr)
{
return detail::bitmask_and(view, rmm::cuda_stream_default, mr);
}
} // namespace cudf