Skip to content

Commit

Permalink
Refactor scatter for list columns (#8255)
Browse files Browse the repository at this point in the history
This PR refactors `scatter` for `LIST` type columns. Previously there were nested `for_each_n` when constructing child columns. The outer loop loops over the rows and the inner loops over the elements of each row. We can replace these loops with a single `transform` because we already have the offsets information of the column to construct.

For each element, we first lookup the `unbound_list_view` it belongs to via binary searching the offset vector. Then the corresponding element to copy from can be retrieved by dereferencing bounded `list_view` with the proper intra index.

Struct type refactor is different. Currently the implementation wraps every child in a lists column and dispatch to the list type specialization. This is fine, but the wrapping process now deep copies the list offsets and child column for dispatching. We can simplify it by just wrapping it with a view.

Since `scatter.cuh` is included in many other files, separating scatter implementation detail can help reducing compilation time during refactoring the code. Most helper function is moved into `scatter_helper.cu`.

Benchmarking code for scattering lists is added. Benchmark snapshot is below:
```
Benchmark                                                                      Time             CPU      Time Old      Time New       CPU Old       CPU New
-----------------------------------------------------------------------------------------------------------------------------------------------------------
ScatterLists/double_type_colesce_o/1024/64/manual_time                      -0.1073         -0.0926        110648         98781        129731        117724
ScatterLists/double_type_colesce_o/4096/64/manual_time                      -0.1177         -0.1015        113393        100045        132412        118971
ScatterLists/double_type_colesce_o/32768/64/manual_time                     -0.3785         -0.3391        167288        103962        185599        122663
ScatterLists/double_type_colesce_o/262144/64/manual_time                    -0.3175         -0.2834        171123        116785        188191        134865
ScatterLists/double_type_colesce_o/2097152/64/manual_time                   -0.2581         -0.2426        270225        200472        290363        219934
ScatterLists/double_type_colesce_o/16777216/64/manual_time                  -0.8464         -0.8438       6205089        953139       6224867        972548
ScatterLists/double_type_colesce_o/33554432/64/manual_time                  -0.8437         -0.8423      12087712       1889483      12107066       1909170
ScatterLists/double_type_colesce_o/1024/512/manual_time                     -0.3487         -0.3111        150169         97810        169463        116736
ScatterLists/double_type_colesce_o/4096/512/manual_time                     -0.3499         -0.3116        151978         98794        170918        117661
ScatterLists/double_type_colesce_o/32768/512/manual_time                    -0.4337         -0.3901        196663        111364        215048        131162
ScatterLists/double_type_colesce_o/262144/512/manual_time                   -0.8083         -0.7844        590691        113251        607891        131089
ScatterLists/double_type_colesce_o/2097152/512/manual_time                  -0.7018         -0.6815        641149        191192        661107        210559
ScatterLists/double_type_colesce_o/16777216/512/manual_time                 -0.6893         -0.6842       2581320        802057       2601542        821602
ScatterLists/double_type_colesce_o/33554432/512/manual_time                 -0.8277         -0.8259       9150244       1576769       9169846       1596137
ScatterLists/double_type_colesce_o/1024/2048/manual_time                    -0.6584         -0.6178        284006         97008        303179        115869
ScatterLists/double_type_colesce_o/4096/2048/manual_time                    -0.6648         -0.6250        289209         96934        308413        115647
ScatterLists/double_type_colesce_o/32768/2048/manual_time                   -0.7433         -0.7089        386115         99120        404566        117774
ScatterLists/double_type_colesce_o/262144/2048/manual_time                  -0.8214         -0.7984        611876        109305        629110        126803
ScatterLists/double_type_colesce_o/2097152/2048/manual_time                 -0.9107         -0.9024       2098263        187417       2118254        206798
ScatterLists/double_type_colesce_o/16777216/2048/manual_time                -0.6869         -0.6816       2527109        791306       2546819        810805
ScatterLists/double_type_colesce_o/33554432/2048/manual_time                -0.5102         -0.5070       3018595       1478458       3038315       1497923
```

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - AJ Schmidt (https://github.com/ajschmidt8)
  - David Wendt (https://github.com/davidwendt)
  - MithunR (https://github.com/mythrocks)

URL: #8255
  • Loading branch information
isVoid authored Jun 7, 2021
1 parent badb501 commit ae8ee8a
Show file tree
Hide file tree
Showing 7 changed files with 805 additions and 618 deletions.
1 change: 1 addition & 0 deletions conda/recipes/libcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ test:
- test -f $PREFIX/include/cudf/lists/detail/drop_list_duplicates.hpp
- test -f $PREFIX/include/cudf/lists/detail/interleave_columns.hpp
- test -f $PREFIX/include/cudf/lists/detail/sorting.hpp
- test -f $PREFIX/include/cudf/lists/detail/scatter_helper.cuh
- test -f $PREFIX/include/cudf/lists/combine.hpp
- test -f $PREFIX/include/cudf/lists/count_elements.hpp
- test -f $PREFIX/include/cudf/lists/explode.hpp
Expand Down
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,7 @@ add_library(cudf
src/lists/copying/copying.cu
src/lists/copying/gather.cu
src/lists/copying/segmented_gather.cu
src/lists/copying/scatter_helper.cu
src/lists/count_elements.cu
src/lists/drop_list_duplicates.cu
src/lists/explode.cu
Expand Down
4 changes: 4 additions & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@ ConfigureBench(GATHER_BENCH copying/gather_benchmark.cu)
# - scatter benchmark -----------------------------------------------------------------------------
ConfigureBench(SCATTER_BENCH copying/scatter_benchmark.cu)

###################################################################################################
# - lists scatter benchmark -----------------------------------------------------------------------
ConfigureBench(SCATTER_LISTS_BENCH lists/copying/scatter_lists_benchmark.cu)

###################################################################################################
# - contiguous_split benchmark -------------------------------------------------------------------
ConfigureBench(CONTIGUOUS_SPLIT_BENCH copying/contiguous_split_benchmark.cu)
Expand Down
131 changes: 131 additions & 0 deletions cpp/benchmarks/lists/copying/scatter_lists_benchmark.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/column/column_factories.hpp>
#include <cudf/copying.hpp>
#include <cudf/null_mask.hpp>
#include <cudf/types.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/exec_policy.hpp>
#include <rmm/mr/device/device_memory_resource.hpp>

#include <thrust/random.h>
#include <thrust/sequence.h>
#include <thrust/shuffle.h>

#include <cmath>

namespace cudf {

class ScatterLists : public cudf::benchmark {
};

template <class TypeParam, bool coalesce>
void BM_lists_scatter(::benchmark::State& state)
{
auto stream = rmm::cuda_stream_default;
auto mr = rmm::mr::get_current_device_resource();

const size_type base_size{(size_type)state.range(0)};
const size_type num_elements_per_row{(size_type)state.range(1)};
const size_type num_rows = (size_type)ceil(double(base_size) / num_elements_per_row);

auto source_base_col = make_fixed_width_column(
data_type{type_to_id<TypeParam>()}, base_size, mask_state::UNALLOCATED, stream, mr);
auto target_base_col = make_fixed_width_column(
data_type{type_to_id<TypeParam>()}, base_size, mask_state::UNALLOCATED, stream, mr);
thrust::sequence(rmm::exec_policy(stream),
source_base_col->mutable_view().begin<TypeParam>(),
source_base_col->mutable_view().end<TypeParam>());
thrust::sequence(rmm::exec_policy(stream),
target_base_col->mutable_view().begin<TypeParam>(),
target_base_col->mutable_view().end<TypeParam>());

auto source_offsets = make_fixed_width_column(
data_type{type_to_id<offset_type>()}, num_rows + 1, mask_state::UNALLOCATED, stream, mr);
auto target_offsets = make_fixed_width_column(
data_type{type_to_id<offset_type>()}, num_rows + 1, mask_state::UNALLOCATED, stream, mr);

thrust::sequence(rmm::exec_policy(stream),
source_offsets->mutable_view().begin<offset_type>(),
source_offsets->mutable_view().end<offset_type>(),
0,
num_elements_per_row);
thrust::sequence(rmm::exec_policy(stream),
target_offsets->mutable_view().begin<offset_type>(),
target_offsets->mutable_view().end<offset_type>(),
0,
num_elements_per_row);

auto source = make_lists_column(num_rows,
std::move(source_offsets),
std::move(source_base_col),
0,
cudf::create_null_mask(num_rows, mask_state::UNALLOCATED),
stream,
mr);
auto target = make_lists_column(num_rows,
std::move(target_offsets),
std::move(target_base_col),
0,
cudf::create_null_mask(num_rows, mask_state::UNALLOCATED),
stream,
mr);

auto scatter_map = make_fixed_width_column(
data_type{type_to_id<size_type>()}, num_rows, mask_state::UNALLOCATED, stream, mr);
auto m_scatter_map = scatter_map->mutable_view();
thrust::sequence(rmm::exec_policy(stream),
m_scatter_map.begin<size_type>(),
m_scatter_map.end<size_type>(),
num_rows - 1,
-1);

if (not coalesce) {
thrust::default_random_engine g;
thrust::shuffle(rmm::exec_policy(stream),
m_scatter_map.begin<size_type>(),
m_scatter_map.begin<size_type>(),
g);
}

for (auto _ : state) {
cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0
scatter(table_view{{*source}}, *scatter_map, table_view{{*target}}, false, mr);
}

state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0) * 2 *
sizeof(TypeParam));
}

#define SBM_BENCHMARK_DEFINE(name, type, coalesce) \
BENCHMARK_DEFINE_F(ScatterLists, name)(::benchmark::State & state) \
{ \
BM_lists_scatter<type, coalesce>(state); \
} \
BENCHMARK_REGISTER_F(ScatterLists, name) \
->RangeMultiplier(8) \
->Ranges({{1 << 10, 1 << 25}, {64, 2048}}) /* 1K-1B rows, 64-2048 elements */ \
->UseManualTime();

SBM_BENCHMARK_DEFINE(double_type_colesce_o, double, true);
SBM_BENCHMARK_DEFINE(double_type_colesce_x, double, false);

} // namespace cudf
Loading

0 comments on commit ae8ee8a

Please sign in to comment.