Skip to content

Commit

Permalink
Load balance optimization for contiguous_split (#9755)
Browse files Browse the repository at this point in the history
The existing `contiguous_split` implementation was vulnerable to situations where `number of columns N * number of splits M` was < the number of SMs on the gpu.   This PR implements a postprocessing step which attempts to distribute the amount of bytes to be copied as evenly as possible across all available SMs.   

PR has been updated to repartition using a constant chunk size of 1 MB.  This yields better results than the initial approach.

Before/after benchmarks for some particularly degenerate cases (T4)

```
Before (4 partitions)
4GB, 4 columns, no splits                      43.3 ms         43.3 ms            8 bytes_per_second=46.1738G/s
After
4GB, 4 columns, no splits                      10.1 ms         10.1 ms            8 bytes_per_second=198.642G/s
```
```
Before (2 partitions)
1GB, 1 column + validity, no splits         114 ms          114 ms            8 bytes_per_second=17.5212G/s
After
1GB, 1 column + validity, no splits         10.5 ms         10.6 ms            8 bytes_per_second=189.784G/s
```

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - Jake Hemstad (https://github.com/jrhemstad)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Elias Stehle (https://github.com/elstehle)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: #9755
  • Loading branch information
nvdbaranec authored Jan 13, 2022
1 parent dbe65f1 commit c07fdab
Show file tree
Hide file tree
Showing 3 changed files with 277 additions and 63 deletions.
65 changes: 43 additions & 22 deletions cpp/benchmarks/copying/contiguous_split_benchmark.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -34,10 +34,18 @@ void BM_contiguous_split_common(benchmark::State& state,
int64_t bytes_total)
{
// generate splits
cudf::size_type split_stride = num_rows / num_splits;
std::vector<cudf::size_type> splits;
for (int idx = 0; idx < num_rows; idx += split_stride) {
splits.push_back(std::min(idx + split_stride, static_cast<cudf::size_type>(num_rows)));
if (num_splits > 0) {
cudf::size_type const split_stride = num_rows / num_splits;
// start after the first element.
auto iter = thrust::make_counting_iterator(1);
splits.reserve(num_splits);
std::transform(iter,
iter + num_splits,
std::back_inserter(splits),
[split_stride, num_rows](cudf::size_type i) {
return std::min(i * split_stride, static_cast<cudf::size_type>(num_rows));
});
}

std::vector<std::unique_ptr<cudf::column>> columns(src_cols.size());
Expand All @@ -53,21 +61,22 @@ void BM_contiguous_split_common(benchmark::State& state,
auto result = cudf::contiguous_split(src_table, splits);
}

state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * bytes_total);
// it's 2x bytes_total because we're both reading and writing.
state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * bytes_total * 2);
}

class ContiguousSplit : public cudf::benchmark {
};

void BM_contiguous_split(benchmark::State& state)
{
int64_t total_desired_bytes = state.range(0);
cudf::size_type num_cols = state.range(1);
cudf::size_type num_splits = state.range(2);
bool include_validity = state.range(3) == 0 ? false : true;
int64_t const total_desired_bytes = state.range(0);
cudf::size_type const num_cols = state.range(1);
cudf::size_type const num_splits = state.range(2);
bool const include_validity = state.range(3) == 0 ? false : true;

cudf::size_type el_size = 4; // ints and floats
int64_t num_rows = total_desired_bytes / (num_cols * el_size);
int64_t const num_rows = total_desired_bytes / (num_cols * el_size);

// generate input table
srand(31337);
Expand All @@ -85,8 +94,10 @@ void BM_contiguous_split(benchmark::State& state)
}
}

size_t total_bytes = total_desired_bytes;
if (include_validity) { total_bytes += num_rows / (sizeof(cudf::bitmask_type) * 8); }
int64_t const total_bytes =
total_desired_bytes +
(include_validity ? (max(int64_t{1}, (num_rows / 32)) * sizeof(cudf::bitmask_type) * num_cols)
: 0);

BM_contiguous_split_common(state, src_cols, num_rows, num_splits, total_bytes);
}
Expand All @@ -102,17 +113,17 @@ int rand_range(int r)

void BM_contiguous_split_strings(benchmark::State& state)
{
int64_t total_desired_bytes = state.range(0);
cudf::size_type num_cols = state.range(1);
cudf::size_type num_splits = state.range(2);
bool include_validity = state.range(3) == 0 ? false : true;
int64_t const total_desired_bytes = state.range(0);
cudf::size_type const num_cols = state.range(1);
cudf::size_type const num_splits = state.range(2);
bool const include_validity = state.range(3) == 0 ? false : true;

const int64_t string_len = 8;
constexpr int64_t string_len = 8;
std::vector<const char*> h_strings{
"aaaaaaaa", "bbbbbbbb", "cccccccc", "dddddddd", "eeeeeeee", "ffffffff", "gggggggg", "hhhhhhhh"};

int64_t col_len_bytes = total_desired_bytes / num_cols;
int64_t num_rows = col_len_bytes / string_len;
int64_t const col_len_bytes = total_desired_bytes / num_cols;
int64_t const num_rows = col_len_bytes / string_len;

// generate input table
srand(31337);
Expand All @@ -133,8 +144,10 @@ void BM_contiguous_split_strings(benchmark::State& state)
}
}

size_t total_bytes = total_desired_bytes + (num_rows * sizeof(cudf::size_type));
if (include_validity) { total_bytes += num_rows / (sizeof(cudf::bitmask_type) * 8); }
int64_t const total_bytes =
total_desired_bytes + ((num_rows + 1) * sizeof(cudf::offset_type)) +
(include_validity ? (max(int64_t{1}, (num_rows / 32)) * sizeof(cudf::bitmask_type) * num_cols)
: 0);

BM_contiguous_split_common(state, src_cols, num_rows, num_splits, total_bytes);
}
Expand All @@ -157,12 +170,16 @@ CSBM_BENCHMARK_DEFINE(6Gb10ColsValidity, (int64_t)6 * 1024 * 1024 * 1024, 10, 25
CSBM_BENCHMARK_DEFINE(4Gb512ColsNoValidity, (int64_t)4 * 1024 * 1024 * 1024, 512, 256, 0);
CSBM_BENCHMARK_DEFINE(4Gb512ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 512, 256, 1);
CSBM_BENCHMARK_DEFINE(4Gb10ColsNoValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 256, 0);
CSBM_BENCHMARK_DEFINE(46b10ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 256, 1);
CSBM_BENCHMARK_DEFINE(4Gb10ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 256, 1);
CSBM_BENCHMARK_DEFINE(4Gb4ColsNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 4, 0, 1);
CSBM_BENCHMARK_DEFINE(4Gb4ColsValidityNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 4, 0, 1);

CSBM_BENCHMARK_DEFINE(1Gb512ColsNoValidity, (int64_t)1 * 1024 * 1024 * 1024, 512, 256, 0);
CSBM_BENCHMARK_DEFINE(1Gb512ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 512, 256, 1);
CSBM_BENCHMARK_DEFINE(1Gb10ColsNoValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 256, 0);
CSBM_BENCHMARK_DEFINE(1Gb10ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 256, 1);
CSBM_BENCHMARK_DEFINE(1Gb1ColNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 1, 0, 1);
CSBM_BENCHMARK_DEFINE(1Gb1ColValidityNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 1, 0, 1);

#define CSBM_STRINGS_BENCHMARK_DEFINE(name, size, num_columns, num_splits, validity) \
BENCHMARK_DEFINE_F(ContiguousSplitStrings, name)(::benchmark::State & state) \
Expand All @@ -179,8 +196,12 @@ CSBM_STRINGS_BENCHMARK_DEFINE(4Gb512ColsNoValidity, (int64_t)4 * 1024 * 1024 * 1
CSBM_STRINGS_BENCHMARK_DEFINE(4Gb512ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 512, 256, 1);
CSBM_STRINGS_BENCHMARK_DEFINE(4Gb10ColsNoValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 256, 0);
CSBM_STRINGS_BENCHMARK_DEFINE(4Gb10ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 256, 1);
CSBM_STRINGS_BENCHMARK_DEFINE(4Gb4ColsNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 4, 0, 0);
CSBM_STRINGS_BENCHMARK_DEFINE(4Gb4ColsValidityNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 4, 0, 1);

CSBM_STRINGS_BENCHMARK_DEFINE(1Gb512ColsNoValidity, (int64_t)1 * 1024 * 1024 * 1024, 512, 256, 0);
CSBM_STRINGS_BENCHMARK_DEFINE(1Gb512ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 512, 256, 1);
CSBM_STRINGS_BENCHMARK_DEFINE(1Gb10ColsNoValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 256, 0);
CSBM_STRINGS_BENCHMARK_DEFINE(1Gb10ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 256, 1);
CSBM_STRINGS_BENCHMARK_DEFINE(1Gb1ColNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 1, 0, 0);
CSBM_STRINGS_BENCHMARK_DEFINE(1Gb1ColValidityNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 1, 0, 1);
Loading

0 comments on commit c07fdab

Please sign in to comment.