diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h index f70b3991473fa..64c0febfc4ede 100644 --- a/cpp/src/arrow/util/byte_stream_split_internal.h +++ b/cpp/src/arrow/util/byte_stream_split_internal.h @@ -19,9 +19,11 @@ #include "arrow/util/endian.h" #include "arrow/util/simd.h" +#include "arrow/util/small_vector.h" #include "arrow/util/ubsan.h" #include +#include #include #include @@ -35,6 +37,9 @@ namespace arrow::util::internal { // SIMD implementations // +// TODO have all decode and encode routines take an explicit width? This would simplify +// testing and benchmarking quite a bit... + #if defined(ARROW_HAVE_SSE4_2) template void ByteStreamSplitDecodeSse2(const uint8_t* data, int64_t num_values, int64_t stride, @@ -672,14 +677,24 @@ inline void DoMergeStreams(const uint8_t** src_streams, int width, int64_t nvalu template void ByteStreamSplitEncodeScalar(const uint8_t* raw_values, const int64_t num_values, - uint8_t* output_buffer_raw) { + uint8_t* out) { std::array dest_streams; for (int stream = 0; stream < kNumStreams; ++stream) { - dest_streams[stream] = &output_buffer_raw[stream * num_values]; + dest_streams[stream] = &out[stream * num_values]; } DoSplitStreams(raw_values, kNumStreams, num_values, dest_streams.data()); } +inline void ByteStreamSplitEncodeScalarDynamic(const uint8_t* raw_values, int width, + const int64_t num_values, uint8_t* out) { + ::arrow::internal::SmallVector dest_streams; + dest_streams.resize(width); + for (int stream = 0; stream < width; ++stream) { + dest_streams[stream] = &out[stream * num_values]; + } + DoSplitStreams(raw_values, width, num_values, dest_streams.data()); +} + template void ByteStreamSplitDecodeScalar(const uint8_t* data, int64_t num_values, int64_t stride, uint8_t* out) { @@ -690,26 +705,57 @@ void ByteStreamSplitDecodeScalar(const uint8_t* data, int64_t num_values, int64_ DoMergeStreams(src_streams.data(), kNumStreams, num_values, out); } -template -void inline ByteStreamSplitEncode(const uint8_t* raw_values, const int64_t num_values, - uint8_t* output_buffer_raw) { +inline void ByteStreamSplitDecodeScalarDynamic(const uint8_t* data, int width, + int64_t num_values, int64_t stride, + uint8_t* out) { + ::arrow::internal::SmallVector src_streams; + src_streams.resize(width); + for (int stream = 0; stream < width; ++stream) { + src_streams[stream] = &data[stream * stride]; + } + DoMergeStreams(src_streams.data(), width, num_values, out); +} + +inline void ByteStreamSplitEncode(const uint8_t* raw_values, int width, + const int64_t num_values, uint8_t* out) { #if defined(ARROW_HAVE_SIMD_SPLIT) - return ByteStreamSplitEncodeSimd(raw_values, num_values, - output_buffer_raw); +#define ByteStreamSplitEncodePerhapsSimd ByteStreamSplitEncodeSimd #else - return ByteStreamSplitEncodeScalar(raw_values, num_values, - output_buffer_raw); +#define ByteStreamSplitEncodePerhapsSimd ByteStreamSplitEncodeScalar #endif + switch (width) { + case 2: + return ByteStreamSplitEncodeScalar<2>(raw_values, num_values, out); + case 4: + return ByteStreamSplitEncodePerhapsSimd<4>(raw_values, num_values, out); + case 8: + return ByteStreamSplitEncodePerhapsSimd<8>(raw_values, num_values, out); + case 16: + return ByteStreamSplitEncodeScalar<16>(raw_values, num_values, out); + } + return ByteStreamSplitEncodeScalarDynamic(raw_values, width, num_values, out); +#undef ByteStreamSplitEncodePerhapsSimd } -template -void inline ByteStreamSplitDecode(const uint8_t* data, int64_t num_values, int64_t stride, - uint8_t* out) { +inline void ByteStreamSplitDecode(const uint8_t* data, int width, int64_t num_values, + int64_t stride, uint8_t* out) { #if defined(ARROW_HAVE_SIMD_SPLIT) - return ByteStreamSplitDecodeSimd(data, num_values, stride, out); +#define ByteStreamSplitDecodePerhapsSimd ByteStreamSplitDecodeSimd #else - return ByteStreamSplitDecodeScalar(data, num_values, stride, out); +#define ByteStreamSplitDecodePerhapsSimd ByteStreamSplitDecodeScalar #endif + switch (width) { + case 2: + return ByteStreamSplitDecodeScalar<2>(data, num_values, stride, out); + case 4: + return ByteStreamSplitDecodePerhapsSimd<4>(data, num_values, stride, out); + case 8: + return ByteStreamSplitDecodePerhapsSimd<8>(data, num_values, stride, out); + case 16: + return ByteStreamSplitDecodeScalar<16>(data, num_values, stride, out); + } + return ByteStreamSplitDecodeScalarDynamic(data, width, num_values, stride, out); +#undef ByteStreamSplitDecodePerhapsSimd } } // namespace arrow::util::internal diff --git a/cpp/src/arrow/util/byte_stream_split_test.cc b/cpp/src/arrow/util/byte_stream_split_test.cc index 71c6063179ea6..7cfd332140325 100644 --- a/cpp/src/arrow/util/byte_stream_split_test.cc +++ b/cpp/src/arrow/util/byte_stream_split_test.cc @@ -63,29 +63,12 @@ class TestByteStreamSplitSpecialized : public ::testing::Test { public: static constexpr int kWidth = static_cast(sizeof(T)); - using EncodeFunc = NamedFunc)>>; - using DecodeFunc = NamedFunc)>>; + using EncodeFunc = NamedFunc>; + using DecodeFunc = NamedFunc>; void SetUp() override { - encode_funcs_.push_back({"reference", &ReferenceEncode}); - encode_funcs_.push_back({"scalar", &ByteStreamSplitEncodeScalar}); - decode_funcs_.push_back({"scalar", &ByteStreamSplitDecodeScalar}); -#if defined(ARROW_HAVE_SIMD_SPLIT) - encode_funcs_.push_back({"simd", &ByteStreamSplitEncodeSimd}); - decode_funcs_.push_back({"simd", &ByteStreamSplitDecodeSimd}); -#endif -#if defined(ARROW_HAVE_SSE4_2) - encode_funcs_.push_back({"sse2", &ByteStreamSplitEncodeSse2}); - decode_funcs_.push_back({"sse2", &ByteStreamSplitDecodeSse2}); -#endif -#if defined(ARROW_HAVE_AVX2) - encode_funcs_.push_back({"avx2", &ByteStreamSplitEncodeAvx2}); - decode_funcs_.push_back({"avx2", &ByteStreamSplitDecodeAvx2}); -#endif -#if defined(ARROW_HAVE_AVX512) - encode_funcs_.push_back({"avx512", &ByteStreamSplitEncodeAvx512}); - decode_funcs_.push_back({"avx512", &ByteStreamSplitDecodeAvx512}); -#endif + decode_funcs_ = MakeDecodeFuncs(); + encode_funcs_ = MakeEncodeFuncs(); } void TestRoundtrip(int64_t num_values) { @@ -98,12 +81,12 @@ class TestByteStreamSplitSpecialized : public ::testing::Test { for (const auto& encode_func : encode_funcs_) { ARROW_SCOPED_TRACE("encode_func = ", encode_func); encoded.assign(encoded.size(), 0); - encode_func.func(reinterpret_cast(input.data()), num_values, + encode_func.func(reinterpret_cast(input.data()), kWidth, num_values, encoded.data()); for (const auto& decode_func : decode_funcs_) { ARROW_SCOPED_TRACE("decode_func = ", decode_func); decoded.assign(decoded.size(), T{}); - decode_func.func(encoded.data(), num_values, /*stride=*/num_values, + decode_func.func(encoded.data(), kWidth, num_values, /*stride=*/num_values, reinterpret_cast(decoded.data())); ASSERT_EQ(decoded, input); } @@ -129,7 +112,8 @@ class TestByteStreamSplitSpecialized : public ::testing::Test { int64_t offset = 0; while (offset < num_values) { auto chunk_size = std::min(num_values - offset, chunk_size_dist(gen)); - decode_func.func(encoded.data() + offset, chunk_size, /*stride=*/num_values, + decode_func.func(encoded.data() + offset, kWidth, chunk_size, + /*stride=*/num_values, reinterpret_cast(decoded.data() + offset)); offset += chunk_size; } @@ -156,6 +140,74 @@ class TestByteStreamSplitSpecialized : public ::testing::Test { return input; } + template + static std::vector MakeDecodeFuncs() { + std::vector funcs; + funcs.push_back({"scalar", &ByteStreamSplitDecodeScalarDynamic}); + funcs.push_back( + {"scalar", DynamicWidthDecodeFromStatic(&ByteStreamSplitDecodeScalar)}); +#if defined(ARROW_HAVE_SIMD_SPLIT) + if constexpr (kSimdImplemented) { + funcs.push_back( + {"simd", DynamicWidthDecodeFromStatic(&ByteStreamSplitDecodeSimd)}); +#if defined(ARROW_HAVE_SSE4_2) + funcs.push_back( + {"sse2", DynamicWidthDecodeFromStatic(&ByteStreamSplitDecodeSse2)}); +#endif +#if defined(ARROW_HAVE_AVX2) + funcs.push_back( + {"avx2", DynamicWidthDecodeFromStatic(&ByteStreamSplitDecodeAvx2)}); +#endif +#if defined(ARROW_HAVE_AVX512) + funcs.push_back( + {"avx512", DynamicWidthDecodeFromStatic(&ByteStreamSplitDecodeAvx512)}); +#endif + } +#endif // defined(ARROW_HAVE_SIMD_SPLIT) + return funcs; + } + + template + static std::vector MakeEncodeFuncs() { + std::vector funcs; + funcs.push_back({"reference", &ReferenceByteStreamSplitEncode}); + funcs.push_back({"reference", &ByteStreamSplitEncodeScalarDynamic}); + funcs.push_back( + {"scalar", DynamicWidthEncodeFromStatic(&ByteStreamSplitEncodeScalar)}); +#if defined(ARROW_HAVE_SIMD_SPLIT) + if constexpr (kSimdImplemented) { + funcs.push_back( + {"simd", DynamicWidthEncodeFromStatic(&ByteStreamSplitEncodeSimd)}); +#if defined(ARROW_HAVE_SSE4_2) + funcs.push_back( + {"sse2", DynamicWidthEncodeFromStatic(&ByteStreamSplitEncodeSse2)}); +#endif +#if defined(ARROW_HAVE_AVX2) + funcs.push_back( + {"avx2", DynamicWidthEncodeFromStatic(&ByteStreamSplitEncodeAvx2)}); +#endif +#if defined(ARROW_HAVE_AVX512) + funcs.push_back( + {"avx512", DynamicWidthEncodeFromStatic(&ByteStreamSplitEncodeAvx512)}); +#endif + } +#endif // defined(ARROW_HAVE_SIMD_SPLIT) + return funcs; + } + + static std::function DynamicWidthDecodeFromStatic( + std::function)> wrapped) { + return [wrapped](const uint8_t* data, int width, int64_t num_values, int64_t stride, + uint8_t* out) { wrapped(data, num_values, stride, out); }; + } + + static std::function DynamicWidthEncodeFromStatic( + std::function)> wrapped) { + return [wrapped](const uint8_t* data, int width, int64_t num_values, uint8_t* out) { + wrapped(data, num_values, out); + }; + } + std::vector encode_funcs_; std::vector decode_funcs_; diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index a3d1746536647..6f3bfad96067e 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -853,8 +853,9 @@ std::shared_ptr ByteStreamSplitEncoder::FlushValues() { AllocateBuffer(this->memory_pool(), EstimatedDataEncodedSize()); uint8_t* output_buffer_raw = output_buffer->mutable_data(); const uint8_t* raw_values = sink_.data(); - ::arrow::util::internal::ByteStreamSplitEncode( - raw_values, num_values_in_buffer_, output_buffer_raw); + ::arrow::util::internal::ByteStreamSplitEncode( + raw_values, /*width=*/static_cast(sizeof(T)), num_values_in_buffer_, + output_buffer_raw); sink_.Reset(); num_values_in_buffer_ = 0; return std::move(output_buffer); @@ -3621,8 +3622,9 @@ int ByteStreamSplitDecoder::Decode(T* buffer, int max_values) { const int num_decoded_previously = num_values_in_buffer_ - num_values_; const uint8_t* data = data_ + num_decoded_previously; - ::arrow::util::internal::ByteStreamSplitDecode( - data, values_to_decode, num_values_in_buffer_, reinterpret_cast(buffer)); + ::arrow::util::internal::ByteStreamSplitDecode(data, kNumStreams, values_to_decode, + num_values_in_buffer_, + reinterpret_cast(buffer)); num_values_ -= values_to_decode; len_ -= sizeof(T) * values_to_decode; return values_to_decode; @@ -3642,18 +3644,17 @@ int ByteStreamSplitDecoder::DecodeArrow( const int num_decoded_previously = num_values_in_buffer_ - num_values_; const uint8_t* data = data_ + num_decoded_previously; - int offset = 0; -#if defined(ARROW_HAVE_SIMD_SPLIT) - // Use fast decoding into intermediate buffer. This will also decode - // some null values, but it's fast enough that we don't care. + // Decode into intermediate buffer. T* decode_out = EnsureDecodeBuffer(values_decoded); - ::arrow::util::internal::ByteStreamSplitDecode( - data, values_decoded, num_values_in_buffer_, - reinterpret_cast(decode_out)); - - // XXX If null_count is 0, we could even append in bulk or decode directly into - // builder + ::arrow::util::internal::ByteStreamSplitDecode(data, kNumStreams, values_decoded, + num_values_in_buffer_, + reinterpret_cast(decode_out)); + + // If null_count is 0, we could even append in bulk or decode directly into + // builder. We could also decode in chunks, or use SpacedExpand. We don't + // bother currently, because DecodeArrow methods are only called for ByteArray. + int64_t offset = 0; VisitNullBitmapInline( valid_bits, valid_bits_offset, num_values, null_count, [&]() { @@ -3662,22 +3663,6 @@ int ByteStreamSplitDecoder::DecodeArrow( }, [&]() { builder->UnsafeAppendNull(); }); -#else - // XXX should operate over runs of 0s / 1s - VisitNullBitmapInline( - valid_bits, valid_bits_offset, num_values, null_count, - [&]() { - uint8_t gathered_byte_data[kNumStreams]; - for (int b = 0; b < kNumStreams; ++b) { - const int64_t byte_index = b * num_values_in_buffer_ + offset; - gathered_byte_data[b] = data[byte_index]; - } - builder->UnsafeAppend(SafeLoadAs(&gathered_byte_data[0])); - ++offset; - }, - [&]() { builder->UnsafeAppendNull(); }); -#endif - num_values_ -= values_decoded; len_ -= sizeof(T) * values_decoded; return values_decoded; diff --git a/cpp/src/parquet/encoding_benchmark.cc b/cpp/src/parquet/encoding_benchmark.cc index 76c411244b22d..f8fb210771ece 100644 --- a/cpp/src/parquet/encoding_benchmark.cc +++ b/cpp/src/parquet/encoding_benchmark.cc @@ -17,6 +17,11 @@ #include "benchmark/benchmark.h" +#include +#include +#include +#include + #include "arrow/array.h" #include "arrow/array/builder_binary.h" #include "arrow/array/builder_dict.h" @@ -31,10 +36,6 @@ #include "parquet/platform.h" #include "parquet/schema.h" -#include -#include -#include - using arrow::default_memory_pool; using arrow::MemoryPool; @@ -361,138 +362,221 @@ static void BM_PlainDecodingSpacedDouble(benchmark::State& state) { } BENCHMARK(BM_PlainDecodingSpacedDouble)->Apply(BM_SpacedArgs); -template +template +struct ByteStreamSplitDummyValue { + static constexpr T value() { return static_cast(42); } +}; + +template +struct ByteStreamSplitDummyValue> { + using Array = std::array; + + static constexpr Array value() { + Array array; + array.fill(ByteStreamSplitDummyValue::value()); + return array; + } +}; + +template static void BM_ByteStreamSplitDecode(benchmark::State& state, DecodeFunc&& decode_func) { - std::vector values(state.range(0), 64.0); + const std::vector values(state.range(0), ByteStreamSplitDummyValue::value()); const uint8_t* values_raw = reinterpret_cast(values.data()); - std::vector output(state.range(0), 0); + std::vector output(state.range(0)); for (auto _ : state) { - decode_func(values_raw, static_cast(values.size()), - static_cast(values.size()), - reinterpret_cast(output.data())); + if constexpr (kIsDynamicWidthDecode) { + decode_func(values_raw, + /*width=*/static_cast(sizeof(T)), + /*num_values=*/static_cast(values.size()), + /*stride=*/static_cast(values.size()), + reinterpret_cast(output.data())); + } else { + decode_func(values_raw, + /*num_values=*/static_cast(values.size()), + /*stride=*/static_cast(values.size()), + reinterpret_cast(output.data())); + } benchmark::ClobberMemory(); } state.SetBytesProcessed(state.iterations() * values.size() * sizeof(T)); + state.SetItemsProcessed(state.iterations() * values.size()); } -template +template static void BM_ByteStreamSplitEncode(benchmark::State& state, EncodeFunc&& encode_func) { - std::vector values(state.range(0), 64.0); + const std::vector values(state.range(0), ByteStreamSplitDummyValue::value()); const uint8_t* values_raw = reinterpret_cast(values.data()); - std::vector output(state.range(0) * sizeof(T), 0); + std::vector output(state.range(0) * sizeof(T)); for (auto _ : state) { - encode_func(values_raw, values.size(), output.data()); + if constexpr (kIsDynamicWidthDecode) { + encode_func(values_raw, /*width=*/static_cast(sizeof(T)), values.size(), + output.data()); + } else { + encode_func(values_raw, values.size(), output.data()); + } benchmark::ClobberMemory(); } state.SetBytesProcessed(state.iterations() * values.size() * sizeof(T)); + state.SetItemsProcessed(state.iterations() * values.size()); +} + +static void BM_ByteStreamSplitDecode_Float_Generic(benchmark::State& state) { + BM_ByteStreamSplitDecode(state, + ::arrow::util::internal::ByteStreamSplitDecode); +} + +static void BM_ByteStreamSplitDecode_Double_Generic(benchmark::State& state) { + BM_ByteStreamSplitDecode(state, + ::arrow::util::internal::ByteStreamSplitDecode); +} + +template +static void BM_ByteStreamSplitDecode_FLBA_Generic(benchmark::State& state) { + BM_ByteStreamSplitDecode, true>( + state, ::arrow::util::internal::ByteStreamSplitDecode); +} + +static void BM_ByteStreamSplitEncode_Float_Generic(benchmark::State& state) { + BM_ByteStreamSplitEncode(state, + ::arrow::util::internal::ByteStreamSplitEncode); +} + +static void BM_ByteStreamSplitEncode_Double_Generic(benchmark::State& state) { + BM_ByteStreamSplitEncode(state, + ::arrow::util::internal::ByteStreamSplitEncode); +} + +template +static void BM_ByteStreamSplitEncode_FLBA_Generic(benchmark::State& state) { + BM_ByteStreamSplitEncode, true>( + state, ::arrow::util::internal::ByteStreamSplitEncode); } static void BM_ByteStreamSplitDecode_Float_Scalar(benchmark::State& state) { - BM_ByteStreamSplitDecode( + BM_ByteStreamSplitDecode( state, ::arrow::util::internal::ByteStreamSplitDecodeScalar); } static void BM_ByteStreamSplitDecode_Double_Scalar(benchmark::State& state) { - BM_ByteStreamSplitDecode( + BM_ByteStreamSplitDecode( state, ::arrow::util::internal::ByteStreamSplitDecodeScalar); } static void BM_ByteStreamSplitEncode_Float_Scalar(benchmark::State& state) { - BM_ByteStreamSplitEncode( + BM_ByteStreamSplitEncode( state, ::arrow::util::internal::ByteStreamSplitEncodeScalar); } static void BM_ByteStreamSplitEncode_Double_Scalar(benchmark::State& state) { - BM_ByteStreamSplitEncode( + BM_ByteStreamSplitEncode( state, ::arrow::util::internal::ByteStreamSplitEncodeScalar); } -BENCHMARK(BM_ByteStreamSplitDecode_Float_Scalar)->Range(MIN_RANGE, MAX_RANGE); -BENCHMARK(BM_ByteStreamSplitDecode_Double_Scalar)->Range(MIN_RANGE, MAX_RANGE); -BENCHMARK(BM_ByteStreamSplitEncode_Float_Scalar)->Range(MIN_RANGE, MAX_RANGE); -BENCHMARK(BM_ByteStreamSplitEncode_Double_Scalar)->Range(MIN_RANGE, MAX_RANGE); +static void ByteStreamSplitApply(::benchmark::internal::Benchmark* bench) { + // Reduce the number of variations by only testing the two range ends. + bench->Arg(MIN_RANGE)->Arg(MAX_RANGE); +} + +BENCHMARK(BM_ByteStreamSplitDecode_Float_Generic)->Apply(ByteStreamSplitApply); +BENCHMARK(BM_ByteStreamSplitDecode_Double_Generic)->Apply(ByteStreamSplitApply); +BENCHMARK_TEMPLATE(BM_ByteStreamSplitDecode_FLBA_Generic, 2)->Apply(ByteStreamSplitApply); +BENCHMARK_TEMPLATE(BM_ByteStreamSplitDecode_FLBA_Generic, 7)->Apply(ByteStreamSplitApply); +BENCHMARK_TEMPLATE(BM_ByteStreamSplitDecode_FLBA_Generic, 16) + ->Apply(ByteStreamSplitApply); + +BENCHMARK(BM_ByteStreamSplitEncode_Float_Generic)->Apply(ByteStreamSplitApply); +BENCHMARK(BM_ByteStreamSplitEncode_Double_Generic)->Apply(ByteStreamSplitApply); +BENCHMARK_TEMPLATE(BM_ByteStreamSplitEncode_FLBA_Generic, 2)->Apply(ByteStreamSplitApply); +BENCHMARK_TEMPLATE(BM_ByteStreamSplitEncode_FLBA_Generic, 7)->Apply(ByteStreamSplitApply); +BENCHMARK_TEMPLATE(BM_ByteStreamSplitEncode_FLBA_Generic, 16) + ->Apply(ByteStreamSplitApply); + +BENCHMARK(BM_ByteStreamSplitDecode_Float_Scalar)->Apply(ByteStreamSplitApply); +BENCHMARK(BM_ByteStreamSplitDecode_Double_Scalar)->Apply(ByteStreamSplitApply); +BENCHMARK(BM_ByteStreamSplitEncode_Float_Scalar)->Apply(ByteStreamSplitApply); +BENCHMARK(BM_ByteStreamSplitEncode_Double_Scalar)->Apply(ByteStreamSplitApply); #if defined(ARROW_HAVE_SSE4_2) static void BM_ByteStreamSplitDecode_Float_Sse2(benchmark::State& state) { - BM_ByteStreamSplitDecode( + BM_ByteStreamSplitDecode( state, ::arrow::util::internal::ByteStreamSplitDecodeSse2); } static void BM_ByteStreamSplitDecode_Double_Sse2(benchmark::State& state) { - BM_ByteStreamSplitDecode( + BM_ByteStreamSplitDecode( state, ::arrow::util::internal::ByteStreamSplitDecodeSse2); } static void BM_ByteStreamSplitEncode_Float_Sse2(benchmark::State& state) { - BM_ByteStreamSplitEncode( + BM_ByteStreamSplitEncode( state, ::arrow::util::internal::ByteStreamSplitEncodeSse2); } static void BM_ByteStreamSplitEncode_Double_Sse2(benchmark::State& state) { - BM_ByteStreamSplitEncode( + BM_ByteStreamSplitEncode( state, ::arrow::util::internal::ByteStreamSplitEncodeSse2); } -BENCHMARK(BM_ByteStreamSplitDecode_Float_Sse2)->Range(MIN_RANGE, MAX_RANGE); -BENCHMARK(BM_ByteStreamSplitDecode_Double_Sse2)->Range(MIN_RANGE, MAX_RANGE); -BENCHMARK(BM_ByteStreamSplitEncode_Float_Sse2)->Range(MIN_RANGE, MAX_RANGE); -BENCHMARK(BM_ByteStreamSplitEncode_Double_Sse2)->Range(MIN_RANGE, MAX_RANGE); +BENCHMARK(BM_ByteStreamSplitDecode_Float_Sse2)->Apply(ByteStreamSplitApply); +BENCHMARK(BM_ByteStreamSplitDecode_Double_Sse2)->Apply(ByteStreamSplitApply); +BENCHMARK(BM_ByteStreamSplitEncode_Float_Sse2)->Apply(ByteStreamSplitApply); +BENCHMARK(BM_ByteStreamSplitEncode_Double_Sse2)->Apply(ByteStreamSplitApply); #endif #if defined(ARROW_HAVE_AVX2) static void BM_ByteStreamSplitDecode_Float_Avx2(benchmark::State& state) { - BM_ByteStreamSplitDecode( + BM_ByteStreamSplitDecode( state, ::arrow::util::internal::ByteStreamSplitDecodeAvx2); } static void BM_ByteStreamSplitDecode_Double_Avx2(benchmark::State& state) { - BM_ByteStreamSplitDecode( + BM_ByteStreamSplitDecode( state, ::arrow::util::internal::ByteStreamSplitDecodeAvx2); } static void BM_ByteStreamSplitEncode_Float_Avx2(benchmark::State& state) { - BM_ByteStreamSplitEncode( + BM_ByteStreamSplitEncode( state, ::arrow::util::internal::ByteStreamSplitEncodeAvx2); } static void BM_ByteStreamSplitEncode_Double_Avx2(benchmark::State& state) { - BM_ByteStreamSplitEncode( + BM_ByteStreamSplitEncode( state, ::arrow::util::internal::ByteStreamSplitEncodeAvx2); } -BENCHMARK(BM_ByteStreamSplitDecode_Float_Avx2)->Range(MIN_RANGE, MAX_RANGE); -BENCHMARK(BM_ByteStreamSplitDecode_Double_Avx2)->Range(MIN_RANGE, MAX_RANGE); -BENCHMARK(BM_ByteStreamSplitEncode_Float_Avx2)->Range(MIN_RANGE, MAX_RANGE); -BENCHMARK(BM_ByteStreamSplitEncode_Double_Avx2)->Range(MIN_RANGE, MAX_RANGE); +BENCHMARK(BM_ByteStreamSplitDecode_Float_Avx2)->Apply(ByteStreamSplitApply); +BENCHMARK(BM_ByteStreamSplitDecode_Double_Avx2)->Apply(ByteStreamSplitApply); +BENCHMARK(BM_ByteStreamSplitEncode_Float_Avx2)->Apply(ByteStreamSplitApply); +BENCHMARK(BM_ByteStreamSplitEncode_Double_Avx2)->Apply(ByteStreamSplitApply); #endif #if defined(ARROW_HAVE_AVX512) static void BM_ByteStreamSplitDecode_Float_Avx512(benchmark::State& state) { - BM_ByteStreamSplitDecode( + BM_ByteStreamSplitDecode( state, ::arrow::util::internal::ByteStreamSplitDecodeAvx512); } static void BM_ByteStreamSplitDecode_Double_Avx512(benchmark::State& state) { - BM_ByteStreamSplitDecode( + BM_ByteStreamSplitDecode( state, ::arrow::util::internal::ByteStreamSplitDecodeAvx512); } static void BM_ByteStreamSplitEncode_Float_Avx512(benchmark::State& state) { - BM_ByteStreamSplitEncode( + BM_ByteStreamSplitEncode( state, ::arrow::util::internal::ByteStreamSplitEncodeAvx512); } static void BM_ByteStreamSplitEncode_Double_Avx512(benchmark::State& state) { - BM_ByteStreamSplitEncode( + BM_ByteStreamSplitEncode( state, ::arrow::util::internal::ByteStreamSplitEncodeAvx512); } -BENCHMARK(BM_ByteStreamSplitDecode_Float_Avx512)->Range(MIN_RANGE, MAX_RANGE); -BENCHMARK(BM_ByteStreamSplitDecode_Double_Avx512)->Range(MIN_RANGE, MAX_RANGE); -BENCHMARK(BM_ByteStreamSplitEncode_Float_Avx512)->Range(MIN_RANGE, MAX_RANGE); -BENCHMARK(BM_ByteStreamSplitEncode_Double_Avx512)->Range(MIN_RANGE, MAX_RANGE); +BENCHMARK(BM_ByteStreamSplitDecode_Float_Avx512)->Apply(ByteStreamSplitApply); +BENCHMARK(BM_ByteStreamSplitDecode_Double_Avx512)->Apply(ByteStreamSplitApply); +BENCHMARK(BM_ByteStreamSplitEncode_Float_Avx512)->Apply(ByteStreamSplitApply); +BENCHMARK(BM_ByteStreamSplitEncode_Double_Avx512)->Apply(ByteStreamSplitApply); #endif template