From e1b71e665ae93d9589a65681da3ae811ef53b66d Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Sat, 26 Jun 2021 21:59:56 -0500 Subject: [PATCH 01/80] multibyte-split scaffolding --- cpp/CMakeLists.txt | 3 +- cpp/include/cudf/io/text/multibyte_split.hpp | 20 +++++ cpp/src/io/text/multibyte_split.cu | 42 ++++++++++ cpp/tests/CMakeLists.txt | 5 ++ cpp/tests/io/text/multibyte_split_test.cpp | 82 ++++++++++++++++++++ 5 files changed, 151 insertions(+), 1 deletion(-) create mode 100644 cpp/include/cudf/io/text/multibyte_split.hpp create mode 100644 cpp/src/io/text/multibyte_split.cu create mode 100644 cpp/tests/io/text/multibyte_split_test.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 678f202d106..36a8a730880 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -221,8 +221,8 @@ add_library(cudf src/interop/dlpack.cpp src/interop/from_arrow.cu src/interop/to_arrow.cu - src/io/avro/avro.cpp src/io/avro/avro_gpu.cu + src/io/avro/avro.cpp src/io/avro/reader_impl.cu src/io/comp/brotli_dict.cpp src/io/comp/cpu_unbz2.cpp @@ -257,6 +257,7 @@ add_library(cudf src/io/parquet/writer_impl.cu src/io/statistics/orc_column_statistics.cu src/io/statistics/parquet_column_statistics.cu + src/io/text/multibyte_split.cu src/io/utilities/column_buffer.cpp src/io/utilities/data_sink.cpp src/io/utilities/datasource.cpp diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp new file mode 100644 index 00000000000..f51e0c5ee2e --- /dev/null +++ b/cpp/include/cudf/io/text/multibyte_split.hpp @@ -0,0 +1,20 @@ +#include + +#include +#include + +#include +#include + +namespace cudf { +namespace io { +namespace text { + +std::unique_ptr multibyte_split( + std::istream& input, + std::string delimeter, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +} +} // namespace io +} // namespace cudf diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu new file mode 100644 index 00000000000..09a6aa4053e --- /dev/null +++ b/cpp/src/io/text/multibyte_split.cu @@ -0,0 +1,42 @@ +#include + +#include +#include + +#include +#include +#include + +namespace { + +} + +namespace cudf { +namespace io { +namespace text { +namespace detail { + +std::unique_ptr multibyte_split(std::istream& input, + std::string delimeter, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FAIL(); +} + +} // namespace detail + +std::unique_ptr multibyte_split(std::istream& input, + std::string delimeter, + rmm::mr::device_memory_resource* mr) +{ + char c; + while (input.readsome(&c, 1) > 0) { std::cout << std::bitset<8>(c) << std::endl; } + std::cout << std::endl; + + CUDF_FAIL(); +} + +} // namespace text +} // namespace io +} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 4360b418e95..d99e28c588c 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -193,6 +193,11 @@ if(CUDF_ENABLE_ARROW_S3) target_compile_definitions(ARROW_IO_SOURCE_TEST PRIVATE "S3_ENABLED") endif() +################################################################################################### +# - io tests -------------------------------------------------------------------------------------- +ConfigureTest(MULTIBYTE_SPLIT_TEST + io/text/multibyte_split_test.cpp) + ################################################################################################### # - sort tests ------------------------------------------------------------------------------------ ConfigureTest(SORT_TEST diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp new file mode 100644 index 00000000000..209b5675a7e --- /dev/null +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include + +#include + +#include + +#include + +using namespace cudf; +using namespace test; + +constexpr bool print_all{false}; + +struct MultibyteSplitTest : public BaseFixture { +}; + +TEST_F(MultibyteSplitTest, Simple) +{ + std::string separator = "😎"; // F0 9F 98 8E | 11110000 11111001 1100010 11101000 + std::string input = + "here😎" + "is😎" + "some😎" + "simple😎" + "text😎" + "seperated😎" + "by😎" + "emojis😎" + "which😎" + "are😎" + "multple😎" + "bytes😎" + "and😎" + "used😎" + "as😎" + "delimeters."; + + auto expected = strings_column_wrapper{"here", + "is", + "some", + "simple", + "text", + "seperated", + "by", + "emojis", + "which", + "are", + "multple", + "bytes", + "and", + "used", + "as", + "delimeters."}; + + auto input_stream = std::basic_istringstream(input); + + auto out = cudf::io::text::multibyte_split(input_stream, separator); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, print_all); +} From 836773a60141e94d6b60540d92238d0139bc6cae Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Sun, 27 Jun 2021 00:05:22 -0500 Subject: [PATCH 02/80] cudf::io::text::input_stream --- cpp/CMakeLists.txt | 1 + .../cudf/io/text/host_input_stream.hpp | 28 ++++++++++ cpp/include/cudf/io/text/input_stream.hpp | 18 +++++++ cpp/include/cudf/io/text/multibyte_split.hpp | 4 +- cpp/src/io/text/host_input_stream.cpp | 35 ++++++++++++ cpp/src/io/text/multibyte_split.cu | 54 ++++++++++++++++--- cpp/tests/io/text/multibyte_split_test.cpp | 7 ++- 7 files changed, 137 insertions(+), 10 deletions(-) create mode 100644 cpp/include/cudf/io/text/host_input_stream.hpp create mode 100644 cpp/include/cudf/io/text/input_stream.hpp create mode 100644 cpp/src/io/text/host_input_stream.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 36a8a730880..b5b1de9900a 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -257,6 +257,7 @@ add_library(cudf src/io/parquet/writer_impl.cu src/io/statistics/orc_column_statistics.cu src/io/statistics/parquet_column_statistics.cu + src/io/text/host_input_stream.cpp src/io/text/multibyte_split.cu src/io/utilities/column_buffer.cpp src/io/utilities/data_sink.cpp diff --git a/cpp/include/cudf/io/text/host_input_stream.hpp b/cpp/include/cudf/io/text/host_input_stream.hpp new file mode 100644 index 00000000000..e68eecb0765 --- /dev/null +++ b/cpp/include/cudf/io/text/host_input_stream.hpp @@ -0,0 +1,28 @@ +#pragma once + +#include + +#include + +#include + +#include + +namespace cudf { +namespace io { +namespace text { + +class host_input_stream : public cudf::io::text::input_stream { + public: + host_input_stream(std::istream& source_stream) : _source_stream(source_stream) {} + + uint32_t readsome(cudf::device_span destination, rmm::cuda_stream_view stream) override; + + private: + std::istream& _source_stream; + thrust::host_vector _host_buffer{}; +}; + +} // namespace text +} // namespace io +} // namespace cudf diff --git a/cpp/include/cudf/io/text/input_stream.hpp b/cpp/include/cudf/io/text/input_stream.hpp new file mode 100644 index 00000000000..f977f70f5fd --- /dev/null +++ b/cpp/include/cudf/io/text/input_stream.hpp @@ -0,0 +1,18 @@ +#pragma once + +#include + +#include + +namespace cudf { +namespace io { +namespace text { + +class input_stream { + public: + virtual uint32_t readsome(cudf::device_span destination, rmm::cuda_stream_view stream) = 0; +}; + +} // namespace text +} // namespace io +} // namespace cudf diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp index f51e0c5ee2e..3de019db8f3 100644 --- a/cpp/include/cudf/io/text/multibyte_split.hpp +++ b/cpp/include/cudf/io/text/multibyte_split.hpp @@ -1,3 +1,5 @@ +#include + #include #include @@ -11,7 +13,7 @@ namespace io { namespace text { std::unique_ptr multibyte_split( - std::istream& input, + cudf::io::text::input_stream& input, std::string delimeter, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/src/io/text/host_input_stream.cpp b/cpp/src/io/text/host_input_stream.cpp new file mode 100644 index 00000000000..6eb5364eede --- /dev/null +++ b/cpp/src/io/text/host_input_stream.cpp @@ -0,0 +1,35 @@ +#include +#include + +#include + +#include + +#include + +namespace cudf { +namespace io { +namespace text { + +uint32_t host_input_stream::readsome(cudf::device_span destination, + rmm::cuda_stream_view stream) +{ + auto read_size = destination.size(); + + if (_host_buffer.size() < read_size) { _host_buffer.resize(read_size); } + + read_size = _source_stream.readsome(_host_buffer.data(), read_size); + + CUDA_TRY(cudaMemcpyAsync( // + destination.data(), + _host_buffer.data(), + read_size, + cudaMemcpyHostToDevice, + stream.value())); + + return read_size; +} + +} // namespace text +} // namespace io +} // namespace cudf diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 09a6aa4053e..dcf440f54cb 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -1,4 +1,5 @@ #include +#include #include #include @@ -9,32 +10,71 @@ namespace { +__global__ void multibyte_split_kernel(cudf::device_span data) +{ + auto const thread_idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_idx < data.size()) { + printf("bid(%i) tid(%i) %c\n", + static_cast(blockIdx.x), + static_cast(threadIdx.x), + data[thread_idx]); + } } +} // namespace + namespace cudf { namespace io { namespace text { namespace detail { -std::unique_ptr multibyte_split(std::istream& input, +std::unique_ptr multibyte_split(cudf::io::text::input_stream& input, std::string delimeter, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + auto constexpr bytes_per_thread = 32; + auto constexpr threads_per_block = 1024; + auto constexpr blocks_per_pass = 1; + auto constexpr bytes_per_pass = bytes_per_thread * threads_per_block * blocks_per_pass; + + auto input_buffer_a = rmm::device_uvector(bytes_per_pass, stream); + auto stream_a = stream; + + auto input_buffer_b = rmm::device_uvector(bytes_per_pass, stream); + auto stream_b = stream; + + uint32_t bytes_read = 0; + + while (true) { + stream_a.synchronize(); + + auto bytes_read = input.readsome(input_buffer_a, stream_a); + + if (bytes_read == 0) { + break; // nothing left to process. + } + + multibyte_split_kernel<<>>( + cudf::device_span(input_buffer_a).first(bytes_read)); + + std::swap(stream_a, stream_b); + std::swap(input_buffer_a, input_buffer_b); + } + + stream_b.synchronize(); + CUDF_FAIL(); } } // namespace detail -std::unique_ptr multibyte_split(std::istream& input, +std::unique_ptr multibyte_split(cudf::io::text::input_stream& input, std::string delimeter, rmm::mr::device_memory_resource* mr) { - char c; - while (input.readsome(&c, 1) > 0) { std::cout << std::bitset<8>(c) << std::endl; } - std::cout << std::endl; - - CUDF_FAIL(); + return detail::multibyte_split(input, delimeter, {}, mr); } } // namespace text diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index 209b5675a7e..d6035f53880 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -19,6 +19,8 @@ #include #include +#include + #include #include @@ -74,9 +76,10 @@ TEST_F(MultibyteSplitTest, Simple) "as", "delimeters."}; - auto input_stream = std::basic_istringstream(input); + auto input_stream = std::basic_istringstream(input); + auto input_stream_io = cudf::io::text::host_input_stream(input_stream); - auto out = cudf::io::text::multibyte_split(input_stream, separator); + auto out = cudf::io::text::multibyte_split(input_stream_io, separator); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, print_all); } From 3e06c1895019ca6b0a0eb844bb94da8b683132b7 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 2 Jul 2021 17:06:01 -0500 Subject: [PATCH 03/80] trie test scaffolding --- cpp/include/cudf/io/text/trie.hpp | 22 ++++++++++ cpp/src/io/text/multibyte_split.cu | 68 ++++++++++++++++++------------ cpp/tests/CMakeLists.txt | 3 ++ cpp/tests/io/text/trie_test.cpp | 52 +++++++++++++++++++++++ 4 files changed, 119 insertions(+), 26 deletions(-) create mode 100644 cpp/include/cudf/io/text/trie.hpp create mode 100644 cpp/tests/io/text/trie_test.cpp diff --git a/cpp/include/cudf/io/text/trie.hpp b/cpp/include/cudf/io/text/trie.hpp new file mode 100644 index 00000000000..f4a2fc5f150 --- /dev/null +++ b/cpp/include/cudf/io/text/trie.hpp @@ -0,0 +1,22 @@ +#include +#include + +namespace cudf { +namespace io { +namespace text { + +namespace { + +struct trie_builder_node { +}; + +} // namespace + +struct trie { + trie(std::string const& pattern) : trie(std::vector{pattern}) {} + trie(std::vector const& patterns) {} +}; + +} // namespace text +} // namespace io +} // namespace cudf diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index dcf440f54cb..1b75d8a7155 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -1,5 +1,7 @@ #include #include +#include +#include #include #include @@ -10,15 +12,33 @@ namespace { +template +inline constexpr auto ceil_div(Dividend dividend, Divisor divisor) +{ + return dividend / divisor + (dividend % divisor != 0); +} + +struct trie_state { + uint8_t placeholder; +}; + +template __global__ void multibyte_split_kernel(cudf::device_span data) { auto const thread_idx = blockIdx.x * blockDim.x + threadIdx.x; + auto const data_begin = thread_idx * BYTES_PER_THREAD; + auto data_end = data_begin + BYTES_PER_THREAD; + + if (data_end > data.size()) { data_end = data.size(); } - if (thread_idx < data.size()) { - printf("bid(%i) tid(%i) %c\n", - static_cast(blockIdx.x), - static_cast(threadIdx.x), - data[thread_idx]); + if (data_end < data.size()) { // + printf("bid(%i) tid(%i) : whole\n", blockIdx.x, threadIdx.x); + } else if (data_begin < data.size()) { + printf("bid(%i) tid(%i) : partial\n", blockIdx.x, threadIdx.x); + } + + for (uint32_t i = data_begin; i < data_end; i++) { + printf("bid(%i) tid(%i) %3i: %c\n", blockIdx.x, threadIdx.x, i, data[i]); } } @@ -34,36 +54,32 @@ std::unique_ptr multibyte_split(cudf::io::text::input_stream& inpu rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto constexpr bytes_per_thread = 32; - auto constexpr threads_per_block = 1024; - auto constexpr blocks_per_pass = 1; - auto constexpr bytes_per_pass = bytes_per_thread * threads_per_block * blocks_per_pass; + auto constexpr BYTES_PER_THREAD = 32; + auto constexpr THREADS_PER_TILE = 256; + auto constexpr BYTES_PER_TILE = BYTES_PER_THREAD * THREADS_PER_TILE; + auto constexpr TILES_PER_CHUNK = 1024; + auto constexpr BYTES_PER_CHUNK = BYTES_PER_TILE * TILES_PER_CHUNK; - auto input_buffer_a = rmm::device_uvector(bytes_per_pass, stream); - auto stream_a = stream; + auto input_buffer = rmm::device_uvector(BYTES_PER_CHUNK, stream); + auto const input_span = cudf::device_span(input_buffer); - auto input_buffer_b = rmm::device_uvector(bytes_per_pass, stream); - auto stream_b = stream; - - uint32_t bytes_read = 0; + // TODO: call state initalization kernels while (true) { - stream_a.synchronize(); - - auto bytes_read = input.readsome(input_buffer_a, stream_a); + uint32_t num_bytes_read = input.readsome(input_span, stream); - if (bytes_read == 0) { - break; // nothing left to process. + if (num_bytes_read == 0) { + // if there's no more data to read, we're done. + break; } - multibyte_split_kernel<<>>( - cudf::device_span(input_buffer_a).first(bytes_read)); + auto num_tiles = ceil_div(num_bytes_read, BYTES_PER_TILE); - std::swap(stream_a, stream_b); - std::swap(input_buffer_a, input_buffer_b); + auto kernel = multibyte_split_kernel; + kernel<<>>(input_span.first(num_bytes_read)); } - stream_b.synchronize(); + // TODO: call state finalization kernels CUDF_FAIL(); } @@ -74,7 +90,7 @@ std::unique_ptr multibyte_split(cudf::io::text::input_stream& inpu std::string delimeter, rmm::mr::device_memory_resource* mr) { - return detail::multibyte_split(input, delimeter, {}, mr); + return detail::multibyte_split(input, delimeter, rmm::cuda_stream_default, mr); } } // namespace text diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index d99e28c588c..dc074547234 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -198,6 +198,9 @@ endif() ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp) +ConfigureTest(TRIE_TEST + io/text/trie_test.cpp) + ################################################################################################### # - sort tests ------------------------------------------------------------------------------------ ConfigureTest(SORT_TEST diff --git a/cpp/tests/io/text/trie_test.cpp b/cpp/tests/io/text/trie_test.cpp new file mode 100644 index 00000000000..bcc32e01b17 --- /dev/null +++ b/cpp/tests/io/text/trie_test.cpp @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include + +#include +#include + +#include + +#include + +using namespace cudf; +using namespace test; + +constexpr bool print_all{false}; + +struct TrieTest : public BaseFixture { +}; + +TEST_F(TrieTest, CanMatchSinglePattern) +{ + auto pattern = cudf::io::text::trie{"abac"}; + + (void)pattern; +} + +TEST_F(TrieTest, CanMatchMultiplePatterns) +{ + auto patterns = std::vector{"abac", "abad"}; + auto pattern = cudf::io::text::trie(patterns); + + (void)pattern; +} From ac14dbd2b3944fb160df28562ef269c987a14a75 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Wed, 7 Jul 2021 09:47:37 -0500 Subject: [PATCH 04/80] superstate + tests --- cpp/include/cudf/io/text/superstate.hpp | 129 ++++++++++++++++++++++++ cpp/tests/CMakeLists.txt | 7 +- cpp/tests/io/text/superstate_test.cpp | 128 +++++++++++++++++++++++ 3 files changed, 262 insertions(+), 2 deletions(-) create mode 100644 cpp/include/cudf/io/text/superstate.hpp create mode 100644 cpp/tests/io/text/superstate_test.cpp diff --git a/cpp/include/cudf/io/text/superstate.hpp b/cpp/include/cudf/io/text/superstate.hpp new file mode 100644 index 00000000000..3c6c31ffaa3 --- /dev/null +++ b/cpp/include/cudf/io/text/superstate.hpp @@ -0,0 +1,129 @@ +#pragma once + +#include +#include +#include + +namespace { + +constexpr unsigned floorlog2(unsigned x) { return x == 1 ? 0 : 1 + floorlog2(x >> 1); } + +constexpr unsigned ceillog2(unsigned x) { return x == 1 ? 0 : floorlog2(x - 1) + 1; } + +template +struct rep { +}; + +template +struct rep> { + using type = uint8_t; +}; + +template +struct rep> { + using type = uint16_t; +}; + +template +struct rep> { + using type = uint32_t; +}; + +template +struct rep> { + using type = uint64_t; +}; + +template +struct superstate_policy { + static_assert(N > 1 and N <= 16, "superstate supports no more than 16 unique states"); + static constexpr uint8_t BITS = ceillog2(N); + static constexpr uint8_t MASK = (1 << BITS) - 1; + using Data = typename rep::type; +}; + +} // namespace + +namespace cudf { +namespace io { +namespace text { + +template +struct superstate { + public: + static constexpr uint8_t BITS = superstate_policy::BITS; + static constexpr uint8_t MASK = superstate_policy::MASK; + + using Data = typename superstate_policy::Data; + using Index = uint8_t; + + private: + Data _data; + + public: + /** + * @brief creates a superstate which represents all possible states and + * applied transitions + */ + constexpr superstate() : _data(0) + { + for (auto i = 0; i < N; i++) { _data |= i << (i * BITS); } + } + + explicit inline constexpr superstate(Data data) : _data(data) {} + + inline constexpr Data data() const { return _data; } + + explicit inline constexpr operator State() const { return static_cast(_data & MASK); } + + inline constexpr State get(Index idx) const + { + return static_cast((_data >> idx * BITS) & MASK); + } + + inline constexpr void set(Index idx, State state) + { + // removing `& MASK` here may result in less instructions, but will result in UB. This may + // be a fine trade-off, as integer-overflow was never an intended use case. + _data |= (static_cast(state) & MASK) << idx * BITS; + } + + inline constexpr void reset(Index idx, State state) + { + _data &= ~(MASK << idx * BITS); + _data |= static_cast(state) << idx * BITS; + } + + template + inline constexpr superstate apply(BinaryOp const& op, RHS const& rhs) + { + superstate result(0); + for (uint8_t pre = 0; pre < N; pre++) { + auto const mid = get(pre); + auto const post = op(mid, rhs); + result.set(pre, post); + } + return result; + } +}; + +template +inline constexpr superstate operator+(superstate lhs, Instruction rhs) +{ + return lhs.apply( // + [](State state, Instruction rhs) { return state + rhs; }, + rhs); +} + +template +inline constexpr superstate operator+(superstate lhs, superstate rhs) +{ + using Index = typename superstate::Index; + return lhs.apply( // + [](State state, superstate rhs) { return rhs.get(static_cast(state)); }, + rhs); +} + +} // namespace text +} // namespace io +} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index dc074547234..4076e997654 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -195,12 +195,15 @@ endif() ################################################################################################### # - io tests -------------------------------------------------------------------------------------- -ConfigureTest(MULTIBYTE_SPLIT_TEST - io/text/multibyte_split_test.cpp) +ConfigureTest(SUPERSTATE_TEST + io/text/superstate_test.cpp) ConfigureTest(TRIE_TEST io/text/trie_test.cpp) +ConfigureTest(MULTIBYTE_SPLIT_TEST + io/text/multibyte_split_test.cpp) + ################################################################################################### # - sort tests ------------------------------------------------------------------------------------ ConfigureTest(SORT_TEST diff --git a/cpp/tests/io/text/superstate_test.cpp b/cpp/tests/io/text/superstate_test.cpp new file mode 100644 index 00000000000..c59f8f4bd69 --- /dev/null +++ b/cpp/tests/io/text/superstate_test.cpp @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +#include + +enum class state : uint8_t { a, b, c, error }; +enum class instruction : uint8_t { inc, dec, swap_ac }; + +inline constexpr state operator+(state const& lhs, instruction const& rhs) +{ + switch (rhs) { + case instruction::inc: + switch (lhs) { + case state::a: return state::b; + case state::b: return state::c; + case state::c: return state::a; + case state::error: return state::error; + } + case instruction::dec: + switch (lhs) { + case state::a: return state::c; + case state::b: return state::a; + case state::c: return state::b; + case state::error: return state::error; + } + case instruction::swap_ac: + switch (lhs) { + case state::a: return state::c; + case state::b: return state::b; + case state::c: return state::a; + case state::error: return state::error; + } + } + + return state::error; +} + +using superstate = cudf::io::text::superstate<4, state>; + +struct SuperstateTest : public cudf::test::BaseFixture { +}; + +TEST_F(SuperstateTest, CanInitializeAllStates) +{ + auto value = superstate(); + + EXPECT_EQ(value.data(), 0b11100100); +} + +TEST_F(SuperstateTest, CanInitializeSpecificValue) +{ + auto value = superstate(0b01010101); + + EXPECT_EQ(value.data(), 0b01010101); +} + +TEST_F(SuperstateTest, CanTransitionExplicitly) +{ + auto value = superstate(); + + auto machine = [](state const& lhs, uint8_t const& rhs) { + return static_cast(static_cast(lhs) + rhs); + }; + + // this call test the overflow capability of individual states within a superstate. It is + // possible this becomes UB in the future, in which case this `TEST_F` should be removed. + value = value.apply(machine, 5); + + EXPECT_EQ(value.data(), 0b00111001); + EXPECT_EQ(value.get(0), static_cast(1)); +} + +TEST_F(SuperstateTest, CanTransitionAllStataes) +{ + auto value = superstate(); + + value = value + instruction::inc; + + EXPECT_EQ(value.data(), 0b11001001); + EXPECT_EQ(value.get(0), state::b); + + value = value + instruction::swap_ac; + + EXPECT_EQ(value.data(), 0b11100001); + EXPECT_EQ(value.get(0), state::b); + + value = value + instruction::dec; + + EXPECT_EQ(value.data(), 0b11011000); + EXPECT_EQ(value.get(0), state::a); +} + +TEST_F(SuperstateTest, CanConcatenateSuperstates) +{ + auto a = superstate() + instruction::inc + instruction::swap_ac; + auto b = superstate() + instruction::dec + instruction::swap_ac; + auto c = superstate() + instruction::swap_ac + instruction::inc; + + auto value = a + b + c; + auto expected = superstate() + // + instruction::inc + instruction::swap_ac + // + instruction::dec + instruction::swap_ac + // + instruction::swap_ac + instruction::inc; + + EXPECT_EQ(value.data(), expected.data()); +} + +CUDF_TEST_PROGRAM_MAIN() From ea8cee21a9c0d4b6f2c2eaa4de6a5a653f71b507 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Wed, 7 Jul 2021 10:51:14 -0500 Subject: [PATCH 05/80] added device trie --- cpp/include/cudf/io/text/trie.hpp | 128 ++++++++++++++++++++++++-- cpp/src/io/text/multibyte_split.cu | 7 ++ cpp/tests/io/text/superstate_test.cpp | 6 +- cpp/tests/io/text/trie_test.cpp | 4 +- 4 files changed, 133 insertions(+), 12 deletions(-) diff --git a/cpp/include/cudf/io/text/trie.hpp b/cpp/include/cudf/io/text/trie.hpp index f4a2fc5f150..827f30a3522 100644 --- a/cpp/include/cudf/io/text/trie.hpp +++ b/cpp/include/cudf/io/text/trie.hpp @@ -1,20 +1,136 @@ +#include +#include + +#include #include #include -namespace cudf { -namespace io { -namespace text { - namespace { struct trie_builder_node { + bool is_accepting; + std::unordered_map> children; + + void insert(std::string s) { insert(s.c_str(), s.size()); } + + trie_builder_node& insert(char const* s, uint16_t size) + { + if (size == 0) { + is_accepting = true; + return *this; + } + + if (children[*s] == nullptr) { children[*s] = std::make_unique(); } + + return children[*s]->insert(s + 1, size - 1); + } }; } // namespace +namespace cudf { +namespace io { +namespace text { + +struct trie_device_view { + uint16_t const* layer_offsets; + char const* tokens; + uint16_t const* transitions; + bool const* accepting; +}; + struct trie { - trie(std::string const& pattern) : trie(std::vector{pattern}) {} - trie(std::vector const& patterns) {} + // could compress all of this to 32 bits without major perf reduction: + // 1) merge accepting state in to the most significant bit of the + // corrosponding transition, and use a mask to access both values. 2) change + // layer_offsets to uint8_t, max string length would be 253 2^8-3 (two values + // reserved: empty string, and error state) + private: + rmm::device_uvector _layer_offsets; + rmm::device_uvector _tokens; + rmm::device_uvector _transitions; + rmm::device_uvector _accepting; + + public: + trie(rmm::device_uvector&& layer_offsets, + rmm::device_uvector&& tokens, + rmm::device_uvector&& transitions, + rmm::device_uvector&& accepting) + : _layer_offsets(std::move(layer_offsets)), + _tokens(std::move(tokens)), + _transitions(std::move(transitions)), + _accepting(std::move(_accepting)) + { + } + + static trie create(std::string const& pattern, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) + + { + return create(std::vector{pattern}, stream, mr); + } + + static trie create(std::vector const& patterns, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) + { + std::vector layer_offsets; + std::vector tokens; + std::vector transitions; + std::vector accepting; + + // create the trie tree + auto root = std::make_unique(); + for (auto& pattern : patterns) { root->insert(pattern); } + + // flatten + auto sum = 1; + layer_offsets.emplace_back(0); + transitions.emplace_back(sum); + accepting.emplace_back(root->is_accepting); + + auto nodes = std::queue>(); + nodes.push(std::move(root)); + + while (nodes.size()) { + layer_offsets.emplace_back(sum); + auto layer_size = nodes.size(); + for (uint32_t i = 0; i < layer_size; i++) { + auto node = std::move(nodes.front()); + nodes.pop(); + sum += node->children.size(); + transitions.emplace_back(sum); + for (auto& item : node->children) { + accepting.emplace_back(item.second->is_accepting); + tokens.emplace_back(item.first); + nodes.push(std::move(item.second)); + } + } + } + + accepting.emplace_back(false); + + // allocate device memory + + auto device_layer_offsets = rmm::device_uvector(layer_offsets.size(), stream, mr); + auto device_tokens = rmm::device_uvector(tokens.size(), stream, mr); + auto device_transitions = rmm::device_uvector(transitions.size(), stream, mr); + auto device_accepting = rmm::device_uvector(accepting.size(), stream, mr); + + // TODO: copy host buffers to device + + return trie{std::move(device_layer_offsets), + std::move(device_tokens), + std::move(device_transitions), + std::move(device_accepting)}; + } + + trie_device_view view() const + { + return trie_device_view{ + _layer_offsets.data(), _tokens.data(), _transitions.data(), _accepting.data()}; + } }; } // namespace text diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 1b75d8a7155..386f60f2030 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -1,5 +1,6 @@ #include #include +#include #include #include @@ -29,6 +30,8 @@ __global__ void multibyte_split_kernel(cudf::device_span data) auto const data_begin = thread_idx * BYTES_PER_THREAD; auto data_end = data_begin + BYTES_PER_THREAD; + // superstate<16> match_state; + if (data_end > data.size()) { data_end = data.size(); } if (data_end < data.size()) { // @@ -39,7 +42,11 @@ __global__ void multibyte_split_kernel(cudf::device_span data) for (uint32_t i = data_begin; i < data_end; i++) { printf("bid(%i) tid(%i) %3i: %c\n", blockIdx.x, threadIdx.x, i, data[i]); + + // match_state = match_state.apply(machine, data[i]); } + + // match_state is now the block-partial reduction, so we should set it. } } // namespace diff --git a/cpp/tests/io/text/superstate_test.cpp b/cpp/tests/io/text/superstate_test.cpp index c59f8f4bd69..9120eb620a7 100644 --- a/cpp/tests/io/text/superstate_test.cpp +++ b/cpp/tests/io/text/superstate_test.cpp @@ -14,14 +14,12 @@ * limitations under the License. */ -#include +#include #include #include +#include #include -#include - -#include enum class state : uint8_t { a, b, c, error }; enum class instruction : uint8_t { inc, dec, swap_ac }; diff --git a/cpp/tests/io/text/trie_test.cpp b/cpp/tests/io/text/trie_test.cpp index bcc32e01b17..1fbecd6e905 100644 --- a/cpp/tests/io/text/trie_test.cpp +++ b/cpp/tests/io/text/trie_test.cpp @@ -38,7 +38,7 @@ struct TrieTest : public BaseFixture { TEST_F(TrieTest, CanMatchSinglePattern) { - auto pattern = cudf::io::text::trie{"abac"}; + auto pattern = cudf::io::text::trie::create("abac", {}); (void)pattern; } @@ -46,7 +46,7 @@ TEST_F(TrieTest, CanMatchSinglePattern) TEST_F(TrieTest, CanMatchMultiplePatterns) { auto patterns = std::vector{"abac", "abad"}; - auto pattern = cudf::io::text::trie(patterns); + auto pattern = cudf::io::text::trie::create(patterns, {}); (void)pattern; } From a4a8dd092a95a8bd95aafc812357a2a8d616a11c Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Wed, 7 Jul 2021 13:13:20 -0500 Subject: [PATCH 06/80] add superstate to multibyte_split --- cpp/include/cudf/io/text/trie.hpp | 54 +++++++++++++++++++++- cpp/src/io/text/multibyte_split.cu | 22 +++++++-- cpp/tests/io/text/multibyte_split_test.cpp | 4 +- 3 files changed, 72 insertions(+), 8 deletions(-) diff --git a/cpp/include/cudf/io/text/trie.hpp b/cpp/include/cudf/io/text/trie.hpp index 827f30a3522..4f56b55905c 100644 --- a/cpp/include/cudf/io/text/trie.hpp +++ b/cpp/include/cudf/io/text/trie.hpp @@ -37,6 +37,30 @@ struct trie_device_view { char const* tokens; uint16_t const* transitions; bool const* accepting; + + inline constexpr uint16_t transition(uint16_t idx, char c) + { + auto pos = transitions[idx]; + auto end = transitions[idx + 1]; + while (pos < end) { + if (c == tokens[pos - 1]) { return pos; } + pos++; + } + + return transition_init(c); + } + + inline constexpr uint16_t transition_init(char c) + { + auto pos = transitions[0]; + auto end = transitions[1]; + while (pos < end) { + if (c == tokens[pos - 1]) { return pos; } + pos++; + } + + return 0; + } }; struct trie { @@ -78,7 +102,7 @@ struct trie { std::vector layer_offsets; std::vector tokens; std::vector transitions; - std::vector accepting; + std::vector accepting; // create the trie tree auto root = std::make_unique(); @@ -118,7 +142,33 @@ struct trie { auto device_transitions = rmm::device_uvector(transitions.size(), stream, mr); auto device_accepting = rmm::device_uvector(accepting.size(), stream, mr); - // TODO: copy host buffers to device + // copy host buffers to device + + RMM_CUDA_TRY(cudaMemcpyAsync(device_layer_offsets.data(), + layer_offsets.data(), + layer_offsets.size() * sizeof(uint16_t), + cudaMemcpyDefault, + stream.value())); + + RMM_CUDA_TRY(cudaMemcpyAsync(device_tokens.data(), + tokens.data(), + tokens.size() * sizeof(char), + cudaMemcpyDefault, + stream.value())); + + RMM_CUDA_TRY(cudaMemcpyAsync(device_transitions.data(), + transitions.data(), + transitions.size() * sizeof(uint16_t), + cudaMemcpyDefault, + stream.value())); + + RMM_CUDA_TRY(cudaMemcpyAsync(device_accepting.data(), + accepting.data(), + accepting.size() * sizeof(bool), + cudaMemcpyDefault, + stream.value())); + + // create owning container return trie{std::move(device_layer_offsets), std::move(device_tokens), diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 386f60f2030..bcb6cceb33b 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -24,13 +24,14 @@ struct trie_state { }; template -__global__ void multibyte_split_kernel(cudf::device_span data) +__global__ void multibyte_split_kernel(cudf::io::text::trie_device_view trie, + cudf::device_span data) { auto const thread_idx = blockIdx.x * blockDim.x + threadIdx.x; auto const data_begin = thread_idx * BYTES_PER_THREAD; auto data_end = data_begin + BYTES_PER_THREAD; - // superstate<16> match_state; + cudf::io::text::superstate<16> x; if (data_end > data.size()) { data_end = data.size(); } @@ -40,8 +41,18 @@ __global__ void multibyte_split_kernel(cudf::device_span data) printf("bid(%i) tid(%i) : partial\n", blockIdx.x, threadIdx.x); } + auto machine = [&](uint8_t const& state, char const& byte) { + return trie.transition(state, byte); + }; + for (uint32_t i = data_begin; i < data_end; i++) { - printf("bid(%i) tid(%i) %3i: %c\n", blockIdx.x, threadIdx.x, i, data[i]); + x = x.apply(machine, data[i]); + printf("bid(%i) tid(%i) %3i: %c - %u\n", + blockIdx.x, + threadIdx.x, + i, + data[i], + static_cast(x.get(0))); // match_state = match_state.apply(machine, data[i]); } @@ -72,6 +83,8 @@ std::unique_ptr multibyte_split(cudf::io::text::input_stream& inpu // TODO: call state initalization kernels + auto const trie = cudf::io::text::trie::create(delimeter, stream); + while (true) { uint32_t num_bytes_read = input.readsome(input_span, stream); @@ -83,7 +96,8 @@ std::unique_ptr multibyte_split(cudf::io::text::input_stream& inpu auto num_tiles = ceil_div(num_bytes_read, BYTES_PER_TILE); auto kernel = multibyte_split_kernel; - kernel<<>>(input_span.first(num_bytes_read)); + kernel<<>>(trie.view(), + input_span.first(num_bytes_read)); } // TODO: call state finalization kernels diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index d6035f53880..6c27cfa6270 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -44,7 +44,7 @@ TEST_F(MultibyteSplitTest, Simple) std::string input = "here😎" "is😎" - "some😎" + "another😎" "simple😎" "text😎" "seperated😎" @@ -61,7 +61,7 @@ TEST_F(MultibyteSplitTest, Simple) auto expected = strings_column_wrapper{"here", "is", - "some", + "another", "simple", "text", "seperated", From 094d2d25bbdb8edbbf7937aa31984f14e1f15425 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Thu, 8 Jul 2021 16:14:01 -0500 Subject: [PATCH 07/80] cub block scan superstates --- cpp/include/cudf/io/text/superstate.hpp | 20 +++-- cpp/src/io/text/multibyte_split.cu | 105 ++++++++++++++++++------ 2 files changed, 94 insertions(+), 31 deletions(-) diff --git a/cpp/include/cudf/io/text/superstate.hpp b/cpp/include/cudf/io/text/superstate.hpp index 3c6c31ffaa3..c1a78ddd389 100644 --- a/cpp/include/cudf/io/text/superstate.hpp +++ b/cpp/include/cudf/io/text/superstate.hpp @@ -105,23 +105,31 @@ struct superstate { } return result; } + + template + inline constexpr superstate apply(BinaryOp const& op) + { + superstate result(0); + for (uint8_t pre = 0; pre < N; pre++) { + auto const mid = get(pre); + auto const post = op(mid); + result.set(pre, post); + } + return result; + } }; template inline constexpr superstate operator+(superstate lhs, Instruction rhs) { - return lhs.apply( // - [](State state, Instruction rhs) { return state + rhs; }, - rhs); + return lhs.apply([&](State state) { return state + rhs; }); } template inline constexpr superstate operator+(superstate lhs, superstate rhs) { using Index = typename superstate::Index; - return lhs.apply( // - [](State state, superstate rhs) { return rhs.get(static_cast(state)); }, - rhs); + return lhs.apply([&](State state) { return rhs.get(static_cast(state)); }); } } // namespace text diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index bcb6cceb33b..9f1d5773adc 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -7,6 +7,8 @@ #include #include +#include + #include #include #include @@ -23,41 +25,100 @@ struct trie_state { uint8_t placeholder; }; +using superstate = cudf::io::text::superstate<16>; + +auto constexpr BYTES_PER_THREAD = 8; +auto constexpr THREADS_PER_TILE = 256; +auto constexpr BYTES_PER_TILE = BYTES_PER_THREAD * THREADS_PER_TILE; +auto constexpr TILES_PER_CHUNK = 1024; +auto constexpr BYTES_PER_CHUNK = BYTES_PER_TILE * TILES_PER_CHUNK; + +// multibyte_split works by splitting up inputs in to 32 inputs (bytes) per thread, and transforming +// them in to data structures called "superstates". these superstates are created by searching a +// trie, but instead of a tradition trie where the search begins at a single node at the beginning, +// we allow our search to begin anywhere within the trie tree. The position within the trie tree is +// stored as a "partial match path", which indicates "we can get from here to there by a set of +// specific transitions". By scanning together superstates, we effectively know "we can get here +// from the beginning by following the inputs". By doing this, each thread knows exactly what state +// it begins in. From there, each thread can then take deterministic action. In this case, the +// deterministic action is counting and outputting delimiter offsets when a delimiter is found. + +struct BlockPrefixCallbackOp { + // Running prefix + superstate running_total; + // Constructor + __device__ BlockPrefixCallbackOp(superstate running_total) : running_total(running_total) {} + // Callback operator to be entered by the first warp of threads in the block. + // Thread-0 is responsible for returning a value for seeding the block-wide scan. + __device__ superstate operator()(superstate const& block_aggregate) + { + superstate old_prefix = running_total; + running_total = old_prefix + block_aggregate; + return old_prefix; + } +}; + template __global__ void multibyte_split_kernel(cudf::io::text::trie_device_view trie, cudf::device_span data) { + typedef cub::BlockScan BlockScan; + + __shared__ union { + typename BlockScan::TempStorage scan; + } temp_storage; + auto const thread_idx = blockIdx.x * blockDim.x + threadIdx.x; auto const data_begin = thread_idx * BYTES_PER_THREAD; auto data_end = data_begin + BYTES_PER_THREAD; - cudf::io::text::superstate<16> x; - if (data_end > data.size()) { data_end = data.size(); } - if (data_end < data.size()) { // - printf("bid(%i) tid(%i) : whole\n", blockIdx.x, threadIdx.x); - } else if (data_begin < data.size()) { - printf("bid(%i) tid(%i) : partial\n", blockIdx.x, threadIdx.x); + superstate thread_data[BYTES_PER_THREAD]; + + for (uint32_t i = 0; i < BYTES_PER_THREAD; i++) { + auto const element_idx = data_begin + i; + if (element_idx >= data.size()) { + thread_data[i] = superstate(); + } else { + thread_data[i] = superstate().apply([&](uint8_t state) { // + return trie.transition(state, data[element_idx]); + }); + } } - auto machine = [&](uint8_t const& state, char const& byte) { - return trie.transition(state, byte); - }; + BlockPrefixCallbackOp prefix_op({}); + + __syncthreads(); + + BlockScan(temp_storage.scan) + .InclusiveScan( // + thread_data, + thread_data, + [](superstate const& lhs, superstate const& rhs) { return lhs + rhs; }, + prefix_op); - for (uint32_t i = data_begin; i < data_end; i++) { - x = x.apply(machine, data[i]); - printf("bid(%i) tid(%i) %3i: %c - %u\n", - blockIdx.x, - threadIdx.x, - i, - data[i], - static_cast(x.get(0))); + __syncthreads(); - // match_state = match_state.apply(machine, data[i]); + if (data_end < data.size()) { // + printf("bid(%2i) tid(%2i) : whole\n", blockIdx.x, threadIdx.x); + } else if (data_begin < data.size()) { + printf("bid(%2i) tid(%2i) : partial\n", blockIdx.x, threadIdx.x); } - // match_state is now the block-partial reduction, so we should set it. + for (uint32_t i = 0; i < BYTES_PER_THREAD; i++) { + auto const element_idx = thread_idx * BYTES_PER_THREAD + i; + if (element_idx >= data.size()) { + break; + } else { + printf("bid(%2i) tid(%2i) %3i: %c - %u\n", + blockIdx.x, + threadIdx.x, + i, + data[data_begin + i], + static_cast(thread_data[i].get(0))); + } + } } } // namespace @@ -72,12 +133,6 @@ std::unique_ptr multibyte_split(cudf::io::text::input_stream& inpu rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto constexpr BYTES_PER_THREAD = 32; - auto constexpr THREADS_PER_TILE = 256; - auto constexpr BYTES_PER_TILE = BYTES_PER_THREAD * THREADS_PER_TILE; - auto constexpr TILES_PER_CHUNK = 1024; - auto constexpr BYTES_PER_CHUNK = BYTES_PER_TILE * TILES_PER_CHUNK; - auto input_buffer = rmm::device_uvector(BYTES_PER_CHUNK, stream); auto const input_span = cudf::device_span(input_buffer); From 1117ab853a9a09b69b670d1527f199492de4f039 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 9 Jul 2021 00:39:11 -0500 Subject: [PATCH 08/80] block-wide superstate matching --- cpp/src/io/text/multibyte_split.cu | 66 +++++++++++++++++++--- cpp/tests/io/text/multibyte_split_test.cpp | 32 +++++------ 2 files changed, 72 insertions(+), 26 deletions(-) diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 9f1d5773adc..fd0c275a1de 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -91,6 +91,35 @@ __global__ void multibyte_split_kernel(cudf::io::text::trie_device_view trie, __syncthreads(); + for (uint32_t i = 0; i < BYTES_PER_THREAD; i++) { + auto const element_idx = thread_idx * BYTES_PER_THREAD + i; + if (element_idx < data.size()) { + printf( + "bid(%2i) tid(%2i) %3i: %c - %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u " + "%2u\n", + blockIdx.x, + threadIdx.x, + i, + data[data_begin + i], + static_cast(thread_data[i].get(0)), + static_cast(thread_data[i].get(1)), + static_cast(thread_data[i].get(2)), + static_cast(thread_data[i].get(3)), + static_cast(thread_data[i].get(4)), + static_cast(thread_data[i].get(5)), + static_cast(thread_data[i].get(6)), + static_cast(thread_data[i].get(7)), + static_cast(thread_data[i].get(8)), + static_cast(thread_data[i].get(9)), + static_cast(thread_data[i].get(10)), + static_cast(thread_data[i].get(11)), + static_cast(thread_data[i].get(12)), + static_cast(thread_data[i].get(13)), + static_cast(thread_data[i].get(14)), + static_cast(thread_data[i].get(15))); + } + } + BlockScan(temp_storage.scan) .InclusiveScan( // thread_data, @@ -108,17 +137,36 @@ __global__ void multibyte_split_kernel(cudf::io::text::trie_device_view trie, for (uint32_t i = 0; i < BYTES_PER_THREAD; i++) { auto const element_idx = thread_idx * BYTES_PER_THREAD + i; - if (element_idx >= data.size()) { - break; - } else { - printf("bid(%2i) tid(%2i) %3i: %c - %u\n", - blockIdx.x, - threadIdx.x, - i, - data[data_begin + i], - static_cast(thread_data[i].get(0))); + if (element_idx < data.size()) { + printf( + "bid(%2i) tid(%2i) %3i: %c - %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u " + "%2u\n", + blockIdx.x, + threadIdx.x, + i, + data[data_begin + i], + static_cast(thread_data[i].get(0)), + static_cast(thread_data[i].get(1)), + static_cast(thread_data[i].get(2)), + static_cast(thread_data[i].get(3)), + static_cast(thread_data[i].get(4)), + static_cast(thread_data[i].get(5)), + static_cast(thread_data[i].get(6)), + static_cast(thread_data[i].get(7)), + static_cast(thread_data[i].get(8)), + static_cast(thread_data[i].get(9)), + static_cast(thread_data[i].get(10)), + static_cast(thread_data[i].get(11)), + static_cast(thread_data[i].get(12)), + static_cast(thread_data[i].get(13)), + static_cast(thread_data[i].get(14)), + static_cast(thread_data[i].get(15))); } } + + // every thread and every value on every thread now knows it's actual state. + + // but we still need each thread to know it's next match... } } // namespace diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index 6c27cfa6270..8bde56af573 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -40,8 +40,17 @@ struct MultibyteSplitTest : public BaseFixture { TEST_F(MultibyteSplitTest, Simple) { - std::string separator = "😎"; // F0 9F 98 8E | 11110000 11111001 1100010 11101000 + std::string separator = "😎deli"; // F0 9F 98 8E | 11110000 11111001 1100010 11101000 std::string input = + "aaa😎" + "bbb😎" + "ccc😎" + "ddd😎" + "eee😎" + "fff😎" + "ggg😎" + "hhh😎" + "___😎" "here😎" "is😎" "another😎" @@ -59,22 +68,11 @@ TEST_F(MultibyteSplitTest, Simple) "as😎" "delimeters."; - auto expected = strings_column_wrapper{"here", - "is", - "another", - "simple", - "text", - "seperated", - "by", - "emojis", - "which", - "are", - "multple", - "bytes", - "and", - "used", - "as", - "delimeters."}; + auto expected = strings_column_wrapper{ + "aaa", "bbb", "ccc", "ddd", "eee", "fff", "ggg", "hhh", "___", + "here", "is", "another", "simple", "text", "seperated", "by", "emojis", "which", + "are", "multple", "bytes", "and", "used", "as", "delimeters.", + }; auto input_stream = std::basic_istringstream(input); auto input_stream_io = cudf::io::text::host_input_stream(input_stream); From 51b1444693b841483c28caa54fe4c30ebdec8b57 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 9 Jul 2021 01:33:50 -0500 Subject: [PATCH 09/80] fix superstate constructor bug where only the first 8 states were initialized --- cpp/include/cudf/io/text/superstate.hpp | 2 +- cpp/src/io/text/multibyte_split.cu | 31 +--------------------- cpp/tests/io/text/multibyte_split_test.cpp | 2 +- 3 files changed, 3 insertions(+), 32 deletions(-) diff --git a/cpp/include/cudf/io/text/superstate.hpp b/cpp/include/cudf/io/text/superstate.hpp index c1a78ddd389..7f5c43a005c 100644 --- a/cpp/include/cudf/io/text/superstate.hpp +++ b/cpp/include/cudf/io/text/superstate.hpp @@ -67,7 +67,7 @@ struct superstate { */ constexpr superstate() : _data(0) { - for (auto i = 0; i < N; i++) { _data |= i << (i * BITS); } + for (auto i = 0; i < N; i++) { _data |= static_cast(i) << (i * BITS); } } explicit inline constexpr superstate(Data data) : _data(data) {} diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index fd0c275a1de..22eb6a2941f 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -91,35 +91,6 @@ __global__ void multibyte_split_kernel(cudf::io::text::trie_device_view trie, __syncthreads(); - for (uint32_t i = 0; i < BYTES_PER_THREAD; i++) { - auto const element_idx = thread_idx * BYTES_PER_THREAD + i; - if (element_idx < data.size()) { - printf( - "bid(%2i) tid(%2i) %3i: %c - %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u " - "%2u\n", - blockIdx.x, - threadIdx.x, - i, - data[data_begin + i], - static_cast(thread_data[i].get(0)), - static_cast(thread_data[i].get(1)), - static_cast(thread_data[i].get(2)), - static_cast(thread_data[i].get(3)), - static_cast(thread_data[i].get(4)), - static_cast(thread_data[i].get(5)), - static_cast(thread_data[i].get(6)), - static_cast(thread_data[i].get(7)), - static_cast(thread_data[i].get(8)), - static_cast(thread_data[i].get(9)), - static_cast(thread_data[i].get(10)), - static_cast(thread_data[i].get(11)), - static_cast(thread_data[i].get(12)), - static_cast(thread_data[i].get(13)), - static_cast(thread_data[i].get(14)), - static_cast(thread_data[i].get(15))); - } - } - BlockScan(temp_storage.scan) .InclusiveScan( // thread_data, @@ -136,7 +107,7 @@ __global__ void multibyte_split_kernel(cudf::io::text::trie_device_view trie, } for (uint32_t i = 0; i < BYTES_PER_THREAD; i++) { - auto const element_idx = thread_idx * BYTES_PER_THREAD + i; + auto const element_idx = data_begin + i; if (element_idx < data.size()) { printf( "bid(%2i) tid(%2i) %3i: %c - %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u " diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index 8bde56af573..218e36ed3f2 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -40,7 +40,7 @@ struct MultibyteSplitTest : public BaseFixture { TEST_F(MultibyteSplitTest, Simple) { - std::string separator = "😎deli"; // F0 9F 98 8E | 11110000 11111001 1100010 11101000 + std::string separator = "😎delimeters."; // F0 9F 98 8E | 11110000 11111001 1100010 11101000 std::string input = "aaa😎" "bbb😎" From d1f7eb3e99086989820c4986904ce8f73abb555b Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 9 Jul 2021 02:06:24 -0500 Subject: [PATCH 10/80] multibyte_split multiple delimeter support --- cpp/include/cudf/io/text/multibyte_split.hpp | 2 +- cpp/include/cudf/io/text/trie.hpp | 4 +- cpp/src/io/text/multibyte_split.cu | 49 +++++++++----------- cpp/tests/io/text/multibyte_split_test.cpp | 40 ++++++++-------- 4 files changed, 46 insertions(+), 49 deletions(-) diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp index 3de019db8f3..20c93b3b7de 100644 --- a/cpp/include/cudf/io/text/multibyte_split.hpp +++ b/cpp/include/cudf/io/text/multibyte_split.hpp @@ -14,7 +14,7 @@ namespace text { std::unique_ptr multibyte_split( cudf::io::text::input_stream& input, - std::string delimeter, + std::vector const& delimeters, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } diff --git a/cpp/include/cudf/io/text/trie.hpp b/cpp/include/cudf/io/text/trie.hpp index 4f56b55905c..1e90667e159 100644 --- a/cpp/include/cudf/io/text/trie.hpp +++ b/cpp/include/cudf/io/text/trie.hpp @@ -61,6 +61,8 @@ struct trie_device_view { return 0; } + + inline constexpr bool is_match(uint16_t idx) { return accepting[idx]; } }; struct trie { @@ -83,7 +85,7 @@ struct trie { : _layer_offsets(std::move(layer_offsets)), _tokens(std::move(tokens)), _transitions(std::move(transitions)), - _accepting(std::move(_accepting)) + _accepting(std::move(accepting)) { } diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 22eb6a2941f..bf546555f30 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -28,7 +28,7 @@ struct trie_state { using superstate = cudf::io::text::superstate<16>; auto constexpr BYTES_PER_THREAD = 8; -auto constexpr THREADS_PER_TILE = 256; +auto constexpr THREADS_PER_TILE = 32; auto constexpr BYTES_PER_TILE = BYTES_PER_THREAD * THREADS_PER_TILE; auto constexpr TILES_PER_CHUNK = 1024; auto constexpr BYTES_PER_CHUNK = BYTES_PER_TILE * TILES_PER_CHUNK; @@ -107,31 +107,24 @@ __global__ void multibyte_split_kernel(cudf::io::text::trie_device_view trie, } for (uint32_t i = 0; i < BYTES_PER_THREAD; i++) { + auto const real_state = thread_data[i].get(0); auto const element_idx = data_begin + i; if (element_idx < data.size()) { - printf( - "bid(%2i) tid(%2i) %3i: %c - %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u " - "%2u\n", - blockIdx.x, - threadIdx.x, - i, - data[data_begin + i], - static_cast(thread_data[i].get(0)), - static_cast(thread_data[i].get(1)), - static_cast(thread_data[i].get(2)), - static_cast(thread_data[i].get(3)), - static_cast(thread_data[i].get(4)), - static_cast(thread_data[i].get(5)), - static_cast(thread_data[i].get(6)), - static_cast(thread_data[i].get(7)), - static_cast(thread_data[i].get(8)), - static_cast(thread_data[i].get(9)), - static_cast(thread_data[i].get(10)), - static_cast(thread_data[i].get(11)), - static_cast(thread_data[i].get(12)), - static_cast(thread_data[i].get(13)), - static_cast(thread_data[i].get(14)), - static_cast(thread_data[i].get(15))); + if (trie.is_match(real_state)) { + printf("bid(%2i) tid(%2i) %3i: %c - %2u MATCH\n", + blockIdx.x, + threadIdx.x, + i, + data[data_begin + i], + static_cast(real_state)); + } else { + printf("bid(%2i) tid(%2i) %3i: %c - %2u\n", + blockIdx.x, + threadIdx.x, + i, + data[data_begin + i], + static_cast(real_state)); + } } } @@ -148,7 +141,7 @@ namespace text { namespace detail { std::unique_ptr multibyte_split(cudf::io::text::input_stream& input, - std::string delimeter, + std::vector const& delimeters, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -157,7 +150,7 @@ std::unique_ptr multibyte_split(cudf::io::text::input_stream& inpu // TODO: call state initalization kernels - auto const trie = cudf::io::text::trie::create(delimeter, stream); + auto const trie = cudf::io::text::trie::create(delimeters, stream); while (true) { uint32_t num_bytes_read = input.readsome(input_span, stream); @@ -182,10 +175,10 @@ std::unique_ptr multibyte_split(cudf::io::text::input_stream& inpu } // namespace detail std::unique_ptr multibyte_split(cudf::io::text::input_stream& input, - std::string delimeter, + std::vector const& delimeters, rmm::mr::device_memory_resource* mr) { - return detail::multibyte_split(input, delimeter, rmm::cuda_stream_default, mr); + return detail::multibyte_split(input, delimeters, rmm::cuda_stream_default, mr); } } // namespace text diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index 218e36ed3f2..f8209ca11ba 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -40,29 +40,31 @@ struct MultibyteSplitTest : public BaseFixture { TEST_F(MultibyteSplitTest, Simple) { - std::string separator = "😎delimeters."; // F0 9F 98 8E | 11110000 11111001 1100010 11101000 + // 😀 | F0 9F 98 80 | 11110000 10011111 01100010 01010000 + // 😎 | F0 9F 98 8E | 11110000 10011111 01100010 11101000 + auto separators = std::vector({"😀", "😎", ",", "::"}); std::string input = - "aaa😎" - "bbb😎" - "ccc😎" - "ddd😎" - "eee😎" - "fff😎" - "ggg😎" - "hhh😎" - "___😎" - "here😎" - "is😎" - "another😎" - "simple😎" + "aaa😀" + "bbb😀" + "ccc😀" + "ddd😀" + "eee😀" + "fff::" + "ggg😀" + "hhh😀" + "___," + "here," + "is," + "another," + "simple😀" "text😎" "seperated😎" "by😎" - "emojis😎" - "which😎" + "emojis," + "which," "are😎" - "multple😎" - "bytes😎" + "multiple," + "bytes::" "and😎" "used😎" "as😎" @@ -77,7 +79,7 @@ TEST_F(MultibyteSplitTest, Simple) auto input_stream = std::basic_istringstream(input); auto input_stream_io = cudf::io::text::host_input_stream(input_stream); - auto out = cudf::io::text::multibyte_split(input_stream_io, separator); + auto out = cudf::io::text::multibyte_split(input_stream_io, separators); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, print_all); } From a628d737f0561be5e737362bf152607a6c72c4dd Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 9 Jul 2021 16:42:25 -0500 Subject: [PATCH 11/80] scan output-offsets in multibyte_split --- cpp/src/io/text/multibyte_split.cu | 103 ++++++++++++++++++----------- 1 file changed, 63 insertions(+), 40 deletions(-) diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index bf546555f30..546744e0818 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -5,6 +5,7 @@ #include #include +#include #include #include @@ -33,16 +34,6 @@ auto constexpr BYTES_PER_TILE = BYTES_PER_THREAD * THREADS_PER_TILE; auto constexpr TILES_PER_CHUNK = 1024; auto constexpr BYTES_PER_CHUNK = BYTES_PER_TILE * TILES_PER_CHUNK; -// multibyte_split works by splitting up inputs in to 32 inputs (bytes) per thread, and transforming -// them in to data structures called "superstates". these superstates are created by searching a -// trie, but instead of a tradition trie where the search begins at a single node at the beginning, -// we allow our search to begin anywhere within the trie tree. The position within the trie tree is -// stored as a "partial match path", which indicates "we can get from here to there by a set of -// specific transitions". By scanning together superstates, we effectively know "we can get here -// from the beginning by following the inputs". By doing this, each thread knows exactly what state -// it begins in. From there, each thread can then take deterministic action. In this case, the -// deterministic action is counting and outputting delimiter offsets when a delimiter is found. - struct BlockPrefixCallbackOp { // Running prefix superstate running_total; @@ -56,16 +47,37 @@ struct BlockPrefixCallbackOp { running_total = old_prefix + block_aggregate; return old_prefix; } + + static rmm::device_uvector create_temp_storage(uint32_t num_elements, + rmm::cuda_stream_view stream) + { + auto num_prefixes = ceil_div(num_elements, BYTES_PER_TILE); + + return rmm::device_uvector(num_prefixes, stream); + } }; +// multibyte_split works by splitting up inputs in to 32 inputs (bytes) per thread, and transforming +// them in to data structures called "superstates". these superstates are created by searching a +// trie, but instead of a tradition trie where the search begins at a single node at the beginning, +// we allow our search to begin anywhere within the trie tree. The position within the trie tree is +// stored as a "partial match path", which indicates "we can get from here to there by a set of +// specific transitions". By scanning together superstates, we effectively know "we can get here +// from the beginning by following the inputs". By doing this, each thread knows exactly what state +// it begins in. From there, each thread can then take deterministic action. In this case, the +// deterministic action is counting and outputting delimiter offsets when a delimiter is found. + template __global__ void multibyte_split_kernel(cudf::io::text::trie_device_view trie, - cudf::device_span data) + cudf::device_span data, + uint32_t* result_count) { - typedef cub::BlockScan BlockScan; + typedef cub::BlockScan SuperstateBlockScan; + typedef cub::BlockScan ResultOffsetBlockScan; __shared__ union { - typename BlockScan::TempStorage scan; + typename SuperstateBlockScan::TempStorage superstate_scan; + typename ResultOffsetBlockScan::TempStorage result_offset_scan; } temp_storage; auto const thread_idx = blockIdx.x * blockDim.x + threadIdx.x; @@ -79,6 +91,8 @@ __global__ void multibyte_split_kernel(cudf::io::text::trie_device_view trie, for (uint32_t i = 0; i < BYTES_PER_THREAD; i++) { auto const element_idx = data_begin + i; if (element_idx >= data.size()) { + // this check is not necessary if we gaurantee no OOB accesses, which we can do because of + // the batch-read/batch-process approach. Keeping the check in for now, though. thread_data[i] = superstate(); } else { thread_data[i] = superstate().apply([&](uint8_t state) { // @@ -91,7 +105,7 @@ __global__ void multibyte_split_kernel(cudf::io::text::trie_device_view trie, __syncthreads(); - BlockScan(temp_storage.scan) + SuperstateBlockScan(temp_storage.superstate_scan) .InclusiveScan( // thread_data, thread_data, @@ -100,37 +114,36 @@ __global__ void multibyte_split_kernel(cudf::io::text::trie_device_view trie, __syncthreads(); - if (data_end < data.size()) { // - printf("bid(%2i) tid(%2i) : whole\n", blockIdx.x, threadIdx.x); - } else if (data_begin < data.size()) { - printf("bid(%2i) tid(%2i) : partial\n", blockIdx.x, threadIdx.x); - } + uint32_t thread_offsets[BYTES_PER_THREAD]; for (uint32_t i = 0; i < BYTES_PER_THREAD; i++) { - auto const real_state = thread_data[i].get(0); auto const element_idx = data_begin + i; if (element_idx < data.size()) { - if (trie.is_match(real_state)) { - printf("bid(%2i) tid(%2i) %3i: %c - %2u MATCH\n", - blockIdx.x, - threadIdx.x, - i, - data[data_begin + i], - static_cast(real_state)); - } else { - printf("bid(%2i) tid(%2i) %3i: %c - %2u\n", - blockIdx.x, - threadIdx.x, - i, - data[data_begin + i], - static_cast(real_state)); - } + thread_offsets[i] = trie.is_match(thread_data[i].get(0)); + } else { + thread_offsets[i] = false; } } - // every thread and every value on every thread now knows it's actual state. - - // but we still need each thread to know it's next match... + uint32_t matches_in_block; + + ResultOffsetBlockScan(temp_storage.result_offset_scan) + .ExclusiveScan( + thread_offsets, + thread_offsets, + [](uint32_t const& lhs, uint32_t const& rhs) { return lhs + rhs; }, + matches_in_block); + + if (threadIdx.x == 0) { *result_count = matches_in_block; } + + // for (uint32_t i = 0; i < BYTES_PER_THREAD; i++) { + // auto const element_idx = data_begin + i; + // if (element_idx < data.size()) { + // thread_offsets[i] = trie.is_match(thread_data[i].get(0)); + // } else { + // thread_offsets[i] = false; + // } + // } } } // namespace @@ -152,6 +165,8 @@ std::unique_ptr multibyte_split(cudf::io::text::input_stream& inpu auto const trie = cudf::io::text::trie::create(delimeters, stream); + auto num_results = rmm::device_scalar(0, stream); + while (true) { uint32_t num_bytes_read = input.readsome(input_span, stream); @@ -163,10 +178,18 @@ std::unique_ptr multibyte_split(cudf::io::text::input_stream& inpu auto num_tiles = ceil_div(num_bytes_read, BYTES_PER_TILE); auto kernel = multibyte_split_kernel; - kernel<<>>(trie.view(), - input_span.first(num_bytes_read)); + kernel<<>>( // + trie.view(), + input_span.first(num_bytes_read), + num_results.data()); } + auto host_num_results = num_results.value(stream); + + stream.synchronize(); + + std::cout << "num results: " << host_num_results << std::endl; + // TODO: call state finalization kernels CUDF_FAIL(); From e1cc84dfd22a342b97e3226cbaedfbcacf038f73 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 9 Jul 2021 17:14:46 -0500 Subject: [PATCH 12/80] printf offsets in multibyte_split --- .../io/orc/orc_reader_benchmark.cpp | 2 +- cpp/src/io/text/multibyte_split.cu | 50 +++++++++---------- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp b/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp index 2f3f454fda6..bc1aef11784 100644 --- a/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp +++ b/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp @@ -124,7 +124,7 @@ void BM_orc_read_varying_options(benchmark::State& state) // Need to assume that an additional "overflow" stripe is present stripes_to_read.push_back(num_stripes); } - read_options.set_stripes(stripes_to_read); + read_options.set_stripes({stripes_to_read}); } break; case row_selection::NROWS: read_options.set_skip_rows(chunk * chunk_row_cnt); diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 546744e0818..7b7e23d43f3 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -73,11 +73,11 @@ __global__ void multibyte_split_kernel(cudf::io::text::trie_device_view trie, uint32_t* result_count) { typedef cub::BlockScan SuperstateBlockScan; - typedef cub::BlockScan ResultOffsetBlockScan; + typedef cub::BlockScan OffsetBlockScan; __shared__ union { typename SuperstateBlockScan::TempStorage superstate_scan; - typename ResultOffsetBlockScan::TempStorage result_offset_scan; + typename OffsetBlockScan::TempStorage offset_scan; } temp_storage; auto const thread_idx = blockIdx.x * blockDim.x + threadIdx.x; @@ -86,29 +86,29 @@ __global__ void multibyte_split_kernel(cudf::io::text::trie_device_view trie, if (data_end > data.size()) { data_end = data.size(); } - superstate thread_data[BYTES_PER_THREAD]; + superstate thread_superstates[BYTES_PER_THREAD]; for (uint32_t i = 0; i < BYTES_PER_THREAD; i++) { auto const element_idx = data_begin + i; if (element_idx >= data.size()) { // this check is not necessary if we gaurantee no OOB accesses, which we can do because of // the batch-read/batch-process approach. Keeping the check in for now, though. - thread_data[i] = superstate(); + thread_superstates[i] = superstate(); } else { - thread_data[i] = superstate().apply([&](uint8_t state) { // + thread_superstates[i] = superstate().apply([&](uint8_t state) { // return trie.transition(state, data[element_idx]); }); } } - BlockPrefixCallbackOp prefix_op({}); - __syncthreads(); + BlockPrefixCallbackOp prefix_op({}); + SuperstateBlockScan(temp_storage.superstate_scan) .InclusiveScan( // - thread_data, - thread_data, + thread_superstates, + thread_superstates, [](superstate const& lhs, superstate const& rhs) { return lhs + rhs; }, prefix_op); @@ -117,33 +117,33 @@ __global__ void multibyte_split_kernel(cudf::io::text::trie_device_view trie, uint32_t thread_offsets[BYTES_PER_THREAD]; for (uint32_t i = 0; i < BYTES_PER_THREAD; i++) { - auto const element_idx = data_begin + i; - if (element_idx < data.size()) { - thread_offsets[i] = trie.is_match(thread_data[i].get(0)); - } else { - thread_offsets[i] = false; - } + thread_offsets[i] = trie.is_match(thread_superstates[i].get(0)); } + __syncthreads(); + uint32_t matches_in_block; - ResultOffsetBlockScan(temp_storage.result_offset_scan) + OffsetBlockScan(temp_storage.offset_scan) .ExclusiveScan( thread_offsets, thread_offsets, [](uint32_t const& lhs, uint32_t const& rhs) { return lhs + rhs; }, matches_in_block); - if (threadIdx.x == 0) { *result_count = matches_in_block; } + __syncthreads(); - // for (uint32_t i = 0; i < BYTES_PER_THREAD; i++) { - // auto const element_idx = data_begin + i; - // if (element_idx < data.size()) { - // thread_offsets[i] = trie.is_match(thread_data[i].get(0)); - // } else { - // thread_offsets[i] = false; - // } - // } + for (uint32_t i = 0; i < BYTES_PER_THREAD; i++) { + printf("bid(%2u) tid(%2u) byte(%2u): %c %2u - %2u\n", // + blockIdx.x, + threadIdx.x, + i, + data[data_begin + i], + thread_offsets[i], + static_cast(trie.is_match(thread_superstates[i].get(0)))); + } + + if (threadIdx.x == 0) { *result_count = matches_in_block; } } } // namespace From c7177bce28856d5b1c13b9210641e160302e6c11 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 9 Jul 2021 17:45:00 -0500 Subject: [PATCH 13/80] add match-length to trie to adjust for output offset in multibyte_split --- cpp/include/cudf/io/text/trie.hpp | 41 +++++++++++----------- cpp/src/io/text/multibyte_split.cu | 2 +- cpp/tests/io/text/multibyte_split_test.cpp | 4 +-- 3 files changed, 24 insertions(+), 23 deletions(-) diff --git a/cpp/include/cudf/io/text/trie.hpp b/cpp/include/cudf/io/text/trie.hpp index 1e90667e159..9e931ce48ae 100644 --- a/cpp/include/cudf/io/text/trie.hpp +++ b/cpp/include/cudf/io/text/trie.hpp @@ -8,21 +8,21 @@ namespace { struct trie_builder_node { - bool is_accepting; + uint8_t match_length; std::unordered_map> children; void insert(std::string s) { insert(s.c_str(), s.size()); } - trie_builder_node& insert(char const* s, uint16_t size) + trie_builder_node& insert(char const* s, uint16_t size, uint8_t depth = 0) { if (size == 0) { - is_accepting = true; + match_length = depth; return *this; } if (children[*s] == nullptr) { children[*s] = std::make_unique(); } - return children[*s]->insert(s + 1, size - 1); + return children[*s]->insert(s + 1, size - 1, depth + 1); } }; @@ -36,7 +36,7 @@ struct trie_device_view { uint16_t const* layer_offsets; char const* tokens; uint16_t const* transitions; - bool const* accepting; + uint8_t const* match_length; inline constexpr uint16_t transition(uint16_t idx, char c) { @@ -62,12 +62,13 @@ struct trie_device_view { return 0; } - inline constexpr bool is_match(uint16_t idx) { return accepting[idx]; } + inline constexpr bool is_match(uint16_t idx) { return static_cast(get_match_length(idx)); } + inline constexpr uint8_t get_match_length(uint16_t idx) { return match_length[idx]; } }; struct trie { // could compress all of this to 32 bits without major perf reduction: - // 1) merge accepting state in to the most significant bit of the + // 1) merge is_accepting state in to the most significant bit of the // corrosponding transition, and use a mask to access both values. 2) change // layer_offsets to uint8_t, max string length would be 253 2^8-3 (two values // reserved: empty string, and error state) @@ -75,17 +76,17 @@ struct trie { rmm::device_uvector _layer_offsets; rmm::device_uvector _tokens; rmm::device_uvector _transitions; - rmm::device_uvector _accepting; + rmm::device_uvector _match_length; public: trie(rmm::device_uvector&& layer_offsets, rmm::device_uvector&& tokens, rmm::device_uvector&& transitions, - rmm::device_uvector&& accepting) + rmm::device_uvector&& match_length) : _layer_offsets(std::move(layer_offsets)), _tokens(std::move(tokens)), _transitions(std::move(transitions)), - _accepting(std::move(accepting)) + _match_length(std::move(match_length)) { } @@ -104,7 +105,7 @@ struct trie { std::vector layer_offsets; std::vector tokens; std::vector transitions; - std::vector accepting; + std::vector match_length; // create the trie tree auto root = std::make_unique(); @@ -114,7 +115,7 @@ struct trie { auto sum = 1; layer_offsets.emplace_back(0); transitions.emplace_back(sum); - accepting.emplace_back(root->is_accepting); + match_length.emplace_back(root->match_length); auto nodes = std::queue>(); nodes.push(std::move(root)); @@ -128,21 +129,21 @@ struct trie { sum += node->children.size(); transitions.emplace_back(sum); for (auto& item : node->children) { - accepting.emplace_back(item.second->is_accepting); + match_length.emplace_back(item.second->match_length); tokens.emplace_back(item.first); nodes.push(std::move(item.second)); } } } - accepting.emplace_back(false); + match_length.emplace_back(false); // allocate device memory auto device_layer_offsets = rmm::device_uvector(layer_offsets.size(), stream, mr); auto device_tokens = rmm::device_uvector(tokens.size(), stream, mr); auto device_transitions = rmm::device_uvector(transitions.size(), stream, mr); - auto device_accepting = rmm::device_uvector(accepting.size(), stream, mr); + auto device_match_length = rmm::device_uvector(match_length.size(), stream, mr); // copy host buffers to device @@ -164,9 +165,9 @@ struct trie { cudaMemcpyDefault, stream.value())); - RMM_CUDA_TRY(cudaMemcpyAsync(device_accepting.data(), - accepting.data(), - accepting.size() * sizeof(bool), + RMM_CUDA_TRY(cudaMemcpyAsync(device_match_length.data(), + match_length.data(), + match_length.size() * sizeof(uint8_t), cudaMemcpyDefault, stream.value())); @@ -175,13 +176,13 @@ struct trie { return trie{std::move(device_layer_offsets), std::move(device_tokens), std::move(device_transitions), - std::move(device_accepting)}; + std::move(device_match_length)}; } trie_device_view view() const { return trie_device_view{ - _layer_offsets.data(), _tokens.data(), _transitions.data(), _accepting.data()}; + _layer_offsets.data(), _tokens.data(), _transitions.data(), _match_length.data()}; } }; diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 7b7e23d43f3..f7159e4595d 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -140,7 +140,7 @@ __global__ void multibyte_split_kernel(cudf::io::text::trie_device_view trie, i, data[data_begin + i], thread_offsets[i], - static_cast(trie.is_match(thread_superstates[i].get(0)))); + static_cast(trie.get_match_length(thread_superstates[i].get(0)))); } if (threadIdx.x == 0) { *result_count = matches_in_block; } diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index f8209ca11ba..7d2b86d1fc3 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -42,7 +42,7 @@ TEST_F(MultibyteSplitTest, Simple) { // 😀 | F0 9F 98 80 | 11110000 10011111 01100010 01010000 // 😎 | F0 9F 98 8E | 11110000 10011111 01100010 11101000 - auto separators = std::vector({"😀", "😎", ",", "::"}); + auto delimiters = std::vector({"😀", "😎", ",", "::"}); std::string input = "aaa😀" "bbb😀" @@ -79,7 +79,7 @@ TEST_F(MultibyteSplitTest, Simple) auto input_stream = std::basic_istringstream(input); auto input_stream_io = cudf::io::text::host_input_stream(input_stream); - auto out = cudf::io::text::multibyte_split(input_stream_io, separators); + auto out = cudf::io::text::multibyte_split(input_stream_io, delimiters); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, print_all); } From 42dc014d0a2eef646e3d30efe7f6ad1c4bbff209 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 9 Jul 2021 18:10:58 -0500 Subject: [PATCH 14/80] adjust multibyte_split test case to expect delimiters to be retained in output --- cpp/tests/io/text/multibyte_split_test.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index 7d2b86d1fc3..35babfb3328 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -71,9 +71,10 @@ TEST_F(MultibyteSplitTest, Simple) "delimeters."; auto expected = strings_column_wrapper{ - "aaa", "bbb", "ccc", "ddd", "eee", "fff", "ggg", "hhh", "___", - "here", "is", "another", "simple", "text", "seperated", "by", "emojis", "which", - "are", "multple", "bytes", "and", "used", "as", "delimeters.", + "aaa😀", "bbb😀", "ccc😀", "ddd😀", "eee😀", "fff::", "ggg😀", + "hhh😀", "___,", "here,", "is,", "another,", "simple😀", "text😎", + "seperated😎", "by😎", "emojis,", "which,", "are😎", "multiple,", "bytes::", + "and😎", "used😎", "as😎", "delimeters.", }; auto input_stream = std::basic_istringstream(input); From 5171711d8a77ba0754bdb4a069e133a9c99e9b07 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 9 Jul 2021 18:22:06 -0500 Subject: [PATCH 15/80] printf match_begin and match_end for multibyte_split --- cpp/src/io/text/multibyte_split.cu | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index f7159e4595d..519a380cc46 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -133,17 +133,25 @@ __global__ void multibyte_split_kernel(cudf::io::text::trie_device_view trie, __syncthreads(); + if (threadIdx.x == 0) { *result_count = matches_in_block; } + for (uint32_t i = 0; i < BYTES_PER_THREAD; i++) { - printf("bid(%2u) tid(%2u) byte(%2u): %c %2u - %2u\n", // + auto const match_length = trie.get_match_length(thread_superstates[i].get(0)); + + if (match_length == 0) { continue; } + + auto const match_end = data_begin + i + 1; + auto const match_begin = match_end - match_length; + + printf("bid(%2u) tid(%2u) byte(%2u): %c %2u - [%3u, %3u)\n", // blockIdx.x, threadIdx.x, i, data[data_begin + i], thread_offsets[i], - static_cast(trie.get_match_length(thread_superstates[i].get(0)))); + match_begin, + match_end); } - - if (threadIdx.x == 0) { *result_count = matches_in_block; } } } // namespace From 6b62cebf310eef1fab7e62ae784246958fd32473 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 9 Jul 2021 19:40:22 -0500 Subject: [PATCH 16/80] multibyte_split test passing --- cpp/include/cudf/io/text/multibyte_split.hpp | 2 +- cpp/src/io/text/multibyte_split.cu | 72 ++++++++++++++------ cpp/tests/io/text/multibyte_split_test.cpp | 14 ++-- 3 files changed, 57 insertions(+), 31 deletions(-) diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp index 20c93b3b7de..52bd66e9405 100644 --- a/cpp/include/cudf/io/text/multibyte_split.hpp +++ b/cpp/include/cudf/io/text/multibyte_split.hpp @@ -13,7 +13,7 @@ namespace io { namespace text { std::unique_ptr multibyte_split( - cudf::io::text::input_stream& input, + cudf::string_scalar const& input, std::vector const& delimeters, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 519a380cc46..07eb24b691a 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -1,7 +1,9 @@ #include +#include #include #include #include +#include #include #include @@ -69,8 +71,9 @@ struct BlockPrefixCallbackOp { template __global__ void multibyte_split_kernel(cudf::io::text::trie_device_view trie, - cudf::device_span data, - uint32_t* result_count) + cudf::device_span data, + uint32_t* result_count, + cudf::device_span results) { typedef cub::BlockScan SuperstateBlockScan; typedef cub::BlockScan OffsetBlockScan; @@ -151,6 +154,8 @@ __global__ void multibyte_split_kernel(cudf::io::text::trie_device_view trie, thread_offsets[i], match_begin, match_end); + + if (results.size() > 0) { results[thread_offsets[i]] = match_end; } } } @@ -161,36 +166,31 @@ namespace io { namespace text { namespace detail { -std::unique_ptr multibyte_split(cudf::io::text::input_stream& input, +std::unique_ptr multibyte_split(cudf::string_scalar const& input, std::vector const& delimeters, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto input_buffer = rmm::device_uvector(BYTES_PER_CHUNK, stream); - auto const input_span = cudf::device_span(input_buffer); + // auto input_buffer = rmm::device_uvector(BYTES_PER_CHUNK, stream); + // auto const input_span = cudf::device_span(input_buffer); // TODO: call state initalization kernels auto const trie = cudf::io::text::trie::create(delimeters, stream); auto num_results = rmm::device_scalar(0, stream); + auto num_tiles = ceil_div(input.size(), BYTES_PER_TILE); - while (true) { - uint32_t num_bytes_read = input.readsome(input_span, stream); - - if (num_bytes_read == 0) { - // if there's no more data to read, we're done. - break; - } + auto offsets = rmm::device_uvector(0, stream); - auto num_tiles = ceil_div(num_bytes_read, BYTES_PER_TILE); + // count the results - auto kernel = multibyte_split_kernel; - kernel<<>>( // - trie.view(), - input_span.first(num_bytes_read), - num_results.data()); - } + auto kernel = multibyte_split_kernel; + kernel<<>>( // + trie.view(), + cudf::device_span(input.data(), input.size()), + num_results.data(), + offsets); auto host_num_results = num_results.value(stream); @@ -198,14 +198,46 @@ std::unique_ptr multibyte_split(cudf::io::text::input_stream& inpu std::cout << "num results: " << host_num_results << std::endl; + // allocate the results + + offsets = rmm::device_uvector(host_num_results + 2, stream); + offsets.set_element_to_zero_async(0, stream); + cudf::size_type const x = offsets.size() - 1; + cudf::size_type const y = input.size(); + offsets.set_element_async(x, y, stream); + + // materialize the results + + kernel<<>>( // + trie.view(), + cudf::device_span(input.data(), input.size()), + num_results.data(), + cudf::device_span(offsets.data() + 1, host_num_results)); + + stream.synchronize(); + // TODO: call state finalization kernels + return cudf::make_strings_column( // + cudf::device_span(input.data(), input.size()), + offsets); + CUDF_FAIL(); + + /* + std::unique_ptr make_strings_column( + cudf::device_span strings, + cudf::device_span offsets, + cudf::device_span null_mask = {}, + size_type null_count = cudf::UNKNOWN_NULL_COUNT, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + */ } } // namespace detail -std::unique_ptr multibyte_split(cudf::io::text::input_stream& input, +std::unique_ptr multibyte_split(cudf::string_scalar const& input, std::vector const& delimeters, rmm::mr::device_memory_resource* mr) { diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index 35babfb3328..dc8c8cc3a0c 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -18,14 +18,11 @@ #include #include #include - -#include - #include #include +#include #include - #include #include @@ -43,7 +40,7 @@ TEST_F(MultibyteSplitTest, Simple) // 😀 | F0 9F 98 80 | 11110000 10011111 01100010 01010000 // 😎 | F0 9F 98 8E | 11110000 10011111 01100010 11101000 auto delimiters = std::vector({"😀", "😎", ",", "::"}); - std::string input = + cudf::string_scalar input( "aaa😀" "bbb😀" "ccc😀" @@ -68,7 +65,7 @@ TEST_F(MultibyteSplitTest, Simple) "and😎" "used😎" "as😎" - "delimeters."; + "delimeters."); auto expected = strings_column_wrapper{ "aaa😀", "bbb😀", "ccc😀", "ddd😀", "eee😀", "fff::", "ggg😀", @@ -77,10 +74,7 @@ TEST_F(MultibyteSplitTest, Simple) "and😎", "used😎", "as😎", "delimeters.", }; - auto input_stream = std::basic_istringstream(input); - auto input_stream_io = cudf::io::text::host_input_stream(input_stream); - - auto out = cudf::io::text::multibyte_split(input_stream_io, delimiters); + auto out = cudf::io::text::multibyte_split(input, delimiters); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, print_all); } From a2c9756ce891e50296311c7813b9fc4e4231c959 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Mon, 12 Jul 2021 12:47:07 -0500 Subject: [PATCH 17/80] add multibyte_split comments, break test intentionally to work on multi-block scaling --- cpp/src/io/text/multibyte_split.cu | 46 +++++++++--------------------- 1 file changed, 13 insertions(+), 33 deletions(-) diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 07eb24b691a..ba2d28a4fe9 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -30,35 +30,12 @@ struct trie_state { using superstate = cudf::io::text::superstate<16>; -auto constexpr BYTES_PER_THREAD = 8; +// keep BYTES_PER_TILE below input size to force multi-tile execution. +auto constexpr BYTES_PER_THREAD = 2; auto constexpr THREADS_PER_TILE = 32; auto constexpr BYTES_PER_TILE = BYTES_PER_THREAD * THREADS_PER_TILE; auto constexpr TILES_PER_CHUNK = 1024; auto constexpr BYTES_PER_CHUNK = BYTES_PER_TILE * TILES_PER_CHUNK; - -struct BlockPrefixCallbackOp { - // Running prefix - superstate running_total; - // Constructor - __device__ BlockPrefixCallbackOp(superstate running_total) : running_total(running_total) {} - // Callback operator to be entered by the first warp of threads in the block. - // Thread-0 is responsible for returning a value for seeding the block-wide scan. - __device__ superstate operator()(superstate const& block_aggregate) - { - superstate old_prefix = running_total; - running_total = old_prefix + block_aggregate; - return old_prefix; - } - - static rmm::device_uvector create_temp_storage(uint32_t num_elements, - rmm::cuda_stream_view stream) - { - auto num_prefixes = ceil_div(num_elements, BYTES_PER_TILE); - - return rmm::device_uvector(num_prefixes, stream); - } -}; - // multibyte_split works by splitting up inputs in to 32 inputs (bytes) per thread, and transforming // them in to data structures called "superstates". these superstates are created by searching a // trie, but instead of a tradition trie where the search begins at a single node at the beginning, @@ -89,6 +66,8 @@ __global__ void multibyte_split_kernel(cudf::io::text::trie_device_view trie, if (data_end > data.size()) { data_end = data.size(); } + // STEP 1 + 2: Load inputs, transform to individual superstates + superstate thread_superstates[BYTES_PER_THREAD]; for (uint32_t i = 0; i < BYTES_PER_THREAD; i++) { @@ -104,18 +83,16 @@ __global__ void multibyte_split_kernel(cudf::io::text::trie_device_view trie, } } - __syncthreads(); - - BlockPrefixCallbackOp prefix_op({}); + // STEP 3: Scan superstates can to produce absolute thread states. + __syncthreads(); SuperstateBlockScan(temp_storage.superstate_scan) .InclusiveScan( // thread_superstates, thread_superstates, - [](superstate const& lhs, superstate const& rhs) { return lhs + rhs; }, - prefix_op); + [](superstate const& lhs, superstate const& rhs) { return lhs + rhs; }); - __syncthreads(); + // STEP 4: Populate match flags uint32_t thread_offsets[BYTES_PER_THREAD]; @@ -123,10 +100,11 @@ __global__ void multibyte_split_kernel(cudf::io::text::trie_device_view trie, thread_offsets[i] = trie.is_match(thread_superstates[i].get(0)); } - __syncthreads(); + // STEP 5: Scan match flags to produce match offsets uint32_t matches_in_block; + __syncthreads(); OffsetBlockScan(temp_storage.offset_scan) .ExclusiveScan( thread_offsets, @@ -134,10 +112,12 @@ __global__ void multibyte_split_kernel(cudf::io::text::trie_device_view trie, [](uint32_t const& lhs, uint32_t const& rhs) { return lhs + rhs; }, matches_in_block); - __syncthreads(); + // Step 6: Assign final block-aggregate match offset as the total number of matches. if (threadIdx.x == 0) { *result_count = matches_in_block; } + // Step 7: Assign results from each thread using match offsets. + for (uint32_t i = 0; i < BYTES_PER_THREAD; i++) { auto const match_length = trie.get_match_length(thread_superstates[i].get(0)); From 21b8b25ba933fa1baa6e15a56bab930e5380d5fe Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Tue, 13 Jul 2021 13:32:02 -0500 Subject: [PATCH 18/80] multibyte_split add multi-block support --- cpp/src/io/text/multibyte_split.cu | 279 ++++++++++++++++++++--------- 1 file changed, 191 insertions(+), 88 deletions(-) diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index ba2d28a4fe9..01579375ebb 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -10,6 +10,7 @@ #include #include +#include #include #include @@ -30,9 +31,56 @@ struct trie_state { using superstate = cudf::io::text::superstate<16>; +template +struct scan_tile_state_view { + bool* tile_status; + T* tile_state; + + __device__ void initialize(cudf::size_type num_tiles) + { + auto thread_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_idx < num_tiles) { tile_status[thread_idx] = false; } + } + + __device__ void set_state(cudf::size_type tile_idx, T value) + { + cub::ThreadStore(tile_state + tile_idx, value); + __threadfence(); + cub::ThreadStore(tile_status + tile_idx, true); + } + + __device__ T get_state_sync(cudf::size_type tile_idx) + { + while (cub::ThreadLoad(tile_status + tile_idx) == false) { __threadfence(); } + return cub::ThreadLoad(tile_state + tile_idx); + } +}; + +template +struct scan_tile_state { + rmm::device_uvector tile_status; + rmm::device_uvector tile_state; + + scan_tile_state(cudf::size_type num_tiles, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) + : tile_status(rmm::device_uvector(num_tiles + 1, stream, mr)), + tile_state(rmm::device_uvector(num_tiles + 1, stream, mr)) + + { + } + + operator scan_tile_state_view() + { + return scan_tile_state_view{tile_status.data(), tile_state.data()}; + } + + T back_element(rmm::cuda_stream_view s) const { return tile_state.back_element(s); } +}; + // keep BYTES_PER_TILE below input size to force multi-tile execution. -auto constexpr BYTES_PER_THREAD = 2; -auto constexpr THREADS_PER_TILE = 32; +auto constexpr BYTES_PER_THREAD = 4; +auto constexpr THREADS_PER_TILE = 4; auto constexpr BYTES_PER_TILE = BYTES_PER_THREAD * THREADS_PER_TILE; auto constexpr TILES_PER_CHUNK = 1024; auto constexpr BYTES_PER_CHUNK = BYTES_PER_TILE * TILES_PER_CHUNK; @@ -46,18 +94,90 @@ auto constexpr BYTES_PER_CHUNK = BYTES_PER_TILE * TILES_PER_CHUNK; // it begins in. From there, each thread can then take deterministic action. In this case, the // deterministic action is counting and outputting delimiter offsets when a delimiter is found. -template -__global__ void multibyte_split_kernel(cudf::io::text::trie_device_view trie, +struct SuperstateScan { + typedef cub::BlockScan BlockScan; + + struct _TempStorage { + typename BlockScan::TempStorage scan; + superstate block_aggregate; + superstate exclusive_prefix; + superstate inclusive_prefix; + }; + + _TempStorage& _temp_storage; + + using TempStorage = cub::Uninitialized<_TempStorage>; + + __device__ inline SuperstateScan(TempStorage& temp_storage) : _temp_storage(temp_storage.Alias()) + { + } + + __device__ inline void Scan(scan_tile_state_view tile_state, + cudf::io::text::trie_device_view trie, + char (&thread_data)[BYTES_PER_THREAD], + uint32_t (&thread_state)[BYTES_PER_THREAD]) + { + // create a state that represents all possible starting states. + auto thread_superstate = superstate(); + + // transition all possible states + for (uint32_t i = 0; i < BYTES_PER_THREAD; i++) { + thread_superstate = thread_superstate.apply([&](uint8_t state) { // + return trie.transition(state, thread_data[i]); + }); + } + + auto prefix_callback = [&] __device__(superstate const& block_aggregate) -> superstate { + if (threadIdx.x == 0) { + _temp_storage.block_aggregate = block_aggregate; + _temp_storage.exclusive_prefix = tile_state.get_state_sync(blockIdx.x); + _temp_storage.inclusive_prefix = _temp_storage.exclusive_prefix + block_aggregate; + tile_state.set_state(blockIdx.x + 1, _temp_storage.inclusive_prefix); + } + return _temp_storage.exclusive_prefix; + }; + + BlockScan(_temp_storage.scan) + .ExclusiveScan( // + thread_superstate, + thread_superstate, + thrust::plus(), + prefix_callback); + + // transition from known state to known state + thread_state[0] = trie.transition(thread_superstate.get(0), thread_data[0]); + + for (uint32_t i = 1; i < BYTES_PER_THREAD; i++) { + thread_state[i] = trie.transition(thread_state[i - 1], thread_data[i]); + } + } +}; + +__global__ void multibyte_split_init_kernel(cudf::size_type num_tiles, + scan_tile_state_view tile_superstates, + scan_tile_state_view tile_output_offsets) +{ + tile_superstates.initialize(num_tiles); + tile_superstates.set_state(0, superstate()); + tile_output_offsets.initialize(num_tiles); + tile_output_offsets.set_state(0, 0); +} + +__global__ void multibyte_split_kernel(cudf::size_type num_tiles, + scan_tile_state_view tile_superstates, + scan_tile_state_view tile_output_offsets, + cudf::io::text::trie_device_view trie, cudf::device_span data, - uint32_t* result_count, - cudf::device_span results) + cudf::device_span string_offsets) { - typedef cub::BlockScan SuperstateBlockScan; - typedef cub::BlockScan OffsetBlockScan; + typedef cub::BlockScan OffsetScan; __shared__ union { - typename SuperstateBlockScan::TempStorage superstate_scan; - typename OffsetBlockScan::TempStorage offset_scan; + typename SuperstateScan::TempStorage superstate_scan; + struct { + typename OffsetScan::TempStorage offset_scan; + uint32_t offset_scan_exclusive_prefix; + }; } temp_storage; auto const thread_idx = blockIdx.x * blockDim.x + threadIdx.x; @@ -68,58 +188,47 @@ __global__ void multibyte_split_kernel(cudf::io::text::trie_device_view trie, // STEP 1 + 2: Load inputs, transform to individual superstates - superstate thread_superstates[BYTES_PER_THREAD]; + char thread_data[BYTES_PER_THREAD]; - for (uint32_t i = 0; i < BYTES_PER_THREAD; i++) { - auto const element_idx = data_begin + i; - if (element_idx >= data.size()) { - // this check is not necessary if we gaurantee no OOB accesses, which we can do because of - // the batch-read/batch-process approach. Keeping the check in for now, though. - thread_superstates[i] = superstate(); - } else { - thread_superstates[i] = superstate().apply([&](uint8_t state) { // - return trie.transition(state, data[element_idx]); - }); - } - } + for (auto i = 0; i < BYTES_PER_THREAD; i++) { thread_data[i] = data[data_begin + i]; } - // STEP 3: Scan superstates can to produce absolute thread states. + uint32_t thread_states[BYTES_PER_THREAD]; - __syncthreads(); - SuperstateBlockScan(temp_storage.superstate_scan) - .InclusiveScan( // - thread_superstates, - thread_superstates, - [](superstate const& lhs, superstate const& rhs) { return lhs + rhs; }); + SuperstateScan(temp_storage.superstate_scan) + .Scan(tile_superstates, trie, thread_data, thread_states); // STEP 4: Populate match flags uint32_t thread_offsets[BYTES_PER_THREAD]; for (uint32_t i = 0; i < BYTES_PER_THREAD; i++) { - thread_offsets[i] = trie.is_match(thread_superstates[i].get(0)); + thread_offsets[i] = trie.is_match(thread_states[i]); } // STEP 5: Scan match flags to produce match offsets - uint32_t matches_in_block; + __syncthreads(); // required before temp_memory re-use - __syncthreads(); - OffsetBlockScan(temp_storage.offset_scan) - .ExclusiveScan( + auto prefix_callback = [&] __device__(uint32_t const& block_aggregate) -> uint32_t { + if (threadIdx.x == 0) { + temp_storage.offset_scan_exclusive_prefix = tile_output_offsets.get_state_sync(blockIdx.x); + auto inclusive_prefix = temp_storage.offset_scan_exclusive_prefix + block_aggregate; + tile_output_offsets.set_state(blockIdx.x + 1, inclusive_prefix); + } + return temp_storage.offset_scan_exclusive_prefix; + }; + + OffsetScan(temp_storage.offset_scan) + .ExclusiveScan( // thread_offsets, thread_offsets, - [](uint32_t const& lhs, uint32_t const& rhs) { return lhs + rhs; }, - matches_in_block); - - // Step 6: Assign final block-aggregate match offset as the total number of matches. + thrust::plus(), + prefix_callback); - if (threadIdx.x == 0) { *result_count = matches_in_block; } - - // Step 7: Assign results from each thread using match offsets. + // Step 7: Assign string_offsets from each thread using match offsets. for (uint32_t i = 0; i < BYTES_PER_THREAD; i++) { - auto const match_length = trie.get_match_length(thread_superstates[i].get(0)); + auto const match_length = trie.get_match_length(thread_states[i]); if (match_length == 0) { continue; } @@ -135,7 +244,9 @@ __global__ void multibyte_split_kernel(cudf::io::text::trie_device_view trie, match_begin, match_end); - if (results.size() > 0) { results[thread_offsets[i]] = match_end; } + if (string_offsets.size() > thread_offsets[i]) { // + string_offsets[thread_offsets[i]] = match_end; + } } } @@ -151,68 +262,60 @@ std::unique_ptr multibyte_split(cudf::string_scalar const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - // auto input_buffer = rmm::device_uvector(BYTES_PER_CHUNK, stream); - // auto const input_span = cudf::device_span(input_buffer); - - // TODO: call state initalization kernels - auto const trie = cudf::io::text::trie::create(delimeters, stream); - auto num_results = rmm::device_scalar(0, stream); - auto num_tiles = ceil_div(input.size(), BYTES_PER_TILE); + auto num_tiles = ceil_div(input.size(), BYTES_PER_TILE); + + // pattern-match and count delimiters - auto offsets = rmm::device_uvector(0, stream); + auto tile_superstates = scan_tile_state>(num_tiles, stream); + auto tile_offsets = scan_tile_state(num_tiles, stream); + auto num_init_blocks = ceil_div(num_tiles, THREADS_PER_TILE); - // count the results + multibyte_split_init_kernel<<>>( // + num_tiles, + tile_superstates, + tile_offsets); - auto kernel = multibyte_split_kernel; - kernel<<>>( // + multibyte_split_kernel<<>>( // + num_tiles, + tile_superstates, + tile_offsets, trie.view(), cudf::device_span(input.data(), input.size()), - num_results.data(), - offsets); - - auto host_num_results = num_results.value(stream); + cudf::device_span(static_cast(nullptr), 0)); - stream.synchronize(); + // allocate string offsets - std::cout << "num results: " << host_num_results << std::endl; + auto num_results = tile_offsets.back_element(stream); + auto string_offsets = rmm::device_uvector(num_results + 2, stream); + auto const x = string_offsets.size() - 1; + auto const y = input.size(); - // allocate the results + std::cout << "num_results: " << num_results << std::endl; - offsets = rmm::device_uvector(host_num_results + 2, stream); - offsets.set_element_to_zero_async(0, stream); - cudf::size_type const x = offsets.size() - 1; - cudf::size_type const y = input.size(); - offsets.set_element_async(x, y, stream); + // first and last element are set manually to zero and size of input, respectively. + // kernel is only responsible for determining delimiter offsets + string_offsets.set_element_to_zero_async(0, stream); + string_offsets.set_element_async(x, y, stream); - // materialize the results + // pattern-match and materialize string offsets - kernel<<>>( // + multibyte_split_kernel<<>>( // + num_tiles, + tile_superstates, + tile_offsets, trie.view(), cudf::device_span(input.data(), input.size()), - num_results.data(), - cudf::device_span(offsets.data() + 1, host_num_results)); - - stream.synchronize(); - - // TODO: call state finalization kernels + cudf::device_span(string_offsets).subspan(1, num_results)); return cudf::make_strings_column( // cudf::device_span(input.data(), input.size()), - offsets); - - CUDF_FAIL(); - - /* - std::unique_ptr make_strings_column( - cudf::device_span strings, - cudf::device_span offsets, - cudf::device_span null_mask = {}, - size_type null_count = cudf::UNKNOWN_NULL_COUNT, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - */ + string_offsets, + {}, + 0, + stream, + mr); } } // namespace detail From f59a93e5169693b12fce69fe00cfa4f84d7f0bc0 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Tue, 13 Jul 2021 16:26:38 -0500 Subject: [PATCH 19/80] rename BYTES_PER_TILE to ITEMS_PER_TILE --- cpp/src/io/text/multibyte_split.cu | 75 ++++++++++++------------------ 1 file changed, 31 insertions(+), 44 deletions(-) diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 01579375ebb..eff67f66513 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -25,10 +25,6 @@ inline constexpr auto ceil_div(Dividend dividend, Divisor divisor) return dividend / divisor + (dividend % divisor != 0); } -struct trie_state { - uint8_t placeholder; -}; - using superstate = cudf::io::text::superstate<16>; template @@ -78,12 +74,12 @@ struct scan_tile_state { T back_element(rmm::cuda_stream_view s) const { return tile_state.back_element(s); } }; -// keep BYTES_PER_TILE below input size to force multi-tile execution. -auto constexpr BYTES_PER_THREAD = 4; -auto constexpr THREADS_PER_TILE = 4; -auto constexpr BYTES_PER_TILE = BYTES_PER_THREAD * THREADS_PER_TILE; +// keep ITEMS_PER_TILE below input size to force multi-tile execution. +auto constexpr ITEMS_PER_THREAD = 4; +auto constexpr THREADS_PER_TILE = 32; +auto constexpr ITEMS_PER_TILE = ITEMS_PER_THREAD * THREADS_PER_TILE; auto constexpr TILES_PER_CHUNK = 1024; -auto constexpr BYTES_PER_CHUNK = BYTES_PER_TILE * TILES_PER_CHUNK; +auto constexpr BYTES_PER_CHUNK = ITEMS_PER_TILE * TILES_PER_CHUNK; // multibyte_split works by splitting up inputs in to 32 inputs (bytes) per thread, and transforming // them in to data structures called "superstates". these superstates are created by searching a // trie, but instead of a tradition trie where the search begins at a single node at the beginning, @@ -94,7 +90,7 @@ auto constexpr BYTES_PER_CHUNK = BYTES_PER_TILE * TILES_PER_CHUNK; // it begins in. From there, each thread can then take deterministic action. In this case, the // deterministic action is counting and outputting delimiter offsets when a delimiter is found. -struct SuperstateScan { +struct PatternScan { typedef cub::BlockScan BlockScan; struct _TempStorage { @@ -108,20 +104,18 @@ struct SuperstateScan { using TempStorage = cub::Uninitialized<_TempStorage>; - __device__ inline SuperstateScan(TempStorage& temp_storage) : _temp_storage(temp_storage.Alias()) - { - } + __device__ inline PatternScan(TempStorage& temp_storage) : _temp_storage(temp_storage.Alias()) {} __device__ inline void Scan(scan_tile_state_view tile_state, cudf::io::text::trie_device_view trie, - char (&thread_data)[BYTES_PER_THREAD], - uint32_t (&thread_state)[BYTES_PER_THREAD]) + char (&thread_data)[ITEMS_PER_THREAD], + uint32_t (&thread_state)[ITEMS_PER_THREAD]) { // create a state that represents all possible starting states. auto thread_superstate = superstate(); // transition all possible states - for (uint32_t i = 0; i < BYTES_PER_THREAD; i++) { + for (uint32_t i = 0; i < ITEMS_PER_THREAD; i++) { thread_superstate = thread_superstate.apply([&](uint8_t state) { // return trie.transition(state, thread_data[i]); }); @@ -138,16 +132,12 @@ struct SuperstateScan { }; BlockScan(_temp_storage.scan) - .ExclusiveScan( // - thread_superstate, - thread_superstate, - thrust::plus(), - prefix_callback); + .ExclusiveSum(thread_superstate, thread_superstate, prefix_callback); // transition from known state to known state thread_state[0] = trie.transition(thread_superstate.get(0), thread_data[0]); - for (uint32_t i = 1; i < BYTES_PER_THREAD; i++) { + for (uint32_t i = 1; i < ITEMS_PER_THREAD; i++) { thread_state[i] = trie.transition(thread_state[i - 1], thread_data[i]); } } @@ -173,7 +163,7 @@ __global__ void multibyte_split_kernel(cudf::size_type num_tiles, typedef cub::BlockScan OffsetScan; __shared__ union { - typename SuperstateScan::TempStorage superstate_scan; + typename PatternScan::TempStorage pattern_scan; struct { typename OffsetScan::TempStorage offset_scan; uint32_t offset_scan_exclusive_prefix; @@ -181,31 +171,32 @@ __global__ void multibyte_split_kernel(cudf::size_type num_tiles, } temp_storage; auto const thread_idx = blockIdx.x * blockDim.x + threadIdx.x; - auto const data_begin = thread_idx * BYTES_PER_THREAD; - auto data_end = data_begin + BYTES_PER_THREAD; + auto const data_begin = thread_idx * ITEMS_PER_THREAD; - if (data_end > data.size()) { data_end = data.size(); } + // STEP 1: Load inputs - // STEP 1 + 2: Load inputs, transform to individual superstates + char thread_data[ITEMS_PER_THREAD]; - char thread_data[BYTES_PER_THREAD]; + for (auto i = 0; i < ITEMS_PER_THREAD; i++) { // + thread_data[i] = data[data_begin + i]; + } - for (auto i = 0; i < BYTES_PER_THREAD; i++) { thread_data[i] = data[data_begin + i]; } + // STEP 2: Scan inputs to determine absolute thread states - uint32_t thread_states[BYTES_PER_THREAD]; + uint32_t thread_states[ITEMS_PER_THREAD]; - SuperstateScan(temp_storage.superstate_scan) + PatternScan(temp_storage.pattern_scan) // .Scan(tile_superstates, trie, thread_data, thread_states); - // STEP 4: Populate match flags + // STEP 3: Flag matches - uint32_t thread_offsets[BYTES_PER_THREAD]; + uint32_t thread_offsets[ITEMS_PER_THREAD]; - for (uint32_t i = 0; i < BYTES_PER_THREAD; i++) { + for (int32_t i = 0; i < ITEMS_PER_THREAD; i++) { thread_offsets[i] = trie.is_match(thread_states[i]); } - // STEP 5: Scan match flags to produce match offsets + // STEP 4: Scan flags to determine absolute thread output offset __syncthreads(); // required before temp_memory re-use @@ -219,15 +210,11 @@ __global__ void multibyte_split_kernel(cudf::size_type num_tiles, }; OffsetScan(temp_storage.offset_scan) - .ExclusiveScan( // - thread_offsets, - thread_offsets, - thrust::plus(), - prefix_callback); + .ExclusiveSum(thread_offsets, thread_offsets, prefix_callback); - // Step 7: Assign string_offsets from each thread using match offsets. + // Step 5: Assign string_offsets from each thread using match offsets. - for (uint32_t i = 0; i < BYTES_PER_THREAD; i++) { + for (uint32_t i = 0; i < ITEMS_PER_THREAD; i++) { auto const match_length = trie.get_match_length(thread_states[i]); if (match_length == 0) { continue; } @@ -239,7 +226,7 @@ __global__ void multibyte_split_kernel(cudf::size_type num_tiles, blockIdx.x, threadIdx.x, i, - data[data_begin + i], + thread_data[i], thread_offsets[i], match_begin, match_end); @@ -264,7 +251,7 @@ std::unique_ptr multibyte_split(cudf::string_scalar const& input, { auto const trie = cudf::io::text::trie::create(delimeters, stream); - auto num_tiles = ceil_div(input.size(), BYTES_PER_TILE); + auto num_tiles = ceil_div(input.size(), ITEMS_PER_TILE); // pattern-match and count delimiters From 5fa112a1f4949f21956f6e6389d4c0c89fcef937 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Wed, 14 Jul 2021 00:47:09 -0500 Subject: [PATCH 20/80] add bounds check to multibyte_split load and flag --- cpp/src/io/text/multibyte_split.cu | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index eff67f66513..eab7e135537 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -75,7 +75,7 @@ struct scan_tile_state { }; // keep ITEMS_PER_TILE below input size to force multi-tile execution. -auto constexpr ITEMS_PER_THREAD = 4; +auto constexpr ITEMS_PER_THREAD = 32; auto constexpr THREADS_PER_TILE = 32; auto constexpr ITEMS_PER_TILE = ITEMS_PER_THREAD * THREADS_PER_TILE; auto constexpr TILES_PER_CHUNK = 1024; @@ -170,14 +170,15 @@ __global__ void multibyte_split_kernel(cudf::size_type num_tiles, }; } temp_storage; - auto const thread_idx = blockIdx.x * blockDim.x + threadIdx.x; - auto const data_begin = thread_idx * ITEMS_PER_THREAD; + int32_t const thread_idx = blockIdx.x * blockDim.x + threadIdx.x; + int32_t const data_begin = thread_idx * ITEMS_PER_THREAD; + int32_t const num_valid = data.size() - data_begin; // STEP 1: Load inputs char thread_data[ITEMS_PER_THREAD]; - for (auto i = 0; i < ITEMS_PER_THREAD; i++) { // + for (int32_t i = 0; i < ITEMS_PER_THREAD and i < num_valid; i++) { // thread_data[i] = data[data_begin + i]; } @@ -193,7 +194,7 @@ __global__ void multibyte_split_kernel(cudf::size_type num_tiles, uint32_t thread_offsets[ITEMS_PER_THREAD]; for (int32_t i = 0; i < ITEMS_PER_THREAD; i++) { - thread_offsets[i] = trie.is_match(thread_states[i]); + thread_offsets[i] = i < num_valid and trie.is_match(thread_states[i]); } // STEP 4: Scan flags to determine absolute thread output offset @@ -214,7 +215,7 @@ __global__ void multibyte_split_kernel(cudf::size_type num_tiles, // Step 5: Assign string_offsets from each thread using match offsets. - for (uint32_t i = 0; i < ITEMS_PER_THREAD; i++) { + for (int32_t i = 0; i < ITEMS_PER_THREAD and i < num_valid; i++) { auto const match_length = trie.get_match_length(thread_states[i]); if (match_length == 0) { continue; } From cf42fd042ff0d47c111debb690b4803d08012387 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Wed, 14 Jul 2021 01:11:40 -0500 Subject: [PATCH 21/80] multibyte_split benchmark scaffolding --- cpp/benchmarks/CMakeLists.txt | 5 ++ .../io/text/multibyte_split_benchmark.cpp | 59 +++++++++++++++++++ cpp/src/io/text/multibyte_split.cu | 4 +- 3 files changed, 66 insertions(+), 2 deletions(-) create mode 100644 cpp/benchmarks/io/text/multibyte_split_benchmark.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index e8ccb24f44c..7c6491a8f14 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -233,3 +233,8 @@ ConfigureBench(STRINGS_BENCH # - json benchmark ------------------------------------------------------------------- ConfigureBench(JSON_BENCH string/json_benchmark.cpp) + +################################################################################################### +# - io benchmark --------------------------------------------------------------------- +ConfigureBench(MULTIBYTE_SPLIT_BENCHMARK + io/text/multibyte_split_benchmark.cpp) diff --git a/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp new file mode 100644 index 00000000000..e40b991874c --- /dev/null +++ b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +using cudf::test::fixed_width_column_wrapper; + +static void BM_multibyte_split(benchmark::State& state) +{ + std::string host_input = ""; + int32_t num_chars = state.range(0); + + for (auto i = 0; i < num_chars; i++) { host_input += "x"; } + + cudf::string_scalar input(host_input); + + auto delimiters = std::vector({"😀", "😎", ",", "::"}); + + for (auto _ : state) { + cuda_event_timer raii(state, true); + auto output = cudf::io::text::multibyte_split(input, delimiters); + } + + state.SetBytesProcessed(state.iterations() * num_chars); +} + +class MultibyteSplitBenchmark : public cudf::benchmark { +}; + +#define TRANSPOSE_BM_BENCHMARK_DEFINE(name) \ + BENCHMARK_DEFINE_F(MultibyteSplitBenchmark, name)(::benchmark::State & state) \ + { \ + BM_multibyte_split(state); \ + } \ + BENCHMARK_REGISTER_F(MultibyteSplitBenchmark, name) \ + ->Range(1 << 15, 1 << 30) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond); + +TRANSPOSE_BM_BENCHMARK_DEFINE(multibyte_split_simple); diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index eab7e135537..721fce8d8fe 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -76,7 +76,7 @@ struct scan_tile_state { // keep ITEMS_PER_TILE below input size to force multi-tile execution. auto constexpr ITEMS_PER_THREAD = 32; -auto constexpr THREADS_PER_TILE = 32; +auto constexpr THREADS_PER_TILE = 128; auto constexpr ITEMS_PER_TILE = ITEMS_PER_THREAD * THREADS_PER_TILE; auto constexpr TILES_PER_CHUNK = 1024; auto constexpr BYTES_PER_CHUNK = ITEMS_PER_TILE * TILES_PER_CHUNK; @@ -280,7 +280,7 @@ std::unique_ptr multibyte_split(cudf::string_scalar const& input, auto const x = string_offsets.size() - 1; auto const y = input.size(); - std::cout << "num_results: " << num_results << std::endl; + // std::cout << "num_results: " << num_results << std::endl; // first and last element are set manually to zero and size of input, respectively. // kernel is only responsible for determining delimiter offsets From e6e9741acd8cf2b54db821d1ddead8085468d658 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Wed, 14 Jul 2021 10:48:44 -0500 Subject: [PATCH 22/80] multibyte_split increase threads per block and adjust test case. --- cpp/src/io/text/multibyte_split.cu | 4 +--- cpp/tests/io/text/multibyte_split_test.cpp | 14 ++++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 721fce8d8fe..7c648285ce2 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -76,7 +76,7 @@ struct scan_tile_state { // keep ITEMS_PER_TILE below input size to force multi-tile execution. auto constexpr ITEMS_PER_THREAD = 32; -auto constexpr THREADS_PER_TILE = 128; +auto constexpr THREADS_PER_TILE = 512; auto constexpr ITEMS_PER_TILE = ITEMS_PER_THREAD * THREADS_PER_TILE; auto constexpr TILES_PER_CHUNK = 1024; auto constexpr BYTES_PER_CHUNK = ITEMS_PER_TILE * TILES_PER_CHUNK; @@ -280,8 +280,6 @@ std::unique_ptr multibyte_split(cudf::string_scalar const& input, auto const x = string_offsets.size() - 1; auto const y = input.size(); - // std::cout << "num_results: " << num_results << std::endl; - // first and last element are set manually to zero and size of input, respectively. // kernel is only responsible for determining delimiter offsets string_offsets.set_element_to_zero_async(0, stream); diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index dc8c8cc3a0c..2075b4da117 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -65,14 +65,16 @@ TEST_F(MultibyteSplitTest, Simple) "and😎" "used😎" "as😎" - "delimeters."); + "delimeters.😎" + "::" + "," + "😀"); auto expected = strings_column_wrapper{ - "aaa😀", "bbb😀", "ccc😀", "ddd😀", "eee😀", "fff::", "ggg😀", - "hhh😀", "___,", "here,", "is,", "another,", "simple😀", "text😎", - "seperated😎", "by😎", "emojis,", "which,", "are😎", "multiple,", "bytes::", - "and😎", "used😎", "as😎", "delimeters.", - }; + "aaa😀", "bbb😀", "ccc😀", "ddd😀", "eee😀", "fff::", "ggg😀", "hhh😀", + "___,", "here,", "is,", "another,", "simple😀", "text😎", "seperated😎", "by😎", + "emojis,", "which,", "are😎", "multiple,", "bytes::", "and😎", "used😎", "as😎", + "delimeters.😎", "::", ",", "😀", ""}; auto out = cudf::io::text::multibyte_split(input, delimiters); From b5c2e05c6d6a196ce89179822cf28e2f38e44736 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Thu, 15 Jul 2021 19:36:51 -0500 Subject: [PATCH 23/80] use circular buffer in multibyte_split to allow for stream inputs --- cpp/CMakeLists.txt | 2 +- .../{input_stream.hpp => device_istream.hpp} | 4 +- ...put_stream.hpp => host_device_istream.hpp} | 10 +- cpp/include/cudf/io/text/multibyte_split.hpp | 9 +- ...put_stream.cpp => host_device_istream.cpp} | 10 +- cpp/src/io/text/multibyte_split.cu | 192 +++++++++++++++--- cpp/tests/io/text/multibyte_split_test.cpp | 2 +- cpp/tests/io/text/trie_test.cpp | 2 +- 8 files changed, 191 insertions(+), 40 deletions(-) rename cpp/include/cudf/io/text/{input_stream.hpp => device_istream.hpp} (58%) rename cpp/include/cudf/io/text/{host_input_stream.hpp => host_device_istream.hpp} (60%) rename cpp/src/io/text/{host_input_stream.cpp => host_device_istream.cpp} (63%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index b5b1de9900a..5c19d3eaa9c 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -257,7 +257,7 @@ add_library(cudf src/io/parquet/writer_impl.cu src/io/statistics/orc_column_statistics.cu src/io/statistics/parquet_column_statistics.cu - src/io/text/host_input_stream.cpp + src/io/text/host_device_istream.cpp src/io/text/multibyte_split.cu src/io/utilities/column_buffer.cpp src/io/utilities/data_sink.cpp diff --git a/cpp/include/cudf/io/text/input_stream.hpp b/cpp/include/cudf/io/text/device_istream.hpp similarity index 58% rename from cpp/include/cudf/io/text/input_stream.hpp rename to cpp/include/cudf/io/text/device_istream.hpp index f977f70f5fd..65daae8c5c5 100644 --- a/cpp/include/cudf/io/text/input_stream.hpp +++ b/cpp/include/cudf/io/text/device_istream.hpp @@ -8,9 +8,11 @@ namespace cudf { namespace io { namespace text { -class input_stream { +class device_istream { public: virtual uint32_t readsome(cudf::device_span destination, rmm::cuda_stream_view stream) = 0; + virtual uint32_t tellg() = 0; + virtual void seekg(uint32_t pos) = 0; }; } // namespace text diff --git a/cpp/include/cudf/io/text/host_input_stream.hpp b/cpp/include/cudf/io/text/host_device_istream.hpp similarity index 60% rename from cpp/include/cudf/io/text/host_input_stream.hpp rename to cpp/include/cudf/io/text/host_device_istream.hpp index e68eecb0765..c4970c31179 100644 --- a/cpp/include/cudf/io/text/host_input_stream.hpp +++ b/cpp/include/cudf/io/text/host_device_istream.hpp @@ -1,6 +1,6 @@ #pragma once -#include +#include #include @@ -12,12 +12,16 @@ namespace cudf { namespace io { namespace text { -class host_input_stream : public cudf::io::text::input_stream { +class host_device_istream : public cudf::io::text::device_istream { public: - host_input_stream(std::istream& source_stream) : _source_stream(source_stream) {} + host_device_istream(std::istream& source_stream) : _source_stream(source_stream) {} uint32_t readsome(cudf::device_span destination, rmm::cuda_stream_view stream) override; + uint32_t tellg() override; + + void seekg(uint32_t pos) override; + private: std::istream& _source_stream; thrust::host_vector _host_buffer{}; diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp index 52bd66e9405..e4ea512d8a8 100644 --- a/cpp/include/cudf/io/text/multibyte_split.hpp +++ b/cpp/include/cudf/io/text/multibyte_split.hpp @@ -1,4 +1,4 @@ -#include +#include #include @@ -17,6 +17,11 @@ std::unique_ptr multibyte_split( std::vector const& delimeters, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -} +std::unique_ptr multibyte_split( + cudf::io::text::device_istream& input, + std::vector const& delimeters, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +} // namespace text } // namespace io } // namespace cudf diff --git a/cpp/src/io/text/host_input_stream.cpp b/cpp/src/io/text/host_device_istream.cpp similarity index 63% rename from cpp/src/io/text/host_input_stream.cpp rename to cpp/src/io/text/host_device_istream.cpp index 6eb5364eede..85e6ef04601 100644 --- a/cpp/src/io/text/host_input_stream.cpp +++ b/cpp/src/io/text/host_device_istream.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include @@ -11,8 +11,8 @@ namespace cudf { namespace io { namespace text { -uint32_t host_input_stream::readsome(cudf::device_span destination, - rmm::cuda_stream_view stream) +uint32_t host_device_istream::readsome(cudf::device_span destination, + rmm::cuda_stream_view stream) { auto read_size = destination.size(); @@ -30,6 +30,10 @@ uint32_t host_input_stream::readsome(cudf::device_span destination, return read_size; } +uint32_t host_device_istream::tellg() { return _source_stream.tellg(); } + +void host_device_istream::seekg(uint32_t pos) { _source_stream.seekg(pos); } + } // namespace text } // namespace io } // namespace cudf diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 7c648285ce2..9d50963f0dd 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include #include @@ -29,26 +29,32 @@ using superstate = cudf::io::text::superstate<16>; template struct scan_tile_state_view { + uint64_t num_tiles; bool* tile_status; T* tile_state; - __device__ void initialize(cudf::size_type num_tiles) + __device__ void initialize(cudf::size_type base_tile_idx, cudf::size_type count) { auto thread_idx = blockIdx.x * blockDim.x + threadIdx.x; - if (thread_idx < num_tiles) { tile_status[thread_idx] = false; } + if (thread_idx < count) { // + tile_status[(base_tile_idx + thread_idx) % num_tiles] = false; + } } - __device__ void set_state(cudf::size_type tile_idx, T value) + __device__ void set_inclusive_prefix(cudf::size_type tile_idx, T value) { - cub::ThreadStore(tile_state + tile_idx, value); + cub::ThreadStore(tile_state + ((tile_idx + num_tiles) % num_tiles), value); __threadfence(); - cub::ThreadStore(tile_status + tile_idx, true); + cub::ThreadStore(tile_status + ((tile_idx + num_tiles) % num_tiles), true); } - __device__ T get_state_sync(cudf::size_type tile_idx) + __device__ T get_inclusive_prefix(cudf::size_type tile_idx) { - while (cub::ThreadLoad(tile_status + tile_idx) == false) { __threadfence(); } - return cub::ThreadLoad(tile_state + tile_idx); + while (cub::ThreadLoad(tile_status + ((tile_idx + num_tiles) % num_tiles)) == + false) { + __threadfence(); + } + return cub::ThreadLoad(tile_state + ((tile_idx + num_tiles) % num_tiles)); } }; @@ -60,26 +66,33 @@ struct scan_tile_state { scan_tile_state(cudf::size_type num_tiles, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) - : tile_status(rmm::device_uvector(num_tiles + 1, stream, mr)), - tile_state(rmm::device_uvector(num_tiles + 1, stream, mr)) - + : tile_status(rmm::device_uvector(num_tiles, stream, mr)), + tile_state(rmm::device_uvector(num_tiles, stream, mr)) { } operator scan_tile_state_view() { - return scan_tile_state_view{tile_status.data(), tile_state.data()}; + return scan_tile_state_view{tile_status.size(), tile_status.data(), tile_state.data()}; } - T back_element(rmm::cuda_stream_view s) const { return tile_state.back_element(s); } + void set_seed_async(T const seed, rmm::cuda_stream_view stream) + { + auto x = tile_status.size(); + bool y = true; + tile_state.set_element_async(x - 1, seed, stream); + tile_status.set_element_async(x - 1, y, stream); + } + + T back_element(rmm::cuda_stream_view stream) const { return tile_state.back_element(stream); } }; // keep ITEMS_PER_TILE below input size to force multi-tile execution. -auto constexpr ITEMS_PER_THREAD = 32; -auto constexpr THREADS_PER_TILE = 512; +auto constexpr ITEMS_PER_THREAD = 4; +auto constexpr THREADS_PER_TILE = 4; auto constexpr ITEMS_PER_TILE = ITEMS_PER_THREAD * THREADS_PER_TILE; -auto constexpr TILES_PER_CHUNK = 1024; -auto constexpr BYTES_PER_CHUNK = ITEMS_PER_TILE * TILES_PER_CHUNK; +auto constexpr TILES_PER_CHUNK = 4; +auto constexpr ITEMS_PER_CHUNK = ITEMS_PER_TILE * TILES_PER_CHUNK; // multibyte_split works by splitting up inputs in to 32 inputs (bytes) per thread, and transforming // them in to data structures called "superstates". these superstates are created by searching a // trie, but instead of a tradition trie where the search begins at a single node at the beginning, @@ -124,9 +137,14 @@ struct PatternScan { auto prefix_callback = [&] __device__(superstate const& block_aggregate) -> superstate { if (threadIdx.x == 0) { _temp_storage.block_aggregate = block_aggregate; - _temp_storage.exclusive_prefix = tile_state.get_state_sync(blockIdx.x); + _temp_storage.exclusive_prefix = tile_state.get_inclusive_prefix(blockIdx.x - 1); _temp_storage.inclusive_prefix = _temp_storage.exclusive_prefix + block_aggregate; - tile_state.set_state(blockIdx.x + 1, _temp_storage.inclusive_prefix); + tile_state.set_inclusive_prefix(blockIdx.x, _temp_storage.inclusive_prefix); + + printf("bid(%2u) tid(%2u): prefix = %2u %2u\n", + blockIdx.x, + threadIdx.x, + _temp_storage.exclusive_prefix); } return _temp_storage.exclusive_prefix; }; @@ -143,17 +161,17 @@ struct PatternScan { } }; -__global__ void multibyte_split_init_kernel(cudf::size_type num_tiles, +__global__ void multibyte_split_init_kernel(cudf::size_type base_tile_idx, + cudf::size_type num_tiles, scan_tile_state_view tile_superstates, scan_tile_state_view tile_output_offsets) { - tile_superstates.initialize(num_tiles); - tile_superstates.set_state(0, superstate()); - tile_output_offsets.initialize(num_tiles); - tile_output_offsets.set_state(0, 0); + tile_superstates.initialize(base_tile_idx, num_tiles); + tile_output_offsets.initialize(base_tile_idx, num_tiles); } -__global__ void multibyte_split_kernel(cudf::size_type num_tiles, +__global__ void multibyte_split_kernel(cudf::size_type base_tile_idx, + cudf::size_type num_tiles, scan_tile_state_view tile_superstates, scan_tile_state_view tile_output_offsets, cudf::io::text::trie_device_view trie, @@ -186,6 +204,10 @@ __global__ void multibyte_split_kernel(cudf::size_type num_tiles, uint32_t thread_states[ITEMS_PER_THREAD]; + // is first tile -> blockscan not prefix callback + // is last tile <- num valid < 32 + // AliasTemporiaries + PatternScan(temp_storage.pattern_scan) // .Scan(tile_superstates, trie, thread_data, thread_states); @@ -203,9 +225,10 @@ __global__ void multibyte_split_kernel(cudf::size_type num_tiles, auto prefix_callback = [&] __device__(uint32_t const& block_aggregate) -> uint32_t { if (threadIdx.x == 0) { - temp_storage.offset_scan_exclusive_prefix = tile_output_offsets.get_state_sync(blockIdx.x); + temp_storage.offset_scan_exclusive_prefix = + tile_output_offsets.get_inclusive_prefix(blockIdx.x - 1); auto inclusive_prefix = temp_storage.offset_scan_exclusive_prefix + block_aggregate; - tile_output_offsets.set_state(blockIdx.x + 1, inclusive_prefix); + tile_output_offsets.set_inclusive_prefix(blockIdx.x, inclusive_prefix); } return temp_storage.offset_scan_exclusive_prefix; }; @@ -261,11 +284,16 @@ std::unique_ptr multibyte_split(cudf::string_scalar const& input, auto num_init_blocks = ceil_div(num_tiles, THREADS_PER_TILE); multibyte_split_init_kernel<<>>( // + 0, num_tiles, tile_superstates, tile_offsets); + tile_superstates.set_seed_async(superstate<16>(), stream); + tile_offsets.set_seed_async(0, stream); + multibyte_split_kernel<<>>( // + 0, num_tiles, tile_superstates, tile_offsets, @@ -285,10 +313,20 @@ std::unique_ptr multibyte_split(cudf::string_scalar const& input, string_offsets.set_element_to_zero_async(0, stream); string_offsets.set_element_async(x, y, stream); + multibyte_split_init_kernel<<>>( // + 0, + num_tiles, + tile_superstates, + tile_offsets); + + tile_superstates.set_seed_async(superstate<16>(), stream); + tile_offsets.set_seed_async(0, stream); + // pattern-match and materialize string offsets multibyte_split_kernel<<>>( // num_tiles, + 0, tile_superstates, tile_offsets, trie.view(), @@ -304,6 +342,97 @@ std::unique_ptr multibyte_split(cudf::string_scalar const& input, mr); } +std::unique_ptr multibyte_split(cudf::io::text::device_istream& input, + std::vector const& delimeters, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const trie = cudf::io::text::trie::create(delimeters, stream); + + // pattern-match and count delimiters + + auto tile_superstates = scan_tile_state>(TILES_PER_CHUNK * 2, stream); + auto tile_offsets = scan_tile_state(TILES_PER_CHUNK * 2, stream); + + rmm::device_uvector input_buffer(ITEMS_PER_CHUNK, stream); + + uint32_t starting_position = input.tellg(); + uint32_t bytes_read; + + // TODO: Set seed state. + + tile_superstates.set_seed_async(superstate<16>(), stream); + tile_offsets.set_seed_async(0, stream); + + for (auto base_tile_idx = 0; bytes_read = input.readsome(input_buffer, stream) > 0; + base_tile_idx += TILES_PER_CHUNK) { + // reset the next chunk of tile state + multibyte_split_init_kernel<<>>( // + base_tile_idx, + TILES_PER_CHUNK, + tile_superstates, + tile_offsets); + + multibyte_split_kernel<<>>( // + base_tile_idx, + TILES_PER_CHUNK, + tile_superstates, + tile_offsets, + trie.view(), + cudf::device_span(input_buffer).first(bytes_read), + cudf::device_span(static_cast(nullptr), 0)); + } + + // allocate string offsets + + auto num_results = tile_offsets.back_element(stream); + auto string_offsets = rmm::device_uvector(num_results + 2, stream); + + // first and last element are set manually to zero and size of input, respectively. + // kernel is only responsible for determining delimiter offsets + // auto const x = string_offsets.size() - 1; + // auto const y = input.size(); + // string_offsets.set_element_to_zero_async(0, stream); + // string_offsets.set_element_async(x, y, stream); + + // pattern-match and materialize string offsets + input.seekg(starting_position); + + // TODO: Set seed state. + + tile_superstates.set_seed_async(superstate<16>(), stream); + tile_offsets.set_seed_async(0, stream); + + for (auto base_tile_idx = 0; bytes_read = input.readsome(input_buffer, stream) > 0; + base_tile_idx += TILES_PER_CHUNK) { + // reset the next chunk of tile state + multibyte_split_init_kernel<<>>( // + base_tile_idx, + TILES_PER_CHUNK, + tile_superstates, + tile_offsets); + + multibyte_split_kernel<<>>( // + base_tile_idx, + TILES_PER_CHUNK, + tile_superstates, + tile_offsets, + trie.view(), + cudf::device_span(input_buffer).first(bytes_read), + cudf::device_span(string_offsets).subspan(1, num_results)); + } + + CUDF_FAIL(); + + // return cudf::make_strings_column( // + // cudf::device_span(input.data(), input.size()), + // string_offsets, + // {}, + // 0, + // stream, + // mr); +} // namespace detail + } // namespace detail std::unique_ptr multibyte_split(cudf::string_scalar const& input, @@ -313,6 +442,13 @@ std::unique_ptr multibyte_split(cudf::string_scalar const& input, return detail::multibyte_split(input, delimeters, rmm::cuda_stream_default, mr); } +// std::unique_ptr multibyte_split(cudf::io::text::device_istream& input, +// std::vector const& delimeters, +// rmm::mr::device_memory_resource* mr) +// { +// return detail::multibyte_split(input, delimeters, rmm::cuda_stream_default, mr); +// } + } // namespace text } // namespace io } // namespace cudf diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index 2075b4da117..cec50aac160 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -21,7 +21,7 @@ #include #include -#include +#include #include #include diff --git a/cpp/tests/io/text/trie_test.cpp b/cpp/tests/io/text/trie_test.cpp index 1fbecd6e905..2beb8497e4b 100644 --- a/cpp/tests/io/text/trie_test.cpp +++ b/cpp/tests/io/text/trie_test.cpp @@ -19,7 +19,7 @@ #include #include -#include +#include #include #include From 738af4850fa6afb7ee0cccd32bd114b0fd440179 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Thu, 15 Jul 2021 23:30:05 -0500 Subject: [PATCH 24/80] update multibyte_split to work with streaming inputs --- cpp/include/cudf/io/text/device_istream.hpp | 5 +- .../cudf/io/text/host_device_istream.hpp | 6 +- cpp/src/io/text/host_device_istream.cpp | 22 ++- cpp/src/io/text/multibyte_split.cu | 155 +++++++++++------- cpp/tests/io/text/multibyte_split_test.cpp | 11 +- 5 files changed, 126 insertions(+), 73 deletions(-) diff --git a/cpp/include/cudf/io/text/device_istream.hpp b/cpp/include/cudf/io/text/device_istream.hpp index 65daae8c5c5..276b2b09c2d 100644 --- a/cpp/include/cudf/io/text/device_istream.hpp +++ b/cpp/include/cudf/io/text/device_istream.hpp @@ -10,9 +10,8 @@ namespace text { class device_istream { public: - virtual uint32_t readsome(cudf::device_span destination, rmm::cuda_stream_view stream) = 0; - virtual uint32_t tellg() = 0; - virtual void seekg(uint32_t pos) = 0; + virtual uint32_t read(cudf::device_span destination, rmm::cuda_stream_view stream) = 0; + virtual void reset() = 0; }; } // namespace text diff --git a/cpp/include/cudf/io/text/host_device_istream.hpp b/cpp/include/cudf/io/text/host_device_istream.hpp index c4970c31179..8d043cf895f 100644 --- a/cpp/include/cudf/io/text/host_device_istream.hpp +++ b/cpp/include/cudf/io/text/host_device_istream.hpp @@ -16,11 +16,9 @@ class host_device_istream : public cudf::io::text::device_istream { public: host_device_istream(std::istream& source_stream) : _source_stream(source_stream) {} - uint32_t readsome(cudf::device_span destination, rmm::cuda_stream_view stream) override; + uint32_t read(cudf::device_span destination, rmm::cuda_stream_view stream) override; - uint32_t tellg() override; - - void seekg(uint32_t pos) override; + void reset() override; private: std::istream& _source_stream; diff --git a/cpp/src/io/text/host_device_istream.cpp b/cpp/src/io/text/host_device_istream.cpp index 85e6ef04601..e488ae3e263 100644 --- a/cpp/src/io/text/host_device_istream.cpp +++ b/cpp/src/io/text/host_device_istream.cpp @@ -11,28 +11,34 @@ namespace cudf { namespace io { namespace text { -uint32_t host_device_istream::readsome(cudf::device_span destination, - rmm::cuda_stream_view stream) +uint32_t host_device_istream::read(cudf::device_span destination, + rmm::cuda_stream_view stream) { auto read_size = destination.size(); if (_host_buffer.size() < read_size) { _host_buffer.resize(read_size); } - read_size = _source_stream.readsome(_host_buffer.data(), read_size); + _source_stream.read(_host_buffer.data(), read_size); + + auto read_size_actual = _source_stream.gcount(); CUDA_TRY(cudaMemcpyAsync( // destination.data(), _host_buffer.data(), - read_size, + read_size_actual, cudaMemcpyHostToDevice, stream.value())); - return read_size; -} + std::cout << "tried to read: " << read_size << ", and got: " << read_size_actual << std::endl; -uint32_t host_device_istream::tellg() { return _source_stream.tellg(); } + return read_size_actual; +} -void host_device_istream::seekg(uint32_t pos) { _source_stream.seekg(pos); } +void host_device_istream::reset() +{ + _source_stream.clear(); + _source_stream.seekg(0, _source_stream.beg); // +} } // namespace text } // namespace io diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 9d50963f0dd..f49bea3a341 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -84,14 +84,19 @@ struct scan_tile_state { tile_status.set_element_async(x - 1, y, stream); } - T back_element(rmm::cuda_stream_view stream) const { return tile_state.back_element(stream); } + // T back_element(rmm::cuda_stream_view stream) const { return tile_state.back_element(stream); } + + T get_inclusive_prefix(cudf::size_type tile_idx, rmm::cuda_stream_view stream) + { + return tile_state.element((tile_idx + tile_status.size()) % tile_status.size(), stream); + } }; // keep ITEMS_PER_TILE below input size to force multi-tile execution. -auto constexpr ITEMS_PER_THREAD = 4; -auto constexpr THREADS_PER_TILE = 4; +auto constexpr ITEMS_PER_THREAD = 2; +auto constexpr THREADS_PER_TILE = 2; auto constexpr ITEMS_PER_TILE = ITEMS_PER_THREAD * THREADS_PER_TILE; -auto constexpr TILES_PER_CHUNK = 4; +auto constexpr TILES_PER_CHUNK = 2; auto constexpr ITEMS_PER_CHUNK = ITEMS_PER_TILE * TILES_PER_CHUNK; // multibyte_split works by splitting up inputs in to 32 inputs (bytes) per thread, and transforming // them in to data structures called "superstates". these superstates are created by searching a @@ -119,7 +124,8 @@ struct PatternScan { __device__ inline PatternScan(TempStorage& temp_storage) : _temp_storage(temp_storage.Alias()) {} - __device__ inline void Scan(scan_tile_state_view tile_state, + __device__ inline void Scan(cudf::size_type base_tile_idx, + scan_tile_state_view tile_state, cudf::io::text::trie_device_view trie, char (&thread_data)[ITEMS_PER_THREAD], uint32_t (&thread_state)[ITEMS_PER_THREAD]) @@ -136,12 +142,14 @@ struct PatternScan { auto prefix_callback = [&] __device__(superstate const& block_aggregate) -> superstate { if (threadIdx.x == 0) { - _temp_storage.block_aggregate = block_aggregate; - _temp_storage.exclusive_prefix = tile_state.get_inclusive_prefix(blockIdx.x - 1); + _temp_storage.block_aggregate = block_aggregate; + _temp_storage.exclusive_prefix = + tile_state.get_inclusive_prefix(base_tile_idx + blockIdx.x - 1); _temp_storage.inclusive_prefix = _temp_storage.exclusive_prefix + block_aggregate; - tile_state.set_inclusive_prefix(blockIdx.x, _temp_storage.inclusive_prefix); + tile_state.set_inclusive_prefix(base_tile_idx + blockIdx.x, _temp_storage.inclusive_prefix); - printf("bid(%2u) tid(%2u): prefix = %2u %2u\n", + printf("base_tile_idx(%2u) bid(%2u) tid(%2u): prefix = %2u %2u\n", + static_cast(base_tile_idx), blockIdx.x, threadIdx.x, _temp_storage.exclusive_prefix); @@ -191,6 +199,16 @@ __global__ void multibyte_split_kernel(cudf::size_type base_tile_idx, int32_t const thread_idx = blockIdx.x * blockDim.x + threadIdx.x; int32_t const data_begin = thread_idx * ITEMS_PER_THREAD; int32_t const num_valid = data.size() - data_begin; + int32_t const char_begin = base_tile_idx * ITEMS_PER_TILE; + + if (threadIdx.x == 0) { + printf("base_tile_idx(%2u) bid(%2u) tid(%2u) data_size(%2u) num_valid(%2i)\n", + static_cast(base_tile_idx), + blockIdx.x, + threadIdx.x, + static_cast(data.size()), + num_valid); + } // STEP 1: Load inputs @@ -198,18 +216,21 @@ __global__ void multibyte_split_kernel(cudf::size_type base_tile_idx, for (int32_t i = 0; i < ITEMS_PER_THREAD and i < num_valid; i++) { // thread_data[i] = data[data_begin + i]; + + printf("base_tile_idx(%2u) bid(%2u) tid(%2u) byte(%2u): %c\n", // + static_cast(base_tile_idx), + blockIdx.x, + threadIdx.x, + i, + thread_data[i]); } // STEP 2: Scan inputs to determine absolute thread states uint32_t thread_states[ITEMS_PER_THREAD]; - // is first tile -> blockscan not prefix callback - // is last tile <- num valid < 32 - // AliasTemporiaries - PatternScan(temp_storage.pattern_scan) // - .Scan(tile_superstates, trie, thread_data, thread_states); + .Scan(base_tile_idx, tile_superstates, trie, thread_data, thread_states); // STEP 3: Flag matches @@ -226,9 +247,9 @@ __global__ void multibyte_split_kernel(cudf::size_type base_tile_idx, auto prefix_callback = [&] __device__(uint32_t const& block_aggregate) -> uint32_t { if (threadIdx.x == 0) { temp_storage.offset_scan_exclusive_prefix = - tile_output_offsets.get_inclusive_prefix(blockIdx.x - 1); + tile_output_offsets.get_inclusive_prefix(base_tile_idx + blockIdx.x - 1); auto inclusive_prefix = temp_storage.offset_scan_exclusive_prefix + block_aggregate; - tile_output_offsets.set_inclusive_prefix(blockIdx.x, inclusive_prefix); + tile_output_offsets.set_inclusive_prefix(base_tile_idx + blockIdx.x, inclusive_prefix); } return temp_storage.offset_scan_exclusive_prefix; }; @@ -243,10 +264,11 @@ __global__ void multibyte_split_kernel(cudf::size_type base_tile_idx, if (match_length == 0) { continue; } - auto const match_end = data_begin + i + 1; + auto const match_end = char_begin + data_begin + i + 1; auto const match_begin = match_end - match_length; - printf("bid(%2u) tid(%2u) byte(%2u): %c %2u - [%3u, %3u)\n", // + printf("base_tile_idx(%2u) bid(%2u) tid(%2u) byte(%2u): %c %2u - [%3u, %3u)\n", // + static_cast(base_tile_idx), blockIdx.x, threadIdx.x, i, @@ -303,7 +325,7 @@ std::unique_ptr multibyte_split(cudf::string_scalar const& input, // allocate string offsets - auto num_results = tile_offsets.back_element(stream); + auto num_results = tile_offsets.get_inclusive_prefix(num_tiles - 1, stream); auto string_offsets = rmm::device_uvector(num_results + 2, stream); auto const x = string_offsets.size() - 1; auto const y = input.size(); @@ -325,8 +347,8 @@ std::unique_ptr multibyte_split(cudf::string_scalar const& input, // pattern-match and materialize string offsets multibyte_split_kernel<<>>( // - num_tiles, 0, + num_tiles, tile_superstates, tile_offsets, trie.view(), @@ -347,25 +369,27 @@ std::unique_ptr multibyte_split(cudf::io::text::device_istream& in rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto const trie = cudf::io::text::trie::create(delimeters, stream); - - // pattern-match and count delimiters - + auto const trie = cudf::io::text::trie::create(delimeters, stream); auto tile_superstates = scan_tile_state>(TILES_PER_CHUNK * 2, stream); auto tile_offsets = scan_tile_state(TILES_PER_CHUNK * 2, stream); rmm::device_uvector input_buffer(ITEMS_PER_CHUNK, stream); - uint32_t starting_position = input.tellg(); + std::cout << "ITEMS_PER_CHUNK: " << ITEMS_PER_CHUNK << std::endl; + + // uint32_t starting_position = input.tellg(); uint32_t bytes_read; // TODO: Set seed state. - tile_superstates.set_seed_async(superstate<16>(), stream); - tile_offsets.set_seed_async(0, stream); + cudf::size_type bytes_total = 0; - for (auto base_tile_idx = 0; bytes_read = input.readsome(input_buffer, stream) > 0; + for (auto base_tile_idx = 0; (bytes_read = input.read(input_buffer, stream)) > 0; base_tile_idx += TILES_PER_CHUNK) { + bytes_total += bytes_read; + + std::cout << "btid: " << base_tile_idx << ", bytes_read: " << bytes_read << std::endl; + // reset the next chunk of tile state multibyte_split_init_kernel<<>>( // base_tile_idx, @@ -373,6 +397,11 @@ std::unique_ptr multibyte_split(cudf::io::text::device_istream& in tile_superstates, tile_offsets); + if (base_tile_idx == 0) { + tile_superstates.set_seed_async(superstate<16>(), stream); + tile_offsets.set_seed_async(0, stream); + } + multibyte_split_kernel<<>>( // base_tile_idx, TILES_PER_CHUNK, @@ -381,37 +410,42 @@ std::unique_ptr multibyte_split(cudf::io::text::device_istream& in trie.view(), cudf::device_span(input_buffer).first(bytes_read), cudf::device_span(static_cast(nullptr), 0)); + + stream.synchronize(); } // allocate string offsets - auto num_results = tile_offsets.back_element(stream); + auto num_tiles = ceil_div(bytes_total, ITEMS_PER_TILE); + auto num_results = tile_offsets.get_inclusive_prefix(num_tiles - 1, stream); auto string_offsets = rmm::device_uvector(num_results + 2, stream); + std::cout << "num results: " << num_results << std::endl; + // first and last element are set manually to zero and size of input, respectively. // kernel is only responsible for determining delimiter offsets - // auto const x = string_offsets.size() - 1; - // auto const y = input.size(); - // string_offsets.set_element_to_zero_async(0, stream); - // string_offsets.set_element_async(x, y, stream); + auto const x = string_offsets.size() - 1; + string_offsets.set_element_to_zero_async(0, stream); + string_offsets.set_element_async(x, bytes_total, stream); // pattern-match and materialize string offsets - input.seekg(starting_position); - - // TODO: Set seed state. + input.reset(); - tile_superstates.set_seed_async(superstate<16>(), stream); - tile_offsets.set_seed_async(0, stream); - - for (auto base_tile_idx = 0; bytes_read = input.readsome(input_buffer, stream) > 0; + for (auto base_tile_idx = 0; (bytes_read = input.read(input_buffer, stream)) > 0; base_tile_idx += TILES_PER_CHUNK) { // reset the next chunk of tile state + multibyte_split_init_kernel<<>>( // base_tile_idx, TILES_PER_CHUNK, tile_superstates, tile_offsets); + if (base_tile_idx == 0) { + tile_superstates.set_seed_async(superstate<16>(), stream); + tile_offsets.set_seed_async(0, stream); + } + multibyte_split_kernel<<>>( // base_tile_idx, TILES_PER_CHUNK, @@ -420,18 +454,29 @@ std::unique_ptr multibyte_split(cudf::io::text::device_istream& in trie.view(), cudf::device_span(input_buffer).first(bytes_read), cudf::device_span(string_offsets).subspan(1, num_results)); + + stream.synchronize(); } - CUDF_FAIL(); + input.reset(); - // return cudf::make_strings_column( // - // cudf::device_span(input.data(), input.size()), - // string_offsets, - // {}, - // 0, - // stream, - // mr); -} // namespace detail + input_buffer = rmm::device_uvector(bytes_total, stream); + bytes_read = input.read(input_buffer, stream); + + auto result = cudf::make_strings_column( // + input_buffer, + string_offsets, + {}, + 0, + stream, + mr); + + stream.synchronize(); + + // return cudf::make_empty_column(cudf::data_type{cudf::type_id::DICTIONARY32}); + + return result; +} } // namespace detail @@ -442,12 +487,12 @@ std::unique_ptr multibyte_split(cudf::string_scalar const& input, return detail::multibyte_split(input, delimeters, rmm::cuda_stream_default, mr); } -// std::unique_ptr multibyte_split(cudf::io::text::device_istream& input, -// std::vector const& delimeters, -// rmm::mr::device_memory_resource* mr) -// { -// return detail::multibyte_split(input, delimeters, rmm::cuda_stream_default, mr); -// } +std::unique_ptr multibyte_split(cudf::io::text::device_istream& input, + std::vector const& delimeters, + rmm::mr::device_memory_resource* mr) +{ + return detail::multibyte_split(input, delimeters, rmm::cuda_stream_default, mr); +} } // namespace text } // namespace io diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index cec50aac160..94eebd82cc0 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -35,12 +35,12 @@ constexpr bool print_all{false}; struct MultibyteSplitTest : public BaseFixture { }; -TEST_F(MultibyteSplitTest, Simple) +TEST_F(MultibyteSplitTest, Simple1) { // 😀 | F0 9F 98 80 | 11110000 10011111 01100010 01010000 // 😎 | F0 9F 98 8E | 11110000 10011111 01100010 11101000 auto delimiters = std::vector({"😀", "😎", ",", "::"}); - cudf::string_scalar input( + auto host_input = std::string( "aaa😀" "bbb😀" "ccc😀" @@ -76,7 +76,12 @@ TEST_F(MultibyteSplitTest, Simple) "emojis,", "which,", "are😎", "multiple,", "bytes::", "and😎", "used😎", "as😎", "delimeters.😎", "::", ",", "😀", ""}; - auto out = cudf::io::text::multibyte_split(input, delimiters); + auto host_input_stream = std::basic_stringstream(host_input); + auto device_input_stream = cudf::io::text::host_device_istream(host_input_stream); + // auto device_input = cudf::string_scalar(host_input); + + auto out = cudf::io::text::multibyte_split(device_input_stream, delimiters); + // auto out = cudf::io::text::multibyte_split(input, delimiters); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, print_all); } From 0121b22a019720c8c7426681fe6ddbce8d7a01b5 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 16 Jul 2021 00:39:24 -0500 Subject: [PATCH 25/80] consolidate two passes of stream-scanning to a single function --- cpp/src/io/text/multibyte_split.cu | 132 +++++++++++------------------ 1 file changed, 49 insertions(+), 83 deletions(-) diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index f49bea3a341..940f6b2c602 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -148,11 +148,11 @@ struct PatternScan { _temp_storage.inclusive_prefix = _temp_storage.exclusive_prefix + block_aggregate; tile_state.set_inclusive_prefix(base_tile_idx + blockIdx.x, _temp_storage.inclusive_prefix); - printf("base_tile_idx(%2u) bid(%2u) tid(%2u): prefix = %2u %2u\n", - static_cast(base_tile_idx), - blockIdx.x, - threadIdx.x, - _temp_storage.exclusive_prefix); + // printf("base_tile_idx(%2u) bid(%2u) tid(%2u): prefix = %2u %2u\n", + // static_cast(base_tile_idx), + // blockIdx.x, + // threadIdx.x, + // _temp_storage.exclusive_prefix); } return _temp_storage.exclusive_prefix; }; @@ -201,28 +201,12 @@ __global__ void multibyte_split_kernel(cudf::size_type base_tile_idx, int32_t const num_valid = data.size() - data_begin; int32_t const char_begin = base_tile_idx * ITEMS_PER_TILE; - if (threadIdx.x == 0) { - printf("base_tile_idx(%2u) bid(%2u) tid(%2u) data_size(%2u) num_valid(%2i)\n", - static_cast(base_tile_idx), - blockIdx.x, - threadIdx.x, - static_cast(data.size()), - num_valid); - } - // STEP 1: Load inputs char thread_data[ITEMS_PER_THREAD]; for (int32_t i = 0; i < ITEMS_PER_THREAD and i < num_valid; i++) { // thread_data[i] = data[data_begin + i]; - - printf("base_tile_idx(%2u) bid(%2u) tid(%2u) byte(%2u): %c\n", // - static_cast(base_tile_idx), - blockIdx.x, - threadIdx.x, - i, - thread_data[i]); } // STEP 2: Scan inputs to determine absolute thread states @@ -267,16 +251,6 @@ __global__ void multibyte_split_kernel(cudf::size_type base_tile_idx, auto const match_end = char_begin + data_begin + i + 1; auto const match_begin = match_end - match_length; - printf("base_tile_idx(%2u) bid(%2u) tid(%2u) byte(%2u): %c %2u - [%3u, %3u)\n", // - static_cast(base_tile_idx), - blockIdx.x, - threadIdx.x, - i, - thread_data[i], - thread_offsets[i], - match_begin, - match_end); - if (string_offsets.size() > thread_offsets[i]) { // string_offsets[thread_offsets[i]] = match_end; } @@ -364,32 +338,24 @@ std::unique_ptr multibyte_split(cudf::string_scalar const& input, mr); } -std::unique_ptr multibyte_split(cudf::io::text::device_istream& input, - std::vector const& delimeters, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +cudf::size_type scan_full_stream(cudf::io::text::device_istream& input, + cudf::io::text::trie const& trie, + scan_tile_state>& tile_superstates, + scan_tile_state& tile_offsets, + device_span output_buffer, + rmm::cuda_stream_view stream) { - auto const trie = cudf::io::text::trie::create(delimeters, stream); - auto tile_superstates = scan_tile_state>(TILES_PER_CHUNK * 2, stream); - auto tile_offsets = scan_tile_state(TILES_PER_CHUNK * 2, stream); - - rmm::device_uvector input_buffer(ITEMS_PER_CHUNK, stream); - - std::cout << "ITEMS_PER_CHUNK: " << ITEMS_PER_CHUNK << std::endl; - - // uint32_t starting_position = input.tellg(); uint32_t bytes_read; + cudf::size_type bytes_total = 0; - // TODO: Set seed state. + rmm::device_uvector input_buffer(ITEMS_PER_CHUNK, stream); - cudf::size_type bytes_total = 0; + // this function can be updated to interleave two kernel executions, such that two input buffers for (auto base_tile_idx = 0; (bytes_read = input.read(input_buffer, stream)) > 0; base_tile_idx += TILES_PER_CHUNK) { bytes_total += bytes_read; - std::cout << "btid: " << base_tile_idx << ", bytes_read: " << bytes_read << std::endl; - // reset the next chunk of tile state multibyte_split_init_kernel<<>>( // base_tile_idx, @@ -408,20 +374,38 @@ std::unique_ptr multibyte_split(cudf::io::text::device_istream& in tile_superstates, tile_offsets, trie.view(), - cudf::device_span(input_buffer).first(bytes_read), - cudf::device_span(static_cast(nullptr), 0)); + device_span(input_buffer).first(bytes_read), + output_buffer); stream.synchronize(); } + return bytes_total; +} + +std::unique_ptr multibyte_split(cudf::io::text::device_istream& input, + std::vector const& delimeters, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const trie = cudf::io::text::trie::create(delimeters, stream); + auto tile_superstates = scan_tile_state>(TILES_PER_CHUNK * 2, stream); + auto tile_offsets = scan_tile_state(TILES_PER_CHUNK * 2, stream); + + auto bytes_total = + scan_full_stream(input, + trie, + tile_superstates, + tile_offsets, + cudf::device_span(static_cast(nullptr), 0), + stream); + // allocate string offsets auto num_tiles = ceil_div(bytes_total, ITEMS_PER_TILE); auto num_results = tile_offsets.get_inclusive_prefix(num_tiles - 1, stream); auto string_offsets = rmm::device_uvector(num_results + 2, stream); - std::cout << "num results: " << num_results << std::endl; - // first and last element are set manually to zero and size of input, respectively. // kernel is only responsible for determining delimiter offsets auto const x = string_offsets.size() - 1; @@ -431,50 +415,32 @@ std::unique_ptr multibyte_split(cudf::io::text::device_istream& in // pattern-match and materialize string offsets input.reset(); - for (auto base_tile_idx = 0; (bytes_read = input.read(input_buffer, stream)) > 0; - base_tile_idx += TILES_PER_CHUNK) { - // reset the next chunk of tile state - - multibyte_split_init_kernel<<>>( // - base_tile_idx, - TILES_PER_CHUNK, - tile_superstates, - tile_offsets); - - if (base_tile_idx == 0) { - tile_superstates.set_seed_async(superstate<16>(), stream); - tile_offsets.set_seed_async(0, stream); - } - - multibyte_split_kernel<<>>( // - base_tile_idx, - TILES_PER_CHUNK, - tile_superstates, - tile_offsets, - trie.view(), - cudf::device_span(input_buffer).first(bytes_read), - cudf::device_span(string_offsets).subspan(1, num_results)); + scan_full_stream(input, + trie, + tile_superstates, + tile_offsets, + cudf::device_span(string_offsets).subspan(1, num_results), + stream); - stream.synchronize(); - } + // copy chars + auto string_chars = rmm::device_uvector(bytes_total, stream); input.reset(); + input.read(string_chars, stream); - input_buffer = rmm::device_uvector(bytes_total, stream); - bytes_read = input.read(input_buffer, stream); - + // copy chars and offsets to make new strings column. auto result = cudf::make_strings_column( // - input_buffer, + string_chars, string_offsets, {}, 0, stream, mr); + // This synchronization is required to keep input_buffer in scope long enough to copy. Can be + // by using `std::unique_ptr` overload, or making a new one that accepts `device_uvector`. stream.synchronize(); - // return cudf::make_empty_column(cudf::data_type{cudf::type_id::DICTIONARY32}); - return result; } From a233ca2024ddfa8a0c4b88b7d0fc45b5bcbbc9ed Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 16 Jul 2021 10:13:11 -0500 Subject: [PATCH 26/80] add tile_state partial to multibyte_split but dont use yet --- cpp/include/cudf/io/text/trie.hpp | 46 ++++++++-------- cpp/src/io/text/multibyte_split.cu | 84 ++++++++++++++++++------------ 2 files changed, 73 insertions(+), 57 deletions(-) diff --git a/cpp/include/cudf/io/text/trie.hpp b/cpp/include/cudf/io/text/trie.hpp index 9e931ce48ae..fa9c62ad56e 100644 --- a/cpp/include/cudf/io/text/trie.hpp +++ b/cpp/include/cudf/io/text/trie.hpp @@ -147,29 +147,29 @@ struct trie { // copy host buffers to device - RMM_CUDA_TRY(cudaMemcpyAsync(device_layer_offsets.data(), - layer_offsets.data(), - layer_offsets.size() * sizeof(uint16_t), - cudaMemcpyDefault, - stream.value())); - - RMM_CUDA_TRY(cudaMemcpyAsync(device_tokens.data(), - tokens.data(), - tokens.size() * sizeof(char), - cudaMemcpyDefault, - stream.value())); - - RMM_CUDA_TRY(cudaMemcpyAsync(device_transitions.data(), - transitions.data(), - transitions.size() * sizeof(uint16_t), - cudaMemcpyDefault, - stream.value())); - - RMM_CUDA_TRY(cudaMemcpyAsync(device_match_length.data(), - match_length.data(), - match_length.size() * sizeof(uint8_t), - cudaMemcpyDefault, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(device_layer_offsets.data(), + layer_offsets.data(), + layer_offsets.size() * sizeof(uint16_t), + cudaMemcpyDefault, + stream.value())); + + CUDA_TRY(cudaMemcpyAsync(device_tokens.data(), + tokens.data(), + tokens.size() * sizeof(char), + cudaMemcpyDefault, + stream.value())); + + CUDA_TRY(cudaMemcpyAsync(device_transitions.data(), + transitions.data(), + transitions.size() * sizeof(uint16_t), + cudaMemcpyDefault, + stream.value())); + + CUDA_TRY(cudaMemcpyAsync(device_match_length.data(), + match_length.data(), + match_length.size() * sizeof(uint8_t), + cudaMemcpyDefault, + stream.value())); // create owning container diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 940f6b2c602..9f1ac43f672 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -27,76 +27,90 @@ inline constexpr auto ceil_div(Dividend dividend, Divisor divisor) using superstate = cudf::io::text::superstate<16>; +enum class scan_tile_status : uint8_t { + uninitialized, + partial, + inclusive, +}; + template struct scan_tile_state_view { uint64_t num_tiles; - bool* tile_status; - T* tile_state; + scan_tile_status* tile_status; + T* tile_partial; + T* tile_inclusive; - __device__ void initialize(cudf::size_type base_tile_idx, cudf::size_type count) + __device__ inline void initialize(cudf::size_type base_tile_idx, cudf::size_type count) { auto thread_idx = blockIdx.x * blockDim.x + threadIdx.x; if (thread_idx < count) { // - tile_status[(base_tile_idx + thread_idx) % num_tiles] = false; + tile_status[(base_tile_idx + thread_idx) % num_tiles] = scan_tile_status::uninitialized; } } - __device__ void set_inclusive_prefix(cudf::size_type tile_idx, T value) + __device__ inline void set_inclusive_prefix(cudf::size_type tile_idx, T value) { - cub::ThreadStore(tile_state + ((tile_idx + num_tiles) % num_tiles), value); + auto const offset = (tile_idx + num_tiles) % num_tiles; + cub::ThreadStore(tile_inclusive + offset, value); __threadfence(); - cub::ThreadStore(tile_status + ((tile_idx + num_tiles) % num_tiles), true); + cub::ThreadStore(tile_status + offset, scan_tile_status::inclusive); } - __device__ T get_inclusive_prefix(cudf::size_type tile_idx) + __device__ inline T get_inclusive_prefix(cudf::size_type tile_idx) { - while (cub::ThreadLoad(tile_status + ((tile_idx + num_tiles) % num_tiles)) == - false) { + auto const offset = (tile_idx + num_tiles) % num_tiles; + while (cub::ThreadLoad(tile_status + offset) != scan_tile_status::inclusive) { __threadfence(); } - return cub::ThreadLoad(tile_state + ((tile_idx + num_tiles) % num_tiles)); + return cub::ThreadLoad(tile_inclusive + offset); } }; template struct scan_tile_state { - rmm::device_uvector tile_status; - rmm::device_uvector tile_state; + rmm::device_uvector tile_status; + rmm::device_uvector tile_state_partial; + rmm::device_uvector tile_state_inclusive; scan_tile_state(cudf::size_type num_tiles, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) - : tile_status(rmm::device_uvector(num_tiles, stream, mr)), - tile_state(rmm::device_uvector(num_tiles, stream, mr)) + : tile_status(rmm::device_uvector(num_tiles, stream, mr)), + tile_state_partial(rmm::device_uvector(num_tiles, stream, mr)), + tile_state_inclusive(rmm::device_uvector(num_tiles, stream, mr)) { } operator scan_tile_state_view() { - return scan_tile_state_view{tile_status.size(), tile_status.data(), tile_state.data()}; + return scan_tile_state_view{tile_status.size(), + tile_status.data(), + tile_state_partial.data(), + tile_state_inclusive.data()}; } - void set_seed_async(T const seed, rmm::cuda_stream_view stream) + inline void set_seed_async(T const seed, rmm::cuda_stream_view stream) { auto x = tile_status.size(); - bool y = true; - tile_state.set_element_async(x - 1, seed, stream); + auto y = scan_tile_status::inclusive; + tile_state_inclusive.set_element_async(x - 1, seed, stream); tile_status.set_element_async(x - 1, y, stream); } // T back_element(rmm::cuda_stream_view stream) const { return tile_state.back_element(stream); } - T get_inclusive_prefix(cudf::size_type tile_idx, rmm::cuda_stream_view stream) + inline T get_inclusive_prefix(cudf::size_type tile_idx, rmm::cuda_stream_view stream) const { - return tile_state.element((tile_idx + tile_status.size()) % tile_status.size(), stream); + auto const offset = (tile_idx + tile_status.size()) % tile_status.size(); + return tile_state_inclusive.element(offset, stream); } }; // keep ITEMS_PER_TILE below input size to force multi-tile execution. -auto constexpr ITEMS_PER_THREAD = 2; -auto constexpr THREADS_PER_TILE = 2; +auto constexpr ITEMS_PER_THREAD = 32; +auto constexpr THREADS_PER_TILE = 32; auto constexpr ITEMS_PER_TILE = ITEMS_PER_THREAD * THREADS_PER_TILE; -auto constexpr TILES_PER_CHUNK = 2; +auto constexpr TILES_PER_CHUNK = 32; auto constexpr ITEMS_PER_CHUNK = ITEMS_PER_TILE * TILES_PER_CHUNK; // multibyte_split works by splitting up inputs in to 32 inputs (bytes) per thread, and transforming // them in to data structures called "superstates". these superstates are created by searching a @@ -124,7 +138,7 @@ struct PatternScan { __device__ inline PatternScan(TempStorage& temp_storage) : _temp_storage(temp_storage.Alias()) {} - __device__ inline void Scan(cudf::size_type base_tile_idx, + __device__ inline void Scan(cudf::size_type tile_idx, scan_tile_state_view tile_state, cudf::io::text::trie_device_view trie, char (&thread_data)[ITEMS_PER_THREAD], @@ -141,15 +155,16 @@ struct PatternScan { } auto prefix_callback = [&] __device__(superstate const& block_aggregate) -> superstate { + if (threadIdx.x < THREADS_PER_TILE) {} + if (threadIdx.x == 0) { - _temp_storage.block_aggregate = block_aggregate; - _temp_storage.exclusive_prefix = - tile_state.get_inclusive_prefix(base_tile_idx + blockIdx.x - 1); + _temp_storage.block_aggregate = block_aggregate; + _temp_storage.exclusive_prefix = tile_state.get_inclusive_prefix(tile_idx - 1); _temp_storage.inclusive_prefix = _temp_storage.exclusive_prefix + block_aggregate; - tile_state.set_inclusive_prefix(base_tile_idx + blockIdx.x, _temp_storage.inclusive_prefix); + tile_state.set_inclusive_prefix(tile_idx, _temp_storage.inclusive_prefix); - // printf("base_tile_idx(%2u) bid(%2u) tid(%2u): prefix = %2u %2u\n", - // static_cast(base_tile_idx), + // printf("tile_idx(%2u) bid(%2u) tid(%2u): prefix = %2u %2u\n", + // static_cast(tile_idx), // blockIdx.x, // threadIdx.x, // _temp_storage.exclusive_prefix); @@ -196,6 +211,7 @@ __global__ void multibyte_split_kernel(cudf::size_type base_tile_idx, }; } temp_storage; + int32_t const tile_idx = base_tile_idx + blockIdx.x; int32_t const thread_idx = blockIdx.x * blockDim.x + threadIdx.x; int32_t const data_begin = thread_idx * ITEMS_PER_THREAD; int32_t const num_valid = data.size() - data_begin; @@ -214,7 +230,7 @@ __global__ void multibyte_split_kernel(cudf::size_type base_tile_idx, uint32_t thread_states[ITEMS_PER_THREAD]; PatternScan(temp_storage.pattern_scan) // - .Scan(base_tile_idx, tile_superstates, trie, thread_data, thread_states); + .Scan(tile_idx, tile_superstates, trie, thread_data, thread_states); // STEP 3: Flag matches @@ -231,9 +247,9 @@ __global__ void multibyte_split_kernel(cudf::size_type base_tile_idx, auto prefix_callback = [&] __device__(uint32_t const& block_aggregate) -> uint32_t { if (threadIdx.x == 0) { temp_storage.offset_scan_exclusive_prefix = - tile_output_offsets.get_inclusive_prefix(base_tile_idx + blockIdx.x - 1); + tile_output_offsets.get_inclusive_prefix(tile_idx - 1); auto inclusive_prefix = temp_storage.offset_scan_exclusive_prefix + block_aggregate; - tile_output_offsets.set_inclusive_prefix(base_tile_idx + blockIdx.x, inclusive_prefix); + tile_output_offsets.set_inclusive_prefix(tile_idx, inclusive_prefix); } return temp_storage.offset_scan_exclusive_prefix; }; From 494605899baf5b99b8383c34407e273e35fb1c1f Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 16 Jul 2021 13:25:43 -0500 Subject: [PATCH 27/80] add reusable tilestate callback to `multibyte_split` --- cpp/src/io/text/multibyte_split.cu | 122 ++++++++++++++++++++--------- 1 file changed, 87 insertions(+), 35 deletions(-) diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 9f1ac43f672..f9bce334df0 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -28,7 +28,7 @@ inline constexpr auto ceil_div(Dividend dividend, Divisor divisor) using superstate = cudf::io::text::superstate<16>; enum class scan_tile_status : uint8_t { - uninitialized, + invalid, partial, inclusive, }; @@ -44,10 +44,18 @@ struct scan_tile_state_view { { auto thread_idx = blockIdx.x * blockDim.x + threadIdx.x; if (thread_idx < count) { // - tile_status[(base_tile_idx + thread_idx) % num_tiles] = scan_tile_status::uninitialized; + tile_status[(base_tile_idx + thread_idx) % num_tiles] = scan_tile_status::invalid; } } + __device__ inline void set_partial_prefix(cudf::size_type tile_idx, T value) + { + auto const offset = (tile_idx + num_tiles) % num_tiles; + cub::ThreadStore(tile_inclusive + offset, value); + __threadfence(); + cub::ThreadStore(tile_status + offset, scan_tile_status::partial); + } + __device__ inline void set_inclusive_prefix(cudf::size_type tile_idx, T value) { auto const offset = (tile_idx + num_tiles) % num_tiles; @@ -56,6 +64,22 @@ struct scan_tile_state_view { cub::ThreadStore(tile_status + offset, scan_tile_status::inclusive); } + __device__ inline T get_prefix(cudf::size_type tile_idx, scan_tile_status& status) + { + auto const offset = (tile_idx + num_tiles) % num_tiles; + + while ((status = cub::ThreadLoad(tile_status + offset)) == + scan_tile_status::invalid) { + __threadfence(); + } + + if (status == scan_tile_status::partial) { + return cub::ThreadLoad(tile_partial + offset); + } else { + return cub::ThreadLoad(tile_inclusive + offset); + } + } + __device__ inline T get_inclusive_prefix(cudf::size_type tile_idx) { auto const offset = (tile_idx + num_tiles) % num_tiles; @@ -106,11 +130,13 @@ struct scan_tile_state { } }; +auto constexpr DO_AGGREGATE_PARTIALS = false; + // keep ITEMS_PER_TILE below input size to force multi-tile execution. -auto constexpr ITEMS_PER_THREAD = 32; -auto constexpr THREADS_PER_TILE = 32; +auto constexpr ITEMS_PER_THREAD = 2; +auto constexpr THREADS_PER_TILE = 2; auto constexpr ITEMS_PER_TILE = ITEMS_PER_THREAD * THREADS_PER_TILE; -auto constexpr TILES_PER_CHUNK = 32; +auto constexpr TILES_PER_CHUNK = 2; auto constexpr ITEMS_PER_CHUNK = ITEMS_PER_TILE * TILES_PER_CHUNK; // multibyte_split works by splitting up inputs in to 32 inputs (bytes) per thread, and transforming // them in to data structures called "superstates". these superstates are created by searching a @@ -122,14 +148,62 @@ auto constexpr ITEMS_PER_CHUNK = ITEMS_PER_TILE * TILES_PER_CHUNK; // it begins in. From there, each thread can then take deterministic action. In this case, the // deterministic action is counting and outputting delimiter offsets when a delimiter is found. +template +struct scan_tile_state_callback { + struct _TempStorage { + T exclusive_prefix; + }; + + using TempStorage = cub::Uninitialized<_TempStorage>; + + __device__ inline scan_tile_state_callback(TempStorage& temp_storage, + scan_tile_state_view& tile_state, + cudf::size_type tile_idx) + : _temp_storage(temp_storage.Alias()), _tile_state(tile_state), _tile_idx(tile_idx) + { + } + + __device__ inline T operator()(T const& block_aggregate) + { + if (threadIdx.x == 0) { + if constexpr (DO_AGGREGATE_PARTIALS) { + // scan partials to form prefix + auto predecessor_idx = _tile_idx - 1; + auto predecessor_status = scan_tile_status::invalid; + auto window_partial = T{}; + + do { + auto predecessor_prefix = _tile_state.get_prefix(predecessor_idx, predecessor_status); + window_partial = predecessor_prefix + window_partial; + } while (predecessor_status != scan_tile_status::inclusive); + + _temp_storage.exclusive_prefix = window_partial; + } else { + // wait for prefix + _temp_storage.exclusive_prefix = _tile_state.get_inclusive_prefix(_tile_idx - 1); + } + + auto inclusive_prefix = _temp_storage.exclusive_prefix + block_aggregate; + _tile_state.set_inclusive_prefix(_tile_idx, inclusive_prefix); + } + + __syncthreads(); // TODO: remove if unnecessary. + + return _temp_storage.exclusive_prefix; + } + + _TempStorage& _temp_storage; + scan_tile_state_view& _tile_state; + cudf::size_type _tile_idx; +}; + struct PatternScan { typedef cub::BlockScan BlockScan; + typedef scan_tile_state_callback BlockScanCallback; struct _TempStorage { typename BlockScan::TempStorage scan; - superstate block_aggregate; - superstate exclusive_prefix; - superstate inclusive_prefix; + typename BlockScanCallback::TempStorage scan_callback; }; _TempStorage& _temp_storage; @@ -154,23 +228,7 @@ struct PatternScan { }); } - auto prefix_callback = [&] __device__(superstate const& block_aggregate) -> superstate { - if (threadIdx.x < THREADS_PER_TILE) {} - - if (threadIdx.x == 0) { - _temp_storage.block_aggregate = block_aggregate; - _temp_storage.exclusive_prefix = tile_state.get_inclusive_prefix(tile_idx - 1); - _temp_storage.inclusive_prefix = _temp_storage.exclusive_prefix + block_aggregate; - tile_state.set_inclusive_prefix(tile_idx, _temp_storage.inclusive_prefix); - - // printf("tile_idx(%2u) bid(%2u) tid(%2u): prefix = %2u %2u\n", - // static_cast(tile_idx), - // blockIdx.x, - // threadIdx.x, - // _temp_storage.exclusive_prefix); - } - return _temp_storage.exclusive_prefix; - }; + auto prefix_callback = BlockScanCallback(_temp_storage.scan_callback, tile_state, tile_idx); BlockScan(_temp_storage.scan) .ExclusiveSum(thread_superstate, thread_superstate, prefix_callback); @@ -202,12 +260,13 @@ __global__ void multibyte_split_kernel(cudf::size_type base_tile_idx, cudf::device_span string_offsets) { typedef cub::BlockScan OffsetScan; + typedef scan_tile_state_callback OffsetScanCallback; __shared__ union { typename PatternScan::TempStorage pattern_scan; struct { typename OffsetScan::TempStorage offset_scan; - uint32_t offset_scan_exclusive_prefix; + typename OffsetScanCallback::TempStorage offset_scan_callback; }; } temp_storage; @@ -244,15 +303,8 @@ __global__ void multibyte_split_kernel(cudf::size_type base_tile_idx, __syncthreads(); // required before temp_memory re-use - auto prefix_callback = [&] __device__(uint32_t const& block_aggregate) -> uint32_t { - if (threadIdx.x == 0) { - temp_storage.offset_scan_exclusive_prefix = - tile_output_offsets.get_inclusive_prefix(tile_idx - 1); - auto inclusive_prefix = temp_storage.offset_scan_exclusive_prefix + block_aggregate; - tile_output_offsets.set_inclusive_prefix(tile_idx, inclusive_prefix); - } - return temp_storage.offset_scan_exclusive_prefix; - }; + auto prefix_callback = + OffsetScanCallback(temp_storage.offset_scan_callback, tile_output_offsets, tile_idx); OffsetScan(temp_storage.offset_scan) .ExclusiveSum(thread_offsets, thread_offsets, prefix_callback); From d69aecabd459a64dfc942ebedc851b27e3b91136 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 16 Jul 2021 14:27:22 -0500 Subject: [PATCH 28/80] begin working on warp-reduce window aggregation of tile state in multibyte_split --- cpp/src/io/text/multibyte_split.cu | 53 ++++++++++++++++------ cpp/tests/io/text/multibyte_split_test.cpp | 1 + 2 files changed, 41 insertions(+), 13 deletions(-) diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index f9bce334df0..d3117d2680c 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -12,6 +12,7 @@ #include #include +#include #include #include @@ -133,8 +134,8 @@ struct scan_tile_state { auto constexpr DO_AGGREGATE_PARTIALS = false; // keep ITEMS_PER_TILE below input size to force multi-tile execution. -auto constexpr ITEMS_PER_THREAD = 2; -auto constexpr THREADS_PER_TILE = 2; +auto constexpr ITEMS_PER_THREAD = 1; +auto constexpr THREADS_PER_TILE = 32; auto constexpr ITEMS_PER_TILE = ITEMS_PER_THREAD * THREADS_PER_TILE; auto constexpr TILES_PER_CHUNK = 2; auto constexpr ITEMS_PER_CHUNK = ITEMS_PER_TILE * TILES_PER_CHUNK; @@ -150,7 +151,10 @@ auto constexpr ITEMS_PER_CHUNK = ITEMS_PER_TILE * TILES_PER_CHUNK; template struct scan_tile_state_callback { + using WarpReduce = cub::WarpReduce; + struct _TempStorage { + typename WarpReduce::TempStorage reduce; T exclusive_prefix; }; @@ -165,26 +169,49 @@ struct scan_tile_state_callback { __device__ inline T operator()(T const& block_aggregate) { - if (threadIdx.x == 0) { - if constexpr (DO_AGGREGATE_PARTIALS) { - // scan partials to form prefix - auto predecessor_idx = _tile_idx - 1; - auto predecessor_status = scan_tile_status::invalid; - auto window_partial = T{}; + auto predecessor_idx = _tile_idx - 1 - threadIdx.x; + auto predecessor_status = scan_tile_status::invalid; + + if constexpr (DO_AGGREGATE_PARTIALS) { + // scan partials to form prefix + auto window_partial = T{}; + if (threadIdx.x == 0) { do { auto predecessor_prefix = _tile_state.get_prefix(predecessor_idx, predecessor_status); window_partial = predecessor_prefix + window_partial; + predecessor_idx--; } while (predecessor_status != scan_tile_status::inclusive); _temp_storage.exclusive_prefix = window_partial; - } else { - // wait for prefix - _temp_storage.exclusive_prefix = _tile_state.get_inclusive_prefix(_tile_idx - 1); + } + } else { + // wait for prefix + if (threadIdx.x == 0) { + _temp_storage.exclusive_prefix = _tile_state.get_inclusive_prefix(predecessor_idx); } - auto inclusive_prefix = _temp_storage.exclusive_prefix + block_aggregate; - _tile_state.set_inclusive_prefix(_tile_idx, inclusive_prefix); + if (threadIdx.x < 1) { // setting this to 2 hangs. 1 is fine. :( + + auto predecessor_prefix = _tile_state.get_prefix(predecessor_idx, predecessor_status); + // auto fun_value = WarpReduce(_temp_storage.reduce) // + // .TailSegmentedReduce(predecessor_prefix, + // predecessor_status == + // scan_tile_status::inclusive, + // [](T const& lhs, T const& rhs) { return rhs + + // lhs; }); + + // printf("tile_idx(%2lu) bid(%2u) tid(%2u) pred_status(%2u) fun(%2u %2u)\n", + // _tile_idx, + // blockIdx.x, + // threadIdx.x, + // static_cast(predecessor_status), + // fun_value); + } + } + + if (threadIdx.x == 0) { + _tile_state.set_inclusive_prefix(_tile_idx, _temp_storage.exclusive_prefix + block_aggregate); } __syncthreads(); // TODO: remove if unnecessary. diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index 94eebd82cc0..55896218480 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -84,4 +84,5 @@ TEST_F(MultibyteSplitTest, Simple1) // auto out = cudf::io::text::multibyte_split(input, delimiters); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, print_all); + CUDF_FAIL(); } From 079d1ea588201e71ffb40a932d259a99aa297662 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Sat, 17 Jul 2021 11:15:29 -0500 Subject: [PATCH 29/80] fix multibyte_split bug where non-streaming approach would hang --- .../io/text/multibyte_split_benchmark.cpp | 10 +- cpp/src/io/text/host_device_istream.cpp | 2 +- cpp/src/io/text/multibyte_split.cu | 96 +++++++++++-------- cpp/tests/io/text/multibyte_split_test.cpp | 52 +++++++++- 4 files changed, 117 insertions(+), 43 deletions(-) diff --git a/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp index e40b991874c..0fc197c693c 100644 --- a/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp +++ b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp @@ -14,12 +14,16 @@ * limitations under the License. */ -#include #include #include + +#include #include #include #include + +#include + #include using cudf::test::fixed_width_column_wrapper; @@ -35,8 +39,12 @@ static void BM_multibyte_split(benchmark::State& state) auto delimiters = std::vector({"😀", "😎", ",", "::"}); + // auto host_input_stream = std::basic_stringstream(host_input); + // auto device_input_stream = cudf::io::text::host_device_istream(host_input_stream); + for (auto _ : state) { cuda_event_timer raii(state, true); + // auto output = cudf::io::text::multibyte_split(device_input_stream, delimiters); auto output = cudf::io::text::multibyte_split(input, delimiters); } diff --git a/cpp/src/io/text/host_device_istream.cpp b/cpp/src/io/text/host_device_istream.cpp index e488ae3e263..6c5c14811b5 100644 --- a/cpp/src/io/text/host_device_istream.cpp +++ b/cpp/src/io/text/host_device_istream.cpp @@ -29,7 +29,7 @@ uint32_t host_device_istream::read(cudf::device_span destination, cudaMemcpyHostToDevice, stream.value())); - std::cout << "tried to read: " << read_size << ", and got: " << read_size_actual << std::endl; + // std::cout << "tried to read: " << read_size << ", and got: " << read_size_actual << std::endl; return read_size_actual; } diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index d3117d2680c..f45ec700af3 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -29,6 +29,7 @@ inline constexpr auto ceil_div(Dividend dividend, Divisor divisor) using superstate = cudf::io::text::superstate<16>; enum class scan_tile_status : uint8_t { + oob, invalid, partial, inclusive, @@ -41,18 +42,20 @@ struct scan_tile_state_view { T* tile_partial; T* tile_inclusive; - __device__ inline void initialize(cudf::size_type base_tile_idx, cudf::size_type count) + __device__ inline void initialize_status(cudf::size_type base_tile_idx, + cudf::size_type count, + scan_tile_status status) { auto thread_idx = blockIdx.x * blockDim.x + threadIdx.x; if (thread_idx < count) { // - tile_status[(base_tile_idx + thread_idx) % num_tiles] = scan_tile_status::invalid; + tile_status[(base_tile_idx + thread_idx) % num_tiles] = status; } } __device__ inline void set_partial_prefix(cudf::size_type tile_idx, T value) { auto const offset = (tile_idx + num_tiles) % num_tiles; - cub::ThreadStore(tile_inclusive + offset, value); + cub::ThreadStore(tile_partial + offset, value); __threadfence(); cub::ThreadStore(tile_status + offset, scan_tile_status::partial); } @@ -131,13 +134,13 @@ struct scan_tile_state { } }; -auto constexpr DO_AGGREGATE_PARTIALS = false; +auto constexpr PARTIAL_AGGRIGATION_STRATEGY = 2; // keep ITEMS_PER_TILE below input size to force multi-tile execution. -auto constexpr ITEMS_PER_THREAD = 1; -auto constexpr THREADS_PER_TILE = 32; +auto constexpr ITEMS_PER_THREAD = 32; +auto constexpr THREADS_PER_TILE = 128; auto constexpr ITEMS_PER_TILE = ITEMS_PER_THREAD * THREADS_PER_TILE; -auto constexpr TILES_PER_CHUNK = 2; +auto constexpr TILES_PER_CHUNK = 1024; auto constexpr ITEMS_PER_CHUNK = ITEMS_PER_TILE * TILES_PER_CHUNK; // multibyte_split works by splitting up inputs in to 32 inputs (bytes) per thread, and transforming // them in to data structures called "superstates". these superstates are created by searching a @@ -169,11 +172,22 @@ struct scan_tile_state_callback { __device__ inline T operator()(T const& block_aggregate) { + if (threadIdx.x == 0) { + _tile_state.set_partial_prefix(_tile_idx, block_aggregate); // + } + auto predecessor_idx = _tile_idx - 1 - threadIdx.x; auto predecessor_status = scan_tile_status::invalid; - if constexpr (DO_AGGREGATE_PARTIALS) { + if constexpr (PARTIAL_AGGRIGATION_STRATEGY == 0) { + if (threadIdx.x == 0) { + _temp_storage.exclusive_prefix = _tile_state.get_inclusive_prefix(predecessor_idx); + } + } + + if constexpr (PARTIAL_AGGRIGATION_STRATEGY == 1) { // scan partials to form prefix + auto window_partial = T{}; if (threadIdx.x == 0) { @@ -185,28 +199,26 @@ struct scan_tile_state_callback { _temp_storage.exclusive_prefix = window_partial; } - } else { - // wait for prefix - if (threadIdx.x == 0) { - _temp_storage.exclusive_prefix = _tile_state.get_inclusive_prefix(predecessor_idx); + } + + if constexpr (PARTIAL_AGGRIGATION_STRATEGY == 2) { + auto window_partial = T{}; + if (threadIdx.x < 32) { + do { + auto predecessor_prefix = _tile_state.get_prefix(predecessor_idx, predecessor_status); + + window_partial = + WarpReduce(_temp_storage.reduce) // + .TailSegmentedReduce(predecessor_prefix, + predecessor_status == scan_tile_status::inclusive, + [](T const& lhs, T const& rhs) { return rhs + lhs; }) + + window_partial; + predecessor_idx -= 32; + } while (__all_sync(0xffffffff, predecessor_status != scan_tile_status::inclusive)); } - if (threadIdx.x < 1) { // setting this to 2 hangs. 1 is fine. :( - - auto predecessor_prefix = _tile_state.get_prefix(predecessor_idx, predecessor_status); - // auto fun_value = WarpReduce(_temp_storage.reduce) // - // .TailSegmentedReduce(predecessor_prefix, - // predecessor_status == - // scan_tile_status::inclusive, - // [](T const& lhs, T const& rhs) { return rhs + - // lhs; }); - - // printf("tile_idx(%2lu) bid(%2u) tid(%2u) pred_status(%2u) fun(%2u %2u)\n", - // _tile_idx, - // blockIdx.x, - // threadIdx.x, - // static_cast(predecessor_status), - // fun_value); + if (threadIdx.x == 0) { + _temp_storage.exclusive_prefix = window_partial; // } } @@ -272,10 +284,11 @@ struct PatternScan { __global__ void multibyte_split_init_kernel(cudf::size_type base_tile_idx, cudf::size_type num_tiles, scan_tile_state_view tile_superstates, - scan_tile_state_view tile_output_offsets) + scan_tile_state_view tile_output_offsets, + scan_tile_status status = scan_tile_status::invalid) { - tile_superstates.initialize(base_tile_idx, num_tiles); - tile_output_offsets.initialize(base_tile_idx, num_tiles); + tile_superstates.initialize_status(base_tile_idx, num_tiles, status); + tile_output_offsets.initialize_status(base_tile_idx, num_tiles, status); } __global__ void multibyte_split_kernel(cudf::size_type base_tile_idx, @@ -370,9 +383,9 @@ std::unique_ptr multibyte_split(cudf::string_scalar const& input, // pattern-match and count delimiters - auto tile_superstates = scan_tile_state>(num_tiles, stream); - auto tile_offsets = scan_tile_state(num_tiles, stream); - auto num_init_blocks = ceil_div(num_tiles, THREADS_PER_TILE); + auto tile_superstates = scan_tile_state>(num_tiles + 1, stream); + auto tile_offsets = scan_tile_state(num_tiles + 1, stream); + auto num_init_blocks = ceil_div(num_tiles + 1, THREADS_PER_TILE); multibyte_split_init_kernel<<>>( // 0, @@ -447,6 +460,16 @@ cudf::size_type scan_full_stream(cudf::io::text::device_istream& input, // this function can be updated to interleave two kernel executions, such that two input buffers + multibyte_split_init_kernel<<>>( // + -TILES_PER_CHUNK, + TILES_PER_CHUNK, + tile_superstates, + tile_offsets, + scan_tile_status::oob); + + tile_superstates.set_seed_async(superstate<16>(), stream); + tile_offsets.set_seed_async(0, stream); + for (auto base_tile_idx = 0; (bytes_read = input.read(input_buffer, stream)) > 0; base_tile_idx += TILES_PER_CHUNK) { bytes_total += bytes_read; @@ -458,11 +481,6 @@ cudf::size_type scan_full_stream(cudf::io::text::device_istream& input, tile_superstates, tile_offsets); - if (base_tile_idx == 0) { - tile_superstates.set_seed_async(superstate<16>(), stream); - tile_offsets.set_seed_async(0, stream); - } - multibyte_split_kernel<<>>( // base_tile_idx, TILES_PER_CHUNK, diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index 55896218480..11660f0683b 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -35,7 +35,7 @@ constexpr bool print_all{false}; struct MultibyteSplitTest : public BaseFixture { }; -TEST_F(MultibyteSplitTest, Simple1) +TEST_F(MultibyteSplitTest, SimpleStreaming) { // 😀 | F0 9F 98 80 | 11110000 10011111 01100010 01010000 // 😎 | F0 9F 98 8E | 11110000 10011111 01100010 11101000 @@ -84,5 +84,53 @@ TEST_F(MultibyteSplitTest, Simple1) // auto out = cudf::io::text::multibyte_split(input, delimiters); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, print_all); - CUDF_FAIL(); + // CUDF_FAIL(); +} + +TEST_F(MultibyteSplitTest, SimplePreloaded) +{ + // 😀 | F0 9F 98 80 | 11110000 10011111 01100010 01010000 + // 😎 | F0 9F 98 8E | 11110000 10011111 01100010 11101000 + auto delimiters = std::vector({"😀", "😎", ",", "::"}); + auto host_input = std::string( + "aaa😀" + "bbb😀" + "ccc😀" + "ddd😀" + "eee😀" + "fff::" + "ggg😀" + "hhh😀" + "___," + "here," + "is," + "another," + "simple😀" + "text😎" + "seperated😎" + "by😎" + "emojis," + "which," + "are😎" + "multiple," + "bytes::" + "and😎" + "used😎" + "as😎" + "delimeters.😎" + "::" + "," + "😀"); + + auto expected = strings_column_wrapper{ + "aaa😀", "bbb😀", "ccc😀", "ddd😀", "eee😀", "fff::", "ggg😀", "hhh😀", + "___,", "here,", "is,", "another,", "simple😀", "text😎", "seperated😎", "by😎", + "emojis,", "which,", "are😎", "multiple,", "bytes::", "and😎", "used😎", "as😎", + "delimeters.😎", "::", ",", "😀", ""}; + + auto device_input = cudf::string_scalar(host_input); + auto out = cudf::io::text::multibyte_split(device_input, delimiters); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, print_all); + // CUDF_FAIL(); } From 970aac2f36f55f0978f0a6883056352a94b8a91a Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Sun, 18 Jul 2021 14:14:24 -0500 Subject: [PATCH 30/80] interleaved streaming io for multibyte_split --- .../io/text/multibyte_split_benchmark.cpp | 10 +-- cpp/src/io/text/host_device_istream.cpp | 2 + cpp/src/io/text/multibyte_split.cu | 65 ++++++++++++++++--- 3 files changed, 63 insertions(+), 14 deletions(-) diff --git a/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp index 0fc197c693c..6b90ae3e077 100644 --- a/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp +++ b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp @@ -39,13 +39,13 @@ static void BM_multibyte_split(benchmark::State& state) auto delimiters = std::vector({"😀", "😎", ",", "::"}); - // auto host_input_stream = std::basic_stringstream(host_input); - // auto device_input_stream = cudf::io::text::host_device_istream(host_input_stream); + auto host_input_stream = std::basic_stringstream(host_input); + auto device_input_stream = cudf::io::text::host_device_istream(host_input_stream); for (auto _ : state) { cuda_event_timer raii(state, true); - // auto output = cudf::io::text::multibyte_split(device_input_stream, delimiters); - auto output = cudf::io::text::multibyte_split(input, delimiters); + auto output = cudf::io::text::multibyte_split(device_input_stream, delimiters); + // auto output = cudf::io::text::multibyte_split(input, delimiters); } state.SetBytesProcessed(state.iterations() * num_chars); @@ -60,7 +60,7 @@ class MultibyteSplitBenchmark : public cudf::benchmark { BM_multibyte_split(state); \ } \ BENCHMARK_REGISTER_F(MultibyteSplitBenchmark, name) \ - ->Range(1 << 15, 1 << 30) \ + ->Range(1 << 30, 1 << 30) \ ->UseManualTime() \ ->Unit(benchmark::kMillisecond); diff --git a/cpp/src/io/text/host_device_istream.cpp b/cpp/src/io/text/host_device_istream.cpp index 6c5c14811b5..c5fa7ea9a8a 100644 --- a/cpp/src/io/text/host_device_istream.cpp +++ b/cpp/src/io/text/host_device_istream.cpp @@ -1,3 +1,4 @@ +#include #include #include @@ -14,6 +15,7 @@ namespace text { uint32_t host_device_istream::read(cudf::device_span destination, rmm::cuda_stream_view stream) { + CUDF_FUNC_RANGE() auto read_size = destination.size(); if (_host_buffer.size() < read_size) { _host_buffer.resize(read_size); } diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index f45ec700af3..14b344ac8d7 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -140,7 +140,7 @@ auto constexpr PARTIAL_AGGRIGATION_STRATEGY = 2; auto constexpr ITEMS_PER_THREAD = 32; auto constexpr THREADS_PER_TILE = 128; auto constexpr ITEMS_PER_TILE = ITEMS_PER_THREAD * THREADS_PER_TILE; -auto constexpr TILES_PER_CHUNK = 1024; +auto constexpr TILES_PER_CHUNK = 256; auto constexpr ITEMS_PER_CHUNK = ITEMS_PER_TILE * TILES_PER_CHUNK; // multibyte_split works by splitting up inputs in to 32 inputs (bytes) per thread, and transforming // them in to data structures called "superstates". these superstates are created by searching a @@ -457,8 +457,24 @@ cudf::size_type scan_full_stream(cudf::io::text::device_istream& input, cudf::size_type bytes_total = 0; rmm::device_uvector input_buffer(ITEMS_PER_CHUNK, stream); + rmm::device_uvector input_buffer_next(ITEMS_PER_CHUNK, stream); + rmm::device_uvector input_buffer_next_next(ITEMS_PER_CHUNK, stream); - // this function can be updated to interleave two kernel executions, such that two input buffers + cudaEvent_t my_event; + cudaEvent_t my_event_next; + cudaEvent_t my_event_next_next; + cudaEventCreate(&my_event); + cudaEventCreate(&my_event_next); + cudaEventCreate(&my_event_next_next); + + cudaStream_t my_stream; + cudaStream_t my_stream_next; + cudaStream_t my_stream_next_next; + cudaStreamCreate(&my_stream); + cudaStreamCreate(&my_stream_next); + cudaStreamCreate(&my_stream_next_next); + + // this function interleaves three kernel executions multibyte_split_init_kernel<<>>( // -TILES_PER_CHUNK, @@ -470,18 +486,18 @@ cudf::size_type scan_full_stream(cudf::io::text::device_istream& input, tile_superstates.set_seed_async(superstate<16>(), stream); tile_offsets.set_seed_async(0, stream); - for (auto base_tile_idx = 0; (bytes_read = input.read(input_buffer, stream)) > 0; + for (auto base_tile_idx = 0; (bytes_read = input.read(input_buffer, my_stream)) > 0; base_tile_idx += TILES_PER_CHUNK) { bytes_total += bytes_read; // reset the next chunk of tile state - multibyte_split_init_kernel<<>>( // + multibyte_split_init_kernel<<>>( // base_tile_idx, TILES_PER_CHUNK, tile_superstates, tile_offsets); - multibyte_split_kernel<<>>( // + multibyte_split_kernel<<>>( // base_tile_idx, TILES_PER_CHUNK, tile_superstates, @@ -490,9 +506,38 @@ cudf::size_type scan_full_stream(cudf::io::text::device_istream& input, device_span(input_buffer).first(bytes_read), output_buffer); - stream.synchronize(); + cudaEventRecord(my_event, my_stream); + + std::swap(my_event_next_next, my_event_next); + std::swap(my_event_next, my_event); + + std::swap(my_stream_next_next, my_stream_next); + std::swap(my_stream_next, my_stream); + + std::swap(input_buffer_next_next, input_buffer_next); + std::swap(input_buffer_next, input_buffer); + + // std::swap(my_event, my_event_next); + // std::swap(my_stream, my_stream_next); + // std::swap(input_buffer, input_buffer_next); + + cudaStreamSynchronize(my_stream); + + // cudaStreamWaitEvent(my_stream, my_event, 0); } + cudaStreamWaitEvent(stream.value(), my_event, 0); + cudaStreamWaitEvent(stream.value(), my_event_next, 0); + cudaStreamWaitEvent(stream.value(), my_event_next_next, 0); + + cudaEventDestroy(my_event); + cudaEventDestroy(my_event_next); + cudaEventDestroy(my_event_next_next); + + cudaStreamDestroy(my_stream); + cudaStreamDestroy(my_stream_next); + cudaStreamDestroy(my_stream_next_next); + return bytes_total; } @@ -501,9 +546,11 @@ std::unique_ptr multibyte_split(cudf::io::text::device_istream& in rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto const trie = cudf::io::text::trie::create(delimeters, stream); - auto tile_superstates = scan_tile_state>(TILES_PER_CHUNK * 2, stream); - auto tile_offsets = scan_tile_state(TILES_PER_CHUNK * 2, stream); + auto const trie = cudf::io::text::trie::create(delimeters, stream); + // probaly only need to b (n * 3 + 1), where 1 is the seed, but 4 makes the reads align better, + // maybe? + auto tile_superstates = scan_tile_state>(TILES_PER_CHUNK * 4, stream); + auto tile_offsets = scan_tile_state(TILES_PER_CHUNK * 4, stream); auto bytes_total = scan_full_stream(input, From fee7ebbf63d45e7347ecc84093b01f28c8d1ee08 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Sun, 18 Jul 2021 19:14:27 -0500 Subject: [PATCH 31/80] use no-copy string column construction in multibyte_split --- .../io/text/multibyte_split_benchmark.cpp | 17 +- cpp/src/io/text/multibyte_split.cu | 211 ++++++++++++------ cpp/tests/io/text/multibyte_split_test.cpp | 2 +- 3 files changed, 153 insertions(+), 77 deletions(-) diff --git a/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp index 6b90ae3e077..a9eb67b6c29 100644 --- a/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp +++ b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp @@ -30,22 +30,21 @@ using cudf::test::fixed_width_column_wrapper; static void BM_multibyte_split(benchmark::State& state) { - std::string host_input = ""; - int32_t num_chars = state.range(0); - - for (auto i = 0; i < num_chars; i++) { host_input += "x"; } - - cudf::string_scalar input(host_input); - auto delimiters = std::vector({"😀", "😎", ",", "::"}); + int32_t num_chars = state.range(0); + auto host_input = std::string(num_chars, 'x'); + auto device_input = cudf::string_scalar(host_input); + auto host_input_stream = std::basic_stringstream(host_input); auto device_input_stream = cudf::io::text::host_device_istream(host_input_stream); + cudaDeviceSynchronize(); + for (auto _ : state) { cuda_event_timer raii(state, true); - auto output = cudf::io::text::multibyte_split(device_input_stream, delimiters); - // auto output = cudf::io::text::multibyte_split(input, delimiters); + // auto output = cudf::io::text::multibyte_split(device_input_stream, delimiters); + auto output = cudf::io::text::multibyte_split(device_input, delimiters); } state.SetBytesProcessed(state.iterations() * num_chars); diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 14b344ac8d7..055d5b43321 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -137,11 +137,12 @@ struct scan_tile_state { auto constexpr PARTIAL_AGGRIGATION_STRATEGY = 2; // keep ITEMS_PER_TILE below input size to force multi-tile execution. -auto constexpr ITEMS_PER_THREAD = 32; -auto constexpr THREADS_PER_TILE = 128; +auto constexpr ITEMS_PER_THREAD = 32; // influences register pressure +auto constexpr THREADS_PER_TILE = 128; // must be >= 32 for warp-reduce. influences shmem usage. auto constexpr ITEMS_PER_TILE = ITEMS_PER_THREAD * THREADS_PER_TILE; -auto constexpr TILES_PER_CHUNK = 256; +auto constexpr TILES_PER_CHUNK = 256; // blocks in streaming launch auto constexpr ITEMS_PER_CHUNK = ITEMS_PER_TILE * TILES_PER_CHUNK; +auto constexpr TILES_PER_PASS = 512; // blocks in non-streaming launch // multibyte_split works by splitting up inputs in to 32 inputs (bytes) per thread, and transforming // them in to data structures called "superstates". these superstates are created by searching a // trie, but instead of a tradition trie where the search begins at a single node at the beginning, @@ -297,7 +298,8 @@ __global__ void multibyte_split_kernel(cudf::size_type base_tile_idx, scan_tile_state_view tile_output_offsets, cudf::io::text::trie_device_view trie, cudf::device_span data, - cudf::device_span string_offsets) + cudf::device_span string_offsets, + cudf::device_span data_out) { typedef cub::BlockScan OffsetScan; typedef scan_tile_state_callback OffsetScanCallback; @@ -363,6 +365,12 @@ __global__ void multibyte_split_kernel(cudf::size_type base_tile_idx, string_offsets[thread_offsets[i]] = match_end; } } + + if (data_out.size() > 0) { + for (int32_t i = 0; i < ITEMS_PER_THREAD and i < num_valid; i++) { // + data_out[data_begin + i] = thread_data[i]; + } + } } } // namespace @@ -372,6 +380,38 @@ namespace io { namespace text { namespace detail { +template +std::unique_ptr create_column(rmm::device_uvector&& values) +{ + auto size = values.size(); + auto dtype = cudf::data_type{cudf::type_to_id()}; + + CUDF_EXPECTS(dtype.id() != type_id::EMPTY, "column type_id cannot be EMPTY"); + + return std::make_unique(dtype, size, values.release(), rmm::device_buffer(), 0); +} + +std::unique_ptr create_char_column(rmm::device_uvector&& values) +{ + auto size = values.size(); + auto dtype = cudf::data_type{type_id::INT8}; + + return std::make_unique(dtype, size, values.release(), rmm::device_buffer(), 0); +} + +std::unique_ptr create_strings_column(rmm::device_uvector&& chars, + rmm::device_uvector&& offsets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto num_strings = offsets.size() - 1; + auto chars_column = create_char_column(std::move(chars)); + auto offsets_column = create_column(std::move(offsets)); + + return cudf::make_strings_column( + num_strings, std::move(offsets_column), std::move(chars_column), 0, {}, stream, mr); +} + std::unique_ptr multibyte_split(cudf::string_scalar const& input, std::vector const& delimeters, rmm::cuda_stream_view stream, @@ -383,32 +423,56 @@ std::unique_ptr multibyte_split(cudf::string_scalar const& input, // pattern-match and count delimiters - auto tile_superstates = scan_tile_state>(num_tiles + 1, stream); - auto tile_offsets = scan_tile_state(num_tiles + 1, stream); - auto num_init_blocks = ceil_div(num_tiles + 1, THREADS_PER_TILE); + auto tile_superstates = + scan_tile_state>(num_tiles + 1, stream); // CHECK IF THIS IS TOO BIG + auto tile_offsets = scan_tile_state(num_tiles + 1, stream); - multibyte_split_init_kernel<<>>( // - 0, - num_tiles, + multibyte_split_init_kernel<<>>( // + -TILES_PER_PASS, + TILES_PER_PASS, tile_superstates, - tile_offsets); + tile_offsets, + scan_tile_status::oob); tile_superstates.set_seed_async(superstate<16>(), stream); tile_offsets.set_seed_async(0, stream); - multibyte_split_kernel<<>>( // - 0, - num_tiles, - tile_superstates, - tile_offsets, - trie.view(), - cudf::device_span(input.data(), input.size()), - cudf::device_span(static_cast(nullptr), 0)); + for (int32_t base_tile_idx = 0; base_tile_idx < num_tiles; base_tile_idx += TILES_PER_PASS) { + auto num_tiles_this_pass = std::min(num_tiles - base_tile_idx, TILES_PER_PASS); + + auto offset = base_tile_idx * ITEMS_PER_TILE; + auto num_valid = input.size() - offset; + + // std::cout << "tip: " << num_tiles_this_pass // + // << " offset: " << offset // + // << " num_valid: " << num_valid << std::endl; + + multibyte_split_init_kernel<<>>( // + base_tile_idx, + TILES_PER_PASS, + tile_superstates, + tile_offsets); + + multibyte_split_kernel<<>>( // + base_tile_idx, + TILES_PER_PASS, + tile_superstates, + tile_offsets, + trie.view(), + cudf::device_span(input.data() + offset, num_valid), + cudf::device_span(static_cast(nullptr), 0), + cudf::device_span(static_cast(nullptr), 0)); + + stream.synchronize(); + } + + // std::cout << "done with first pass" << std::endl; // allocate string offsets auto num_results = tile_offsets.get_inclusive_prefix(num_tiles - 1, stream); - auto string_offsets = rmm::device_uvector(num_results + 2, stream); + auto string_offsets = rmm::device_uvector(num_results + 2, stream, mr); + auto string_chars = rmm::device_uvector(input.size(), stream, mr); auto const x = string_offsets.size() - 1; auto const y = input.size(); @@ -417,33 +481,52 @@ std::unique_ptr multibyte_split(cudf::string_scalar const& input, string_offsets.set_element_to_zero_async(0, stream); string_offsets.set_element_async(x, y, stream); - multibyte_split_init_kernel<<>>( // + multibyte_split_init_kernel<<>>( // 0, num_tiles, tile_superstates, - tile_offsets); + tile_offsets, + scan_tile_status::oob); tile_superstates.set_seed_async(superstate<16>(), stream); tile_offsets.set_seed_async(0, stream); - // pattern-match and materialize string offsets + for (int32_t base_tile_idx = 0; base_tile_idx < num_tiles; base_tile_idx += TILES_PER_PASS) { + auto num_tiles_this_pass = std::min(num_tiles - base_tile_idx, TILES_PER_PASS); - multibyte_split_kernel<<>>( // - 0, - num_tiles, - tile_superstates, - tile_offsets, - trie.view(), - cudf::device_span(input.data(), input.size()), - cudf::device_span(string_offsets).subspan(1, num_results)); - - return cudf::make_strings_column( // - cudf::device_span(input.data(), input.size()), - string_offsets, - {}, - 0, - stream, - mr); + auto offset = base_tile_idx * ITEMS_PER_TILE; + auto num_valid = input.size() - offset; + + // std::cout << "tip: " << num_tiles_this_pass // + // << " offset: " << offset // + // << " num_valid: " << num_valid << std::endl; + + multibyte_split_init_kernel<<>>( // + base_tile_idx, + TILES_PER_PASS, + tile_superstates, + tile_offsets); + + multibyte_split_kernel<<>>( // + base_tile_idx, + TILES_PER_PASS, + tile_superstates, + tile_offsets, + trie.view(), + cudf::device_span(input.data() + offset, num_valid), + cudf::device_span(string_offsets).subspan(1, num_results), + string_chars); + + stream.synchronize(); + } + + // std::cout << "done with second pass" << std::endl; + + auto res = create_strings_column(std::move(string_chars), std::move(string_offsets), stream, mr); + + stream.synchronize(); + + return res; } cudf::size_type scan_full_stream(cudf::io::text::device_istream& input, @@ -451,6 +534,7 @@ cudf::size_type scan_full_stream(cudf::io::text::device_istream& input, scan_tile_state>& tile_superstates, scan_tile_state& tile_offsets, device_span output_buffer, + device_span output_char_buffer, rmm::cuda_stream_view stream) { uint32_t bytes_read; @@ -503,8 +587,9 @@ cudf::size_type scan_full_stream(cudf::io::text::device_istream& input, tile_superstates, tile_offsets, trie.view(), - device_span(input_buffer).first(bytes_read), - output_buffer); + device_span(input_buffer).first(bytes_read), + output_buffer, + output_char_buffer); cudaEventRecord(my_event, my_stream); @@ -518,8 +603,13 @@ cudf::size_type scan_full_stream(cudf::io::text::device_istream& input, std::swap(input_buffer_next, input_buffer); // std::swap(my_event, my_event_next); + // std::swap(my_event_next, my_event_next_next); + // std::swap(my_stream, my_stream_next); + // std::swap(my_stream_next, my_stream_next_next); + // std::swap(input_buffer, input_buffer_next); + // std::swap(input_buffer_next, input_buffer_next_next); cudaStreamSynchronize(my_stream); @@ -552,19 +642,20 @@ std::unique_ptr multibyte_split(cudf::io::text::device_istream& in auto tile_superstates = scan_tile_state>(TILES_PER_CHUNK * 4, stream); auto tile_offsets = scan_tile_state(TILES_PER_CHUNK * 4, stream); - auto bytes_total = - scan_full_stream(input, - trie, - tile_superstates, - tile_offsets, - cudf::device_span(static_cast(nullptr), 0), - stream); + auto bytes_total = scan_full_stream(input, + trie, + tile_superstates, + tile_offsets, + cudf::device_span(static_cast(nullptr), 0), + cudf::device_span(static_cast(nullptr), 0), + stream); // allocate string offsets auto num_tiles = ceil_div(bytes_total, ITEMS_PER_TILE); auto num_results = tile_offsets.get_inclusive_prefix(num_tiles - 1, stream); - auto string_offsets = rmm::device_uvector(num_results + 2, stream); + auto string_offsets = rmm::device_uvector(num_results + 2, stream, mr); + auto string_chars = rmm::device_uvector(bytes_total, stream, mr); // first and last element are set manually to zero and size of input, respectively. // kernel is only responsible for determining delimiter offsets @@ -579,29 +670,15 @@ std::unique_ptr multibyte_split(cudf::io::text::device_istream& in trie, tile_superstates, tile_offsets, - cudf::device_span(string_offsets).subspan(1, num_results), + cudf::device_span(string_offsets).subspan(1, num_results), + string_chars, stream); - // copy chars - auto string_chars = rmm::device_uvector(bytes_total, stream); - - input.reset(); - input.read(string_chars, stream); - - // copy chars and offsets to make new strings column. - auto result = cudf::make_strings_column( // - string_chars, - string_offsets, - {}, - 0, - stream, - mr); + auto res = create_strings_column(std::move(string_chars), std::move(string_offsets), stream, mr); - // This synchronization is required to keep input_buffer in scope long enough to copy. Can be - // by using `std::unique_ptr` overload, or making a new one that accepts `device_uvector`. stream.synchronize(); - return result; + return res; } } // namespace detail diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index 11660f0683b..e28e9bc03f3 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -30,7 +30,7 @@ using namespace cudf; using namespace test; -constexpr bool print_all{false}; +constexpr bool print_all{true}; struct MultibyteSplitTest : public BaseFixture { }; From e5a5204a8f9c58a37ec6b2aab9448684b1254755 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Sun, 18 Jul 2021 23:44:38 -0500 Subject: [PATCH 32/80] document multibyte_split minimum tile count requirements --- .../io/text/multibyte_split_benchmark.cpp | 4 +- cpp/src/io/text/multibyte_split.cu | 39 +++++++++++-------- cpp/tests/io/text/multibyte_split_test.cpp | 6 +-- 3 files changed, 25 insertions(+), 24 deletions(-) diff --git a/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp index a9eb67b6c29..700dd11c5a2 100644 --- a/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp +++ b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp @@ -43,8 +43,8 @@ static void BM_multibyte_split(benchmark::State& state) for (auto _ : state) { cuda_event_timer raii(state, true); - // auto output = cudf::io::text::multibyte_split(device_input_stream, delimiters); - auto output = cudf::io::text::multibyte_split(device_input, delimiters); + auto output = cudf::io::text::multibyte_split(device_input_stream, delimiters); + // auto output = cudf::io::text::multibyte_split(device_input, delimiters); } state.SetBytesProcessed(state.iterations() * num_chars); diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 055d5b43321..b4502521179 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -14,7 +14,6 @@ #include #include -#include #include #include @@ -143,15 +142,6 @@ auto constexpr ITEMS_PER_TILE = ITEMS_PER_THREAD * THREADS_PER_TILE; auto constexpr TILES_PER_CHUNK = 256; // blocks in streaming launch auto constexpr ITEMS_PER_CHUNK = ITEMS_PER_TILE * TILES_PER_CHUNK; auto constexpr TILES_PER_PASS = 512; // blocks in non-streaming launch -// multibyte_split works by splitting up inputs in to 32 inputs (bytes) per thread, and transforming -// them in to data structures called "superstates". these superstates are created by searching a -// trie, but instead of a tradition trie where the search begins at a single node at the beginning, -// we allow our search to begin anywhere within the trie tree. The position within the trie tree is -// stored as a "partial match path", which indicates "we can get from here to there by a set of -// specific transitions". By scanning together superstates, we effectively know "we can get here -// from the beginning by following the inputs". By doing this, each thread knows exactly what state -// it begins in. From there, each thread can then take deterministic action. In this case, the -// deterministic action is counting and outputting delimiter offsets when a delimiter is found. template struct scan_tile_state_callback { @@ -282,6 +272,16 @@ struct PatternScan { } }; +// multibyte_split works by splitting up inputs in to 32 inputs (bytes) per thread, and transforming +// them in to data structures called "superstates". these superstates are created by searching a +// trie, but instead of a tradition trie where the search begins at a single node at the beginning, +// we allow our search to begin anywhere within the trie tree. The position within the trie tree is +// stored as a "partial match path", which indicates "we can get from here to there by a set of +// specific transitions". By scanning together superstates, we effectively know "we can get here +// from the beginning by following the inputs". By doing this, each thread knows exactly what state +// it begins in. From there, each thread can then take deterministic action. In this case, the +// deterministic action is counting and outputting delimiter offsets when a delimiter is found. + __global__ void multibyte_split_init_kernel(cudf::size_type base_tile_idx, cudf::size_type num_tiles, scan_tile_state_view tile_superstates, @@ -420,12 +420,15 @@ std::unique_ptr multibyte_split(cudf::string_scalar const& input, auto const trie = cudf::io::text::trie::create(delimeters, stream); auto num_tiles = ceil_div(input.size(), ITEMS_PER_TILE); + // must be at least 32 when using warp-reduce on partials + // must be at least 1 more than max possible concurrent tiles + // best when at least 32 more than max possible concurrent tiles, due to rolling `invalid`s + auto num_tile_states = std::max(32, TILES_PER_PASS + 32); // pattern-match and count delimiters - auto tile_superstates = - scan_tile_state>(num_tiles + 1, stream); // CHECK IF THIS IS TOO BIG - auto tile_offsets = scan_tile_state(num_tiles + 1, stream); + auto tile_superstates = scan_tile_state>(num_tile_states, stream); + auto tile_offsets = scan_tile_state(num_tile_states, stream); multibyte_split_init_kernel<<>>( // -TILES_PER_PASS, @@ -637,10 +640,12 @@ std::unique_ptr multibyte_split(cudf::io::text::device_istream& in rmm::mr::device_memory_resource* mr) { auto const trie = cudf::io::text::trie::create(delimeters, stream); - // probaly only need to b (n * 3 + 1), where 1 is the seed, but 4 makes the reads align better, - // maybe? - auto tile_superstates = scan_tile_state>(TILES_PER_CHUNK * 4, stream); - auto tile_offsets = scan_tile_state(TILES_PER_CHUNK * 4, stream); + // must be at least 32 when using warp-reduce on partials + // must be at least 1 more than max possible concurrent tiles + // best when at least 32 more than max possible concurrent tiles, due to rolling `invalid`s + auto num_tile_states = std::max(32, TILES_PER_CHUNK * 3 + 32); + auto tile_superstates = scan_tile_state>(num_tile_states, stream); + auto tile_offsets = scan_tile_state(num_tile_states, stream); auto bytes_total = scan_full_stream(input, trie, diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index e28e9bc03f3..b92b28e1b61 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -78,10 +78,7 @@ TEST_F(MultibyteSplitTest, SimpleStreaming) auto host_input_stream = std::basic_stringstream(host_input); auto device_input_stream = cudf::io::text::host_device_istream(host_input_stream); - // auto device_input = cudf::string_scalar(host_input); - - auto out = cudf::io::text::multibyte_split(device_input_stream, delimiters); - // auto out = cudf::io::text::multibyte_split(input, delimiters); + auto out = cudf::io::text::multibyte_split(device_input_stream, delimiters); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, print_all); // CUDF_FAIL(); @@ -132,5 +129,4 @@ TEST_F(MultibyteSplitTest, SimplePreloaded) auto out = cudf::io::text::multibyte_split(device_input, delimiters); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, print_all); - // CUDF_FAIL(); } From 65af4debd75f176e6ba52456baf419b0eb401cd6 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Wed, 21 Jul 2021 23:41:17 -0500 Subject: [PATCH 33/80] multibyte_split tunable concurrency via stream pool --- .../io/text/multibyte_split_benchmark.cpp | 16 +- .../cudf/io/text/host_device_istream.hpp | 5 +- cpp/src/io/text/multibyte_split.cu | 167 ++++++++++-------- cpp/tests/io/text/multibyte_split_test.cpp | 1 - 4 files changed, 116 insertions(+), 73 deletions(-) diff --git a/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp index 700dd11c5a2..aacc9cf0ea1 100644 --- a/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp +++ b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp @@ -24,6 +24,8 @@ #include +#include +#include #include using cudf::test::fixed_width_column_wrapper; @@ -36,8 +38,18 @@ static void BM_multibyte_split(benchmark::State& state) auto host_input = std::string(num_chars, 'x'); auto device_input = cudf::string_scalar(host_input); - auto host_input_stream = std::basic_stringstream(host_input); - auto device_input_stream = cudf::io::text::host_device_istream(host_input_stream); + auto temp_file_name = std::string("io.x"); + close(mkstemp(const_cast(temp_file_name.data()))); + { + auto temp_fostream = std::ofstream(temp_file_name, std::ofstream::out); + temp_fostream << host_input; + temp_fostream.close(); + } + auto temp_fistream = std::ifstream(temp_file_name, std::ifstream::in); + + auto host_input_stream = std::basic_stringstream(host_input); + // auto device_input_stream = cudf::io::text::host_device_istream(host_input_stream); + auto device_input_stream = cudf::io::text::host_device_istream(temp_fistream); cudaDeviceSynchronize(); diff --git a/cpp/include/cudf/io/text/host_device_istream.hpp b/cpp/include/cudf/io/text/host_device_istream.hpp index 8d043cf895f..002874d98cd 100644 --- a/cpp/include/cudf/io/text/host_device_istream.hpp +++ b/cpp/include/cudf/io/text/host_device_istream.hpp @@ -6,6 +6,8 @@ #include +#include + #include namespace cudf { @@ -22,7 +24,8 @@ class host_device_istream : public cudf::io::text::device_istream { private: std::istream& _source_stream; - thrust::host_vector _host_buffer{}; + thrust::host_vector> + _host_buffer{}; }; } // namespace text diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index b4502521179..558884fe477 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -532,35 +533,82 @@ std::unique_ptr multibyte_split(cudf::string_scalar const& input, return res; } +struct chunk { + chunk(rmm::device_buffer&& buffer, std::size_t size) : _buffer(std::move(buffer)), _size(size) {} + + operator device_span() + { + return device_span(static_cast(_buffer.data()), _size); + } + + uint32_t size() const { return _size; } + + rmm::cuda_stream_view stream() const { return _buffer.stream(); } + + private: + rmm::device_buffer _buffer; + std::size_t _size; +}; + +struct chunk_reader { + chunk_reader(cudf::io::text::device_istream& input, rmm::cuda_stream_pool& stream_pool) + : _input(input), _stream_pool(stream_pool) + { + auto buffers = std::vector(stream_pool.get_pool_size()); + for (uint32_t i = 0; i < stream_pool.get_pool_size(); i++) { + buffers[i] = rmm::device_buffer(ITEMS_PER_CHUNK, _stream_pool.get_stream(i)); + } + } + chunk get_next_chunk(uint32_t size) + { + auto stream = _stream_pool.get_stream(i++); + auto chunk_buffer = rmm::device_buffer(size, stream); + auto chunk_span = + device_span(static_cast(chunk_buffer.data()), chunk_buffer.size()); + cudaStreamSynchronize(stream); + size = _input.read(chunk_span, stream); + return chunk(std::move(chunk_buffer), size); + } + + private: + cudf::io::text::device_istream& _input; + rmm::cuda_stream_pool& _stream_pool; + uint32_t i = 0; +}; + +void fork_stream_to_pool(rmm::cuda_stream_view stream, rmm::cuda_stream_pool& stream_pool) +{ + cudaEvent_t event; + cudaEventCreate(&event); + cudaEventRecord(event, stream); + for (uint32_t i = 0; i < stream_pool.get_pool_size(); i++) { + cudaStreamWaitEvent(stream_pool.get_stream(i), event, 0); + } + cudaEventDestroy(event); +} + +void join_pool_to_stream(rmm::cuda_stream_pool& stream_pool, rmm::cuda_stream_view stream) +{ + cudaEvent_t event; + cudaEventCreate(&event); + for (uint32_t i = 0; i < stream_pool.get_pool_size(); i++) { + cudaEventRecord(event, stream_pool.get_stream(i)); + cudaStreamWaitEvent(stream, event, 0); + } + cudaEventDestroy(event); +} + cudf::size_type scan_full_stream(cudf::io::text::device_istream& input, cudf::io::text::trie const& trie, scan_tile_state>& tile_superstates, scan_tile_state& tile_offsets, device_span output_buffer, device_span output_char_buffer, - rmm::cuda_stream_view stream) + rmm::cuda_stream_view stream, + rmm::cuda_stream_pool& stream_pool) { - uint32_t bytes_read; cudf::size_type bytes_total = 0; - rmm::device_uvector input_buffer(ITEMS_PER_CHUNK, stream); - rmm::device_uvector input_buffer_next(ITEMS_PER_CHUNK, stream); - rmm::device_uvector input_buffer_next_next(ITEMS_PER_CHUNK, stream); - - cudaEvent_t my_event; - cudaEvent_t my_event_next; - cudaEvent_t my_event_next_next; - cudaEventCreate(&my_event); - cudaEventCreate(&my_event_next); - cudaEventCreate(&my_event_next_next); - - cudaStream_t my_stream; - cudaStream_t my_stream_next; - cudaStream_t my_stream_next_next; - cudaStreamCreate(&my_stream); - cudaStreamCreate(&my_stream_next); - cudaStreamCreate(&my_stream_next_next); - // this function interleaves three kernel executions multibyte_split_init_kernel<<>>( // @@ -573,63 +621,35 @@ cudf::size_type scan_full_stream(cudf::io::text::device_istream& input, tile_superstates.set_seed_async(superstate<16>(), stream); tile_offsets.set_seed_async(0, stream); - for (auto base_tile_idx = 0; (bytes_read = input.read(input_buffer, my_stream)) > 0; - base_tile_idx += TILES_PER_CHUNK) { - bytes_total += bytes_read; + fork_stream_to_pool(stream, stream_pool); + + auto reader = chunk_reader(input, stream_pool); + + for (auto base_tile_idx = 0; true; base_tile_idx += TILES_PER_CHUNK) { + auto chunk = reader.get_next_chunk(ITEMS_PER_CHUNK); + + if (chunk.size() == 0) { break; } + + bytes_total += chunk.size(); // reset the next chunk of tile state - multibyte_split_init_kernel<<>>( // + multibyte_split_init_kernel<<>>( // base_tile_idx, TILES_PER_CHUNK, tile_superstates, tile_offsets); - - multibyte_split_kernel<<>>( // + multibyte_split_kernel<<>>( // base_tile_idx, TILES_PER_CHUNK, tile_superstates, tile_offsets, trie.view(), - device_span(input_buffer).first(bytes_read), + chunk, output_buffer, output_char_buffer); - - cudaEventRecord(my_event, my_stream); - - std::swap(my_event_next_next, my_event_next); - std::swap(my_event_next, my_event); - - std::swap(my_stream_next_next, my_stream_next); - std::swap(my_stream_next, my_stream); - - std::swap(input_buffer_next_next, input_buffer_next); - std::swap(input_buffer_next, input_buffer); - - // std::swap(my_event, my_event_next); - // std::swap(my_event_next, my_event_next_next); - - // std::swap(my_stream, my_stream_next); - // std::swap(my_stream_next, my_stream_next_next); - - // std::swap(input_buffer, input_buffer_next); - // std::swap(input_buffer_next, input_buffer_next_next); - - cudaStreamSynchronize(my_stream); - - // cudaStreamWaitEvent(my_stream, my_event, 0); } - cudaStreamWaitEvent(stream.value(), my_event, 0); - cudaStreamWaitEvent(stream.value(), my_event_next, 0); - cudaStreamWaitEvent(stream.value(), my_event_next_next, 0); - - cudaEventDestroy(my_event); - cudaEventDestroy(my_event_next); - cudaEventDestroy(my_event_next_next); - - cudaStreamDestroy(my_stream); - cudaStreamDestroy(my_stream_next); - cudaStreamDestroy(my_stream_next_next); + join_pool_to_stream(stream_pool, stream); return bytes_total; } @@ -643,17 +663,21 @@ std::unique_ptr multibyte_split(cudf::io::text::device_istream& in // must be at least 32 when using warp-reduce on partials // must be at least 1 more than max possible concurrent tiles // best when at least 32 more than max possible concurrent tiles, due to rolling `invalid`s - auto num_tile_states = std::max(32, TILES_PER_CHUNK * 3 + 32); + auto concurrency = 3; + auto num_tile_states = std::max(32, TILES_PER_CHUNK * concurrency + 32); auto tile_superstates = scan_tile_state>(num_tile_states, stream); auto tile_offsets = scan_tile_state(num_tile_states, stream); + auto stream_pool = rmm::cuda_stream_pool(concurrency); + auto bytes_total = scan_full_stream(input, trie, tile_superstates, tile_offsets, cudf::device_span(static_cast(nullptr), 0), cudf::device_span(static_cast(nullptr), 0), - stream); + stream, + stream_pool); // allocate string offsets @@ -677,12 +701,11 @@ std::unique_ptr multibyte_split(cudf::io::text::device_istream& in tile_offsets, cudf::device_span(string_offsets).subspan(1, num_results), string_chars, - stream); + stream, + stream_pool); auto res = create_strings_column(std::move(string_chars), std::move(string_offsets), stream, mr); - stream.synchronize(); - return res; } @@ -692,14 +715,20 @@ std::unique_ptr multibyte_split(cudf::string_scalar const& input, std::vector const& delimeters, rmm::mr::device_memory_resource* mr) { - return detail::multibyte_split(input, delimeters, rmm::cuda_stream_default, mr); + auto stream = rmm::cuda_stream_default; + auto result = detail::multibyte_split(input, delimeters, stream, mr); + stream.synchronize(); + return result; } std::unique_ptr multibyte_split(cudf::io::text::device_istream& input, std::vector const& delimeters, rmm::mr::device_memory_resource* mr) { - return detail::multibyte_split(input, delimeters, rmm::cuda_stream_default, mr); + auto stream = rmm::cuda_stream_default; + auto result = detail::multibyte_split(input, delimeters, stream, mr); + stream.synchronize(); + return result; } } // namespace text diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index b92b28e1b61..1779e11060b 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -81,7 +81,6 @@ TEST_F(MultibyteSplitTest, SimpleStreaming) auto out = cudf::io::text::multibyte_split(device_input_stream, delimiters); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, print_all); - // CUDF_FAIL(); } TEST_F(MultibyteSplitTest, SimplePreloaded) From a4fe128df49d72e072bf9460d10abc36d88298f5 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Thu, 22 Jul 2021 19:11:52 -0500 Subject: [PATCH 34/80] multibyte_split remove device_istream replace with data_chunk_reader --- cpp/CMakeLists.txt | 1 - .../io/text/multibyte_split_benchmark.cpp | 11 +- .../cudf/io/text/data_chunk_source.hpp | 44 ++++ .../io/text/data_chunk_source_factories.hpp | 89 ++++++++ .../cudf/io/text/host_device_istream.hpp | 33 --- cpp/include/cudf/io/text/multibyte_split.hpp | 9 +- cpp/src/io/text/host_device_istream.cpp | 47 ----- cpp/src/io/text/multibyte_split.cu | 199 ++---------------- cpp/tests/io/text/multibyte_split_test.cpp | 23 +- cpp/tests/io/text/trie_test.cpp | 2 - 10 files changed, 165 insertions(+), 293 deletions(-) create mode 100644 cpp/include/cudf/io/text/data_chunk_source.hpp create mode 100644 cpp/include/cudf/io/text/data_chunk_source_factories.hpp delete mode 100644 cpp/include/cudf/io/text/host_device_istream.hpp delete mode 100644 cpp/src/io/text/host_device_istream.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index ffa2d714c59..597cbef5a83 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -289,7 +289,6 @@ add_library(cudf src/io/parquet/writer_impl.cu src/io/statistics/orc_column_statistics.cu src/io/statistics/parquet_column_statistics.cu - src/io/text/host_device_istream.cpp src/io/text/multibyte_split.cu src/io/utilities/column_buffer.cpp src/io/utilities/data_sink.cpp diff --git a/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp index aacc9cf0ea1..473e71aafea 100644 --- a/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp +++ b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp @@ -17,7 +17,7 @@ #include #include -#include +#include #include #include #include @@ -45,17 +45,16 @@ static void BM_multibyte_split(benchmark::State& state) temp_fostream << host_input; temp_fostream.close(); } - auto temp_fistream = std::ifstream(temp_file_name, std::ifstream::in); - auto host_input_stream = std::basic_stringstream(host_input); - // auto device_input_stream = cudf::io::text::host_device_istream(host_input_stream); - auto device_input_stream = cudf::io::text::host_device_istream(temp_fistream); + auto source = cudf::io::text::make_source_from_file(temp_file_name); + // auto source = cudf::text::io::make_source(device_input); + // auto source = cudf::text::io::make_source(host_input); cudaDeviceSynchronize(); for (auto _ : state) { cuda_event_timer raii(state, true); - auto output = cudf::io::text::multibyte_split(device_input_stream, delimiters); + auto output = cudf::io::text::multibyte_split(*source, delimiters); // auto output = cudf::io::text::multibyte_split(device_input, delimiters); } diff --git a/cpp/include/cudf/io/text/data_chunk_source.hpp b/cpp/include/cudf/io/text/data_chunk_source.hpp new file mode 100644 index 00000000000..b4238532b03 --- /dev/null +++ b/cpp/include/cudf/io/text/data_chunk_source.hpp @@ -0,0 +1,44 @@ +#pragma once + +#include + +#include +#include + +namespace cudf { +namespace io { +namespace text { + +struct data_chunk { + data_chunk(rmm::device_buffer&& buffer, std::size_t size) + : _buffer(std::move(buffer)), _size(size) + { + } + + operator cudf::device_span() + { + return cudf::device_span(static_cast(_buffer.data()), _size); + } + + uint32_t size() const { return _size; } + + rmm::cuda_stream_view stream() const { return _buffer.stream(); } + + private: + rmm::device_buffer _buffer; + std::size_t _size; +}; + +class data_chunk_reader { + public: + virtual data_chunk get_next_chunk(uint32_t size, rmm::cuda_stream_view stream) = 0; +}; + +class data_chunk_source { + public: + virtual std::unique_ptr create_reader() = 0; +}; + +} // namespace text +} // namespace io +} // namespace cudf diff --git a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp new file mode 100644 index 00000000000..4bf768fafef --- /dev/null +++ b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp @@ -0,0 +1,89 @@ +#pragma once + +#include +#include +#include + +#include + +#include +#include + +#include +#include +#include + +namespace cudf { +namespace io { +namespace text { + +namespace { + +class file_data_chunk_reader : public data_chunk_reader { + public: + file_data_chunk_reader(std::string const& filename) + : _filestream(std::ifstream(filename, std::ifstream::in)) + { + CUDA_TRY(cudaEventCreate(&prev_host_copy_event)); // + } + + ~file_data_chunk_reader() + { + CUDA_TRY(cudaEventDestroy(prev_host_copy_event)); // + } + + data_chunk get_next_chunk(uint32_t read_size, rmm::cuda_stream_view stream) override + { + CUDA_TRY(cudaEventSynchronize(prev_host_copy_event)); + + if (_host_buffer.size() < read_size) { _host_buffer.resize(read_size); } + + _filestream.read(_host_buffer.data(), read_size); + + read_size = _filestream.gcount(); + + auto chunk_buffer = rmm::device_buffer(read_size, stream); + + CUDA_TRY(cudaMemcpyAsync( // + chunk_buffer.data(), + _host_buffer.data(), + read_size, + cudaMemcpyHostToDevice, + stream.value())); + + CUDA_TRY(cudaEventRecord(prev_host_copy_event, stream.value())); + + return data_chunk(std::move(chunk_buffer), read_size); + } + + private: + cudaEvent_t prev_host_copy_event; + std::ifstream _filestream; + thrust::host_vector> + _host_buffer{}; +}; + +class file_data_chunk_source : public data_chunk_source { + public: + file_data_chunk_source(std::string filename) : _filename(filename) {} + std::unique_ptr create_reader() override + { + return std::make_unique(_filename); + } + + private: + std::string _filename; +}; + +} // namespace + +std::unique_ptr make_source(std::string& data); +std::unique_ptr make_source(cudf::string_scalar& data); +std::unique_ptr make_source_from_file(std::string filename) +{ + return std::make_unique(filename); +} + +} // namespace text +} // namespace io +} // namespace cudf diff --git a/cpp/include/cudf/io/text/host_device_istream.hpp b/cpp/include/cudf/io/text/host_device_istream.hpp deleted file mode 100644 index 002874d98cd..00000000000 --- a/cpp/include/cudf/io/text/host_device_istream.hpp +++ /dev/null @@ -1,33 +0,0 @@ -#pragma once - -#include - -#include - -#include - -#include - -#include - -namespace cudf { -namespace io { -namespace text { - -class host_device_istream : public cudf::io::text::device_istream { - public: - host_device_istream(std::istream& source_stream) : _source_stream(source_stream) {} - - uint32_t read(cudf::device_span destination, rmm::cuda_stream_view stream) override; - - void reset() override; - - private: - std::istream& _source_stream; - thrust::host_vector> - _host_buffer{}; -}; - -} // namespace text -} // namespace io -} // namespace cudf diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp index e4ea512d8a8..a1f484aabce 100644 --- a/cpp/include/cudf/io/text/multibyte_split.hpp +++ b/cpp/include/cudf/io/text/multibyte_split.hpp @@ -1,4 +1,4 @@ -#include +#include #include @@ -13,12 +13,7 @@ namespace io { namespace text { std::unique_ptr multibyte_split( - cudf::string_scalar const& input, - std::vector const& delimeters, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -std::unique_ptr multibyte_split( - cudf::io::text::device_istream& input, + data_chunk_source& source, std::vector const& delimeters, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/src/io/text/host_device_istream.cpp b/cpp/src/io/text/host_device_istream.cpp deleted file mode 100644 index c5fa7ea9a8a..00000000000 --- a/cpp/src/io/text/host_device_istream.cpp +++ /dev/null @@ -1,47 +0,0 @@ -#include -#include -#include - -#include - -#include - -#include - -namespace cudf { -namespace io { -namespace text { - -uint32_t host_device_istream::read(cudf::device_span destination, - rmm::cuda_stream_view stream) -{ - CUDF_FUNC_RANGE() - auto read_size = destination.size(); - - if (_host_buffer.size() < read_size) { _host_buffer.resize(read_size); } - - _source_stream.read(_host_buffer.data(), read_size); - - auto read_size_actual = _source_stream.gcount(); - - CUDA_TRY(cudaMemcpyAsync( // - destination.data(), - _host_buffer.data(), - read_size_actual, - cudaMemcpyHostToDevice, - stream.value())); - - // std::cout << "tried to read: " << read_size << ", and got: " << read_size_actual << std::endl; - - return read_size_actual; -} - -void host_device_istream::reset() -{ - _source_stream.clear(); - _source_stream.seekg(0, _source_stream.beg); // -} - -} // namespace text -} // namespace io -} // namespace cudf diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 558884fe477..e2b97f9c85c 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include #include @@ -413,169 +413,6 @@ std::unique_ptr create_strings_column(rmm::device_uvector&& chars, num_strings, std::move(offsets_column), std::move(chars_column), 0, {}, stream, mr); } -std::unique_ptr multibyte_split(cudf::string_scalar const& input, - std::vector const& delimeters, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - auto const trie = cudf::io::text::trie::create(delimeters, stream); - - auto num_tiles = ceil_div(input.size(), ITEMS_PER_TILE); - // must be at least 32 when using warp-reduce on partials - // must be at least 1 more than max possible concurrent tiles - // best when at least 32 more than max possible concurrent tiles, due to rolling `invalid`s - auto num_tile_states = std::max(32, TILES_PER_PASS + 32); - - // pattern-match and count delimiters - - auto tile_superstates = scan_tile_state>(num_tile_states, stream); - auto tile_offsets = scan_tile_state(num_tile_states, stream); - - multibyte_split_init_kernel<<>>( // - -TILES_PER_PASS, - TILES_PER_PASS, - tile_superstates, - tile_offsets, - scan_tile_status::oob); - - tile_superstates.set_seed_async(superstate<16>(), stream); - tile_offsets.set_seed_async(0, stream); - - for (int32_t base_tile_idx = 0; base_tile_idx < num_tiles; base_tile_idx += TILES_PER_PASS) { - auto num_tiles_this_pass = std::min(num_tiles - base_tile_idx, TILES_PER_PASS); - - auto offset = base_tile_idx * ITEMS_PER_TILE; - auto num_valid = input.size() - offset; - - // std::cout << "tip: " << num_tiles_this_pass // - // << " offset: " << offset // - // << " num_valid: " << num_valid << std::endl; - - multibyte_split_init_kernel<<>>( // - base_tile_idx, - TILES_PER_PASS, - tile_superstates, - tile_offsets); - - multibyte_split_kernel<<>>( // - base_tile_idx, - TILES_PER_PASS, - tile_superstates, - tile_offsets, - trie.view(), - cudf::device_span(input.data() + offset, num_valid), - cudf::device_span(static_cast(nullptr), 0), - cudf::device_span(static_cast(nullptr), 0)); - - stream.synchronize(); - } - - // std::cout << "done with first pass" << std::endl; - - // allocate string offsets - - auto num_results = tile_offsets.get_inclusive_prefix(num_tiles - 1, stream); - auto string_offsets = rmm::device_uvector(num_results + 2, stream, mr); - auto string_chars = rmm::device_uvector(input.size(), stream, mr); - auto const x = string_offsets.size() - 1; - auto const y = input.size(); - - // first and last element are set manually to zero and size of input, respectively. - // kernel is only responsible for determining delimiter offsets - string_offsets.set_element_to_zero_async(0, stream); - string_offsets.set_element_async(x, y, stream); - - multibyte_split_init_kernel<<>>( // - 0, - num_tiles, - tile_superstates, - tile_offsets, - scan_tile_status::oob); - - tile_superstates.set_seed_async(superstate<16>(), stream); - tile_offsets.set_seed_async(0, stream); - - for (int32_t base_tile_idx = 0; base_tile_idx < num_tiles; base_tile_idx += TILES_PER_PASS) { - auto num_tiles_this_pass = std::min(num_tiles - base_tile_idx, TILES_PER_PASS); - - auto offset = base_tile_idx * ITEMS_PER_TILE; - auto num_valid = input.size() - offset; - - // std::cout << "tip: " << num_tiles_this_pass // - // << " offset: " << offset // - // << " num_valid: " << num_valid << std::endl; - - multibyte_split_init_kernel<<>>( // - base_tile_idx, - TILES_PER_PASS, - tile_superstates, - tile_offsets); - - multibyte_split_kernel<<>>( // - base_tile_idx, - TILES_PER_PASS, - tile_superstates, - tile_offsets, - trie.view(), - cudf::device_span(input.data() + offset, num_valid), - cudf::device_span(string_offsets).subspan(1, num_results), - string_chars); - - stream.synchronize(); - } - - // std::cout << "done with second pass" << std::endl; - - auto res = create_strings_column(std::move(string_chars), std::move(string_offsets), stream, mr); - - stream.synchronize(); - - return res; -} - -struct chunk { - chunk(rmm::device_buffer&& buffer, std::size_t size) : _buffer(std::move(buffer)), _size(size) {} - - operator device_span() - { - return device_span(static_cast(_buffer.data()), _size); - } - - uint32_t size() const { return _size; } - - rmm::cuda_stream_view stream() const { return _buffer.stream(); } - - private: - rmm::device_buffer _buffer; - std::size_t _size; -}; - -struct chunk_reader { - chunk_reader(cudf::io::text::device_istream& input, rmm::cuda_stream_pool& stream_pool) - : _input(input), _stream_pool(stream_pool) - { - auto buffers = std::vector(stream_pool.get_pool_size()); - for (uint32_t i = 0; i < stream_pool.get_pool_size(); i++) { - buffers[i] = rmm::device_buffer(ITEMS_PER_CHUNK, _stream_pool.get_stream(i)); - } - } - chunk get_next_chunk(uint32_t size) - { - auto stream = _stream_pool.get_stream(i++); - auto chunk_buffer = rmm::device_buffer(size, stream); - auto chunk_span = - device_span(static_cast(chunk_buffer.data()), chunk_buffer.size()); - cudaStreamSynchronize(stream); - size = _input.read(chunk_span, stream); - return chunk(std::move(chunk_buffer), size); - } - - private: - cudf::io::text::device_istream& _input; - rmm::cuda_stream_pool& _stream_pool; - uint32_t i = 0; -}; - void fork_stream_to_pool(rmm::cuda_stream_view stream, rmm::cuda_stream_pool& stream_pool) { cudaEvent_t event; @@ -598,7 +435,7 @@ void join_pool_to_stream(rmm::cuda_stream_pool& stream_pool, rmm::cuda_stream_vi cudaEventDestroy(event); } -cudf::size_type scan_full_stream(cudf::io::text::device_istream& input, +cudf::size_type scan_full_stream(cudf::io::text::data_chunk_source& source, cudf::io::text::trie const& trie, scan_tile_state>& tile_superstates, scan_tile_state& tile_offsets, @@ -623,22 +460,23 @@ cudf::size_type scan_full_stream(cudf::io::text::device_istream& input, fork_stream_to_pool(stream, stream_pool); - auto reader = chunk_reader(input, stream_pool); + auto reader = source.create_reader(); for (auto base_tile_idx = 0; true; base_tile_idx += TILES_PER_CHUNK) { - auto chunk = reader.get_next_chunk(ITEMS_PER_CHUNK); + auto chunk_stream = stream_pool.get_stream(); + auto chunk = reader->get_next_chunk(ITEMS_PER_CHUNK, chunk_stream); if (chunk.size() == 0) { break; } bytes_total += chunk.size(); // reset the next chunk of tile state - multibyte_split_init_kernel<<>>( // + multibyte_split_init_kernel<<>>( // base_tile_idx, TILES_PER_CHUNK, tile_superstates, tile_offsets); - multibyte_split_kernel<<>>( // + multibyte_split_kernel<<>>( // base_tile_idx, TILES_PER_CHUNK, tile_superstates, @@ -654,7 +492,7 @@ cudf::size_type scan_full_stream(cudf::io::text::device_istream& input, return bytes_total; } -std::unique_ptr multibyte_split(cudf::io::text::device_istream& input, +std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source& source, std::vector const& delimeters, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) @@ -670,7 +508,7 @@ std::unique_ptr multibyte_split(cudf::io::text::device_istream& in auto stream_pool = rmm::cuda_stream_pool(concurrency); - auto bytes_total = scan_full_stream(input, + auto bytes_total = scan_full_stream(source, trie, tile_superstates, tile_offsets, @@ -692,10 +530,7 @@ std::unique_ptr multibyte_split(cudf::io::text::device_istream& in string_offsets.set_element_to_zero_async(0, stream); string_offsets.set_element_async(x, bytes_total, stream); - // pattern-match and materialize string offsets - input.reset(); - - scan_full_stream(input, + scan_full_stream(source, trie, tile_superstates, tile_offsets, @@ -711,22 +546,12 @@ std::unique_ptr multibyte_split(cudf::io::text::device_istream& in } // namespace detail -std::unique_ptr multibyte_split(cudf::string_scalar const& input, - std::vector const& delimeters, - rmm::mr::device_memory_resource* mr) -{ - auto stream = rmm::cuda_stream_default; - auto result = detail::multibyte_split(input, delimeters, stream, mr); - stream.synchronize(); - return result; -} - -std::unique_ptr multibyte_split(cudf::io::text::device_istream& input, +std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source& source, std::vector const& delimeters, rmm::mr::device_memory_resource* mr) { auto stream = rmm::cuda_stream_default; - auto result = detail::multibyte_split(input, delimeters, stream, mr); + auto result = detail::multibyte_split(source, delimeters, stream, mr); stream.synchronize(); return result; } diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index 1779e11060b..53d200d8ccf 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -21,7 +21,7 @@ #include #include -#include +#include #include #include @@ -76,19 +76,20 @@ TEST_F(MultibyteSplitTest, SimpleStreaming) "emojis,", "which,", "are😎", "multiple,", "bytes::", "and😎", "used😎", "as😎", "delimeters.😎", "::", ",", "😀", ""}; - auto host_input_stream = std::basic_stringstream(host_input); - auto device_input_stream = cudf::io::text::host_device_istream(host_input_stream); - auto out = cudf::io::text::multibyte_split(device_input_stream, delimiters); + CUDF_FAIL(); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, print_all); + // auto source = cudf::io::text::make_source(host_input); + // auto out = cudf::io::text::multibyte_split(*source, delimiters); + + // CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, print_all); } TEST_F(MultibyteSplitTest, SimplePreloaded) { // 😀 | F0 9F 98 80 | 11110000 10011111 01100010 01010000 // 😎 | F0 9F 98 8E | 11110000 10011111 01100010 11101000 - auto delimiters = std::vector({"😀", "😎", ",", "::"}); - auto host_input = std::string( + auto delimiters = std::vector({"😀", "😎", ",", "::"}); + auto device_input = cudf::string_scalar( "aaa😀" "bbb😀" "ccc😀" @@ -124,8 +125,10 @@ TEST_F(MultibyteSplitTest, SimplePreloaded) "emojis,", "which,", "are😎", "multiple,", "bytes::", "and😎", "used😎", "as😎", "delimeters.😎", "::", ",", "😀", ""}; - auto device_input = cudf::string_scalar(host_input); - auto out = cudf::io::text::multibyte_split(device_input, delimiters); + CUDF_FAIL(); + + // auto source = cudf::io::text::make_source(device_input); + // auto out = cudf::io::text::multibyte_split(*source, delimiters); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, print_all); + // CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, print_all); } diff --git a/cpp/tests/io/text/trie_test.cpp b/cpp/tests/io/text/trie_test.cpp index 2beb8497e4b..49217fecf1c 100644 --- a/cpp/tests/io/text/trie_test.cpp +++ b/cpp/tests/io/text/trie_test.cpp @@ -19,8 +19,6 @@ #include #include -#include - #include #include From 9bc6c89104ffba64866e0670bb9fe107057aa7a7 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 23 Jul 2021 10:21:26 -0500 Subject: [PATCH 35/80] add data_chunk_source factories, nvtx ranges to multibyte_split, use temp dir for benchmark files --- cpp/benchmarks/io/cuio_benchmark_common.hpp | 2 + .../io/text/multibyte_split_benchmark.cpp | 19 ++-- .../cudf/io/text/data_chunk_source.hpp | 17 +--- .../io/text/data_chunk_source_factories.hpp | 98 ++++++++++++++++--- cpp/src/io/text/multibyte_split.cu | 60 ++++++------ cpp/tests/io/text/multibyte_split_test.cpp | 57 +---------- 6 files changed, 137 insertions(+), 116 deletions(-) diff --git a/cpp/benchmarks/io/cuio_benchmark_common.hpp b/cpp/benchmarks/io/cuio_benchmark_common.hpp index 2c49386a901..7107585dbcc 100644 --- a/cpp/benchmarks/io/cuio_benchmark_common.hpp +++ b/cpp/benchmarks/io/cuio_benchmark_common.hpp @@ -33,6 +33,8 @@ using cudf::io::io_type; benchmark(name##_buffer_output, type_or_group, static_cast(io_type::HOST_BUFFER)); \ benchmark(name##_void_output, type_or_group, static_cast(io_type::VOID)); +std::string random_file_in_dir(std::string const& dir_path); + /** * @brief Class to create a coupled `source_info` and `sink_info` of given type. */ diff --git a/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp index 473e71aafea..a3255d2cb5a 100644 --- a/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp +++ b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp @@ -15,12 +15,15 @@ */ #include +#include #include +#include +#include + #include #include #include -#include #include @@ -30,6 +33,8 @@ using cudf::test::fixed_width_column_wrapper; +temp_directory const temp_dir("cudf_gbench"); + static void BM_multibyte_split(benchmark::State& state) { auto delimiters = std::vector({"😀", "😎", ",", "::"}); @@ -38,7 +43,8 @@ static void BM_multibyte_split(benchmark::State& state) auto host_input = std::string(num_chars, 'x'); auto device_input = cudf::string_scalar(host_input); - auto temp_file_name = std::string("io.x"); + auto temp_file_name = random_file_in_dir(temp_dir.path()); + close(mkstemp(const_cast(temp_file_name.data()))); { auto temp_fostream = std::ofstream(temp_file_name, std::ofstream::out); @@ -46,16 +52,15 @@ static void BM_multibyte_split(benchmark::State& state) temp_fostream.close(); } - auto source = cudf::io::text::make_source_from_file(temp_file_name); - // auto source = cudf::text::io::make_source(device_input); - // auto source = cudf::text::io::make_source(host_input); - cudaDeviceSynchronize(); + auto source = cudf::io::text::make_source_from_file(temp_file_name); + // auto source = cudf::io::text::make_source(device_input); + // auto source = cudf::io::text::make_source(host_input); + for (auto _ : state) { cuda_event_timer raii(state, true); auto output = cudf::io::text::multibyte_split(*source, delimiters); - // auto output = cudf::io::text::multibyte_split(device_input, delimiters); } state.SetBytesProcessed(state.iterations() * num_chars); diff --git a/cpp/include/cudf/io/text/data_chunk_source.hpp b/cpp/include/cudf/io/text/data_chunk_source.hpp index b4238532b03..48671664aea 100644 --- a/cpp/include/cudf/io/text/data_chunk_source.hpp +++ b/cpp/include/cudf/io/text/data_chunk_source.hpp @@ -10,23 +10,14 @@ namespace io { namespace text { struct data_chunk { - data_chunk(rmm::device_buffer&& buffer, std::size_t size) - : _buffer(std::move(buffer)), _size(size) - { - } + data_chunk(device_span data) : _data(data) {} - operator cudf::device_span() - { - return cudf::device_span(static_cast(_buffer.data()), _size); - } + operator cudf::device_span() { return _data; } - uint32_t size() const { return _size; } - - rmm::cuda_stream_view stream() const { return _buffer.stream(); } + uint32_t size() const { return _data.size(); } private: - rmm::device_buffer _buffer; - std::size_t _size; + device_span _data; }; class data_chunk_reader { diff --git a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp index 4bf768fafef..042abdd9df9 100644 --- a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp +++ b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -11,7 +12,9 @@ #include #include +#include #include +#include namespace cudf { namespace io { @@ -19,33 +22,45 @@ namespace text { namespace { -class file_data_chunk_reader : public data_chunk_reader { +class istream_data_chunk_reader : public data_chunk_reader { public: - file_data_chunk_reader(std::string const& filename) - : _filestream(std::ifstream(filename, std::ifstream::in)) + istream_data_chunk_reader(std::unique_ptr datastream) + : _datastream(std::move(datastream)), _buffers() { CUDA_TRY(cudaEventCreate(&prev_host_copy_event)); // } - ~file_data_chunk_reader() + ~istream_data_chunk_reader() { CUDA_TRY(cudaEventDestroy(prev_host_copy_event)); // } + device_span find_or_create_data(uint32_t size, rmm::cuda_stream_view stream) + { + auto search = _buffers.find(stream.value()); + + if (search == _buffers.end() || search->second.size() < size) { + _buffers[stream.value()] = rmm::device_buffer(size, stream); + } + + return device_span(static_cast(_buffers[stream.value()].data()), size); + } + data_chunk get_next_chunk(uint32_t read_size, rmm::cuda_stream_view stream) override { + CUDF_FUNC_RANGE(); CUDA_TRY(cudaEventSynchronize(prev_host_copy_event)); if (_host_buffer.size() < read_size) { _host_buffer.resize(read_size); } - _filestream.read(_host_buffer.data(), read_size); + _datastream->read(_host_buffer.data(), read_size); - read_size = _filestream.gcount(); + read_size = _datastream->gcount(); - auto chunk_buffer = rmm::device_buffer(read_size, stream); + auto chunk_span = find_or_create_data(read_size, stream); CUDA_TRY(cudaMemcpyAsync( // - chunk_buffer.data(), + chunk_span.data(), _host_buffer.data(), read_size, cudaMemcpyHostToDevice, @@ -53,37 +68,92 @@ class file_data_chunk_reader : public data_chunk_reader { CUDA_TRY(cudaEventRecord(prev_host_copy_event, stream.value())); - return data_chunk(std::move(chunk_buffer), read_size); + return data_chunk(chunk_span); } private: + std::unique_ptr _datastream; + std::unordered_map _buffers; cudaEvent_t prev_host_copy_event; - std::ifstream _filestream; thrust::host_vector> _host_buffer{}; }; +class device_span_data_chunk_reader : public data_chunk_reader { + public: + device_span_data_chunk_reader(device_span data) : _data(data) {} + + data_chunk get_next_chunk(uint32_t read_size, rmm::cuda_stream_view stream) override + { + if (read_size > _data.size() - _position) { read_size = _data.size() - _position; } + + auto chunk_span = _data.subspan(_position, read_size); + + _position += read_size; + + return data_chunk(chunk_span); + } + + private: + device_span _data; + uint64_t _position = 0; +}; + class file_data_chunk_source : public data_chunk_source { public: file_data_chunk_source(std::string filename) : _filename(filename) {} std::unique_ptr create_reader() override { - return std::make_unique(_filename); + return std::make_unique( + std::make_unique(_filename, std::ifstream::in)); } private: std::string _filename; }; +class string_data_chunk_source : public data_chunk_source { + public: + string_data_chunk_source(std::string const& data) : _data(data) {} + std::unique_ptr create_reader() override + { + return std::make_unique(std::make_unique(_data)); + } + + private: + std::string const& _data; +}; + +class device_span_data_chunk_source : public data_chunk_source { + public: + device_span_data_chunk_source(device_span data) : _data(data) {} + std::unique_ptr create_reader() override + { + return std::make_unique(_data); + } + + private: + device_span _data; +}; + } // namespace -std::unique_ptr make_source(std::string& data); -std::unique_ptr make_source(cudf::string_scalar& data); -std::unique_ptr make_source_from_file(std::string filename) +std::unique_ptr make_source(std::string const& data) +{ + return std::make_unique(data); +} + +std::unique_ptr make_source_from_file(std::string const& filename) { return std::make_unique(filename); } +std::unique_ptr make_source(cudf::string_scalar& data) +{ + auto data_span = device_span(data.data(), data.size()); + return std::make_unique(data_span); +} + } // namespace text } // namespace io } // namespace cudf diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index e2b97f9c85c..ae2419ca67c 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -134,15 +135,14 @@ struct scan_tile_state { } }; -auto constexpr PARTIAL_AGGRIGATION_STRATEGY = 2; +auto constexpr PARTIAL_AGGRIGATION_STRATEGY = 1; // keep ITEMS_PER_TILE below input size to force multi-tile execution. auto constexpr ITEMS_PER_THREAD = 32; // influences register pressure auto constexpr THREADS_PER_TILE = 128; // must be >= 32 for warp-reduce. influences shmem usage. auto constexpr ITEMS_PER_TILE = ITEMS_PER_THREAD * THREADS_PER_TILE; -auto constexpr TILES_PER_CHUNK = 256; // blocks in streaming launch +auto constexpr TILES_PER_CHUNK = 512; // blocks in streaming launch auto constexpr ITEMS_PER_CHUNK = ITEMS_PER_TILE * TILES_PER_CHUNK; -auto constexpr TILES_PER_PASS = 512; // blocks in non-streaming launch template struct scan_tile_state_callback { @@ -435,15 +435,16 @@ void join_pool_to_stream(rmm::cuda_stream_pool& stream_pool, rmm::cuda_stream_vi cudaEventDestroy(event); } -cudf::size_type scan_full_stream(cudf::io::text::data_chunk_source& source, - cudf::io::text::trie const& trie, - scan_tile_state>& tile_superstates, - scan_tile_state& tile_offsets, - device_span output_buffer, - device_span output_char_buffer, - rmm::cuda_stream_view stream, - rmm::cuda_stream_pool& stream_pool) +cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_source& source, + cudf::io::text::trie const& trie, + scan_tile_state>& tile_superstates, + scan_tile_state& tile_offsets, + device_span output_buffer, + device_span output_char_buffer, + rmm::cuda_stream_view stream, + rmm::cuda_stream_pool& stream_pool) { + CUDF_FUNC_RANGE(); cudf::size_type bytes_total = 0; // this function interleaves three kernel executions @@ -497,25 +498,27 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source& rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + CUDF_FUNC_RANGE(); auto const trie = cudf::io::text::trie::create(delimeters, stream); // must be at least 32 when using warp-reduce on partials // must be at least 1 more than max possible concurrent tiles // best when at least 32 more than max possible concurrent tiles, due to rolling `invalid`s - auto concurrency = 3; + auto concurrency = 2; auto num_tile_states = std::max(32, TILES_PER_CHUNK * concurrency + 32); auto tile_superstates = scan_tile_state>(num_tile_states, stream); auto tile_offsets = scan_tile_state(num_tile_states, stream); auto stream_pool = rmm::cuda_stream_pool(concurrency); - auto bytes_total = scan_full_stream(source, - trie, - tile_superstates, - tile_offsets, - cudf::device_span(static_cast(nullptr), 0), - cudf::device_span(static_cast(nullptr), 0), - stream, - stream_pool); + auto bytes_total = + multibyte_split_scan_full_source(source, + trie, + tile_superstates, + tile_offsets, + cudf::device_span(static_cast(nullptr), 0), + cudf::device_span(static_cast(nullptr), 0), + stream, + stream_pool); // allocate string offsets @@ -530,14 +533,15 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source& string_offsets.set_element_to_zero_async(0, stream); string_offsets.set_element_async(x, bytes_total, stream); - scan_full_stream(source, - trie, - tile_superstates, - tile_offsets, - cudf::device_span(string_offsets).subspan(1, num_results), - string_chars, - stream, - stream_pool); + multibyte_split_scan_full_source( + source, + trie, + tile_superstates, + tile_offsets, + cudf::device_span(string_offsets).subspan(1, num_results), + string_chars, + stream, + stream_pool); auto res = create_strings_column(std::move(string_chars), std::move(string_offsets), stream, mr); diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index 53d200d8ccf..957a9b70ec6 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -76,59 +76,8 @@ TEST_F(MultibyteSplitTest, SimpleStreaming) "emojis,", "which,", "are😎", "multiple,", "bytes::", "and😎", "used😎", "as😎", "delimeters.😎", "::", ",", "😀", ""}; - CUDF_FAIL(); + auto source = cudf::io::text::make_source(host_input); + auto out = cudf::io::text::multibyte_split(*source, delimiters); - // auto source = cudf::io::text::make_source(host_input); - // auto out = cudf::io::text::multibyte_split(*source, delimiters); - - // CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, print_all); -} - -TEST_F(MultibyteSplitTest, SimplePreloaded) -{ - // 😀 | F0 9F 98 80 | 11110000 10011111 01100010 01010000 - // 😎 | F0 9F 98 8E | 11110000 10011111 01100010 11101000 - auto delimiters = std::vector({"😀", "😎", ",", "::"}); - auto device_input = cudf::string_scalar( - "aaa😀" - "bbb😀" - "ccc😀" - "ddd😀" - "eee😀" - "fff::" - "ggg😀" - "hhh😀" - "___," - "here," - "is," - "another," - "simple😀" - "text😎" - "seperated😎" - "by😎" - "emojis," - "which," - "are😎" - "multiple," - "bytes::" - "and😎" - "used😎" - "as😎" - "delimeters.😎" - "::" - "," - "😀"); - - auto expected = strings_column_wrapper{ - "aaa😀", "bbb😀", "ccc😀", "ddd😀", "eee😀", "fff::", "ggg😀", "hhh😀", - "___,", "here,", "is,", "another,", "simple😀", "text😎", "seperated😎", "by😎", - "emojis,", "which,", "are😎", "multiple,", "bytes::", "and😎", "used😎", "as😎", - "delimeters.😎", "::", ",", "😀", ""}; - - CUDF_FAIL(); - - // auto source = cudf::io::text::make_source(device_input); - // auto out = cudf::io::text::multibyte_split(*source, delimiters); - - // CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, print_all); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, print_all); } From 08b3069731ec6591b2c39ce7815ffe0bf2b2d359 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 23 Jul 2021 10:26:42 -0500 Subject: [PATCH 36/80] use make_device_uvector_async in trie.hpp --- cpp/include/cudf/io/text/trie.hpp | 49 ++++++------------------------- 1 file changed, 9 insertions(+), 40 deletions(-) diff --git a/cpp/include/cudf/io/text/trie.hpp b/cpp/include/cudf/io/text/trie.hpp index fa9c62ad56e..ca936f7ae6a 100644 --- a/cpp/include/cudf/io/text/trie.hpp +++ b/cpp/include/cudf/io/text/trie.hpp @@ -1,3 +1,5 @@ +#include + #include #include @@ -109,7 +111,9 @@ struct trie { // create the trie tree auto root = std::make_unique(); - for (auto& pattern : patterns) { root->insert(pattern); } + for (auto& pattern : patterns) { + root->insert(pattern); + } // flatten auto sum = 1; @@ -138,45 +142,10 @@ struct trie { match_length.emplace_back(false); - // allocate device memory - - auto device_layer_offsets = rmm::device_uvector(layer_offsets.size(), stream, mr); - auto device_tokens = rmm::device_uvector(tokens.size(), stream, mr); - auto device_transitions = rmm::device_uvector(transitions.size(), stream, mr); - auto device_match_length = rmm::device_uvector(match_length.size(), stream, mr); - - // copy host buffers to device - - CUDA_TRY(cudaMemcpyAsync(device_layer_offsets.data(), - layer_offsets.data(), - layer_offsets.size() * sizeof(uint16_t), - cudaMemcpyDefault, - stream.value())); - - CUDA_TRY(cudaMemcpyAsync(device_tokens.data(), - tokens.data(), - tokens.size() * sizeof(char), - cudaMemcpyDefault, - stream.value())); - - CUDA_TRY(cudaMemcpyAsync(device_transitions.data(), - transitions.data(), - transitions.size() * sizeof(uint16_t), - cudaMemcpyDefault, - stream.value())); - - CUDA_TRY(cudaMemcpyAsync(device_match_length.data(), - match_length.data(), - match_length.size() * sizeof(uint8_t), - cudaMemcpyDefault, - stream.value())); - - // create owning container - - return trie{std::move(device_layer_offsets), - std::move(device_tokens), - std::move(device_transitions), - std::move(device_match_length)}; + return trie{detail::make_device_uvector_async(layer_offsets, stream, mr), + detail::make_device_uvector_async(tokens, stream, mr), + detail::make_device_uvector_async(transitions, stream, mr), + detail::make_device_uvector_async(match_length, stream, mr)}; } trie_device_view view() const From 70887918bc13107ba661ec0634d11a6d3c13d59d Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 23 Jul 2021 10:39:04 -0500 Subject: [PATCH 37/80] rm device_istream --- .../io/text/data_chunk_source_factories.hpp | 1 - cpp/include/cudf/io/text/device_istream.hpp | 19 ------------------- 2 files changed, 20 deletions(-) delete mode 100644 cpp/include/cudf/io/text/device_istream.hpp diff --git a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp index 042abdd9df9..b292b256401 100644 --- a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp +++ b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp @@ -2,7 +2,6 @@ #include #include -#include #include #include diff --git a/cpp/include/cudf/io/text/device_istream.hpp b/cpp/include/cudf/io/text/device_istream.hpp deleted file mode 100644 index 276b2b09c2d..00000000000 --- a/cpp/include/cudf/io/text/device_istream.hpp +++ /dev/null @@ -1,19 +0,0 @@ -#pragma once - -#include - -#include - -namespace cudf { -namespace io { -namespace text { - -class device_istream { - public: - virtual uint32_t read(cudf::device_span destination, rmm::cuda_stream_view stream) = 0; - virtual void reset() = 0; -}; - -} // namespace text -} // namespace io -} // namespace cudf From b61c14f74059f3722c21c643c43e9f691730a74f Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 23 Jul 2021 15:14:08 -0500 Subject: [PATCH 38/80] multibyte_split add some docs, add more test cases --- .../cudf/io/text/data_chunk_source.hpp | 12 ++++ .../io/text/data_chunk_source_factories.hpp | 43 +++++++++++++++ cpp/tests/io/text/multibyte_split_test.cpp | 55 ++++++++++++++++++- 3 files changed, 109 insertions(+), 1 deletion(-) diff --git a/cpp/include/cudf/io/text/data_chunk_source.hpp b/cpp/include/cudf/io/text/data_chunk_source.hpp index 48671664aea..f9e4ade57b7 100644 --- a/cpp/include/cudf/io/text/data_chunk_source.hpp +++ b/cpp/include/cudf/io/text/data_chunk_source.hpp @@ -9,6 +9,9 @@ namespace cudf { namespace io { namespace text { +/** + * @brief represents a possibly-shared view over device memory. + */ struct data_chunk { data_chunk(device_span data) : _data(data) {} @@ -20,11 +23,20 @@ struct data_chunk { device_span _data; }; +/** + * @brief a reader capable of producing views over device memory + * + */ class data_chunk_reader { public: virtual data_chunk get_next_chunk(uint32_t size, rmm::cuda_stream_view stream) = 0; }; +/** + * @brief a data source capable of creating a reader which can produce views of the data source in + * device memory. + * + */ class data_chunk_source { public: virtual std::unique_ptr create_reader() = 0; diff --git a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp index b292b256401..bab0c4c088e 100644 --- a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp +++ b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp @@ -21,11 +21,17 @@ namespace text { namespace { +/** + * @brief a reader which produces views of device memory which contain a copy of the data from an + * istream. + * + */ class istream_data_chunk_reader : public data_chunk_reader { public: istream_data_chunk_reader(std::unique_ptr datastream) : _datastream(std::move(datastream)), _buffers() { + // create an event to track the completion of the last device-to-host copy. CUDA_TRY(cudaEventCreate(&prev_host_copy_event)); // } @@ -48,16 +54,23 @@ class istream_data_chunk_reader : public data_chunk_reader { data_chunk get_next_chunk(uint32_t read_size, rmm::cuda_stream_view stream) override { CUDF_FUNC_RANGE(); + + // synchronize on the last host-to-device copy, so we don't clobber the host buffer. CUDA_TRY(cudaEventSynchronize(prev_host_copy_event)); + // resize the host buffer as necessary to contain the requested number of bytes if (_host_buffer.size() < read_size) { _host_buffer.resize(read_size); } + // read data from the host istream in to the pinned host memory buffer _datastream->read(_host_buffer.data(), read_size); + // adjust the read size to reflect how many bytes were actually read from the data stream read_size = _datastream->gcount(); + // get a view over some device memory we can use to buffer the read data on to device. auto chunk_span = find_or_create_data(read_size, stream); + // copy the host-pinned data on to device CUDA_TRY(cudaMemcpyAsync( // chunk_span.data(), _host_buffer.data(), @@ -65,8 +78,10 @@ class istream_data_chunk_reader : public data_chunk_reader { cudaMemcpyHostToDevice, stream.value())); + // record the host-to-device copy. CUDA_TRY(cudaEventRecord(prev_host_copy_event, stream.value())); + // return the view over device memory so it can be processed. return data_chunk(chunk_span); } @@ -78,18 +93,27 @@ class istream_data_chunk_reader : public data_chunk_reader { _host_buffer{}; }; +/** + * @brief a reader which produces view of device memory which represent a subset of the input device + * span + * + */ class device_span_data_chunk_reader : public data_chunk_reader { public: device_span_data_chunk_reader(device_span data) : _data(data) {} data_chunk get_next_chunk(uint32_t read_size, rmm::cuda_stream_view stream) override { + // limit the read size to the number of bytes remaining in the device_span. if (read_size > _data.size() - _position) { read_size = _data.size() - _position; } + // create a view over the device span auto chunk_span = _data.subspan(_position, read_size); + // increment position _position += read_size; + // return the view over device memory so it can be processed. return data_chunk(chunk_span); } @@ -98,6 +122,10 @@ class device_span_data_chunk_reader : public data_chunk_reader { uint64_t _position = 0; }; +/** + * @brief a file data source which creates an istream_data_chunk_reader + * + */ class file_data_chunk_source : public data_chunk_source { public: file_data_chunk_source(std::string filename) : _filename(filename) {} @@ -111,6 +139,9 @@ class file_data_chunk_source : public data_chunk_source { std::string _filename; }; +/** + * @brief a host string data source which creates an istream_data_chunk_reader + */ class string_data_chunk_source : public data_chunk_source { public: string_data_chunk_source(std::string const& data) : _data(data) {} @@ -123,6 +154,9 @@ class string_data_chunk_source : public data_chunk_source { std::string const& _data; }; +/** + * @brief a device span data source which creates an istream_data_chunk_reader + */ class device_span_data_chunk_source : public data_chunk_source { public: device_span_data_chunk_source(device_span data) : _data(data) {} @@ -137,16 +171,25 @@ class device_span_data_chunk_source : public data_chunk_source { } // namespace +/** + * @brief Creates a data source capable of producing device-buffered views of the given string. + */ std::unique_ptr make_source(std::string const& data) { return std::make_unique(data); } +/** + * @brief Creates a data source capable of producing device-buffered views of the file + */ std::unique_ptr make_source_from_file(std::string const& filename) { return std::make_unique(filename); } +/** + * @brief Creates a data source capable of producing views of the given device string scalar + */ std::unique_ptr make_source(cudf::string_scalar& data) { auto data_span = device_span(data.data(), data.size()); diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index 957a9b70ec6..dd393207c83 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -35,7 +35,60 @@ constexpr bool print_all{true}; struct MultibyteSplitTest : public BaseFixture { }; -TEST_F(MultibyteSplitTest, SimpleStreaming) +TEST_F(MultibyteSplitTest, NondeterministicMatching) +{ + // bug: test fails because PatternScan does not account for NFAs (repeated 'a' char) + auto delimiters = std::vector({"abac"}); + auto host_input = std::string("ababacabacab"); + + auto expected = strings_column_wrapper{"ababac", "abac", "ab"}; + + auto source = cudf::io::text::make_source(host_input); + auto out = cudf::io::text::multibyte_split(*source, delimiters); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, print_all); +} + +TEST_F(MultibyteSplitTest, DelimiterAtEnd) +{ + auto delimiters = std::vector({":"}); + auto host_input = std::string("abcdefg:"); + + auto expected = strings_column_wrapper{"abcdefg:", ""}; + + auto source = cudf::io::text::make_source(host_input); + auto out = cudf::io::text::multibyte_split(*source, delimiters); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, print_all); +} + +TEST_F(MultibyteSplitTest, LargeInput) +{ + // 😀 | F0 9F 98 80 | 11110000 10011111 01100010 01010000 + // 😎 | F0 9F 98 8E | 11110000 10011111 01100010 11101000 + auto delimiters = std::vector({"😀", "😎", ",", "::"}); + + // TODO: figure out why CUDF_TEST_EXPECT_COLUMNS_EQUAL fails when the input is larger + // like when changing std::string(100, ...) -> std::string(1000, ...) + auto host_input = std::string(std::string(100, 'w') + "😀" + // + std::string(100, 'x') + "😀" + // + std::string(100, 'y') + "😀" + // + std::string(100, 'z') + "😀" + // + std::string(100, '_')); + + auto expected = strings_column_wrapper{std::string(100, 'w') + "😀", + std::string(100, 'x') + "😀", + std::string(100, 'y') + "😀", + std::string(100, 'z') + "😀", + std::string(100, '_')}; + + auto source = cudf::io::text::make_source(host_input); + auto out = cudf::io::text::multibyte_split(*source, delimiters); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, print_all); +} + +TEST_F(MultibyteSplitTest, MultipleDelimiters) { // 😀 | F0 9F 98 80 | 11110000 10011111 01100010 01010000 // 😎 | F0 9F 98 8E | 11110000 10011111 01100010 11101000 From 017f05db82ee99222dc156d80293d3d0b55fe908 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 23 Jul 2021 15:16:45 -0500 Subject: [PATCH 39/80] revert CMakeLists ordering --- cpp/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 597cbef5a83..1e1062e53e2 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -253,8 +253,8 @@ add_library(cudf src/interop/dlpack.cpp src/interop/from_arrow.cu src/interop/to_arrow.cu - src/io/avro/avro_gpu.cu src/io/avro/avro.cpp + src/io/avro/avro_gpu.cu src/io/avro/reader_impl.cu src/io/comp/brotli_dict.cpp src/io/comp/cpu_unbz2.cpp From f432e687d9b5278ed1283b27349bbe213cca7896 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Sun, 25 Jul 2021 12:27:17 -0500 Subject: [PATCH 40/80] convert trie storage from SOA to AOS --- cpp/include/cudf/io/text/trie.hpp | 84 ++++++++++++++----------------- 1 file changed, 38 insertions(+), 46 deletions(-) diff --git a/cpp/include/cudf/io/text/trie.hpp b/cpp/include/cudf/io/text/trie.hpp index ca936f7ae6a..2c087b746e3 100644 --- a/cpp/include/cudf/io/text/trie.hpp +++ b/cpp/include/cudf/io/text/trie.hpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -34,18 +35,21 @@ namespace cudf { namespace io { namespace text { +struct trie_node { + char token; + uint8_t match_length; + uint8_t transitions_begin; +}; + struct trie_device_view { - uint16_t const* layer_offsets; - char const* tokens; - uint16_t const* transitions; - uint8_t const* match_length; + device_span _nodes; inline constexpr uint16_t transition(uint16_t idx, char c) { - auto pos = transitions[idx]; - auto end = transitions[idx + 1]; + auto pos = _nodes[idx].transitions_begin; + auto end = _nodes[idx + 1].transitions_begin; while (pos < end) { - if (c == tokens[pos - 1]) { return pos; } + if (c == _nodes[pos].token) { return pos; } pos++; } @@ -54,10 +58,10 @@ struct trie_device_view { inline constexpr uint16_t transition_init(char c) { - auto pos = transitions[0]; - auto end = transitions[1]; + auto pos = _nodes[0].transitions_begin; + auto end = _nodes[1].transitions_begin; while (pos < end) { - if (c == tokens[pos - 1]) { return pos; } + if (c == _nodes[pos].token) { return pos; } pos++; } @@ -65,7 +69,7 @@ struct trie_device_view { } inline constexpr bool is_match(uint16_t idx) { return static_cast(get_match_length(idx)); } - inline constexpr uint8_t get_match_length(uint16_t idx) { return match_length[idx]; } + inline constexpr uint8_t get_match_length(uint16_t idx) { return _nodes[idx].match_length; } }; struct trie { @@ -75,22 +79,10 @@ struct trie { // layer_offsets to uint8_t, max string length would be 253 2^8-3 (two values // reserved: empty string, and error state) private: - rmm::device_uvector _layer_offsets; - rmm::device_uvector _tokens; - rmm::device_uvector _transitions; - rmm::device_uvector _match_length; + rmm::device_uvector _nodes; public: - trie(rmm::device_uvector&& layer_offsets, - rmm::device_uvector&& tokens, - rmm::device_uvector&& transitions, - rmm::device_uvector&& match_length) - : _layer_offsets(std::move(layer_offsets)), - _tokens(std::move(tokens)), - _transitions(std::move(transitions)), - _match_length(std::move(match_length)) - { - } + trie(rmm::device_uvector&& nodes) : _nodes(std::move(nodes)) {} static trie create(std::string const& pattern, rmm::cuda_stream_view stream, @@ -104,9 +96,8 @@ struct trie { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { - std::vector layer_offsets; std::vector tokens; - std::vector transitions; + std::vector transitions; std::vector match_length; // create the trie tree @@ -117,42 +108,43 @@ struct trie { // flatten auto sum = 1; - layer_offsets.emplace_back(0); transitions.emplace_back(sum); match_length.emplace_back(root->match_length); - auto nodes = std::queue>(); - nodes.push(std::move(root)); + auto builder_nodes = std::queue>(); + builder_nodes.push(std::move(root)); + + tokens.emplace_back(0); - while (nodes.size()) { - layer_offsets.emplace_back(sum); - auto layer_size = nodes.size(); + while (builder_nodes.size()) { + auto layer_size = builder_nodes.size(); for (uint32_t i = 0; i < layer_size; i++) { - auto node = std::move(nodes.front()); - nodes.pop(); + auto node = std::move(builder_nodes.front()); + builder_nodes.pop(); sum += node->children.size(); transitions.emplace_back(sum); for (auto& item : node->children) { match_length.emplace_back(item.second->match_length); tokens.emplace_back(item.first); - nodes.push(std::move(item.second)); + builder_nodes.push(std::move(item.second)); } } } - match_length.emplace_back(false); + tokens.emplace_back(0); - return trie{detail::make_device_uvector_async(layer_offsets, stream, mr), - detail::make_device_uvector_async(tokens, stream, mr), - detail::make_device_uvector_async(transitions, stream, mr), - detail::make_device_uvector_async(match_length, stream, mr)}; - } + match_length.emplace_back(0); - trie_device_view view() const - { - return trie_device_view{ - _layer_offsets.data(), _tokens.data(), _transitions.data(), _match_length.data()}; + std::vector trie_nodes; + + for (uint32_t i = 0; i < tokens.size(); i++) { + trie_nodes.emplace_back(trie_node{tokens[i], match_length[i], transitions[i]}); + } + + return trie{detail::make_device_uvector_async(trie_nodes, stream, mr)}; } + + trie_device_view view() const { return trie_device_view{_nodes}; } }; } // namespace text From f1d3b4af7dd2c4ed46701b5f000c79b16fb0ea88 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Mon, 26 Jul 2021 18:59:03 -0500 Subject: [PATCH 41/80] fix spelling mistakes --- cpp/include/cudf/io/text/multibyte_split.hpp | 2 +- cpp/src/io/text/multibyte_split.cu | 16 ++++++++-------- cpp/tests/io/text/multibyte_split_test.cpp | 4 ++-- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp index a1f484aabce..20912831b48 100644 --- a/cpp/include/cudf/io/text/multibyte_split.hpp +++ b/cpp/include/cudf/io/text/multibyte_split.hpp @@ -14,7 +14,7 @@ namespace text { std::unique_ptr multibyte_split( data_chunk_source& source, - std::vector const& delimeters, + std::vector const& delimiters, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace text diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index ae2419ca67c..f7baf8d02b5 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -135,7 +135,7 @@ struct scan_tile_state { } }; -auto constexpr PARTIAL_AGGRIGATION_STRATEGY = 1; +auto constexpr PARTIAL_AGGREGATION_STRATEGY = 1; // keep ITEMS_PER_TILE below input size to force multi-tile execution. auto constexpr ITEMS_PER_THREAD = 32; // influences register pressure @@ -171,13 +171,13 @@ struct scan_tile_state_callback { auto predecessor_idx = _tile_idx - 1 - threadIdx.x; auto predecessor_status = scan_tile_status::invalid; - if constexpr (PARTIAL_AGGRIGATION_STRATEGY == 0) { + if constexpr (PARTIAL_AGGREGATION_STRATEGY == 0) { if (threadIdx.x == 0) { _temp_storage.exclusive_prefix = _tile_state.get_inclusive_prefix(predecessor_idx); } } - if constexpr (PARTIAL_AGGRIGATION_STRATEGY == 1) { + if constexpr (PARTIAL_AGGREGATION_STRATEGY == 1) { // scan partials to form prefix auto window_partial = T{}; @@ -193,7 +193,7 @@ struct scan_tile_state_callback { } } - if constexpr (PARTIAL_AGGRIGATION_STRATEGY == 2) { + if constexpr (PARTIAL_AGGREGATION_STRATEGY == 2) { auto window_partial = T{}; if (threadIdx.x < 32) { do { @@ -494,12 +494,12 @@ cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_sour } std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source& source, - std::vector const& delimeters, + std::vector const& delimiters, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - auto const trie = cudf::io::text::trie::create(delimeters, stream); + auto const trie = cudf::io::text::trie::create(delimiters, stream); // must be at least 32 when using warp-reduce on partials // must be at least 1 more than max possible concurrent tiles // best when at least 32 more than max possible concurrent tiles, due to rolling `invalid`s @@ -551,11 +551,11 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source& } // namespace detail std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source& source, - std::vector const& delimeters, + std::vector const& delimiters, rmm::mr::device_memory_resource* mr) { auto stream = rmm::cuda_stream_default; - auto result = detail::multibyte_split(source, delimeters, stream, mr); + auto result = detail::multibyte_split(source, delimiters, stream, mr); stream.synchronize(); return result; } diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index dd393207c83..37382dd357b 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -118,7 +118,7 @@ TEST_F(MultibyteSplitTest, MultipleDelimiters) "and😎" "used😎" "as😎" - "delimeters.😎" + "delimiters.😎" "::" "," "😀"); @@ -127,7 +127,7 @@ TEST_F(MultibyteSplitTest, MultipleDelimiters) "aaa😀", "bbb😀", "ccc😀", "ddd😀", "eee😀", "fff::", "ggg😀", "hhh😀", "___,", "here,", "is,", "another,", "simple😀", "text😎", "seperated😎", "by😎", "emojis,", "which,", "are😎", "multiple,", "bytes::", "and😎", "used😎", "as😎", - "delimeters.😎", "::", ",", "😀", ""}; + "delimiters.😎", "::", ",", "😀", ""}; auto source = cudf::io::text::make_source(host_input); auto out = cudf::io::text::multibyte_split(*source, delimiters); From 51ac35c5c942a0dae87678febeab80f8b026a24c Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Mon, 26 Jul 2021 23:43:25 -0500 Subject: [PATCH 42/80] break multibyte_split by adding queue/multistate support --- cpp/include/cudf/io/text/trie.hpp | 88 +++++++++-- cpp/src/io/text/multibyte_split.cu | 32 ++-- cpp/tests/io/text/multibyte_split_test.cpp | 174 ++++++++++----------- 3 files changed, 170 insertions(+), 124 deletions(-) diff --git a/cpp/include/cudf/io/text/trie.hpp b/cpp/include/cudf/io/text/trie.hpp index 2c087b746e3..1e6f32c8f03 100644 --- a/cpp/include/cudf/io/text/trie.hpp +++ b/cpp/include/cudf/io/text/trie.hpp @@ -38,38 +38,94 @@ namespace text { struct trie_node { char token; uint8_t match_length; - uint8_t transitions_begin; + uint8_t child_begin; +}; + +struct trie_path_part { + uint32_t head; + uint32_t tail; +}; + +struct trie_queue { + static uint32_t const N = 8; + trie_path_part values[N]; + uint32_t pos; + uint32_t end; + + inline constexpr uint32_t size() { return end - pos; } + + inline constexpr trie_path_part peek() { return values[pos % N]; } + + inline constexpr trie_path_part dequeue() { return values[pos++ % N]; } + + inline constexpr void enqueue(trie_path_part value) + { + if (size() < N) { values[end++ % N] = value; } + } }; struct trie_device_view { device_span _nodes; - inline constexpr uint16_t transition(uint16_t idx, char c) + template + inline constexpr void transition_init( // + char c, + trie_path_part (&parts)[N], + uint32_t& pos, + uint32_t& end) { - auto pos = _nodes[idx].transitions_begin; - auto end = _nodes[idx + 1].transitions_begin; - while (pos < end) { - if (c == _nodes[pos].token) { return pos; } - pos++; + for (uint32_t curr = 0; curr < _nodes.size() - 1; curr++) { + transition_enqueue_all(c, parts, pos, end, curr, curr); } - - return transition_init(c); } - inline constexpr uint16_t transition_init(char c) + template + inline constexpr void transition( // + char c, + trie_path_part (&parts)[N], + uint32_t& pos, + uint32_t& end) { - auto pos = _nodes[0].transitions_begin; - auto end = _nodes[1].transitions_begin; - while (pos < end) { - if (c == _nodes[pos].token) { return pos; } - pos++; + auto size = end - pos; + transition_enqueue_all(c, parts, pos, end, 0, 0); + for (uint32_t i = 0; i < size; i++) { + auto partial = parts[pos++ % N]; + transition_enqueue_all(c, parts, pos, end, partial.head, partial.tail); } + } - return 0; + template + inline constexpr void transition_enqueue_all( // + char c, + trie_path_part (&parts)[N], + uint32_t& pos, + uint32_t& end, + uint32_t const& head, + uint32_t const& curr) + { + for (uint32_t tail = _nodes[curr].child_begin; tail < _nodes[curr + 1].child_begin; tail++) { + if (end - pos < N) { // + if (_nodes[tail].token == c) { // + parts[end++ % N] = {head, tail}; + } + } + } } inline constexpr bool is_match(uint16_t idx) { return static_cast(get_match_length(idx)); } inline constexpr uint8_t get_match_length(uint16_t idx) { return _nodes[idx].match_length; } + + template + inline constexpr uint8_t get_match_length(trie_path_part (&parts)[N], + uint32_t& pos, + uint32_t& end) + { + int8_t val = 0; + for (uint32_t i = pos; i != end; i++) { + val = max(val, get_match_length(parts[i % N].tail)); + } + return val; + } }; struct trie { diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index f7baf8d02b5..876bbfc9150 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -138,10 +138,10 @@ struct scan_tile_state { auto constexpr PARTIAL_AGGREGATION_STRATEGY = 1; // keep ITEMS_PER_TILE below input size to force multi-tile execution. -auto constexpr ITEMS_PER_THREAD = 32; // influences register pressure -auto constexpr THREADS_PER_TILE = 128; // must be >= 32 for warp-reduce. influences shmem usage. +auto constexpr ITEMS_PER_THREAD = 32; // influences register pressure +auto constexpr THREADS_PER_TILE = 32; // must be >= 32 for warp-reduce. influences shmem usage. auto constexpr ITEMS_PER_TILE = ITEMS_PER_THREAD * THREADS_PER_TILE; -auto constexpr TILES_PER_CHUNK = 512; // blocks in streaming launch +auto constexpr TILES_PER_CHUNK = 1; // blocks in streaming launch auto constexpr ITEMS_PER_CHUNK = ITEMS_PER_TILE * TILES_PER_CHUNK; template @@ -249,27 +249,19 @@ struct PatternScan { char (&thread_data)[ITEMS_PER_THREAD], uint32_t (&thread_state)[ITEMS_PER_THREAD]) { - // create a state that represents all possible starting states. - auto thread_superstate = superstate(); - - // transition all possible states - for (uint32_t i = 0; i < ITEMS_PER_THREAD; i++) { - thread_superstate = thread_superstate.apply([&](uint8_t state) { // - return trie.transition(state, thread_data[i]); - }); - } - - auto prefix_callback = BlockScanCallback(_temp_storage.scan_callback, tile_state, tile_idx); - - BlockScan(_temp_storage.scan) - .ExclusiveSum(thread_superstate, thread_superstate, prefix_callback); + cudf::io::text::trie_path_part parts[4]; + uint32_t pos = 0; + uint32_t end = 0; - // transition from known state to known state - thread_state[0] = trie.transition(thread_superstate.get(0), thread_data[0]); + trie.transition_init(thread_data[0], parts, pos, end); for (uint32_t i = 1; i < ITEMS_PER_THREAD; i++) { - thread_state[i] = trie.transition(thread_state[i - 1], thread_data[i]); + trie.transition(thread_data[i], parts, pos, end); } + + // at this point, `parts` should contain the possible matches for this thread. + + // but now we have to join them across threads. And then across blocks. } }; diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index 37382dd357b..f0b17561355 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -30,8 +30,6 @@ using namespace cudf; using namespace test; -constexpr bool print_all{true}; - struct MultibyteSplitTest : public BaseFixture { }; @@ -46,91 +44,91 @@ TEST_F(MultibyteSplitTest, NondeterministicMatching) auto source = cudf::io::text::make_source(host_input); auto out = cudf::io::text::multibyte_split(*source, delimiters); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, print_all); -} - -TEST_F(MultibyteSplitTest, DelimiterAtEnd) -{ - auto delimiters = std::vector({":"}); - auto host_input = std::string("abcdefg:"); - - auto expected = strings_column_wrapper{"abcdefg:", ""}; - - auto source = cudf::io::text::make_source(host_input); - auto out = cudf::io::text::multibyte_split(*source, delimiters); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, print_all); -} - -TEST_F(MultibyteSplitTest, LargeInput) -{ - // 😀 | F0 9F 98 80 | 11110000 10011111 01100010 01010000 - // 😎 | F0 9F 98 8E | 11110000 10011111 01100010 11101000 - auto delimiters = std::vector({"😀", "😎", ",", "::"}); - - // TODO: figure out why CUDF_TEST_EXPECT_COLUMNS_EQUAL fails when the input is larger - // like when changing std::string(100, ...) -> std::string(1000, ...) - auto host_input = std::string(std::string(100, 'w') + "😀" + // - std::string(100, 'x') + "😀" + // - std::string(100, 'y') + "😀" + // - std::string(100, 'z') + "😀" + // - std::string(100, '_')); - - auto expected = strings_column_wrapper{std::string(100, 'w') + "😀", - std::string(100, 'x') + "😀", - std::string(100, 'y') + "😀", - std::string(100, 'z') + "😀", - std::string(100, '_')}; - - auto source = cudf::io::text::make_source(host_input); - auto out = cudf::io::text::multibyte_split(*source, delimiters); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, print_all); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); } -TEST_F(MultibyteSplitTest, MultipleDelimiters) -{ - // 😀 | F0 9F 98 80 | 11110000 10011111 01100010 01010000 - // 😎 | F0 9F 98 8E | 11110000 10011111 01100010 11101000 - auto delimiters = std::vector({"😀", "😎", ",", "::"}); - auto host_input = std::string( - "aaa😀" - "bbb😀" - "ccc😀" - "ddd😀" - "eee😀" - "fff::" - "ggg😀" - "hhh😀" - "___," - "here," - "is," - "another," - "simple😀" - "text😎" - "seperated😎" - "by😎" - "emojis," - "which," - "are😎" - "multiple," - "bytes::" - "and😎" - "used😎" - "as😎" - "delimiters.😎" - "::" - "," - "😀"); - - auto expected = strings_column_wrapper{ - "aaa😀", "bbb😀", "ccc😀", "ddd😀", "eee😀", "fff::", "ggg😀", "hhh😀", - "___,", "here,", "is,", "another,", "simple😀", "text😎", "seperated😎", "by😎", - "emojis,", "which,", "are😎", "multiple,", "bytes::", "and😎", "used😎", "as😎", - "delimiters.😎", "::", ",", "😀", ""}; - - auto source = cudf::io::text::make_source(host_input); - auto out = cudf::io::text::multibyte_split(*source, delimiters); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, print_all); -} +// TEST_F(MultibyteSplitTest, DelimiterAtEnd) +// { +// auto delimiters = std::vector({":"}); +// auto host_input = std::string("abcdefg:"); + +// auto expected = strings_column_wrapper{"abcdefg:", ""}; + +// auto source = cudf::io::text::make_source(host_input); +// auto out = cudf::io::text::multibyte_split(*source, delimiters); + +// CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); +// } + +// TEST_F(MultibyteSplitTest, LargeInput) +// { +// // 😀 | F0 9F 98 80 | 11110000 10011111 01100010 01010000 +// // 😎 | F0 9F 98 8E | 11110000 10011111 01100010 11101000 +// auto delimiters = std::vector({"😀", "😎", ",", "::"}); + +// // TODO: figure out why CUDF_TEST_EXPECT_COLUMNS_EQUAL fails when the input is larger +// // like when changing std::string(100, ...) -> std::string(1000, ...) +// auto host_input = std::string(std::string(100, 'w') + "😀" + // +// std::string(100, 'x') + "😀" + // +// std::string(100, 'y') + "😀" + // +// std::string(100, 'z') + "😀" + // +// std::string(100, '_')); + +// auto expected = strings_column_wrapper{std::string(100, 'w') + "😀", +// std::string(100, 'x') + "😀", +// std::string(100, 'y') + "😀", +// std::string(100, 'z') + "😀", +// std::string(100, '_')}; + +// auto source = cudf::io::text::make_source(host_input); +// auto out = cudf::io::text::multibyte_split(*source, delimiters); + +// CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); +// } + +// TEST_F(MultibyteSplitTest, MultipleDelimiters) +// { +// // 😀 | F0 9F 98 80 | 11110000 10011111 01100010 01010000 +// // 😎 | F0 9F 98 8E | 11110000 10011111 01100010 11101000 +// auto delimiters = std::vector({"😀", "😎", ",", "::"}); +// auto host_input = std::string( +// "aaa😀" +// "bbb😀" +// "ccc😀" +// "ddd😀" +// "eee😀" +// "fff::" +// "ggg😀" +// "hhh😀" +// "___," +// "here," +// "is," +// "another," +// "simple😀" +// "text😎" +// "seperated😎" +// "by😎" +// "emojis," +// "which," +// "are😎" +// "multiple," +// "bytes::" +// "and😎" +// "used😎" +// "as😎" +// "delimiters.😎" +// "::" +// "," +// "😀"); + +// auto expected = strings_column_wrapper{ +// "aaa😀", "bbb😀", "ccc😀", "ddd😀", "eee😀", "fff::", "ggg😀", "hhh😀", +// "___,", "here,", "is,", "another,", "simple😀", "text😎", "seperated😎", "by😎", +// "emojis,", "which,", "are😎", "multiple,", "bytes::", "and😎", "used😎", "as😎", +// "delimiters.😎", "::", ",", "😀", ""}; + +// auto source = cudf::io::text::make_source(host_input); +// auto out = cudf::io::text::multibyte_split(*source, delimiters); + +// CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); +// } From 1fb36ee15771e79a97008413f4d71c652806b407 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Wed, 28 Jul 2021 20:24:58 -0500 Subject: [PATCH 43/80] fix `abac` pattern matching test, introduce new bug :( --- cpp/include/cudf/io/text/multistate.hpp | 73 +++++++++++++ cpp/include/cudf/io/text/trie.hpp | 84 +++++---------- cpp/src/io/text/multibyte_split.cu | 116 +++++++++++++++------ cpp/tests/io/text/multibyte_split_test.cpp | 78 ++++++++------ 4 files changed, 233 insertions(+), 118 deletions(-) create mode 100644 cpp/include/cudf/io/text/multistate.hpp diff --git a/cpp/include/cudf/io/text/multistate.hpp b/cpp/include/cudf/io/text/multistate.hpp new file mode 100644 index 00000000000..d1c618a9486 --- /dev/null +++ b/cpp/include/cudf/io/text/multistate.hpp @@ -0,0 +1,73 @@ +#pragma once + +#include + +namespace cudf { +namespace io { +namespace text { + +struct multistate_segment { + public: + inline constexpr multistate_segment() : _data(0) {} + inline constexpr multistate_segment(uint8_t head, uint8_t tail) + : _data((head & 0b1111) | (tail << 4)) + { + } + + inline constexpr uint8_t get_head() const { return _data & 0b1111; } + inline constexpr uint8_t get_tail() const { return _data >> 4; } + + private: + uint8_t _data; +}; + +struct multistate { + public: + inline constexpr void enqueue(uint8_t head, uint8_t tail) + { + _segments[_size++] = multistate_segment(head, tail); + } + + inline constexpr uint8_t size() const { return _size; } + + inline constexpr uint8_t max_tail() const + { + uint8_t maximum = 0; + + for (uint8_t i = 0; i < _size; i++) { + maximum = std::max(maximum, get_tail(i)); + } + + return maximum; + } + + inline constexpr uint8_t get_head(uint8_t idx) const { return _segments[idx].get_head(); } + inline constexpr uint8_t get_tail(uint8_t idx) const { return _segments[idx].get_tail(); } + + private: + static auto constexpr N = 7; + uint8_t _size = 0; + multistate_segment _segments[N]; +}; + +// lhs contains only zero? + +inline constexpr multistate operator+(multistate const& lhs, multistate const& rhs) +{ + // combine two multistates together by full-joining LHS tails to RHS heads, + // and taking the corosponding LHS heads and RHS tails. + + multistate result; + for (uint8_t lhs_idx = 0; lhs_idx < lhs.size(); lhs_idx++) { + auto tail = lhs.get_tail(lhs_idx); + for (uint8_t rhs_idx = 0; rhs_idx < rhs.size(); rhs_idx++) { + auto head = rhs.get_head(rhs_idx); + if (tail == head) { result.enqueue(lhs.get_head(lhs_idx), rhs.get_tail(rhs_idx)); } + } + } + return result; +} + +} // namespace text +} // namespace io +} // namespace cudf diff --git a/cpp/include/cudf/io/text/trie.hpp b/cpp/include/cudf/io/text/trie.hpp index 1e6f32c8f03..aa95d17891d 100644 --- a/cpp/include/cudf/io/text/trie.hpp +++ b/cpp/include/cudf/io/text/trie.hpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -41,73 +42,43 @@ struct trie_node { uint8_t child_begin; }; -struct trie_path_part { - uint32_t head; - uint32_t tail; -}; - -struct trie_queue { - static uint32_t const N = 8; - trie_path_part values[N]; - uint32_t pos; - uint32_t end; - - inline constexpr uint32_t size() { return end - pos; } - - inline constexpr trie_path_part peek() { return values[pos % N]; } - - inline constexpr trie_path_part dequeue() { return values[pos++ % N]; } - - inline constexpr void enqueue(trie_path_part value) - { - if (size() < N) { values[end++ % N] = value; } - } -}; - struct trie_device_view { device_span _nodes; - template - inline constexpr void transition_init( // - char c, - trie_path_part (&parts)[N], - uint32_t& pos, - uint32_t& end) + inline constexpr multistate transition_init(char c) { - for (uint32_t curr = 0; curr < _nodes.size() - 1; curr++) { - transition_enqueue_all(c, parts, pos, end, curr, curr); + auto result = multistate(); + + result.enqueue(0, 0); + + for (uint8_t curr = 0; curr < _nodes.size() - 1; curr++) { + transition_enqueue_all(c, result, curr, curr); } + return result; } - template - inline constexpr void transition( // - char c, - trie_path_part (&parts)[N], - uint32_t& pos, - uint32_t& end) + inline constexpr multistate transition(char c, multistate const& states) { - auto size = end - pos; - transition_enqueue_all(c, parts, pos, end, 0, 0); - for (uint32_t i = 0; i < size; i++) { - auto partial = parts[pos++ % N]; - transition_enqueue_all(c, parts, pos, end, partial.head, partial.tail); + auto result = multistate(); + + result.enqueue(0, 0); + + for (uint8_t i = 0; i < states.size(); i++) { + transition_enqueue_all(c, result, states.get_head(i), states.get_tail(i)); } + + return result; } - template inline constexpr void transition_enqueue_all( // char c, - trie_path_part (&parts)[N], - uint32_t& pos, - uint32_t& end, - uint32_t const& head, - uint32_t const& curr) + multistate& states, + uint8_t head, + uint8_t curr) { for (uint32_t tail = _nodes[curr].child_begin; tail < _nodes[curr + 1].child_begin; tail++) { - if (end - pos < N) { // - if (_nodes[tail].token == c) { // - parts[end++ % N] = {head, tail}; - } + if (_nodes[tail].token == c) { // + states.enqueue(head, tail); } } } @@ -116,13 +87,12 @@ struct trie_device_view { inline constexpr uint8_t get_match_length(uint16_t idx) { return _nodes[idx].match_length; } template - inline constexpr uint8_t get_match_length(trie_path_part (&parts)[N], - uint32_t& pos, - uint32_t& end) + inline constexpr uint8_t get_match_length(multistate const& states) { int8_t val = 0; - for (uint32_t i = pos; i != end; i++) { - val = max(val, get_match_length(parts[i % N].tail)); + for (uint8_t i = 0; i < states.size(); i++) { + auto match_length = get_match_length(states.get_tail(i)); + if (match_length > val) { val = match_length; } } return val; } diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 876bbfc9150..ce794e72fac 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include #include @@ -27,7 +27,7 @@ inline constexpr auto ceil_div(Dividend dividend, Divisor divisor) return dividend / divisor + (dividend % divisor != 0); } -using superstate = cudf::io::text::superstate<16>; +using multistate = cudf::io::text::multistate; enum class scan_tile_status : uint8_t { oob, @@ -138,10 +138,10 @@ struct scan_tile_state { auto constexpr PARTIAL_AGGREGATION_STRATEGY = 1; // keep ITEMS_PER_TILE below input size to force multi-tile execution. -auto constexpr ITEMS_PER_THREAD = 32; // influences register pressure -auto constexpr THREADS_PER_TILE = 32; // must be >= 32 for warp-reduce. influences shmem usage. +auto constexpr ITEMS_PER_THREAD = 32; // influences register pressure +auto constexpr THREADS_PER_TILE = 128; // must be >= 32 for warp-reduce. influences shmem usage. auto constexpr ITEMS_PER_TILE = ITEMS_PER_THREAD * THREADS_PER_TILE; -auto constexpr TILES_PER_CHUNK = 1; // blocks in streaming launch +auto constexpr TILES_PER_CHUNK = 512; // blocks in streaming launch auto constexpr ITEMS_PER_CHUNK = ITEMS_PER_TILE * TILES_PER_CHUNK; template @@ -229,8 +229,8 @@ struct scan_tile_state_callback { }; struct PatternScan { - typedef cub::BlockScan BlockScan; - typedef scan_tile_state_callback BlockScanCallback; + typedef cub::BlockScan BlockScan; + typedef scan_tile_state_callback BlockScanCallback; struct _TempStorage { typename BlockScan::TempStorage scan; @@ -244,50 +244,102 @@ struct PatternScan { __device__ inline PatternScan(TempStorage& temp_storage) : _temp_storage(temp_storage.Alias()) {} __device__ inline void Scan(cudf::size_type tile_idx, - scan_tile_state_view tile_state, + scan_tile_state_view tile_state, cudf::io::text::trie_device_view trie, char (&thread_data)[ITEMS_PER_THREAD], uint32_t (&thread_state)[ITEMS_PER_THREAD]) { - cudf::io::text::trie_path_part parts[4]; - uint32_t pos = 0; - uint32_t end = 0; - - trie.transition_init(thread_data[0], parts, pos, end); + auto thread_multistate = trie.transition_init(thread_data[0]); + + if (blockIdx.x == 0 and threadIdx.x < 2) { + for (uint8_t i = 0; i < thread_multistate.size(); i++) { + printf("bid(%3u) tid(%3u) |--- : idx(%2u) head(%2u) tail(%2u)\n", + blockIdx.x, + threadIdx.x, + static_cast(i), + static_cast(thread_multistate.get_head(i)), + static_cast(thread_multistate.get_tail(i))); + } + } for (uint32_t i = 1; i < ITEMS_PER_THREAD; i++) { - trie.transition(thread_data[i], parts, pos, end); + thread_multistate = trie.transition(thread_data[i], thread_multistate); + } + + auto prefix_callback = BlockScanCallback(_temp_storage.scan_callback, tile_state, tile_idx); + + if (blockIdx.x == 0 and threadIdx.x < 2) { + for (uint8_t i = 0; i < thread_multistate.size(); i++) { + printf("bid(%3u) tid(%3u) -|-- : idx(%2u) head(%2u) tail(%2u)\n", + blockIdx.x, + threadIdx.x, + static_cast(i), + static_cast(thread_multistate.get_head(i)), + static_cast(thread_multistate.get_tail(i))); + } } - // at this point, `parts` should contain the possible matches for this thread. + // everything is correct up to this point, but exclusive sum produces a multistate with no + // segments. + + BlockScan(_temp_storage.scan) + .ExclusiveSum(thread_multistate, thread_multistate, prefix_callback); - // but now we have to join them across threads. And then across blocks. + if (blockIdx.x == 0 and threadIdx.x < 2) { + for (uint8_t i = 0; i < thread_multistate.size(); i++) { + printf("bid(%3u) tid(%3u) --|- : idx(%2u) head(%2u) tail(%2u)\n", + blockIdx.x, + threadIdx.x, + static_cast(i), + static_cast(thread_multistate.get_head(i)), + static_cast(thread_multistate.get_tail(i))); + } + } + + __syncthreads(); + + for (uint32_t i = 0; i < ITEMS_PER_THREAD; i++) { + thread_multistate = trie.transition(thread_data[i], thread_multistate); + + thread_state[i] = thread_multistate.max_tail(); + } + + if (blockIdx.x == 0 and threadIdx.x < 2) { + for (uint8_t i = 0; i < thread_multistate.size(); i++) { + printf("bid(%3u) tid(%3u) ---| : idx(%2u) head(%2u) tail(%2u)\n", + blockIdx.x, + threadIdx.x, + static_cast(i), + static_cast(thread_multistate.get_head(i)), + static_cast(thread_multistate.get_tail(i))); + } + } } }; // multibyte_split works by splitting up inputs in to 32 inputs (bytes) per thread, and transforming -// them in to data structures called "superstates". these superstates are created by searching a +// them in to data structures called "multistates". these multistates are created by searching a // trie, but instead of a tradition trie where the search begins at a single node at the beginning, // we allow our search to begin anywhere within the trie tree. The position within the trie tree is // stored as a "partial match path", which indicates "we can get from here to there by a set of -// specific transitions". By scanning together superstates, we effectively know "we can get here +// specific transitions". By scanning together multistates, we effectively know "we can get here // from the beginning by following the inputs". By doing this, each thread knows exactly what state // it begins in. From there, each thread can then take deterministic action. In this case, the // deterministic action is counting and outputting delimiter offsets when a delimiter is found. __global__ void multibyte_split_init_kernel(cudf::size_type base_tile_idx, cudf::size_type num_tiles, - scan_tile_state_view tile_superstates, + scan_tile_state_view tile_multistates, scan_tile_state_view tile_output_offsets, scan_tile_status status = scan_tile_status::invalid) { - tile_superstates.initialize_status(base_tile_idx, num_tiles, status); + tile_multistates.initialize_status(base_tile_idx, num_tiles, status); tile_output_offsets.initialize_status(base_tile_idx, num_tiles, status); } __global__ void multibyte_split_kernel(cudf::size_type base_tile_idx, cudf::size_type num_tiles, - scan_tile_state_view tile_superstates, + scan_tile_state_view tile_multistates, scan_tile_state_view tile_output_offsets, cudf::io::text::trie_device_view trie, cudf::device_span data, @@ -324,7 +376,7 @@ __global__ void multibyte_split_kernel(cudf::size_type base_tile_idx, uint32_t thread_states[ITEMS_PER_THREAD]; PatternScan(temp_storage.pattern_scan) // - .Scan(tile_idx, tile_superstates, trie, thread_data, thread_states); + .Scan(tile_idx, tile_multistates, trie, thread_data, thread_states); // STEP 3: Flag matches @@ -429,7 +481,7 @@ void join_pool_to_stream(rmm::cuda_stream_pool& stream_pool, rmm::cuda_stream_vi cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_source& source, cudf::io::text::trie const& trie, - scan_tile_state>& tile_superstates, + scan_tile_state& tile_multistates, scan_tile_state& tile_offsets, device_span output_buffer, device_span output_char_buffer, @@ -444,11 +496,15 @@ cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_sour multibyte_split_init_kernel<<>>( // -TILES_PER_CHUNK, TILES_PER_CHUNK, - tile_superstates, + tile_multistates, tile_offsets, scan_tile_status::oob); - tile_superstates.set_seed_async(superstate<16>(), stream); + auto multistate_seed = multistate(); + + multistate_seed.enqueue(0, 0); + + tile_multistates.set_seed_async(multistate_seed, stream); tile_offsets.set_seed_async(0, stream); fork_stream_to_pool(stream, stream_pool); @@ -467,12 +523,12 @@ cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_sour multibyte_split_init_kernel<<>>( // base_tile_idx, TILES_PER_CHUNK, - tile_superstates, + tile_multistates, tile_offsets); multibyte_split_kernel<<>>( // base_tile_idx, TILES_PER_CHUNK, - tile_superstates, + tile_multistates, tile_offsets, trie.view(), chunk, @@ -497,7 +553,7 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source& // best when at least 32 more than max possible concurrent tiles, due to rolling `invalid`s auto concurrency = 2; auto num_tile_states = std::max(32, TILES_PER_CHUNK * concurrency + 32); - auto tile_superstates = scan_tile_state>(num_tile_states, stream); + auto tile_multistates = scan_tile_state(num_tile_states, stream); auto tile_offsets = scan_tile_state(num_tile_states, stream); auto stream_pool = rmm::cuda_stream_pool(concurrency); @@ -505,7 +561,7 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source& auto bytes_total = multibyte_split_scan_full_source(source, trie, - tile_superstates, + tile_multistates, tile_offsets, cudf::device_span(static_cast(nullptr), 0), cudf::device_span(static_cast(nullptr), 0), @@ -528,7 +584,7 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source& multibyte_split_scan_full_source( source, trie, - tile_superstates, + tile_multistates, tile_offsets, cudf::device_span(string_offsets).subspan(1, num_results), string_chars, diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index f0b17561355..81784ca0022 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -47,44 +47,60 @@ TEST_F(MultibyteSplitTest, NondeterministicMatching) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); } -// TEST_F(MultibyteSplitTest, DelimiterAtEnd) -// { -// auto delimiters = std::vector({":"}); -// auto host_input = std::string("abcdefg:"); +TEST_F(MultibyteSplitTest, DelimiterAtEnd) +{ + auto delimiters = std::vector({":"}); + auto host_input = std::string("abcdefg:"); -// auto expected = strings_column_wrapper{"abcdefg:", ""}; + auto expected = strings_column_wrapper{"abcdefg:", ""}; -// auto source = cudf::io::text::make_source(host_input); -// auto out = cudf::io::text::multibyte_split(*source, delimiters); + auto source = cudf::io::text::make_source(host_input); + auto out = cudf::io::text::multibyte_split(*source, delimiters); -// CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); -// } + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); +} -// TEST_F(MultibyteSplitTest, LargeInput) -// { -// // 😀 | F0 9F 98 80 | 11110000 10011111 01100010 01010000 -// // 😎 | F0 9F 98 8E | 11110000 10011111 01100010 11101000 -// auto delimiters = std::vector({"😀", "😎", ",", "::"}); +TEST_F(MultibyteSplitTest, LargeInput) +{ + // 😀 | F0 9F 98 80 | 11110000 10011111 01100010 01010000 + // 😎 | F0 9F 98 8E | 11110000 10011111 01100010 11101000 + auto delimiters = std::vector({"😀", "😎", ",", "::"}); + + // TODO: figure out why CUDF_TEST_EXPECT_COLUMNS_EQUAL fails when the input is larger + // like when changing std::string(100, ...) -> std::string(1000, ...) + auto host_input = std::string(std::string(100, 'w') + "😀" + // + std::string(100, 'x') + "😀" + // + std::string(100, 'y') + "😀" + // + std::string(100, 'z') + "😀" + // + std::string(100, '_')); + + auto expected = strings_column_wrapper{std::string(100, 'w') + "😀", + std::string(100, 'x') + "😀", + std::string(100, 'y') + "😀", + std::string(100, 'z') + "😀", + std::string(100, '_')}; -// // TODO: figure out why CUDF_TEST_EXPECT_COLUMNS_EQUAL fails when the input is larger -// // like when changing std::string(100, ...) -> std::string(1000, ...) -// auto host_input = std::string(std::string(100, 'w') + "😀" + // -// std::string(100, 'x') + "😀" + // -// std::string(100, 'y') + "😀" + // -// std::string(100, 'z') + "😀" + // -// std::string(100, '_')); + auto source = cudf::io::text::make_source(host_input); + auto out = cudf::io::text::multibyte_split(*source, delimiters); -// auto expected = strings_column_wrapper{std::string(100, 'w') + "😀", -// std::string(100, 'x') + "😀", -// std::string(100, 'y') + "😀", -// std::string(100, 'z') + "😀", -// std::string(100, '_')}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); +} -// auto source = cudf::io::text::make_source(host_input); -// auto out = cudf::io::text::multibyte_split(*source, delimiters); +TEST_F(MultibyteSplitTest, LongDelimiter) +{ + auto delimiters = std::vector({"===="}); + auto host_input = std::string( + "..............................==" + "==.............................."); -// CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); -// } + auto expected = + strings_column_wrapper{"..............................====", ".............................."}; + + auto source = cudf::io::text::make_source(host_input); + auto out = cudf::io::text::multibyte_split(*source, delimiters); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, debug_output_level::ALL_ERRORS); +} // TEST_F(MultibyteSplitTest, MultipleDelimiters) // { @@ -130,5 +146,5 @@ TEST_F(MultibyteSplitTest, NondeterministicMatching) // auto source = cudf::io::text::make_source(host_input); // auto out = cudf::io::text::multibyte_split(*source, delimiters); -// CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); +// CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, debug_output_level::ALL_ERRORS); // } From ecf440a7df31ceb453356137cc24c08d4ea2bef6 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Wed, 28 Jul 2021 21:57:45 -0500 Subject: [PATCH 44/80] fix multibyte_split aggregation strategy to avoid assuming T{} is an identity value --- cpp/src/io/text/multibyte_split.cu | 12 +-- cpp/tests/io/text/multibyte_split_test.cpp | 92 +++++++++++----------- 2 files changed, 52 insertions(+), 52 deletions(-) diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index ce794e72fac..5ddfcec8b97 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -180,20 +180,20 @@ struct scan_tile_state_callback { if constexpr (PARTIAL_AGGREGATION_STRATEGY == 1) { // scan partials to form prefix - auto window_partial = T{}; - if (threadIdx.x == 0) { - do { + auto window_partial = _tile_state.get_prefix(predecessor_idx, predecessor_status); + while (predecessor_status != scan_tile_status::inclusive) { + predecessor_idx--; auto predecessor_prefix = _tile_state.get_prefix(predecessor_idx, predecessor_status); window_partial = predecessor_prefix + window_partial; - predecessor_idx--; - } while (predecessor_status != scan_tile_status::inclusive); - + } _temp_storage.exclusive_prefix = window_partial; } } if constexpr (PARTIAL_AGGREGATION_STRATEGY == 2) { + // TODO: T{} is not gauranteed to be an identity value, so use an existing value instead. + // otherwise, this is bugged for multistate. auto window_partial = T{}; if (threadIdx.x < 32) { do { diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index 81784ca0022..4dba8276d19 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -102,49 +102,49 @@ TEST_F(MultibyteSplitTest, LongDelimiter) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, debug_output_level::ALL_ERRORS); } -// TEST_F(MultibyteSplitTest, MultipleDelimiters) -// { -// // 😀 | F0 9F 98 80 | 11110000 10011111 01100010 01010000 -// // 😎 | F0 9F 98 8E | 11110000 10011111 01100010 11101000 -// auto delimiters = std::vector({"😀", "😎", ",", "::"}); -// auto host_input = std::string( -// "aaa😀" -// "bbb😀" -// "ccc😀" -// "ddd😀" -// "eee😀" -// "fff::" -// "ggg😀" -// "hhh😀" -// "___," -// "here," -// "is," -// "another," -// "simple😀" -// "text😎" -// "seperated😎" -// "by😎" -// "emojis," -// "which," -// "are😎" -// "multiple," -// "bytes::" -// "and😎" -// "used😎" -// "as😎" -// "delimiters.😎" -// "::" -// "," -// "😀"); - -// auto expected = strings_column_wrapper{ -// "aaa😀", "bbb😀", "ccc😀", "ddd😀", "eee😀", "fff::", "ggg😀", "hhh😀", -// "___,", "here,", "is,", "another,", "simple😀", "text😎", "seperated😎", "by😎", -// "emojis,", "which,", "are😎", "multiple,", "bytes::", "and😎", "used😎", "as😎", -// "delimiters.😎", "::", ",", "😀", ""}; - -// auto source = cudf::io::text::make_source(host_input); -// auto out = cudf::io::text::multibyte_split(*source, delimiters); - -// CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, debug_output_level::ALL_ERRORS); -// } +TEST_F(MultibyteSplitTest, MultipleDelimiters) +{ + // 😀 | F0 9F 98 80 | 11110000 10011111 01100010 01010000 + // 😎 | F0 9F 98 8E | 11110000 10011111 01100010 11101000 + auto delimiters = std::vector({"😀", "😎", ",", "::"}); + auto host_input = std::string( + "aaa😀" + "bbb😀" + "ccc😀" + "ddd😀" + "eee😀" + "fff::" + "ggg😀" + "hhh😀" + "___," + "here," + "is," + "another," + "simple😀" + "text😎" + "seperated😎" + "by😎" + "emojis," + "which," + "are😎" + "multiple," + "bytes::" + "and😎" + "used😎" + "as😎" + "delimiters.😎" + "::" + "," + "😀"); + + auto expected = strings_column_wrapper{ + "aaa😀", "bbb😀", "ccc😀", "ddd😀", "eee😀", "fff::", "ggg😀", "hhh😀", + "___,", "here,", "is,", "another,", "simple😀", "text😎", "seperated😎", "by😎", + "emojis,", "which,", "are😎", "multiple,", "bytes::", "and😎", "used😎", "as😎", + "delimiters.😎", "::", ",", "😀", ""}; + + auto source = cudf::io::text::make_source(host_input); + auto out = cudf::io::text::multibyte_split(*source, delimiters); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, debug_output_level::ALL_ERRORS); +} From fc014e5237c0c7b8931f20a5af5bb1bfa7cfefef Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Wed, 28 Jul 2021 22:52:09 -0500 Subject: [PATCH 45/80] add second host buffer to istream_data_chunk_reader to facilitate overlapping h2d copies~ --- .../io/text/data_chunk_source_factories.hpp | 34 ++++++++----- cpp/src/io/text/multibyte_split.cu | 49 ------------------- 2 files changed, 23 insertions(+), 60 deletions(-) diff --git a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp index bab0c4c088e..90aa11af55e 100644 --- a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp +++ b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp @@ -27,17 +27,26 @@ namespace { * */ class istream_data_chunk_reader : public data_chunk_reader { + struct host_ticket { + cudaEvent_t event; + thrust::host_vector> buffer; + }; + public: istream_data_chunk_reader(std::unique_ptr datastream) - : _datastream(std::move(datastream)), _buffers() + : _datastream(std::move(datastream)), _buffers(), _tickets(1) { // create an event to track the completion of the last device-to-host copy. - CUDA_TRY(cudaEventCreate(&prev_host_copy_event)); // + for (uint32_t i = 0; i < _tickets.size(); i++) { + CUDA_TRY(cudaEventCreate(&(_tickets[i].event))); + } } ~istream_data_chunk_reader() { - CUDA_TRY(cudaEventDestroy(prev_host_copy_event)); // + for (uint32_t i = 0; i < _tickets.size(); i++) { + CUDA_TRY(cudaEventDestroy(_tickets[i].event)); + } } device_span find_or_create_data(uint32_t size, rmm::cuda_stream_view stream) @@ -55,14 +64,18 @@ class istream_data_chunk_reader : public data_chunk_reader { { CUDF_FUNC_RANGE(); + auto& ticket = _tickets[_next_ticket_idx]; + + _next_ticket_idx = (_next_ticket_idx + 1) % _tickets.size(); + // synchronize on the last host-to-device copy, so we don't clobber the host buffer. - CUDA_TRY(cudaEventSynchronize(prev_host_copy_event)); + CUDA_TRY(cudaEventSynchronize(ticket.event)); // resize the host buffer as necessary to contain the requested number of bytes - if (_host_buffer.size() < read_size) { _host_buffer.resize(read_size); } + if (ticket.buffer.size() < read_size) { ticket.buffer.resize(read_size); } // read data from the host istream in to the pinned host memory buffer - _datastream->read(_host_buffer.data(), read_size); + _datastream->read(ticket.buffer.data(), read_size); // adjust the read size to reflect how many bytes were actually read from the data stream read_size = _datastream->gcount(); @@ -73,24 +86,23 @@ class istream_data_chunk_reader : public data_chunk_reader { // copy the host-pinned data on to device CUDA_TRY(cudaMemcpyAsync( // chunk_span.data(), - _host_buffer.data(), + ticket.buffer.data(), read_size, cudaMemcpyHostToDevice, stream.value())); // record the host-to-device copy. - CUDA_TRY(cudaEventRecord(prev_host_copy_event, stream.value())); + CUDA_TRY(cudaEventRecord(ticket.event, stream.value())); // return the view over device memory so it can be processed. return data_chunk(chunk_span); } private: + uint32_t _next_ticket_idx = 0; std::unique_ptr _datastream; std::unordered_map _buffers; - cudaEvent_t prev_host_copy_event; - thrust::host_vector> - _host_buffer{}; + std::vector _tickets; }; /** diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 5ddfcec8b97..d8af2ef00a7 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -251,69 +251,20 @@ struct PatternScan { { auto thread_multistate = trie.transition_init(thread_data[0]); - if (blockIdx.x == 0 and threadIdx.x < 2) { - for (uint8_t i = 0; i < thread_multistate.size(); i++) { - printf("bid(%3u) tid(%3u) |--- : idx(%2u) head(%2u) tail(%2u)\n", - blockIdx.x, - threadIdx.x, - static_cast(i), - static_cast(thread_multistate.get_head(i)), - static_cast(thread_multistate.get_tail(i))); - } - } - for (uint32_t i = 1; i < ITEMS_PER_THREAD; i++) { thread_multistate = trie.transition(thread_data[i], thread_multistate); } auto prefix_callback = BlockScanCallback(_temp_storage.scan_callback, tile_state, tile_idx); - if (blockIdx.x == 0 and threadIdx.x < 2) { - for (uint8_t i = 0; i < thread_multistate.size(); i++) { - printf("bid(%3u) tid(%3u) -|-- : idx(%2u) head(%2u) tail(%2u)\n", - blockIdx.x, - threadIdx.x, - static_cast(i), - static_cast(thread_multistate.get_head(i)), - static_cast(thread_multistate.get_tail(i))); - } - } - - // everything is correct up to this point, but exclusive sum produces a multistate with no - // segments. - BlockScan(_temp_storage.scan) .ExclusiveSum(thread_multistate, thread_multistate, prefix_callback); - if (blockIdx.x == 0 and threadIdx.x < 2) { - for (uint8_t i = 0; i < thread_multistate.size(); i++) { - printf("bid(%3u) tid(%3u) --|- : idx(%2u) head(%2u) tail(%2u)\n", - blockIdx.x, - threadIdx.x, - static_cast(i), - static_cast(thread_multistate.get_head(i)), - static_cast(thread_multistate.get_tail(i))); - } - } - - __syncthreads(); - for (uint32_t i = 0; i < ITEMS_PER_THREAD; i++) { thread_multistate = trie.transition(thread_data[i], thread_multistate); thread_state[i] = thread_multistate.max_tail(); } - - if (blockIdx.x == 0 and threadIdx.x < 2) { - for (uint8_t i = 0; i < thread_multistate.size(); i++) { - printf("bid(%3u) tid(%3u) ---| : idx(%2u) head(%2u) tail(%2u)\n", - blockIdx.x, - threadIdx.x, - static_cast(i), - static_cast(thread_multistate.get_head(i)), - static_cast(thread_multistate.get_tail(i))); - } - } } }; From 896ed318fb39ae4a839e80efdf414af7040fbe30 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Wed, 28 Jul 2021 22:54:10 -0500 Subject: [PATCH 46/80] actually add second buffer to istream_data_chunk_reader --- cpp/include/cudf/io/text/data_chunk_source_factories.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp index 90aa11af55e..7a492d1ee7d 100644 --- a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp +++ b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp @@ -34,7 +34,7 @@ class istream_data_chunk_reader : public data_chunk_reader { public: istream_data_chunk_reader(std::unique_ptr datastream) - : _datastream(std::move(datastream)), _buffers(), _tickets(1) + : _datastream(std::move(datastream)), _buffers(), _tickets(2) { // create an event to track the completion of the last device-to-host copy. for (uint32_t i = 0; i < _tickets.size(); i++) { From 2f75b50c9f26904dd76dab4435e6524a9951e9e7 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 30 Jul 2021 01:07:26 -0500 Subject: [PATCH 47/80] clean up multibyte_split code --- .../cudf/io/text/data_chunk_source.hpp | 16 ++ .../io/text/data_chunk_source_factories.hpp | 16 ++ cpp/include/cudf/io/text/multibyte_split.hpp | 20 ++- cpp/include/cudf/io/text/multistate.hpp | 16 ++ cpp/include/cudf/io/text/superstate.hpp | 137 ------------------ cpp/src/io/text/multibyte_split.cu | 105 ++++++-------- cpp/tests/CMakeLists.txt | 12 +- cpp/tests/io/text/superstate_test.cpp | 126 ---------------- cpp/tests/io/text/trie_test.cpp | 50 ------- 9 files changed, 108 insertions(+), 390 deletions(-) delete mode 100644 cpp/include/cudf/io/text/superstate.hpp delete mode 100644 cpp/tests/io/text/superstate_test.cpp delete mode 100644 cpp/tests/io/text/trie_test.cpp diff --git a/cpp/include/cudf/io/text/data_chunk_source.hpp b/cpp/include/cudf/io/text/data_chunk_source.hpp index f9e4ade57b7..f0eb9dcd164 100644 --- a/cpp/include/cudf/io/text/data_chunk_source.hpp +++ b/cpp/include/cudf/io/text/data_chunk_source.hpp @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #pragma once #include diff --git a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp index 7a492d1ee7d..91a07dde292 100644 --- a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp +++ b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #pragma once #include diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp index 20912831b48..93b9660d443 100644 --- a/cpp/include/cudf/io/text/multibyte_split.hpp +++ b/cpp/include/cudf/io/text/multibyte_split.hpp @@ -1,8 +1,24 @@ -#include +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once #include +#include -#include #include #include diff --git a/cpp/include/cudf/io/text/multistate.hpp b/cpp/include/cudf/io/text/multistate.hpp index d1c618a9486..5a7c4bde86f 100644 --- a/cpp/include/cudf/io/text/multistate.hpp +++ b/cpp/include/cudf/io/text/multistate.hpp @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #pragma once #include diff --git a/cpp/include/cudf/io/text/superstate.hpp b/cpp/include/cudf/io/text/superstate.hpp deleted file mode 100644 index 7f5c43a005c..00000000000 --- a/cpp/include/cudf/io/text/superstate.hpp +++ /dev/null @@ -1,137 +0,0 @@ -#pragma once - -#include -#include -#include - -namespace { - -constexpr unsigned floorlog2(unsigned x) { return x == 1 ? 0 : 1 + floorlog2(x >> 1); } - -constexpr unsigned ceillog2(unsigned x) { return x == 1 ? 0 : floorlog2(x - 1) + 1; } - -template -struct rep { -}; - -template -struct rep> { - using type = uint8_t; -}; - -template -struct rep> { - using type = uint16_t; -}; - -template -struct rep> { - using type = uint32_t; -}; - -template -struct rep> { - using type = uint64_t; -}; - -template -struct superstate_policy { - static_assert(N > 1 and N <= 16, "superstate supports no more than 16 unique states"); - static constexpr uint8_t BITS = ceillog2(N); - static constexpr uint8_t MASK = (1 << BITS) - 1; - using Data = typename rep::type; -}; - -} // namespace - -namespace cudf { -namespace io { -namespace text { - -template -struct superstate { - public: - static constexpr uint8_t BITS = superstate_policy::BITS; - static constexpr uint8_t MASK = superstate_policy::MASK; - - using Data = typename superstate_policy::Data; - using Index = uint8_t; - - private: - Data _data; - - public: - /** - * @brief creates a superstate which represents all possible states and - * applied transitions - */ - constexpr superstate() : _data(0) - { - for (auto i = 0; i < N; i++) { _data |= static_cast(i) << (i * BITS); } - } - - explicit inline constexpr superstate(Data data) : _data(data) {} - - inline constexpr Data data() const { return _data; } - - explicit inline constexpr operator State() const { return static_cast(_data & MASK); } - - inline constexpr State get(Index idx) const - { - return static_cast((_data >> idx * BITS) & MASK); - } - - inline constexpr void set(Index idx, State state) - { - // removing `& MASK` here may result in less instructions, but will result in UB. This may - // be a fine trade-off, as integer-overflow was never an intended use case. - _data |= (static_cast(state) & MASK) << idx * BITS; - } - - inline constexpr void reset(Index idx, State state) - { - _data &= ~(MASK << idx * BITS); - _data |= static_cast(state) << idx * BITS; - } - - template - inline constexpr superstate apply(BinaryOp const& op, RHS const& rhs) - { - superstate result(0); - for (uint8_t pre = 0; pre < N; pre++) { - auto const mid = get(pre); - auto const post = op(mid, rhs); - result.set(pre, post); - } - return result; - } - - template - inline constexpr superstate apply(BinaryOp const& op) - { - superstate result(0); - for (uint8_t pre = 0; pre < N; pre++) { - auto const mid = get(pre); - auto const post = op(mid); - result.set(pre, post); - } - return result; - } -}; - -template -inline constexpr superstate operator+(superstate lhs, Instruction rhs) -{ - return lhs.apply([&](State state) { return state + rhs; }); -} - -template -inline constexpr superstate operator+(superstate lhs, superstate rhs) -{ - using Index = typename superstate::Index; - return lhs.apply([&](State state) { return rhs.get(static_cast(state)); }); -} - -} // namespace text -} // namespace io -} // namespace cudf diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index d8af2ef00a7..354f9f2b99c 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -1,15 +1,29 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #include #include #include #include #include #include -#include #include #include #include -#include #include #include @@ -135,15 +149,6 @@ struct scan_tile_state { } }; -auto constexpr PARTIAL_AGGREGATION_STRATEGY = 1; - -// keep ITEMS_PER_TILE below input size to force multi-tile execution. -auto constexpr ITEMS_PER_THREAD = 32; // influences register pressure -auto constexpr THREADS_PER_TILE = 128; // must be >= 32 for warp-reduce. influences shmem usage. -auto constexpr ITEMS_PER_TILE = ITEMS_PER_THREAD * THREADS_PER_TILE; -auto constexpr TILES_PER_CHUNK = 512; // blocks in streaming launch -auto constexpr ITEMS_PER_CHUNK = ITEMS_PER_TILE * TILES_PER_CHUNK; - template struct scan_tile_state_callback { using WarpReduce = cub::WarpReduce; @@ -171,47 +176,16 @@ struct scan_tile_state_callback { auto predecessor_idx = _tile_idx - 1 - threadIdx.x; auto predecessor_status = scan_tile_status::invalid; - if constexpr (PARTIAL_AGGREGATION_STRATEGY == 0) { - if (threadIdx.x == 0) { - _temp_storage.exclusive_prefix = _tile_state.get_inclusive_prefix(predecessor_idx); - } - } - - if constexpr (PARTIAL_AGGREGATION_STRATEGY == 1) { - // scan partials to form prefix - - if (threadIdx.x == 0) { - auto window_partial = _tile_state.get_prefix(predecessor_idx, predecessor_status); - while (predecessor_status != scan_tile_status::inclusive) { - predecessor_idx--; - auto predecessor_prefix = _tile_state.get_prefix(predecessor_idx, predecessor_status); - window_partial = predecessor_prefix + window_partial; - } - _temp_storage.exclusive_prefix = window_partial; - } - } - - if constexpr (PARTIAL_AGGREGATION_STRATEGY == 2) { - // TODO: T{} is not gauranteed to be an identity value, so use an existing value instead. - // otherwise, this is bugged for multistate. - auto window_partial = T{}; - if (threadIdx.x < 32) { - do { - auto predecessor_prefix = _tile_state.get_prefix(predecessor_idx, predecessor_status); - - window_partial = - WarpReduce(_temp_storage.reduce) // - .TailSegmentedReduce(predecessor_prefix, - predecessor_status == scan_tile_status::inclusive, - [](T const& lhs, T const& rhs) { return rhs + lhs; }) + - window_partial; - predecessor_idx -= 32; - } while (__all_sync(0xffffffff, predecessor_status != scan_tile_status::inclusive)); - } + // scan partials to form prefix - if (threadIdx.x == 0) { - _temp_storage.exclusive_prefix = window_partial; // + if (threadIdx.x == 0) { + auto window_partial = _tile_state.get_prefix(predecessor_idx, predecessor_status); + while (predecessor_status != scan_tile_status::inclusive) { + predecessor_idx--; + auto predecessor_prefix = _tile_state.get_prefix(predecessor_idx, predecessor_status); + window_partial = predecessor_prefix + window_partial; } + _temp_storage.exclusive_prefix = window_partial; } if (threadIdx.x == 0) { @@ -228,6 +202,13 @@ struct scan_tile_state_callback { cudf::size_type _tile_idx; }; +// keep ITEMS_PER_TILE below input size to force multi-tile execution. +auto constexpr ITEMS_PER_THREAD = 32; // influences register pressure +auto constexpr THREADS_PER_TILE = 128; // must be >= 32 for warp-reduce. influences shmem usage. +auto constexpr ITEMS_PER_TILE = ITEMS_PER_THREAD * THREADS_PER_TILE; +auto constexpr TILES_PER_CHUNK = 512; // blocks in streaming launch +auto constexpr ITEMS_PER_CHUNK = ITEMS_PER_TILE * TILES_PER_CHUNK; + struct PatternScan { typedef cub::BlockScan BlockScan; typedef scan_tile_state_callback BlockScanCallback; @@ -347,24 +328,20 @@ __global__ void multibyte_split_kernel(cudf::size_type base_tile_idx, OffsetScan(temp_storage.offset_scan) .ExclusiveSum(thread_offsets, thread_offsets, prefix_callback); - // Step 5: Assign string_offsets from each thread using match offsets. + // Step 5: Assign outputs from each thread using match offsets. - for (int32_t i = 0; i < ITEMS_PER_THREAD and i < num_valid; i++) { - auto const match_length = trie.get_match_length(thread_states[i]); - - if (match_length == 0) { continue; } - - auto const match_end = char_begin + data_begin + i + 1; - auto const match_begin = match_end - match_length; - - if (string_offsets.size() > thread_offsets[i]) { // - string_offsets[thread_offsets[i]] = match_end; + if (data_out.size() > 0) { + for (int32_t i = 0; i < ITEMS_PER_THREAD and i < num_valid; i++) { + data_out[data_begin + i] = thread_data[i]; } } - if (data_out.size() > 0) { - for (int32_t i = 0; i < ITEMS_PER_THREAD and i < num_valid; i++) { // - data_out[data_begin + i] = thread_data[i]; + if (string_offsets.size() > 0) { + for (int32_t i = 0; i < ITEMS_PER_THREAD and i < num_valid; i++) { + if (trie.get_match_length(thread_states[i]) > 0) { + auto const match_end = char_begin + data_begin + i + 1; + string_offsets[thread_offsets[i]] = match_end; + } } } } diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index cc1741a7b5a..8fe44f88db8 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -195,21 +195,11 @@ ConfigureTest(ORC_TEST io/orc_test.cpp) ConfigureTest(PARQUET_TEST io/parquet_test.cpp) ConfigureTest(JSON_TEST io/json_test.cpp) ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp) +ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp) if(CUDF_ENABLE_ARROW_S3) target_compile_definitions(ARROW_IO_SOURCE_TEST PRIVATE "S3_ENABLED") endif() -################################################################################################### -# - io tests -------------------------------------------------------------------------------------- -ConfigureTest(SUPERSTATE_TEST - io/text/superstate_test.cpp) - -ConfigureTest(TRIE_TEST - io/text/trie_test.cpp) - -ConfigureTest(MULTIBYTE_SPLIT_TEST - io/text/multibyte_split_test.cpp) - ################################################################################################### # - sort tests ------------------------------------------------------------------------------------ ConfigureTest(SORT_TEST diff --git a/cpp/tests/io/text/superstate_test.cpp b/cpp/tests/io/text/superstate_test.cpp deleted file mode 100644 index 9120eb620a7..00000000000 --- a/cpp/tests/io/text/superstate_test.cpp +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include - -enum class state : uint8_t { a, b, c, error }; -enum class instruction : uint8_t { inc, dec, swap_ac }; - -inline constexpr state operator+(state const& lhs, instruction const& rhs) -{ - switch (rhs) { - case instruction::inc: - switch (lhs) { - case state::a: return state::b; - case state::b: return state::c; - case state::c: return state::a; - case state::error: return state::error; - } - case instruction::dec: - switch (lhs) { - case state::a: return state::c; - case state::b: return state::a; - case state::c: return state::b; - case state::error: return state::error; - } - case instruction::swap_ac: - switch (lhs) { - case state::a: return state::c; - case state::b: return state::b; - case state::c: return state::a; - case state::error: return state::error; - } - } - - return state::error; -} - -using superstate = cudf::io::text::superstate<4, state>; - -struct SuperstateTest : public cudf::test::BaseFixture { -}; - -TEST_F(SuperstateTest, CanInitializeAllStates) -{ - auto value = superstate(); - - EXPECT_EQ(value.data(), 0b11100100); -} - -TEST_F(SuperstateTest, CanInitializeSpecificValue) -{ - auto value = superstate(0b01010101); - - EXPECT_EQ(value.data(), 0b01010101); -} - -TEST_F(SuperstateTest, CanTransitionExplicitly) -{ - auto value = superstate(); - - auto machine = [](state const& lhs, uint8_t const& rhs) { - return static_cast(static_cast(lhs) + rhs); - }; - - // this call test the overflow capability of individual states within a superstate. It is - // possible this becomes UB in the future, in which case this `TEST_F` should be removed. - value = value.apply(machine, 5); - - EXPECT_EQ(value.data(), 0b00111001); - EXPECT_EQ(value.get(0), static_cast(1)); -} - -TEST_F(SuperstateTest, CanTransitionAllStataes) -{ - auto value = superstate(); - - value = value + instruction::inc; - - EXPECT_EQ(value.data(), 0b11001001); - EXPECT_EQ(value.get(0), state::b); - - value = value + instruction::swap_ac; - - EXPECT_EQ(value.data(), 0b11100001); - EXPECT_EQ(value.get(0), state::b); - - value = value + instruction::dec; - - EXPECT_EQ(value.data(), 0b11011000); - EXPECT_EQ(value.get(0), state::a); -} - -TEST_F(SuperstateTest, CanConcatenateSuperstates) -{ - auto a = superstate() + instruction::inc + instruction::swap_ac; - auto b = superstate() + instruction::dec + instruction::swap_ac; - auto c = superstate() + instruction::swap_ac + instruction::inc; - - auto value = a + b + c; - auto expected = superstate() + // - instruction::inc + instruction::swap_ac + // - instruction::dec + instruction::swap_ac + // - instruction::swap_ac + instruction::inc; - - EXPECT_EQ(value.data(), expected.data()); -} - -CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/text/trie_test.cpp b/cpp/tests/io/text/trie_test.cpp deleted file mode 100644 index 49217fecf1c..00000000000 --- a/cpp/tests/io/text/trie_test.cpp +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include - -#include -#include - -#include - -#include - -using namespace cudf; -using namespace test; - -constexpr bool print_all{false}; - -struct TrieTest : public BaseFixture { -}; - -TEST_F(TrieTest, CanMatchSinglePattern) -{ - auto pattern = cudf::io::text::trie::create("abac", {}); - - (void)pattern; -} - -TEST_F(TrieTest, CanMatchMultiplePatterns) -{ - auto patterns = std::vector{"abac", "abad"}; - auto pattern = cudf::io::text::trie::create(patterns, {}); - - (void)pattern; -} From 162e9cf6b42a3b4744769620740fac83273720ac Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 30 Jul 2021 01:12:11 -0500 Subject: [PATCH 48/80] adjust copyright --- .../io/text/multibyte_split_benchmark.cpp | 2 +- cpp/include/cudf/io/text/trie.hpp | 18 ++++++++++++++++++ cpp/tests/io/text/multibyte_split_test.cpp | 2 +- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp index a3255d2cb5a..0a9ffe7cbed 100644 --- a/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp +++ b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/cudf/io/text/trie.hpp b/cpp/include/cudf/io/text/trie.hpp index aa95d17891d..2e49e6dbc21 100644 --- a/cpp/include/cudf/io/text/trie.hpp +++ b/cpp/include/cudf/io/text/trie.hpp @@ -1,3 +1,21 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + #include #include #include diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index 4dba8276d19..35791f9242b 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From ade11507edc89d2490f0f746800d5f2067f218d2 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 30 Jul 2021 01:21:32 -0500 Subject: [PATCH 49/80] remove confusing test case in multibyte_split --- cpp/include/cudf/io/text/trie.hpp | 5 ----- cpp/tests/io/text/multibyte_split_test.cpp | 26 ++++------------------ 2 files changed, 4 insertions(+), 27 deletions(-) diff --git a/cpp/include/cudf/io/text/trie.hpp b/cpp/include/cudf/io/text/trie.hpp index 2e49e6dbc21..8618e79bdeb 100644 --- a/cpp/include/cudf/io/text/trie.hpp +++ b/cpp/include/cudf/io/text/trie.hpp @@ -117,11 +117,6 @@ struct trie_device_view { }; struct trie { - // could compress all of this to 32 bits without major perf reduction: - // 1) merge is_accepting state in to the most significant bit of the - // corrosponding transition, and use a mask to access both values. 2) change - // layer_offsets to uint8_t, max string length would be 253 2^8-3 (two values - // reserved: empty string, and error state) private: rmm::device_uvector _nodes; diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index 35791f9242b..ca0760392ef 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -30,12 +30,14 @@ using namespace cudf; using namespace test; +// 😀 | F0 9F 98 80 | 11110000 10011111 01100010 01010000 +// 😎 | F0 9F 98 8E | 11110000 10011111 01100010 11101000 + struct MultibyteSplitTest : public BaseFixture { }; TEST_F(MultibyteSplitTest, NondeterministicMatching) { - // bug: test fails because PatternScan does not account for NFAs (repeated 'a' char) auto delimiters = std::vector({"abac"}); auto host_input = std::string("ababacabacab"); @@ -62,11 +64,9 @@ TEST_F(MultibyteSplitTest, DelimiterAtEnd) TEST_F(MultibyteSplitTest, LargeInput) { - // 😀 | F0 9F 98 80 | 11110000 10011111 01100010 01010000 - // 😎 | F0 9F 98 8E | 11110000 10011111 01100010 11101000 auto delimiters = std::vector({"😀", "😎", ",", "::"}); - // TODO: figure out why CUDF_TEST_EXPECT_COLUMNS_EQUAL fails when the input is larger + // TODO: figure out why CUDF_TEST_EXPECT_COLUMNS_EQUAL segfaults when the input is larger // like when changing std::string(100, ...) -> std::string(1000, ...) auto host_input = std::string(std::string(100, 'w') + "😀" + // std::string(100, 'x') + "😀" + // @@ -86,26 +86,8 @@ TEST_F(MultibyteSplitTest, LargeInput) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); } -TEST_F(MultibyteSplitTest, LongDelimiter) -{ - auto delimiters = std::vector({"===="}); - auto host_input = std::string( - "..............................==" - "==.............................."); - - auto expected = - strings_column_wrapper{"..............................====", ".............................."}; - - auto source = cudf::io::text::make_source(host_input); - auto out = cudf::io::text::multibyte_split(*source, delimiters); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, debug_output_level::ALL_ERRORS); -} - TEST_F(MultibyteSplitTest, MultipleDelimiters) { - // 😀 | F0 9F 98 80 | 11110000 10011111 01100010 01010000 - // 😎 | F0 9F 98 8E | 11110000 10011111 01100010 11101000 auto delimiters = std::vector({"😀", "😎", ",", "::"}); auto host_input = std::string( "aaa😀" From 8e080126506b072f094bbc04b132f78757dadd7d Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 30 Jul 2021 18:56:06 -0500 Subject: [PATCH 50/80] limit multibyte_split to 32 threads, because of a bug that needs fixing. add overlapping matches test, which also fails --- cpp/src/io/text/multibyte_split.cu | 16 +++++---- cpp/tests/io/text/multibyte_split_test.cpp | 39 ++++++++++++++-------- 2 files changed, 35 insertions(+), 20 deletions(-) diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 354f9f2b99c..65ea4ac4c4f 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -63,6 +63,8 @@ struct scan_tile_state_view { { auto thread_idx = blockIdx.x * blockDim.x + threadIdx.x; if (thread_idx < count) { // + // this is probably UB without taking in to account tile_states being assigned multiple ties + // due to modulo operator tile_status[(base_tile_idx + thread_idx) % num_tiles] = status; } } @@ -202,12 +204,12 @@ struct scan_tile_state_callback { cudf::size_type _tile_idx; }; -// keep ITEMS_PER_TILE below input size to force multi-tile execution. -auto constexpr ITEMS_PER_THREAD = 32; // influences register pressure -auto constexpr THREADS_PER_TILE = 128; // must be >= 32 for warp-reduce. influences shmem usage. +auto constexpr ITEMS_PER_THREAD = 32; // influences register pressure +auto constexpr THREADS_PER_TILE = 32; // must be >= 32 for warp-reduce. bugged for > 32, needs fix auto constexpr ITEMS_PER_TILE = ITEMS_PER_THREAD * THREADS_PER_TILE; -auto constexpr TILES_PER_CHUNK = 512; // blocks in streaming launch -auto constexpr ITEMS_PER_CHUNK = ITEMS_PER_TILE * TILES_PER_CHUNK; +auto constexpr TILES_PER_CHUNK = 512; +// keep ITEMS_PER_CHUNK below input size to force multi-tile execution. +auto constexpr ITEMS_PER_CHUNK = ITEMS_PER_TILE * TILES_PER_CHUNK; struct PatternScan { typedef cub::BlockScan BlockScan; @@ -475,11 +477,11 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source& rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - auto const trie = cudf::io::text::trie::create(delimiters, stream); + auto const trie = cudf::io::text::trie::create(delimiters, stream); + auto concurrency = 2; // must be at least 32 when using warp-reduce on partials // must be at least 1 more than max possible concurrent tiles // best when at least 32 more than max possible concurrent tiles, due to rolling `invalid`s - auto concurrency = 2; auto num_tile_states = std::max(32, TILES_PER_CHUNK * concurrency + 32); auto tile_multistates = scan_tile_state(num_tile_states, stream); auto tile_offsets = scan_tile_state(num_tile_states, stream); diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index ca0760392ef..17405641cf5 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -64,21 +64,21 @@ TEST_F(MultibyteSplitTest, DelimiterAtEnd) TEST_F(MultibyteSplitTest, LargeInput) { - auto delimiters = std::vector({"😀", "😎", ",", "::"}); + auto delimiters = std::vector({":::::", "....."}); + + auto host_input = std::string(); + auto host_expected = std::vector(); - // TODO: figure out why CUDF_TEST_EXPECT_COLUMNS_EQUAL segfaults when the input is larger - // like when changing std::string(100, ...) -> std::string(1000, ...) - auto host_input = std::string(std::string(100, 'w') + "😀" + // - std::string(100, 'x') + "😀" + // - std::string(100, 'y') + "😀" + // - std::string(100, 'z') + "😀" + // - std::string(100, '_')); + for (auto i = 0; i < 1000; i++) { + host_input += ":::::"; + host_input += "....."; + host_expected.emplace_back(std::string(":::::")); + host_expected.emplace_back(std::string(".....")); + } - auto expected = strings_column_wrapper{std::string(100, 'w') + "😀", - std::string(100, 'x') + "😀", - std::string(100, 'y') + "😀", - std::string(100, 'z') + "😀", - std::string(100, '_')}; + host_expected.emplace_back(std::string("")); + + auto expected = strings_column_wrapper{host_expected.begin(), host_expected.end()}; auto source = cudf::io::text::make_source(host_input); auto out = cudf::io::text::multibyte_split(*source, delimiters); @@ -86,6 +86,19 @@ TEST_F(MultibyteSplitTest, LargeInput) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); } +// TEST_F(MultibyteSplitTest, OverlappingMatchErasure) +// { +// auto delimiters = std::vector({":::::"}); + +// auto host_input = std::string(":::::" ":::::"); +// auto expected = strings_column_wrapper{":::::", ":::::"}; + +// auto source = cudf::io::text::make_source(host_input); +// auto out = cudf::io::text::multibyte_split(*source, delimiters); + +// CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); +// } + TEST_F(MultibyteSplitTest, MultipleDelimiters) { auto delimiters = std::vector({"😀", "😎", ",", "::"}); From 5ad2148e9b7a489a971b844d7f36aee66b74eb1b Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 30 Jul 2021 20:22:52 -0500 Subject: [PATCH 51/80] fix emoji bits documentation --- cpp/tests/io/text/multibyte_split_test.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index 17405641cf5..b6f53ac00fa 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -30,8 +30,8 @@ using namespace cudf; using namespace test; -// 😀 | F0 9F 98 80 | 11110000 10011111 01100010 01010000 -// 😎 | F0 9F 98 8E | 11110000 10011111 01100010 11101000 +// 😀 | F0 9F 98 80 | 11110000 10011111 10011000 10000000 +// 😎 | F0 9F 98 8E | 11110000 10011111 10011000 10001110 struct MultibyteSplitTest : public BaseFixture { }; From 511ab9ff46d3982536dad43d269c34181b2d9b66 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Mon, 2 Aug 2021 13:13:10 -0500 Subject: [PATCH 52/80] style adjustments and documentation update to multibyte_split --- conda/recipes/libcudf/meta.yaml | 9 +++++++-- cpp/include/cudf/io/text/multistate.hpp | 22 ++++++++++++++++++++++ cpp/include/cudf/io/text/trie.hpp | 5 ++++- cpp/tests/io/text/multibyte_split_test.cpp | 22 +++++++++++----------- 4 files changed, 44 insertions(+), 14 deletions(-) diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 6c4175a2539..37e33a6d135 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -119,10 +119,9 @@ test: - test -f $PREFIX/include/cudf/hashing.hpp - test -f $PREFIX/include/cudf/interop.hpp - test -f $PREFIX/include/cudf/io/avro.hpp + - test -f $PREFIX/include/cudf/io/csv.hpp - test -f $PREFIX/include/cudf/io/data_sink.hpp - test -f $PREFIX/include/cudf/io/datasource.hpp - - test -f $PREFIX/include/cudf/io/orc_metadata.hpp - - test -f $PREFIX/include/cudf/io/csv.hpp - test -f $PREFIX/include/cudf/io/detail/avro.hpp - test -f $PREFIX/include/cudf/io/detail/csv.hpp - test -f $PREFIX/include/cudf/io/detail/json.hpp @@ -130,8 +129,14 @@ test: - test -f $PREFIX/include/cudf/io/detail/parquet.hpp - test -f $PREFIX/include/cudf/io/detail/utils.hpp - test -f $PREFIX/include/cudf/io/json.hpp + - test -f $PREFIX/include/cudf/io/orc_metadata.hpp - test -f $PREFIX/include/cudf/io/orc.hpp - test -f $PREFIX/include/cudf/io/parquet.hpp + - test -f $PREFIX/include/cudf/io/text/data_chunk_source_factories.hpp + - test -f $PREFIX/include/cudf/io/text/data_chunk_source.hpp + - test -f $PREFIX/include/cudf/io/text/multibyte_split.hpp + - test -f $PREFIX/include/cudf/io/text/multistate.hpp + - test -f $PREFIX/include/cudf/io/text/trie.hpp - test -f $PREFIX/include/cudf/io/types.hpp - test -f $PREFIX/include/cudf/ipc.hpp - test -f $PREFIX/include/cudf/join.hpp diff --git a/cpp/include/cudf/io/text/multistate.hpp b/cpp/include/cudf/io/text/multistate.hpp index 5a7c4bde86f..82c1c37fec9 100644 --- a/cpp/include/cudf/io/text/multistate.hpp +++ b/cpp/include/cudf/io/text/multistate.hpp @@ -22,6 +22,10 @@ namespace cudf { namespace io { namespace text { +/** + * @brief represents a single (begin, end] pair of possible state transition history. + * + */ struct multistate_segment { public: inline constexpr multistate_segment() : _data(0) {} @@ -37,6 +41,9 @@ struct multistate_segment { uint8_t _data; }; +/** + * @brief Holds up to 7 transition history segments + */ struct multistate { public: inline constexpr void enqueue(uint8_t head, uint8_t tail) @@ -68,6 +75,21 @@ struct multistate { // lhs contains only zero? +/** + * @brief associatively inner-joins transition histories. + * + * Examples: + * <(0, 5]> + <(5, 9]> = <(0, 9]> + * <(0, 5]> + <(6, 9]> = <> + * <(0, 1], (0, 2]> + <(2, 3], (1, 4]> = <(0, 4], (0, 3]> + * <(0, 1], (0, 2]> + <(1, 3]> = <(0, 3]> + * + * Head and tail value are limited to [0, 1, ..., 16] + * + * @param lhs past segments + * @param rhs future segments + * @return full join of past and future segments + */ inline constexpr multistate operator+(multistate const& lhs, multistate const& rhs) { // combine two multistates together by full-joining LHS tails to RHS heads, diff --git a/cpp/include/cudf/io/text/trie.hpp b/cpp/include/cudf/io/text/trie.hpp index 8618e79bdeb..9a8689ca099 100644 --- a/cpp/include/cudf/io/text/trie.hpp +++ b/cpp/include/cudf/io/text/trie.hpp @@ -35,7 +35,10 @@ struct trie_builder_node { void insert(std::string s) { insert(s.c_str(), s.size()); } - trie_builder_node& insert(char const* s, uint16_t size, uint8_t depth = 0) + trie_builder_node& insert(char const* s, uint16_t size) { return this->insert(s, size, 0); } + + private: + trie_builder_node& insert(char const* s, uint16_t size, uint8_t depth) { if (size == 0) { match_length = depth; diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index b6f53ac00fa..54f73210d72 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -25,8 +25,6 @@ #include #include -#include - using namespace cudf; using namespace test; @@ -86,18 +84,20 @@ TEST_F(MultibyteSplitTest, LargeInput) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); } -// TEST_F(MultibyteSplitTest, OverlappingMatchErasure) -// { -// auto delimiters = std::vector({":::::"}); +TEST_F(MultibyteSplitTest, OverlappingMatchErasure) +{ + auto delimiters = std::vector({":::::"}); -// auto host_input = std::string(":::::" ":::::"); -// auto expected = strings_column_wrapper{":::::", ":::::"}; + auto host_input = std::string( + ":::::" + ":::::"); + auto expected = strings_column_wrapper{":::::", ":::::"}; -// auto source = cudf::io::text::make_source(host_input); -// auto out = cudf::io::text::multibyte_split(*source, delimiters); + auto source = cudf::io::text::make_source(host_input); + auto out = cudf::io::text::multibyte_split(*source, delimiters); -// CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); -// } + // CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); // this use case it not yet supported. +} TEST_F(MultibyteSplitTest, MultipleDelimiters) { From 69280e8ccb764e32d1426b72b27b206aef9f2b61 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Mon, 2 Aug 2021 13:30:19 -0500 Subject: [PATCH 53/80] move tile-scanning utilites to detail namespace --- conda/recipes/libcudf/meta.yaml | 4 +- .../cudf/io/text/{ => detail}/multistate.hpp | 2 + .../cudf/io/text/detail/tile_state.hpp | 174 ++++++++++++++ .../cudf/io/text/{ => detail}/trie.hpp | 16 +- cpp/src/io/text/multibyte_split.cu | 213 +++--------------- 5 files changed, 212 insertions(+), 197 deletions(-) rename cpp/include/cudf/io/text/{ => detail}/multistate.hpp (98%) create mode 100644 cpp/include/cudf/io/text/detail/tile_state.hpp rename cpp/include/cudf/io/text/{ => detail}/trie.hpp (96%) diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 37e33a6d135..2c6ebda3376 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -134,9 +134,9 @@ test: - test -f $PREFIX/include/cudf/io/parquet.hpp - test -f $PREFIX/include/cudf/io/text/data_chunk_source_factories.hpp - test -f $PREFIX/include/cudf/io/text/data_chunk_source.hpp + - test -f $PREFIX/include/cudf/io/text/detail/multistate.hpp + - test -f $PREFIX/include/cudf/io/text/detail/trie.hpp - test -f $PREFIX/include/cudf/io/text/multibyte_split.hpp - - test -f $PREFIX/include/cudf/io/text/multistate.hpp - - test -f $PREFIX/include/cudf/io/text/trie.hpp - test -f $PREFIX/include/cudf/io/types.hpp - test -f $PREFIX/include/cudf/ipc.hpp - test -f $PREFIX/include/cudf/join.hpp diff --git a/cpp/include/cudf/io/text/multistate.hpp b/cpp/include/cudf/io/text/detail/multistate.hpp similarity index 98% rename from cpp/include/cudf/io/text/multistate.hpp rename to cpp/include/cudf/io/text/detail/multistate.hpp index 82c1c37fec9..fc9fb9552fd 100644 --- a/cpp/include/cudf/io/text/multistate.hpp +++ b/cpp/include/cudf/io/text/detail/multistate.hpp @@ -21,6 +21,7 @@ namespace cudf { namespace io { namespace text { +namespace detail { /** * @brief represents a single (begin, end] pair of possible state transition history. @@ -106,6 +107,7 @@ inline constexpr multistate operator+(multistate const& lhs, multistate const& r return result; } +} // namespace detail } // namespace text } // namespace io } // namespace cudf diff --git a/cpp/include/cudf/io/text/detail/tile_state.hpp b/cpp/include/cudf/io/text/detail/tile_state.hpp new file mode 100644 index 00000000000..95c4ec8beca --- /dev/null +++ b/cpp/include/cudf/io/text/detail/tile_state.hpp @@ -0,0 +1,174 @@ + +#pragma once + +#include + +namespace cudf { +namespace io { +namespace text { +namespace detail { + +enum class scan_tile_status : uint8_t { + oob, + invalid, + partial, + inclusive, +}; + +template +struct scan_tile_state_view { + uint64_t num_tiles; + scan_tile_status* tile_status; + T* tile_partial; + T* tile_inclusive; + + __device__ inline void initialize_status(cudf::size_type base_tile_idx, + cudf::size_type count, + scan_tile_status status) + { + auto thread_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_idx < count) { // + // this is UB if tile_status gets assigned from multiple threads. + tile_status[(base_tile_idx + thread_idx) % num_tiles] = status; + } + } + + __device__ inline void set_partial_prefix(cudf::size_type tile_idx, T value) + { + auto const offset = (tile_idx + num_tiles) % num_tiles; + cub::ThreadStore(tile_partial + offset, value); + __threadfence(); + cub::ThreadStore(tile_status + offset, scan_tile_status::partial); + } + + __device__ inline void set_inclusive_prefix(cudf::size_type tile_idx, T value) + { + auto const offset = (tile_idx + num_tiles) % num_tiles; + cub::ThreadStore(tile_inclusive + offset, value); + __threadfence(); + cub::ThreadStore(tile_status + offset, scan_tile_status::inclusive); + } + + __device__ inline T get_prefix(cudf::size_type tile_idx, scan_tile_status& status) + { + auto const offset = (tile_idx + num_tiles) % num_tiles; + + while ((status = cub::ThreadLoad(tile_status + offset)) == + scan_tile_status::invalid) { + __threadfence(); + } + + if (status == scan_tile_status::partial) { + return cub::ThreadLoad(tile_partial + offset); + } else { + return cub::ThreadLoad(tile_inclusive + offset); + } + } + + __device__ inline T get_inclusive_prefix(cudf::size_type tile_idx) + { + auto const offset = (tile_idx + num_tiles) % num_tiles; + while (cub::ThreadLoad(tile_status + offset) != scan_tile_status::inclusive) { + __threadfence(); + } + return cub::ThreadLoad(tile_inclusive + offset); + } +}; + +template +struct scan_tile_state { + rmm::device_uvector tile_status; + rmm::device_uvector tile_state_partial; + rmm::device_uvector tile_state_inclusive; + + scan_tile_state(cudf::size_type num_tiles, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) + : tile_status(rmm::device_uvector(num_tiles, stream, mr)), + tile_state_partial(rmm::device_uvector(num_tiles, stream, mr)), + tile_state_inclusive(rmm::device_uvector(num_tiles, stream, mr)) + { + } + + operator scan_tile_state_view() + { + return scan_tile_state_view{tile_status.size(), + tile_status.data(), + tile_state_partial.data(), + tile_state_inclusive.data()}; + } + + inline void set_seed_async(T const seed, rmm::cuda_stream_view stream) + { + auto x = tile_status.size(); + auto y = scan_tile_status::inclusive; + tile_state_inclusive.set_element_async(x - 1, seed, stream); + tile_status.set_element_async(x - 1, y, stream); + } + + // T back_element(rmm::cuda_stream_view stream) const { return tile_state.back_element(stream); } + + inline T get_inclusive_prefix(cudf::size_type tile_idx, rmm::cuda_stream_view stream) const + { + auto const offset = (tile_idx + tile_status.size()) % tile_status.size(); + return tile_state_inclusive.element(offset, stream); + } +}; + +template +struct scan_tile_state_callback { + using WarpReduce = cub::WarpReduce; + + struct _TempStorage { + typename WarpReduce::TempStorage reduce; + T exclusive_prefix; + }; + + using TempStorage = cub::Uninitialized<_TempStorage>; + + __device__ inline scan_tile_state_callback(TempStorage& temp_storage, + scan_tile_state_view& tile_state, + cudf::size_type tile_idx) + : _temp_storage(temp_storage.Alias()), _tile_state(tile_state), _tile_idx(tile_idx) + { + } + + __device__ inline T operator()(T const& block_aggregate) + { + if (threadIdx.x == 0) { + _tile_state.set_partial_prefix(_tile_idx, block_aggregate); // + } + + auto predecessor_idx = _tile_idx - 1 - threadIdx.x; + auto predecessor_status = scan_tile_status::invalid; + + // scan partials to form prefix + + if (threadIdx.x == 0) { + auto window_partial = _tile_state.get_prefix(predecessor_idx, predecessor_status); + while (predecessor_status != scan_tile_status::inclusive) { + predecessor_idx--; + auto predecessor_prefix = _tile_state.get_prefix(predecessor_idx, predecessor_status); + window_partial = predecessor_prefix + window_partial; + } + _temp_storage.exclusive_prefix = window_partial; + } + + if (threadIdx.x == 0) { + _tile_state.set_inclusive_prefix(_tile_idx, _temp_storage.exclusive_prefix + block_aggregate); + } + + __syncthreads(); // TODO: remove if unnecessary. + + return _temp_storage.exclusive_prefix; + } + + _TempStorage& _temp_storage; + scan_tile_state_view& _tile_state; + cudf::size_type _tile_idx; +}; + +} // namespace detail +} // namespace text +} // namespace io +} // namespace cudf diff --git a/cpp/include/cudf/io/text/trie.hpp b/cpp/include/cudf/io/text/detail/trie.hpp similarity index 96% rename from cpp/include/cudf/io/text/trie.hpp rename to cpp/include/cudf/io/text/detail/trie.hpp index 9a8689ca099..7ea520d3145 100644 --- a/cpp/include/cudf/io/text/trie.hpp +++ b/cpp/include/cudf/io/text/detail/trie.hpp @@ -17,7 +17,7 @@ #pragma once #include -#include +#include #include #include @@ -27,7 +27,10 @@ #include #include -namespace { +namespace cudf { +namespace io { +namespace text { +namespace detail { struct trie_builder_node { uint8_t match_length; @@ -51,12 +54,6 @@ struct trie_builder_node { } }; -} // namespace - -namespace cudf { -namespace io { -namespace text { - struct trie_node { char token; uint8_t match_length; @@ -183,12 +180,13 @@ struct trie { trie_nodes.emplace_back(trie_node{tokens[i], match_length[i], transitions[i]}); } - return trie{detail::make_device_uvector_async(trie_nodes, stream, mr)}; + return trie{cudf::detail::make_device_uvector_async(trie_nodes, stream, mr)}; } trie_device_view view() const { return trie_device_view{_nodes}; } }; +} // namespace detail } // namespace text } // namespace io } // namespace cudf diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 65ea4ac4c4f..9ab6319ccec 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -18,17 +18,16 @@ #include #include #include -#include -#include +#include +#include +#include #include #include #include #include -#include #include -#include #include #include @@ -41,168 +40,7 @@ inline constexpr auto ceil_div(Dividend dividend, Divisor divisor) return dividend / divisor + (dividend % divisor != 0); } -using multistate = cudf::io::text::multistate; - -enum class scan_tile_status : uint8_t { - oob, - invalid, - partial, - inclusive, -}; - -template -struct scan_tile_state_view { - uint64_t num_tiles; - scan_tile_status* tile_status; - T* tile_partial; - T* tile_inclusive; - - __device__ inline void initialize_status(cudf::size_type base_tile_idx, - cudf::size_type count, - scan_tile_status status) - { - auto thread_idx = blockIdx.x * blockDim.x + threadIdx.x; - if (thread_idx < count) { // - // this is probably UB without taking in to account tile_states being assigned multiple ties - // due to modulo operator - tile_status[(base_tile_idx + thread_idx) % num_tiles] = status; - } - } - - __device__ inline void set_partial_prefix(cudf::size_type tile_idx, T value) - { - auto const offset = (tile_idx + num_tiles) % num_tiles; - cub::ThreadStore(tile_partial + offset, value); - __threadfence(); - cub::ThreadStore(tile_status + offset, scan_tile_status::partial); - } - - __device__ inline void set_inclusive_prefix(cudf::size_type tile_idx, T value) - { - auto const offset = (tile_idx + num_tiles) % num_tiles; - cub::ThreadStore(tile_inclusive + offset, value); - __threadfence(); - cub::ThreadStore(tile_status + offset, scan_tile_status::inclusive); - } - - __device__ inline T get_prefix(cudf::size_type tile_idx, scan_tile_status& status) - { - auto const offset = (tile_idx + num_tiles) % num_tiles; - - while ((status = cub::ThreadLoad(tile_status + offset)) == - scan_tile_status::invalid) { - __threadfence(); - } - - if (status == scan_tile_status::partial) { - return cub::ThreadLoad(tile_partial + offset); - } else { - return cub::ThreadLoad(tile_inclusive + offset); - } - } - - __device__ inline T get_inclusive_prefix(cudf::size_type tile_idx) - { - auto const offset = (tile_idx + num_tiles) % num_tiles; - while (cub::ThreadLoad(tile_status + offset) != scan_tile_status::inclusive) { - __threadfence(); - } - return cub::ThreadLoad(tile_inclusive + offset); - } -}; - -template -struct scan_tile_state { - rmm::device_uvector tile_status; - rmm::device_uvector tile_state_partial; - rmm::device_uvector tile_state_inclusive; - - scan_tile_state(cudf::size_type num_tiles, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) - : tile_status(rmm::device_uvector(num_tiles, stream, mr)), - tile_state_partial(rmm::device_uvector(num_tiles, stream, mr)), - tile_state_inclusive(rmm::device_uvector(num_tiles, stream, mr)) - { - } - - operator scan_tile_state_view() - { - return scan_tile_state_view{tile_status.size(), - tile_status.data(), - tile_state_partial.data(), - tile_state_inclusive.data()}; - } - - inline void set_seed_async(T const seed, rmm::cuda_stream_view stream) - { - auto x = tile_status.size(); - auto y = scan_tile_status::inclusive; - tile_state_inclusive.set_element_async(x - 1, seed, stream); - tile_status.set_element_async(x - 1, y, stream); - } - - // T back_element(rmm::cuda_stream_view stream) const { return tile_state.back_element(stream); } - - inline T get_inclusive_prefix(cudf::size_type tile_idx, rmm::cuda_stream_view stream) const - { - auto const offset = (tile_idx + tile_status.size()) % tile_status.size(); - return tile_state_inclusive.element(offset, stream); - } -}; - -template -struct scan_tile_state_callback { - using WarpReduce = cub::WarpReduce; - - struct _TempStorage { - typename WarpReduce::TempStorage reduce; - T exclusive_prefix; - }; - - using TempStorage = cub::Uninitialized<_TempStorage>; - - __device__ inline scan_tile_state_callback(TempStorage& temp_storage, - scan_tile_state_view& tile_state, - cudf::size_type tile_idx) - : _temp_storage(temp_storage.Alias()), _tile_state(tile_state), _tile_idx(tile_idx) - { - } - - __device__ inline T operator()(T const& block_aggregate) - { - if (threadIdx.x == 0) { - _tile_state.set_partial_prefix(_tile_idx, block_aggregate); // - } - - auto predecessor_idx = _tile_idx - 1 - threadIdx.x; - auto predecessor_status = scan_tile_status::invalid; - - // scan partials to form prefix - - if (threadIdx.x == 0) { - auto window_partial = _tile_state.get_prefix(predecessor_idx, predecessor_status); - while (predecessor_status != scan_tile_status::inclusive) { - predecessor_idx--; - auto predecessor_prefix = _tile_state.get_prefix(predecessor_idx, predecessor_status); - window_partial = predecessor_prefix + window_partial; - } - _temp_storage.exclusive_prefix = window_partial; - } - - if (threadIdx.x == 0) { - _tile_state.set_inclusive_prefix(_tile_idx, _temp_storage.exclusive_prefix + block_aggregate); - } - - __syncthreads(); // TODO: remove if unnecessary. - - return _temp_storage.exclusive_prefix; - } - - _TempStorage& _temp_storage; - scan_tile_state_view& _tile_state; - cudf::size_type _tile_idx; -}; +using cudf::io::text::detail::multistate; auto constexpr ITEMS_PER_THREAD = 32; // influences register pressure auto constexpr THREADS_PER_TILE = 32; // must be >= 32 for warp-reduce. bugged for > 32, needs fix @@ -213,7 +51,7 @@ auto constexpr ITEMS_PER_CHUNK = ITEMS_PER_TILE * TILES_PER_CHUNK; struct PatternScan { typedef cub::BlockScan BlockScan; - typedef scan_tile_state_callback BlockScanCallback; + typedef cudf::io::text::detail::scan_tile_state_callback BlockScanCallback; struct _TempStorage { typename BlockScan::TempStorage scan; @@ -227,8 +65,8 @@ struct PatternScan { __device__ inline PatternScan(TempStorage& temp_storage) : _temp_storage(temp_storage.Alias()) {} __device__ inline void Scan(cudf::size_type tile_idx, - scan_tile_state_view tile_state, - cudf::io::text::trie_device_view trie, + cudf::io::text::detail::scan_tile_state_view tile_state, + cudf::io::text::detail::trie_device_view trie, char (&thread_data)[ITEMS_PER_THREAD], uint32_t (&thread_state)[ITEMS_PER_THREAD]) { @@ -261,27 +99,30 @@ struct PatternScan { // it begins in. From there, each thread can then take deterministic action. In this case, the // deterministic action is counting and outputting delimiter offsets when a delimiter is found. -__global__ void multibyte_split_init_kernel(cudf::size_type base_tile_idx, - cudf::size_type num_tiles, - scan_tile_state_view tile_multistates, - scan_tile_state_view tile_output_offsets, - scan_tile_status status = scan_tile_status::invalid) +__global__ void multibyte_split_init_kernel( + cudf::size_type base_tile_idx, + cudf::size_type num_tiles, + cudf::io::text::detail::scan_tile_state_view tile_multistates, + cudf::io::text::detail::scan_tile_state_view tile_output_offsets, + cudf::io::text::detail::scan_tile_status status = + cudf::io::text::detail::scan_tile_status::invalid) { tile_multistates.initialize_status(base_tile_idx, num_tiles, status); tile_output_offsets.initialize_status(base_tile_idx, num_tiles, status); } -__global__ void multibyte_split_kernel(cudf::size_type base_tile_idx, - cudf::size_type num_tiles, - scan_tile_state_view tile_multistates, - scan_tile_state_view tile_output_offsets, - cudf::io::text::trie_device_view trie, - cudf::device_span data, - cudf::device_span string_offsets, - cudf::device_span data_out) +__global__ void multibyte_split_kernel( + cudf::size_type base_tile_idx, + cudf::size_type num_tiles, + cudf::io::text::detail::scan_tile_state_view tile_multistates, + cudf::io::text::detail::scan_tile_state_view tile_output_offsets, + cudf::io::text::detail::trie_device_view trie, + cudf::device_span data, + cudf::device_span string_offsets, + cudf::device_span data_out) { typedef cub::BlockScan OffsetScan; - typedef scan_tile_state_callback OffsetScanCallback; + typedef cudf::io::text::detail::scan_tile_state_callback OffsetScanCallback; __shared__ union { typename PatternScan::TempStorage pattern_scan; @@ -410,7 +251,7 @@ void join_pool_to_stream(rmm::cuda_stream_pool& stream_pool, rmm::cuda_stream_vi } cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_source& source, - cudf::io::text::trie const& trie, + cudf::io::text::detail::trie const& trie, scan_tile_state& tile_multistates, scan_tile_state& tile_offsets, device_span output_buffer, @@ -428,7 +269,7 @@ cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_sour TILES_PER_CHUNK, tile_multistates, tile_offsets, - scan_tile_status::oob); + cudf::io::text::detail::scan_tile_status::oob); auto multistate_seed = multistate(); @@ -477,7 +318,7 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source& rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - auto const trie = cudf::io::text::trie::create(delimiters, stream); + auto const trie = cudf::io::text::detail::trie::create(delimiters, stream); auto concurrency = 2; // must be at least 32 when using warp-reduce on partials // must be at least 1 more than max possible concurrent tiles From 2d37dc96ec03d2e946cf1a999d9c5aabf5e6488f Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Mon, 2 Aug 2021 14:45:49 -0500 Subject: [PATCH 54/80] remove "inline" from constexpr members in cudf::io::text --- .../cudf/io/text/detail/multistate.hpp | 23 ++++++++----------- cpp/include/cudf/io/text/detail/trie.hpp | 12 +++++----- cpp/src/io/text/multibyte_split.cu | 2 +- 3 files changed, 17 insertions(+), 20 deletions(-) diff --git a/cpp/include/cudf/io/text/detail/multistate.hpp b/cpp/include/cudf/io/text/detail/multistate.hpp index fc9fb9552fd..164a1ae61d5 100644 --- a/cpp/include/cudf/io/text/detail/multistate.hpp +++ b/cpp/include/cudf/io/text/detail/multistate.hpp @@ -29,14 +29,11 @@ namespace detail { */ struct multistate_segment { public: - inline constexpr multistate_segment() : _data(0) {} - inline constexpr multistate_segment(uint8_t head, uint8_t tail) - : _data((head & 0b1111) | (tail << 4)) - { - } + constexpr multistate_segment() : _data(0) {} + constexpr multistate_segment(uint8_t head, uint8_t tail) : _data((head & 0b1111) | (tail << 4)) {} - inline constexpr uint8_t get_head() const { return _data & 0b1111; } - inline constexpr uint8_t get_tail() const { return _data >> 4; } + constexpr uint8_t get_head() const { return _data & 0b1111; } + constexpr uint8_t get_tail() const { return _data >> 4; } private: uint8_t _data; @@ -47,14 +44,14 @@ struct multistate_segment { */ struct multistate { public: - inline constexpr void enqueue(uint8_t head, uint8_t tail) + constexpr void enqueue(uint8_t head, uint8_t tail) { _segments[_size++] = multistate_segment(head, tail); } - inline constexpr uint8_t size() const { return _size; } + constexpr uint8_t size() const { return _size; } - inline constexpr uint8_t max_tail() const + constexpr uint8_t max_tail() const { uint8_t maximum = 0; @@ -65,8 +62,8 @@ struct multistate { return maximum; } - inline constexpr uint8_t get_head(uint8_t idx) const { return _segments[idx].get_head(); } - inline constexpr uint8_t get_tail(uint8_t idx) const { return _segments[idx].get_tail(); } + constexpr uint8_t get_head(uint8_t idx) const { return _segments[idx].get_head(); } + constexpr uint8_t get_tail(uint8_t idx) const { return _segments[idx].get_tail(); } private: static auto constexpr N = 7; @@ -91,7 +88,7 @@ struct multistate { * @param rhs future segments * @return full join of past and future segments */ -inline constexpr multistate operator+(multistate const& lhs, multistate const& rhs) +constexpr multistate operator+(multistate const& lhs, multistate const& rhs) { // combine two multistates together by full-joining LHS tails to RHS heads, // and taking the corosponding LHS heads and RHS tails. diff --git a/cpp/include/cudf/io/text/detail/trie.hpp b/cpp/include/cudf/io/text/detail/trie.hpp index 7ea520d3145..14f66ec4f73 100644 --- a/cpp/include/cudf/io/text/detail/trie.hpp +++ b/cpp/include/cudf/io/text/detail/trie.hpp @@ -63,7 +63,7 @@ struct trie_node { struct trie_device_view { device_span _nodes; - inline constexpr multistate transition_init(char c) + constexpr multistate transition_init(char c) { auto result = multistate(); @@ -75,7 +75,7 @@ struct trie_device_view { return result; } - inline constexpr multistate transition(char c, multistate const& states) + constexpr multistate transition(char c, multistate const& states) { auto result = multistate(); @@ -88,7 +88,7 @@ struct trie_device_view { return result; } - inline constexpr void transition_enqueue_all( // + constexpr void transition_enqueue_all( // char c, multistate& states, uint8_t head, @@ -101,11 +101,11 @@ struct trie_device_view { } } - inline constexpr bool is_match(uint16_t idx) { return static_cast(get_match_length(idx)); } - inline constexpr uint8_t get_match_length(uint16_t idx) { return _nodes[idx].match_length; } + constexpr bool is_match(uint16_t idx) { return static_cast(get_match_length(idx)); } + constexpr uint8_t get_match_length(uint16_t idx) { return _nodes[idx].match_length; } template - inline constexpr uint8_t get_match_length(multistate const& states) + constexpr uint8_t get_match_length(multistate const& states) { int8_t val = 0; for (uint8_t i = 0; i < states.size(); i++) { diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 9ab6319ccec..bde781df164 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -35,7 +35,7 @@ namespace { template -inline constexpr auto ceil_div(Dividend dividend, Divisor divisor) +constexpr decltype(auto) ceil_div(Dividend dividend, Divisor divisor) { return dividend / divisor + (dividend % divisor != 0); } From 9c6bf2abb700f7330a52d9241bc9a42dd56e48a5 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Mon, 2 Aug 2021 23:03:59 -0500 Subject: [PATCH 55/80] fix large input bug in multibyte_split where offsets were not accounted for correctly --- .../cudf/io/text/data_chunk_source.hpp | 7 ++ cpp/src/io/text/multibyte_split.cu | 76 +++++++++++-------- cpp/tests/io/text/multibyte_split_test.cpp | 10 ++- 3 files changed, 58 insertions(+), 35 deletions(-) diff --git a/cpp/include/cudf/io/text/data_chunk_source.hpp b/cpp/include/cudf/io/text/data_chunk_source.hpp index f0eb9dcd164..10ec735dad5 100644 --- a/cpp/include/cudf/io/text/data_chunk_source.hpp +++ b/cpp/include/cudf/io/text/data_chunk_source.hpp @@ -45,6 +45,13 @@ struct data_chunk { */ class data_chunk_reader { public: + /** + * @brief Get the next chunk of data + * + * @param size desired number of bytes + * @param stream stream to associate allocations or perform work required to obtain chunk + * @return a chunk of data up to @param size bytes, or less if no more data is avaialable + */ virtual data_chunk get_next_chunk(uint32_t size, rmm::cuda_stream_view stream) = 0; }; diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index bde781df164..cb59fee6c83 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -27,6 +27,7 @@ #include #include +#include #include #include @@ -42,12 +43,12 @@ constexpr decltype(auto) ceil_div(Dividend dividend, Divisor divisor) using cudf::io::text::detail::multistate; -auto constexpr ITEMS_PER_THREAD = 32; // influences register pressure -auto constexpr THREADS_PER_TILE = 32; // must be >= 32 for warp-reduce. bugged for > 32, needs fix -auto constexpr ITEMS_PER_TILE = ITEMS_PER_THREAD * THREADS_PER_TILE; -auto constexpr TILES_PER_CHUNK = 512; +int32_t constexpr ITEMS_PER_THREAD = 32; // influences register pressure +int32_t constexpr THREADS_PER_TILE = 32; // must be >= 32. bugged for > 32, needs fix +int32_t constexpr ITEMS_PER_TILE = ITEMS_PER_THREAD * THREADS_PER_TILE; +int32_t constexpr TILES_PER_CHUNK = 512; // keep ITEMS_PER_CHUNK below input size to force multi-tile execution. -auto constexpr ITEMS_PER_CHUNK = ITEMS_PER_TILE * TILES_PER_CHUNK; +int32_t constexpr ITEMS_PER_CHUNK = ITEMS_PER_TILE * TILES_PER_CHUNK; struct PatternScan { typedef cub::BlockScan BlockScan; @@ -117,14 +118,18 @@ __global__ void multibyte_split_kernel( cudf::io::text::detail::scan_tile_state_view tile_multistates, cudf::io::text::detail::scan_tile_state_view tile_output_offsets, cudf::io::text::detail::trie_device_view trie, - cudf::device_span data, - cudf::device_span string_offsets, - cudf::device_span data_out) + int32_t chunk_input_offset, + cudf::device_span chunk_input_chars, + cudf::device_span abs_output_delimiter_offsets, + cudf::device_span abs_output_chars) { - typedef cub::BlockScan OffsetScan; - typedef cudf::io::text::detail::scan_tile_state_callback OffsetScanCallback; + using InputLoad = + cub::BlockLoad; + using OffsetScan = cub::BlockScan; + using OffsetScanCallback = cudf::io::text::detail::scan_tile_state_callback; __shared__ union { + typename InputLoad::TempStorage input_load; typename PatternScan::TempStorage pattern_scan; struct { typename OffsetScan::TempStorage offset_scan; @@ -132,39 +137,39 @@ __global__ void multibyte_split_kernel( }; } temp_storage; - int32_t const tile_idx = base_tile_idx + blockIdx.x; - int32_t const thread_idx = blockIdx.x * blockDim.x + threadIdx.x; - int32_t const data_begin = thread_idx * ITEMS_PER_THREAD; - int32_t const num_valid = data.size() - data_begin; - int32_t const char_begin = base_tile_idx * ITEMS_PER_TILE; + int32_t const tile_idx = base_tile_idx + blockIdx.x; + int32_t const tile_input_offset = blockIdx.x * ITEMS_PER_TILE; + int32_t const thread_input_offset = tile_input_offset + threadIdx.x * ITEMS_PER_THREAD; + int32_t const thread_input_size = chunk_input_chars.size() - thread_input_offset; // STEP 1: Load inputs - char thread_data[ITEMS_PER_THREAD]; + char thread_chars[ITEMS_PER_THREAD]; - for (int32_t i = 0; i < ITEMS_PER_THREAD and i < num_valid; i++) { // - thread_data[i] = data[data_begin + i]; - } + InputLoad(temp_storage.input_load) + .Load(chunk_input_chars.data() + tile_input_offset, + thread_chars, + chunk_input_chars.size() - tile_input_offset); // STEP 2: Scan inputs to determine absolute thread states uint32_t thread_states[ITEMS_PER_THREAD]; + __syncthreads(); // required before temp_memory re-use PatternScan(temp_storage.pattern_scan) // - .Scan(tile_idx, tile_multistates, trie, thread_data, thread_states); + .Scan(tile_idx, tile_multistates, trie, thread_chars, thread_states); // STEP 3: Flag matches uint32_t thread_offsets[ITEMS_PER_THREAD]; for (int32_t i = 0; i < ITEMS_PER_THREAD; i++) { - thread_offsets[i] = i < num_valid and trie.is_match(thread_states[i]); + thread_offsets[i] = i < thread_input_size and trie.is_match(thread_states[i]); } // STEP 4: Scan flags to determine absolute thread output offset __syncthreads(); // required before temp_memory re-use - auto prefix_callback = OffsetScanCallback(temp_storage.offset_scan_callback, tile_output_offsets, tile_idx); @@ -173,17 +178,21 @@ __global__ void multibyte_split_kernel( // Step 5: Assign outputs from each thread using match offsets. - if (data_out.size() > 0) { - for (int32_t i = 0; i < ITEMS_PER_THREAD and i < num_valid; i++) { - data_out[data_begin + i] = thread_data[i]; + if (blockIdx.x == 0 and threadIdx.x == 0) { + printf("tile(%2u), cio(%9i)\n", tile_idx, chunk_input_offset); + } + + if (abs_output_chars.size() > 0) { + for (int32_t i = 0; i < ITEMS_PER_THREAD and i < thread_input_size; i++) { + abs_output_chars[chunk_input_offset + thread_input_offset + i] = thread_chars[i]; } } - if (string_offsets.size() > 0) { - for (int32_t i = 0; i < ITEMS_PER_THREAD and i < num_valid; i++) { + if (abs_output_delimiter_offsets.size() > 0) { + for (int32_t i = 0; i < ITEMS_PER_THREAD and i < thread_input_size; i++) { if (trie.get_match_length(thread_states[i]) > 0) { - auto const match_end = char_begin + data_begin + i + 1; - string_offsets[thread_offsets[i]] = match_end; + auto const match_end = base_tile_idx * ITEMS_PER_TILE + thread_input_offset + i + 1; + abs_output_delimiter_offsets[thread_offsets[i]] = match_end; } } } @@ -260,7 +269,7 @@ cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_sour rmm::cuda_stream_pool& stream_pool) { CUDF_FUNC_RANGE(); - cudf::size_type bytes_total = 0; + cudf::size_type chunk_offset = 0; // this function interleaves three kernel executions @@ -288,8 +297,6 @@ cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_sour if (chunk.size() == 0) { break; } - bytes_total += chunk.size(); - // reset the next chunk of tile state multibyte_split_init_kernel<<>>( // base_tile_idx, @@ -302,14 +309,17 @@ cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_sour tile_multistates, tile_offsets, trie.view(), + chunk_offset, chunk, output_buffer, output_char_buffer); + + chunk_offset += chunk.size(); } join_pool_to_stream(stream_pool, stream); - return bytes_total; + return chunk_offset; } std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source& source, diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index 54f73210d72..f5fa8455edf 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -67,7 +67,7 @@ TEST_F(MultibyteSplitTest, LargeInput) auto host_input = std::string(); auto host_expected = std::vector(); - for (auto i = 0; i < 1000; i++) { + for (auto i = 0; i < (32 * 32 * 512); i++) { host_input += ":::::"; host_input += "....."; host_expected.emplace_back(std::string(":::::")); @@ -81,7 +81,13 @@ TEST_F(MultibyteSplitTest, LargeInput) auto source = cudf::io::text::make_source(host_input); auto out = cudf::io::text::multibyte_split(*source, delimiters); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(cudf::strings_column_view(expected).chars(), + cudf::strings_column_view(*out).chars()); + + // CUDF_TEST_EXPECT_COLUMNS_EQUAL(cudf::strings_column_view(expected).offsets(), + // cudf::strings_column_view(*out).offsets()); + + // CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); } TEST_F(MultibyteSplitTest, OverlappingMatchErasure) From ee817b15432f63e5c7a1b53619f3c9d87bb9b470 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Mon, 2 Aug 2021 23:09:22 -0500 Subject: [PATCH 56/80] improve data_chunk_reader docs --- cpp/include/cudf/io/text/data_chunk_source.hpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cpp/include/cudf/io/text/data_chunk_source.hpp b/cpp/include/cudf/io/text/data_chunk_source.hpp index 10ec735dad5..a7e1c9f139c 100644 --- a/cpp/include/cudf/io/text/data_chunk_source.hpp +++ b/cpp/include/cudf/io/text/data_chunk_source.hpp @@ -46,7 +46,12 @@ struct data_chunk { class data_chunk_reader { public: /** - * @brief Get the next chunk of data + * @brief Get the next chunk of bytes from the data source + * + * Performs any necessary work to read and prepare the underlying data source for consumption as a + * view over device memory. Common implementations may read from a file, copy data from host + * memory, allocate temporary memory, perform iterative decompression, or even launch device + * kernels. * * @param size desired number of bytes * @param stream stream to associate allocations or perform work required to obtain chunk From 4cdbee5d58669ec31aea405d306dba0bbbf18740 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Mon, 2 Aug 2021 23:17:25 -0500 Subject: [PATCH 57/80] make multibyte_split accept data_chunk_source as a const& arg --- cpp/include/cudf/io/text/data_chunk_source.hpp | 2 +- cpp/include/cudf/io/text/data_chunk_source_factories.hpp | 6 +++--- cpp/include/cudf/io/text/multibyte_split.hpp | 2 +- cpp/src/io/text/multibyte_split.cu | 6 +++--- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/include/cudf/io/text/data_chunk_source.hpp b/cpp/include/cudf/io/text/data_chunk_source.hpp index a7e1c9f139c..3132c94f3bf 100644 --- a/cpp/include/cudf/io/text/data_chunk_source.hpp +++ b/cpp/include/cudf/io/text/data_chunk_source.hpp @@ -67,7 +67,7 @@ class data_chunk_reader { */ class data_chunk_source { public: - virtual std::unique_ptr create_reader() = 0; + virtual std::unique_ptr create_reader() const = 0; }; } // namespace text diff --git a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp index 91a07dde292..2d0893be014 100644 --- a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp +++ b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp @@ -157,7 +157,7 @@ class device_span_data_chunk_reader : public data_chunk_reader { class file_data_chunk_source : public data_chunk_source { public: file_data_chunk_source(std::string filename) : _filename(filename) {} - std::unique_ptr create_reader() override + std::unique_ptr create_reader() const override { return std::make_unique( std::make_unique(_filename, std::ifstream::in)); @@ -173,7 +173,7 @@ class file_data_chunk_source : public data_chunk_source { class string_data_chunk_source : public data_chunk_source { public: string_data_chunk_source(std::string const& data) : _data(data) {} - std::unique_ptr create_reader() override + std::unique_ptr create_reader() const override { return std::make_unique(std::make_unique(_data)); } @@ -188,7 +188,7 @@ class string_data_chunk_source : public data_chunk_source { class device_span_data_chunk_source : public data_chunk_source { public: device_span_data_chunk_source(device_span data) : _data(data) {} - std::unique_ptr create_reader() override + std::unique_ptr create_reader() const override { return std::make_unique(_data); } diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp index 93b9660d443..88f4c7d3819 100644 --- a/cpp/include/cudf/io/text/multibyte_split.hpp +++ b/cpp/include/cudf/io/text/multibyte_split.hpp @@ -29,7 +29,7 @@ namespace io { namespace text { std::unique_ptr multibyte_split( - data_chunk_source& source, + data_chunk_source const& source, std::vector const& delimiters, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index cb59fee6c83..a27e58eb150 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -259,7 +259,7 @@ void join_pool_to_stream(rmm::cuda_stream_pool& stream_pool, rmm::cuda_stream_vi cudaEventDestroy(event); } -cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_source& source, +cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_source const& source, cudf::io::text::detail::trie const& trie, scan_tile_state& tile_multistates, scan_tile_state& tile_offsets, @@ -322,7 +322,7 @@ cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_sour return chunk_offset; } -std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source& source, +std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source const& source, std::vector const& delimiters, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) @@ -379,7 +379,7 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source& } // namespace detail -std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source& source, +std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source const& source, std::vector const& delimiters, rmm::mr::device_memory_resource* mr) { From c3783dbcab14e91bacf4c0154d8bf6402edf24cd Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Mon, 2 Aug 2021 23:27:16 -0500 Subject: [PATCH 58/80] add tile_state.hpp to meta.yaml --- conda/recipes/libcudf/meta.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 2c6ebda3376..437ea8bc8ed 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -135,6 +135,7 @@ test: - test -f $PREFIX/include/cudf/io/text/data_chunk_source_factories.hpp - test -f $PREFIX/include/cudf/io/text/data_chunk_source.hpp - test -f $PREFIX/include/cudf/io/text/detail/multistate.hpp + - test -f $PREFIX/include/cudf/io/text/detail/tile_state.hpp - test -f $PREFIX/include/cudf/io/text/detail/trie.hpp - test -f $PREFIX/include/cudf/io/text/multibyte_split.hpp - test -f $PREFIX/include/cudf/io/types.hpp From 432399c209bdfb2fb6e68cbe6dcf231b707391e9 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Tue, 3 Aug 2021 16:25:46 -0500 Subject: [PATCH 59/80] create bad-case scenario benchmark --- .../io/text/multibyte_split_benchmark.cpp | 41 +++++++++++++++---- .../cudf/io/text/detail/multistate.hpp | 4 +- cpp/src/io/text/multibyte_split.cu | 20 ++++----- 3 files changed, 42 insertions(+), 23 deletions(-) diff --git a/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp index 0a9ffe7cbed..f022de09502 100644 --- a/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp +++ b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp @@ -35,11 +35,19 @@ using cudf::test::fixed_width_column_wrapper; temp_directory const temp_dir("cudf_gbench"); +enum data_chunk_source_type { + file, + host, + device, +}; + static void BM_multibyte_split(benchmark::State& state) { - auto delimiters = std::vector({"😀", "😎", ",", "::"}); + auto num_chars = state.range(0); + auto source_type = static_cast(state.range(1)); - int32_t num_chars = state.range(0); + // it would be better if we initialized these chars on gpu, then scattered-in some delimiters, + // then copied them back to host auto host_input = std::string(num_chars, 'x'); auto device_input = cudf::string_scalar(host_input); @@ -54,9 +62,25 @@ static void BM_multibyte_split(benchmark::State& state) cudaDeviceSynchronize(); - auto source = cudf::io::text::make_source_from_file(temp_file_name); - // auto source = cudf::io::text::make_source(device_input); - // auto source = cudf::io::text::make_source(host_input); + auto source = std::unique_ptr(nullptr); + + switch (source_type) { + case data_chunk_source_type::file: // + source = cudf::io::text::make_source_from_file(temp_file_name); + state.SetLabel("from file"); + break; + case data_chunk_source_type::host: // + source = cudf::io::text::make_source(host_input); + state.SetLabel("from host"); + break; + case data_chunk_source_type::device: // + source = cudf::io::text::make_source(device_input); + state.SetLabel("from device"); + break; + default: CUDF_FAIL(); + } + + auto delimiters = std::vector({"x"}); for (auto _ : state) { cuda_event_timer raii(state, true); @@ -75,8 +99,11 @@ class MultibyteSplitBenchmark : public cudf::benchmark { BM_multibyte_split(state); \ } \ BENCHMARK_REGISTER_F(MultibyteSplitBenchmark, name) \ - ->Range(1 << 30, 1 << 30) \ + ->ArgsProduct({{1 << 15, 1 << 30}, \ + {data_chunk_source_type::file, \ + data_chunk_source_type::host, \ + data_chunk_source_type::device}}) \ ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); + ->Unit(::benchmark::kMillisecond); TRANSPOSE_BM_BENCHMARK_DEFINE(multibyte_split_simple); diff --git a/cpp/include/cudf/io/text/detail/multistate.hpp b/cpp/include/cudf/io/text/detail/multistate.hpp index 164a1ae61d5..5ccf6765028 100644 --- a/cpp/include/cudf/io/text/detail/multistate.hpp +++ b/cpp/include/cudf/io/text/detail/multistate.hpp @@ -71,8 +71,6 @@ struct multistate { multistate_segment _segments[N]; }; -// lhs contains only zero? - /** * @brief associatively inner-joins transition histories. * @@ -91,7 +89,7 @@ struct multistate { constexpr multistate operator+(multistate const& lhs, multistate const& rhs) { // combine two multistates together by full-joining LHS tails to RHS heads, - // and taking the corosponding LHS heads and RHS tails. + // and taking the corresponding LHS heads and RHS tails. multistate result; for (uint8_t lhs_idx = 0; lhs_idx < lhs.size(); lhs_idx++) { diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index a27e58eb150..1193cbc17d8 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -114,7 +114,6 @@ __global__ void multibyte_split_init_kernel( __global__ void multibyte_split_kernel( cudf::size_type base_tile_idx, - cudf::size_type num_tiles, cudf::io::text::detail::scan_tile_state_view tile_multistates, cudf::io::text::detail::scan_tile_state_view tile_output_offsets, cudf::io::text::detail::trie_device_view trie, @@ -178,10 +177,6 @@ __global__ void multibyte_split_kernel( // Step 5: Assign outputs from each thread using match offsets. - if (blockIdx.x == 0 and threadIdx.x == 0) { - printf("tile(%2u), cio(%9i)\n", tile_idx, chunk_input_offset); - } - if (abs_output_chars.size() > 0) { for (int32_t i = 0; i < ITEMS_PER_THREAD and i < thread_input_size; i++) { abs_output_chars[chunk_input_offset + thread_input_offset + i] = thread_chars[i]; @@ -271,8 +266,6 @@ cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_sour CUDF_FUNC_RANGE(); cudf::size_type chunk_offset = 0; - // this function interleaves three kernel executions - multibyte_split_init_kernel<<>>( // -TILES_PER_CHUNK, TILES_PER_CHUNK, @@ -281,8 +274,7 @@ cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_sour cudf::io::text::detail::scan_tile_status::oob); auto multistate_seed = multistate(); - - multistate_seed.enqueue(0, 0); + multistate_seed.enqueue(0, 0); // this represents the first state in the pattern. tile_multistates.set_seed_async(multistate_seed, stream); tile_offsets.set_seed_async(0, stream); @@ -297,15 +289,17 @@ cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_sour if (chunk.size() == 0) { break; } + auto tiles_in_launch = ceil_div(chunk.size(), ITEMS_PER_TILE); + // reset the next chunk of tile state - multibyte_split_init_kernel<<>>( // + multibyte_split_init_kernel<<>>( // base_tile_idx, - TILES_PER_CHUNK, + tiles_in_launch, tile_multistates, tile_offsets); - multibyte_split_kernel<<>>( // + + multibyte_split_kernel<<>>( // base_tile_idx, - TILES_PER_CHUNK, tile_multistates, tile_offsets, trie.view(), From ad21c4fc379111e5fa5a1685f22e66d24e292aa5 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Wed, 4 Aug 2021 00:19:09 -0500 Subject: [PATCH 60/80] remove data_chunk in favor of device_span until it becomes clear an raii type is required --- .../cudf/io/text/data_chunk_source.hpp | 24 +++++++------------ .../io/text/data_chunk_source_factories.hpp | 8 +++---- 2 files changed, 12 insertions(+), 20 deletions(-) diff --git a/cpp/include/cudf/io/text/data_chunk_source.hpp b/cpp/include/cudf/io/text/data_chunk_source.hpp index 3132c94f3bf..012cb564bbf 100644 --- a/cpp/include/cudf/io/text/data_chunk_source.hpp +++ b/cpp/include/cudf/io/text/data_chunk_source.hpp @@ -26,21 +26,13 @@ namespace io { namespace text { /** - * @brief represents a possibly-shared view over device memory. - */ -struct data_chunk { - data_chunk(device_span data) : _data(data) {} - - operator cudf::device_span() { return _data; } - - uint32_t size() const { return _data.size(); } - - private: - device_span _data; -}; - -/** - * @brief a reader capable of producing views over device memory + * @brief a reader capable of producing views over device memory. + * + * The data chunk reader API encapsulates the idea of statefully traversing and loading a data + * source. A data source may be a file, a region of device memory, or a region of host memory. + * Reading data from these data sources efficiently requires different strategies dependings on the + * type of data source, type of compression, capabilities of the host and device, the data's + * destination. Whole-file decompression should be hidden behind this interface * */ class data_chunk_reader { @@ -57,7 +49,7 @@ class data_chunk_reader { * @param stream stream to associate allocations or perform work required to obtain chunk * @return a chunk of data up to @param size bytes, or less if no more data is avaialable */ - virtual data_chunk get_next_chunk(uint32_t size, rmm::cuda_stream_view stream) = 0; + virtual device_span get_next_chunk(uint32_t size, rmm::cuda_stream_view stream) = 0; }; /** diff --git a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp index 2d0893be014..64f3522f92e 100644 --- a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp +++ b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp @@ -76,7 +76,7 @@ class istream_data_chunk_reader : public data_chunk_reader { return device_span(static_cast(_buffers[stream.value()].data()), size); } - data_chunk get_next_chunk(uint32_t read_size, rmm::cuda_stream_view stream) override + device_span get_next_chunk(uint32_t read_size, rmm::cuda_stream_view stream) override { CUDF_FUNC_RANGE(); @@ -111,7 +111,7 @@ class istream_data_chunk_reader : public data_chunk_reader { CUDA_TRY(cudaEventRecord(ticket.event, stream.value())); // return the view over device memory so it can be processed. - return data_chunk(chunk_span); + return chunk_span; } private: @@ -130,7 +130,7 @@ class device_span_data_chunk_reader : public data_chunk_reader { public: device_span_data_chunk_reader(device_span data) : _data(data) {} - data_chunk get_next_chunk(uint32_t read_size, rmm::cuda_stream_view stream) override + device_span get_next_chunk(uint32_t read_size, rmm::cuda_stream_view stream) override { // limit the read size to the number of bytes remaining in the device_span. if (read_size > _data.size() - _position) { read_size = _data.size() - _position; } @@ -142,7 +142,7 @@ class device_span_data_chunk_reader : public data_chunk_reader { _position += read_size; // return the view over device memory so it can be processed. - return data_chunk(chunk_span); + return chunk_span; } private: From 18e0863f7428c43b3c82ec88cb5890e0f1f5ab24 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Wed, 4 Aug 2021 02:03:26 -0500 Subject: [PATCH 61/80] use std::vector instread of stream_pool --- cpp/src/io/text/multibyte_split.cu | 53 +++++++++++++++++++----------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 1193cbc17d8..4b7ae0a47fb 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -232,28 +232,37 @@ std::unique_ptr create_strings_column(rmm::device_uvector&& chars, num_strings, std::move(offsets_column), std::move(chars_column), 0, {}, stream, mr); } -void fork_stream_to_pool(rmm::cuda_stream_view stream, rmm::cuda_stream_pool& stream_pool) +void fork_stream(std::vector streams, rmm::cuda_stream_view stream) { cudaEvent_t event; cudaEventCreate(&event); cudaEventRecord(event, stream); - for (uint32_t i = 0; i < stream_pool.get_pool_size(); i++) { - cudaStreamWaitEvent(stream_pool.get_stream(i), event, 0); + for (uint32_t i = 0; i < streams.size(); i++) { + cudaStreamWaitEvent(streams[i], event, 0); } cudaEventDestroy(event); } -void join_pool_to_stream(rmm::cuda_stream_pool& stream_pool, rmm::cuda_stream_view stream) +void join_stream(std::vector streams, rmm::cuda_stream_view stream) { cudaEvent_t event; cudaEventCreate(&event); - for (uint32_t i = 0; i < stream_pool.get_pool_size(); i++) { - cudaEventRecord(event, stream_pool.get_stream(i)); + for (uint32_t i = 0; i < streams.size(); i++) { + cudaEventRecord(event, streams[i]); cudaStreamWaitEvent(stream, event, 0); } cudaEventDestroy(event); } +std::vector get_streams(int32_t count, rmm::cuda_stream_pool& stream_pool) +{ + auto streams = std::vector(); + for (int32_t i = 0; i < count; i++) { + streams.emplace_back(stream_pool.get_stream()); + } + return streams; +} + cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_source const& source, cudf::io::text::detail::trie const& trie, scan_tile_state& tile_multistates, @@ -261,7 +270,7 @@ cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_sour device_span output_buffer, device_span output_char_buffer, rmm::cuda_stream_view stream, - rmm::cuda_stream_pool& stream_pool) + std::vector const& streams) { CUDF_FUNC_RANGE(); cudf::size_type chunk_offset = 0; @@ -279,13 +288,14 @@ cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_sour tile_multistates.set_seed_async(multistate_seed, stream); tile_offsets.set_seed_async(0, stream); - fork_stream_to_pool(stream, stream_pool); + fork_stream(streams, stream); auto reader = source.create_reader(); - for (auto base_tile_idx = 0; true; base_tile_idx += TILES_PER_CHUNK) { - auto chunk_stream = stream_pool.get_stream(); - auto chunk = reader->get_next_chunk(ITEMS_PER_CHUNK, chunk_stream); + for (int32_t i = 0; true; i++) { + auto base_tile_idx = i * TILES_PER_CHUNK; + auto chunk_stream = streams[i % streams.size()]; + auto chunk = reader->get_next_chunk(ITEMS_PER_CHUNK, chunk_stream); if (chunk.size() == 0) { break; } @@ -311,7 +321,7 @@ cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_sour chunk_offset += chunk.size(); } - join_pool_to_stream(stream_pool, stream); + join_stream(streams, stream); return chunk_offset; } @@ -319,7 +329,8 @@ cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_sour std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source const& source, std::vector const& delimiters, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::mr::device_memory_resource* mr, + rmm::cuda_stream_pool& stream_pool) { CUDF_FUNC_RANGE(); auto const trie = cudf::io::text::detail::trie::create(delimiters, stream); @@ -331,7 +342,7 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source auto tile_multistates = scan_tile_state(num_tile_states, stream); auto tile_offsets = scan_tile_state(num_tile_states, stream); - auto stream_pool = rmm::cuda_stream_pool(concurrency); + auto streams = get_streams(concurrency, stream_pool); auto bytes_total = multibyte_split_scan_full_source(source, @@ -341,10 +352,9 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source cudf::device_span(static_cast(nullptr), 0), cudf::device_span(static_cast(nullptr), 0), stream, - stream_pool); - - // allocate string offsets + streams); + // allocate results auto num_tiles = ceil_div(bytes_total, ITEMS_PER_TILE); auto num_results = tile_offsets.get_inclusive_prefix(num_tiles - 1, stream); auto string_offsets = rmm::device_uvector(num_results + 2, stream, mr); @@ -364,7 +374,7 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source cudf::device_span(string_offsets).subspan(1, num_results), string_chars, stream, - stream_pool); + streams); auto res = create_strings_column(std::move(string_chars), std::move(string_offsets), stream, mr); @@ -377,9 +387,12 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source std::vector const& delimiters, rmm::mr::device_memory_resource* mr) { - auto stream = rmm::cuda_stream_default; - auto result = detail::multibyte_split(source, delimiters, stream, mr); + auto stream = rmm::cuda_stream_default; + auto stream_pool = rmm::cuda_stream_pool(2); + auto result = detail::multibyte_split(source, delimiters, stream, mr, stream_pool); + stream.synchronize(); + return result; } From 45e5b6549aff97f154ed7f8915af14c858878b16 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Wed, 4 Aug 2021 02:12:45 -0500 Subject: [PATCH 62/80] rename ticket to h_ticket --- .../cudf/io/text/data_chunk_source_factories.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp index 64f3522f92e..76903b25d97 100644 --- a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp +++ b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp @@ -80,18 +80,18 @@ class istream_data_chunk_reader : public data_chunk_reader { { CUDF_FUNC_RANGE(); - auto& ticket = _tickets[_next_ticket_idx]; + auto& h_ticket = _tickets[_next_ticket_idx]; _next_ticket_idx = (_next_ticket_idx + 1) % _tickets.size(); // synchronize on the last host-to-device copy, so we don't clobber the host buffer. - CUDA_TRY(cudaEventSynchronize(ticket.event)); + CUDA_TRY(cudaEventSynchronize(h_ticket.event)); // resize the host buffer as necessary to contain the requested number of bytes - if (ticket.buffer.size() < read_size) { ticket.buffer.resize(read_size); } + if (h_ticket.buffer.size() < read_size) { h_ticket.buffer.resize(read_size); } // read data from the host istream in to the pinned host memory buffer - _datastream->read(ticket.buffer.data(), read_size); + _datastream->read(h_ticket.buffer.data(), read_size); // adjust the read size to reflect how many bytes were actually read from the data stream read_size = _datastream->gcount(); @@ -102,13 +102,13 @@ class istream_data_chunk_reader : public data_chunk_reader { // copy the host-pinned data on to device CUDA_TRY(cudaMemcpyAsync( // chunk_span.data(), - ticket.buffer.data(), + h_ticket.buffer.data(), read_size, cudaMemcpyHostToDevice, stream.value())); // record the host-to-device copy. - CUDA_TRY(cudaEventRecord(ticket.event, stream.value())); + CUDA_TRY(cudaEventRecord(h_ticket.event, stream.value())); // return the view over device memory so it can be processed. return chunk_span; From ee122a81747c06676dd49e53aa339bce6c03077a Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Wed, 4 Aug 2021 18:16:38 -0500 Subject: [PATCH 63/80] adjust `scan_tile_state_view::get_prefix` to make the purpose of thread fence more obvious. --- .../cudf/io/text/detail/tile_state.hpp | 28 ++++++------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/cpp/include/cudf/io/text/detail/tile_state.hpp b/cpp/include/cudf/io/text/detail/tile_state.hpp index 95c4ec8beca..fe62486cd35 100644 --- a/cpp/include/cudf/io/text/detail/tile_state.hpp +++ b/cpp/include/cudf/io/text/detail/tile_state.hpp @@ -53,9 +53,11 @@ struct scan_tile_state_view { { auto const offset = (tile_idx + num_tiles) % num_tiles; - while ((status = cub::ThreadLoad(tile_status + offset)) == - scan_tile_status::invalid) { + while (true) { + status = cub::ThreadLoad(tile_status + offset); + // prevent break-condition from being hoisted out of the loop? __threadfence(); + if (status != scan_tile_status::invalid) { break; } } if (status == scan_tile_status::partial) { @@ -64,15 +66,6 @@ struct scan_tile_state_view { return cub::ThreadLoad(tile_inclusive + offset); } } - - __device__ inline T get_inclusive_prefix(cudf::size_type tile_idx) - { - auto const offset = (tile_idx + num_tiles) % num_tiles; - while (cub::ThreadLoad(tile_status + offset) != scan_tile_status::inclusive) { - __threadfence(); - } - return cub::ThreadLoad(tile_inclusive + offset); - } }; template @@ -100,14 +93,12 @@ struct scan_tile_state { inline void set_seed_async(T const seed, rmm::cuda_stream_view stream) { - auto x = tile_status.size(); - auto y = scan_tile_status::inclusive; - tile_state_inclusive.set_element_async(x - 1, seed, stream); - tile_status.set_element_async(x - 1, y, stream); + auto size = tile_status.size(); + auto status = scan_tile_status::inclusive; + tile_state_inclusive.set_element_async(size - 1, seed, stream); + tile_status.set_element_async(size - 1, status, stream); } - // T back_element(rmm::cuda_stream_view stream) const { return tile_state.back_element(stream); } - inline T get_inclusive_prefix(cudf::size_type tile_idx, rmm::cuda_stream_view stream) const { auto const offset = (tile_idx + tile_status.size()) % tile_status.size(); @@ -117,10 +108,7 @@ struct scan_tile_state { template struct scan_tile_state_callback { - using WarpReduce = cub::WarpReduce; - struct _TempStorage { - typename WarpReduce::TempStorage reduce; T exclusive_prefix; }; From ca6bbac41dd675b1018df6b29f4b5ddace0e7ddd Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Thu, 5 Aug 2021 20:51:43 -0500 Subject: [PATCH 64/80] fix UB in multibyte_split concurrent kernel execution, improve perf --- .../cudf/io/text/detail/tile_state.hpp | 2 -- cpp/include/cudf/io/text/multibyte_split.hpp | 1 - cpp/src/io/text/multibyte_split.cu | 19 +++++++++++++------ cpp/tests/io/text/multibyte_split_test.cpp | 10 ++-------- 4 files changed, 15 insertions(+), 17 deletions(-) diff --git a/cpp/include/cudf/io/text/detail/tile_state.hpp b/cpp/include/cudf/io/text/detail/tile_state.hpp index fe62486cd35..031561203a1 100644 --- a/cpp/include/cudf/io/text/detail/tile_state.hpp +++ b/cpp/include/cudf/io/text/detail/tile_state.hpp @@ -146,8 +146,6 @@ struct scan_tile_state_callback { _tile_state.set_inclusive_prefix(_tile_idx, _temp_storage.exclusive_prefix + block_aggregate); } - __syncthreads(); // TODO: remove if unnecessary. - return _temp_storage.exclusive_prefix; } diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp index 88f4c7d3819..6fe5358ac83 100644 --- a/cpp/include/cudf/io/text/multibyte_split.hpp +++ b/cpp/include/cudf/io/text/multibyte_split.hpp @@ -21,7 +21,6 @@ #include -#include #include namespace cudf { diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 4b7ae0a47fb..462969631c2 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -30,7 +30,6 @@ #include #include -#include #include namespace { @@ -43,12 +42,11 @@ constexpr decltype(auto) ceil_div(Dividend dividend, Divisor divisor) using cudf::io::text::detail::multistate; -int32_t constexpr ITEMS_PER_THREAD = 32; // influences register pressure -int32_t constexpr THREADS_PER_TILE = 32; // must be >= 32. bugged for > 32, needs fix +int32_t constexpr ITEMS_PER_THREAD = 32; +int32_t constexpr THREADS_PER_TILE = 128; int32_t constexpr ITEMS_PER_TILE = ITEMS_PER_THREAD * THREADS_PER_TILE; -int32_t constexpr TILES_PER_CHUNK = 512; -// keep ITEMS_PER_CHUNK below input size to force multi-tile execution. -int32_t constexpr ITEMS_PER_CHUNK = ITEMS_PER_TILE * TILES_PER_CHUNK; +int32_t constexpr TILES_PER_CHUNK = 1024; +int32_t constexpr ITEMS_PER_CHUNK = ITEMS_PER_TILE * TILES_PER_CHUNK; struct PatternScan { typedef cub::BlockScan BlockScan; @@ -292,6 +290,9 @@ cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_sour auto reader = source.create_reader(); + cudaEvent_t last_launch_event; + cudaEventCreate(&last_launch_event); + for (int32_t i = 0; true; i++) { auto base_tile_idx = i * TILES_PER_CHUNK; auto chunk_stream = streams[i % streams.size()]; @@ -308,6 +309,8 @@ cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_sour tile_multistates, tile_offsets); + cudaStreamWaitEvent(chunk_stream, last_launch_event, 0); + multibyte_split_kernel<<>>( // base_tile_idx, tile_multistates, @@ -318,9 +321,13 @@ cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_sour output_buffer, output_char_buffer); + cudaEventRecord(last_launch_event, chunk_stream); + chunk_offset += chunk.size(); } + cudaEventDestroy(last_launch_event); + join_stream(streams, stream); return chunk_offset; diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index f5fa8455edf..345d97a8081 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -67,7 +67,7 @@ TEST_F(MultibyteSplitTest, LargeInput) auto host_input = std::string(); auto host_expected = std::vector(); - for (auto i = 0; i < (32 * 32 * 512); i++) { + for (auto i = 0; i < (32 * 128 * 1024); i++) { host_input += ":::::"; host_input += "....."; host_expected.emplace_back(std::string(":::::")); @@ -81,13 +81,7 @@ TEST_F(MultibyteSplitTest, LargeInput) auto source = cudf::io::text::make_source(host_input); auto out = cudf::io::text::multibyte_split(*source, delimiters); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(cudf::strings_column_view(expected).chars(), - cudf::strings_column_view(*out).chars()); - - // CUDF_TEST_EXPECT_COLUMNS_EQUAL(cudf::strings_column_view(expected).offsets(), - // cudf::strings_column_view(*out).offsets()); - - // CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); } TEST_F(MultibyteSplitTest, OverlappingMatchErasure) From d68d9511df6ffb9dea132ddf54f0b817512f3ea1 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Thu, 5 Aug 2021 21:19:34 -0500 Subject: [PATCH 65/80] add error messages to multibyte_split to indicate unsupported use cases --- .../cudf/io/text/detail/multistate.hpp | 7 +++--- cpp/include/cudf/io/text/detail/trie.hpp | 24 +++++++++++++++++-- cpp/src/io/text/multibyte_split.cu | 9 ++++++- 3 files changed, 34 insertions(+), 6 deletions(-) diff --git a/cpp/include/cudf/io/text/detail/multistate.hpp b/cpp/include/cudf/io/text/detail/multistate.hpp index 5ccf6765028..d7b0275b9cc 100644 --- a/cpp/include/cudf/io/text/detail/multistate.hpp +++ b/cpp/include/cudf/io/text/detail/multistate.hpp @@ -29,6 +29,7 @@ namespace detail { */ struct multistate_segment { public: + static auto constexpr max_states = 16; constexpr multistate_segment() : _data(0) {} constexpr multistate_segment(uint8_t head, uint8_t tail) : _data((head & 0b1111) | (tail << 4)) {} @@ -44,6 +45,7 @@ struct multistate_segment { */ struct multistate { public: + static auto constexpr max_segments = 7; constexpr void enqueue(uint8_t head, uint8_t tail) { _segments[_size++] = multistate_segment(head, tail); @@ -66,9 +68,8 @@ struct multistate { constexpr uint8_t get_tail(uint8_t idx) const { return _segments[idx].get_tail(); } private: - static auto constexpr N = 7; - uint8_t _size = 0; - multistate_segment _segments[N]; + uint8_t _size = 0; + multistate_segment _segments[max_segments]; }; /** diff --git a/cpp/include/cudf/io/text/detail/trie.hpp b/cpp/include/cudf/io/text/detail/trie.hpp index 14f66ec4f73..3fa3344c91d 100644 --- a/cpp/include/cudf/io/text/detail/trie.hpp +++ b/cpp/include/cudf/io/text/detail/trie.hpp @@ -23,8 +23,10 @@ #include #include +#include #include #include +#include #include namespace cudf { @@ -118,10 +120,18 @@ struct trie_device_view { struct trie { private: + cudf::size_type _max_duplicate_tokens; rmm::device_uvector _nodes; public: - trie(rmm::device_uvector&& nodes) : _nodes(std::move(nodes)) {} + trie(cudf::size_type max_duplicate_tokens, rmm::device_uvector&& nodes) + : _max_duplicate_tokens(max_duplicate_tokens), _nodes(std::move(nodes)) + { + } + + cudf::size_type size() const { return _nodes.size(); } + + cudf::size_type max_duplicate_tokens() const { return _max_duplicate_tokens; } static trie create(std::string const& pattern, rmm::cuda_stream_view stream, @@ -175,12 +185,22 @@ struct trie { match_length.emplace_back(0); std::vector trie_nodes; + auto token_counts = std::unordered_map(); for (uint32_t i = 0; i < tokens.size(); i++) { trie_nodes.emplace_back(trie_node{tokens[i], match_length[i], transitions[i]}); + token_counts[tokens[i]]++; } - return trie{cudf::detail::make_device_uvector_async(trie_nodes, stream, mr)}; + auto most_common_token = + std::max_element(token_counts.begin(), token_counts.end(), [](auto const& a, auto const& b) { + return a.second < b.second; + }); + + auto max_duplicate_tokens = most_common_token->second; + + return trie{max_duplicate_tokens, + cudf::detail::make_device_uvector_sync(trie_nodes, stream, mr)}; } trie_device_view view() const { return trie_device_view{_nodes}; } diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 462969631c2..e0a8add7cf5 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -340,7 +340,14 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source rmm::cuda_stream_pool& stream_pool) { CUDF_FUNC_RANGE(); - auto const trie = cudf::io::text::detail::trie::create(delimiters, stream); + auto const trie = cudf::io::text::detail::trie::create(delimiters, stream); + + CUDF_EXPECTS(trie.max_duplicate_tokens() <= multistate::max_segments, + "delimiters must be representable by a trie with no more than 7 duplicate tokens"); + + CUDF_EXPECTS(trie.size() <= multistate_segment::max_states, + "delimiters must be representable by a trie with no more than 16 unique states"); + auto concurrency = 2; // must be at least 32 when using warp-reduce on partials // must be at least 1 more than max possible concurrent tiles From 9684646dddf61656732ab7d1d193760015ed99be Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Sun, 8 Aug 2021 22:39:24 -0500 Subject: [PATCH 66/80] remove __threadfence() in favor of cuda::atomic --- .../cudf/io/text/detail/tile_state.hpp | 42 ++++++------------- cpp/src/io/text/multibyte_split.cu | 28 +++++++++++-- 2 files changed, 37 insertions(+), 33 deletions(-) diff --git a/cpp/include/cudf/io/text/detail/tile_state.hpp b/cpp/include/cudf/io/text/detail/tile_state.hpp index 031561203a1..f40d0aa8054 100644 --- a/cpp/include/cudf/io/text/detail/tile_state.hpp +++ b/cpp/include/cudf/io/text/detail/tile_state.hpp @@ -3,6 +3,8 @@ #include +#include + namespace cudf { namespace io { namespace text { @@ -18,47 +20,36 @@ enum class scan_tile_status : uint8_t { template struct scan_tile_state_view { uint64_t num_tiles; - scan_tile_status* tile_status; + cuda::atomic* tile_status; T* tile_partial; T* tile_inclusive; - __device__ inline void initialize_status(cudf::size_type base_tile_idx, - cudf::size_type count, - scan_tile_status status) + __device__ inline void set_status(cudf::size_type tile_idx, scan_tile_status status) { - auto thread_idx = blockIdx.x * blockDim.x + threadIdx.x; - if (thread_idx < count) { // - // this is UB if tile_status gets assigned from multiple threads. - tile_status[(base_tile_idx + thread_idx) % num_tiles] = status; - } + auto const offset = (tile_idx + num_tiles) % num_tiles; + tile_status[offset].store(status); } __device__ inline void set_partial_prefix(cudf::size_type tile_idx, T value) { auto const offset = (tile_idx + num_tiles) % num_tiles; cub::ThreadStore(tile_partial + offset, value); - __threadfence(); - cub::ThreadStore(tile_status + offset, scan_tile_status::partial); + tile_status[offset].store(scan_tile_status::partial); } __device__ inline void set_inclusive_prefix(cudf::size_type tile_idx, T value) { auto const offset = (tile_idx + num_tiles) % num_tiles; cub::ThreadStore(tile_inclusive + offset, value); - __threadfence(); - cub::ThreadStore(tile_status + offset, scan_tile_status::inclusive); + tile_status[offset].store(scan_tile_status::inclusive); } __device__ inline T get_prefix(cudf::size_type tile_idx, scan_tile_status& status) { auto const offset = (tile_idx + num_tiles) % num_tiles; - while (true) { - status = cub::ThreadLoad(tile_status + offset); - // prevent break-condition from being hoisted out of the loop? - __threadfence(); - if (status != scan_tile_status::invalid) { break; } - } + while ((status = tile_status[offset].load(cuda::memory_order_relaxed)) == + scan_tile_status::invalid) {} if (status == scan_tile_status::partial) { return cub::ThreadLoad(tile_partial + offset); @@ -70,14 +61,15 @@ struct scan_tile_state_view { template struct scan_tile_state { - rmm::device_uvector tile_status; + rmm::device_uvector> tile_status; rmm::device_uvector tile_state_partial; rmm::device_uvector tile_state_inclusive; scan_tile_state(cudf::size_type num_tiles, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) - : tile_status(rmm::device_uvector(num_tiles, stream, mr)), + : tile_status(rmm::device_uvector>( + num_tiles, stream, mr)), tile_state_partial(rmm::device_uvector(num_tiles, stream, mr)), tile_state_inclusive(rmm::device_uvector(num_tiles, stream, mr)) { @@ -91,14 +83,6 @@ struct scan_tile_state { tile_state_inclusive.data()}; } - inline void set_seed_async(T const seed, rmm::cuda_stream_view stream) - { - auto size = tile_status.size(); - auto status = scan_tile_status::inclusive; - tile_state_inclusive.set_element_async(size - 1, seed, stream); - tile_status.set_element_async(size - 1, status, stream); - } - inline T get_inclusive_prefix(cudf::size_type tile_idx, rmm::cuda_stream_view stream) const { auto const offset = (tile_idx + tile_status.size()) % tile_status.size(); diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index e0a8add7cf5..db32960abf9 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -106,8 +106,25 @@ __global__ void multibyte_split_init_kernel( cudf::io::text::detail::scan_tile_status status = cudf::io::text::detail::scan_tile_status::invalid) { - tile_multistates.initialize_status(base_tile_idx, num_tiles, status); - tile_output_offsets.initialize_status(base_tile_idx, num_tiles, status); + auto const thread_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_idx < num_tiles) { + auto const tile_idx = base_tile_idx + thread_idx; + tile_multistates.set_status(tile_idx, status); + tile_output_offsets.set_status(tile_idx, status); + } +} + +__global__ void multibyte_split_seed_kernel( + cudf::io::text::detail::scan_tile_state_view tile_multistates, + cudf::io::text::detail::scan_tile_state_view tile_output_offsets, + multistate tile_multistate_seed, + uint32_t tile_output_offset) +{ + auto const thread_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_idx == 0) { + tile_multistates.set_inclusive_prefix(-1, tile_multistate_seed); + tile_output_offsets.set_inclusive_prefix(-1, tile_output_offset); + } } __global__ void multibyte_split_kernel( @@ -283,8 +300,11 @@ cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_sour auto multistate_seed = multistate(); multistate_seed.enqueue(0, 0); // this represents the first state in the pattern. - tile_multistates.set_seed_async(multistate_seed, stream); - tile_offsets.set_seed_async(0, stream); + multibyte_split_seed_kernel<<<1, 1, 0, stream.value()>>>( // + tile_multistates, + tile_offsets, + multistate_seed, + 0); fork_stream(streams, stream); From d3de0625c62cfcc8044ac69d4ae1313033492ea0 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Thu, 12 Aug 2021 19:00:53 -0500 Subject: [PATCH 67/80] improve multibyte_split benchmarks --- .../io/text/multibyte_split_benchmark.cpp | 109 ++++++++++++------ 1 file changed, 74 insertions(+), 35 deletions(-) diff --git a/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp index f022de09502..57913a8d24e 100644 --- a/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp +++ b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp @@ -14,15 +14,19 @@ * limitations under the License. */ +#include #include #include #include #include + #include #include #include +#include +#include #include #include @@ -41,38 +45,76 @@ enum data_chunk_source_type { device, }; -static void BM_multibyte_split(benchmark::State& state) +static cudf::string_scalar create_random_input(int32_t num_chars, + int32_t num_delims, + double deviation, + std::string delim) { - auto num_chars = state.range(0); - auto source_type = static_cast(state.range(1)); + auto const num_rows = num_delims; + auto const num_delim_chars = delim.size() * num_delims; + auto const num_value_chars = num_chars - num_delim_chars; + auto const value_size_max = static_cast(num_value_chars / num_rows); + auto const value_size_min = static_cast(value_size_max * (1 - deviation)); + + data_profile table_profile; + + table_profile.set_distribution_params( // + cudf::type_id::STRING, + distribution_id::NORMAL, + value_size_min, + value_size_max); + + auto const values_table = create_random_table( // + {cudf::type_id::STRING}, + 1, + row_count{num_rows}, + table_profile); + + auto delim_scalar = cudf::make_string_scalar(delim); + auto delims_column = cudf::make_column_from_scalar(*delim_scalar, num_rows); + auto input_table = cudf::table_view({values_table->get_column(0).view(), delims_column->view()}); + auto input_column = cudf::strings::concatenate(input_table); + + // extract the chars from the returned strings column. + auto input_column_contents = input_column->release(); + auto chars_column_contents = input_column_contents.children[1]->release(); + auto chars_buffer = chars_column_contents.data.release(); + + // turn the chars in to a string scalar. + return cudf::string_scalar(std::move(*chars_buffer)); +} - // it would be better if we initialized these chars on gpu, then scattered-in some delimiters, - // then copied them back to host - auto host_input = std::string(num_chars, 'x'); - auto device_input = cudf::string_scalar(host_input); +static void BM_multibyte_split(benchmark::State& state) +{ + auto file_size_approx = state.range(0); + auto delimiter_count = state.range(1); + auto source_type = static_cast(state.range(2)); + auto device_input = create_random_input(file_size_approx, delimiter_count, 0.1, "::"); + // auto host_input = std::string(file_size_approx, 'x'); - auto temp_file_name = random_file_in_dir(temp_dir.path()); + // auto temp_file_name = random_file_in_dir(temp_dir.path()); - close(mkstemp(const_cast(temp_file_name.data()))); - { - auto temp_fostream = std::ofstream(temp_file_name, std::ofstream::out); - temp_fostream << host_input; - temp_fostream.close(); - } + // close(mkstemp(const_cast(temp_file_name.data()))); + // { + // auto temp_fostream = std::ofstream(temp_file_name, std::ofstream::out); + // temp_fostream << host_input; + // temp_fostream.close(); + // } cudaDeviceSynchronize(); auto source = std::unique_ptr(nullptr); switch (source_type) { - case data_chunk_source_type::file: // - source = cudf::io::text::make_source_from_file(temp_file_name); - state.SetLabel("from file"); - break; - case data_chunk_source_type::host: // - source = cudf::io::text::make_source(host_input); - state.SetLabel("from host"); - break; + case data_chunk_source_type::file: // + // source = + // cudf::io::text::make_source_from_file(temp_file_name); + // state.SetLabel("from file"); + // break; + case data_chunk_source_type::host: // + // source = cudf::io::text::make_source(host_input); + // state.SetLabel("from host"); + // break; case data_chunk_source_type::device: // source = cudf::io::text::make_source(device_input); state.SetLabel("from device"); @@ -80,30 +122,27 @@ static void BM_multibyte_split(benchmark::State& state) default: CUDF_FAIL(); } - auto delimiters = std::vector({"x"}); + auto delimiters = std::vector({"::"}); for (auto _ : state) { cuda_event_timer raii(state, true); auto output = cudf::io::text::multibyte_split(*source, delimiters); } - state.SetBytesProcessed(state.iterations() * num_chars); + state.SetBytesProcessed(state.iterations() * device_input.size()); } class MultibyteSplitBenchmark : public cudf::benchmark { }; -#define TRANSPOSE_BM_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(MultibyteSplitBenchmark, name)(::benchmark::State & state) \ - { \ - BM_multibyte_split(state); \ - } \ - BENCHMARK_REGISTER_F(MultibyteSplitBenchmark, name) \ - ->ArgsProduct({{1 << 15, 1 << 30}, \ - {data_chunk_source_type::file, \ - data_chunk_source_type::host, \ - data_chunk_source_type::device}}) \ - ->UseManualTime() \ +#define TRANSPOSE_BM_BENCHMARK_DEFINE(name) \ + BENCHMARK_DEFINE_F(MultibyteSplitBenchmark, name)(::benchmark::State & state) \ + { \ + BM_multibyte_split(state); \ + } \ + BENCHMARK_REGISTER_F(MultibyteSplitBenchmark, name) \ + ->ArgsProduct({{1 << 30, 1 << 30}, {1 << 15, 1 << 15}, {data_chunk_source_type::device}}) \ + ->UseManualTime() \ ->Unit(::benchmark::kMillisecond); TRANSPOSE_BM_BENCHMARK_DEFINE(multibyte_split_simple); From d3921404fa348b3edd344e1a43ab0bd8ec220a55 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Thu, 12 Aug 2021 19:01:24 -0500 Subject: [PATCH 68/80] provide explicit memory_order for tile state status stores. --- cpp/include/cudf/io/text/detail/tile_state.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/io/text/detail/tile_state.hpp b/cpp/include/cudf/io/text/detail/tile_state.hpp index f40d0aa8054..e7787f64e4f 100644 --- a/cpp/include/cudf/io/text/detail/tile_state.hpp +++ b/cpp/include/cudf/io/text/detail/tile_state.hpp @@ -27,7 +27,7 @@ struct scan_tile_state_view { __device__ inline void set_status(cudf::size_type tile_idx, scan_tile_status status) { auto const offset = (tile_idx + num_tiles) % num_tiles; - tile_status[offset].store(status); + tile_status[offset].store(status, cuda::memory_order_relaxed); } __device__ inline void set_partial_prefix(cudf::size_type tile_idx, T value) From 42b8c881b68b6c2cac935a0993e5ccae474faacc Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Thu, 12 Aug 2021 19:46:59 -0500 Subject: [PATCH 69/80] improve multibyte_split benchmarks --- .../io/text/multibyte_split_benchmark.cpp | 45 +++++++++++-------- 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp index 57913a8d24e..00892b387d4 100644 --- a/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp +++ b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp @@ -46,13 +46,14 @@ enum data_chunk_source_type { }; static cudf::string_scalar create_random_input(int32_t num_chars, - int32_t num_delims, + double delim_factor, double deviation, std::string delim) { - auto const num_rows = num_delims; - auto const num_delim_chars = delim.size() * num_delims; + auto const num_delims = static_cast((num_chars * delim_factor) / delim.size()); + auto const num_delim_chars = num_delims * delim.size(); auto const num_value_chars = num_chars - num_delim_chars; + auto const num_rows = num_delims; auto const value_size_max = static_cast(num_value_chars / num_rows); auto const value_size_min = static_cast(value_size_max * (1 - deviation)); @@ -86,10 +87,18 @@ static cudf::string_scalar create_random_input(int32_t num_chars, static void BM_multibyte_split(benchmark::State& state) { - auto file_size_approx = state.range(0); - auto delimiter_count = state.range(1); - auto source_type = static_cast(state.range(2)); - auto device_input = create_random_input(file_size_approx, delimiter_count, 0.1, "::"); + auto source_type = static_cast(state.range(0)); + auto delim_size = state.range(1); + auto delim_percent = state.range(2); + auto file_size_approx = state.range(3); + + CUDF_EXPECTS(delim_percent >= 1, "delimiter percent must be at least 1"); + CUDF_EXPECTS(delim_percent <= 50, "delimiter percent must be at most 50"); + + auto delim = std::string(":", delim_size); + + auto delim_factor = static_cast(delim_percent) / 100; + auto device_input = create_random_input(file_size_approx, delim_factor, 0.1, delim); // auto host_input = std::string(file_size_approx, 'x'); // auto temp_file_name = random_file_in_dir(temp_dir.path()); @@ -106,15 +115,15 @@ static void BM_multibyte_split(benchmark::State& state) auto source = std::unique_ptr(nullptr); switch (source_type) { - case data_chunk_source_type::file: // - // source = - // cudf::io::text::make_source_from_file(temp_file_name); - // state.SetLabel("from file"); - // break; - case data_chunk_source_type::host: // - // source = cudf::io::text::make_source(host_input); - // state.SetLabel("from host"); - // break; + // case data_chunk_source_type::file: // + // source = + // cudf::io::text::make_source_from_file(temp_file_name); + // state.SetLabel("from file"); + // break; + // case data_chunk_source_type::host: // + // source = cudf::io::text::make_source(host_input); + // state.SetLabel("from host"); + // break; case data_chunk_source_type::device: // source = cudf::io::text::make_source(device_input); state.SetLabel("from device"); @@ -122,7 +131,7 @@ static void BM_multibyte_split(benchmark::State& state) default: CUDF_FAIL(); } - auto delimiters = std::vector({"::"}); + auto delimiters = std::vector({delim}); for (auto _ : state) { cuda_event_timer raii(state, true); @@ -141,7 +150,7 @@ class MultibyteSplitBenchmark : public cudf::benchmark { BM_multibyte_split(state); \ } \ BENCHMARK_REGISTER_F(MultibyteSplitBenchmark, name) \ - ->ArgsProduct({{1 << 30, 1 << 30}, {1 << 15, 1 << 15}, {data_chunk_source_type::device}}) \ + ->ArgsProduct({{data_chunk_source_type::device}, {1, 4, 7}, {1, 25}, {1 << 15, 1 << 30}}) \ ->UseManualTime() \ ->Unit(::benchmark::kMillisecond); From 40d81e88cb05a7e302ee348b6bd160ab5a87ae61 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 13 Aug 2021 20:03:55 -0500 Subject: [PATCH 70/80] add file and host benchmarks for multibyte_split --- .../io/text/multibyte_split_benchmark.cpp | 68 +++++++++++-------- 1 file changed, 39 insertions(+), 29 deletions(-) diff --git a/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp index 00892b387d4..e7ad1516c4d 100644 --- a/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp +++ b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp @@ -29,6 +29,8 @@ #include #include +#include + #include #include @@ -40,9 +42,9 @@ using cudf::test::fixed_width_column_wrapper; temp_directory const temp_dir("cudf_gbench"); enum data_chunk_source_type { + device, file, host, - device, }; static cudf::string_scalar create_random_input(int32_t num_chars, @@ -99,34 +101,37 @@ static void BM_multibyte_split(benchmark::State& state) auto delim_factor = static_cast(delim_percent) / 100; auto device_input = create_random_input(file_size_approx, delim_factor, 0.1, delim); - // auto host_input = std::string(file_size_approx, 'x'); - - // auto temp_file_name = random_file_in_dir(temp_dir.path()); - - // close(mkstemp(const_cast(temp_file_name.data()))); - // { - // auto temp_fostream = std::ofstream(temp_file_name, std::ofstream::out); - // temp_fostream << host_input; - // temp_fostream.close(); - // } + auto host_input = thrust::host_vector(device_input.size()); + auto host_string = std::string(host_input.data(), host_input.size()); + + cudaMemcpyAsync(host_input.data(), + device_input.data(), + device_input.size() * sizeof(char), + cudaMemcpyDeviceToHost, + rmm::cuda_stream_default); + + auto temp_file_name = random_file_in_dir(temp_dir.path()); + + close(mkstemp(const_cast(temp_file_name.data()))); + { + auto temp_fostream = std::ofstream(temp_file_name, std::ofstream::out); + temp_fostream.write(host_input.data(), host_input.size()); + temp_fostream.close(); + } cudaDeviceSynchronize(); auto source = std::unique_ptr(nullptr); switch (source_type) { - // case data_chunk_source_type::file: // - // source = - // cudf::io::text::make_source_from_file(temp_file_name); - // state.SetLabel("from file"); - // break; - // case data_chunk_source_type::host: // - // source = cudf::io::text::make_source(host_input); - // state.SetLabel("from host"); - // break; + case data_chunk_source_type::file: // + source = cudf::io::text::make_source_from_file(temp_file_name); + break; + case data_chunk_source_type::host: // + source = cudf::io::text::make_source(host_string); + break; case data_chunk_source_type::device: // source = cudf::io::text::make_source(device_input); - state.SetLabel("from device"); break; default: CUDF_FAIL(); } @@ -144,14 +149,19 @@ static void BM_multibyte_split(benchmark::State& state) class MultibyteSplitBenchmark : public cudf::benchmark { }; -#define TRANSPOSE_BM_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(MultibyteSplitBenchmark, name)(::benchmark::State & state) \ - { \ - BM_multibyte_split(state); \ - } \ - BENCHMARK_REGISTER_F(MultibyteSplitBenchmark, name) \ - ->ArgsProduct({{data_chunk_source_type::device}, {1, 4, 7}, {1, 25}, {1 << 15, 1 << 30}}) \ - ->UseManualTime() \ +#define TRANSPOSE_BM_BENCHMARK_DEFINE(name) \ + BENCHMARK_DEFINE_F(MultibyteSplitBenchmark, name)(::benchmark::State & state) \ + { \ + BM_multibyte_split(state); \ + } \ + BENCHMARK_REGISTER_F(MultibyteSplitBenchmark, name) \ + ->ArgsProduct({{data_chunk_source_type::device, \ + data_chunk_source_type::file, \ + data_chunk_source_type::host}, \ + {1, 4, 7}, \ + {1, 25}, \ + {1 << 15, 1 << 30}}) \ + ->UseManualTime() \ ->Unit(::benchmark::kMillisecond); TRANSPOSE_BM_BENCHMARK_DEFINE(multibyte_split_simple); From 31713399d44621259dc19430a476ec562b88156a Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 13 Aug 2021 20:20:21 -0500 Subject: [PATCH 71/80] make use of div_rounding_up_safe --- cpp/src/io/text/multibyte_split.cu | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index db32960abf9..46f2f81c36a 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -34,12 +35,6 @@ namespace { -template -constexpr decltype(auto) ceil_div(Dividend dividend, Divisor divisor) -{ - return dividend / divisor + (dividend % divisor != 0); -} - using cudf::io::text::detail::multistate; int32_t constexpr ITEMS_PER_THREAD = 32; @@ -320,7 +315,8 @@ cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_sour if (chunk.size() == 0) { break; } - auto tiles_in_launch = ceil_div(chunk.size(), ITEMS_PER_TILE); + auto tiles_in_launch = + cudf::util::div_rounding_up_safe(chunk.size(), static_cast(ITEMS_PER_TILE)); // reset the next chunk of tile state multibyte_split_init_kernel<<>>( // @@ -389,7 +385,7 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source streams); // allocate results - auto num_tiles = ceil_div(bytes_total, ITEMS_PER_TILE); + auto num_tiles = cudf::util::div_rounding_up_safe(bytes_total, ITEMS_PER_TILE); auto num_results = tile_offsets.get_inclusive_prefix(num_tiles - 1, stream); auto string_offsets = rmm::device_uvector(num_results + 2, stream, mr); auto string_chars = rmm::device_uvector(bytes_total, stream, mr); From 63c4bb017ec442ca9156b46436b5adc4b70f9939 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 13 Aug 2021 20:24:42 -0500 Subject: [PATCH 72/80] remove unused temp storage from tile state callback --- .../cudf/io/text/detail/tile_state.hpp | 32 +++++++------------ cpp/src/io/text/multibyte_split.cu | 17 ++++------ 2 files changed, 17 insertions(+), 32 deletions(-) diff --git a/cpp/include/cudf/io/text/detail/tile_state.hpp b/cpp/include/cudf/io/text/detail/tile_state.hpp index e7787f64e4f..849d857597b 100644 --- a/cpp/include/cudf/io/text/detail/tile_state.hpp +++ b/cpp/include/cudf/io/text/detail/tile_state.hpp @@ -92,48 +92,38 @@ struct scan_tile_state { template struct scan_tile_state_callback { - struct _TempStorage { - T exclusive_prefix; - }; - - using TempStorage = cub::Uninitialized<_TempStorage>; - - __device__ inline scan_tile_state_callback(TempStorage& temp_storage, - scan_tile_state_view& tile_state, + __device__ inline scan_tile_state_callback(scan_tile_state_view& tile_state, cudf::size_type tile_idx) - : _temp_storage(temp_storage.Alias()), _tile_state(tile_state), _tile_idx(tile_idx) + : _tile_state(tile_state), _tile_idx(tile_idx) { } __device__ inline T operator()(T const& block_aggregate) { + T exclusive_prefix; + if (threadIdx.x == 0) { - _tile_state.set_partial_prefix(_tile_idx, block_aggregate); // - } + _tile_state.set_partial_prefix(_tile_idx, block_aggregate); - auto predecessor_idx = _tile_idx - 1 - threadIdx.x; - auto predecessor_status = scan_tile_status::invalid; + auto predecessor_idx = _tile_idx - 1; + auto predecessor_status = scan_tile_status::invalid; - // scan partials to form prefix + // scan partials to form prefix - if (threadIdx.x == 0) { auto window_partial = _tile_state.get_prefix(predecessor_idx, predecessor_status); while (predecessor_status != scan_tile_status::inclusive) { predecessor_idx--; auto predecessor_prefix = _tile_state.get_prefix(predecessor_idx, predecessor_status); window_partial = predecessor_prefix + window_partial; } - _temp_storage.exclusive_prefix = window_partial; - } + exclusive_prefix = window_partial; - if (threadIdx.x == 0) { - _tile_state.set_inclusive_prefix(_tile_idx, _temp_storage.exclusive_prefix + block_aggregate); + _tile_state.set_inclusive_prefix(_tile_idx, exclusive_prefix + block_aggregate); } - return _temp_storage.exclusive_prefix; + return exclusive_prefix; } - _TempStorage& _temp_storage; scan_tile_state_view& _tile_state; cudf::size_type _tile_idx; }; diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 46f2f81c36a..fceb6115e11 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -49,7 +49,6 @@ struct PatternScan { struct _TempStorage { typename BlockScan::TempStorage scan; - typename BlockScanCallback::TempStorage scan_callback; }; _TempStorage& _temp_storage; @@ -70,7 +69,7 @@ struct PatternScan { thread_multistate = trie.transition(thread_data[i], thread_multistate); } - auto prefix_callback = BlockScanCallback(_temp_storage.scan_callback, tile_state, tile_idx); + auto prefix_callback = BlockScanCallback(tile_state, tile_idx); BlockScan(_temp_storage.scan) .ExclusiveSum(thread_multistate, thread_multistate, prefix_callback); @@ -140,10 +139,7 @@ __global__ void multibyte_split_kernel( __shared__ union { typename InputLoad::TempStorage input_load; typename PatternScan::TempStorage pattern_scan; - struct { - typename OffsetScan::TempStorage offset_scan; - typename OffsetScanCallback::TempStorage offset_scan_callback; - }; + typename OffsetScan::TempStorage offset_scan; } temp_storage; int32_t const tile_idx = base_tile_idx + blockIdx.x; @@ -164,8 +160,8 @@ __global__ void multibyte_split_kernel( uint32_t thread_states[ITEMS_PER_THREAD]; - __syncthreads(); // required before temp_memory re-use - PatternScan(temp_storage.pattern_scan) // + __syncthreads(); // required before temp_memory re-use + PatternScan(temp_storage.pattern_scan) .Scan(tile_idx, tile_multistates, trie, thread_chars, thread_states); // STEP 3: Flag matches @@ -178,10 +174,9 @@ __global__ void multibyte_split_kernel( // STEP 4: Scan flags to determine absolute thread output offset - __syncthreads(); // required before temp_memory re-use - auto prefix_callback = - OffsetScanCallback(temp_storage.offset_scan_callback, tile_output_offsets, tile_idx); + auto prefix_callback = OffsetScanCallback(tile_output_offsets, tile_idx); + __syncthreads(); // required before temp_memory re-use OffsetScan(temp_storage.offset_scan) .ExclusiveSum(thread_offsets, thread_offsets, prefix_callback); From 05cdecfed850b8000e9fd78964063f5752dc51a4 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Tue, 17 Aug 2021 14:25:22 -0500 Subject: [PATCH 73/80] simplify multibyte_split api to accept only a single delimiter --- .../io/text/multibyte_split_benchmark.cpp | 4 +- cpp/include/cudf/io/text/multibyte_split.hpp | 2 +- cpp/src/io/text/multibyte_split.cu | 8 +- cpp/tests/io/text/multibyte_split_test.cpp | 96 +++++++++---------- 4 files changed, 53 insertions(+), 57 deletions(-) diff --git a/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp index e7ad1516c4d..13b3a29decb 100644 --- a/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp +++ b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp @@ -136,11 +136,9 @@ static void BM_multibyte_split(benchmark::State& state) default: CUDF_FAIL(); } - auto delimiters = std::vector({delim}); - for (auto _ : state) { cuda_event_timer raii(state, true); - auto output = cudf::io::text::multibyte_split(*source, delimiters); + auto output = cudf::io::text::multibyte_split(*source, delim); } state.SetBytesProcessed(state.iterations() * device_input.size()); diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp index 6fe5358ac83..d42ee9f510e 100644 --- a/cpp/include/cudf/io/text/multibyte_split.hpp +++ b/cpp/include/cudf/io/text/multibyte_split.hpp @@ -29,7 +29,7 @@ namespace text { std::unique_ptr multibyte_split( data_chunk_source const& source, - std::vector const& delimiters, + std::string const& delimiter, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace text diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index fceb6115e11..f5fdb917239 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -345,13 +345,13 @@ cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_sour } std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source const& source, - std::vector const& delimiters, + std::string const& delimiter, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr, rmm::cuda_stream_pool& stream_pool) { CUDF_FUNC_RANGE(); - auto const trie = cudf::io::text::detail::trie::create(delimiters, stream); + auto const trie = cudf::io::text::detail::trie::create({delimiter}, stream); CUDF_EXPECTS(trie.max_duplicate_tokens() <= multistate::max_segments, "delimiters must be representable by a trie with no more than 7 duplicate tokens"); @@ -409,12 +409,12 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source } // namespace detail std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source const& source, - std::vector const& delimiters, + std::string const& delimiter, rmm::mr::device_memory_resource* mr) { auto stream = rmm::cuda_stream_default; auto stream_pool = rmm::cuda_stream_pool(2); - auto result = detail::multibyte_split(source, delimiters, stream, mr, stream_pool); + auto result = detail::multibyte_split(source, delimiter, stream, mr, stream_pool); stream.synchronize(); diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index 345d97a8081..d1fa787e000 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -36,57 +36,54 @@ struct MultibyteSplitTest : public BaseFixture { TEST_F(MultibyteSplitTest, NondeterministicMatching) { - auto delimiters = std::vector({"abac"}); + auto delimiter = std::string("abac"); auto host_input = std::string("ababacabacab"); auto expected = strings_column_wrapper{"ababac", "abac", "ab"}; auto source = cudf::io::text::make_source(host_input); - auto out = cudf::io::text::multibyte_split(*source, delimiters); + auto out = cudf::io::text::multibyte_split(*source, delimiter); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); } TEST_F(MultibyteSplitTest, DelimiterAtEnd) { - auto delimiters = std::vector({":"}); + auto delimiter = std::string(":"); auto host_input = std::string("abcdefg:"); auto expected = strings_column_wrapper{"abcdefg:", ""}; auto source = cudf::io::text::make_source(host_input); - auto out = cudf::io::text::multibyte_split(*source, delimiters); + auto out = cudf::io::text::multibyte_split(*source, delimiter); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); } TEST_F(MultibyteSplitTest, LargeInput) { - auto delimiters = std::vector({":::::", "....."}); - auto host_input = std::string(); auto host_expected = std::vector(); - for (auto i = 0; i < (32 * 128 * 1024); i++) { - host_input += ":::::"; - host_input += "....."; - host_expected.emplace_back(std::string(":::::")); - host_expected.emplace_back(std::string(".....")); + for (auto i = 0; i < (2 * 32 * 128 * 1024); i++) { + host_input += "...:|"; + host_expected.emplace_back(std::string("...:|")); } host_expected.emplace_back(std::string("")); auto expected = strings_column_wrapper{host_expected.begin(), host_expected.end()}; - auto source = cudf::io::text::make_source(host_input); - auto out = cudf::io::text::multibyte_split(*source, delimiters); + auto delimiter = std::string("...:|"); + auto source = cudf::io::text::make_source(host_input); + auto out = cudf::io::text::multibyte_split(*source, delimiter); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); } TEST_F(MultibyteSplitTest, OverlappingMatchErasure) { - auto delimiters = std::vector({":::::"}); + auto delimiter = "::"; auto host_input = std::string( ":::::" @@ -94,49 +91,50 @@ TEST_F(MultibyteSplitTest, OverlappingMatchErasure) auto expected = strings_column_wrapper{":::::", ":::::"}; auto source = cudf::io::text::make_source(host_input); - auto out = cudf::io::text::multibyte_split(*source, delimiters); + auto out = cudf::io::text::multibyte_split(*source, delimiter); // CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); // this use case it not yet supported. } -TEST_F(MultibyteSplitTest, MultipleDelimiters) +TEST_F(MultibyteSplitTest, HandpickedInput) { - auto delimiters = std::vector({"😀", "😎", ",", "::"}); + auto delimiters = "::|"; auto host_input = std::string( - "aaa😀" - "bbb😀" - "ccc😀" - "ddd😀" - "eee😀" - "fff::" - "ggg😀" - "hhh😀" - "___," - "here," - "is," - "another," - "simple😀" - "text😎" - "seperated😎" - "by😎" - "emojis," - "which," - "are😎" - "multiple," - "bytes::" - "and😎" - "used😎" - "as😎" - "delimiters.😎" - "::" - "," - "😀"); + "aaa::|" + "bbb::|" + "ccc::|" + "ddd::|" + "eee::|" + "fff::|" + "ggg::|" + "hhh::|" + "___::|" + "here::|" + "is::|" + "another::|" + "simple::|" + "text::|" + "seperated::|" + "by::|" + "emojis::|" + "which::|" + "are::|" + "multiple::|" + "bytes::|" + "and::|" + "used::|" + "as::|" + "delimiters.::|" + "::|" + "::|" + "::|"); auto expected = strings_column_wrapper{ - "aaa😀", "bbb😀", "ccc😀", "ddd😀", "eee😀", "fff::", "ggg😀", "hhh😀", - "___,", "here,", "is,", "another,", "simple😀", "text😎", "seperated😎", "by😎", - "emojis,", "which,", "are😎", "multiple,", "bytes::", "and😎", "used😎", "as😎", - "delimiters.😎", "::", ",", "😀", ""}; + "aaa::|", "bbb::|", "ccc::|", "ddd::|", "eee::|", "fff::|", + "ggg::|", "hhh::|", "___::|", "here::|", "is::|", "another::|", + "simple::|", "text::|", "seperated::|", "by::|", "emojis::|", "which::|", + "are::|", "multiple::|", "bytes::|", "and::|", "used::|", "as::|", + "delimiters.::|", "::|", "::|", "::|", ""}; auto source = cudf::io::text::make_source(host_input); auto out = cudf::io::text::multibyte_split(*source, delimiters); From a4d4d7993a170c462215f544e00f8408e938c21f Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Wed, 18 Aug 2021 21:01:59 -0500 Subject: [PATCH 74/80] add strings column factory which takes device_uvectors --- cpp/include/cudf/column/column_factories.hpp | 20 ++++++++++ cpp/src/io/text/multibyte_split.cu | 41 ++----------------- cpp/src/strings/strings_column_factories.cu | 42 ++++++++++++++++++++ 3 files changed, 66 insertions(+), 37 deletions(-) diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp index bdb7fd48e60..ebd7f5bbef0 100644 --- a/cpp/include/cudf/column/column_factories.hpp +++ b/cpp/include/cudf/column/column_factories.hpp @@ -442,6 +442,26 @@ std::unique_ptr make_strings_column( rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Construct a STRING type column given offsets, columns, and optional null count and null + * mask. + * + * @param[in] num_strings The number of strings the column represents. + * @param[in] offsets The offset values for this column. The number of elements is one more than the + * total number of strings so the `offset[last] - offset[0]` is the total number of bytes in the + * strings vector. + * @param[in] chars The char bytes for all the strings for this column. Individual strings are + * identified by the offsets and the nullmask. + * @param[in] null_mask The bits specifying the null strings in device memory. Arrow format for + * nulls is used for interpreting this bitmask. + * @param[in] null_count The number of null string entries. + */ +std::unique_ptr make_strings_column(size_type num_strings, + rmm::device_uvector&& offsets, + rmm::device_uvector&& chars, + rmm::device_buffer&& null_mask = {}, + size_type null_count = cudf::UNKNOWN_NULL_COUNT); + /** * @brief Construct a LIST type column given offsets column, child column, null mask and null * count. diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index f5fdb917239..89ba0f45c8e 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -205,38 +205,6 @@ namespace io { namespace text { namespace detail { -template -std::unique_ptr create_column(rmm::device_uvector&& values) -{ - auto size = values.size(); - auto dtype = cudf::data_type{cudf::type_to_id()}; - - CUDF_EXPECTS(dtype.id() != type_id::EMPTY, "column type_id cannot be EMPTY"); - - return std::make_unique(dtype, size, values.release(), rmm::device_buffer(), 0); -} - -std::unique_ptr create_char_column(rmm::device_uvector&& values) -{ - auto size = values.size(); - auto dtype = cudf::data_type{type_id::INT8}; - - return std::make_unique(dtype, size, values.release(), rmm::device_buffer(), 0); -} - -std::unique_ptr create_strings_column(rmm::device_uvector&& chars, - rmm::device_uvector&& offsets, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - auto num_strings = offsets.size() - 1; - auto chars_column = create_char_column(std::move(chars)); - auto offsets_column = create_column(std::move(offsets)); - - return cudf::make_strings_column( - num_strings, std::move(offsets_column), std::move(chars_column), 0, {}, stream, mr); -} - void fork_stream(std::vector streams, rmm::cuda_stream_view stream) { cudaEvent_t event; @@ -387,9 +355,9 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source // first and last element are set manually to zero and size of input, respectively. // kernel is only responsible for determining delimiter offsets - auto const x = string_offsets.size() - 1; + auto string_count = static_cast(string_offsets.size() - 1); string_offsets.set_element_to_zero_async(0, stream); - string_offsets.set_element_async(x, bytes_total, stream); + string_offsets.set_element_async(string_count, bytes_total, stream); multibyte_split_scan_full_source( source, @@ -401,9 +369,8 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source stream, streams); - auto res = create_strings_column(std::move(string_chars), std::move(string_offsets), stream, mr); - - return res; + return cudf::make_strings_column( + string_count, std::move(string_offsets), std::move(string_chars)); } } // namespace detail diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu index abf1f9599dc..c89f1b756d6 100644 --- a/cpp/src/strings/strings_column_factories.cu +++ b/cpp/src/strings/strings_column_factories.cu @@ -137,4 +137,46 @@ std::unique_ptr make_strings_column(size_type num_strings, std::move(children)); } +std::unique_ptr make_strings_column(size_type num_strings, + rmm::device_uvector&& offsets, + rmm::device_uvector&& chars, + rmm::device_buffer&& null_mask, + size_type null_count) +{ + CUDF_FUNC_RANGE(); + + auto const offsets_size = static_cast(offsets.size()); + auto const chars_size = static_cast(chars.size()); + + if (null_count > 0) CUDF_EXPECTS(null_mask.size() > 0, "Column with nulls must be nullable."); + + CUDF_EXPECTS(num_strings == offsets_size - 1, "Invalid offsets column size for strings column."); + + auto offsets_column = std::make_unique( // + data_type{type_id::INT32}, + offsets_size, + offsets.release(), + rmm::device_buffer(), + 0); + + auto chars_column = std::make_unique( // + data_type{type_id::INT8}, + chars_size, + chars.release(), + rmm::device_buffer(), + 0); + + auto children = std::vector>(); + + children.emplace_back(std::move(offsets_column)); + children.emplace_back(std::move(chars_column)); + + return std::make_unique(data_type{type_id::STRING}, + num_strings, + rmm::device_buffer{}, + std::move(null_mask), + null_count, + std::move(children)); +} + } // namespace cudf From cef897d5875e9abfb71aff7727c83cfc5ba5d9c6 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Thu, 19 Aug 2021 15:49:15 -0500 Subject: [PATCH 75/80] add docs to cudf::io::text::detail::trie --- .../cudf/io/text/detail/multistate.hpp | 82 +++++++++++++++---- cpp/include/cudf/io/text/detail/trie.hpp | 76 +++++++++++------ cpp/src/io/text/multibyte_split.cu | 4 +- 3 files changed, 119 insertions(+), 43 deletions(-) diff --git a/cpp/include/cudf/io/text/detail/multistate.hpp b/cpp/include/cudf/io/text/detail/multistate.hpp index d7b0275b9cc..d3c8909ab51 100644 --- a/cpp/include/cudf/io/text/detail/multistate.hpp +++ b/cpp/include/cudf/io/text/detail/multistate.hpp @@ -24,35 +24,74 @@ namespace text { namespace detail { /** - * @brief represents a single (begin, end] pair of possible state transition history. - * + * @brief Represents up to 7 segments */ -struct multistate_segment { - public: - static auto constexpr max_states = 16; - constexpr multistate_segment() : _data(0) {} - constexpr multistate_segment(uint8_t head, uint8_t tail) : _data((head & 0b1111) | (tail << 4)) {} +struct multistate { + private: + /** + * @brief represents a (head, tail] segment, stored as a single 8 bit value + */ + struct multistate_segment { + public: + /** + * @brief Creates a segment which represents (0, 0] + */ + + constexpr multistate_segment() : _data(0) {} + /** + * @brief Creates a segment which represents (head, tail] + * + * @param head the (head, ____] value. Undefined behavior for values >= 16 + * @param tail the (____, tail] value. Undefined behavior for values >= 16 + */ + + constexpr multistate_segment(uint8_t head, uint8_t tail) : _data((head & 0b1111) | (tail << 4)) + { + } - constexpr uint8_t get_head() const { return _data & 0b1111; } - constexpr uint8_t get_tail() const { return _data >> 4; } + /** + * @brief Get's the (head, ____] value from the segment. + */ + constexpr uint8_t get_head() const { return _data & 0b1111; } - private: - uint8_t _data; -}; + /** + * @brief Get's the (____, tail] value from the segment. + */ + constexpr uint8_t get_tail() const { return _data >> 4; } + + private: + uint8_t _data; + }; -/** - * @brief Holds up to 7 transition history segments - */ -struct multistate { public: - static auto constexpr max_segments = 7; + /** + * @brief The maximum state (head or tail) this multistate can represent + */ + + static auto constexpr max_segment_value = 15; + /** + * @brief The maximum number of segments this multistate can represent + */ + static auto constexpr max_segment_count = 7; + + /** + * @brief Enqueues a (head, tail] segment to this multistate + * + * @note: The behavior of this function is undefined if size() => max_segment_count + */ constexpr void enqueue(uint8_t head, uint8_t tail) { _segments[_size++] = multistate_segment(head, tail); } + /** + * @brief get's the number of segments this multistate represents + */ constexpr uint8_t size() const { return _size; } + /** + * @brief get's the highest (____, tail] value this multistate represents + */ constexpr uint8_t max_tail() const { uint8_t maximum = 0; @@ -64,12 +103,19 @@ struct multistate { return maximum; } + /** + * @brief get's the Nth (head, ____] value state this multistate represents + */ constexpr uint8_t get_head(uint8_t idx) const { return _segments[idx].get_head(); } + + /** + * @brief get's the Nth (____, tail] value state this multistate represents + */ constexpr uint8_t get_tail(uint8_t idx) const { return _segments[idx].get_tail(); } private: uint8_t _size = 0; - multistate_segment _segments[max_segments]; + multistate_segment _segments[max_segment_count]; }; /** diff --git a/cpp/include/cudf/io/text/detail/trie.hpp b/cpp/include/cudf/io/text/detail/trie.hpp index 3fa3344c91d..01971d273ec 100644 --- a/cpp/include/cudf/io/text/detail/trie.hpp +++ b/cpp/include/cudf/io/text/detail/trie.hpp @@ -34,28 +34,6 @@ namespace io { namespace text { namespace detail { -struct trie_builder_node { - uint8_t match_length; - std::unordered_map> children; - - void insert(std::string s) { insert(s.c_str(), s.size()); } - - trie_builder_node& insert(char const* s, uint16_t size) { return this->insert(s, size, 0); } - - private: - trie_builder_node& insert(char const* s, uint16_t size, uint8_t depth) - { - if (size == 0) { - match_length = depth; - return *this; - } - - if (children[*s] == nullptr) { children[*s] = std::make_unique(); } - - return children[*s]->insert(s + 1, size - 1, depth + 1); - } -}; - struct trie_node { char token; uint8_t match_length; @@ -118,21 +96,65 @@ struct trie_device_view { } }; +/** + * @brief A flat trie contained in device memory. + */ struct trie { private: cudf::size_type _max_duplicate_tokens; rmm::device_uvector _nodes; - public: trie(cudf::size_type max_duplicate_tokens, rmm::device_uvector&& nodes) : _max_duplicate_tokens(max_duplicate_tokens), _nodes(std::move(nodes)) { } + /** + * @brief Used to build a hierarchical trie which can then be flattened. + */ + struct trie_builder_node { + uint8_t match_length; + std::unordered_map> children; + + /** + * @brief Insert the string in to the trie tree, growing the trie as necessary + */ + void insert(std::string s) { insert(s.c_str(), s.size(), 0); } + + private: + trie_builder_node& insert(char const* s, uint16_t size, uint8_t depth) + { + if (size == 0) { + match_length = depth; + return *this; + } + + if (children[*s] == nullptr) { children[*s] = std::make_unique(); } + + return children[*s]->insert(s + 1, size - 1, depth + 1); + } + }; + + public: + /** + * @brief Gets the number of nodes contained in this trie. + */ cudf::size_type size() const { return _nodes.size(); } + /** + * @brief A pessimistic count of duplicate tokens in the trie. Used to determine the maximum + * possible stack size required to compute matches of this trie in parallel. + */ cudf::size_type max_duplicate_tokens() const { return _max_duplicate_tokens; } + /** + * @brief Create a trie which represents the given pattern. + * + * @param pattern The pattern to store in the trie + * @param stream The stream to use for allocation and copy + * @param mr Memory resource to use for the device memory allocation + * @return The trie. + */ static trie create(std::string const& pattern, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) @@ -141,6 +163,14 @@ struct trie { return create(std::vector{pattern}, stream, mr); } + /** + * @brief Create a trie which represents the given pattern. + * + * @param pattern The patterns to store in the trie + * @param stream The stream to use for allocation and copy + * @param mr Memory resource to use for the device memory allocation + * @return The trie. + */ static trie create(std::vector const& patterns, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 89ba0f45c8e..860f4a510ff 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -321,10 +321,10 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source CUDF_FUNC_RANGE(); auto const trie = cudf::io::text::detail::trie::create({delimiter}, stream); - CUDF_EXPECTS(trie.max_duplicate_tokens() <= multistate::max_segments, + CUDF_EXPECTS(trie.max_duplicate_tokens() <= multistate::max_segment_count, "delimiters must be representable by a trie with no more than 7 duplicate tokens"); - CUDF_EXPECTS(trie.size() <= multistate_segment::max_states, + CUDF_EXPECTS(trie.size() <= multistate::max_segment_value, "delimiters must be representable by a trie with no more than 16 unique states"); auto concurrency = 2; From 89ce0aa1ef65057746e8b6e0544b0e3f9732c84c Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Mon, 23 Aug 2021 16:54:14 -0500 Subject: [PATCH 76/80] add more documentation and comments to multibyte_split related code --- .../io/text/multibyte_split_benchmark.cpp | 2 - cpp/include/cudf/io/text/detail/trie.hpp | 48 ++++++++++++++----- cpp/src/io/text/multibyte_split.cu | 13 +++-- 3 files changed, 43 insertions(+), 20 deletions(-) diff --git a/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp index 13b3a29decb..dce4521338e 100644 --- a/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp +++ b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp @@ -112,11 +112,9 @@ static void BM_multibyte_split(benchmark::State& state) auto temp_file_name = random_file_in_dir(temp_dir.path()); - close(mkstemp(const_cast(temp_file_name.data()))); { auto temp_fostream = std::ofstream(temp_file_name, std::ofstream::out); temp_fostream.write(host_input.data(), host_input.size()); - temp_fostream.close(); } cudaDeviceSynchronize(); diff --git a/cpp/include/cudf/io/text/detail/trie.hpp b/cpp/include/cudf/io/text/detail/trie.hpp index 01971d273ec..d14fe15b0a9 100644 --- a/cpp/include/cudf/io/text/detail/trie.hpp +++ b/cpp/include/cudf/io/text/detail/trie.hpp @@ -43,6 +43,9 @@ struct trie_node { struct trie_device_view { device_span _nodes; + /** + * @brief create a multistate which contains all partial path matches for the given token. + */ constexpr multistate transition_init(char c) { auto result = multistate(); @@ -55,6 +58,13 @@ struct trie_device_view { return result; } + /** + * @brief create a new multistate by transitioning all states in the multistate by the given token + * + * Eliminates any partial matches that cannot transition using the given token. + * + * @note always enqueues (0, 0] as the first state of the returned multistate. + */ constexpr multistate transition(char c, multistate const& states) { auto result = multistate(); @@ -68,22 +78,20 @@ struct trie_device_view { return result; } - constexpr void transition_enqueue_all( // - char c, - multistate& states, - uint8_t head, - uint8_t curr) - { - for (uint32_t tail = _nodes[curr].child_begin; tail < _nodes[curr + 1].child_begin; tail++) { - if (_nodes[tail].token == c) { // - states.enqueue(head, tail); - } - } - } - + /** + * @brief returns true if the given index is associated with a matching state. + */ constexpr bool is_match(uint16_t idx) { return static_cast(get_match_length(idx)); } + + /** + * @brief returns the match length if the given index is associated with a matching state, + * otherwise zero. + */ constexpr uint8_t get_match_length(uint16_t idx) { return _nodes[idx].match_length; } + /** + * @brief returns the longest matching state of any state in the multistate. + */ template constexpr uint8_t get_match_length(multistate const& states) { @@ -94,6 +102,20 @@ struct trie_device_view { } return val; } + + private: + constexpr void transition_enqueue_all( // + char c, + multistate& states, + uint8_t head, + uint8_t curr) + { + for (uint32_t tail = _nodes[curr].child_begin; tail < _nodes[curr + 1].child_begin; tail++) { + if (_nodes[tail].token == c) { // + states.enqueue(head, tail); + } + } + } }; /** diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 860f4a510ff..d530ccec02e 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -190,7 +190,7 @@ __global__ void multibyte_split_kernel( if (abs_output_delimiter_offsets.size() > 0) { for (int32_t i = 0; i < ITEMS_PER_THREAD and i < thread_input_size; i++) { - if (trie.get_match_length(thread_states[i]) > 0) { + if (trie.is_match(thread_states[i])) { auto const match_end = base_tile_idx * ITEMS_PER_TILE + thread_input_offset + i + 1; abs_output_delimiter_offsets[thread_offsets[i]] = match_end; } @@ -258,6 +258,9 @@ cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_sour auto multistate_seed = multistate(); multistate_seed.enqueue(0, 0); // this represents the first state in the pattern. + // Seeding the tile state with an identity value allows the 0th tile to follow the same logic as + // the Nth tile, assuming it can look up an inclusive prefix. Without this seed, the 0th block + // would have to follow seperate logic. multibyte_split_seed_kernel<<<1, 1, 0, stream.value()>>>( // tile_multistates, tile_offsets, @@ -321,11 +324,11 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source CUDF_FUNC_RANGE(); auto const trie = cudf::io::text::detail::trie::create({delimiter}, stream); - CUDF_EXPECTS(trie.max_duplicate_tokens() <= multistate::max_segment_count, - "delimiters must be representable by a trie with no more than 7 duplicate tokens"); + CUDF_EXPECTS(trie.max_duplicate_tokens() < multistate::max_segment_count, + "delimiter contains too many duplicate tokens to produce a deterministic result."); - CUDF_EXPECTS(trie.size() <= multistate::max_segment_value, - "delimiters must be representable by a trie with no more than 16 unique states"); + CUDF_EXPECTS(trie.size() < multistate::max_segment_value, + "delimiter contains too many total tokens to produce a deterministic result."); auto concurrency = 2; // must be at least 32 when using warp-reduce on partials From d2735dd13382a722b6043cdf1357d5bbd5a1aa38 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Mon, 23 Aug 2021 17:42:46 -0500 Subject: [PATCH 77/80] adjust multibyte_split benchmark deviation math to be representative of intent. --- cpp/benchmarks/io/text/multibyte_split_benchmark.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp index dce4521338e..cb8a61caa57 100644 --- a/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp +++ b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp @@ -56,8 +56,9 @@ static cudf::string_scalar create_random_input(int32_t num_chars, auto const num_delim_chars = num_delims * delim.size(); auto const num_value_chars = num_chars - num_delim_chars; auto const num_rows = num_delims; - auto const value_size_max = static_cast(num_value_chars / num_rows); - auto const value_size_min = static_cast(value_size_max * (1 - deviation)); + auto const value_size_avg = static_cast(num_value_chars / num_rows); + auto const value_size_min = static_cast(value_size_avg * (1 - deviation)); + auto const value_size_max = static_cast(value_size_avg * (1 + deviation)); data_profile table_profile; @@ -100,7 +101,7 @@ static void BM_multibyte_split(benchmark::State& state) auto delim = std::string(":", delim_size); auto delim_factor = static_cast(delim_percent) / 100; - auto device_input = create_random_input(file_size_approx, delim_factor, 0.1, delim); + auto device_input = create_random_input(file_size_approx, delim_factor, 0.05, delim); auto host_input = thrust::host_vector(device_input.size()); auto host_string = std::string(host_input.data(), host_input.size()); From 615534ddb0eaeacc2a6b94332584136b19229e90 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Tue, 24 Aug 2021 10:42:45 -0500 Subject: [PATCH 78/80] multibyte_split: replace typedef with using and replace uint32_t with std::size_t where appropriate --- cpp/include/cudf/io/text/data_chunk_source.hpp | 2 +- .../cudf/io/text/data_chunk_source_factories.hpp | 12 ++++++------ cpp/src/io/text/multibyte_split.cu | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/cpp/include/cudf/io/text/data_chunk_source.hpp b/cpp/include/cudf/io/text/data_chunk_source.hpp index 012cb564bbf..3cfc338442f 100644 --- a/cpp/include/cudf/io/text/data_chunk_source.hpp +++ b/cpp/include/cudf/io/text/data_chunk_source.hpp @@ -49,7 +49,7 @@ class data_chunk_reader { * @param stream stream to associate allocations or perform work required to obtain chunk * @return a chunk of data up to @param size bytes, or less if no more data is avaialable */ - virtual device_span get_next_chunk(uint32_t size, rmm::cuda_stream_view stream) = 0; + virtual device_span get_next_chunk(std::size_t size, rmm::cuda_stream_view stream) = 0; }; /** diff --git a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp index 76903b25d97..7ce860467d9 100644 --- a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp +++ b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp @@ -53,19 +53,19 @@ class istream_data_chunk_reader : public data_chunk_reader { : _datastream(std::move(datastream)), _buffers(), _tickets(2) { // create an event to track the completion of the last device-to-host copy. - for (uint32_t i = 0; i < _tickets.size(); i++) { + for (std::size_t i = 0; i < _tickets.size(); i++) { CUDA_TRY(cudaEventCreate(&(_tickets[i].event))); } } ~istream_data_chunk_reader() { - for (uint32_t i = 0; i < _tickets.size(); i++) { + for (std::size_t i = 0; i < _tickets.size(); i++) { CUDA_TRY(cudaEventDestroy(_tickets[i].event)); } } - device_span find_or_create_data(uint32_t size, rmm::cuda_stream_view stream) + device_span find_or_create_data(std::size_t size, rmm::cuda_stream_view stream) { auto search = _buffers.find(stream.value()); @@ -76,7 +76,7 @@ class istream_data_chunk_reader : public data_chunk_reader { return device_span(static_cast(_buffers[stream.value()].data()), size); } - device_span get_next_chunk(uint32_t read_size, rmm::cuda_stream_view stream) override + device_span get_next_chunk(std::size_t read_size, rmm::cuda_stream_view stream) override { CUDF_FUNC_RANGE(); @@ -115,7 +115,7 @@ class istream_data_chunk_reader : public data_chunk_reader { } private: - uint32_t _next_ticket_idx = 0; + std::size_t _next_ticket_idx = 0; std::unique_ptr _datastream; std::unordered_map _buffers; std::vector _tickets; @@ -130,7 +130,7 @@ class device_span_data_chunk_reader : public data_chunk_reader { public: device_span_data_chunk_reader(device_span data) : _data(data) {} - device_span get_next_chunk(uint32_t read_size, rmm::cuda_stream_view stream) override + device_span get_next_chunk(std::size_t read_size, rmm::cuda_stream_view stream) override { // limit the read size to the number of bytes remaining in the device_span. if (read_size > _data.size() - _position) { read_size = _data.size() - _position; } diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index d530ccec02e..662ec744680 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -44,8 +44,8 @@ int32_t constexpr TILES_PER_CHUNK = 1024; int32_t constexpr ITEMS_PER_CHUNK = ITEMS_PER_TILE * TILES_PER_CHUNK; struct PatternScan { - typedef cub::BlockScan BlockScan; - typedef cudf::io::text::detail::scan_tile_state_callback BlockScanCallback; + using BlockScan = cub::BlockScan; + using BlockScanCallback = cudf::io::text::detail::scan_tile_state_callback; struct _TempStorage { typename BlockScan::TempStorage scan; From bd67026fd24c22bce7d9c8c966417668ac2bf4e0 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Tue, 24 Aug 2021 10:48:44 -0500 Subject: [PATCH 79/80] make data_chunk_reader::get_next_chunk docs more informative. --- cpp/include/cudf/io/text/data_chunk_source.hpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/cpp/include/cudf/io/text/data_chunk_source.hpp b/cpp/include/cudf/io/text/data_chunk_source.hpp index 3cfc338442f..6ee1fa033d0 100644 --- a/cpp/include/cudf/io/text/data_chunk_source.hpp +++ b/cpp/include/cudf/io/text/data_chunk_source.hpp @@ -45,11 +45,14 @@ class data_chunk_reader { * memory, allocate temporary memory, perform iterative decompression, or even launch device * kernels. * - * @param size desired number of bytes + * @param size number of bytes to read. * @param stream stream to associate allocations or perform work required to obtain chunk - * @return a chunk of data up to @param size bytes, or less if no more data is avaialable + * @return a chunk of data up to @param size bytes. May return less than @param size bytes if + * reader reaches end of underlying data source. Returned data must be accessed in stream order + * relative to the specified @param stream. */ - virtual device_span get_next_chunk(std::size_t size, rmm::cuda_stream_view stream) = 0; + virtual device_span get_next_chunk(std::size_t size, + rmm::cuda_stream_view stream) = 0; }; /** From a61fd09aa4f2c8fce1587f75e5735cff29398e1a Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Tue, 24 Aug 2021 10:59:15 -0500 Subject: [PATCH 80/80] fix style --- cpp/include/cudf/io/text/data_chunk_source_factories.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp index 7ce860467d9..f6807c1c9a8 100644 --- a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp +++ b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp @@ -76,7 +76,8 @@ class istream_data_chunk_reader : public data_chunk_reader { return device_span(static_cast(_buffers[stream.value()].data()), size); } - device_span get_next_chunk(std::size_t read_size, rmm::cuda_stream_view stream) override + device_span get_next_chunk(std::size_t read_size, + rmm::cuda_stream_view stream) override { CUDF_FUNC_RANGE(); @@ -130,7 +131,8 @@ class device_span_data_chunk_reader : public data_chunk_reader { public: device_span_data_chunk_reader(device_span data) : _data(data) {} - device_span get_next_chunk(std::size_t read_size, rmm::cuda_stream_view stream) override + device_span get_next_chunk(std::size_t read_size, + rmm::cuda_stream_view stream) override { // limit the read size to the number of bytes remaining in the device_span. if (read_size > _data.size() - _position) { read_size = _data.size() - _position; }