From 0b345d4008d0d77a4bc66fc37be1e28b4dd5c01f Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 19 Oct 2022 12:53:19 +0000 Subject: [PATCH 01/10] add strip_delimiters option to read_text --- cpp/benchmarks/io/text/multibyte_split.cpp | 5 +- cpp/include/cudf/io/text/multibyte_split.hpp | 30 ++++++++-- cpp/src/io/text/multibyte_split.cu | 60 +++++++++++++++++++- cpp/tests/io/text/multibyte_split_test.cpp | 30 ++++++++++ python/cudf/cudf/_lib/cpp/io/text.pxd | 10 +++- python/cudf/cudf/_lib/text.pyx | 22 +++---- python/cudf/cudf/io/text.py | 2 + python/cudf/cudf/tests/test_text.py | 12 ++++ 8 files changed, 149 insertions(+), 22 deletions(-) diff --git a/cpp/benchmarks/io/text/multibyte_split.cpp b/cpp/benchmarks/io/text/multibyte_split.cpp index 380766fee46..c0e82b34623 100644 --- a/cpp/benchmarks/io/text/multibyte_split.cpp +++ b/cpp/benchmarks/io/text/multibyte_split.cpp @@ -121,6 +121,7 @@ static void bench_multibyte_split(nvbench::state& state, auto const delim_percent = state.get_int64("delim_percent"); auto const file_size_approx = state.get_int64("size_approx"); auto const byte_range_percent = state.get_int64("byte_range_percent"); + auto const strip_delimiters = bool(state.get_int64("strip_delimiters")); auto const byte_range_factor = static_cast(byte_range_percent) / 100; CUDF_EXPECTS(delim_percent >= 1, "delimiter percent must be at least 1"); @@ -182,12 +183,13 @@ static void bench_multibyte_split(nvbench::state& state, auto const range_size = static_cast(device_input.size() * byte_range_factor); auto const range_offset = (device_input.size() - range_size) / 2; cudf::io::text::byte_range_info range{range_offset, range_size}; + cudf::io::text::parse_options options{range, strip_delimiters}; std::unique_ptr output; state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { try_drop_l3_cache(); - output = cudf::io::text::multibyte_split(*source, delim, range); + output = cudf::io::text::multibyte_split(*source, delim, options); }); state.add_buffer_size(mem_stats_logger.peak_memory_usage(), "pmu", "Peak Memory Usage"); @@ -203,6 +205,7 @@ using source_type_list = nvbench::enum_type_list multibyte_split( data_chunk_source const& source, std::string const& delimiter, - std::optional byte_range = std::nullopt, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + parse_options options = {}, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +std::unique_ptr multibyte_split( + data_chunk_source const& source, + std::string const& delimiter, + std::optional byte_range, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); std::unique_ptr multibyte_split(data_chunk_source const& source, std::string const& delimiter, diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 29cec0e8c3f..86e6d5cd931 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -21,12 +21,14 @@ #include #include +#include #include #include #include #include #include #include +#include #include #include #include @@ -551,6 +553,7 @@ class output_builder { std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source const& source, std::string const& delimiter, byte_range_info byte_range, + bool strip_delimiters, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr, rmm::cuda_stream_pool& stream_pool) @@ -771,8 +774,50 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source [baseline = *first_row_offset] __device__(byte_offset global_offset) { return static_cast(global_offset - baseline); }); - auto string_count = offsets.size() - 1; + if (strip_delimiters) { + rmm::device_uvector new_offsets{offsets.size(), stream, mr}; + auto const it = cudf::detail::make_counting_transform_iterator( + int32_t{0}, + [ofs = offsets.data(), + delim_size = static_cast(delimiter.size()), + last_row = static_cast(string_count) - 1, + insert_end] __device__(int32_t i) { + // if we inserted a virtual delimiter at the end, we musn't strip it + if (i == last_row && insert_end) { + return ofs[i + 1] - ofs[i]; + } else { + return std::max(ofs[i + 1] - ofs[i] - delim_size, 0); + } + }); + new_offsets.set_element_to_zero_async(0, stream); + thrust::inclusive_scan( + rmm::exec_policy(stream), it, it + offsets.size() - 1, new_offsets.begin() + 1); + auto const new_char_count = static_cast(new_offsets.back_element(stream)); + rmm::device_uvector new_chars{new_char_count, stream, mr}; + // copy the chars over in striped accesses of slice_size chars, 8 chars per thread on avg + auto const slice_size = cudf::util::div_rounding_up_safe(new_char_count, string_count * 8); + auto const counting_it = thrust::make_counting_iterator(std::size_t{}); + thrust::for_each_n(rmm::exec_policy(stream), + counting_it, + string_count * slice_size, + [chars = chars.data(), + new_chars = new_chars.data(), + ofs = offsets.data(), + new_ofs = new_offsets.data(), + slice_size] __device__(std::size_t i) { + auto const row = i / slice_size; + auto const stripe = i % slice_size; + auto const in_begin = ofs[row]; + auto const out_begin = new_ofs[row]; + auto const out_size = new_ofs[row + 1] - out_begin; + for (int32_t i = stripe; i < out_size; i += slice_size) { + new_chars[out_begin + i] = chars[in_begin + i]; + } + }); + offsets = std::move(new_offsets); + chars = std::move(new_chars); + } return cudf::make_strings_column(string_count, std::move(offsets), std::move(chars)); } @@ -783,12 +828,21 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source std::string const& delimiter, std::optional byte_range, rmm::mr::device_memory_resource* mr) +{ + return multibyte_split( + source, delimiter, parse_options{byte_range.value_or(create_byte_range_info_max())}, mr); +} + +std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source const& source, + std::string const& delimiter, + parse_options options, + rmm::mr::device_memory_resource* mr) { auto stream = cudf::get_default_stream(); auto stream_pool = rmm::cuda_stream_pool(2); auto result = detail::multibyte_split( - source, delimiter, byte_range.value_or(create_byte_range_info_max()), stream, mr, stream_pool); + source, delimiter, options.byte_range, options.strip_delimiters, stream, mr, stream_pool); return result; } @@ -797,7 +851,7 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source std::string const& delimiter, rmm::mr::device_memory_resource* mr) { - return multibyte_split(source, delimiter, std::nullopt, mr); + return multibyte_split(source, delimiter, parse_options{}, mr); } } // namespace text diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index 43debf3d5b3..d84d1a5bb24 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -146,6 +146,36 @@ TEST_F(MultibyteSplitTest, OverlappingMatchErasure) // CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); // this use case it not yet supported. } +TEST_F(MultibyteSplitTest, DelimiterErasure) +{ + auto delimiter = "\r\n"; + + auto host_input = std::string("line\r\nanother line\r\nthird line\r\n"); + auto expected = strings_column_wrapper{"line", "another line", "third line", ""}; + + cudf::io::text::parse_options options; + options.strip_delimiters = true; + auto source = cudf::io::text::make_source(host_input); + auto out = cudf::io::text::multibyte_split(*source, delimiter, options); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); +} + +TEST_F(MultibyteSplitTest, DelimiterErasureOverlap) +{ + auto delimiter = "::"; + + auto host_input = std::string("::a:::b::c::::d"); + auto expected = strings_column_wrapper{"", "a", "", "b", "c", "", "", "d"}; + + cudf::io::text::parse_options options; + options.strip_delimiters = true; + auto source = cudf::io::text::make_source(host_input); + auto out = cudf::io::text::multibyte_split(*source, delimiter, options); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); +} + TEST_F(MultibyteSplitTest, HandpickedInput) { auto delimiters = "::|"; diff --git a/python/cudf/cudf/_lib/cpp/io/text.pxd b/python/cudf/cudf/_lib/cpp/io/text.pxd index 7bbe870dad3..368b014ea4b 100644 --- a/python/cudf/cudf/_lib/cpp/io/text.pxd +++ b/python/cudf/cudf/_lib/cpp/io/text.pxd @@ -1,6 +1,7 @@ # Copyright (c) 2020-2022, NVIDIA CORPORATION. from libc.stdint cimport uint64_t +from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.string cimport string @@ -37,9 +38,12 @@ cdef extern from "cudf/io/text/data_chunk_source_factories.hpp" \ cdef extern from "cudf/io/text/multibyte_split.hpp" \ namespace "cudf::io::text" nogil: - unique_ptr[column] multibyte_split(data_chunk_source source, - string delimiter) except + + cdef cppclass parse_options: + byte_range_info byte_range + bool strip_delimiters + + parse_options() except + unique_ptr[column] multibyte_split(data_chunk_source source, string delimiter, - byte_range_info byte_range) except + + parse_options options) except + diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx index 31a5617af58..be11132497e 100644 --- a/python/cudf/cudf/_lib/text.pyx +++ b/python/cudf/cudf/_lib/text.pyx @@ -19,12 +19,14 @@ from cudf._lib.cpp.io.text cimport ( make_source_from_bgzip_file, make_source_from_file, multibyte_split, + parse_options, ) def read_text(object filepaths_or_buffers, object delimiter=None, object byte_range=None, + object strip_delimiters=False, object compression=None, object compression_offsets=None): """ @@ -44,6 +46,7 @@ def read_text(object filepaths_or_buffers, cdef byte_range_info c_byte_range cdef uint64_t c_compression_begin_offset cdef uint64_t c_compression_end_offset + cdef parse_options c_options if compression is None: if isinstance(filepaths_or_buffers, TextIOBase): @@ -71,19 +74,18 @@ def read_text(object filepaths_or_buffers, else: raise ValueError("Only bgzip compression is supported at the moment") - if (byte_range is None): - with nogil: - c_col = move(multibyte_split(dereference(datasource), delim)) - else: + c_options = parse_options() + if byte_range is not None: c_byte_range_offset = byte_range[0] c_byte_range_size = byte_range[1] - c_byte_range = byte_range_info( + c_options.byte_range = byte_range_info( c_byte_range_offset, c_byte_range_size) - with nogil: - c_col = move(multibyte_split( - dereference(datasource), - delim, - c_byte_range)) + c_options.strip_delimiters = strip_delimiters + with nogil: + c_col = move(multibyte_split( + dereference(datasource), + delim, + c_options)) return {None: Column.from_unique_ptr(move(c_col))} diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py index 23983f01966..f341edbf6c1 100644 --- a/python/cudf/cudf/io/text.py +++ b/python/cudf/cudf/io/text.py @@ -14,6 +14,7 @@ def read_text( filepath_or_buffer, delimiter=None, byte_range=None, + strip_delimiters=False, compression=None, compression_offsets=None, **kwargs, @@ -35,6 +36,7 @@ def read_text( filepath_or_buffer, delimiter=delimiter, byte_range=byte_range, + strip_delimiters=strip_delimiters, compression=compression, compression_offsets=compression_offsets, ) diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py index 7f41d606473..2c5c96781fe 100644 --- a/python/cudf/cudf/tests/test_text.py +++ b/python/cudf/cudf/tests/test_text.py @@ -847,6 +847,18 @@ def test_read_text_in_memory(datadir): assert_eq(expected, actual) +def test_read_text_in_memory_strip_delimiter(datadir): + # Since Python split removes the delimiter and read_text does + # not we need to add it back to the 'content' + expected = cudf.Series(["x", "y", "z"]) + + actual = cudf.read_text( + StringIO("x::y::z"), delimiter="::", strip_delimiters=True + ) + + assert_eq(expected, actual) + + def test_read_text_bgzip(datadir): chess_file_compressed = str(datadir) + "/chess.pgn.gz" chess_file = str(datadir) + "/chess.pgn" From 3e4d464eff25e02bd6828da2626f8f17459c8bbf Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 19 Oct 2022 18:22:00 +0000 Subject: [PATCH 02/10] use gathering make_string_column --- cpp/src/io/text/multibyte_split.cu | 55 ++++++++---------------------- 1 file changed, 15 insertions(+), 40 deletions(-) diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 86e6d5cd931..9989bf42d63 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -776,50 +777,24 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source }); auto string_count = offsets.size() - 1; if (strip_delimiters) { - rmm::device_uvector new_offsets{offsets.size(), stream, mr}; - auto const it = cudf::detail::make_counting_transform_iterator( - int32_t{0}, + auto it = cudf::detail::make_counting_transform_iterator( + 0, [ofs = offsets.data(), - delim_size = static_cast(delimiter.size()), - last_row = static_cast(string_count) - 1, - insert_end] __device__(int32_t i) { - // if we inserted a virtual delimiter at the end, we musn't strip it - if (i == last_row && insert_end) { - return ofs[i + 1] - ofs[i]; + chars = chars.data(), + delim_size = static_cast(delimiter.size()), + last_row = static_cast(string_count) - 1] __device__(size_type row) { + auto const begin = ofs[row]; + auto const len = ofs[row + 1] - begin; + if (row == last_row) { + return thrust::make_pair(chars + begin, len); } else { - return std::max(ofs[i + 1] - ofs[i] - delim_size, 0); - } + return thrust::make_pair(chars + begin, std::max(0, len - delim_size)); + }; }); - new_offsets.set_element_to_zero_async(0, stream); - thrust::inclusive_scan( - rmm::exec_policy(stream), it, it + offsets.size() - 1, new_offsets.begin() + 1); - auto const new_char_count = static_cast(new_offsets.back_element(stream)); - rmm::device_uvector new_chars{new_char_count, stream, mr}; - // copy the chars over in striped accesses of slice_size chars, 8 chars per thread on avg - auto const slice_size = cudf::util::div_rounding_up_safe(new_char_count, string_count * 8); - auto const counting_it = thrust::make_counting_iterator(std::size_t{}); - thrust::for_each_n(rmm::exec_policy(stream), - counting_it, - string_count * slice_size, - [chars = chars.data(), - new_chars = new_chars.data(), - ofs = offsets.data(), - new_ofs = new_offsets.data(), - slice_size] __device__(std::size_t i) { - auto const row = i / slice_size; - auto const stripe = i % slice_size; - auto const in_begin = ofs[row]; - auto const out_begin = new_ofs[row]; - auto const out_size = new_ofs[row + 1] - out_begin; - for (int32_t i = stripe; i < out_size; i += slice_size) { - new_chars[out_begin + i] = chars[in_begin + i]; - } - }); - offsets = std::move(new_offsets); - chars = std::move(new_chars); + return cudf::strings::detail::make_strings_column(it, it + string_count, stream, mr); + } else { + return cudf::make_strings_column(string_count, std::move(offsets), std::move(chars)); } - - return cudf::make_strings_column(string_count, std::move(offsets), std::move(chars)); } } // namespace detail From 84a975ab7479eeb26402f11a8878aa2af23f3bee Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 19 Oct 2022 18:45:56 +0000 Subject: [PATCH 03/10] restore original handling of delimiter in last row --- cpp/src/io/text/multibyte_split.cu | 11 ++++++-- cpp/tests/io/text/multibyte_split_test.cpp | 32 ++++++++++++++++++++++ 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 9989bf42d63..2416ff95e6c 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -761,7 +761,11 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source auto global_offsets = row_offset_storage.gather(stream, mr); bool const insert_begin = *first_row_offset == 0; - bool const insert_end = not last_row_offset.has_value() or last_row_offset == chunk_offset; + // insert a value at the end either if we found no delimiter past the byte range (to complete the + // last row), or if we found a delimiter at the very end (to add an empty row for the entry after + // the delimiter) + bool const insert_end = not last_row_offset.has_value() or + (last_row_offset == chunk_offset and chunk_offset <= byte_range_end); rmm::device_uvector offsets{ global_offsets.size() + insert_begin + insert_end, stream, mr}; if (insert_begin) { offsets.set_element_to_zero_async(0, stream); } @@ -782,10 +786,11 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source [ofs = offsets.data(), chars = chars.data(), delim_size = static_cast(delimiter.size()), - last_row = static_cast(string_count) - 1] __device__(size_type row) { + last_row = static_cast(string_count) - 1, + insert_end] __device__(size_type row) { auto const begin = ofs[row]; auto const len = ofs[row + 1] - begin; - if (row == last_row) { + if (row == last_row && insert_end) { return thrust::make_pair(chars + begin, len); } else { return thrust::make_pair(chars + begin, std::max(0, len - delim_size)); diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index d84d1a5bb24..d923deba2fd 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -91,6 +91,22 @@ TEST_F(MultibyteSplitTest, DelimiterAtEndByteRange) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); } +TEST_F(MultibyteSplitTest, DelimiterAtEndByteRange2) +{ + auto delimiter = std::string(":"); + auto host_input = std::string("abcdefg:"); + + auto expected = strings_column_wrapper{"abcdefg:"}; + + auto source = cudf::io::text::make_source(host_input); + auto out = cudf::io::text::multibyte_split( + *source, + delimiter, + cudf::io::text::byte_range_info{0, static_cast(host_input.size() - 1)}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); +} + TEST_F(MultibyteSplitTest, LargeInputSparse) { auto host_input = std::string(1024 * 1024 * 32, '.'); @@ -161,6 +177,22 @@ TEST_F(MultibyteSplitTest, DelimiterErasure) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); } +TEST_F(MultibyteSplitTest, DelimiterErasureByteRange) +{ + auto delimiter = "\r\n"; + + auto host_input = std::string("line\r\nanother line\r\nthird line\r\n"); + auto expected = strings_column_wrapper{"line", "another line", "third line"}; + + cudf::io::text::parse_options options; + options.strip_delimiters = true; + options.byte_range = cudf::io::text::byte_range_info(0, host_input.size() - 1); + auto source = cudf::io::text::make_source(host_input); + auto out = cudf::io::text::multibyte_split(*source, delimiter, options); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); +} + TEST_F(MultibyteSplitTest, DelimiterErasureOverlap) { auto delimiter = "::"; From 7c3028341c29b11b31ee0b0806aa3876e19062e0 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 20 Oct 2022 09:01:05 +0000 Subject: [PATCH 04/10] fix byte range matching last delimiter at end --- cpp/src/io/text/multibyte_split.cu | 11 ++++++----- cpp/tests/io/text/multibyte_split_test.cpp | 16 ++++++++++++++++ 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 2416ff95e6c..72a6d94acf8 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -751,19 +751,20 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source join_stream(streams, stream); // if the input was empty, we didn't find a delimiter at all, - // or the first delimiter was also the last: empty output + // or the first delimiter was also the last (but not the last in the file): empty output if (chunk_offset == 0 or not first_row_offset.has_value() or - first_row_offset == last_row_offset) { + (first_row_offset == last_row_offset and first_row_offset < byte_range_end)) { return make_empty_column(type_id::STRING); } auto chars = char_storage.gather(stream, mr); auto global_offsets = row_offset_storage.gather(stream, mr); + // insert an offset at the beginning if we started at the beginning of the input bool const insert_begin = *first_row_offset == 0; - // insert a value at the end either if we found no delimiter past the byte range (to complete the - // last row), or if we found a delimiter at the very end (to add an empty row for the entry after - // the delimiter) + // insert an offset at the end either if we found no delimiter past the byte range (to complete + // the last row), or if we found a delimiter at the very end (to add an empty row for the entry + // after the delimiter) bool const insert_end = not last_row_offset.has_value() or (last_row_offset == chunk_offset and chunk_offset <= byte_range_end); rmm::device_uvector offsets{ diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index d923deba2fd..a841c77a64f 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -421,6 +421,22 @@ TEST_F(MultibyteSplitTest, SmallInputAllPossibleRangesSingleByte) } } +TEST_F(MultibyteSplitTest, SingletonRangeAtEnd) +{ + // we want a delimiter at the end of the file to create a new empty row. To make concatenation of + // disjoint byte ranges work, we need to make sure this property is preserved even if the byte + // range contains only the last delimiter + using namespace cudf::io::text; + auto host_input = std::string("ab:cd:"); + auto delimiter = std::string(":"); + auto source = make_source(host_input); + auto expected = strings_column_wrapper{""}; + + auto out = multibyte_split(*source, delimiter, byte_range_info{5, 1}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, debug_output_level::ALL_ERRORS); +} + TEST_F(MultibyteSplitTest, EmptyInput) { using namespace cudf::io::text; From d7c3afc190ab81b38aa1ea2a1fe65607e0dd1bde Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 20 Oct 2022 16:44:07 +0000 Subject: [PATCH 05/10] strip empty row with delimiter at end --- cpp/src/io/text/multibyte_split.cu | 14 ++++++-------- cpp/tests/io/text/multibyte_split_test.cpp | 17 +++++++---------- 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 72a6d94acf8..465564856fe 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -751,9 +751,9 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source join_stream(streams, stream); // if the input was empty, we didn't find a delimiter at all, - // or the first delimiter was also the last (but not the last in the file): empty output + // or the first delimiter was also the last: empty output if (chunk_offset == 0 or not first_row_offset.has_value() or - (first_row_offset == last_row_offset and first_row_offset < byte_range_end)) { + first_row_offset == last_row_offset) { return make_empty_column(type_id::STRING); } @@ -761,12 +761,10 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source auto global_offsets = row_offset_storage.gather(stream, mr); // insert an offset at the beginning if we started at the beginning of the input - bool const insert_begin = *first_row_offset == 0; - // insert an offset at the end either if we found no delimiter past the byte range (to complete - // the last row), or if we found a delimiter at the very end (to add an empty row for the entry - // after the delimiter) - bool const insert_end = not last_row_offset.has_value() or - (last_row_offset == chunk_offset and chunk_offset <= byte_range_end); + bool const insert_begin = first_row_offset.value_or(0) == 0; + // insert an offset at the end if we have not terminated the last row + bool const insert_end = + not(last_row_offset.has_value() or global_offsets.back_element(stream) == chunk_offset); rmm::device_uvector offsets{ global_offsets.size() + insert_begin + insert_end, stream, mr}; if (insert_begin) { offsets.set_element_to_zero_async(0, stream); } diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index a841c77a64f..40cd8c47ee7 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -67,7 +67,7 @@ TEST_F(MultibyteSplitTest, DelimiterAtEnd) auto delimiter = std::string(":"); auto host_input = std::string("abcdefg:"); - auto expected = strings_column_wrapper{"abcdefg:", ""}; + auto expected = strings_column_wrapper{"abcdefg:"}; auto source = cudf::io::text::make_source(host_input); auto out = cudf::io::text::multibyte_split(*source, delimiter); @@ -80,7 +80,7 @@ TEST_F(MultibyteSplitTest, DelimiterAtEndByteRange) auto delimiter = std::string(":"); auto host_input = std::string("abcdefg:"); - auto expected = strings_column_wrapper{"abcdefg:", ""}; + auto expected = strings_column_wrapper{"abcdefg:"}; auto source = cudf::io::text::make_source(host_input); auto out = cudf::io::text::multibyte_split( @@ -136,8 +136,6 @@ TEST_F(MultibyteSplitTest, LargeInput) host_expected.emplace_back(std::string("...:|")); } - host_expected.emplace_back(std::string("")); - auto expected = strings_column_wrapper{host_expected.begin(), host_expected.end()}; auto delimiter = std::string("...:|"); @@ -167,7 +165,7 @@ TEST_F(MultibyteSplitTest, DelimiterErasure) auto delimiter = "\r\n"; auto host_input = std::string("line\r\nanother line\r\nthird line\r\n"); - auto expected = strings_column_wrapper{"line", "another line", "third line", ""}; + auto expected = strings_column_wrapper{"line", "another line", "third line"}; cudf::io::text::parse_options options; options.strip_delimiters = true; @@ -246,7 +244,7 @@ TEST_F(MultibyteSplitTest, HandpickedInput) "ggg::|", "hhh::|", "___::|", "here::|", "is::|", "another::|", "simple::|", "text::|", "seperated::|", "by::|", "emojis::|", "which::|", "are::|", "multiple::|", "bytes::|", "and::|", "used::|", "as::|", - "delimiters.::|", "::|", "::|", "::|", ""}; + "delimiters.::|", "::|", "::|", "::|"}; auto source = cudf::io::text::make_source(host_input); auto out = cudf::io::text::multibyte_split(*source, delimiters); @@ -423,14 +421,13 @@ TEST_F(MultibyteSplitTest, SmallInputAllPossibleRangesSingleByte) TEST_F(MultibyteSplitTest, SingletonRangeAtEnd) { - // we want a delimiter at the end of the file to create a new empty row. To make concatenation of - // disjoint byte ranges work, we need to make sure this property is preserved even if the byte - // range contains only the last delimiter + // we want a delimiter at the end of the file to not create a new empty row even if it is the only + // character in the byte range using namespace cudf::io::text; auto host_input = std::string("ab:cd:"); auto delimiter = std::string(":"); auto source = make_source(host_input); - auto expected = strings_column_wrapper{""}; + auto expected = strings_column_wrapper{}; auto out = multibyte_split(*source, delimiter, byte_range_info{5, 1}); From 8ae57072ed675784beb84f8b2f76605c12c929c5 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 26 Oct 2022 09:58:56 +0000 Subject: [PATCH 06/10] fix OOB access when no delimiter was found --- cpp/src/io/text/multibyte_split.cu | 3 ++- cpp/tests/io/text/multibyte_split_test.cpp | 13 +++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 465564856fe..0d699fc72fd 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -764,7 +764,8 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source bool const insert_begin = first_row_offset.value_or(0) == 0; // insert an offset at the end if we have not terminated the last row bool const insert_end = - not(last_row_offset.has_value() or global_offsets.back_element(stream) == chunk_offset); + not(last_row_offset.has_value() or + (global_offsets.size() > 0 and global_offsets.back_element(stream) == chunk_offset)); rmm::device_uvector offsets{ global_offsets.size() + insert_begin + insert_end, stream, mr}; if (insert_begin) { offsets.set_element_to_zero_async(0, stream); } diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index 40cd8c47ee7..2da7073b334 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -62,6 +62,19 @@ TEST_F(MultibyteSplitTest, NondeterministicMatching) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); } +TEST_F(MultibyteSplitTest, NoDelimiter) +{ + auto delimiter = std::string(":"); + auto host_input = std::string("abcdefg"); + + auto expected = strings_column_wrapper{"abcdefg"}; + + auto source = cudf::io::text::make_source(host_input); + auto out = cudf::io::text::multibyte_split(*source, delimiter); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); +} + TEST_F(MultibyteSplitTest, DelimiterAtEnd) { auto delimiter = std::string(":"); From d99bfe599e4bf25d3dc5f2417210a83958e82dad Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 26 Oct 2022 09:59:36 +0000 Subject: [PATCH 07/10] fix python test --- python/cudf/cudf/tests/test_text.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py index 2c5c96781fe..627bf0a68bb 100644 --- a/python/cudf/cudf/tests/test_text.py +++ b/python/cudf/cudf/tests/test_text.py @@ -827,14 +827,20 @@ def test_read_text_byte_range(datadir): def test_read_text_byte_range_large(tmpdir): - content = str([["\n" if x % 5 == 0 else "x"] for x in range(0, 3000)]) - delimiter = "1." + content = "".join(("\n" if x % 5 == 4 else "x") for x in range(0, 3000)) + delimiter = "\n" temp_file = str(tmpdir) + "/temp.txt" with open(temp_file, "w") as f: f.write(content) - cudf.read_text(temp_file, delimiter=delimiter) + expected = cudf.Series(["xxxx\n" for i in range(0, 200)]) + + actual = cudf.read_text( + temp_file, delimiter=delimiter, byte_range=[1000, 1000] + ) + + assert_eq(expected, actual) def test_read_text_in_memory(datadir): From 01ba6f11243774db8c6aaeac4473ba4bfa557fcc Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 26 Oct 2022 10:03:21 +0000 Subject: [PATCH 08/10] ad missing docstring --- python/cudf/cudf/utils/ioutils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 0a0647f1297..f7eaef57942 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -1215,6 +1215,8 @@ delimiter : string, default None The delimiter that should be used for splitting text chunks into separate cudf column rows. The delimiter may be one or more characters. +strip_delimiters : boolean, default False + If True, removes delimiters that are normally kept at the end of strings. byte_range : list or tuple, default None Byte range within the input file to be read. The first number is the offset in bytes, the second number is the range size in bytes. From 6c2a3b0a40c4f76b49209a1c1d711d126ca931f9 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 26 Oct 2022 22:49:56 +0200 Subject: [PATCH 09/10] Reword parse_options docstring Co-authored-by: Bradley Dice --- cpp/include/cudf/io/text/multibyte_split.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp index 8524033d10c..a7edc9be0e4 100644 --- a/cpp/include/cudf/io/text/multibyte_split.hpp +++ b/cpp/include/cudf/io/text/multibyte_split.hpp @@ -39,7 +39,7 @@ struct parse_options { */ byte_range_info byte_range = create_byte_range_info_max(); /** - * @brief Should delimiters at the end of rows be stripped from the output column? + * @brief Whether delimiters at the end of rows should be stripped from the output column */ bool strip_delimiters = false; }; From ca5568ab4577ac856a5e91578a04a670fa7b3ab2 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 27 Oct 2022 15:11:17 +0200 Subject: [PATCH 10/10] Extend docstring --- python/cudf/cudf/utils/ioutils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index f7eaef57942..5298e470a91 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -1216,7 +1216,12 @@ The delimiter that should be used for splitting text chunks into separate cudf column rows. The delimiter may be one or more characters. strip_delimiters : boolean, default False - If True, removes delimiters that are normally kept at the end of strings. + Unlike the `str.split()` function, `read_text` preserves the delimiter + at the end of a field in output by default, meaning `a;b;c` will turn into + `['a;','b;','c']` when using `;` as a delimiter. + Setting this option to `True` will strip these trailing delimiters, + leaving only the contents between delimiters in the resulting column: + `['a','b','c']` byte_range : list or tuple, default None Byte range within the input file to be read. The first number is the offset in bytes, the second number is the range size in bytes.