From 8d49db5bbd959d8b1ea28cbebf896e6e911716f5 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 27 Oct 2022 13:21:38 -0500 Subject: [PATCH 1/4] Ignore python docs build artifacts (#12000) This PR gitignores some of the python docs build artifcats that keep showing up in `git status` Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/12000 --- .gitignore | 2 ++ docs/cudf/source/user_guide/10min.ipynb | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index aaac92ff643..91a7ecc49f7 100644 --- a/.gitignore +++ b/.gitignore @@ -165,3 +165,5 @@ dask-worker-space/ # Sphinx docs & build artifacts docs/cudf/source/api_docs/generated/* docs/cudf/source/api_docs/api/* +docs/cudf/source/user_guide/example_output/* +docs/cudf/source/user_guide/cudf.*Dtype.*.rst diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb index b9278151e64..ce6c55fe134 100644 --- a/docs/cudf/source/user_guide/10min.ipynb +++ b/docs/cudf/source/user_guide/10min.ipynb @@ -5474,7 +5474,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Writing to parquet files, using the CPU via PyArrow." + "Writing to parquet files with GPU-accelerated parquet writer" ] }, { @@ -5749,7 +5749,7 @@ } ], "source": [ - "ddf.to_parquet('example_files') " + "ddf.to_parquet('example_output/ddf_parquet_files')" ] }, { From b4ca89492b5e4bf8d54102a4082c2d362350e783 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 27 Oct 2022 20:26:22 +0200 Subject: [PATCH 2/4] Add `strip_delimiters` option to `read_text` (#11946) This adds a `strip_delimiters` post-processing option to `read_text`. I needed to implement some lightweight striping because a thread-per-row parallelization of the string gather gave pretty bad performance. For consistency, I also removed the special-case handling of delimiters at the end (previously adding an empty row), to match the read_csv behavior. Benchmark results: ``` benchmarks/MULTIBYTE_SPLIT_NVBENCH --axis size_approx[pow2]=30 --axis byte_range_percent=100 --axis T=device --axis delim_size=4 ``` ### [0] Tesla T4 | T | strip_delimiters | delim_percent | size_approx | CPU Time | Noise | Peak Memory Usage | Encoded file size | |--------|------------------|---------------|-------------------|------------|-------|-------------------|-------------------| | device | 0 | 1 | 2^30 = 1073741824 | 178.133 ms | 0.36% | 3.709 GiB | 1014.442 MiB | | device | 1 | 1 | 2^30 = 1073741824 | 188.328 ms | 0.31% | 4.690 GiB | 1014.442 MiB | | device | 0 | 25 | 2^30 = 1073741824 | 206.188 ms | 0.03% | 5.292 GiB | 953.075 MiB | | device | 1 | 25 | 2^30 = 1073741824 | 242.534 ms | 0.50% | 5.975 GiB | 953.075 MiB | Closes #11625 Authors: - Tobias Ribizel (https://github.com/upsj) Approvers: - David Wendt (https://github.com/davidwendt) - GALI PREM SAGAR (https://github.com/galipremsagar) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/11946 --- cpp/benchmarks/io/text/multibyte_split.cpp | 5 +- cpp/include/cudf/io/text/multibyte_split.hpp | 30 +++++- cpp/src/io/text/multibyte_split.cu | 48 ++++++++-- cpp/tests/io/text/multibyte_split_test.cpp | 98 +++++++++++++++++++- python/cudf/cudf/_lib/cpp/io/text.pxd | 10 +- python/cudf/cudf/_lib/text.pyx | 22 +++-- python/cudf/cudf/io/text.py | 2 + python/cudf/cudf/tests/test_text.py | 24 ++++- python/cudf/cudf/utils/ioutils.py | 7 ++ 9 files changed, 212 insertions(+), 34 deletions(-) diff --git a/cpp/benchmarks/io/text/multibyte_split.cpp b/cpp/benchmarks/io/text/multibyte_split.cpp index 380766fee46..c0e82b34623 100644 --- a/cpp/benchmarks/io/text/multibyte_split.cpp +++ b/cpp/benchmarks/io/text/multibyte_split.cpp @@ -121,6 +121,7 @@ static void bench_multibyte_split(nvbench::state& state, auto const delim_percent = state.get_int64("delim_percent"); auto const file_size_approx = state.get_int64("size_approx"); auto const byte_range_percent = state.get_int64("byte_range_percent"); + auto const strip_delimiters = bool(state.get_int64("strip_delimiters")); auto const byte_range_factor = static_cast(byte_range_percent) / 100; CUDF_EXPECTS(delim_percent >= 1, "delimiter percent must be at least 1"); @@ -182,12 +183,13 @@ static void bench_multibyte_split(nvbench::state& state, auto const range_size = static_cast(device_input.size() * byte_range_factor); auto const range_offset = (device_input.size() - range_size) / 2; cudf::io::text::byte_range_info range{range_offset, range_size}; + cudf::io::text::parse_options options{range, strip_delimiters}; std::unique_ptr output; state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { try_drop_l3_cache(); - output = cudf::io::text::multibyte_split(*source, delim, range); + output = cudf::io::text::multibyte_split(*source, delim, options); }); state.add_buffer_size(mem_stats_logger.peak_memory_usage(), "pmu", "Peak Memory Usage"); @@ -203,6 +205,7 @@ using source_type_list = nvbench::enum_type_list multibyte_split( data_chunk_source const& source, std::string const& delimiter, - std::optional byte_range = std::nullopt, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + parse_options options = {}, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +std::unique_ptr multibyte_split( + data_chunk_source const& source, + std::string const& delimiter, + std::optional byte_range, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); std::unique_ptr multibyte_split(data_chunk_source const& source, std::string const& delimiter, diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 29cec0e8c3f..0d699fc72fd 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -21,13 +21,16 @@ #include #include +#include #include #include #include #include #include #include +#include #include +#include #include #include @@ -551,6 +554,7 @@ class output_builder { std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source const& source, std::string const& delimiter, byte_range_info byte_range, + bool strip_delimiters, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr, rmm::cuda_stream_pool& stream_pool) @@ -756,8 +760,12 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source auto chars = char_storage.gather(stream, mr); auto global_offsets = row_offset_storage.gather(stream, mr); - bool const insert_begin = *first_row_offset == 0; - bool const insert_end = not last_row_offset.has_value() or last_row_offset == chunk_offset; + // insert an offset at the beginning if we started at the beginning of the input + bool const insert_begin = first_row_offset.value_or(0) == 0; + // insert an offset at the end if we have not terminated the last row + bool const insert_end = + not(last_row_offset.has_value() or + (global_offsets.size() > 0 and global_offsets.back_element(stream) == chunk_offset)); rmm::device_uvector offsets{ global_offsets.size() + insert_begin + insert_end, stream, mr}; if (insert_begin) { offsets.set_element_to_zero_async(0, stream); } @@ -771,10 +779,27 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source [baseline = *first_row_offset] __device__(byte_offset global_offset) { return static_cast(global_offset - baseline); }); - auto string_count = offsets.size() - 1; - - return cudf::make_strings_column(string_count, std::move(offsets), std::move(chars)); + if (strip_delimiters) { + auto it = cudf::detail::make_counting_transform_iterator( + 0, + [ofs = offsets.data(), + chars = chars.data(), + delim_size = static_cast(delimiter.size()), + last_row = static_cast(string_count) - 1, + insert_end] __device__(size_type row) { + auto const begin = ofs[row]; + auto const len = ofs[row + 1] - begin; + if (row == last_row && insert_end) { + return thrust::make_pair(chars + begin, len); + } else { + return thrust::make_pair(chars + begin, std::max(0, len - delim_size)); + }; + }); + return cudf::strings::detail::make_strings_column(it, it + string_count, stream, mr); + } else { + return cudf::make_strings_column(string_count, std::move(offsets), std::move(chars)); + } } } // namespace detail @@ -783,12 +808,21 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source std::string const& delimiter, std::optional byte_range, rmm::mr::device_memory_resource* mr) +{ + return multibyte_split( + source, delimiter, parse_options{byte_range.value_or(create_byte_range_info_max())}, mr); +} + +std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source const& source, + std::string const& delimiter, + parse_options options, + rmm::mr::device_memory_resource* mr) { auto stream = cudf::get_default_stream(); auto stream_pool = rmm::cuda_stream_pool(2); auto result = detail::multibyte_split( - source, delimiter, byte_range.value_or(create_byte_range_info_max()), stream, mr, stream_pool); + source, delimiter, options.byte_range, options.strip_delimiters, stream, mr, stream_pool); return result; } @@ -797,7 +831,7 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source std::string const& delimiter, rmm::mr::device_memory_resource* mr) { - return multibyte_split(source, delimiter, std::nullopt, mr); + return multibyte_split(source, delimiter, parse_options{}, mr); } } // namespace text diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index 43debf3d5b3..2da7073b334 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -62,12 +62,25 @@ TEST_F(MultibyteSplitTest, NondeterministicMatching) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); } +TEST_F(MultibyteSplitTest, NoDelimiter) +{ + auto delimiter = std::string(":"); + auto host_input = std::string("abcdefg"); + + auto expected = strings_column_wrapper{"abcdefg"}; + + auto source = cudf::io::text::make_source(host_input); + auto out = cudf::io::text::multibyte_split(*source, delimiter); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); +} + TEST_F(MultibyteSplitTest, DelimiterAtEnd) { auto delimiter = std::string(":"); auto host_input = std::string("abcdefg:"); - auto expected = strings_column_wrapper{"abcdefg:", ""}; + auto expected = strings_column_wrapper{"abcdefg:"}; auto source = cudf::io::text::make_source(host_input); auto out = cudf::io::text::multibyte_split(*source, delimiter); @@ -80,7 +93,7 @@ TEST_F(MultibyteSplitTest, DelimiterAtEndByteRange) auto delimiter = std::string(":"); auto host_input = std::string("abcdefg:"); - auto expected = strings_column_wrapper{"abcdefg:", ""}; + auto expected = strings_column_wrapper{"abcdefg:"}; auto source = cudf::io::text::make_source(host_input); auto out = cudf::io::text::multibyte_split( @@ -91,6 +104,22 @@ TEST_F(MultibyteSplitTest, DelimiterAtEndByteRange) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); } +TEST_F(MultibyteSplitTest, DelimiterAtEndByteRange2) +{ + auto delimiter = std::string(":"); + auto host_input = std::string("abcdefg:"); + + auto expected = strings_column_wrapper{"abcdefg:"}; + + auto source = cudf::io::text::make_source(host_input); + auto out = cudf::io::text::multibyte_split( + *source, + delimiter, + cudf::io::text::byte_range_info{0, static_cast(host_input.size() - 1)}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); +} + TEST_F(MultibyteSplitTest, LargeInputSparse) { auto host_input = std::string(1024 * 1024 * 32, '.'); @@ -120,8 +149,6 @@ TEST_F(MultibyteSplitTest, LargeInput) host_expected.emplace_back(std::string("...:|")); } - host_expected.emplace_back(std::string("")); - auto expected = strings_column_wrapper{host_expected.begin(), host_expected.end()}; auto delimiter = std::string("...:|"); @@ -146,6 +173,52 @@ TEST_F(MultibyteSplitTest, OverlappingMatchErasure) // CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); // this use case it not yet supported. } +TEST_F(MultibyteSplitTest, DelimiterErasure) +{ + auto delimiter = "\r\n"; + + auto host_input = std::string("line\r\nanother line\r\nthird line\r\n"); + auto expected = strings_column_wrapper{"line", "another line", "third line"}; + + cudf::io::text::parse_options options; + options.strip_delimiters = true; + auto source = cudf::io::text::make_source(host_input); + auto out = cudf::io::text::multibyte_split(*source, delimiter, options); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); +} + +TEST_F(MultibyteSplitTest, DelimiterErasureByteRange) +{ + auto delimiter = "\r\n"; + + auto host_input = std::string("line\r\nanother line\r\nthird line\r\n"); + auto expected = strings_column_wrapper{"line", "another line", "third line"}; + + cudf::io::text::parse_options options; + options.strip_delimiters = true; + options.byte_range = cudf::io::text::byte_range_info(0, host_input.size() - 1); + auto source = cudf::io::text::make_source(host_input); + auto out = cudf::io::text::multibyte_split(*source, delimiter, options); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); +} + +TEST_F(MultibyteSplitTest, DelimiterErasureOverlap) +{ + auto delimiter = "::"; + + auto host_input = std::string("::a:::b::c::::d"); + auto expected = strings_column_wrapper{"", "a", "", "b", "c", "", "", "d"}; + + cudf::io::text::parse_options options; + options.strip_delimiters = true; + auto source = cudf::io::text::make_source(host_input); + auto out = cudf::io::text::multibyte_split(*source, delimiter, options); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); +} + TEST_F(MultibyteSplitTest, HandpickedInput) { auto delimiters = "::|"; @@ -184,7 +257,7 @@ TEST_F(MultibyteSplitTest, HandpickedInput) "ggg::|", "hhh::|", "___::|", "here::|", "is::|", "another::|", "simple::|", "text::|", "seperated::|", "by::|", "emojis::|", "which::|", "are::|", "multiple::|", "bytes::|", "and::|", "used::|", "as::|", - "delimiters.::|", "::|", "::|", "::|", ""}; + "delimiters.::|", "::|", "::|", "::|"}; auto source = cudf::io::text::make_source(host_input); auto out = cudf::io::text::multibyte_split(*source, delimiters); @@ -359,6 +432,21 @@ TEST_F(MultibyteSplitTest, SmallInputAllPossibleRangesSingleByte) } } +TEST_F(MultibyteSplitTest, SingletonRangeAtEnd) +{ + // we want a delimiter at the end of the file to not create a new empty row even if it is the only + // character in the byte range + using namespace cudf::io::text; + auto host_input = std::string("ab:cd:"); + auto delimiter = std::string(":"); + auto source = make_source(host_input); + auto expected = strings_column_wrapper{}; + + auto out = multibyte_split(*source, delimiter, byte_range_info{5, 1}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, debug_output_level::ALL_ERRORS); +} + TEST_F(MultibyteSplitTest, EmptyInput) { using namespace cudf::io::text; diff --git a/python/cudf/cudf/_lib/cpp/io/text.pxd b/python/cudf/cudf/_lib/cpp/io/text.pxd index 7bbe870dad3..368b014ea4b 100644 --- a/python/cudf/cudf/_lib/cpp/io/text.pxd +++ b/python/cudf/cudf/_lib/cpp/io/text.pxd @@ -1,6 +1,7 @@ # Copyright (c) 2020-2022, NVIDIA CORPORATION. from libc.stdint cimport uint64_t +from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.string cimport string @@ -37,9 +38,12 @@ cdef extern from "cudf/io/text/data_chunk_source_factories.hpp" \ cdef extern from "cudf/io/text/multibyte_split.hpp" \ namespace "cudf::io::text" nogil: - unique_ptr[column] multibyte_split(data_chunk_source source, - string delimiter) except + + cdef cppclass parse_options: + byte_range_info byte_range + bool strip_delimiters + + parse_options() except + unique_ptr[column] multibyte_split(data_chunk_source source, string delimiter, - byte_range_info byte_range) except + + parse_options options) except + diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx index 31a5617af58..be11132497e 100644 --- a/python/cudf/cudf/_lib/text.pyx +++ b/python/cudf/cudf/_lib/text.pyx @@ -19,12 +19,14 @@ from cudf._lib.cpp.io.text cimport ( make_source_from_bgzip_file, make_source_from_file, multibyte_split, + parse_options, ) def read_text(object filepaths_or_buffers, object delimiter=None, object byte_range=None, + object strip_delimiters=False, object compression=None, object compression_offsets=None): """ @@ -44,6 +46,7 @@ def read_text(object filepaths_or_buffers, cdef byte_range_info c_byte_range cdef uint64_t c_compression_begin_offset cdef uint64_t c_compression_end_offset + cdef parse_options c_options if compression is None: if isinstance(filepaths_or_buffers, TextIOBase): @@ -71,19 +74,18 @@ def read_text(object filepaths_or_buffers, else: raise ValueError("Only bgzip compression is supported at the moment") - if (byte_range is None): - with nogil: - c_col = move(multibyte_split(dereference(datasource), delim)) - else: + c_options = parse_options() + if byte_range is not None: c_byte_range_offset = byte_range[0] c_byte_range_size = byte_range[1] - c_byte_range = byte_range_info( + c_options.byte_range = byte_range_info( c_byte_range_offset, c_byte_range_size) - with nogil: - c_col = move(multibyte_split( - dereference(datasource), - delim, - c_byte_range)) + c_options.strip_delimiters = strip_delimiters + with nogil: + c_col = move(multibyte_split( + dereference(datasource), + delim, + c_options)) return {None: Column.from_unique_ptr(move(c_col))} diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py index 23983f01966..f341edbf6c1 100644 --- a/python/cudf/cudf/io/text.py +++ b/python/cudf/cudf/io/text.py @@ -14,6 +14,7 @@ def read_text( filepath_or_buffer, delimiter=None, byte_range=None, + strip_delimiters=False, compression=None, compression_offsets=None, **kwargs, @@ -35,6 +36,7 @@ def read_text( filepath_or_buffer, delimiter=delimiter, byte_range=byte_range, + strip_delimiters=strip_delimiters, compression=compression, compression_offsets=compression_offsets, ) diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py index 7f41d606473..627bf0a68bb 100644 --- a/python/cudf/cudf/tests/test_text.py +++ b/python/cudf/cudf/tests/test_text.py @@ -827,14 +827,20 @@ def test_read_text_byte_range(datadir): def test_read_text_byte_range_large(tmpdir): - content = str([["\n" if x % 5 == 0 else "x"] for x in range(0, 3000)]) - delimiter = "1." + content = "".join(("\n" if x % 5 == 4 else "x") for x in range(0, 3000)) + delimiter = "\n" temp_file = str(tmpdir) + "/temp.txt" with open(temp_file, "w") as f: f.write(content) - cudf.read_text(temp_file, delimiter=delimiter) + expected = cudf.Series(["xxxx\n" for i in range(0, 200)]) + + actual = cudf.read_text( + temp_file, delimiter=delimiter, byte_range=[1000, 1000] + ) + + assert_eq(expected, actual) def test_read_text_in_memory(datadir): @@ -847,6 +853,18 @@ def test_read_text_in_memory(datadir): assert_eq(expected, actual) +def test_read_text_in_memory_strip_delimiter(datadir): + # Since Python split removes the delimiter and read_text does + # not we need to add it back to the 'content' + expected = cudf.Series(["x", "y", "z"]) + + actual = cudf.read_text( + StringIO("x::y::z"), delimiter="::", strip_delimiters=True + ) + + assert_eq(expected, actual) + + def test_read_text_bgzip(datadir): chess_file_compressed = str(datadir) + "/chess.pgn.gz" chess_file = str(datadir) + "/chess.pgn" diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 0a0647f1297..5298e470a91 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -1215,6 +1215,13 @@ delimiter : string, default None The delimiter that should be used for splitting text chunks into separate cudf column rows. The delimiter may be one or more characters. +strip_delimiters : boolean, default False + Unlike the `str.split()` function, `read_text` preserves the delimiter + at the end of a field in output by default, meaning `a;b;c` will turn into + `['a;','b;','c']` when using `;` as a delimiter. + Setting this option to `True` will strip these trailing delimiters, + leaving only the contents between delimiters in the resulting column: + `['a','b','c']` byte_range : list or tuple, default None Byte range within the input file to be read. The first number is the offset in bytes, the second number is the range size in bytes. From 43eb7a07c8ed1afdf5b80d2912d6e8993e5262d9 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 27 Oct 2022 21:26:55 +0200 Subject: [PATCH 3/4] Refactor multibyte_split `output_builder` (#11945) This PR moves the `output_builder` and `split_device_span` classes out of `multibyte_split` and adds an iterator for the `split_device_span`, enabling it to be used directly in Thrust algorithms. I also included a fix from #11875 to make the integration easier once that is merged. Authors: - Tobias Ribizel (https://github.com/upsj) Approvers: - Bradley Dice (https://github.com/bdice) - Mike Wilson (https://github.com/hyperbolic2346) URL: https://github.com/rapidsai/cudf/pull/11945 --- cpp/src/io/text/multibyte_split.cu | 217 +------------ cpp/src/io/utilities/output_builder.cuh | 357 +++++++++++++++++++++ cpp/tests/io/text/multibyte_split_test.cpp | 64 ++++ 3 files changed, 423 insertions(+), 215 deletions(-) create mode 100644 cpp/src/io/utilities/output_builder.cuh diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 0d699fc72fd..1177be6b63f 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -14,10 +14,7 @@ * limitations under the License. */ -// Can be removed once we use Thrust 1.16+ -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wpragmas" -#pragma GCC diagnostic ignored "-Wsizeof-array-div" +#include #include #include @@ -48,54 +45,12 @@ #include #include -#pragma GCC diagnostic pop - #include #include #include #include #include -namespace cudf { - -/** - * @brief A device span consisting of two separate device_spans acting as if they were part of a - * single span. The first head.size() entries are served from the first span, the remaining - * tail.size() entries are served from the second span. - * - * @tparam T The type of elements in the span. - */ -template -class split_device_span { - public: - explicit constexpr split_device_span(device_span head, device_span tail = {}) - : _head{head}, _tail{tail} - { - } - - [[nodiscard]] constexpr T& operator[](size_type i) - { - return i < _head.size() ? _head[i] : _tail[i - _head.size()]; - } - - [[nodiscard]] constexpr const T& operator[](size_type i) const - { - return i < _head.size() ? _head[i] : _tail[i - _head.size()]; - } - - [[nodiscard]] constexpr size_type size() const { return _head.size() + _tail.size(); } - - [[nodiscard]] constexpr device_span head() const { return _head; } - - [[nodiscard]] constexpr device_span tail() const { return _tail; } - - private: - device_span _head; - device_span _tail; -}; - -} // namespace cudf - namespace { using cudf::io::text::detail::multistate; @@ -385,172 +340,6 @@ std::vector get_streams(int32_t count, rmm::cuda_stream_p return streams; } -/** - * @brief A chunked storage class that provides preallocated memory for algorithms with known - * worst-case output size. It provides functionality to retrieve the next chunk to write to, for - * reporting how much memory was actually written and for gathering all previously written outputs - * into a single contiguous vector. - * - * @tparam T The output element type. - */ -template -class output_builder { - public: - using size_type = typename rmm::device_uvector::size_type; - - /** - * @brief Initializes an output builder with given worst-case output size and stream. - * - * @param max_write_size the maximum number of elements that will be written into a - * split_device_span returned from `next_output`. - * @param stream the stream used to allocate the first chunk of memory. - * @param mr optional, the memory resource to use for allocation. - */ - output_builder(size_type max_write_size, - size_type max_growth, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) - : _size{0}, _max_write_size{max_write_size}, _max_growth{max_growth} - { - CUDF_EXPECTS(max_write_size > 0, "Internal error"); - _chunks.emplace_back(0, stream, mr); - _chunks.back().reserve(max_write_size * 2, stream); - } - - output_builder(output_builder&&) = delete; - output_builder(const output_builder&) = delete; - output_builder& operator=(output_builder&&) = delete; - output_builder& operator=(const output_builder&) = delete; - - /** - * @brief Returns the next free chunk of `max_write_size` elements from the underlying storage. - * Must be followed by a call to `advance_output` after the memory has been written to. - * - * @param stream The stream to allocate a new chunk of memory with, if necessary. - * This should be the stream that will write to the `split_device_span`. - * @return A `split_device_span` starting directly after the last output and providing at least - * `max_write_size` entries of storage. - */ - [[nodiscard]] split_device_span next_output(rmm::cuda_stream_view stream) - { - auto head_it = _chunks.end() - (_chunks.size() > 1 and _chunks.back().is_empty() ? 2 : 1); - auto head_span = get_free_span(*head_it); - if (head_span.size() >= _max_write_size) { return split_device_span{head_span}; } - if (head_it == _chunks.end() - 1) { - // insert a new vector of double size - auto const next_chunk_size = - std::min(_max_growth * _max_write_size, 2 * _chunks.back().capacity()); - _chunks.emplace_back(0, stream, _chunks.back().memory_resource()); - _chunks.back().reserve(next_chunk_size, stream); - } - auto tail_span = get_free_span(_chunks.back()); - CUDF_EXPECTS(head_span.size() + tail_span.size() >= _max_write_size, "Internal error"); - return split_device_span{head_span, tail_span}; - } - - /** - * @brief Advances the output sizes after a `split_device_span` returned from `next_output` was - * written to. - * - * @param actual_size The number of elements that were written to the result of the previous - * `next_output` call. - */ - void advance_output(size_type actual_size, rmm::cuda_stream_view stream) - { - CUDF_EXPECTS(actual_size <= _max_write_size, "Internal error"); - if (_chunks.size() < 2) { - auto const new_size = _chunks.back().size() + actual_size; - inplace_resize(_chunks.back(), new_size, stream); - } else { - auto& tail = _chunks.back(); - auto& prev = _chunks.rbegin()[1]; - auto const prev_advance = std::min(actual_size, prev.capacity() - prev.size()); - auto const tail_advance = actual_size - prev_advance; - inplace_resize(prev, prev.size() + prev_advance, stream); - inplace_resize(tail, tail.size() + tail_advance, stream); - } - _size += actual_size; - } - - /** - * @brief Returns the first element that was written to the output. - * Requires a previous call to `next_output` and `advance_output` and `size() > 0`. - * @param stream The stream used to access the element. - * @return The first element that was written to the output. - */ - [[nodiscard]] T front_element(rmm::cuda_stream_view stream) const - { - return _chunks.front().front_element(stream); - } - - /** - * @brief Returns the last element that was written to the output. - * Requires a previous call to `next_output` and `advance_output` and `size() > 0`. - * @param stream The stream used to access the element. - * @return The last element that was written to the output. - */ - [[nodiscard]] T back_element(rmm::cuda_stream_view stream) const - { - auto const& last_nonempty_chunk = - _chunks.size() > 1 and _chunks.back().is_empty() ? _chunks.rbegin()[1] : _chunks.back(); - return last_nonempty_chunk.back_element(stream); - } - - [[nodiscard]] size_type size() const { return _size; } - - /** - * @brief Gathers all previously written outputs into a single contiguous vector. - * - * @param stream The stream used to allocate and gather the output vector. All previous write - * operations to the output buffer must have finished or happened on this stream. - * @param mr The memory resource used to allocate the output vector. - * @return The output vector. - */ - rmm::device_uvector gather(rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const - { - rmm::device_uvector output{size(), stream, mr}; - auto output_it = output.begin(); - for (auto const& chunk : _chunks) { - output_it = thrust::copy( - rmm::exec_policy_nosync(stream), chunk.begin(), chunk.begin() + chunk.size(), output_it); - } - return output; - } - - private: - /** - * @brief Resizes a vector without reallocating - * - * @param vector The vector - * @param new_size The new size. Must be smaller than the vector's capacity - */ - static void inplace_resize(rmm::device_uvector& vector, - size_type new_size, - rmm::cuda_stream_view stream) - { - CUDF_EXPECTS(new_size <= vector.capacity(), "Internal error"); - vector.resize(new_size, stream); - } - - /** - * @brief Returns the span consisting of all currently unused elements in the vector - * (`i >= size() and i < capacity()`). - * - * @param vector The vector. - * @return The span of unused elements. - */ - static device_span get_free_span(rmm::device_uvector& vector) - { - return device_span{vector.data() + vector.size(), vector.capacity() - vector.size()}; - } - - size_type _size; - size_type _max_write_size; - size_type _max_growth; - std::vector> _chunks; -}; - std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source const& source, std::string const& delimiter, byte_range_info byte_range, @@ -732,9 +521,7 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source chunk->data() + std::min(sentinel - chunk_offset, chunk->size()); auto const output_size = end - begin; auto char_output = char_storage.next_output(scan_stream); - auto const split = begin + std::min(output_size, char_output.head().size()); - thrust::copy(rmm::exec_policy_nosync(scan_stream), begin, split, char_output.head().begin()); - thrust::copy(rmm::exec_policy_nosync(scan_stream), split, end, char_output.tail().begin()); + thrust::copy(rmm::exec_policy_nosync(scan_stream), begin, end, char_output.begin()); char_storage.advance_output(output_size, scan_stream); } diff --git a/cpp/src/io/utilities/output_builder.cuh b/cpp/src/io/utilities/output_builder.cuh new file mode 100644 index 00000000000..e45143480fc --- /dev/null +++ b/cpp/src/io/utilities/output_builder.cuh @@ -0,0 +1,357 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include + +#include + +#include + +namespace cudf { + +template +class split_device_span_iterator; + +/** + * @brief A device span consisting of two separate device_spans acting as if they were part of a + * single span. The first head.size() entries are served from the first span, the remaining + * tail.size() entries are served from the second span. + * + * @tparam T The type of elements in the span. + */ +template +class split_device_span { + public: + using element_type = T; + using value_type = std::remove_cv; + using size_type = std::size_t; + using difference_type = std::ptrdiff_t; + using pointer = T*; + using iterator = split_device_span_iterator; + using const_pointer = T const*; + using reference = T&; + using const_reference = T const&; + + split_device_span() = default; + + explicit constexpr split_device_span(device_span head, device_span tail = {}) + : _head{head}, _tail{tail} + { + } + + [[nodiscard]] constexpr reference operator[](size_type i) const + { + return i < _head.size() ? _head[i] : _tail[i - _head.size()]; + } + + [[nodiscard]] constexpr size_type size() const { return _head.size() + _tail.size(); } + + [[nodiscard]] constexpr device_span head() const { return _head; } + + [[nodiscard]] constexpr device_span tail() const { return _tail; } + + [[nodiscard]] constexpr iterator begin() const; + + [[nodiscard]] constexpr iterator end() const; + + private: + device_span _head; + device_span _tail; +}; + +/** + * @brief A random access iterator indexing into a split_device_span. + * + * @tparam T The type of elements in the underlying span. + */ +template +class split_device_span_iterator { + using it = split_device_span_iterator; + + public: + using size_type = std::size_t; + using difference_type = std::ptrdiff_t; + using value_type = T; + using pointer = value_type*; + using reference = value_type&; + using iterator_category = std::random_access_iterator_tag; + + split_device_span_iterator() = default; + + constexpr split_device_span_iterator(split_device_span span, size_type offset) + : _span{span}, _offset{offset} + { + } + + [[nodiscard]] constexpr reference operator*() const { return _span[_offset]; } + + [[nodiscard]] constexpr reference operator[](size_type i) const { return _span[_offset + i]; } + + [[nodiscard]] constexpr friend bool operator==(const it& lhs, const it& rhs) + { + return lhs._offset == rhs._offset; + } + + [[nodiscard]] constexpr friend bool operator!=(const it& lhs, const it& rhs) + { + return !(lhs == rhs); + } + [[nodiscard]] constexpr friend bool operator<(const it& lhs, const it& rhs) + { + return lhs._offset < rhs._offset; + } + + [[nodiscard]] constexpr friend bool operator>=(const it& lhs, const it& rhs) + { + return !(lhs < rhs); + } + + [[nodiscard]] constexpr friend bool operator>(const it& lhs, const it& rhs) { return rhs < lhs; } + + [[nodiscard]] constexpr friend bool operator<=(const it& lhs, const it& rhs) + { + return !(lhs > rhs); + } + + [[nodiscard]] constexpr friend difference_type operator-(const it& lhs, const it& rhs) + { + return lhs._offset - rhs._offset; + } + + [[nodiscard]] constexpr friend it operator+(it lhs, difference_type i) { return lhs += i; } + + constexpr it& operator+=(difference_type i) + { + _offset += i; + return *this; + } + + constexpr it& operator-=(difference_type i) { return *this += -i; } + + constexpr it& operator++() { return *this += 1; } + + constexpr it& operator--() { return *this -= 1; } + + constexpr it operator++(int) + { + auto result = *this; + ++*this; + return result; + } + + constexpr it operator--(int) + { + auto result = *this; + --*this; + return result; + } + + private: + split_device_span _span; + size_type _offset; +}; + +template +[[nodiscard]] constexpr split_device_span_iterator split_device_span::begin() const +{ + return {*this, 0}; +} + +template +[[nodiscard]] constexpr split_device_span_iterator split_device_span::end() const +{ + return {*this, size()}; +} + +/** + * @brief A chunked storage class that provides preallocated memory for algorithms with known + * worst-case output size. It provides functionality to retrieve the next chunk to write to, for + * reporting how much memory was actually written and for gathering all previously written outputs + * into a single contiguous vector. + * + * @tparam T The output element type. + */ +template +class output_builder { + public: + using size_type = typename rmm::device_uvector::size_type; + + /** + * @brief Initializes an output builder with given worst-case output size and stream. + * + * @param max_write_size the maximum number of elements that will be written into a + * split_device_span returned from `next_output`. + * @param stream the stream used to allocate the first chunk of memory. + * @param mr optional, the memory resource to use for allocation. + */ + output_builder(size_type max_write_size, + size_type max_growth, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) + : _size{0}, _max_write_size{max_write_size}, _max_growth{max_growth} + { + CUDF_EXPECTS(max_write_size > 0, "Internal error"); + _chunks.emplace_back(0, stream, mr); + _chunks.back().reserve(max_write_size * 2, stream); + } + + output_builder(output_builder&&) = delete; + output_builder(const output_builder&) = delete; + output_builder& operator=(output_builder&&) = delete; + output_builder& operator=(const output_builder&) = delete; + + /** + * @brief Returns the next free chunk of `max_write_size` elements from the underlying storage. + * Must be followed by a call to `advance_output` after the memory has been written to. + * + * @param stream The stream to allocate a new chunk of memory with, if necessary. + * This should be the stream that will write to the `split_device_span`. + * @return A `split_device_span` starting directly after the last output and providing at least + * `max_write_size` entries of storage. + */ + [[nodiscard]] split_device_span next_output(rmm::cuda_stream_view stream) + { + auto head_it = _chunks.end() - (_chunks.size() > 1 and _chunks.back().is_empty() ? 2 : 1); + auto head_span = get_free_span(*head_it); + if (head_span.size() >= _max_write_size) { return split_device_span{head_span}; } + if (head_it == _chunks.end() - 1) { + // insert a new device_uvector of double size + auto const next_chunk_size = + std::min(_max_growth * _max_write_size, 2 * _chunks.back().capacity()); + _chunks.emplace_back(0, stream, _chunks.back().memory_resource()); + _chunks.back().reserve(next_chunk_size, stream); + } + auto tail_span = get_free_span(_chunks.back()); + CUDF_EXPECTS(head_span.size() + tail_span.size() >= _max_write_size, "Internal error"); + return split_device_span{head_span, tail_span}; + } + + /** + * @brief Advances the output sizes after a `split_device_span` returned from `next_output` was + * written to. + * + * @param actual_size The number of elements that were written to the result of the previous + * `next_output` call. + * @param stream The stream on which to resize the vectors. Since this function will not + * reallocate, this only changes the stream of the internally stored vectors, + * impacting their subsequent copy and destruction behavior. + */ + void advance_output(size_type actual_size, rmm::cuda_stream_view stream) + { + CUDF_EXPECTS(actual_size <= _max_write_size, "Internal error"); + if (_chunks.size() < 2) { + auto const new_size = _chunks.back().size() + actual_size; + inplace_resize(_chunks.back(), new_size, stream); + } else { + auto& tail = _chunks.back(); + auto& prev = _chunks.rbegin()[1]; + auto const prev_advance = std::min(actual_size, prev.capacity() - prev.size()); + auto const tail_advance = actual_size - prev_advance; + inplace_resize(prev, prev.size() + prev_advance, stream); + inplace_resize(tail, tail.size() + tail_advance, stream); + } + _size += actual_size; + } + + /** + * @brief Returns the first element that was written to the output. + * Requires a previous call to `next_output` and `advance_output` and `size() > 0`. + * @param stream The stream used to access the element. + * @return The first element that was written to the output. + */ + [[nodiscard]] T front_element(rmm::cuda_stream_view stream) const + { + return _chunks.front().front_element(stream); + } + + /** + * @brief Returns the last element that was written to the output. + * Requires a previous call to `next_output` and `advance_output` and `size() > 0`. + * @param stream The stream used to access the element. + * @return The last element that was written to the output. + */ + [[nodiscard]] T back_element(rmm::cuda_stream_view stream) const + { + auto const& last_nonempty_chunk = + _chunks.size() > 1 and _chunks.back().is_empty() ? _chunks.rbegin()[1] : _chunks.back(); + return last_nonempty_chunk.back_element(stream); + } + + [[nodiscard]] size_type size() const { return _size; } + + /** + * @brief Gathers all previously written outputs into a single contiguous vector. + * + * @param stream The stream used to allocate and gather the output vector. All previous write + * operations to the output buffer must have finished or happened on this stream. + * @param mr The memory resource used to allocate the output vector. + * @return The output vector. + */ + rmm::device_uvector gather(rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const + { + rmm::device_uvector output{size(), stream, mr}; + auto output_it = output.begin(); + for (auto const& chunk : _chunks) { + output_it = thrust::copy( + rmm::exec_policy_nosync(stream), chunk.begin(), chunk.begin() + chunk.size(), output_it); + } + return output; + } + + private: + /** + * @brief Resizes a vector without reallocating + * + * @param vector The vector + * @param new_size The new size. Must be smaller than the vector's capacity + * @param stream The stream on which to resize the vector. Since this function will not + * reallocate, this only changes the stream of `vector`, impacting its subsequent + * copy and destruction behavior. + */ + static void inplace_resize(rmm::device_uvector& vector, + size_type new_size, + rmm::cuda_stream_view stream) + { + CUDF_EXPECTS(new_size <= vector.capacity(), "Internal error"); + vector.resize(new_size, stream); + } + + /** + * @brief Returns the span consisting of all currently unused elements in the vector + * (`i >= size() and i < capacity()`). + * + * @param vector The vector. + * @return The span of unused elements. + */ + static device_span get_free_span(rmm::device_uvector& vector) + { + return device_span{vector.data() + vector.size(), vector.capacity() - vector.size()}; + } + + size_type _size; + size_type _max_write_size; + size_type _max_growth; + std::vector> _chunks; +}; + +} // namespace cudf diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index 2da7073b334..2783b006982 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include + #include #include #include @@ -26,6 +28,7 @@ #include #include #include +#include using namespace cudf; using namespace test; @@ -499,4 +502,65 @@ TEST_F(MultibyteSplitTest, EmptyRangeSingleByte) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, debug_output_level::ALL_ERRORS); } +TEST_F(MultibyteSplitTest, EmptySplitDeviceSpan) +{ + cudf::split_device_span span; + ASSERT_EQ(span.size(), 0); + ASSERT_EQ(span.head().size(), 0); + ASSERT_EQ(span.head().data(), nullptr); + ASSERT_EQ(span.tail().size(), 0); + ASSERT_EQ(span.tail().data(), nullptr); +} + +TEST_F(MultibyteSplitTest, SplitDeviceSpan) +{ + int i = 0; + int j = 1; + cudf::split_device_span span{{&i, 1}, {&j, 1}}; + ASSERT_EQ(span.size(), 2); + ASSERT_EQ(span.head().size(), 1); + ASSERT_EQ(span.head().data(), &i); + ASSERT_EQ(span.tail().size(), 1); + ASSERT_EQ(span.tail().data(), &j); + ASSERT_EQ(&span[0], &i); + ASSERT_EQ(&span[1], &j); + ASSERT_EQ(&*span.begin(), &i); + ASSERT_EQ(&*(span.begin() + 1), &j); + ASSERT_NE(span.begin() + 1, span.end()); + ASSERT_EQ(span.begin() + 2, span.end()); +} + +TEST_F(MultibyteSplitTest, OutputBuilder) +{ + auto const stream = cudf::get_default_stream(); + cudf::output_builder builder{10, 4, stream}; + auto const output = builder.next_output(stream); + ASSERT_GE(output.size(), 10); + ASSERT_EQ(output.tail().size(), 0); + ASSERT_EQ(output.tail().data(), nullptr); + ASSERT_EQ(builder.size(), 0); + builder.advance_output(1, stream); + ASSERT_EQ(builder.size(), 1); + auto const output2 = builder.next_output(stream); + ASSERT_EQ(output2.head().data(), output.head().data() + 1); + builder.advance_output(10, stream); + ASSERT_EQ(builder.size(), 11); + auto const output3 = builder.next_output(stream); + ASSERT_EQ(output3.head().size(), 9); + ASSERT_EQ(output3.head().data(), output.head().data() + 11); + ASSERT_EQ(output3.tail().size(), 40); + builder.advance_output(9, stream); + ASSERT_EQ(builder.size(), 20); + auto const output4 = builder.next_output(stream); + ASSERT_EQ(output4.head().size(), 0); + ASSERT_EQ(output4.tail().size(), output3.tail().size()); + ASSERT_EQ(output4.tail().data(), output3.tail().data()); + builder.advance_output(1, stream); + auto const output5 = builder.next_output(stream); + ASSERT_EQ(output5.head().size(), 39); + ASSERT_EQ(output5.head().data(), output4.tail().data() + 1); + ASSERT_EQ(output5.tail().size(), 0); + ASSERT_EQ(output5.tail().data(), nullptr); +} + CUDF_TEST_PROGRAM_MAIN() From bac20048c488827747ba7ba9c596af9f38aceff7 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 27 Oct 2022 15:03:48 -0500 Subject: [PATCH 4/4] Add pivot_table and crosstab to docs. (#12014) This PR resolves #12012 by adding `cudf.pivot_table` and `cudf.crosstab` to the documentation. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Ashwin Srinath (https://github.com/shwina) URL: https://github.com/rapidsai/cudf/pull/12014 --- docs/cudf/source/api_docs/general_functions.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/cudf/source/api_docs/general_functions.rst b/docs/cudf/source/api_docs/general_functions.rst index 272d95e84bc..40e1b766dc9 100644 --- a/docs/cudf/source/api_docs/general_functions.rst +++ b/docs/cudf/source/api_docs/general_functions.rst @@ -14,6 +14,8 @@ Data manipulations cudf.get_dummies cudf.melt cudf.pivot + cudf.pivot_table + cudf.crosstab cudf.unstack Top-level conversions