Add BGZIP multibyte_split benchmark (#11723)

This refactors #11652 to extract the BGZIP IO and adds another `source_type` to the `multibyte_split` benchmark, creating a compressed file using `zlib`. A quick benchmark shows performance results around 2.5x slower than reading from a device buffer at around 1:5 compression ratio ### [0] Tesla T4 | source_type | delim_size | delim_percent | size_approx | byte_range_percent | Time | Peak Memory Usage | Encoded file size | |-------------|------------|---------------|-------------------|--------------------|------------|-------------------|-------------------| | bgzip | 1 | 1 | 2^30 = 1073741824 | 100 | 507.479 ms | 4.022 GiB | 1006.638 MiB | | file | 1 | 1 | 2^30 = 1073741824 | 100 | 339.860 ms | 3.947 GiB | 1006.638 MiB | | device | 1 | 1 | 2^30 = 1073741824 | 100 | 201.556 ms | 3.947 GiB | 1006.638 MiB | Authors: - Tobias Ribizel (https://github.com/upsj) Approvers: - Robert Maynard (https://github.com/robertmaynard) - Vukasin Milovanovic (https://github.com/vuule) - Bradley Dice (https://github.com/bdice) - Jordan Jacobelli (https://github.com/Ethyling) URL: #11723
rapidsai · Oct 10, 2022 · 4eb9c6c · 4eb9c6c
1 parent fc5b675
commit 4eb9c6c
Show file tree

Hide file tree

Showing 9 changed files with 473 additions and 185 deletions.
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
@@ -152,6 +152,7 @@ outputs:
         - test -f $PREFIX/include/cudf/io/text/byte_range_info.hpp
         - test -f $PREFIX/include/cudf/io/text/data_chunk_source.hpp
         - test -f $PREFIX/include/cudf/io/text/data_chunk_source_factories.hpp
+        - test -f $PREFIX/include/cudf/io/text/detail/bgzip_utils.hpp
         - test -f $PREFIX/include/cudf/io/text/detail/multistate.hpp
         - test -f $PREFIX/include/cudf/io/text/detail/tile_state.hpp
         - test -f $PREFIX/include/cudf/io/text/detail/trie.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -356,6 +356,7 @@ add_library(
   src/io/text/byte_range_info.cpp
   src/io/text/data_chunk_source_factories.cpp
   src/io/text/bgzip_data_chunk_source.cu
+  src/io/text/bgzip_utils.cpp
   src/io/text/multibyte_split.cu
   src/io/utilities/column_buffer.cpp
   src/io/utilities/config_utils.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -301,7 +301,8 @@ ConfigureNVBench(NESTED_JSON_NVBENCH io/json/nested_json.cpp)
 
 # ##################################################################################################
 # * io benchmark ---------------------------------------------------------------------
-ConfigureNVBench(MULTIBYTE_SPLIT_BENCHMARK io/text/multibyte_split.cpp)
+ConfigureNVBench(MULTIBYTE_SPLIT_NVBENCH io/text/multibyte_split.cpp)
+target_link_libraries(MULTIBYTE_SPLIT_NVBENCH PRIVATE ZLIB::ZLIB)
 
 add_custom_target(
   run_benchmarks

diff --git a/cpp/benchmarks/io/text/multibyte_split.cpp b/cpp/benchmarks/io/text/multibyte_split.cpp
@@ -25,6 +25,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
+#include <cudf/io/text/detail/bgzip_utils.hpp>
 #include <cudf/io/text/multibyte_split.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/strings/combine.hpp>
@@ -40,10 +41,25 @@
 #include <cstdio>
 #include <fstream>
 #include <memory>
+#include <random>
 
 temp_directory const temp_dir("cudf_nvbench");
 
-enum class data_chunk_source_type { device, file, host, host_pinned };
+enum class data_chunk_source_type { device, file, host, host_pinned, file_bgzip };
+
+NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
+  data_chunk_source_type,
+  [](auto value) {
+    switch (value) {
+      case data_chunk_source_type::device: return "device";
+      case data_chunk_source_type::file: return "file";
+      case data_chunk_source_type::host: return "host";
+      case data_chunk_source_type::host_pinned: return "host_pinned";
+      case data_chunk_source_type::file_bgzip: return "file_bgzip";
+      default: return "Unknown";
+    }
+  },
+  [](auto) { return std::string{}; })
 
 static cudf::string_scalar create_random_input(int32_t num_chars,
                                                double delim_factor,
@@ -78,14 +94,32 @@ static cudf::string_scalar create_random_input(int32_t num_chars,
   return cudf::string_scalar(std::move(*chars_buffer));
 }
 
-static void bench_multibyte_split(nvbench::state& state)
+static void write_bgzip_file(cudf::host_span<char const> host_data, std::ostream& output_stream)
+{
+  // a bit of variability with a decent amount of padding so we don't overflow 16 bit block sizes
+  std::uniform_int_distribution<std::size_t> chunk_size_dist{64000, 65000};
+  std::default_random_engine rng{};
+  std::size_t pos = 0;
+  while (pos < host_data.size()) {
+    auto const remainder  = host_data.size() - pos;
+    auto const chunk_size = std::min(remainder, chunk_size_dist(rng));
+    cudf::io::text::detail::bgzip::write_compressed_block(output_stream,
+                                                          {host_data.data() + pos, chunk_size});
+    pos += chunk_size;
+  }
+  // empty block denotes EOF
+  cudf::io::text::detail::bgzip::write_uncompressed_block(output_stream, {});
+}
+
+template <data_chunk_source_type source_type>
+static void bench_multibyte_split(nvbench::state& state,
+                                  nvbench::type_list<nvbench::enum_type<source_type>>)
 {
   cudf::rmm_pool_raii pool_raii;
 
-  auto const source_type      = static_cast<data_chunk_source_type>(state.get_int64("source_type"));
-  auto const delim_size       = state.get_int64("delim_size");
-  auto const delim_percent    = state.get_int64("delim_percent");
-  auto const file_size_approx = state.get_int64("size_approx");
+  auto const delim_size         = state.get_int64("delim_size");
+  auto const delim_percent      = state.get_int64("delim_percent");
+  auto const file_size_approx   = state.get_int64("size_approx");
   auto const byte_range_percent = state.get_int64("byte_range_percent");
 
   auto const byte_range_factor = static_cast<double>(byte_range_percent) / 100;
@@ -104,7 +138,8 @@ static void bench_multibyte_split(nvbench::state& state)
   auto host_pinned_input =
     thrust::host_vector<char, thrust::system::cuda::experimental::pinned_allocator<char>>{};
 
-  if (source_type == data_chunk_source_type::host || source_type == data_chunk_source_type::file) {
+  if (source_type == data_chunk_source_type::host || source_type == data_chunk_source_type::file ||
+      source_type == data_chunk_source_type::file_bgzip) {
     host_input = cudf::detail::make_std_vector_sync<char>(
       {device_input.data(), static_cast<std::size_t>(device_input.size())},
       cudf::default_stream_value);
@@ -131,6 +166,14 @@ static void bench_multibyte_split(nvbench::state& state)
         return cudf::io::text::make_source(host_pinned_input);
       case data_chunk_source_type::device:  //
         return cudf::io::text::make_source(device_input);
+      case data_chunk_source_type::file_bgzip: {
+        auto const temp_file_name = random_file_in_dir(temp_dir.path());
+        {
+          std::ofstream output_stream(temp_file_name, std::ofstream::out);
+          write_bgzip_file(host_input, output_stream);
+        }
+        return cudf::io::text::make_source_from_bgzip_file(temp_file_name);
+      }
       default: CUDF_FAIL();
     }
   }();
@@ -152,13 +195,14 @@ static void bench_multibyte_split(nvbench::state& state)
   state.add_buffer_size(range_size, "efs", "Encoded file size");
 }
 
-NVBENCH_BENCH(bench_multibyte_split)
+using source_type_list = nvbench::enum_type_list<data_chunk_source_type::device,
+                                                 data_chunk_source_type::file,
+                                                 data_chunk_source_type::host,
+                                                 data_chunk_source_type::host_pinned,
+                                                 data_chunk_source_type::file_bgzip>;
+
+NVBENCH_BENCH_TYPES(bench_multibyte_split, NVBENCH_TYPE_AXES(source_type_list))
   .set_name("multibyte_split")
-  .add_int64_axis("source_type",
-                  {static_cast<int>(data_chunk_source_type::device),
-                   static_cast<int>(data_chunk_source_type::file),
-                   static_cast<int>(data_chunk_source_type::host),
-                   static_cast<int>(data_chunk_source_type::host_pinned)})
   .add_int64_axis("delim_size", {1, 4, 7})
   .add_int64_axis("delim_percent", {1, 25})
   .add_int64_power_of_two_axis("size_approx", {15, 30})

diff --git a/cpp/include/cudf/io/text/detail/bgzip_utils.hpp b/cpp/include/cudf/io/text/detail/bgzip_utils.hpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <zlib.h>
+
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <algorithm>
+#include <array>
+#include <fstream>
+#include <limits>
+
+namespace cudf::io::text::detail::bgzip {
+
+struct header {
+  int block_size;
+  int extra_length;
+  [[nodiscard]] int data_size() const { return block_size - extra_length - 20; }
+};
+
+struct footer {
+  uint32_t crc;
+  uint32_t decompressed_size;
+};
+
+/**
+ * @brief Reads the full BGZIP header from the given input stream. Afterwards, the stream position
+ *        is at the first data byte.
+ *
+ * @param input_stream The input stream
+ * @return The header storing the compressed size and extra subfield length
+ */
+header read_header(std::istream& input_stream);
+
+/**
+ * @brief Reads the full BGZIP footer from the given input stream. Afterwards, the stream position
+ *        is after the last footer byte.
+ *
+ * @param input_stream The input stream
+ * @return The footer storing uncompressed size and CRC32
+ */
+footer read_footer(std::istream& input_stream);
+
+/**
+ * @brief Writes a header for data of the given compressed size to the given stream.
+ *
+ * @param output_stream The output stream
+ * @param compressed_size The size of the compressed data
+ * @param pre_size_subfields Any GZIP extra subfields (need to be valid) to be placed before the
+ *                           BGZIP block size subfield
+ * @param post_size_subfields Any subfields to be placed after the BGZIP block size subfield
+ */
+void write_header(std::ostream& output_stream,
+                  uint16_t compressed_size,
+                  host_span<char const> pre_size_subfields,
+                  host_span<char const> post_size_subfields);
+
+/**
+ * @brief Writes a footer for the given uncompressed data to the given stream.
+ *
+ * @param output_stream The output stream
+ * @param data The data for which uncompressed size and CRC32 will be computed and written
+ */
+void write_footer(std::ostream& output_stream, host_span<char const> data);
+
+/**
+ * @brief Writes the given data to the given stream as an uncompressed deflate block with BZGIP
+ *        header and footer.
+ *
+ * @param output_stream The output stream
+ * @param data The uncompressed data
+ * @param pre_size_subfields Any GZIP extra subfields (need to be valid) to be placed before the
+ *                           BGZIP block size subfield
+ * @param post_size_subfields Any subfields to be placed after the BGZIP block size subfield
+ */
+void write_uncompressed_block(std::ostream& output_stream,
+                              host_span<char const> data,
+                              host_span<char const> pre_size_subfields  = {},
+                              host_span<char const> post_size_subfields = {});
+
+/**
+ * @brief Writes the given data to the given stream as a compressed deflate block with BZGIP
+ *        header and footer.
+ *
+ * @param output_stream The output stream
+ * @param data The uncompressed data
+ * @param pre_size_subfields Any GZIP extra subfields (need to be valid) to be placed before the
+ *                           BGZIP block size subfield
+ * @param post_size_subfields Any subfields to be placed after the BGZIP block size subfield
+ */
+void write_compressed_block(std::ostream& output_stream,
+                            host_span<char const> data,
+                            host_span<char const> pre_size_subfields  = {},
+                            host_span<char const> post_size_subfields = {});
+
+}  // namespace cudf::io::text::detail::bgzip
diff --git a/cpp/src/io/text/bgzip_data_chunk_source.cu b/cpp/src/io/text/bgzip_data_chunk_source.cu
@@ -20,6 +20,7 @@
 
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
+#include <cudf/io/text/detail/bgzip_utils.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -36,7 +37,6 @@
 #include <limits>
 
 namespace cudf::io::text {
-
 namespace {
 
 /**
@@ -64,68 +64,6 @@ struct bgzip_nvcomp_transform_functor {
 
 class bgzip_data_chunk_reader : public data_chunk_reader {
  private:
-  template <typename IntType>
-  static IntType read_int(char* data)
-  {
-    IntType result{};
-    // we assume little-endian
-    std::memcpy(&result, &data[0], sizeof(result));
-    return result;
-  }
-
-  struct bgzip_header {
-    int block_size;
-    int extra_length;
-    [[nodiscard]] int data_size() const { return block_size - extra_length - 20; }
-  };
-
-  bgzip_header read_header()
-  {
-    std::array<char, 12> buffer{};
-    _data_stream->read(buffer.data(), sizeof(buffer));
-    std::array<uint8_t, 4> const expected_header{{31, 139, 8, 4}};
-    CUDF_EXPECTS(
-      std::equal(
-        expected_header.begin(), expected_header.end(), reinterpret_cast<uint8_t*>(buffer.data())),
-      "malformed BGZIP header");
-    // we ignore the remaining bytes of the fixed header, since they don't matter to us
-    auto const extra_length = read_int<uint16_t>(&buffer[10]);
-    uint16_t extra_offset{};
-    // read all the extra subfields
-    while (extra_offset < extra_length) {
-      auto const remaining_size = extra_length - extra_offset;
-      CUDF_EXPECTS(remaining_size >= 4, "invalid extra field length");
-      // a subfield consists of 2 identifier bytes and a uint16 length
-      // 66/67 identifies a BGZIP block size field, we skip all other fields
-      _data_stream->read(buffer.data(), 4);
-      extra_offset += 4;
-      auto const subfield_size = read_int<uint16_t>(&buffer[2]);
-      if (buffer[0] == 66 && buffer[1] == 67) {
-        // the block size subfield contains a single uint16 value, which is block_size - 1
-        CUDF_EXPECTS(subfield_size == sizeof(uint16_t), "malformed BGZIP extra subfield");
-        _data_stream->read(buffer.data(), sizeof(uint16_t));
-        _data_stream->seekg(remaining_size - 6, std::ios_base::cur);
-        auto const block_size_minus_one = read_int<uint16_t>(&buffer[0]);
-        return {block_size_minus_one + 1, extra_length};
-      } else {
-        _data_stream->seekg(subfield_size, std::ios_base::cur);
-        extra_offset += subfield_size;
-      }
-    }
-    CUDF_FAIL("missing BGZIP size extra subfield");
-  }
-
-  struct bgzip_footer {
-    uint32_t decompressed_size;
-  };
-
-  bgzip_footer read_footer()
-  {
-    std::array<char, 8> buffer{};
-    _data_stream->read(buffer.data(), sizeof(buffer));
-    return {read_int<uint32_t>(&buffer[4])};
-  }
-
   template <typename T>
   using pinned_host_vector =
     thrust::host_vector<T, thrust::system::cuda::experimental::pinned_allocator<T>>;
@@ -258,13 +196,13 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
       return available_decompressed_size - read_pos;
     }
 
-    void read_block(bgzip_header header, std::istream& stream)
+    void read_block(detail::bgzip::header header, std::istream& stream)
     {
       h_compressed_blocks.resize(h_compressed_blocks.size() + header.data_size());
       stream.read(h_compressed_blocks.data() + compressed_size(), header.data_size());
     }
 
-    void add_block_offsets(bgzip_header header, bgzip_footer footer)
+    void add_block_offsets(detail::bgzip::header header, detail::bgzip::footer footer)
     {
       max_decompressed_size =
         std::max<std::size_t>(footer.decompressed_size, max_decompressed_size);
@@ -294,9 +232,9 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
       // peek is necessary if we are already at the end, but didn't try to read another byte
       _data_stream->peek();
       if (_data_stream->eof() || _compressed_pos > _compressed_end) { break; }
-      auto header = read_header();
+      auto header = detail::bgzip::read_header(*_data_stream);
       _curr_blocks.read_block(header, *_data_stream);
-      auto footer = read_footer();
+      auto footer = detail::bgzip::read_footer(*_data_stream);
       _curr_blocks.add_block_offsets(header, footer);
       // for the last GZIP block, we restrict ourselves to the bytes up to _local_end
       // but only for the reader, not for decompression!