-
Notifications
You must be signed in to change notification settings - Fork 915
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add BGZIP multibyte_split benchmark (#11723)
This refactors #11652 to extract the BGZIP IO and adds another `source_type` to the `multibyte_split` benchmark, creating a compressed file using `zlib`. A quick benchmark shows performance results around 2.5x slower than reading from a device buffer at around 1:5 compression ratio ### [0] Tesla T4 | source_type | delim_size | delim_percent | size_approx | byte_range_percent | Time | Peak Memory Usage | Encoded file size | |-------------|------------|---------------|-------------------|--------------------|------------|-------------------|-------------------| | bgzip | 1 | 1 | 2^30 = 1073741824 | 100 | 507.479 ms | 4.022 GiB | 1006.638 MiB | | file | 1 | 1 | 2^30 = 1073741824 | 100 | 339.860 ms | 3.947 GiB | 1006.638 MiB | | device | 1 | 1 | 2^30 = 1073741824 | 100 | 201.556 ms | 3.947 GiB | 1006.638 MiB | Authors: - Tobias Ribizel (https://github.com/upsj) Approvers: - Robert Maynard (https://github.com/robertmaynard) - Vukasin Milovanovic (https://github.com/vuule) - Bradley Dice (https://github.com/bdice) - Jordan Jacobelli (https://github.com/Ethyling) URL: #11723
- Loading branch information
Showing
9 changed files
with
473 additions
and
185 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
/* | ||
* Copyright (c) 2022, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include <zlib.h> | ||
|
||
#include <cudf/utilities/error.hpp> | ||
#include <cudf/utilities/span.hpp> | ||
|
||
#include <algorithm> | ||
#include <array> | ||
#include <fstream> | ||
#include <limits> | ||
|
||
namespace cudf::io::text::detail::bgzip { | ||
|
||
struct header { | ||
int block_size; | ||
int extra_length; | ||
[[nodiscard]] int data_size() const { return block_size - extra_length - 20; } | ||
}; | ||
|
||
struct footer { | ||
uint32_t crc; | ||
uint32_t decompressed_size; | ||
}; | ||
|
||
/** | ||
* @brief Reads the full BGZIP header from the given input stream. Afterwards, the stream position | ||
* is at the first data byte. | ||
* | ||
* @param input_stream The input stream | ||
* @return The header storing the compressed size and extra subfield length | ||
*/ | ||
header read_header(std::istream& input_stream); | ||
|
||
/** | ||
* @brief Reads the full BGZIP footer from the given input stream. Afterwards, the stream position | ||
* is after the last footer byte. | ||
* | ||
* @param input_stream The input stream | ||
* @return The footer storing uncompressed size and CRC32 | ||
*/ | ||
footer read_footer(std::istream& input_stream); | ||
|
||
/** | ||
* @brief Writes a header for data of the given compressed size to the given stream. | ||
* | ||
* @param output_stream The output stream | ||
* @param compressed_size The size of the compressed data | ||
* @param pre_size_subfields Any GZIP extra subfields (need to be valid) to be placed before the | ||
* BGZIP block size subfield | ||
* @param post_size_subfields Any subfields to be placed after the BGZIP block size subfield | ||
*/ | ||
void write_header(std::ostream& output_stream, | ||
uint16_t compressed_size, | ||
host_span<char const> pre_size_subfields, | ||
host_span<char const> post_size_subfields); | ||
|
||
/** | ||
* @brief Writes a footer for the given uncompressed data to the given stream. | ||
* | ||
* @param output_stream The output stream | ||
* @param data The data for which uncompressed size and CRC32 will be computed and written | ||
*/ | ||
void write_footer(std::ostream& output_stream, host_span<char const> data); | ||
|
||
/** | ||
* @brief Writes the given data to the given stream as an uncompressed deflate block with BZGIP | ||
* header and footer. | ||
* | ||
* @param output_stream The output stream | ||
* @param data The uncompressed data | ||
* @param pre_size_subfields Any GZIP extra subfields (need to be valid) to be placed before the | ||
* BGZIP block size subfield | ||
* @param post_size_subfields Any subfields to be placed after the BGZIP block size subfield | ||
*/ | ||
void write_uncompressed_block(std::ostream& output_stream, | ||
host_span<char const> data, | ||
host_span<char const> pre_size_subfields = {}, | ||
host_span<char const> post_size_subfields = {}); | ||
|
||
/** | ||
* @brief Writes the given data to the given stream as a compressed deflate block with BZGIP | ||
* header and footer. | ||
* | ||
* @param output_stream The output stream | ||
* @param data The uncompressed data | ||
* @param pre_size_subfields Any GZIP extra subfields (need to be valid) to be placed before the | ||
* BGZIP block size subfield | ||
* @param post_size_subfields Any subfields to be placed after the BGZIP block size subfield | ||
*/ | ||
void write_compressed_block(std::ostream& output_stream, | ||
host_span<char const> data, | ||
host_span<char const> pre_size_subfields = {}, | ||
host_span<char const> post_size_subfields = {}); | ||
|
||
} // namespace cudf::io::text::detail::bgzip |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.