Skip to content

Commit

Permalink
[FEATURE] Adds compression formats to magic_header.
Browse files Browse the repository at this point in the history
  • Loading branch information
rrahn committed Aug 14, 2019
1 parent 56460f4 commit 78932ba
Show file tree
Hide file tree
Showing 4 changed files with 113 additions and 70 deletions.
22 changes: 11 additions & 11 deletions include/seqan3/contrib/stream/bgzf_stream_util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ template <>
struct CompressionContext<detail::bgzf_compression>:
CompressionContext<detail::gz_compression>
{
static constexpr size_t BLOCK_HEADER_LENGTH = detail::magic_header<detail::bgzf_compression>.size();
static constexpr size_t BLOCK_HEADER_LENGTH = detail::bgzf_compression::magic_header.size();
unsigned char headerPos;
};

Expand Down Expand Up @@ -192,7 +192,7 @@ _compressBlock(TDestValue *dstBegin, TDestCapacity dstCapacity,
assert(sizeof(unsigned) == 4u);

// 1. COPY HEADER
std::ranges::copy(detail::magic_header<detail::bgzf_compression>, dstBegin);
std::ranges::copy(detail::bgzf_compression::magic_header, dstBegin);

// 2. COMPRESS
compressInit(ctx);
Expand Down Expand Up @@ -234,15 +234,15 @@ _compressBlock(TDestValue *dstBegin, TDestCapacity dstCapacity,
inline bool
_bgzfCheckHeader(char const * header)
{
const char FLG_FEXTRA = detail::magic_header<detail::bgzf_compression>[3];
const char BGZF_ID1 = detail::magic_header<detail::bgzf_compression>[12];
const char BGZF_ID2 = detail::magic_header<detail::bgzf_compression>[13];
const char BGZF_SLEN = detail::magic_header<detail::bgzf_compression>[14];
const char BGZF_XLEN = detail::magic_header<detail::bgzf_compression>[10];

return (header[0] == static_cast<char>(detail::magic_header<detail::gz_compression>[0]) &&
header[1] == static_cast<char>(detail::magic_header<detail::gz_compression>[1]) &&
header[2] == static_cast<char>(detail::magic_header<detail::gz_compression>[2]) &&
const char FLG_FEXTRA = detail::bgzf_compression::magic_header[3];
const char BGZF_ID1 = detail::bgzf_compression::magic_header[12];
const char BGZF_ID2 = detail::bgzf_compression::magic_header[13];
const char BGZF_SLEN = detail::bgzf_compression::magic_header[14];
const char BGZF_XLEN = detail::bgzf_compression::magic_header[10];

return (header[0] == static_cast<char>(detail::gz_compression::magic_header[0]) &&
header[1] == static_cast<char>(detail::gz_compression::magic_header[1]) &&
header[2] == static_cast<char>(detail::gz_compression::magic_header[2]) &&
(header[3] & FLG_FEXTRA) != 0 &&
_bgzfUnpack16(header + 10) == BGZF_XLEN &&
header[12] == BGZF_ID1 &&
Expand Down
115 changes: 64 additions & 51 deletions include/seqan3/io/detail/magic_header.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,83 +13,96 @@
#pragma once

#include <array>
#include <tuple>

#include <seqan3/core/type_traits/basic.hpp>
#include <seqan3/core/type_traits/template_inspection.hpp>
#include <seqan3/core/platform.hpp>
#include <seqan3/core/type_list/type_list.hpp>
#include <seqan3/core/type_list/traits.hpp>
#include <seqan3/std/type_traits>

namespace seqan3::detail
{

//!\brief Defines a magic byte sequence to disambiguate different compression formats. Default is empty.
//!\ingroup io
template <typename header_tag_t>
inline constexpr std::array<char, 0> magic_header{};

//!\brief A tag signifying a gz compressed file.
//!\ingroup io
struct gz_compression
{};
{
//!\brief The valid file extension for gz compression.
static inline const std::vector<std::string> file_extensions
{
{"gz"}
};

/*!\brief The magic byte sequence to disambiguate gz compressed files.
* \ingroup io
*
* \details
*
* Specialises seqan3::detail::magic_header for seqan3::detail::gz_compression.
*/
template <>
inline constexpr std::array<char, 3> magic_header<gz_compression>{'\x1f', '\x8b', '\x08'};
//!\brief The magic byte sequence to disambiguate gz compressed files.
static inline constexpr std::array<char, 3> magic_header{'\x1f', '\x8b', '\x08'};
};

//!\brief A tag signifying a bz2 compressed file.
//!\ingroup io
struct bz2_compression
{};
{
//!\brief The valid file extension for bz2 compression.
static inline const std::vector<std::string> file_extensions
{
{"bz2"}
};

/*!\brief The magic byte sequence to disambiguate bz2 compressed files.
* \ingroup io
*
* \details
*
* Specialises seqan3::detail::magic_header for seqan3::detail::bz2_compression.
*/
template <>
inline constexpr std::array<char, 3> magic_header<bz2_compression>{'\x42', '\x5a', '\x68'};
//!\brief The magic byte sequence to disambiguate bz2 compressed files.
static inline constexpr std::array<char, 3> magic_header{'\x42', '\x5a', '\x68'};
};

//!\brief A tag signifying a zstd compressed file.
//!\ingroup io
struct zstd_compression
{};
{
//!\brief The valid file extension for zstd compression.
static inline const std::vector<std::string> file_extensions
{
{"zst"}
};

/*!\brief The magic byte sequence to disambiguate zstd compressed files.
* \ingroup io
*
* \details
*
* Specialises seqan3::detail::magic_header for seqan3::detail::zstd_compression.
*/
template <>
inline constexpr std::array<char, 4> magic_header<zstd_compression>{'\x28', '\xb5', '\x2f', '\xfd'};
//!\brief The magic byte sequence to disambiguate zstd compressed files.
static inline constexpr std::array<char, 4> magic_header{'\x28', '\xb5', '\x2f', '\xfd'};
};

//!\brief A tag signifying a bgzf compressed file.
//!\ingroup io
struct bgzf_compression
{};
{
//!\brief The valid file extension for bgzf compression.
static inline const std::vector<std::string> file_extensions
{
{"bgzf"}
};

/*!\brief The magic byte sequence to disambiguate bgzf compressed files.
//!\brief The magic byte sequence to disambiguate bgzf compressed files.
static inline constexpr std::array<char, 18> magic_header
{
// ID1 ID2 CM
gz_compression::magic_header[0], gz_compression::magic_header[1], gz_compression::magic_header[2],
// FLG [MTIME ] XFL OS [XLEN ]
'\x04', '\x00', '\x00', '\x00', '\x00', '\x00', '\xff', '\x06', '\x00',
// B C [SLEN ] [BSIZE ]
'\x42', '\x43', '\x02', '\x00', '\x00', '\x00'
};
};

/*!\brief A seqan3::type_list containing the available compression formats.
* \ingroup io
*
* \details
*
* Specialises seqan3::detail::magic_header for seqan3::detail::bgzf_compression.
*/
template <>
inline constexpr std::array<char, 18> magic_header<bgzf_compression>
{
// ID1 ID2 CM
magic_header<gz_compression>[0], magic_header<gz_compression>[1], magic_header<gz_compression>[2],
// FLG [MTIME ] XFL OS [XLEN ]
'\x04', '\x00', '\x00', '\x00', '\x00', '\x00', '\xff', '\x06', '\x00',
// B C [SLEN ] [BSIZE ]
'\x42', '\x43', '\x02', '\x00', '\x00', '\x00'
};
using compression_formats = pack_traits::drop_front<void
#if SEQAN3_HAS_ZLIB
,gz_compression
,bgzf_compression
#endif // SEQAN3_HAS_ZLIB
#if SEQAN3_HAS_BZIP2
,bz2_compression
#endif // SEQAN3_HAS_BZIP2
#if SEQAN3_HAS_ZSTD
,zstd_compression
#endif // SEQAN3_HAS_ZSTD
>;

} // namespace seqan3::detail
25 changes: 17 additions & 8 deletions include/seqan3/io/detail/misc_input.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ inline auto make_secondary_istream(std::basic_istream<char_t> & primary_stream,

// extract "magic header"
std::istreambuf_iterator<char_t> it{primary_stream};
std::array<char, magic_header<bgzf_compression>.size()> magic_number{}; // Largest magic header from bgzf
std::array<char, bgzf_compression::magic_header.size()> magic_number{}; // Largest magic header from bgzf
size_t read_chars = 0;
for (; read_chars < magic_number.size(); ++read_chars)
{
Expand All @@ -99,13 +99,22 @@ inline auto make_secondary_istream(std::basic_istream<char_t> & primary_stream,
for (size_t i = 0 ; i < read_chars; ++i)
primary_stream.unget();

std::string const extension = filename.extension().string();
std::string extension{};
if (filename.has_extension())
extension = filename.extension().string().substr(1);

// tests whether the given extension matches with one of the given compression tags.
auto contains_extension = [] (auto compression_tag, auto const & extension)
{
return std::ranges::find(decltype(compression_tag)::file_extensions, extension) !=
std::ranges::end(decltype(compression_tag)::file_extensions);
};

// set return value appropriately
if (read_chars == magic_number.size() && contrib::_bgzfCheckHeader(magic_number.data())) // BGZF
{
#ifdef SEQAN3_HAS_ZLIB
if ((extension == ".gz") || (extension == ".bgzf"))
if (contains_extension(gz_compression{}, extension) || contains_extension(bgzf_compression{}, extension))
filename.replace_extension();

return {new contrib::basic_bgzf_istream<char_t>{primary_stream},
Expand All @@ -114,29 +123,29 @@ inline auto make_secondary_istream(std::basic_istream<char_t> & primary_stream,
throw file_open_error{"Trying to read from a bgzf file, but no ZLIB available."};
#endif
}
else if (starts_with(magic_number, magic_header<gz_compression>)) // GZIP
else if (starts_with(magic_number, gz_compression::magic_header)) // GZIP
{
#ifdef SEQAN3_HAS_ZLIB
if ((extension == ".gz") || (extension == ".bgzf"))
if (contains_extension(gz_compression{}, extension) || contains_extension(bgzf_compression{}, extension))
filename.replace_extension();

return {new contrib::basic_gz_istream<char_t>{primary_stream}, stream_deleter_default};
#else
throw file_open_error{"Trying to read from a gzipped file, but no ZLIB available."};
#endif
}
else if (starts_with(magic_number, magic_header<bz2_compression>)) // BZip2
else if (starts_with(magic_number, bz2_compression::magic_header)) // BZip2
{
#ifdef SEQAN3_HAS_BZIP2
if (extension == ".bz2")
if (contains_extension(bz2_compression{}, extension))
filename.replace_extension();

return {new contrib::basic_bz2_istream<char_t>{primary_stream}, stream_deleter_default};
#else
throw file_open_error{"Trying to read from a bzipped file, but no libbz2 available."};
#endif
}
else if (starts_with(magic_number, magic_header<zstd_compression>)) // ZStd
else if (starts_with(magic_number, zstd_compression::magic_header)) // ZStd
{
throw file_open_error{"Trying to read from a zst'ed file, but SeqAn does not yet support this."};
}
Expand Down
21 changes: 21 additions & 0 deletions test/unit/io/detail/misc_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <string>
#include <vector>

#include <seqan3/io/detail/magic_header.hpp>
#include <seqan3/io/detail/misc.hpp>
#include <seqan3/std/ranges>
#include <seqan3/test/tmp_filename.hpp>
Expand Down Expand Up @@ -51,3 +52,23 @@ TEST(misc, valid_file_extensions)
for (std::string & ext : dummy_file::format2::file_extensions)
EXPECT_NE(cmp_lambda(ext), all_extensions.end());
}

TEST(misc, valid_compression_extensions)
{

std::vector<std::string> valid_compression = detail::valid_file_extensions<detail::compression_formats>();

#if defined(SEQAN3_HAS_ZLIB)
// expect gz and bgzf
EXPECT_TRUE(std::find(valid_compression.begin(), valid_compression.end(), "gz") != valid_compression.end());
EXPECT_TRUE(std::find(valid_compression.begin(), valid_compression.end(), "bgzf") != valid_compression.end());
#endif

#if defined(SEQAN3_HAS_BZIP2)
EXPECT_TRUE(std::find(valid_compression.begin(), valid_compression.end(), "bz2") != valid_compression.end());
#endif

#if defined(SEQAN3_HAS_ZSTD)
EXPECT_TRUE(std::find(valid_compression.begin(), valid_compression.end(), "zst") != valid_compression.end());
#endif
}

0 comments on commit 78932ba

Please sign in to comment.