Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] compression streams writing wrong format #2458

Merged
merged 3 commits into from
Mar 24, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 9 additions & 11 deletions include/seqan3/contrib/span/span
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,9 @@

#include <array> // for array
#include <cstddef> // for ptrdiff_t
#include <iterator> // for iterators
#include <type_traits> // for remove_cv, etc

#include <seqan3/std/iterator> // for iterator concepts
#include <seqan3/std/ranges> // for range concepts
#include <seqan3/std/type_traits> // for remove_cvref_t, etc

namespace std
{
Expand Down Expand Up @@ -236,13 +234,13 @@ public:
constexpr span& operator=(const span&) noexcept = default;

template <contiguous_iterator iterator_t>
requires is_convertible_v<remove_reference_t<iter_reference_t<iterator_t>>(*)[], element_type(*)[]>
requires is_convertible_v<remove_cvref_t<iter_reference_t<iterator_t>>(*)[], element_type(*)[]>
constexpr span(iterator_t f, index_type count) noexcept : data_{addressof(*f)}, size_{count}
{}

template <contiguous_iterator iterator, typename sentinel_t>
requires sized_sentinel_for<sentinel_t, iterator> &&
is_convertible_v<remove_reference_t<iter_reference_t<iterator>>(*)[], element_type(*)[]> &&
is_convertible_v<remove_cvref_t<iter_reference_t<iterator>>(*)[], element_type(*)[]> &&
(!is_convertible_v<sentinel_t, size_t>)
constexpr span(iterator f, sentinel_t l) noexcept : data_{addressof(*f)}, size_{static_cast<index_type>(distance(f, l))}
{}
Expand All @@ -257,11 +255,11 @@ public:
inline constexpr span(const array<value_type, span_sz>& arr) noexcept : data_{arr.data()}, size_{span_sz} {}

template <typename range_t>
requires (!same_as<remove_reference_t<remove_cv_t<range_t>>, span>) && // guard for recursive instantiation in constructible_from
(!is_span<remove_reference_t<range_t>>::value) &&
(!is_std_array<remove_reference_t<range_t>>::value) &&
(!is_array_v<remove_reference_t<range_t>>) &&
is_convertible_v<remove_reference_t<iter_reference_t<ranges::iterator_t<range_t>>>(*)[],
requires (!same_as<remove_cvref_t<range_t>, span>) && // guard for recursive instantiation in constructible_from
(!is_span<remove_cvref_t<range_t>>::value) &&
(!is_std_array<remove_cvref_t<range_t>>::value) &&
(!is_array_v<remove_cvref_t<range_t>>) &&
is_convertible_v<remove_cvref_t<ranges::range_reference_t<range_t>>(*)[],
element_type(*)[]> &&
ranges::contiguous_range<range_t> &&
ranges::sized_range<range_t> &&
Expand Down Expand Up @@ -430,7 +428,7 @@ template <class span_tp, size_t span_sz>
span(const array<span_tp, span_sz>&) -> span<const span_tp, span_sz>;

template <ranges::contiguous_range range_t>
span(range_t &&) -> span<ranges::range_value_t<range_t>>;
span(range_t &&) -> span<remove_reference_t<ranges::range_reference_t<range_t&>>>;
} // namespace std

//!\endcond
16 changes: 8 additions & 8 deletions include/seqan3/io/detail/magic_header.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,14 +99,14 @@ struct bgzf_compression
static_assert(seqan3::detail::weakly_equality_comparable_with<char_t, char>,
"The given char type of the span must be comparable with char.");

return (header[0] == magic_header[0] && // GZ_ID1
header[1] == magic_header[1] && // GZ_ID2
header[2] == magic_header[2] && // GZ_CM
(header[3] & magic_header[3]) != 0 && // FLG_FEXTRA
to_little_endian(*reinterpret_cast<uint16_t *>(&header[10])) == magic_header[10] && // BGZF_ID1
header[12] == magic_header[12] && // BGZF_ID2
header[13] == magic_header[13] && // BGZF_SLEN
to_little_endian(*reinterpret_cast<uint16_t *>(&header[14])) == magic_header[14]); // BGZF_XLEN
return (header[0] == magic_header[0] && // GZ_ID1
header[1] == magic_header[1] && // GZ_ID2
header[2] == magic_header[2] && // GZ_CM
(header[3] & magic_header[3]) != 0 && // FLG_FEXTRA
to_little_endian(*reinterpret_cast<uint16_t const *>(&header[10])) == magic_header[10] && // BGZF_ID1
header[12] == magic_header[12] && // BGZF_ID2
header[13] == magic_header[13] && // BGZF_SLEN
to_little_endian(*reinterpret_cast<uint16_t const *>(&header[14])) == magic_header[14]); // BGZF_XLEN
}
};

Expand Down
2 changes: 1 addition & 1 deletion include/seqan3/io/detail/misc_input.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
#include <string>
#include <tuple>


#ifdef SEQAN3_HAS_BZIP2
#include <seqan3/contrib/stream/bz2_istream.hpp>
#endif
Expand All @@ -31,6 +30,7 @@
#include <seqan3/contrib/stream/gz_istream.hpp>
#endif
#include <seqan3/io/detail/magic_header.hpp>
#include <seqan3/io/exception.hpp>
#include <seqan3/utility/detail/exposition_only_concept.hpp>

namespace seqan3::detail
Expand Down
33 changes: 21 additions & 12 deletions include/seqan3/io/detail/misc_output.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,20 @@

#pragma once

#include <seqan3/std/filesystem>
#include <iostream>
#include <string>
#include <tuple>

#include <seqan3/utility/detail/exposition_only_concept.hpp>
#ifdef SEQAN3_HAS_BZIP2
#include <seqan3/contrib/stream/bz2_ostream.hpp>
#endif
#ifdef SEQAN3_HAS_ZLIB
#include <seqan3/contrib/stream/bgzf_ostream.hpp>
#include <seqan3/contrib/stream/gz_ostream.hpp>
#endif
#include <seqan3/std/filesystem>
#include <seqan3/io/exception.hpp>
#include <seqan3/utility/detail/exposition_only_concept.hpp>

namespace seqan3::detail
{
Expand All @@ -46,26 +47,34 @@ inline auto make_secondary_ostream(std::basic_ostream<char_t> & primary_stream,

std::string extension = filename.extension().string();

if ((extension == ".gz") || (extension == ".bgzf") || (extension == ".bam"))
if (extension == ".gz")
{
#ifdef SEQAN3_HAS_ZLIB
#ifdef SEQAN3_HAS_ZLIB
filename.replace_extension("");
return {new contrib::basic_gz_ostream<char_t>{primary_stream}, stream_deleter_default};
#else
throw file_open_error{"Trying to write a gzipped file, but no ZLIB available."};
#endif
}
else if ((extension == ".bgzf") || (extension == ".bam"))
{
#ifdef SEQAN3_HAS_ZLIB
if (extension != ".bam") // remove extension except for bam
filename.replace_extension("");

return {new contrib::basic_bgzf_ostream<char_t>{primary_stream},
stream_deleter_default};
#else
throw file_open_error{"Trying to write a gzipped file, but no ZLIB available."};
#endif
return {new contrib::basic_bgzf_ostream<char_t>{primary_stream}, stream_deleter_default};
#else
throw file_open_error{"Trying to write a bgzf'ed file, but no ZLIB available."};
#endif
}
else if (extension == ".bz2")
{
#ifdef SEQAN3_HAS_BZIP2
#ifdef SEQAN3_HAS_BZIP2
filename.replace_extension("");
return {new contrib::basic_bz2_ostream<char_t>{primary_stream}, stream_deleter_default};
#else
#else
throw file_open_error{"Trying to write a bzipped file, but no libbz2 available."};
#endif
#endif
}
else if (extension == ".zst")
{
Expand Down
1 change: 1 addition & 0 deletions test/unit/io/detail/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
seqan3_test(detail_record_test.cpp)
seqan3_test(in_file_iterator_test.cpp)
seqan3_test(misc_test.cpp)
seqan3_test(misc_output_test.cpp)
seqan3_test(out_file_iterator_test.cpp)
seqan3_test(ignore_output_iterator_test.cpp)
seqan3_test(record_like_test.cpp)
Expand Down
68 changes: 68 additions & 0 deletions test/unit/io/detail/misc_output_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
// -----------------------------------------------------------------------------------------------------
// Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin
// Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
// -----------------------------------------------------------------------------------------------------

#include <gtest/gtest.h>

#include <fstream>
#include <vector>

#include <seqan3/core/debug_stream.hpp>

#include <seqan3/io/detail/misc_input.hpp>
#include <seqan3/io/detail/misc_output.hpp>
#include <seqan3/test/tmp_filename.hpp>

inline seqan3::test::tmp_filename tmp_compressed_file(std::string const & file_extension)
{
std::string const tmp_file_name = "io_misc_output_test.txt." + file_extension;
seqan3::test::tmp_filename tmp_file{tmp_file_name.c_str()};

// We need a copy of the path because `make_secondary_ostream` will strip the compression extension.
auto file_path = tmp_file.get_path();
std::ofstream filestream{file_path};
auto stream_ptr = seqan3::detail::make_secondary_ostream(filestream, file_path);
*stream_ptr << std::string(8, 'a') << '\n';

return tmp_file;
}

inline std::vector<char> read_file_content(std::filesystem::path const & path)
{
std::ifstream filestream{path};
using char_t = decltype(filestream)::char_type;
return {std::istreambuf_iterator{filestream}, std::istreambuf_iterator<char_t>{}};
}

#ifdef SEQAN3_HAS_ZLIB
TEST(misc_output, issue2455_gz)
{
seqan3::test::tmp_filename const compressed_file = tmp_compressed_file("gz");
std::vector<char> const file_content = read_file_content(compressed_file.get_path());

EXPECT_TRUE(seqan3::detail::starts_with(file_content, seqan3::detail::gz_compression::magic_header));
// gz should not have a valid bgzf header (the gz header is a prefix of the bgzf header)
EXPECT_FALSE(seqan3::detail::bgzf_compression::validate_header(std::span{file_content}));
}

TEST(misc_output, issue2455_bgzf)
{
seqan3::test::tmp_filename const compressed_file = tmp_compressed_file("bgzf");
std::vector<char> const file_content = read_file_content(compressed_file.get_path());

EXPECT_TRUE(seqan3::detail::bgzf_compression::validate_header(std::span{file_content}));
}
#endif

#ifdef SEQAN3_HAS_BZIP2
TEST(misc_output, issue2455_bz)
{
seqan3::test::tmp_filename const compressed_file = tmp_compressed_file("bz2");
std::vector<char> const file_content = read_file_content(compressed_file.get_path());

EXPECT_TRUE(seqan3::detail::starts_with(file_content, seqan3::detail::bz2_compression::magic_header));
}
#endif
1 change: 0 additions & 1 deletion test/unit/io/detail/misc_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@

#include <seqan3/io/detail/magic_header.hpp>
#include <seqan3/io/detail/misc.hpp>
#include <seqan3/std/ranges>
#include <seqan3/test/tmp_filename.hpp>

struct dummy_file
Expand Down
2 changes: 1 addition & 1 deletion test/unit/io/sam_file/sam_file_output_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -650,7 +650,7 @@ TEST(compression, by_filename_gz)

std::string buffer = compression_by_filename_impl(filename);
buffer[9] = '\x00'; // zero out OS byte.
EXPECT_EQ(buffer, expected_bgzf);
EXPECT_EQ(buffer, expected_gz);
}

TEST(compression, by_stream_gz)
Expand Down
2 changes: 1 addition & 1 deletion test/unit/io/sequence_file/sequence_file_output_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -528,7 +528,7 @@ TEST(compression, by_filename_gz)

std::string buffer = compression_by_filename_impl(filename);
buffer[9] = '\x00'; // zero out OS byte
EXPECT_EQ(buffer, expected_bgzf);
EXPECT_EQ(buffer, expected_gz);
}

TEST(compression, by_stream_gz)
Expand Down
2 changes: 1 addition & 1 deletion test/unit/io/structure_file/structure_file_output_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,7 @@ TEST_F(structure_file_output_compression, by_filename_gz)
seqan3::test::tmp_filename filename{"structure_file_output_test.dbn.gz"};
std::string buffer = compression_by_filename_impl(filename);
buffer[9] = '\x00'; // zero out OS byte
EXPECT_EQ(buffer, expected_bgzf);
EXPECT_EQ(buffer, expected_gz);
}

TEST_F(structure_file_output_compression, by_stream_gz)
Expand Down