From c3cb612907f999f447ff73c7c9941d1f2e462673 Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Tue, 23 Mar 2021 11:59:03 +0100 Subject: [PATCH] [FIX] compression streams writing wrong format --- include/seqan3/io/detail/misc_input.hpp | 2 +- include/seqan3/io/detail/misc_output.hpp | 33 +++++---- test/unit/io/detail/CMakeLists.txt | 1 + test/unit/io/detail/misc_output_test.cpp | 68 +++++++++++++++++++ test/unit/io/detail/misc_test.cpp | 1 - .../unit/io/sam_file/sam_file_output_test.cpp | 2 +- .../sequence_file_output_test.cpp | 2 +- .../structure_file_output_test.cpp | 2 +- 8 files changed, 94 insertions(+), 17 deletions(-) create mode 100644 test/unit/io/detail/misc_output_test.cpp diff --git a/include/seqan3/io/detail/misc_input.hpp b/include/seqan3/io/detail/misc_input.hpp index c300eaa18c7..7c161ff5afe 100644 --- a/include/seqan3/io/detail/misc_input.hpp +++ b/include/seqan3/io/detail/misc_input.hpp @@ -21,7 +21,6 @@ #include #include - #ifdef SEQAN3_HAS_BZIP2 #include #endif @@ -31,6 +30,7 @@ #include #endif #include +#include #include namespace seqan3::detail diff --git a/include/seqan3/io/detail/misc_output.hpp b/include/seqan3/io/detail/misc_output.hpp index 568b0c59f62..f712b511fde 100644 --- a/include/seqan3/io/detail/misc_output.hpp +++ b/include/seqan3/io/detail/misc_output.hpp @@ -12,11 +12,11 @@ #pragma once +#include #include #include #include -#include #ifdef SEQAN3_HAS_BZIP2 #include #endif @@ -24,7 +24,8 @@ #include #include #endif -#include +#include +#include namespace seqan3::detail { @@ -46,26 +47,34 @@ inline auto make_secondary_ostream(std::basic_ostream & primary_stream, std::string extension = filename.extension().string(); - if ((extension == ".gz") || (extension == ".bgzf") || (extension == ".bam")) + if (extension == ".gz") { - #ifdef SEQAN3_HAS_ZLIB +#ifdef SEQAN3_HAS_ZLIB + filename.replace_extension(""); + return {new contrib::basic_gz_ostream{primary_stream}, stream_deleter_default}; +#else + throw file_open_error{"Trying to write a gzipped file, but no ZLIB available."}; +#endif + } + else if ((extension == ".bgzf") || (extension == ".bam")) + { +#ifdef SEQAN3_HAS_ZLIB if (extension != ".bam") // remove extension except for bam filename.replace_extension(""); - return {new contrib::basic_bgzf_ostream{primary_stream}, - stream_deleter_default}; - #else - throw file_open_error{"Trying to write a gzipped file, but no ZLIB available."}; - #endif + return {new contrib::basic_bgzf_ostream{primary_stream}, stream_deleter_default}; +#else + throw file_open_error{"Trying to write a bgzf'ed file, but no ZLIB available."}; +#endif } else if (extension == ".bz2") { - #ifdef SEQAN3_HAS_BZIP2 +#ifdef SEQAN3_HAS_BZIP2 filename.replace_extension(""); return {new contrib::basic_bz2_ostream{primary_stream}, stream_deleter_default}; - #else +#else throw file_open_error{"Trying to write a bzipped file, but no libbz2 available."}; - #endif +#endif } else if (extension == ".zst") { diff --git a/test/unit/io/detail/CMakeLists.txt b/test/unit/io/detail/CMakeLists.txt index 32763995c23..a6ce1f3d492 100644 --- a/test/unit/io/detail/CMakeLists.txt +++ b/test/unit/io/detail/CMakeLists.txt @@ -1,6 +1,7 @@ seqan3_test(detail_record_test.cpp) seqan3_test(in_file_iterator_test.cpp) seqan3_test(misc_test.cpp) +seqan3_test(misc_output_test.cpp) seqan3_test(out_file_iterator_test.cpp) seqan3_test(ignore_output_iterator_test.cpp) seqan3_test(record_like_test.cpp) diff --git a/test/unit/io/detail/misc_output_test.cpp b/test/unit/io/detail/misc_output_test.cpp new file mode 100644 index 00000000000..2094ed199fd --- /dev/null +++ b/test/unit/io/detail/misc_output_test.cpp @@ -0,0 +1,68 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +#include + +#include +#include + +#include + +#include +#include +#include + +inline seqan3::test::tmp_filename tmp_compressed_file(std::string const & file_extension) +{ + std::string const tmp_file_name = "io_misc_output_test.txt." + file_extension; + seqan3::test::tmp_filename tmp_file{tmp_file_name.c_str()}; + + // We need a copy of the path because `make_secondary_ostream` will strip the compression extension. + auto file_path = tmp_file.get_path(); + std::ofstream filestream{file_path}; + auto stream_ptr = seqan3::detail::make_secondary_ostream(filestream, file_path); + *stream_ptr << std::string(8, 'a') << '\n'; + + return tmp_file; +} + +inline std::vector read_file_content(std::filesystem::path const & path) +{ + std::ifstream filestream{path}; + using char_t = decltype(filestream)::char_type; + return {std::istreambuf_iterator{filestream}, std::istreambuf_iterator{}}; +} + +#ifdef SEQAN3_HAS_ZLIB +TEST(misc_output, issue2455_gz) +{ + seqan3::test::tmp_filename const compressed_file = tmp_compressed_file("gz"); + std::vector const file_content = read_file_content(compressed_file.get_path()); + + EXPECT_TRUE(seqan3::detail::starts_with(file_content, seqan3::detail::gz_compression::magic_header)); + // gz should not have a valid bgzf header (the gz header is a prefix of the bgzf header) + EXPECT_FALSE(seqan3::detail::bgzf_compression::validate_header(std::span{file_content})); +} + +TEST(misc_output, issue2455_bgzf) +{ + seqan3::test::tmp_filename const compressed_file = tmp_compressed_file("bgzf"); + std::vector const file_content = read_file_content(compressed_file.get_path()); + + EXPECT_TRUE(seqan3::detail::bgzf_compression::validate_header(std::span{file_content})); +} +#endif + +#ifdef SEQAN3_HAS_BZIP2 +TEST(misc_output, issue2455_bz) +{ + seqan3::test::tmp_filename const compressed_file = tmp_compressed_file("bz2"); + std::vector const file_content = read_file_content(compressed_file.get_path()); + + EXPECT_TRUE(seqan3::detail::starts_with(file_content, seqan3::detail::bz2_compression::magic_header)); +} +#endif diff --git a/test/unit/io/detail/misc_test.cpp b/test/unit/io/detail/misc_test.cpp index 50d41cc9178..2ca06958daf 100644 --- a/test/unit/io/detail/misc_test.cpp +++ b/test/unit/io/detail/misc_test.cpp @@ -12,7 +12,6 @@ #include #include -#include #include struct dummy_file diff --git a/test/unit/io/sam_file/sam_file_output_test.cpp b/test/unit/io/sam_file/sam_file_output_test.cpp index 575bae12263..bc31df56290 100644 --- a/test/unit/io/sam_file/sam_file_output_test.cpp +++ b/test/unit/io/sam_file/sam_file_output_test.cpp @@ -650,7 +650,7 @@ TEST(compression, by_filename_gz) std::string buffer = compression_by_filename_impl(filename); buffer[9] = '\x00'; // zero out OS byte. - EXPECT_EQ(buffer, expected_bgzf); + EXPECT_EQ(buffer, expected_gz); } TEST(compression, by_stream_gz) diff --git a/test/unit/io/sequence_file/sequence_file_output_test.cpp b/test/unit/io/sequence_file/sequence_file_output_test.cpp index 0367538f12a..d5cb90fd58c 100644 --- a/test/unit/io/sequence_file/sequence_file_output_test.cpp +++ b/test/unit/io/sequence_file/sequence_file_output_test.cpp @@ -528,7 +528,7 @@ TEST(compression, by_filename_gz) std::string buffer = compression_by_filename_impl(filename); buffer[9] = '\x00'; // zero out OS byte - EXPECT_EQ(buffer, expected_bgzf); + EXPECT_EQ(buffer, expected_gz); } TEST(compression, by_stream_gz) diff --git a/test/unit/io/structure_file/structure_file_output_test.cpp b/test/unit/io/structure_file/structure_file_output_test.cpp index 2a710a20451..40d58b9b1cd 100644 --- a/test/unit/io/structure_file/structure_file_output_test.cpp +++ b/test/unit/io/structure_file/structure_file_output_test.cpp @@ -481,7 +481,7 @@ TEST_F(structure_file_output_compression, by_filename_gz) seqan3::test::tmp_filename filename{"structure_file_output_test.dbn.gz"}; std::string buffer = compression_by_filename_impl(filename); buffer[9] = '\x00'; // zero out OS byte - EXPECT_EQ(buffer, expected_bgzf); + EXPECT_EQ(buffer, expected_gz); } TEST_F(structure_file_output_compression, by_stream_gz)