diff --git a/CHANGELOG.md b/CHANGELOG.md index c9c8e955cf..3d6a66f036 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -43,6 +43,10 @@ If possible, provide tooling that performs the changes, e.g. a shell-script. ## Notable Bug-fixes +#### I/O +* Empty SAM/BAM files must at least write a header to ensure a valid file + ([\#3081](https://github.com/seqan/seqan3/pull/3081)). + ## API changes #### Dependencies diff --git a/include/seqan3/io/sam_file/detail/format_sam_base.hpp b/include/seqan3/io/sam_file/detail/format_sam_base.hpp index 51390a4938..88362d7468 100644 --- a/include/seqan3/io/sam_file/detail/format_sam_base.hpp +++ b/include/seqan3/io/sam_file/detail/format_sam_base.hpp @@ -98,9 +98,8 @@ class format_sam_base sam_file_header & hdr, ref_seqs_type & /*ref_id_to_pos_map*/); - template - void - write_header(stream_t & stream, sam_file_output_options const & options, sam_file_header & header); + template + void write_header(stream_t & stream, sam_file_output_options const & options, header_type & header); }; /*!\brief Checks for known reference ids or adds a new reference is and assigns a reference id to `ref_id`. @@ -702,115 +701,117 @@ inline void format_sam_base::read_header(stream_view_type && stream_view, * according to the rules of the official * [SAM format specifications](https://samtools.github.io/hts-specs/SAMv1.pdf). */ -template -inline void format_sam_base::write_header(stream_t & stream, - sam_file_output_options const & options, - sam_file_header & header) +template +inline void +format_sam_base::write_header(stream_t & stream, sam_file_output_options const & options, header_type & header) { - // ----------------------------------------------------------------- - // Check Header - // ----------------------------------------------------------------- + if constexpr (!detail::decays_to_ignore_v) + { + // ----------------------------------------------------------------- + // Check Header + // ----------------------------------------------------------------- - // (@HD) Check header line - // The format version string will be taken from the local member variable - if (!header.sorting.empty() - && !(header.sorting == "unknown" || header.sorting == "unsorted" || header.sorting == "queryname" - || header.sorting == "coordinate")) - throw format_error{"SAM format error: The header.sorting member must be " - "one of [unknown, unsorted, queryname, coordinate]."}; + // (@HD) Check header line + // The format version string will be taken from the local member variable + if (!header.sorting.empty() + && !(header.sorting == "unknown" || header.sorting == "unsorted" || header.sorting == "queryname" + || header.sorting == "coordinate")) + throw format_error{"SAM format error: The header.sorting member must be " + "one of [unknown, unsorted, queryname, coordinate]."}; - if (!header.grouping.empty() - && !(header.grouping == "none" || header.grouping == "query" || header.grouping == "reference")) - throw format_error{"SAM format error: The header.grouping member must be " - "one of [none, query, reference]."}; + if (!header.grouping.empty() + && !(header.grouping == "none" || header.grouping == "query" || header.grouping == "reference")) + throw format_error{"SAM format error: The header.grouping member must be " + "one of [none, query, reference]."}; - // (@SQ) Check Reference Sequence Dictionary lines + // (@SQ) Check Reference Sequence Dictionary lines - // TODO + // TODO - // - sorting order be one of ... - // - grouping can be one of ... - // - reference names must be unique - // - ids of read groups must be unique - // - program ids need to be unique - // many more small semantic things, like fits REGEX + // - sorting order be one of ... + // - grouping can be one of ... + // - reference names must be unique + // - ids of read groups must be unique + // - program ids need to be unique + // many more small semantic things, like fits REGEX - // ----------------------------------------------------------------- - // Write Header - // ----------------------------------------------------------------- - std::ostreambuf_iterator stream_it{stream}; + // ----------------------------------------------------------------- + // Write Header + // ----------------------------------------------------------------- + std::ostreambuf_iterator stream_it{stream}; - // (@HD) Write header line [required]. - stream << "@HD\tVN:"; - std::ranges::copy(format_version, stream_it); + // (@HD) Write header line [required]. + stream << "@HD\tVN:"; + std::ranges::copy(format_version, stream_it); - if (!header.sorting.empty()) - stream << "\tSO:" << header.sorting; + if (!header.sorting.empty()) + stream << "\tSO:" << header.sorting; - if (!header.subsorting.empty()) - stream << "\tSS:" << header.subsorting; + if (!header.subsorting.empty()) + stream << "\tSS:" << header.subsorting; - if (!header.grouping.empty()) - stream << "\tGO:" << header.grouping; + if (!header.grouping.empty()) + stream << "\tGO:" << header.grouping; - detail::write_eol(stream_it, options.add_carriage_return); + detail::write_eol(stream_it, options.add_carriage_return); - // (@SQ) Write Reference Sequence Dictionary lines [required]. - for (auto const & [ref_name, ref_info] : views::zip(header.ref_ids(), header.ref_id_info)) - { - stream << "@SQ\tSN:"; + // (@SQ) Write Reference Sequence Dictionary lines [required]. + for (auto const & [ref_name, ref_info] : views::zip(header.ref_ids(), header.ref_id_info)) + { + stream << "@SQ\tSN:"; - std::ranges::copy(ref_name, stream_it); + std::ranges::copy(ref_name, stream_it); - stream << "\tLN:" << get<0>(ref_info); + stream << "\tLN:" << get<0>(ref_info); - if (!get<1>(ref_info).empty()) - stream << "\t" << get<1>(ref_info); + if (!get<1>(ref_info).empty()) + stream << "\t" << get<1>(ref_info); - detail::write_eol(stream_it, options.add_carriage_return); - } + detail::write_eol(stream_it, options.add_carriage_return); + } - // Write read group (@RG) lines if specified. - for (auto const & read_group : header.read_groups) - { - stream << "@RG" - << "\tID:" << get<0>(read_group); + // Write read group (@RG) lines if specified. + for (auto const & read_group : header.read_groups) + { + stream << "@RG" + << "\tID:" << get<0>(read_group); - if (!get<1>(read_group).empty()) - stream << "\t" << get<1>(read_group); + if (!get<1>(read_group).empty()) + stream << "\t" << get<1>(read_group); - detail::write_eol(stream_it, options.add_carriage_return); - } + detail::write_eol(stream_it, options.add_carriage_return); + } - // Write program (@PG) lines if specified. - for (auto const & program : header.program_infos) - { - stream << "@PG" - << "\tID:" << program.id; + // Write program (@PG) lines if specified. + for (auto const & program : header.program_infos) + { + stream << "@PG" + << "\tID:" << program.id; - if (!program.name.empty()) - stream << "\tPN:" << program.name; + if (!program.name.empty()) + stream << "\tPN:" << program.name; - if (!program.command_line_call.empty()) - stream << "\tCL:" << program.command_line_call; + if (!program.command_line_call.empty()) + stream << "\tCL:" << program.command_line_call; - if (!program.previous.empty()) - stream << "\tPP:" << program.previous; + if (!program.previous.empty()) + stream << "\tPP:" << program.previous; - if (!program.description.empty()) - stream << "\tDS:" << program.description; + if (!program.description.empty()) + stream << "\tDS:" << program.description; - if (!program.version.empty()) - stream << "\tVN:" << program.version; + if (!program.version.empty()) + stream << "\tVN:" << program.version; - detail::write_eol(stream_it, options.add_carriage_return); - } + detail::write_eol(stream_it, options.add_carriage_return); + } - // Write comment (@CO) lines if specified. - for (auto const & comment : header.comments) - { - stream << "@CO\t" << comment; - detail::write_eol(stream_it, options.add_carriage_return); + // Write comment (@CO) lines if specified. + for (auto const & comment : header.comments) + { + stream << "@CO\t" << comment; + detail::write_eol(stream_it, options.add_carriage_return); + } } } diff --git a/include/seqan3/io/sam_file/format_bam.hpp b/include/seqan3/io/sam_file/format_bam.hpp index e5719d5266..3f27e43a7e 100644 --- a/include/seqan3/io/sam_file/format_bam.hpp +++ b/include/seqan3/io/sam_file/format_bam.hpp @@ -137,6 +137,10 @@ class format_bam : private detail::format_sam_base [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value), [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score)); + //!\privatesection + template + void write_header(stream_t & stream, sam_file_output_options const & options, header_type & header); + private: //!\brief A variable that tracks whether the content of header has been read or not. bool header_was_read{false}; @@ -726,8 +730,9 @@ inline void format_bam::write_alignment_record([[maybe_unused]] stream_type & st if constexpr (detail::decays_to_ignore_v) { throw format_error{"BAM can only be written with a header but you did not provide enough information! " - "You can either construct the output file with ref_ids and ref_seqs information and " - "the header will be created for you, or you can access the `header` member directly."}; + "You can either construct the output file with reference names and reference length " + "information and the header will be created for you, or you can access the `header` member " + "directly."}; } else { @@ -745,28 +750,7 @@ inline void format_bam::write_alignment_record([[maybe_unused]] stream_type & st // --------------------------------------------------------------------- if (!header_was_written) { - stream << "BAM\1"; - std::ostringstream os; - write_header(os, options, header); // write SAM header to temporary stream to query the size. - int32_t l_text{static_cast(os.str().size())}; - std::ranges::copy_n(reinterpret_cast(&l_text), 4, stream_it); // write read id - - stream << os.str(); - - int32_t n_ref{static_cast(header.ref_ids().size())}; - std::ranges::copy_n(reinterpret_cast(&n_ref), 4, stream_it); // write read id - - for (int32_t ridx = 0; ridx < n_ref; ++ridx) - { - int32_t l_name{static_cast(header.ref_ids()[ridx].size()) + 1}; // plus null character - std::ranges::copy_n(reinterpret_cast(&l_name), 4, stream_it); // write l_name - // write reference name: - std::ranges::copy(header.ref_ids()[ridx].begin(), header.ref_ids()[ridx].end(), stream_it); - stream_it = '\0'; - // write reference sequence length: - std::ranges::copy_n(reinterpret_cast(&get<0>(header.ref_id_info[ridx])), 4, stream_it); - } - + write_header(stream, options, header); header_was_written = true; } @@ -961,6 +945,58 @@ inline void format_bam::write_alignment_record([[maybe_unused]] stream_type & st } // if constexpr (!detail::decays_to_ignore_v) } +//!\copydoc seqan3::detail::format_sam_base::write_header +template +inline void format_bam::write_header(stream_t & stream, sam_file_output_options const & options, header_type & header) +{ + if constexpr (detail::decays_to_ignore_v) + { + throw format_error{"BAM can only be written with a header but you did not provide enough information! " + "You can either construct the output file with reference names and reference length " + "information and the header will be created for you, or you can access the `header` member " + "directly."}; + } + else + { + detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()}; + + std::ranges::copy_n("BAM\1", 4, stream_it); // Do not copy the null terminator + + // write SAM header to temporary stream first to query its size. + std::ostringstream os; + detail::format_sam_base::write_header(os, options, header); +#if SEQAN3_COMPILER_IS_GCC && (__GNUC__ == 10) + int32_t const l_text{static_cast(os.str().size())}; +#else + int32_t const l_text{static_cast(os.view().size())}; +#endif + std::ranges::copy_n(reinterpret_cast(&l_text), 4, stream_it); // write text length + +#if SEQAN3_COMPILER_IS_GCC && (__GNUC__ == 10) + auto header_view = os.str(); +#else + auto header_view = os.view(); +#endif + std::ranges::copy(header_view, stream_it); + + assert(header.ref_ids().size() < (1ull << 32)); + int32_t const n_ref{static_cast(header.ref_ids().size())}; + std::ranges::copy_n(reinterpret_cast(&n_ref), 4, stream_it); // write number of references + + for (int32_t ridx = 0; ridx < n_ref; ++ridx) + { + assert(header.ref_ids()[ridx].size() + 1 < (1ull << 32)); + int32_t const l_name{static_cast(header.ref_ids()[ridx].size()) + 1}; // plus null character + std::ranges::copy_n(reinterpret_cast(&l_name), 4, stream_it); // write l_name + // write reference name: + std::ranges::copy(header.ref_ids()[ridx], stream_it); + stream_it = '\0'; // ++ is not necessary for ostream_iterator + // write reference sequence length: + std::ranges::copy_n(reinterpret_cast(&get<0>(header.ref_id_info[ridx])), 4, stream_it); + } + } +} + //!\copydoc seqan3::format_sam::read_sam_dict_vector template inline void format_bam::read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant, diff --git a/include/seqan3/io/sam_file/format_sam.hpp b/include/seqan3/io/sam_file/format_sam.hpp index 07f4958ebf..fb44980a30 100644 --- a/include/seqan3/io/sam_file/format_sam.hpp +++ b/include/seqan3/io/sam_file/format_sam.hpp @@ -113,7 +113,7 @@ namespace seqan3 * * \remark For a complete overview, take a look at \ref io_sam_file */ -class format_sam : private detail::format_sam_base +class format_sam : protected detail::format_sam_base { public: /*!\name Constructors, destructor and assignment diff --git a/include/seqan3/io/sam_file/output.hpp b/include/seqan3/io/sam_file/output.hpp index 3fee46743d..95e53a6c9e 100644 --- a/include/seqan3/io/sam_file/output.hpp +++ b/include/seqan3/io/sam_file/output.hpp @@ -103,7 +103,8 @@ class sam_file_output field::header_ptr>; static_assert( - []() constexpr { + []() constexpr + { for (field f : selected_field_ids::as_array) if (!field_ids::contains(f)) return false; @@ -149,8 +150,24 @@ class sam_file_output sam_file_output(sam_file_output &&) = default; //!\brief Move assignment is defaulted. sam_file_output & operator=(sam_file_output &&) = default; - //!\brief Destructor is defaulted. - ~sam_file_output() = default; + //!\brief The destructor will write the header if it has not been written before. + ~sam_file_output() + { + if (header_has_been_written) + return; + + assert(!format.valueless_by_exception()); + + std::visit( + [&](auto & f) + { + if constexpr (std::same_as) + f.write_header(*secondary_stream, options, std::ignore); + else + f.write_header(*secondary_stream, options, *header_ptr); + }, + format); + } /*!\brief Construct from filename. * \param[in] filename Path to the file you wish to open. @@ -592,6 +609,9 @@ class sam_file_output protected: //!\privatesection + //!\brief This is needed during deconstruction to know whether a header still needs to be written. + bool header_has_been_written{false}; + //!\brief A larger (compared to stl default) stream buffer to use when reading from a file. std::vector stream_buffer{std::vector(1'000'000)}; @@ -690,6 +710,8 @@ class sam_file_output } }, format); + + header_has_been_written = true; // when writing a record, the header is written automatically } //!\brief Befriend iterator so it can access the buffers. diff --git a/include/seqan3/io/sam_file/output_format_concept.hpp b/include/seqan3/io/sam_file/output_format_concept.hpp index e8fa5db246..e3c4b21c4d 100644 --- a/include/seqan3/io/sam_file/output_format_concept.hpp +++ b/include/seqan3/io/sam_file/output_format_concept.hpp @@ -38,8 +38,8 @@ namespace seqan3::detail * * \details * - * Exposes the protected member function `write_alignment_record` from the given `format_type`, such that the file can - * call the proper function for the selected format. + * Exposes the protected member function `write_alignment_record` and `write_header` from the given `format_type`, such + * that the file can call the proper function for the selected format. * * \remark For a complete overview, take a look at \ref io_sam_file */ @@ -55,6 +55,13 @@ struct sam_file_output_format_exposer : public format_type { format_type::write_alignment_record(std::forward(args)...); } + + //!\brief Forwards to `format_type::write_header`. + template + void write_header(stream_t & stream, sam_file_output_options const & options, header_type & header) + { + format_type::write_header(stream, options, header); + } }; } // namespace seqan3::detail diff --git a/test/unit/io/sam_file/format_bam_test.cpp b/test/unit/io/sam_file/format_bam_test.cpp index c68f2ab932..378887bae5 100644 --- a/test/unit/io/sam_file/format_bam_test.cpp +++ b/test/unit/io/sam_file/format_bam_test.cpp @@ -27,11 +27,19 @@ struct sam_file_read : public sam_file_data // pass --no-PG to samtools if you do not want PG tags which may be added automatically // To print out something like the below strings, store a file in SAM format (e.g. test.sam) + // transform it to BAM via + // samtools view --no-PG -b test.sam > test.bam // and use the folliwng commandline call: // samtools view --no-PG -u test.bam | bgzip -d | hexdump -v -e '"\\\x" 1/1 "%02x"' | sed 's;\\;Y, Y\\;g' | sed 's;$;Y;' | tr 'Y' "'" | cut -c 4- | sed -e "s/.\{112\}/&\n/g" using stream_type = std::istringstream; + std::string minimal_header{'\x42', '\x41', '\x4d', '\x01', '\x1c', '\x00', '\x00', '\x00', '\x40', '\x48', '\x44', + '\x09', '\x56', '\x4e', '\x3a', '\x31', '\x2e', '\x36', '\x0a', '\x40', '\x53', '\x51', + '\x09', '\x53', '\x4e', '\x3a', '\x72', '\x65', '\x66', '\x09', '\x4c', '\x4e', '\x3a', + '\x33', '\x34', '\x0a', '\x01', '\x00', '\x00', '\x00', '\x04', '\x00', '\x00', '\x00', + '\x72', '\x65', '\x66', '\x00', '\x22', '\x00', '\x00', '\x00'}; + std::string big_header_input{ '\x42', '\x41', '\x4D', '\x01', '\xB7', '\x01', '\x00', '\x00', '\x40', '\x48', '\x44', '\x09', '\x56', '\x4E', '\x3A', '\x31', '\x2E', '\x36', '\x09', '\x53', '\x4F', '\x3A', '\x63', '\x6F', '\x6F', '\x72', '\x64', '\x69', diff --git a/test/unit/io/sam_file/format_sam_test.cpp b/test/unit/io/sam_file/format_sam_test.cpp index c442296a37..36d43f4e8e 100644 --- a/test/unit/io/sam_file/format_sam_test.cpp +++ b/test/unit/io/sam_file/format_sam_test.cpp @@ -20,6 +20,11 @@ struct sam_file_read : public sam_file_data using stream_type = std::istringstream; + std::string minimal_header{ + R"(@HD VN:1.6 +@SQ SN:ref LN:34 +)"}; + std::string big_header_input{ R"(@HD VN:1.6 SO:coordinate SS:coordinate:queryname GO:none @PG ID:qc PN:quality_control CL:qc -f file1 DS:trim reads with low qual VN:1.0.0 diff --git a/test/unit/io/sam_file/sam_file_format_test_template.hpp b/test/unit/io/sam_file/sam_file_format_test_template.hpp index d442028d01..80a3e6f3b3 100644 --- a/test/unit/io/sam_file/sam_file_format_test_template.hpp +++ b/test/unit/io/sam_file/sam_file_format_test_template.hpp @@ -441,6 +441,17 @@ TYPED_TEST_P(sam_file_write, output_concept) EXPECT_TRUE((seqan3::sam_file_output_format)); } +TYPED_TEST_P(sam_file_write, no_records) +{ + { + auto ref_lengths = this->ref_sequences | std::views::transform([] (auto const & v) { return v.size(); }); + seqan3::sam_file_output fout{this->ostream, this->ref_ids, ref_lengths, TypeParam{}, sam_fields{}}; + } + + this->ostream.flush(); + EXPECT_EQ(this->ostream.str(), this->minimal_header); +} + TYPED_TEST_P(sam_file_write, write_empty_members) { { @@ -708,7 +719,8 @@ TYPED_TEST_P(sam_file_write, special_cases) TYPED_TEST_P(sam_file_write, format_errors) { - seqan3::sam_file_output fout{this->ostream, TypeParam{}, sam_fields{}}; + auto ref_lengths = this->ref_sequences | std::views::transform([] (auto const & v) { return v.size(); }); + seqan3::sam_file_output fout{this->ostream, this->ref_ids, ref_lengths, TypeParam{}, sam_fields{}}; // ensure that only a ref_id that is listed in the header is allowed EXPECT_THROW(fout.emplace_back(&(this->header), @@ -757,6 +769,7 @@ REGISTER_TYPED_TEST_SUITE_P(sam_file_read, issue2423); REGISTER_TYPED_TEST_SUITE_P(sam_file_write, + no_records, write_empty_members, output_concept, default_options_all_members_specified,