diff --git a/include/seqan3/io/alignment_file/detail.hpp b/include/seqan3/io/alignment_file/detail.hpp index 007d1568dfe..fa657357936 100644 --- a/include/seqan3/io/alignment_file/detail.hpp +++ b/include/seqan3/io/alignment_file/detail.hpp @@ -204,6 +204,24 @@ std::vector get_cigar_vector(alignment_type && alignment, return result; } +/*!\brief Creates a CIGAR string (SAM format) given an std::vector. + * \ingroup alignment_file + * \param cigar_vector The std::vector of seqan3::cigar elements to be transformed into a std::string. + * \returns The CIGAR string (std::string). + * + * \details + * + * The transformation is done by printing the vector with the seqan3::debug_stream. + */ +std::string get_cigar_string(std::vector const & cigar_vector) +{ + std::stringstream ss; + debug_stream_type ds{ss}; + ds << cigar_vector; + ss.flush(); + return ss.str(); +} + /*!\brief Creates a CIGAR string (SAM format) given an alignment represented by two aligned sequences. * \ingroup alignment_file * @@ -252,11 +270,7 @@ std::string get_cigar_string(alignment_type && alignment, uint32_t const query_end_pos = 0, bool const extended_cigar = false) { - std::stringstream ss; - debug_stream_type ds{ss}; - ds << get_cigar_vector(alignment, query_start_pos, query_end_pos, extended_cigar); - ss.flush(); - return ss.str(); + return get_cigar_string(get_cigar_vector(alignment, query_start_pos, query_end_pos, extended_cigar)); } /*!\brief Transforms an alignment represented by two aligned sequences into the diff --git a/include/seqan3/io/alignment_file/format_bam.hpp b/include/seqan3/io/alignment_file/format_bam.hpp index a3a0553ff1a..d5aa3dbb9f0 100644 --- a/include/seqan3/io/alignment_file/format_bam.hpp +++ b/include/seqan3/io/alignment_file/format_bam.hpp @@ -715,6 +715,7 @@ class alignment_file_output_format : alignment_file_output_format @@ -729,6 +730,7 @@ class alignment_file_output_format : alignment_file_output_format ref_offset, [[maybe_unused]] align_type && align, + [[maybe_unused]] cigar_type && cigar_vector, [[maybe_unused]] uint16_t flag, [[maybe_unused]] uint8_t mapq, [[maybe_unused]] mate_type && mate, @@ -848,24 +850,37 @@ class alignment_file_output_format : alignment_file_output_format(align)) && !std::ranges::empty(get<1>(align))) + { + ref_length = std::ranges::distance(get<1>(align)); - for (auto chr : get<1>(align)) - if (chr == gap{}) - ++off_end; + // compute possible distance from alignment end to sequence end + // which indicates soft clipping at the end. + // This should be replaced by a free count_gaps function for + // aligned sequences which is more efficient if possible. + auto off_end{std::ranges::distance(seq) - offset}; - off_end -= std::ranges::distance(get<1>(align)); + for (auto chr : get<1>(align)) + if (chr == gap{}) + ++off_end; - std::vector cigar_vector = detail::get_cigar_vector(align, offset, off_end); + off_end -= ref_length; + cigar_vector = detail::get_cigar_vector(align, offset, off_end); + } + else + { + int32_t d{}; // dummy seq length + for (auto & [co, op] : cigar_vector) + alignment_file_input_format::update_alignment_lengths(ref_length, d, op.to_char(), co); + } if (cigar_vector.size() > 65535) // cannot be represented with 16bits, must be written into the sam tag CG { - tag_dict["CG"_tag] = detail::get_cigar_string(align, offset, off_end); + tag_dict["CG"_tag] = detail::get_cigar_string(cigar_vector); cigar_vector.resize(2); cigar_vector[0] = cigar{static_cast(std::ranges::distance(seq)), 'S'_cigar_op}; cigar_vector[1] = cigar{static_cast(std::ranges::distance(get<1>(align))), 'N'_cigar_op}; @@ -880,7 +895,7 @@ class alignment_file_output_format : alignment_file_output_format(std::min(std::ranges::distance(id) + 1, 255), 2), /* mapq */ mapq, - /* bin */ reg2bin(ref_offset.value_or(-1), std::ranges::distance(get<1>(align))), + /* bin */ reg2bin(ref_offset.value_or(-1), ref_length), /* n_cigar_op */ static_cast(cigar_vector.size()), /* flag */ flag, /* l_seq */ static_cast(std::ranges::distance(seq)), diff --git a/include/seqan3/io/alignment_file/format_sam.hpp b/include/seqan3/io/alignment_file/format_sam.hpp index 2a5d9376e08..0beeb2401ca 100644 --- a/include/seqan3/io/alignment_file/format_sam.hpp +++ b/include/seqan3/io/alignment_file/format_sam.hpp @@ -489,7 +489,7 @@ class alignment_file_input_format * \param[in] op The cigar operation. * \param[in] cigar_count The cigar count value to add to the length depending. */ - void update_alignment_lengths(int32_t & ref_length, int32_t & seq_length, char op, uint32_t cigar_count) + static void update_alignment_lengths(int32_t & ref_length, int32_t & seq_length, char op, uint32_t cigar_count) { if (is_char<'M'>(op) || is_char<'='>(op) || is_char<'X'>(op)) { @@ -1097,6 +1097,7 @@ class alignment_file_output_format ref_id_type && ref_id, std::optional ref_offset, align_type && align, + std::vector const & cigar_vector, uint16_t flag, uint8_t mapq, mate_type && mate, @@ -1291,6 +1292,11 @@ class alignment_file_output_format write_range(stream_it, detail::get_cigar_string(std::forward(align), offset, off_end)); } + else if (!cigar_vector.empty()) // both fields are never non-empty (fields asserted on file level) + { + for (auto & c : cigar_vector) + stream << c.to_char(); // each letter returns a small_vector instead of char so write_range doesn't work + } else { stream << '*'; diff --git a/include/seqan3/io/alignment_file/output.hpp b/include/seqan3/io/alignment_file/output.hpp index 8c08a824fc7..6ac5cdc0d3e 100644 --- a/include/seqan3/io/alignment_file/output.hpp +++ b/include/seqan3/io/alignment_file/output.hpp @@ -75,6 +75,7 @@ namespace seqan3 * 12. field::TAGS * 13. field::EVALUE * 14. field::BIT_SCORE + * 15. field::CIGAR * * There is an additional field called seqan3::field::HEADER_PTR. It is used to transfer * header information from seqan3::alignment_file_input to seqan3::alignment_file_output, @@ -209,6 +210,7 @@ class alignment_file_output field::REF_ID, field::REF_OFFSET, field::ALIGNMENT, + field::CIGAR, field::MAPQ, field::FLAG, field::QUAL, @@ -228,6 +230,13 @@ class alignment_file_output "please refer to the documentation of " "seqan3::alignment_file_output::field_ids for the accepted values."); + static_assert([] () constexpr + { + return !(selected_field_ids::contains(field::ALIGNMENT) && + selected_field_ids::contains(field::CIGAR)); + }(), + "You may not select field::ALIGNMENT and field::CIGAR at the same time."); + /*!\name Range associated types * \brief Most of the range associated types are `void` for output ranges. * \{ @@ -513,6 +522,7 @@ class alignment_file_output detail::get_or(r, std::ignore), detail::get_or(r, std::optional{}), detail::get_or(r, default_align_t{}), + detail::get_or(r, std::vector{}), detail::get_or(r, 0u), detail::get_or(r, 0u), detail::get_or(r, default_mate_t{}), @@ -561,6 +571,7 @@ class alignment_file_output detail::get_or(t, std::ignore), detail::get_or(t, std::optional{}), detail::get_or(t, default_align_t{}), + detail::get_or(t, std::vector{}), detail::get_or(t, 0u), detail::get_or(t, 0u), detail::get_or(t, default_mate_t{}), @@ -775,7 +786,7 @@ class alignment_file_output template void write_record(record_header_ptr_t && record_header_ptr, pack_type && ...remainder) { - static_assert((sizeof...(pack_type) == 14), "Wrong parameter list passed to write_record."); + static_assert((sizeof...(pack_type) == 15), "Wrong parameter list passed to write_record."); assert(!format.valueless_by_exception()); diff --git a/include/seqan3/io/alignment_file/output_format_concept.hpp b/include/seqan3/io/alignment_file/output_format_concept.hpp index 01dab7a96a5..e60433b00c5 100644 --- a/include/seqan3/io/alignment_file/output_format_concept.hpp +++ b/include/seqan3/io/alignment_file/output_format_concept.hpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -64,6 +65,7 @@ SEQAN3_CONCEPT AlignmentFileOutputFormat = std::optional & ref_id, std::optional & ref_offset, std::pair>, std::vector>> & align, + std::vector & cigar, uint16_t & flag, uint8_t & mapq, std::tuple, std::optional, int32_t> & mate, @@ -84,6 +86,7 @@ SEQAN3_CONCEPT AlignmentFileOutputFormat = ref_id, ref_offset, align, + cigar, flag, mapq, mate, @@ -111,6 +114,7 @@ SEQAN3_CONCEPT AlignmentFileOutputFormat = ref_id_type && ref_id, ref_offset_type && ref_offset, align_type && align, + std::vector & cigar, flag_type && flag, mapq_type && mapq, mate_type && mate, @@ -145,6 +149,7 @@ SEQAN3_CONCEPT AlignmentFileOutputFormat = * \param[in] ref_id The data for seqan3::field::REF_ID, e.g. the reference id.. * \param[in] ref_offset The data for seqan3::field::REF_OFFSET, i.e. the start position of the alignment in \p ref_seq. * \param[in] align The data for seqan3::field::ALIGN, e.g. the alignment between query and ref. + * \param[in] cigar The data for seqan3::field::CIGAR, e.g. representing the alignment between query and ref. * \param[in] flag The data for seqan3::field::FLAG, e.g. the SAM mapping flag value. * \param[in] mapq The data for seqan3::field::MAPQ, e.g. the mapping quality value. * \param[in] mate The data for seqan3::field::MATE, e.g. the mate information of paired reads. diff --git a/test/unit/io/alignment_file/alignment_file_format_test_template.hpp b/test/unit/io/alignment_file/alignment_file_format_test_template.hpp index 76432492291..36ecf68ed6f 100644 --- a/test/unit/io/alignment_file/alignment_file_format_test_template.hpp +++ b/test/unit/io/alignment_file/alignment_file_format_test_template.hpp @@ -495,6 +495,47 @@ TYPED_TEST_P(alignment_file_write, with_header) EXPECT_EQ(this->ostream.str(), this->verbose_output); } +TYPED_TEST_P(alignment_file_write, cigar_vector) +{ + std::vector> cigar_v + { + {{1, 'S'_cigar_op}, {1, 'M'_cigar_op}, {1, 'D'_cigar_op}, {1, 'M'_cigar_op}, {1, 'I'_cigar_op}}, + {{7, 'M'_cigar_op}, {1, 'D'_cigar_op}, {1, 'M'_cigar_op}, {1, 'S'_cigar_op}}, + {{1, 'S'_cigar_op}, {1, 'M'_cigar_op}, {1, 'P'_cigar_op}, {1, 'M'_cigar_op}, {1, 'I'_cigar_op}, + {1, 'M'_cigar_op}, {1, 'I'_cigar_op}, {1, 'D'_cigar_op}, {1, 'M'_cigar_op}, {1, 'S'_cigar_op}} + }; + + { + this->tag_dicts[0]["NM"_tag] = 7; + this->tag_dicts[0]["AS"_tag] = 2; + this->tag_dicts[1]["xy"_tag] = std::vector{3,4,5}; + + alignment_file_output fout{this->ostream, TypeParam{}, fields{}}; + + for (size_t i = 0; i < 3; ++i) + { + ASSERT_NO_THROW(fout.emplace_back(&(this->header), this->ids[i], this->flags[i], 0/*ref_id*/, + this->ref_offsets[i], this->mapqs[i], cigar_v[i], + this->offsets[i], this->mates[i], this->seqs[i], this->quals[i], + this->tag_dicts[i])); + } + } + + this->ostream.flush(); + EXPECT_EQ(this->ostream.str(), this->simple_three_reads_output); +} + TYPED_TEST_P(alignment_file_write, special_cases) { std::optional rid; @@ -573,5 +614,6 @@ REGISTER_TYPED_TEST_CASE_P(alignment_file_write, default_options_all_members_specified, write_ref_id_with_different_types, with_header, + cigar_vector, special_cases, format_errors);