From 58ab484ee02727286ef354d128225c79a61881bd Mon Sep 17 00:00:00 2001 From: Lydia Buntrock Date: Mon, 29 Mar 2021 18:33:07 +0200 Subject: [PATCH 1/3] [MISC] Move `parse_cigar` from `format_sam_base` into `cigar`. Signed-off-by: Lydia Buntrock --- include/seqan3/io/sam_file/detail/cigar.hpp | 65 ++++++++++++++++ .../io/sam_file/detail/format_sam_base.hpp | 74 +------------------ include/seqan3/io/sam_file/format_bam.hpp | 6 +- include/seqan3/io/sam_file/format_sam.hpp | 2 +- 4 files changed, 70 insertions(+), 77 deletions(-) diff --git a/include/seqan3/io/sam_file/detail/cigar.hpp b/include/seqan3/io/sam_file/detail/cigar.hpp index 6335355123..49c846a57e 100644 --- a/include/seqan3/io/sam_file/detail/cigar.hpp +++ b/include/seqan3/io/sam_file/detail/cigar.hpp @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -97,6 +98,70 @@ template return assign_char_to(operators[key], cigar::operation{}); } +/*!\brief Updates the sequence lengths by `cigar_count` depending on the cigar operation `op`. + * \param[in, out] ref_length The reference sequence's length. + * \param[in, out] seq_length The query sequence's length. + * \param[in] cigar_operation The cigar operation. + * \param[in] cigar_count The cigar count value to add to the length depending on the cigar operation. + */ +inline void update_alignment_lengths(int32_t & ref_length, + int32_t & seq_length, + char const cigar_operation, + uint32_t const cigar_count) +{ + switch (cigar_operation) + { + case 'M': case '=': case 'X': ref_length += cigar_count, seq_length += cigar_count; break; + case 'D': case 'N': ref_length += cigar_count; break; + case 'I' : seq_length += cigar_count; break; + case 'S': case 'H': case 'P': break; // no op (soft-clipping or padding does not increase either length) + default: throw format_error{"Illegal cigar operation: " + std::string{cigar_operation}}; + } +} + +/*!\brief Parses a cigar string into a vector of operation-count pairs (e.g. (M, 3)). + * \tparam cigar_input_type The type of a single pass input view over the cigar string; must model + * std::ranges::input_range. + * \param[in] cigar_input The single pass input view over the cigar string to parse. + * + * \returns A tuple of size three containing (1) std::vector over seqan3::cigar, that describes + * the alignment, (2) the aligned reference length, (3) the aligned query sequence length. + * + * \details + * + * For example, the view over the cigar string "1H4M1D2M2S" will return + * `{[(H,1), (M,4), (D,1), (M,2), (S,2)], 7, 6}`. + */ +template +inline std::tuple, int32_t, int32_t> parse_cigar(cigar_input_type && cigar_input) +{ + std::vector operations{}; + std::array buffer{}; // buffer to parse numbers with from_chars. Biggest number should fit in uint64_t + char cigar_operation{}; + uint32_t cigar_count{}; + int32_t ref_length{}, seq_length{}; // length of aligned part for ref and query + + // transform input into a single input view if it isn't already + auto cigar_view = cigar_input | views::single_pass_input; + + // parse the rest of the cigar + // ------------------------------------------------------------------------------------------------------------- + while (std::ranges::begin(cigar_view) != std::ranges::end(cigar_view)) // until stream is not empty + { + auto buff_end = (std::ranges::copy(cigar_view | views::take_until_or_throw(!is_digit), buffer.data())).out; + cigar_operation = *std::ranges::begin(cigar_view); + std::ranges::next(std::ranges::begin(cigar_view)); + + if (std::from_chars(buffer.begin(), buff_end, cigar_count).ec != std::errc{}) + throw format_error{"Corrupted cigar string encountered"}; + + update_alignment_lengths(ref_length, seq_length, cigar_operation, cigar_count); + operations.emplace_back(cigar_count, cigar::operation{}.assign_char(cigar_operation)); + } + + return {operations, ref_length, seq_length}; +} + /*!\brief Creates a cigar string (SAM format) given a seqan3::detail::pairwise_alignment represented by two * seqan3::aligned_sequence's. * \ingroup io_sam_file diff --git a/include/seqan3/io/sam_file/detail/format_sam_base.hpp b/include/seqan3/io/sam_file/detail/format_sam_base.hpp index 521acf7643..bfd03d6a11 100644 --- a/include/seqan3/io/sam_file/detail/format_sam_base.hpp +++ b/include/seqan3/io/sam_file/detail/format_sam_base.hpp @@ -43,7 +43,7 @@ #include #include #include -#include +// #include #include #include #include @@ -95,11 +95,6 @@ class format_sam_base header_type & header, ref_seqs_type & /*tag*/); - static void update_alignment_lengths(int32_t & ref_length, - int32_t & seq_length, - char const cigar_operation, - uint32_t const cigar_count); - template void construct_alignment(align_type & align, std::vector & cigar_vector, @@ -110,9 +105,6 @@ class format_sam_base void transfer_soft_clipping_to(std::vector const & cigar_vector, int32_t & sc_begin, int32_t & sc_end) const; - template - std::tuple, int32_t, int32_t> parse_cigar(cigar_input_type && cigar_input) const; - template void read_field(stream_view_type && stream_view, detail::ignore_t const & SEQAN3_DOXYGEN_ONLY(target)); @@ -190,27 +182,6 @@ inline void format_sam_base::check_and_assign_ref_id(ref_id_type & ref_id, } } -/*!\brief Updates the sequence lengths by `cigar_count` depending on the cigar operation `op`. - * \param[in, out] ref_length The reference sequence's length. - * \param[in, out] seq_length The query sequence's length. - * \param[in] cigar_operation The cigar operation. - * \param[in] cigar_count The cigar count value to add to the length depending on the cigar operation. - */ -inline void format_sam_base::update_alignment_lengths(int32_t & ref_length, - int32_t & seq_length, - char const cigar_operation, - uint32_t const cigar_count) -{ - switch (cigar_operation) - { - case 'M': case '=': case 'X': ref_length += cigar_count, seq_length += cigar_count; break; - case 'D': case 'N': ref_length += cigar_count; break; - case 'I' : seq_length += cigar_count; break; - case 'S': case 'H': case 'P': break; // no op (soft-clipping or padding does not increase either length) - default: throw format_error{"Illegal cigar operation: " + std::string{cigar_operation}}; - } -} - /*!\brief Transfer soft clipping information from the \p cigar_vector to \p sc_begin and \p sc_end. * \param[in] cigar_vector The cigar information to parse for soft-clipping. * \param[out] sc_begin The soft clipping at the beginning of the alignment to set. @@ -247,49 +218,6 @@ inline void format_sam_base::transfer_soft_clipping_to(std::vector const sc_end = cigar_count_at(second_last_index); } -/*!\brief Parses a cigar string into a vector of operation-count pairs (e.g. (M, 3)). - * \tparam cigar_input_type The type of a single pass input view over the cigar string; must model - * std::ranges::input_range. - * \param[in] cigar_input The single pass input view over the cigar string to parse. - * - * \returns A tuple of size three containing (1) std::vector over seqan3::cigar, that describes - * the alignment, (2) the aligned reference length, (3) the aligned query sequence length. - * - * \details - * - * For example, the view over the cigar string "1H4M1D2M2S" will return - * `{[(H,1), (M,4), (D,1), (M,2), (S,2)], 7, 6}`. - */ -template -inline std::tuple, int32_t, int32_t> format_sam_base::parse_cigar(cigar_input_type && cigar_input) const -{ - std::vector operations{}; - std::array buffer{}; // buffer to parse numbers with from_chars. Biggest number should fit in uint64_t - char cigar_operation{}; - uint32_t cigar_count{}; - int32_t ref_length{}, seq_length{}; // length of aligned part for ref and query - - // transform input into a single input view if it isn't already - auto cigar_view = cigar_input | views::single_pass_input; - - // parse the rest of the cigar - // ------------------------------------------------------------------------------------------------------------- - while (std::ranges::begin(cigar_view) != std::ranges::end(cigar_view)) // until stream is not empty - { - auto buff_end = (std::ranges::copy(cigar_view | views::take_until_or_throw(!is_digit), buffer.data())).out; - cigar_operation = *std::ranges::begin(cigar_view); - std::ranges::next(std::ranges::begin(cigar_view)); - - if (std::from_chars(buffer.begin(), buff_end, cigar_count).ec != std::errc{}) - throw format_error{"Corrupted cigar string encountered"}; - - update_alignment_lengths(ref_length, seq_length, cigar_operation, cigar_count); - operations.emplace_back(cigar_count, cigar::operation{}.assign_char(cigar_operation)); - } - - return {operations, ref_length, seq_length}; -} - /*!\brief Construct the field::alignment depending on the given information. * \tparam align_type The alignment type. * \tparam ref_seqs_type The type of reference sequences (might decay to ignore). diff --git a/include/seqan3/io/sam_file/format_bam.hpp b/include/seqan3/io/sam_file/format_bam.hpp index e08f450b3e..99f5217d2e 100644 --- a/include/seqan3/io/sam_file/format_bam.hpp +++ b/include/seqan3/io/sam_file/format_bam.hpp @@ -572,7 +572,7 @@ inline void format_bam::read_alignment_record(stream_type & stream, "record.")}; auto cigar_view = std::views::all(std::get(it->second)); - std::tie(tmp_cigar_vector, ref_length, seq_length) = parse_cigar(cigar_view); + std::tie(tmp_cigar_vector, ref_length, seq_length) = detail::parse_cigar(cigar_view); offset_tmp = soft_clipping_end = 0; transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end); tag_dict.erase(it); // remove redundant information @@ -746,7 +746,7 @@ inline void format_bam::write_alignment_record([[maybe_unused]] stream_type & s { int32_t dummy_seq_length{}; for (auto & [count, operation] : cigar_vector) - update_alignment_lengths(ref_length, dummy_seq_length, operation.to_char(), count); + detail::update_alignment_lengths(ref_length, dummy_seq_length, operation.to_char(), count); } else if (!std::ranges::empty(get<0>(align)) && !std::ranges::empty(get<1>(align))) { @@ -1147,7 +1147,7 @@ inline auto format_bam::parse_binary_cigar(cigar_input_type && cigar_input, uint operation = cigar_mapping[operation_and_count & cigar_mask]; count = operation_and_count >> 4; - update_alignment_lengths(ref_length, seq_length, operation, count); + detail::update_alignment_lengths(ref_length, seq_length, operation, count); operations.emplace_back(count, cigar::operation{}.assign_char(operation)); --n_cigar_op; } diff --git a/include/seqan3/io/sam_file/format_sam.hpp b/include/seqan3/io/sam_file/format_sam.hpp index cb5eecf652..ae894fe597 100644 --- a/include/seqan3/io/sam_file/format_sam.hpp +++ b/include/seqan3/io/sam_file/format_sam.hpp @@ -460,7 +460,7 @@ inline void format_sam::read_alignment_record(stream_type & stream, { if (!is_char<'*'>(*std::ranges::begin(stream_view))) // no cigar information given { - std::tie(tmp_cigar_vector, ref_length, seq_length) = parse_cigar(field_view); + std::tie(tmp_cigar_vector, ref_length, seq_length) = detail::parse_cigar(field_view); transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end); // the actual cigar_vector is swapped with tmp_cigar_vector at the end to avoid copying } From 5d912fa06dfdf0c8da0a57b430959b7fed450c91 Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Wed, 7 Apr 2021 11:06:18 +0200 Subject: [PATCH 2/3] Apply suggestions from code review --- include/seqan3/io/sam_file/detail/cigar.hpp | 1 - include/seqan3/io/sam_file/detail/format_sam_base.hpp | 1 - 2 files changed, 2 deletions(-) diff --git a/include/seqan3/io/sam_file/detail/cigar.hpp b/include/seqan3/io/sam_file/detail/cigar.hpp index 49c846a57e..f43366552e 100644 --- a/include/seqan3/io/sam_file/detail/cigar.hpp +++ b/include/seqan3/io/sam_file/detail/cigar.hpp @@ -27,7 +27,6 @@ #include #include -#include namespace seqan3::detail { diff --git a/include/seqan3/io/sam_file/detail/format_sam_base.hpp b/include/seqan3/io/sam_file/detail/format_sam_base.hpp index bfd03d6a11..6c0d1fab5b 100644 --- a/include/seqan3/io/sam_file/detail/format_sam_base.hpp +++ b/include/seqan3/io/sam_file/detail/format_sam_base.hpp @@ -43,7 +43,6 @@ #include #include #include -// #include #include #include #include From fe462a6fd1f3b21efbed329a1e2c68ac003724f9 Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Wed, 7 Apr 2021 11:06:43 +0200 Subject: [PATCH 3/3] Update include/seqan3/io/sam_file/detail/cigar.hpp --- include/seqan3/io/sam_file/detail/cigar.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/include/seqan3/io/sam_file/detail/cigar.hpp b/include/seqan3/io/sam_file/detail/cigar.hpp index f43366552e..ea10dd2f8f 100644 --- a/include/seqan3/io/sam_file/detail/cigar.hpp +++ b/include/seqan3/io/sam_file/detail/cigar.hpp @@ -27,7 +27,6 @@ #include #include - namespace seqan3::detail { //!\brief Comparator that is able two compare two views