Skip to content

Commit

Permalink
[MISC] Move parse_cigar from format_sam_base into cigar. (#2502)
Browse files Browse the repository at this point in the history
* [MISC] Move `parse_cigar` from `format_sam_base` into `cigar`.

Signed-off-by: Lydia Buntrock <[email protected]>

* Apply suggestions from code review

* Update include/seqan3/io/sam_file/detail/cigar.hpp

Co-authored-by: Enrico Seiler <[email protected]>
  • Loading branch information
Irallia and eseiler authored Apr 7, 2021
1 parent 30e966c commit 5d4aa4b
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 79 deletions.
67 changes: 65 additions & 2 deletions include/seqan3/io/sam_file/detail/cigar.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,9 @@
#include <seqan3/range/views/single_pass_input.hpp>
#include <seqan3/range/views/take_until.hpp>
#include <seqan3/range/views/zip.hpp>
#include <seqan3/utility/char_operations/predicate.hpp>
#include <seqan3/utility/tuple/concept.hpp>

#include <range/v3/view/zip.hpp>

namespace seqan3::detail
{
//!\brief Comparator that is able two compare two views
Expand Down Expand Up @@ -97,6 +96,70 @@ template <typename reference_char_type, typename query_char_type>
return assign_char_to(operators[key], cigar::operation{});
}

/*!\brief Updates the sequence lengths by `cigar_count` depending on the cigar operation `op`.
* \param[in, out] ref_length The reference sequence's length.
* \param[in, out] seq_length The query sequence's length.
* \param[in] cigar_operation The cigar operation.
* \param[in] cigar_count The cigar count value to add to the length depending on the cigar operation.
*/
inline void update_alignment_lengths(int32_t & ref_length,
int32_t & seq_length,
char const cigar_operation,
uint32_t const cigar_count)
{
switch (cigar_operation)
{
case 'M': case '=': case 'X': ref_length += cigar_count, seq_length += cigar_count; break;
case 'D': case 'N': ref_length += cigar_count; break;
case 'I' : seq_length += cigar_count; break;
case 'S': case 'H': case 'P': break; // no op (soft-clipping or padding does not increase either length)
default: throw format_error{"Illegal cigar operation: " + std::string{cigar_operation}};
}
}

/*!\brief Parses a cigar string into a vector of operation-count pairs (e.g. (M, 3)).
* \tparam cigar_input_type The type of a single pass input view over the cigar string; must model
* std::ranges::input_range.
* \param[in] cigar_input The single pass input view over the cigar string to parse.
*
* \returns A tuple of size three containing (1) std::vector over seqan3::cigar, that describes
* the alignment, (2) the aligned reference length, (3) the aligned query sequence length.
*
* \details
*
* For example, the view over the cigar string "1H4M1D2M2S" will return
* `{[(H,1), (M,4), (D,1), (M,2), (S,2)], 7, 6}`.
*/
template <typename cigar_input_type>
inline std::tuple<std::vector<cigar>, int32_t, int32_t> parse_cigar(cigar_input_type && cigar_input)
{
std::vector<cigar> operations{};
std::array<char, 20> buffer{}; // buffer to parse numbers with from_chars. Biggest number should fit in uint64_t
char cigar_operation{};
uint32_t cigar_count{};
int32_t ref_length{}, seq_length{}; // length of aligned part for ref and query

// transform input into a single input view if it isn't already
auto cigar_view = cigar_input | views::single_pass_input;

// parse the rest of the cigar
// -------------------------------------------------------------------------------------------------------------
while (std::ranges::begin(cigar_view) != std::ranges::end(cigar_view)) // until stream is not empty
{
auto buff_end = (std::ranges::copy(cigar_view | views::take_until_or_throw(!is_digit), buffer.data())).out;
cigar_operation = *std::ranges::begin(cigar_view);
std::ranges::next(std::ranges::begin(cigar_view));

if (std::from_chars(buffer.begin(), buff_end, cigar_count).ec != std::errc{})
throw format_error{"Corrupted cigar string encountered"};

update_alignment_lengths(ref_length, seq_length, cigar_operation, cigar_count);
operations.emplace_back(cigar_count, cigar::operation{}.assign_char(cigar_operation));
}

return {operations, ref_length, seq_length};
}

/*!\brief Creates a cigar string (SAM format) given a seqan3::detail::pairwise_alignment represented by two
* seqan3::aligned_sequence's.
* \ingroup io_sam_file
Expand Down
73 changes: 0 additions & 73 deletions include/seqan3/io/sam_file/detail/format_sam_base.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
#include <seqan3/range/views/to.hpp>
#include <seqan3/range/views/to_char.hpp>
#include <seqan3/range/views/zip.hpp>
#include <seqan3/utility/char_operations/predicate.hpp>
#include <seqan3/utility/detail/exposition_only_concept.hpp>
#include <seqan3/utility/detail/type_name_as_string.hpp>
#include <seqan3/utility/tuple/concept.hpp>
Expand Down Expand Up @@ -95,11 +94,6 @@ class format_sam_base
header_type & header,
ref_seqs_type & /*tag*/);

static void update_alignment_lengths(int32_t & ref_length,
int32_t & seq_length,
char const cigar_operation,
uint32_t const cigar_count);

template <typename align_type, typename ref_seqs_type>
void construct_alignment(align_type & align,
std::vector<cigar> & cigar_vector,
Expand All @@ -110,9 +104,6 @@ class format_sam_base

void transfer_soft_clipping_to(std::vector<cigar> const & cigar_vector, int32_t & sc_begin, int32_t & sc_end) const;

template <typename cigar_input_type>
std::tuple<std::vector<cigar>, int32_t, int32_t> parse_cigar(cigar_input_type && cigar_input) const;

template <typename stream_view_type>
void read_field(stream_view_type && stream_view, detail::ignore_t const & SEQAN3_DOXYGEN_ONLY(target));

Expand Down Expand Up @@ -190,27 +181,6 @@ inline void format_sam_base::check_and_assign_ref_id(ref_id_type & ref_id,
}
}

/*!\brief Updates the sequence lengths by `cigar_count` depending on the cigar operation `op`.
* \param[in, out] ref_length The reference sequence's length.
* \param[in, out] seq_length The query sequence's length.
* \param[in] cigar_operation The cigar operation.
* \param[in] cigar_count The cigar count value to add to the length depending on the cigar operation.
*/
inline void format_sam_base::update_alignment_lengths(int32_t & ref_length,
int32_t & seq_length,
char const cigar_operation,
uint32_t const cigar_count)
{
switch (cigar_operation)
{
case 'M': case '=': case 'X': ref_length += cigar_count, seq_length += cigar_count; break;
case 'D': case 'N': ref_length += cigar_count; break;
case 'I' : seq_length += cigar_count; break;
case 'S': case 'H': case 'P': break; // no op (soft-clipping or padding does not increase either length)
default: throw format_error{"Illegal cigar operation: " + std::string{cigar_operation}};
}
}

/*!\brief Transfer soft clipping information from the \p cigar_vector to \p sc_begin and \p sc_end.
* \param[in] cigar_vector The cigar information to parse for soft-clipping.
* \param[out] sc_begin The soft clipping at the beginning of the alignment to set.
Expand Down Expand Up @@ -247,49 +217,6 @@ inline void format_sam_base::transfer_soft_clipping_to(std::vector<cigar> const
sc_end = cigar_count_at(second_last_index);
}

/*!\brief Parses a cigar string into a vector of operation-count pairs (e.g. (M, 3)).
* \tparam cigar_input_type The type of a single pass input view over the cigar string; must model
* std::ranges::input_range.
* \param[in] cigar_input The single pass input view over the cigar string to parse.
*
* \returns A tuple of size three containing (1) std::vector over seqan3::cigar, that describes
* the alignment, (2) the aligned reference length, (3) the aligned query sequence length.
*
* \details
*
* For example, the view over the cigar string "1H4M1D2M2S" will return
* `{[(H,1), (M,4), (D,1), (M,2), (S,2)], 7, 6}`.
*/
template <typename cigar_input_type>
inline std::tuple<std::vector<cigar>, int32_t, int32_t> format_sam_base::parse_cigar(cigar_input_type && cigar_input) const
{
std::vector<cigar> operations{};
std::array<char, 20> buffer{}; // buffer to parse numbers with from_chars. Biggest number should fit in uint64_t
char cigar_operation{};
uint32_t cigar_count{};
int32_t ref_length{}, seq_length{}; // length of aligned part for ref and query

// transform input into a single input view if it isn't already
auto cigar_view = cigar_input | views::single_pass_input;

// parse the rest of the cigar
// -------------------------------------------------------------------------------------------------------------
while (std::ranges::begin(cigar_view) != std::ranges::end(cigar_view)) // until stream is not empty
{
auto buff_end = (std::ranges::copy(cigar_view | views::take_until_or_throw(!is_digit), buffer.data())).out;
cigar_operation = *std::ranges::begin(cigar_view);
std::ranges::next(std::ranges::begin(cigar_view));

if (std::from_chars(buffer.begin(), buff_end, cigar_count).ec != std::errc{})
throw format_error{"Corrupted cigar string encountered"};

update_alignment_lengths(ref_length, seq_length, cigar_operation, cigar_count);
operations.emplace_back(cigar_count, cigar::operation{}.assign_char(cigar_operation));
}

return {operations, ref_length, seq_length};
}

/*!\brief Construct the field::alignment depending on the given information.
* \tparam align_type The alignment type.
* \tparam ref_seqs_type The type of reference sequences (might decay to ignore).
Expand Down
6 changes: 3 additions & 3 deletions include/seqan3/io/sam_file/format_bam.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -572,7 +572,7 @@ inline void format_bam::read_alignment_record(stream_type & stream,
"record.")};

auto cigar_view = std::views::all(std::get<std::string>(it->second));
std::tie(tmp_cigar_vector, ref_length, seq_length) = parse_cigar(cigar_view);
std::tie(tmp_cigar_vector, ref_length, seq_length) = detail::parse_cigar(cigar_view);
offset_tmp = soft_clipping_end = 0;
transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
tag_dict.erase(it); // remove redundant information
Expand Down Expand Up @@ -746,7 +746,7 @@ inline void format_bam::write_alignment_record([[maybe_unused]] stream_type & s
{
int32_t dummy_seq_length{};
for (auto & [count, operation] : cigar_vector)
update_alignment_lengths(ref_length, dummy_seq_length, operation.to_char(), count);
detail::update_alignment_lengths(ref_length, dummy_seq_length, operation.to_char(), count);
}
else if (!std::ranges::empty(get<0>(align)) && !std::ranges::empty(get<1>(align)))
{
Expand Down Expand Up @@ -1147,7 +1147,7 @@ inline auto format_bam::parse_binary_cigar(cigar_input_type && cigar_input, uint
operation = cigar_mapping[operation_and_count & cigar_mask];
count = operation_and_count >> 4;

update_alignment_lengths(ref_length, seq_length, operation, count);
detail::update_alignment_lengths(ref_length, seq_length, operation, count);
operations.emplace_back(count, cigar::operation{}.assign_char(operation));
--n_cigar_op;
}
Expand Down
2 changes: 1 addition & 1 deletion include/seqan3/io/sam_file/format_sam.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -460,7 +460,7 @@ inline void format_sam::read_alignment_record(stream_type & stream,
{
if (!is_char<'*'>(*std::ranges::begin(stream_view))) // no cigar information given
{
std::tie(tmp_cigar_vector, ref_length, seq_length) = parse_cigar(field_view);
std::tie(tmp_cigar_vector, ref_length, seq_length) = detail::parse_cigar(field_view);
transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
// the actual cigar_vector is swapped with tmp_cigar_vector at the end to avoid copying
}
Expand Down

1 comment on commit 5d4aa4b

@vercel
Copy link

@vercel vercel bot commented on 5d4aa4b Apr 7, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.