Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MISC] Move parse_cigar from format_sam_base into cigar. #2502

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions include/seqan3/io/sam_file/detail/cigar.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <seqan3/range/views/single_pass_input.hpp>
#include <seqan3/range/views/take_until.hpp>
#include <seqan3/range/views/zip.hpp>
#include <seqan3/utility/char_operations/predicate.hpp>
#include <seqan3/utility/tuple/concept.hpp>

eseiler marked this conversation as resolved.
Show resolved Hide resolved
#include <range/v3/view/zip.hpp>
eseiler marked this conversation as resolved.
Show resolved Hide resolved
Expand Down Expand Up @@ -97,6 +98,70 @@ template <typename reference_char_type, typename query_char_type>
return assign_char_to(operators[key], cigar::operation{});
}

/*!\brief Updates the sequence lengths by `cigar_count` depending on the cigar operation `op`.
* \param[in, out] ref_length The reference sequence's length.
* \param[in, out] seq_length The query sequence's length.
* \param[in] cigar_operation The cigar operation.
* \param[in] cigar_count The cigar count value to add to the length depending on the cigar operation.
*/
inline void update_alignment_lengths(int32_t & ref_length,
int32_t & seq_length,
char const cigar_operation,
uint32_t const cigar_count)
{
switch (cigar_operation)
{
case 'M': case '=': case 'X': ref_length += cigar_count, seq_length += cigar_count; break;
case 'D': case 'N': ref_length += cigar_count; break;
case 'I' : seq_length += cigar_count; break;
case 'S': case 'H': case 'P': break; // no op (soft-clipping or padding does not increase either length)
default: throw format_error{"Illegal cigar operation: " + std::string{cigar_operation}};
}
}

/*!\brief Parses a cigar string into a vector of operation-count pairs (e.g. (M, 3)).
* \tparam cigar_input_type The type of a single pass input view over the cigar string; must model
* std::ranges::input_range.
* \param[in] cigar_input The single pass input view over the cigar string to parse.
*
* \returns A tuple of size three containing (1) std::vector over seqan3::cigar, that describes
* the alignment, (2) the aligned reference length, (3) the aligned query sequence length.
*
* \details
*
* For example, the view over the cigar string "1H4M1D2M2S" will return
* `{[(H,1), (M,4), (D,1), (M,2), (S,2)], 7, 6}`.
*/
template <typename cigar_input_type>
inline std::tuple<std::vector<cigar>, int32_t, int32_t> parse_cigar(cigar_input_type && cigar_input)
{
std::vector<cigar> operations{};
std::array<char, 20> buffer{}; // buffer to parse numbers with from_chars. Biggest number should fit in uint64_t
char cigar_operation{};
uint32_t cigar_count{};
int32_t ref_length{}, seq_length{}; // length of aligned part for ref and query

// transform input into a single input view if it isn't already
auto cigar_view = cigar_input | views::single_pass_input;

// parse the rest of the cigar
// -------------------------------------------------------------------------------------------------------------
while (std::ranges::begin(cigar_view) != std::ranges::end(cigar_view)) // until stream is not empty
{
auto buff_end = (std::ranges::copy(cigar_view | views::take_until_or_throw(!is_digit), buffer.data())).out;
cigar_operation = *std::ranges::begin(cigar_view);
std::ranges::next(std::ranges::begin(cigar_view));

if (std::from_chars(buffer.begin(), buff_end, cigar_count).ec != std::errc{})
throw format_error{"Corrupted cigar string encountered"};

update_alignment_lengths(ref_length, seq_length, cigar_operation, cigar_count);
operations.emplace_back(cigar_count, cigar::operation{}.assign_char(cigar_operation));
}

return {operations, ref_length, seq_length};
}

/*!\brief Creates a cigar string (SAM format) given a seqan3::detail::pairwise_alignment represented by two
* seqan3::aligned_sequence's.
* \ingroup io_sam_file
Expand Down
74 changes: 1 addition & 73 deletions include/seqan3/io/sam_file/detail/format_sam_base.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
#include <seqan3/range/views/to.hpp>
#include <seqan3/range/views/to_char.hpp>
#include <seqan3/range/views/zip.hpp>
#include <seqan3/utility/char_operations/predicate.hpp>
// #include <seqan3/utility/char_operations/predicate.hpp>
eseiler marked this conversation as resolved.
Show resolved Hide resolved
#include <seqan3/utility/detail/exposition_only_concept.hpp>
#include <seqan3/utility/detail/type_name_as_string.hpp>
#include <seqan3/utility/tuple/concept.hpp>
Expand Down Expand Up @@ -95,11 +95,6 @@ class format_sam_base
header_type & header,
ref_seqs_type & /*tag*/);

static void update_alignment_lengths(int32_t & ref_length,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This static void does not appear in the include/seqan3/io/sam_file/detail/cigar.hpp. Is it just unnecessary ? :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As the function is not in a class anymore, I think the static is not necessary.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A static member of a class would mean that the member (in this case a member function) is independent of the actual object. Maybe think of it as "there is one (global) instance of this member" and the member exists without any instance of the class existing.

It also means that you do not need an instance of the format_sam_base (or any derived) class to invoke this function. You can use format_sam_base::update_alignment_lengths instead of some_instance.update_alignment_lengths.

int32_t & seq_length,
char const cigar_operation,
uint32_t const cigar_count);

template <typename align_type, typename ref_seqs_type>
void construct_alignment(align_type & align,
std::vector<cigar> & cigar_vector,
Expand All @@ -110,9 +105,6 @@ class format_sam_base

void transfer_soft_clipping_to(std::vector<cigar> const & cigar_vector, int32_t & sc_begin, int32_t & sc_end) const;

template <typename cigar_input_type>
std::tuple<std::vector<cigar>, int32_t, int32_t> parse_cigar(cigar_input_type && cigar_input) const;

template <typename stream_view_type>
void read_field(stream_view_type && stream_view, detail::ignore_t const & SEQAN3_DOXYGEN_ONLY(target));

Expand Down Expand Up @@ -190,27 +182,6 @@ inline void format_sam_base::check_and_assign_ref_id(ref_id_type & ref_id,
}
}

/*!\brief Updates the sequence lengths by `cigar_count` depending on the cigar operation `op`.
* \param[in, out] ref_length The reference sequence's length.
* \param[in, out] seq_length The query sequence's length.
* \param[in] cigar_operation The cigar operation.
* \param[in] cigar_count The cigar count value to add to the length depending on the cigar operation.
*/
inline void format_sam_base::update_alignment_lengths(int32_t & ref_length,
int32_t & seq_length,
char const cigar_operation,
uint32_t const cigar_count)
{
switch (cigar_operation)
{
case 'M': case '=': case 'X': ref_length += cigar_count, seq_length += cigar_count; break;
case 'D': case 'N': ref_length += cigar_count; break;
case 'I' : seq_length += cigar_count; break;
case 'S': case 'H': case 'P': break; // no op (soft-clipping or padding does not increase either length)
default: throw format_error{"Illegal cigar operation: " + std::string{cigar_operation}};
}
}

/*!\brief Transfer soft clipping information from the \p cigar_vector to \p sc_begin and \p sc_end.
* \param[in] cigar_vector The cigar information to parse for soft-clipping.
* \param[out] sc_begin The soft clipping at the beginning of the alignment to set.
Expand Down Expand Up @@ -247,49 +218,6 @@ inline void format_sam_base::transfer_soft_clipping_to(std::vector<cigar> const
sc_end = cigar_count_at(second_last_index);
}

/*!\brief Parses a cigar string into a vector of operation-count pairs (e.g. (M, 3)).
* \tparam cigar_input_type The type of a single pass input view over the cigar string; must model
* std::ranges::input_range.
* \param[in] cigar_input The single pass input view over the cigar string to parse.
*
* \returns A tuple of size three containing (1) std::vector over seqan3::cigar, that describes
* the alignment, (2) the aligned reference length, (3) the aligned query sequence length.
*
* \details
*
* For example, the view over the cigar string "1H4M1D2M2S" will return
* `{[(H,1), (M,4), (D,1), (M,2), (S,2)], 7, 6}`.
*/
template <typename cigar_input_type>
inline std::tuple<std::vector<cigar>, int32_t, int32_t> format_sam_base::parse_cigar(cigar_input_type && cigar_input) const
{
std::vector<cigar> operations{};
std::array<char, 20> buffer{}; // buffer to parse numbers with from_chars. Biggest number should fit in uint64_t
char cigar_operation{};
uint32_t cigar_count{};
int32_t ref_length{}, seq_length{}; // length of aligned part for ref and query

// transform input into a single input view if it isn't already
auto cigar_view = cigar_input | views::single_pass_input;

// parse the rest of the cigar
// -------------------------------------------------------------------------------------------------------------
while (std::ranges::begin(cigar_view) != std::ranges::end(cigar_view)) // until stream is not empty
{
auto buff_end = (std::ranges::copy(cigar_view | views::take_until_or_throw(!is_digit), buffer.data())).out;
cigar_operation = *std::ranges::begin(cigar_view);
std::ranges::next(std::ranges::begin(cigar_view));

if (std::from_chars(buffer.begin(), buff_end, cigar_count).ec != std::errc{})
throw format_error{"Corrupted cigar string encountered"};

update_alignment_lengths(ref_length, seq_length, cigar_operation, cigar_count);
operations.emplace_back(cigar_count, cigar::operation{}.assign_char(cigar_operation));
}

return {operations, ref_length, seq_length};
}

/*!\brief Construct the field::alignment depending on the given information.
* \tparam align_type The alignment type.
* \tparam ref_seqs_type The type of reference sequences (might decay to ignore).
Expand Down
6 changes: 3 additions & 3 deletions include/seqan3/io/sam_file/format_bam.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -572,7 +572,7 @@ inline void format_bam::read_alignment_record(stream_type & stream,
"record.")};

auto cigar_view = std::views::all(std::get<std::string>(it->second));
std::tie(tmp_cigar_vector, ref_length, seq_length) = parse_cigar(cigar_view);
std::tie(tmp_cigar_vector, ref_length, seq_length) = detail::parse_cigar(cigar_view);
offset_tmp = soft_clipping_end = 0;
transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
tag_dict.erase(it); // remove redundant information
Expand Down Expand Up @@ -746,7 +746,7 @@ inline void format_bam::write_alignment_record([[maybe_unused]] stream_type & s
{
int32_t dummy_seq_length{};
for (auto & [count, operation] : cigar_vector)
update_alignment_lengths(ref_length, dummy_seq_length, operation.to_char(), count);
detail::update_alignment_lengths(ref_length, dummy_seq_length, operation.to_char(), count);
}
else if (!std::ranges::empty(get<0>(align)) && !std::ranges::empty(get<1>(align)))
{
Expand Down Expand Up @@ -1147,7 +1147,7 @@ inline auto format_bam::parse_binary_cigar(cigar_input_type && cigar_input, uint
operation = cigar_mapping[operation_and_count & cigar_mask];
count = operation_and_count >> 4;

update_alignment_lengths(ref_length, seq_length, operation, count);
detail::update_alignment_lengths(ref_length, seq_length, operation, count);
operations.emplace_back(count, cigar::operation{}.assign_char(operation));
--n_cigar_op;
}
Expand Down
2 changes: 1 addition & 1 deletion include/seqan3/io/sam_file/format_sam.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -460,7 +460,7 @@ inline void format_sam::read_alignment_record(stream_type & stream,
{
if (!is_char<'*'>(*std::ranges::begin(stream_view))) // no cigar information given
{
std::tie(tmp_cigar_vector, ref_length, seq_length) = parse_cigar(field_view);
std::tie(tmp_cigar_vector, ref_length, seq_length) = detail::parse_cigar(field_view);
transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
// the actual cigar_vector is swapped with tmp_cigar_vector at the end to avoid copying
}
Expand Down