-
Notifications
You must be signed in to change notification settings - Fork 82
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[FEATURE] Add free function seqan3::cigar_from_alignment.
- Loading branch information
1 parent
9b98f11
commit b86720d
Showing
8 changed files
with
431 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
241 changes: 241 additions & 0 deletions
241
include/seqan3/alignment/cigar_conversion/cigar_from_alignment.hpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,241 @@ | ||
// ----------------------------------------------------------------------------------------------------- | ||
// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin | ||
// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik | ||
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License | ||
// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md | ||
// ----------------------------------------------------------------------------------------------------- | ||
|
||
/*!\file | ||
* \brief Provides the function seqan3::cigar_from_alignment and a helper struct seqan3::cigar_clipped_bases. | ||
* \author Svenja Mehringer <svenja.mehringer AT fu-berlin.de> | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include <vector> | ||
|
||
#include <seqan3/alphabet/cigar/cigar.hpp> | ||
#include <seqan3/alphabet/gap/gap.hpp> | ||
#include <seqan3/utility/views/zip.hpp> | ||
|
||
namespace seqan3 | ||
{ | ||
|
||
/*!\brief Helper struct to specialise soft and hard clipping when using seqan3::cigar_from_alignment. | ||
* \ingroup cigar_conversion | ||
* | ||
* A CIGAR string might have hard or soft clipping at the front or back, e.g., `2H3S100M3S2H`. | ||
*/ | ||
struct cigar_clipped_bases | ||
{ | ||
uint32_t hard_front{}; //!< The number of hard clipped bases at the front of the CIGAR string. | ||
uint32_t hard_back{}; //!< The number of hard clipped bases at the back of the CIGAR string. | ||
uint32_t soft_front{}; //!< The number of soft clipped bases at the front of the CIGAR string. | ||
uint32_t soft_back{}; //!< The number of soft clipped bases at the back of the CIGAR string. | ||
}; | ||
|
||
/*!\brief Creates a CIGAR string (SAM format) given a seqan3::detail::pairwise_alignment represented by two | ||
* `seqan3::aligned_sequence`s. | ||
* \ingroup cigar_conversion | ||
* | ||
* \tparam alignment_type Must model seqan3::detail::pairwise_alignment. | ||
* \param alignment The alignment, represented by a pair of aligned sequences, | ||
* to be transformed into CIGAR vector based on the | ||
* second (query) sequence. | ||
* \param clipped_bases Provides information on whether the query sequence was cropped (hard clipping) before the | ||
* alignment or whether part of the query sequence does not take part (soft clipping) in the | ||
* alignment. | ||
* \param extended_cigar Whether to print the extended CIGAR alphabet or not. See `seqan3::cigar::operation`. | ||
* \returns A std::vector\<seqan3::cigar\> representing the alignment. | ||
* | ||
* \details | ||
* | ||
* \note The resulting `cigar_vector` is based on the query sequence, which is the second sequence in the `alignment` | ||
* pair. | ||
* | ||
* ### Example: | ||
* | ||
* Given the following alignment, reference sequence on top and the query sequence at | ||
* the bottom: | ||
* ``` | ||
* ATGG--CGTAGAGCTT | ||
* |||X |||X| ||| | ||
* ATGCCCCGTTG--CTT | ||
* ``` | ||
* In this case, the function `seqan3::cigar_from_alignment` will return the following CIGAR vector: | ||
* | ||
* ``` | ||
* [('M',4),('I',2),('M',5),('D',2),('M',3)] | ||
* ``` | ||
* | ||
* The extended CIGAR string would look like this: | ||
* | ||
* ``` | ||
* [('=',3)('X',1)('I',2)('=',3)('X',1)('=',1)('D',2)('=',3)] | ||
* ``` | ||
* | ||
* \include test/snippet/alignment/cigar_conversion/cigar_from_alignment.cpp | ||
* | ||
* ### Soft and Hard clipping | ||
* | ||
* The terms soft and hard clipping were introduced by the | ||
* [SAM specifications](https://samtools.github.io/hts-specs/SAMv1.pdf). A SAM file is only storing a semi alignment | ||
* represented by the CIGAR string. The semi alignment of a query sequence is most often the result of a | ||
* [read mapping.](https://training.galaxyproject.org/training-material/topics/sequence-analysis/tutorials/mapping/tutorial.html) | ||
* | ||
* #### Hard clipping | ||
* | ||
* Before aligning a query or read to the reference, some tools crop the query sequence because the quality is bad at | ||
* one end (e.g., Illumina reads tend to display a bad sequence quality towards the end of the read). | ||
* | ||
* To inform the user of a SAM file that query sequences were altered, hard-clipping information is appended to the | ||
* CIGAR string. E.g. `100M50H` indicates that a read of former length `150` has been cropped at the right end by `50` | ||
* bases. The sequence in the SAM file will thus only be 100 bases long. | ||
* | ||
* #### Soft clipping | ||
* | ||
* In contrast to hard clipping, soft clipping indicates that the read was cropped and the respective | ||
* bases do not participate in the alignment, but they are still part of the reported sequence. | ||
* E.g., `100M50S` indicates that a read of length `150` has been aligned without the rightmost `50` bases. | ||
* The sequence in the SAM file will still be 150 bases long. | ||
* | ||
* #### Adding soft and hard clipping with `seqan3::cigar_from_alignment` | ||
* | ||
* You can add the respective clipping information by passing an instance of seqan3::cigar_clipped_bases: | ||
* | ||
* \include test/snippet/alignment/cigar_conversion/cigar_from_alignment_with_clipping.cpp | ||
* | ||
* \sa seqan3::cigar_clipped_bases | ||
* \sa seqan3::sam_file_output | ||
* \sa seqan3::cigar | ||
* \sa seqan3::aligned_sequence | ||
*/ | ||
template <typename alignment_type> | ||
inline auto cigar_from_alignment(alignment_type const & alignment, | ||
cigar_clipped_bases const & clipped_bases = {}, | ||
bool const extended_cigar = false) | ||
{ | ||
static_assert((tuple_like<std::remove_cvref_t<alignment_type>> | ||
&& std::tuple_size_v<std::remove_cvref_t<alignment_type>> == 2), | ||
"The alignment must be a std::pair or std::tuple of size 2."); | ||
|
||
static_assert( | ||
(std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<0>(alignment))>> | ||
&& std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<1>(alignment))>>), | ||
"The alignment must contain two ranges whose value_type is comparable to seqan3::gap."); | ||
|
||
/*brief: Compares two seqan3::aligned_sequence values and returns their cigar operation. | ||
* param reference_char The aligned character of the reference to compare. | ||
* param query_char The aligned character of the query to compare. | ||
* param extended_cigar Whether to print the extended cigar alphabet or not. See cigar operation. | ||
* returns A seqan3::cigar::operation representing the alignment operation between the two values. | ||
* | ||
* The resulting CIGAR operation is based on the query character (`query_char`). | ||
* | ||
* ### Example: | ||
* | ||
* The following alignment column shows the reference char ('C') on top and a | ||
* gap for the query char at the bottom. | ||
* ``` | ||
* ... C ... | ||
* | | ||
* ... - ... | ||
* ``` | ||
* In this case, `to_cigar_op` will return | ||
* 'D' since the query char is "deleted". | ||
* | ||
* The next alignment column shows the reference char ('C') on top and a | ||
* query char ('G') at the bottom. | ||
* ``` | ||
* ... C ... | ||
* | | ||
* ... G ... | ||
* ``` | ||
* In this case, `to_cigar_op` will return | ||
* 'M', for the basic CIGAR the two bases are aligned, while | ||
* in the extended CIGAR alphabet (`extended_cigar` = `true`) the function | ||
* will return an 'X' since the bases are aligned but are not | ||
* equal. | ||
* \sa seqan3::aligned_sequence | ||
*/ | ||
auto to_cigar_op = [extended_cigar](auto const reference_char, auto const query_char) | ||
{ | ||
// note that N is not considered because it is equivalent to D but has a special meaning: | ||
// SAM spec: "For mRNA-to-genome alignment, an N operation represents an intron. For other types of alignments, | ||
// the interpretation of N is not defined." | ||
// as we cannot know the meaning, the user has to change D -> N themself | ||
constexpr std::array<char, 6> operators{'M', 'D', 'I', 'P', 'X', '='}; // contains the possible cigar operators. | ||
|
||
// no gaps -> 00 -> 0 | ||
// gap only in query -> 01 -> 1 | ||
// gap only in reference -> 10 -> 2 | ||
// both gaps -> 11 -> 3 | ||
uint8_t key = (static_cast<uint8_t>(reference_char == gap{}) << 1) | static_cast<uint8_t>(query_char == gap{}); | ||
|
||
// mismatch -> 100 -> 4 | ||
// match -> 101 -> 5 | ||
if (extended_cigar && (key == 0)) // in extended format, refine the substitution operator to match/mismatch. | ||
key |= ((1 << 2) | static_cast<uint8_t>(query_char == reference_char)); // maps to [4, 5]. | ||
|
||
return assign_char_to(operators[key], cigar::operation{}); | ||
}; | ||
|
||
using std::get; | ||
|
||
auto & ref_seq = get<0>(alignment); | ||
auto & query_seq = get<1>(alignment); | ||
|
||
if (ref_seq.size() != query_seq.size()) | ||
throw std::logic_error{"The aligned sequences (including gaps) must have the same length."}; | ||
|
||
if (std::ranges::empty(ref_seq)) // only check ref_seq because query_seq was checked to have to same size | ||
throw std::logic_error{"The aligned sequences may not be empty."}; | ||
|
||
std::vector<cigar> result{}; | ||
|
||
// Add (H)ard-clipping at the start of the query | ||
if (clipped_bases.hard_front) | ||
result.emplace_back(clipped_bases.hard_front, 'H'_cigar_operation); | ||
|
||
// Add (S)oft-clipping at the start of the query | ||
if (clipped_bases.soft_front) | ||
result.emplace_back(clipped_bases.soft_front, 'S'_cigar_operation); | ||
|
||
// Create cigar string from alignment | ||
// ------------------------------------------------------------------------- | ||
// initialize first operation and count value: | ||
cigar::operation operation{to_cigar_op(ref_seq[0], query_seq[0])}; | ||
uint32_t count{0}; | ||
|
||
// go through alignment columns | ||
for (auto && [reference_char, query_char] : views::zip(ref_seq, query_seq)) | ||
{ | ||
cigar::operation next_op = to_cigar_op(reference_char, query_char); | ||
|
||
if (operation == next_op) | ||
{ | ||
++count; | ||
} | ||
else | ||
{ | ||
result.emplace_back(count, operation); | ||
operation = next_op; | ||
count = 1; | ||
} | ||
} | ||
|
||
// append last cigar element | ||
result.emplace_back(count, operation); | ||
|
||
// Add (S)oft-clipping at the end of the query | ||
if (clipped_bases.soft_back) | ||
result.emplace_back(clipped_bases.soft_back, 'S'_cigar_operation); | ||
|
||
// Add (H)ard-clipping at the end of the query | ||
if (clipped_bases.hard_back) | ||
result.emplace_back(clipped_bases.hard_back, 'H'_cigar_operation); | ||
|
||
return result; | ||
} | ||
|
||
} // namespace seqan3 |
31 changes: 31 additions & 0 deletions
31
test/snippet/alignment/cigar_conversion/cigar_from_alignment.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
#include <seqan3/alignment/aligned_sequence/aligned_sequence_concept.hpp> | ||
#include <seqan3/alignment/cigar_conversion/cigar_from_alignment.hpp> | ||
#include <seqan3/alignment/decorator/gap_decorator.hpp> | ||
#include <seqan3/alphabet/gap/gapped.hpp> | ||
#include <seqan3/alphabet/nucleotide/dna5.hpp> | ||
#include <seqan3/core/debug_stream.hpp> | ||
#include <seqan3/utility/views/slice.hpp> | ||
|
||
int main() | ||
{ | ||
using namespace seqan3::literals; | ||
|
||
seqan3::dna5_vector reference = "ATGGCGTAGAGCTTCCCCCCCCCCCCCCCCC"_dna5; | ||
seqan3::dna5_vector read = "ATGCCCCGTTGCTT"_dna5; // length 14 | ||
|
||
// Align the full query against the first 14 bases of the reference. | ||
seqan3::gap_decorator aligned_reference{reference | seqan3::views::slice(0, 14)}; | ||
seqan3::gap_decorator aligned_read{read}; | ||
// Insert gaps to represent the alignment: | ||
seqan3::insert_gap(aligned_read, aligned_read.begin() + 11, 2); | ||
seqan3::insert_gap(aligned_reference, aligned_reference.begin() + 4, 2); | ||
|
||
seqan3::debug_stream << aligned_reference << '\n' << aligned_read << '\n'; | ||
// prints: | ||
// ATGG--CGTAGAGCTT | ||
// ATGCCCCGTTG--CTT | ||
|
||
auto cigar_sequence = seqan3::cigar_from_alignment(std::tie(aligned_reference, aligned_read)); | ||
|
||
seqan3::debug_stream << cigar_sequence << '\n'; // prints [4M,2I,5M,2D,3M] | ||
} |
1 change: 1 addition & 0 deletions
1
test/snippet/alignment/cigar_conversion/cigar_from_alignment.err
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
[4M,2I,5M,2D,3M] |
29 changes: 29 additions & 0 deletions
29
test/snippet/alignment/cigar_conversion/cigar_from_alignment_with_clipping.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
#include <seqan3/alignment/aligned_sequence/aligned_sequence_concept.hpp> | ||
#include <seqan3/alignment/cigar_conversion/cigar_from_alignment.hpp> | ||
#include <seqan3/alignment/decorator/gap_decorator.hpp> | ||
#include <seqan3/alphabet/gap/gapped.hpp> | ||
#include <seqan3/alphabet/nucleotide/dna5.hpp> | ||
#include <seqan3/core/debug_stream.hpp> | ||
#include <seqan3/utility/views/slice.hpp> | ||
|
||
int main() | ||
{ | ||
using namespace seqan3::literals; | ||
|
||
seqan3::dna5_vector reference = "ATGGCGTAGAGCTTCCCCCCCCCCCCCCCCC"_dna5; | ||
seqan3::dna5_vector read = "ATGCCCCGTTGCTT"_dna5; // length 14 | ||
|
||
// Let's say, we want to ignore the last 2 bases of the query because the quality is low. | ||
// We thus only align the first 12 bases, the last two will be soft-clipped bases in the CIGAR string. | ||
seqan3::gap_decorator aligned_reference{reference | seqan3::views::slice(0, 12)}; | ||
seqan3::gap_decorator aligned_query{read | seqan3::views::slice(0, 12)}; | ||
// insert gaps | ||
seqan3::insert_gap(aligned_reference, aligned_reference.begin() + 4, 2); | ||
seqan3::insert_gap(aligned_query, aligned_query.begin() + 11, 2); | ||
|
||
auto cigar_sequence = | ||
seqan3::cigar_from_alignment(std::tie(aligned_reference, aligned_query), | ||
{.hard_front = 1, .hard_back = 0, .soft_front = 0, .soft_back = 2}); | ||
|
||
seqan3::debug_stream << cigar_sequence << '\n'; // prints [1H,4M,2I,5M,2D,1M,2S] | ||
} |
1 change: 1 addition & 0 deletions
1
test/snippet/alignment/cigar_conversion/cigar_from_alignment_with_clipping.err
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
[1H,4M,2I,5M,2D,1M,2S] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
seqan3_test (alignment_from_cigar_test.cpp) | ||
seqan3_test (cigar_from_alignment_test.cpp) |
Oops, something went wrong.