From b86720da43320a5157c8aaa914296e524f2707a5 Mon Sep 17 00:00:00 2001 From: Svenja Mehringer Date: Fri, 14 Oct 2022 11:39:16 +0200 Subject: [PATCH] [FEATURE] Add free function seqan3::cigar_from_alignment. --- .../seqan3/alignment/cigar_conversion/all.hpp | 1 + .../cigar_conversion/cigar_from_alignment.hpp | 241 ++++++++++++++++++ .../cigar_conversion/cigar_from_alignment.cpp | 31 +++ .../cigar_conversion/cigar_from_alignment.err | 1 + .../cigar_from_alignment_with_clipping.cpp | 29 +++ .../cigar_from_alignment_with_clipping.err | 1 + .../alignment/cigar_conversion/CMakeLists.txt | 1 + .../cigar_from_alignment_test.cpp | 126 +++++++++ 8 files changed, 431 insertions(+) create mode 100644 include/seqan3/alignment/cigar_conversion/cigar_from_alignment.hpp create mode 100644 test/snippet/alignment/cigar_conversion/cigar_from_alignment.cpp create mode 100644 test/snippet/alignment/cigar_conversion/cigar_from_alignment.err create mode 100644 test/snippet/alignment/cigar_conversion/cigar_from_alignment_with_clipping.cpp create mode 100644 test/snippet/alignment/cigar_conversion/cigar_from_alignment_with_clipping.err create mode 100644 test/unit/alignment/cigar_conversion/cigar_from_alignment_test.cpp diff --git a/include/seqan3/alignment/cigar_conversion/all.hpp b/include/seqan3/alignment/cigar_conversion/all.hpp index 80f55a7cc22..1e5094fd7de 100644 --- a/include/seqan3/alignment/cigar_conversion/all.hpp +++ b/include/seqan3/alignment/cigar_conversion/all.hpp @@ -18,3 +18,4 @@ #pragma once #include +#include diff --git a/include/seqan3/alignment/cigar_conversion/cigar_from_alignment.hpp b/include/seqan3/alignment/cigar_conversion/cigar_from_alignment.hpp new file mode 100644 index 00000000000..a76be1a6440 --- /dev/null +++ b/include/seqan3/alignment/cigar_conversion/cigar_from_alignment.hpp @@ -0,0 +1,241 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides the function seqan3::cigar_from_alignment and a helper struct seqan3::cigar_clipped_bases. + * \author Svenja Mehringer + */ + +#pragma once + +#include + +#include +#include +#include + +namespace seqan3 +{ + +/*!\brief Helper struct to specialise soft and hard clipping when using seqan3::cigar_from_alignment. + * \ingroup cigar_conversion + * + * A CIGAR string might have hard or soft clipping at the front or back, e.g., `2H3S100M3S2H`. + */ +struct cigar_clipped_bases +{ + uint32_t hard_front{}; //!< The number of hard clipped bases at the front of the CIGAR string. + uint32_t hard_back{}; //!< The number of hard clipped bases at the back of the CIGAR string. + uint32_t soft_front{}; //!< The number of soft clipped bases at the front of the CIGAR string. + uint32_t soft_back{}; //!< The number of soft clipped bases at the back of the CIGAR string. +}; + +/*!\brief Creates a CIGAR string (SAM format) given a seqan3::detail::pairwise_alignment represented by two + * `seqan3::aligned_sequence`s. + * \ingroup cigar_conversion + * + * \tparam alignment_type Must model seqan3::detail::pairwise_alignment. + * \param alignment The alignment, represented by a pair of aligned sequences, + * to be transformed into CIGAR vector based on the + * second (query) sequence. + * \param clipped_bases Provides information on whether the query sequence was cropped (hard clipping) before the + * alignment or whether part of the query sequence does not take part (soft clipping) in the + * alignment. + * \param extended_cigar Whether to print the extended CIGAR alphabet or not. See `seqan3::cigar::operation`. + * \returns A std::vector\ representing the alignment. + * + * \details + * + * \note The resulting `cigar_vector` is based on the query sequence, which is the second sequence in the `alignment` + * pair. + * + * ### Example: + * + * Given the following alignment, reference sequence on top and the query sequence at + * the bottom: + * ``` + * ATGG--CGTAGAGCTT + * |||X |||X| ||| + * ATGCCCCGTTG--CTT + * ``` + * In this case, the function `seqan3::cigar_from_alignment` will return the following CIGAR vector: + * + * ``` + * [('M',4),('I',2),('M',5),('D',2),('M',3)] + * ``` + * + * The extended CIGAR string would look like this: + * + * ``` + * [('=',3)('X',1)('I',2)('=',3)('X',1)('=',1)('D',2)('=',3)] + * ``` + * + * \include test/snippet/alignment/cigar_conversion/cigar_from_alignment.cpp + * + * ### Soft and Hard clipping + * + * The terms soft and hard clipping were introduced by the + * [SAM specifications](https://samtools.github.io/hts-specs/SAMv1.pdf). A SAM file is only storing a semi alignment + * represented by the CIGAR string. The semi alignment of a query sequence is most often the result of a + * [read mapping.](https://training.galaxyproject.org/training-material/topics/sequence-analysis/tutorials/mapping/tutorial.html) + * + * #### Hard clipping + * + * Before aligning a query or read to the reference, some tools crop the query sequence because the quality is bad at + * one end (e.g., Illumina reads tend to display a bad sequence quality towards the end of the read). + * + * To inform the user of a SAM file that query sequences were altered, hard-clipping information is appended to the + * CIGAR string. E.g. `100M50H` indicates that a read of former length `150` has been cropped at the right end by `50` + * bases. The sequence in the SAM file will thus only be 100 bases long. + * + * #### Soft clipping + * + * In contrast to hard clipping, soft clipping indicates that the read was cropped and the respective + * bases do not participate in the alignment, but they are still part of the reported sequence. + * E.g., `100M50S` indicates that a read of length `150` has been aligned without the rightmost `50` bases. + * The sequence in the SAM file will still be 150 bases long. + * + * #### Adding soft and hard clipping with `seqan3::cigar_from_alignment` + * + * You can add the respective clipping information by passing an instance of seqan3::cigar_clipped_bases: + * + * \include test/snippet/alignment/cigar_conversion/cigar_from_alignment_with_clipping.cpp + * + * \sa seqan3::cigar_clipped_bases + * \sa seqan3::sam_file_output + * \sa seqan3::cigar + * \sa seqan3::aligned_sequence + */ +template +inline auto cigar_from_alignment(alignment_type const & alignment, + cigar_clipped_bases const & clipped_bases = {}, + bool const extended_cigar = false) +{ + static_assert((tuple_like> + && std::tuple_size_v> == 2), + "The alignment must be a std::pair or std::tuple of size 2."); + + static_assert( + (std::equality_comparable_with(alignment))>> + && std::equality_comparable_with(alignment))>>), + "The alignment must contain two ranges whose value_type is comparable to seqan3::gap."); + + /*brief: Compares two seqan3::aligned_sequence values and returns their cigar operation. + * param reference_char The aligned character of the reference to compare. + * param query_char The aligned character of the query to compare. + * param extended_cigar Whether to print the extended cigar alphabet or not. See cigar operation. + * returns A seqan3::cigar::operation representing the alignment operation between the two values. + * + * The resulting CIGAR operation is based on the query character (`query_char`). + * + * ### Example: + * + * The following alignment column shows the reference char ('C') on top and a + * gap for the query char at the bottom. + * ``` + * ... C ... + * | + * ... - ... + * ``` + * In this case, `to_cigar_op` will return + * 'D' since the query char is "deleted". + * + * The next alignment column shows the reference char ('C') on top and a + * query char ('G') at the bottom. + * ``` + * ... C ... + * | + * ... G ... + * ``` + * In this case, `to_cigar_op` will return + * 'M', for the basic CIGAR the two bases are aligned, while + * in the extended CIGAR alphabet (`extended_cigar` = `true`) the function + * will return an 'X' since the bases are aligned but are not + * equal. + * \sa seqan3::aligned_sequence + */ + auto to_cigar_op = [extended_cigar](auto const reference_char, auto const query_char) + { + // note that N is not considered because it is equivalent to D but has a special meaning: + // SAM spec: "For mRNA-to-genome alignment, an N operation represents an intron. For other types of alignments, + // the interpretation of N is not defined." + // as we cannot know the meaning, the user has to change D -> N themself + constexpr std::array operators{'M', 'D', 'I', 'P', 'X', '='}; // contains the possible cigar operators. + + // no gaps -> 00 -> 0 + // gap only in query -> 01 -> 1 + // gap only in reference -> 10 -> 2 + // both gaps -> 11 -> 3 + uint8_t key = (static_cast(reference_char == gap{}) << 1) | static_cast(query_char == gap{}); + + // mismatch -> 100 -> 4 + // match -> 101 -> 5 + if (extended_cigar && (key == 0)) // in extended format, refine the substitution operator to match/mismatch. + key |= ((1 << 2) | static_cast(query_char == reference_char)); // maps to [4, 5]. + + return assign_char_to(operators[key], cigar::operation{}); + }; + + using std::get; + + auto & ref_seq = get<0>(alignment); + auto & query_seq = get<1>(alignment); + + if (ref_seq.size() != query_seq.size()) + throw std::logic_error{"The aligned sequences (including gaps) must have the same length."}; + + if (std::ranges::empty(ref_seq)) // only check ref_seq because query_seq was checked to have to same size + throw std::logic_error{"The aligned sequences may not be empty."}; + + std::vector result{}; + + // Add (H)ard-clipping at the start of the query + if (clipped_bases.hard_front) + result.emplace_back(clipped_bases.hard_front, 'H'_cigar_operation); + + // Add (S)oft-clipping at the start of the query + if (clipped_bases.soft_front) + result.emplace_back(clipped_bases.soft_front, 'S'_cigar_operation); + + // Create cigar string from alignment + // ------------------------------------------------------------------------- + // initialize first operation and count value: + cigar::operation operation{to_cigar_op(ref_seq[0], query_seq[0])}; + uint32_t count{0}; + + // go through alignment columns + for (auto && [reference_char, query_char] : views::zip(ref_seq, query_seq)) + { + cigar::operation next_op = to_cigar_op(reference_char, query_char); + + if (operation == next_op) + { + ++count; + } + else + { + result.emplace_back(count, operation); + operation = next_op; + count = 1; + } + } + + // append last cigar element + result.emplace_back(count, operation); + + // Add (S)oft-clipping at the end of the query + if (clipped_bases.soft_back) + result.emplace_back(clipped_bases.soft_back, 'S'_cigar_operation); + + // Add (H)ard-clipping at the end of the query + if (clipped_bases.hard_back) + result.emplace_back(clipped_bases.hard_back, 'H'_cigar_operation); + + return result; +} + +} // namespace seqan3 diff --git a/test/snippet/alignment/cigar_conversion/cigar_from_alignment.cpp b/test/snippet/alignment/cigar_conversion/cigar_from_alignment.cpp new file mode 100644 index 00000000000..dfd189badda --- /dev/null +++ b/test/snippet/alignment/cigar_conversion/cigar_from_alignment.cpp @@ -0,0 +1,31 @@ +#include +#include +#include +#include +#include +#include +#include + +int main() +{ + using namespace seqan3::literals; + + seqan3::dna5_vector reference = "ATGGCGTAGAGCTTCCCCCCCCCCCCCCCCC"_dna5; + seqan3::dna5_vector read = "ATGCCCCGTTGCTT"_dna5; // length 14 + + // Align the full query against the first 14 bases of the reference. + seqan3::gap_decorator aligned_reference{reference | seqan3::views::slice(0, 14)}; + seqan3::gap_decorator aligned_read{read}; + // Insert gaps to represent the alignment: + seqan3::insert_gap(aligned_read, aligned_read.begin() + 11, 2); + seqan3::insert_gap(aligned_reference, aligned_reference.begin() + 4, 2); + + seqan3::debug_stream << aligned_reference << '\n' << aligned_read << '\n'; + // prints: + // ATGG--CGTAGAGCTT + // ATGCCCCGTTG--CTT + + auto cigar_sequence = seqan3::cigar_from_alignment(std::tie(aligned_reference, aligned_read)); + + seqan3::debug_stream << cigar_sequence << '\n'; // prints [4M,2I,5M,2D,3M] +} diff --git a/test/snippet/alignment/cigar_conversion/cigar_from_alignment.err b/test/snippet/alignment/cigar_conversion/cigar_from_alignment.err new file mode 100644 index 00000000000..b1d0504acbd --- /dev/null +++ b/test/snippet/alignment/cigar_conversion/cigar_from_alignment.err @@ -0,0 +1 @@ +[4M,2I,5M,2D,3M] diff --git a/test/snippet/alignment/cigar_conversion/cigar_from_alignment_with_clipping.cpp b/test/snippet/alignment/cigar_conversion/cigar_from_alignment_with_clipping.cpp new file mode 100644 index 00000000000..4924abda3c3 --- /dev/null +++ b/test/snippet/alignment/cigar_conversion/cigar_from_alignment_with_clipping.cpp @@ -0,0 +1,29 @@ +#include +#include +#include +#include +#include +#include +#include + +int main() +{ + using namespace seqan3::literals; + + seqan3::dna5_vector reference = "ATGGCGTAGAGCTTCCCCCCCCCCCCCCCCC"_dna5; + seqan3::dna5_vector read = "ATGCCCCGTTGCTT"_dna5; // length 14 + + // Let's say, we want to ignore the last 2 bases of the query because the quality is low. + // We thus only align the first 12 bases, the last two will be soft-clipped bases in the CIGAR string. + seqan3::gap_decorator aligned_reference{reference | seqan3::views::slice(0, 12)}; + seqan3::gap_decorator aligned_query{read | seqan3::views::slice(0, 12)}; + // insert gaps + seqan3::insert_gap(aligned_reference, aligned_reference.begin() + 4, 2); + seqan3::insert_gap(aligned_query, aligned_query.begin() + 11, 2); + + auto cigar_sequence = + seqan3::cigar_from_alignment(std::tie(aligned_reference, aligned_query), + {.hard_front = 1, .hard_back = 0, .soft_front = 0, .soft_back = 2}); + + seqan3::debug_stream << cigar_sequence << '\n'; // prints [1H,4M,2I,5M,2D,1M,2S] +} diff --git a/test/snippet/alignment/cigar_conversion/cigar_from_alignment_with_clipping.err b/test/snippet/alignment/cigar_conversion/cigar_from_alignment_with_clipping.err new file mode 100644 index 00000000000..490ad1629b5 --- /dev/null +++ b/test/snippet/alignment/cigar_conversion/cigar_from_alignment_with_clipping.err @@ -0,0 +1 @@ +[1H,4M,2I,5M,2D,1M,2S] diff --git a/test/unit/alignment/cigar_conversion/CMakeLists.txt b/test/unit/alignment/cigar_conversion/CMakeLists.txt index 6f964341430..36a289d28c7 100644 --- a/test/unit/alignment/cigar_conversion/CMakeLists.txt +++ b/test/unit/alignment/cigar_conversion/CMakeLists.txt @@ -1 +1,2 @@ seqan3_test (alignment_from_cigar_test.cpp) +seqan3_test (cigar_from_alignment_test.cpp) diff --git a/test/unit/alignment/cigar_conversion/cigar_from_alignment_test.cpp b/test/unit/alignment/cigar_conversion/cigar_from_alignment_test.cpp new file mode 100644 index 00000000000..89360cf52fb --- /dev/null +++ b/test/unit/alignment/cigar_conversion/cigar_from_alignment_test.cpp @@ -0,0 +1,126 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +using seqan3::operator""_cigar_operation; +using seqan3::operator""_dna5; + +struct cigar_conversion_data : public ::testing::Test +{ + seqan3::dna5_vector ref = "ACTGATCGAGAGGATCTAGAGGAGATCGTAGGAC"_dna5; + + std::vector simple_cigar{{1, 'S'_cigar_operation}, // 1S1M1D1M1I + {1, 'M'_cigar_operation}, + {1, 'D'_cigar_operation}, + {1, 'M'_cigar_operation}, + {1, 'I'_cigar_operation}}; + + std::vector> simple_cigar_gapped_ref{'A'_dna5, 'C'_dna5, 'T'_dna5, seqan3::gap{}}; + std::vector> simple_cigar_gapped_seq{'C'_dna5, seqan3::gap{}, 'G'_dna5, 'T'_dna5}; + + std::vector cigar_with_padding{{1, 'S'_cigar_operation}, // 1S1M1P1M1I1M1I1D1M1S + {1, 'M'_cigar_operation}, + {1, 'P'_cigar_operation}, + {1, 'M'_cigar_operation}, + {1, 'I'_cigar_operation}, + {1, 'M'_cigar_operation}, + {1, 'I'_cigar_operation}, + {1, 'D'_cigar_operation}, + {1, 'M'_cigar_operation}, + {1, 'S'_cigar_operation}}; + + // same as `cigar_with_padding` but M are substituted by `=` or `X` depending on match or mismatch + std::vector extended_cigar_with_padding{{1, 'S'_cigar_operation}, // 1S1=1P1X1I1X1I1D1=1S + {1, '='_cigar_operation}, + {1, 'P'_cigar_operation}, + {1, 'X'_cigar_operation}, + {1, 'I'_cigar_operation}, + {1, 'X'_cigar_operation}, + {1, 'I'_cigar_operation}, + {1, 'D'_cigar_operation}, + {1, '='_cigar_operation}, + {1, 'S'_cigar_operation}}; + + std::vector> cigar_with_padding_gapped_ref = + {'T'_dna5, seqan3::gap{}, 'G'_dna5, seqan3::gap{}, 'A'_dna5, seqan3::gap{}, 'T'_dna5, 'C'_dna5}; + std::vector> cigar_with_padding_gapped_seq = + {'T'_dna5, seqan3::gap{}, 'A'_dna5, 'G'_dna5, 'T'_dna5, 'A'_dna5, seqan3::gap{}, 'C'_dna5}; + + std::vector cigar_with_hard_clipping{{1, 'H'_cigar_operation}, // 1H2S7M1D1M1S2H + {2, 'S'_cigar_operation}, + {7, 'M'_cigar_operation}, + {1, 'D'_cigar_operation}, + {1, 'M'_cigar_operation}, + {1, 'S'_cigar_operation}, + {2, 'H'_cigar_operation}}; + + std::vector> cigar_with_hard_clipping_gapped_ref = + {'C'_dna5, 'T'_dna5, 'G'_dna5, 'A'_dna5, 'T'_dna5, 'C'_dna5, 'G'_dna5, 'A'_dna5, 'G'_dna5}; + std::vector> cigar_with_hard_clipping_gapped_seq = + {'A'_dna5, 'G'_dna5, 'G'_dna5, 'C'_dna5, 'T'_dna5, 'G'_dna5, 'N'_dna5, seqan3::gap{}, 'A'_dna5}; +}; + +struct cigar_from_alignment : public cigar_conversion_data +{}; + +TEST_F(cigar_from_alignment, empty_sequences) +{ + std::vector> empty{}; + EXPECT_THROW(seqan3::cigar_from_alignment(std::tie(empty, empty)), std::logic_error); +} + +TEST_F(cigar_from_alignment, aligned_sequences_do_not_have_the_same_length) +{ + std::vector> too_short{'A'_dna5}; + EXPECT_THROW(seqan3::cigar_from_alignment(std::tie(this->simple_cigar_gapped_ref, too_short)), std::logic_error); +} + +TEST_F(cigar_from_alignment, simple_cigar) +{ + auto cigar = seqan3::cigar_from_alignment(std::tie(this->simple_cigar_gapped_ref, this->simple_cigar_gapped_seq), + {.soft_front = 1}); + + EXPECT_RANGE_EQ(cigar, this->simple_cigar); +} + +TEST_F(cigar_from_alignment, with_padding) +{ + auto cigar = + seqan3::cigar_from_alignment(std::tie(this->cigar_with_padding_gapped_ref, this->cigar_with_padding_gapped_seq), + {.soft_front = 1, .soft_back = 1}); + + EXPECT_RANGE_EQ(cigar, this->cigar_with_padding); +} + +TEST_F(cigar_from_alignment, extended_cigar) +{ + auto cigar = + seqan3::cigar_from_alignment(std::tie(this->cigar_with_padding_gapped_ref, this->cigar_with_padding_gapped_seq), + {.soft_front = 1, .soft_back = 1}, + true /* output extended cigar */); + + EXPECT_RANGE_EQ(cigar, this->extended_cigar_with_padding); +} + +TEST_F(cigar_from_alignment, hard_clipping) +{ + auto cigar = seqan3::cigar_from_alignment( + std::tie(this->cigar_with_hard_clipping_gapped_ref, this->cigar_with_hard_clipping_gapped_seq), + {.hard_front = 1, .hard_back = 2, .soft_front = 2, .soft_back = 1}); + + EXPECT_RANGE_EQ(cigar, this->cigar_with_hard_clipping); +}