From 9b98f1136e774fce828053959fc8df28ff6a8d56 Mon Sep 17 00:00:00 2001 From: Svenja Mehringer Date: Fri, 14 Oct 2022 11:39:00 +0200 Subject: [PATCH] [FEATURE] Add free function seqan3::alignment_from_cigar. --- .../cigar_conversion/alignment_from_cigar.hpp | 226 ++++++++++++++++++ .../seqan3/alignment/cigar_conversion/all.hpp | 20 ++ .../cigar_conversion/alignment_from_cigar.cpp | 22 ++ .../cigar_conversion/alignment_from_cigar.err | 1 + .../alignment_from_cigar_io.cpp | 34 +++ .../alignment_from_cigar_io.err | 3 + .../alignment/cigar_conversion/CMakeLists.txt | 1 + .../alignment_from_cigar_test.cpp | 160 +++++++++++++ 8 files changed, 467 insertions(+) create mode 100644 include/seqan3/alignment/cigar_conversion/alignment_from_cigar.hpp create mode 100644 include/seqan3/alignment/cigar_conversion/all.hpp create mode 100644 test/snippet/alignment/cigar_conversion/alignment_from_cigar.cpp create mode 100644 test/snippet/alignment/cigar_conversion/alignment_from_cigar.err create mode 100644 test/snippet/alignment/cigar_conversion/alignment_from_cigar_io.cpp create mode 100644 test/snippet/alignment/cigar_conversion/alignment_from_cigar_io.err create mode 100644 test/unit/alignment/cigar_conversion/CMakeLists.txt create mode 100644 test/unit/alignment/cigar_conversion/alignment_from_cigar_test.cpp diff --git a/include/seqan3/alignment/cigar_conversion/alignment_from_cigar.hpp b/include/seqan3/alignment/cigar_conversion/alignment_from_cigar.hpp new file mode 100644 index 0000000000..1b06b1b6e3 --- /dev/null +++ b/include/seqan3/alignment/cigar_conversion/alignment_from_cigar.hpp @@ -0,0 +1,226 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides the function seqan3::alignment_from_cigar. + * \author Svenja Mehringer + */ + +#pragma once + +#include + +#include +#include +#include +#include + +namespace seqan3 +{ + +/*!\brief Construct an alignment from a CIGAR string and the corresponding sequences. + * \ingroup cigar_conversion + * \tparam reference_type The type of the reference sequence for a SAM record. + * \tparam sequence_type The type of the read sequence for a SAM record. + * \param[in] cigar_vector The CIGAR information to convert to an alignment. + * \param[in] reference The reference sequence to which the `query` was aligned to, the alignment being represented by `cigar_vector`. + * \param[in] zero_based_reference_start_position The start position of the alignment in the reference sequence. The + * position is zero-based. When using our seqan3::sam_file_input, note + * that the seqan3::sam_file_input::record_type::reference_position() + * is always zero-based. + * \param[in] query The query or read sequence of the alignment represented by `cigar_vector`. + * \returns An alignment represented by a `std::tuple` of size 2 holding 2 `seqan3::gap_decorator`s. At position 0 is + * the aligned reference sequence and at position 1 the aligned read sequence. + * + * ### Quick background on the CIGAR string + * + * The CIGAR string is a compact representation of an aligned read against a reference and was introduced by + * the [SAM](https://samtools.github.io/hts-specs/SAMv1.pdf) format. The SAM format stores the result of mapping + * short/long read sequences from a sequencing experiment (e.g., Illumina/Nanopore) against a reference (e.g., hg38). + * + * ### Conversion to an alignment + * + * You can reconstruct a full alignment from a CIGAR string, if you have the respective sequences at hand: + * + * \include test/snippet/alignment/cigar_conversion/alignment_from_cigar.cpp + * + * ### Quick explanation of the alignment representation + * + * In seqan3, an alignment is represented by a `std::tuple` of size 2 that holds 2 `seqan3::aligned_sequence`s. + * + * The data structure that we use most often to model `seqan3::aligned_sequence` is the `seqan3::gap_decorator`. + * It is a lightweight data structure that only holds a view on the sequence (no copy is made) and on top can hold + * `seqan3::gap`s. + * + * In the above example, the read sequence `ACGT` is aligned to the reference with one gap: + * `AC-GA` where `-` represents a gap. + * In the CIGAR string, the gap in the query/read is represented by `1D`. + * + * The full alignment consist of two aligned sequences (read and reference). + * In the above example, the alignment + * ``` + * position 01234 + * reference ACTGA + * read AC-GA + * ``` + * is represented by a tuple of the aligned reference at the first position (std::get<0>) and the aligned read at the + * second position (std::get<1>): `(ACTGA,AC-GA)`. + * + * ### IO Example + * + * A more realistic example is extracting the information directly from a SAM file: + * + * \include test/snippet/alignment/cigar_conversion/alignment_from_cigar_io.cpp + * + * \sa seqan3::sam_file_input + * \sa seqan3::cigar + */ +template +inline auto alignment_from_cigar(std::vector const & cigar_vector, + reference_type const & reference, + uint32_t const zero_based_reference_start_position, + sequence_type const & query) +{ + if (cigar_vector.empty()) + throw std::logic_error{"An empty CIGAR is not a valid alignment representation."}; + + // compute the length of the aligned region in the reference sequence + // ------------------------------------------------------------------------- + // this requires a first stream over the cigar vector. + uint32_t reference_length{0}; + uint32_t query_length{0}; + + for (auto [cigar_count, cigar_operation] : cigar_vector) + { + if (('M'_cigar_operation == cigar_operation) || ('='_cigar_operation == cigar_operation) + || ('X'_cigar_operation == cigar_operation)) + { + reference_length += cigar_count; + query_length += cigar_count; + } + else if ('D'_cigar_operation == cigar_operation) + { + reference_length += cigar_count; + } + else if ('I'_cigar_operation == cigar_operation) + { + query_length += cigar_count; + } + } + + if (static_cast(zero_based_reference_start_position + reference_length) > std::ranges::size(reference)) + throw std::logic_error{"The CIGAR string indicates a reference length of at least " + + std::to_string(zero_based_reference_start_position + reference_length) + + ", but the supplied reference sequence is only of size" + + std::to_string(std::ranges::size(reference)) + "."}; + + // Get soft clipping at the start and end of the CIGAR string + // ------------------------------------------------------------------------- + uint32_t soft_clipping_start{0}; + uint32_t soft_clipping_end{0}; + + // Checks whether the given index in the cigar vector is a soft clip. + auto soft_clipping_at = [&cigar_vector](size_t const index) + { + return cigar_vector[index] == 'S'_cigar_operation; + }; + // Checks whether the given index in the cigar vector is a hard clip. + auto hard_clipping_at = [&](size_t const index) + { + return cigar_vector[index] == 'H'_cigar_operation; + }; + // Checks whether the given cigar vector has at least min_size many elements. + auto vector_size_at_least = [&](size_t const min_size) + { + return cigar_vector.size() >= min_size; + }; + // Returns the cigar count of the ith cigar element in the given cigar vector. + auto cigar_count_at = [&](size_t const index) + { + return get<0>(cigar_vector[index]); + }; + + // check for soft clipping at the first two positions + // cigar is non-empty, checked at the very beginning. + if (soft_clipping_at(0)) + soft_clipping_start = cigar_count_at(0); + else if (vector_size_at_least(2) && hard_clipping_at(0) && soft_clipping_at(1)) + soft_clipping_start = cigar_count_at(1); + + // Check for soft clipping at the last two positions to validate the CIGAR string. + // Even if the two following arithmetics overflow, they are protected by the corresponding if expressions below. + auto const last_index = cigar_vector.size() - 1; + auto const second_last_index = last_index - 1; + + if (vector_size_at_least(2) && soft_clipping_at(last_index)) + soft_clipping_end = cigar_count_at(last_index); + else if (vector_size_at_least(3) && hard_clipping_at(last_index) && soft_clipping_at(second_last_index)) + soft_clipping_end = cigar_count_at(second_last_index); + + if (soft_clipping_start + query_length + soft_clipping_end != std::ranges::size(query)) + throw std::logic_error{"The CIGAR string indicates a query/read sequence length of " + + std::to_string(soft_clipping_start + query_length + soft_clipping_end) + + ", but the supplied query/read sequence is of size" + + std::to_string(std::ranges::size(query)) + "."}; + + // Assign the sequence to the alignment (a tuple of 2 gap decorators) + // ------------------------------------------------------------------------- + using gapped_reference_type = gap_decorator; + using gapped_sequence_type = gap_decorator; + using alignment_type = std::tuple; + + alignment_type alignment{}; + + assign_unaligned(get<0>(alignment), + reference + | views::slice(zero_based_reference_start_position, + zero_based_reference_start_position + reference_length)); + // query_length already accounts for soft clipping at begin and end + assign_unaligned(get<1>(alignment), query | views::slice(soft_clipping_start, soft_clipping_start + query_length)); + + // Insert gaps into the alignment based on the cigar vector + // ------------------------------------------------------------------------- + using std::get; + auto current_ref_pos = std::ranges::begin(get<0>(alignment)); + auto current_read_pos = std::ranges::begin(get<1>(alignment)); + + for (auto [cigar_count, cigar_operation] : cigar_vector) + { + if (('M'_cigar_operation == cigar_operation) || ('='_cigar_operation == cigar_operation) + || ('X'_cigar_operation == cigar_operation)) + { + std::ranges::advance(current_ref_pos, cigar_count); + std::ranges::advance(current_read_pos, cigar_count); + } + else if (('D'_cigar_operation == cigar_operation) || ('N'_cigar_operation == cigar_operation)) + { + // insert gaps into query + current_read_pos = get<1>(alignment).insert_gap(current_read_pos, cigar_count); + ++current_read_pos; + std::ranges::advance(current_ref_pos, cigar_count); + } + else if (('I'_cigar_operation == cigar_operation)) // Insert gaps into ref + { + current_ref_pos = get<0>(alignment).insert_gap(current_ref_pos, cigar_count); + ++current_ref_pos; + std::ranges::advance(current_read_pos, cigar_count); + } + else if (('P'_cigar_operation == cigar_operation)) // skip padding + { + current_ref_pos = get<0>(alignment).insert_gap(current_ref_pos, cigar_count); + ++current_ref_pos; + + current_read_pos = get<1>(alignment).insert_gap(current_read_pos, cigar_count); + ++current_read_pos; + } + // S and H are ignored as they are handled by cropping the sequence + } + + return alignment; +} + +} // namespace seqan3 diff --git a/include/seqan3/alignment/cigar_conversion/all.hpp b/include/seqan3/alignment/cigar_conversion/all.hpp new file mode 100644 index 0000000000..80f55a7cc2 --- /dev/null +++ b/include/seqan3/alignment/cigar_conversion/all.hpp @@ -0,0 +1,20 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Meta-header for the \link cigar_conversion Alignment / CIGAR Conversion submodule \endlink. + * \author Svenja Mehringer + */ + +/*!\defgroup cigar_conversion CIGAR Conversion + * \brief The CIGAR Conversion submodule contains utility functions to convert a CIGAR to an alignment or vice versa. + * \ingroup alignment + */ + +#pragma once + +#include diff --git a/test/snippet/alignment/cigar_conversion/alignment_from_cigar.cpp b/test/snippet/alignment/cigar_conversion/alignment_from_cigar.cpp new file mode 100644 index 0000000000..f27f5a7222 --- /dev/null +++ b/test/snippet/alignment/cigar_conversion/alignment_from_cigar.cpp @@ -0,0 +1,22 @@ +#include +#include +#include +#include + +using namespace seqan3::literals; + +int main() +{ + // CIGAR string = 2M1D2M + std::vector cigar_vector{{2, 'M'_cigar_operation}, + {1, 'D'_cigar_operation}, + {2, 'M'_cigar_operation}}; + + uint32_t reference_start_position{0}; // The read is aligned at the start of the reference. + seqan3::dna5_vector reference = "ACTGATCGAGAGGATCTAGAGGAGATCGTAGGAC"_dna5; + seqan3::dna5_vector query = "ACGA"_dna5; + + auto alignment = alignment_from_cigar(cigar_vector, reference, reference_start_position, query); + + seqan3::debug_stream << alignment << '\n'; // prints (ACTGA,AC-GA) +} diff --git a/test/snippet/alignment/cigar_conversion/alignment_from_cigar.err b/test/snippet/alignment/cigar_conversion/alignment_from_cigar.err new file mode 100644 index 0000000000..269fcf4f88 --- /dev/null +++ b/test/snippet/alignment/cigar_conversion/alignment_from_cigar.err @@ -0,0 +1 @@ +(ACTGA,AC-GA) diff --git a/test/snippet/alignment/cigar_conversion/alignment_from_cigar_io.cpp b/test/snippet/alignment/cigar_conversion/alignment_from_cigar_io.cpp new file mode 100644 index 0000000000..337c27e425 --- /dev/null +++ b/test/snippet/alignment/cigar_conversion/alignment_from_cigar_io.cpp @@ -0,0 +1,34 @@ +#include +#include +#include + +using namespace seqan3::literals; + +auto sam_file_raw = R"(@HD VN:1.6 +@SQ SN:ref LN:34 +read1 41 ref 1 61 1S1M1D1M1I ref 10 300 ACGT !##$ AS:i:2 NM:i:7 +read2 42 ref 2 62 1H7M1D1M1S2H ref 10 300 AGGCTGNAG !##$&'()* xy:B:S,3,4,5 +read3 43 ref 3 63 1S1M1P1M1I1M1I1D1M1S ref 10 300 GGAGTATA !!*+,-./ +)"; + +int main() +{ + // The reference sequence might be read from a different file. + seqan3::dna5_vector reference = "ACTGATCGAGAGGATCTAGAGGAGATCGTAGGAC"_dna5; + + seqan3::sam_file_input fin{std::istringstream{sam_file_raw}, seqan3::format_sam{}}; + // You will probably read it from a file, e.g., like this: + // seqan3::sam_file_input fin{"test.sam"}; + + for (auto && rec : fin) + { + auto alignment = + alignment_from_cigar(rec.cigar_sequence(), reference, rec.reference_position().value(), rec.sequence()); + seqan3::debug_stream << alignment << '\n'; + } + + // prints: + // (ACT-,C-GT) + // (CTGATCGAG,AGGCTGN-A) + // (T-G-A-TC,G-AGTA-T) +} diff --git a/test/snippet/alignment/cigar_conversion/alignment_from_cigar_io.err b/test/snippet/alignment/cigar_conversion/alignment_from_cigar_io.err new file mode 100644 index 0000000000..65e8099578 --- /dev/null +++ b/test/snippet/alignment/cigar_conversion/alignment_from_cigar_io.err @@ -0,0 +1,3 @@ +(ACT-,C-GT) +(CTGATCGAG,AGGCTGN-A) +(T-G-A-TC,G-AGTA-T) diff --git a/test/unit/alignment/cigar_conversion/CMakeLists.txt b/test/unit/alignment/cigar_conversion/CMakeLists.txt new file mode 100644 index 0000000000..6f96434143 --- /dev/null +++ b/test/unit/alignment/cigar_conversion/CMakeLists.txt @@ -0,0 +1 @@ +seqan3_test (alignment_from_cigar_test.cpp) diff --git a/test/unit/alignment/cigar_conversion/alignment_from_cigar_test.cpp b/test/unit/alignment/cigar_conversion/alignment_from_cigar_test.cpp new file mode 100644 index 0000000000..8f792c590b --- /dev/null +++ b/test/unit/alignment/cigar_conversion/alignment_from_cigar_test.cpp @@ -0,0 +1,160 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +using seqan3::operator""_cigar_operation; +using seqan3::operator""_dna5; + +struct cigar_conversion_data : public ::testing::Test +{ + seqan3::dna5_vector ref = "ACTGATCGAGAGGATCTAGAGGAGATCGTAGGAC"_dna5; + + std::vector simple_cigar{{1, 'S'_cigar_operation}, // 1S1M1D1M1I + {1, 'M'_cigar_operation}, + {1, 'D'_cigar_operation}, + {1, 'M'_cigar_operation}, + {1, 'I'_cigar_operation}}; + + std::vector> simple_cigar_gapped_ref{'A'_dna5, 'C'_dna5, 'T'_dna5, seqan3::gap{}}; + std::vector> simple_cigar_gapped_seq{'C'_dna5, seqan3::gap{}, 'G'_dna5, 'T'_dna5}; + + std::vector cigar_with_padding{{1, 'S'_cigar_operation}, // 1S1M1P1M1I1M1I1D1M1S + {1, 'M'_cigar_operation}, + {1, 'P'_cigar_operation}, + {1, 'M'_cigar_operation}, + {1, 'I'_cigar_operation}, + {1, 'M'_cigar_operation}, + {1, 'I'_cigar_operation}, + {1, 'D'_cigar_operation}, + {1, 'M'_cigar_operation}, + {1, 'S'_cigar_operation}}; + + // same as `cigar_with_padding` but M are substituted by `=` or `X` depending on match or mismatch + std::vector extended_cigar_with_padding{{1, 'S'_cigar_operation}, // 1S1=1P1X1I1X1I1D1=1S + {1, '='_cigar_operation}, + {1, 'P'_cigar_operation}, + {1, 'X'_cigar_operation}, + {1, 'I'_cigar_operation}, + {1, 'X'_cigar_operation}, + {1, 'I'_cigar_operation}, + {1, 'D'_cigar_operation}, + {1, '='_cigar_operation}, + {1, 'S'_cigar_operation}}; + + std::vector> cigar_with_padding_gapped_ref = + {'T'_dna5, seqan3::gap{}, 'G'_dna5, seqan3::gap{}, 'A'_dna5, seqan3::gap{}, 'T'_dna5, 'C'_dna5}; + std::vector> cigar_with_padding_gapped_seq = + {'T'_dna5, seqan3::gap{}, 'A'_dna5, 'G'_dna5, 'T'_dna5, 'A'_dna5, seqan3::gap{}, 'C'_dna5}; + + std::vector cigar_with_hard_clipping{{1, 'H'_cigar_operation}, // 1H2S7M1D1M1S2H + {2, 'S'_cigar_operation}, + {7, 'M'_cigar_operation}, + {1, 'D'_cigar_operation}, + {1, 'M'_cigar_operation}, + {1, 'S'_cigar_operation}, + {2, 'H'_cigar_operation}}; + + std::vector> cigar_with_hard_clipping_gapped_ref = + {'C'_dna5, 'T'_dna5, 'G'_dna5, 'A'_dna5, 'T'_dna5, 'C'_dna5, 'G'_dna5, 'A'_dna5, 'G'_dna5}; + std::vector> cigar_with_hard_clipping_gapped_seq = + {'A'_dna5, 'G'_dna5, 'G'_dna5, 'C'_dna5, 'T'_dna5, 'G'_dna5, 'N'_dna5, seqan3::gap{}, 'A'_dna5}; +}; + +struct alignment_from_cigar : public cigar_conversion_data +{}; + +TEST_F(alignment_from_cigar, empty_cigar) +{ + std::vector empty_cigar{}; + seqan3::dna5_vector seq{"ACGT"_dna5}; + + // An empty CIGAR string is not valid as it must always fulfil the following: + // "Sum of lengths of the M/I/S/=/X operations shall equal the length of SEQ" + EXPECT_THROW(seqan3::alignment_from_cigar(empty_cigar, this->ref, 0, seq), std::logic_error); +} + +TEST_F(alignment_from_cigar, cigar_covers_too_little_bases_of_read) +{ + seqan3::dna5_vector seq{"ACGT"_dna5}; + std::vector corrupt_cigar{{3, 'M'_cigar_operation}}; // Although seq is of length 4 + + EXPECT_THROW(seqan3::alignment_from_cigar(corrupt_cigar, this->ref, 0, seq), std::logic_error); +} + +TEST_F(alignment_from_cigar, cigar_covers_too_many_bases_of_read) +{ + seqan3::dna5_vector seq{"ACGT"_dna5}; + std::vector corrupt_cigar{{5, 'M'_cigar_operation}}; // Although seq is of length 4 + + EXPECT_THROW(seqan3::alignment_from_cigar(corrupt_cigar, this->ref, 0, seq), std::logic_error); +} + +TEST_F(alignment_from_cigar, cigar_covers_too_many_bases_of_reference) +{ + seqan3::dna5_vector seq{"ACGT"_dna5}; + std::vector corrupt_cigar{{2, 'M'_cigar_operation}, + {40, 'D'_cigar_operation}, // Although reference is only of length 35 + {2, 'M'_cigar_operation}}; + + EXPECT_THROW(seqan3::alignment_from_cigar(corrupt_cigar, this->ref, 0, seq), std::logic_error); +} + +TEST_F(alignment_from_cigar, simple_cigar) +{ + seqan3::dna5_vector seq{"ACGT"_dna5}; + int32_t reference_start_position{0}; + + auto alignment = seqan3::alignment_from_cigar(this->simple_cigar, this->ref, reference_start_position, seq); + + EXPECT_RANGE_EQ(std::get<0>(alignment), this->simple_cigar_gapped_ref); + EXPECT_RANGE_EQ(std::get<1>(alignment), this->simple_cigar_gapped_seq); +} + +TEST_F(alignment_from_cigar, with_padding) +{ + seqan3::dna5_vector seq{"GTAGTACA"_dna5}; + int32_t reference_start_position{2}; + + auto alignment = seqan3::alignment_from_cigar(this->cigar_with_padding, this->ref, reference_start_position, seq); + + EXPECT_RANGE_EQ(std::get<0>(alignment), this->cigar_with_padding_gapped_ref); + EXPECT_RANGE_EQ(std::get<1>(alignment), this->cigar_with_padding_gapped_seq); +} + +TEST_F(alignment_from_cigar, extended_cigar) +{ + seqan3::dna5_vector seq{"GTAGTACA"_dna5}; + int32_t reference_start_position{2}; + + auto alignment = + seqan3::alignment_from_cigar(this->extended_cigar_with_padding, this->ref, reference_start_position, seq); + + EXPECT_RANGE_EQ(std::get<0>(alignment), this->cigar_with_padding_gapped_ref); + EXPECT_RANGE_EQ(std::get<1>(alignment), this->cigar_with_padding_gapped_seq); +} + +TEST_F(alignment_from_cigar, with_hard_clipping) +{ + seqan3::dna5_vector seq{"TTAGGCTGNAG"_dna5}; + int32_t reference_start_position{1}; + + auto alignment = + seqan3::alignment_from_cigar(this->cigar_with_hard_clipping, this->ref, reference_start_position, seq); + + EXPECT_RANGE_EQ(std::get<0>(alignment), this->cigar_with_hard_clipping_gapped_ref); + EXPECT_RANGE_EQ(std::get<1>(alignment), this->cigar_with_hard_clipping_gapped_seq); +}