Skip to content

Commit

Permalink
[FEATURE] SAM/BAM can now write a cigar vector instead of an alignment.
Browse files Browse the repository at this point in the history
  • Loading branch information
smehringer committed Jul 23, 2019
1 parent 90f9cd0 commit 09edc53
Show file tree
Hide file tree
Showing 6 changed files with 112 additions and 19 deletions.
24 changes: 19 additions & 5 deletions include/seqan3/io/alignment_file/detail.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,24 @@ std::vector<cigar> get_cigar_vector(alignment_type && alignment,
return result;
}

/*!\brief Creates a CIGAR string (SAM format) given an std::vector<seqan3::cigar>.
* \ingroup alignment_file
* \param cigar_vector The std::vector of seqan3::cigar elements to be transformed into a std::string.
* \returns The CIGAR string (std::string).
*
* \details
*
* The transformation is done by printing the vector with the seqan3::debug_stream.
*/
std::string get_cigar_string(std::vector<cigar> const & cigar_vector)
{
std::stringstream ss;
debug_stream_type ds{ss};
ds << cigar_vector;
ss.flush();
return ss.str();
}

/*!\brief Creates a CIGAR string (SAM format) given an alignment represented by two aligned sequences.
* \ingroup alignment_file
*
Expand Down Expand Up @@ -252,11 +270,7 @@ std::string get_cigar_string(alignment_type && alignment,
uint32_t const query_end_pos = 0,
bool const extended_cigar = false)
{
std::stringstream ss;
debug_stream_type ds{ss};
ds << get_cigar_vector(alignment, query_start_pos, query_end_pos, extended_cigar);
ss.flush();
return ss.str();
return get_cigar_string(get_cigar_vector(alignment, query_start_pos, query_end_pos, extended_cigar));
}

/*!\brief Transforms an alignment represented by two aligned sequences into the
Expand Down
39 changes: 27 additions & 12 deletions include/seqan3/io/alignment_file/format_bam.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -715,6 +715,7 @@ class alignment_file_output_format<format_bam> : alignment_file_output_format<fo
typename ref_seq_type,
typename ref_id_type,
typename align_type,
typename cigar_type,
typename qual_type,
typename mate_type,
typename tag_dict_type>
Expand All @@ -729,6 +730,7 @@ class alignment_file_output_format<format_bam> : alignment_file_output_format<fo
[[maybe_unused]] ref_id_type && ref_id,
[[maybe_unused]] std::optional<int32_t> ref_offset,
[[maybe_unused]] align_type && align,
[[maybe_unused]] cigar_type && cigar_vector,
[[maybe_unused]] uint16_t flag,
[[maybe_unused]] uint8_t mapq,
[[maybe_unused]] mate_type && mate,
Expand Down Expand Up @@ -848,24 +850,37 @@ class alignment_file_output_format<format_bam> : alignment_file_output_format<fo
// ---------------------------------------------------------------------
// Writing the Record
// ---------------------------------------------------------------------
int32_t ref_length{};

// compute possible distance from alignment end to sequence end
// which indicates soft clipping at the end.
// This should be replaced by a free count_gaps function for
// aligned sequences which is more efficient if possible.
auto off_end{std::ranges::distance(seq) - offset};
// if alignment is non-empty, replace cigar_vector.
// else, compute the ref_length from given cigar_vector which is needed to fill field `bin`.
if (!std::ranges::empty(get<0>(align)) && !std::ranges::empty(get<1>(align)))
{
ref_length = std::ranges::distance(get<1>(align));

for (auto chr : get<1>(align))
if (chr == gap{})
++off_end;
// compute possible distance from alignment end to sequence end
// which indicates soft clipping at the end.
// This should be replaced by a free count_gaps function for
// aligned sequences which is more efficient if possible.
auto off_end{std::ranges::distance(seq) - offset};

off_end -= std::ranges::distance(get<1>(align));
for (auto chr : get<1>(align))
if (chr == gap{})
++off_end;

std::vector<cigar> cigar_vector = detail::get_cigar_vector(align, offset, off_end);
off_end -= ref_length;
cigar_vector = detail::get_cigar_vector(align, offset, off_end);
}
else
{
int32_t d{}; // dummy seq length
for (auto & [co, op] : cigar_vector)
alignment_file_input_format<format_sam>::update_alignment_lengths(ref_length, d, op.to_char(), co);
}

if (cigar_vector.size() > 65535) // cannot be represented with 16bits, must be written into the sam tag CG
{
tag_dict["CG"_tag] = detail::get_cigar_string(align, offset, off_end);
tag_dict["CG"_tag] = detail::get_cigar_string(cigar_vector);
cigar_vector.resize(2);
cigar_vector[0] = cigar{static_cast<uint32_t>(std::ranges::distance(seq)), 'S'_cigar_op};
cigar_vector[1] = cigar{static_cast<uint32_t>(std::ranges::distance(get<1>(align))), 'N'_cigar_op};
Expand All @@ -880,7 +895,7 @@ class alignment_file_output_format<format_bam> : alignment_file_output_format<fo
/* pos */ ref_offset.value_or(-1),
/* l_read_name */ std::max<uint8_t>(std::min<size_t>(std::ranges::distance(id) + 1, 255), 2),
/* mapq */ mapq,
/* bin */ reg2bin(ref_offset.value_or(-1), std::ranges::distance(get<1>(align))),
/* bin */ reg2bin(ref_offset.value_or(-1), ref_length),
/* n_cigar_op */ static_cast<uint16_t>(cigar_vector.size()),
/* flag */ flag,
/* l_seq */ static_cast<int32_t>(std::ranges::distance(seq)),
Expand Down
8 changes: 7 additions & 1 deletion include/seqan3/io/alignment_file/format_sam.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,7 @@ class alignment_file_input_format<format_sam>
* \param[in] op The cigar operation.
* \param[in] cigar_count The cigar count value to add to the length depending.
*/
void update_alignment_lengths(int32_t & ref_length, int32_t & seq_length, char op, uint32_t cigar_count)
static void update_alignment_lengths(int32_t & ref_length, int32_t & seq_length, char op, uint32_t cigar_count)
{
if (is_char<'M'>(op) || is_char<'='>(op) || is_char<'X'>(op))
{
Expand Down Expand Up @@ -1097,6 +1097,7 @@ class alignment_file_output_format<format_sam>
ref_id_type && ref_id,
std::optional<int32_t> ref_offset,
align_type && align,
std::vector<cigar> const & cigar_vector,
uint16_t flag,
uint8_t mapq,
mate_type && mate,
Expand Down Expand Up @@ -1291,6 +1292,11 @@ class alignment_file_output_format<format_sam>

write_range(stream_it, detail::get_cigar_string(std::forward<align_type>(align), offset, off_end));
}
else if (!cigar_vector.empty()) // both fields are never non-empty (fields asserted on file level)
{
for (auto & c : cigar_vector)
stream << c.to_char(); // each letter returns a small_vector instead of char so write_range doesn't work
}
else
{
stream << '*';
Expand Down
13 changes: 12 additions & 1 deletion include/seqan3/io/alignment_file/output.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ namespace seqan3
* 12. field::TAGS
* 13. field::EVALUE
* 14. field::BIT_SCORE
* 15. field::CIGAR
*
* There is an additional field called seqan3::field::HEADER_PTR. It is used to transfer
* header information from seqan3::alignment_file_input to seqan3::alignment_file_output,
Expand Down Expand Up @@ -209,6 +210,7 @@ class alignment_file_output
field::REF_ID,
field::REF_OFFSET,
field::ALIGNMENT,
field::CIGAR,
field::MAPQ,
field::FLAG,
field::QUAL,
Expand All @@ -228,6 +230,13 @@ class alignment_file_output
"please refer to the documentation of "
"seqan3::alignment_file_output::field_ids for the accepted values.");

static_assert([] () constexpr
{
return !(selected_field_ids::contains(field::ALIGNMENT) &&
selected_field_ids::contains(field::CIGAR));
}(),
"You may not select field::ALIGNMENT and field::CIGAR at the same time.");

/*!\name Range associated types
* \brief Most of the range associated types are `void` for output ranges.
* \{
Expand Down Expand Up @@ -513,6 +522,7 @@ class alignment_file_output
detail::get_or<field::REF_ID>(r, std::ignore),
detail::get_or<field::REF_OFFSET>(r, std::optional<int32_t>{}),
detail::get_or<field::ALIGNMENT>(r, default_align_t{}),
detail::get_or<field::CIGAR>(r, std::vector<cigar>{}),
detail::get_or<field::FLAG>(r, 0u),
detail::get_or<field::MAPQ>(r, 0u),
detail::get_or<field::MATE>(r, default_mate_t{}),
Expand Down Expand Up @@ -561,6 +571,7 @@ class alignment_file_output
detail::get_or<selected_field_ids::index_of(field::REF_ID)>(t, std::ignore),
detail::get_or<selected_field_ids::index_of(field::REF_OFFSET)>(t, std::optional<int32_t>{}),
detail::get_or<selected_field_ids::index_of(field::ALIGNMENT)>(t, default_align_t{}),
detail::get_or<selected_field_ids::index_of(field::CIGAR)>(t, std::vector<cigar>{}),
detail::get_or<selected_field_ids::index_of(field::FLAG)>(t, 0u),
detail::get_or<selected_field_ids::index_of(field::MAPQ)>(t, 0u),
detail::get_or<selected_field_ids::index_of(field::MATE)>(t, default_mate_t{}),
Expand Down Expand Up @@ -775,7 +786,7 @@ class alignment_file_output
template <typename record_header_ptr_t, typename ...pack_type>
void write_record(record_header_ptr_t && record_header_ptr, pack_type && ...remainder)
{
static_assert((sizeof...(pack_type) == 14), "Wrong parameter list passed to write_record.");
static_assert((sizeof...(pack_type) == 15), "Wrong parameter list passed to write_record.");

assert(!format.valueless_by_exception());

Expand Down
5 changes: 5 additions & 0 deletions include/seqan3/io/alignment_file/output_format_concept.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <seqan3/alphabet/nucleotide/dna5.hpp>
#include <seqan3/alphabet/quality/aliases.hpp>
#include <seqan3/alphabet/quality/phred42.hpp>
#include <seqan3/alphabet/cigar/cigar.hpp>
#include <seqan3/core/type_list.hpp>
#include <seqan3/io/alignment_file/header.hpp>
#include <seqan3/io/alignment_file/output_options.hpp>
Expand Down Expand Up @@ -64,6 +65,7 @@ SEQAN3_CONCEPT AlignmentFileOutputFormat =
std::optional<int32_t> & ref_id,
std::optional<int32_t> & ref_offset,
std::pair<std::vector<gapped<dna4>>, std::vector<gapped<dna4>>> & align,
std::vector<cigar> & cigar,
uint16_t & flag,
uint8_t & mapq,
std::tuple<std::optional<int32_t>, std::optional<int32_t>, int32_t> & mate,
Expand All @@ -84,6 +86,7 @@ SEQAN3_CONCEPT AlignmentFileOutputFormat =
ref_id,
ref_offset,
align,
cigar,
flag,
mapq,
mate,
Expand Down Expand Up @@ -111,6 +114,7 @@ SEQAN3_CONCEPT AlignmentFileOutputFormat =
ref_id_type && ref_id,
ref_offset_type && ref_offset,
align_type && align,
std::vector<cigar> & cigar,
flag_type && flag,
mapq_type && mapq,
mate_type && mate,
Expand Down Expand Up @@ -145,6 +149,7 @@ SEQAN3_CONCEPT AlignmentFileOutputFormat =
* \param[in] ref_id The data for seqan3::field::REF_ID, e.g. the reference id..
* \param[in] ref_offset The data for seqan3::field::REF_OFFSET, i.e. the start position of the alignment in \p ref_seq.
* \param[in] align The data for seqan3::field::ALIGN, e.g. the alignment between query and ref.
* \param[in] cigar The data for seqan3::field::CIGAR, e.g. representing the alignment between query and ref.
* \param[in] flag The data for seqan3::field::FLAG, e.g. the SAM mapping flag value.
* \param[in] mapq The data for seqan3::field::MAPQ, e.g. the mapping quality value.
* \param[in] mate The data for seqan3::field::MATE, e.g. the mate information of paired reads.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,47 @@ TYPED_TEST_P(alignment_file_write, with_header)
EXPECT_EQ(this->ostream.str(), this->verbose_output);
}

TYPED_TEST_P(alignment_file_write, cigar_vector)
{
std::vector<std::vector<cigar>> cigar_v
{
{{1, 'S'_cigar_op}, {1, 'M'_cigar_op}, {1, 'D'_cigar_op}, {1, 'M'_cigar_op}, {1, 'I'_cigar_op}},
{{7, 'M'_cigar_op}, {1, 'D'_cigar_op}, {1, 'M'_cigar_op}, {1, 'S'_cigar_op}},
{{1, 'S'_cigar_op}, {1, 'M'_cigar_op}, {1, 'P'_cigar_op}, {1, 'M'_cigar_op}, {1, 'I'_cigar_op},
{1, 'M'_cigar_op}, {1, 'I'_cigar_op}, {1, 'D'_cigar_op}, {1, 'M'_cigar_op}, {1, 'S'_cigar_op}}
};

{
this->tag_dicts[0]["NM"_tag] = 7;
this->tag_dicts[0]["AS"_tag] = 2;
this->tag_dicts[1]["xy"_tag] = std::vector<uint16_t>{3,4,5};

alignment_file_output fout{this->ostream, TypeParam{}, fields<field::HEADER_PTR,
field::ID,
field::FLAG,
field::REF_ID,
field::REF_OFFSET,
field::MAPQ,
field::CIGAR, // cigar instead of alignment
field::OFFSET,
field::MATE,
field::SEQ,
field::QUAL,
field::TAGS>{}};

for (size_t i = 0; i < 3; ++i)
{
ASSERT_NO_THROW(fout.emplace_back(&(this->header), this->ids[i], this->flags[i], 0/*ref_id*/,
this->ref_offsets[i], this->mapqs[i], cigar_v[i],
this->offsets[i], this->mates[i], this->seqs[i], this->quals[i],
this->tag_dicts[i]));
}
}

this->ostream.flush();
EXPECT_EQ(this->ostream.str(), this->simple_three_reads_output);
}

TYPED_TEST_P(alignment_file_write, special_cases)
{
std::optional<int32_t> rid;
Expand Down Expand Up @@ -573,5 +614,6 @@ REGISTER_TYPED_TEST_CASE_P(alignment_file_write,
default_options_all_members_specified,
write_ref_id_with_different_types,
with_header,
cigar_vector,
special_cases,
format_errors);

0 comments on commit 09edc53

Please sign in to comment.