From feff8776a36eda5411a69e95e22f9cc295811fcf Mon Sep 17 00:00:00 2001 From: smehringer Date: Thu, 20 Jun 2019 17:14:30 +0200 Subject: [PATCH 1/6] [FIX] rank_type overflows for in_t == char. --- include/seqan3/alphabet/detail/convert.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/seqan3/alphabet/detail/convert.hpp b/include/seqan3/alphabet/detail/convert.hpp index 8150f189fb..d1d301186c 100755 --- a/include/seqan3/alphabet/detail/convert.hpp +++ b/include/seqan3/alphabet/detail/convert.hpp @@ -39,7 +39,8 @@ constexpr std::array> convert_through_char_representa [] () constexpr { std::array> ret{}; - for (typename in_t::rank_type i = 0; i < alphabet_size; ++i) + // for (decltype(alphabet_size) i = 0; ...) causes indefinite compilation :( + for (auto i = decltype(alphabet_size){0}; i < alphabet_size; ++i) assign_char_to(to_char(assign_rank_to(i, in_t{})), ret[i]); return ret; }() From 18017bba51262f0c059a047b2b96037ab2743b95 Mon Sep 17 00:00:00 2001 From: smehringer Date: Fri, 28 Jun 2019 14:45:24 +0200 Subject: [PATCH 2/6] [FIX] Allow Contigous ranges to be queried in the ref_dict independent of the container. --- .../seqan3/io/alignment_file/format_bam.hpp | 21 +++++++- .../seqan3/io/alignment_file/format_sam.hpp | 30 +++++++++--- include/seqan3/io/alignment_file/header.hpp | 7 ++- include/seqan3/io/alignment_file/input.hpp | 19 ++++++-- include/seqan3/io/alignment_file/output.hpp | 48 +++++++++++-------- .../alignment_file_format_test_template.hpp | 42 ++++++++++++++++ 6 files changed, 134 insertions(+), 33 deletions(-) diff --git a/include/seqan3/io/alignment_file/format_bam.hpp b/include/seqan3/io/alignment_file/format_bam.hpp index 8fb4a0d253..17b48d80fa 100644 --- a/include/seqan3/io/alignment_file/format_bam.hpp +++ b/include/seqan3/io/alignment_file/format_bam.hpp @@ -866,7 +866,25 @@ class alignment_file_output_format : alignment_file_output_format && + std::ranges::SizedRange && + ForwardingRange) + { + id_it = header.ref_dict.find(std::span{std::ranges::data(id_source), + std::ranges::size(id_source)}); + } + else + { + using header_ref_id_type = std::remove_reference_t; + + static_assert(ImplicitlyConvertibleTo, + "The ref_id type is not convertible to the reference id information stored in the " + "reference dictionary of the header object."); + + id_it = header.ref_dict.find(id_source); + } if (id_it == header.ref_dict.end()) { @@ -874,6 +892,7 @@ class alignment_file_output_format : alignment_file_output_formatsecond; } } diff --git a/include/seqan3/io/alignment_file/format_sam.hpp b/include/seqan3/io/alignment_file/format_sam.hpp index e198f079d8..3417cb88b5 100644 --- a/include/seqan3/io/alignment_file/format_sam.hpp +++ b/include/seqan3/io/alignment_file/format_sam.hpp @@ -838,7 +838,7 @@ class alignment_file_input_format if (is_char<'S'>(*std::ranges::begin(stream_view))) // SQ (sequence dictionary) tag { ref_info_present_in_header = true; - std::string id; + value_type_t id; std::tuple info{}; parse_tag_value(id); // parse required SN (sequence name) tag @@ -1100,15 +1100,31 @@ class alignment_file_output_format !std::Integral> && !detail::is_type_specialisation_of_v, std::optional>) { - static_assert(ImplicitlyConvertibleTo, - "The ref_id type is not convertible to the reference id information stored in the " - "reference dictionary of the header object."); if (options.sam_require_header && !std::ranges::empty(ref_id)) { - if ((header.ref_dict).count(ref_id) == 0) // no reference id matched - throw format_error{std::string("The ref_id '") + std::string(ref_id) + - "' was not in the list of references"}; + auto id_it = header.ref_dict.end(); + + if constexpr (std::ranges::ContiguousRange && + std::ranges::SizedRange && + ForwardingRange) + { + id_it = header.ref_dict.find(std::span{std::ranges::data(ref_id), std::ranges::size(ref_id)}); + } + else + { + using header_ref_id_type = std::remove_reference_t; + + static_assert(ImplicitlyConvertibleTo, + "The ref_id type is not convertible to the reference id information stored in the " + "reference dictionary of the header object."); + + id_it = header.ref_dict.find(ref_id); + } + + if (id_it == header.ref_dict.end()) // no reference id matched + throw format_error{detail::to_string("The ref_id '", ref_id, "' was not in the list of references:", + header.ref_ids())}; } } diff --git a/include/seqan3/io/alignment_file/header.hpp b/include/seqan3/io/alignment_file/header.hpp index 5acc64e974..34364c6ece 100644 --- a/include/seqan3/io/alignment_file/header.hpp +++ b/include/seqan3/io/alignment_file/header.hpp @@ -16,8 +16,11 @@ #include #include +#include +#include #include #include +#include #include namespace seqan3 @@ -88,7 +91,9 @@ class alignment_file_header //!\brief Stream deleter with default behaviour (ownership assumed). static void ref_ids_deleter_default(ref_ids_type * ptr) { delete ptr; } //!\brief The key's type of ref_dict. - using key_type = std::ranges::all_view>; + using key_type = std::conditional_t>, + std::span const>, + all_view>>; //!\brief The pointer to reference ids information (non-owning if reference information is given). ref_ids_ptr_t ref_ids_ptr{new ref_ids_type{}, ref_ids_deleter_default}; diff --git a/include/seqan3/io/alignment_file/input.hpp b/include/seqan3/io/alignment_file/input.hpp index 0362af21ac..7092a042ea 100644 --- a/include/seqan3/io/alignment_file/input.hpp +++ b/include/seqan3/io/alignment_file/input.hpp @@ -913,16 +913,27 @@ class alignment_file_input template void set_references(typename traits_type::ref_ids & ref_ids, ref_sequences_t && ref_sequences) { - assert(std::ranges::size(ref_ids) == std::ranges::size(ref_sequences)); + assert(std::ranges::distance(ref_ids) == std::ranges::distance(ref_sequences)); header_ptr = std::unique_ptr{std::make_unique(ref_ids)}; reference_sequences_ptr = &ref_sequences; // initialise reference map and ref_dict if ref_ids are non-empty - for (size_t idx = 0; idx < ref_ids.size(); ++idx) + for (int32_t idx = 0; idx < std::ranges::distance(ref_ids); ++idx) { - header_ptr->ref_id_info.emplace_back(std::ranges::size(ref_sequences[idx]), ""); - header_ptr->ref_dict[header_ptr->ref_ids()[idx]] = idx; + header_ptr->ref_id_info.emplace_back(std::ranges::distance(ref_sequences[idx]), ""); + + if constexpr (std::ranges::ContiguousRange> && + std::ranges::SizedRange> && + ForwardingRange>) + { + auto && id = header_ptr->ref_ids()[idx]; + header_ptr->ref_dict[std::span{std::ranges::data(id), std::ranges::size(id)}] = idx; + } + else + { + header_ptr->ref_dict[header_ptr->ref_ids()[idx]] = idx; + } } } //!\} diff --git a/include/seqan3/io/alignment_file/output.hpp b/include/seqan3/io/alignment_file/output.hpp index e04d497835..8c08a824fc 100644 --- a/include/seqan3/io/alignment_file/output.hpp +++ b/include/seqan3/io/alignment_file/output.hpp @@ -389,16 +389,7 @@ class alignment_file_output alignment_file_output{filename, selected_field_ids{}} { - assert(std::ranges::size(ref_ids) == std::ranges::size(ref_lengths)); - - header_ptr = std::make_unique>(std::forward(ref_ids)); - - // fill ref_dict - for (size_t idx = 0; idx < std::ranges::size(ref_ids); ++idx) - { - header_ptr->ref_id_info.push_back({ref_lengths[idx], ""}); - header_ptr->ref_dict[(header_ptr->ref_ids()[idx])] = idx; - } + initialise_header_information(ref_ids, ref_lengths); } /*!\brief Construct from an existing stream and with specified format. @@ -436,16 +427,7 @@ class alignment_file_output selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) : alignment_file_output{std::forward(stream), file_format{}, selected_field_ids{}} { - assert(std::ranges::size(ref_ids) == std::ranges::size(ref_lengths)); - - header_ptr = std::make_unique>(std::forward(ref_ids)); - - // fill ref_dict - for (uint32_t idx = 0; idx < std::ranges::size(ref_ids); ++idx) - { - header_ptr->ref_id_info.emplace_back(ref_lengths[idx], ""); - header_ptr->ref_dict[header_ptr->ref_ids()[idx]] = idx; - } + initialise_header_information(ref_ids, ref_lengths); } //!\} @@ -763,6 +745,32 @@ class alignment_file_output //!\brief The file header object (will be set on construction). std::unique_ptr header_ptr; + //!\brief Fill the header reference dictionary, with the given info. + template + void initialise_header_information(ref_ids_type_ && ref_ids, ref_lengths_type && ref_lengths) + { + assert(std::ranges::size(ref_ids) == std::ranges::size(ref_lengths)); + + header_ptr = std::make_unique>(std::forward(ref_ids)); + + for (int32_t idx = 0; idx < std::ranges::distance(ref_ids); ++idx) + { + header_ptr->ref_id_info.emplace_back(ref_lengths[idx], ""); + + if constexpr (std::ranges::ContiguousRange> && + std::ranges::SizedRange> && + ForwardingRange>) + { + auto && id = header_ptr->ref_ids()[idx]; + header_ptr->ref_dict[std::span{std::ranges::data(id), std::ranges::size(id)}] = idx; + } + else + { + header_ptr->ref_dict[header_ptr->ref_ids()[idx]] = idx; + } + } + } + //!\brief Write record to format. template void write_record(record_header_ptr_t && record_header_ptr, pack_type && ...remainder) diff --git a/test/unit/io/alignment_file/alignment_file_format_test_template.hpp b/test/unit/io/alignment_file/alignment_file_format_test_template.hpp index 457d49201e..6fbef85056 100644 --- a/test/unit/io/alignment_file/alignment_file_format_test_template.hpp +++ b/test/unit/io/alignment_file/alignment_file_format_test_template.hpp @@ -485,6 +485,47 @@ TYPED_TEST_P(alignment_file_write, default_options_all_members_specified) EXPECT_EQ(ostream.str(), this->simple_three_reads_output); } +TYPED_TEST_P(alignment_file_write, write_ref_id_with_different_types) +{ + detail::alignment_file_output_format format; + + std::ostringstream ostream; + + alignment_file_header header{std::vector{this->ref_id}}; + header.ref_id_info.push_back({this->ref_seq.size(), ""}); + header.ref_dict[this->ref_id] = 0; + + this->tag_dicts[0]["NM"_tag] = 7; + this->tag_dicts[0]["AS"_tag] = 2; + this->tag_dicts[1]["xy"_tag] = std::vector{3,4,5}; + + // header ref_id_type is std::string + + // std::string + ASSERT_NO_THROW(format.write(ostream, output_options, header, this->seqs[0], this->quals[0], this->ids[0], + this->offsets[0], std::string{}, + /*----------------------->*/ this->ref_id, + this->ref_offsets[0], this->alignments[0], + this->flags[0], this->mapqs[0], this->mates[0], this->tag_dicts[0], 0, 0)); + // std::string_view + ASSERT_NO_THROW(format.write(ostream, output_options, header, this->seqs[1], this->quals[1], this->ids[1], + this->offsets[1], std::string{}, + /*----------------------->*/ std::string_view{this->ref_id}, + this->ref_offsets[1], this->alignments[1], + this->flags[1], this->mapqs[1], this->mates[1], this->tag_dicts[1], 0, 0)); + + // view on string + ASSERT_NO_THROW(format.write(ostream, output_options, header, this->seqs[2], this->quals[2], this->ids[2], + this->offsets[2], std::string{}, + /*----------------------->*/ this->ref_id | view::take(20), + this->ref_offsets[2], this->alignments[2], + this->flags[2], this->mapqs[2], this->mates[2], this->tag_dicts[2], 0, 0)); + + ostream.flush(); + + EXPECT_EQ(ostream.str(), this->simple_three_reads_output); +} + TYPED_TEST_P(alignment_file_write, with_header) { detail::alignment_file_output_format format; @@ -597,6 +638,7 @@ REGISTER_TYPED_TEST_CASE_P(alignment_file_read, REGISTER_TYPED_TEST_CASE_P(alignment_file_write, output_concept, default_options_all_members_specified, + write_ref_id_with_different_types, with_header, special_cases, format_errors); From d9bb7ab3c3f93678f136f4d42966248d74e80d83 Mon Sep 17 00:00:00 2001 From: smehringer Date: Fri, 28 Jun 2019 14:46:07 +0200 Subject: [PATCH 3/6] [FIX] Empty id needs to be written as '*\0' in BAM for backward compatibility. --- .../seqan3/io/alignment_file/format_bam.hpp | 7 ++++-- .../alignment_file_format_test_template.hpp | 24 +++++++++++++++++++ .../io/alignment_file/format_bam_test.cpp | 2 +- .../io/alignment_file/format_sam_test.cpp | 2 +- 4 files changed, 31 insertions(+), 4 deletions(-) diff --git a/include/seqan3/io/alignment_file/format_bam.hpp b/include/seqan3/io/alignment_file/format_bam.hpp index 17b48d80fa..c5f3eafa2e 100644 --- a/include/seqan3/io/alignment_file/format_bam.hpp +++ b/include/seqan3/io/alignment_file/format_bam.hpp @@ -836,7 +836,7 @@ class alignment_file_output_format : alignment_file_output_format(std::ranges::distance(id) + 1), + /* l_read_name */ std::max(std::ranges::distance(id) + 1, 2) /* empty id is repr. by '*\0' */, /* mapq */ mapq, /* bin */ reg2bin(ref_offset.value_or(-1), std::ranges::distance(get<1>(align))), /* n_cigar_op */ static_cast(cigar.size()), @@ -915,7 +915,10 @@ class alignment_file_output_format : alignment_file_output_format(&core), sizeof(core), stream_it); // write core - std::ranges::copy_n(std::ranges::begin(id), std::ranges::distance(id), stream_it); // write read id + if (std::ranges::distance(id) == 0) // empty id is represented as * for backward compatibility + stream_it = '*'; + else + std::ranges::copy_n(std::ranges::begin(id), std::ranges::distance(id), stream_it); // write read id stream_it = '\0'; // write cigar diff --git a/test/unit/io/alignment_file/alignment_file_format_test_template.hpp b/test/unit/io/alignment_file/alignment_file_format_test_template.hpp index 6fbef85056..a8dc07fc45 100644 --- a/test/unit/io/alignment_file/alignment_file_format_test_template.hpp +++ b/test/unit/io/alignment_file/alignment_file_format_test_template.hpp @@ -461,6 +461,29 @@ TYPED_TEST_P(alignment_file_write, output_concept) EXPECT_TRUE((AlignmentFileOutputFormat)); } +TYPED_TEST_P(alignment_file_write, write_empty_members) +{ + detail::alignment_file_output_format format; + + std::ostringstream ostream; + { + alignment_file_header header{std::vector{this->ref_id}}; + header.ref_id_info.push_back({this->ref_seq.size(), ""}); + header.ref_dict[this->ref_id] = 0; + + using default_align_t = std::pair>, std::span>>; + using default_mate_t = std::tuple, int32_t>; + + ASSERT_NO_THROW(format.write(ostream, output_options, header, std::string_view{}, std::string_view{}, + std::string_view{}, 0, std::string_view{}, std::string_view{}, + std::optional{std::nullopt}, default_align_t{}, 0, 0, + default_mate_t{}, sam_tag_dictionary{}, 0, 0)); + } + + ostream.flush(); + EXPECT_EQ(ostream.str(), this->empty_input); +} + TYPED_TEST_P(alignment_file_write, default_options_all_members_specified) { detail::alignment_file_output_format format; @@ -636,6 +659,7 @@ REGISTER_TYPED_TEST_CASE_P(alignment_file_read, format_error_ref_id_not_in_reference_information); REGISTER_TYPED_TEST_CASE_P(alignment_file_write, + write_empty_members, output_concept, default_options_all_members_specified, write_ref_id_with_different_types, diff --git a/test/unit/io/alignment_file/format_bam_test.cpp b/test/unit/io/alignment_file/format_bam_test.cpp index cb33f3e6ab..5c67c27c5d 100644 --- a/test/unit/io/alignment_file/format_bam_test.cpp +++ b/test/unit/io/alignment_file/format_bam_test.cpp @@ -146,7 +146,7 @@ struct alignment_file_read : public alignment_file_data '\x00', '\x04', '\x00', '\x00', '\x00', '\x72', '\x65', '\x66', '\x00', '\x22', '\x00', '\x00', '\x00', '\x22', '\x00', '\x00', '\x00', '\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\x02', '\x00', '\x48', '\x12', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\xFF', '\xFF', - '\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\x00', '\x00', '\x00', '\x00', '\x2A', '\x00', '\x0A' + '\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\x00', '\x00', '\x00', '\x00', '\x2A', '\x00' }; std::string empty_cigar{ diff --git a/test/unit/io/alignment_file/format_sam_test.cpp b/test/unit/io/alignment_file/format_sam_test.cpp index 901079c9f9..d111b2e51d 100644 --- a/test/unit/io/alignment_file/format_sam_test.cpp +++ b/test/unit/io/alignment_file/format_sam_test.cpp @@ -54,7 +54,7 @@ read3 43 ref 3 63 1S1M1D1M1I1M1I1D1M1S = 10 300 GGAGTATA !!*+,-./ "\tbf:B:f,3.5,0.1,43.8\n" "read3\t43\tref\t3\t63\t1S1M1D1M1I1M1I1D1M1S\tref\t10\t300\tGGAGTATA\t!!*+,-./\n"}; - std::string empty_input{"*\t0\t*\t0\t0\t*\t*\t0\t0\t*\t*\n"}; + std::string empty_input{"@HD\tVN:1.6\n@SQ\tSN:ref\tLN:34\n*\t0\t*\t0\t0\t*\t*\t0\t0\t*\t*\n"}; std::string empty_cigar{"read1\t41\tref\t1\t61\t*\tref\t10\t300\tACGT\t!##$\n"}; From 84ffea782a4324584b9d2aaef71e9a468a995ab0 Mon Sep 17 00:00:00 2001 From: smehringer Date: Thu, 20 Jun 2019 10:09:41 +0200 Subject: [PATCH 4/6] [FIX] SAM should not have requirements on the types it does not use anyway. --- include/seqan3/io/alignment_file/format_sam.hpp | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/include/seqan3/io/alignment_file/format_sam.hpp b/include/seqan3/io/alignment_file/format_sam.hpp index 3417cb88b5..9ce4af6b5b 100644 --- a/include/seqan3/io/alignment_file/format_sam.hpp +++ b/include/seqan3/io/alignment_file/format_sam.hpp @@ -988,7 +988,9 @@ class alignment_file_output_format typename align_type, typename qual_type, typename mate_type, - typename tag_dict_type> + typename tag_dict_type, + typename e_value_type, + typename bit_score_type> void write(stream_type & stream, alignment_file_output_options const & options, header_type && header, @@ -1004,8 +1006,8 @@ class alignment_file_output_format uint8_t mapq, mate_type && mate, tag_dict_type && tag_dict, - double SEQAN3_DOXYGEN_ONLY(e_value), - double SEQAN3_DOXYGEN_ONLY(bit_score)) + e_value_type && SEQAN3_DOXYGEN_ONLY(e_value), + bit_score_type && SEQAN3_DOXYGEN_ONLY(bit_score)) { /* Note the following general things: * @@ -1033,11 +1035,6 @@ class alignment_file_output_format "The id object must be a std::ranges::ForwardRange over " "letters that model seqan3::Alphabet."); - static_assert((std::ranges::ForwardRange && - Alphabet>), - "The ref_seq object must be a std::ranges::ForwardRange " - "over letters that model seqan3::Alphabet."); - if constexpr (!detail::decays_to_ignore_v) { static_assert((std::ranges::ForwardRange || From 2d792c8fb37a4807051ddead65a6946f1d4b875c Mon Sep 17 00:00:00 2001 From: smehringer Date: Thu, 20 Jun 2019 11:50:03 +0200 Subject: [PATCH 5/6] [MISC] Restrict the id alphabet to char. There is no use case to change this and it is required in BAM. --- include/seqan3/io/alignment_file/input.hpp | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/include/seqan3/io/alignment_file/input.hpp b/include/seqan3/io/alignment_file/input.hpp index 7092a042ea..b1b23f9155 100644 --- a/include/seqan3/io/alignment_file/input.hpp +++ b/include/seqan3/io/alignment_file/input.hpp @@ -78,11 +78,8 @@ namespace seqan3 * \brief Type template of the seqan3::field::SEQ, a container template over `sequence_alphabet`; * must model seqan3::SequenceContainer. */ -/*!\typedef using id_alphabet - * \brief Alphabet of the characters for the seqan3::field::ID; must model seqan3::Alphabet. - */ /*!\typedef using id_container - * \brief Type template of the seqan3::field::ID, a container template over `id_alphabet`; + * \brief Type template of the seqan3::field::ID, a container template over `char`; * must model seqan3::SequenceContainer. */ /*!\typedef using quality_alphabet @@ -122,8 +119,7 @@ SEQAN3_CONCEPT AlignmentFileInputTraits = requires (t v) requires SequenceContainer>; // field::ID - requires WritableAlphabet; - requires SequenceContainer>; + requires SequenceContainer>; // field::QUAL requires WritableQualityAlphabet; @@ -200,9 +196,6 @@ struct alignment_file_input_default_traits template using sequence_container = std::vector<_sequence_alphabet>; - //!\brief The alphabet for an identifier string is char. - using id_alphabet = char; - //!\brief The string type for an identifier is std::basic_string. template using id_container = std::basic_string<_id_alphabet>; @@ -417,8 +410,7 @@ class alignment_file_input using sequence_type = typename traits_type::template sequence_container< typename traits_type::sequence_alphabet>; //!\brief The type of field::ID (default std::string by default). - using id_type = typename traits_type::template id_container< - typename traits_type::id_alphabet>; + using id_type = typename traits_type::template id_container; //!\brief The type of field::OFFSET is fixed to int32_t. using offset_type = int32_t; /*!\brief The type of field::REF_SEQ (default depends on construction). From 4b63d48952ef30d1ca34f3be3deb59f9b6e4bde3 Mon Sep 17 00:00:00 2001 From: smehringer Date: Thu, 27 Jun 2019 09:19:51 +0200 Subject: [PATCH 6/6] [FIX] Truncate read ids that are longer than 255 chars when writing BAM. --- include/seqan3/io/alignment_file/format_bam.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/seqan3/io/alignment_file/format_bam.hpp b/include/seqan3/io/alignment_file/format_bam.hpp index c5f3eafa2e..582aad1eab 100644 --- a/include/seqan3/io/alignment_file/format_bam.hpp +++ b/include/seqan3/io/alignment_file/format_bam.hpp @@ -836,7 +836,7 @@ class alignment_file_output_format : alignment_file_output_format(std::ranges::distance(id) + 1, 2) /* empty id is repr. by '*\0' */, + /* l_read_name */ std::max(std::min(std::ranges::distance(id) + 1, 255), 2), /* mapq */ mapq, /* bin */ reg2bin(ref_offset.value_or(-1), std::ranges::distance(get<1>(align))), /* n_cigar_op */ static_cast(cigar.size()), @@ -918,7 +918,7 @@ class alignment_file_output_format : alignment_file_output_format