Skip to content

Commit

Permalink
Merge pull request #3256 from eseiler/feature/user_tags
Browse files Browse the repository at this point in the history
[FEATURE] Accept user-defined tags
  • Loading branch information
eseiler authored Aug 12, 2024
2 parents 5c917af + d0eabcf commit 1fcf9a6
Show file tree
Hide file tree
Showing 14 changed files with 149 additions and 352 deletions.
1 change: 0 additions & 1 deletion doc/cookbook/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -439,7 +439,6 @@ Search for keywords with `Strg + F`.
\include test/snippet/io/sam_file/sam_file_input_front.cpp
\include test/snippet/io/sam_file/sam_file_input_get_header.cpp
\include test/snippet/io/sam_file/sam_file_input_my_traits.cpp
\include test/snippet/io/sam_file/sam_file_input_options.cpp
\include test/snippet/io/sam_file/sam_file_input_reading_custom_fields.cpp
\include test/snippet/io/sam_file/sam_file_input_reading_filter.cpp
\include test/snippet/io/sam_file/sam_file_input_reading_move_record.cpp
Expand Down
45 changes: 18 additions & 27 deletions include/seqan3/io/sam_file/detail/format_sam_base.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
#include <seqan3/io/detail/misc.hpp>
#include <seqan3/io/sam_file/detail/cigar.hpp>
#include <seqan3/io/sam_file/header.hpp>
#include <seqan3/io/sam_file/input_options.hpp>
#include <seqan3/io/sam_file/output_format_concept.hpp>
#include <seqan3/utility/detail/type_name_as_string.hpp>
#include <seqan3/utility/views/repeat_n.hpp>
Expand Down Expand Up @@ -83,11 +82,10 @@ class format_sam_base
template <arithmetic arithmetic_target_type>
void read_arithmetic_field(std::string_view const & str, arithmetic_target_type & arithmetic_target);

template <typename stream_view_type, typename ref_ids_type, typename ref_seqs_type, typename seq_legal_alph_type>
template <typename stream_view_type, typename ref_ids_type, typename ref_seqs_type>
void read_header(stream_view_type && stream_view,
sam_file_header<ref_ids_type> & hdr,
ref_seqs_type & /*ref_id_to_pos_map*/,
sam_file_input_options<seq_legal_alph_type> const & options);
ref_seqs_type & /*ref_id_to_pos_map*/);

template <typename stream_t, typename header_type>
void write_header(stream_t & stream, sam_file_output_options const & options, header_type & header);
Expand Down Expand Up @@ -260,7 +258,6 @@ inline void format_sam_base::read_arithmetic_field(std::string_view const & str,
* \tparam stream_view_type The type of the stream as a view.
* \param[in, out] stream_view The stream view to iterate over.
* \param[in, out] hdr The header (as a pointer) to store the parsed values.
* \param[in] options The options to alter the parsing process.
*
* \throws seqan3::format_error if any unexpected character or format is encountered.
*
Expand All @@ -272,14 +269,16 @@ inline void format_sam_base::read_arithmetic_field(std::string_view const & str,
* The function throws a seqan3::format_error if the format is not in a correct state (e.g. required fields are not
* given), but throwing might occur downstream of the actual error.
*
* If any unknown tag was encountered, a warning will be emitted to std::cerr. This can be configured with
* seqan3::sam_file_input_options::stream_warnings_to.
* Any user-defined tags are not checked for correctness ([TAG]:[VALUE]) and are stored as strings:
* * HD: seqan3::sam_file_header::user_tags
* * SQ: seqan3::sam_file_header::ref_id_info
* * RG: seqan3::sam_file_header::read_groups
* * PG: seqan3::sam_file_header::program_infos / seqan3::sam_file_program_info_t::user_tags
*/
template <typename stream_view_type, typename ref_ids_type, typename ref_seqs_type, typename seq_legal_alph_type>
template <typename stream_view_type, typename ref_ids_type, typename ref_seqs_type>
inline void format_sam_base::read_header(stream_view_type && stream_view,
sam_file_header<ref_ids_type> & hdr,
ref_seqs_type & /*ref_id_to_pos_map*/,
sam_file_input_options<seq_legal_alph_type> const & options)
ref_seqs_type & /*ref_id_to_pos_map*/)
{
auto it = std::ranges::begin(stream_view);
auto end = std::ranges::end(stream_view);
Expand Down Expand Up @@ -341,20 +340,6 @@ inline void format_sam_base::read_header(stream_view_type && stream_view,
read_forward_range_field(string_buffer, value);
};

auto consume_unsupported_tag_and_print_warning =
[&](char const * const header_tag, std::array<char, 2> const raw_tag)
{
// Not using `copy_next_tag_value_into_buffer` because we do not care whether the tag is valid.
// E.g., `pb5.0.0` instead of `pb:5.0.0`, would break the parsing if we used `copy_next_tag_value_into_buffer`.
take_until_predicate(is_char<'\t'> || is_char<'\n'>);

if (options.stream_warnings_to == nullptr)
return;

*options.stream_warnings_to << "Unsupported tag found in SAM header @" << header_tag << ": \"" << raw_tag[0]
<< raw_tag[1] << string_buffer << "\"\n";
};

while (it != end && is_char<'@'>(*it))
{
++it; // skip @
Expand Down Expand Up @@ -391,9 +376,9 @@ inline void format_sam_base::read_header(stream_view_type && stream_view,
header_entry = std::addressof(hdr.grouping);
break;
}
default: // unsupported header tag
default: // unknown/user tag
{
consume_unsupported_tag_and_print_warning("HD", raw_tag);
parse_and_append_unhandled_tag_to_string(hdr.user_tags, raw_tag);
}
}

Expand Down Expand Up @@ -565,7 +550,7 @@ inline void format_sam_base::read_header(stream_view_type && stream_view,
}
default: // unsupported header tag
{
consume_unsupported_tag_and_print_warning("PG", raw_tag);
parse_and_append_unhandled_tag_to_string(tmp.user_tags, raw_tag);
}
}

Expand Down Expand Up @@ -669,6 +654,9 @@ format_sam_base::write_header(stream_t & stream, sam_file_output_options const &
if (!header.grouping.empty())
stream << "\tGO:" << header.grouping;

if (!header.user_tags.empty())
stream << '\t' << header.user_tags;

detail::write_eol(stream_it, options.add_carriage_return);

// (@SQ) Write Reference Sequence Dictionary lines [required].
Expand Down Expand Up @@ -719,6 +707,9 @@ format_sam_base::write_header(stream_t & stream, sam_file_output_options const &
if (!program.version.empty())
stream << "\tVN:" << program.version;

if (!program.user_tags.empty())
stream << '\t' << program.user_tags;

detail::write_eol(stream_it, options.add_carriage_return);
}

Expand Down
41 changes: 21 additions & 20 deletions include/seqan3/io/sam_file/format_bam.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ class format_bam : private detail::format_sam_base
typename e_value_type,
typename bit_score_type>
void read_alignment_record(stream_type & stream,
sam_file_input_options<seq_legal_alph_type> const & options,
sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
ref_seqs_type & ref_seqs,
sam_file_header<ref_ids_type> & header,
stream_pos_type & position_buffer,
Expand Down Expand Up @@ -257,24 +257,25 @@ template <typename stream_type, // constraints checked by file
typename tag_dict_type,
typename e_value_type,
typename bit_score_type>
inline void format_bam::read_alignment_record(stream_type & stream,
sam_file_input_options<seq_legal_alph_type> const & options,
ref_seqs_type & ref_seqs,
sam_file_header<ref_ids_type> & header,
stream_pos_type & position_buffer,
seq_type & seq,
qual_type & qual,
id_type & id,
ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
ref_id_type & ref_id,
ref_offset_type & ref_offset,
cigar_type & cigar_vector,
flag_type & flag,
mapq_type & mapq,
mate_type & mate,
tag_dict_type & tag_dict,
e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
inline void
format_bam::read_alignment_record(stream_type & stream,
sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
ref_seqs_type & ref_seqs,
sam_file_header<ref_ids_type> & header,
stream_pos_type & position_buffer,
seq_type & seq,
qual_type & qual,
id_type & id,
ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
ref_id_type & ref_id,
ref_offset_type & ref_offset,
cigar_type & cigar_vector,
flag_type & flag,
mapq_type & mapq,
mate_type & mate,
tag_dict_type & tag_dict,
e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
{
static_assert(detail::decays_to_ignore_v<ref_offset_type>
|| detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
Expand Down Expand Up @@ -304,7 +305,7 @@ inline void format_bam::read_alignment_record(stream_type & stream,
read_integral_byte_field(stream_view, l_text);

if (l_text > 0) // header text is present
read_header(stream_view | detail::take_exactly_or_throw(l_text), header, ref_seqs, options);
read_header(stream_view | detail::take_exactly_or_throw(l_text), header, ref_seqs);

read_integral_byte_field(stream_view, n_ref);

Expand Down
41 changes: 21 additions & 20 deletions include/seqan3/io/sam_file/format_sam.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ class format_sam : protected detail::format_sam_base
typename e_value_type,
typename bit_score_type>
void read_alignment_record(stream_type & stream,
sam_file_input_options<seq_legal_alph_type> const & options,
sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
ref_seqs_type & ref_seqs,
sam_file_header<ref_ids_type> & header,
stream_pos_type & position_buffer,
Expand Down Expand Up @@ -355,24 +355,25 @@ template <typename stream_type, // constraints checked by file
typename tag_dict_type,
typename e_value_type,
typename bit_score_type>
inline void format_sam::read_alignment_record(stream_type & stream,
sam_file_input_options<seq_legal_alph_type> const & options,
ref_seqs_type & ref_seqs,
sam_file_header<ref_ids_type> & header,
stream_pos_type & position_buffer,
seq_type & seq,
qual_type & qual,
id_type & id,
ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
ref_id_type & ref_id,
ref_offset_type & ref_offset,
cigar_type & cigar_vector,
flag_type & flag,
mapq_type & mapq,
mate_type & mate,
tag_dict_type & tag_dict,
e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
inline void
format_sam::read_alignment_record(stream_type & stream,
sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
ref_seqs_type & ref_seqs,
sam_file_header<ref_ids_type> & header,
stream_pos_type & position_buffer,
seq_type & seq,
qual_type & qual,
id_type & id,
ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
ref_id_type & ref_id,
ref_offset_type & ref_offset,
cigar_type & cigar_vector,
flag_type & flag,
mapq_type & mapq,
mate_type & mate,
tag_dict_type & tag_dict,
e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
{
static_assert(detail::decays_to_ignore_v<ref_offset_type>
|| detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
Expand All @@ -389,7 +390,7 @@ inline void format_sam::read_alignment_record(stream_type & stream,
// -------------------------------------------------------------------------------------------------------------
if (is_char<'@'>(*stream_it)) // we always read the header if present
{
read_header(stream_view, header, ref_seqs, options);
read_header(stream_view, header, ref_seqs);

if (std::ranges::begin(stream_view) == std::ranges::end(stream_view)) // file has no records
return;
Expand Down
3 changes: 3 additions & 0 deletions include/seqan3/io/sam_file/header.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ struct sam_file_program_info_t
std::string previous; //!< The id of the previous program if program calls were chained.
std::string description; //!< A description of the program and/or program call.
std::string version; //!< The program/tool version.
std::string user_tags; //!< Additional user-defined tags.
};

/*!\brief Stores the header information of SAM/BAM files.
Expand Down Expand Up @@ -213,6 +214,8 @@ class sam_file_header
* * **SM:** Sample. Use pool name where a pool is being sequenced.
*/
std::vector<std::pair<std::string, std::string>> read_groups;

std::string user_tags; //!< Additional user-defined tags.
};

} // namespace seqan3
18 changes: 3 additions & 15 deletions include/seqan3/io/sam_file/input_options.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@

#pragma once

#include <iostream>

#include <seqan3/core/platform.hpp>

namespace seqan3
Expand All @@ -19,22 +17,12 @@ namespace seqan3
/*!\brief The options type defines various option members that influence the behaviour of all or some formats.
* \ingroup io_sam_file
*
* \note As of now, there are no specific options for the SAM format. This class may be used in the future for possible
* SAM parsing extensions.
* \remark For a complete overview, take a look at \ref io_sam_file
*/
template <typename sequence_legal_alphabet>
struct sam_file_input_options
{
/*!\brief The stream to write warnings to. Defaults to std::cerr.
* \details
* ### Example
* \include test/snippet/io/sam_file/sam_file_input_options.cpp
* Output to std::cerr:
* \include test/snippet/io/sam_file/sam_file_input_options.err
* Output to std::cout:
* \include test/snippet/io/sam_file/sam_file_input_options.out
* \experimentalapi{Experimental since version 3.4.}
*/
std::ostream * stream_warnings_to{std::addressof(std::cerr)};
};
{};

} // namespace seqan3
Loading

0 comments on commit 1fcf9a6

Please sign in to comment.