Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEATURE] Accept user-defined tags #3256

Merged
merged 2 commits into from
Aug 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion doc/cookbook/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -439,7 +439,6 @@ Search for keywords with `Strg + F`.
\include test/snippet/io/sam_file/sam_file_input_front.cpp
\include test/snippet/io/sam_file/sam_file_input_get_header.cpp
\include test/snippet/io/sam_file/sam_file_input_my_traits.cpp
\include test/snippet/io/sam_file/sam_file_input_options.cpp
\include test/snippet/io/sam_file/sam_file_input_reading_custom_fields.cpp
\include test/snippet/io/sam_file/sam_file_input_reading_filter.cpp
\include test/snippet/io/sam_file/sam_file_input_reading_move_record.cpp
Expand Down
45 changes: 18 additions & 27 deletions include/seqan3/io/sam_file/detail/format_sam_base.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
#include <seqan3/io/detail/misc.hpp>
#include <seqan3/io/sam_file/detail/cigar.hpp>
#include <seqan3/io/sam_file/header.hpp>
#include <seqan3/io/sam_file/input_options.hpp>
#include <seqan3/io/sam_file/output_format_concept.hpp>
#include <seqan3/utility/detail/type_name_as_string.hpp>
#include <seqan3/utility/views/repeat_n.hpp>
Expand Down Expand Up @@ -83,11 +82,10 @@ class format_sam_base
template <arithmetic arithmetic_target_type>
void read_arithmetic_field(std::string_view const & str, arithmetic_target_type & arithmetic_target);

template <typename stream_view_type, typename ref_ids_type, typename ref_seqs_type, typename seq_legal_alph_type>
template <typename stream_view_type, typename ref_ids_type, typename ref_seqs_type>
void read_header(stream_view_type && stream_view,
sam_file_header<ref_ids_type> & hdr,
ref_seqs_type & /*ref_id_to_pos_map*/,
sam_file_input_options<seq_legal_alph_type> const & options);
ref_seqs_type & /*ref_id_to_pos_map*/);

template <typename stream_t, typename header_type>
void write_header(stream_t & stream, sam_file_output_options const & options, header_type & header);
Expand Down Expand Up @@ -260,7 +258,6 @@ inline void format_sam_base::read_arithmetic_field(std::string_view const & str,
* \tparam stream_view_type The type of the stream as a view.
* \param[in, out] stream_view The stream view to iterate over.
* \param[in, out] hdr The header (as a pointer) to store the parsed values.
* \param[in] options The options to alter the parsing process.
*
* \throws seqan3::format_error if any unexpected character or format is encountered.
*
Expand All @@ -272,14 +269,16 @@ inline void format_sam_base::read_arithmetic_field(std::string_view const & str,
* The function throws a seqan3::format_error if the format is not in a correct state (e.g. required fields are not
* given), but throwing might occur downstream of the actual error.
*
* If any unknown tag was encountered, a warning will be emitted to std::cerr. This can be configured with
* seqan3::sam_file_input_options::stream_warnings_to.
* Any user-defined tags are not checked for correctness ([TAG]:[VALUE]) and are stored as strings:
* * HD: seqan3::sam_file_header::user_tags
* * SQ: seqan3::sam_file_header::ref_id_info
* * RG: seqan3::sam_file_header::read_groups
* * PG: seqan3::sam_file_header::program_infos / seqan3::sam_file_program_info_t::user_tags
*/
template <typename stream_view_type, typename ref_ids_type, typename ref_seqs_type, typename seq_legal_alph_type>
template <typename stream_view_type, typename ref_ids_type, typename ref_seqs_type>
inline void format_sam_base::read_header(stream_view_type && stream_view,
sam_file_header<ref_ids_type> & hdr,
ref_seqs_type & /*ref_id_to_pos_map*/,
sam_file_input_options<seq_legal_alph_type> const & options)
ref_seqs_type & /*ref_id_to_pos_map*/)
{
auto it = std::ranges::begin(stream_view);
auto end = std::ranges::end(stream_view);
Expand Down Expand Up @@ -341,20 +340,6 @@ inline void format_sam_base::read_header(stream_view_type && stream_view,
read_forward_range_field(string_buffer, value);
};

auto consume_unsupported_tag_and_print_warning =
[&](char const * const header_tag, std::array<char, 2> const raw_tag)
{
// Not using `copy_next_tag_value_into_buffer` because we do not care whether the tag is valid.
// E.g., `pb5.0.0` instead of `pb:5.0.0`, would break the parsing if we used `copy_next_tag_value_into_buffer`.
take_until_predicate(is_char<'\t'> || is_char<'\n'>);

if (options.stream_warnings_to == nullptr)
return;

*options.stream_warnings_to << "Unsupported tag found in SAM header @" << header_tag << ": \"" << raw_tag[0]
<< raw_tag[1] << string_buffer << "\"\n";
};

while (it != end && is_char<'@'>(*it))
{
++it; // skip @
Expand Down Expand Up @@ -391,9 +376,9 @@ inline void format_sam_base::read_header(stream_view_type && stream_view,
header_entry = std::addressof(hdr.grouping);
break;
}
default: // unsupported header tag
default: // unknown/user tag
{
consume_unsupported_tag_and_print_warning("HD", raw_tag);
parse_and_append_unhandled_tag_to_string(hdr.user_tags, raw_tag);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just for me to understand the format here. We store all parsed line field tags tab separated within a string. Or are official field tags from the specification stored in a different format?

Copy link
Member Author

@eseiler eseiler Jun 10, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All the header stuff (SQ, HD, RG) have strutcs that store the members defined in the SAM spec. Any user defined tags are stored as string. For the alignment records, tags, including User-defined, are stored in the SAM tag dictionary.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but do I see it right that here we store the user defined field tags in a tab seperated string? If so, I feel like it would make more sense to put this into a vector of field tags instead.

Copy link
Member Author

@eseiler eseiler Jun 10, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that would be better. Would be nice to store a struct with the name and value.

Anyway, I would do it as a followup, because it breaks api for the header tags where we already store the user tags in a string (SQ and RG).

If we do a vector, we should also throw on invalid formatting (it should be TAG:VALUE with some constraints on allowed characters) like samtools does.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good! Would you mind tracking this on the issue board?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will do when working on this issue

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

}
}

Expand Down Expand Up @@ -565,7 +550,7 @@ inline void format_sam_base::read_header(stream_view_type && stream_view,
}
default: // unsupported header tag
{
consume_unsupported_tag_and_print_warning("PG", raw_tag);
parse_and_append_unhandled_tag_to_string(tmp.user_tags, raw_tag);
}
}

Expand Down Expand Up @@ -669,6 +654,9 @@ format_sam_base::write_header(stream_t & stream, sam_file_output_options const &
if (!header.grouping.empty())
stream << "\tGO:" << header.grouping;

if (!header.user_tags.empty())
stream << '\t' << header.user_tags;

detail::write_eol(stream_it, options.add_carriage_return);

// (@SQ) Write Reference Sequence Dictionary lines [required].
Expand Down Expand Up @@ -719,6 +707,9 @@ format_sam_base::write_header(stream_t & stream, sam_file_output_options const &
if (!program.version.empty())
stream << "\tVN:" << program.version;

if (!program.user_tags.empty())
stream << '\t' << program.user_tags;

detail::write_eol(stream_it, options.add_carriage_return);
}

Expand Down
41 changes: 21 additions & 20 deletions include/seqan3/io/sam_file/format_bam.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ class format_bam : private detail::format_sam_base
typename e_value_type,
typename bit_score_type>
void read_alignment_record(stream_type & stream,
sam_file_input_options<seq_legal_alph_type> const & options,
sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
ref_seqs_type & ref_seqs,
sam_file_header<ref_ids_type> & header,
stream_pos_type & position_buffer,
Expand Down Expand Up @@ -257,24 +257,25 @@ template <typename stream_type, // constraints checked by file
typename tag_dict_type,
typename e_value_type,
typename bit_score_type>
inline void format_bam::read_alignment_record(stream_type & stream,
sam_file_input_options<seq_legal_alph_type> const & options,
ref_seqs_type & ref_seqs,
sam_file_header<ref_ids_type> & header,
stream_pos_type & position_buffer,
seq_type & seq,
qual_type & qual,
id_type & id,
ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
ref_id_type & ref_id,
ref_offset_type & ref_offset,
cigar_type & cigar_vector,
flag_type & flag,
mapq_type & mapq,
mate_type & mate,
tag_dict_type & tag_dict,
e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
inline void
format_bam::read_alignment_record(stream_type & stream,
sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
ref_seqs_type & ref_seqs,
sam_file_header<ref_ids_type> & header,
stream_pos_type & position_buffer,
seq_type & seq,
qual_type & qual,
id_type & id,
ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
ref_id_type & ref_id,
ref_offset_type & ref_offset,
cigar_type & cigar_vector,
flag_type & flag,
mapq_type & mapq,
mate_type & mate,
tag_dict_type & tag_dict,
e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
{
static_assert(detail::decays_to_ignore_v<ref_offset_type>
|| detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
Expand Down Expand Up @@ -304,7 +305,7 @@ inline void format_bam::read_alignment_record(stream_type & stream,
read_integral_byte_field(stream_view, l_text);

if (l_text > 0) // header text is present
read_header(stream_view | detail::take_exactly_or_throw(l_text), header, ref_seqs, options);
read_header(stream_view | detail::take_exactly_or_throw(l_text), header, ref_seqs);

read_integral_byte_field(stream_view, n_ref);

Expand Down
41 changes: 21 additions & 20 deletions include/seqan3/io/sam_file/format_sam.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ class format_sam : protected detail::format_sam_base
typename e_value_type,
typename bit_score_type>
void read_alignment_record(stream_type & stream,
sam_file_input_options<seq_legal_alph_type> const & options,
sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
ref_seqs_type & ref_seqs,
sam_file_header<ref_ids_type> & header,
stream_pos_type & position_buffer,
Expand Down Expand Up @@ -355,24 +355,25 @@ template <typename stream_type, // constraints checked by file
typename tag_dict_type,
typename e_value_type,
typename bit_score_type>
inline void format_sam::read_alignment_record(stream_type & stream,
sam_file_input_options<seq_legal_alph_type> const & options,
ref_seqs_type & ref_seqs,
sam_file_header<ref_ids_type> & header,
stream_pos_type & position_buffer,
seq_type & seq,
qual_type & qual,
id_type & id,
ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
ref_id_type & ref_id,
ref_offset_type & ref_offset,
cigar_type & cigar_vector,
flag_type & flag,
mapq_type & mapq,
mate_type & mate,
tag_dict_type & tag_dict,
e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
inline void
format_sam::read_alignment_record(stream_type & stream,
sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
eseiler marked this conversation as resolved.
Show resolved Hide resolved
ref_seqs_type & ref_seqs,
sam_file_header<ref_ids_type> & header,
stream_pos_type & position_buffer,
seq_type & seq,
qual_type & qual,
id_type & id,
ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
ref_id_type & ref_id,
ref_offset_type & ref_offset,
cigar_type & cigar_vector,
flag_type & flag,
mapq_type & mapq,
mate_type & mate,
tag_dict_type & tag_dict,
e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
{
static_assert(detail::decays_to_ignore_v<ref_offset_type>
|| detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
Expand All @@ -389,7 +390,7 @@ inline void format_sam::read_alignment_record(stream_type & stream,
// -------------------------------------------------------------------------------------------------------------
if (is_char<'@'>(*stream_it)) // we always read the header if present
{
read_header(stream_view, header, ref_seqs, options);
read_header(stream_view, header, ref_seqs);

if (std::ranges::begin(stream_view) == std::ranges::end(stream_view)) // file has no records
return;
Expand Down
3 changes: 3 additions & 0 deletions include/seqan3/io/sam_file/header.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ struct sam_file_program_info_t
std::string previous; //!< The id of the previous program if program calls were chained.
std::string description; //!< A description of the program and/or program call.
std::string version; //!< The program/tool version.
std::string user_tags; //!< Additional user-defined tags.
};

/*!\brief Stores the header information of SAM/BAM files.
Expand Down Expand Up @@ -213,6 +214,8 @@ class sam_file_header
* * **SM:** Sample. Use pool name where a pool is being sequenced.
*/
std::vector<std::pair<std::string, std::string>> read_groups;

std::string user_tags; //!< Additional user-defined tags.
};

} // namespace seqan3
18 changes: 3 additions & 15 deletions include/seqan3/io/sam_file/input_options.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@

#pragma once

#include <iostream>

#include <seqan3/core/platform.hpp>

namespace seqan3
Expand All @@ -19,22 +17,12 @@ namespace seqan3
/*!\brief The options type defines various option members that influence the behaviour of all or some formats.
* \ingroup io_sam_file
*
* \note As of now, there are no specific options for the SAM format. This class may be used in the future for possible
* SAM parsing extensions.
* \remark For a complete overview, take a look at \ref io_sam_file
*/
template <typename sequence_legal_alphabet>
struct sam_file_input_options
{
/*!\brief The stream to write warnings to. Defaults to std::cerr.
* \details
* ### Example
* \include test/snippet/io/sam_file/sam_file_input_options.cpp
* Output to std::cerr:
* \include test/snippet/io/sam_file/sam_file_input_options.err
* Output to std::cout:
* \include test/snippet/io/sam_file/sam_file_input_options.out
* \experimentalapi{Experimental since version 3.4.}
*/
std::ostream * stream_warnings_to{std::addressof(std::cerr)};
};
{};
eseiler marked this conversation as resolved.
Show resolved Hide resolved

} // namespace seqan3
Loading
Loading