Skip to content

Commit

Permalink
Merge pull request #3081 from smehringer/io_empty_bam_file
Browse files Browse the repository at this point in the history
[FIX] Write BAM header on sam_file deconstruction if not written before
  • Loading branch information
eseiler authored Oct 31, 2022
2 parents 469ccf4 + fe51c42 commit 334b790
Show file tree
Hide file tree
Showing 9 changed files with 211 additions and 115 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ If possible, provide tooling that performs the changes, e.g. a shell-script.

## Notable Bug-fixes

#### I/O
* Empty SAM/BAM files must at least write a header to ensure a valid file
([\#3081](https://github.com/seqan/seqan3/pull/3081)).

## API changes

#### Dependencies
Expand Down
169 changes: 85 additions & 84 deletions include/seqan3/io/sam_file/detail/format_sam_base.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,8 @@ class format_sam_base
sam_file_header<ref_ids_type> & hdr,
ref_seqs_type & /*ref_id_to_pos_map*/);

template <typename stream_t, typename ref_ids_type>
void
write_header(stream_t & stream, sam_file_output_options const & options, sam_file_header<ref_ids_type> & header);
template <typename stream_t, typename header_type>
void write_header(stream_t & stream, sam_file_output_options const & options, header_type & header);
};

/*!\brief Checks for known reference ids or adds a new reference is and assigns a reference id to `ref_id`.
Expand Down Expand Up @@ -702,115 +701,117 @@ inline void format_sam_base::read_header(stream_view_type && stream_view,
* according to the rules of the official
* [SAM format specifications](https://samtools.github.io/hts-specs/SAMv1.pdf).
*/
template <typename stream_t, typename ref_ids_type>
inline void format_sam_base::write_header(stream_t & stream,
sam_file_output_options const & options,
sam_file_header<ref_ids_type> & header)
template <typename stream_t, typename header_type>
inline void
format_sam_base::write_header(stream_t & stream, sam_file_output_options const & options, header_type & header)
{
// -----------------------------------------------------------------
// Check Header
// -----------------------------------------------------------------
if constexpr (!detail::decays_to_ignore_v<header_type>)
{
// -----------------------------------------------------------------
// Check Header
// -----------------------------------------------------------------

// (@HD) Check header line
// The format version string will be taken from the local member variable
if (!header.sorting.empty()
&& !(header.sorting == "unknown" || header.sorting == "unsorted" || header.sorting == "queryname"
|| header.sorting == "coordinate"))
throw format_error{"SAM format error: The header.sorting member must be "
"one of [unknown, unsorted, queryname, coordinate]."};
// (@HD) Check header line
// The format version string will be taken from the local member variable
if (!header.sorting.empty()
&& !(header.sorting == "unknown" || header.sorting == "unsorted" || header.sorting == "queryname"
|| header.sorting == "coordinate"))
throw format_error{"SAM format error: The header.sorting member must be "
"one of [unknown, unsorted, queryname, coordinate]."};

if (!header.grouping.empty()
&& !(header.grouping == "none" || header.grouping == "query" || header.grouping == "reference"))
throw format_error{"SAM format error: The header.grouping member must be "
"one of [none, query, reference]."};
if (!header.grouping.empty()
&& !(header.grouping == "none" || header.grouping == "query" || header.grouping == "reference"))
throw format_error{"SAM format error: The header.grouping member must be "
"one of [none, query, reference]."};

// (@SQ) Check Reference Sequence Dictionary lines
// (@SQ) Check Reference Sequence Dictionary lines

// TODO
// TODO

// - sorting order be one of ...
// - grouping can be one of ...
// - reference names must be unique
// - ids of read groups must be unique
// - program ids need to be unique
// many more small semantic things, like fits REGEX
// - sorting order be one of ...
// - grouping can be one of ...
// - reference names must be unique
// - ids of read groups must be unique
// - program ids need to be unique
// many more small semantic things, like fits REGEX

// -----------------------------------------------------------------
// Write Header
// -----------------------------------------------------------------
std::ostreambuf_iterator stream_it{stream};
// -----------------------------------------------------------------
// Write Header
// -----------------------------------------------------------------
std::ostreambuf_iterator stream_it{stream};

// (@HD) Write header line [required].
stream << "@HD\tVN:";
std::ranges::copy(format_version, stream_it);
// (@HD) Write header line [required].
stream << "@HD\tVN:";
std::ranges::copy(format_version, stream_it);

if (!header.sorting.empty())
stream << "\tSO:" << header.sorting;
if (!header.sorting.empty())
stream << "\tSO:" << header.sorting;

if (!header.subsorting.empty())
stream << "\tSS:" << header.subsorting;
if (!header.subsorting.empty())
stream << "\tSS:" << header.subsorting;

if (!header.grouping.empty())
stream << "\tGO:" << header.grouping;
if (!header.grouping.empty())
stream << "\tGO:" << header.grouping;

detail::write_eol(stream_it, options.add_carriage_return);
detail::write_eol(stream_it, options.add_carriage_return);

// (@SQ) Write Reference Sequence Dictionary lines [required].
for (auto const & [ref_name, ref_info] : views::zip(header.ref_ids(), header.ref_id_info))
{
stream << "@SQ\tSN:";
// (@SQ) Write Reference Sequence Dictionary lines [required].
for (auto const & [ref_name, ref_info] : views::zip(header.ref_ids(), header.ref_id_info))
{
stream << "@SQ\tSN:";

std::ranges::copy(ref_name, stream_it);
std::ranges::copy(ref_name, stream_it);

stream << "\tLN:" << get<0>(ref_info);
stream << "\tLN:" << get<0>(ref_info);

if (!get<1>(ref_info).empty())
stream << "\t" << get<1>(ref_info);
if (!get<1>(ref_info).empty())
stream << "\t" << get<1>(ref_info);

detail::write_eol(stream_it, options.add_carriage_return);
}
detail::write_eol(stream_it, options.add_carriage_return);
}

// Write read group (@RG) lines if specified.
for (auto const & read_group : header.read_groups)
{
stream << "@RG"
<< "\tID:" << get<0>(read_group);
// Write read group (@RG) lines if specified.
for (auto const & read_group : header.read_groups)
{
stream << "@RG"
<< "\tID:" << get<0>(read_group);

if (!get<1>(read_group).empty())
stream << "\t" << get<1>(read_group);
if (!get<1>(read_group).empty())
stream << "\t" << get<1>(read_group);

detail::write_eol(stream_it, options.add_carriage_return);
}
detail::write_eol(stream_it, options.add_carriage_return);
}

// Write program (@PG) lines if specified.
for (auto const & program : header.program_infos)
{
stream << "@PG"
<< "\tID:" << program.id;
// Write program (@PG) lines if specified.
for (auto const & program : header.program_infos)
{
stream << "@PG"
<< "\tID:" << program.id;

if (!program.name.empty())
stream << "\tPN:" << program.name;
if (!program.name.empty())
stream << "\tPN:" << program.name;

if (!program.command_line_call.empty())
stream << "\tCL:" << program.command_line_call;
if (!program.command_line_call.empty())
stream << "\tCL:" << program.command_line_call;

if (!program.previous.empty())
stream << "\tPP:" << program.previous;
if (!program.previous.empty())
stream << "\tPP:" << program.previous;

if (!program.description.empty())
stream << "\tDS:" << program.description;
if (!program.description.empty())
stream << "\tDS:" << program.description;

if (!program.version.empty())
stream << "\tVN:" << program.version;
if (!program.version.empty())
stream << "\tVN:" << program.version;

detail::write_eol(stream_it, options.add_carriage_return);
}
detail::write_eol(stream_it, options.add_carriage_return);
}

// Write comment (@CO) lines if specified.
for (auto const & comment : header.comments)
{
stream << "@CO\t" << comment;
detail::write_eol(stream_it, options.add_carriage_return);
// Write comment (@CO) lines if specified.
for (auto const & comment : header.comments)
{
stream << "@CO\t" << comment;
detail::write_eol(stream_it, options.add_carriage_return);
}
}
}

Expand Down
84 changes: 60 additions & 24 deletions include/seqan3/io/sam_file/format_bam.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,10 @@ class format_bam : private detail::format_sam_base
[[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
[[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score));

//!\privatesection
template <typename stream_t, typename header_type>
void write_header(stream_t & stream, sam_file_output_options const & options, header_type & header);

private:
//!\brief A variable that tracks whether the content of header has been read or not.
bool header_was_read{false};
Expand Down Expand Up @@ -726,8 +730,9 @@ inline void format_bam::write_alignment_record([[maybe_unused]] stream_type & st
if constexpr (detail::decays_to_ignore_v<header_type>)
{
throw format_error{"BAM can only be written with a header but you did not provide enough information! "
"You can either construct the output file with ref_ids and ref_seqs information and "
"the header will be created for you, or you can access the `header` member directly."};
"You can either construct the output file with reference names and reference length "
"information and the header will be created for you, or you can access the `header` member "
"directly."};
}
else
{
Expand All @@ -745,28 +750,7 @@ inline void format_bam::write_alignment_record([[maybe_unused]] stream_type & st
// ---------------------------------------------------------------------
if (!header_was_written)
{
stream << "BAM\1";
std::ostringstream os;
write_header(os, options, header); // write SAM header to temporary stream to query the size.
int32_t l_text{static_cast<int32_t>(os.str().size())};
std::ranges::copy_n(reinterpret_cast<char *>(&l_text), 4, stream_it); // write read id

stream << os.str();

int32_t n_ref{static_cast<int32_t>(header.ref_ids().size())};
std::ranges::copy_n(reinterpret_cast<char *>(&n_ref), 4, stream_it); // write read id

for (int32_t ridx = 0; ridx < n_ref; ++ridx)
{
int32_t l_name{static_cast<int32_t>(header.ref_ids()[ridx].size()) + 1}; // plus null character
std::ranges::copy_n(reinterpret_cast<char *>(&l_name), 4, stream_it); // write l_name
// write reference name:
std::ranges::copy(header.ref_ids()[ridx].begin(), header.ref_ids()[ridx].end(), stream_it);
stream_it = '\0';
// write reference sequence length:
std::ranges::copy_n(reinterpret_cast<char *>(&get<0>(header.ref_id_info[ridx])), 4, stream_it);
}

write_header(stream, options, header);
header_was_written = true;
}

Expand Down Expand Up @@ -961,6 +945,58 @@ inline void format_bam::write_alignment_record([[maybe_unused]] stream_type & st
} // if constexpr (!detail::decays_to_ignore_v<header_type>)
}

//!\copydoc seqan3::detail::format_sam_base::write_header
template <typename stream_t, typename header_type>
inline void format_bam::write_header(stream_t & stream, sam_file_output_options const & options, header_type & header)
{
if constexpr (detail::decays_to_ignore_v<header_type>)
{
throw format_error{"BAM can only be written with a header but you did not provide enough information! "
"You can either construct the output file with reference names and reference length "
"information and the header will be created for you, or you can access the `header` member "
"directly."};
}
else
{
detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};

std::ranges::copy_n("BAM\1", 4, stream_it); // Do not copy the null terminator

// write SAM header to temporary stream first to query its size.
std::ostringstream os;
detail::format_sam_base::write_header(os, options, header);
#if SEQAN3_COMPILER_IS_GCC && (__GNUC__ == 10)
int32_t const l_text{static_cast<int32_t>(os.str().size())};
#else
int32_t const l_text{static_cast<int32_t>(os.view().size())};
#endif
std::ranges::copy_n(reinterpret_cast<char const *>(&l_text), 4, stream_it); // write text length

#if SEQAN3_COMPILER_IS_GCC && (__GNUC__ == 10)
auto header_view = os.str();
#else
auto header_view = os.view();
#endif
std::ranges::copy(header_view, stream_it);

assert(header.ref_ids().size() < (1ull << 32));
int32_t const n_ref{static_cast<int32_t>(header.ref_ids().size())};
std::ranges::copy_n(reinterpret_cast<char const *>(&n_ref), 4, stream_it); // write number of references

for (int32_t ridx = 0; ridx < n_ref; ++ridx)
{
assert(header.ref_ids()[ridx].size() + 1 < (1ull << 32));
int32_t const l_name{static_cast<int32_t>(header.ref_ids()[ridx].size()) + 1}; // plus null character
std::ranges::copy_n(reinterpret_cast<char const *>(&l_name), 4, stream_it); // write l_name
// write reference name:
std::ranges::copy(header.ref_ids()[ridx], stream_it);
stream_it = '\0'; // ++ is not necessary for ostream_iterator
// write reference sequence length:
std::ranges::copy_n(reinterpret_cast<char *>(&get<0>(header.ref_id_info[ridx])), 4, stream_it);
}
}
}

//!\copydoc seqan3::format_sam::read_sam_dict_vector
template <typename stream_view_type, typename value_type>
inline void format_bam::read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
Expand Down
2 changes: 1 addition & 1 deletion include/seqan3/io/sam_file/format_sam.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ namespace seqan3
*
* \remark For a complete overview, take a look at \ref io_sam_file
*/
class format_sam : private detail::format_sam_base
class format_sam : protected detail::format_sam_base
{
public:
/*!\name Constructors, destructor and assignment
Expand Down
28 changes: 25 additions & 3 deletions include/seqan3/io/sam_file/output.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,8 @@ class sam_file_output
field::header_ptr>;

static_assert(
[]() constexpr {
[]() constexpr
{
for (field f : selected_field_ids::as_array)
if (!field_ids::contains(f))
return false;
Expand Down Expand Up @@ -149,8 +150,24 @@ class sam_file_output
sam_file_output(sam_file_output &&) = default;
//!\brief Move assignment is defaulted.
sam_file_output & operator=(sam_file_output &&) = default;
//!\brief Destructor is defaulted.
~sam_file_output() = default;
//!\brief The destructor will write the header if it has not been written before.
~sam_file_output()
{
if (header_has_been_written)
return;

assert(!format.valueless_by_exception());

std::visit(
[&](auto & f)
{
if constexpr (std::same_as<ref_ids_type, ref_info_not_given>)
f.write_header(*secondary_stream, options, std::ignore);
else
f.write_header(*secondary_stream, options, *header_ptr);
},
format);
}

/*!\brief Construct from filename.
* \param[in] filename Path to the file you wish to open.
Expand Down Expand Up @@ -592,6 +609,9 @@ class sam_file_output

protected:
//!\privatesection
//!\brief This is needed during deconstruction to know whether a header still needs to be written.
bool header_has_been_written{false};

//!\brief A larger (compared to stl default) stream buffer to use when reading from a file.
std::vector<char> stream_buffer{std::vector<char>(1'000'000)};

Expand Down Expand Up @@ -690,6 +710,8 @@ class sam_file_output
}
},
format);

header_has_been_written = true; // when writing a record, the header is written automatically
}

//!\brief Befriend iterator so it can access the buffers.
Expand Down
Loading

1 comment on commit 334b790

@vercel
Copy link

@vercel vercel bot commented on 334b790 Oct 31, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Successfully deployed to the following URLs:

seqan3 – ./

seqan3-seqan.vercel.app
seqan3-git-master-seqan.vercel.app
seqan3.vercel.app

Please sign in to comment.