Skip to content

Commit

Permalink
[REVIEW] Refactor read_sam_dict
Browse files Browse the repository at this point in the history
  • Loading branch information
eseiler committed Jan 5, 2023
1 parent 035168b commit 3173b4f
Showing 1 changed file with 53 additions and 54 deletions.
107 changes: 53 additions & 54 deletions include/seqan3/io/sam_file/format_bam.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -933,15 +933,38 @@ inline void format_bam::read_sam_dict(std::string_view const tag_str, sam_tag_di
by the length (int32_t) of the array, followed by the values.
*/
auto it = tag_str.begin();
while (it != tag_str.end())

// Deduces int_t from passed argument.
auto parse_integer_into_target = [&]<std::integral int_t>(uint16_t const tag, int_t)
{
int_t tmp{};
read_integral_byte_field(std::string_view{it, tag_str.end()}, tmp);
target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
it += sizeof(tmp);
};

// Deduces array_value_t from passed argument.
auto parse_array_into_target = [&]<arithmetic array_value_t>(uint16_t const tag, array_value_t)
{
int32_t const count = read_sam_dict_vector(target[tag], std::string_view{it, tag_str.end()}, array_value_t{});
it += sizeof(int32_t) /*length is stored within the vector*/ + sizeof(array_value_t) * count;
};

// Read uint16_t from string_view and advance `it`.
auto parse_tag = [&]()
{
uint16_t tag = static_cast<uint16_t>(*it) << 8;
++it; // skip char read before

tag += static_cast<uint16_t>(*it);
tag |= static_cast<uint16_t>(*it);
++it; // skip char read before
return tag;
};

while (it != tag_str.end())
{
uint16_t const tag = parse_tag();

char type_id = *it;
char const type_id{*it};
++it; // skip char read before

switch (type_id)
Expand All @@ -955,86 +978,70 @@ inline void format_bam::read_sam_dict(std::string_view const tag_str, sam_tag_di
// all integer sizes are possible
case 'c': // int8_t
{
int8_t tmp;
read_integral_byte_field(std::string_view{it, tag_str.end()}, tmp);
target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
it += sizeof(tmp);
parse_integer_into_target(tag, int8_t{});
break;
}
case 'C': // uint8_t
{
uint8_t tmp;
read_integral_byte_field(std::string_view{it, tag_str.end()}, tmp);
target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
it += sizeof(tmp);
parse_integer_into_target(tag, uint8_t{});
break;
}
case 's': // int16_t
{
int16_t tmp;
read_integral_byte_field(std::string_view{it, tag_str.end()}, tmp);
target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
it += sizeof(tmp);
parse_integer_into_target(tag, int16_t{});
break;
}
case 'S': // uint16_t
{
uint16_t tmp;
read_integral_byte_field(std::string_view{it, tag_str.end()}, tmp);
target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
it += sizeof(tmp);
parse_integer_into_target(tag, uint16_t{});
break;
}
case 'i': // int32_t
{
int32_t tmp;
read_integral_byte_field(std::string_view{it, tag_str.end()}, tmp);
target[tag] = std::move(tmp); // readable sam format only allows int32_t
it += sizeof(tmp);
parse_integer_into_target(tag, int32_t{});
break;
}
case 'I': // uint32_t
{
uint32_t tmp;
read_integral_byte_field(std::string_view{it, tag_str.end()}, tmp);
target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
it += sizeof(tmp);
parse_integer_into_target(tag, uint32_t{});
break;
}
case 'f': // float
{
float tmp;
float tmp{};
read_float_byte_field(std::string_view{it, tag_str.end()}, tmp);
target[tag] = tmp;
it += sizeof(int32_t);
it += sizeof(float);
break;
}
case 'Z': // string
{
std::string_view v = std::string_view{static_cast<char const *>(it)}; // parses until '\0'
target[tag] = std::string{v};
std::string const v{static_cast<char const *>(it)}; // parses until '\0'
it += v.size() + 1;
target[tag] = std::move(v);
break;
}
case 'H': // byte array, represented as null-terminated string; specification requires even number of bytes
{
std::string_view str = std::string_view{static_cast<char const *>(it)}; // parses until '\0'
std::string_view const str{static_cast<char const *>(it)}; // parses until '\0'

std::vector<std::byte> tmp_vector;
std::vector<std::byte> tmp_vector{};
// std::from_chars cannot directly parse into a std::byte
uint8_t dummy_byte{};

if (str.size() % 2 != 0)
throw format_error{"[CORRUPTED BAM FILE] Hexadecimal tag must have even number of digits."};

for (size_t i = 0; i < str.size(); i += 2)
// H encodes bytes in a hexadecimal format. Two hex values are stored for each byte as characters.
// E.g., '1' and 'A' need one byte each and are read as `\x1A`, which is 27 in decimal.
for (auto hex_begin = str.begin(), hex_end = str.begin() + 2; hex_begin != str.end();
hex_begin += 2, hex_end += 2)
{
auto res = std::from_chars(str.begin() + i, str.begin() + i + 2, dummy_byte, 16);
auto res = std::from_chars(hex_begin, hex_end, dummy_byte, 16);

if (res.ec == std::errc::invalid_argument)
throw format_error{std::string("[CORRUPTED BAM FILE] The string '")
+ std::string(str.begin() + i, str.begin() + i + 2)
+ "' could not be cast into type uint8_t."};
+ std::string(hex_begin, hex_end) + "' could not be cast into type uint8_t."};

if (res.ec == std::errc::result_out_of_range)
throw format_error{std::string("[CORRUPTED BAM FILE] Casting '") + std::string(str)
Expand All @@ -1043,7 +1050,7 @@ inline void format_bam::read_sam_dict(std::string_view const tag_str, sam_tag_di
tmp_vector.push_back(std::byte{dummy_byte});
}

target[tag] = tmp_vector;
target[tag] = std::move(tmp_vector);

it += str.size() + 1;

Expand All @@ -1053,37 +1060,29 @@ inline void format_bam::read_sam_dict(std::string_view const tag_str, sam_tag_di
{
char array_value_type_id = *it;
++it; // skip char read before
int32_t count{0};

switch (array_value_type_id)
{
case 'c': // int8_t
count = read_sam_dict_vector(target[tag], std::string_view{it, tag_str.end()}, int8_t{});
it += sizeof(int32_t) /*length is stored within the vector*/ + sizeof(int8_t) * count;
parse_array_into_target(tag, int8_t{});
break;
case 'C': // uint8_t
count = read_sam_dict_vector(target[tag], std::string_view{it, tag_str.end()}, uint8_t{});
it += sizeof(int32_t) /*length is stored within the vector*/ + sizeof(uint8_t) * count;
parse_array_into_target(tag, uint8_t{});
break;
case 's': // int16_t
count = read_sam_dict_vector(target[tag], std::string_view{it, tag_str.end()}, int16_t{});
it += sizeof(int32_t) /*length is stored within the vector*/ + sizeof(int16_t) * count;
parse_array_into_target(tag, int16_t{});
break;
case 'S': // uint16_t
count = read_sam_dict_vector(target[tag], std::string_view{it, tag_str.end()}, uint16_t{});
it += sizeof(int32_t) /*length is stored within the vector*/ + sizeof(uint16_t) * count;
parse_array_into_target(tag, uint16_t{});
break;
case 'i': // int32_t
count = read_sam_dict_vector(target[tag], std::string_view{it, tag_str.end()}, int32_t{});
it += sizeof(int32_t) /*length is stored within the vector*/ + sizeof(int32_t) * count;
parse_array_into_target(tag, int32_t{});
break;
case 'I': // uint32_t
count = read_sam_dict_vector(target[tag], std::string_view{it, tag_str.end()}, uint32_t{});
it += sizeof(int32_t) /*length is stored within the vector*/ + sizeof(uint32_t) * count;
parse_array_into_target(tag, uint32_t{});
break;
case 'f': // float
count = read_sam_dict_vector(target[tag], std::string_view{it, tag_str.end()}, float{});
it += sizeof(int32_t) /*length is stored within the vector*/ + sizeof(float) * count;
parse_array_into_target(tag, float{});
break;
default:
throw format_error{detail::to_string("The first character in the numerical id of a SAM tag ",
Expand Down

0 comments on commit 3173b4f

Please sign in to comment.