Skip to content

Commit

Permalink
re #476: handle byte order marks
Browse files Browse the repository at this point in the history
  • Loading branch information
biojppm committed Jan 22, 2025
1 parent 9b6fff2 commit 75c415b
Show file tree
Hide file tree
Showing 7 changed files with 344 additions and 10 deletions.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -762,6 +762,11 @@ following situations:
reflects the usual practice of having at most 1 or 2 tag directives;
also, be aware that this feature is under consideration for removal
in YAML 1.3.
* Byte Order Marks: while ryml correctly handles BOMs at the beginning
of the stream or documents (as per the standard), BOMs inside
scalars are ignored. The [standard mandates that they should be
quoted](https://yaml.org/spec/1.2.2/#52-character-encodings) when
emitted, this is not done.
* ryml tends to be on the permissive side in several cases where the
YAML standard dictates that there should be an error; in many of these
cases, ryml will tolerate the input. This may be good or bad, but in
Expand Down
1 change: 1 addition & 0 deletions changelog/current.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
- add workarounds for problems with codegen of gcc 11,12,13.
- improve CI coverage of gcc and clang optimization levels.
- [BREAKING] Fix [#477](https://github.com/biojppm/rapidyaml/issues/477) ([PR#479](https://github.com/biojppm/rapidyaml/pull/479)): changed `read<std::map>()` to overwrite existing entries. The provided implementations had an inconsistency between `std::map` (which wasn't overwriting) and `std::vector` (which *was* overwriting).
- Fix [#476](https://github.com/biojppm/rapidyaml/issues/476) [PR#493](https://github.com/biojppm/rapidyaml/pull/493): add handling of Byte Order Marks.
- [PR#492](https://github.com/biojppm/rapidyaml/pull/492): fix emit of explicit keys when indented:
```yaml
fixed:
Expand Down
14 changes: 14 additions & 0 deletions src/c4/yml/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,20 @@ struct RYML_EXPORT Callbacks
/** @} */


//-----------------------------------------------------------------------------
//-----------------------------------------------------------------------------
//-----------------------------------------------------------------------------

typedef enum {
NOBOM,
UTF8,
UTF16LE,
UTF16BE,
UTF32LE,
UTF32BE,
} Encoding_e;


//-----------------------------------------------------------------------------
//-----------------------------------------------------------------------------
//-----------------------------------------------------------------------------
Expand Down
100 changes: 97 additions & 3 deletions src/c4/yml/parse_engine.def.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
#include "c4/error.hpp"
#include "c4/charconv.hpp"
#include "c4/utf.hpp"
#include <c4/dump.hpp>

#include <ctype.h>

#include "c4/yml/detail/parser_dbg.hpp"
#include "c4/yml/filter_processor.hpp"
#ifdef RYML_DBG
#include <c4/dump.hpp>
#include "c4/yml/detail/print.hpp"
#endif

Expand Down Expand Up @@ -261,6 +261,9 @@ ParseEngine<EventHandler>::ParseEngine(EventHandler *evt_handler, ParserOptions
, m_evt_handler(evt_handler)
, m_pending_anchors()
, m_pending_tags()
, m_was_inside_qmrk(false)
, m_doc_empty(false)
, m_encoding(NOBOM)
, m_newline_offsets()
, m_newline_offsets_size(0)
, m_newline_offsets_capacity(0)
Expand All @@ -277,6 +280,9 @@ ParseEngine<EventHandler>::ParseEngine(ParseEngine &&that) noexcept
, m_evt_handler(that.m_evt_handler)
, m_pending_anchors(that.m_pending_anchors)
, m_pending_tags(that.m_pending_tags)
, m_was_inside_qmrk(false)
, m_doc_empty(false)
, m_encoding(NOBOM)
, m_newline_offsets(that.m_newline_offsets)
, m_newline_offsets_size(that.m_newline_offsets_size)
, m_newline_offsets_capacity(that.m_newline_offsets_capacity)
Expand All @@ -293,6 +299,9 @@ ParseEngine<EventHandler>::ParseEngine(ParseEngine const& that)
, m_evt_handler(that.m_evt_handler)
, m_pending_anchors(that.m_pending_anchors)
, m_pending_tags(that.m_pending_tags)
, m_was_inside_qmrk(false)
, m_doc_empty(false)
, m_encoding(NOBOM)
, m_newline_offsets()
, m_newline_offsets_size()
, m_newline_offsets_capacity()
Expand All @@ -317,6 +326,9 @@ ParseEngine<EventHandler>& ParseEngine<EventHandler>::operator=(ParseEngine &&th
m_evt_handler = that.m_evt_handler;
m_pending_anchors = that.m_pending_anchors;
m_pending_tags = that.m_pending_tags;
m_was_inside_qmrk = that.m_was_inside_qmrk;
m_doc_empty = that.m_doc_empty;
m_encoding = that.m_encoding;
m_newline_offsets = (that.m_newline_offsets);
m_newline_offsets_size = (that.m_newline_offsets_size);
m_newline_offsets_capacity = (that.m_newline_offsets_capacity);
Expand All @@ -337,6 +349,9 @@ ParseEngine<EventHandler>& ParseEngine<EventHandler>::operator=(ParseEngine cons
m_evt_handler = that.m_evt_handler;
m_pending_anchors = that.m_pending_anchors;
m_pending_tags = that.m_pending_tags;
m_was_inside_qmrk = that.m_was_inside_qmrk;
m_doc_empty = that.m_doc_empty;
m_encoding = that.m_encoding;
if(that.m_newline_offsets_capacity > m_newline_offsets_capacity)
_resize_locations(that.m_newline_offsets_capacity);
_RYML_CB_CHECK(m_evt_handler->m_stack.m_callbacks, m_newline_offsets_capacity >= that.m_newline_offsets_capacity);
Expand All @@ -357,6 +372,9 @@ void ParseEngine<EventHandler>::_clr()
m_evt_handler = {};
m_pending_anchors = {};
m_pending_tags = {};
m_was_inside_qmrk = false;
m_doc_empty = true;
m_encoding = NOBOM;
m_newline_offsets = {};
m_newline_offsets_size = {};
m_newline_offsets_capacity = {};
Expand Down Expand Up @@ -385,11 +403,12 @@ void ParseEngine<EventHandler>::_reset()
m_pending_anchors = {};
m_pending_tags = {};
m_doc_empty = true;
m_was_inside_qmrk = false;
m_encoding = NOBOM;
if(m_options.locations())
{
_prepare_locations();
}
m_was_inside_qmrk = false;
}


Expand Down Expand Up @@ -4351,6 +4370,72 @@ void ParseEngine<EventHandler>::_handle_directive(csubstr rem)
}
}

template<class EventHandler>
bool ParseEngine<EventHandler>::_handle_bom()
{
const csubstr rem = m_evt_handler->m_curr->line_contents.rem;
if(rem.len)
{
const csubstr rest = rem.sub(1);
// https://yaml.org/spec/1.2.2/#52-character-encodings
#define _rymlisascii(c) ((c) <= '\x7f') // is the character ASCII?
if(rem.begins_with("\x00\x00\xfe\xff") || (rem.begins_with("\x00\x00\x00") && rem.len >= 4u && _rymlisascii(rem.str[3])))
{
_c4dbgp("byte order mark: UTF32BE");

Check warning on line 4384 in src/c4/yml/parse_engine.def.hpp

View check run for this annotation

Codecov / codecov/patch

src/c4/yml/parse_engine.def.hpp#L4384

Added line #L4384 was not covered by tests
_handle_bom(UTF32BE);
_line_progressed(4);
return true;
}
else if(rem.begins_with("\xff\xfe\x00\x00") || (rest.begins_with("\x00\x00\x00") && rem.len >= 4u && _rymlisascii(rem.str[0])))
{
_c4dbgp("byte order mark: UTF32LE");

Check warning on line 4391 in src/c4/yml/parse_engine.def.hpp

View check run for this annotation

Codecov / codecov/patch

src/c4/yml/parse_engine.def.hpp#L4391

Added line #L4391 was not covered by tests
_handle_bom(UTF32LE);
_line_progressed(4);
return true;
}
else if(rem.begins_with("\xfe\xff") || (rem.begins_with('\x00') && rem.len >= 2u && _rymlisascii(rem.str[1])))
{
_c4dbgp("byte order mark: UTF16BE");

Check warning on line 4398 in src/c4/yml/parse_engine.def.hpp

View check run for this annotation

Codecov / codecov/patch

src/c4/yml/parse_engine.def.hpp#L4398

Added line #L4398 was not covered by tests
_handle_bom(UTF16BE);
_line_progressed(2);
return true;
}
else if(rem.begins_with("\xff\xfe") || (rest.begins_with('\x00') && rem.len >= 2u && _rymlisascii(rem.str[0])))
{
_c4dbgp("byte order mark: UTF16LE");

Check warning on line 4405 in src/c4/yml/parse_engine.def.hpp

View check run for this annotation

Codecov / codecov/patch

src/c4/yml/parse_engine.def.hpp#L4405

Added line #L4405 was not covered by tests
_handle_bom(UTF16LE);
_line_progressed(2);
return true;
}
else if(rem.begins_with("\xef\xbb\xbf"))
{
_c4dbgp("byte order mark: UTF8");

Check warning on line 4412 in src/c4/yml/parse_engine.def.hpp

View check run for this annotation

Codecov / codecov/patch

src/c4/yml/parse_engine.def.hpp#L4412

Added line #L4412 was not covered by tests
_handle_bom(UTF8);
_line_progressed(3);
return true;
}
#undef _rymlisascii
}
return false;
}

template<class EventHandler>
void ParseEngine<EventHandler>::_handle_bom(Encoding_e enc)
{
if(m_encoding == NOBOM)
{
const bool is_beginning_of_file = m_evt_handler->m_curr->line_contents.rem.str == m_buf.str;
if(enc == UTF8 || is_beginning_of_file)
m_encoding = enc;
else
_c4err("non-UTF8 byte order mark can appear only at the beginning of the file");
}
else if(enc != m_encoding)
{
_c4err("byte order mark can only be set once");
}
}


//-----------------------------------------------------------------------------

Expand Down Expand Up @@ -7202,6 +7287,10 @@ void ParseEngine<EventHandler>::_handle_unk_json()
_set_indentation(m_evt_handler->m_curr->line_contents.current_col(rem));
_line_progressed(1);
}
else if(_handle_bom())
{
_c4dbgp("byte order mark");

Check warning on line 7292 in src/c4/yml/parse_engine.def.hpp

View check run for this annotation

Codecov / codecov/patch

src/c4/yml/parse_engine.def.hpp#L7292

Added line #L7292 was not covered by tests
}
else
{
_RYML_CB_ASSERT(m_evt_handler->m_stack.m_callbacks, ! has_any(SSCL));
Expand Down Expand Up @@ -7288,8 +7377,13 @@ void ParseEngine<EventHandler>::_handle_unk()

if(m_evt_handler->m_curr->line_contents.indentation == 0u && _at_line_begin())
{
const char first = rem.str[0];
_c4dbgp("rtop: zero indent + at line begin");
if(_handle_bom())
{
_c4dbgp("byte order mark!");

Check warning on line 7383 in src/c4/yml/parse_engine.def.hpp

View check run for this annotation

Codecov / codecov/patch

src/c4/yml/parse_engine.def.hpp#L7383

Added line #L7383 was not covered by tests
rem = m_evt_handler->m_curr->line_contents.rem;
}
const char first = rem.str[0];
if(first == '-')
{
_c4dbgp("rtop: suspecting doc");
Expand Down
8 changes: 8 additions & 0 deletions src/c4/yml/parse_engine.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,10 @@ class ParseEngine
/** Get the latest YAML buffer parsed by this object. */
csubstr source() const { return m_buf; }

/** Get the encoding of the latest YAML buffer parsed by this object.
* If no encoding was specified, UTF8 is assumed as per the YAML standard. */
Encoding_e encoding() const { return m_encoding != NOBOM ? m_encoding : UTF8; }

id_type stack_capacity() const { RYML_ASSERT(m_evt_handler); return m_evt_handler->m_stack.capacity(); }
size_t locations_capacity() const { return m_newline_offsets_capacity; }

Expand Down Expand Up @@ -714,6 +718,8 @@ class ParseEngine
void _handle_annotations_and_indentation_after_start_mapblck(size_t key_indentation, size_t key_line);
size_t _select_indentation_from_annotations(size_t val_indentation, size_t val_line);
void _handle_directive(csubstr rem);
bool _handle_bom();
void _handle_bom(Encoding_e enc);

void _check_tag(csubstr tag);

Expand All @@ -738,6 +744,8 @@ class ParseEngine
bool m_was_inside_qmrk;
bool m_doc_empty = true;

Encoding_e m_encoding = UTF8;

private:

size_t *m_newline_offsets;
Expand Down
Loading

0 comments on commit 75c415b

Please sign in to comment.