diff --git a/.github/workflows/clang.yml b/.github/workflows/clang.yml index 64ab2935f..62dbf1579 100644 --- a/.github/workflows/clang.yml +++ b/.github/workflows/clang.yml @@ -42,7 +42,55 @@ jobs: - {std: 20, cxx: clang++-10 , bt: Release, os: ubuntu-18.04, bitlinks: shared64 static32} - {std: 11, cxx: clang++-6.0, bt: Debug , os: ubuntu-18.04, bitlinks: shared64 static32} - {std: 11, cxx: clang++-6.0, bt: Release, os: ubuntu-18.04, bitlinks: shared64 static32} - env: {STD: "${{matrix.std}}", CXX_: "${{matrix.cxx}}", BT: "${{matrix.bt}}", BITLINKS: "${{matrix.bitlinks}}", VG: "${{matrix.vg}}", SAN: "${{matrix.san}}", LINT: "${{matrix.lint}}", OS: "${{matrix.os}}"} + env: {STD: "${{matrix.std}}", CXX_: "${{matrix.cxx}}", BT: "${{matrix.bt}}", BITLINKS: "${{matrix.bitlinks}}", + CMAKE_FLAGS: "${{matrix.cmkflags}}", + VG: "${{matrix.vg}}", SAN: "${{matrix.san}}", LINT: "${{matrix.lint}}", OS: "${{matrix.os}}"} + steps: + - {name: checkout, uses: actions/checkout@v2, with: {submodules: recursive}} + - {name: install requirements, run: source .github/reqs.sh && c4_install_test_requirements $OS} + - {name: show info, run: source .github/setenv.sh && c4_show_info} + - name: shared64-configure--------------------------------------------------- + run: source .github/setenv.sh && c4_cfg_test shared64 + - {name: shared64-build, run: source .github/setenv.sh && c4_build_test shared64} + - {name: shared64-run, run: source .github/setenv.sh && c4_run_test shared64} + - {name: shared64-pack, run: source .github/setenv.sh && c4_package shared64} + - name: static64-configure--------------------------------------------------- + run: source .github/setenv.sh && c4_cfg_test static64 + - {name: static64-build, run: source .github/setenv.sh && c4_build_test static64} + - {name: static64-run, run: source .github/setenv.sh && c4_run_test static64} + - {name: static64-pack, run: source .github/setenv.sh && c4_package static64} + - name: static32-configure--------------------------------------------------- + run: source .github/setenv.sh && c4_cfg_test static32 + - {name: static32-build, run: source .github/setenv.sh && c4_build_test static32} + - {name: static32-run, run: source .github/setenv.sh && c4_run_test static32} + - {name: static32-pack, run: source .github/setenv.sh && c4_package static32} + - name: shared32-configure--------------------------------------------------- + run: source .github/setenv.sh && c4_cfg_test shared32 + - {name: shared32-build, run: source .github/setenv.sh && c4_build_test shared32} + - {name: shared32-run, run: source .github/setenv.sh && c4_run_test shared32} + - {name: shared32-pack, run: source .github/setenv.sh && c4_package shared32} + + clang_canary_tabtokens: + name: tabtokens/${{matrix.cxx}}/canary/c++${{matrix.std}}/${{matrix.bt}} + if: | + (!contains(github.event.head_commit.message, 'skip all')) || + (!contains(github.event.head_commit.message, 'skip clang')) || + contains(github.event.head_commit.message, 'only clang') + continue-on-error: true + runs-on: ${{matrix.os}} + strategy: + fail-fast: false + matrix: + include: + - {std: 17, cxx: clang++-10 , bt: Debug , os: ubuntu-18.04, bitlinks: static64, cmkflags: "-DRYML_WITH_TAB_TOKENS=ON"} + - {std: 17, cxx: clang++-10 , bt: Release, os: ubuntu-18.04, bitlinks: static64, cmkflags: "-DRYML_WITH_TAB_TOKENS=ON"} + - {std: 20, cxx: clang++-10 , bt: Debug , os: ubuntu-18.04, bitlinks: static64, cmkflags: "-DRYML_WITH_TAB_TOKENS=ON"} + - {std: 20, cxx: clang++-10 , bt: Release, os: ubuntu-18.04, bitlinks: static64, cmkflags: "-DRYML_WITH_TAB_TOKENS=ON"} + - {std: 11, cxx: clang++-6.0, bt: Debug , os: ubuntu-18.04, bitlinks: static64, cmkflags: "-DRYML_WITH_TAB_TOKENS=ON"} + - {std: 11, cxx: clang++-6.0, bt: Release, os: ubuntu-18.04, bitlinks: static64, cmkflags: "-DRYML_WITH_TAB_TOKENS=ON"} + env: {STD: "${{matrix.std}}", CXX_: "${{matrix.cxx}}", BT: "${{matrix.bt}}", BITLINKS: "${{matrix.bitlinks}}", + CMAKE_FLAGS: "${{matrix.cmkflags}}", + VG: "${{matrix.vg}}", SAN: "${{matrix.san}}", LINT: "${{matrix.lint}}", OS: "${{matrix.os}}"} steps: - {name: checkout, uses: actions/checkout@v2, with: {submodules: recursive}} - {name: install requirements, run: source .github/reqs.sh && c4_install_test_requirements $OS} @@ -82,21 +130,11 @@ jobs: fail-fast: false matrix: include: - - {std: 11, cxx: clang++-9 , bt: Debug , vg: on, os: ubuntu-18.04} - - {std: 11, cxx: clang++-9 , bt: Release, vg: on, os: ubuntu-18.04} - - {std: 11, cxx: clang++-8 , bt: Debug , vg: on, os: ubuntu-18.04} - - {std: 11, cxx: clang++-8 , bt: Release, vg: on, os: ubuntu-18.04} - - {std: 11, cxx: clang++-7 , bt: Debug , vg: on, os: ubuntu-18.04} - - {std: 11, cxx: clang++-7 , bt: Release, vg: on, os: ubuntu-18.04} - - {std: 11, cxx: clang++-6.0, bt: Debug , vg: on, os: ubuntu-18.04} - - {std: 11, cxx: clang++-6.0, bt: Release, vg: on, os: ubuntu-18.04} - - {std: 11, cxx: clang++-5.0, bt: Debug , vg: on, os: ubuntu-18.04} - - {std: 11, cxx: clang++-5.0, bt: Release, vg: on, os: ubuntu-18.04} - - {std: 11, cxx: clang++-4.0, bt: Debug , vg: on, os: ubuntu-18.04} - - {std: 11, cxx: clang++-4.0, bt: Release, vg: on, os: ubuntu-18.04} - - {std: 11, cxx: clang++-3.9, bt: Debug , vg: on, os: ubuntu-18.04} - - {std: 11, cxx: clang++-3.9, bt: Release, vg: on, os: ubuntu-18.04} - env: {STD: "${{matrix.std}}", CXX_: "${{matrix.cxx}}", BT: "${{matrix.bt}}", BITLINKS: "${{matrix.bitlinks}}", VG: "${{matrix.vg}}", SAN: "${{matrix.san}}", LINT: "${{matrix.lint}}", OS: "${{matrix.os}}"} + - {std: 11, cxx: clang++-10 , bt: Debug , vg: on, os: ubuntu-18.04} + - {std: 11, cxx: clang++-10 , bt: Release, vg: on, os: ubuntu-18.04} + env: {STD: "${{matrix.std}}", CXX_: "${{matrix.cxx}}", BT: "${{matrix.bt}}", BITLINKS: "${{matrix.bitlinks}}", + CMAKE_FLAGS: "${{matrix.cmkflags}}", + VG: "${{matrix.vg}}", SAN: "${{matrix.san}}", LINT: "${{matrix.lint}}", OS: "${{matrix.os}}"} steps: - {name: checkout, uses: actions/checkout@v2, with: {submodules: recursive}} - {name: install requirements, run: source .github/reqs.sh && c4_install_test_requirements $OS} @@ -137,10 +175,10 @@ jobs: matrix: include: # clang tidy takes a long time, so don't do multiple bits/linktypes - - {std: 11, cxx: clang++-9, bt: Debug , lint: clang-tidy, bitlinks: shared64 static64, os: ubuntu-18.04} - - {std: 11, cxx: clang++-9, bt: Debug , lint: clang-tidy, bitlinks: shared32 static32, os: ubuntu-18.04} - - {std: 11, cxx: clang++-9, bt: ReleaseWithDebInfo, lint: clang-tidy, bitlinks: shared64 static64, os: ubuntu-18.04} - - {std: 11, cxx: clang++-9, bt: ReleaseWithDebInfo, lint: clang-tidy, bitlinks: shared32 static32, os: ubuntu-18.04} + - {std: 11, cxx: clang++-10, bt: Debug , lint: clang-tidy, bitlinks: shared64 static64, os: ubuntu-18.04} + - {std: 11, cxx: clang++-10, bt: Debug , lint: clang-tidy, bitlinks: shared32 static32, os: ubuntu-18.04} + - {std: 11, cxx: clang++-10, bt: ReleaseWithDebInfo, lint: clang-tidy, bitlinks: shared64 static64, os: ubuntu-18.04} + - {std: 11, cxx: clang++-10, bt: ReleaseWithDebInfo, lint: clang-tidy, bitlinks: shared32 static32, os: ubuntu-18.04} env: {STD: "${{matrix.std}}", CXX_: "${{matrix.cxx}}", BT: "${{matrix.bt}}", BITLINKS: "${{matrix.bitlinks}}", VG: "${{matrix.vg}}", SAN: "${{matrix.san}}", LINT: "${{matrix.lint}}", OS: "${{matrix.os}}"} steps: - {name: checkout, uses: actions/checkout@v2, with: {submodules: recursive}} diff --git a/.github/workflows/gcc.yml b/.github/workflows/gcc.yml index e30a1f20b..2399b0c07 100644 --- a/.github/workflows/gcc.yml +++ b/.github/workflows/gcc.yml @@ -68,6 +68,52 @@ jobs: - {name: shared32-run, run: source .github/setenv.sh && c4_run_test shared32} - {name: shared32-pack, run: source .github/setenv.sh && c4_package shared32} + gcc_tabtokens: + name: tabtokens/${{matrix.cxx}}/canary/${{matrix.bt}} + if: | + (!contains(github.event.head_commit.message, 'skip all')) || + (!contains(github.event.head_commit.message, 'skip gcc')) || + contains(github.event.head_commit.message, 'only gcc') + continue-on-error: true + runs-on: ${{matrix.os}} + strategy: + fail-fast: false + matrix: + include: + - {std: 11, cxx: g++-7 , bt: Debug , os: ubuntu-18.04, bitlinks: shared64 static32, cmkflags: "-DRYML_WITH_TAB_TOKENS=ON"} + - {std: 11, cxx: g++-7 , bt: Release, os: ubuntu-18.04, bitlinks: shared64 static32, cmkflags: "-DRYML_WITH_TAB_TOKENS=ON"} + - {std: 20, cxx: g++-10 , bt: Debug , os: ubuntu-18.04, bitlinks: shared64 static32, cmkflags: "-DRYML_WITH_TAB_TOKENS=ON"} + - {std: 20, cxx: g++-10 , bt: Release, os: ubuntu-18.04, bitlinks: shared64 static32, cmkflags: "-DRYML_WITH_TAB_TOKENS=ON"} + - {std: 11, cxx: g++-5 , bt: Debug , os: ubuntu-18.04, bitlinks: shared64 static32, cmkflags: "-DRYML_WITH_TAB_TOKENS=ON"} + - {std: 11, cxx: g++-5 , bt: Release, os: ubuntu-18.04, bitlinks: shared64 static32, cmkflags: "-DRYML_WITH_TAB_TOKENS=ON"} + env: {STD: "${{matrix.std}}", CXX_: "${{matrix.cxx}}", BT: "${{matrix.bt}}", BITLINKS: "${{matrix.bitlinks}}", + CMAKE_FLAGS: "${{matrix.cmkflags}}", + VG: "${{matrix.vg}}", SAN: "${{matrix.san}}", LINT: "${{matrix.lint}}", OS: "${{matrix.os}}"} + steps: + - {name: checkout, uses: actions/checkout@v2, with: {submodules: recursive}} + - {name: install requirements, run: source .github/reqs.sh && c4_install_test_requirements $OS} + - {name: show info, run: source .github/setenv.sh && c4_show_info} + - name: shared64-configure--------------------------------------------------- + run: source .github/setenv.sh && c4_cfg_test shared64 + - {name: shared64-build, run: source .github/setenv.sh && c4_build_test shared64} + - {name: shared64-run, run: source .github/setenv.sh && c4_run_test shared64} + - {name: shared64-pack, run: source .github/setenv.sh && c4_package shared64} + - name: static64-configure--------------------------------------------------- + run: source .github/setenv.sh && c4_cfg_test static64 + - {name: static64-build, run: source .github/setenv.sh && c4_build_test static64} + - {name: static64-run, run: source .github/setenv.sh && c4_run_test static64} + - {name: static64-pack, run: source .github/setenv.sh && c4_package static64} + - name: static32-configure--------------------------------------------------- + run: source .github/setenv.sh && c4_cfg_test static32 + - {name: static32-build, run: source .github/setenv.sh && c4_build_test static32} + - {name: static32-run, run: source .github/setenv.sh && c4_run_test static32} + - {name: static32-pack, run: source .github/setenv.sh && c4_package static32} + - name: shared32-configure--------------------------------------------------- + run: source .github/setenv.sh && c4_cfg_test shared32 + - {name: shared32-build, run: source .github/setenv.sh && c4_build_test shared32} + - {name: shared32-run, run: source .github/setenv.sh && c4_run_test shared32} + - {name: shared32-pack, run: source .github/setenv.sh && c4_package shared32} + #---------------------------------------------------------------------------- gcc_extended: name: ${{matrix.cxx}}/extended/${{matrix.bt}} @@ -91,17 +137,6 @@ jobs: - {std: 17, cxx: g++-10, bt: Release, vg: ON, os: ubuntu-18.04} - {std: 20, cxx: g++-10, bt: Debug , vg: ON, os: ubuntu-18.04} - {std: 20, cxx: g++-10, bt: Release, vg: ON, os: ubuntu-18.04} - # - - {std: 11, cxx: g++-9, bt: Debug , os: ubuntu-18.04} - - {std: 11, cxx: g++-9, bt: Release, os: ubuntu-18.04} - - {std: 11, cxx: g++-8, bt: Debug , os: ubuntu-18.04} - - {std: 11, cxx: g++-8, bt: Release, os: ubuntu-18.04} - - {std: 11, cxx: g++-7, bt: Debug , os: ubuntu-18.04} - - {std: 11, cxx: g++-7, bt: Release, os: ubuntu-18.04} - - {std: 11, cxx: g++-6, bt: Debug , os: ubuntu-18.04} - - {std: 11, cxx: g++-6, bt: Release, os: ubuntu-18.04} - - {std: 11, cxx: g++-5, bt: Debug , os: ubuntu-18.04} - - {std: 11, cxx: g++-5, bt: Release, os: ubuntu-18.04} env: {STD: "${{matrix.std}}", CXX_: "${{matrix.cxx}}", BT: "${{matrix.bt}}", BITLINKS: "${{matrix.bitlinks}}", VG: "${{matrix.vg}}", SAN: "${{matrix.san}}", LINT: "${{matrix.lint}}", OS: "${{matrix.os}}"} steps: - {name: checkout, uses: actions/checkout@v2, with: {submodules: recursive}} diff --git a/CMakeLists.txt b/CMakeLists.txt index 16c87ae85..515b0b30c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,6 +10,7 @@ c4_project(VERSION 0.3.0 STANDALONE #------------------------------------------------------- +option(RYML_WITH_TAB_TOKENS "Enable parsing of tabs after ':' and '-'. This is costly and disabled by default." OFF) option(RYML_DEFAULT_CALLBACKS "Enable ryml's default implementation of callbacks: allocate(), free(), error()" ON) option(RYML_BUILD_TOOLS "build tools" OFF) option(RYML_BUILD_API "Enable API generation (python, etc)" OFF) @@ -57,6 +58,10 @@ c4_add_library(ryml INCORPORATE c4core ) +if(RYML_WITH_TAB_TOKENS) + target_compile_definitions(ryml PUBLIC RYML_WITH_TAB_TOKENS) +endif() + if(NOT RYML_DEFAULT_CALLBACKS) target_compile_definitions(ryml PRIVATE RYML_NO_DEFAULT_CALLBACKS) endif() diff --git a/README.md b/README.md index 59f9a31cc..549543370 100644 --- a/README.md +++ b/README.md @@ -657,7 +657,8 @@ sample_location_tracking(); ///< track node locations in the parsed source tr ### Package managers -If you opt for package managers, here's where ryml is available so far (thanks to all the contributors!): +If you opt for package managers, here's where ryml is available so far +(thanks to all the contributors!): * [vcpkg](https://vcpkg.io/en/packages.html): `vcpkg install ryml` * Arch Linux/Manjaro: * [rapidyaml-git (AUR)](https://aur.archlinux.org/packages/rapidyaml-git/) @@ -766,6 +767,9 @@ more about each sample: The following cmake variables can be used to control the build behavior of ryml: + * `RYML_WITH_TAB_TOKENS=ON/OFF`. Enable/disable support for tabs as + valid container tokens after `:` and `-`. Defaults to `OFF`, + because this may cost up to 10% in processing time. * `RYML_DEFAULT_CALLBACKS=ON/OFF`. Enable/disable ryml's default implementation of error and allocation callbacks. Defaults to `ON`. * `RYML_STANDALONE=ON/OFF`. ryml uses @@ -787,7 +791,8 @@ ryml is strongly coupled to c4core, and this is reinforced by the fact that c4core is a submodule of the current repo. However, it is still possible to use a c4core version different from the one in the repo (of course, only if there are no incompatibilities between the -versions). You can find out how to achieve this by looking at the [`custom_c4core` sample](./samples/custom_c4core/CMakeLists.txt). +versions). You can find out how to achieve this by looking at the +[`custom_c4core` sample](./samples/custom_c4core/CMakeLists.txt). ------ @@ -814,8 +819,8 @@ be changed.) With that said, here's an example of the Python API: ```python import ryml -# because ryml does not take ownership of the source buffer -# ryml cannot accept strings; only bytes or bytearrays +# ryml cannot accept strings because it does not take ownership of the +# source buffer; only bytes or bytearrays are accepted. src = b"{HELLO: a, foo: b, bar: c, baz: d, seq: [0, 1, 2, 3]}" def check(tree): @@ -914,17 +919,20 @@ See also [the roadmap](./ROADMAP.md) for a list of future work. ryml deliberately makes no effort to follow the standard in the following situations: +* Tab characters after `:` and `-` are not accepted tokens, unless + ryml is compiled with the macro `RYML_WITH_TAB_TOKENS`. This + requirement exists because checking for tabs introduces branching + into the parser's hot code and in some cases costs as much as 10% + in parsing time. * Containers are not accepted as mapping keys: keys must be scalar strings. * Tags are parsed as-is; tag lookup is not supported. * Anchor names must not end with a terminating colon: eg `&anchor: key: val`. -* Tabs after `:` or `-` are not supported. * `%TAG` directives have no effect and are ignored. All schemas are assumed to be the default YAML 2002 schema. * `%YAML` directives have no effect and are ignored. -Some of the limitations above will be worked on (tag lookups, tab -tokens). Others (notably container keys) absolutely will not, not in -the data tree at least. +Some of the limitations above will be worked on, (eg tag +lookups). Others (notably container keys) most likely will not. Also, ryml tends to be on the permissive side where the YAML standard dictates there should be an error; in many of these cases, ryml will @@ -937,12 +945,13 @@ problems, which is a good practice anyway. If you do run into trouble and would like to investigate conformance of your YAML code, beware of existing online YAML linters, many of which are not fully conformant; instead, try using -[https://play.yaml.io](https://play.yaml.io), an amazing tool from the -YAML people which lets you dynamically input your YAML and continuously -see the results from all the existing parsers (kudos to -@ingydotnet). And of course, if you detect anything bad with ryml, -please [open an issue](https://github.com/biojppm/rapidyaml/issues) so -that we can improve. +[https://play.yaml.io](https://play.yaml.io), an amazing tool which +lets you dynamically input your YAML and continuously see the results +from all the existing parsers (kudos to @ingydotnet and the people +from the YAML test suite). And of course, if you detect anything wrong +with ryml, please [open an +issue](https://github.com/biojppm/rapidyaml/issues) so that we can +improve. ### Test suite status diff --git a/changelog/current.md b/changelog/current.md index 169bae950..80bbf049d 100644 --- a/changelog/current.md +++ b/changelog/current.md @@ -109,7 +109,8 @@ As part of the [new feature to track source locations](https://github.com/biojpp ? explicit key # this comment was not parsed correctly ? # trailing empty key was not added to the map ``` -- ryml now parses successfully compact JSON code `{"like":"this"}` without any need for preprocessing. So the `preprocess_json()` functions and utilities are no longer necessary and have been removed. If you were using these functions, just remove the calls and pass the original source directly to ryml ([PR#210](https://github.com/biojppm/rapidyaml/pulls/210)). +- Fixed parsing of tabs used as whitespace tokens after `:` or `-`. This feature [is costly (see some benchmark results here)](https://github.com/biojppm/rapidyaml/pull/211#issuecomment-1030688035) and thus it is disabled by default, and requires defining a macro or cmake option `RYML_WITH_TAB_TOKENS` to enable ([PR#211](https://github.com/biojppm/rapidyaml/pulls/211)). +- ryml now parses successfully compact JSON code `{"like":"this"}` without any need for preprocessing. This code was not valid YAML 1.1, but was made valid in YAML 1.2. So the `preprocess_json()` functions, used to insert spaces after `:` are no longer necessary and have been removed. If you were using these functions, remove the calls and just pass the original source directly to ryml's parser ([PR#210](https://github.com/biojppm/rapidyaml/pulls/210)). - Fix handling of indentation when parsing block scalars ([PR#210](https://github.com/biojppm/rapidyaml/pulls/210)): ```yaml --- diff --git a/src/c4/yml/detail/parser_dbg.hpp b/src/c4/yml/detail/parser_dbg.hpp index cc6258bb3..457f1700d 100644 --- a/src/c4/yml/detail/parser_dbg.hpp +++ b/src/c4/yml/detail/parser_dbg.hpp @@ -51,7 +51,7 @@ void _dbg_printf(c4::csubstr fmt, Args&& ...args) # define _c4dbgq(msg) _dbg_printf(msg "\n") # define _c4err(fmt, ...) \ do { if(c4::is_debugger_attached()) { C4_DEBUG_BREAK(); } \ - this->_err("ERROR:\n" "%s:%d: " fmt, __FILE__, __LINE__, ## __VA_ARGS__); } while(0) + this->_err("ERROR:\n" "{}:{}: " fmt, __FILE__, __LINE__, ## __VA_ARGS__); } while(0) #else # define _c4dbgt(fmt, ...) # define _c4dbgpf(fmt, ...) @@ -63,21 +63,20 @@ void _dbg_printf(c4::csubstr fmt, Args&& ...args) #endif #define _c4prsp(sp) sp -#define _c4prc(c) (__c4prc(c) ? 2 : 1), (__c4prc(c) ? __c4prc(c) : &c) #define _c4presc(s) __c4presc(s.str, s.len) -inline const char *__c4prc(const char c) +inline c4::csubstr _c4prc(const char &C4_RESTRICT c) { switch(c) { - case '\n': return "\\n"; - case '\t': return "\\t"; - case '\0': return "\\0"; - case '\r': return "\\r"; - case '\f': return "\\f"; - case '\b': return "\\b"; - case '\v': return "\\v"; - case '\a': return "\\a"; - default: return nullptr; + case '\n': return c4::csubstr("\\n"); + case '\t': return c4::csubstr("\\t"); + case '\0': return c4::csubstr("\\0"); + case '\r': return c4::csubstr("\\r"); + case '\f': return c4::csubstr("\\f"); + case '\b': return c4::csubstr("\\b"); + case '\v': return c4::csubstr("\\v"); + case '\a': return c4::csubstr("\\a"); + default: return c4::csubstr(&c, 1); } } inline void __c4presc(const char *s, size_t len) diff --git a/src/c4/yml/parse.cpp b/src/c4/yml/parse.cpp index be84c4e8a..65953574e 100644 --- a/src/c4/yml/parse.cpp +++ b/src/c4/yml/parse.cpp @@ -11,7 +11,20 @@ #ifdef RYML_DBG #include "c4/yml/detail/print.hpp" #endif -#define RYML_FILTER_ARENA + +#ifndef RYML_ERRMSG_SIZE + #define RYML_ERRMSG_SIZE 1024 +#endif + +//#define RYML_WITH_TAB_TOKENS +#ifdef RYML_WITH_TAB_TOKENS +#define _RYML_WITH_TAB_TOKENS(...) __VA_ARGS__ +#define _RYML_WITH_OR_WITHOUT_TAB_TOKENS(with, without) with +#else +#define _RYML_WITH_TAB_TOKENS(...) +#define _RYML_WITH_OR_WITHOUT_TAB_TOKENS(with, without) without +#endif + #if defined(_MSC_VER) # pragma warning(push) @@ -35,7 +48,7 @@ namespace yml { namespace { template -size_t _parse_dump(DumpFn dumpfn, c4::csubstr fmt, Args&& ...args) +void _parse_dump(DumpFn dumpfn, c4::csubstr fmt, Args&& ...args) { char writebuf[256]; auto results = c4::format_dump_resume(dumpfn, writebuf, fmt, std::forward(args)...); @@ -48,7 +61,6 @@ size_t _parse_dump(DumpFn dumpfn, c4::csubstr fmt, Args&& ...args) results = format_dump_resume(dumpfn, results, writebuf, fmt, std::forward(args)...); } } - return results.bufsize; } bool _is_scalar_next__runk(csubstr s) @@ -63,7 +75,7 @@ bool _is_scalar_next__rseq_rval(csubstr s) bool _is_scalar_next__rmap(csubstr s) { - return !(s.begins_with(": ") || s.begins_with_any("#,!&") || s.begins_with("? ")); + return !(s.begins_with(": ") || s.begins_with_any("#,!&") || s.begins_with("? ") _RYML_WITH_TAB_TOKENS(|| s.begins_with(":\t"))); } bool _is_scalar_next__rmap_val(csubstr s) @@ -357,24 +369,23 @@ void Parser::_fmt_msg(DumpFn &&dumpfn) const if(contents.len) { // print the yaml src line - size_t offs; - if( ! m_file.empty()) - offs = _parse_dump(dumpfn, "{}:{}:{}", m_file, m_state->pos.line, m_state->pos.col); - else - offs = _parse_dump(dumpfn, "{}:{}", m_state->pos.line, m_state->pos.col); + size_t offs = 3u + to_chars(substr{}, m_state->pos.line) + to_chars(substr{}, m_state->pos.col); + if(m_file.len) + { + _parse_dump(dumpfn, "{}:", m_file); + offs += m_file.len + 1; + } + _parse_dump(dumpfn, "{}:{}: ", m_state->pos.line, m_state->pos.col); + csubstr maybe_full_content = (contents.len < 80u ? contents : contents.first(80u)); csubstr maybe_ellipsis = (contents.len < 80u ? csubstr{} : csubstr("...")); - _parse_dump(dumpfn, "{}:{}",m_state->pos.line, m_state->pos.col); - _parse_dump(dumpfn, "{}{} (size={})\n", - (contents.len < 80u ? contents : contents.first(80u)), - maybe_ellipsis, - contents.len); + _parse_dump(dumpfn, "{}{} (size={})\n", maybe_full_content, maybe_ellipsis, contents.len); // highlight the remaining portion of the previous line size_t firstcol = (size_t)(lc.rem.begin() - lc.full.begin()); size_t lastcol = firstcol + lc.rem.len; for(size_t i = 0; i < offs + firstcol; ++i) dumpfn(" "); dumpfn("^"); - for(size_t i = 0, e = (lc.rem.len < 80u ? lc.rem.len : 80u); i < e; ++i) + for(size_t i = 1, e = (lc.rem.len < 80u ? lc.rem.len : 80u); i < e; ++i) dumpfn("~"); _parse_dump(dumpfn, "{} (cols {}-{})\n", maybe_ellipsis, firstcol+1, lastcol+1); } @@ -397,9 +408,6 @@ void Parser::_fmt_msg(DumpFn &&dumpfn) const template void Parser::_err(csubstr fmt, Args const& C4_RESTRICT ...args) const { -#ifndef RYML_ERRMSG_SIZE - #define RYML_ERRMSG_SIZE 1024 -#endif char errmsg[RYML_ERRMSG_SIZE]; detail::_SubstrWriter writer(errmsg); auto dumpfn = [&writer](csubstr s){ writer.append(s); }; @@ -473,27 +481,27 @@ void Parser::_handle_line() _RYML_CB_ASSERT(m_stack.m_callbacks, ! m_state->line_contents.rem.empty()); if(has_any(RSEQ)) { - if(has_any(EXPL)) + if(has_any(FLOW)) { - if(_handle_seq_expl()) + if(_handle_seq_flow()) return; } else { - if(_handle_seq_impl()) + if(_handle_seq_blck()) return; } } else if(has_any(RMAP)) { - if(has_any(EXPL)) + if(has_any(FLOW)) { - if(_handle_map_expl()) + if(_handle_map_flow()) return; } else { - if(_handle_map_impl()) + if(_handle_map_blck()) return; } } @@ -559,7 +567,7 @@ bool Parser::_handle_unk() } } - if(rem.begins_with("- ")) + if(rem.begins_with("- ") _RYML_WITH_TAB_TOKENS( || rem.begins_with("-\t"))) { _c4dbgpf("it's a seq (as_child={})", start_as_child); _move_key_anchor_to_val_anchor(); @@ -583,23 +591,23 @@ bool Parser::_handle_unk() } else if(rem.begins_with('[')) { - _c4dbgpf("it's a seq, explicit (as_child={})", start_as_child); + _c4dbgpf("it's a seq, flow (as_child={})", start_as_child); _move_key_anchor_to_val_anchor(); _move_key_tag_to_val_tag(); _push_level(/*explicit flow*/true); _start_seq(start_as_child); - add_flags(EXPL); + add_flags(FLOW); _line_progressed(1); return true; } else if(rem.begins_with('{')) { - _c4dbgpf("it's a map, explicit (as_child={})", start_as_child); + _c4dbgpf("it's a map, flow (as_child={})", start_as_child); _move_key_anchor_to_val_anchor(); _move_key_tag_to_val_tag(); _push_level(/*explicit flow*/true); _start_map(start_as_child); - addrem_flags(EXPL|RKEY, RVAL); + addrem_flags(FLOW|RKEY, RVAL); _line_progressed(1); return true; } @@ -674,7 +682,7 @@ bool Parser::_handle_unk() { _c4dbgpf("got a ',' -- it's a seq (as_child={})", start_as_child); _start_seq(start_as_child); - add_flags(EXPL); + add_flags(FLOW); _append_val(_consume_scalar()); _line_progressed(2); } @@ -682,11 +690,11 @@ bool Parser::_handle_unk() { _c4dbgpf("got a ',' -- it's a seq (as_child={})", start_as_child); _start_seq(start_as_child); - add_flags(EXPL); + add_flags(FLOW); _append_val(_consume_scalar()); _line_progressed(1); } - else if(rem.begins_with(": ")) + else if(rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t"))) { _c4dbgpf("got a ': ' -- it's a map (as_child={})", start_as_child); _start_map_unk(start_as_child); // wait for the val scalar to append the key-val pair @@ -701,7 +709,7 @@ bool Parser::_handle_unk() } else if(rem.begins_with('}')) { - if(!has_all(RMAP|EXPL)) + if(!has_all(RMAP|FLOW)) { _c4err("invalid token: not reading a map"); } @@ -786,7 +794,7 @@ bool Parser::_handle_unk() } } _store_scalar(scalar, is_quoted); - if(rem.begins_with(": ")) + if(rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t"))) { _c4dbgpf("got a ': ' next -- it's a map (as_child={})", start_as_child); _push_level(); @@ -827,24 +835,52 @@ bool Parser::_handle_unk() return false; } + //----------------------------------------------------------------------------- -bool Parser::_handle_seq_expl() +C4_ALWAYS_INLINE void Parser::_skipchars(char c) { - _c4dbgpf("handle_seq_expl: node_id={} level={}", m_state->node_id, m_state->level); + _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->line_contents.rem.begins_with(c)); + size_t pos = m_state->line_contents.rem.first_not_of(c); + if(pos == npos) + pos = m_state->line_contents.rem.len; // maybe the line is just whitespace + _c4dbgpf("skip {} '{}'", pos, c); + _line_progressed(pos); +} + +template +C4_ALWAYS_INLINE void Parser::_skipchars(const char (&chars)[N]) +{ + _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->line_contents.rem.begins_with_any(chars)); + size_t pos = m_state->line_contents.rem.first_not_of(chars); + if(pos == npos) + pos = m_state->line_contents.rem.len; // maybe the line is just whitespace + _c4dbgpf("skip {} characters", pos); + _line_progressed(pos); +} + + +//----------------------------------------------------------------------------- +bool Parser::_handle_seq_flow() +{ + _c4dbgpf("handle_seq_flow: node_id={} level={}", m_state->node_id, m_state->level); csubstr rem = m_state->line_contents.rem; _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RKEY)); - _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RSEQ|EXPL)); + _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RSEQ|FLOW)); if(rem.begins_with(' ')) { // with explicit flow, indentation does not matter _c4dbgp("starts with spaces"); - rem = rem.left_of(rem.first_not_of(' ')); - _c4dbgpf("skip {} spaces", rem.len); - _line_progressed(rem.len); + _skipchars(' '); return true; } + _RYML_WITH_TAB_TOKENS(else if(rem.begins_with('\t')) + { + _c4dbgp("starts with tabs"); + _skipchars('\t'); + return true; + }) else if(rem.begins_with('#')) { _c4dbgp("it's a comment"); @@ -881,7 +917,7 @@ bool Parser::_handle_seq_expl() addrem_flags(RNXT, RVAL); // before _push_level! _push_level(/*explicit flow*/true); _start_seq(); - add_flags(EXPL); + add_flags(FLOW); _line_progressed(1); return true; } @@ -891,7 +927,7 @@ bool Parser::_handle_seq_expl() addrem_flags(RNXT, RVAL); // before _push_level! _push_level(/*explicit flow*/true); _start_map(); - addrem_flags(EXPL|RKEY, RVAL); + addrem_flags(FLOW|RKEY, RVAL); _line_progressed(1); return true; } @@ -902,7 +938,7 @@ bool Parser::_handle_seq_expl() _line_progressed(1); return true; } - else if(rem.begins_with(": ")) + else if(rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t"))) { _c4dbgpf("found ': ' -- there's an implicit map in the seq node[{}]", m_state->node_id); _start_seqimap(); @@ -950,7 +986,7 @@ bool Parser::_handle_seq_expl() _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RVAL)); if(rem.begins_with(", ")) { - _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(EXPL)); + _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(FLOW)); _c4dbgp("seq: expect next val"); addrem_flags(RVAL, RNXT); _line_progressed(2); @@ -958,7 +994,7 @@ bool Parser::_handle_seq_expl() } else if(rem.begins_with(',')) { - _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(EXPL)); + _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(FLOW)); _c4dbgp("seq: expect next val"); addrem_flags(RVAL, RNXT); _line_progressed(1); @@ -971,7 +1007,7 @@ bool Parser::_handle_seq_expl() _line_progressed(1); return true; } - else if(rem.begins_with(": ")) + else if(rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t"))) { _c4dbgpf("found ': ' -- there's an implicit map in the seq node[{}]", m_state->node_id); _start_seqimap(); @@ -992,14 +1028,14 @@ bool Parser::_handle_seq_expl() } //----------------------------------------------------------------------------- -bool Parser::_handle_seq_impl() +bool Parser::_handle_seq_blck() { _c4dbgpf("handle_seq_impl: node_id={} level={}", m_state->node_id, m_state->level); csubstr rem = m_state->line_contents.rem; _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RSEQ)); _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RKEY)); - _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(EXPL)); + _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(FLOW)); if(rem.begins_with('#')) { @@ -1013,11 +1049,9 @@ bool Parser::_handle_seq_impl() _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RVAL)); if(_handle_indentation()) - { return true; - } - if(rem.begins_with("- ")) + if(rem.begins_with("- ") _RYML_WITH_TAB_TOKENS( || rem.begins_with("-\t"))) { _c4dbgp("expect another val"); addrem_flags(RVAL, RNXT); @@ -1034,9 +1068,7 @@ bool Parser::_handle_seq_impl() else if(rem.begins_with_any(" \t")) { _RYML_CB_ASSERT(m_stack.m_callbacks, ! _at_line_begin()); - rem = rem.left_of(rem.first_not_of(" \t")); - _c4dbgpf("skipping {} spaces/tabs", rem.len); - _line_progressed(rem.len); + _skipchars(" \t"); return true; } else if(rem.begins_with("...")) @@ -1070,17 +1102,18 @@ bool Parser::_handle_seq_impl() _c4dbgpf("it's a{} scalar", is_quoted ? " quoted" : ""); rem = m_state->line_contents.rem; - if(rem.begins_with(' ')) + if(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(rem.begins_with_any(" \t"), rem.begins_with(' '))) { _c4dbgp("skipping whitespace..."); - size_t skip = rem.first_not_of(' '); + size_t skip = rem.first_not_of(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' ')); if(skip == csubstr::npos) skip = rem.len; // maybe the line is just whitespace _line_progressed(skip); rem = rem.sub(skip); } - if(!rem.begins_with('#') && (rem.begins_with(": ") || rem.ends_with(':'))) + _c4dbgpf("rem=[{}]~~~{}~~~", rem.len, rem); + if(!rem.begins_with('#') && (rem.ends_with(':') || rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t")))) { _c4dbgp("actually, the scalar is the first key of a map, and it opens a new scope"); if(m_key_anchor.empty()) @@ -1108,7 +1141,7 @@ bool Parser::_handle_seq_impl() } return true; } - else if(rem.begins_with("- ")) + else if(rem.begins_with("- ") _RYML_WITH_TAB_TOKENS( || rem.begins_with("-\t"))) { if(_rval_dash_start_or_continue_seq()) _line_progressed(2); @@ -1122,21 +1155,21 @@ bool Parser::_handle_seq_impl() } else if(rem.begins_with('[')) { - _c4dbgp("val is a child seq, explicit"); + _c4dbgp("val is a child seq, flow"); addrem_flags(RNXT, RVAL); // before _push_level! _push_level(/*explicit flow*/true); _start_seq(); - add_flags(EXPL); + add_flags(FLOW); _line_progressed(1); return true; } else if(rem.begins_with('{')) { - _c4dbgp("val is a child map, explicit"); + _c4dbgp("val is a child map, flow"); addrem_flags(RNXT, RVAL); // before _push_level! _push_level(/*explicit flow*/true); _start_map(); - addrem_flags(EXPL|RKEY, RVAL); + addrem_flags(FLOW|RKEY, RVAL); _line_progressed(1); return true; } @@ -1244,23 +1277,28 @@ bool Parser::_rval_dash_start_or_continue_seq() } //----------------------------------------------------------------------------- -bool Parser::_handle_map_expl() +bool Parser::_handle_map_flow() { // explicit flow, ie, inside {}, separated by commas - _c4dbgpf("handle_map_expl: node_id={} level={}", m_state->node_id, m_state->level); + _c4dbgpf("handle_map_flow: node_id={} level={}", m_state->node_id, m_state->level); csubstr rem = m_state->line_contents.rem; - _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RMAP|EXPL)); + _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RMAP|FLOW)); if(rem.begins_with(' ')) { // with explicit flow, indentation does not matter _c4dbgp("starts with spaces"); - rem = rem.left_of(rem.first_not_of(' ')); - _c4dbgpf("skip {} spaces", rem.len); - _line_progressed(rem.len); + _skipchars(' '); return true; } + _RYML_WITH_TAB_TOKENS(else if(rem.begins_with('\t')) + { + // with explicit flow, indentation does not matter + _c4dbgp("starts with tabs"); + _skipchars('\t'); + return true; + }) else if(rem.begins_with('#')) { _c4dbgp("it's a comment"); @@ -1324,7 +1362,7 @@ bool Parser::_handle_map_expl() _store_scalar(rem, is_quoted); rem = m_state->line_contents.rem; csubstr trimmed = rem.triml(" \t"); - if(trimmed.len && (trimmed.begins_with(": ") || trimmed.begins_with_any(":,}"))) + if(trimmed.len && (trimmed.begins_with(": ") || trimmed.begins_with_any(":,}") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t")))) { _RYML_CB_ASSERT(m_stack.m_callbacks, trimmed.str >= rem.str); size_t num = static_cast(trimmed.str - rem.str); @@ -1334,7 +1372,7 @@ bool Parser::_handle_map_expl() } } - if(rem.begins_with(": ")) + if(rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t"))) { _c4dbgp("wait for val"); addrem_flags(RVAL, RKEY|QMRK); @@ -1458,7 +1496,7 @@ bool Parser::_handle_map_expl() _push_level(/*explicit flow*/true); _move_scalar_from_top(); _start_seq(); - add_flags(EXPL); + add_flags(FLOW); _line_progressed(1); return true; } @@ -1469,7 +1507,7 @@ bool Parser::_handle_map_expl() _push_level(/*explicit flow*/true); _move_scalar_from_top(); _start_map(); - addrem_flags(EXPL|RKEY, RNXT|RVAL); + addrem_flags(FLOW|RKEY, RNXT|RVAL); _line_progressed(1); return true; } @@ -1520,13 +1558,13 @@ bool Parser::_handle_map_expl() } //----------------------------------------------------------------------------- -bool Parser::_handle_map_impl() +bool Parser::_handle_map_blck() { _c4dbgpf("handle_map_impl: node_id={} level={}", m_state->node_id, m_state->level); csubstr rem = m_state->line_contents.rem; _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RMAP)); - _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(EXPL)); + _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(FLOW)); if(rem.begins_with('#')) { @@ -1587,10 +1625,11 @@ bool Parser::_handle_map_impl() } else if(rem.begins_with_any(" \t")) { - //_RYML_CB_ASSERT(m_stack.m_callbacks, ! _at_line_begin()); - rem = rem.left_of(rem.first_not_of(" \t")); - _c4dbgpf("skip {} spaces/tabs", rem.len); - _line_progressed(rem.len); + size_t pos = rem.first_not_of(" \t"); + if(pos == npos) + pos = rem.len; + _c4dbgpf("skip {} spaces/tabs", pos); + _line_progressed(pos); return true; } else if(rem == '?' || rem.begins_with("? ")) @@ -1613,13 +1652,11 @@ bool Parser::_handle_map_impl() if(rem.begins_with(' ')) { _RYML_CB_ASSERT(m_stack.m_callbacks, ! _at_line_begin()); - rem = rem.left_of(rem.first_not_of(' ')); - _c4dbgpf("skip {} spaces", rem.len); - _line_progressed(rem.len); + _skipchars(' '); } return true; } - else if(rem == ':' || rem.begins_with(": ") ) + else if(rem == ':' || rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t"))) { _c4dbgp("key finished"); if(!has_all(SSCL)) @@ -1724,23 +1761,23 @@ bool Parser::_handle_map_impl() } else if(rem.begins_with('[')) { - _c4dbgp("val is a child seq, explicit"); + _c4dbgp("val is a child seq, flow"); addrem_flags(RKEY, RVAL); // before _push_level! _push_level(/*explicit flow*/true); _move_scalar_from_top(); _start_seq(); - add_flags(EXPL); + add_flags(FLOW); _line_progressed(1); return true; } else if(rem.begins_with('{')) { - _c4dbgp("val is a child map, explicit"); + _c4dbgp("val is a child map, flow"); addrem_flags(RKEY, RVAL); // before _push_level! _push_level(/*explicit flow*/true); _move_scalar_from_top(); _start_map(); - addrem_flags(EXPL|RKEY, RVAL); + addrem_flags(FLOW|RKEY, RVAL); _line_progressed(1); return true; } @@ -2288,16 +2325,22 @@ bool Parser::_scan_scalar(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted) _c4dbgp("RSEQ|RVAL"); if( ! _is_scalar_next__rseq_rval(s)) return false; - s = s.left_of(s.find(" #")); // is there a comment? - s = s.left_of(s.find(": ")); // is there a key-value? if(s.ends_with(':')) - s = s.left_of(s.len-1); - if(has_all(EXPL)) + { + --s.len; + } + else + { + auto first = s.first_of_any(": " _RYML_WITH_TAB_TOKENS( , ":\t"), " #"); + if(first) + s.len = first.pos; + } + if(has_all(FLOW)) { _c4dbgp("RSEQ|RVAL|EXPL"); s = s.left_of(s.first_of(",]")); } - s = s.trimr(' '); + s = s.trimr(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' ')); } else { @@ -2311,10 +2354,23 @@ bool Parser::_scan_scalar(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted) size_t colon_space = s.find(": "); if(colon_space == npos) { - colon_space = s.find(":"); - _RYML_CB_ASSERT(m_stack.m_callbacks, s.len > 0); - if(colon_space != s.len-1) - colon_space = npos; + _RYML_WITH_OR_WITHOUT_TAB_TOKENS( + // with tab tokens + colon_space = s.find(":\t"); + if(colon_space == npos) + { + _RYML_CB_ASSERT(m_stack.m_callbacks, s.len > 0); + colon_space = s.find(':'); + if(colon_space != s.len-1) + colon_space = npos; + } + , + // without tab tokens + colon_space = s.find(':'); + _RYML_CB_ASSERT(m_stack.m_callbacks, s.len > 0); + if(colon_space != s.len-1) + colon_space = npos; + ) } if(has_all(RKEY)) @@ -2328,7 +2384,7 @@ bool Parser::_scan_scalar(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted) return false; s = s.left_of(colon_space); s = s.left_of(s.first_of("#")); - if(has_any(EXPL)) + if(has_any(FLOW)) s = s.left_of(s.first_of(':')); s = s.trimr(" \t"); if(s.begins_with("---")) @@ -2343,8 +2399,8 @@ bool Parser::_scan_scalar(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted) if(s.begins_with("? ") || s == '?') return false; s = s.left_of(colon_space); - s = s.trimr(' '); - if(has_any(EXPL)) + s = s.trimr(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' ')); + if(has_any(FLOW)) { _c4dbgpf("RMAP|RKEY|EXPL: '{}'", s); s = s.left_of(s.first_of(",}")); @@ -2366,12 +2422,10 @@ bool Parser::_scan_scalar(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted) _c4dbgp("RMAP|RVAL"); _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(QMRK)); if( ! _is_scalar_next__rmap_val(s)) - { return false; - } s = s.left_of(s.find(" #")); // is there a comment? s = s.left_of(s.find("\t#")); // is there a comment? - if(has_any(EXPL)) + if(has_any(FLOW)) { _c4dbgp("RMAP|RVAL|EXPL"); if(has_none(RSEQIMAP)) @@ -2379,7 +2433,7 @@ bool Parser::_scan_scalar(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted) else s = s.left_of(s.first_of(",]")); } - s = s.trim(' '); + s = s.trim(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' ')); if(s.begins_with("---")) return false; else if(s.begins_with("...")) @@ -2392,7 +2446,7 @@ bool Parser::_scan_scalar(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted) } else if(has_all(RUNK)) { - _c4dbgp("RUNK"); + _c4dbgpf("RUNK '[{}]~~~{}~~~", s.len, s); if( ! _is_scalar_next__runk(s)) { _c4dbgp("RUNK: no scalar next"); @@ -2404,6 +2458,10 @@ bool Parser::_scan_scalar(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted) s = s.left_of(pos); else if(s.ends_with(':')) s = s.left_of(s.len-1); + _RYML_WITH_TAB_TOKENS( + else if((pos = s.find(":\t")) != npos) // TABS + s = s.left_of(pos); + ) else s = s.left_of(s.first_of(',')); s = s.trim(" \t"); @@ -2440,8 +2498,8 @@ csubstr Parser::_extend_scanned_scalar(csubstr s) { if(has_all(RMAP|RKEY|QMRK)) { - size_t scalar_indentation = has_any(EXPL) ? 0 : m_state->scalar_col; - _c4dbgpf("extend_scalar: complex key! indref={} scalar_indentation={} scalar_col={}", m_state->indref, scalar_indentation, m_state->scalar_col); + size_t scalar_indentation = has_any(FLOW) ? 0 : m_state->scalar_col; + _c4dbgpf("extend_scalar: explicit key! indref={} scalar_indentation={} scalar_col={}", m_state->indref, scalar_indentation, m_state->scalar_col); csubstr n = _scan_to_next_nonempty_line(scalar_indentation); if(!n.empty()) { @@ -2454,7 +2512,7 @@ csubstr Parser::_extend_scanned_scalar(csubstr s) else if(!s.begins_with_any("*")) // cannot be a plain scalar if it starts with * (that's an anchor reference) { _c4dbgpf("extend_scalar: line ended, scalar='{}'", s); - if(has_none(EXPL)) + if(has_none(FLOW)) { size_t scalar_indentation = m_state->indref + 1; if(has_all(RUNK) && scalar_indentation == 1) @@ -2464,19 +2522,19 @@ csubstr Parser::_extend_scanned_scalar(csubstr s) { _c4dbgpf("rscalar[IMPL]: state_indref={} state_indentation={} scalar_indentation={}", m_state->indref, m_state->line_contents.indentation, scalar_indentation); _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->line_contents.full.is_super(n)); - substr full = _scan_plain_scalar_impl(s, n, scalar_indentation); + substr full = _scan_plain_scalar_blck(s, n, scalar_indentation); if(full.len >= s.len) s = _filter_plain_scalar(full, scalar_indentation); } } else { - _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(EXPL)); + _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(FLOW)); csubstr n = _scan_to_next_nonempty_line(/*indentation*/0); if(!n.empty()) { - _c4dbgp("rscalar[EXPL]"); - substr full = _scan_plain_scalar_expl(s, n); + _c4dbgp("rscalar[FLOW]"); + substr full = _scan_plain_scalar_flow(s, n); s = _filter_plain_scalar(full, /*indentation*/0); } } @@ -2488,7 +2546,7 @@ csubstr Parser::_extend_scanned_scalar(csubstr s) //----------------------------------------------------------------------------- -substr Parser::_scan_plain_scalar_expl(csubstr currscalar, csubstr peeked_line) +substr Parser::_scan_plain_scalar_flow(csubstr currscalar, csubstr peeked_line) { static constexpr const csubstr chars = "[]{}?#,"; size_t pos = peeked_line.first_of(chars); @@ -2546,7 +2604,7 @@ substr Parser::_scan_plain_scalar_expl(csubstr currscalar, csubstr peeked_line) //----------------------------------------------------------------------------- -substr Parser::_scan_plain_scalar_impl(csubstr currscalar, csubstr peeked_line, size_t indentation) +substr Parser::_scan_plain_scalar_blck(csubstr currscalar, csubstr peeked_line, size_t indentation) { _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.is_super(currscalar)); // NOTE. there's a problem with _scan_to_next_nonempty_line(), as it counts newlines twice @@ -2961,9 +3019,9 @@ void Parser::_push_level(bool explicit_flow_chars) return; } flag_t st = RUNK; - if(explicit_flow_chars || has_all(EXPL)) + if(explicit_flow_chars || has_all(FLOW)) { - st |= EXPL; + st |= FLOW; } m_stack.push_top(); m_state = &m_stack.top(); @@ -3099,7 +3157,7 @@ void Parser::_end_stream() _c4err("internal error"); } } - else if(has_all(RSEQ|RVAL) && has_none(EXPL)) + else if(has_all(RSEQ|RVAL) && has_none(FLOW)) { _c4dbgp("add last..."); added = _append_val_null(m_state->line_contents.rem.str); @@ -3157,7 +3215,7 @@ void Parser::_end_stream() { _c4dbgpf("popping level: {} (stack sz={})", m_state->level, m_stack.size()); _RYML_CB_ASSERT(m_stack.m_callbacks, ! has_any(SSCL, &m_stack.top())); - if(has_all(RSEQ|EXPL)) + if(has_all(RSEQ|FLOW)) _err("closing ] not found"); _pop_level(); } @@ -3375,7 +3433,7 @@ void Parser::_stop_seq() void Parser::_start_seqimap() { _c4dbgpf("start_seqimap at node={}. has_children={}", m_state->node_id, m_tree->has_children(m_state->node_id)); - _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RSEQ|EXPL)); + _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RSEQ|FLOW)); // create a map, and turn the last scalar of this sequence // into the key of the map's first child. This scalar was // understood to be a value in the sequence, but it is @@ -3403,7 +3461,7 @@ void Parser::_start_seqimap() _start_map(); _store_scalar_null(m_state->line_contents.rem.str); } - add_flags(RSEQIMAP|EXPL); + add_flags(RSEQIMAP|FLOW); } void Parser::_stop_seqimap() @@ -3508,7 +3566,7 @@ void Parser::_move_scalar_from_top() /** @todo this function is a monster and needs love. */ bool Parser::_handle_indentation() { - _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(EXPL)); + _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(FLOW)); if( ! _at_line_begin()) return false; @@ -4997,7 +5055,7 @@ csubstr Parser::_prfl(substr buf, flag_t flags) _prflag(RUNK); _prflag(RMAP); _prflag(RSEQ); - _prflag(EXPL); + _prflag(FLOW); _prflag(QMRK); _prflag(RKEY); _prflag(RVAL); diff --git a/src/c4/yml/parse.hpp b/src/c4/yml/parse.hpp index c5fc12e6b..692115fe5 100644 --- a/src/c4/yml/parse.hpp +++ b/src/c4/yml/parse.hpp @@ -288,8 +288,8 @@ class RYML_EXPORT Parser csubstr _scan_squot_scalar(); csubstr _scan_dquot_scalar(); csubstr _scan_block(); - substr _scan_plain_scalar_impl(csubstr currscalar, csubstr peeked_line, size_t indentation); - substr _scan_plain_scalar_expl(csubstr currscalar, csubstr peeked_line); + substr _scan_plain_scalar_blck(csubstr currscalar, csubstr peeked_line, size_t indentation); + substr _scan_plain_scalar_flow(csubstr currscalar, csubstr peeked_line); substr _scan_complex_key(csubstr currscalar, csubstr peeked_line); csubstr _scan_to_next_nonempty_line(size_t indentation); csubstr _extend_scanned_scalar(csubstr currscalar); @@ -310,10 +310,10 @@ class RYML_EXPORT Parser bool _handle_indentation(); bool _handle_unk(); - bool _handle_map_expl(); - bool _handle_map_impl(); - bool _handle_seq_expl(); - bool _handle_seq_impl(); + bool _handle_map_flow(); + bool _handle_map_blck(); + bool _handle_seq_flow(); + bool _handle_seq_blck(); bool _handle_top(); bool _handle_types(); bool _handle_key_anchors_and_refs(); @@ -363,6 +363,10 @@ class RYML_EXPORT Parser void _write_key_anchor(size_t node_id); void _write_val_anchor(size_t node_id); + void _skipchars(char c); + template + void _skipchars(const char (&chars)[N]); + private: static size_t _count_nlines(csubstr src); @@ -374,7 +378,7 @@ class RYML_EXPORT Parser RUNK = 0x01 << 1, ///< reading an unknown: must determine whether scalar, map or seq RMAP = 0x01 << 2, ///< reading a map RSEQ = 0x01 << 3, ///< reading a seq - EXPL = 0x01 << 4, ///< reading is inside explicit flow chars: [] or {} + FLOW = 0x01 << 4, ///< reading is inside explicit flow chars: [] or {} QMRK = 0x01 << 5, ///< reading an explicit key (`? key`) RKEY = 0x01 << 6, ///< reading a scalar as key RVAL = 0x01 << 7, ///< reading a scalar as val diff --git a/test/test_seq_of_map.cpp b/test/test_seq_of_map.cpp index 63d0ad54a..ec1bacf04 100644 --- a/test/test_seq_of_map.cpp +++ b/test/test_seq_of_map.cpp @@ -132,26 +132,25 @@ TEST(seq_of_map, missing_scalars_v3) EXPECT_EQ(t["a"][1].first_child().val(), nullptr); } -TEST(explicit_key, test_suite_NJ66) +#ifdef RYML_WITH_TAB_TOKENS +TEST(seq_of_map, test_suite_6BCT) { - csubstr yaml = R"( -- { single line: value} -- { multi - line: value} -- { multi - line: value} -)"; - test_check_emit_check(yaml, [](Tree const &t){ - ASSERT_TRUE(t.rootref().is_seq()); - ASSERT_EQ(t.rootref().num_children(), 3u); - ASSERT_TRUE(t[0].has_child("single line")); - ASSERT_TRUE(t[1].has_child("multi line")); - ASSERT_TRUE(t[2].has_child("multi line")); - EXPECT_EQ(t[0]["single line"].val(), csubstr("value")); - EXPECT_EQ(t[1]["multi line"].val(), csubstr("value")); - EXPECT_EQ(t[2]["multi line"].val(), csubstr("value")); - }); + Tree t = parse_in_arena(R"( +- foo0: bar0 +- foo1 : bar1 +- foo2 : bar2 +)"); + #ifdef RYML_DBG + print_tree(t); + #endif + ASSERT_TRUE(t[0].is_map()); + ASSERT_TRUE(t[1].is_map()); + ASSERT_TRUE(t[2].is_map()); + EXPECT_EQ(t[0]["foo0"].val(), csubstr("bar0")); + EXPECT_EQ(t[1]["foo1"].val(), csubstr("bar1")); + EXPECT_EQ(t[2]["foo2"].val(), csubstr("bar2")); } +#endif //----------------------------------------------------------------------------- @@ -275,7 +274,7 @@ L{N(KEYSEQ|KEYQUO, "implicit block key", L{ N(L{N(KEYSEQ|KEYQUO, "implicit flow key s", L{N("val1"), N("val2")})}), })}); -/* TODO JAVAI 209 + ADD_CASE_TO_GROUP("seq of maps, implicit map in seq, missing scalar", R"({a : [ : foo @@ -294,7 +293,7 @@ L{ N("b", L{N(MAP, L{N("", "foo")}),}), N("c", L{N(MAP, L{N(KEYVAL, "", {})}), N(MAP, L{N(KEYVAL, "", {})}),}), }); -*/ + ADD_CASE_TO_GROUP("seq of maps, implicit with anchors, unresolved", R"( @@ -310,6 +309,7 @@ L{ N(L{N("*a1", AR(KEYREF, "*a1"), "w1"), N("*a2", AR(KEYREF, "*a2"), "w2"), N("*a3", AR(KEYREF, "*a3"), "w3")}), }); + ADD_CASE_TO_GROUP("seq of maps, implicit with anchors, resolved", RESOLVE_REFS, R"( - &a1 a1: v1 diff --git a/test/test_simple_map.cpp b/test/test_simple_map.cpp index f6a4a3c50..8ee021fd6 100644 --- a/test/test_simple_map.cpp +++ b/test/test_simple_map.cpp @@ -207,6 +207,94 @@ TEST(simple_map, no_seq_key_block) } #endif +#ifdef RYML_WITH_TAB_TOKENS +TEST(simple_map, block_tab_tokens) +{ + Tree tree = parse_in_arena(R"( +--- # block, spaces only +a: 0 +b: 1 +c: 2 +--- # block, tabs after token +a: 0 +b: 1 +c: 2 +--- # block, tabs before and after token +a : 0 +b : 1 +c : 2 +--- # block, tabs before token +a : 0 +b : 1 +c : 2 +--- # block, tabs before newline +a : 0 +b : 1 +c : 2 +)"); + EXPECT_EQ(tree.docref(0)["a"].val(), csubstr("0")); + EXPECT_EQ(tree.docref(0)["b"].val(), csubstr("1")); + EXPECT_EQ(tree.docref(0)["c"].val(), csubstr("2")); + EXPECT_EQ(tree.docref(1)["a"].val(), csubstr("0")); + EXPECT_EQ(tree.docref(1)["b"].val(), csubstr("1")); + EXPECT_EQ(tree.docref(1)["c"].val(), csubstr("2")); + EXPECT_EQ(tree.docref(2)["a"].val(), csubstr("0")); + EXPECT_EQ(tree.docref(2)["b"].val(), csubstr("1")); + EXPECT_EQ(tree.docref(2)["c"].val(), csubstr("2")); + EXPECT_EQ(tree.docref(3)["a"].val(), csubstr("0")); + EXPECT_EQ(tree.docref(3)["b"].val(), csubstr("1")); + EXPECT_EQ(tree.docref(3)["c"].val(), csubstr("2")); + EXPECT_EQ(tree.docref(4)["a"].val(), csubstr("0")); + EXPECT_EQ(tree.docref(4)["b"].val(), csubstr("1")); + EXPECT_EQ(tree.docref(4)["c"].val(), csubstr("2")); +} + +TEST(simple_map, flow_tab_tokens) +{ + Tree tree = parse_in_arena(R"( +--- # flow, no tabs +{a: 0, b: 1, c: 2} +--- # flow, tabs after token +{a: 0, b: 1, c: 2} +--- # flow, tabs before and after token +{a : 0, b : 1, c : 2} +--- # flow, tabs before token +{a : 0, b : 1, c : 2} +--- # flow, tabs after val +{a : 0 , b : 1 , c : 2 } +--- # flow, tabs after val and comma +{a : 0 , b : 1 , c : 2 } +--- # flow, tabs everywhere + { + a : 0 , + b : 1 , + c : 2 + } + )"); + EXPECT_EQ(tree.docref(0)["a"].val(), csubstr("0")); + EXPECT_EQ(tree.docref(0)["b"].val(), csubstr("1")); + EXPECT_EQ(tree.docref(0)["c"].val(), csubstr("2")); + EXPECT_EQ(tree.docref(1)["a"].val(), csubstr("0")); + EXPECT_EQ(tree.docref(1)["b"].val(), csubstr("1")); + EXPECT_EQ(tree.docref(1)["c"].val(), csubstr("2")); + EXPECT_EQ(tree.docref(2)["a"].val(), csubstr("0")); + EXPECT_EQ(tree.docref(2)["b"].val(), csubstr("1")); + EXPECT_EQ(tree.docref(2)["c"].val(), csubstr("2")); + EXPECT_EQ(tree.docref(3)["a"].val(), csubstr("0")); + EXPECT_EQ(tree.docref(3)["b"].val(), csubstr("1")); + EXPECT_EQ(tree.docref(3)["c"].val(), csubstr("2")); + EXPECT_EQ(tree.docref(4)["a"].val(), csubstr("0")); + EXPECT_EQ(tree.docref(4)["b"].val(), csubstr("1")); + EXPECT_EQ(tree.docref(4)["c"].val(), csubstr("2")); + EXPECT_EQ(tree.docref(5)["a"].val(), csubstr("0")); + EXPECT_EQ(tree.docref(5)["b"].val(), csubstr("1")); + EXPECT_EQ(tree.docref(5)["c"].val(), csubstr("2")); + EXPECT_EQ(tree.docref(6)["a"].val(), csubstr("0")); + EXPECT_EQ(tree.docref(6)["b"].val(), csubstr("1")); + EXPECT_EQ(tree.docref(6)["c"].val(), csubstr("2")); +} +#endif // RYML_WITH_TAB_TOKENS + //----------------------------------------------------------------------------- //----------------------------------------------------------------------------- diff --git a/test/test_simple_seq.cpp b/test/test_simple_seq.cpp index 5404821ef..e505c6f96 100644 --- a/test/test_simple_seq.cpp +++ b/test/test_simple_seq.cpp @@ -109,6 +109,63 @@ TEST(simple_seq, deeply_nested_to_cover_parse_stack_resizes) } +#ifdef RYML_WITH_TAB_TOKENS +TEST(simple_seq, block_tab_tokens) +{ + Tree tree = parse_in_arena(R"( +--- # block, spaces only +- 0 +- 1 +- 2 +--- # block, tabs after +- 0 +- 1 +- 2 +--- # block, tabs after token, and after val +- 0 +- 1 +- 2 +)"); + EXPECT_EQ(tree.docref(0)[0].val(), csubstr("0")); + EXPECT_EQ(tree.docref(0)[1].val(), csubstr("1")); + EXPECT_EQ(tree.docref(0)[2].val(), csubstr("2")); + EXPECT_EQ(tree.docref(1)[0].val(), csubstr("0")); + EXPECT_EQ(tree.docref(1)[1].val(), csubstr("1")); + EXPECT_EQ(tree.docref(1)[2].val(), csubstr("2")); +} + +TEST(simple_seq, flow_tab_tokens) +{ + Tree tree = parse_in_arena(R"( +--- # flow, no tabs +[0, 1, 2] +--- # flow, tabs after +[0, 1, 2] +--- # flow, tabs before and after +[0 , 1 , 2] +--- # flow, tabs everywhere + [ + 0 , + 1 , + 2 , + ] +)"); + EXPECT_EQ(tree.docref(0)[0].val(), csubstr("0")); + EXPECT_EQ(tree.docref(0)[1].val(), csubstr("1")); + EXPECT_EQ(tree.docref(0)[2].val(), csubstr("2")); + EXPECT_EQ(tree.docref(1)[0].val(), csubstr("0")); + EXPECT_EQ(tree.docref(1)[1].val(), csubstr("1")); + EXPECT_EQ(tree.docref(1)[2].val(), csubstr("2")); + EXPECT_EQ(tree.docref(2)[0].val(), csubstr("0")); + EXPECT_EQ(tree.docref(2)[1].val(), csubstr("1")); + EXPECT_EQ(tree.docref(2)[2].val(), csubstr("2")); + EXPECT_EQ(tree.docref(3)[0].val(), csubstr("0")); + EXPECT_EQ(tree.docref(3)[1].val(), csubstr("1")); + EXPECT_EQ(tree.docref(3)[2].val(), csubstr("2")); +} +#endif // RYML_WITH_TAB_TOKENS + + //----------------------------------------------------------------------------- //----------------------------------------------------------------------------- //----------------------------------------------------------------------------- @@ -413,6 +470,11 @@ L{ } ); +#ifdef RYML_WITH_TAB_TOKENS +#define _ryml_with_or_without_tabs(with, without) with +#else +#define _ryml_with_or_without_tabs(with, without) without +#endif ADD_CASE_TO_GROUP("simple seq expl, scalars with special chars, colon", R"( - [[], :@] @@ -429,7 +491,7 @@ L{ N(L{N(SEQ), N(":^")}), N(L{N(SEQ), N(":$")}), N(L{N(SEQ), N("::")}), - N(L{N(SEQ), N(": ")}), + N(L{N(SEQ), _ryml_with_or_without_tabs(N(MAP, L{N("", "")}), N(": "))}), N(L{N(SEQ), N(":`")}), } ); diff --git a/test/test_suite/test_suite_parts.cpp b/test/test_suite/test_suite_parts.cpp index a513ec594..2ea3a7818 100644 --- a/test/test_suite/test_suite_parts.cpp +++ b/test/test_suite/test_suite_parts.cpp @@ -110,6 +110,10 @@ constexpr const AllowedFailure allowed_failures[] = { // These tests are skipped because they cover parts of YAML that // are deliberately not implemented by ryml. + #ifndef RYML_WITH_TAB_TOKENS // - or : are supported only when the above macro is defined + _("6BCT-in_yaml" , "tabs after - or :"), + _("J3BT-in_yaml-events" , "tabs after - or :"), + #endif // container keys are not supported _("4FJ6-in_yaml" , "only scalar keys allowed (keys cannot be maps or seqs)"), _("4FJ6-out_yaml" , "only scalar keys allowed (keys cannot be maps or seqs)"), @@ -126,10 +130,10 @@ constexpr const AllowedFailure allowed_failures[] = { _("KZN9-out_yaml" , "only scalar keys allowed (keys cannot be maps or seqs)"), _("LX3P-in_yaml" , "only scalar keys allowed (keys cannot be maps or seqs)"), _("LX3P-out_yaml" , "only scalar keys allowed (keys cannot be maps or seqs)"), - _("M2N8_00-in_yaml" , "only scalar keys allowed (keys cannot be maps or seqs)"),// FIXME but only case 1 - _("M2N8_00-out_yaml" , "only scalar keys allowed (keys cannot be maps or seqs)"),// FIXME but only case 1 - _("M2N8_01-in_yaml-events" , "only scalar keys allowed (keys cannot be maps or seqs)"),// FIXME but only case 1 - _("M2N8_01-out_yaml-events", "only scalar keys allowed (keys cannot be maps or seqs)"),// FIXME but only case 1 + _("M2N8_00-in_yaml" , "only scalar keys allowed (keys cannot be maps or seqs)"), + _("M2N8_00-out_yaml" , "only scalar keys allowed (keys cannot be maps or seqs)"), + _("M2N8_01-in_yaml-events" , "only scalar keys allowed (keys cannot be maps or seqs)"), + _("M2N8_01-out_yaml-events", "only scalar keys allowed (keys cannot be maps or seqs)"), _("M5DY-in_yaml" , "only scalar keys allowed (keys cannot be maps or seqs)"), _("M5DY-out_yaml" , "only scalar keys allowed (keys cannot be maps or seqs)"), _("Q9WF-in_yaml" , "only scalar keys allowed (keys cannot be maps or seqs)"), @@ -144,9 +148,6 @@ constexpr const AllowedFailure allowed_failures[] = { _("X38W-out_yaml" , "only scalar keys allowed (keys cannot be maps or seqs)"), _("XW4D-in_yaml" , "only scalar keys allowed (keys cannot be maps or seqs)"), _("XW4D-out_yaml" , "only scalar keys allowed (keys cannot be maps or seqs)"), - // tabs after - or : are not supported - _("6BCT-in_yaml" , "tabs after - or :"), - _("J3BT-in_yaml-events" , "tabs after - or :"), // anchors with : are not supported _("2SXE-in_yaml-events" , "weird characters in anchors, anchors must not end with :"), // tags are parsed as-is; tag lookup is not supported