From bfb5397b1fda493e2e1b7d3eddf7f0c75be6a6cf Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 5 Oct 2023 22:56:49 -0700 Subject: [PATCH] adds comments on the recovering fst behaviour changes --- cpp/src/io/json/nested_json_gpu.cu | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 08a890959e3..c9107357239 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -697,7 +697,9 @@ auto get_transition_table(json_format_cfg_t format) /** * @brief Getting the translation table - * @param recover_from_error Whether or not the tokenizer should recover from invalid lines + * @param recover_from_error Whether or not the tokenizer should recover from invalid lines. If + * `recover_from_error` is true, invalid JSON lines end with the token sequence (`ErrorBegin`, + * `LineEn`) and incomplete JSON lines (e.g., `{"a":123\n`) are treated as invalid lines. */ auto get_translation_table(bool recover_from_error) { @@ -716,8 +718,11 @@ auto get_translation_table(bool recover_from_error) constexpr auto ErrorBegin = token_t::ErrorBegin; /** - * @brief If and only if `recover_from_error` is true, `recovering_tokens` are returned along with - * a token_t::LineEnd token, otherwise `regular_tokens` is returned. + * @brief Instead of specifying the verbose translation tables twice (i.e., once when + * `recover_from_error` is true and once when it is false), we use `nl_tokens` to specialize the + * translation table where it differs depending on the `recover_from_error` option. If and only if + * `recover_from_error` is true, `recovering_tokens` are returned along with a token_t::LineEnd + * token, otherwise `regular_tokens` is returned. */ auto nl_tokens = [recover_from_error](std::vector regular_tokens, std::vector recovering_tokens) {