From ec838dbffa312aa88bf722b6f8e7b9d1f2626e0f Mon Sep 17 00:00:00 2001 From: Shawn Garbett Date: Tue, 22 Feb 2022 13:12:55 -0600 Subject: [PATCH] This seems to solve a lot of UTF-8 issues. Needs performance evaluation. --- inst/tests/test_as_yaml.R | 38 +++++++++++++++++++++++++++--------- inst/tests/test_write_yaml.R | 15 ++++++++++++-- src/r_emit.c | 15 ++++++++++---- 3 files changed, 53 insertions(+), 15 deletions(-) diff --git a/inst/tests/test_as_yaml.R b/inst/tests/test_as_yaml.R index 868d2ac..cf61d15 100644 --- a/inst/tests/test_as_yaml.R +++ b/inst/tests/test_as_yaml.R @@ -457,18 +457,38 @@ test_no_dots_at_end <- function() } # UTF-8 testing -test_latin1_strings <- function() { - latin1_in <- list("\xAB", "\xBF") +test_latin1_strings_no_enc <- function() { + latin1_in <- list("\xAB\xF4", "\xBF\xE9") + checkEquals("- \n- \n", as.yaml(latin1_in)) +} + +test_latin1_names_no_enc <- function() { + latin1_in <- list("a", "b") + names(latin1_in) <- c("\xE9", "\xF4") as.yaml(latin1_in) + checkEquals(": a\n: b\n", as.yaml(latin1_in)) +} + +test_latin1_strings_enc <- function() { + x <- "\xAB\xF4" + y <- "\xBF\xE9" + Encoding(x) <- "latin1" + Encoding(y) <- "latin1" + latin1_in <- list(x,y) + + checkEquals("- «ô\n- ¿é\n", as.yaml(latin1_in)) } -# This cause a full crash -# -# test_latin1_names <- function() { -# latin1_in <- list("a", "b") -# names(latin1_in) <- c("\xAB", "\xBF") -# as.yaml(latin1_in) -# } +test_latin1_names_enc <- function() { + latin1_in <- list("a", "b") + x <- "\xAB\xF4" + y <- "\xBF\xE9" + Encoding(x) <- "latin1" + Encoding(y) <- "latin1" + names(latin1_in) <- c(x,y) + + checkEquals("«ô: a\n¿é: b\n", as.yaml(latin1_in)) +} diff --git a/inst/tests/test_write_yaml.R b/inst/tests/test_write_yaml.R index 530a6fa..5f9f47a 100644 --- a/inst/tests/test_write_yaml.R +++ b/inst/tests/test_write_yaml.R @@ -7,11 +7,22 @@ test_output_is_written_to_a_file_when_a_filename_is_specified <- function() { } # Latin1 UTF-8 test -test_output_from_latin1_works <- function() +test_output_from_latin1_no_enc_works <- function() { filename <- tempfile() write_yaml(list("\xab"), filename) output <- readLines(filename) unlink(filename) - checkEquals(c("- \xab"), output) + checkEquals(c("- "), output) +} + +test_output_from_latin1_enc_works <- function() +{ + x <- "\xab\xef" + Encoding(x) <- "latin1" + filename <- tempfile() + write_yaml(list(x), filename) + output <- readLines(filename) + unlink(filename) + checkEquals("- «ï", output) } \ No newline at end of file diff --git a/src/r_emit.c b/src/r_emit.c index 22e0abf..61198f7 100644 --- a/src/r_emit.c +++ b/src/r_emit.c @@ -335,13 +335,19 @@ emit_char(emitter, event, s_obj, tag, implicit_tag, scalar_style) yaml_emitter_t *emitter; yaml_event_t *event; SEXP s_obj; - char *tag; + const char *tag; int implicit_tag; yaml_scalar_style_t scalar_style; { +#ifdef DEBUG + Rprintf("emit_char length=%d\n", LENGTH(s_obj)); +#endif + const void *vmax=vmaxget(); + const char *trans=Rf_translateCharUTF8(s_obj); yaml_scalar_event_initialize(event, NULL, (yaml_char_t *)tag, - (yaml_char_t *)Rf_translateCharUTF8(s_obj), LENGTH(s_obj), + (yaml_char_t *)trans, strlen(trans), implicit_tag, implicit_tag, scalar_style); + vmaxset(vmax); return yaml_emitter_emit(emitter, event); } @@ -351,7 +357,7 @@ emit_string(emitter, event, s_obj, tag, implicit_tag) yaml_emitter_t *emitter; yaml_event_t *event; SEXP s_obj; - char *tag; + const char *tag; int implicit_tag; { SEXP s_new_obj = NULL, s_chr = NULL, quoted = NULL; @@ -396,7 +402,7 @@ emit_factor(emitter, event, s_obj, tag, implicit_tag) yaml_emitter_t *emitter; yaml_event_t *event; SEXP s_obj; - char *tag; + const char *tag; int implicit_tag; { SEXP s_levels = NULL, s_level_chr = NULL; @@ -466,6 +472,7 @@ emit_object(emitter, event, s_obj, omap, column_major, precision, s_handlers) #if DEBUG Rprintf("=== Emitting ===\n"); PrintValue(s_obj); + Rprintf("TYPEOF: %d\n", TYPEOF(s_obj)); #endif /* Look for custom handler by class */