Skip to content

Commit

Permalink
Rework filepath (re-)encoding (#438)
Browse files Browse the repository at this point in the history
* Revert filepath reencoding

* Unskip this test

* Re-encode to native just prior to fopen() or mio::make_mmap_source()

* Expose the bytes

* Unconditionally encode paths as UTF-8

* Clean up comments, remove print debugging

* Ensure path returned by `chr_to_file()` is encoded as UTF-8

This guards against the scenario where the tempdir's path has non-ascii characters in it.

Presumably that could arise on, say, Windows if the user name has non-ascii characters:

> tempdir()
[1] "C:\\Users\\jenny\\AppData\\Local\\Temp\\Rtmpg30qBQ"

* Add more `enc2utf8()`

Everytime we use a base R path-handling function, explicitly re-encode the result as UTF-8.

* Inline the reference object

* Test writing to a non-ascii path

* Test read/write of .zip with non-ascii characters in path

* Rewrite this test

* I now think the problem IS in archive, for Windows and unix

Skip this test in non-UTF-8 locales for now

* Add a test re: reading fwf from non-ascii filepath

* Tweaks NEWS

* Make versions of basename() and normalizePath() that uphold UTF-8 everywhere
  • Loading branch information
jennybc authored May 17, 2022
1 parent 0c90270 commit 85143f7
Show file tree
Hide file tree
Showing 4 changed files with 104 additions and 30 deletions.
6 changes: 3 additions & 3 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# vroom (development version)

* `vroom()` reads more reliably from filepaths containing non-ascii characters, in a non-UTF-8 locale (#394, #438).

* `vroom_format()` and `vroom_write()` only quote values that contain a
delimiter, quote, or newline. Specifically values that are equal to the `na`
string (or that start with it) are no longer quoted (#426).

* `vroom()` reads more reliably from filepaths containing non-ascii characters (#394).


* Fixed segfault when reading in multiple files and the first file is header-only but subsequent files have at least one row (#430).

* Fixed segfault when `vroom_format()` is given an empty data frame (#425)
Expand Down
31 changes: 17 additions & 14 deletions R/path.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,6 @@ reencode_file <- function(path, encoding) {
return(list(out_file))
}

reencode_filepath <- function(path) {
if (is_windows()) {
enc2utf8(path)
} else {
enc2native(path)
}
}

# These functions adapted from https://github.com/tidyverse/readr/blob/192cb1ca5c445e359f153d2259391e6d324fd0a2/R/source.R
standardise_path <- function(path) {
if (is.raw(path)) {
Expand Down Expand Up @@ -68,7 +60,7 @@ standardise_path <- function(path) {
}
}

as.list(reencode_filepath(path))
as.list(enc2utf8(path))
}

standardise_one_path <- function (path, write = FALSE) {
Expand Down Expand Up @@ -104,10 +96,12 @@ standardise_one_path <- function (path, write = FALSE) {
)
}

p <- split_path_ext(basename(path))
path <- enc2utf8(path)

p <- split_path_ext(basename_utf8(path))

if (write) {
path <- normalizePath(path, mustWork = FALSE)
path <- normalizePath_utf8(path, mustWork = FALSE)
} else {
path <- check_path(path)
}
Expand Down Expand Up @@ -228,8 +222,9 @@ is_url <- function(path) {
}

check_path <- function(path) {
if (file.exists(path))
return(normalizePath(path, "/", mustWork = FALSE))
if (file.exists(path)) {
return(normalizePath_utf8(path, mustWork = FALSE))
}

stop("'", path, "' does not exist",
if (!is_absolute_path(path)) {
Expand Down Expand Up @@ -265,7 +260,7 @@ chr_to_file <- function(x, envir = parent.frame()) {

withr::defer(unlink(out), envir = envir)

normalizePath(out)
normalizePath_utf8(out)
}

detect_compression <- function(path) {
Expand Down Expand Up @@ -319,3 +314,11 @@ detect_compression <- function(path) {

NA_character_
}

basename_utf8 <- function(path) {
enc2utf8(basename(path))
}

normalizePath_utf8 <- function(path, winslash = "/", mustWork = NA) {
enc2utf8(normalizePath(path, winslash = winslash, mustWork = mustWork))
}
31 changes: 27 additions & 4 deletions src/unicode_fopen.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,31 @@
#endif
// clang-format on

#ifdef _WIN32
#include <Rinternals.h>

#ifdef _WIN32
#include <windows.h>
#else
#include "cpp11/r_string.hpp"
#endif

// useful for print debugging file path encoding
// inline void print_hex(const char* string) {
// unsigned char* p = (unsigned char*) string;
// for (int i = 0; i < 300 ; i++) {
// if (p[i] == '\0') break;
// Rprintf("%c 0x%02x ", p[i], p[i]);
// if ((i%16 == 0) && i)
// Rprintf("\n");
// }
// Rprintf("\n");
// }

// This is needed to support wide character paths on windows
inline FILE* unicode_fopen(const char* path, const char* mode) {
FILE* out;
#ifdef _WIN32
// First conver the mode to the wide equivalent
// First convert the mode to the wide equivalent
// Only usage is 2 characters so max 8 bytes + 2 byte null.
wchar_t mode_w[10];
MultiByteToWideChar(CP_UTF8, 0, mode, -1, mode_w, 9);
Expand All @@ -40,7 +55,11 @@ inline FILE* unicode_fopen(const char* path, const char* mode) {
MultiByteToWideChar(CP_UTF8, 0, path, -1, buf, len);
out = _wfopen(buf, mode_w);
#else
out = fopen(path, mode);
// the path has UTF-8 encoding, because we do that unconditionally on the R
// side (but also because cpp11 is eager to use UTF-8)
// however, we need to pass the path to fopen() in the native encoding
const char* native_path = Rf_translateChar(cpp11::r_string(path));
out = fopen(native_path, mode);
#endif

return out;
Expand All @@ -64,6 +83,10 @@ make_mmap_source(const char* file, std::error_code& error) {
free(buf);
return out;
#else
return mio::make_mmap_source(file, error);
// the path has UTF-8 encoding, because we do that unconditionally on the R
// side (but also because cpp11 is eager to use UTF-8)
// however, we need to pass the path to fopen() in the native encoding
const char* native_path = Rf_translateChar(cpp11::r_string(file));
return mio::make_mmap_source(native_path, error);
#endif
}
66 changes: 57 additions & 9 deletions tests/testthat/test-path.R
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
mt <- vroom(vroom_example("mtcars.csv"), col_types = list())

test_that("vroom errors if the file does not exist", {

tf <- tempfile()

expect_error(vroom(tf, col_types = list()), "does not exist")
})

test_that("vroom works with compressed files", {
mt <- vroom(vroom_example("mtcars.csv"), col_types = list())
expect_equal(vroom(vroom_example("mtcars.csv.gz"), col_types = list()), mt)
expect_equal(vroom(vroom_example("mtcars.csv.bz2"), col_types = list()), mt)
expect_equal(vroom(vroom_example("mtcars.csv.xz"), col_types = list()), mt)
Expand All @@ -17,13 +14,15 @@ test_that("vroom works with compressed files", {
test_that("read_file works via https", {
skip_on_cran()

mt <- vroom(vroom_example("mtcars.csv"), col_types = list())
url <- "https://raw.githubusercontent.com/r-lib/vroom/main/inst/extdata/mtcars.csv"
expect_equal(vroom(url, col_types = list()), mt)
})

test_that("vroom works via https on gz file", {
skip_on_cran()

mt <- vroom(vroom_example("mtcars.csv"), col_types = list())
url <- "https://raw.githubusercontent.com/r-lib/vroom/main/inst/extdata/mtcars.csv.gz"
expect_equal(vroom(url, col_types = list()), mt)
})
Expand Down Expand Up @@ -110,11 +109,6 @@ test_that("can read file w/o final newline, w/ multi-byte characters in path", {

# for completeness, w.r.t. test above
test_that("can read file w/ final newline, w/ multi-byte characters in path", {
# (our usage of) mio seems to fail for a non-ascii path, on linux, in a
# non-UTF-8 local
# I'm not convinced it's worth troubleshooting at this point
skip_if(!is_windows() && isTRUE(l10n_info()$`Latin-1`))

pattern <- "yes-trailing-n\u00e8wline-m\u00fblti-byt\u00e9-path-"
tfile <- withr::local_tempfile(pattern = pattern, fileext = ".csv")
writeLines(c("a,b", "A,B"), tfile)
Expand All @@ -124,3 +118,57 @@ test_that("can read file w/ final newline, w/ multi-byte characters in path", {
tibble::tibble(a = "A", b = "B")
)
})

test_that("can write to path with non-ascii characters", {
pattern <- "cr\u00E8me-br\u00FBl\u00E9e-"
tfile <- withr::local_tempfile(pattern = pattern, fileext = ".csv")
dat <- tibble::tibble(a = "A", b = "B")
vroom_write(dat, tfile, delim = ",")
expect_equal(readLines(tfile), c("a,b", "A,B"))
})

test_that("can read/write a compressed file with non-ascii characters in path", {
skip_on_cran()
skip_if_not(rlang::is_installed("archive"))
# https://github.com/r-lib/archive/issues/75
skip_if(l10n_info()$`Latin-1`)

make_temp_path <- function(ext) file.path(tempdir(), paste0("d\u00E4t", ext))

gzfile <- withr::local_file(make_temp_path(".tar.gz"))
bz2file <- withr::local_file(make_temp_path(".tar.bz2"))
xzfile <- withr::local_file(make_temp_path(".tar.xz"))
zipfile <- withr::local_file(make_temp_path(".zip"))

dat <- tibble::tibble(a = "A", b = "B")

vroom_write(dat, gzfile)
vroom_write(dat, bz2file)
vroom_write(dat, xzfile)
vroom_write(dat, zipfile)

expect_equal(detect_compression(gzfile), "gz")
expect_equal(detect_compression(bz2file), "bz2")
expect_equal(detect_compression(xzfile), "xz")
expect_equal(detect_compression(zipfile), "zip")

expect_equal(vroom(gzfile, show_col_types = FALSE), dat)
expect_equal(vroom(bz2file, show_col_types = FALSE), dat)
expect_equal(vroom(xzfile, show_col_types = FALSE), dat)
expect_equal(vroom(zipfile, show_col_types = FALSE), dat)
})

test_that("can read fwf file w/ non-ascii characters in path", {
tfile <- withr::local_tempfile(pattern = "fwf-y\u00F6-", fileext = ".txt")
writeLines(c("A B", "C D"), tfile)

expect_equal(
spec <- fwf_empty(tfile, col_names = c("a", "b")),
list(begin = c(0L, 2L), end = c(1L, NA), col_names = c("a", "b"))
)

expect_equal(
vroom_fwf(tfile, spec, show_col_types = FALSE),
tibble::tibble(a = c("A", "C"), b = c("B", "D"))
)
})

0 comments on commit 85143f7

Please sign in to comment.