From 21c9bf56e7d53ec66548770fdacc61a2e5eb4b0c Mon Sep 17 00:00:00 2001 From: Julia Silge Date: Thu, 3 Oct 2024 14:08:01 -0600 Subject: [PATCH] Use nanoparquet package, to read/write parquet files (#843) * Use nanoparquet package, to read/write parquet files * Update NEWS --- DESCRIPTION | 1 + NEWS.md | 2 ++ R/pin-read-write.R | 8 ++++---- tests/testthat/test-pin-read-write.R | 9 ++++++++- vignettes/pins.Rmd | 2 +- 5 files changed, 16 insertions(+), 6 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index c9f86930..9a1aca3e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -54,6 +54,7 @@ Suggests: Microsoft365R, mime, mockery, + nanoparquet, openssl, paws.storage, qs, diff --git a/NEWS.md b/NEWS.md index c599a9f3..5293fca0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -16,6 +16,8 @@ * Fixed how previously deleted pin versions are detected (#838, @MichalLauer) +* Switched writing with `type = "parquet"` to use the nanoparquet package (#843). + # pins 1.3.0 ## Breaking changes diff --git a/R/pin-read-write.R b/R/pin-read-write.R index 1b9d98b8..061e98e0 100644 --- a/R/pin-read-write.R +++ b/R/pin-read-write.R @@ -194,8 +194,8 @@ write_qs <- function(x, path) { } write_parquet <- function(x, path) { - check_installed("arrow") - arrow::write_parquet(x, path) + check_installed("nanoparquet") + nanoparquet::write_parquet(x, path) invisible(path) } @@ -251,8 +251,8 @@ read_qs <- function(path) { } read_parquet <- function(path) { - check_installed("arrow") - arrow::read_parquet(path) + check_installed("nanoparquet") + nanoparquet::read_parquet(path) } read_arrow <- function(path) { diff --git a/tests/testthat/test-pin-read-write.R b/tests/testthat/test-pin-read-write.R index 1867c714..b7f9eb64 100644 --- a/tests/testthat/test-pin-read-write.R +++ b/tests/testthat/test-pin-read-write.R @@ -1,6 +1,7 @@ test_that("can round trip all types", { skip_if_not_installed("qs") skip_if_not_installed("arrow") + skip_if_not_installed("nanoparquet") board <- board_temp() # Data frames @@ -9,7 +10,13 @@ test_that("can round trip all types", { expect_equal(pin_read(board, "df-1"), df) pin_write(board, df, "df-2", type = "parquet") - expect_equal(pin_read(board, "df-2"), df) + expect_equal( + withr::with_options( + list(nanoparquet.class = c("tbl_df", "tbl")), + pin_read(board, "df-2") + ), + df + ) pin_write(board, df, "df-3", type = "arrow") expect_equal(pin_read(board, "df-3"), df) diff --git a/vignettes/pins.Rmd b/vignettes/pins.Rmd index 4b6e1fb0..8bd48cdc 100644 --- a/vignettes/pins.Rmd +++ b/vignettes/pins.Rmd @@ -73,7 +73,7 @@ But you can choose another option depending on your goals: - `type = "rds"` uses `writeRDS()` to create a binary R data file. It can save any R object (including trained models) but it's only readable from R, not other languages. - `type = "csv"` uses `write.csv()` to create a CSV file. CSVs are plain text and can be read easily by many applications, but they only support simple columns (e.g. numbers, strings), can take up a lot of disk space, and can be slow to read. -- `type = "parquet"` uses `arrow::write_parquet()` to create a Parquet file. [Parquet](https://parquet.apache.org/) is a modern, language-independent, column-oriented file format for efficient data storage and retrieval. Parquet is an excellent choice for storing tabular data but requires the [arrow](https://arrow.apache.org/docs/r/) package. +- `type = "parquet"` uses `nanoparquet::write_parquet()` to create a Parquet file. [Parquet](https://parquet.apache.org/) is a modern, language-independent, column-oriented file format for efficient data storage and retrieval. Parquet is an excellent choice for storing tabular data but requires the [nanoparquet](https://nanoparquet.r-lib.org/) package. - `type = "arrow"` uses `arrow::write_feather()` to create an Arrow/Feather file. - `type = "json"` uses `jsonlite::write_json()` to create a JSON file. Pretty much every programming language can read json files, but they only work well for nested lists. - `type = "qs"` uses `qs::qsave()` to create a binary R data file, like `writeRDS()`. This format achieves faster read/write speeds than RDS, and compresses data more efficiently, making it a good choice for larger objects. Read more on the [qs package](https://github.com/traversc/qs).