From 23ca4e513295177fd6febef755ded123e6d08a35 Mon Sep 17 00:00:00 2001 From: Julia Silge Date: Mon, 6 Mar 2023 09:41:18 -0700 Subject: [PATCH 1/6] Add `type = "parquet"` --- R/pin-read-write.R | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/R/pin-read-write.R b/R/pin-read-write.R index 8a25abd7..65790caa 100644 --- a/R/pin-read-write.R +++ b/R/pin-read-write.R @@ -56,9 +56,9 @@ pin_read <- function(board, name, version = NULL, hash = NULL, ...) { #' When retrieving the pin, this will be stored in the `user` key, to #' avoid potential clashes with the metadata that pins itself uses. #' @param type File type used to save `x` to disk. Must be one of -#' "csv", "json", "rds", "arrow", or "qs". If not supplied, will use JSON for -#' bare lists and RDS for everything else. Be aware that CSV and JSON are -#' plain text formats, while RDS, Arrow, and +#' "csv", "json", "rds", "parquet", "arrow", or "qs". If not supplied, will +#' use JSON for bare lists and RDS for everything else. Be aware that CSV and +#' JSON are plain text formats, while RDS, Parquet, Arrow, and #' [qs](https://CRAN.R-project.org/package=qs) are binary formats. #' @param versioned Should the pin be versioned? The default, `NULL`, will #' use the default for `board` @@ -133,6 +133,7 @@ object_write <- function(x, path, type = "rds") { switch(type, rds = write_rds(x, path), json = jsonlite::write_json(x, path, auto_unbox = TRUE), + parquet = write_parquet(x, path), arrow = write_arrow(x, path), pickle = abort("'pickle' pins not supported in R"), joblib = abort("'joblib' pins not supported in R"), @@ -168,13 +169,19 @@ write_qs <- function(x, path) { invisible(path) } +write_parquet <- function(x, path) { + check_installed("arrow") + arrow::write_parquet(x, path) + invisible(path) +} + write_arrow <- function(x, path) { check_installed("arrow") arrow::write_feather(x, path) invisible(path) } -object_types <- c("rds", "json", "arrow", "pickle", "csv", "qs", "file") +object_types <- c("rds", "json", "parquet", "arrow", "pickle", "csv", "qs", "file") object_read <- function(meta) { path <- fs::path(meta$local$dir, meta$file) @@ -189,6 +196,7 @@ object_read <- function(meta) { switch(type, rds = readRDS(path), json = jsonlite::read_json(path, simplifyVector = TRUE), + parquet = read_parquet(path), arrow = read_arrow(path), pickle = abort("'pickle' pins not supported in R"), joblib = abort("'joblib' pins not supported in R"), @@ -217,6 +225,11 @@ read_qs <- function(path) { qs::qread(path, strict = TRUE) } +read_parquet <- function(path) { + check_installed("arrow") + arrow::read_parquet(path) +} + read_arrow <- function(path) { check_installed("arrow") arrow::read_feather(path) From 15b267af690735afd132e8a9a0751bc05248e64a Mon Sep 17 00:00:00 2001 From: Julia Silge Date: Mon, 6 Mar 2023 09:41:31 -0700 Subject: [PATCH 2/6] Update tests --- tests/testthat/_snaps/pin-read-write.md | 2 +- tests/testthat/test-pin-read-write.R | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/testthat/_snaps/pin-read-write.md b/tests/testthat/_snaps/pin-read-write.md index 355bd6dc..f5f24cee 100644 --- a/tests/testthat/_snaps/pin-read-write.md +++ b/tests/testthat/_snaps/pin-read-write.md @@ -23,7 +23,7 @@ pin_write(board, mtcars, name = "mtcars", type = "froopy-loops") Condition Error in `object_write()`: - ! `type` must be one of "rds", "json", "arrow", "pickle", "csv", or "qs", not "froopy-loops". + ! `type` must be one of "rds", "json", "parquet", "arrow", "pickle", "csv", or "qs", not "froopy-loops". Code pin_write(board, mtcars, name = "mtcars", metadata = 1) Condition diff --git a/tests/testthat/test-pin-read-write.R b/tests/testthat/test-pin-read-write.R index 47751c4d..fdc47ea7 100644 --- a/tests/testthat/test-pin-read-write.R +++ b/tests/testthat/test-pin-read-write.R @@ -8,13 +8,16 @@ test_that("can round trip all types", { pin_write(board, df, "df-1", type = "rds") expect_equal(pin_read(board, "df-1"), df) - pin_write(board, df, "df-2", type = "arrow") + pin_write(board, df, "df-2", type = "parquet") expect_equal(pin_read(board, "df-2"), df) - pin_write(board, df, "df-3", type = "csv") + pin_write(board, df, "df-3", type = "arrow") + expect_equal(pin_read(board, "df-2"), df) + + pin_write(board, df, "df-4", type = "csv") expect_equal(pin_read(board, "df-3"), df) - pin_write(board, df, "df-4", type = "qs") + pin_write(board, df, "df-5", type = "qs") expect_equal(pin_read(board, "df-4"), df) # List From e2e24d3fd3facddba469cdbf8c84ae1213b758d4 Mon Sep 17 00:00:00 2001 From: Julia Silge Date: Mon, 6 Mar 2023 09:41:50 -0700 Subject: [PATCH 3/6] Update vignette/README, plus redocument --- README.Rmd | 2 +- README.md | 7 ++++--- man/pin_read.Rd | 6 +++--- vignettes/pins.Rmd | 3 ++- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/README.Rmd b/README.Rmd index 90eaa88c..7a796d7c 100644 --- a/README.Rmd +++ b/README.Rmd @@ -70,7 +70,7 @@ It takes three arguments: the board to pin to, an object, and a name: board %>% pin_write(head(mtcars), "mtcars") ``` -As you can see, the data saved as an `.rds` by default, but depending on what you're saving and who else you want to read it, you might use the `type` argument to instead save it as a `csv`, `json`, or `arrow` file. +As you can see, the data saved as an `.rds` by default, but depending on what you're saving and who else you want to read it, you might use the `type` argument to instead save it as a Parquet, Arrow, CSV, or JSON file. You can later retrieve the pinned data with `pin_read()`: diff --git a/README.md b/README.md index 83e53a25..d6bb91b2 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ board <- board_temp() board #> Pin board #> Path: -#> '/var/folders/hv/hzsmmyk9393_m7q3nscx1slc0000gn/T/RtmpTxyyP1/pins-114c073a9ddd2' +#> '/var/folders/hv/hzsmmyk9393_m7q3nscx1slc0000gn/T/RtmpwGre3p/pins-15a8b4f3f602c' #> Cache size: 0 ``` @@ -71,13 +71,14 @@ arguments: the board to pin to, an object, and a name: ``` r board %>% pin_write(head(mtcars), "mtcars") #> Guessing `type = 'rds'` -#> Creating new version '20230223T220424Z-a800d' +#> Creating new version '20230303T233508Z-a800d' #> Writing to pin 'mtcars' ``` As you can see, the data saved as an `.rds` by default, but depending on what you’re saving and who else you want to read it, you might use the -`type` argument to instead save it as a `csv`, `json`, or `arrow` file. +`type` argument to instead save it as a Parquet, Arrow, CSV, or JSON +file. You can later retrieve the pinned data with `pin_read()`: diff --git a/man/pin_read.Rd b/man/pin_read.Rd index 92cc4b52..aac63d6f 100644 --- a/man/pin_read.Rd +++ b/man/pin_read.Rd @@ -38,9 +38,9 @@ you expect. You can find the hash of an existing pin by looking for \item{x}{An object (typically a data frame) to pin.} \item{type}{File type used to save \code{x} to disk. Must be one of -"csv", "json", "rds", "arrow", or "qs". If not supplied, will use JSON for -bare lists and RDS for everything else. Be aware that CSV and JSON are -plain text formats, while RDS, Arrow, and +"csv", "json", "rds", "parquet", "arrow", or "qs". If not supplied, will +use JSON for bare lists and RDS for everything else. Be aware that CSV and +JSON are plain text formats, while RDS, Parquet, Arrow, and \href{https://CRAN.R-project.org/package=qs}{qs} are binary formats.} \item{title}{A title for the pin; most important for shared boards so that diff --git a/vignettes/pins.Rmd b/vignettes/pins.Rmd index 04246325..ca5a7430 100644 --- a/vignettes/pins.Rmd +++ b/vignettes/pins.Rmd @@ -63,7 +63,8 @@ But you can choose another option depending on your goals: - `type = "rds"` uses `writeRDS()` to create a binary R data file. It can save any R object but it's only readable from R, not other languages. - `type = "csv"` uses `write.csv()` to create a `.csv` file. CSVs can read by any application, but only support simple columns (e.g. numbers, strings, dates), can take up a lot of disk space, and can be slow to read. -- `type = "arrow"` uses `arrow::write_feather()` to create an arrow/feather file. [Arrow](https://arrow.apache.org) is a modern, language-independent, high-performance file format designed for data science. Not every tool can read arrow files, but support is growing rapidly. +- `type = "parquet"` uses `arrow::write_parquet()` to create a Parquet file. [Parquet](https://parquet.apache.org/) is a modern, language-independent, column-oriented file format for efficient data storage and retrieval. Parquet is a storage format used with [Arrow](https://arrow.apache.org), an in-memory columnar format. +- `type = "arrow"` uses `arrow::write_feather()` to create an Arrow/Feather file. Read the [FAQs from the Arrow project](https://arrow.apache.org/faq/) for more on the differences between Arrow and Parquet as file formats. - `type = "json"` uses `jsonlite::write_json()` to create a `.json` file. Pretty much every programming language can read json files, but they only work well for nested lists. - `type = "qs"` uses `qs::qsave()` to create a binary R data file, like `writeRDS()`. This format achieves faster read/write speeds than RDS, and compresses data more efficiently, making it a good choice for larger objects. Read more on the [qs package](https://github.com/traversc/qs). From f899e80e5c3c8238259f9a29ee4a3e1f3e67922f Mon Sep 17 00:00:00 2001 From: Julia Silge Date: Mon, 6 Mar 2023 09:44:16 -0700 Subject: [PATCH 4/6] Update NEWS --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index cd7bde6d..eeaba898 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,8 @@ * `board_s3()` now uses pagination for listing and versioning (#719, @mzorko). +* Added `type = "parquet"` to read and write Parquet files (#729). + # pins 1.1.0 ## Breaking changes From 520e647bc61d5e7d248e0370a56795798b115da1 Mon Sep 17 00:00:00 2001 From: Julia Silge Date: Mon, 6 Mar 2023 10:09:13 -0700 Subject: [PATCH 5/6] Update vignettes/pins.Rmd Co-authored-by: Hadley Wickham --- vignettes/pins.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/pins.Rmd b/vignettes/pins.Rmd index ca5a7430..d50c51cd 100644 --- a/vignettes/pins.Rmd +++ b/vignettes/pins.Rmd @@ -63,7 +63,7 @@ But you can choose another option depending on your goals: - `type = "rds"` uses `writeRDS()` to create a binary R data file. It can save any R object but it's only readable from R, not other languages. - `type = "csv"` uses `write.csv()` to create a `.csv` file. CSVs can read by any application, but only support simple columns (e.g. numbers, strings, dates), can take up a lot of disk space, and can be slow to read. -- `type = "parquet"` uses `arrow::write_parquet()` to create a Parquet file. [Parquet](https://parquet.apache.org/) is a modern, language-independent, column-oriented file format for efficient data storage and retrieval. Parquet is a storage format used with [Arrow](https://arrow.apache.org), an in-memory columnar format. +- `type = "parquet"` uses `arrow::write_parquet()` to create a Parquet file. [Parquet](https://parquet.apache.org/) is a modern, language-independent, column-oriented file format for efficient data storage and retrieval. - `type = "arrow"` uses `arrow::write_feather()` to create an Arrow/Feather file. Read the [FAQs from the Arrow project](https://arrow.apache.org/faq/) for more on the differences between Arrow and Parquet as file formats. - `type = "json"` uses `jsonlite::write_json()` to create a `.json` file. Pretty much every programming language can read json files, but they only work well for nested lists. - `type = "qs"` uses `qs::qsave()` to create a binary R data file, like `writeRDS()`. This format achieves faster read/write speeds than RDS, and compresses data more efficiently, making it a good choice for larger objects. Read more on the [qs package](https://github.com/traversc/qs). From 5c0b99f148f423ab4819fa6f014b34021315e0a5 Mon Sep 17 00:00:00 2001 From: Julia Silge Date: Mon, 6 Mar 2023 10:18:11 -0700 Subject: [PATCH 6/6] Update advice on `type` --- vignettes/pins.Rmd | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vignettes/pins.Rmd b/vignettes/pins.Rmd index d50c51cd..f866da55 100644 --- a/vignettes/pins.Rmd +++ b/vignettes/pins.Rmd @@ -61,11 +61,11 @@ The only rule for a pin name is that it can't contain slashes. As you can see from the output, pins has chosen to save this data to an `.rds` file. But you can choose another option depending on your goals: -- `type = "rds"` uses `writeRDS()` to create a binary R data file. It can save any R object but it's only readable from R, not other languages. -- `type = "csv"` uses `write.csv()` to create a `.csv` file. CSVs can read by any application, but only support simple columns (e.g. numbers, strings, dates), can take up a lot of disk space, and can be slow to read. -- `type = "parquet"` uses `arrow::write_parquet()` to create a Parquet file. [Parquet](https://parquet.apache.org/) is a modern, language-independent, column-oriented file format for efficient data storage and retrieval. -- `type = "arrow"` uses `arrow::write_feather()` to create an Arrow/Feather file. Read the [FAQs from the Arrow project](https://arrow.apache.org/faq/) for more on the differences between Arrow and Parquet as file formats. -- `type = "json"` uses `jsonlite::write_json()` to create a `.json` file. Pretty much every programming language can read json files, but they only work well for nested lists. +- `type = "rds"` uses `writeRDS()` to create a binary R data file. It can save any R object (including trained models) but it's only readable from R, not other languages. +- `type = "csv"` uses `write.csv()` to create a CSV file. CSVs are plain text and can be read easily by many applications, but they only support simple columns (e.g. numbers, strings), can take up a lot of disk space, and can be slow to read. +- `type = "parquet"` uses `arrow::write_parquet()` to create a Parquet file. [Parquet](https://parquet.apache.org/) is a modern, language-independent, column-oriented file format for efficient data storage and retrieval. Parquet is an excellent choice for storing tabular data but requires the [arrow](https://arrow.apache.org/docs/r/) package. +- `type = "arrow"` uses `arrow::write_feather()` to create an Arrow/Feather file. +- `type = "json"` uses `jsonlite::write_json()` to create a JSON file. Pretty much every programming language can read json files, but they only work well for nested lists. - `type = "qs"` uses `qs::qsave()` to create a binary R data file, like `writeRDS()`. This format achieves faster read/write speeds than RDS, and compresses data more efficiently, making it a good choice for larger objects. Read more on the [qs package](https://github.com/traversc/qs). After you've pinned an object, you can read it back with `pin_read()`: