From b13591e107351b9d3c613500ee7b3f26d674516f Mon Sep 17 00:00:00 2001 From: Jeremy Stanley Date: Mon, 6 Apr 2015 13:30:09 -0400 Subject: [PATCH] #29 improve examples --- vignettes/introduction-to-tidyjson.Rmd | 65 ++++++++++++++++---------- 1 file changed, 41 insertions(+), 24 deletions(-) diff --git a/vignettes/introduction-to-tidyjson.Rmd b/vignettes/introduction-to-tidyjson.Rmd index 65af39a..0ed168a 100644 --- a/vignettes/introduction-to-tidyjson.Rmd +++ b/vignettes/introduction-to-tidyjson.Rmd @@ -73,26 +73,26 @@ people <- ' ]' # Structure the data -people %>% # Use the %>% pipe operator to pass json through a pipeline - as.tbl_json %>% # Parse the JSON and setup a 'tbl_json' object - gather_array %>% # Gather (stack) the array by index - spread_values( # Spread (widen) values to widen the data.frame - user.name = jstring("name"), # Extract the "name" object as a character column "user.name" - user.age = jnumber("age") # Extract the "age" object as a numeric column "user.age" +people %>% # %>% is the magrittr pipeline operator + as.tbl_json %>% # parse the JSON and setup a 'tbl_json' object + gather_array %>% # gather (stack) the array by index + spread_values( # spread (widen) values to widen the data.frame + name = jstring("name"), # value of "name" becomes a character column + age = jnumber("age") # value of "age" becomes a numeric column ) ``` In such a simple example, we can use `fromJSON` in the jsonlite package to do this much faster: -```{r} +```{r, message = FALSE} library(jsonlite) -jsonlite::fromJSON(people) +jsonlite::fromJSON(people, simplifyDataFrame = TRUE) ``` -However, if the structure of the data changed, so would the output from `fromJSON`. -So even in this simple example there is value in the explicit structure defined -in the tidyjson pipeline above. +However, if the structure of the JSON data changed, so would the columns output +by `fromJSON`. So even in this simple example there is value in the explicit +structure defined in the tidyjson pipeline above. ## A more complex example @@ -136,9 +136,8 @@ purch_json <- ' ]' ``` -Suppose we want to find out how much each person has spent. - -Using jsonlite, we can parse the JSON: +Suppose we want to find out how much each person has spent. Using jsonlite, we +can parse the JSON: ```{r} library(jsonlite) @@ -148,7 +147,8 @@ purch_df <- jsonlite::fromJSON(purch_json) purch_df ``` -However, the resulting data structure is a complex nested data.frame: +This looks deceptively simple, the resulting data structure is actually a +complex nested data.frame: ```{r} str(purch_df) @@ -157,27 +157,44 @@ str(purch_df) This is difficult to work with, and we end up writing code like this: ```{r} -lapply(lapply(purch_df$purchases, `[[`, "items"), lapply, `[[`, "price") +items <- lapply(purch_df$purchases, `[[`, "items") +prices <- lapply(items, lapply, `[[`, "price") +vapply(lapply(prices, unlist), sum, integer(1)) ``` Reasoning about code like this is nearly impossible, and further, the relational structure of the data is lost (we no longer have the name of the user). +We can instead try to use dplyr and the `do{}` operator to get at the +data in the nested data.frames, but this is equally challenging: + +```{r} +purch_df %>% group_by(name) %>% + do( + data.frame( + name = .$name, + items = .$purchases[[1]] %>% rowwise %>% do({.$items}), + stringsAsFactors = FALSE + ) + ) %>% + summarize(price = sum(items.price)) +``` + Using tidyjson, we can build a pipeline to turn this JSON into a tidy data.frame where each row corresponds to a purchased item: ```{r} -purch_items <- purch_json %>% - as.tbl_json %>% gather_array %>% - spread_values(person = jstring("name")) %>% - enter_object("purchases") %>% gather_array %>% - spread_values(purchase.date = jstring("date")) %>% - enter_object("items") %>% gather_array %>% - spread_values( +purch_items <- purch_json %>% as.tbl_json %>% + gather_array %>% # stack the users + spread_values(person = jstring("name")) %>% # extract the user name + enter_object("purchases") %>% gather_array %>% # stack the purchases + spread_values(purchase.date = jstring("date")) %>% # extract the purchase date + enter_object("items") %>% gather_array %>% # stack the items + spread_values( # extract item name and price item.name = jstring("name"), item.price = jnumber("price") ) %>% - select(person, purchase.date, item.name, item.price) + select(person, purchase.date, item.name, item.price) # select only what is needed ``` The resulting data.frame is exactly what we want