diff --git a/NAMESPACE b/NAMESPACE index 8b64268..f973d54 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -17,6 +17,7 @@ export(gather_keys) export(is.tbl_json) export(jlogical) export(jnumber) +export(json_complexity) export(json_lengths) export(json_structure) export(json_types) diff --git a/R/json_complexity.r b/R/json_complexity.r new file mode 100644 index 0000000..ee34418 --- /dev/null +++ b/R/json_complexity.r @@ -0,0 +1,33 @@ +#' Add a column that contains the complexity (recursively unlisted length) of the JSON data +#' +#' When investigating complex JSON data it can be helpful to identify the +#' complexity of deeply nested documents. The json_complexity() function adds a +#' column (default name "complexity") that contains the 'complexity' of the JSON +#' associated with each row. Essentially, every on-null scalar value is found in the +#' object by recursively stripping away all objects or arrays, and the complexity +#' is the count of these scalar values. Note that 'null' has complexity 0. +#' +#' @param x a tbl_json object +#' @param column.name the name to specify for the length column +#' @return a tbl_json object with column.name column that tells the length +#' @export +#' @examples +#' library(magrittr) # for %>% +#' c('[1, 2, [3, 4]]', '{"k1": 1, "k2": [2, [3, 4]]}', '1', {}) %>% +#' json_lengths %>% json_complexity +json_complexity <- function(x, column.name = "complexity") { + + if (!is.tbl_json(x)) x <- as.tbl_json(x) + + # Extract json + json <- attr(x, "JSON") + + # Determine lengths + lengths <- json %>% map(unlist, recursive = TRUE) %>% map_int(length) + + # Add as a column to x + x[column.name] <- lengths + + tbl_json(x, json) + +} diff --git a/man/json_complexity.Rd b/man/json_complexity.Rd new file mode 100644 index 0000000..9efed61 --- /dev/null +++ b/man/json_complexity.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/json_complexity.r +\name{json_complexity} +\alias{json_complexity} +\title{Add a column that contains the complexity (recursively unlisted length) of the JSON data} +\usage{ +json_complexity(x, column.name = "complexity") +} +\arguments{ +\item{x}{a tbl_json object} + +\item{column.name}{the name to specify for the length column} +} +\value{ +a tbl_json object with column.name column that tells the length +} +\description{ +When investigating complex JSON data it can be helpful to identify the +complexity of deeply nested documents. The json_complexity() function adds a +column (default name "complexity") that contains the 'complexity' of the JSON +associated with each row. Essentially, every on-null scalar value is found in the +object by recursively stripping away all objects or arrays, and the complexity +is the count of these scalar values. Note that 'null' has complexity 0. +} +\examples{ +library(magrittr) # for \%>\% +c('[1, 2, [3, 4]]', '{"k1": 1, "k2": [2, [3, 4]]}', '1', {}) \%>\% + json_lengths \%>\% json_complexity +} + diff --git a/tests/testthat/test-json_complexity.r b/tests/testthat/test-json_complexity.r new file mode 100644 index 0000000..73cd911 --- /dev/null +++ b/tests/testthat/test-json_complexity.r @@ -0,0 +1,63 @@ +context("json_complexity") + +test_that("works for arrays", { + + json <- c('[]', '[1]', '[1, 2]') + expect_identical( + json %>% json_complexity %>% `$`(complexity), + c(0L, 1L, 2L) + ) + +} +) + +test_that("works for objects", { + + json <- c('{}', '{"k":"v"}', '{"k1":"v1", "k2":"v2"}') + expect_identical( + json %>% json_complexity %>% `$`(complexity), + c(0L, 1L, 2L) + ) + +} +) + +test_that("works for scalars", { + + json <- c('[1, "a", true]') + expect_identical( + json %>% gather_array %>% json_complexity %>% `$`(complexity), + rep(1L, 3) + ) + +} +) + +test_that("works for emtpy objects", { + + json <- character(0) + expect_identical( + json %>% json_complexity %>% `$`(complexity), + integer(0) + ) + + json <- c('[null, [], {}]') + expect_identical( + json %>% gather_array %>% json_complexity %>% `$`(complexity), + rep(0L, 3) + ) + +} +) + +test_that("works for nested JSON", { + + json <- c('{"key": [1, 2]}', '{"key1": [1], "key2": [1, 2]}', + '{"key1": [1, 2], "key2": true, "key3": false}') + expect_identical( + json %>% json_complexity %>% `$`(complexity), + c(2L, 3L, 4L), + ) + +} +) diff --git a/vignettes/visualizing-json.Rmd b/vignettes/visualizing-json.Rmd index 7d658ce..2b019b9 100644 --- a/vignettes/visualizing-json.Rmd +++ b/vignettes/visualizing-json.Rmd @@ -88,7 +88,7 @@ We can then find out how complex each record is by recursively unlisting it and computing the length: ```{r} -co_length <- co_list %>% map(unlist, recursive = TRUE) %>% map_int(length) +co_length <- companies %>% json_complexity %>% extract2("complexity") ``` Then we can visualize the distribution of lengths on a log-scale: