#5 implement json_complexity

colearendt · Aug 26, 2016 · 9b50ac2 · 9b50ac2
1 parent 73d4336
commit 9b50ac2
Show file tree

Hide file tree

Showing 5 changed files with 128 additions and 1 deletion.
diff --git a/NAMESPACE b/NAMESPACE
@@ -17,6 +17,7 @@ export(gather_keys)
 export(is.tbl_json)
 export(jlogical)
 export(jnumber)
+export(json_complexity)
 export(json_lengths)
 export(json_structure)
 export(json_types)

diff --git a/R/json_complexity.r b/R/json_complexity.r
@@ -0,0 +1,33 @@
+#' Add a column that contains the complexity (recursively unlisted length) of the JSON data
+#'
+#' When investigating complex JSON data it can be helpful to identify the
+#' complexity of deeply nested documents. The json_complexity() function adds a
+#' column (default name "complexity") that contains the 'complexity' of the JSON
+#' associated with each row. Essentially, every on-null scalar value is found in the
+#' object by recursively stripping away all objects or arrays, and the complexity
+#' is the count of these scalar values. Note that 'null' has complexity 0.
+#'
+#' @param x a tbl_json object
+#' @param column.name the name to specify for the length column
+#' @return a tbl_json object with column.name column that tells the length
+#' @export
+#' @examples
+#' library(magrittr)  # for %>%
+#' c('[1, 2, [3, 4]]', '{"k1": 1, "k2": [2, [3, 4]]}', '1', {}) %>%
+#'   json_lengths %>% json_complexity
+json_complexity <- function(x, column.name = "complexity") {
+
+  if (!is.tbl_json(x)) x <- as.tbl_json(x)
+
+  # Extract json
+  json <- attr(x, "JSON")
+
+  # Determine lengths
+  lengths <- json %>% map(unlist, recursive = TRUE) %>% map_int(length)
+
+  # Add as a column to x
+  x[column.name] <- lengths
+
+  tbl_json(x, json)
+
+}
diff --git a/man/json_complexity.Rd b/man/json_complexity.Rd
diff --git a/tests/testthat/test-json_complexity.r b/tests/testthat/test-json_complexity.r
@@ -0,0 +1,63 @@
+context("json_complexity")
+
+test_that("works for arrays", {
+
+  json <- c('[]', '[1]', '[1, 2]')
+  expect_identical(
+    json %>% json_complexity %>% `$`(complexity),
+    c(0L, 1L, 2L)
+  )
+
+}
+)
+
+test_that("works for objects", {
+
+  json <- c('{}', '{"k":"v"}', '{"k1":"v1", "k2":"v2"}')
+  expect_identical(
+    json %>% json_complexity %>% `$`(complexity),
+    c(0L, 1L, 2L)
+  )
+
+}
+)
+
+test_that("works for scalars", {
+
+  json <- c('[1, "a", true]')
+  expect_identical(
+    json %>% gather_array %>% json_complexity %>% `$`(complexity),
+    rep(1L, 3)
+  )
+
+}
+)
+
+test_that("works for emtpy objects", {
+
+  json <- character(0)
+  expect_identical(
+    json %>% json_complexity %>% `$`(complexity),
+    integer(0)
+  )
+
+  json <- c('[null, [], {}]')
+  expect_identical(
+    json %>% gather_array %>% json_complexity %>% `$`(complexity),
+    rep(0L, 3)
+  )
+
+}
+)
+
+test_that("works for nested JSON", {
+
+  json <- c('{"key": [1, 2]}', '{"key1": [1], "key2": [1, 2]}',
+            '{"key1": [1, 2], "key2": true, "key3": false}')
+  expect_identical(
+    json %>% json_complexity %>% `$`(complexity),
+    c(2L, 3L, 4L),
+  )
+
+}
+)
diff --git a/vignettes/visualizing-json.Rmd b/vignettes/visualizing-json.Rmd
@@ -88,7 +88,7 @@ We can then find out how complex each record is by recursively unlisting it
 and computing the length:
 
 ```{r}
-co_length <- co_list %>% map(unlist, recursive = TRUE) %>% map_int(length)
+co_length <- companies %>% json_complexity %>% extract2("complexity")
 ```
 
 Then we can visualize the distribution of lengths on a log-scale: