Merge pull request #578 from lorenzwalthert/caching-top-level-expr

- Caching top level expressions (#578).
r-lib · Feb 5, 2020 · a87198f · a87198f
2 parents b7c4d9b + 4f6a890
commit a87198f
Show file tree

Hide file tree

Showing 192 changed files with 6,193 additions and 5,029 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: styler
 Title: Non-Invasive Pretty Printing of R Code
-Version: 1.2.0.9000
+Version: 1.2.0.9001
 Authors@R: 
     c(person(given = "Kirill",
              family = "Müller",
@@ -81,6 +81,7 @@ Collate:
     'testing-public-api.R'
     'testing.R'
     'token-create.R'
+    'transform-block.R'
     'transform-code.R'
     'transform-files.R'
     'ui-caching.R'

diff --git a/NEWS.md b/NEWS.md
@@ -20,8 +20,10 @@
   before will be instantaneous. This brings large speed boosts in many
   situations, e.g. when `style_pkg()` is run but only a few files have changed
   since the last styling or when using the [styler pre-commit
-  hook](https://github.com/lorenzwalthert/precommit). See `help("caching")`
-  for details (#538).
+  hook](https://github.com/lorenzwalthert/precommit). Because styler caches
+  by expression, you will also get speed boosts in large files with many 
+  expressions when you only change a few o them. See `help("caching")` for 
+  details (#538, #578).
 
 * `create_style_guide()` gains two arguments `style_guide_name` and 
   `style_guide_version` that are carried as meta data, in particular to version 

diff --git a/R/initialize.R b/R/initialize.R
@@ -20,6 +20,8 @@ default_style_guide_attributes <- function(pd_flat) {
     validate_parse_data()
 }
 
+
+
 #' Initialize attributes
 #'
 #' @name initialize_attributes

diff --git a/R/nest.R b/R/nest.R
@@ -7,23 +7,112 @@
 #'   of the parse table.
 #' @importFrom purrr when
 #' @keywords internal
-compute_parse_data_nested <- function(text) {
+compute_parse_data_nested <- function(text,
+                                      transformers) {
   parse_data <- tokenize(text) %>%
     add_terminal_token_before() %>%
     add_terminal_token_after() %>%
-    add_stylerignore()
+    add_stylerignore() %>%
+    add_attributes_caching(transformers) %>%
+    drop_cached_children()
 
   env_add_stylerignore(parse_data)
 
   parse_data$child <- rep(list(NULL), length(parse_data$text))
   pd_nested <- parse_data %>%
     nest_parse_data() %>%
     flatten_operators() %>%
-    when(any(parse_data$token == "EQ_ASSIGN") ~ relocate_eq_assign(.), ~.)
+    when(any(parse_data$token == "EQ_ASSIGN") ~ relocate_eq_assign(.), ~.) %>%
+    add_cache_block()
 
   pd_nested
 }
 
+#' Add the block id to a parse table
+#'
+#' Must be after [nest_parse_data()] because requires a nested parse table as
+#' input.
+#' @param pd_nested A top level nest.
+#' @keywords internal
+#' @importFrom rlang seq2
+add_cache_block <- function(pd_nested) {
+  if (cache_is_activated()) {
+    pd_nested$block <- cache_find_block(pd_nested)
+  } else {
+    pd_nested$block <- rep(1, nrow(pd_nested))
+  }
+  pd_nested
+}
+
+#' Drop all children of a top level expression that are cached
+#'
+#' Note that we do cache top-level comments. Because package code has a lot of
+#' roxygen comments and each of them is a top level expresion, checking is
+#' very expensive.
+#' @param pd A top-level nest.
+#' @details
+#' Because we process in blocks of expressions for speed, a cached expression
+#' will always end up in a block that won't be styled again (usual case), unless
+#' it's on a line where multiple expressions sit and at least one is not styled
+#' (exception).
+#'
+#' **usual case: All other expressions in a block are cached**
+#'
+#' Cached expressiond don't need to be transformed with `transformers` in
+#' [parse_transform_serialize_r_block()], we simply return `text` for the top
+#' level token. For that
+#' reason, the nested parse table can, at the rows where these expressions are
+#' located, be shallow, i.e. it does not have to contain a child, because it
+#' will neither be transformed nor serialized anytime. This function drops all
+#' associated tokens except the top-level token for such expressions, which will
+#' result in large speed improvements in [compute_parse_data_nested()] because
+#' nesting is expensive and will not be done for cached expressions.
+#'
+#' **exception: Not all other expressions in a block are cached**
+#'
+#' As described in [cache_find_block()], expressions on the same line are always
+#' put into one block. If any element of a block is not cached, the block will
+#' be styled as a whole. If the parse table was made shallow (and the top level)
+#' expresion is still marked as non-terminal, `text` will never be used in the
+#' transformation process and eventually lost. Hence, we must change the top
+#' level expression to a terminal. It will act like a comment in the sense that
+#' it is a fixed `text`.
+#'
+#' Because for the usual case, it does not even matter if the cached expression
+#' is a terminal or not (because it is not processed), we can safely set
+#' `terminal = TRUE` in general.
+#' @section Implementation:
+#' Because the structure of the parse table is not always "top-level expression
+#' first, then children", this function creates a temporary parse table that has
+#' this property and then extract the ids and subset the original parse table so
+#' it is shallow in the right places.
+#' @keywords internal
+drop_cached_children <- function(pd) {
+
+  if (cache_is_activated()) {
+
+    pd_parent_first <- pd[order(pd$line1, pd$col1, -pd$line2, -pd$col2, as.integer(pd$terminal)),]
+    pos_ids_to_keep <- pd_parent_first %>%
+      split(cumsum(pd_parent_first$parent == 0)) %>%
+      map(find_pos_id_to_keep) %>%
+      unlist() %>%
+      unname()
+    pd[pd$pos_id %in% pos_ids_to_keep,]
+  } else {
+    pd
+  }
+
+}
+
+find_pos_id_to_keep <- function(pd) {
+    if (pd$is_cached[1]) {
+      pd$pos_id[1]
+    } else {
+      pd$pos_id
+    }
+  }
+
+
 #' Turn off styling for parts of the code
 #'
 #' Using stylerignore markers, you can temporarily turn off styler. See a
@@ -137,6 +226,25 @@ add_terminal_token_before <- function(pd_flat) {
     left_join(pd_flat, ., by = "id")
 }
 
+#' Initialise variables related to caching
+#'
+#' @param transformers A list with transformer functions, used to check if
+#'   the code is cached.
+#' @describeIn add_token_terminal Initializes `newlines` and `lag_newlines`.
+#' @keywords internal
+add_attributes_caching <- function(pd_flat, transformers) {
+  pd_flat$block <- pd_flat$is_cached <- rep(NA, nrow(pd_flat))
+  if (cache_is_activated()) {
+    pd_flat$is_cached[pd_flat$parent == 0] <- map_lgl(
+      pd_flat$text[pd_flat$parent == 0],
+      is_cached, transformers, cache_dir_default()
+    )
+    is_comment <- pd_flat$token == "COMMENT"
+    pd_flat$is_cached[is_comment] <- rep(FALSE, sum(is_comment))
+  }
+  pd_flat
+}
+
 #' @describeIn add_token_terminal Removes column `terimnal_token_before`. Might
 #'   be used to prevent the use of invalidated information, e.g. if tokens were
 #'   added to the nested parse table.
@@ -220,13 +328,3 @@ combine_children <- function(child, internal_child) {
   }
   bound[order(bound$pos_id), ]
 }
-
-#' Get the start right
-#'
-#' On what line does the first token occur?
-#' @param pd_nested A nested parse table.
-#' @return The line number on which the first token occurs.
-#' @keywords internal
-find_start_line <- function(pd_nested) {
-  pd_nested$line1[1]
-}
diff --git a/R/nested-to-tree.R b/R/nested-to-tree.R
@@ -30,6 +30,7 @@ create_tree_from_pd_with_default_style_attributes <- function(pd, structure_only
 #' @return An object of class "Node" and "R6".
 #' @examples
 #' if (rlang::is_installed("data.tree")) {
+#'   cache_deactivate() # keep things simple
 #'   code <- "a <- function(x) { if(x > 1) { 1+1 } else {x} }"
 #'   nested_pd <- styler:::compute_parse_data_nested(code)
 #'   initialized <- styler:::pre_visit(nested_pd, c(default_style_guide_attributes))

diff --git a/R/parse.R b/R/parse.R
@@ -71,19 +71,22 @@ has_crlf_as_first_line_sep <- function(message, initial_text) {
 #'   * A column "child" that contains *nest*s.
 #'
 #' @param text A character vector.
+#' @inheritParams get_parse_data
 #' @return A flat parse table
 #' @importFrom rlang seq2
 #' @keywords internal
 tokenize <- function(text) {
-  get_parse_data(text, include_text = NA) %>%
+  get_parse_data(text, include_text = TRUE) %>%
     ensure_correct_str_txt(text) %>%
     enhance_mapping_special()
 }
 
 #' Obtain robust parse data
 #'
 #' Wrapper around `utils::getParseData(parse(text = text))` that returns a flat
-#' parse table.
+#' parse table. When caching information should be added, make sure that
+#' the cache is activated with `cache_activate()` and both `transformers` and
+#' `cache_dir` are non-`NULL`.
 #' @param text The text to parse.
 #' @param include_text Passed to [utils::getParseData()] as `includeText`.
 #' @param ... Other arguments passed to [utils::getParseData()].
@@ -96,6 +99,7 @@ get_parse_data <- function(text, include_text = TRUE, ...) {
     utils::getParseData(parsed, includeText = include_text),
     .name_repair = "minimal") %>%
     add_id_and_short()
+
   parser_version_set(parser_version_find(pd))
   pd
 }

diff --git a/R/relevel.R b/R/relevel.R
@@ -183,7 +183,10 @@ relocate_eq_assign_nest <- function(pd) {
 #' Two assignment tokens `EQ_ASSIGN` belong to the same block if they are not
 #' separated by more than one token. Token between `EQ_ASSIGN` tokens belong
 #' to the `EQ_ASSIGN` token occurring before them, except the token right before
-#' `EQ_ASSING` already belongs to the `EQ_ASSING` after it.
+#' `EQ_ASSING` already belongs to the `EQ_ASSING` after it. Note that this
+#' notion is unrelated to the column *block* in the parse table, which is used
+#' to [parse_transform_serialize_r()] code blocks and leave out the ones that
+#' are cached.
 #' @param pd A parse table.
 #' @keywords internal
 find_block_id <- function(pd) {

diff --git a/R/serialize.R b/R/serialize.R
@@ -2,11 +2,10 @@
 #'
 #' Collapses a flattened parse table into character vector representation.
 #' @inheritParams apply_stylerignore
-#' @param start_line The line number on which the code starts.
 #' @keywords internal
-serialize_parse_data_flattened <- function(flattened_pd, start_line = 1) {
-  flattened_pd$lag_newlines[1] <- start_line - 1
+serialize_parse_data_flattened <- function(flattened_pd) {
   flattened_pd <- apply_stylerignore(flattened_pd)
+  flattened_pd$lag_newlines[1] <- 0 # resolve start_line elsewhere
   res <- with(
     flattened_pd,
     paste0(

diff --git a/R/token-create.R b/R/token-create.R
@@ -18,6 +18,10 @@
 #' @param terminal Boolean vector indicating whether a token is a terminal or
 #'   not.
 #' @param child The children of the tokens.
+#' @param stylerignore Boolean to indicate if the line should be ignored by
+#'   styler.
+#' @param block The block (of caching) to which the token belongs. An integer.
+#' @param is_cached Whether the token is cached already.
 #' @family token creators
 #' @keywords internal
 create_tokens <- function(tokens,
@@ -31,7 +35,9 @@ create_tokens <- function(tokens,
                           indents = 0,
                           terminal = TRUE,
                           child = NULL,
-                          stylerignore = FALSE) {
+                          stylerignore = FALSE,
+                          block = NA,
+                          is_cached = NA) {
   len_text <- length(texts)
   new_tibble(
     list(
@@ -50,7 +56,9 @@ create_tokens <- function(tokens,
     indention_ref_pos_id = indention_ref_pos_ids,
     indent = indents,
     child = rep(list(child), len_text),
-    stylerignore = stylerignore
+    stylerignore = stylerignore,
+    block = block,
+    is_cached = is_cached
   ),
     nrow = len_text
   )

diff --git a/R/transform-block.R b/R/transform-block.R
@@ -0,0 +1,96 @@
+#' Parse, transform and serialize a nested parse table
+#'
+#' We process blocks of nested parse tables for speed. See [cache_find_block()]
+#' for details on how a top level nest is split into blocks.
+#' @param pd_nested A block of the nested parse table.
+#' @param start_line The line number on which the code starts.
+#' @inheritParams apply_transformers
+#' @keywords internal
+parse_transform_serialize_r_block <- function(pd_nested,
+                                              start_line,
+                                              transformers) {
+  if (!all(pd_nested$is_cached, na.rm = TRUE) || !cache_is_activated()) {
+    transformed_pd <- apply_transformers(pd_nested, transformers)
+    flattened_pd <- post_visit(transformed_pd, list(extract_terminals)) %>%
+      enrich_terminals(transformers$use_raw_indention) %>%
+      apply_ref_indention() %>%
+      set_regex_indention(
+        pattern = transformers$reindention$regex_pattern,
+        target_indention = transformers$reindention$indention,
+        comments_only = transformers$reindention$comments_only
+      )
+    serialized_transformed_text <- serialize_parse_data_flattened(flattened_pd)
+  } else {
+    serialized_transformed_text <- map2(
+      c(0, find_blank_lines_to_next_expr(pd_nested)[-1] - 1L),
+      pd_nested$text,
+      ~ c(rep("", .x), .y)
+    ) %>%
+      unlist()
+  }
+  c(rep("", start_line - 1), serialized_transformed_text)
+}
+
+#' Find the groups of expressions that should be processed together
+#'
+#' Every expression is an expression itself, Expressions on same line are in
+#' same block.
+#' Multiple expressions can sit on one row, e.g. in line comment and commands
+#' separated with ";". This creates a problem when processing each expression
+#' separately because when putting them together, we need complicated handling
+#' of line breaks between them, as it is not apriory clear that there is a line
+#' break separating them. To avoid this, we put top level expressions that sit
+#' on the same line into one block, so the assumption that there is a line break
+#' between each block of expressions holds.
+#' @param pd A top level parse table.
+#' @details
+#' we want to for turning points:
+#' - change in cache state is a turning point
+#' - expressions that are not on a new line cannot be a turning point. In this
+#'   case, the turning point is moved to the first expression on the line
+#' @param pd A top level nest.
+#' @keywords internal
+cache_find_block <- function(pd) {
+
+  first_after_cache_state_switch <- pd$is_cached != lag(pd$is_cached, default = !pd$is_cached[1])
+
+  not_first_on_line <- find_blank_lines_to_next_expr(pd) == 0
+  invalid_turning_point_idx <- which(
+    not_first_on_line & first_after_cache_state_switch
+  )
+
+  first_on_line_idx <- which(!not_first_on_line)
+  valid_replacements <- map_int(invalid_turning_point_idx, function(x) {
+    last(which(x > first_on_line_idx))
+  })
+  sort(unique(c(
+    setdiff(which(first_after_cache_state_switch), invalid_turning_point_idx),
+    valid_replacements
+  ))) %>%
+    unwhich(nrow(pd)) %>%
+    cumsum()
+}
+
+
+#' Find blank lines
+#'
+#' What number of line breaks lay between the expressions?
+#' @param pd_nested A nested parse table.
+#' @return The line number on which the first token occurs.
+#' @keywords internal
+find_blank_lines_to_next_expr <- function(pd_nested) {
+  pd_nested$line1 - lag(pd_nested$line2, default = 0)
+}
+
+#' Number of lines between cache blocks
+#'
+#' This is relevant when putting expressions together into a block and preserve
+#' blank lines between them. Note that because code does not need to start on
+#' line 1, the first element of the output is the number of lines until the
+#' first block.
+#' @param pd A top level nest.
+find_blank_lines_to_next_block <- function(pd) {
+  block_boundary <- pd$block != lag(pd$block, default = 0)
+  find_blank_lines_to_next_expr(pd)[block_boundary]
+}
+
-Original file line number
+Diff line change
@@ Expand Up / @@ -20,6 +20,8 @@ default_style_guide_attributes <- function(pd_flat) { @@
         validate_parse_data()
     }
     #' Initialize attributes
     #'
     #' @name initialize_attributes
@@ Expand Down @@