-
Notifications
You must be signed in to change notification settings - Fork 72
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Caching top level expressions #578
Changes from 35 commits
b640559
8d2221d
8ea1205
8881173
d569b5c
aa7aaae
2b82c61
784d7a2
97be061
6e10ea4
e9bba1b
bc83cd5
85dc320
9750da5
ec84800
037aabf
aa59e41
43f7676
644d0ed
2921c7e
1147d7f
a5f6dab
7c6e7c6
b3ce4e9
9e6a99f
8564e74
7679c2a
fc22511
e9e7f9d
b3c5ba7
4ae8fe5
bcbfe2d
c5bd163
f1b3e4a
824e2a8
cdbe1bf
4f6a890
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,23 +7,112 @@ | |
#' of the parse table. | ||
#' @importFrom purrr when | ||
#' @keywords internal | ||
compute_parse_data_nested <- function(text) { | ||
compute_parse_data_nested <- function(text, | ||
transformers) { | ||
parse_data <- tokenize(text) %>% | ||
add_terminal_token_before() %>% | ||
add_terminal_token_after() %>% | ||
add_stylerignore() | ||
add_stylerignore() %>% | ||
add_attributes_caching(transformers) %>% | ||
drop_cached_children() | ||
|
||
env_add_stylerignore(parse_data) | ||
|
||
parse_data$child <- rep(list(NULL), length(parse_data$text)) | ||
pd_nested <- parse_data %>% | ||
nest_parse_data() %>% | ||
flatten_operators() %>% | ||
when(any(parse_data$token == "EQ_ASSIGN") ~ relocate_eq_assign(.), ~.) | ||
when(any(parse_data$token == "EQ_ASSIGN") ~ relocate_eq_assign(.), ~.) %>% | ||
add_cache_block() | ||
|
||
pd_nested | ||
} | ||
|
||
#' Add the block id to a parse table | ||
#' | ||
#' Must be after [nest_parse_data()] because requires a nested parse table as | ||
#' input. | ||
#' @param pd_nested A top level nest. | ||
#' @keywords internal | ||
#' @importFrom rlang seq2 | ||
add_cache_block <- function(pd_nested) { | ||
if (cache_is_activated()) { | ||
pd_nested$block <- cache_find_block(pd_nested) | ||
} else { | ||
pd_nested$block <- rep(1, nrow(pd_nested)) | ||
} | ||
pd_nested | ||
} | ||
|
||
#' Drop all children of a top level expression that are cached | ||
#' | ||
#' Note that we do cache top-level comments. Because package code has a lot of | ||
#' roxygen comments and each of them is a top level expresion, so checking is | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The word "so" is wrong since the sentence starts with "Because". |
||
#' very expensive. | ||
#' @param pd A top-level nest. | ||
#' @details | ||
#' Because we process in blocks of expressions for speed, a cached expression | ||
#' will always end up in a block that won't be styled again (usual case), unless | ||
#' it's on a line where multiple expressions sit and at least one is not styled | ||
#' (exception). | ||
#' | ||
#' **usual case: All other expressions in a block are cached** | ||
#' | ||
#' Cached expressiond don't need to be transformed with `transformers` in | ||
#' [parse_transform_serialize_r_block()], we simply return `text` for the top | ||
#' level token. For that | ||
#' reason, the nested parse table can, at the rows where these expressions are | ||
#' located, be shallow, i.e. it does not have to contain a children, because it | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. typo: "a children" -> "children" |
||
#' will neighter be transformerd nor serialized anytime. This function drop all | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. typo: "neighter" -> "neither" |
||
#' associated tokens except the top-level token for such expressions, which will | ||
#' result in large speed improvements in [compute_parse_data_nested()] because | ||
#' nesting is expensive and will not be done for cached expressions. | ||
#' | ||
#' **exception: Not all other expressions in a block are cached** | ||
#' | ||
#' As described in [cache_find_block()], expressions on the same line are always | ||
#' put into one block. If any element of a block is not cached, the block will | ||
#' be styled as a whole. If the parse table was made shallow (and the top level) | ||
#' expresion is still marked as non-terminal, `text` will never be used in the | ||
#' transformation process and eventually lost. Hence, we must change the top | ||
#' level expression to a terminal. It will act like a comment in the sense that | ||
#' it is a fixed `text`. | ||
#' | ||
#' Because for the usual case, it does not even matter if the cached expression | ||
#' is a terminal or not (because it is not processed), we can safely set | ||
#' `terminal = TRUE` in general. | ||
#' @section Implementation: | ||
#' Because the structure of the parse table is not always "top-level expression | ||
#' first, then children", this function creates a temporary parse table that has | ||
#' this property and then extract the ids and subset the original parse table so | ||
#' it is shallow in the right places. | ||
#' @keywords internal | ||
drop_cached_children <- function(pd) { | ||
|
||
if (cache_is_activated()) { | ||
|
||
pd_parent_first <- pd[order(pd$line1, pd$col1, -pd$line2, -pd$col2, as.integer(pd$terminal)),] | ||
pos_ids_to_keep <- pd_parent_first %>% | ||
split(cumsum(pd_parent_first$parent == 0)) %>% | ||
map(find_pos_id_to_keep) %>% | ||
unlist() %>% | ||
unname() | ||
pd[pd$pos_id %in% pos_ids_to_keep,] | ||
} else { | ||
pd | ||
} | ||
|
||
} | ||
|
||
find_pos_id_to_keep <- function(pd) { | ||
if (pd$is_cached[1]) { | ||
pd$pos_id[1] | ||
} else { | ||
pd$pos_id | ||
} | ||
} | ||
|
||
|
||
#' Turn off styling for parts of the code | ||
#' | ||
#' Using stylerignore markers, you can temporarily turn off styler. See a | ||
|
@@ -137,6 +226,25 @@ add_terminal_token_before <- function(pd_flat) { | |
left_join(pd_flat, ., by = "id") | ||
} | ||
|
||
#' Initialise variables related to caching | ||
#' | ||
#' @param transformers A list with transformer functions, used to check if | ||
#' the code is cached. | ||
#' @describeIn add_token_terminal Initializes `newlines` and `lag_newlines`. | ||
#' @keywords internal | ||
add_attributes_caching <- function(pd_flat, transformers) { | ||
pd_flat$block <- pd_flat$is_cached <- rep(NA, nrow(pd_flat)) | ||
if (cache_is_activated()) { | ||
pd_flat$is_cached[pd_flat$parent == 0] <- map_lgl( | ||
pd_flat$text[pd_flat$parent == 0], | ||
is_cached, transformers, cache_dir_default() | ||
) | ||
is_comment <- pd_flat$token == "COMMENT" | ||
pd_flat$is_cached[is_comment] <- rep(FALSE, sum(is_comment)) | ||
} | ||
pd_flat | ||
} | ||
|
||
#' @describeIn add_token_terminal Removes column `terimnal_token_before`. Might | ||
#' be used to prevent the use of invalidated information, e.g. if tokens were | ||
#' added to the nested parse table. | ||
|
@@ -220,13 +328,3 @@ combine_children <- function(child, internal_child) { | |
} | ||
bound[order(bound$pos_id), ] | ||
} | ||
|
||
#' Get the start right | ||
#' | ||
#' On what line does the first token occur? | ||
#' @param pd_nested A nested parse table. | ||
#' @return The line number on which the first token occurs. | ||
#' @keywords internal | ||
find_start_line <- function(pd_nested) { | ||
pd_nested$line1[1] | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
#' Parse, transform and serialize a nested parse table | ||
#' | ||
#' We process blocks of nested parse tables for speed. See [cache_find_block()] | ||
#' for details on how a top level nest is split into blocks. | ||
#' @param pd_nested A block of the nested parse table. | ||
#' @param start_line The line number on which the code starts. | ||
#' @inheritParams apply_transformers | ||
#' @keywords internal | ||
parse_transform_serialize_r_block <- function(pd_nested, | ||
start_line, | ||
transformers) { | ||
if (!all(pd_nested$is_cached, na.rm = TRUE) || !cache_is_activated()) { | ||
transformed_pd <- apply_transformers(pd_nested, transformers) | ||
flattened_pd <- post_visit(transformed_pd, list(extract_terminals)) %>% | ||
enrich_terminals(transformers$use_raw_indention) %>% | ||
apply_ref_indention() %>% | ||
set_regex_indention( | ||
pattern = transformers$reindention$regex_pattern, | ||
target_indention = transformers$reindention$indention, | ||
comments_only = transformers$reindention$comments_only | ||
) | ||
serialized_transformed_text <- serialize_parse_data_flattened(flattened_pd) | ||
} else { | ||
serialized_transformed_text <- map2( | ||
c(0, find_blank_lines_to_next_expr(pd_nested)[-1] - 1L), | ||
pd_nested$text, | ||
~ c(rep("", .x), .y) | ||
) %>% | ||
unlist() | ||
} | ||
c(rep("", start_line - 1), serialized_transformed_text) | ||
} | ||
|
||
#' Find the groups of expressions that should be processed together | ||
#' | ||
#' Every expression is an expression itself, Expressions on same line are in | ||
#' same block. | ||
#' Multiple expressions can sit on one row, e.g. in line comment and commands | ||
#' seperated with ";". This creates a problem when processing each expression | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. typo: "seperated" -> "separated" |
||
#' separately because when putting them together, we need complicated handling | ||
#' of line breaks between them, as it is not apriory clear that there is a line | ||
#' break separating them. To avoid this, we put top level expressions that sit | ||
#' on the same line into one block, so the assumption that there is a line break | ||
#' between each block of expressions holds. | ||
#' @param pd A top level parse table. | ||
#' @details | ||
#' we want to for turning points: | ||
#' - change in cache state is a turning point | ||
#' - expressions that are not on a new line cannot be a turning point. In this | ||
#' case, the turning point is moved to the first expression on the line | ||
#' @param pd A top level nest. | ||
#' @keywords internal | ||
cache_find_block <- function(pd) { | ||
|
||
first_after_cache_state_switch <- pd$is_cached != lag(pd$is_cached, default = !pd$is_cached[1]) | ||
|
||
not_first_on_line <- find_blank_lines_to_next_expr(pd) == 0 | ||
invalid_turning_point_idx <- which( | ||
not_first_on_line & first_after_cache_state_switch | ||
) | ||
|
||
first_on_line_idx <- which(!not_first_on_line) | ||
valid_replacements <- map_int(invalid_turning_point_idx, function(x) { | ||
last(which(x > first_on_line_idx)) | ||
}) | ||
sort(unique(c( | ||
setdiff(which(first_after_cache_state_switch), invalid_turning_point_idx), | ||
valid_replacements | ||
))) %>% | ||
unwhich(nrow(pd)) %>% | ||
cumsum() | ||
} | ||
|
||
|
||
#' Find blank lines | ||
#' | ||
#' What number of line breaks lay between the expressions? | ||
#' @param pd_nested A nested parse table. | ||
#' @return The line number on which the first token occurs. | ||
#' @keywords internal | ||
find_blank_lines_to_next_expr <- function(pd_nested) { | ||
# TODO think about naming: prefix with cache here also or just ui functions? | ||
pd_nested$line1 - lag(pd_nested$line2, default = 0) | ||
} | ||
|
||
#' Number of lines between cache blocks | ||
#' | ||
#' This is relevant when putting expressions together into a block and preserve | ||
#' blank lines between them. | ||
#' @param pd A top level nest. | ||
find_blank_lines_to_next_block <- function(pd) { | ||
block_boundary <- pd$block != lag(pd$block, default = 0) | ||
# TODO everywhere: block is not ambiguous. use cache block since we also have | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. typo: "not ambiguous" -> "ambiguous" |
||
# block_id and other things in other places | ||
find_blank_lines_to_next_expr(pd)[block_boundary] | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What's the semantic difference between 1 empty line, 3 empty lines (as in line 22) and 5 empty lines?