From ccbb163c7bf3bf022820c6b365c6698538d0dd7d Mon Sep 17 00:00:00 2001 From: Andrew Bruce Date: Sun, 28 Jul 2024 22:28:15 -0700 Subject: [PATCH] refactored construct_regex --- DESCRIPTION | 1 + NAMESPACE | 1 - R/generated-globals.R | 8 ++ R/regex.R | 251 +++++++++++++++++++++++++++------------- man/construct_regex2.Rd | 21 ---- 5 files changed, 177 insertions(+), 105 deletions(-) delete mode 100644 man/construct_regex2.Rd diff --git a/DESCRIPTION b/DESCRIPTION index d23d099..12580cb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -22,6 +22,7 @@ Imports: rlang, strex, stringfish, + stringi, stringr, tidyr, triebeard, diff --git a/NAMESPACE b/NAMESPACE index f971ec4..399c98a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,7 +6,6 @@ export(assign_adjustments) export(calculate_amounts) export(carc_add_dash) export(construct_regex) -export(construct_regex2) export(data_dict) export(get_example) export(get_pin) diff --git a/R/generated-globals.R b/R/generated-globals.R index 5e29233..715c2f6 100644 --- a/R/generated-globals.R +++ b/R/generated-globals.R @@ -5,6 +5,12 @@ utils::globalVariables(c( "aoc_complements", # "facility", + # + "group", + # + "group_size", + # + "key", # "nonfacility", # @@ -17,5 +23,7 @@ utils::globalVariables(c( "rbcs_procedure", # "rbcs_subcategory", + # + "value", NULL )) diff --git a/R/regex.R b/R/regex.R index dac36d4..08dcb14 100644 --- a/R/regex.R +++ b/R/regex.R @@ -9,117 +9,148 @@ #' #' @autoglobal #' -# @keywords internal -#' #' @export construct_regex <- function(x) { - # TODO: check for equal lengths - x <- collapse::funique( collapse::na_rm( - gsub(" ", "", x))) + gsub(" ", "", x) + ) + ) - x <- stringr::str_split(x, "") |> - purrr::list_transpose() |> + vecs <- stringr::str_split_fixed( + x, "", + n = max( + collapse::vlengths(x) + ) + ) |> + as.data.frame() |> + purrr::map(dplyr::na_if, y = "") + + to_brackets <- vecs |> + purrr::map(collapse::na_rm) |> purrr::map(collapse::funique) |> - purrr::map(pos_re) |> - purrr::list_c() |> - paste0(collapse = "") + purrr::map(pos_re) - paste0("^", x, "$") + qmark <- names(which(purrr::map_lgl(vecs, anyNA))) -} + if (!vctrs::vec_is_empty(qmark)) { + to_brackets[qmark] <- purrr::map(to_brackets[qmark], \(x) paste0(x, "?")) + } -#' Internal function for `construct_regex()` -#' -#' @param x `` vector -#' -#' @returns `` vector -#' -#' @autoglobal -#' -#' @noRd -pos_re <- function(x) { + to_vec <- to_brackets |> + purrr::map(id_runs) |> + purrr::list_c() - sorted <- stringr::str_sort(x, numeric = TRUE) - alphabet <- purrr::list_c(strex::str_extract_non_numerics(sorted)) - numbers <- purrr::list_c(strex::str_extract_numbers(sorted)) + if (collapse::any_duplicated(to_vec)) { - paste0("[", - fuimus::collapser(alphabet), - fuimus::collapser(numbers), - "]") + # TODO probably need to vectorize this, + # will surely have more than one unique duplicate + + dupe_idx <- which(collapse::fduplicated(to_vec, all = TRUE)) + + rp <- paste0(to_vec[dupe_idx][1], "{", length(dupe_idx), "}") + + to_vec[dupe_idx] <- rp + + to_vec <- collapse::funique(to_vec) + + } + + x <- paste0("^", fuimus::collapser(to_vec), "$") + return(x) } -#' Construct regex patterns +#' Internal function for `construct_regex()` #' #' @param x `` vector #' -#' @examples -#' construct_regex2(search_descriptions()$hcpcs_code) -#' #' @returns `` vector #' #' @autoglobal #' -# @keywords internal -#' -#' @export -construct_regex2 <- function(x) { +#' @noRd +id_runs <- function(x) { - x <- collapse::funique( - collapse::na_rm( - gsub(" ", "", x) - ) - ) + vec <- c(LETTERS, 0:9) - vecs <- stringr::str_split_fixed( - x, - "", - n = max( - collapse::vlengths(x) - ) - ) |> - as.data.frame() |> - purrr::map( - dplyr::na_if, - y = "" - ) + vec <- rlang::set_names(rep(0, length(vec)), vec) - to_brackets <- vecs |> - purrr::map(collapse::na_rm) |> - purrr::map(collapse::funique) |> - purrr::map(pos_re2) + test <- strsplit(x, "")[[1]] - qmark <- names( - which( - purrr::map_lgl(vecs, anyNA) - ) - ) + vecna <- vec[test] - if (!vctrs::vec_is_empty(qmark)) { - to_brackets[qmark] <- purrr::map( - to_brackets[qmark], - \(x) paste0(x, "?") - ) - } + vecna <- vecna[!is.na(vecna)] - to_vec <- to_brackets |> - purrr::list_c() |> - paste0(collapse = "") + vec[names(vecna)] <- 1 + + vec_group <- dplyr::tibble( + value = names(vec), + key = vec, + idx = 1:length(vec), + group = dplyr::consecutive_id(key) + ) |> + dplyr::mutate( + group_size = dplyr::n(), + .by = group + ) |> + dplyr::filter( + key == 1, + group_size >= 3 + ) |> + dplyr::select( + value, + group + ) - x <- paste0("^", to_vec, "$") + if (vctrs::vec_is_empty(vec_group)) return(x) - x <- gsub(paste0(0:9, collapse = ""), "0-9", x) + xgroups <- unname( + split( + vec_group, + vec_group$group + ) + ) |> + purrr::map( + purrr::pluck("value") + ) |> + purrr::map( + paste0, + collapse = "" + ) |> + purrr::list_c() + + replacements <- dplyr::left_join( + dplyr::slice_min( + vec_group, + by = group, + order_by = value + ) |> + dplyr::rename(start = value), + dplyr::slice_max( + vec_group, + by = group, + order_by = value + ) |> + dplyr::rename(end = value), + by = dplyr::join_by(group) + ) |> + glue::glue_data( + "{start}-{end}" + ) |> + as.vector() - x <- gsub(paste0(LETTERS, collapse = ""), "A-Z", x) + res <- stringi::stri_replace_all_regex( + x, + xgroups, + replacements, + vectorize_all = FALSE) - return(x) + paste0("[", res, "]") } -#' Internal function for `construct_regex2()` +#' Internal function for `construct_regex()` #' #' @param x `` vector #' @@ -128,17 +159,16 @@ construct_regex2 <- function(x) { #' @autoglobal #' #' @noRd -pos_re2 <- function(x) { +pos_re <- function(x) { sorted <- stringr::str_sort(x, numeric = TRUE) alphabet <- purrr::list_c(strex::str_extract_non_numerics(sorted)) numbers <- purrr::list_c(strex::str_extract_numbers(sorted)) - paste0("[", - fuimus::collapser(alphabet), - fuimus::collapser(numbers), - "]" - ) + paste0( + fuimus::collapser(alphabet), + fuimus::collapser(numbers) + ) } @@ -162,3 +192,58 @@ pos_nchar <- function(x) { ) } + +#' Construct regex patterns +#' +#' @param x `` vector +#' +#' @examples +#' construct_regex_old(search_descriptions()$hcpcs_code) +#' +#' @returns `` vector +#' +#' @autoglobal +#' +#' @keywords internal +#' +#' @noRd +construct_regex_old <- function(x) { + + # TODO: check for equal lengths + + x <- collapse::funique( + collapse::na_rm( + gsub(" ", "", x))) + + x <- stringr::str_split(x, "") |> + purrr::list_transpose() |> + purrr::map(collapse::funique) |> + purrr::map(pos_re_old) |> + purrr::list_c() |> + paste0(collapse = "") + + paste0("^", x, "$") + +} + +#' Internal function for `construct_regex_old()` +#' +#' @param x `` vector +#' +#' @returns `` vector +#' +#' @autoglobal +#' +#' @noRd +pos_re_old <- function(x) { + + sorted <- stringr::str_sort(x, numeric = TRUE) + alphabet <- purrr::list_c(strex::str_extract_non_numerics(sorted)) + numbers <- purrr::list_c(strex::str_extract_numbers(sorted)) + + paste0("[", + fuimus::collapser(alphabet), + fuimus::collapser(numbers), + "]") + +} diff --git a/man/construct_regex2.Rd b/man/construct_regex2.Rd deleted file mode 100644 index a520bf3..0000000 --- a/man/construct_regex2.Rd +++ /dev/null @@ -1,21 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/regex.R -\name{construct_regex2} -\alias{construct_regex2} -\title{Construct regex patterns} -\usage{ -construct_regex2(x) -} -\arguments{ -\item{x}{\verb{} vector} -} -\value{ -\verb{} vector -} -\description{ -Construct regex patterns -} -\examples{ -construct_regex2(search_descriptions()$hcpcs_code) - -}