Skip to content

Commit

Permalink
refactored construct_regex
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewallenbruce committed Jul 29, 2024
1 parent c527ad1 commit ccbb163
Show file tree
Hide file tree
Showing 5 changed files with 177 additions and 105 deletions.
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ Imports:
rlang,
strex,
stringfish,
stringi,
stringr,
tidyr,
triebeard,
Expand Down
1 change: 0 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ export(assign_adjustments)
export(calculate_amounts)
export(carc_add_dash)
export(construct_regex)
export(construct_regex2)
export(data_dict)
export(get_example)
export(get_pin)
Expand Down
8 changes: 8 additions & 0 deletions R/generated-globals.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@ utils::globalVariables(c(
"aoc_complements",
# <calculate_amounts>
"facility",
# <id_runs>
"group",
# <id_runs>
"group_size",
# <id_runs>
"key",
# <calculate_amounts>
"nonfacility",
# <search_ptps>
Expand All @@ -17,5 +23,7 @@ utils::globalVariables(c(
"rbcs_procedure",
# <search_rbcs>
"rbcs_subcategory",
# <id_runs>
"value",
NULL
))
251 changes: 168 additions & 83 deletions R/regex.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,117 +9,148 @@
#'
#' @autoglobal
#'
# @keywords internal
#'
#' @export
construct_regex <- function(x) {

# TODO: check for equal lengths

x <- collapse::funique(
collapse::na_rm(
gsub(" ", "", x)))
gsub(" ", "", x)
)
)

x <- stringr::str_split(x, "") |>
purrr::list_transpose() |>
vecs <- stringr::str_split_fixed(
x, "",
n = max(
collapse::vlengths(x)
)
) |>
as.data.frame() |>
purrr::map(dplyr::na_if, y = "")

to_brackets <- vecs |>
purrr::map(collapse::na_rm) |>
purrr::map(collapse::funique) |>
purrr::map(pos_re) |>
purrr::list_c() |>
paste0(collapse = "")
purrr::map(pos_re)

paste0("^", x, "$")
qmark <- names(which(purrr::map_lgl(vecs, anyNA)))

}
if (!vctrs::vec_is_empty(qmark)) {
to_brackets[qmark] <- purrr::map(to_brackets[qmark], \(x) paste0(x, "?"))
}

#' Internal function for `construct_regex()`
#'
#' @param x `<chr>` vector
#'
#' @returns `<chr>` vector
#'
#' @autoglobal
#'
#' @noRd
pos_re <- function(x) {
to_vec <- to_brackets |>
purrr::map(id_runs) |>
purrr::list_c()

sorted <- stringr::str_sort(x, numeric = TRUE)
alphabet <- purrr::list_c(strex::str_extract_non_numerics(sorted))
numbers <- purrr::list_c(strex::str_extract_numbers(sorted))
if (collapse::any_duplicated(to_vec)) {

paste0("[",
fuimus::collapser(alphabet),
fuimus::collapser(numbers),
"]")
# TODO probably need to vectorize this,
# will surely have more than one unique duplicate

dupe_idx <- which(collapse::fduplicated(to_vec, all = TRUE))

rp <- paste0(to_vec[dupe_idx][1], "{", length(dupe_idx), "}")

to_vec[dupe_idx] <- rp

to_vec <- collapse::funique(to_vec)

}

x <- paste0("^", fuimus::collapser(to_vec), "$")

return(x)
}

#' Construct regex patterns
#' Internal function for `construct_regex()`
#'
#' @param x `<chr>` vector
#'
#' @examples
#' construct_regex2(search_descriptions()$hcpcs_code)
#'
#' @returns `<chr>` vector
#'
#' @autoglobal
#'
# @keywords internal
#'
#' @export
construct_regex2 <- function(x) {
#' @noRd
id_runs <- function(x) {

x <- collapse::funique(
collapse::na_rm(
gsub(" ", "", x)
)
)
vec <- c(LETTERS, 0:9)

vecs <- stringr::str_split_fixed(
x,
"",
n = max(
collapse::vlengths(x)
)
) |>
as.data.frame() |>
purrr::map(
dplyr::na_if,
y = ""
)
vec <- rlang::set_names(rep(0, length(vec)), vec)

to_brackets <- vecs |>
purrr::map(collapse::na_rm) |>
purrr::map(collapse::funique) |>
purrr::map(pos_re2)
test <- strsplit(x, "")[[1]]

qmark <- names(
which(
purrr::map_lgl(vecs, anyNA)
)
)
vecna <- vec[test]

if (!vctrs::vec_is_empty(qmark)) {
to_brackets[qmark] <- purrr::map(
to_brackets[qmark],
\(x) paste0(x, "?")
)
}
vecna <- vecna[!is.na(vecna)]

to_vec <- to_brackets |>
purrr::list_c() |>
paste0(collapse = "")
vec[names(vecna)] <- 1

vec_group <- dplyr::tibble(
value = names(vec),
key = vec,
idx = 1:length(vec),
group = dplyr::consecutive_id(key)
) |>
dplyr::mutate(
group_size = dplyr::n(),
.by = group
) |>
dplyr::filter(
key == 1,
group_size >= 3
) |>
dplyr::select(
value,
group
)

x <- paste0("^", to_vec, "$")
if (vctrs::vec_is_empty(vec_group)) return(x)

x <- gsub(paste0(0:9, collapse = ""), "0-9", x)
xgroups <- unname(
split(
vec_group,
vec_group$group
)
) |>
purrr::map(
purrr::pluck("value")
) |>
purrr::map(
paste0,
collapse = ""
) |>
purrr::list_c()

replacements <- dplyr::left_join(
dplyr::slice_min(
vec_group,
by = group,
order_by = value
) |>
dplyr::rename(start = value),
dplyr::slice_max(
vec_group,
by = group,
order_by = value
) |>
dplyr::rename(end = value),
by = dplyr::join_by(group)
) |>
glue::glue_data(
"{start}-{end}"
) |>
as.vector()

x <- gsub(paste0(LETTERS, collapse = ""), "A-Z", x)
res <- stringi::stri_replace_all_regex(
x,
xgroups,
replacements,
vectorize_all = FALSE)

return(x)
paste0("[", res, "]")
}

#' Internal function for `construct_regex2()`
#' Internal function for `construct_regex()`
#'
#' @param x `<chr>` vector
#'
Expand All @@ -128,17 +159,16 @@ construct_regex2 <- function(x) {
#' @autoglobal
#'
#' @noRd
pos_re2 <- function(x) {
pos_re <- function(x) {

sorted <- stringr::str_sort(x, numeric = TRUE)
alphabet <- purrr::list_c(strex::str_extract_non_numerics(sorted))
numbers <- purrr::list_c(strex::str_extract_numbers(sorted))

paste0("[",
fuimus::collapser(alphabet),
fuimus::collapser(numbers),
"]"
)
paste0(
fuimus::collapser(alphabet),
fuimus::collapser(numbers)
)

}

Expand All @@ -162,3 +192,58 @@ pos_nchar <- function(x) {
)

}

#' Construct regex patterns
#'
#' @param x `<chr>` vector
#'
#' @examples
#' construct_regex_old(search_descriptions()$hcpcs_code)
#'
#' @returns `<chr>` vector
#'
#' @autoglobal
#'
#' @keywords internal
#'
#' @noRd
construct_regex_old <- function(x) {

# TODO: check for equal lengths

x <- collapse::funique(
collapse::na_rm(
gsub(" ", "", x)))

x <- stringr::str_split(x, "") |>
purrr::list_transpose() |>
purrr::map(collapse::funique) |>
purrr::map(pos_re_old) |>
purrr::list_c() |>
paste0(collapse = "")

paste0("^", x, "$")

}

#' Internal function for `construct_regex_old()`
#'
#' @param x `<chr>` vector
#'
#' @returns `<chr>` vector
#'
#' @autoglobal
#'
#' @noRd
pos_re_old <- function(x) {

sorted <- stringr::str_sort(x, numeric = TRUE)
alphabet <- purrr::list_c(strex::str_extract_non_numerics(sorted))
numbers <- purrr::list_c(strex::str_extract_numbers(sorted))

paste0("[",
fuimus::collapser(alphabet),
fuimus::collapser(numbers),
"]")

}
21 changes: 0 additions & 21 deletions man/construct_regex2.Rd

This file was deleted.

0 comments on commit ccbb163

Please sign in to comment.