feat tox provide all data and tests improved [skipci]

c1au6i0 · Dec 31, 2024 · 050c765 · 050c765
1 parent f544b1d
commit 050c765
Show file tree

Hide file tree

Showing 32 changed files with 3,832 additions and 340 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -15,3 +15,4 @@ export(extr_tetramer)
 export(extr_tox)
 export(with_extr_sandbox)
 export(write_dataframes_to_excel)
+importFrom(cli,cli_warn)
diff --git a/R/cache.R b/R/cache.R
@@ -13,7 +13,6 @@
 #' The function will overwrite any existing file with the same name.
 #' @noRd
 save_to_cache <- function(obj, file_name, verbose = FALSE) {
-
   # Sys.getenv("R_USER_CACHE_DIR")
 
   cache_dir <- tools::R_user_dir("extractox", which = "cache")

diff --git a/R/checks.R b/R/checks.R
@@ -1,5 +1,5 @@
-' Check for NA values in a specific column of a dataframe
-#'
+" Check for NA values in a specific column of a dataframe
+#"
 #' Checks for NA values in a specified column of a dataframe and optionally warns if any are found.
 #'
 #' @param dat A dataframe that contains the data.
@@ -9,7 +9,6 @@
 #' @keywords internal
 #' @noRd
 check_na_warn <- function(dat, col_to_check, verbose = TRUE) {
-
   ids_not_found <- dat$query[is.na(dat[[col_to_check]])]
 
   if (all(isTRUE(verbose), length(ids_not_found) != 0)) {

diff --git a/R/extr_comptox.R b/R/extr_comptox.R
@@ -187,7 +187,7 @@ extr_comptox <- function(ids,
   sheet_names <- readxl::excel_sheets(xlsx_file)
 
   suppressMessages(
-  dat_list <- lapply(sheet_names, readxl::read_excel, path = xlsx_file)
+    dat_list <- lapply(sheet_names, readxl::read_excel, path = xlsx_file)
   )
 
   names(dat_list) <- paste0("comptox_", gsub(" ", "_", tolower(sheet_names)))

diff --git a/R/extr_ctd.R b/R/extr_ctd.R
@@ -80,8 +80,10 @@ extr_ctd <- function(
   check_internet(verbose = verbose)
 
 
-  col_names <- c("chemical_name", "chemical_id", "casrn", "gene_symbol",
-                 "gene_id",   "organism", "organism_id", "pubmed_ids", "query")
+  col_names <- c(
+    "chemical_name", "chemical_id", "casrn", "gene_symbol",
+    "gene_id", "organism", "organism_id", "pubmed_ids", "query"
+  )
 
   params <- list(
     inputType = category,
@@ -145,11 +147,10 @@ extr_ctd <- function(
   # Lets clean up
   out$query <- gsub(" \\[Object not found\\]", "", out$query)
 
-  check_na_warn(dat = out,col_to_check = "gene_id", verbose = verbose)
+  check_na_warn(dat = out, col_to_check = "gene_id", verbose = verbose)
 
   unlink(csv_file)
   out
-
 }
 
 #' Extract Tetramer Data from the CTD API
@@ -198,8 +199,6 @@ extr_tetramer <- function(
     verify_ssl = FALSE,
     verbose = TRUE,
     ...) {
-
-
   if (missing(chem)) {
     cli::cli_abort("The argument {.field {chem}} is required.")
   }
@@ -234,7 +233,7 @@ extr_tetramer <- function(
     )
   }
 
-  check_na_warn(dat = out,col_to_check = "gene_id", verbose = verbose)
+  check_na_warn(dat = out, col_to_check = "gene_id", verbose = verbose)
 
   out
 }
@@ -254,7 +253,6 @@ extr_tetramer_ <- function(
     verify_ssl = FALSE,
     verbose = verbose,
     ...) {
-
   # Define the base URL
   base_url <- "https://ctdbase.org/query.go"
 
@@ -330,5 +328,3 @@ extr_tetramer_ <- function(
 
   out
 }
-
-
diff --git a/R/extr_ice.R b/R/extr_ice.R
@@ -60,11 +60,13 @@ extr_ice <- function(casrn,
   check_status_code(resp, verbose = verbose)
 
   # This is used in case no results are retrieved in next chunk
-  col_names <- c("assay", "endpoint", "substance_type", "casrn", "qsar_ready_id",
-                 "value", "unit", "species", "receptor_species", "route", "sex",
-                 "strain", "life_stage", "tissue", "lesion", "location",
-                 "assay_source", "in_vitro_assay_format", "reference",
-                 "reference_url", "dtxsid", "substance_name", "pubmed_id")
+  col_names <- c(
+    "assay", "endpoint", "substance_type", "casrn", "qsar_ready_id",
+    "value", "unit", "species", "receptor_species", "route", "sex",
+    "strain", "life_stage", "tissue", "lesion", "location",
+    "assay_source", "in_vitro_assay_format", "reference",
+    "reference_url", "dtxsid", "substance_name", "pubmed_id"
+  )
 
   out <- stats::setNames(as.data.frame(matrix(ncol = length(col_names), nrow = 0)), col_names)
 
@@ -78,7 +80,7 @@ extr_ice <- function(casrn,
     },
     error = function(e) {
       if (grepl("Unexpected content type \"text/plain\"", e$message)) {
-        if(isTRUE(verbose)){
+        if (isTRUE(verbose)) {
           cli::cli_warn("It seems that the ids were not found in ICE:
                         {conditionMessage(e)}")
         }
@@ -112,14 +114,16 @@ extr_ice <- function(casrn,
   ids_founds <- casrn[casrn %in% out$casrn]
 
   if (nrow(out) > 0) {
-
     out[, "query"] <- paste(ids_founds, collapse = ", ")
-    to_add <- stats::setNames(data.frame(matrix(ncol = ncol(out),
-                                                nrow = length(ids_not_found))),
-                              names(out))
+    to_add <- stats::setNames(
+      data.frame(matrix(
+        ncol = ncol(out),
+        nrow = length(ids_not_found)
+      )),
+      names(out)
+    )
     to_add$query <- ids_not_found
     out <- rbind(to_add, out)
-
   } else {
     out[1:length(casrn), "query"] <- casrn
   }

diff --git a/R/extr_iris.R b/R/extr_iris.R
@@ -15,12 +15,11 @@
 #' extr_iris(c("1332-21-4", "50-00-0"))
 #' }
 extr_iris <- function(casrn = NULL, verbose = TRUE) {
+  cancer_types <- c("non_cancer", "cancer")
 
-   cancer_types <-  c("non_cancer", "cancer")
-
-   if (missing(casrn)) {
-     cli::cli_abort("The argument {.field {casrn}} is required.")
-   }
+  if (missing(casrn)) {
+    cli::cli_abort("The argument {.field {casrn}} is required.")
+  }
 
   # Check if online
   check_internet(verbose = verbose)

diff --git a/R/extr_monograph.R b/R/extr_monograph.R
@@ -5,19 +5,19 @@
 #'
 #' @param search_type A character string specifying the type of search to
 #'   perform. Valid options are "cas_rn" (CAS Registry Number) and "name"
-#'.  (name of the chemical). If `search_type` is "casrn", the function filters
-#'.  by the CAS Registry Number.
+#' .  (name of the chemical). If `search_type` is "casrn", the function filters
+#' .  by the CAS Registry Number.
 #'   If `search_type` is "name", the function performs a partial match search
 #'   for the chemical name.
 #' @param ids A character vector of IDs to search for.
 #' @param verbose A logical value indicating whether to print detailed messages.
-#'.   Default is TRUE.
+#' .   Default is TRUE.
 #' @param get_all Logical. If TRUE ignore all the other ignore `ids`,
 #'   `search_type`, set  `force = TRUE` and get the all dataset.
 #'   This is was introduced for debugging purposes.
 #' @return A data frame containing the relevant information from the WHO IARC,
-#'.  including Monograph `volume`, `volume_publication_year`, `evaluation_year`,
-#'.  and `additional_information` where the chemical was described.
+#' .  including Monograph `volume`, `volume_publication_year`, `evaluation_year`,
+#' .  and `additional_information` where the chemical was described.
 #' @seealso \url{https://monographs.iarc.who.int/list-of-classifications/}
 #' @export
 #' @examples
@@ -26,17 +26,17 @@
 #'   str(dat)
 #'
 #'   # Example usage for name search
-#'   dat2 <- extr_monograph(search_type = "name",
-#'                           ids = c("Aloe", "Schistosoma",
-#'.                         "Styrene")
-#'.  )
+#'   dat2 <- extr_monograph(
+#'     search_type = "name",
+#'     ids = c("Aloe", "Schistosoma", "Styrene")
+#'   )
 #'   str(dat2)
 #' }
 extr_monograph <- function(ids,
-                           search_type ="casrn",
+                           search_type = "casrn",
                            verbose = TRUE,
                            get_all = FALSE) {
-  if(isTRUE(get_all)) {
+  if (isTRUE(get_all)) {
     return(who_iarc_monographs)
   }
 
@@ -54,16 +54,17 @@ extr_monograph <- function(ids,
     cli::cli_alert_info("Extracting WHO IARC monographs...\nLast updated: 2024-11-29 5:08pm (CET)")
   }
 
-  col_names <-  c(names(who_iarc_monographs), "query")
+  col_names <- c(names(who_iarc_monographs), "query")
 
-  out <- search_and_match(dat = who_iarc_monographs,
-                   ids = ids,
-                   search_type = search_type,
-                   col_names = col_names,
-                   chemical_col = "agent"
-                    )
+  out <- search_and_match(
+    dat = who_iarc_monographs,
+    ids = ids,
+    search_type = search_type,
+    col_names = col_names,
+    chemical_col = "agent"
+  )
 
-  check_na_warn(dat = out, col_to_check =  "agent", verbose = verbose)
+  check_na_warn(dat = out, col_to_check = "agent", verbose = verbose)
 
   out
 }
diff --git a/R/extr_pprtv.R b/R/extr_pprtv.R
@@ -31,8 +31,10 @@
 #'   extr_pprtv(ids = "107-02-8", search_type = "casrn", verbose = TRUE)
 #'
 #'   # Extract data for a chemical name
-#'   extr_pprtv(ids = "Acrolein", search_type = "name", verbose = TRUE,
-#'   force = FALSE)
+#'   extr_pprtv(
+#'     ids = "Acrolein", search_type = "name", verbose = TRUE,
+#'     force = FALSE
+#'   )
 #'
 #'   # Extract data for multiple identifiers
 #'   extr_pprtv(
@@ -45,23 +47,24 @@
 #' }
 extr_pprtv <- function(ids, search_type = "casrn", verbose = TRUE, force = TRUE,
                        get_all = FALSE) {
-
-
   if (all(missing(ids), !isTRUE(get_all))) {
     cli::cli_abort("The argument {.field ids} is required.")
   }
 
-  if (all(!search_type %in% c("casrn", "name"),!isTRUE(get_all))) {
+  if (all(!search_type %in% c("casrn", "name"), !isTRUE(get_all))) {
     cli::cli_abort("The argument {.field search_type} needs to be either `casrn`
                    or `name`.")
   }
 
   file_name <- "epa_pprtvs.rds" # Filename for caching
 
   # Check if path is present otherwise it download it again
-  full_path_cache_file <- fs::path(tools::R_user_dir("extractox",
-                                                     which = "cache"),
-                                                     file_name)
+  full_path_cache_file <- fs::path(
+    tools::R_user_dir("extractox",
+      which = "cache"
+    ),
+    file_name
+  )
 
   cache_present <- fs::file_exists(full_path_cache_file)
 
@@ -75,12 +78,14 @@ extr_pprtv <- function(ids, search_type = "casrn", verbose = TRUE, force = TRUE,
       url = "https://cfpub.epa.gov/ncea/pprtv/atoz.cfm",
       url_query_param = list(excel = "yes"),
       file_name = file_name,
-      verbose = verbose)
-
-      # get eveything
-      if (get_all == TRUE) return(dat)
-      path_cache <- save_to_cache(dat, file_name, verbose = verbose)
+      verbose = verbose
+    )
 
+    # get eveything
+    if (get_all == TRUE) {
+      return(dat)
+    }
+    path_cache <- save_to_cache(dat, file_name, verbose = verbose)
   } else {
     dat <- read_from_cache(file_name = file_name, verbose = verbose)
 
@@ -95,20 +100,21 @@ extr_pprtv <- function(ids, search_type = "casrn", verbose = TRUE, force = TRUE,
     cli::cli_alert_info("Extracting EPA PPRTVs.")
   }
 
-  col_names <- c("pprtv_substance_id", "chemical", "casrn", "last_revision",
-                 "pprtv_assessment", "iris_link", "rf_c_value", "rf_d_value",
-                 "woe", "date_downloaded", "query")
+  col_names <- c(
+    "pprtv_substance_id", "chemical", "casrn", "last_revision",
+    "pprtv_assessment", "iris_link", "rf_c_value", "rf_d_value",
+    "woe", "date_downloaded", "query"
+  )
 
   out <- search_and_match(
-                   dat = dat,
-                   ids = ids,
-                   search_type = search_type,
-                   col_names = col_names,
-                   chemical_col = "chemical"
-                   )
+    dat = dat,
+    ids = ids,
+    search_type = search_type,
+    col_names = col_names,
+    chemical_col = "chemical"
+  )
 
-  check_na_warn(dat = out, col_to_check  = "chemical", verbose = verbose)
+  check_na_warn(dat = out, col_to_check = "chemical", verbose = verbose)
 
   out
-
 }