From 51a67334e20857faf31d266d0ba80b1d5e581623 Mon Sep 17 00:00:00 2001 From: kyle-messier Date: Tue, 1 Oct 2024 23:39:06 -0400 Subject: [PATCH] _targets.R, test and run bash scripts --- _targets.R | 209 ++++++++++++++------------- container/run_container_dl_calc.sh | 10 +- container/run_dl_calc_local_tests.sh | 4 +- 3 files changed, 115 insertions(+), 108 deletions(-) diff --git a/_targets.R b/_targets.R index e90e030..1cc6de4 100755 --- a/_targets.R +++ b/_targets.R @@ -1,123 +1,80 @@ library(targets) library(tarchetypes) -library(future) -library(future.batchtools) library(dplyr) -library( - beethoven, - lib.loc = "/ddn/gs1/home/manwareme/R/x86_64-pc-linux-gnu-library/4.3" -) -library(tidymodels) -library(bonsai) -# library( -# torch, -# lib.loc = "/ddn/gs1/biotools/R/lib64/R/library" -# ) +library(crew) +library(future) +library(beethoven) +library(amadeus) -Sys.setenv("LD_LIBRARY_PATH" = paste("/ddn/gs1/biotools/R/lib64/R/customlib", Sys.getenv("LD_LIBRARY_PATH"), sep = ":")) -# replacing yaml file. +# targets store location corresponds to _targets/ in the root of the project tar_config_set( - store = "/ddn/gs1/home/manwareme/beethoven/beethoven_targets" + store = "/opt/_targets" +) + +# crew contollers +# For now, one is set, but we can explore the use of multiple controllers +# Can also explore making the workers input for bash script or Rscript +geo_controller <- crew_controller_local( + name = "geo_controller", + workers = 16L, + launch_max = 8L, + seconds_idle = 120 ) -# maximum future exportable object size is set 50GB -# TODO: the maximum size error did not appear until recently -# and suddenly appeared. Need to investigate the cause. -# Should be removed after the investigation. -# options(future.globals.maxSize = 50 * 2^30) -options(future.globals.maxSize = 60 * 1024^3) # 60 GiB -generate_list_download <- FALSE +# Setting up the NASA Earthdata token inside the container +# This needs to be tested +if (!nzchar(Sys.getenv("NASA_EARTHDATA_TOKEN"))){ + tar_source("/mnt/NASA_token_setup.R") + file.exists(".netrc") + file.exists(".urs_cookies") + file.exists(".dodsrc") +} + arglist_download <- set_args_download( char_period = c("2018-01-01", "2022-12-31"), - char_input_dir = "input", - nasa_earth_data_token = NULL,#Sys.getenv("NASA_EARTHDATA_TOKEN"), - mod06_filelist = "inst/targets/mod06_links_2018_2022.csv", - export = generate_list_download, - path_export = "inst/targets/download_spec.qs" + char_input_dir = "/input", + nasa_earth_data_token = Sys.getenv("NASA_EARTHDATA_TOKEN"), + mod06_filelist = "/pipeline/targets/mod06_links_2018_2022.csv", + export = TRUE, + path_export = "/pipeline/targets/download_spec.qs" ) -generate_list_calc <- FALSE -arglist_common <- - set_args_calc( - char_siteid = "site_id", - char_timeid = "time", - char_period = c("2018-01-01", "2022-12-31"), - num_extent = c(-126, -62, 22, 52), - char_user_email = paste0(Sys.getenv("USER"), "@nih.gov"), - export = generate_list_calc, - path_export = "inst/targets/calc_spec.qs", - char_input_dir = "/ddn/gs1/group/set/Projects/NRT-AP-Model/input" - ) -tar_source("inst/targets/targets_initialize.R") -tar_source("inst/targets/targets_download.R") -tar_source("inst/targets/targets_calculate_fit.R") -tar_source("inst/targets/targets_calculate_predict.R") -tar_source("inst/targets/targets_baselearner.R") -tar_source("inst/targets/targets_metalearner.R") -tar_source("inst/targets/targets_predict.R") -# bypass option -Sys.setenv("BTV_DOWNLOAD_PASS" = "TRUE") -# -# bind custom built GDAL -# Users should export the right path to the GDAL library -# by export LD_LIBRARY_PATH=.... command. +### NOTE: It is important to source the scipts after the global variables are defined from the set_args functions + #tar_source("/pipeline/targets/targets_aqs.R") + tar_source("/pipeline/targets/targets_download.R") -# arglist_common is generated above -plan( - list( - tweak( - future.batchtools::batchtools_slurm, - template = "inst/targets/template_slurm.tmpl", - resources = - list( - memory = 8, - log.file = "slurm_run.log", - ncpus = 1, partition = "geo", ntasks = 1, - email = arglist_common$char_user_email, - error.file = "slurm_error.log" - ) - ), - multicore - ) -) +# Toy test files - note we will not have functions defined like this directly in +# the _targets.R file +my_fun_a <- function(n) { + rnorm(n) +} + +my_fun_b <- function(x) { + x^2 +} -# # invalidate any nodes older than 180 days: force running the pipeline -# tar_invalidate(any_of(tar_older(Sys.time() - as.difftime(180, units = "days")))) -# # nullify download target if bypass option is set -if (Sys.getenv("BTV_DOWNLOAD_PASS") == "TRUE") { - target_download <- NULL -} -# targets options -# For GPU support, users should be aware of setting environment -# variables and GPU versions of the packages. -# TODO: check if the controller and resources setting are required tar_option_set( - packages = c( - "beethoven", "amadeus", "chopin", "targets", "tarchetypes", - "data.table", "sf", "terra", "exactextractr", - #"crew", "crew.cluster", - "tigris", "dplyr", - "future.batchtools", "qs", "collapse", "bonsai", - "tidymodels", "tune", "rsample", "torch", "brulee", - "glmnet", "xgboost", - "future", "future.apply", "future.callr", "callr", - "stars", "rlang", "parallelly" - ), - library = c("/ddn/gs1/group/set/isong-archive/r-libs"), - repository = "local", + packages = + c( "amadeus", "targets", "tarchetypes", + "data.table", "sf", "terra", "exactextractr", + "dplyr", "qs", "callr", "stars", "rlang"), + controller = crew_controller_group(geo_controller), + resources = tar_resources( + crew = tar_resources_crew(controller = "geo_controller") + ), error = "abridge", memory = "transient", format = "qs", @@ -127,15 +84,65 @@ tar_option_set( seed = 202401L ) -# should run tar_make_future() + list( + tar_target(name = A, command = my_fun_a(100)), + tar_target(name = B, command = my_fun_b(A), pattern = A), + tar_target(name = save_input, command = saveRDS(B, "/input/input.rds")), + tar_target( # Test download data with amadeus + download_test, + amadeus::download_narr( + variables = c("weasd", "omega"), + year = c(2023, 2023), + directory_to_save = "/input/narr_monolevel", + acknowledgement = TRUE, + download = TRUE, + remove_command = TRUE + ) + ), + target_download + ) + + +# Style below that uses sources scripts for targets by pipeline step +# Note that variables created in _targets.R are in the same local +# environment as the sourced scripts + +# list( +# target_init, +# target_download + # target_calculate_fit, + # target_baselearner#, + # target_metalearner, + # target_calculate_predict, + # target_predict, + # # documents and summary statistics + # targets::tar_target( + # summary_urban_rural, + # summary_prediction( + # grid_filled, + # level = "point", + # contrast = "urbanrural")) + # , + # targets::tar_target( + # summary_state, + # summary_prediction( + # grid_filled, + # level = "point", + # contrast = "state" + # ) + # ) +# ) + +# targets::tar_visnetwork(targets_only = TRUE) +# END OF FILE -list( - target_init, - target_download, - target_calculate_fit, - target_baselearner, - target_metalearner, - target_calculate_predict#, +# list( +# target_init, +# target_download, +# target_calculate_fit, +# target_baselearner, +# target_metalearner, +# target_calculate_predict#, # target_predict, # # documents and summary statistics # targets::tar_target( diff --git a/container/run_container_dl_calc.sh b/container/run_container_dl_calc.sh index 49e7eb0..5d0909b 100644 --- a/container/run_container_dl_calc.sh +++ b/container/run_container_dl_calc.sh @@ -1,17 +1,17 @@ #!/bin/bash -#SBATCH --job-name=beethoven_001 +#SBATCH --job-name=download_calc #SBATCH --partition=geo #SBATCH --mem=128G #SBATCH --cpus-per-task=4 #SBATCH --ntasks=16 -#SBATCH --output=slurm_messages/slurm-%j.out -#SBATCH --error=slurm_messages/slurm-%j.err +#SBATCH --output=../slurm_messages/slurm-%j.out +#SBATCH --error=../slurm_messages/slurm-%j.err #SBATCH --mail-user=kyle.messier@nih.gov #SBATCH --mail-type=ALL - - +# Run the container +# .sif file sites in "root/container", thus we need to go up one level with bind mounts apptainer exec \ --bind $PWD/inst:/pipeline \ --bind $PWD/input:/input \ diff --git a/container/run_dl_calc_local_tests.sh b/container/run_dl_calc_local_tests.sh index 9404e8a..72c81a2 100644 --- a/container/run_dl_calc_local_tests.sh +++ b/container/run_dl_calc_local_tests.sh @@ -4,8 +4,8 @@ #SBATCH --mem=128G #SBATCH --cpus-per-task=4 #SBATCH --ntasks=16 -#SBATCH --output=slurm_messages/slurm-%j.out -#SBATCH --error=slurm_messages/slurm-%j.err +#SBATCH --output=../slurm_messages/slurm-%j.out +#SBATCH --error=../slurm_messages/slurm-%j.err #SBATCH --mail-user=kyle.messier@nih.gov #SBATCH --mail-type=ALL