Merge pull request #20 from Robinlovelace/get-v1-data

Get v1 data - the magic get functions for od
rOpenSpain · Aug 12, 2024 · 16bcf7b · 16bcf7b
2 parents 2ce6f68 + 82d6001
commit 16bcf7b
Show file tree

Hide file tree

Showing 32 changed files with 1,512 additions and 262 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -12,3 +12,6 @@ README.qmd
 ^.*\.Rproj$
 ^\.Rproj\.user$
 ^private$
+^doc$
+^Meta$
+^vignettes/*_files$
diff --git a/.gitignore b/.gitignore
@@ -13,3 +13,7 @@ private
 
 /.quarto/
 .Rproj.user
+inst/doc
+.Renviron
+/doc/
+/Meta/
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -23,20 +23,25 @@ Depends:
 Imports: 
     curl,
     DBI,
+    dplyr,
     duckdb,
     fs,
     glue,
     lubridate,
+    parallelly,
     purrr,
     readr,
-    rlang (>= 1.1.0),
     sf,
+    stats,
     stringr,
     tibble,
     xml2
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.3.2
 Suggests: 
+    quarto,
     testthat (>= 3.0.0)
 Config/testthat/edition: 3
+VignetteBuilder:
+    quarto
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,10 +1,13 @@
 # Generated by roxygen2: do not edit by hand
 
 export(spod_available_data_v1)
+export(spod_convert_od_v1_to_duckdb)
 export(spod_download_data)
 export(spod_get)
+export(spod_get_data_dir)
 export(spod_get_latest_v1_file_list)
 export(spod_get_latest_v2_xml)
 export(spod_get_metadata)
+export(spod_get_od_v1)
 export(spod_get_zones)
 export(spod_get_zones_v1)
diff --git a/R/convert_data.R b/R/convert_data.R
@@ -0,0 +1,92 @@
+#' Convert all downloaded v1 origin-destination data to duckdb
+#'
+#' @param save_dir The path to the directory where the duckdb files will be saved. If `NULL`, uses the default location in `data_dir` (set by the `SPANISH_OD_DATA_DIR` environment variable). Therefore, the default relative path is `<data_dir>/clean_data/v1/tabular/duckdb/od_<zones>.duckdb`.
+#' @inheritParams spod_get_zones_v1
+#' @inheritParams spod_duckdb_limit_resources
+#' @param overwrite Logical. If `TRUE`, overwrites existing duckdb files. Defaults to `FALSE`.
+#' @return Path to saved DuckDB file.
+#' @export
+spod_convert_od_v1_to_duckdb <- function(
+    zones = c(
+      "districts", "dist", "distr", "distritos",
+      "municipalities", "muni", "municip", "municipios"
+    ),
+    data_dir = spod_get_data_dir(),
+    save_dir = NULL,
+    quiet = FALSE,
+    duck_max_mem = 3,
+    duck_max_threads = parallelly::availableCores(),
+    overwrite = FALSE) {
+  zones <- match.arg(zones)
+  zones <- spod_zone_names_en2es(zones)
+
+  # if save_dir is NULL, use default location in data_dir
+  if (is.null(save_dir)) {
+    save_dir <- fs::path(
+      data_dir,
+      spod_subfolder_clean_data_cache(ver = 1),
+      "tabular/duckdb/"
+    )
+  }
+
+  # ensure save_dir exists
+  if (!fs::dir_exists(save_dir)) fs::dir_create(save_dir, recurse = TRUE)
+
+  # create duckdb save path
+  duckdb_save_path <- glue::glue("{save_dir}/od_{zones}.duckdb")
+
+  # check if duckdb file already exists
+  if (fs::file_exists(duckdb_save_path) & !overwrite) {
+    message("Duckdb file already exists: ", duckdb_save_path)
+    # in future, perhaps add code that provides a summary of what's inside that file
+    # ask user if they want to overwrite
+    response <- readline(prompt = "Overwrite existing duckdb file? (yes/no) ")
+    overwrite_duckdb <- any(tolower(response) %in% c("y", "yes", "yes."))
+    if (!overwrite_duckdb) {
+      message(glue::glue("Exiting without overwriting existing duckdb file. You may delete it from {duckdb_save_path} manually and rerun the function. Or rerun it with `overwrite = TRUE`."))
+      return()
+    } else {
+      if (isFALSE(quiet)) message(glue::glue("Overwriting existing duckdb file: ", duckdb_save_path))
+      fs::file_delete(duckdb_save_path)
+    }
+  }
+
+  if (isFALSE(quiet)) message(glue::glue("Using {duck_max_mem} GB of memory and {duck_max_threads} threads. You may adjust this using the function arguments `duck_max_mem` and `duck_max_threads`."))
+  if (isFALSE(quiet)) message(glue::glue("Converting cached v1 od data for {zones} to DuckDB: ", duckdb_save_path, "... This may take a while."))
+  # add some indication on how long it may take from empirical experimentation
+  # hopefully, the progress_bar feature will be implemented in duckdb R package soon, bug filed here https://github.com/duckdb/duckdb-r/issues/199
+
+  # get dates of cached data
+  # v1_meta <- spod_available_data_v1(check_local_files = TRUE)
+
+  # v1_meta <- v1_meta[v1_meta$downloaded == TRUE,]
+  # v1_meta <- v1_meta[grepl("maestra1", v1_meta$local_path),]
+  # v1_meta <- v1_meta[grepl(zones, v1_meta$local_path),]
+
+  # dates <- v1_meta$data_ymd
+
+  # create duckdb connection
+  drv <- duckdb::duckdb()
+  con <- DBI::dbConnect(drv, dbdir = duckdb_save_path, read_only = FALSE)
+
+  # define memory and threads limits
+  con <- spod_duckdb_limit_resources(
+    con = con,
+    duck_max_mem = duck_max_mem,
+    duck_max_threads = duck_max_threads
+  )
+
+  # connect to folder of CSVs with v1 od data
+  con <- spod_duckdb_od_v1(con = con, zones = zones)
+  # DBI::dbListTables(con)
+
+  # import view of CSV files into duckdb
+  DBI::dbExecute(con, "CREATE TABLE od AS SELECT * FROM od_csv_clean ;")
+
+  DBI::dbDisconnect(con, shutdown = TRUE)
+  duckdb::duckdb_shutdown(drv)
+
+  message("Cached v1 origin-destination data imported to DuckDB at: ", duckdb_save_path)
+
+  return(duckdb_save_path)
+}
diff --git a/R/download_data.R b/R/download_data.R
@@ -1,91 +1,101 @@
 #' Download the data files of specified type, zones, and dates
-#' 
+#'
 #' This function downloads the data files of the specified type, zones, dates and data version.
 #' @param type The type of data to download. Can be `"origin-destination"` (or ust `"od"`), or `"trips_per_person"` (or just `"tpp"`) for v1 data. For v2 data `"overnight_stays"` (or just `"os"`) is also available. More data types to be supported in the future. See respective codebooks for more information. **ADD CODEBOOKS! to the package**
-#' @param zones The zones for which to download the data. Can be `"districts"` (or `"dist"`, `"distr"`) or `"municipalities"` (or `"muni"`, `"municip"`) for v1 data. Additionaly, these can be `"large_urban_areas"` (or `"lau"`) for v2 data.
+#' @param zones The zones for which to download the data. Can be `"districts"` (or `"dist"`, `"distr"`, or the original Spanish `"distritos"`) or `"municipalities"` (or `"muni"`, `"municip"`, or the original Spanish `"municipios"`). Additionaly, these can be `"large_urban_areas"` (or `"lau"`, or the original Spanish `"grandes_areas_urbanas"`, or `"gau"`) for v2 data.
 #' @inheritParams spod_dates_argument_to_dates_seq
 #' @param data_dir The directory where the data is stored. Defaults to the value returned by `spod_get_data_dir()` which returns the value of the environment variable `SPANISH_OD_DATA_DIR` or a temporary directory if the variable is not set.
 #' @param quiet Logical. If `TRUE`, the function does not print messages to the console. Defaults to `FALSE`.
 #' @param return_output Logical. If `TRUE`, the function returns a character vector of the paths to the downloaded files. If `FALSE`, the function returns `NULL`.
-#' 
+#'
 #' @return A character vector of the paths to the downloaded files. Unless `return_output = FALSE`, in which case the function returns `NULL`.
-#' 
-#' @export 
+#'
+#' @export
 #' @examples
 #' \dontrun{
 #' # Download the origin-destination on district level for the a date range in March 2020
-#' spod_download_data(type = "od", zones = "districts", 
-#'                    date_range = c("2020-03-20", "2020-03-24"))
-#' 
+#' spod_download_data(
+#'   type = "od", zones = "districts",
+#'   date_range = c("2020-03-20", "2020-03-24")
+#' )
+#'
 #' # Download the origin-destination on district level for select dates in 2020 and 2021
-#' spod_download_data(type = "od", zones = "dist", 
-#'                    dates_list = c("2020-03-20", "2020-03-24", "2021-03-20", "2021-03-24"))
-#' 
-#' # Download the origin-destination on municipality level using regex for a date range in March 2020 
+#' spod_download_data(
+#'   type = "od", zones = "dist",
+#'   dates_list = c("2020-03-20", "2020-03-24", "2021-03-20", "2021-03-24")
+#' )
+#'
+#' # Download the origin-destination on municipality level using regex for a date range in March 2020
 #' # (the regex will capture the dates 2020-03-20 to 2020-03-24)
-#' spod_download_data(type = "od", zones = "municip", 
-#'                    date_regex = "2020032[0-4]")
+#' spod_download_data(
+#'   type = "od", zones = "municip",
+#'   date_regex = "2020032[0-4]"
+#' )
 #' }
 spod_download_data <- function(
-  type = c(
-    "od", "origin-destination",
-    "os", "overnight_stays",
-    "tpp", "trips_per_person"),
-  zones = c("districts", "dist", "distr",
-    "municipalities", "muni", "municip",
-    "lau", "large_urban_areas"), # implement "urban_areas" for v2 data
-  dates = NULL,
-  data_dir = spod_get_data_dir(),
-  quiet = FALSE,
-  return_output = TRUE
-) {
+    type = c(
+      "od", "origin-destination",
+      "os", "overnight_stays",
+      "tpp", "trips_per_person"
+    ),
+    zones = c(
+      "districts", "dist", "distr", "distritos",
+      "municipalities", "muni", "municip", "municipios",
+      "lau", "large_urban_areas", "gau", "grandes_areas_urbanas"
+    ), # implement "urban_areas" for v2 data
+    dates = NULL,
+    data_dir = spod_get_data_dir(),
+    quiet = FALSE,
+    return_output = TRUE) {
   # convert english zone names to spanish words used in the default data paths
   zones <- match.arg(zones)
   zones <- spod_zone_names_en2es(zones)
-
-  # this is where the date arguments are processed
-  # for all the wrapper functions that use the spod_download_data() function the dates are also processed here
+
   dates_to_use <- spod_dates_argument_to_dates_seq(dates = dates)
 
+
   # check version
   # replace this argument with automatic version detection based on the dates requested?
-  ver <- spod_infer_data_v_from_dates(dates_to_use) # this leads to a second call to an internal spod_get_valid_dates() which in turn causes a second call to spod_available_data_v1() or spod_get_metadata(). This results in reading the xml files with metadata for the second time. This is not optimal and should be fixed.
+  ver <- spod_infer_data_v_from_dates(dates_to_use) # this leads to a second call to an internal spod_get_valid_dates() which in turn causes a second call to spod_available_data_v1() or spod_get_metadata(). This results in reading thedates_to_use <- spod_dates_argument_to_dates_seq(dates = dates) xml files with metadata for the second time. This is not optimal and should be fixed.
   if (isFALSE(quiet)) message("Data version detected from dates: ", ver)
-  
+
   # convert english data type names to spanish words used in the default data paths
   type <- match.arg(type)
   type <- spod_match_data_type(type = type, ver = ver)
-  
-  
-  
+
+
+
   # get the available  data list while checking for files already cached on disk
-  if( ver == 1) {
-    metadata <- spod_available_data_v1(data_dir = data_dir,
-      check_local_files = TRUE)
+  if (ver == 1) {
+    metadata <- spod_available_data_v1(
+      data_dir = data_dir,
+      check_local_files = TRUE
+    )
   } else if (ver == 2) {
     metadata <- spod_get_metadata(data_dir = data_dir)
     # replace with spod_available_data_v2() when available, spod_get_metadata can become a wrapper with v1/v2 argument. Potentially we can even automaticaly detect the data version based on the time intervals that user requests, but this is a bit controversial, as the methodology behind v1 and v2 data generation is not the same and Nommon+MITMA do not recommend mixing those together and comparing absoloute numbers of trips.
   }
-  
+
   # match the metadata to type, zones, version and dates
-  if(ver == 1){
+  if (ver == 1) {
     requested_files <- metadata[
       grepl(glue::glue("v{ver}.*{type}.*{zones}"), metadata$local_path) &
-      metadata$data_ymd %in% dates_to_use,
+        metadata$data_ymd %in% dates_to_use,
     ]
-  } else if(ver == 2){
+  } else if (ver == 2) {
     requested_files <- metadata[
       grepl(glue::glue("v{ver}.*{zones}.*{type}"), metadata$local_path) &
-      metadata$data_ymd %in% dates_to_use,
+        metadata$data_ymd %in% dates_to_use,
     ]
   }
 
   files_to_download <- requested_files[!requested_files$downloaded, ]
-  
+
   # pre-generate target paths for the files to download
   fs::dir_create(
     unique(fs::path_dir(files_to_download$local_path)),
-    recurse = TRUE)
+    recurse = TRUE
+  )
 
   # download the missing files
   downloaded_files <- curl::multi_download(