Skip to content

Commit

Permalink
Merge pull request #20 from Robinlovelace/get-v1-data
Browse files Browse the repository at this point in the history
Get v1 data - the magic get functions for od
  • Loading branch information
Robinlovelace authored Aug 12, 2024
2 parents 2ce6f68 + 82d6001 commit 16bcf7b
Show file tree
Hide file tree
Showing 32 changed files with 1,512 additions and 262 deletions.
3 changes: 3 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,6 @@ README.qmd
^.*\.Rproj$
^\.Rproj\.user$
^private$
^doc$
^Meta$
^vignettes/*_files$
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,7 @@ private

/.quarto/
.Rproj.user
inst/doc
.Renviron
/doc/
/Meta/
7 changes: 6 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -23,20 +23,25 @@ Depends:
Imports:
curl,
DBI,
dplyr,
duckdb,
fs,
glue,
lubridate,
parallelly,
purrr,
readr,
rlang (>= 1.1.0),
sf,
stats,
stringr,
tibble,
xml2
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.2
Suggests:
quarto,
testthat (>= 3.0.0)
Config/testthat/edition: 3
VignetteBuilder:
quarto
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
# Generated by roxygen2: do not edit by hand

export(spod_available_data_v1)
export(spod_convert_od_v1_to_duckdb)
export(spod_download_data)
export(spod_get)
export(spod_get_data_dir)
export(spod_get_latest_v1_file_list)
export(spod_get_latest_v2_xml)
export(spod_get_metadata)
export(spod_get_od_v1)
export(spod_get_zones)
export(spod_get_zones_v1)
92 changes: 92 additions & 0 deletions R/convert_data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#' Convert all downloaded v1 origin-destination data to duckdb
#'
#' @param save_dir The path to the directory where the duckdb files will be saved. If `NULL`, uses the default location in `data_dir` (set by the `SPANISH_OD_DATA_DIR` environment variable). Therefore, the default relative path is `<data_dir>/clean_data/v1/tabular/duckdb/od_<zones>.duckdb`.
#' @inheritParams spod_get_zones_v1
#' @inheritParams spod_duckdb_limit_resources
#' @param overwrite Logical. If `TRUE`, overwrites existing duckdb files. Defaults to `FALSE`.
#' @return Path to saved DuckDB file.
#' @export
spod_convert_od_v1_to_duckdb <- function(
zones = c(
"districts", "dist", "distr", "distritos",
"municipalities", "muni", "municip", "municipios"
),
data_dir = spod_get_data_dir(),
save_dir = NULL,
quiet = FALSE,
duck_max_mem = 3,
duck_max_threads = parallelly::availableCores(),
overwrite = FALSE) {
zones <- match.arg(zones)
zones <- spod_zone_names_en2es(zones)

# if save_dir is NULL, use default location in data_dir
if (is.null(save_dir)) {
save_dir <- fs::path(
data_dir,
spod_subfolder_clean_data_cache(ver = 1),
"tabular/duckdb/"
)
}

# ensure save_dir exists
if (!fs::dir_exists(save_dir)) fs::dir_create(save_dir, recurse = TRUE)

# create duckdb save path
duckdb_save_path <- glue::glue("{save_dir}/od_{zones}.duckdb")

# check if duckdb file already exists
if (fs::file_exists(duckdb_save_path) & !overwrite) {
message("Duckdb file already exists: ", duckdb_save_path)
# in future, perhaps add code that provides a summary of what's inside that file
# ask user if they want to overwrite
response <- readline(prompt = "Overwrite existing duckdb file? (yes/no) ")
overwrite_duckdb <- any(tolower(response) %in% c("y", "yes", "yes."))
if (!overwrite_duckdb) {
message(glue::glue("Exiting without overwriting existing duckdb file. You may delete it from {duckdb_save_path} manually and rerun the function. Or rerun it with `overwrite = TRUE`."))
return()
} else {
if (isFALSE(quiet)) message(glue::glue("Overwriting existing duckdb file: ", duckdb_save_path))
fs::file_delete(duckdb_save_path)
}
}

if (isFALSE(quiet)) message(glue::glue("Using {duck_max_mem} GB of memory and {duck_max_threads} threads. You may adjust this using the function arguments `duck_max_mem` and `duck_max_threads`."))
if (isFALSE(quiet)) message(glue::glue("Converting cached v1 od data for {zones} to DuckDB: ", duckdb_save_path, "... This may take a while."))
# add some indication on how long it may take from empirical experimentation
# hopefully, the progress_bar feature will be implemented in duckdb R package soon, bug filed here https://github.com/duckdb/duckdb-r/issues/199

# get dates of cached data
# v1_meta <- spod_available_data_v1(check_local_files = TRUE)

# v1_meta <- v1_meta[v1_meta$downloaded == TRUE,]
# v1_meta <- v1_meta[grepl("maestra1", v1_meta$local_path),]
# v1_meta <- v1_meta[grepl(zones, v1_meta$local_path),]

# dates <- v1_meta$data_ymd

# create duckdb connection
drv <- duckdb::duckdb()
con <- DBI::dbConnect(drv, dbdir = duckdb_save_path, read_only = FALSE)

# define memory and threads limits
con <- spod_duckdb_limit_resources(
con = con,
duck_max_mem = duck_max_mem,
duck_max_threads = duck_max_threads
)

# connect to folder of CSVs with v1 od data
con <- spod_duckdb_od_v1(con = con, zones = zones)
# DBI::dbListTables(con)

# import view of CSV files into duckdb
DBI::dbExecute(con, "CREATE TABLE od AS SELECT * FROM od_csv_clean ;")

DBI::dbDisconnect(con, shutdown = TRUE)
duckdb::duckdb_shutdown(drv)

message("Cached v1 origin-destination data imported to DuckDB at: ", duckdb_save_path)

return(duckdb_save_path)
}
98 changes: 54 additions & 44 deletions R/download_data.R
Original file line number Diff line number Diff line change
@@ -1,91 +1,101 @@
#' Download the data files of specified type, zones, and dates
#'
#'
#' This function downloads the data files of the specified type, zones, dates and data version.
#' @param type The type of data to download. Can be `"origin-destination"` (or ust `"od"`), or `"trips_per_person"` (or just `"tpp"`) for v1 data. For v2 data `"overnight_stays"` (or just `"os"`) is also available. More data types to be supported in the future. See respective codebooks for more information. **ADD CODEBOOKS! to the package**
#' @param zones The zones for which to download the data. Can be `"districts"` (or `"dist"`, `"distr"`) or `"municipalities"` (or `"muni"`, `"municip"`) for v1 data. Additionaly, these can be `"large_urban_areas"` (or `"lau"`) for v2 data.
#' @param zones The zones for which to download the data. Can be `"districts"` (or `"dist"`, `"distr"`, or the original Spanish `"distritos"`) or `"municipalities"` (or `"muni"`, `"municip"`, or the original Spanish `"municipios"`). Additionaly, these can be `"large_urban_areas"` (or `"lau"`, or the original Spanish `"grandes_areas_urbanas"`, or `"gau"`) for v2 data.
#' @inheritParams spod_dates_argument_to_dates_seq
#' @param data_dir The directory where the data is stored. Defaults to the value returned by `spod_get_data_dir()` which returns the value of the environment variable `SPANISH_OD_DATA_DIR` or a temporary directory if the variable is not set.
#' @param quiet Logical. If `TRUE`, the function does not print messages to the console. Defaults to `FALSE`.
#' @param return_output Logical. If `TRUE`, the function returns a character vector of the paths to the downloaded files. If `FALSE`, the function returns `NULL`.
#'
#'
#' @return A character vector of the paths to the downloaded files. Unless `return_output = FALSE`, in which case the function returns `NULL`.
#'
#' @export
#'
#' @export
#' @examples
#' \dontrun{
#' # Download the origin-destination on district level for the a date range in March 2020
#' spod_download_data(type = "od", zones = "districts",
#' date_range = c("2020-03-20", "2020-03-24"))
#'
#' spod_download_data(
#' type = "od", zones = "districts",
#' date_range = c("2020-03-20", "2020-03-24")
#' )
#'
#' # Download the origin-destination on district level for select dates in 2020 and 2021
#' spod_download_data(type = "od", zones = "dist",
#' dates_list = c("2020-03-20", "2020-03-24", "2021-03-20", "2021-03-24"))
#'
#' # Download the origin-destination on municipality level using regex for a date range in March 2020
#' spod_download_data(
#' type = "od", zones = "dist",
#' dates_list = c("2020-03-20", "2020-03-24", "2021-03-20", "2021-03-24")
#' )
#'
#' # Download the origin-destination on municipality level using regex for a date range in March 2020
#' # (the regex will capture the dates 2020-03-20 to 2020-03-24)
#' spod_download_data(type = "od", zones = "municip",
#' date_regex = "2020032[0-4]")
#' spod_download_data(
#' type = "od", zones = "municip",
#' date_regex = "2020032[0-4]"
#' )
#' }
spod_download_data <- function(
type = c(
"od", "origin-destination",
"os", "overnight_stays",
"tpp", "trips_per_person"),
zones = c("districts", "dist", "distr",
"municipalities", "muni", "municip",
"lau", "large_urban_areas"), # implement "urban_areas" for v2 data
dates = NULL,
data_dir = spod_get_data_dir(),
quiet = FALSE,
return_output = TRUE
) {
type = c(
"od", "origin-destination",
"os", "overnight_stays",
"tpp", "trips_per_person"
),
zones = c(
"districts", "dist", "distr", "distritos",
"municipalities", "muni", "municip", "municipios",
"lau", "large_urban_areas", "gau", "grandes_areas_urbanas"
), # implement "urban_areas" for v2 data
dates = NULL,
data_dir = spod_get_data_dir(),
quiet = FALSE,
return_output = TRUE) {
# convert english zone names to spanish words used in the default data paths
zones <- match.arg(zones)
zones <- spod_zone_names_en2es(zones)

# this is where the date arguments are processed
# for all the wrapper functions that use the spod_download_data() function the dates are also processed here

dates_to_use <- spod_dates_argument_to_dates_seq(dates = dates)


# check version
# replace this argument with automatic version detection based on the dates requested?
ver <- spod_infer_data_v_from_dates(dates_to_use) # this leads to a second call to an internal spod_get_valid_dates() which in turn causes a second call to spod_available_data_v1() or spod_get_metadata(). This results in reading the xml files with metadata for the second time. This is not optimal and should be fixed.
ver <- spod_infer_data_v_from_dates(dates_to_use) # this leads to a second call to an internal spod_get_valid_dates() which in turn causes a second call to spod_available_data_v1() or spod_get_metadata(). This results in reading thedates_to_use <- spod_dates_argument_to_dates_seq(dates = dates) xml files with metadata for the second time. This is not optimal and should be fixed.
if (isFALSE(quiet)) message("Data version detected from dates: ", ver)

# convert english data type names to spanish words used in the default data paths
type <- match.arg(type)
type <- spod_match_data_type(type = type, ver = ver)



# get the available data list while checking for files already cached on disk
if( ver == 1) {
metadata <- spod_available_data_v1(data_dir = data_dir,
check_local_files = TRUE)
if (ver == 1) {
metadata <- spod_available_data_v1(
data_dir = data_dir,
check_local_files = TRUE
)
} else if (ver == 2) {
metadata <- spod_get_metadata(data_dir = data_dir)
# replace with spod_available_data_v2() when available, spod_get_metadata can become a wrapper with v1/v2 argument. Potentially we can even automaticaly detect the data version based on the time intervals that user requests, but this is a bit controversial, as the methodology behind v1 and v2 data generation is not the same and Nommon+MITMA do not recommend mixing those together and comparing absoloute numbers of trips.
}

# match the metadata to type, zones, version and dates
if(ver == 1){
if (ver == 1) {
requested_files <- metadata[
grepl(glue::glue("v{ver}.*{type}.*{zones}"), metadata$local_path) &
metadata$data_ymd %in% dates_to_use,
metadata$data_ymd %in% dates_to_use,
]
} else if(ver == 2){
} else if (ver == 2) {
requested_files <- metadata[
grepl(glue::glue("v{ver}.*{zones}.*{type}"), metadata$local_path) &
metadata$data_ymd %in% dates_to_use,
metadata$data_ymd %in% dates_to_use,
]
}

files_to_download <- requested_files[!requested_files$downloaded, ]

# pre-generate target paths for the files to download
fs::dir_create(
unique(fs::path_dir(files_to_download$local_path)),
recurse = TRUE)
recurse = TRUE
)

# download the missing files
downloaded_files <- curl::multi_download(
Expand Down
Loading

0 comments on commit 16bcf7b

Please sign in to comment.