diff --git a/.Rbuildignore b/.Rbuildignore index d69ac4d..d7e1207 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -11,4 +11,4 @@ README.qmd ^LICENSE\.md$ ^.*\.Rproj$ ^\.Rproj\.user$ -private \ No newline at end of file +^private$ diff --git a/.gitignore b/.gitignore index 47a82d5..1fceb1f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,9 @@ +# Ignore all gz files *.gz + +# Exceptions for gz files in inst/extdata +!inst/extdata/*.gz + movilidad.duckdb .Rhistory zonificacion_distritos* diff --git a/DESCRIPTION b/DESCRIPTION index 289a7a2..142e58a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -29,6 +29,7 @@ Imports: lubridate, purrr, readr, + rlang (>= 1.1.0), sf, stringr, tibble, @@ -36,3 +37,6 @@ Imports: Encoding: UTF-8 Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.2 +Suggests: + testthat (>= 3.0.0) +Config/testthat/edition: 3 diff --git a/NAMESPACE b/NAMESPACE index 20f352d..f2616bc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,10 @@ # Generated by roxygen2: do not edit by hand +export(spod_available_data_v1) +export(spod_download_data) export(spod_get) +export(spod_get_latest_v1_file_list) export(spod_get_latest_v2_xml) export(spod_get_metadata) export(spod_get_zones) +export(spod_get_zones_v1) diff --git a/R/download_data.R b/R/download_data.R new file mode 100644 index 0000000..10101b3 --- /dev/null +++ b/R/download_data.R @@ -0,0 +1,106 @@ +#' Download the data files of specified type, zones, and dates +#' +#' This function downloads the data files of the specified type, zones, dates and data version. +#' @param type The type of data to download. Can be `"origin-destination"` (or ust `"od"`), or `"trips_per_person"` (or just `"tpp"`) for v1 data. For v2 data `"overnight_stays"` (or just `"os"`) is also available. More data types to be supported in the future. See respective codebooks for more information. **ADD CODEBOOKS! to the package** +#' @param zones The zones for which to download the data. Can be `"districts"` (or `"dist"`, `"distr"`) or `"municipalities"` (or `"muni"`, `"municip"`) for v1 data. Additionaly, these can be `"large_urban_areas"` (or `"lau"`) for v2 data. +#' @inheritParams spod_dates_argument_to_dates_seq +#' @param data_dir The directory where the data is stored. Defaults to the value returned by `spod_get_data_dir()` which returns the value of the environment variable `SPANISH_OD_DATA_DIR` or a temporary directory if the variable is not set. +#' @param quiet Logical. If `TRUE`, the function does not print messages to the console. Defaults to `FALSE`. +#' @param return_output Logical. If `TRUE`, the function returns a character vector of the paths to the downloaded files. If `FALSE`, the function returns `NULL`. +#' +#' @return A character vector of the paths to the downloaded files. Unless `return_output = FALSE`, in which case the function returns `NULL`. +#' +#' @export +#' @examples +#' \dontrun{ +#' # Download the origin-destination on district level for the a date range in March 2020 +#' spod_download_data(type = "od", zones = "districts", +#' date_range = c("2020-03-20", "2020-03-24")) +#' +#' # Download the origin-destination on district level for select dates in 2020 and 2021 +#' spod_download_data(type = "od", zones = "dist", +#' dates_list = c("2020-03-20", "2020-03-24", "2021-03-20", "2021-03-24")) +#' +#' # Download the origin-destination on municipality level using regex for a date range in March 2020 +#' # (the regex will capture the dates 2020-03-20 to 2020-03-24) +#' spod_download_data(type = "od", zones = "municip", +#' date_regex = "2020032[0-4]") +#' } +spod_download_data <- function( + type = c( + "od", "origin-destination", + "os", "overnight_stays", + "tpp", "trips_per_person"), + zones = c("districts", "dist", "distr", + "municipalities", "muni", "municip", + "lau", "large_urban_areas"), # implement "urban_areas" for v2 data + dates = NULL, + data_dir = spod_get_data_dir(), + quiet = FALSE, + return_output = TRUE +) { + # convert english zone names to spanish words used in the default data paths + zones <- match.arg(zones) + zones <- spod_zone_names_en2es(zones) + + # this is where the date arguments are processed + # for all the wrapper functions that use the spod_download_data() function the dates are also processed here + dates_to_use <- spod_dates_argument_to_dates_seq(dates = dates) + + # check version + # replace this argument with automatic version detection based on the dates requested? + ver <- spod_infer_data_v_from_dates(dates_to_use) # this leads to a second call to an internal spod_get_valid_dates() which in turn causes a second call to spod_available_data_v1() or spod_get_metadata(). This results in reading the xml files with metadata for the second time. This is not optimal and should be fixed. + if (isFALSE(quiet)) message("Data version detected from dates: ", ver) + + # convert english data type names to spanish words used in the default data paths + type <- match.arg(type) + type <- spod_match_data_type(type = type, ver = ver) + + + + # get the available data list while checking for files already cached on disk + if( ver == 1) { + metadata <- spod_available_data_v1(data_dir = data_dir, + check_local_files = TRUE) + } else if (ver == 2) { + metadata <- spod_get_metadata(data_dir = data_dir) + # replace with spod_available_data_v2() when available, spod_get_metadata can become a wrapper with v1/v2 argument. Potentially we can even automaticaly detect the data version based on the time intervals that user requests, but this is a bit controversial, as the methodology behind v1 and v2 data generation is not the same and Nommon+MITMA do not recommend mixing those together and comparing absoloute numbers of trips. + } + + # match the metadata to type, zones, version and dates + if(ver == 1){ + requested_files <- metadata[ + grepl(glue::glue("v{ver}.*{type}.*{zones}"), metadata$local_path) & + metadata$data_ymd %in% dates_to_use, + ] + } else if(ver == 2){ + requested_files <- metadata[ + grepl(glue::glue("v{ver}.*{zones}.*{type}"), metadata$local_path) & + metadata$data_ymd %in% dates_to_use, + ] + } + + files_to_download <- requested_files[!requested_files$downloaded, ] + + # pre-generate target paths for the files to download + fs::dir_create( + unique(fs::path_dir(files_to_download$local_path)), + recurse = TRUE) + + # download the missing files + downloaded_files <- curl::multi_download( + urls = files_to_download$target_url, + destfiles = files_to_download$local_path, + progress = TRUE, + resume = TRUE + ) + + # set download status for downloaded files as TRUE in requested_files + requested_files$downloaded[requested_files$local_path %in% downloaded_files$destfile] <- TRUE + + message("Retrieved data for requested dates: ", paste(dates_to_use, collapse = ", ")) # this may output too many dates, shoudl be fixed when we create a flexible date argument processing function. Keeping for now. + + if (return_output) { + return(requested_files$local_path) + } +} diff --git a/R/folders.R b/R/folders.R new file mode 100644 index 0000000..795a68e --- /dev/null +++ b/R/folders.R @@ -0,0 +1,9 @@ +# change subfolder name for raw data cache here to apply globally +spod_subfolder_raw_data_cache <- function(ver = 1) { + rlang:::check_number_whole(ver) + if (!ver %in% c(1, 2)) { + stop("Invalid version number. Must be 1 or 2.") + } + base_subdir_name <- "raw_data_cache" + return(paste0(base_subdir_name, "/v", ver, "/")) +} diff --git a/R/get.R b/R/get.R index ad112b3..2bfe88f 100644 --- a/R/get.R +++ b/R/get.R @@ -2,7 +2,6 @@ #' #' @param data_dir The directory where the data is stored. Defaults to the value returned by `spod_get_data_dir()`. #' @param xml_url The URL of the XML file to download. Defaults to "https://movilidad-opendata.mitma.es/RSS.xml". -#' @param current_timestamp The current timestamp to keep track of the version of the remote file list. Defaults to the current date. #' #' @return The path to the downloaded XML file. #' @export @@ -12,13 +11,14 @@ #' } spod_get_latest_v2_xml = function( data_dir = spod_get_data_dir(), - xml_url = "https://movilidad-opendata.mitma.es/RSS.xml", - current_timestamp = format(Sys.time(), format = "%Y-%m-%d", usetz = FALSE, tz = "UTC")) { + xml_url = "https://movilidad-opendata.mitma.es/RSS.xml" +) { if (!fs::dir_exists(data_dir)) { fs::dir_create(data_dir) } - current_filename = glue::glue("{data_dir}/data_links_{current_timestamp}.xml") + current_timestamp = format(Sys.time(), format = "%Y-%m-%d", usetz = FALSE, tz = "UTC") + current_filename = glue::glue("{data_dir}/data_links_v2_{current_timestamp}.xml") message("Saving the file to: ", current_filename) xml_requested = curl::curl_download(url = xml_url, destfile = current_filename, quiet = FALSE) @@ -30,6 +30,7 @@ spod_get_latest_v2_xml = function( #' This function retrieves the data dictionary for the specified data directory. #' #' @param data_dir The directory where the data is stored. Defaults to the value returned by `spod_get_data_dir()`. +#' @param quiet Whether to suppress messages. Defaults to `FALSE`. #' @return The data dictionary. #' @export #' @examples @@ -39,14 +40,14 @@ spod_get_latest_v2_xml = function( #' names(metadata) #' head(metadata) #' } -spod_get_metadata = function(data_dir = spod_get_data_dir()) { - xml_files_list = fs::dir_ls(data_dir, type = "file", regexp = "data_links_") |> sort() +spod_get_metadata = function(data_dir = spod_get_data_dir(), quiet = FALSE) { + xml_files_list = fs::dir_ls(data_dir, type = "file", regexp = "data_links_v2") |> sort() latest_data_links_xml_path = utils::tail(xml_files_list, 1) if (length(latest_data_links_xml_path) == 0) { - message("Getting latest data links xml") + if(isFALSE(quiet)) message("Getting latest data links xml") latest_data_links_xml_path = spod_get_latest_v2_xml(data_dir = data_dir) } else { - message("Using existing data links xml: ", latest_data_links_xml_path) + if(isFALSE(quiet)) message("Using existing data links xml: ", latest_data_links_xml_path) } x_xml = xml2::read_xml(latest_data_links_xml_path) @@ -74,12 +75,24 @@ spod_get_metadata = function(data_dir = spod_get_data_dir()) { return(download_dt) } -spod_get_data_dir = function() { +#' Get the data directory +#' +#' This function retrieves the data directory from the environment variable SPANISH_OD_DATA_DIR. +#' If the environment variable is not set, it returns the temporary directory. +#' +#' @return The data directory. +#' @keywords internal +spod_get_data_dir = function(quiet = FALSE) { data_dir_env = Sys.getenv("SPANISH_OD_DATA_DIR") - if (data_dir_env == "") { - data_dir_env = tempdir() + if( data_dir_env == "" ) { + if (isFALSE(quiet)) warning("Warning: SPANISH_OD_DATA_DIR is not set. Using the temporary directory, which is not recommended, as the data will be deleted when the session ends.\n\n To set the data directory, use `Sys.setenv(SPANISH_OD_DATA_DIR = '/path/to/data')` or set SPANISH_OD_DATA_DIR permanently in the environment by editing the `.Renviron` file locally for current project with `usethis::edit_r_environ('project')` or `file.edit('.Renviron')` or globally for all projects with `usethis::edit_r_environ('user')` or `file.edit('~/.Renviron')`.") + data_dir_env = tempdir() # if not set, use the temp directory + } + # check if dir exists and create it if it doesn't + if (!fs::dir_exists(data_dir_env)) { + fs::dir_create(data_dir_env) } - return(data_dir_env) + return(fs::path_real(data_dir_env)) } #' Retrieves the zones data @@ -104,7 +117,7 @@ spod_get_zones = function( metadata_distritos = metadata[sel_distritos, ] dir_name = dirname(metadata_distritos$local_path[1]) if (!fs::dir_exists(dir_name)) { - fs::dir_create(dir_name) + fs::dir_create(dir_name, recurse = TRUE) } for (i in 1:nrow(metadata_distritos)) { if (!fs::file_exists(metadata_distritos$local_path[i])) { diff --git a/R/get_v1_data.R b/R/get_v1_data.R new file mode 100644 index 0000000..1da532c --- /dev/null +++ b/R/get_v1_data.R @@ -0,0 +1,243 @@ +#' Get latest file list from the XML for MITMA open mobiltiy data v1 (2020-2021) +#' +#' @param data_dir The directory where the data is stored. Defaults to the value returned by `spod_get_data_dir()`. +#' @param xml_url The URL of the XML file to download. Defaults to "https://opendata-movilidad.mitma.es/RSS.xml". +#' +#' @return The path to the downloaded XML file. +#' @export +#' @examples +#' if (FALSE) { +#' spod_get_latest_v1_file_list() +#' } +spod_get_latest_v1_file_list <- function( + data_dir = spod_get_data_dir(), + xml_url = "https://opendata-movilidad.mitma.es/RSS.xml") { + if (!fs::dir_exists(data_dir)) { + fs::dir_create(data_dir) + } + + current_timestamp <- format(Sys.time(), format = "%Y-%m-%d", usetz = FALSE, tz = "UTC") + current_filename <- glue::glue("{data_dir}/data_links_v1_{current_timestamp}.xml") + + message("Saving the file to: ", current_filename) + xml_requested <- curl::curl_download( + url = xml_url, + destfile = current_filename, + quiet = FALSE + ) + return(current_filename) +} + +#' Get the available v1 data list +#' +#' This function provides a table of the available data list of MITMA v1 (2020-2021), both remote and local. +#' +#' @param data_dir The directory where the data is stored. Defaults to the value returned by `spod_get_data_dir()`. +#' @param check_local_files Whether to check if the local files exist. Defaults to `FALSE`. +#' @param quiet Whether to suppress messages. Defaults to `FALSE`. +#' @return A tibble with links, release dates of files in the data, dates of data coverage, local paths to files, and the download status. +#' \describe{ +#' \item{target_url}{\code{character}. The URL link to the data file.} +#' \item{pub_ts}{\code{POSIXct}. The timestamp of when the file was published.} +#' \item{file_extension}{\code{character}. The file extension of the data file (e.g., 'tar', 'gz').} +#' \item{data_ym}{\code{Date}. The year and month of the data coverage, if available.} +#' \item{data_ymd}{\code{Date}. The specific date of the data coverage, if available.} +#' \item{local_path}{\code{character}. The local file path where the data is stored.} +#' \item{downloaded}{\code{logical}. Indicator of whether the data file has been downloaded locally.} +#' } +#' @export +#' @examples +#' # Get the available v1 data list for the default data directory +#' if (FALSE) { +#' metadata <- spod_available_data_v1() +#' names(metadata) +#' head(metadata) +#' } +spod_available_data_v1 <- function(data_dir = spod_get_data_dir(), + # check_local_files (below) is FALSE by default to avoid excessive filesystem access, perhaps should be TRUE. Download functions use it to load the xml file, but we probably do not want the script to check all local cache directories every time we run a get data function. Perhaps it is better to offload this check to a separate function and have a csv file or some other way to keep track of the files that were downloaded and cached. An output of curl::multi_download() could be used for this purpose. + check_local_files = FALSE, + quiet = FALSE +) { + xml_files_list <- fs::dir_ls(data_dir, type = "file", regexp = "data_links_v1") |> sort() + if(length(xml_files_list) == 0) { + if(isFALSE(quiet)) message("No data links xml files found, getting latest data links xml") + latest_data_links_xml_path <- spod_get_latest_v1_file_list(data_dir = data_dir) + } else { + latest_data_links_xml_path <- utils::tail(xml_files_list, 1) + } + + # Check if the XML file is 1 day old or older from its name + file_date <- stringr::str_extract(latest_data_links_xml_path, "[0-9]{4}-[0-9]{2}-[0-9]{2}") + + if (file_date < format(Sys.Date(), format = "%Y-%m-%d")) { + if(isFALSE(quiet)) message("File list xml is 1 day old or older, getting latest data links xml") + latest_data_links_xml_path <- spod_get_latest_v1_file_list(data_dir = data_dir) + } else { + if(isFALSE(quiet)) message("Using existing data links xml: ", latest_data_links_xml_path) + } + + if (length(latest_data_links_xml_path) == 0) { + if(isFALSE(quiet)) message("Getting latest data links xml") + latest_data_links_xml_path <- spod_get_latest_v1_file_list(data_dir = data_dir) + } + + x_xml <- xml2::read_xml(latest_data_links_xml_path) + + files_table <- tibble::tibble( + target_url = xml2::xml_find_all(x = x_xml, xpath = "//link") |> xml2::xml_text(), + pub_date = xml2::xml_find_all(x = x_xml, xpath = "//pubDate") |> xml2::xml_text() + ) + + files_table$pub_ts <- lubridate::dmy_hms(files_table$pub_date) + files_table$file_extension <- tools::file_ext(files_table$target_url) + files_table <- files_table[files_table$file_extension != "", ] + files_table$pub_date <- NULL + + files_table$data_ym <- lubridate::ym(stringr::str_extract(files_table$target_url, "[0-9]{4}-[0-9]{2}")) + files_table$data_ymd <- lubridate::ymd(stringr::str_extract(files_table$target_url, "[0-9]{8}")) + # order by pub_ts + files_table <- files_table[order(files_table$pub_ts, decreasing = TRUE), ] + files_table$local_path <- file.path( + data_dir, + stringr::str_replace(files_table$target_url, ".*mitma.es/", spod_subfolder_raw_data_cache(ver = 1)) + ) + + files_table$local_path <- stringr::str_replace_all(files_table$local_path, "\\/\\/\\/|\\/\\/", "/") + + # change path for daily data files to be in hive-style format + files_table$local_path <- gsub("([0-9]{4})-([0-9]{2})\\/[0-9]{6}([0-9]{2})_", "year=\\1\\/month=\\2\\/day=\\3\\/", files_table$local_path) + + # fix paths for files that are in '0000-referencia' folder + files_table$local_path <- gsub("0000-referencia\\/([0-9]{4})([0-9]{2})([0-9]{2})_", "year=\\1\\/month=\\2\\/day=\\3\\/", files_table$local_path) + + # replace 2 digit month with 1 digit month + files_table$local_path <- gsub("month=0([1-9])", "month=\\1", files_table$local_path) + + # replace 2 digit day with 1 digit day + files_table$local_path <- gsub("day=0([1-9])", "day=\\1", files_table$local_path) + + # now check if any of local files exist + files_table$downloaded <- fs::file_exists(files_table$local_path) + + return(files_table) +} + +#' Retrieves the zones for v1 data +#' +#' This function retrieves the zones data from the specified data directory. +#' It can retrieve either "distritos" or "municipios" zones data. +#' +#' @param data_dir The directory where the data is stored. +#' @param zones The zones for which to download the data. Can be `"districts"` (or `"dist"`, `"distr"`) or `"municipalities"` (or `"muni"`, `"municip"`). +#' @return A spatial object containing the zones data. +#' @export +#' @examples +#' if (FALSE) { +#' zones <- spod_get_zones() +#' } +spod_get_zones_v1 <- function( + zones = c("districts", "dist", "distr", + "municipalities", "muni", "municip"), + data_dir = spod_get_data_dir() +) { + zones <- match.arg(zones) + zones <- spod_zone_names_en2es(zones) + + # check if shp files are already extracted + expected_gpkg_path <- fs::path(data_dir, glue::glue("clean_data/v1//zones/{zones}_mitma.gpkg")) + if (fs::file_exists(expected_gpkg_path)) { + message("Loading .gpkg file that already exists in data dir: ", expected_gpkg_path) + return(sf::read_sf(expected_gpkg_path)) + } + + # if data is not available, download, extract, clean and save it to gpkg + + metadata <- spod_available_data_v1(data_dir, check_local_files = FALSE) + regex <- glue::glue("zonificacion_{zones}\\.") + sel_zones <- stringr::str_detect(metadata$target_url, regex) + metadata_zones <- metadata[sel_zones, ] + dir_name <- fs::path_dir(metadata_zones$local_path[1]) + if (!fs::dir_exists(dir_name)) { + fs::dir_create(dir_name, recurse = TRUE) + } + + if (!fs::file_exists(metadata_zones$local_path)) { + message("Downloading the file to: ", metadata_zones$local_path) + downloaded_file <- curl::curl_download(metadata_zones$target_url, destfile = metadata_zones$local_path, mode = "wb", quiet = FALSE) + } else { + message("File already exists: ", metadata_zones$local_path) + downloaded_file <- metadata_zones$local_path + } + + message("Unzipping the file: ", downloaded_file) + utils::unzip(downloaded_file, + exdir = fs::path_dir(downloaded_file) + ) + + # remove artifacts (remove __MACOSX if exists) + junk_path <- paste0(fs::path_dir(downloaded_file), "/__MACOSX") + if (fs::dir_exists(junk_path)) fs::dir_delete(junk_path) + + zones_path <- fs::dir_ls(data_dir, glob = glue::glue("**{zones}/*.shp"), recurse = TRUE) + + zones <- spod_clean_zones_v1(zones_path) + fs::dir_create(fs::path_dir(expected_gpkg_path), recurse = TRUE) + sf::st_write(zones, expected_gpkg_path, delete_dsn = TRUE, delete_layer = TRUE) + + return(zones) +} + +#' Fixes common issues in the zones data and cleans up variable names +#' +#' This function fixes any invalid geometries in the zones data and renames the "ID" column to "id". +#' +#' @param zones_path The path to the zones spatial data file. +#' @return A spatial object of class `sf`. +#' @keywords internal +#' +spod_clean_zones_v1 <- function(zones_path) { + suppressWarnings({ + zones <- sf::read_sf(zones_path) + }) + invalid_geometries <- !sf::st_is_valid(zones) + if (sum(invalid_geometries) > 0) { + fixed_zones <- sf::st_make_valid(zones[invalid_geometries, ]) + zones <- rbind(zones[!invalid_geometries, ], fixed_zones) + } + names(zones)[names(zones) == "ID"] <- "id" + return(zones) +} + + +#' Retrieve the origin-destination v1 data (2020-2021) +#' +#' This function retrieves the v1 (2020-2021) origin-destination data from the specified data directory. +#' @param read_fun The function to read the data. Defaults to `duckdb::tbl_file`. +#' @inheritParams spod_download_data +#' @return A tibble with the origin-destination data. +spod_get_od <- function( + zones = c("districts", "dist", "distr", + "municipalities", "muni", "municip"), # add "urban_areas" for v2 data + dates = NULL, + data_dir = spod_get_data_dir(), + quiet = FALSE, + read_fun = duckdb::tbl_file +) { + # Processing of the date arguments is performed in `spod_download_data()` + + zones <- match.arg(zones) + + # use the spot_download_data() function to download any missing data + downloaded_files <- spod_download_data( + type = "od", + zones = zones, + dates = dates, + data_dir = data_dir + ) + + + # read data from cached files + + + +} diff --git a/R/internal_utils.R b/R/internal_utils.R new file mode 100644 index 0000000..7eca22d --- /dev/null +++ b/R/internal_utils.R @@ -0,0 +1,228 @@ +#' Convert multiple formates of date arguments to a sequence of dates +#' +#' This function processes the date arguments provided to various functions in the package. It can handle single dates and arbitratry sequences (vectors) of dates in ISO (YYYY-MM-DD) and YYYYMMDD format. It can also handle date ranges in the format 'YYYY-MM-DD_YYYY-MM-DD' (or 'YYYYMMDD_YYYYMMDD'), date ranges in named vec and regular expressions to match dates in the format `YYYYMMDD`. +#' +#' @param dates A `character` or `Date` vector of dates to process. Kindly keep in mind that v1 and v2 data follow different data collection methodologies and may not be directly comparable. Therefore, do not try to request data from both versions for the same date range. If you need to compare data from both versions, please refer to the respective codebooks and methodology documents. The v1 data covers the period from 2020-02-14 to 2021-05-09, and the v2 data covers the period from 2022-01-01 to the present until further notice. The true dates range is checked against the available data for each version on every function run. +#' +#' The possible values can be any of the following: +#' +#' * A single date in ISO (YYYY-MM-DD) or YYYYMMDD format. `character` or `Date` object. +#' +#' * A vector of dates in ISO (YYYY-MM-DD) or YYYYMMDD format. `character` or `Date` object. Can be any non-consecutive sequence of dates. +#' +#' * A date range +#' +#' * eigher a `character` or `Date` object of length 2 with clearly named elements `start` and `end` in ISO (YYYY-MM-DD) or YYYYMMDD format. E.g. `c(start = "2020-02-15", end = "2020-02-17")`; +#' +#' * or a `character` object of the form `YYYY-MM-DD_YYYY-MM-DD` or `YYYYMMDD_YYYYMMDD`. For example, `2020-02-15_2020-02-17` or `20200215_20200217`. +#' +#' * A regular expression to match dates in the format `YYYYMMDD`. `character` object. For example, `^202002` will match all dates in February 2020. +#' +#' +#' @return A character vector of dates in ISO format (YYYY-MM-DD). +#' @keywords internal +spod_dates_argument_to_dates_seq <- function(dates) { + if (is.null(dates) || (!is.character(dates) && !inherits(dates, "Date"))) { + stop("Invalid date input format. Please provide a character vector or Date object.") + } + + range_regex <- "^\\d{4}(-\\d{2}){2}_\\d{4}(-\\d{2}){2}$|^\\d{8}_\\d{8}$" + single_date_regex <- "^(\\d{4}-\\d{2}-\\d{2}|\\d{8})$" + # If dates is a vector of length one + # Check if is single date, date range, or regex pattern + if (length(dates) == 1){ + + # Check if date range + # match both YYYY-MM-DD_YYYY-MM-DD and YYYYMMDD_YYYYMMDD + if (grepl(range_regex, dates)){ + date_parts <- strsplit(dates, "_")[[1]] + date_parts <- lubridate::ymd(date_parts) + dates <- seq.Date(date_parts[1], date_parts[2], by = "day") + + # if dates does not match the date range pattern + # check if it is just a single day in YYYY-MM-DD or YYYYMMDD format + } else if(grepl(single_date_regex, dates)) { + dates <- lubridate::ymd(dates) + + # assume it is a regex pattern + } else { + dates <- spod_expand_dates_from_regex(dates) + # since spod_expand_dates_from_regex already uses the metadata to generate valid dates we can skip any checks that are required for other date formats and only check for datte overlap + if( isFALSE(spod_is_data_version_overlaps(dates)) ){ + return(dates) + } + } + + # If dates if a vector of multiple values + } else if (length(dates) > 1){ + + # Check if it is of length 2, then it may be a date range + if (length(dates) == 2 & !is.null(names(dates))) { + # if the vector is named with 'start' and 'end', we can assume it is a date range + if(all(names(dates) %in% c("start", "end"))){ + date_parts <- lubridate::ymd(dates) + dates <- seq.Date(date_parts[1], date_parts[2], by = "day") + } + } else { + # this is apparantly a sequence of dates + dates <- lubridate::ymd(dates) + } + } + + # now that we have a clean sequence of dates, we can check for overlaps between data versions + if (isFALSE(spod_is_data_version_overlaps(dates)) & + spod_infer_data_v_from_dates(dates) %in% c(1, 2) + ) { + return(dates) + } +} + + + +#' Check if specified dates span both data versions +#' +#' This function checks if the specified dates or date ranges span both v1 and v2 data versions. +#' +#' @param dates A Dates vector of dates to check. +#' @return TRUE if the dates span both data versions, FALSE otherwise. +#' @keywords internal +spod_is_data_version_overlaps <- function(dates){ + + all_dates_v1 <- spod_get_valid_dates(ver = 1) + all_dates_v2 <- spod_get_valid_dates(ver = 2) + + if (any(dates %in% all_dates_v1) && any(dates %in% all_dates_v2)) { + stop(paste0("Dates found in both v1 and v2 data. The v1 and v2 data sets may not be comparable. Please see the respective codebooks and methodology documents.\nThe valid dates range for v1 is: ", paste0(min(all_dates_v1), " to ", max(all_dates_v1)), " and for v2 is: ", paste0(min(all_dates_v2), " to ", max(all_dates_v2)))) + } + return(FALSE) +} + +spod_infer_data_v_from_dates <- function(dates) { + # in case of overlap + # will throw an error from the spod_is_data_version_overlaps + if (spod_is_data_version_overlaps(dates)) { + invisible(return(NULL)) + } + + # of no overlap, compare with date ranges + v1_dates <- spod_get_valid_dates(ver = 1) + v2_dates <- spod_get_valid_dates(ver = 2) + + if (all(dates %in% v1_dates)) { + return(1) + } else if (all(dates %in% v2_dates)) { + return(2) + } else { + # if some dates did not match stop with a message showing which dates are missing + missing_dates <- dates[!dates %in% c(v1_dates, v2_dates)] + stop(paste0("Some dates do not match the available data. The valid dates range for v1 is: ", paste0(min(v1_dates), " to ", max(v1_dates)), " and for v2 is: ", paste0(min(v2_dates), " to ", max(v2_dates), ".\nMissing dates: ", paste0(missing_dates, collapse = ", ")))) + } +} + +#' Function to expand dates from a regex +#' +#' This function generates a sequence of dates from a regular expression pattern. +#' based on the provided regular expression. +#' +#' @param date_regex A regular expression to match dates in the format yyyymmdd. +#' @return A character vector of dates matching the regex. +#' @keywords internal +spod_expand_dates_from_regex <- function(date_regex) { + + all_dates_v1 <- spod_get_valid_dates(ver = 1) + all_dates_v2 <- spod_get_valid_dates(ver = 2) + + # Filter dates matching the regex for both versions + matching_dates_v1 <- all_dates_v1[grepl(date_regex, format(all_dates_v1, "%Y%m%d"))] + matching_dates_v2 <- all_dates_v2[grepl(date_regex, format(all_dates_v2, "%Y%m%d"))] + + # if both vectors are empty, throw an error + if (length(matching_dates_v1) == 0 && length(matching_dates_v2) == 0) { + stop(paste0("No matching dates found in the available data.", + "The valid dates range for v1 is: ", paste0(min(all_dates_v1), " to ", max(all_dates_v1)), " and for v2 is: ", paste0(min(all_dates_v2), " to ", max(all_dates_v2)))) + } + # If checks above have passed, we can combine the matching dates as only one contains dates and the other is empty + matching_dates <- sort(c(matching_dates_v1, matching_dates_v2)) + + return(matching_dates) +} + + +spod_get_valid_dates <- function(ver = 1) { + rlang:::check_number_whole(ver) + if (!ver %in% c(1, 2)) { + stop("Invalid version number. Must be 1 or 2.") + } + + + if(ver == 1) { + # available_data <- spod_available_data_v1(check_local_files = FALSE, quiet = TRUE) + # all_dates <- unique(available_data[grepl("maestra1.*diarios", available_data$target_url),]$data_ymd, na.rm = TRUE) + # perahps it is worth hardcoding at lest the v1 data range as it is unlikely to change at this point + all_dates <- seq.Date(from = as.Date("2020-02-14"), to = as.Date("2021-05-09"), by = "day") + } else if (ver == 2) { + available_data <- spod_get_metadata(quiet = TRUE) # replace with spod_available_data_v2() when available + all_dates <- unique(available_data[grepl("viajes.*diarios", available_data$target_url),]$data_ymd, na.rm = TRUE) + } + + return(all_dates) +} +# currently checks for date range for od data only. not all datasets may be available for all dates, so this function may need to be updated to check for the availability of the specific for the requested dates. spod_match_data_type() helper in the same file may be useful here. + + +# replace with spod_available_data_v2() when available, spod_get_metadata can become a wrapper with v1/v2 argument. Potentially we can even automaticaly detect the data version based on the time intervals that user requests, but this is a bit controversial, as the methodology behind v1 and v2 data generation is not the same and Nommon+MITMA do not recommend mixing those together and comparing absoloute numbers of trips. + + +spod_zone_names_en2es <- function( + zones = c("districts", "dist", "distr", + "municipalities", "muni", "municip") +) { + zones <- tolower(zones) + zones <- match.arg(zones) + if(zones %in% c("districts", "dist", "distr")) { + return("distritos") + } else if(zones %in% c("municipalities", "muni", "municip")) { + return("municipios") + } +} + +#' Match data types to folders +#' @param type The type of data to match. Can be "od", "origin-destination", "os", "overnight_stays", or "tpp", "trips_per_person". +#' @param ver The version of the data to use. Defaults to 1. Can be 1 or 2. +#' @keywords internal +spod_match_data_type <- function( + type = c( + "od", "origin-destination", + "os", "overnight_stays", + "tpp", "trips_per_person"), + ver = c(1, 2) +){ + rlang:::check_number_whole(ver) + if (!ver %in% c(1, 2)) { + stop("Invalid version number. Must be 1 or 2.") + } + + type <- tolower(type) + type <- match.arg(type) + + if(ver == 1) { + if (type %in% c("od", "origin-destination")) { + return("maestra1") + } else if(type %in% c("tpp", "trips_per_person")) { + return("maestra2") + } + } + + if(ver == 2) { + if (type %in% c("od", "origin-destination")) { + return("viajes") + } else if(type %in% c("os", "overnight_stays")) { + return("pernoctaciones") + } else if(type %in% c("tpp", "trips_per_person")) { + return("personas") + } + } + + # need to add a warning here that the type is not recognized + return(NULL) +} diff --git a/inst/extdata/data_links_v1_2024-08-07.xml.gz b/inst/extdata/data_links_v1_2024-08-07.xml.gz new file mode 100644 index 0000000..1fd4912 Binary files /dev/null and b/inst/extdata/data_links_v1_2024-08-07.xml.gz differ diff --git a/inst/extdata/data_links_v2_2024-08-07.xml.gz b/inst/extdata/data_links_v2_2024-08-07.xml.gz new file mode 100644 index 0000000..fe90e84 Binary files /dev/null and b/inst/extdata/data_links_v2_2024-08-07.xml.gz differ diff --git a/man/spod_available_data_v1.Rd b/man/spod_available_data_v1.Rd new file mode 100644 index 0000000..74c67b9 --- /dev/null +++ b/man/spod_available_data_v1.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_v1_data.R +\name{spod_available_data_v1} +\alias{spod_available_data_v1} +\title{Get the available v1 data list} +\usage{ +spod_available_data_v1( + data_dir = spod_get_data_dir(), + check_local_files = FALSE, + quiet = FALSE +) +} +\arguments{ +\item{data_dir}{The directory where the data is stored. Defaults to the value returned by \code{spod_get_data_dir()}.} + +\item{check_local_files}{Whether to check if the local files exist. Defaults to \code{FALSE}.} + +\item{quiet}{Whether to suppress messages. Defaults to \code{FALSE}.} +} +\value{ +A tibble with links, release dates of files in the data, dates of data coverage, local paths to files, and the download status. +\describe{ +\item{target_url}{\code{character}. The URL link to the data file.} +\item{pub_ts}{\code{POSIXct}. The timestamp of when the file was published.} +\item{file_extension}{\code{character}. The file extension of the data file (e.g., 'tar', 'gz').} +\item{data_ym}{\code{Date}. The year and month of the data coverage, if available.} +\item{data_ymd}{\code{Date}. The specific date of the data coverage, if available.} +\item{local_path}{\code{character}. The local file path where the data is stored.} +\item{downloaded}{\code{logical}. Indicator of whether the data file has been downloaded locally.} +} +} +\description{ +This function provides a table of the available data list of MITMA v1 (2020-2021), both remote and local. +} +\examples{ +# Get the available v1 data list for the default data directory +if (FALSE) { + metadata <- spod_available_data_v1() + names(metadata) + head(metadata) +} +} diff --git a/man/spod_clean_zones_v1.Rd b/man/spod_clean_zones_v1.Rd new file mode 100644 index 0000000..81fbadc --- /dev/null +++ b/man/spod_clean_zones_v1.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_v1_data.R +\name{spod_clean_zones_v1} +\alias{spod_clean_zones_v1} +\title{Fixes common issues in the zones data and cleans up variable names} +\usage{ +spod_clean_zones_v1(zones_path) +} +\arguments{ +\item{zones_path}{The path to the zones spatial data file.} +} +\value{ +A spatial object of class \code{sf}. +} +\description{ +This function fixes any invalid geometries in the zones data and renames the "ID" column to "id". +} +\keyword{internal} diff --git a/man/spod_dates_argument_to_dates_seq.Rd b/man/spod_dates_argument_to_dates_seq.Rd new file mode 100644 index 0000000..d9f5b21 --- /dev/null +++ b/man/spod_dates_argument_to_dates_seq.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/internal_utils.R +\name{spod_dates_argument_to_dates_seq} +\alias{spod_dates_argument_to_dates_seq} +\title{Convert multiple formates of date arguments to a sequence of dates} +\usage{ +spod_dates_argument_to_dates_seq(dates) +} +\arguments{ +\item{dates}{A \code{character} or \code{Date} vector of dates to process. Kindly keep in mind that v1 and v2 data follow different data collection methodologies and may not be directly comparable. Therefore, do not try to request data from both versions for the same date range. If you need to compare data from both versions, please refer to the respective codebooks and methodology documents. The v1 data covers the period from 2020-02-14 to 2021-05-09, and the v2 data covers the period from 2022-01-01 to the present until further notice. The true dates range is checked against the available data for each version on every function run. + +The possible values can be any of the following: +\itemize{ +\item A single date in ISO (YYYY-MM-DD) or YYYYMMDD format. \code{character} or \code{Date} object. +\item A vector of dates in ISO (YYYY-MM-DD) or YYYYMMDD format. \code{character} or \code{Date} object. Can be any non-consecutive sequence of dates. +\item A date range +\item eigher a \code{character} or \code{Date} object of length 2 with clearly named elements \code{start} and \code{end} in ISO (YYYY-MM-DD) or YYYYMMDD format. E.g. \code{c(start = "2020-02-15", end = "2020-02-17")}; +\item or a \code{character} object of the form \code{YYYY-MM-DD_YYYY-MM-DD} or \code{YYYYMMDD_YYYYMMDD}. For example, \verb{2020-02-15_2020-02-17} or \verb{20200215_20200217}. +\item A regular expression to match dates in the format \code{YYYYMMDD}. \code{character} object. For example, \verb{^202002} will match all dates in February 2020. +}} +} +\value{ +A character vector of dates in ISO format (YYYY-MM-DD). +} +\description{ +This function processes the date arguments provided to various functions in the package. It can handle single dates and arbitratry sequences (vectors) of dates in ISO (YYYY-MM-DD) and YYYYMMDD format. It can also handle date ranges in the format 'YYYY-MM-DD_YYYY-MM-DD' (or 'YYYYMMDD_YYYYMMDD'), date ranges in named vec and regular expressions to match dates in the format \code{YYYYMMDD}. +} +\keyword{internal} diff --git a/man/spod_download_data.Rd b/man/spod_download_data.Rd new file mode 100644 index 0000000..3639bc5 --- /dev/null +++ b/man/spod_download_data.Rd @@ -0,0 +1,62 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/download_data.R +\name{spod_download_data} +\alias{spod_download_data} +\title{Download the data files of specified type, zones, and dates} +\usage{ +spod_download_data( + type = c("od", "origin-destination", "os", "overnight_stays", "tpp", + "trips_per_person"), + zones = c("districts", "dist", "distr", "municipalities", "muni", "municip", "lau", + "large_urban_areas"), + dates = NULL, + data_dir = spod_get_data_dir(), + quiet = FALSE, + return_output = TRUE +) +} +\arguments{ +\item{type}{The type of data to download. Can be \code{"origin-destination"} (or ust \code{"od"}), or \code{"trips_per_person"} (or just \code{"tpp"}) for v1 data. For v2 data \code{"overnight_stays"} (or just \code{"os"}) is also available. More data types to be supported in the future. See respective codebooks for more information. \strong{ADD CODEBOOKS! to the package}} + +\item{zones}{The zones for which to download the data. Can be \code{"districts"} (or \code{"dist"}, \code{"distr"}) or \code{"municipalities"} (or \code{"muni"}, \code{"municip"}) for v1 data. Additionaly, these can be \code{"large_urban_areas"} (or \code{"lau"}) for v2 data.} + +\item{dates}{A \code{character} or \code{Date} vector of dates to process. Kindly keep in mind that v1 and v2 data follow different data collection methodologies and may not be directly comparable. Therefore, do not try to request data from both versions for the same date range. If you need to compare data from both versions, please refer to the respective codebooks and methodology documents. The v1 data covers the period from 2020-02-14 to 2021-05-09, and the v2 data covers the period from 2022-01-01 to the present until further notice. The true dates range is checked against the available data for each version on every function run. + +The possible values can be any of the following: +\itemize{ +\item A single date in ISO (YYYY-MM-DD) or YYYYMMDD format. \code{character} or \code{Date} object. +\item A vector of dates in ISO (YYYY-MM-DD) or YYYYMMDD format. \code{character} or \code{Date} object. Can be any non-consecutive sequence of dates. +\item A date range +\item eigher a \code{character} or \code{Date} object of length 2 with clearly named elements \code{start} and \code{end} in ISO (YYYY-MM-DD) or YYYYMMDD format. E.g. \code{c(start = "2020-02-15", end = "2020-02-17")}; +\item or a \code{character} object of the form \code{YYYY-MM-DD_YYYY-MM-DD} or \code{YYYYMMDD_YYYYMMDD}. For example, \verb{2020-02-15_2020-02-17} or \verb{20200215_20200217}. +\item A regular expression to match dates in the format \code{YYYYMMDD}. \code{character} object. For example, \verb{^202002} will match all dates in February 2020. +}} + +\item{data_dir}{The directory where the data is stored. Defaults to the value returned by \code{spod_get_data_dir()} which returns the value of the environment variable \code{SPANISH_OD_DATA_DIR} or a temporary directory if the variable is not set.} + +\item{quiet}{Logical. If \code{TRUE}, the function does not print messages to the console. Defaults to \code{FALSE}.} + +\item{return_output}{Logical. If \code{TRUE}, the function returns a character vector of the paths to the downloaded files. If \code{FALSE}, the function returns \code{NULL}.} +} +\value{ +A character vector of the paths to the downloaded files. Unless \code{return_output = FALSE}, in which case the function returns \code{NULL}. +} +\description{ +This function downloads the data files of the specified type, zones, dates and data version. +} +\examples{ +\dontrun{ +# Download the origin-destination on district level for the a date range in March 2020 +spod_download_data(type = "od", zones = "districts", + date_range = c("2020-03-20", "2020-03-24")) + +# Download the origin-destination on district level for select dates in 2020 and 2021 +spod_download_data(type = "od", zones = "dist", + dates_list = c("2020-03-20", "2020-03-24", "2021-03-20", "2021-03-24")) + +# Download the origin-destination on municipality level using regex for a date range in March 2020 +# (the regex will capture the dates 2020-03-20 to 2020-03-24) +spod_download_data(type = "od", zones = "municip", + date_regex = "2020032[0-4]") +} +} diff --git a/man/spod_expand_dates_from_regex.Rd b/man/spod_expand_dates_from_regex.Rd new file mode 100644 index 0000000..ed24db9 --- /dev/null +++ b/man/spod_expand_dates_from_regex.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/internal_utils.R +\name{spod_expand_dates_from_regex} +\alias{spod_expand_dates_from_regex} +\title{Function to expand dates from a regex} +\usage{ +spod_expand_dates_from_regex(date_regex) +} +\arguments{ +\item{date_regex}{A regular expression to match dates in the format yyyymmdd.} +} +\value{ +A character vector of dates matching the regex. +} +\description{ +This function generates a sequence of dates from a regular expression pattern. +based on the provided regular expression. +} +\keyword{internal} diff --git a/man/spod_get_data_dir.Rd b/man/spod_get_data_dir.Rd new file mode 100644 index 0000000..f291402 --- /dev/null +++ b/man/spod_get_data_dir.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get.R +\name{spod_get_data_dir} +\alias{spod_get_data_dir} +\title{Get the data directory} +\usage{ +spod_get_data_dir(quiet = FALSE) +} +\value{ +The data directory. +} +\description{ +This function retrieves the data directory from the environment variable SPANISH_OD_DATA_DIR. +If the environment variable is not set, it returns the temporary directory. +} +\keyword{internal} diff --git a/man/spod_get_latest_v1_file_list.Rd b/man/spod_get_latest_v1_file_list.Rd new file mode 100644 index 0000000..3b6a8ff --- /dev/null +++ b/man/spod_get_latest_v1_file_list.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_v1_data.R +\name{spod_get_latest_v1_file_list} +\alias{spod_get_latest_v1_file_list} +\title{Get latest file list from the XML for MITMA open mobiltiy data v1 (2020-2021)} +\usage{ +spod_get_latest_v1_file_list( + data_dir = spod_get_data_dir(), + xml_url = "https://opendata-movilidad.mitma.es/RSS.xml" +) +} +\arguments{ +\item{data_dir}{The directory where the data is stored. Defaults to the value returned by \code{spod_get_data_dir()}.} + +\item{xml_url}{The URL of the XML file to download. Defaults to "https://opendata-movilidad.mitma.es/RSS.xml".} +} +\value{ +The path to the downloaded XML file. +} +\description{ +Get latest file list from the XML for MITMA open mobiltiy data v1 (2020-2021) +} +\examples{ +if (FALSE) { + spod_get_latest_v1_file_list() +} +} diff --git a/man/spod_get_latest_v2_xml.Rd b/man/spod_get_latest_v2_xml.Rd index 7b9643e..2674530 100644 --- a/man/spod_get_latest_v2_xml.Rd +++ b/man/spod_get_latest_v2_xml.Rd @@ -6,17 +6,13 @@ \usage{ spod_get_latest_v2_xml( data_dir = spod_get_data_dir(), - xml_url = "https://movilidad-opendata.mitma.es/RSS.xml", - current_timestamp = format(Sys.time(), format = "\%Y-\%m-\%d", usetz = FALSE, tz = - "UTC") + xml_url = "https://movilidad-opendata.mitma.es/RSS.xml" ) } \arguments{ \item{data_dir}{The directory where the data is stored. Defaults to the value returned by \code{spod_get_data_dir()}.} \item{xml_url}{The URL of the XML file to download. Defaults to "https://movilidad-opendata.mitma.es/RSS.xml".} - -\item{current_timestamp}{The current timestamp to keep track of the version of the remote file list. Defaults to the current date.} } \value{ The path to the downloaded XML file. diff --git a/man/spod_get_metadata.Rd b/man/spod_get_metadata.Rd index 93c4bef..81e79fb 100644 --- a/man/spod_get_metadata.Rd +++ b/man/spod_get_metadata.Rd @@ -4,10 +4,12 @@ \alias{spod_get_metadata} \title{Get the data dictionary} \usage{ -spod_get_metadata(data_dir = spod_get_data_dir()) +spod_get_metadata(data_dir = spod_get_data_dir(), quiet = FALSE) } \arguments{ \item{data_dir}{The directory where the data is stored. Defaults to the value returned by \code{spod_get_data_dir()}.} + +\item{quiet}{Whether to suppress messages. Defaults to \code{FALSE}.} } \value{ The data dictionary. diff --git a/man/spod_get_od.Rd b/man/spod_get_od.Rd new file mode 100644 index 0000000..939a4d7 --- /dev/null +++ b/man/spod_get_od.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_v1_data.R +\name{spod_get_od} +\alias{spod_get_od} +\title{Retrieve the origin-destination v1 data (2020-2021)} +\usage{ +spod_get_od( + zones = c("districts", "dist", "distr", "municipalities", "muni", "municip"), + dates = NULL, + data_dir = spod_get_data_dir(), + quiet = FALSE, + read_fun = duckdb::tbl_file +) +} +\arguments{ +\item{zones}{The zones for which to download the data. Can be \code{"districts"} (or \code{"dist"}, \code{"distr"}) or \code{"municipalities"} (or \code{"muni"}, \code{"municip"}) for v1 data. Additionaly, these can be \code{"large_urban_areas"} (or \code{"lau"}) for v2 data.} + +\item{dates}{A \code{character} or \code{Date} vector of dates to process. Kindly keep in mind that v1 and v2 data follow different data collection methodologies and may not be directly comparable. Therefore, do not try to request data from both versions for the same date range. If you need to compare data from both versions, please refer to the respective codebooks and methodology documents. The v1 data covers the period from 2020-02-14 to 2021-05-09, and the v2 data covers the period from 2022-01-01 to the present until further notice. The true dates range is checked against the available data for each version on every function run. + +The possible values can be any of the following: +\itemize{ +\item A single date in ISO (YYYY-MM-DD) or YYYYMMDD format. \code{character} or \code{Date} object. +\item A vector of dates in ISO (YYYY-MM-DD) or YYYYMMDD format. \code{character} or \code{Date} object. Can be any non-consecutive sequence of dates. +\item A date range +\item eigher a \code{character} or \code{Date} object of length 2 with clearly named elements \code{start} and \code{end} in ISO (YYYY-MM-DD) or YYYYMMDD format. E.g. \code{c(start = "2020-02-15", end = "2020-02-17")}; +\item or a \code{character} object of the form \code{YYYY-MM-DD_YYYY-MM-DD} or \code{YYYYMMDD_YYYYMMDD}. For example, \verb{2020-02-15_2020-02-17} or \verb{20200215_20200217}. +\item A regular expression to match dates in the format \code{YYYYMMDD}. \code{character} object. For example, \verb{^202002} will match all dates in February 2020. +}} + +\item{data_dir}{The directory where the data is stored. Defaults to the value returned by \code{spod_get_data_dir()} which returns the value of the environment variable \code{SPANISH_OD_DATA_DIR} or a temporary directory if the variable is not set.} + +\item{quiet}{Logical. If \code{TRUE}, the function does not print messages to the console. Defaults to \code{FALSE}.} + +\item{read_fun}{The function to read the data. Defaults to \code{duckdb::tbl_file}.} +} +\value{ +A tibble with the origin-destination data. +} +\description{ +This function retrieves the v1 (2020-2021) origin-destination data from the specified data directory. +} diff --git a/man/spod_get_zones_v1.Rd b/man/spod_get_zones_v1.Rd new file mode 100644 index 0000000..79ec9ff --- /dev/null +++ b/man/spod_get_zones_v1.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_v1_data.R +\name{spod_get_zones_v1} +\alias{spod_get_zones_v1} +\title{Retrieves the zones for v1 data} +\usage{ +spod_get_zones_v1( + zones = c("districts", "dist", "distr", "municipalities", "muni", "municip"), + data_dir = spod_get_data_dir() +) +} +\arguments{ +\item{zones}{The zones for which to download the data. Can be \code{"districts"} (or \code{"dist"}, \code{"distr"}) or \code{"municipalities"} (or \code{"muni"}, \code{"municip"}).} + +\item{data_dir}{The directory where the data is stored.} +} +\value{ +A spatial object containing the zones data. +} +\description{ +This function retrieves the zones data from the specified data directory. +It can retrieve either "distritos" or "municipios" zones data. +} +\examples{ +if (FALSE) { + zones <- spod_get_zones() +} +} diff --git a/man/spod_is_data_version_overlaps.Rd b/man/spod_is_data_version_overlaps.Rd new file mode 100644 index 0000000..59e28ca --- /dev/null +++ b/man/spod_is_data_version_overlaps.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/internal_utils.R +\name{spod_is_data_version_overlaps} +\alias{spod_is_data_version_overlaps} +\title{Check if specified dates span both data versions} +\usage{ +spod_is_data_version_overlaps(dates) +} +\arguments{ +\item{dates}{A Dates vector of dates to check.} +} +\value{ +TRUE if the dates span both data versions, FALSE otherwise. +} +\description{ +This function checks if the specified dates or date ranges span both v1 and v2 data versions. +} +\keyword{internal} diff --git a/man/spod_match_data_type.Rd b/man/spod_match_data_type.Rd new file mode 100644 index 0000000..4bf6f37 --- /dev/null +++ b/man/spod_match_data_type.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/internal_utils.R +\name{spod_match_data_type} +\alias{spod_match_data_type} +\title{Match data types to folders} +\usage{ +spod_match_data_type( + type = c("od", "origin-destination", "os", "overnight_stays", "tpp", + "trips_per_person"), + ver = c(1, 2) +) +} +\arguments{ +\item{type}{The type of data to match. Can be "od", "origin-destination", "os", "overnight_stays", or "tpp", "trips_per_person".} + +\item{ver}{The version of the data to use. Defaults to 1. Can be 1 or 2.} +} +\description{ +Match data types to folders +} +\keyword{internal} diff --git a/tests/testthat.R b/tests/testthat.R new file mode 100644 index 0000000..fe6ac28 --- /dev/null +++ b/tests/testthat.R @@ -0,0 +1,12 @@ +# This file is part of the standard setup for testthat. +# It is recommended that you do not modify it. +# +# Where should you do additional test configuration? +# Learn more about the roles of various files in: +# * https://r-pkgs.org/testing-design.html#sec-tests-files-overview +# * https://testthat.r-lib.org/articles/special-files.html + +library(testthat) +library(spanishoddata) + +test_check("spanishoddata") diff --git a/tests/testthat/test-internal_utils.R b/tests/testthat/test-internal_utils.R new file mode 100644 index 0000000..dd6ef6a --- /dev/null +++ b/tests/testthat/test-internal_utils.R @@ -0,0 +1,100 @@ +# Prepare the testing environment using bundled xml files to avoid downloading data from the internet + +extdata_path <- system.file("extdata", package = "spanishoddata") +gz_files <- list.files(extdata_path, pattern = "data_links_.*\\.xml\\.gz", full.names = TRUE) + +if (length(gz_files) == 0) stop("No gzipped XML files found.") + +# Create a temporary directory +test_data_dir <- tempfile() +dir.create(test_data_dir, recursive = TRUE) + +current_date <- format(Sys.time(), format = "%Y-%m-%d", usetz = FALSE) + +# Copy and rename gzipped XML files to the temporary directory +for (gz_file in gz_files) { + if (grepl("v1", gz_file)) { + file.copy(gz_file, file.path(test_data_dir, paste0("data_links_v1_", current_date, ".xml.gz"))) + } else if (grepl("v2", gz_file)) { + file.copy(gz_file, file.path(test_data_dir, paste0("data_links_v2_", current_date, ".xml.gz"))) + } +} + +# Set the environment variable to the test directory +Sys.setenv(SPANISH_OD_DATA_DIR = test_data_dir) + + +test_that("single ISO date input", { + dates <- "2023-07-01" + result <- spod_dates_argument_to_dates_seq(dates) + expect_equal(result, as.Date("2023-07-01")) +}) + +test_that("single YYYYMMDD date input", { + dates <- "20230701" + result <- spod_dates_argument_to_dates_seq(dates) + expect_equal(result, as.Date("2023-07-01")) +}) + +test_that("vector of ISO dates", { + dates <- c("2023-07-01", "2023-07-03", "2023-07-05") + result <- spod_dates_argument_to_dates_seq(dates) + expect_equal(result, as.Date(c("2023-07-01", "2023-07-03", "2023-07-05"))) +}) + +test_that("vector of YYYYMMDD dates", { + dates <- c("20230701", "20230703", "20230705") + result <- spod_dates_argument_to_dates_seq(dates) + expect_equal(result, as.Date(c("2023-07-01", "2023-07-03", "2023-07-05"))) +}) + +test_that("date range in ISO format", { + dates <- "2023-07-01_2023-07-05" + result <- spod_dates_argument_to_dates_seq(dates) + expect_equal(result, seq.Date(from = as.Date("2023-07-01"), to = as.Date("2023-07-05"), by = "day")) +}) + +test_that("date range in YYYYMMDD format", { + dates <- "20230701_20230705" + result <- spod_dates_argument_to_dates_seq(dates) + expect_equal(result, seq.Date(from = as.Date("2023-07-01"), to = as.Date("2023-07-05"), by = "day")) +}) + +test_that("named vector date range in ISO format", { + dates <- c(start = "2023-07-01", end = "2023-07-05") + result <- spod_dates_argument_to_dates_seq(dates) + expect_equal(result, seq.Date(from = as.Date("2023-07-01"), to = as.Date("2023-07-05"), by = "day")) +}) + +test_that("named vector date range in YYYYMMDD format", { + dates <- c(start = "20230701", end = "20230705") + result <- spod_dates_argument_to_dates_seq(dates) + expect_equal(result, seq.Date(from = as.Date("2023-07-01"), to = as.Date("2023-07-05"), by = "day")) +}) + +test_that("regex pattern matching dates", { + dates <- "^202307" + result <- spod_dates_argument_to_dates_seq(dates) + expected_dates <- seq.Date(from = as.Date("2023-07-01"), to = as.Date("2023-07-31"), by = "day") + expect_equal(result, expected_dates) +}) + +test_that("invalid input type", { + dates <- 20230701 + expect_error(spod_dates_argument_to_dates_seq(dates), "Invalid date input format. Please provide a character vector or Date object.") +}) + +test_that("dates span both v1 and v2 data", { + dates <- c("2021-05-09", "2022-01-01") + expect_error(spod_dates_argument_to_dates_seq(dates), + "Dates found in both v1 and v2 data.") +}) + +test_that("dates that are out of availabe range of v1 data", { + dates <- c("2020-01-01", "2021-01-01") + expect_error(spod_dates_argument_to_dates_seq(dates), + "Some dates do not match the available data.") +}) + +# clean up +unlink(test_data_dir, recursive = TRUE)