Skip to content

Commit

Permalink
Merge pull request #18 from Robinlovelace/get-v1-data
Browse files Browse the repository at this point in the history
Get v1 data, partial implementation
  • Loading branch information
e-kotov authored Aug 9, 2024
2 parents c511dc9 + d1a1e52 commit 2ce6f68
Show file tree
Hide file tree
Showing 26 changed files with 1,062 additions and 20 deletions.
2 changes: 1 addition & 1 deletion .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ README.qmd
^LICENSE\.md$
^.*\.Rproj$
^\.Rproj\.user$
private
^private$
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
# Ignore all gz files
*.gz

# Exceptions for gz files in inst/extdata
!inst/extdata/*.gz

movilidad.duckdb
.Rhistory
zonificacion_distritos*
Expand Down
4 changes: 4 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,14 @@ Imports:
lubridate,
purrr,
readr,
rlang (>= 1.1.0),
sf,
stringr,
tibble,
xml2
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.2
Suggests:
testthat (>= 3.0.0)
Config/testthat/edition: 3
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
# Generated by roxygen2: do not edit by hand

export(spod_available_data_v1)
export(spod_download_data)
export(spod_get)
export(spod_get_latest_v1_file_list)
export(spod_get_latest_v2_xml)
export(spod_get_metadata)
export(spod_get_zones)
export(spod_get_zones_v1)
106 changes: 106 additions & 0 deletions R/download_data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
#' Download the data files of specified type, zones, and dates
#'
#' This function downloads the data files of the specified type, zones, dates and data version.
#' @param type The type of data to download. Can be `"origin-destination"` (or ust `"od"`), or `"trips_per_person"` (or just `"tpp"`) for v1 data. For v2 data `"overnight_stays"` (or just `"os"`) is also available. More data types to be supported in the future. See respective codebooks for more information. **ADD CODEBOOKS! to the package**
#' @param zones The zones for which to download the data. Can be `"districts"` (or `"dist"`, `"distr"`) or `"municipalities"` (or `"muni"`, `"municip"`) for v1 data. Additionaly, these can be `"large_urban_areas"` (or `"lau"`) for v2 data.
#' @inheritParams spod_dates_argument_to_dates_seq
#' @param data_dir The directory where the data is stored. Defaults to the value returned by `spod_get_data_dir()` which returns the value of the environment variable `SPANISH_OD_DATA_DIR` or a temporary directory if the variable is not set.
#' @param quiet Logical. If `TRUE`, the function does not print messages to the console. Defaults to `FALSE`.
#' @param return_output Logical. If `TRUE`, the function returns a character vector of the paths to the downloaded files. If `FALSE`, the function returns `NULL`.
#'
#' @return A character vector of the paths to the downloaded files. Unless `return_output = FALSE`, in which case the function returns `NULL`.
#'
#' @export
#' @examples
#' \dontrun{
#' # Download the origin-destination on district level for the a date range in March 2020
#' spod_download_data(type = "od", zones = "districts",
#' date_range = c("2020-03-20", "2020-03-24"))
#'
#' # Download the origin-destination on district level for select dates in 2020 and 2021
#' spod_download_data(type = "od", zones = "dist",
#' dates_list = c("2020-03-20", "2020-03-24", "2021-03-20", "2021-03-24"))
#'
#' # Download the origin-destination on municipality level using regex for a date range in March 2020
#' # (the regex will capture the dates 2020-03-20 to 2020-03-24)
#' spod_download_data(type = "od", zones = "municip",
#' date_regex = "2020032[0-4]")
#' }
spod_download_data <- function(
type = c(
"od", "origin-destination",
"os", "overnight_stays",
"tpp", "trips_per_person"),
zones = c("districts", "dist", "distr",
"municipalities", "muni", "municip",
"lau", "large_urban_areas"), # implement "urban_areas" for v2 data
dates = NULL,
data_dir = spod_get_data_dir(),
quiet = FALSE,
return_output = TRUE
) {
# convert english zone names to spanish words used in the default data paths
zones <- match.arg(zones)
zones <- spod_zone_names_en2es(zones)

# this is where the date arguments are processed
# for all the wrapper functions that use the spod_download_data() function the dates are also processed here
dates_to_use <- spod_dates_argument_to_dates_seq(dates = dates)

# check version
# replace this argument with automatic version detection based on the dates requested?
ver <- spod_infer_data_v_from_dates(dates_to_use) # this leads to a second call to an internal spod_get_valid_dates() which in turn causes a second call to spod_available_data_v1() or spod_get_metadata(). This results in reading the xml files with metadata for the second time. This is not optimal and should be fixed.
if (isFALSE(quiet)) message("Data version detected from dates: ", ver)

# convert english data type names to spanish words used in the default data paths
type <- match.arg(type)
type <- spod_match_data_type(type = type, ver = ver)



# get the available data list while checking for files already cached on disk
if( ver == 1) {
metadata <- spod_available_data_v1(data_dir = data_dir,
check_local_files = TRUE)
} else if (ver == 2) {
metadata <- spod_get_metadata(data_dir = data_dir)
# replace with spod_available_data_v2() when available, spod_get_metadata can become a wrapper with v1/v2 argument. Potentially we can even automaticaly detect the data version based on the time intervals that user requests, but this is a bit controversial, as the methodology behind v1 and v2 data generation is not the same and Nommon+MITMA do not recommend mixing those together and comparing absoloute numbers of trips.
}

# match the metadata to type, zones, version and dates
if(ver == 1){
requested_files <- metadata[
grepl(glue::glue("v{ver}.*{type}.*{zones}"), metadata$local_path) &
metadata$data_ymd %in% dates_to_use,
]
} else if(ver == 2){
requested_files <- metadata[
grepl(glue::glue("v{ver}.*{zones}.*{type}"), metadata$local_path) &
metadata$data_ymd %in% dates_to_use,
]
}

files_to_download <- requested_files[!requested_files$downloaded, ]

# pre-generate target paths for the files to download
fs::dir_create(
unique(fs::path_dir(files_to_download$local_path)),
recurse = TRUE)

# download the missing files
downloaded_files <- curl::multi_download(
urls = files_to_download$target_url,
destfiles = files_to_download$local_path,
progress = TRUE,
resume = TRUE
)

# set download status for downloaded files as TRUE in requested_files
requested_files$downloaded[requested_files$local_path %in% downloaded_files$destfile] <- TRUE

message("Retrieved data for requested dates: ", paste(dates_to_use, collapse = ", ")) # this may output too many dates, shoudl be fixed when we create a flexible date argument processing function. Keeping for now.

if (return_output) {
return(requested_files$local_path)
}
}
9 changes: 9 additions & 0 deletions R/folders.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# change subfolder name for raw data cache here to apply globally
spod_subfolder_raw_data_cache <- function(ver = 1) {
rlang:::check_number_whole(ver)
if (!ver %in% c(1, 2)) {
stop("Invalid version number. Must be 1 or 2.")
}
base_subdir_name <- "raw_data_cache"
return(paste0(base_subdir_name, "/v", ver, "/"))
}
39 changes: 26 additions & 13 deletions R/get.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
#'
#' @param data_dir The directory where the data is stored. Defaults to the value returned by `spod_get_data_dir()`.
#' @param xml_url The URL of the XML file to download. Defaults to "https://movilidad-opendata.mitma.es/RSS.xml".
#' @param current_timestamp The current timestamp to keep track of the version of the remote file list. Defaults to the current date.
#'
#' @return The path to the downloaded XML file.
#' @export
Expand All @@ -12,13 +11,14 @@
#' }
spod_get_latest_v2_xml = function(
data_dir = spod_get_data_dir(),
xml_url = "https://movilidad-opendata.mitma.es/RSS.xml",
current_timestamp = format(Sys.time(), format = "%Y-%m-%d", usetz = FALSE, tz = "UTC")) {
xml_url = "https://movilidad-opendata.mitma.es/RSS.xml"
) {
if (!fs::dir_exists(data_dir)) {
fs::dir_create(data_dir)
}

current_filename = glue::glue("{data_dir}/data_links_{current_timestamp}.xml")
current_timestamp = format(Sys.time(), format = "%Y-%m-%d", usetz = FALSE, tz = "UTC")
current_filename = glue::glue("{data_dir}/data_links_v2_{current_timestamp}.xml")

message("Saving the file to: ", current_filename)
xml_requested = curl::curl_download(url = xml_url, destfile = current_filename, quiet = FALSE)
Expand All @@ -30,6 +30,7 @@ spod_get_latest_v2_xml = function(
#' This function retrieves the data dictionary for the specified data directory.
#'
#' @param data_dir The directory where the data is stored. Defaults to the value returned by `spod_get_data_dir()`.
#' @param quiet Whether to suppress messages. Defaults to `FALSE`.
#' @return The data dictionary.
#' @export
#' @examples
Expand All @@ -39,14 +40,14 @@ spod_get_latest_v2_xml = function(
#' names(metadata)
#' head(metadata)
#' }
spod_get_metadata = function(data_dir = spod_get_data_dir()) {
xml_files_list = fs::dir_ls(data_dir, type = "file", regexp = "data_links_") |> sort()
spod_get_metadata = function(data_dir = spod_get_data_dir(), quiet = FALSE) {
xml_files_list = fs::dir_ls(data_dir, type = "file", regexp = "data_links_v2") |> sort()
latest_data_links_xml_path = utils::tail(xml_files_list, 1)
if (length(latest_data_links_xml_path) == 0) {
message("Getting latest data links xml")
if(isFALSE(quiet)) message("Getting latest data links xml")
latest_data_links_xml_path = spod_get_latest_v2_xml(data_dir = data_dir)
} else {
message("Using existing data links xml: ", latest_data_links_xml_path)
if(isFALSE(quiet)) message("Using existing data links xml: ", latest_data_links_xml_path)
}

x_xml = xml2::read_xml(latest_data_links_xml_path)
Expand Down Expand Up @@ -74,12 +75,24 @@ spod_get_metadata = function(data_dir = spod_get_data_dir()) {
return(download_dt)
}

spod_get_data_dir = function() {
#' Get the data directory
#'
#' This function retrieves the data directory from the environment variable SPANISH_OD_DATA_DIR.
#' If the environment variable is not set, it returns the temporary directory.
#'
#' @return The data directory.
#' @keywords internal
spod_get_data_dir = function(quiet = FALSE) {
data_dir_env = Sys.getenv("SPANISH_OD_DATA_DIR")
if (data_dir_env == "") {
data_dir_env = tempdir()
if( data_dir_env == "" ) {
if (isFALSE(quiet)) warning("Warning: SPANISH_OD_DATA_DIR is not set. Using the temporary directory, which is not recommended, as the data will be deleted when the session ends.\n\n To set the data directory, use `Sys.setenv(SPANISH_OD_DATA_DIR = '/path/to/data')` or set SPANISH_OD_DATA_DIR permanently in the environment by editing the `.Renviron` file locally for current project with `usethis::edit_r_environ('project')` or `file.edit('.Renviron')` or globally for all projects with `usethis::edit_r_environ('user')` or `file.edit('~/.Renviron')`.")
data_dir_env = tempdir() # if not set, use the temp directory
}
# check if dir exists and create it if it doesn't
if (!fs::dir_exists(data_dir_env)) {
fs::dir_create(data_dir_env)
}
return(data_dir_env)
return(fs::path_real(data_dir_env))
}

#' Retrieves the zones data
Expand All @@ -104,7 +117,7 @@ spod_get_zones = function(
metadata_distritos = metadata[sel_distritos, ]
dir_name = dirname(metadata_distritos$local_path[1])
if (!fs::dir_exists(dir_name)) {
fs::dir_create(dir_name)
fs::dir_create(dir_name, recurse = TRUE)
}
for (i in 1:nrow(metadata_distritos)) {
if (!fs::file_exists(metadata_distritos$local_path[i])) {
Expand Down
Loading

0 comments on commit 2ce6f68

Please sign in to comment.