diff --git a/.Rbuildignore b/.Rbuildignore index d7e1207..6b54f13 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -12,3 +12,6 @@ README.qmd ^.*\.Rproj$ ^\.Rproj\.user$ ^private$ +^doc$ +^Meta$ +^vignettes/*_files$ diff --git a/.gitignore b/.gitignore index 1fceb1f..bad9e2f 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,7 @@ private /.quarto/ .Rproj.user +inst/doc +.Renviron +/doc/ +/Meta/ diff --git a/DESCRIPTION b/DESCRIPTION index 142e58a..f083911 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -23,14 +23,16 @@ Depends: Imports: curl, DBI, + dplyr, duckdb, fs, glue, lubridate, + parallelly, purrr, readr, - rlang (>= 1.1.0), sf, + stats, stringr, tibble, xml2 @@ -38,5 +40,8 @@ Encoding: UTF-8 Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.2 Suggests: + quarto, testthat (>= 3.0.0) Config/testthat/edition: 3 +VignetteBuilder: + quarto diff --git a/NAMESPACE b/NAMESPACE index f2616bc..909bb95 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,10 +1,13 @@ # Generated by roxygen2: do not edit by hand export(spod_available_data_v1) +export(spod_convert_od_v1_to_duckdb) export(spod_download_data) export(spod_get) +export(spod_get_data_dir) export(spod_get_latest_v1_file_list) export(spod_get_latest_v2_xml) export(spod_get_metadata) +export(spod_get_od_v1) export(spod_get_zones) export(spod_get_zones_v1) diff --git a/R/convert_data.R b/R/convert_data.R new file mode 100644 index 0000000..cf4a963 --- /dev/null +++ b/R/convert_data.R @@ -0,0 +1,92 @@ +#' Convert all downloaded v1 origin-destination data to duckdb +#' +#' @param save_dir The path to the directory where the duckdb files will be saved. If `NULL`, uses the default location in `data_dir` (set by the `SPANISH_OD_DATA_DIR` environment variable). Therefore, the default relative path is `/clean_data/v1/tabular/duckdb/od_.duckdb`. +#' @inheritParams spod_get_zones_v1 +#' @inheritParams spod_duckdb_limit_resources +#' @param overwrite Logical. If `TRUE`, overwrites existing duckdb files. Defaults to `FALSE`. +#' @return Path to saved DuckDB file. +#' @export +spod_convert_od_v1_to_duckdb <- function( + zones = c( + "districts", "dist", "distr", "distritos", + "municipalities", "muni", "municip", "municipios" + ), + data_dir = spod_get_data_dir(), + save_dir = NULL, + quiet = FALSE, + duck_max_mem = 3, + duck_max_threads = parallelly::availableCores(), + overwrite = FALSE) { + zones <- match.arg(zones) + zones <- spod_zone_names_en2es(zones) + + # if save_dir is NULL, use default location in data_dir + if (is.null(save_dir)) { + save_dir <- fs::path( + data_dir, + spod_subfolder_clean_data_cache(ver = 1), + "tabular/duckdb/" + ) + } + + # ensure save_dir exists + if (!fs::dir_exists(save_dir)) fs::dir_create(save_dir, recurse = TRUE) + + # create duckdb save path + duckdb_save_path <- glue::glue("{save_dir}/od_{zones}.duckdb") + + # check if duckdb file already exists + if (fs::file_exists(duckdb_save_path) & !overwrite) { + message("Duckdb file already exists: ", duckdb_save_path) + # in future, perhaps add code that provides a summary of what's inside that file + # ask user if they want to overwrite + response <- readline(prompt = "Overwrite existing duckdb file? (yes/no) ") + overwrite_duckdb <- any(tolower(response) %in% c("y", "yes", "yes.")) + if (!overwrite_duckdb) { + message(glue::glue("Exiting without overwriting existing duckdb file. You may delete it from {duckdb_save_path} manually and rerun the function. Or rerun it with `overwrite = TRUE`.")) + return() + } else { + if (isFALSE(quiet)) message(glue::glue("Overwriting existing duckdb file: ", duckdb_save_path)) + fs::file_delete(duckdb_save_path) + } + } + + if (isFALSE(quiet)) message(glue::glue("Using {duck_max_mem} GB of memory and {duck_max_threads} threads. You may adjust this using the function arguments `duck_max_mem` and `duck_max_threads`.")) + if (isFALSE(quiet)) message(glue::glue("Converting cached v1 od data for {zones} to DuckDB: ", duckdb_save_path, "... This may take a while.")) + # add some indication on how long it may take from empirical experimentation + # hopefully, the progress_bar feature will be implemented in duckdb R package soon, bug filed here https://github.com/duckdb/duckdb-r/issues/199 + + # get dates of cached data + # v1_meta <- spod_available_data_v1(check_local_files = TRUE) + + # v1_meta <- v1_meta[v1_meta$downloaded == TRUE,] + # v1_meta <- v1_meta[grepl("maestra1", v1_meta$local_path),] + # v1_meta <- v1_meta[grepl(zones, v1_meta$local_path),] + + # dates <- v1_meta$data_ymd + + # create duckdb connection + drv <- duckdb::duckdb() + con <- DBI::dbConnect(drv, dbdir = duckdb_save_path, read_only = FALSE) + + # define memory and threads limits + con <- spod_duckdb_limit_resources( + con = con, + duck_max_mem = duck_max_mem, + duck_max_threads = duck_max_threads + ) + + # connect to folder of CSVs with v1 od data + con <- spod_duckdb_od_v1(con = con, zones = zones) + # DBI::dbListTables(con) + + # import view of CSV files into duckdb + DBI::dbExecute(con, "CREATE TABLE od AS SELECT * FROM od_csv_clean ;") + + DBI::dbDisconnect(con, shutdown = TRUE) + duckdb::duckdb_shutdown(drv) + + message("Cached v1 origin-destination data imported to DuckDB at: ", duckdb_save_path) + + return(duckdb_save_path) +} diff --git a/R/download_data.R b/R/download_data.R index 10101b3..272ed96 100644 --- a/R/download_data.R +++ b/R/download_data.R @@ -1,91 +1,101 @@ #' Download the data files of specified type, zones, and dates -#' +#' #' This function downloads the data files of the specified type, zones, dates and data version. #' @param type The type of data to download. Can be `"origin-destination"` (or ust `"od"`), or `"trips_per_person"` (or just `"tpp"`) for v1 data. For v2 data `"overnight_stays"` (or just `"os"`) is also available. More data types to be supported in the future. See respective codebooks for more information. **ADD CODEBOOKS! to the package** -#' @param zones The zones for which to download the data. Can be `"districts"` (or `"dist"`, `"distr"`) or `"municipalities"` (or `"muni"`, `"municip"`) for v1 data. Additionaly, these can be `"large_urban_areas"` (or `"lau"`) for v2 data. +#' @param zones The zones for which to download the data. Can be `"districts"` (or `"dist"`, `"distr"`, or the original Spanish `"distritos"`) or `"municipalities"` (or `"muni"`, `"municip"`, or the original Spanish `"municipios"`). Additionaly, these can be `"large_urban_areas"` (or `"lau"`, or the original Spanish `"grandes_areas_urbanas"`, or `"gau"`) for v2 data. #' @inheritParams spod_dates_argument_to_dates_seq #' @param data_dir The directory where the data is stored. Defaults to the value returned by `spod_get_data_dir()` which returns the value of the environment variable `SPANISH_OD_DATA_DIR` or a temporary directory if the variable is not set. #' @param quiet Logical. If `TRUE`, the function does not print messages to the console. Defaults to `FALSE`. #' @param return_output Logical. If `TRUE`, the function returns a character vector of the paths to the downloaded files. If `FALSE`, the function returns `NULL`. -#' +#' #' @return A character vector of the paths to the downloaded files. Unless `return_output = FALSE`, in which case the function returns `NULL`. -#' -#' @export +#' +#' @export #' @examples #' \dontrun{ #' # Download the origin-destination on district level for the a date range in March 2020 -#' spod_download_data(type = "od", zones = "districts", -#' date_range = c("2020-03-20", "2020-03-24")) -#' +#' spod_download_data( +#' type = "od", zones = "districts", +#' date_range = c("2020-03-20", "2020-03-24") +#' ) +#' #' # Download the origin-destination on district level for select dates in 2020 and 2021 -#' spod_download_data(type = "od", zones = "dist", -#' dates_list = c("2020-03-20", "2020-03-24", "2021-03-20", "2021-03-24")) -#' -#' # Download the origin-destination on municipality level using regex for a date range in March 2020 +#' spod_download_data( +#' type = "od", zones = "dist", +#' dates_list = c("2020-03-20", "2020-03-24", "2021-03-20", "2021-03-24") +#' ) +#' +#' # Download the origin-destination on municipality level using regex for a date range in March 2020 #' # (the regex will capture the dates 2020-03-20 to 2020-03-24) -#' spod_download_data(type = "od", zones = "municip", -#' date_regex = "2020032[0-4]") +#' spod_download_data( +#' type = "od", zones = "municip", +#' date_regex = "2020032[0-4]" +#' ) #' } spod_download_data <- function( - type = c( - "od", "origin-destination", - "os", "overnight_stays", - "tpp", "trips_per_person"), - zones = c("districts", "dist", "distr", - "municipalities", "muni", "municip", - "lau", "large_urban_areas"), # implement "urban_areas" for v2 data - dates = NULL, - data_dir = spod_get_data_dir(), - quiet = FALSE, - return_output = TRUE -) { + type = c( + "od", "origin-destination", + "os", "overnight_stays", + "tpp", "trips_per_person" + ), + zones = c( + "districts", "dist", "distr", "distritos", + "municipalities", "muni", "municip", "municipios", + "lau", "large_urban_areas", "gau", "grandes_areas_urbanas" + ), # implement "urban_areas" for v2 data + dates = NULL, + data_dir = spod_get_data_dir(), + quiet = FALSE, + return_output = TRUE) { # convert english zone names to spanish words used in the default data paths zones <- match.arg(zones) zones <- spod_zone_names_en2es(zones) - - # this is where the date arguments are processed - # for all the wrapper functions that use the spod_download_data() function the dates are also processed here + dates_to_use <- spod_dates_argument_to_dates_seq(dates = dates) + # check version # replace this argument with automatic version detection based on the dates requested? - ver <- spod_infer_data_v_from_dates(dates_to_use) # this leads to a second call to an internal spod_get_valid_dates() which in turn causes a second call to spod_available_data_v1() or spod_get_metadata(). This results in reading the xml files with metadata for the second time. This is not optimal and should be fixed. + ver <- spod_infer_data_v_from_dates(dates_to_use) # this leads to a second call to an internal spod_get_valid_dates() which in turn causes a second call to spod_available_data_v1() or spod_get_metadata(). This results in reading thedates_to_use <- spod_dates_argument_to_dates_seq(dates = dates) xml files with metadata for the second time. This is not optimal and should be fixed. if (isFALSE(quiet)) message("Data version detected from dates: ", ver) - + # convert english data type names to spanish words used in the default data paths type <- match.arg(type) type <- spod_match_data_type(type = type, ver = ver) - - - + + + # get the available data list while checking for files already cached on disk - if( ver == 1) { - metadata <- spod_available_data_v1(data_dir = data_dir, - check_local_files = TRUE) + if (ver == 1) { + metadata <- spod_available_data_v1( + data_dir = data_dir, + check_local_files = TRUE + ) } else if (ver == 2) { metadata <- spod_get_metadata(data_dir = data_dir) # replace with spod_available_data_v2() when available, spod_get_metadata can become a wrapper with v1/v2 argument. Potentially we can even automaticaly detect the data version based on the time intervals that user requests, but this is a bit controversial, as the methodology behind v1 and v2 data generation is not the same and Nommon+MITMA do not recommend mixing those together and comparing absoloute numbers of trips. } - + # match the metadata to type, zones, version and dates - if(ver == 1){ + if (ver == 1) { requested_files <- metadata[ grepl(glue::glue("v{ver}.*{type}.*{zones}"), metadata$local_path) & - metadata$data_ymd %in% dates_to_use, + metadata$data_ymd %in% dates_to_use, ] - } else if(ver == 2){ + } else if (ver == 2) { requested_files <- metadata[ grepl(glue::glue("v{ver}.*{zones}.*{type}"), metadata$local_path) & - metadata$data_ymd %in% dates_to_use, + metadata$data_ymd %in% dates_to_use, ] } files_to_download <- requested_files[!requested_files$downloaded, ] - + # pre-generate target paths for the files to download fs::dir_create( unique(fs::path_dir(files_to_download$local_path)), - recurse = TRUE) + recurse = TRUE + ) # download the missing files downloaded_files <- curl::multi_download( diff --git a/R/duckdb_helpers.R b/R/duckdb_helpers.R new file mode 100644 index 0000000..8d02874 --- /dev/null +++ b/R/duckdb_helpers.R @@ -0,0 +1,337 @@ +#' Creates a duckdb connection to v1 OD data +#' +#' This function creates a duckdb connection to the v1 OD data. +#' +#' @param con A duckdb connection object. If not specified, a new in-memory connection will be created. +#' @inheritParams spod_download_data +#' @return A duckdb connection object with 2 views: +#' +#' * `od_csv_raw` - a raw table view of all cached CSV files with the origin-destination data that has been previously cached in $SPANISH_OD_DATA_DIR +#' +#' * `od_csv_clean` - a cleaned-up table view of `od_csv_raw` with column names and values translated and mapped to English. This still includes all cached data. +#' +#' The structure of the cleaned-up views `od_csv_clean` is as follows: +#' +#' \describe{ +#' \item{full_date}{\code{Date}. The full date of the trip, including year, month, and day.} +#' \item{id_origin}{\code{factor}. The identifier for the origin location of the trip, formatted as a code (e.g., '01001_AM').} +#' \item{id_destination}{\code{factor}. The identifier for the destination location of the trip, formatted as a code (e.g., '01001_AM').} +#' \item{activity_origin}{\code{factor}. The type of activity at the origin location (e.g., 'home', 'work'). \strong{Note:} Only available for district level data.} +#' \item{activity_destination}{\code{factor}. The type of activity at the destination location (e.g., 'home', 'other'). \strong{Note:} Only available for district level data.} +#' \item{residence_province}{\code{factor}. The province of residence for the individual making the trip (e.g., 'Cuenca', 'Girona'). Provinces are stored as factors, and are encoded in a way that the province code can be used for queries. \strong{Note:} Only available for district level data.} +#' \item{time_slot}{\code{integer}. The time slot during which the trip started, represented as an integer (e.g., 0, 1, 2).} +#' \item{distance}{\code{factor}. The distance category of the trip, represented as a code (e.g., '002-005' for 2-5 km).} +#' \item{n_trips}{\code{double}. The number of trips taken within the specified time slot and distance.} +#' \item{trips_total_length_km}{\code{double}. The total length of all trips in kilometers for the specified time slot and distance.} +#' \item{year}{\code{double}. The year of the trip.} +#' \item{month}{\code{double}. The month of the trip.} +#' \item{day}{\code{double}. The day of the trip.} +#' } +#' +#' The structure of the original data in `od_csv_raw` is as follows: +#' +#' \describe{ +#' \item{fecha}{\code{Date}. The date of the trip, including year, month, and day.} +#' \item{origen}{\code{character}. The identifier for the origin location of the trip, formatted as a character string (e.g., '01001_AM').} +#' \item{destino}{\code{character}. The identifier for the destination location of the trip, formatted as a character string (e.g., '01001_AM').} +#' \item{actividad_origen}{\code{character}. The type of activity at the origin location (e.g., 'casa', 'trabajo').} +#' \item{actividad_destino}{\code{character}. The type of activity at the destination location (e.g., 'otros', 'trabajo').} +#' \item{residencia}{\code{character}. The code representing the residence of the individual making the trip (e.g., '01') according to the official INE classification.} +#' \item{edad}{\code{character}. The age of the individual making the trip. This data is actaully filled with 'NA' values, which is why this column is removed in the cleaned-up and translated view described above.} +#' \item{periodo}{\code{integer}. The time period during which the trip started, represented as an integer (e.g., 0, 1, 2).} +#' \item{distancia}{\code{character}. The distance category of the trip, represented as a character string (e.g., '002-005' for 2-5 km).} +#' \item{viajes}{\code{double}. The number of trips taken within the specified time period and distance.} +#' \item{viajes_km}{\code{double}. The total length of all trips in kilometers for the specified time period and distance.} +#' \item{day}{\code{double}. The day of the trip.} +#' \item{month}{\code{double}. The month of the trip.} +#' \item{year}{\code{double}. The year of the trip.} +#' } +#' @keywords internal +spod_duckdb_od_v1 <- function( + con = DBI::dbConnect(duckdb::duckdb(), dbdir = ":memory:", read_only = FALSE), + zones = c( + "districts", "dist", "distr", "distritos", + "municipalities", "muni", "municip", "municipios", + "lau", "large_urban_areas", "gau", "grandes_areas_urbanas" + ), + data_dir = spod_get_data_dir()) { + ver <- 1 + + zones <- match.arg(zones) + zones <- spod_zone_names_en2es(zones) + + csv_folder <- paste0( + data_dir, "/", + spod_subfolder_raw_data_cache(ver = ver), + "/maestra1-mitma-", spod_zone_names_en2es(zones), + "/ficheros-diarios/" + ) + + + # create view of csv files and preset variable types + if (zones == "distritos") { + DBI::dbSendStatement( + con, + dplyr::sql( + glue::glue( + "CREATE VIEW od_csv_raw AS SELECT * + FROM read_csv_auto('{csv_folder}**/*.txt.gz', delim='|', header=TRUE, hive_partitioning=TRUE, + columns={{ + 'fecha': 'DATE', + 'origen': 'VARCHAR', + 'destino': 'VARCHAR', + 'actividad_origen': 'VARCHAR', + 'actividad_destino': 'VARCHAR', + 'residencia': 'VARCHAR', + 'edad': 'VARCHAR', + 'periodo': 'INTEGER', + 'distancia': 'VARCHAR', + 'viajes': 'DOUBLE', + 'viajes_km': 'DOUBLE' + }}, + dateformat='%Y%m%d');" + ) + ) + ) + } else if (zones == "municipios") { + DBI::dbSendStatement( + con, + dplyr::sql( + glue::glue( + "CREATE VIEW od_csv_raw AS SELECT * + FROM read_csv_auto('{csv_folder}**/*.txt.gz', delim='|', header=TRUE, hive_partitioning=TRUE, + columns={{ + 'fecha': 'DATE', + 'origen': 'VARCHAR', + 'destino': 'VARCHAR', + 'periodo': 'INTEGER', + 'distancia': 'VARCHAR', + 'viajes': 'DOUBLE', + 'viajes_km': 'DOUBLE' + }}, + dateformat='%Y%m%d');" + ) + ) + ) + } + + # preview table + # DBI::dbGetQuery(con, "SELECT * FROM od_csv_raw LIMIT 10") |> dplyr::glimpse() # for debugging + + # create ENUMs + + # zones ENUMs + zones <- spod_zone_names_en2es(zones) + spatial_data <- spod_get_zones_v1(zones, quiet = TRUE) + unique_ids <- unique(spatial_data$id) + DBI::dbSendStatement( + con, + dplyr::sql( + paste0( + "CREATE TYPE ZONES_ENUM AS ENUM ('", + paste0(unique_ids, collapse = "','"), + "');" + ) + ) + ) + + # create ACTIV_ENUM + if (zones == "distritos") { + DBI::dbSendStatement( + con, + dplyr::sql("CREATE TYPE ACTIV_ENUM AS ENUM ('home', 'work', 'other')") + ) + } + + # create DISTANCE_ENUM + DBI::dbSendStatement( + con, + dplyr::sql("CREATE TYPE DISTANCE_ENUM AS ENUM ('002-005', '005-010', '010-050', '0005-002', '050-100', '100+');") + ) + + # create INE province ENUM + if (zones == "distritos") { + spod_duckdb_create_province_enum(con) + # DBI::dbGetQuery(con, "SELECT enum_range(NULL::INE_PROV_ENUM)") # check that it was created, remove this line when package is stable + # for debugging + # DBI::dbSendStatement(con, "DROP TYPE INE_PROV_ENUM") # remove this line when package is stable + + # create second view with desired data types including ENUMs + # create view to fix variable types and recode values to English + # NOTE: thsi raises non-ASCII character WARNING on R CMD check, so will need to store this query in a text file + # load when_then_provinces from a system file in inst/extdata/sql-queries/when-recode-provinces.txt + when_then_provinces <- readLines( + system.file( + "extdata/sql-queries/when-recode-provinces.txt", + package = "spanishoddata" + ) + ) |> + paste(collapse = "\n") + + # now execute the query pasting in the contents of when_then_provinces + DBI::dbSendStatement( + con, + dplyr::sql( + glue::glue( + "CREATE VIEW od_csv_clean AS SELECT + fecha AS full_date, + CAST(origen AS ZONES_ENUM) AS id_origin, + CAST(destino AS ZONES_ENUM) AS id_destination, + CAST(CASE actividad_origen + WHEN 'casa' THEN 'home' + WHEN 'otros' THEN 'other' + WHEN 'trabajo_estudio' THEN 'work' + END AS ACTIV_ENUM) AS activity_origin, + CAST(CASE actividad_destino + WHEN 'casa' THEN 'home' + WHEN 'otros' THEN 'other' + WHEN 'trabajo_estudio' THEN 'work_or_study' + END AS ACTIV_ENUM) AS activity_destination, + CAST (CASE residencia + {when_then_provinces} + END AS INE_PROV_ENUM) AS residence_province, + periodo AS time_slot, + CAST(distancia AS DISTANCE_ENUM) AS distance, + viajes AS n_trips, + viajes_km AS trips_total_length_km, + year AS year, + month AS month, + day AS day + FROM od_csv_raw;" + ) + ) + ) + } else if (zones == "municipios") { + DBI::dbSendStatement( + con, + dplyr::sql( + "CREATE VIEW od_csv_clean AS SELECT + fecha AS full_date, + CAST(origen AS ZONES_ENUM) AS id_origin, + CAST(destino AS ZONES_ENUM) AS id_destination, + periodo AS time_slot, + CAST(distancia AS DISTANCE_ENUM) AS distance, + viajes AS n_trips, + viajes_km AS trips_total_length_km, + year AS year, + month AS month, + day AS day + FROM od_csv_raw;" + ) + ) + } + + # preview result for debugging + # DBI::dbGetQuery(con, "SELECT * FROM od_csv_clean LIMIT 10") |> dplyr::glimpse() + + # return the connection as duckdb object + return(con) +} + +#' Filter a duckdb conenction by dates +#' @param con A duckdb connection +#' @param source_view_name The name of the source duckdb "view" (the virtual table, in the context of current package likely connected to a folder of CSV files). +#' @param new_view_name The name of the new duckdb "view" (the virtual table, in the context of current package likely connected to a folder of CSV files). +#' @inheritParams spod_dates_argument_to_dates_seq +#' @param source_view_name The name of the source duckdb "view" (the virtual table, in the context of current package likely connected to a folder of CSV files) +spod_duckdb_filter_by_dates <- function(con, source_view_name, new_view_name, dates) { + # prepare query to filter by dates + query <- dplyr::sql( + glue::glue( + "CREATE VIEW {new_view_name} AS SELECT * FROM {source_view_name} ", + spod_sql_where_dates(dates) + ) + ) + + # create a view with a filter to the desired dates + DBI::dbSendStatement(con, query) + + return(con) +} + +spod_duckdb_create_province_enum <- function(con) { + # load provinces with non-ASCII names + provinces_enum <- readLines( + system.file("extdata/sql-queries/provinces-enum.txt", + package = "spanishoddata" + ) + ) |> + paste(collapse = "\n") + + # create INE_PROV_ENUM + DBI::dbSendStatement( + con, + dplyr::sql( + glue::glue( + "CREATE TYPE INE_PROV_ENUM AS ENUM ( + {provinces_enum} + );" + ) + ) + ) + + # for debugging + # DBI::dbGetQuery(con, "SELECT enum_range(NULL::INE_PROV_ENUM)") # check that it was created, remove this line when package is stable + # DBI::dbSendStatement(con, "DROP TYPE INE_PROV_ENUM") # remove this line when package is stable + + return(con) +} + +#' Generate a WHERE part of an SQL query from a sequence of dates +#' @param dates A Dates vector of dates to process. +#' @return A character vector of the SQL query. +#' @keywords internal +spod_sql_where_dates <- function(dates) { + # Extract unique year, month, and day combinations from the dates + date_parts <- data.frame( + year = format(dates, "%Y"), + month = format(dates, "%m"), + day = format(dates, "%d") + ) + + # Get distinct rows and sort them by year, month, and day + date_parts <- date_parts[!duplicated(date_parts), ] + date_parts <- date_parts[order(date_parts$year, date_parts$month, date_parts$day), ] + + # Create the WHERE conditions for each unique date + where_conditions <- stats::aggregate(day ~ year + month, data = date_parts, FUN = function(x) paste(x, collapse = ", ")) + where_conditions$condition <- paste0( + "(year = ", where_conditions$year, + " AND month = ", where_conditions$month, + " AND day IN (", where_conditions$day, "))" + ) + + # Combine all conditions into a single WHERE clause + sql_query <- paste0( + "WHERE ", + paste(where_conditions$condition, collapse = " OR ") + ) + + return(sql_query) +} + +#' Set maximum memory and number of threads for a DuckDB connection +#' @param con A duckdb connection +#' @param duck_max_mem The maximum memory to use in GB. A conservative default is 3 GB, which should be enough for resaving the data to DuckDB form a folder of CSV.gz files while being small enough to fit in memory of most even old computers. For data analysis using the already converted data (in DuckDB or Parquet format) or with the raw CSV.gz data, it is recommended to increase it according to available resources. +#' @param duck_max_threads The maximum number of threads to use. Defaults to the number of available cores minus 1. +spod_duckdb_limit_resources <- function( + con, + duck_max_mem = 3, # in GB, default to 3 GB, should be enough to resave the data and small enough to fit in memory of most even old computers + duck_max_threads = parallelly::availableCores() - 1 # leave one core for other tasks by default + ) { + DBI::dbExecute( + con, + dplyr::sql( + glue::glue("SET max_memory='{duck_max_mem}GB';") + ) + ) + + DBI::dbExecute( + con, + dplyr::sql( + glue::glue("SET threads='{duck_max_threads}';") + ) + ) + + return(con) +} diff --git a/R/folders.R b/R/folders.R index 795a68e..468c236 100644 --- a/R/folders.R +++ b/R/folders.R @@ -1,9 +1,29 @@ -# change subfolder name for raw data cache here to apply globally +#' Get raw data cache subfolder name +#' +#' Change subfolder name in the code of this function for raw data cache here to apply globally, as all functions in the package should use this function to get the raw data cache path. +#' @param ver Integer. The version of the data. Must be 1 or 2. +#' @return Character string with the subfolder name for the raw data cache. +#' @keywords internal spod_subfolder_raw_data_cache <- function(ver = 1) { - rlang:::check_number_whole(ver) + ver <- as.integer(ver) if (!ver %in% c(1, 2)) { stop("Invalid version number. Must be 1 or 2.") } base_subdir_name <- "raw_data_cache" return(paste0(base_subdir_name, "/v", ver, "/")) } + +#' Get clean data subfolder name +#' +#' Change subfolder name in the code of this function for clean data cache here to apply globally, as all functions in the package should use this function to get the clean data cache path. +#' @param ver Integer. The version of the data. Must be 1 or 2. +#' @return Character string with the subfolder name for the clean data cache. +#' @keywords internal +spod_subfolder_clean_data_cache <- function(ver = 1) { + ver <- as.integer(ver) + if (!ver %in% c(1, 2)) { + stop("Invalid version number. Must be 1 or 2.") + } + base_subdir_name <- "clean_data" + return(paste0(base_subdir_name, "/v", ver, "/")) +} diff --git a/R/get.R b/R/get.R index 2bfe88f..f162ddf 100644 --- a/R/get.R +++ b/R/get.R @@ -7,21 +7,20 @@ #' @export #' @examples #' if (FALSE) { -#' spod_get_latest_v2_xml() +#' spod_get_latest_v2_xml() #' } -spod_get_latest_v2_xml = function( +spod_get_latest_v2_xml <- function( data_dir = spod_get_data_dir(), - xml_url = "https://movilidad-opendata.mitma.es/RSS.xml" -) { + xml_url = "https://movilidad-opendata.mitma.es/RSS.xml") { if (!fs::dir_exists(data_dir)) { fs::dir_create(data_dir) } - current_timestamp = format(Sys.time(), format = "%Y-%m-%d", usetz = FALSE, tz = "UTC") - current_filename = glue::glue("{data_dir}/data_links_v2_{current_timestamp}.xml") + current_timestamp <- format(Sys.time(), format = "%Y-%m-%d", usetz = FALSE, tz = "UTC") + current_filename <- glue::glue("{data_dir}/data_links_v2_{current_timestamp}.xml") message("Saving the file to: ", current_filename) - xml_requested = curl::curl_download(url = xml_url, destfile = current_filename, quiet = FALSE) + xml_requested <- curl::curl_download(url = xml_url, destfile = current_filename, quiet = FALSE) return(current_filename) } @@ -36,57 +35,58 @@ spod_get_latest_v2_xml = function( #' @examples #' # Get the data dictionary for the default data directory #' if (FALSE) { -#' metadata = spod_get_metadata() -#' names(metadata) -#' head(metadata) +#' metadata <- spod_get_metadata() +#' names(metadata) +#' head(metadata) #' } -spod_get_metadata = function(data_dir = spod_get_data_dir(), quiet = FALSE) { - xml_files_list = fs::dir_ls(data_dir, type = "file", regexp = "data_links_v2") |> sort() - latest_data_links_xml_path = utils::tail(xml_files_list, 1) +spod_get_metadata <- function(data_dir = spod_get_data_dir(), quiet = FALSE) { + xml_files_list <- fs::dir_ls(data_dir, type = "file", regexp = "data_links_v2") |> sort() + latest_data_links_xml_path <- utils::tail(xml_files_list, 1) if (length(latest_data_links_xml_path) == 0) { - if(isFALSE(quiet)) message("Getting latest data links xml") - latest_data_links_xml_path = spod_get_latest_v2_xml(data_dir = data_dir) + if (isFALSE(quiet)) message("Getting latest data links xml") + latest_data_links_xml_path <- spod_get_latest_v2_xml(data_dir = data_dir) } else { - if(isFALSE(quiet)) message("Using existing data links xml: ", latest_data_links_xml_path) + if (isFALSE(quiet)) message("Using existing data links xml: ", latest_data_links_xml_path) } - x_xml = xml2::read_xml(latest_data_links_xml_path) + x_xml <- xml2::read_xml(latest_data_links_xml_path) - download_dt = tibble::tibble( + download_dt <- tibble::tibble( target_url = xml2::xml_find_all(x = x_xml, xpath = "//link") |> xml2::xml_text(), pub_date = xml2::xml_find_all(x = x_xml, xpath = "//pubDate") |> xml2::xml_text() ) - download_dt$pub_ts = lubridate::dmy_hms(download_dt$pub_date) - download_dt$file_extension = tools::file_ext(download_dt$target_url) - download_dt = download_dt[download_dt$file_extension != "", ] - download_dt$pub_date = NULL + download_dt$pub_ts <- lubridate::dmy_hms(download_dt$pub_date) + download_dt$file_extension <- tools::file_ext(download_dt$target_url) + download_dt <- download_dt[download_dt$file_extension != "", ] + download_dt$pub_date <- NULL - download_dt$data_ym = lubridate::ym(stringr::str_extract(download_dt$target_url, "[0-9]{4}-[0-9]{2}")) - download_dt$data_ymd = lubridate::ymd(stringr::str_extract(download_dt$target_url, "[0-9]{8}")) + download_dt$data_ym <- lubridate::ym(stringr::str_extract(download_dt$target_url, "[0-9]{4}-[0-9]{2}")) + download_dt$data_ymd <- lubridate::ymd(stringr::str_extract(download_dt$target_url, "[0-9]{8}")) # order by pub_ts - download_dt = download_dt[order(download_dt$pub_ts, decreasing = TRUE), ] - download_dt$local_path = file.path( + download_dt <- download_dt[order(download_dt$pub_ts, decreasing = TRUE), ] + download_dt$local_path <- file.path( data_dir, stringr::str_replace(download_dt$target_url, "https://movilidad-opendata.mitma.es/", "") ) - download_dt$local_path = stringr::str_replace_all(download_dt$local_path, "\\/\\/\\/|\\/\\/", "/") + download_dt$local_path <- stringr::str_replace_all(download_dt$local_path, "\\/\\/\\/|\\/\\/", "/") return(download_dt) } #' Get the data directory -#' +#' #' This function retrieves the data directory from the environment variable SPANISH_OD_DATA_DIR. #' If the environment variable is not set, it returns the temporary directory. -#' +#' @param quiet Logical. If `TRUE`, the function does not print messages to the console. Defaults to `FALSE`. #' @return The data directory. +#' @export #' @keywords internal -spod_get_data_dir = function(quiet = FALSE) { - data_dir_env = Sys.getenv("SPANISH_OD_DATA_DIR") - if( data_dir_env == "" ) { +spod_get_data_dir <- function(quiet = FALSE) { + data_dir_env <- Sys.getenv("SPANISH_OD_DATA_DIR") + if (data_dir_env == "") { if (isFALSE(quiet)) warning("Warning: SPANISH_OD_DATA_DIR is not set. Using the temporary directory, which is not recommended, as the data will be deleted when the session ends.\n\n To set the data directory, use `Sys.setenv(SPANISH_OD_DATA_DIR = '/path/to/data')` or set SPANISH_OD_DATA_DIR permanently in the environment by editing the `.Renviron` file locally for current project with `usethis::edit_r_environ('project')` or `file.edit('.Renviron')` or globally for all projects with `usethis::edit_r_environ('user')` or `file.edit('~/.Renviron')`.") - data_dir_env = tempdir() # if not set, use the temp directory + data_dir_env <- tempdir() # if not set, use the temp directory } # check if dir exists and create it if it doesn't if (!fs::dir_exists(data_dir_env)) { @@ -106,16 +106,16 @@ spod_get_data_dir = function(quiet = FALSE) { #' @export #' @examples #' if (FALSE) { -#' zones = spod_get_zones() +#' zones <- spod_get_zones() #' } -spod_get_zones = function( - data_dir = spod_get_data_dir(), - type = "distritos") { - metadata = spod_get_metadata(data_dir) - regex = glue::glue("zonificacion_{type}\\.") - sel_distritos = stringr::str_detect(metadata$target_url, regex) - metadata_distritos = metadata[sel_distritos, ] - dir_name = dirname(metadata_distritos$local_path[1]) +spod_get_zones <- function( + data_dir = spod_get_data_dir(), + type = "distritos") { + metadata <- spod_get_metadata(data_dir) + regex <- glue::glue("zonificacion_{type}\\.") + sel_distritos <- stringr::str_detect(metadata$target_url, regex) + metadata_distritos <- metadata[sel_distritos, ] + dir_name <- dirname(metadata_distritos$local_path[1]) if (!fs::dir_exists(dir_name)) { fs::dir_create(dir_name, recurse = TRUE) } @@ -125,8 +125,8 @@ spod_get_zones = function( curl::curl_download(url = metadata_distritos$target_url[i], destfile = metadata_distritos$local_path[i], quiet = FALSE) } } - sel_shp = stringr::str_detect(metadata_distritos$local_path, "\\.shp$") - shp_file = metadata_distritos$local_path[sel_shp] + sel_shp <- stringr::str_detect(metadata_distritos$local_path, "\\.shp$") + shp_file <- metadata_distritos$local_path[sel_shp] suppressWarnings({ return(sf::read_sf(shp_file)) }) @@ -147,35 +147,33 @@ spod_get_zones = function( #' @examples #' # Download the origin-destination data for the first two days of March 2024 #' if (FALSE) { -#' od_20240301_20240302 = spod_get(date_regex = "2024-03-0[1-2]") +#' od_20240301_20240302 <- spod_get(date_regex = "2024-03-0[1-2]") #' } -spod_get = function( - data_dir = spod_get_data_dir(), - subdir = "estudios_basicos/por-distritos/viajes/ficheros-diarios", - date_regex = "2024030[1-2]", - read_fun = duckdb::tbl_file -) { - file_paths = download_od(data_dir = data_dir, subdir = subdir, date_regex = date_regex) +spod_get <- function( + data_dir = spod_get_data_dir(), + subdir = "estudios_basicos/por-distritos/viajes/ficheros-diarios", + date_regex = "2024030[1-2]", + read_fun = duckdb::tbl_file) { + file_paths <- download_od(data_dir = data_dir, subdir = subdir, date_regex = date_regex) if (identical(read_fun, readr::read_csv)) { return(purrr::map_dfr(file_paths, read_fun)) } - drv = duckdb::duckdb() - con = DBI::dbConnect(drv) + drv <- duckdb::duckdb() + con <- DBI::dbConnect(drv) # file.exists(file_paths[1]) # od1 = duckdb::tbl_file(con, file_paths[2]) - od_list = purrr::map(file_paths, ~duckdb::tbl_file(con, .)) + od_list <- purrr::map(file_paths, ~ duckdb::tbl_file(con, .)) } -download_od = function( - data_dir = spod_get_data_dir(), - subdir = "estudios_basicos/por-distritos/viajes/ficheros-diarios", - date_regex = "2024030[1-2]" -) { - regex = glue::glue("{subdir}*.+{date_regex}_Viajes_distritos.csv.gz") - metadata = spod_get_metadata(data_dir) - sel_od = stringr::str_detect(metadata$target_url, regex) - metadata_od = metadata[sel_od, ] +download_od <- function( + data_dir = spod_get_data_dir(), + subdir = "estudios_basicos/por-distritos/viajes/ficheros-diarios", + date_regex = "2024030[1-2]") { + regex <- glue::glue("{subdir}*.+{date_regex}_Viajes_distritos.csv.gz") + metadata <- spod_get_metadata(data_dir) + sel_od <- stringr::str_detect(metadata$target_url, regex) + metadata_od <- metadata[sel_od, ] metadata_od[[1]] - dir_name = dirname(metadata_od$local_path[1]) + dir_name <- dirname(metadata_od$local_path[1]) if (!fs::dir_exists(dir_name)) { fs::dir_create(dir_name) } diff --git a/R/get_v1_data.R b/R/get_v1_data.R index 1da532c..8be5a48 100644 --- a/R/get_v1_data.R +++ b/R/get_v1_data.R @@ -53,14 +53,14 @@ spod_get_latest_v1_file_list <- function( #' names(metadata) #' head(metadata) #' } -spod_available_data_v1 <- function(data_dir = spod_get_data_dir(), - # check_local_files (below) is FALSE by default to avoid excessive filesystem access, perhaps should be TRUE. Download functions use it to load the xml file, but we probably do not want the script to check all local cache directories every time we run a get data function. Perhaps it is better to offload this check to a separate function and have a csv file or some other way to keep track of the files that were downloaded and cached. An output of curl::multi_download() could be used for this purpose. - check_local_files = FALSE, - quiet = FALSE -) { +spod_available_data_v1 <- function( + data_dir = spod_get_data_dir(), + # check_local_files (below) is FALSE by default to avoid excessive filesystem access, perhaps should be TRUE. Download functions use it to load the xml file, but we probably do not want the script to check all local cache directories every time we run a get data function. Perhaps it is better to offload this check to a separate function and have a csv file or some other way to keep track of the files that were downloaded and cached. An output of curl::multi_download() could be used for this purpose. + check_local_files = FALSE, + quiet = FALSE) { xml_files_list <- fs::dir_ls(data_dir, type = "file", regexp = "data_links_v1") |> sort() - if(length(xml_files_list) == 0) { - if(isFALSE(quiet)) message("No data links xml files found, getting latest data links xml") + if (length(xml_files_list) == 0) { + if (isFALSE(quiet)) message("No data links xml files found, getting latest data links xml") latest_data_links_xml_path <- spod_get_latest_v1_file_list(data_dir = data_dir) } else { latest_data_links_xml_path <- utils::tail(xml_files_list, 1) @@ -70,14 +70,14 @@ spod_available_data_v1 <- function(data_dir = spod_get_data_dir(), file_date <- stringr::str_extract(latest_data_links_xml_path, "[0-9]{4}-[0-9]{2}-[0-9]{2}") if (file_date < format(Sys.Date(), format = "%Y-%m-%d")) { - if(isFALSE(quiet)) message("File list xml is 1 day old or older, getting latest data links xml") + if (isFALSE(quiet)) message("File list xml is 1 day old or older, getting latest data links xml") latest_data_links_xml_path <- spod_get_latest_v1_file_list(data_dir = data_dir) } else { - if(isFALSE(quiet)) message("Using existing data links xml: ", latest_data_links_xml_path) + if (isFALSE(quiet)) message("Using existing data links xml: ", latest_data_links_xml_path) } if (length(latest_data_links_xml_path) == 0) { - if(isFALSE(quiet)) message("Getting latest data links xml") + if (isFALSE(quiet)) message("Getting latest data links xml") latest_data_links_xml_path <- spod_get_latest_v1_file_list(data_dir = data_dir) } @@ -109,10 +109,10 @@ spod_available_data_v1 <- function(data_dir = spod_get_data_dir(), # fix paths for files that are in '0000-referencia' folder files_table$local_path <- gsub("0000-referencia\\/([0-9]{4})([0-9]{2})([0-9]{2})_", "year=\\1\\/month=\\2\\/day=\\3\\/", files_table$local_path) - + # replace 2 digit month with 1 digit month files_table$local_path <- gsub("month=0([1-9])", "month=\\1", files_table$local_path) - + # replace 2 digit day with 1 digit day files_table$local_path <- gsub("day=0([1-9])", "day=\\1", files_table$local_path) @@ -128,7 +128,8 @@ spod_available_data_v1 <- function(data_dir = spod_get_data_dir(), #' It can retrieve either "distritos" or "municipios" zones data. #' #' @param data_dir The directory where the data is stored. -#' @param zones The zones for which to download the data. Can be `"districts"` (or `"dist"`, `"distr"`) or `"municipalities"` (or `"muni"`, `"municip"`). +#' @param zones The zones for which to download the data. Can be `"districts"` (or `"dist"`, `"distr"`, or the original Spanish `"distritos"`) or `"municipalities"` (or `"muni"`, `"municip"`, or the original Spanish `"municipios"`). +#' @param quiet Whether to suppress messages. Defaults to `FALSE`. #' @return A spatial object containing the zones data. #' @export #' @examples @@ -136,17 +137,25 @@ spod_available_data_v1 <- function(data_dir = spod_get_data_dir(), #' zones <- spod_get_zones() #' } spod_get_zones_v1 <- function( - zones = c("districts", "dist", "distr", - "municipalities", "muni", "municip"), - data_dir = spod_get_data_dir() -) { + zones = c( + "districts", "dist", "distr", "distritos", + "municipalities", "muni", "municip", "municipios" + ), + data_dir = spod_get_data_dir(), + quiet = FALSE + ) { zones <- match.arg(zones) zones <- spod_zone_names_en2es(zones) # check if shp files are already extracted - expected_gpkg_path <- fs::path(data_dir, glue::glue("clean_data/v1//zones/{zones}_mitma.gpkg")) + expected_gpkg_path <- fs::path( + data_dir, + glue::glue(spod_subfolder_clean_data_cache(), "/zones/{zones}_mitma.gpkg") + ) if (fs::file_exists(expected_gpkg_path)) { - message("Loading .gpkg file that already exists in data dir: ", expected_gpkg_path) + if (isFALSE(quiet)) { + message("Loading .gpkg file that already exists in data dir: ", expected_gpkg_path) + } return(sf::read_sf(expected_gpkg_path)) } @@ -162,14 +171,14 @@ spod_get_zones_v1 <- function( } if (!fs::file_exists(metadata_zones$local_path)) { - message("Downloading the file to: ", metadata_zones$local_path) + if (isFALSE(quiet)) message("Downloading the file to: ", metadata_zones$local_path) downloaded_file <- curl::curl_download(metadata_zones$target_url, destfile = metadata_zones$local_path, mode = "wb", quiet = FALSE) } else { - message("File already exists: ", metadata_zones$local_path) + if (isFALSE(quiet)) message("File already exists: ", metadata_zones$local_path) downloaded_file <- metadata_zones$local_path } - message("Unzipping the file: ", downloaded_file) + if (isFALSE(quiet)) message("Unzipping the file: ", downloaded_file) utils::unzip(downloaded_file, exdir = fs::path_dir(downloaded_file) ) @@ -178,8 +187,7 @@ spod_get_zones_v1 <- function( junk_path <- paste0(fs::path_dir(downloaded_file), "/__MACOSX") if (fs::dir_exists(junk_path)) fs::dir_delete(junk_path) - zones_path <- fs::dir_ls(data_dir, glob = glue::glue("**{zones}/*.shp"), recurse = TRUE) - + zones_path <- fs::dir_ls(data_dir, glob = glue::glue("*v1**{zones}/*.shp"), recurse = TRUE) zones <- spod_clean_zones_v1(zones_path) fs::dir_create(fs::path_dir(expected_gpkg_path), recurse = TRUE) sf::st_write(zones, expected_gpkg_path, delete_dsn = TRUE, delete_layer = TRUE) @@ -188,13 +196,13 @@ spod_get_zones_v1 <- function( } #' Fixes common issues in the zones data and cleans up variable names -#' +#' #' This function fixes any invalid geometries in the zones data and renames the "ID" column to "id". -#' +#' #' @param zones_path The path to the zones spatial data file. #' @return A spatial object of class `sf`. #' @keywords internal -#' +#' spod_clean_zones_v1 <- function(zones_path) { suppressWarnings({ zones <- sf::read_sf(zones_path) @@ -209,35 +217,159 @@ spod_clean_zones_v1 <- function(zones_path) { } -#' Retrieve the origin-destination v1 data (2020-2021) -#' -#' This function retrieves the v1 (2020-2021) origin-destination data from the specified data directory. -#' @param read_fun The function to read the data. Defaults to `duckdb::tbl_file`. +#' Load the origin-destination v1 data (2020-2021) for specified dates +#' +#' This function retrieves the v1 (2020-2021) origin_destination_data for the specified dates. It checks if the requested data is already cached locally and downloads it if it is not. When all the requested data is cached, it creates a `DuckDB` connection to the cache data folder and provides an table +#' #' @inheritParams spod_download_data -#' @return A tibble with the origin-destination data. -spod_get_od <- function( - zones = c("districts", "dist", "distr", - "municipalities", "muni", "municip"), # add "urban_areas" for v2 data - dates = NULL, - data_dir = spod_get_data_dir(), - quiet = FALSE, - read_fun = duckdb::tbl_file -) { - # Processing of the date arguments is performed in `spod_download_data()` - +#' @inheritParams spod_duckdb_limit_resources +#' @return A DuckDB table connection object. It can be manupulated using `dplyr` verbs, or can be loaded into memory using `dplyr::collect()`. The structure of the object is as follows: +#' +#' \describe{ +#' \item{full_date}{\code{Date}. The full date of the trip, including year, month, and day.} +#' \item{id_origin}{\code{factor}. The identifier for the origin location of the trip, formatted as a code (e.g., '01001_AM').} +#' \item{id_destination}{\code{factor}. The identifier for the destination location of the trip, formatted as a code (e.g., '01001_AM').} +#' \item{activity_origin}{\code{factor}. The type of activity at the origin location (e.g., 'home', 'work').} +#' \item{activity_destination}{\code{factor}. The type of activity at the destination location (e.g., 'home', 'other').} +#' \item{residence_province}{\code{factor}. The province of residence for the individual making the trip (e.g. 'Cuenca', 'Girona').} +#' \item{time_slot}{\code{integer}. The time slot during which the trip started, represented as an integer (e.g., 0, 1, 2).} +#' \item{distance}{\code{factor}. The distance category of the trip, represented as a code (e.g., '002-005' for 2-5 km).} +#' \item{n_trips}{\code{double}. The number of trips taken within the specified time slot and distance.} +#' \item{trips_total_length_km}{\code{double}. The total length of all trips in kilometers for the specified time slot and distance.} +#' \item{year}{\code{double}. The year of the trip.} +#' \item{month}{\code{double}. The month of the trip.} +#' \item{day}{\code{double}. The day of the trip.} +#' } +#' +#' This object also contains the reference to the source DuckDB conneciton with the full view of the cached data. It can be accessed using `od_table$src$con`. See examples below. The connection includes two views: +#' +#' +#' * `od_csv_raw` - a raw table view of all cached CSV files with the origin-destination data that has been previously cached in $SPANISH_OD_DATA_DIR +#' +#' * `od_csv_clean` - a cleaned-up table view of `od_csv_raw` with column names and values translated and mapped to English. This still includes all cached data. +#' +#' View `od_csv_clean` has the same structure as the filtered view 'od_filtered', which is returned by `spod_get_od_v1()` as a DuckDB table connection object. The view `od_csv_raw` has original Spanish column names and values and has the following structure: +#' \describe{ +#' \item{fecha}{\code{Date}. The date of the trip, including year, month, and day.} +#' \item{origen}{\code{character}. The identifier for the origin location of the trip, formatted as a character string (e.g., '01001_AM').} +#' \item{destino}{\code{character}. The identifier for the destination location of the trip, formatted as a character string (e.g., '01001_AM').} +#' \item{actividad_origen}{\code{character}. The type of activity at the origin location (e.g., 'casa', 'trabajo').} +#' \item{actividad_destino}{\code{character}. The type of activity at the destination location (e.g., 'otros', 'trabajo').} +#' \item{residencia}{\code{character}. The code representing the residence of the individual making the trip (e.g., '01') according to the official INE classification.} +#' \item{edad}{\code{character}. The age of the individual making the trip. This data is actaully filled with 'NA' values, which is why this column is removed in the cleaned-up and translated view described above.} +#' \item{periodo}{\code{integer}. The time period during which the trip started, represented as an integer (e.g., 0, 1, 2).} +#' \item{distancia}{\code{character}. The distance category of the trip, represented as a character string (e.g., '002-005' for 2-5 km).} +#' \item{viajes}{\code{double}. The number of trips taken within the specified time period and distance.} +#' \item{viajes_km}{\code{double}. The total length of all trips in kilometers for the specified time period and distance.} +#' \item{day}{\code{double}. The day of the trip.} +#' \item{month}{\code{double}. The month of the trip.} +#' \item{year}{\code{double}. The year of the trip.} +#' } +#' +#' @export +#' @examples +#' \dontrun{ +#' +#' # create a connection to the v1 data +#' Sys.setenv(SPANISH_OD_DATA_DIR = "~/home/nosync/cache/mitma") +#' dates <- c("2020-02-14", "2020-03-14", "2021-02-14", "2021-02-14", "2021-02-15") +#' od_dist <- spod_get_od_v1(zones = "distr", dates = dates) +#' +#' # od dist is a table view filtered to the specified dates +#' +#' # access the source connection with all dates +#' # list tables +#' DBI::dbListTables(od_dist$src$con) +#' } +spod_get_od_v1 <- function( + zones = c( + "districts", "dist", "distr", "distritos", + "municipalities", "muni", "municip", "municipios" + ), + dates = NULL, + data_dir = spod_get_data_dir(), + quiet = FALSE, + duck_max_mem = 2, + duck_max_threads = parallelly::availableCores()) + { + # hardcode od as this is a wrapper to get origin-destiation data + type <- "od" + zones <- match.arg(zones) - - # use the spot_download_data() function to download any missing data - downloaded_files <- spod_download_data( - type = "od", + zones <- spod_zone_names_en2es(zones) + + + if (is.character(dates)) { + if (all(dates != "cached")) { + dates <- spod_dates_argument_to_dates_seq(dates = dates) + # use the spot_download_data() function to download any missing data + spod_download_data( + type = type, + zones = zones, + dates = dates, + data_dir = data_dir, + return_output = FALSE + ) + } + } + + # create in memory duckdb connection + drv <- duckdb::duckdb() + con <- DBI::dbConnect(drv, dbdir = ":memory:", read_only = FALSE) + + # define memory and threads limits + con <- spod_duckdb_limit_resources( + con = con, + duck_max_mem = duck_max_mem, + duck_max_threads = duck_max_threads + ) + + + # attach the od folder of csv.gz files with predefined and cleaned up data types + con <- spod_duckdb_od_v1( + con = con, zones = zones, - dates = dates, data_dir = data_dir ) + # filter by date + # actually, it seems like this works even if we do not return the 'con' from the function below, but I guess it is safer to return the 'con' and resave it to the 'con' of the environment/scope of this function + if (is.character(dates)) { + if (all(dates != "cached")) { + con <- spod_duckdb_filter_by_dates(con, "od_csv_clean", "od_csv_clean_filtered", dates) + } + } + - # read data from cached files - - - + # DBI::dbListTables(con) # for debugging only + # dplyr::tbl(con, "od_csv_clean") |> dplyr::glimpse() # for debugging only + # DBI::dbDisconnect(con) # for debugging only + + # speed comparison REMOVE a bit later AFTER TESTING + # b1 <- bench::mark(iterations = 5, check = FALSE, + # hive_date = {dplyr::tbl(con, "od_csv_clean") |> + # dplyr::distinct(full_date) |> + # dplyr::collect()}, # this is prefiltered using custom SQL query using only the columns (year, month, day) that we know are constructed from the hive style partitioning + # full_date = {dplyr::tbl(con, "od_csv_clean") |> + # dplyr::filter(full_date %in% dates) |> + # dplyr::distinct(full_date) |> + # dplyr::collect()} # this is causing DuckDB to scan ALL csv.gz files in the folder because it has to match the desired dates with full_date column + # ) + # bench:::plot.bench_mark(b1, type = "violin") + ggpubr::theme_pubclean(base_size = 24) + + # perhaps let's not confuse the user with the duckdb connection, see help for the @return of the spod_duckdb_od_v1() function + # return(con) + + # return the tbl conection for user friendly data manipulation + # this may have an implication that there is no way for the user to properly disconnect the db connection, should think how this can be addressed + # not a problem! can be done with: + # DBI::dbDisconnect(od$src$con) + + if (is.character(dates)) { + if (all(dates != "cached")) { + return(dplyr::tbl(con, "od_csv_clean_filtered")) + } + } else { + return(dplyr::tbl(con, "od_csv_clean")) + } } diff --git a/R/internal_utils.R b/R/internal_utils.R index 7eca22d..043a02c 100644 --- a/R/internal_utils.R +++ b/R/internal_utils.R @@ -1,65 +1,63 @@ #' Convert multiple formates of date arguments to a sequence of dates -#' +#' #' This function processes the date arguments provided to various functions in the package. It can handle single dates and arbitratry sequences (vectors) of dates in ISO (YYYY-MM-DD) and YYYYMMDD format. It can also handle date ranges in the format 'YYYY-MM-DD_YYYY-MM-DD' (or 'YYYYMMDD_YYYYMMDD'), date ranges in named vec and regular expressions to match dates in the format `YYYYMMDD`. -#' +#' #' @param dates A `character` or `Date` vector of dates to process. Kindly keep in mind that v1 and v2 data follow different data collection methodologies and may not be directly comparable. Therefore, do not try to request data from both versions for the same date range. If you need to compare data from both versions, please refer to the respective codebooks and methodology documents. The v1 data covers the period from 2020-02-14 to 2021-05-09, and the v2 data covers the period from 2022-01-01 to the present until further notice. The true dates range is checked against the available data for each version on every function run. -#' +#' #' The possible values can be any of the following: -#' +#' #' * A single date in ISO (YYYY-MM-DD) or YYYYMMDD format. `character` or `Date` object. -#' +#' #' * A vector of dates in ISO (YYYY-MM-DD) or YYYYMMDD format. `character` or `Date` object. Can be any non-consecutive sequence of dates. -#' +#' #' * A date range -#' -#' * eigher a `character` or `Date` object of length 2 with clearly named elements `start` and `end` in ISO (YYYY-MM-DD) or YYYYMMDD format. E.g. `c(start = "2020-02-15", end = "2020-02-17")`; -#' -#' * or a `character` object of the form `YYYY-MM-DD_YYYY-MM-DD` or `YYYYMMDD_YYYYMMDD`. For example, `2020-02-15_2020-02-17` or `20200215_20200217`. -#' +#' +#' * eigher a `character` or `Date` object of length 2 with clearly named elements `start` and `end` in ISO (YYYY-MM-DD) or YYYYMMDD format. E.g. `c(start = "2020-02-15", end = "2020-02-17")`; +#' +#' * or a `character` object of the form `YYYY-MM-DD_YYYY-MM-DD` or `YYYYMMDD_YYYYMMDD`. For example, `2020-02-15_2020-02-17` or `20200215_20200217`. +#' #' * A regular expression to match dates in the format `YYYYMMDD`. `character` object. For example, `^202002` will match all dates in February 2020. -#' -#' +#' +#' #' @return A character vector of dates in ISO format (YYYY-MM-DD). #' @keywords internal spod_dates_argument_to_dates_seq <- function(dates) { if (is.null(dates) || (!is.character(dates) && !inherits(dates, "Date"))) { stop("Invalid date input format. Please provide a character vector or Date object.") } - + range_regex <- "^\\d{4}(-\\d{2}){2}_\\d{4}(-\\d{2}){2}$|^\\d{8}_\\d{8}$" single_date_regex <- "^(\\d{4}-\\d{2}-\\d{2}|\\d{8})$" # If dates is a vector of length one # Check if is single date, date range, or regex pattern - if (length(dates) == 1){ - + if (length(dates) == 1) { # Check if date range # match both YYYY-MM-DD_YYYY-MM-DD and YYYYMMDD_YYYYMMDD - if (grepl(range_regex, dates)){ + if (grepl(range_regex, dates)) { date_parts <- strsplit(dates, "_")[[1]] date_parts <- lubridate::ymd(date_parts) dates <- seq.Date(date_parts[1], date_parts[2], by = "day") # if dates does not match the date range pattern # check if it is just a single day in YYYY-MM-DD or YYYYMMDD format - } else if(grepl(single_date_regex, dates)) { + } else if (grepl(single_date_regex, dates)) { dates <- lubridate::ymd(dates) - + # assume it is a regex pattern } else { dates <- spod_expand_dates_from_regex(dates) # since spod_expand_dates_from_regex already uses the metadata to generate valid dates we can skip any checks that are required for other date formats and only check for datte overlap - if( isFALSE(spod_is_data_version_overlaps(dates)) ){ + if (isFALSE(spod_is_data_version_overlaps(dates))) { return(dates) } } # If dates if a vector of multiple values - } else if (length(dates) > 1){ - + } else if (length(dates) > 1) { # Check if it is of length 2, then it may be a date range if (length(dates) == 2 & !is.null(names(dates))) { # if the vector is named with 'start' and 'end', we can assume it is a date range - if(all(names(dates) %in% c("start", "end"))){ + if (all(names(dates) %in% c("start", "end"))) { date_parts <- lubridate::ymd(dates) dates <- seq.Date(date_parts[1], date_parts[2], by = "day") } @@ -71,7 +69,7 @@ spod_dates_argument_to_dates_seq <- function(dates) { # now that we have a clean sequence of dates, we can check for overlaps between data versions if (isFALSE(spod_is_data_version_overlaps(dates)) & - spod_infer_data_v_from_dates(dates) %in% c(1, 2) + spod_infer_data_v_from_dates(dates) %in% c(1, 2) ) { return(dates) } @@ -80,14 +78,13 @@ spod_dates_argument_to_dates_seq <- function(dates) { #' Check if specified dates span both data versions -#' +#' #' This function checks if the specified dates or date ranges span both v1 and v2 data versions. -#' +#' #' @param dates A Dates vector of dates to check. #' @return TRUE if the dates span both data versions, FALSE otherwise. #' @keywords internal -spod_is_data_version_overlaps <- function(dates){ - +spod_is_data_version_overlaps <- function(dates) { all_dates_v1 <- spod_get_valid_dates(ver = 1) all_dates_v2 <- spod_get_valid_dates(ver = 2) @@ -103,7 +100,7 @@ spod_infer_data_v_from_dates <- function(dates) { if (spod_is_data_version_overlaps(dates)) { invisible(return(NULL)) } - + # of no overlap, compare with date ranges v1_dates <- spod_get_valid_dates(ver = 1) v2_dates <- spod_get_valid_dates(ver = 2) @@ -120,15 +117,14 @@ spod_infer_data_v_from_dates <- function(dates) { } #' Function to expand dates from a regex -#' +#' #' This function generates a sequence of dates from a regular expression pattern. #' based on the provided regular expression. -#' +#' #' @param date_regex A regular expression to match dates in the format yyyymmdd. #' @return A character vector of dates matching the regex. #' @keywords internal spod_expand_dates_from_regex <- function(date_regex) { - all_dates_v1 <- spod_get_valid_dates(ver = 1) all_dates_v2 <- spod_get_valid_dates(ver = 2) @@ -138,8 +134,10 @@ spod_expand_dates_from_regex <- function(date_regex) { # if both vectors are empty, throw an error if (length(matching_dates_v1) == 0 && length(matching_dates_v2) == 0) { - stop(paste0("No matching dates found in the available data.", - "The valid dates range for v1 is: ", paste0(min(all_dates_v1), " to ", max(all_dates_v1)), " and for v2 is: ", paste0(min(all_dates_v2), " to ", max(all_dates_v2)))) + stop(paste0( + "No matching dates found in the available data.", + "The valid dates range for v1 is: ", paste0(min(all_dates_v1), " to ", max(all_dates_v1)), " and for v2 is: ", paste0(min(all_dates_v2), " to ", max(all_dates_v2)) + )) } # If checks above have passed, we can combine the matching dates as only one contains dates and the other is empty matching_dates <- sort(c(matching_dates_v1, matching_dates_v2)) @@ -147,22 +145,25 @@ spod_expand_dates_from_regex <- function(date_regex) { return(matching_dates) } - +#' Get valid dates for the specified data version +#' @param ver The version of the data to use. Defaults to 1. Can be 1 or 2. +#' @return A Dates vector of valid dates for the specified data version. +#' @keywords internal spod_get_valid_dates <- function(ver = 1) { - rlang:::check_number_whole(ver) + ver <- as.integer(ver) if (!ver %in% c(1, 2)) { stop("Invalid version number. Must be 1 or 2.") - } + } - if(ver == 1) { + if (ver == 1) { # available_data <- spod_available_data_v1(check_local_files = FALSE, quiet = TRUE) # all_dates <- unique(available_data[grepl("maestra1.*diarios", available_data$target_url),]$data_ymd, na.rm = TRUE) # perahps it is worth hardcoding at lest the v1 data range as it is unlikely to change at this point all_dates <- seq.Date(from = as.Date("2020-02-14"), to = as.Date("2021-05-09"), by = "day") } else if (ver == 2) { available_data <- spod_get_metadata(quiet = TRUE) # replace with spod_available_data_v2() when available - all_dates <- unique(available_data[grepl("viajes.*diarios", available_data$target_url),]$data_ymd, na.rm = TRUE) + all_dates <- unique(available_data[grepl("viajes.*diarios", available_data$target_url), ]$data_ymd, na.rm = TRUE) } return(all_dates) @@ -174,15 +175,19 @@ spod_get_valid_dates <- function(ver = 1) { spod_zone_names_en2es <- function( - zones = c("districts", "dist", "distr", - "municipalities", "muni", "municip") -) { + zones = c( + "districts", "dist", "distr", "distritos", + "municipalities", "muni", "municip", "municipios", + "lau", "large_urban_areas", "gau", "grandes_areas_urbanas" + )) { zones <- tolower(zones) zones <- match.arg(zones) - if(zones %in% c("districts", "dist", "distr")) { + if (zones %in% c("districts", "dist", "distr", "distritos")) { return("distritos") - } else if(zones %in% c("municipalities", "muni", "municip")) { + } else if (zones %in% c("municipalities", "muni", "municip", "municipios")) { return("municipios") + } else if (zones %in% c("lau", "large_urban_areas", "gau", "grandes_areas_urbanas")) { + return("GAU") } } @@ -191,13 +196,12 @@ spod_zone_names_en2es <- function( #' @param ver The version of the data to use. Defaults to 1. Can be 1 or 2. #' @keywords internal spod_match_data_type <- function( - type = c( - "od", "origin-destination", - "os", "overnight_stays", - "tpp", "trips_per_person"), - ver = c(1, 2) -){ - rlang:::check_number_whole(ver) + type = c( + "od", "origin-destination", + "os", "overnight_stays", + "tpp", "trips_per_person" + ), + ver = c(1, 2)) { if (!ver %in% c(1, 2)) { stop("Invalid version number. Must be 1 or 2.") } @@ -205,20 +209,20 @@ spod_match_data_type <- function( type <- tolower(type) type <- match.arg(type) - if(ver == 1) { + if (ver == 1) { if (type %in% c("od", "origin-destination")) { return("maestra1") - } else if(type %in% c("tpp", "trips_per_person")) { + } else if (type %in% c("tpp", "trips_per_person")) { return("maestra2") } } - if(ver == 2) { + if (ver == 2) { if (type %in% c("od", "origin-destination")) { return("viajes") - } else if(type %in% c("os", "overnight_stays")) { + } else if (type %in% c("os", "overnight_stays")) { return("pernoctaciones") - } else if(type %in% c("tpp", "trips_per_person")) { + } else if (type %in% c("tpp", "trips_per_person")) { return("personas") } } diff --git a/README.md b/README.md index 4e98533..68d5025 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,20 @@ Load it as follows: library(spanishoddata) ``` +Local development: to load the package locally, clone it and navigate to +the root of the package in the terminal, e.g. with the following: + +``` bash +gh repo clone Robinlovelace/spanishoddata +code spanishoddata +``` + +then run the following command from the R console: + +``` r +devtools::load_all() +``` + # Setting the data directory You can specify the data directory globally by setting the @@ -93,16 +107,16 @@ metadata # A tibble: 9,442 × 6 target_url pub_ts file_extension data_ym data_ymd - 1 https://movilidad-o… 2024-07-16 11:22:03 gz NA 2022-10-23 - 2 https://movilidad-o… 2024-07-16 11:18:41 gz NA 2022-10-22 - 3 https://movilidad-o… 2024-07-16 11:15:06 gz NA 2022-10-20 - 4 https://movilidad-o… 2024-07-16 11:11:35 gz NA 2022-10-18 - 5 https://movilidad-o… 2024-07-16 11:07:58 gz NA 2022-10-17 - 6 https://movilidad-o… 2024-07-16 11:04:18 gz NA 2022-10-12 - 7 https://movilidad-o… 2024-07-16 11:00:20 gz NA 2022-10-07 - 8 https://movilidad-o… 2024-07-16 10:56:03 gz NA 2022-08-07 - 9 https://movilidad-o… 2024-07-16 10:51:05 gz NA 2022-08-06 - 10 https://movilidad-o… 2024-07-16 10:46:24 gz NA 2022-08-05 + 1 https://movilidad-o… 2024-07-30 10:54:08 gz NA 2022-10-23 + 2 https://movilidad-o… 2024-07-30 10:51:07 gz NA 2022-10-22 + 3 https://movilidad-o… 2024-07-30 10:47:52 gz NA 2022-10-20 + 4 https://movilidad-o… 2024-07-30 10:14:55 gz NA 2022-10-18 + 5 https://movilidad-o… 2024-07-30 10:11:58 gz NA 2022-10-17 + 6 https://movilidad-o… 2024-07-30 10:09:03 gz NA 2022-10-12 + 7 https://movilidad-o… 2024-07-30 10:05:57 gz NA 2022-10-07 + 8 https://movilidad-o… 2024-07-30 10:02:12 gz NA 2022-08-07 + 9 https://movilidad-o… 2024-07-30 09:58:34 gz NA 2022-08-06 + 10 https://movilidad-o… 2024-07-30 09:54:30 gz NA 2022-08-05 # ℹ 9,432 more rows # ℹ 1 more variable: local_path @@ -174,7 +188,7 @@ od_multi_list[[1]] ``` # Source: SQL [?? x 15] - # Database: DuckDB v0.10.2 [robin@Linux 6.5.0-45-generic:R 4.4.1/:memory:] + # Database: DuckDB v1.0.0 [robin@Linux 6.5.0-45-generic:R 4.4.1/:memory:] fecha periodo origen destino distancia actividad_origen actividad_destino 1 20240307 00 01009_… 01001 0.5-2 frecuente casa diff --git a/README.qmd b/README.qmd index f3d7d0d..3f1effd 100644 --- a/README.qmd +++ b/README.qmd @@ -45,6 +45,20 @@ Load it as follows: library(spanishoddata) ``` +Local development: to load the package locally, clone it and navigate to the root of the package in the terminal, e.g. with the following: + +```bash +gh repo clone Robinlovelace/spanishoddata +code spanishoddata +``` + +then run the following command from the R console: + +```{r} +#| eval: false +devtools::load_all() +``` + # Setting the data directory You can specify the data directory globally by setting the `SPANISH_OD_DATA_DIR` environment variable, e.g. with the following command: @@ -335,6 +349,8 @@ usethis::use_pkgdown() usethis::use_github_action("pkgdown") # Setup gh pages: usethis::use_github_pages() +# Auto-style with styler +styler::style_pkg() ``` ```{r} diff --git a/inst/extdata/sql-queries/provinces-enum.txt b/inst/extdata/sql-queries/provinces-enum.txt new file mode 100644 index 0000000..3c2fd6f --- /dev/null +++ b/inst/extdata/sql-queries/provinces-enum.txt @@ -0,0 +1,53 @@ +'UNDEFINED', +'Araba/Álava', +'Albacete', +'Alicante/Alacant', +'Almería', +'Ávila', +'Badajoz', +'Balears, Illes', +'Barcelona', +'Burgos', +'Cáceres', +'Cádiz', +'Castellón/Castelló', +'Ciudad Real', +'Córdoba', +'Coruña, A', +'Cuenca', +'Girona', +'Granada', +'Guadalajara', +'Gipuzkoa', +'Huelva', +'Huesca', +'Jaén', +'León', +'Lleida', +'Rioja, La', +'Lugo', +'Madrid', +'Málaga', +'Murcia', +'Navarra', +'Ourense', +'Asturias', +'Palencia', +'Palmas, Las', +'Pontevedra', +'Salamanca', +'Santa Cruz de Tenerife', +'Cantabria', +'Segovia', +'Sevilla', +'Soria', +'Tarragona', +'Teruel', +'Toledo', +'Valencia/València', +'Valladolid', +'Bizkaia', +'Zamora', +'Zaragoza', +'Ceuta', +'Melilla' diff --git a/inst/extdata/sql-queries/when-recode-provinces.txt b/inst/extdata/sql-queries/when-recode-provinces.txt new file mode 100644 index 0000000..05344cb --- /dev/null +++ b/inst/extdata/sql-queries/when-recode-provinces.txt @@ -0,0 +1,52 @@ +WHEN '01' THEN 'Araba/Álava' +WHEN '02' THEN 'Albacete' +WHEN '03' THEN 'Alicante/Alacant' +WHEN '04' THEN 'Almería' +WHEN '05' THEN 'Ávila' +WHEN '06' THEN 'Badajoz' +WHEN '07' THEN 'Balears, Illes' +WHEN '08' THEN 'Barcelona' +WHEN '09' THEN 'Burgos' +WHEN '10' THEN 'Cáceres' +WHEN '11' THEN 'Cádiz' +WHEN '12' THEN 'Castellón/Castelló' +WHEN '13' THEN 'Ciudad Real' +WHEN '14' THEN 'Córdoba' +WHEN '15' THEN 'Coruña, A' +WHEN '16' THEN 'Cuenca' +WHEN '17' THEN 'Girona' +WHEN '18' THEN 'Granada' +WHEN '19' THEN 'Guadalajara' +WHEN '20' THEN 'Gipuzkoa' +WHEN '21' THEN 'Huelva' +WHEN '22' THEN 'Huesca' +WHEN '23' THEN 'Jaén' +WHEN '24' THEN 'León' +WHEN '25' THEN 'Lleida' +WHEN '26' THEN 'Rioja, La' +WHEN '27' THEN 'Lugo' +WHEN '28' THEN 'Madrid' +WHEN '29' THEN 'Málaga' +WHEN '30' THEN 'Murcia' +WHEN '31' THEN 'Navarra' +WHEN '32' THEN 'Ourense' +WHEN '33' THEN 'Asturias' +WHEN '34' THEN 'Palencia' +WHEN '35' THEN 'Palmas, Las' +WHEN '36' THEN 'Pontevedra' +WHEN '37' THEN 'Salamanca' +WHEN '38' THEN 'Santa Cruz de Tenerife' +WHEN '39' THEN 'Cantabria' +WHEN '40' THEN 'Segovia' +WHEN '41' THEN 'Sevilla' +WHEN '42' THEN 'Soria' +WHEN '43' THEN 'Tarragona' +WHEN '44' THEN 'Teruel' +WHEN '45' THEN 'Toledo' +WHEN '46' THEN 'Valencia/València' +WHEN '47' THEN 'Valladolid' +WHEN '48' THEN 'Bizkaia' +WHEN '49' THEN 'Zamora' +WHEN '50' THEN 'Zaragoza' +WHEN '51' THEN 'Ceuta' +WHEN '52' THEN 'Melilla' diff --git a/man/figures/README-disaggregated-1.png b/man/figures/README-disaggregated-1.png index 8f2a045..7317026 100644 Binary files a/man/figures/README-disaggregated-1.png and b/man/figures/README-disaggregated-1.png differ diff --git a/man/spod_convert_od_v1_to_duckdb.Rd b/man/spod_convert_od_v1_to_duckdb.Rd new file mode 100644 index 0000000..25c52da --- /dev/null +++ b/man/spod_convert_od_v1_to_duckdb.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/convert_data.R +\name{spod_convert_od_v1_to_duckdb} +\alias{spod_convert_od_v1_to_duckdb} +\title{Convert all downloaded v1 origin-destination data to duckdb} +\usage{ +spod_convert_od_v1_to_duckdb( + zones = c("districts", "dist", "distr", "distritos", "municipalities", "muni", + "municip", "municipios"), + data_dir = spod_get_data_dir(), + save_dir = NULL, + quiet = FALSE, + duck_max_mem = 3, + duck_max_threads = parallelly::availableCores(), + overwrite = FALSE +) +} +\arguments{ +\item{zones}{The zones for which to download the data. Can be \code{"districts"} (or \code{"dist"}, \code{"distr"}, or the original Spanish \code{"distritos"}) or \code{"municipalities"} (or \code{"muni"}, \code{"municip"}, or the original Spanish \code{"municipios"}).} + +\item{data_dir}{The directory where the data is stored.} + +\item{save_dir}{The path to the directory where the duckdb files will be saved. If \code{NULL}, uses the default location in \code{data_dir} (set by the \code{SPANISH_OD_DATA_DIR} environment variable). Therefore, the default relative path is \verb{/clean_data/v1/tabular/duckdb/od_.duckdb}.} + +\item{quiet}{Whether to suppress messages. Defaults to \code{FALSE}.} + +\item{duck_max_mem}{The maximum memory to use in GB. A conservative default is 3 GB, which should be enough for resaving the data to DuckDB form a folder of CSV.gz files while being small enough to fit in memory of most even old computers. For data analysis using the already converted data (in DuckDB or Parquet format) or with the raw CSV.gz data, it is recommended to increase it according to available resources.} + +\item{duck_max_threads}{The maximum number of threads to use. Defaults to the number of available cores minus 1.} + +\item{overwrite}{Logical. If \code{TRUE}, overwrites existing duckdb files. Defaults to \code{FALSE}.} +} +\value{ +Path to saved DuckDB file. +} +\description{ +Convert all downloaded v1 origin-destination data to duckdb +} diff --git a/man/spod_dates_argument_to_dates_seq.Rd b/man/spod_dates_argument_to_dates_seq.Rd index d9f5b21..54ecc41 100644 --- a/man/spod_dates_argument_to_dates_seq.Rd +++ b/man/spod_dates_argument_to_dates_seq.Rd @@ -14,8 +14,10 @@ The possible values can be any of the following: \item A single date in ISO (YYYY-MM-DD) or YYYYMMDD format. \code{character} or \code{Date} object. \item A vector of dates in ISO (YYYY-MM-DD) or YYYYMMDD format. \code{character} or \code{Date} object. Can be any non-consecutive sequence of dates. \item A date range +\itemize{ \item eigher a \code{character} or \code{Date} object of length 2 with clearly named elements \code{start} and \code{end} in ISO (YYYY-MM-DD) or YYYYMMDD format. E.g. \code{c(start = "2020-02-15", end = "2020-02-17")}; \item or a \code{character} object of the form \code{YYYY-MM-DD_YYYY-MM-DD} or \code{YYYYMMDD_YYYYMMDD}. For example, \verb{2020-02-15_2020-02-17} or \verb{20200215_20200217}. +} \item A regular expression to match dates in the format \code{YYYYMMDD}. \code{character} object. For example, \verb{^202002} will match all dates in February 2020. }} } diff --git a/man/spod_download_data.Rd b/man/spod_download_data.Rd index 3639bc5..68ef4d4 100644 --- a/man/spod_download_data.Rd +++ b/man/spod_download_data.Rd @@ -7,8 +7,8 @@ spod_download_data( type = c("od", "origin-destination", "os", "overnight_stays", "tpp", "trips_per_person"), - zones = c("districts", "dist", "distr", "municipalities", "muni", "municip", "lau", - "large_urban_areas"), + zones = c("districts", "dist", "distr", "distritos", "municipalities", "muni", + "municip", "municipios", "lau", "large_urban_areas", "gau", "grandes_areas_urbanas"), dates = NULL, data_dir = spod_get_data_dir(), quiet = FALSE, @@ -18,7 +18,7 @@ spod_download_data( \arguments{ \item{type}{The type of data to download. Can be \code{"origin-destination"} (or ust \code{"od"}), or \code{"trips_per_person"} (or just \code{"tpp"}) for v1 data. For v2 data \code{"overnight_stays"} (or just \code{"os"}) is also available. More data types to be supported in the future. See respective codebooks for more information. \strong{ADD CODEBOOKS! to the package}} -\item{zones}{The zones for which to download the data. Can be \code{"districts"} (or \code{"dist"}, \code{"distr"}) or \code{"municipalities"} (or \code{"muni"}, \code{"municip"}) for v1 data. Additionaly, these can be \code{"large_urban_areas"} (or \code{"lau"}) for v2 data.} +\item{zones}{The zones for which to download the data. Can be \code{"districts"} (or \code{"dist"}, \code{"distr"}, or the original Spanish \code{"distritos"}) or \code{"municipalities"} (or \code{"muni"}, \code{"municip"}, or the original Spanish \code{"municipios"}). Additionaly, these can be \code{"large_urban_areas"} (or \code{"lau"}, or the original Spanish \code{"grandes_areas_urbanas"}, or \code{"gau"}) for v2 data.} \item{dates}{A \code{character} or \code{Date} vector of dates to process. Kindly keep in mind that v1 and v2 data follow different data collection methodologies and may not be directly comparable. Therefore, do not try to request data from both versions for the same date range. If you need to compare data from both versions, please refer to the respective codebooks and methodology documents. The v1 data covers the period from 2020-02-14 to 2021-05-09, and the v2 data covers the period from 2022-01-01 to the present until further notice. The true dates range is checked against the available data for each version on every function run. @@ -27,8 +27,10 @@ The possible values can be any of the following: \item A single date in ISO (YYYY-MM-DD) or YYYYMMDD format. \code{character} or \code{Date} object. \item A vector of dates in ISO (YYYY-MM-DD) or YYYYMMDD format. \code{character} or \code{Date} object. Can be any non-consecutive sequence of dates. \item A date range +\itemize{ \item eigher a \code{character} or \code{Date} object of length 2 with clearly named elements \code{start} and \code{end} in ISO (YYYY-MM-DD) or YYYYMMDD format. E.g. \code{c(start = "2020-02-15", end = "2020-02-17")}; \item or a \code{character} object of the form \code{YYYY-MM-DD_YYYY-MM-DD} or \code{YYYYMMDD_YYYYMMDD}. For example, \verb{2020-02-15_2020-02-17} or \verb{20200215_20200217}. +} \item A regular expression to match dates in the format \code{YYYYMMDD}. \code{character} object. For example, \verb{^202002} will match all dates in February 2020. }} diff --git a/man/spod_get_od.Rd b/man/spod_duckdb_filter_by_dates.Rd similarity index 57% rename from man/spod_get_od.Rd rename to man/spod_duckdb_filter_by_dates.Rd index 939a4d7..0d924f5 100644 --- a/man/spod_get_od.Rd +++ b/man/spod_duckdb_filter_by_dates.Rd @@ -1,19 +1,17 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_v1_data.R -\name{spod_get_od} -\alias{spod_get_od} -\title{Retrieve the origin-destination v1 data (2020-2021)} +% Please edit documentation in R/duckdb_helpers.R +\name{spod_duckdb_filter_by_dates} +\alias{spod_duckdb_filter_by_dates} +\title{Filter a duckdb conenction by dates} \usage{ -spod_get_od( - zones = c("districts", "dist", "distr", "municipalities", "muni", "municip"), - dates = NULL, - data_dir = spod_get_data_dir(), - quiet = FALSE, - read_fun = duckdb::tbl_file -) +spod_duckdb_filter_by_dates(con, source_view_name, new_view_name, dates) } \arguments{ -\item{zones}{The zones for which to download the data. Can be \code{"districts"} (or \code{"dist"}, \code{"distr"}) or \code{"municipalities"} (or \code{"muni"}, \code{"municip"}) for v1 data. Additionaly, these can be \code{"large_urban_areas"} (or \code{"lau"}) for v2 data.} +\item{con}{A duckdb connection} + +\item{source_view_name}{The name of the source duckdb "view" (the virtual table, in the context of current package likely connected to a folder of CSV files)} + +\item{new_view_name}{The name of the new duckdb "view" (the virtual table, in the context of current package likely connected to a folder of CSV files).} \item{dates}{A \code{character} or \code{Date} vector of dates to process. Kindly keep in mind that v1 and v2 data follow different data collection methodologies and may not be directly comparable. Therefore, do not try to request data from both versions for the same date range. If you need to compare data from both versions, please refer to the respective codebooks and methodology documents. The v1 data covers the period from 2020-02-14 to 2021-05-09, and the v2 data covers the period from 2022-01-01 to the present until further notice. The true dates range is checked against the available data for each version on every function run. @@ -22,20 +20,13 @@ The possible values can be any of the following: \item A single date in ISO (YYYY-MM-DD) or YYYYMMDD format. \code{character} or \code{Date} object. \item A vector of dates in ISO (YYYY-MM-DD) or YYYYMMDD format. \code{character} or \code{Date} object. Can be any non-consecutive sequence of dates. \item A date range +\itemize{ \item eigher a \code{character} or \code{Date} object of length 2 with clearly named elements \code{start} and \code{end} in ISO (YYYY-MM-DD) or YYYYMMDD format. E.g. \code{c(start = "2020-02-15", end = "2020-02-17")}; \item or a \code{character} object of the form \code{YYYY-MM-DD_YYYY-MM-DD} or \code{YYYYMMDD_YYYYMMDD}. For example, \verb{2020-02-15_2020-02-17} or \verb{20200215_20200217}. +} \item A regular expression to match dates in the format \code{YYYYMMDD}. \code{character} object. For example, \verb{^202002} will match all dates in February 2020. }} - -\item{data_dir}{The directory where the data is stored. Defaults to the value returned by \code{spod_get_data_dir()} which returns the value of the environment variable \code{SPANISH_OD_DATA_DIR} or a temporary directory if the variable is not set.} - -\item{quiet}{Logical. If \code{TRUE}, the function does not print messages to the console. Defaults to \code{FALSE}.} - -\item{read_fun}{The function to read the data. Defaults to \code{duckdb::tbl_file}.} -} -\value{ -A tibble with the origin-destination data. } \description{ -This function retrieves the v1 (2020-2021) origin-destination data from the specified data directory. +Filter a duckdb conenction by dates } diff --git a/man/spod_duckdb_limit_resources.Rd b/man/spod_duckdb_limit_resources.Rd new file mode 100644 index 0000000..d0122b9 --- /dev/null +++ b/man/spod_duckdb_limit_resources.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/duckdb_helpers.R +\name{spod_duckdb_limit_resources} +\alias{spod_duckdb_limit_resources} +\title{Set maximum memory and number of threads for a DuckDB connection} +\usage{ +spod_duckdb_limit_resources( + con, + duck_max_mem = 3, + duck_max_threads = parallelly::availableCores() - 1 +) +} +\arguments{ +\item{con}{A duckdb connection} + +\item{duck_max_mem}{The maximum memory to use in GB. A conservative default is 3 GB, which should be enough for resaving the data to DuckDB form a folder of CSV.gz files while being small enough to fit in memory of most even old computers. For data analysis using the already converted data (in DuckDB or Parquet format) or with the raw CSV.gz data, it is recommended to increase it according to available resources.} + +\item{duck_max_threads}{The maximum number of threads to use. Defaults to the number of available cores minus 1.} +} +\description{ +Set maximum memory and number of threads for a DuckDB connection +} diff --git a/man/spod_duckdb_od_v1.Rd b/man/spod_duckdb_od_v1.Rd new file mode 100644 index 0000000..f2e59bb --- /dev/null +++ b/man/spod_duckdb_od_v1.Rd @@ -0,0 +1,68 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/duckdb_helpers.R +\name{spod_duckdb_od_v1} +\alias{spod_duckdb_od_v1} +\title{Creates a duckdb connection to v1 OD data} +\usage{ +spod_duckdb_od_v1( + con = DBI::dbConnect(duckdb::duckdb(), dbdir = ":memory:", read_only = FALSE), + zones = c("districts", "dist", "distr", "distritos", "municipalities", "muni", + "municip", "municipios", "lau", "large_urban_areas", "gau", "grandes_areas_urbanas"), + data_dir = spod_get_data_dir() +) +} +\arguments{ +\item{con}{A duckdb connection object. If not specified, a new in-memory connection will be created.} + +\item{zones}{The zones for which to download the data. Can be \code{"districts"} (or \code{"dist"}, \code{"distr"}, or the original Spanish \code{"distritos"}) or \code{"municipalities"} (or \code{"muni"}, \code{"municip"}, or the original Spanish \code{"municipios"}). Additionaly, these can be \code{"large_urban_areas"} (or \code{"lau"}, or the original Spanish \code{"grandes_areas_urbanas"}, or \code{"gau"}) for v2 data.} + +\item{data_dir}{The directory where the data is stored. Defaults to the value returned by \code{spod_get_data_dir()} which returns the value of the environment variable \code{SPANISH_OD_DATA_DIR} or a temporary directory if the variable is not set.} +} +\value{ +A duckdb connection object with 2 views: +\itemize{ +\item \code{od_csv_raw} - a raw table view of all cached CSV files with the origin-destination data that has been previously cached in $SPANISH_OD_DATA_DIR +\item \code{od_csv_clean} - a cleaned-up table view of \code{od_csv_raw} with column names and values translated and mapped to English. This still includes all cached data. +} + +The structure of the cleaned-up views \code{od_csv_clean} is as follows: + +\describe{ +\item{full_date}{\code{Date}. The full date of the trip, including year, month, and day.} +\item{id_origin}{\code{factor}. The identifier for the origin location of the trip, formatted as a code (e.g., '01001_AM').} +\item{id_destination}{\code{factor}. The identifier for the destination location of the trip, formatted as a code (e.g., '01001_AM').} +\item{activity_origin}{\code{factor}. The type of activity at the origin location (e.g., 'home', 'work'). \strong{Note:} Only available for district level data.} +\item{activity_destination}{\code{factor}. The type of activity at the destination location (e.g., 'home', 'other'). \strong{Note:} Only available for district level data.} +\item{residence_province}{\code{factor}. The province of residence for the individual making the trip (e.g., 'Cuenca', 'Girona'). Provinces are stored as factors, and are encoded in a way that the province code can be used for queries. \strong{Note:} Only available for district level data.} +\item{time_slot}{\code{integer}. The time slot during which the trip started, represented as an integer (e.g., 0, 1, 2).} +\item{distance}{\code{factor}. The distance category of the trip, represented as a code (e.g., '002-005' for 2-5 km).} +\item{n_trips}{\code{double}. The number of trips taken within the specified time slot and distance.} +\item{trips_total_length_km}{\code{double}. The total length of all trips in kilometers for the specified time slot and distance.} +\item{year}{\code{double}. The year of the trip.} +\item{month}{\code{double}. The month of the trip.} +\item{day}{\code{double}. The day of the trip.} +} + +The structure of the original data in \code{od_csv_raw} is as follows: + +\describe{ +\item{fecha}{\code{Date}. The date of the trip, including year, month, and day.} +\item{origen}{\code{character}. The identifier for the origin location of the trip, formatted as a character string (e.g., '01001_AM').} +\item{destino}{\code{character}. The identifier for the destination location of the trip, formatted as a character string (e.g., '01001_AM').} +\item{actividad_origen}{\code{character}. The type of activity at the origin location (e.g., 'casa', 'trabajo').} +\item{actividad_destino}{\code{character}. The type of activity at the destination location (e.g., 'otros', 'trabajo').} +\item{residencia}{\code{character}. The code representing the residence of the individual making the trip (e.g., '01') according to the official INE classification.} +\item{edad}{\code{character}. The age of the individual making the trip. This data is actaully filled with 'NA' values, which is why this column is removed in the cleaned-up and translated view described above.} +\item{periodo}{\code{integer}. The time period during which the trip started, represented as an integer (e.g., 0, 1, 2).} +\item{distancia}{\code{character}. The distance category of the trip, represented as a character string (e.g., '002-005' for 2-5 km).} +\item{viajes}{\code{double}. The number of trips taken within the specified time period and distance.} +\item{viajes_km}{\code{double}. The total length of all trips in kilometers for the specified time period and distance.} +\item{day}{\code{double}. The day of the trip.} +\item{month}{\code{double}. The month of the trip.} +\item{year}{\code{double}. The year of the trip.} +} +} +\description{ +This function creates a duckdb connection to the v1 OD data. +} +\keyword{internal} diff --git a/man/spod_get_data_dir.Rd b/man/spod_get_data_dir.Rd index f291402..52e0b8b 100644 --- a/man/spod_get_data_dir.Rd +++ b/man/spod_get_data_dir.Rd @@ -6,6 +6,9 @@ \usage{ spod_get_data_dir(quiet = FALSE) } +\arguments{ +\item{quiet}{Logical. If \code{TRUE}, the function does not print messages to the console. Defaults to \code{FALSE}.} +} \value{ The data directory. } diff --git a/man/spod_get_od_v1.Rd b/man/spod_get_od_v1.Rd new file mode 100644 index 0000000..0c2d3e4 --- /dev/null +++ b/man/spod_get_od_v1.Rd @@ -0,0 +1,102 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_v1_data.R +\name{spod_get_od_v1} +\alias{spod_get_od_v1} +\title{Load the origin-destination v1 data (2020-2021) for specified dates} +\usage{ +spod_get_od_v1( + zones = c("districts", "dist", "distr", "distritos", "municipalities", "muni", + "municip", "municipios"), + dates = NULL, + data_dir = spod_get_data_dir(), + quiet = FALSE, + duck_max_mem = 2, + duck_max_threads = parallelly::availableCores() +) +} +\arguments{ +\item{zones}{The zones for which to download the data. Can be \code{"districts"} (or \code{"dist"}, \code{"distr"}, or the original Spanish \code{"distritos"}) or \code{"municipalities"} (or \code{"muni"}, \code{"municip"}, or the original Spanish \code{"municipios"}). Additionaly, these can be \code{"large_urban_areas"} (or \code{"lau"}, or the original Spanish \code{"grandes_areas_urbanas"}, or \code{"gau"}) for v2 data.} + +\item{dates}{A \code{character} or \code{Date} vector of dates to process. Kindly keep in mind that v1 and v2 data follow different data collection methodologies and may not be directly comparable. Therefore, do not try to request data from both versions for the same date range. If you need to compare data from both versions, please refer to the respective codebooks and methodology documents. The v1 data covers the period from 2020-02-14 to 2021-05-09, and the v2 data covers the period from 2022-01-01 to the present until further notice. The true dates range is checked against the available data for each version on every function run. + +The possible values can be any of the following: +\itemize{ +\item A single date in ISO (YYYY-MM-DD) or YYYYMMDD format. \code{character} or \code{Date} object. +\item A vector of dates in ISO (YYYY-MM-DD) or YYYYMMDD format. \code{character} or \code{Date} object. Can be any non-consecutive sequence of dates. +\item A date range +\itemize{ +\item eigher a \code{character} or \code{Date} object of length 2 with clearly named elements \code{start} and \code{end} in ISO (YYYY-MM-DD) or YYYYMMDD format. E.g. \code{c(start = "2020-02-15", end = "2020-02-17")}; +\item or a \code{character} object of the form \code{YYYY-MM-DD_YYYY-MM-DD} or \code{YYYYMMDD_YYYYMMDD}. For example, \verb{2020-02-15_2020-02-17} or \verb{20200215_20200217}. +} +\item A regular expression to match dates in the format \code{YYYYMMDD}. \code{character} object. For example, \verb{^202002} will match all dates in February 2020. +}} + +\item{data_dir}{The directory where the data is stored. Defaults to the value returned by \code{spod_get_data_dir()} which returns the value of the environment variable \code{SPANISH_OD_DATA_DIR} or a temporary directory if the variable is not set.} + +\item{quiet}{Logical. If \code{TRUE}, the function does not print messages to the console. Defaults to \code{FALSE}.} + +\item{duck_max_mem}{The maximum memory to use in GB. A conservative default is 3 GB, which should be enough for resaving the data to DuckDB form a folder of CSV.gz files while being small enough to fit in memory of most even old computers. For data analysis using the already converted data (in DuckDB or Parquet format) or with the raw CSV.gz data, it is recommended to increase it according to available resources.} + +\item{duck_max_threads}{The maximum number of threads to use. Defaults to the number of available cores minus 1.} +} +\value{ +A DuckDB table connection object. It can be manupulated using \code{dplyr} verbs, or can be loaded into memory using \code{dplyr::collect()}. The structure of the object is as follows: + +\describe{ +\item{full_date}{\code{Date}. The full date of the trip, including year, month, and day.} +\item{id_origin}{\code{factor}. The identifier for the origin location of the trip, formatted as a code (e.g., '01001_AM').} +\item{id_destination}{\code{factor}. The identifier for the destination location of the trip, formatted as a code (e.g., '01001_AM').} +\item{activity_origin}{\code{factor}. The type of activity at the origin location (e.g., 'home', 'work').} +\item{activity_destination}{\code{factor}. The type of activity at the destination location (e.g., 'home', 'other').} +\item{residence_province}{\code{factor}. The province of residence for the individual making the trip (e.g. 'Cuenca', 'Girona').} +\item{time_slot}{\code{integer}. The time slot during which the trip started, represented as an integer (e.g., 0, 1, 2).} +\item{distance}{\code{factor}. The distance category of the trip, represented as a code (e.g., '002-005' for 2-5 km).} +\item{n_trips}{\code{double}. The number of trips taken within the specified time slot and distance.} +\item{trips_total_length_km}{\code{double}. The total length of all trips in kilometers for the specified time slot and distance.} +\item{year}{\code{double}. The year of the trip.} +\item{month}{\code{double}. The month of the trip.} +\item{day}{\code{double}. The day of the trip.} +} + +This object also contains the reference to the source DuckDB conneciton with the full view of the cached data. It can be accessed using \code{od_table$src$con}. See examples below. The connection includes two views: +\itemize{ +\item \code{od_csv_raw} - a raw table view of all cached CSV files with the origin-destination data that has been previously cached in $SPANISH_OD_DATA_DIR +\item \code{od_csv_clean} - a cleaned-up table view of \code{od_csv_raw} with column names and values translated and mapped to English. This still includes all cached data. +} + +View \code{od_csv_clean} has the same structure as the filtered view 'od_filtered', which is returned by \code{spod_get_od_v1()} as a DuckDB table connection object. The view \code{od_csv_raw} has original Spanish column names and values and has the following structure: +\describe{ +\item{fecha}{\code{Date}. The date of the trip, including year, month, and day.} +\item{origen}{\code{character}. The identifier for the origin location of the trip, formatted as a character string (e.g., '01001_AM').} +\item{destino}{\code{character}. The identifier for the destination location of the trip, formatted as a character string (e.g., '01001_AM').} +\item{actividad_origen}{\code{character}. The type of activity at the origin location (e.g., 'casa', 'trabajo').} +\item{actividad_destino}{\code{character}. The type of activity at the destination location (e.g., 'otros', 'trabajo').} +\item{residencia}{\code{character}. The code representing the residence of the individual making the trip (e.g., '01') according to the official INE classification.} +\item{edad}{\code{character}. The age of the individual making the trip. This data is actaully filled with 'NA' values, which is why this column is removed in the cleaned-up and translated view described above.} +\item{periodo}{\code{integer}. The time period during which the trip started, represented as an integer (e.g., 0, 1, 2).} +\item{distancia}{\code{character}. The distance category of the trip, represented as a character string (e.g., '002-005' for 2-5 km).} +\item{viajes}{\code{double}. The number of trips taken within the specified time period and distance.} +\item{viajes_km}{\code{double}. The total length of all trips in kilometers for the specified time period and distance.} +\item{day}{\code{double}. The day of the trip.} +\item{month}{\code{double}. The month of the trip.} +\item{year}{\code{double}. The year of the trip.} +} +} +\description{ +This function retrieves the v1 (2020-2021) origin_destination_data for the specified dates. It checks if the requested data is already cached locally and downloads it if it is not. When all the requested data is cached, it creates a \code{DuckDB} connection to the cache data folder and provides an table +} +\examples{ +\dontrun{ + +# create a connection to the v1 data +Sys.setenv(SPANISH_OD_DATA_DIR = "~/home/nosync/cache/mitma") +dates <- c("2020-02-14", "2020-03-14", "2021-02-14", "2021-02-14", "2021-02-15") +od_dist <- spod_get_od_v1(zones = "distr", dates = dates) + +# od dist is a table view filtered to the specified dates + +# access the source connection with all dates +# list tables +DBI::dbListTables(od_dist$src$con) +} +} diff --git a/man/spod_get_valid_dates.Rd b/man/spod_get_valid_dates.Rd new file mode 100644 index 0000000..d4c8367 --- /dev/null +++ b/man/spod_get_valid_dates.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/internal_utils.R +\name{spod_get_valid_dates} +\alias{spod_get_valid_dates} +\title{Get valid dates for the specified data version} +\usage{ +spod_get_valid_dates(ver = 1) +} +\arguments{ +\item{ver}{The version of the data to use. Defaults to 1. Can be 1 or 2.} +} +\value{ +A Dates vector of valid dates for the specified data version. +} +\description{ +Get valid dates for the specified data version +} +\keyword{internal} diff --git a/man/spod_get_zones_v1.Rd b/man/spod_get_zones_v1.Rd index 79ec9ff..b99d0ef 100644 --- a/man/spod_get_zones_v1.Rd +++ b/man/spod_get_zones_v1.Rd @@ -5,14 +5,18 @@ \title{Retrieves the zones for v1 data} \usage{ spod_get_zones_v1( - zones = c("districts", "dist", "distr", "municipalities", "muni", "municip"), - data_dir = spod_get_data_dir() + zones = c("districts", "dist", "distr", "distritos", "municipalities", "muni", + "municip", "municipios"), + data_dir = spod_get_data_dir(), + quiet = FALSE ) } \arguments{ -\item{zones}{The zones for which to download the data. Can be \code{"districts"} (or \code{"dist"}, \code{"distr"}) or \code{"municipalities"} (or \code{"muni"}, \code{"municip"}).} +\item{zones}{The zones for which to download the data. Can be \code{"districts"} (or \code{"dist"}, \code{"distr"}, or the original Spanish \code{"distritos"}) or \code{"municipalities"} (or \code{"muni"}, \code{"municip"}, or the original Spanish \code{"municipios"}).} \item{data_dir}{The directory where the data is stored.} + +\item{quiet}{Whether to suppress messages. Defaults to \code{FALSE}.} } \value{ A spatial object containing the zones data. diff --git a/man/spod_sql_where_dates.Rd b/man/spod_sql_where_dates.Rd new file mode 100644 index 0000000..b0fc43e --- /dev/null +++ b/man/spod_sql_where_dates.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/duckdb_helpers.R +\name{spod_sql_where_dates} +\alias{spod_sql_where_dates} +\title{Generate a WHERE part of an SQL query from a sequence of dates} +\usage{ +spod_sql_where_dates(dates) +} +\arguments{ +\item{dates}{A Dates vector of dates to process.} +} +\value{ +A character vector of the SQL query. +} +\description{ +Generate a WHERE part of an SQL query from a sequence of dates +} +\keyword{internal} diff --git a/man/spod_subfolder_clean_data_cache.Rd b/man/spod_subfolder_clean_data_cache.Rd new file mode 100644 index 0000000..1faecde --- /dev/null +++ b/man/spod_subfolder_clean_data_cache.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/folders.R +\name{spod_subfolder_clean_data_cache} +\alias{spod_subfolder_clean_data_cache} +\title{Get clean data subfolder name} +\usage{ +spod_subfolder_clean_data_cache(ver = 1) +} +\arguments{ +\item{ver}{Integer. The version of the data. Must be 1 or 2.} +} +\value{ +Character string with the subfolder name for the clean data cache. +} +\description{ +Change subfolder name in the code of this function for clean data cache here to apply globally, as all functions in the package should use this function to get the clean data cache path. +} +\keyword{internal} diff --git a/man/spod_subfolder_raw_data_cache.Rd b/man/spod_subfolder_raw_data_cache.Rd new file mode 100644 index 0000000..e962f59 --- /dev/null +++ b/man/spod_subfolder_raw_data_cache.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/folders.R +\name{spod_subfolder_raw_data_cache} +\alias{spod_subfolder_raw_data_cache} +\title{Get raw data cache subfolder name} +\usage{ +spod_subfolder_raw_data_cache(ver = 1) +} +\arguments{ +\item{ver}{Integer. The version of the data. Must be 1 or 2.} +} +\value{ +Character string with the subfolder name for the raw data cache. +} +\description{ +Change subfolder name in the code of this function for raw data cache here to apply globally, as all functions in the package should use this function to get the raw data cache path. +} +\keyword{internal} diff --git a/tests/testthat/test-internal_utils.R b/tests/testthat/test-internal_utils.R index dd6ef6a..e453ea4 100644 --- a/tests/testthat/test-internal_utils.R +++ b/tests/testthat/test-internal_utils.R @@ -86,14 +86,18 @@ test_that("invalid input type", { test_that("dates span both v1 and v2 data", { dates <- c("2021-05-09", "2022-01-01") - expect_error(spod_dates_argument_to_dates_seq(dates), - "Dates found in both v1 and v2 data.") + expect_error( + spod_dates_argument_to_dates_seq(dates), + "Dates found in both v1 and v2 data." + ) }) test_that("dates that are out of availabe range of v1 data", { dates <- c("2020-01-01", "2021-01-01") - expect_error(spod_dates_argument_to_dates_seq(dates), - "Some dates do not match the available data.") + expect_error( + spod_dates_argument_to_dates_seq(dates), + "Some dates do not match the available data." + ) }) # clean up diff --git a/vignettes/.gitignore b/vignettes/.gitignore new file mode 100644 index 0000000..ba7abf3 --- /dev/null +++ b/vignettes/.gitignore @@ -0,0 +1,3 @@ +*.html +*.R +*_files diff --git a/vignettes/work-with-v1-data.qmd b/vignettes/work-with-v1-data.qmd new file mode 100644 index 0000000..50c1a23 --- /dev/null +++ b/vignettes/work-with-v1-data.qmd @@ -0,0 +1,196 @@ +--- +title: "Working with v1 MITMA data" +vignette: > + %\VignetteIndexEntry{Working with v1 MITMA data} + %\VignetteEngine{quarto::html} + %\VignetteEncoding{UTF-8} +execute: + eval: false +--- + + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +```{r} +#| eval: false +if (!requireNamespace(c("pak"), quietly = TRUE)) { + install.packages("pak") +} +packages <- c("sf", "dplyr", "DBI") +pak::pkg_install(packages, ask = FALSE, upgrade = FALSE) + +pak::pkg_install("Robinlovelace/spanishoddata@1a53ed9f834f720f47cb016769464acfdad599a8", ask = FALSE, upgrade = TRUE) +devtools::load_all() +``` + +```{r setup} +#| include: false +library(sf) +library(dplyr) +library(DBI) +library(spanishoddata) +``` + +# Introduction + +v1 data, from the Ministerio de Transportes, Movilidad y Agenda Urbana ([MITMA](https://sede.mitma.gob.es/sede_electronica/lang_castellano/)) covers the period from 2020-02-14 to 2021-05-09. + +# Set a directory to store the data + +```{r eval=FALSE} +#| eval: false +Sys.setenv(SPANISH_OD_DATA_DIR = "path/to/store/data") +spod_get_data_dir() # you may want to check that it is set correctly +``` + + +# Get the zones + +Load the spatial data for districts. + +```{r} +districts <- spod_get_zones_v1(zones = "dist", quiet = TRUE) +``` + +```{r} +glimpse(districts) +plot(st_geometry(districts)) +``` + +Load the spatial data for municipalities. + +```{r} +municipalities <- spod_get_zones_v1(zones = "muni", quiet = TRUE) +``` + +```{r} +glimpse(municipalities) +plot(st_geometry(municipalities)) +``` + +# Get the origin-destination data for an inteval of dates + +Load the data for the period from 2020-02-14 to 2020-02-17. + +The data will be cached in the directory set in the SPANISH_OD_DATA_DIR environment variable. If it is already downloaded, it will not be downloaded again. + +```{r} +od_dist_1 <- spod_get_od_v1( + zones = "dist", + dates = c(start = "2020-02-14", end = "2020-02-17"), + quiet = TRUE +) +``` + + +Look at the data structure. This is a lazy table with DuckDB backend. That is, the files on disk are still raw gzipped CSV files, but they are cleverly connected to a dynamic view in in-memory DuckDB database. + +```{r} +od_dist_1 |> + glimpse() +``` + +You can work with it using dplyr verbs as if it were a regular data frame, but if you want to load the results into memory, you can use the `collect()` function. + +For example, this code below will not execute the query, but will only create another "lazy" object, and executes instantly. + +```{r} +od_dist_1_lazy <- od_dist_1 |> + group_by(id_origin, id_destination, full_date) |> + summarise( + total_day_trips = sum(n_trips, na.rm = TRUE), + .groups = "drop" + ) |> + group_by(id_origin, id_destination) |> + summarise( + mean_dayly_trips = mean(total_day_trips, na.rm = TRUE), + .groups = "drop" + ) +``` + +In fact this is a "lazy" object with an SQL query attached to it. You can see the query with the `show_query()` function. + +```{r} +od_dist_1_lazy |> + show_query() +``` + +No data has been loaded to memory yet: + +```{r} +format(object.size(od_dist_1_lazy), units = "Mb") +class(od_dist_1_lazy) +``` + +Use the `collect()` function to import the object into memory (your global environment). +It can be added either at the end of the original pipeline, like so: + +```{r} +#| eval=FALSE +od_dist_1_data <- od_dist_1 |> + group_by(id_origin, id_destination, full_date) |> + summarise( + total_day_trips = sum(n_trips, na.rm = TRUE), + .groups = "drop" + ) |> + group_by(id_origin, id_destination) |> + summarise( + mean_dayly_trips = mean(total_day_trips, na.rm = TRUE), + .groups = "drop" + ) |> + collect() +``` + +Or you can just add `collect()` to the "lazy" object that you created before, like so: + +```{r} +od_dist_1_data <- od_dist_1_lazy |> + collect() +``` + +Now the data is in memory and is consuming computational resources (around 10 MB in this case because we've aggregated the data, removing the hour-by-hour detail): + +```{r} +format(object.size(od_dist_1_data), units = "Mb") +class(od_dist_1_data) +``` + +To disconnect the in-memory database, you can use the `DBI::dbDisconnect()` function. + +```{r} +DBI::dbDisconnect(od_dist_1$src$con) +``` + +# Get the origin-destination data for several non-consecutive dates + +This time let's get the data for 2020-02-14, 2020-02-17, 2021-05-07 and 2021-05-09. We will use municipalities as zones. + +```{r} +od_muni_1 <- spod_get_od_v1( + zones = "muni", + dates = c("2020-02-14", "2020-02-17", "2021-05-07", "2021-05-09"), + quiet = TRUE +) +``` + + +Look at the data structure. + +```{r} +od_muni_1 |> + glimpse() +``` + +```{r} +format(object.size(od_muni_1), units = "Mb") +class(od_muni_1) +``` + +```{r} +DBI::dbDisconnect(od_muni_1$src$con) +```