From e53a6164c52084c71179a5194c0390d2a7c4a8e1 Mon Sep 17 00:00:00 2001 From: Joan Maspons Date: Wed, 24 Jul 2024 07:59:38 +0200 Subject: [PATCH 1/4] Set encoding to UTF-8 for tags and user names FIX #346 --- DESCRIPTION | 2 +- NEWS.md | 1 + R/get-osmdata-df.R | 7 +++++-- R/get-osmdata-sc.R | 6 ++++++ R/get-osmdata-sf.R | 5 +++-- R/get-osmdata-sp.R | 6 +++++- R/get-osmdata.R | 17 +++++++++++++++++ codemeta.json | 4 ++-- 8 files changed, 40 insertions(+), 8 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3e9925371..6c5ec3ee 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: osmdata Title: Import 'OpenStreetMap' Data as Simple Features or Spatial Objects -Version: 0.2.5.018 +Version: 0.2.5.019 Authors@R: c( person("Mark", "Padgham", , "mark.padgham@email.com", role = c("aut", "cre")), person("Bob", "Rudis", role = "aut"), diff --git a/NEWS.md b/NEWS.md index 56ddfdaa..1192d3d4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -11,6 +11,7 @@ - Improved `get_bb(..., format_out = "sf_polygon")` to return full metadata along with geometries (#338 thanks to @RegularnaMatrica) - Mention key-only feature requests in README (#342 thanks to @joostschouppe) +- Set encoding to UTF-8 for tags and user names (#347) 0.2.5 diff --git a/R/get-osmdata-df.R b/R/get-osmdata-df.R index 12ed530b..6f0409a6 100644 --- a/R/get-osmdata-df.R +++ b/R/get-osmdata-df.R @@ -75,7 +75,8 @@ osmdata_data_frame <- function (q, colClasses = "character", # osm_id doesn't fit in integer check.names = FALSE, comment.char = "", - stringsAsFactors = stringsAsFactors + stringsAsFactors = stringsAsFactors, + encoding = "UTF-8" ) } else if (isTRUE (obj$meta$query_type == "adiff")) { datetime_from <- obj$meta$datetime_from @@ -162,7 +163,7 @@ xml_to_df <- function (doc, stringsAsFactors = FALSE) { osm_id = rownames (res [[i]]), center [[i]], meta [[i]], - tags [[i]], + setenc_utf8(tags [[i]]), stringsAsFactors = stringsAsFactors, check.names = FALSE ) @@ -225,6 +226,7 @@ xml_adiff_to_df <- function (doc, tagV <- vapply (tag, function (x) x, FUN.VALUE = character (2)) m [i, tagV [1, ]] <- tagV [2, ] } + Encoding(m) <- "UTF-8" osm_type <- xml2::xml_name (osm_obj) osm_id <- xml2::xml_attr (osm_obj, "id") @@ -325,6 +327,7 @@ get_meta_from_xml <- function (osm_obj) { osm_uid = xml2::xml_attr (osm_obj, attr = "uid"), osm_user = xml2::xml_attr (osm_obj, attr = "user") ) + Encoding(out$osm_user) <- "UTF-8" } else { out <- matrix (nrow = length (osm_obj), ncol = 0) diff --git a/R/get-osmdata-sc.R b/R/get-osmdata-sc.R index 935c4489..8aeb1a3a 100644 --- a/R/get-osmdata-sc.R +++ b/R/get-osmdata-sc.R @@ -75,6 +75,12 @@ osmdata_sc <- function (q, doc, quiet = TRUE) { overpass_version = temp$obj$meta$overpass_version ) + has_tags <- c ("nodes", "relation_properties", "object") + obj [has_tags] <- lapply(obj [has_tags], function (x) { + x [, c ("key", "value")] <- setenc_utf8 (x [, c ("key", "value")]) + x + }) + if (!missing (q)) { if (!is.character (q)) { obj$meta$bbox <- q$bbox diff --git a/R/get-osmdata-sf.R b/R/get-osmdata-sf.R index fed33c7b..a0d9612d 100644 --- a/R/get-osmdata-sf.R +++ b/R/get-osmdata-sf.R @@ -62,8 +62,9 @@ osmdata_sf <- function (q, doc, quiet = TRUE, stringsAsFactors = FALSE) { # noli if (!"osm_id" %in% names (res$polygons_kv)[1]) { res <- fill_kv (res, "polygons_kv", "polygons", stringsAsFactors) } - kv_df <- grep ("_kv$", names (res)) - res[kv_df] <- fix_columns_list (res[kv_df]) + kv_df <- grep ("_kv$", names (res)) # objects with tags + res [kv_df] <- fix_columns_list (res[kv_df]) + res [kv_df] <- lapply (res [kv_df], setenc_utf8) if (missing (q)) { obj$bbox <- paste (res$bbox, collapse = " ") diff --git a/R/get-osmdata-sp.R b/R/get-osmdata-sp.R index c14c1d66..3eec7e14 100644 --- a/R/get-osmdata-sp.R +++ b/R/get-osmdata-sp.R @@ -71,7 +71,11 @@ osmdata_sp <- function (q, doc, quiet = TRUE) { obj$osm_multipolygons <- res$multipolygons osm_items <- grep ("^osm_", names (obj)) - obj[osm_items] <- fix_columns_list (obj[osm_items]) + obj [osm_items] <- fix_columns_list (obj [osm_items]) + obj [osm_items] <- lapply (obj [osm_items], function (x) { + x@data <- setenc_utf8 (x@data) + x + }) class (obj) <- c (class (obj), "osmdata_sp") return (obj) diff --git a/R/get-osmdata.R b/R/get-osmdata.R index 32822165..0624576e 100644 --- a/R/get-osmdata.R +++ b/R/get-osmdata.R @@ -344,3 +344,20 @@ get_center_from_cpp_output <- function (res, what = "points") { return (as.data.frame (this)) } + + +#' Set encoding to UTF-8 +#' +#' @param x a data.frame or a list. +#' +#' @return `x` with all the columns or items of type character with UTF-8 encoding set. +#' @noRd +setenc_utf8 <- function(x) { + char_cols <- vapply (x, is.character, FUN.VALUE = logical (1)) + x [char_cols] <- lapply (x [char_cols], function (y) { + Encoding (y) <- "UTF-8" + y + }) + + return(x) +} diff --git a/codemeta.json b/codemeta.json index 24d33347..b9c22425 100644 --- a/codemeta.json +++ b/codemeta.json @@ -11,13 +11,13 @@ "codeRepository": "https://github.com/ropensci/osmdata/", "issueTracker": "https://github.com/ropensci/osmdata/issues", "license": "https://spdx.org/licenses/GPL-3.0", - "version": "0.2.5.018", + "version": "0.2.5.19", "programmingLanguage": { "@type": "ComputerLanguage", "name": "R", "url": "https://r-project.org" }, - "runtimePlatform": "R version 4.3.1 (2023-06-16)", + "runtimePlatform": "R version 4.4.1 (2024-06-14)", "provider": { "@id": "https://cran.r-project.org", "@type": "Organization", From a81f652bdd44d422722ef04286f6278ac3767db9 Mon Sep 17 00:00:00 2001 From: Joan Maspons Date: Wed, 24 Jul 2024 09:44:48 +0000 Subject: [PATCH 2/4] More explicit and efficient Co-authored-by: mark padgham --- R/get-osmdata.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/get-osmdata.R b/R/get-osmdata.R index 0624576e..65d28721 100644 --- a/R/get-osmdata.R +++ b/R/get-osmdata.R @@ -353,7 +353,7 @@ get_center_from_cpp_output <- function (res, what = "points") { #' @return `x` with all the columns or items of type character with UTF-8 encoding set. #' @noRd setenc_utf8 <- function(x) { - char_cols <- vapply (x, is.character, FUN.VALUE = logical (1)) + char_cols <- which (vapply (x, is.character, FUN.VALUE = logical (1))) x [char_cols] <- lapply (x [char_cols], function (y) { Encoding (y) <- "UTF-8" y From 6e7336ee30914c38a1fdf1907d0c8e40e66ce9f5 Mon Sep 17 00:00:00 2001 From: Joan Maspons Date: Thu, 25 Jul 2024 09:24:22 +0200 Subject: [PATCH 3/4] Use enc2utf8() instead of Encoding() <- "UTF-8" Tests show better performance, see #346 enc2utf8() in matrices where possible to avoid calls for each df column --- R/get-osmdata-df.R | 21 ++++++++++++--------- R/get-osmdata.R | 7 +++---- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/R/get-osmdata-df.R b/R/get-osmdata-df.R index 6f0409a6..8d019796 100644 --- a/R/get-osmdata-df.R +++ b/R/get-osmdata-df.R @@ -123,13 +123,15 @@ xml_to_df <- function (doc, stringsAsFactors = FALSE) { tags <- mapply (function (i, k) { i <- i [, k, drop = FALSE] # remove osm_id column if exists + out <- matrix ( + NA_character_, + nrow = nrow (i), ncol = length (keys), + dimnames = list (NULL, keys) + ) + out <- enc2utf8 (out) out <- data.frame ( - matrix ( - nrow = nrow (i), ncol = length (keys), - dimnames = list (NULL, keys) - ), - stringsAsFactors = stringsAsFactors, - check.names = FALSE + out, + stringsAsFactors = stringsAsFactors, check.names = FALSE ) out [, names (i)] <- i return (out) @@ -163,7 +165,7 @@ xml_to_df <- function (doc, stringsAsFactors = FALSE) { osm_id = rownames (res [[i]]), center [[i]], meta [[i]], - setenc_utf8(tags [[i]]), + tags [[i]], stringsAsFactors = stringsAsFactors, check.names = FALSE ) @@ -215,6 +217,7 @@ xml_adiff_to_df <- function (doc, tags_u <- xml2::xml_find_all (osm_actions, xpath = ".//tag") col_names <- sort (unique (xml2::xml_attr (tags_u, attr = "k"))) m <- matrix ( + NA_character_, nrow = length (osm_obj), ncol = length (col_names), dimnames = list (NULL, col_names) ) @@ -226,7 +229,7 @@ xml_adiff_to_df <- function (doc, tagV <- vapply (tag, function (x) x, FUN.VALUE = character (2)) m [i, tagV [1, ]] <- tagV [2, ] } - Encoding(m) <- "UTF-8" + m <- enc2utf8 (m) osm_type <- xml2::xml_name (osm_obj) osm_id <- xml2::xml_attr (osm_obj, "id") @@ -327,7 +330,7 @@ get_meta_from_xml <- function (osm_obj) { osm_uid = xml2::xml_attr (osm_obj, attr = "uid"), osm_user = xml2::xml_attr (osm_obj, attr = "user") ) - Encoding(out$osm_user) <- "UTF-8" + out$osm_user <- enc2utf8 (out$osm_user) } else { out <- matrix (nrow = length (osm_obj), ncol = 0) diff --git a/R/get-osmdata.R b/R/get-osmdata.R index 65d28721..e575d51c 100644 --- a/R/get-osmdata.R +++ b/R/get-osmdata.R @@ -352,12 +352,11 @@ get_center_from_cpp_output <- function (res, what = "points") { #' #' @return `x` with all the columns or items of type character with UTF-8 encoding set. #' @noRd -setenc_utf8 <- function(x) { +setenc_utf8 <- function (x) { char_cols <- which (vapply (x, is.character, FUN.VALUE = logical (1))) x [char_cols] <- lapply (x [char_cols], function (y) { - Encoding (y) <- "UTF-8" - y + enc2utf8 (y) }) - return(x) + return (x) } From 2ff8526c36e037708ad7179bc1a1c03f68c6998f Mon Sep 17 00:00:00 2001 From: Joan Maspons Date: Thu, 25 Jul 2024 09:26:32 +0200 Subject: [PATCH 4/4] Set encoding in the getbb() result --- R/getbb.R | 2 ++ codemeta.json | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/R/getbb.R b/R/getbb.R index a786836a..6be594bf 100644 --- a/R/getbb.R +++ b/R/getbb.R @@ -229,6 +229,8 @@ getbb <- function (place_name, ) if (format_out == "data.frame") { + utf8cols <- c ("licence", "name", "display_name") + obj [, utf8cols] <- setenc_utf8 (obj [, utf8cols]) return (obj) } diff --git a/codemeta.json b/codemeta.json index b9c22425..1f0c8894 100644 --- a/codemeta.json +++ b/codemeta.json @@ -11,7 +11,7 @@ "codeRepository": "https://github.com/ropensci/osmdata/", "issueTracker": "https://github.com/ropensci/osmdata/issues", "license": "https://spdx.org/licenses/GPL-3.0", - "version": "0.2.5.19", + "version": "0.2.5.019", "programmingLanguage": { "@type": "ComputerLanguage", "name": "R",