emlab-ucsb · jflowernet · Jan 15, 2025 · Jan 15, 2025
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: oceandatr
 Type: Package
 Title: Ocean Data Access
-Version: 0.2.1.1
+Version: 0.2.1.2
 Authors@R: person(given = "Jason", family = "Flower", email = "[email protected]", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-6731-8182"))
 Description: For retrieving and gridding ocean related data.
 License: GPL (>= 3) + file LICENSE

diff --git a/NAMESPACE b/NAMESPACE
@@ -7,7 +7,7 @@ export(get_coral_habitat)
 export(get_data_in_grid)
 export(get_dist)
 export(get_ecoregion)
-export(get_enviro_regions)
+export(get_enviro_zones)
 export(get_features)
 export(get_geomorphology)
 export(get_gfw)

diff --git a/R/get_enviro_regions.R → R/get_enviro_zones.R b/R/get_enviro_regions.R → R/get_enviro_zones.R
@@ -1,10 +1,10 @@
-#' Create environmental regions for area of interest
+#' Create environmental zones for area of interest
 #'
 #' @description This function gets [Bio-Oracle](https://bio-oracle.org/)
 #'   environmental data for the spatial grid and can then create environmental
-#'   regions using k-means clustering. The idea for the clustering comes from
+#'   zones, using k-means clustering. The idea for the clustering comes from
 #'   Magris et al. [2020](https://doi.org/10.1111/ddi.13183). The number of
-#'   environmental regions can be specified directly, using `num_clusters`, but
+#'   environmental zones can be specified directly, using `num_clusters`, but
 #'   the function can also find the 'optimal' number of clusters using the
 #'   `NbClust()` from the `NbClust` package.
 #'
@@ -42,57 +42,60 @@
 #' @inheritParams get_bathymetry
 #' @param raw `logical` if TRUE, `spatial_grid` should be an `sf` polygon, and
 #'   the raw Bio-Oracle environmental data in that polygon(s) will be returned,
-#'   unless `enviro_regions = TRUE`, in which case the raw data will be
-#'   classified into environmental regions
-#' @param enviro_regions `logical` if TRUE, environmental regions will be
+#'   unless `enviro_zones = TRUE`, in which case the raw data will be
+#'   classified into environmental zones
+#' @param enviro_zones `logical` if TRUE, environmental zones will be
 #'   created. If FALSE the gridded Bio-Oracle data will be returned
 #' @param show_plots `logical`; whether to show boxplots for each environmental
-#'   variable in each environmental region (default is FALSE)
-#' @param num_clusters `numeric`; the number of environmental regions to cluster
+#'   variable in each environmental zone (default is FALSE)
+#' @param num_clusters `numeric`; the number of environmental zones to cluster
 #'   the data into - to be used when a clustering algorithm is not necessary
 #'   (default is NULL)
 #' @param max_num_clusters `numeric`; the maximum number of environmental
-#'   regions to try when using the clustering algorithm (default is 6)
+#'   zones to try when using the clustering algorithm (default is 6)
 #' @param sample_size `numeric`; default is 5000. Larger sample sizes will
 #'   quickly consume memory (>10GB) so should be used with caution.
 #' @param num_samples `numeric`; default is 5, which resulted in good consensus
 #'   on the optimal number of clusters in testing.
 #' @param num_cores `numeric`; default 1. Multi-core sampling is supported if
-#'   the package `parallel` is installed, but be aware than increasing the
+#'   the package `parallel` is installed, but be aware that increasing the
 #'   number of cores will also increase the memory required.
-#' @param custom_seed `numeric`; default `1234`, but a custom seed can be
-#'   supplied if desired.
 #'
-#' @return If `enviro_regions = FALSE`, Bio-Oracle data in the `spatial_grid`
-#'   supplied, or in the original raster file resolution if `raw = TRUE`. If
-#'   `enviro_regions = TRUE` a multi-layer raster or an `sf` object with one
-#'   environmental region in each column/ layer is returned, depending on the
-#'   `spatial_grid` format. If `enviro_regions = TRUE` and `raw = TRUE` (in
-#'   which case `spatial_grid` should be an `sf` polygon), the raw Bio-Oracle
-#'   data is classified into environmental zones.
+#' @return If `enviro_zones = FALSE`, Bio-Oracle data in the `spatial_grid`
+#'   supplied, or the original Bio-Oracle data cropped and masked to the grid if
+#'   `raw = TRUE`. If `enviro_zones = TRUE` a multi-layer raster or an `sf`
+#'   object with one environmental zone in each column/ layer is returned,
+#'   depending on the `spatial_grid` format. If `enviro_zones = TRUE` and `raw
+#'   = TRUE` (in which case `spatial_grid` should be an `sf` polygon), the raw
+#'   Bio-Oracle data is classified into environmental zones.
 #'
 #' @export
 #' 
 #' @examples
 #' # Get EEZ data first 
 #' bermuda_eez <- get_boundary(name = "Bermuda")
 #' # Get raw Bio-Oracle environmental data for Bermuda
-#' enviro_data <- get_enviro_regions(spatial_grid = bermuda_eez, raw = TRUE, enviro_regions = FALSE)
+#' enviro_data <- get_enviro_zones(spatial_grid = bermuda_eez, raw = TRUE, enviro_zones = FALSE)
 #' terra::plot(enviro_data)
 #' # Get gridded Bio-Oracle data for Bermuda:
 #' bermuda_grid <- get_grid(boundary = bermuda_eez, crs = '+proj=laea +lon_0=-64.8108333 +lat_0=32.3571917 +datum=WGS84 +units=m +no_defs', resolution = 20000)
-#' enviro_data_gridded <- get_enviro_regions(spatial_grid = bermuda_grid, raw = FALSE, enviro_regions = FALSE)
+#' 
+#' enviro_data_gridded <- get_enviro_zones(spatial_grid = bermuda_grid, raw = FALSE, enviro_zones = FALSE)
 #' terra::plot(enviro_data_gridded)
-#' # Get 3 environmental regions for Bermuda
-#' bermuda_enviro_regions <- get_enviro_regions(spatial_grid = bermuda_grid, raw = FALSE, enviro_regions = TRUE, num_clusters = 3)
-#' terra::plot(bermuda_enviro_regions)
-#' # Can also create environmental regions from the raw Bio-Oracle data using setting raw = TRUE and enviro_regions = TRUE. In this case, the `spatial_grid` should be a polygon of the area you want the data for
-#' bermuda_enviro_regions2 <- get_enviro_regions(spatial_grid = bermuda_eez, raw = TRUE, enviro_regions = TRUE, num_clusters = 3)
-#' terra::plot(bermuda_enviro_regions2)
+#' 
+#' # Get 3 environmental zones for Bermuda
+#' 
+#' #set seed for reproducibility in the sampling to find optimal number of clusters
+#' set.seed(500)
+#' bermuda_enviro_zones <- get_enviro_zones(spatial_grid = bermuda_grid, raw = FALSE, enviro_zones = TRUE, num_clusters = 3)
+#' terra::plot(bermuda_enviro_zones)
+#' # Can also create environmental zones from the raw Bio-Oracle data using setting raw = TRUE and enviro_zones = TRUE. In this case, the `spatial_grid` should be a polygon of the area you want the data for
+#' bermuda_enviro_zones2 <- get_enviro_zones(spatial_grid = bermuda_eez, raw = TRUE, enviro_zones = TRUE, num_clusters = 3)
+#' terra::plot(bermuda_enviro_zones2)
 
-get_enviro_regions <- function(spatial_grid = NULL, raw = FALSE, enviro_regions = TRUE, show_plots = FALSE, num_clusters = NULL, max_num_clusters = 6, antimeridian = NULL, sample_size = 5000, num_samples = 5, num_cores = 1, custom_seed = 1234){
+get_enviro_zones <- function(spatial_grid = NULL, raw = FALSE, enviro_zones = TRUE, show_plots = FALSE, num_clusters = NULL, max_num_clusters = 6, antimeridian = NULL, sample_size = 5000, num_samples = 5, num_cores = 1){
 
-  rlang::check_installed("biooracler", reason = "to get Bio-Oracle data using `get_enviro_regions()`", action = \(pkg, ...) remotes::install_github("bio-oracle/biooracler"))
+  rlang::check_installed("biooracler", reason = "to get Bio-Oracle data using `get_enviro_zones()`", action = \(pkg, ...) remotes::install_github("bio-oracle/biooracler"))
 
   check_grid(spatial_grid)
 
@@ -125,7 +128,7 @@ get_enviro_regions <- function(spatial_grid = NULL, raw = FALSE, enviro_regions
   enviro_data <- get_enviro_data(spatial_grid = spatial_grid) %>% 
     get_data_in_grid(spatial_grid = spatial_grid, dat = ., raw = raw, meth = meth)
 
- if(!enviro_regions){
+ if(!enviro_zones){
    return(enviro_data)
   }else{
 
@@ -136,7 +139,6 @@ get_enviro_regions <- function(spatial_grid = NULL, raw = FALSE, enviro_regions
    if(is.null(num_clusters)){
       message("This could take several minutes")
       #setting index = "all" results in large memory usage and long runtime (I haven't run to completion after >1hr), for the moment, setting the index to "hartigan" which is the same algorithm (Hartigan-Wong) used by the kmeans() function used below
-     set.seed(custom_seed)
 
      n_df_rows <- nrow(df_for_clustering)
 
@@ -166,33 +168,33 @@ get_enviro_regions <- function(spatial_grid = NULL, raw = FALSE, enviro_regions
     clust_partition <- clust_result$cluster
 
     if(show_plots) {
-      enviro_regions_boxplot(clust_partition, df_for_clustering)
-      enviro_regions_pca(clust_partition, df_for_clustering)
+      enviro_zones_boxplot(clust_partition, df_for_clustering)
+      enviro_zones_pca(clust_partition, df_for_clustering)
     }
 
     if(check_sf(enviro_data)){
-      enviro_region_cols <- stats::model.matrix(~ as.factor(clust_partition) - 1) %>% 
+      enviro_zone_cols <- stats::model.matrix(~ as.factor(clust_partition) - 1) %>% 
         as.data.frame() %>%   
-        stats::setNames(paste0("enviro_region_", 1:ncol(.))) %>% 
+        stats::setNames(paste0("enviro_zone_", 1:ncol(.))) %>% 
         dplyr::mutate(row_id = as.numeric(names(clust_partition)))
 
       sf::st_geometry(enviro_data) %>% 
         sf::st_sf() %>% 
         dplyr::mutate(row_id = 1:nrow(.)) %>% 
-        dplyr::left_join(enviro_region_cols, by = dplyr::join_by(row_id)) %>% 
+        dplyr::left_join(enviro_zone_cols, by = dplyr::join_by(row_id)) %>% 
         dplyr::select(-row_id) %>% 
         {if(grid_has_extra_cols) cbind(., extra_cols) %>% dplyr::relocate(colnames(extra_cols), .before = 1) else .}
 
     }else{
-      #create environmental regions raster, filled with NAs to start with
-      enviro_regions <- terra::rast(enviro_data, nlyrs=1, vals = NA, names = "enviro_region")
+      #create environmental zones raster, filled with NAs to start with
+      enviro_zones <- terra::rast(enviro_data, nlyrs=1, vals = NA, names = "enviro_zone")
 
       #set cluster ids in raster - subset for only raster values that are non-NA
-      enviro_regions[as.numeric(names(clust_partition))] <- clust_partition
+      enviro_zones[as.numeric(names(clust_partition))] <- clust_partition
 
-      enviro_regions %>% 
+      enviro_zones %>% 
         terra::segregate() %>% 
-        stats::setNames(paste0("enviro_region_", names(.)))
+        stats::setNames(paste0("enviro_zone_", names(.)))
     }
   }
 }
@@ -259,24 +261,24 @@ get_enviro_data <- function(spatial_grid = NULL){
     stats::setNames(c("Chlorophyll", "Dissolved_oxygen", "Nitrate", "Minimum_temp", "Mean_temp", "Max_temp", "pH", "Phosphate", "Salinity", "Silicate", "Phytoplankton"))
 }
 
-enviro_regions_boxplot <- function(enviro_region, enviro_data){
-  #compare values in each environmental region
-  enviro_regions_df <- cbind(enviro_region, enviro_data) 
+enviro_zones_boxplot <- function(enviro_zone, enviro_data){
+  #compare values in each environmental zone
+  enviro_zones_df <- cbind(enviro_zone, enviro_data) 
 
   graphics::par(mfrow = c(3,4))
-  for (i in 2:ncol(enviro_regions_df)) {
-    eval(parse(text = paste0("boxplot(`", colnames(enviro_regions_df[i]), "` ~ enviro_region, data = enviro_regions_df, col = palette.colors(n = ", max(enviro_region), ", palette = 'Dark2'))")))
+  for (i in 2:ncol(enviro_zones_df)) {
+    eval(parse(text = paste0("boxplot(`", colnames(enviro_zones_df[i]), "` ~ enviro_zone, data = enviro_zones_df, col = palette.colors(n = ", max(enviro_zone), ", palette = 'Dark2'))")))
   }
   graphics::par(mfrow = c(1,1))
 }
 
-enviro_regions_pca <- function(enviro_region, enviro_data){
+enviro_zones_pca <- function(enviro_zone, enviro_data){
   pca_df <- stats::prcomp(enviro_data, scale. = TRUE, center = TRUE) %>% 
     .[["x"]] %>% 
     as.data.frame()
 
-  pca_df$enviro_region <- enviro_region
+  pca_df$enviro_zone <- enviro_zone
 
-  plot(x = pca_df$PC1, y = pca_df$PC2, col = pca_df$enviro_region, xlab = "PC1", ylab = "PC2", pch = 4, cex = 0.6)
-  graphics::legend("bottomright", legend = unique(pca_df$enviro_region), col = unique(pca_df$enviro_region), pch = 4, cex = 1, title = "Enviro region")
+  plot(x = pca_df$PC1, y = pca_df$PC2, col = pca_df$enviro_zone, xlab = "PC1", ylab = "PC2", pch = 4, cex = 0.6)
+  graphics::legend("bottomright", legend = unique(pca_df$enviro_zone), col = unique(pca_df$enviro_zone), pch = 4, cex = 1, title = "Enviro zone")
 }
diff --git a/R/get_features.R b/R/get_features.R
@@ -2,15 +2,15 @@
 #'
 #' @description This is a wrapper of `get_bathymetry()`,
 #'   `get_seamounts_buffered()`, `get_knolls()`, `get_geomorphology()`,
-#'   `get_coral_habitat()`, and `get_enviro_regions()`. See the individual
+#'   `get_coral_habitat()`, and `get_enviro_zones()`. See the individual
 #'   functions for details.
 #'
 #' @inheritParams get_bathymetry
 #' @param raw `logical` if TRUE, `spatial_grid` should be an `sf` polygon, and
 #'   the raw feature data in that polygon(s) will be returned. Note that this
 #'   will be a list object, since raster and `sf` data may be returned.
 #' @param features a vector of feature names, can include: "bathymetry",
-#'   "seamounts", "knolls", "geomorphology", "corals", "enviro_regions"
+#'   "seamounts", "knolls", "geomorphology", "corals", "enviro_zones"
 #' @param bathy_resolution `numeric`; the resolution (in minutes) of data to
 #'   pull from the ETOPO 2022 Global Relief model. Values less than 1 can only
 #'   be 0.5 (30 arc seconds) and 0.25 (15 arc seconds)
@@ -25,11 +25,11 @@
 #' @param octocoral_threshold `numeric` between 0 and 7; the threshold value for
 #'   how many species (of 7) should be predicted present in an area for
 #'   octocorals to be considered present (default is 2)
-#' @param enviro_clusters `numeric`; the number of environmental regions to
+#' @param enviro_clusters `numeric`; the number of environmental zones to
 #'   cluster the data into - to be used when a clustering algorithm is not
 #'   necessary (default is NULL)
 #' @param max_enviro_clusters `numeric`; the maximum number of environmental
-#'   regions to try when using the clustering algorithm (default is 8)
+#'   zones to try when using the clustering algorithm (default is 8)
 #'
 #' @return If `raw = TRUE`, a list of feature data is returned (mixed raster and
 #'   `sf` objects). If a `spatial_grid` is supplied, a multi-layer raster or
@@ -44,10 +44,12 @@
 #' raw_data <- get_features(spatial_grid = bermuda_eez, raw = TRUE)
 #' # Get feature data in a spatial grid
 #' bermuda_grid <- get_grid(boundary = bermuda_eez, crs = '+proj=laea +lon_0=-64.8108333 +lat_0=32.3571917 +datum=WGS84 +units=m +no_defs', resolution = 20000)
+#' #set seed for reproducibility in the get_enviro_zones() function
+#' set.seed(500)
 #' features_gridded <- get_features(spatial_grid = bermuda_grid)
 #' terra::plot(features_gridded)
 
-get_features <- function(spatial_grid = NULL, raw = FALSE, features = c("bathymetry", "seamounts", "knolls", "geomorphology", "corals", "enviro_regions"), bathy_resolution = 1, seamount_buffer = 30000, antipatharia_threshold = 22, octocoral_threshold = 2, enviro_clusters = NULL, max_enviro_clusters = 6, antimeridian = NULL){
+get_features <- function(spatial_grid = NULL, raw = FALSE, features = c("bathymetry", "seamounts", "knolls", "geomorphology", "corals", "enviro_zones"), bathy_resolution = 1, seamount_buffer = 30000, antipatharia_threshold = 22, octocoral_threshold = 2, enviro_clusters = NULL, max_enviro_clusters = 6, antimeridian = NULL){
 
   #set extra columns aside - only need this is it a spatial grid, so added
   #nrow() check to remove the need for this step if only raw data is required
@@ -92,10 +94,10 @@ get_features <- function(spatial_grid = NULL, raw = FALSE, features = c("bathyme
     })
   }
 
-  if("enviro_regions" %in% features) { 
-    message("Getting environmental regions data... This could take several minutes")
+  if("enviro_zones" %in% features) { 
+    message("Getting environmental zones data... This could take several minutes")
     suppressMessages({
-      enviro_regions <- get_enviro_regions(spatial_grid = spatial_grid, raw = raw, show_plots = FALSE, num_clusters = enviro_clusters, max_num_clusters = max_enviro_clusters, antimeridian = antimeridian)
+      enviro_zones <- get_enviro_zones(spatial_grid = spatial_grid, raw = raw, show_plots = FALSE, num_clusters = enviro_clusters, max_num_clusters = max_enviro_clusters, antimeridian = antimeridian)
     })
   }
 

diff --git a/README.Rmd b/README.Rmd
@@ -180,9 +180,9 @@ coral_habitat <- get_coral_habitat(spatial_grid = bermuda_grid)
 terra::plot(coral_habitat, col = c("grey60", "coral"), axes = FALSE, fun = function()terra::lines(terra::as.polygons(seamounts, dissolve = TRUE), col = "orangered4"))
 ```
 
-## Environmental Regions
+## Environmental Zones
 
-Bioregions are often included in spatial planning, but available bioregional classifications are either too coarse or too detailed to be useful for planning at the EEZ level. Borrowing methods from [Magris et al. 2020](https://doi.org/10.1111/ddi.13183), we use spatial clustering of biophysical environmental data from [Bio-Oracle](https://bio-oracle.org/), to create 'environmental regions'. Biophysical conditions within a environmental region are more similar than areas outside that region, though the differences may be small. Diagnostic boxplots and a PCA will be shown if `show_plots = TRUE`. All the biophysical data are ocean surface data for the period 2010 - 2020:
+Bioregions are often included in spatial planning, but available bioregional classifications are either too coarse or too detailed to be useful for planning at the EEZ level. Borrowing methods from [Magris et al. 2020](https://doi.org/10.1111/ddi.13183), we use spatial clustering of biophysical environmental data from [Bio-Oracle](https://bio-oracle.org/), to create 'environmental zones'. Biophysical conditions within a environmental zone are more similar than areas outside that zone, though the differences may be small. Diagnostic boxplots and a PCA will be shown if `show_plots = TRUE`. All the biophysical data are ocean surface data for the period 2010 - 2020:
 
 * Chlorophyll concentration (mean, mg/ m3)
 * Dissolved oxygen concentration (mean)
@@ -196,14 +196,13 @@ Bioregions are often included in spatial planning, but available bioregional cla
 * Sea surface temperature (min, degree C)
 * Silicate concentration (mean, mmol/ m3)
 
-```{r environmental_regions, warning=FALSE, message=FALSE}
-
+```{r environmental_zones, warning=FALSE, message=FALSE}
 #set number of clusters to 3 to reduce runtime and memory usage
-enviro_regions <- get_enviro_regions(spatial_grid = bermuda_grid, show_plots = TRUE, num_clusters = 3)
+enviro_zones <- get_enviro_zones(spatial_grid = bermuda_grid, show_plots = TRUE, num_clusters = 3)
 ```
 
 
-```{r enviro_regions_maps, warning=FALSE, message=FALSE}
-#value of 1 indicates that environmental region is present
-terra::plot(enviro_regions, col = c("grey60", "forestgreen"), axes = FALSE, fun = function(){terra::lines(terra::vect(bermuda_eez_projected))})
+```{r enviro_zones_maps, warning=FALSE, message=FALSE}
+#value of 1 indicates that environmental zone is present
+terra::plot(enviro_zones, col = c("grey60", "forestgreen"), axes = FALSE, fun = function(){terra::lines(terra::vect(bermuda_eez_projected))})
 ```