tidyverse · hadley · Aug 18, 2022 · Jul 5, 2022 · Jul 6, 2022 · Jul 6, 2022
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,6 @@
 # dplyr (development version)
 
+* `storms` has been updated to include 2021 data, data prior to 1975, and some missing storms that were omitted due to an error.
 * `coalesce()` now more fully embraces the principles of vctrs (#6265).
 
   * `.ptype` and `.size` arguments have been added to allow you to explicitly

diff --git a/R/data-storms.R b/R/data-storms.R
@@ -1,33 +1,41 @@
 #' Storm tracks data
 #'
-#' This data is a subset of the NOAA Atlantic hurricane database best track
-#' data, \url{https://www.nhc.noaa.gov/data/#hurdat}. The data includes the
-#' positions and attributes of storms from 1975-2020, measured every six hours
-#' during the lifetime of a storm.
+#' This dataset is the NOAA Atlantic hurricane database best track data, <https://www.nhc.noaa.gov/data/#hurdat>.
+#' The data includes the positions and attributes of storms from 1852-2021.
+#' The modern storms are measured every six hours during the lifetime of the storm.
 #'
 #' @seealso The script to create the storms data set: \url{https://github.com/tidyverse/dplyr/blob/main/data-raw/storms.R}
 #'
-#' @format A tibble with 11,859 observations and 13 variables:
+#' @format A tibble with 22,184 observations and 13 variables:
 #' \describe{
 #' \item{name}{Storm Name}
 #' \item{year,month,day}{Date of report}
 #' \item{hour}{Hour of report (in UTC)}
 #' \item{lat,long}{Location of storm center}
 #' \item{status}{Storm classification (Tropical Depression, Tropical Storm,
 #'   or Hurricane)}
-#' \item{category}{Saffir-Simpson storm category (estimated from wind speed.
-#' -1 = Tropical Depression, 0 = Tropical Storm)}
+#' \item{category}{Saffir-Simpson hurricane category calculated from wind speed. Only applies to hurricanes.
+#'   \itemize{
+#'     \item 64+ knots = Category 1
+#'     \item 83+ knots = Category 2
+#'     \item 96+ knots = Category 3
+#'     \item 113+ knots = Category 4
+#'     \item 137+ knots = Category 5
+#'   }
+#' }
 #' \item{wind}{storm's maximum sustained wind speed (in knots)}
 #' \item{pressure}{Air pressure at the storm's center (in millibars)}
-#' \item{tropicalstorm_force_diameter}{Diameter (in nautical miles) of the area experiencing tropical storm strength winds (34 knots or above)}
-#' \item{hurricane_force_diameter}{Diameter (in nautical miles) of the area experiencing hurricane strength winds (64 knots or above)}
+#' \item{tropicalstorm_force_diameter}{Diameter (in nautical miles) of the area experiencing tropical storm strength winds (34 knots or above). Only available starting in 2004.}
+#' \item{hurricane_force_diameter}{Diameter (in nautical miles) of the area experiencing hurricane strength winds (64 knots or above). Only available starting in 2004.}
 #' }
 #' @examples
 #'
-#' # show a plot of the storm paths
+#' # show a plot of the storm paths in 1975 or later
 #' if (requireNamespace("ggplot2", quietly = TRUE)) {
 #'   library(ggplot2)
-#'   ggplot(storms) +
+#'   storms %>%
+#'     filter(year >= 1975) %>%
+#'   ggplot() +
-#'   ggplot() +
+#'     ggplot() +
-#'   ggplot() +
+#'     ggplot() +
 #'     aes(x=long, y=lat, color=paste(year, name)) +
 #'     geom_path() +
 #'     guides(color='none') +

diff --git a/data-raw/storms.R b/data-raw/storms.R
@@ -1,26 +1,27 @@
 library(tidyverse)
 
 # Creates storms data set from NOAA Atlantic Hurricane data, which is provided
-# in an unorthodox format: a csv that alternates between header/identifier rows
+# in an unorthodox format: a csv that alternates between metadata/identifier rows
 # and data rows.
 
 # TO UPDATE: get the latest URL from https://www.nhc.noaa.gov/data/#hurdat, and rerun this code
 
 # Read in data set so each line is a character string
-storm_file_complete <- read_file("https://www.nhc.noaa.gov/data/hurdat/hurdat2-1851-2020-052921.txt")
+storm_file_complete <- read_file("https://www.nhc.noaa.gov/data/hurdat/hurdat2-1851-2021-041922.txt")
 storm_strings <- read_lines(storm_file_complete)
 
 # Identify the header lines that have three commas
-header_locations <- (1:length(storm_strings))[str_count(storm_strings, "\\,") == 3]
+header_locations <- str_count(storm_strings, "\\,") == 3
+header_locations <- (1:length(storm_strings))[header_locations]
 
 # Extract length of each sub-dataset
 headers <- as.list(storm_strings[header_locations])
 headers_df <- headers %>%
   map(str_sub, start = 1, end = -2) %>% # to remove trailing comma
   map(paste0, "\n") %>%                 # to trigger literal read
-  map_df(read_csv, col_names = c("id", "name", "n_obs")) %>%
+  map_df(read_csv, col_names = c("id", "name", "n_obs"), col_types = "cci") %>%
   mutate(name = recode(name, "UNNAMED" = id), skip = header_locations) %>%
-  select(name, skip, n_obs)
+  select(id, name, skip, n_obs)
 
 column_types <- list(
   date = col_character(),
@@ -58,24 +59,37 @@ for (i in 1:nrow(headers_df)) {
   data_subset = storm_strings[row_start:row_end] %>%
     paste(collapse = "\n") %>%
     paste0("\n")
+  # read it as a csv
   data_subset = read_csv(
     data_subset,
     col_names = column_names,
     col_types = column_types,
     na = c("", "-99", "-999")
   )
-  # name at the front
+  problems()
+  # name and id at the front
   data_subset$name = headers_df[i,]$name
   data_subset = data_subset %>% relocate(name)
+  data_subset$id = headers_df[i,]$id
+  data_subset = data_subset %>% relocate(id)
   # add to list of storms
   storm_dataframes[[i]] = data_subset
 }
 
 # Combine and clean the data sets
 library(lubridate)
 
+# combine the storms into one dataframe
 storms <- storm_dataframes %>%
-  bind_rows() %>%
+  bind_rows()
+
+
+#####################
+# format and cleanup
+
+
+# format the columns
+storms <- storms %>%
   mutate(
     date = ymd(date),
     year = year(date),
@@ -88,35 +102,67 @@ storms <- storm_dataframes %>%
     long_hemisphere = str_sub(long, -1),
     long_sign = if_else(long_hemisphere == "E", 1, -1),
     long = as.numeric(str_sub(long, 1, -2)) * long_sign,
-    category = cut(wind,
-      breaks = c(0, 34, 64, 83, 96, 113, 137, 500),
-      labels = c(-1, 0, 1, 2, 3, 4, 5),
-      include.lowest = TRUE, ordered = TRUE
-    ),
     # wind = wind * 1.15078, # transforms knots to mph,
     TSradius1 = extent_34_NE + extent_34_SW,
     TSradius2 = extent_34_NW + extent_34_SE,
     tropicalstorm_force_diameter = pmax(TSradius1, TSradius2),
     HUradius1 = extent_64_NE + extent_64_SW,
     HUradius2 = extent_64_NW + extent_64_SE,
-    hurricane_force_diameter = pmax(HUradius1, HUradius2),
-    status = recode(status, "HU" = "hurricane", "TS" = "tropical storm", "TD" = "tropical depression")
+    hurricane_force_diameter = pmax(HUradius1, HUradius2)
   ) %>%
-  select(name, year, month, day, hour, lat, long, status, category, wind, pressure, tropicalstorm_force_diameter, hurricane_force_diameter)
+  select(name, year, month, day, hour, lat, long, status, wind, pressure, tropicalstorm_force_diameter, hurricane_force_diameter)
+
+# drop rows with missing pressure record
+storms <- storms %>%
+  filter(!is.na(pressure))
+
+# don't abrev.
+storms <- storms %>% mutate(
+  status = factor(recode(status,
+    "HU" = "hurricane",
+    "TS" = "tropical storm",
+    "TD" = "tropical depression",
+    "EX" = "extratropical",
+    "SD" = "subtropical depression",
+    "SS" = "subtropical storm",
+    "LO" = "other low",
+    "WV" = "tropical wave",
+    "DB" = "disturbance"
+  ))
+)
 
-# Narrow to storms that have complete pressure record
-completeish <- storms %>%
-  group_by(name) %>%
-  summarise(n_pressure = sum(!is.na(pressure)), p_pressure = mean(!is.na(pressure))) %>%
-  filter(p_pressure == 1) %>%
-  .[["name"]]
+# hurricane category
+storms <- storms %>%
+  mutate(category = case_when(
+    status != "hurricane" ~ NA,
+    wind >= 137 ~ 5,
+    wind >= 113 ~ 4,
+    wind >= 96 ~ 3,
+    wind >= 83 ~ 2,
+    wind >= 64 ~ 1,
+    .default = NA
+  )) %>%
+  relocate(category, .after = status)
 
+# drop storms without at least one record that is a tropical depression or higher
+storms <- storms %>%
+  group_by(year, name) %>%
+  mutate(is_depression_or_higher = any(status %in% c("hurricane", "tropical storm", "tropical depression"))) %>%
+  ungroup() %>%
+  filter(is_depression_or_higher) %>%
+  select(-is_depression_or_higher)
+
+# drop all rows that are not at least a depression
+# might want to use this filter if the file size is an issue
+# storms <- storms %>% filter(status %in% c("hurricane", "tropical storm", "tropical depression"))
+
+# make names Title casing
 storms <- storms %>%
-  filter(
-    status %in% c("hurricane", "tropical storm", "tropical depression"),
-    name %in% completeish
-  ) %>%
   mutate(name = if_else(str_sub(name, 1, 3) %in% c("AL0", "AL1"), name, str_to_title(name)))
 
+# drop a bad data point (add more if found)
+storms <- storms %>%
+  filter( !((year == 1969) & (name == "Debbie") & (long < -350)) )
+
 # output for the package
 usethis::use_data(storms, overwrite = TRUE)
diff --git a/data/storms.rda b/data/storms.rda
diff --git a/man/storms.Rd b/man/storms.Rd