-
Notifications
You must be signed in to change notification settings - Fork 3
/
utils.r
323 lines (312 loc) · 12.8 KB
/
utils.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
#' Download EA Catchment Data
#' @description Downloads classification data from EA Catchment Data
#' Explorer site. Data can be downloaded by specifying waterbody id
#' (\code{WBID}), Management Catchment (\code{MC}), Operational
#' Catchment (\code{OC}) or River Basin District (\code{RBD}).
#' Start year (\code{startyr}) and end year (\code{endyr}) allow
#' specific timeranges to be downloaded. This is not exported as it is
#' called by \code{\link{get_status}}.
#' For Management Catchment (\code{MC}), Operational
#' Catchment (\code{OC}) or River Basin District (\code{RBD}) level
#' downloads, waterbody \code{Type} can also be specified to allow
#' extraction of specific waterbody types (River, Lake etc).
#
#' @param ea_name A string representing the description (name) of the
#' features to be extracted. For example to extract data for the whole of
#' the Humber RBD, this would be "Humber"; also see examples. Must be an
#' exact match to the values used in the EA database.
#' Use the \code{\link{search_names}} function to search for specific values.
#'
#' @param column The column to be searched. Possible options are
#' \code{WBID} (waterbody id), \code{OC} (Operational Catchment), \code{MC}
#' (Management Catchment) and \code{RBD} (River Basin District)
#'
#' @param data_type The type of data to be retrieved, either status
#' classification ("class") or Reasons of Not Acheving Good ("rnag").
#'
#' @return A data frame containing the classifcation or RNAG details for the
#' specified combination of column and value.
#'
#' @noRd
download_cde <- function(ea_name = NULL, column = NULL, data_type=NULL) {
# set up url components
base_url<-"http://environment.data.gov.uk/catchment-planning/"
# need to set the end URL differently depending on data_type
if (data_type=="class"){
end_url<-"/classification?item=all&status=all&format=csv"
}
if (data_type=="rnag"){
end_url<-"/ReasonsForNotAchievingGood?item=all&format=csv"
}
if(data_type=="measures"){
end_url<-"/Action?format=csv"
}
if(data_type=="pa"){
end_url<-"/pa/csv"
}
if (data_type=="objectives"){
end_url<-"/outcome?item=all&status=all&format=csv"
}
if (column == "RBD") {
# rbd level extraction
index_num <- ea_wbids$RBD.num[which(ea_wbids[, column] == ea_name)][1]
if (is.na(index_num)) {
stop("River Basin District name specified not found.")
} else {
downloadurl <- paste0(base_url, "RiverBasinDistrict/", index_num,
end_url)
cde_data <- zip_download(downloadurl)
}
} # end of rbd extraction
if (column == "MC") {
# mc level extraction
index_num <- ea_wbids$MC.num[which(ea_wbids[, column] == ea_name)][1]
if (is.na(index_num)) {
stop("Management Catchment name specified not found.")
} else {
downloadurl <- paste0(base_url, "ManagementCatchment/", index_num,
end_url)
cde_data <- zip_download(downloadurl)
}
} # end of mc extraction
# oc next
if (column == "OC") {
# oc level extraction - works
index_num <- ea_wbids$OC.num[which(ea_wbids[, column] == ea_name)][1]
if (is.na(index_num)) {
stop("Operational catchment name specified not found.")
} else {
cde_data <- data.table::fread(paste0(base_url, "OperationalCatchment/",
index_num, end_url), showProgress = FALSE, header = TRUE,
stringsAsFactors = FALSE, check.names=TRUE, data.table=FALSE)
}
} # end of oc extraction
# finally wbid
if (column == "WBID") {
# wbid level extraction
if (ea_name %in% ea_wbids[, "WBID"]) {
if (data_type=="rnag"){
# have to add supress warnings as data.table does not like empty
# RNAG data (bad download format on the part of EA)
suppressWarnings(cde_data <- data.table::fread(paste0(base_url,
"data/reason-for-failure.csv?waterBody=", ea_name, "&_view=csv"),
showProgress = FALSE, header = TRUE, stringsAsFactors = FALSE,
check.names=TRUE, data.table=FALSE))
}
if (data_type=="objectives"){
cde_data <- data.table::fread(paste0(base_url, "so/WaterBody/",
ea_name, "/objective-outcomes.csv?_view=csv"),
showProgress = FALSE, header = TRUE, stringsAsFactors = FALSE,
check.names=TRUE, data.table=FALSE)
}
if (data_type=="pa"){
cde_data <- data.table::fread(paste0(base_url, "WaterBody/",
ea_name, "/pa/csv"), showProgress = FALSE, header = TRUE,
stringsAsFactors = FALSE, check.names=TRUE, data.table=FALSE)
}
if (data_type=="class"){
cde_data <- data.table::fread(paste0(base_url,
"data/classification.csv?waterBody=", ea_name, "&_view=csv"),
showProgress = FALSE, header = TRUE, stringsAsFactors = FALSE,
check.names=TRUE, data.table=FALSE)
}
}
else {
stop("WBID value specified not found.")
}
}
# end of wbid extraction
return(cde_data)
} # end of function
#' Download Zipfile and extract csv
#' @description Downloads zipfile from specified url, unzips to
#' csv file and reads csv into dataframe.
#
#' @param download_url A string representing the url to download the
#' zip file from.
#'
#' @noRd
zip_download <- function(download_url) {
temp <- tempfile()
curl::curl_download(download_url, temp, mode = "wb")
# extract data from zipfile to df using data.table to speed things up
csvfile <- utils::unzip(temp, junkpaths = TRUE)
catchment_data <- data.table::fread(csvfile, stringsAsFactors = FALSE,
check.names = TRUE, data.table = FALSE, showProgress = FALSE)
# delete the intermediate files
unlink(temp)
unlink(csvfile)
return(catchment_data)
} # end of function
#' Check common arguments to functions
#' @description Checks the ea_name, year ranges and waterbody type
#' for all functions
#
#' @param ea_name A string representing the description (name) of the
#' features to be extracted. For example to extract data for the whole of
#' the Humber RBD, this would be "Humber"; also see examples. Must be an
#' exact match to the values used in the EA database.
#' Use the \code{\link{search_names}} function to search for specific values.
#'
#' @param column The column to be searched. Possible options are
#' \code{WBID} (waterbody id), \code{OC} (Operational Catchment), \code{MC}
#' (Management Catchment) and \code{RBD} (River Basin District)
#'
#' @param startyr The data can be extracted for specific years using the
#' \code{startyr} and \code{endyr} arguments. If only \code{startyr} is
#' specified this extracts for a particular year. If no years are specified
#' all years are returned.
#'
#' @param endyr The data can be extracted for specific years using the
#' \code{startyr} and \code{endyr} arguments. The \code{endyr} should
#' only be specified if \code{startyr} is also included, otherwise an
#' error is returned.
#'
#' @param type Type of waterbody to be extracted. For Operational/Management
#' catchment level or RBD level queries, the data can also be subset by
#' waterbody type. Possible values are \code{River}, \code{Lake},
#' \code{GroundWaterBody}, \code{TransitionalWater} or \code{CoastalWater}.
#'
#' @noRd
check_args <- function(ea_name = NULL, column = NULL, startyr = NULL,
endyr = NULL, type = NULL) {
# check that both ea_name and column are present
if (is.null(ea_name) | is.null(column)) {
stop("Both ea_name (name) and column (\"WBID\", \"MC\", \"OC\",
or \"RBD\") should be specified", "\n")
}
# are years, if present, numeric?
if (!is.null(startyr) & !is.null(endyr)) {
if (!is.numeric(startyr) | !is.numeric(endyr)) {
stop("Please enter numeric values for the starting and ending years")
}
}
# if there is a startyr set
if (!is.null(startyr)) {
if (startyr < 2009) {
stop("Starting year cannot be before 2009")
}
# if there is an end year alsoset
if (!is.null(endyr)) {
# check values make sense
if (!endyr >= startyr) {
stop("End year is before start year: please correct.")
}
}
}
# catch when only endyr is set
if (is.null(startyr) & !is.null(endyr)){
stop("Only end year specified, also needs start year.")
}
# check that the waterbody type is a valid choice
if (!is.null(type)) {
types <- c("River", "Lake", "TransitionalWater", "GroundWaterBody",
"CoastalWater")
if (!type %in% types) {
stop("Type specified is not a valid choice (\"River\", \"Lake\",
\"CoastalWater\", \"TransitionalWater\" or \"GroundWaterBody\"")
}
}
}
# end of function
#' Subset data as required
#' @description Subsets data by year, year range, classification level
#' and waterbody type as required
#'
#' @param full_data The dataframe to be subset
#'
#' @param ea_name A string representing the description (name) of the
#' features to be extracted. For example to extract data for the whole of
#' the Humber RBD, this would be "Humber"; also see examples. Must be an
#' exact match to the values used in the EA database.
#' Use the \code{\link{search_names}} function to search for specific values.
#'
#' @param column The column to be searched. Possible options are
#' \code{WBID} (waterbody id), \code{OC} (Operational Catchment), \code{MC}
#' (Management Catchment) and \code{RBD} (River Basin District)
#'
#' @param level The level within the WFD quality status classification to be
#' extracted. Defaults to 'Overall Water Body'. See docs for possible values.
#'
#' @param startyr The data can be extracted for specific years using the
#' \code{startyr} and \code{endyr} arguments. If only \code{startyr} is
#' specified this extracts for a particular year. If no years are specified
#' all years are returned.
#'
#' @param endyr The data can be extracted for specific years using the
#' \code{startyr} and \code{endyr} arguments. The \code{endyr} should
#' only be specified if \code{startyr} is also included.
#'
#' @param type Type of waterbody to be extracted. For Operational/Management
#' catchment level or RBD level queries, the data can also be subset by
#' waterbody type. Possible values are \code{River}, \code{Lake},
#' \code{GroundWaterBody}, \code{TransitionalWater} or \code{CoastalWater}.
#'
#' @return A data frame that has been subsetted by the
#' specified combination of column, value, level and dates.
#'
#' @noRd
#'
subset_data <- function(full_data, ea_name = NULL, column = NULL,
level = "Overall Water Body", startyr = NULL, endyr = NULL, type = NULL) {
# if only start year is set, is it beyond the data range?
if (!is.null(startyr) & is.null(endyr)){
if (startyr>max(full_data$Year)){
message(paste0("Start year is beyond the most recent year of data (",
max(full_data$Year),")"))
message("Just outputting most recent year")
startyr<-max(full_data$Year)
}
}
# if endyr is set, is it beyond the data range?
if (!is.null(endyr)){
if (endyr>max(full_data$Year)){
message(paste0("End year is beyond the most recent year of data (",
max(full_data$Year),")"))
message("Subsetting to most recent year")
endyr<-max(full_data$Year)
}
}
# if they are both set, check the endyr
if (!is.null(startyr) & !is.null(endyr)) {
if (endyr>max(full_data$Year)){
message(paste0("End year is beyond the most recent year of data (",
max(full_data$Year),")"))
message("Subsetting to most recent year")
endyr<-max(full_data$Year)
}
# if both years are specified, subset by range
full_data <- full_data[full_data$Year >= startyr
& full_data$Year <= endyr, ]
}
else if (!is.null(startyr)) {
full_data <- full_data[full_data$Year == startyr, ]
}
# level subsetting, defaults to "Overall Water Body"
# for Chemical and Supporting Elements levels, need to deal with options for
# surface waters and groundwaters
if (!is.null(level)){
if (level == "Chemical") {
full_data <- full_data[full_data$Classification.Item == "Chemical" |
full_data$Classification.Item == "Chemical (GW)", ]
}
else if (level == "Supporting elements") {
full_data <- full_data[full_data$Classification.Item ==
"Supporting elements (Surface Water)" |
full_data$Classification.Item ==
"Supporting elements (Groundwater)", ]
}
else {
full_data <- full_data[full_data$Classification.Item == level, ]
}
}
# now Water.body.type
if (!is.null(type)) {
full_data <- full_data[full_data$Water.body.type == type, ]
}
# if year range covers 2013 and 2014, subset to just include cycle 2 data
# avoids double counting of waterbodies
full_data <- full_data[!(full_data$Year == 2013 & full_data$Cycle == 1 |
full_data$Year == 2014 & full_data$Cycle == 1), ]
return(full_data)
} # end of function