-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathsegment.R
354 lines (308 loc) · 12.1 KB
/
segment.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
## TODO: tests https://github.com/ncss-tech/aqp/issues/180
#' Segmenting of Soil Horizon Data by Depth Interval
#'
#' This function segments or subdivides horizon data from a `SoilProfileCollection` or `data.frame` by depth interval (e.g. `c(0, 10)`, `c(0, 50)`, or `25:100`). This results in horizon records being split at the specified depth intervals, which duplicates the original horizon data but also adds new horizon depths. In addition, labels (i.e. `"segment_id"`) are added to each horizon record that correspond with their depth interval (e.g. `025-100`). This function is intended to harmonize horizons to a common support (i.e. depth interval) for further aggregation or summary. See the examples.
#' @param object either a `SoilProfileCollection` or `data.frame`
#' @param intervals a vector of integers over which to slice the horizon data (e.g. `c(25, 100)` or `25:100`)
#' @param trim logical, when `TRUE` horizons in `object` are truncated to the min/max specified in `intervals`. When `FALSE`, those horizons overlapping an interval are marked as such. Care should be taken when specifying more than one depth interval and `trim = FALSE`.
#' @param hzdepcols a character vector of length 2 specifying the names of the horizon depths (e.g. `c("hzdept", "hzdepb")`), only necessary if `object` is a `data.frame`.
#'
#' @details `segment()` performs no aggregation or resampling of the source data, rather, labels are added to horizon records for subsequent aggregation or summary. This makes it possible to process a very large number of records outside of the constraints associated with e.g. `slice()` or `slab()`.
#'
#' @return Either a `SoilProfileCollection` or `data.frame` with the original horizon data segmented by depth intervals. There are usually more records in the resulting object, one for each time a segment interval partially overlaps with a horizon. A new column called `segment_id` identifying the depth interval is added.
#'
#' @author Stephen Roecker
#'
#' @seealso [dice()], [glom()]
#'
#' @export
#'
#' @examples
#'
#' # example data
#' data(sp1)
#'
#' # upgrade to SPC
#' depths(sp1) <- id ~ top + bottom
#'
#' # segment and trim
#' z <- segment(sp1, intervals = c(0, 10, 20, 30), trim = TRUE)
#'
#' # display segment labels
#' # note that there are new horizon boundaries at segments
#' par(mar = c(0, 0, 3, 1))
#' plotSPC(z, color = 'segment_id', width = 0.3)
#'
#' # highlight new horizon records
#' par(mar = c(0, 0, 2, 1))
#' plotSPC(z, color = NA, default.color = NA, width = 0.3, lwd = 1)
#' plotSPC(sp1, color = NA, default.color = NA,
#' width = 0.3, lwd = 3, add = TRUE, name = NA, print.id = FALSE)
#' legend('top', horiz = TRUE,
#' legend = c('original', 'segmented'),
#' lwd = c(1, 3), cex = 0.85, bty = 'n')
#'
#' \donttest{
#' # same results as slab()
#' # 10 random profiles
#' s <- lapply(1:10, random_profile, n_prop = 1, SPC = TRUE, method = 'random_walk')
#' s <- combine(s)
#'
#' a.slab <- slab(s, fm = ~ p1, slab.structure = c(0, 10, 20, 30), slab.fun = mean, na.rm = TRUE)
#'
#' z <- segment(s, intervals = c(0, 10, 20, 30), trim = TRUE)
#' z <- horizons(z)
#' z$thick <- z$bottom - z$top
#'
#' a.segment <- sapply(split(z, z$segment_id), function(i) {
#' weighted.mean(i$p1, i$thick)
#' })
#'
#'
#' res <- data.frame(
#' slab = a.slab$value,
#' segment = a.segment,
#' diff = a.slab$value - a.segment
#' )
#'
#' print(res)
#' res$diff < 0.001
#'}
#'
#'
#' data(sp5)
#'
#' # segment by upper 25-cm
#' test1 <- segment(sp5, intervals = c(0, 100))
#' print(test1)
#' nrow(test1)
#' print(object.size(test1), units = "Mb")
#'
#' # segment by 1-cm increments
#' test2 <- segment(sp5, intervals = 0:100)
#' print(test2)
#' nrow(test2)
#' print(object.size(test2), units = "Mb")
#'
#'
#' # segment and aggregate
#' test3 <- segment(horizons(sp5),
#' intervals = c(0, 5, 15, 30, 60, 100, 200),
#' hzdepcols = c("top", "bottom")
#' )
#' test3$hzthk <- test3$bottom - test3$top
#' test3_agg <- by(test3, test3$segment_id, function(x) {
#' data.frame(
#' hzID = x$hzID[1],
#' segment_id = x$segment_id[1],
#' average = weighted.mean(x$clay, w = x$hzthk)
#' )
#' })
#' test3_agg <- do.call("rbind", test3_agg)
#'
#' head(test3_agg)
#'
segment <- function(object, intervals, trim = TRUE, hzdepcols = NULL) {
# depth interval rules
dep <- data.frame(
top = intervals[-length(intervals)],
bot = intervals[-1],
stringsAsFactors = FALSE
)
n <- max(nchar(intervals))
dep$id <- paste0(
formatC(dep$top, width = n, flag = 0),
"-",
formatC(dep$bot, width = n, flag = 0)
)
# argument sanity check
test_spc <- inherits(object, 'SoilProfileCollection')
test_df <- inherits(object, 'data.frame')
test_hd <- !is.null(hzdepcols) & length(hzdepcols) == 2
test_dep <- is.numeric(dep$top) & is.numeric(dep$bot) & all(dep$top < dep$bot)
if (!any(test_spc, test_df)) {
stop("the input must be either a SoilProfileCollection or data.frame")
}
if (!test_spc & (!test_df | !test_hd)) {
stop("if the input is a data.frame then hzdepcols must not be NULL and length(hzdepcols) == 2")
}
if (!test_dep) {
stop("intervals should be numeric and sequential (e.g. c(0, 1, 2, 3) or 0:100)")
}
# standardize inputs
if (test_spc) {
peid <- idname(object)
hzid <- hzidname(object)
hzdepcols <- horizonDepths(object)
h <- horizons(object)
names(h)[names(h) %in% c(peid, hzid)] <- c("peid", "hzid")
} else {
h <- object
}
names(h)[names(h) %in% hzdepcols] <- c("hzdept", "hzdepb")
## TODO: consider using dice()
# filter horizons and trim
.slice <- function(h, top = NULL, bot = NULL) {
idx <- h$hzdept <= bot & h$hzdepb >= top
h <- h[idx, ]
# causing errors when FALSE
if (trim == TRUE) {
h <- within(h, {
hzdept = ifelse(hzdept < top, top, hzdept)
hzdepb = ifelse(hzdepb > bot, bot, hzdepb)
})
h <- h[(h$hzdepb - h$hzdept) > 0, ]
}
return(h)
}
# slice spc by intervals
# dep$df <- lapply(1:nrow(dep), function(x) h[0, ]) # pre-allocate memory
dep$df <- list(h[0, ])[rep(1, nrow(dep))] # pre-allocate memory faster
h <- {
split(dep, dep$id) ->.;
lapply(., function(x) {
x$df[[1]] <- cbind(.slice(h, top = x$top, bot = x$bot), segment_id = x$id)
return(x$df[[1]])
}) ->.;
do.call("rbind", .) ->.;
}
names(h)[names(h) %in% c("hzdept", "hzdepb")] <- hzdepcols
if (test_spc) {
h <- h[order(h$peid, h[[hzdepcols[1]]]), ]
# merge to re-add spc with NA
h_orig <- data.frame(peid = names(table(horizons(object)[peid])), stringsAsFactors = FALSE)
h <- merge(h_orig, h, by = "peid", all.x = TRUE, sort = FALSE)
rm(h_orig)
## TODO: consider adding a flag to indicate "new" horizon records that have been added
# rebuild SPC
names(h)[names(h) == "peid"] <- peid
names(h)[names(h) == "hzid"] <- hzid
h$hzID <- 1:nrow(h)
replaceHorizons(object) <- h
suppressMessages(hzidname(object) <- "hzID")
}
# return
if(test_spc){
return(suppressMessages(rebuildSPC(object))) # TODO: this is protection from missing-data/ID offset
} else {
return(h)
}
}
#' @title Dissolving horizon boundaries by grouping variables
#'
#' @description This function dissolves or combines horizons that have a common set of grouping variables. It only combines those horizon records that are sequential (e.g. share a horizon boundary). Thus, it can be used to identify discontinuities in the grouping variables along a profile and their unique depths. It is particularly useful for determining the depth to the top or bottom of horizons with a specific category, and should be simpler than previous methods that require aggregating over profiles.
#'
#' @param object a \code{data.frame}
#' @param by character: column names, to be used as grouping variables, within the object.
#' @param id character: column name of the pedon ID within the object.
#' @param hztop character: column name of the horizon top depth within the object.
#' @param hzbot character: column name of the horizon bottom depth in the object.
#' @param collapse logical: indicating whether to not combine grouping variables before dissolving.
#' @param order logical: indicating whether or not to order the object by the id, hztop, and hzbot columns.
#' #'
#' @details This function assumes the profiles and horizons within the object follow the logic defined by \code{checkHzDepthLogic} (e.g. records are ordered sequentially by id, hztop, and hzbot and without gaps). If the records are not ordered, set the \code{order = TRUE}.
#'
#' @return A \code{data.frame} with the original id, by grouping variables, and non-consecutive horizon depths.
#'
#' @author Stephen Roecker
#'
#' @seealso \code{\link{checkHzDepthLogic}}
#'
#' @export
#'
#' @examples
#'
#' # example 1
#' data(jacobs2000)
#' spc <- jacobs2000
#'
#' spc$dep_5 <- spc$depletion_pct >=5
#' spc$genhz <- generalize.hz(spc$name, c("A", "E", "B", "C"), c("A", "E", "B", "C"))
#' h <- horizons(spc)
#'
#' test <- dissolve_hz(h, by = c("genhz", "dep_5"), id = "id", hztop = "top", hzbot = "bottom")
#'
#' vars <- c("id", "top", "bottom", "genhz", "dep_5")
#' h[h$id == "92-1", vars]
#' test[test$id == "92-1", ]
#'
#'
#' # example 2
#' df <- data.frame(
#' peiid = 1,
#' hzdept = c(0, 5, 10, 15, 25, 50),
#' hzdepb = c(5, 10, 15, 25, 50, 100),
#' hzname = c("A1", "A2", "E/A", "2Bt1", "2Bt2", "2C"),
#' genhz = c("A", "A", "E", "2Bt", "2Bt", "2C"),
#' texcl = c("sil", "sil", "sil", "sl", "sl", "s")
#' )
#'
#' df
#'
#' dissolve_hz(df, c("genhz", "texcl"))
#' dissolve_hz(df, c("genhz", "texcl"), collapse = TRUE)
#'
#' test <- dissolve_hz(df, "genhz")
#' subset(test, value == "2Bt")
#'
dissolve_hz <- function(object, by, id = "peiid", hztop = "hzdept", hzbot = "hzdepb", collapse = FALSE, order = FALSE) {
# id = "peiid"; hztop = "hzdept"; hzbot = "hzdepb", collapse = FALSE, order = FALSE
# test inputs ----
# argument sanity check
# test_spc <- inherits(object, 'SoilProfileCollection')
# check that object & by are the right class
test_object <- inherits(object, "data.frame")
test_by <- inherits(by, "character")
if (!any(test_object | test_by)) {
stop("the object argument must be a data.frame, and by a character", call. = FALSE)
}
# check that by is not NULL
if (is.null(by)) stop("the by argument must not be NULL")
# check that collapse is a logical of length 1
if (!inherits(collapse, "logical") || length(collapse) != 1) {
stop("the collapse argument must be logical and a length of one", call. = FALSE)
}
# check that the column names exisit within the object
var_names <- c(id = id, top = hztop, bot = hzbot, by)
if (!all(var_names %in% names(object))) {
stop("all arguments must match object names")
}
# check that "by" are characters or convert
if (any(!"character" %in% sapply(object[by], class))) {
message("non-character grouping variables are being converted to characters")
object[by] <- lapply(object[by], as.character)
}
# standardize inputs ----
df <- object
idx_names <- sapply(var_names[1:3], function(x) which(names(df) == x))
names(df)[idx_names] <- names(var_names)[1:3]
# valid
# vd_idx <- validate_depths(df, id = "id", hztop = "hzdept", bot = "hzdepb")
if (order == TRUE) {
df <- df[order(df$id, df$top, df$bot), ]
}
if (collapse == TRUE) {
by_co <- paste(by, collapse = " & ")
df[by_co] <- apply(df[by], 1, paste, collapse = " & ")
by <- by_co
}
# var thickness ----
var_dep <- lapply(by, function(x) {
con_bot <- rle( paste(df$id, df[, x]))$length
con_top <- rle(rev(paste(df$id, df[, x])))$length
bot_idx <- cumsum(con_bot)
top_idx <- cumsum(con_top)
vd <- data.frame(
id = df[bot_idx, "id"],
top = rev(rev(df$top)[top_idx]),
bot = df[bot_idx, "bot"],
variable = x,
value = df[bot_idx, x]
)
return(vd)
})
var_dep <- do.call("rbind", var_dep)
# undo standardization ----
names(var_dep)[1:3] <- var_names[1:3]
return(var_dep)
}