-
Notifications
You must be signed in to change notification settings - Fork 1
/
age_string_parser.R
165 lines (148 loc) · 5.92 KB
/
age_string_parser.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
library(magrittr)
split_age_string <- function(x) {
#### modify input (fixing small details) ####
x <- gsub("\\+", "\u00B1", x) # replace + with \u00B1
#x <- gsub("(.*)(?=\\()", "\\1 ", x, perl = T) # missing space before parentheses
x <- gsub("AA-R-", "AAR- ", x) # fix wrong lab code (http://www.radiocarbon.org/Info/labcodes.html)
x <- gsub("Wk - ", "Wk-", x) # fix wrong lab code
x <- gsub("\u{00a0}", " ", x) # replace wrong space characters
x <- gsub(" ", " ", x)
#### construct result table ####
res <- tibble::tibble(
Date_C14_Labnr = rep(NA, length(x)),
Date_C14_Uncal_BP = NA,
Date_C14_Uncal_BP_Err = NA,
Date_BC_AD_Start = NA,
Date_BC_AD_Stop = NA,
Date_Type = NA
)
#### first rough determination of dating type info ####
none_ids <- which(is.na(x))
res$Date_Type[none_ids] <- "none"
present_ids <- grep("present", x)
res$Date_Type[present_ids] <- "modern"
c14_age_ids <- grep("±", x) # indizes of suspected radiocarbon dates
#### parse nice, full uncalibrated c14 dates ####
# extract real, nice radiocarbon dates
full_radiocarbon_dates <- stringr::str_extract_all(
x[c14_age_ids],
paste0(
"[0-9]{1,5}(\\s+)*\u00B1(\\s+)*[0-9]{1,4}( BP){0,1},{0,1}\\s{0,1}(", # pattern for age +/- std
paste(
c( # patterns for labnrs
"CNA4579.1.1",
"Beta [0-9]+",
"COL3897.1.1",
"A-",
"D-AMS-[0-9]*",
"AA84155",
"KIA44691",
"MAMS 21972",
"R-EVA1606/MAMS-[0-9]*",
"TÜBİTAK-[0-9]*",
"AAR-\\s[0-9]*",
"OxA-X-[0-9]*-[0-9]*",
"CIRCE-DSH-[0-9]*",
"ISGS-A[0-9]*",
"CEDAD-LTL[0-9]*A",
"[A-Za-z]{2,7}(\\s|-)[0-9]*" # that's the normal pattern, the others are deviating from that
),
collapse = "|"
),
")"
)
)
# if there is no real, nice date, then it can't be a proper C14 date at all
full_radiocarbon_date_consumed <- purrr::map_lgl(full_radiocarbon_dates, function(x) {length(x) > 0})
c14_age_ids_true <- c14_age_ids[full_radiocarbon_date_consumed]
# split date and labnr
full_radiocarbon_split <- purrr::map(full_radiocarbon_dates[full_radiocarbon_date_consumed], function(y) {
#if (length(stringr::str_split(y, c("\u00B1|( BP){0,1}, "))[[1]])<3) {print(y)}
stringr::str_split(y, c("\u00B1|( BP )|( BP){0,1},(\\s){0,1}")) %>%
purrr::transpose(c("uncal_age", "uncal_std", "labnr")) %>%
purrr::map(unlist)
}) %>% purrr::transpose()
# write uncalibrated dates into the result table
res$Date_C14_Uncal_BP[c14_age_ids_true] <- full_radiocarbon_split$uncal_age
res$Date_C14_Uncal_BP_Err[c14_age_ids_true] <- full_radiocarbon_split$uncal_std
res$Date_C14_Labnr[c14_age_ids_true] <- full_radiocarbon_split$labnr
#### parse unnamed radiocarbon dates ####
c14_age_ids_false <- c14_age_ids[!full_radiocarbon_date_consumed]
unnamed_radiocarbon_dates <- stringr::str_extract_all(
x[c14_age_ids_false],
"[0-9]{1,5}(\\s+)*\u00B1(\\s+)*[0-9]{1,4}"
) %>%
# if more than 2 dates are listed, then the first value is (usually?) not a real date
purrr::map(function(x) { if (length(x)>2) {x[-1]} else x })
unnamed_radiocarbon_split <- purrr::map(unnamed_radiocarbon_dates, function(y) {
stringr::str_split(y, c("\\s*\u00B1")) %>%
purrr::transpose(c("uncal_age", "uncal_std")) %>%
purrr::map(unlist)
}) %>% purrr::transpose()
# write uncalibrated dates into the result table
res$Date_C14_Uncal_BP[c14_age_ids_false] <- unnamed_radiocarbon_split$uncal_age
res$Date_C14_Uncal_BP_Err[c14_age_ids_false] <- unnamed_radiocarbon_split$uncal_std
#### finally fill Date_Type column ####
res$Date_Type[!is.na(res$Date_C14_Uncal_BP)] <- "C14"
res$Date_Type[is.na(res$Date_Type)] <- "contextual"
#### parse contextual (and simplified) ages ####
# remove parenthesis and split at space and minus
simple_age_split <- x %>%
stringr::str_replace_all("\\(", "") %>%
stringr::str_replace_all("\\)", "") %>%
stringr::str_split("\\s*-\\s*|-|\\s+")
# translate first elements of the vector to meaningful start and stop ages
stop <- start <- rep(NA, length(simple_age_split))
for (i in 1:length(simple_age_split)) {
# no age info
if (is.na(simple_age_split[[i]][1])) {
start[i] <- NA
stop[i] <- NA
next
}
# age below calibration range, e.g. >45000
if (grepl("^>", simple_age_split[[i]][1])) {
start[i] <- -Inf
stop[i] <- -as.numeric(gsub(">", "", simple_age_split[[i]][1]))
next
}
# no range: only one value e.g. 5000 BCE
if (length(simple_age_split[[i]]) == 2) {
if (simple_age_split[[i]][2] == "BCE") {
start[i] <- -as.numeric(simple_age_split[[i]][1])
stop[i] <- -as.numeric(simple_age_split[[i]][1])
next
}
if (simple_age_split[[i]][2] == "CE") {
start[i] <- as.numeric(simple_age_split[[i]][1])
stop[i] <- as.numeric(simple_age_split[[i]][1])
next
}
if (all(grepl("^[0-9]+$", simple_age_split[[i]]))) {
start[i] <- -as.numeric(simple_age_split[[i]][1])
stop[i] <- -as.numeric(simple_age_split[[i]][2])
next
}
}
# normal range 5000-4700 BCE
if (simple_age_split[[i]][3] %in% c("BCE", "calBCE")) {
start[i] <- -as.numeric(simple_age_split[[i]][1])
stop[i] <- -as.numeric(simple_age_split[[i]][2])
next
}
if (simple_age_split[[i]][3] %in% c("CE", "calCE")) {
start[i] <- as.numeric(simple_age_split[[i]][1])
stop[i] <- as.numeric(simple_age_split[[i]][2])
next
}
if (simple_age_split[[i]][2] %in% c("BCE", "calBCE") & simple_age_split[[i]][4] %in% c("CE", "calCE")) {
start[i] <- -as.numeric(simple_age_split[[i]][1])
stop[i] <- as.numeric(simple_age_split[[i]][3])
next
}
}
# write start and to the columns
res$Date_BC_AD_Start <- start
res$Date_BC_AD_Stop <- stop
return(res)
}