-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathurl-check.R
172 lines (129 loc) · 5.67 KB
/
url-check.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/usr/bin/env Rscript
# Adapted for this jhudsl repository by Candace Savonen Mar 2022
# Summarize url checks
library(magrittr)
# Find .git root directory
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
output_file <- file.path(root_dir, 'check_reports', 'url_checks.tsv')
if (!dir.exists('check_reports')) {
dir.create('check_reports')
}
# Declare ignore_urls file
ignore_urls_file <- file.path(root_dir, 'resources', 'ignore-urls.txt')
# Declare exclude_files.txt
exclude_file <- file.path(root_dir, 'resources', 'exclude_files.txt')
# Read in ignore urls file if it exists
if (file.exists(ignore_urls_file)) {
ignore_urls <- readLines(ignore_urls_file)
} else {
ignore_urls <- ""
}
# Read in ignore urls file if it exists
if (file.exists(exclude_file)) {
exclude_file <- readLines(exclude_file)
} else {
exclude_file <- ""
}
# Only declare `.md` files but not the ones in the style-sets directory
files <- list.files(path = root_dir, pattern = 'md$', full.names = TRUE, recursive = TRUE)
if( exclude_file[1] != "") files <- grep(paste0(exclude_file, collapse = "|"), files, invert = TRUE, value = TRUE)
test_url <- function(url) {
if (url %in% ignore_urls) {
message(paste0("Ignoring: ", url))
return("ignored")
}
message(paste0("Testing: ", url))
url_status <- try(httr::GET(url), silent = TRUE)
# Fails if host can't be resolved
status <- ifelse(suppressMessages(grepl("Could not resolve host", url_status)), "failed", "success")
if (status == "success") {
# Fails if 404'ed
status <- ifelse(try(url_status$status_code, silent = TRUE) == 404, "failed", "success")
}
return(status)
}
get_urls <- function(file) {
message(paste("##### Testing URLs from file:", file))
# Read in a file and return the urls from it
content <- readLines(file)
# Set up the possible tags
html_tag <- "<a href="
include_url_tag <- "include_url\\("
include_slide_tag <- "include_slide\\("
markdown_tag <- "\\[.*\\]\\(http[s]?.*\\)"
markdown_tag_bracket <- "\\[.*\\]: http[s]?"
http_gen <- "http[s]?"
url_pattern <- "[(|<]?http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
# Other patterns
nested_parens <- "\\((.*)\\((.*)\\)(.*)\\)"
outermost_parens <- "^\\((.*)\\)(.*)$"
# Collect the different kinds of tags in a named vector
all_tags <- c(html = html_tag,
knitr = include_url_tag,
ottrpal = include_slide_tag,
markdown = markdown_tag,
markdown_bracket = markdown_tag_bracket,
other_http = http_gen)
url_list <- sapply(all_tags, grep, content, value = TRUE)
url_list$other_http <- setdiff(url_list$other_http, unlist(url_list[-6]))
# Extract the urls only of each type
if (length(url_list$html) > 0 ){
url_list$html <- sapply(url_list$html, function(html_line) {
head(rvest::html_attr(rvest::html_nodes(rvest::read_html(html_line), "a"), "href"))
})
url_list$html <- unlist(url_list$html)
}
url_list$knitr <- stringr::word(url_list$knitr, sep = "include_url\\(\"|\"\\)", 2)
url_list$ottrpal <- stringr::word(url_list$ottrpal, sep = "include_slide\\(\"|\"\\)", 2)
# Check markdown for parentheticals outside of [ ]( )
parens_index <- sapply(url_list$markdown, stringr::str_detect, nested_parens)
if (length(parens_index) >= 1) {
# Break down to parenthetical only
url_list$markdown[parens_index] <- stringr::str_extract(url_list$markdown[parens_index], nested_parens)
# Remove parentheticals outside [ ]( )
url_list$markdown[parens_index] <- stringr::word(stringr::str_replace(url_list$markdown[parens_index], outermost_parens, "\\1"), sep = "\\]", 2)
url_list$markdown[!parens_index] <- stringr::word(url_list$markdown[!parens_index], sep = "\\]", 2)
url_list$markdown <- grep("http", url_list$markdown, value = TRUE)
}
if (length(url_list$markdown_bracket) > 0 ){
url_list$markdown_bracket <- paste0("http", stringr::word(url_list$markdown_bracket, sep = "\\]: http", 2))
}
url_list$other_http <- stringr::word(stringr::str_extract(url_list$other_http, url_pattern), sep = "\\]", 1)
# Remove parentheses only if they are on the outside
url_list$other_http <- stringr::word(stringr::str_replace(url_list$other_http, outermost_parens, "\\1"), sep = "\\]", 1)
url_list$markdown <- stringr::word(stringr::str_replace(url_list$markdown, outermost_parens, "\\1"), sep = "\\]", 1)
# Remove `< >`
url_list$other_http <- stringr::word(stringr::str_replace(url_list$other_http, "^<(.*)>(.*)$", "\\1"), sep = "\\]", 1)
# If after the manipulations there's not actually a URL, remove it.
url_list <- lapply(url_list, na.omit)
# collapse list
urls <- unlist(url_list)
# Remove trailing characters
urls <- gsub("\\'\\:$|\\'|\\:$|\\.$|\\)$|\\,$", "", urls)
# Remove URLs that are in the ignore
if( ignore_urls[1] != "") urls <- grep(paste0(ignore_urls, collapse = "|"), urls, invert = TRUE, value = TRUE)
if (length(urls) > 0 ){
# Remove trailing characters
urls_status <- sapply(urls, test_url)
url_df <- data.frame(urls, urls_status, file)
return(url_df)
} else {
message("No URLs found")
}
}
# Run this for all Rmds
all_urls <- lapply(files, get_urls)
# Write the file
all_urls_df <- dplyr::bind_rows(all_urls)
if (nrow(all_urls_df) > 0) {
all_urls_df <- all_urls_df %>%
dplyr::filter(urls_status == "failed") %>%
readr::write_tsv(output_file)
} else {
all_urls_df <- data.frame(errors = NA)
}
# Print out how many spell check errors
write(nrow(all_urls_df), stdout())
# Save spell errors to file temporarily
readr::write_tsv(all_urls_df, output_file)
message(paste0("Saved to: ", output_file))