forked from rawiriblundell/wiki.bash-hackers.org
-
Notifications
You must be signed in to change notification settings - Fork 0
/
archive_crawler
203 lines (172 loc) · 7.66 KB
/
archive_crawler
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
#!/bin/bash
# An incomplete script, more like notes really, for grabbing a
# copy of wiki.bash-hackers.org from archive.org
# This works by targeting pages that have been captured by the Wayback Machine
# that specifically have '?do=edit' on the end of their URL.
# These pages present the original Dokuwiki Markup source of the respective page.
# So with a little massaging, we should be able to extract said Dokuwiki Markup.
# Ensure we have our required commands, otherwise fail early
cmd_err=0
for cmd in grep sed tr sort uniq mkdir curl jq pandoc git; do
if ! command -v "${cmd}" >/dev/null 2>&1; then
printf -- '%s\n' "This script requires ${cmd} but it was not found in PATH." >&2
(( cmd_err++ ))
fi
done
(( cmd_err > 0 )) && exit 1
# Where are we playing on the local file system?
basedir="$(git rev-parse --show-toplevel)"
# Prepend a string e.g.
# cmd: prepend foo bar
# out: foobar
# Also has a delimiter option e.g.
# cmd: prepend -d ';' foo bar
# out: foo;bar
prepend() {
local _prepend_delimiter
case "${1}" in
(-d|--delimiter)
_prepend_delimiter="${2}"
shift 2
;;
esac
printf -- '%s\n' "${1}${_prepend_delimiter:-}${2}"
}
# Call archive.org's 'available' API to see if a site is available
# This will either return the URL of the most current snapshot, or null
# e.g.
# cmd: check_wayback_availability https://wiki.bash-hackers.org/howto/mutex?do=edit
# out: http://web.archive.org/web/20220615023742/https://wiki.bash-hackers.org/howto/mutex?do=edit
# vs
# cmd: check_wayback_availability https://contoso.com/pantsmcgee
# out: null
check_wayback_availability() {
local remote_target
remote_target="https://archive.org/wayback/available?url=${1:?No target specified}"
curl -s -X GET "${remote_target}" | jq -r '.archived_snapshots.closest.url'
}
# Download the given target into a local file structure e.g.
# http://web.archive.org/web/20220706170849/https://wiki.bash-hackers.org/scripting/bashbehaviour?do=edit
# Will download into: "${basedir}/scripting/bashbehaviour"
get_wayback_target() {
local remote_target target_path target_dirname
remote_target="${1:?No target specified}"
# Strip out everything after 'bash-hackers.org' and '?do=edit' e.g.
# http://web.archive.org/web/20220615023742/https://wiki.bash-hackers.org/howto/mutex?do=edit -> /howto/mutex
target_path="$(sed -n 's/.*bash-hackers.org//p' <<< "${remote_target}" | sed -e 's/?do=edit//')"
# If we already have it, bounce out
if [[ -f "./${target_path}.markup" ]]; then
printf -- '%s\n' "./${target_path}.markup appears to already exist. Remove it and re-run this script to force a fresh download."
return 0
fi
# Get the dirname e.g. /howto/mutex?do=edit -> /howto
target_dirname="$(dirname "${target_path}")"
# Create the path, ensuring that we strip the leading slash just-in-case e.g. /howto -> howto
mkdir -p "${basedir:?FATAL - basedir unset}/${target_dirname/\//}"
# Download the remote target to the local path
printf -- '%s\n' "Retrieving into ./${target_path}.markup"
curl -s -X GET "${remote_target}" | extract_markup - > "./${target_path}.markup"
}
# We want to pull internal-wiki links out of the markup fragments. These look like
# [[path:to:file]]
# But can be given friendly names like
# [[path:to:file | friendly name]]
# The friendly name might have leading spaces around the pipe char or not
# We start by grep'ing for '[['
# Then we filter out external links e.g. '[[http://contoso.com]]'
# Then we grep out just the link substrings
# Then we filter out friendly names (this could be brutally tidied up, it's late and I'm lazy right now)
# Then we append "/?do=edit/"
# Then we filter out unwanted garbage one last time
scrape_targets() {
local source_file
source_file="${1:?No target specified}"
for source_file in "${@}"; do
grep "\[\[" "${source_file}" |
grep -v "\[\[http" |
grep -o "\[\[.*\]\]" |
sed -e 's/ | .*\]\]/]]/g' -e 's/| .*\]\]/]]/g' -e 's/|.*\]\]/]]/g' -e 's/\[\[/\//g' -e 's/\]\]/?do=edit/g' |
tr ':' '/' |
grep -Ev '==|!=|\$|#|/ |nowiki|ftp'
done
}
# Because of the structure of the downloaded files,
# we should be able to reliably extract our target Dokuwiki Markup.
# First, remove everything between the first line and 'name="sectok"'
# Next, remove everything after '</textarea>'
# This should remove everything above and below our desired Dokuwiki Markup
# We also take the opportunity to convert some HTML chars
extract_markup() {
sed -e '1,/name="sectok"/d' \
-e '/<\/textarea>/,$d' \
-e 's/>/>/g' \
-e 's/</</g' \
-e 's/&/\&/g' \
-e 's/"/"/g' "${1:-/dev/stdin}"
}
###### Beyond this point things get a little wishy-washy ######
(
cd "${basedir}" || exit 1
# If it's not already here, get the start page
if [[ ! -f start.markup ]]; then
get_wayback_target https://web.archive.org/web/20220930131429/https://wiki.bash-hackers.org/start?do=edit
fi
# Extract a list of targets from the start page
while read -r; do
prepend "https://wiki.bash-hackers.org" "${REPLY}"
done < <(scrape_targets start.markup) > raw_targets
# For each scraped target, validate that they're available from archive.org
# and in doing so, generate a list of the urls for the latest captures of each
while read -r; do
check_wayback_availability "${REPLY}"
done < raw_targets > waybacktargets
# Work through each of the above urls
while read -r; do
get_wayback_target "${REPLY}"
done < waybacktargets
# Now we parse through the .markup files and try to generate a fresh list of raw_targets
# With extglob, this would be
# scrape_targets **/*.markup | sort | uniq
while read -r; do
scrape_targets "${REPLY}"
done < <(find . -name "*.markup") | sort | uniq > fresh_targets
while read -r; do
prepend "https://wiki.bash-hackers.org" "${REPLY}"
done < fresh_targets > raw_targets
# And as before, we generate a list of targets and retrieve them
while read -r; do
check_wayback_availability "${REPLY}"
done < raw_targets > waybacktargets
while read -r; do
get_wayback_target "${REPLY}"
done < waybacktargets
# Next, we convert from dokuwiki markup to github markdown
while read -r; do
pandoc --from dokuwiki --to gfm --toc --no-highlight "${REPLY}" > "${REPLY/.markup/}.md"
done < <(find . -name "*.markup")
)
# Ugh, screw the constant while read loops. Let's extglob from here.
# Replace <note>
sed -i -e 's/\\<note\\>/\| :shell: /' ./**/*.md
# Replace <note important>
sed -i -e 's/\\<note important\\>/\| :loudspeaker: /' ./**/*.md
# Replace <note info>
sed -i -e 's/\\<note info\\>/\| :memo: /' ./**/*.md
# Replace <note tip>
sed -i -e 's/\\<note tip\\>/\| :bulb: /' ./**/*.md
# Replace <note warning>
sed -i -e 's/\\<note warning\\>/\| :warning: /' ./**/*.md
# Replace </note>
sed -i -e 's/\\<\/note\\>/ |\n| --- |/' ./**/*.md
# Let's correct any internal links so that they point to their new .md homes
mapfile -t mdlist < <(grep -o "\[.*\]\(.*\)" start.md | grep -v http | cut -d '(' -f2 | cut -d ')' -f1 | cut -d '#' -f1 | grep '/' | sort | uniq)
while read -r; do
for target in "${mdlist[@]}"; do
sed -i -e "s|${target}|${target}.md|g" "${REPLY}"
done
done < <(find . -name "*.md")
# Because the pandoc output has a mix of markdown tables and html tables
# the above does not capture links within html tables.
# We're getting to the point that the bulk of this capture is done
# and we're over to manual corrections. But something like this may help:
# grep -R "href=\"/" * | grep -v ".*/.*\.md"