-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwebsite_to_pdf
executable file
·201 lines (158 loc) · 4.99 KB
/
website_to_pdf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
#!/bin/zsh
# This program accepts a url seed for a website and generates a combined PDF output.
# Requirements:
# - a Bourne-compatible shell (only tested for zsh)
# - Homebrew package manager
# - enquirer
# - linkchecker
# - imagemagick
# - ocrmypdf
# - img2pdf
source .env
apiflash_endpoint="https://api.apiflash.com/v1/urltoimage"
## Check if homebrew package manager is available
echo 'Checking dependencies...'
echo 'Brew binary exists?'
if command -v brew > /dev/null 2>&1; then
echo OK
else
echo 'Brew not found. Please go to https://brew.sh/ and follow the installation instructions, then run this script again.'
exit 1
fi
## Check for linkchecker
echo 'LinkChecker exists?'
if command -v linkchecker > /dev/null 2>&1; then
echo OK
else
echo 'LinkChecker not installed. Please go to https://github.com/linkchecker/linkchecker and follow the installation instructions, then run this script again.'
exit 1
fi
## Taken from https://gist.github.com/cdown/1163649
urlencode() {
# urlencode <string>
old_lc_collate=$LC_COLLATE
LC_COLLATE=C
local length="${#1}"
for (( i = 0; i < length; i++ )); do
local c="${1:$i:1}"
case $c in
[a-zA-Z0-9.~_-]) printf '%s' "$c" ;;
*) printf '%%%02X' "'$c" ;;
esac
done
LC_COLLATE=$old_lc_collate
}
## Check for required homebrew formulas
brew_formulas=()
echo 'Imagemagick exists?'
if brew ls --versions imagemagick > /dev/null; then
echo OK
else
echo 'imagemagick not installed. Queueing for installation...'
brew_formulas+=('imagemagick')
fi
echo 'OCRmyPDF exists?'
if brew ls --versions ocrmypdf > /dev/null; then
echo OK
else
echo 'OCRmyPDF not installed. Queueing for installation...'
brew_formulas+=('ocrmypdf')
fi
echo 'Enquirer exists?'
if brew ls --versions enquirer > /dev/null; then
echo OK
else
echo 'Enquirer not installed. Installing tap formula...'
brew tap termapps/tap
brew install enquirer
fi
echo 'img2pdf exists?'
if command -v img2pdf > /dev/null 2>&1; then
echo OK
else
echo 'img2pdf not installed. Please go to https://github.com/josch/img2pdf and follow the installation instructions, then run this script again.'
exit 1
fi
## Install missing homebrew formulas
for i in "${brew_formulas[@]}"
do
brew install $i
done
url=$(enquirer input -m "Enter URL seed: ")
echo "You entered ${url}"
# url_root=${url:0:18}
url_root=${url:0:18}
# trip_id=${url:8:11}
trip_id=${url:45:20}
fresh=$(enquirer confirm -m "Get fresh screenshots? (y/N)")
outdir="website-to-pdf-${trip_id}"
dry_run=$(enquirer confirm -m "Dry run? (y/N)")
preserve_page_breaks=$(enquirer confirm -m "Preserve original page breaks? (y/N) Selecting 'y' may result in suboptimal print layout.")
echo 'Gathering individual URLs...'
urls=($(linkchecker ${url} --verbose -r=1 --file-output=csv | grep '^Real URL' | sed 's/Real URL //g' | grep ${url_root}))
## Uncomment to echo linkchecker results
# for i in "${urls[@]}"
# do
# echo "Link ${i}"
# done
urls_sorted=($(enquirer sort -m "Sort order of pages:" ${urls[@]}))
## Uncomment to echo sorted list
# for i in "${urls_sorted[@]}"
# do
# echo "Sorted ${i}"
# done
## Create a place for the screenshots
if [ "$dry_run" = "false" ]; then
mkdir -p ${outdir}
if [ "$preserve_page_breaks" = "false" ]; then
mkdir -p ${outdir}/cropped
fi
fi
## Loop through the URLS array and fetch screenshots of each
j=0
for i in "${urls_sorted[@]}"
do
if [[ ! $i =~ .*\.(svg|png|js|css|xml)$ ]]; then
echo "Generating screenshot of $i..."
u=$(urlencode "${i}")
request=("${apiflash_endpoint}?access_key=${access_key}&url=${u}&width=1280&full_page=true&fresh=${fresh}&format=jpeg&quality=100")
echo "Generating screenshot of request $request..."
if [ "$dry_run" = "false" ]; then
curl --request GET --url "$request" > ${outdir}/apiflash-${j}.jpg
fi
echo DONE
j=$((j + 1))
fi
done
## Combine all screenshots into single image
if [ "$preserve_page_breaks" = "false" ]; then
echo 'Combining screenshots...'
if [ "$dry_run" = "false" ]; then
convert -append ${outdir}/*.jpg ${outdir}/full.jpg
fi
echo DONE
fi
## Chop the aggregate image into equal-sized images for better print results
if [ "$preserve_page_breaks" = "false" ]; then
echo 'Normalizing sizes...'
if [ "$dry_run" = "false" ]; then
convert -crop 1280x1920 +repage ${outdir}/full.jpg ${outdir}/cropped/cropped_%d.jpg
fi
echo DONE
fi
## Combine images into a letter-sized PDF file, then run the output PDF through OCR
echo 'Building PDF...'
if [ "$dry_run" = "false" ]; then
if [ "$preserve_page_breaks" = "false" ]; then
img2pdf ${outdir}/cropped/*.jpg --pagesize Letter --imgsize 8inx10.5in --border 1cm:0 | ocrmypdf --output-type pdf - ${outdir}/complete.pdf
else
img2pdf ${outdir}/*.jpg --pagesize Letter --imgsize 8inx10.5in --border 1cm:0 | ocrmypdf --output-type pdf - ${outdir}/complete.pdf
fi
fi
echo DONE
## Report the final result
if [ "$dry_run" = "false" ]; then
echo "Find the complete PDF at ${outdir}/complete.pdf"
else
echo "Dry run complete."
fi