website_to_pdf

#!/bin/zsh
# This program accepts a url seed for a website and generates a combined PDF output.
# Requirements:
#  - a Bourne-compatible shell (only tested for zsh)
#  - Homebrew package manager
#  - enquirer
#  - linkchecker
#  - imagemagick
#  - ocrmypdf
#  - img2pdf


source .env
apiflash_endpoint="https://api.apiflash.com/v1/urltoimage"


## Check if homebrew package manager is available
echo 'Checking dependencies...'
echo 'Brew binary exists?'
if command -v brew > /dev/null 2>&1; then
  echo OK
else
  echo 'Brew not found. Please go to https://brew.sh/ and follow the installation instructions, then run this script again.'
  exit 1
fi


## Check for linkchecker
echo 'LinkChecker exists?'
if command -v linkchecker > /dev/null 2>&1; then
  echo OK
else
  echo 'LinkChecker not installed. Please go to https://github.com/linkchecker/linkchecker and follow the installation instructions, then run this script again.'
  exit 1
fi


## Taken from https://gist.github.com/cdown/1163649
urlencode() {
  # urlencode <string>

  old_lc_collate=$LC_COLLATE
  LC_COLLATE=C

  local length="${#1}"
  for (( i = 0; i < length; i++ )); do
      local c="${1:$i:1}"
      case $c in
          [a-zA-Z0-9.~_-]) printf '%s' "$c" ;;
          *) printf '%%%02X' "'$c" ;;
      esac
  done

  LC_COLLATE=$old_lc_collate
}


## Check for required homebrew formulas
brew_formulas=()
echo 'Imagemagick exists?'
if brew ls --versions imagemagick > /dev/null; then
  echo OK
else
  echo 'imagemagick not installed. Queueing for installation...'
  brew_formulas+=('imagemagick')
fi


echo 'OCRmyPDF exists?'
if brew ls --versions ocrmypdf > /dev/null; then
  echo OK
else
  echo 'OCRmyPDF not installed. Queueing for installation...'
  brew_formulas+=('ocrmypdf')
fi


echo 'Enquirer exists?'
if brew ls --versions enquirer > /dev/null; then
  echo OK
else
  echo 'Enquirer not installed. Installing tap formula...'
  brew tap termapps/tap
  brew install enquirer
fi


echo 'img2pdf exists?'
if command -v img2pdf > /dev/null 2>&1; then
  echo OK
else
  echo 'img2pdf not installed. Please go to https://github.com/josch/img2pdf and follow the installation instructions, then run this script again.'
  exit 1
fi


## Install missing homebrew formulas
for i in "${brew_formulas[@]}"
do
  brew install $i
done


url=$(enquirer input -m "Enter URL seed: ")
echo "You entered ${url}"


# url_root=${url:0:18}
url_root=${url:0:18}
# trip_id=${url:8:11}
trip_id=${url:45:20}
fresh=$(enquirer confirm -m "Get fresh screenshots? (y/N)")
outdir="website-to-pdf-${trip_id}"
dry_run=$(enquirer confirm -m "Dry run? (y/N)")
preserve_page_breaks=$(enquirer confirm -m "Preserve original page breaks? (y/N) Selecting 'y' may result in suboptimal print layout.")


echo 'Gathering individual URLs...'
urls=($(linkchecker ${url} --verbose -r=1 --file-output=csv | grep '^Real URL' | sed 's/Real URL   //g' | grep ${url_root}))

## Uncomment to echo linkchecker results
# for i in "${urls[@]}"
# do
# echo "Link ${i}"
# done


urls_sorted=($(enquirer sort -m "Sort order of pages:" ${urls[@]}))

## Uncomment to echo sorted list
# for i in "${urls_sorted[@]}"
# do
# echo "Sorted ${i}"
# done


## Create a place for the screenshots
if [ "$dry_run" = "false" ]; then
  mkdir -p ${outdir}
  if [ "$preserve_page_breaks" = "false" ]; then
    mkdir -p ${outdir}/cropped
  fi
fi


## Loop through the URLS array and fetch screenshots of each
j=0
for i in "${urls_sorted[@]}"
do
   if [[ ! $i =~ .*\.(svg|png|js|css|xml)$ ]]; then
    echo "Generating screenshot of $i..."
    u=$(urlencode "${i}")
    request=("${apiflash_endpoint}?access_key=${access_key}&url=${u}&width=1280&full_page=true&fresh=${fresh}&format=jpeg&quality=100")
    echo "Generating screenshot of request $request..."
    if [ "$dry_run" = "false" ]; then
      curl --request GET --url "$request" > ${outdir}/apiflash-${j}.jpg
    fi
    echo DONE
    j=$((j + 1))
  fi
done


## Combine all screenshots into single image
if [ "$preserve_page_breaks" = "false" ]; then
  echo 'Combining screenshots...'
  if [ "$dry_run" = "false" ]; then
    convert -append ${outdir}/*.jpg ${outdir}/full.jpg
  fi
  echo DONE
fi


## Chop the aggregate image into equal-sized images for better print results
if [ "$preserve_page_breaks" = "false" ]; then
  echo 'Normalizing sizes...'
  if [ "$dry_run" = "false" ]; then
    convert -crop 1280x1920 +repage ${outdir}/full.jpg ${outdir}/cropped/cropped_%d.jpg
  fi
  echo DONE
fi


## Combine images into a letter-sized PDF file, then run the output PDF through OCR
echo 'Building PDF...'
if [ "$dry_run" = "false" ]; then
  if [ "$preserve_page_breaks" = "false" ]; then
    img2pdf ${outdir}/cropped/*.jpg --pagesize Letter --imgsize 8inx10.5in --border 1cm:0 | ocrmypdf --output-type pdf - ${outdir}/complete.pdf
  else
    img2pdf ${outdir}/*.jpg --pagesize Letter --imgsize 8inx10.5in --border 1cm:0 | ocrmypdf --output-type pdf - ${outdir}/complete.pdf
  fi
fi
echo DONE


## Report the final result
if [ "$dry_run" = "false" ]; then
  echo "Find the complete PDF at ${outdir}/complete.pdf"
else
  echo "Dry run complete."
fi