From a84a215ff7d4e6d7e450143da5ef780336da8bf0 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sun, 22 Mar 2020 17:02:02 -0400 Subject: [PATCH 01/92] readme: Remove extra parenthesis --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d71f294..2b52be2 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ Disposable VMs to perform conversion of potentially untrusted (e.g. maliciously malformed) PDF files into safe-to-view PDF files. This is done by having the Disposable VM perform the complex (and potentially -buggy) rendering of the PDF in question) and sending the resulting RGB bitmap +buggy) rendering of the PDF in question and sending the resulting RGB bitmap (simple representation) to the client AppVM. The client AppVM can _trivially_ verify the received data are indeed the simple representation, and then construct a new PDF out of the received bitmap. Of course the price we pay for From 4e0d63566613c1c7acdbaee5478947553d3cf90e Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Mon, 23 Mar 2020 22:39:12 -0400 Subject: [PATCH 02/92] wrapper: Update qvm-convert-pdf into Python 3 This commit also adds more robust argument parsing in anticipation of future options and filepath existence checks to avoid potentially wasteful qrexec-client-vm runs. --- qvm-convert-pdf | 70 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 64 insertions(+), 6 deletions(-) diff --git a/qvm-convert-pdf b/qvm-convert-pdf index 3469e34..e2c13d2 100755 --- a/qvm-convert-pdf +++ b/qvm-convert-pdf @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env python3 # # The Qubes OS Project, http://www.qubes-os.org # @@ -20,9 +20,67 @@ # # -if [ $# -ne 1 ] ; then - echo usage: $0 '' - exit 1 -fi +import argparse +import os +import sys -exec /usr/bin/qrexec-client-vm '$dispvm' qubes.PdfConvert /usr/lib/qubes/qpdf-convert-client "$@" +PROG_NAME = os.path.basename(sys.argv[0]) +QREXEC_CLIENT = '/usr/bin/qrexec-client-vm' + +def version(): + print(f'{PROG_NAME} version 0.1') + sys.exit(0) + +class ArgumentParser(argparse.ArgumentParser): + '''Overriding class for custom help message.''' + def print_help(self): + print(f'''usage: {PROG_NAME} [OPTIONS ...] [FILE ...] + +Options: + --help Show this help message and exit. + --version Show version information and exit.''') + sys.exit(0) + +def parser_new(): + '''Create a command-line parser + + :rtype: tuple + ''' + parser = ArgumentParser() + + if len(sys.argv) == 1: + parser.print_help() + + parser.add_argument('--version', action='store_true', default=False) + + return parser.parse_known_args() + +def parse_args(args): + if args.version: + version() + +def check_pdf_paths(untrusted_pdfs): + for untrusted_pdf in untrusted_pdfs: + if not os.path.exists(untrusted_pdf): + print(f'"{untrusted_pdf}": No such file') + sys.exit(1) + +def main(): + args, untrusted_pdfs = parser_new() + parse_args(args) + check_pdf_paths(untrusted_pdfs) + + # ? Flush since we're not coming back from os.execvp() + # sys.stdin.flush() + # sys.stdout.flush() + # sys.stderr.flush() + + # TODO: Pass in our multiple files + os.execvp(QREXEC_CLIENT, (QREXEC_CLIENT, '$dispvm', 'qubes.PdfConvert', + '/usr/lib/qubes/qpdf-convert-client', sys.argv[1])) + +if __name__ == '__main__': + try: + main() + except KeyboardInterrupt as e: + sys.exit(0) From 7ef5b33f97c915019f9ce19d8d808b3202e4f9be Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Wed, 25 Mar 2020 18:22:23 -0400 Subject: [PATCH 03/92] wrapper: Add logging and trim options --- qvm-convert-pdf | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/qvm-convert-pdf b/qvm-convert-pdf index e2c13d2..b60ef2c 100755 --- a/qvm-convert-pdf +++ b/qvm-convert-pdf @@ -21,24 +21,23 @@ # import argparse +import logging import os import sys PROG_NAME = os.path.basename(sys.argv[0]) QREXEC_CLIENT = '/usr/bin/qrexec-client-vm' -def version(): - print(f'{PROG_NAME} version 0.1') - sys.exit(0) +logging.basicConfig(level=logging.INFO, stream=sys.stderr, + format='%(module)s (%(levelname)s): %(message)s') class ArgumentParser(argparse.ArgumentParser): '''Overriding class for custom help message.''' def print_help(self): - print(f'''usage: {PROG_NAME} [OPTIONS ...] [FILE ...] + print(f'''usage: {PROG_NAME} [OPTIONS ...] FILE Options: - --help Show this help message and exit. - --version Show version information and exit.''') + --help Show this help message and exit.''') sys.exit(0) def parser_new(): @@ -51,18 +50,19 @@ def parser_new(): if len(sys.argv) == 1: parser.print_help() - parser.add_argument('--version', action='store_true', default=False) + # parser.add_argument('-v', '--verbose', action='count', default=0) return parser.parse_known_args() def parse_args(args): - if args.version: - version() + # if args.version: + # version() + return def check_pdf_paths(untrusted_pdfs): for untrusted_pdf in untrusted_pdfs: if not os.path.exists(untrusted_pdf): - print(f'"{untrusted_pdf}": No such file') + logging.error(f'"{untrusted_pdf}": No such file') sys.exit(1) def main(): From 9668bfba214a374c689197e54c66896bba2daf86 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Wed, 25 Mar 2020 18:28:00 -0400 Subject: [PATCH 04/92] wrapper: Prepare for multiple file support --- qvm-convert-pdf | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/qvm-convert-pdf b/qvm-convert-pdf index b60ef2c..640b798 100755 --- a/qvm-convert-pdf +++ b/qvm-convert-pdf @@ -70,14 +70,10 @@ def main(): parse_args(args) check_pdf_paths(untrusted_pdfs) - # ? Flush since we're not coming back from os.execvp() - # sys.stdin.flush() - # sys.stdout.flush() - # sys.stderr.flush() - - # TODO: Pass in our multiple files - os.execvp(QREXEC_CLIENT, (QREXEC_CLIENT, '$dispvm', 'qubes.PdfConvert', - '/usr/lib/qubes/qpdf-convert-client', sys.argv[1])) + # TODO: Handle os.execl() error (maybe with os._exit(127) + os.execvp(QREXEC_CLIENT, [QREXEC_CLIENT, '$dispvm', 'qubes.PdfConvert', + '/usr/lib/qubes/qpdf-convert-client', + *untrusted_pdfs]) if __name__ == '__main__': try: From a014fb317014fa14df07df16d68b3f53722e967a Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Wed, 25 Mar 2020 18:28:56 -0400 Subject: [PATCH 05/92] wrapper: Remove unneeded main() try block --- qvm-convert-pdf | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/qvm-convert-pdf b/qvm-convert-pdf index 640b798..6ce873f 100755 --- a/qvm-convert-pdf +++ b/qvm-convert-pdf @@ -76,7 +76,6 @@ def main(): *untrusted_pdfs]) if __name__ == '__main__': - try: - main() - except KeyboardInterrupt as e: - sys.exit(0) + # No need to wrap this in a try block since we're + # never returning from execl() + main() From fd9175ff1d4c9a7ea017b588de30296baf83e680 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Thu, 26 Mar 2020 17:07:29 -0400 Subject: [PATCH 06/92] wrapper: Remove logging --- qvm-convert-pdf | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/qvm-convert-pdf b/qvm-convert-pdf index 6ce873f..b144110 100755 --- a/qvm-convert-pdf +++ b/qvm-convert-pdf @@ -21,16 +21,12 @@ # import argparse -import logging import os import sys PROG_NAME = os.path.basename(sys.argv[0]) QREXEC_CLIENT = '/usr/bin/qrexec-client-vm' -logging.basicConfig(level=logging.INFO, stream=sys.stderr, - format='%(module)s (%(levelname)s): %(message)s') - class ArgumentParser(argparse.ArgumentParser): '''Overriding class for custom help message.''' def print_help(self): @@ -62,7 +58,7 @@ def parse_args(args): def check_pdf_paths(untrusted_pdfs): for untrusted_pdf in untrusted_pdfs: if not os.path.exists(untrusted_pdf): - logging.error(f'"{untrusted_pdf}": No such file') + print(f'"{untrusted_pdf}": No such file') sys.exit(1) def main(): From 0a7cfae503ba12a93d6c9ec6a239ec65820db12e Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Thu, 26 Mar 2020 17:11:05 -0400 Subject: [PATCH 07/92] client: Update to Python 3 --- qpdf-convert-client | 321 ++++++++++++++++++++++++++------------------ 1 file changed, 187 insertions(+), 134 deletions(-) diff --git a/qpdf-convert-client b/qpdf-convert-client index 2fdb2ce..50f54c9 100755 --- a/qpdf-convert-client +++ b/qpdf-convert-client @@ -1,8 +1,9 @@ -#!/bin/bash -# +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + # The Qubes OS Project, http://www.qubes-os.org # -# Copyright (C) 2013 Joanna Rutkowska +# Copyright (C) 2013 Joanna Rutkowska # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License @@ -17,134 +18,186 @@ # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -# -# Requires: -# - ImageMagick (convert) - -INPUT_FILE="$1" -RCVD_FILE=$(mktemp --tmpdir qpdf-conversion-XXXXXXXX) -CONVERTED_FILE="$(dirname "$1")/$(basename "$1" .pdf).trusted.pdf" -CONVERTED_FILE_PARTIAL="$CONVERTED_FILE".part.pdf - - -MAX_PAGES=10000 -MAX_IMG_WIDTH=10000 -MAX_IMG_HEIGHT=10000 -IMG_DEPTH=8 -MAX_IMG_SIZE=$((MAX_IMG_WIDTH * MAX_IMG_HEIGHT * 3)) - -VERBOSE=1 -if [ -n "$PROGRESS_FOR_GUI" ]; then - VERBOSE=0; -fi - -die() { - reason="$1" - if [ -n "$PROGRESS_FOR_GUI" ]; then - zenity --error --title="PDF conversion error" --text="$reason" - else - echo "$reason" >&2 - fi - exit 1 -} - - -# Send the input (untrusted) file to the server... -[ $VERBOSE -ge 1 ] && echo "-> Sending file to a Disposable VM..." >&2 -cat "$INPUT_FILE" -exec >&- - -# ... and get the recvd *simple* representation: - -# Note: the server might be compromised at this point so, it can very well send -# us something else than the simple representation. Thus we explicitly specify -# input format to ImageMagick's convert via "rgb:" prefix, forcing it to -# interpret whatever stream of bytes it gets on input as a simple RGB array. We -# hope that when using this RGB format explicitly (which is the simplest format -# for bitmaps in the known universe), there is no space for offending bug in -# image parsing... - -# First, get the no of pages: -read -r NO_PAGES -if [[ ! "$NO_PAGES" =~ ^[1-9][0-9]*$ ]] || [[ $NO_PAGES -le 0 ]] || [[ $NO_PAGES -gt $MAX_PAGES ]] ; then - die "The remote party return invalid no of pages, aborting!" -fi - -[ $VERBOSE -ge 1 ] && echo "-> Waiting for converted samples..." >&2 - -PAGE=1 -while [ $PAGE -le "$NO_PAGES" ]; do - read -r IMG_WIDTH IMG_HEIGHT - if [ $VERBOSE -eq 1 ]; then - echo -n "-> Receiving page $PAGE out of $NO_PAGES..." >&2 - printf "\r" >&2 - elif [ $VERBOSE -gt 1 ]; then - echo "-> Receiving page $PAGE out of $NO_PAGES..." >&2 - fi - if [[ ! "$IMG_WIDTH" =~ ^[1-9][0-9]*$ ]] || [ "$IMG_WIDTH" -le 0 ] || [ "$IMG_WIDTH" -gt $MAX_IMG_WIDTH ] || \ - [[ ! "$IMG_HEIGHT" =~ ^[1-9][0-9]*$ ]] || [ "$IMG_HEIGHT" -le 0 ] || [ "$IMG_HEIGHT" -gt $MAX_IMG_HEIGHT ]; then - die "The remote party return invalid image geometry info, aborting!" - fi - [ $VERBOSE -ge 2 ] && echo "--> page geometry: $IMG_WIDTH x $IMG_HEIGHT x $IMG_DEPTH" >&2 - IMG_SIZE=$((IMG_WIDTH*IMG_HEIGHT*3)) - if [ $IMG_SIZE -le 0 ] || [ $IMG_SIZE -gt $MAX_IMG_SIZE ]; then - die "Calculated image size is invalid, aborting!" - fi - # save the simplified RGB image into a temp PDF file: - RGB_FILE=$RCVD_FILE-$PAGE.rgb - PNG_FILE=$RCVD_FILE-$PAGE.png - PDF_FILE=$RCVD_FILE-$PAGE.pdf - head -c $IMG_SIZE > "$RGB_FILE" - RCVD_IMG_SIZE=$(stat -c %s "$RGB_FILE") - if [ "$RCVD_IMG_SIZE" -ne $IMG_SIZE ]; then - die "The remote party return invalid no of bytes of the RGB file, aborting!" - fi - # here, the important part is that we *explicitly* specify RGB as the input format via "rgb:" - # We first convert to a (compressed) PNG to create smaller output files - convert_msgs=$(convert -size "${IMG_WIDTH}x${IMG_HEIGHT}" -depth ${IMG_DEPTH} rgb:"$RGB_FILE" png:"$PNG_FILE" 2>&1) - # shellcheck disable=SC2181 - if [ $? -ne 0 ]; then - die "Page $PAGE conversion failed (RGB->PNG): $convert_msgs" - fi - rm -f "$RGB_FILE" - - # now convert the (trusted but compressed) PNG into PDF for easy assembly... - convert_msgs=$(convert "$PNG_FILE" "$PDF_FILE" 2>&1) - # shellcheck disable=SC2181 - if [ $? -ne 0 ]; then - die "Page $PAGE conversion failed (PNG->PDF): $convert_msgs" - fi - rm -f "$PNG_FILE" - - if [ $PAGE -gt 1 ]; then - convert_msgs=$(pdfunite "$CONVERTED_FILE" "$PDF_FILE" "$CONVERTED_FILE_PARTIAL" 2>&1) - # shellcheck disable=SC2181 - if [ $? -ne 0 ]; then - die "Error merging converted page: $convert_msgs" - fi - mv "$CONVERTED_FILE_PARTIAL" "$CONVERTED_FILE" || die - else - mv "$PDF_FILE" "$CONVERTED_FILE" || die - fi - rm -f "$PDF_FILE" || die - - PAGE=$((PAGE+1)) - - [ -n "$PROGRESS_FOR_GUI" ] && echo $(((PAGE - 1) * 90 / NO_PAGES)) >& "$SAVED_FD_1" -done - -if [ $VERBOSE -eq 1 ]; then - echo >&2 -fi - -[ $VERBOSE -ge 1 ] && echo "-> Converted PDF saved as: $CONVERTED_FILE" >&2 - -mkdir -p "$HOME/QubesUntrustedPDFs" -ORIG_FILE="$HOME/QubesUntrustedPDFs/$(basename "$INPUT_FILE")" -mv "$INPUT_FILE" "${ORIG_FILE}" || die "Moving original file failed" -[ $VERBOSE -ge 1 ] && echo "-> Original file saved as $ORIG_FILE" >&2 - -# Cleanup -rm -f "$RCVD_FILE"* -[ -n "$PROGRESS_FOR_GUI" ] && echo "100" >& "$SAVED_FD_1" -exit 0 + +from collections import namedtuple +import os +from PIL import Image +import subprocess +import sys +import tempfile + +PROG_NAME = os.path.basename(sys.argv[0]) +ARCHIVE_PATH = f"{os.path.expanduser('~')}/QubesUntrustedPDFs" + +MAX_PAGES = 10000 +MAX_IMG_WIDTH = 10000 +MAX_IMG_HEIGHT = 10000 +MAX_IMG_SIZE = MAX_IMG_WIDTH * MAX_IMG_HEIGHT * 3 + + +############################### +# Utilities +############################### + +def check_range(val, upper): + if (not 1 <= val <= upper) or (not 1 <= val <= upper): + raise ValueError + + +############################### +# Files & Images +############################### + +def receive_page_count(): + # TODO: Can't the # of pages be modified by a malicious server? + try: + page_count = int(input()) + check_range(page_count, MAX_PAGES) + except ValueError as err: + print("Invalid number of pages returned... aborting!", file=sys.stderr) + sys.exit(1) + + return page_count + +def receive_img_measurements(): + return (int(val) for val in input().split(' ', 2)) + +def get_img_size(width, height): + size = width * height * 3 + + if size > MAX_IMG_SIZE: + print("Calculated image size is too large... aborting!", + file=sys.stderr) + sys.exit(1) + + return size + +def get_img_dimensions(): + depth = 8 + + try: + width, height = receive_img_measurements() + check_range(width, MAX_IMG_WIDTH) + check_range(height, MAX_IMG_HEIGHT) + except ValueError as err: + print("Invalid image geometry returned... aborting!", file=sys.stderr) + sys.exit(1) + + size = get_img_size(width, height) + + Dimensions = namedtuple('Dimensions', 'width height depth size') + return Dimensions(width=width, height=height, size=size, depth=depth) + +def get_tmp_rgb(rgb_path, size): + # XXX: For some reason, this leaves us missing alot of bytes + # rcvd_bytes = input().encode('utf-8', 'surrogateescape') + # rcvd_bytes = rcvd_bytes[:dimensions.size] + + # XXX: Example of using PIL for performant PNG -> JPG. Maybe use this? + # png = Image.open(object.logo.path) + # png.load() # required for png.split() + # background = Image.new("RGB", png.size, (255, 255, 255)) + # background.paste(png, mask=png.split()[3]) # 3 is the alpha channel + # background.save('foo.jpg', 'JPEG', quality=80) + + with open(rgb_path, 'wb') as rgb_f: + # FIXME: Why can't we get this in pure Python????? This isn't + # Windows-compatible + subprocess.run(['head', '-c', f'{size}'], stdout=rgb_f, check=True) + + if os.path.getsize(rgb_f.name) != size: + print('Invalid number of bytes in RGB file... aborting!', + file=sys.stderr) + os.remove(rgb_path) + sys.exit(1) + +def rgb_to_png(rgb_path, png_path, dimensions, page): + try: + subprocess.run(['convert', + '-size' , f'{dimensions.width}x{dimensions.height}', + '-depth', f'{dimensions.depth}', + f'rgb:{rgb_path}', f'png:{png_path}'], check=True) + except subprocess.CalledProcessError as err: + print(f'Page {page} conversion failed (RGB->PNG): {err}', + file=sys.stderr) + sys.exit(1) + else: + os.remove(rgb_path) + +def convert_received_rgb_file(dimensions, page): + # TODO: Does the number of X's matter? + with tempfile.NamedTemporaryFile(prefix='qpdf-conversion-') as rcvd_f: + rgb_path = f'{rcvd_f.name}-{page}.rgb' + png_path = f'{rcvd_f.name}-{page}.png' + get_tmp_rgb(rgb_path, dimensions.size) + rgb_to_png(rgb_path, png_path, dimensions, page) + + return png_path + +def process_pdf(untrusted_pdf, page_count): + page = 1 + images = [] + trusted_pdf = f'{os.path.splitext(untrusted_pdf)[0]}.trusted.pdf' + + print("Waiting for converted sample...", file=sys.stderr) + while page <= page_count: + dimensions = get_img_dimensions() + + # TODO: There's some weird verbose condition here in the og script + print(f'Receiving page {page}/{page_count}...', end='\r', + file=sys.stderr) + png_path = convert_received_rgb_file(dimensions, page) + images.append(Image.open(png_path)) + + page += 1 + else: + print('', file=sys.stderr) + + # TODO: Maybe it'd be better to save->delete png over and over again to + # avoid storing all PNGs in memory + images[0].save(trusted_pdf, 'PDF', resolution=100.0, save_all=True, + append_images=images[1:]) + + for img in images: + img.close() + os.remove(img.filename) + + print(f'Converted PDF saved as: {trusted_pdf}', file=sys.stderr) + +def archive_pdf(untrusted_pdf): + archived_pdf = f'{ARCHIVE_PATH}/{os.path.basename(untrusted_pdf)}' + os.rename(untrusted_pdf, archived_pdf) + print(f'Original PDF saved as: {archived_pdf}', file=sys.stderr) + +def mkdir_archive(): + if not os.path.exists(ARCHIVE_PATH): + os.mkdir(ARCHIVE_PATH) + +############################### +# qrexec-related +############################### + +def send_pdf_file(untrusted_pdf): + print('Sending file to a Disposable VM...', file=sys.stderr) + with open(untrusted_pdf, 'rb') as f: + sys.stdout.buffer.write(f.read()) + os.close(sys.stdout.fileno()) + +def process_pdfs(untrusted_pdfs): + # TODO: Remove [0] when support for multiple PDFs is available + # for untrusted_pdf in untrusted_pdfs: + send_pdf_file(untrusted_pdfs[0]) + page_count = receive_page_count() + process_pdf(untrusted_pdfs[0], page_count) + archive_pdf(untrusted_pdfs[0]) + +def main(): + untrusted_pdfs = sys.argv[1:] + mkdir_archive() + process_pdfs(untrusted_pdfs) + +if __name__ == '__main__': + try: + main() + except KeyboardInterrupt as e: + sys.exit(0) From 4d88ecfb503608582393d3b7680ea4af148a62e2 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sun, 29 Mar 2020 15:15:12 -0400 Subject: [PATCH 08/92] wrapper, client: Re-add logging Using the logging module makes more sense than using print() in Qubes since it just implicitly has more meaning. --- qpdf-convert-client | 3 +++ qvm-convert-pdf | 16 ++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/qpdf-convert-client b/qpdf-convert-client index 50f54c9..f850326 100755 --- a/qpdf-convert-client +++ b/qpdf-convert-client @@ -20,6 +20,7 @@ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. from collections import namedtuple +import logging import os from PIL import Image import subprocess @@ -34,6 +35,8 @@ MAX_IMG_WIDTH = 10000 MAX_IMG_HEIGHT = 10000 MAX_IMG_SIZE = MAX_IMG_WIDTH * MAX_IMG_HEIGHT * 3 +logging.basicConfig(format='%(message)s', stream=sys.stderr) + ############################### # Utilities diff --git a/qvm-convert-pdf b/qvm-convert-pdf index b144110..650625f 100755 --- a/qvm-convert-pdf +++ b/qvm-convert-pdf @@ -21,12 +21,28 @@ # import argparse +import logging import os import sys PROG_NAME = os.path.basename(sys.argv[0]) QREXEC_CLIENT = '/usr/bin/qrexec-client-vm' + +############################### +# Utilities +############################### + +def die(msg): + logging.basicConfig(format='%(message)s', stream=sys.stderr) + logging.error(msg) + sys.exit(1) + + +############################### +# Parsing +############################### + class ArgumentParser(argparse.ArgumentParser): '''Overriding class for custom help message.''' def print_help(self): From 588a9e2ae14de22c00056f69d58fff0d24a61635 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sun, 29 Mar 2020 15:16:21 -0400 Subject: [PATCH 09/92] wrapper: PEP 8 --- qvm-convert-pdf | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/qvm-convert-pdf b/qvm-convert-pdf index 650625f..83e6301 100755 --- a/qvm-convert-pdf +++ b/qvm-convert-pdf @@ -74,20 +74,24 @@ def parse_args(args): def check_pdf_paths(untrusted_pdfs): for untrusted_pdf in untrusted_pdfs: if not os.path.exists(untrusted_pdf): - print(f'"{untrusted_pdf}": No such file') - sys.exit(1) + die(f'{untrusted_pdf}: No such file') + + +############################### +# Main +############################### def main(): + # TODO: Move parsing into qpdf-convert-client args, untrusted_pdfs = parser_new() parse_args(args) check_pdf_paths(untrusted_pdfs) # TODO: Handle os.execl() error (maybe with os._exit(127) - os.execvp(QREXEC_CLIENT, [QREXEC_CLIENT, '$dispvm', 'qubes.PdfConvert', - '/usr/lib/qubes/qpdf-convert-client', - *untrusted_pdfs]) + cmd = [QREXEC_CLIENT, 'disp8051', 'qubes.PdfConvert', + '/usr/lib/qubes/qpdf-convert-client', *untrusted_pdfs] + os.execvp(QREXEC_CLIENT, cmd) if __name__ == '__main__': - # No need to wrap this in a try block since we're - # never returning from execl() + # No need to wrap this in a try block since we never return from execl() main() From 516cd211ea3ffbe7fd362ad55fe1cd3f9d0ec5ba Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sun, 29 Mar 2020 15:19:17 -0400 Subject: [PATCH 10/92] client: Add qrexec wrapper functions send() makes more sense to someone unfamiliar with how qrexec works than a normal print() does. --- qpdf-convert-client | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/qpdf-convert-client b/qpdf-convert-client index f850326..0571541 100755 --- a/qpdf-convert-client +++ b/qpdf-convert-client @@ -42,9 +42,28 @@ logging.basicConfig(format='%(message)s', stream=sys.stderr) # Utilities ############################### +def info(msg, suffix=None): + print(msg, end=suffix, flush=True, file=sys.stderr) + +def die(msg): + logging.error(msg) + sys.exit(1) + +def send(data): + print(data, flush=True) + +def send_b(data): + sys.stdout.buffer.write(data) + +def recv(): + return input() + +def recv_b(): + return sys.stdin.buffer.read() + def check_range(val, upper): if (not 1 <= val <= upper) or (not 1 <= val <= upper): - raise ValueError + raise ValueError ############################### From 552d2f49257fd4c34cd2dc22d92af8c1af60620f Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sun, 29 Mar 2020 15:20:00 -0400 Subject: [PATCH 11/92] client: PEP 8 --- qpdf-convert-client | 154 ++++++++++++++++++++++---------------------- 1 file changed, 76 insertions(+), 78 deletions(-) diff --git a/qpdf-convert-client b/qpdf-convert-client index 0571541..34a9032 100755 --- a/qpdf-convert-client +++ b/qpdf-convert-client @@ -25,7 +25,7 @@ import os from PIL import Image import subprocess import sys -import tempfile +from tempfile import NamedTemporaryFile PROG_NAME = os.path.basename(sys.argv[0]) ARCHIVE_PATH = f"{os.path.expanduser('~')}/QubesUntrustedPDFs" @@ -67,50 +67,36 @@ def check_range(val, upper): ############################### -# Files & Images +# Image-related ############################### -def receive_page_count(): - # TODO: Can't the # of pages be modified by a malicious server? - try: - page_count = int(input()) - check_range(page_count, MAX_PAGES) - except ValueError as err: - print("Invalid number of pages returned... aborting!", file=sys.stderr) - sys.exit(1) - - return page_count +def recv_measurements(): + return (int(val) for val in recv().split(' ', 2)) -def receive_img_measurements(): - return (int(val) for val in input().split(' ', 2)) - -def get_img_size(width, height): +def get_size(width, height): size = width * height * 3 if size > MAX_IMG_SIZE: - print("Calculated image size is too large... aborting!", - file=sys.stderr) - sys.exit(1) + die("Calculated image size is too large... aborting!") return size -def get_img_dimensions(): +def get_dimensions(): depth = 8 try: - width, height = receive_img_measurements() + width, height = recv_measurements() check_range(width, MAX_IMG_WIDTH) check_range(height, MAX_IMG_HEIGHT) - except ValueError as err: - print("Invalid image geometry returned... aborting!", file=sys.stderr) - sys.exit(1) + except ValueError: + die("Invalid image geometry returned... aborting!") - size = get_img_size(width, height) + size = get_size(width, height) + Dimensions = namedtuple('Dimensions', ['width', 'height', 'depth', 'size']) - Dimensions = namedtuple('Dimensions', 'width height depth size') return Dimensions(width=width, height=height, size=size, depth=depth) -def get_tmp_rgb(rgb_path, size): +def recv_rgb(rgb_path, size): # XXX: For some reason, this leaves us missing alot of bytes # rcvd_bytes = input().encode('utf-8', 'surrogateescape') # rcvd_bytes = rcvd_bytes[:dimensions.size] @@ -125,55 +111,78 @@ def get_tmp_rgb(rgb_path, size): with open(rgb_path, 'wb') as rgb_f: # FIXME: Why can't we get this in pure Python????? This isn't # Windows-compatible - subprocess.run(['head', '-c', f'{size}'], stdout=rgb_f, check=True) + cmd = ['head', '-c', str(size)] + subprocess.run(cmd, stdout=rgb_f, check=True) if os.path.getsize(rgb_f.name) != size: - print('Invalid number of bytes in RGB file... aborting!', - file=sys.stderr) os.remove(rgb_path) - sys.exit(1) + die('Invalid number of bytes in RGB file... aborting!') def rgb_to_png(rgb_path, png_path, dimensions, page): - try: - subprocess.run(['convert', - '-size' , f'{dimensions.width}x{dimensions.height}', - '-depth', f'{dimensions.depth}', - f'rgb:{rgb_path}', f'png:{png_path}'], check=True) - except subprocess.CalledProcessError as err: - print(f'Page {page} conversion failed (RGB->PNG): {err}', - file=sys.stderr) - sys.exit(1) - else: - os.remove(rgb_path) + cmd = ['convert', '-size', f'{dimensions.width}x{dimensions.height}', + '-depth', str(dimensions.depth), f'rgb:{rgb_path}', + f'png:{png_path}'] -def convert_received_rgb_file(dimensions, page): - # TODO: Does the number of X's matter? - with tempfile.NamedTemporaryFile(prefix='qpdf-conversion-') as rcvd_f: + try: + subprocess.run(cmd, check=True) + except subprocess.CalledProcessError: + die(f'Page {page} conversion failed (RGB->PNG)... aborting!') + else: + os.remove(rgb_path) + +def convert_rgb(dimensions, page): + with NamedTemporaryFile(prefix='qpdf-conversion-') as rcvd_f: rgb_path = f'{rcvd_f.name}-{page}.rgb' png_path = f'{rcvd_f.name}-{page}.png' - get_tmp_rgb(rgb_path, dimensions.size) + recv_rgb(rgb_path, dimensions.size) rgb_to_png(rgb_path, png_path, dimensions, page) return png_path -def process_pdf(untrusted_pdf, page_count): + +############################### +# PDF-related +############################### + +def recv_page_count(): + try: + page_count = int(recv()) + check_range(page_count, MAX_PAGES) + except ValueError: + die("Invalid number of pages returned... aborting!") + + return page_count + +def archive_pdf(untrusted_pdf): + archived_pdf = f'{ARCHIVE_PATH}/{os.path.basename(untrusted_pdf)}' + os.rename(untrusted_pdf, archived_pdf) + info(f'Original PDF saved as: {archived_pdf}') + +def send_pdf_file(untrusted_pdf): + info('Sending file to a Disposable VM...') + with open(untrusted_pdf, 'rb') as f: + send_b(f.read()) + os.close(sys.stdout.fileno()) + +def process_pdf_file(untrusted_pdf, page_count): page = 1 images = [] trusted_pdf = f'{os.path.splitext(untrusted_pdf)[0]}.trusted.pdf' - print("Waiting for converted sample...", file=sys.stderr) + info("Waiting for converted sample...") + while page <= page_count: - dimensions = get_img_dimensions() + dimensions = get_dimensions() + + info(f'Receiving page {page}/{page_count}...', '\r') # TODO: There's some weird verbose condition here in the og script - print(f'Receiving page {page}/{page_count}...', end='\r', - file=sys.stderr) - png_path = convert_received_rgb_file(dimensions, page) + png_path = convert_rgb(dimensions, page) images.append(Image.open(png_path)) page += 1 else: - print('', file=sys.stderr) + info('') # TODO: Maybe it'd be better to save->delete png over and over again to # avoid storing all PNGs in memory @@ -184,42 +193,31 @@ def process_pdf(untrusted_pdf, page_count): img.close() os.remove(img.filename) - print(f'Converted PDF saved as: {trusted_pdf}', file=sys.stderr) - -def archive_pdf(untrusted_pdf): - archived_pdf = f'{ARCHIVE_PATH}/{os.path.basename(untrusted_pdf)}' - os.rename(untrusted_pdf, archived_pdf) - print(f'Original PDF saved as: {archived_pdf}', file=sys.stderr) - -def mkdir_archive(): - if not os.path.exists(ARCHIVE_PATH): - os.mkdir(ARCHIVE_PATH) - -############################### -# qrexec-related -############################### - -def send_pdf_file(untrusted_pdf): - print('Sending file to a Disposable VM...', file=sys.stderr) - with open(untrusted_pdf, 'rb') as f: - sys.stdout.buffer.write(f.read()) - os.close(sys.stdout.fileno()) + info(f'Converted PDF saved as: {trusted_pdf}') def process_pdfs(untrusted_pdfs): # TODO: Remove [0] when support for multiple PDFs is available # for untrusted_pdf in untrusted_pdfs: send_pdf_file(untrusted_pdfs[0]) - page_count = receive_page_count() - process_pdf(untrusted_pdfs[0], page_count) + page_count = recv_page_count() + process_pdf_file(untrusted_pdfs[0], page_count) archive_pdf(untrusted_pdfs[0]) + +############################### +# Main +############################### + def main(): untrusted_pdfs = sys.argv[1:] - mkdir_archive() + + if not os.path.exists(ARCHIVE_PATH): + os.mkdir(ARCHIVE_PATH) + process_pdfs(untrusted_pdfs) if __name__ == '__main__': try: main() - except KeyboardInterrupt as e: - sys.exit(0) + except KeyboardInterrupt: + die("KeyboardInterrupt... aborting!") From f6c90fb6de93b4199da2c9e0dbcb507fbabde67a Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sun, 29 Mar 2020 15:21:20 -0400 Subject: [PATCH 12/92] server: Refactor to Python 3 --- qpdf-convert-server | 208 +++++++++++++++++++++++++++++++++----------- 1 file changed, 156 insertions(+), 52 deletions(-) diff --git a/qpdf-convert-server b/qpdf-convert-server index 8997090..1e2e162 100755 --- a/qpdf-convert-server +++ b/qpdf-convert-server @@ -1,8 +1,9 @@ -#!/bin/bash -# +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + # The Qubes OS Project, http://www.qubes-os.org # -# Copyright (C) 2013 Joanna Rutkowska +# Copyright (C) 2013 Joanna Rutkowska # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License @@ -17,52 +18,155 @@ # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -# -# Requires: -# - poppler-utils (pdftocairo, pdfinfo) -# - ImageMagick (convert) - -INPUT_FILE=$(mktemp --tmpdir qpdf-conversion-XXXXXXXX) -TEMP_PNG_FILE=$(mktemp --tmpdir qpdf-conversion-XXXXXXXX.png) -TEMP_RGB_FILE=$(mktemp --tmpdir qpdf-conversion-XXXXXXXX.pdf) -IMG_DEPTH=8 - -# Get the original (untrusted) PDF file... -cat > "$INPUT_FILE" - -# now, let's convert it into a simple representation, -# and send back to the client. - -# Note that we might be compromised at this point (due to exploitation of PDF -# parsing code) and so what we're sending back might very well be something -# totally different than a decent simple representation -- the client should -# never trust what we're sending back, and should discard anything that doesn't -# look like the simple representation! - -NO_PAGES=$(pdfinfo "$INPUT_FILE" | grep -a "^Pages:" | sed -e "s/^Pages:[^0-9]*//") -if [ -z "$NO_PAGES" ]; then - # Perhaps this is not a PDF, only some JPG/PNG/etc? Let's try it anyway... - NO_PAGES=1 -fi -echo $NO_PAGES - -cd /tmp || exit 1 -PAGE=1 -while [ $PAGE -le $NO_PAGES ]; do - # if pdftocairo fails, lets try the ImageMagick's convert -- perhaps this is just some img file? - pdftocairo "$INPUT_FILE" -png -f $PAGE -l $PAGE -singlefile "$(basename "$TEMP_PNG_FILE" .png)" || \ - convert "$INPUT_FILE" png:"$TEMP_PNG_FILE" - IMG_WIDTH=$(identify -format "%w" "$TEMP_PNG_FILE") - IMG_HEIGHT=$(identify -format "%h" "$TEMP_PNG_FILE") - convert "$TEMP_PNG_FILE" -depth $IMG_DEPTH rgb:"$TEMP_RGB_FILE" - echo "$IMG_WIDTH $IMG_HEIGHT" - cat "$TEMP_RGB_FILE" - PAGE=$((PAGE + 1)) -done - -# Cleanup tmp files... -# Note: our DispVM might get destroyed before the commands below -# complete, but that doesn't hurt us, because this is... well a DispVM. -rm -f "$INPUT_FILE" -rm -f "$TEMP_PNG_FILE" -rm -f "$TEMP_RGB_FILE" + +from collections import namedtuple +import logging +import os +import subprocess +import sys +from tempfile import NamedTemporaryFile + +logging.basicConfig(format='%(message)s', stream=sys.stderr) + + +############################### +# Utilities +############################### + +def info(msg, suffix=None): + print(msg, end=suffix, flush=True, file=sys.stderr) + +def die(msg): + logging.error(msg) + sys.exit(1) + +def send(data): + print(data, flush=True) + +def send_b(data): + sys.stdout.buffer.write(data) + +def recv_b(): + return sys.stdin.buffer.read() + + +############################### +# Image-related +############################### + +def send_dimensions(png_path): + cmd1 = ['identify', '-format', '%w', png_path] + cmd2 = ['identify', '-format', '%h', png_path] + + try: + width = subprocess.check_output(cmd1, text=True) + height = subprocess.check_output(cmd2, text=True) + except CalledProcessError: + die("Failed to gather dimensions... Aborting") + + send(f'{width} {height}') + +def send_rgb_file(rgb_path): + with open(rgb_path, 'rb') as f: + data = f.read() + send_b(data) + +def pdf_to_png(page, pdf_path, png_path): + png_filename = os.path.splitext(png_path)[0] + cmd = ['pdftocairo', pdf_path, '-png', '-f', str(page), '-l', str(page), + '-singlefile', png_filename] + + try: + subprocess.run(cmd, check=True) + except subprocess.CalledProcessError: + cmd = ['convert', pdf_path, f'png:{png_path}'] + try: + subprocess.run(cmd, check=True) + except subprocess.CalledProcessError: + die(f'Page {page} conversion failed (PDF->PNG): {err}') + +def png_to_rgb(png_path, rgb_path): + depth = 8 + cmd = ['convert', png_path, '-depth', str(depth), f'rgb:{rgb_path}'] + + try: + subprocess.run(cmd, check=True) + except subprocess.CalledProcessError: + die(f'Page {page} conversion failed (PNG->RGB): {err}') + + +############################### +# File-related +############################### + +def get_tmp_files(): + Files = namedtuple('Files', ['pdf', 'png', 'rgb']) + suffixes = ('', '.png', '.rgb') + paths = [] + + for suffix in suffixes: + with NamedTemporaryFile(prefix='qpdf-conversion-', suffix=suffix) as f: + paths.append(f.name) + + return Files(pdf=paths[0], png=paths[1], rgb=paths[2]) + +def make_tmp_files(paths): + for path in paths: + with open(path, 'wb') as f: + f.write(b'') + + +############################### +# PDF-related +############################### + +def recv_pdf(pdf_path): + with open(pdf_path, 'wb') as f: + data = recv_b() + f.write(data) + +def get_page_count(pdf_path): + pages = 0 + output = None + cmd = ['pdfinfo', pdf_path] + + try: + output = subprocess.check_output(cmd) + except subprocess.CalledProcessError: + info(f'Probably not a PDF...') + else: + for line in output.decode().splitlines(): + if 'Pages:' in line: + pages = int(line.split(':')[1]) + + return pages + +def process_pdf(paths): + page = 1 + + pages = get_page_count(paths.pdf) + send(pages) + + while (page <= pages): + pdf_to_png(page, paths.pdf, paths.png) + send_dimensions(paths.png) + png_to_rgb(paths.png, paths.rgb) + send_rgb_file(paths.rgb) + page += 1 + + +############################### +# Main +############################### + +def main(): + paths = get_tmp_files() + make_tmp_files(paths) + recv_pdf(paths.pdf) + process_pdf(paths) + +if __name__ == '__main__': + try: + main() + except KeyboardInterrupt: + die("KeyboardInterrupt... aborting!") From 8a8283d884e32210c5a9a8d0835f6e3fd1d7a2db Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sun, 29 Mar 2020 17:34:13 -0400 Subject: [PATCH 13/92] client, server: Add documentation --- qpdf-convert-client | 33 ++++++++++++++++++++++++++++----- qpdf-convert-server | 19 +++++++++++++++++++ 2 files changed, 47 insertions(+), 5 deletions(-) diff --git a/qpdf-convert-client b/qpdf-convert-client index 34a9032..405ca66 100755 --- a/qpdf-convert-client +++ b/qpdf-convert-client @@ -43,25 +43,37 @@ logging.basicConfig(format='%(message)s', stream=sys.stderr) ############################### def info(msg, suffix=None): + '''Qrexec wrapper for displaying information + + `suffix` is typically only ever used when `msg` needs to overwrite + the line of the previous message (so as to imitate an updating + line). This is done by setting `suffix` to '\r'. + ''' print(msg, end=suffix, flush=True, file=sys.stderr) def die(msg): + '''Qrexec wrapper for displaying error messages''' logging.error(msg) sys.exit(1) def send(data): + '''Qrexec wrapper for sending text data to the client's STDOUT''' print(data, flush=True) def send_b(data): + '''Qrexec wrapper for sending binary data to the client's STDOUT''' sys.stdout.buffer.write(data) def recv(): + '''Qrexec wrapper for receiving text data from a client''' return input() def recv_b(): + '''Qrexec wrapper for receiving binary data from a client''' return sys.stdin.buffer.read() def check_range(val, upper): + '''Raises a ValueError if the value isn't between 1 and some upper bound''' if (not 1 <= val <= upper) or (not 1 <= val <= upper): raise ValueError @@ -71,9 +83,11 @@ def check_range(val, upper): ############################### def recv_measurements(): + '''Receive image measurements for a PDF page from server''' return (int(val) for val in recv().split(' ', 2)) def get_size(width, height): + '''Compute image size based on the received measurements''' size = width * height * 3 if size > MAX_IMG_SIZE: @@ -82,6 +96,7 @@ def get_size(width, height): return size def get_dimensions(): + '''Compute image dimensions based on received measurements''' depth = 8 try: @@ -97,6 +112,7 @@ def get_dimensions(): return Dimensions(width=width, height=height, size=size, depth=depth) def recv_rgb(rgb_path, size): + '''Receive a presumably clean RGB file of a PDF page from server''' # XXX: For some reason, this leaves us missing alot of bytes # rcvd_bytes = input().encode('utf-8', 'surrogateescape') # rcvd_bytes = rcvd_bytes[:dimensions.size] @@ -119,6 +135,7 @@ def recv_rgb(rgb_path, size): die('Invalid number of bytes in RGB file... aborting!') def rgb_to_png(rgb_path, png_path, dimensions, page): + '''Convert an RGB file to a PNG file''' cmd = ['convert', '-size', f'{dimensions.width}x{dimensions.height}', '-depth', str(dimensions.depth), f'rgb:{rgb_path}', f'png:{png_path}'] @@ -131,6 +148,7 @@ def rgb_to_png(rgb_path, png_path, dimensions, page): os.remove(rgb_path) def convert_rgb(dimensions, page): + '''Driver for receiving and converting RGB files''' with NamedTemporaryFile(prefix='qpdf-conversion-') as rcvd_f: rgb_path = f'{rcvd_f.name}-{page}.rgb' png_path = f'{rcvd_f.name}-{page}.png' @@ -145,6 +163,7 @@ def convert_rgb(dimensions, page): ############################### def recv_page_count(): + '''Receive number of pages in PDF file from server''' try: page_count = int(recv()) check_range(page_count, MAX_PAGES) @@ -153,18 +172,21 @@ def recv_page_count(): return page_count -def archive_pdf(untrusted_pdf): - archived_pdf = f'{ARCHIVE_PATH}/{os.path.basename(untrusted_pdf)}' - os.rename(untrusted_pdf, archived_pdf) - info(f'Original PDF saved as: {archived_pdf}') - def send_pdf_file(untrusted_pdf): + '''Send untrusted PDF file to server''' info('Sending file to a Disposable VM...') with open(untrusted_pdf, 'rb') as f: send_b(f.read()) os.close(sys.stdout.fileno()) +def archive_pdf(untrusted_pdf): + '''Move original untrusted PDF to an archive''' + archived_pdf = f'{ARCHIVE_PATH}/{os.path.basename(untrusted_pdf)}' + os.rename(untrusted_pdf, archived_pdf) + info(f'Original PDF saved as: {archived_pdf}') + def process_pdf_file(untrusted_pdf, page_count): + '''Process an untrusted PDF page and save the trusted result''' page = 1 images = [] trusted_pdf = f'{os.path.splitext(untrusted_pdf)[0]}.trusted.pdf' @@ -196,6 +218,7 @@ def process_pdf_file(untrusted_pdf, page_count): info(f'Converted PDF saved as: {trusted_pdf}') def process_pdfs(untrusted_pdfs): + '''Wrapper for PDF processing''' # TODO: Remove [0] when support for multiple PDFs is available # for untrusted_pdf in untrusted_pdfs: send_pdf_file(untrusted_pdfs[0]) diff --git a/qpdf-convert-server b/qpdf-convert-server index 1e2e162..805a7ab 100755 --- a/qpdf-convert-server +++ b/qpdf-convert-server @@ -34,19 +34,29 @@ logging.basicConfig(format='%(message)s', stream=sys.stderr) ############################### def info(msg, suffix=None): + '''Qrexec wrapper for displaying information + + `suffix` is typically only ever used when `msg` needs to overwrite + the line of the previous message (so as to imitate an updating + line). This is done by setting `suffix` to '\r'. + ''' print(msg, end=suffix, flush=True, file=sys.stderr) def die(msg): + '''Qrexec wrapper for displaying error messages''' logging.error(msg) sys.exit(1) def send(data): + '''Qrexec wrapper for sending text data to the client's STDOUT''' print(data, flush=True) def send_b(data): + '''Qrexec wrapper for sending binary data to the client's STDOUT''' sys.stdout.buffer.write(data) def recv_b(): + '''Qrexec wrapper for receiving binary data from a client''' return sys.stdin.buffer.read() @@ -55,6 +65,7 @@ def recv_b(): ############################### def send_dimensions(png_path): + '''Send dimensions of untrusted PNG file to client for conversion''' cmd1 = ['identify', '-format', '%w', png_path] cmd2 = ['identify', '-format', '%h', png_path] @@ -67,11 +78,13 @@ def send_dimensions(png_path): send(f'{width} {height}') def send_rgb_file(rgb_path): + '''Send presumably clean RGB file to client''' with open(rgb_path, 'rb') as f: data = f.read() send_b(data) def pdf_to_png(page, pdf_path, png_path): + '''Convert an untrusted PDF page into an intermediate PNG file''' png_filename = os.path.splitext(png_path)[0] cmd = ['pdftocairo', pdf_path, '-png', '-f', str(page), '-l', str(page), '-singlefile', png_filename] @@ -86,6 +99,7 @@ def pdf_to_png(page, pdf_path, png_path): die(f'Page {page} conversion failed (PDF->PNG): {err}') def png_to_rgb(png_path, rgb_path): + '''Convert PNG file into a presumably clean RGB file''' depth = 8 cmd = ['convert', png_path, '-depth', str(depth), f'rgb:{rgb_path}'] @@ -100,6 +114,7 @@ def png_to_rgb(png_path, rgb_path): ############################### def get_tmp_files(): + '''Return random temporary file names for images and the untrusted PDF''' Files = namedtuple('Files', ['pdf', 'png', 'rgb']) suffixes = ('', '.png', '.rgb') paths = [] @@ -111,6 +126,7 @@ def get_tmp_files(): return Files(pdf=paths[0], png=paths[1], rgb=paths[2]) def make_tmp_files(paths): + '''Create temporary files to store images and the untrusted PDF''' for path in paths: with open(path, 'wb') as f: f.write(b'') @@ -121,11 +137,13 @@ def make_tmp_files(paths): ############################### def recv_pdf(pdf_path): + '''Receive untrusted PDF file from client''' with open(pdf_path, 'wb') as f: data = recv_b() f.write(data) def get_page_count(pdf_path): + '''Get number of pages in the untrusted PDF file''' pages = 0 output = None cmd = ['pdfinfo', pdf_path] @@ -142,6 +160,7 @@ def get_page_count(pdf_path): return pages def process_pdf(paths): + '''Process pages of the untrusted PDF file''' page = 1 pages = get_page_count(paths.pdf) From 5cfaab7aa9d134f589b0f7b132139bd43dfb9244 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sun, 29 Mar 2020 17:35:53 -0400 Subject: [PATCH 14/92] server: Make check_output() return a String instead of a bytestring --- qpdf-convert-server | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qpdf-convert-server b/qpdf-convert-server index 805a7ab..9b381b8 100755 --- a/qpdf-convert-server +++ b/qpdf-convert-server @@ -149,11 +149,11 @@ def get_page_count(pdf_path): cmd = ['pdfinfo', pdf_path] try: - output = subprocess.check_output(cmd) + output = subprocess.check_output(cmd, text=True) except subprocess.CalledProcessError: info(f'Probably not a PDF...') else: - for line in output.decode().splitlines(): + for line in output.splitlines(): if 'Pages:' in line: pages = int(line.split(':')[1]) From ec754fc9a20c757c61e3e11c1d09084dae51e367 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sun, 29 Mar 2020 17:36:54 -0400 Subject: [PATCH 15/92] wrapper: Add documentation --- qvm-convert-pdf | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/qvm-convert-pdf b/qvm-convert-pdf index 83e6301..15954c5 100755 --- a/qvm-convert-pdf +++ b/qvm-convert-pdf @@ -34,6 +34,7 @@ QREXEC_CLIENT = '/usr/bin/qrexec-client-vm' ############################### def die(msg): + '''Qrexec wrapper for displaying error messages''' logging.basicConfig(format='%(message)s', stream=sys.stderr) logging.error(msg) sys.exit(1) @@ -46,17 +47,14 @@ def die(msg): class ArgumentParser(argparse.ArgumentParser): '''Overriding class for custom help message.''' def print_help(self): - print(f'''usage: {PROG_NAME} [OPTIONS ...] FILE + print(f'''\ +usage: {PROG_NAME} [OPTIONS ...] FILE Options: --help Show this help message and exit.''') sys.exit(0) def parser_new(): - '''Create a command-line parser - - :rtype: tuple - ''' parser = ArgumentParser() if len(sys.argv) == 1: From b8b41da0496b55e7dad669dcabac82804c2eddf7 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sun, 29 Mar 2020 20:38:07 -0400 Subject: [PATCH 16/92] Add .py extensions and update install commands --- Makefile | 6 +++--- qpdf-convert-client => qpdf-convert-client.py | 0 qpdf-convert-server => qpdf-convert-server.py | 0 qvm-convert-pdf => qvm-convert-pdf.py | 0 4 files changed, 3 insertions(+), 3 deletions(-) rename qpdf-convert-client => qpdf-convert-client.py (100%) rename qpdf-convert-server => qpdf-convert-server.py (100%) rename qvm-convert-pdf => qvm-convert-pdf.py (100%) diff --git a/Makefile b/Makefile index 44f07f4..2e2b8dc 100644 --- a/Makefile +++ b/Makefile @@ -78,9 +78,9 @@ build: install-vm: make install -C doc - install -D qvm-convert-pdf $(DESTDIR)/usr/bin/qvm-convert-pdf - install -D qpdf-convert-client $(DESTDIR)/usr/lib/qubes/qpdf-convert-client - install -D qpdf-convert-server $(DESTDIR)/usr/lib/qubes/qpdf-convert-server + install -D qvm-convert-pdf.py $(DESTDIR)/usr/bin/qvm-convert-pdf + install -D qpdf-convert-client.py $(DESTDIR)/usr/lib/qubes/qpdf-convert-client + install -D qpdf-convert-server.py $(DESTDIR)/usr/lib/qubes/qpdf-convert-server install -d $(DESTDIR)/etc/qubes-rpc ln -s ../../usr/lib/qubes/qpdf-convert-server $(DESTDIR)/etc/qubes-rpc/qubes.PdfConvert install -D qvm-convert-pdf.gnome $(DESTDIR)/usr/lib/qubes/qvm-convert-pdf.gnome diff --git a/qpdf-convert-client b/qpdf-convert-client.py similarity index 100% rename from qpdf-convert-client rename to qpdf-convert-client.py diff --git a/qpdf-convert-server b/qpdf-convert-server.py similarity index 100% rename from qpdf-convert-server rename to qpdf-convert-server.py diff --git a/qvm-convert-pdf b/qvm-convert-pdf.py similarity index 100% rename from qvm-convert-pdf rename to qvm-convert-pdf.py From 150793cfb843af6daefdd8ad694d47f93eee349f Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Tue, 31 Mar 2020 15:48:52 -0400 Subject: [PATCH 17/92] readme: Update usage example --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 2b52be2..94a5538 100644 --- a/README.md +++ b/README.md @@ -22,11 +22,11 @@ Usage ------ [user@varia ~]$ qvm-convert-pdf test.pdf - -> Sending file to remote VM... - -> Waiting for converted samples... - -> Receving page 8 out of 8... - -> Converted PDF saved as: ./test.trusted.pdf - -> Original file saved as /home/user/QubesUntrustedPDFs/test.pdf + Sending file to a Disposable VM... + Waiting for converted samples... + Receving page 8/8... + Converted PDF saved as: /home/user/test.trusted.pdf + Original file saved as /home/user/QubesUntrustedPDFs/test.pdf Authors --------- From 21407f109f0ba08eb8d3d531b8fae0a1f72680c2 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Tue, 31 Mar 2020 15:51:27 -0400 Subject: [PATCH 18/92] wrapper: Fix leftover debugging command --- qvm-convert-pdf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qvm-convert-pdf.py b/qvm-convert-pdf.py index 15954c5..3d15edf 100755 --- a/qvm-convert-pdf.py +++ b/qvm-convert-pdf.py @@ -86,7 +86,7 @@ def main(): check_pdf_paths(untrusted_pdfs) # TODO: Handle os.execl() error (maybe with os._exit(127) - cmd = [QREXEC_CLIENT, 'disp8051', 'qubes.PdfConvert', + cmd = [QREXEC_CLIENT, '$dispvm', 'qubes.PdfConvert', '/usr/lib/qubes/qpdf-convert-client', *untrusted_pdfs] os.execvp(QREXEC_CLIENT, cmd) From 3d941e8539f0cdbe77f54e6b27d3b664394cdd10 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Tue, 31 Mar 2020 19:39:23 -0400 Subject: [PATCH 19/92] wrapper: Trim down argument parsing Can't think of any useful options so only files can be passed to qvm-convert-pdf now. --- qvm-convert-pdf.py | 51 ++++++++++------------------------------------ 1 file changed, 11 insertions(+), 40 deletions(-) diff --git a/qvm-convert-pdf.py b/qvm-convert-pdf.py index 3d15edf..9470dcd 100755 --- a/qvm-convert-pdf.py +++ b/qvm-convert-pdf.py @@ -20,7 +20,6 @@ # # -import argparse import logging import os import sys @@ -39,51 +38,23 @@ def die(msg): logging.error(msg) sys.exit(1) - -############################### -# Parsing -############################### - -class ArgumentParser(argparse.ArgumentParser): - '''Overriding class for custom help message.''' - def print_help(self): - print(f'''\ -usage: {PROG_NAME} [OPTIONS ...] FILE - -Options: - --help Show this help message and exit.''') - sys.exit(0) - -def parser_new(): - parser = ArgumentParser() - - if len(sys.argv) == 1: - parser.print_help() - - # parser.add_argument('-v', '--verbose', action='count', default=0) - - return parser.parse_known_args() +def usage(): + print(f'usage: {PROG_NAME} [FILE ...]') + sys.exit(0) def parse_args(args): - # if args.version: - # version() - return - -def check_pdf_paths(untrusted_pdfs): - for untrusted_pdf in untrusted_pdfs: - if not os.path.exists(untrusted_pdf): - die(f'{untrusted_pdf}: No such file') + if len(args) == 1: + usage() - -############################### -# Main -############################### +def check_pdf_paths(pdfs): + for pdf in pdfs: + if not os.path.exists(pdf): + die(f'{pdf}: No such file') def main(): # TODO: Move parsing into qpdf-convert-client - args, untrusted_pdfs = parser_new() - parse_args(args) - check_pdf_paths(untrusted_pdfs) + parse_args(sys.argv) + check_pdf_paths(sys.argv[1:]) # TODO: Handle os.execl() error (maybe with os._exit(127) cmd = [QREXEC_CLIENT, '$dispvm', 'qubes.PdfConvert', From c6831f5764f4880f768d2bbc92638922f0c8854e Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Tue, 31 Mar 2020 19:40:30 -0400 Subject: [PATCH 20/92] wrapper: Move logging config to global namespace --- qvm-convert-pdf.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/qvm-convert-pdf.py b/qvm-convert-pdf.py index 9470dcd..30eb548 100755 --- a/qvm-convert-pdf.py +++ b/qvm-convert-pdf.py @@ -27,14 +27,10 @@ PROG_NAME = os.path.basename(sys.argv[0]) QREXEC_CLIENT = '/usr/bin/qrexec-client-vm' - -############################### -# Utilities -############################### +logging.basicConfig(format='%(message)s', stream=sys.stderr) def die(msg): '''Qrexec wrapper for displaying error messages''' - logging.basicConfig(format='%(message)s', stream=sys.stderr) logging.error(msg) sys.exit(1) From 2857aa4dbc35b1ab00308543173c31f7466a13da Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Tue, 31 Mar 2020 19:41:40 -0400 Subject: [PATCH 21/92] client, server: Fix stream flushing Without this change, if an error occured, the Python interpreter would have encountered an error in its stream cleanup after it caught a SystemExit. To fix that, we properly flush STDOUT after writing to it. --- qpdf-convert-client.py | 1 + qpdf-convert-server.py | 1 + 2 files changed, 2 insertions(+) diff --git a/qpdf-convert-client.py b/qpdf-convert-client.py index 405ca66..e073634 100755 --- a/qpdf-convert-client.py +++ b/qpdf-convert-client.py @@ -63,6 +63,7 @@ def send(data): def send_b(data): '''Qrexec wrapper for sending binary data to the client's STDOUT''' sys.stdout.buffer.write(data) + sys.stdout.buffer.flush() def recv(): '''Qrexec wrapper for receiving text data from a client''' diff --git a/qpdf-convert-server.py b/qpdf-convert-server.py index 9b381b8..e154d57 100755 --- a/qpdf-convert-server.py +++ b/qpdf-convert-server.py @@ -50,6 +50,7 @@ def die(msg): def send(data): '''Qrexec wrapper for sending text data to the client's STDOUT''' print(data, flush=True) + sys.stdout.buffer.flush() def send_b(data): '''Qrexec wrapper for sending binary data to the client's STDOUT''' From 8d5bc1604e8c0e743b2db079cdcddde563300187 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Tue, 31 Mar 2020 19:44:24 -0400 Subject: [PATCH 22/92] client: Handle EOFErrors if server suddenly dies For now, not too sure how recv_b() should handle these types of situations... --- qpdf-convert-client.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/qpdf-convert-client.py b/qpdf-convert-client.py index e073634..b5121d1 100755 --- a/qpdf-convert-client.py +++ b/qpdf-convert-client.py @@ -67,10 +67,16 @@ def send_b(data): def recv(): '''Qrexec wrapper for receiving text data from a client''' - return input() + try: + data = input() + except EOFError: + sys.exit(1) + else: + return data def recv_b(): '''Qrexec wrapper for receiving binary data from a client''' + # TODO: Does this raise EOFError like in recv()? return sys.stdin.buffer.read() def check_range(val, upper): From f9374802ab5b16d73ae00d13e054a7bc8baade7e Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Tue, 31 Mar 2020 19:46:29 -0400 Subject: [PATCH 23/92] server: Avoid errors if subprocess command fails Before, subprocess.check_output() would print out STDERR messages if the command failed. To avoid this, we use subprocess.run() instead. --- qpdf-convert-server.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/qpdf-convert-server.py b/qpdf-convert-server.py index e154d57..1612093 100755 --- a/qpdf-convert-server.py +++ b/qpdf-convert-server.py @@ -71,9 +71,11 @@ def send_dimensions(png_path): cmd2 = ['identify', '-format', '%h', png_path] try: - width = subprocess.check_output(cmd1, text=True) - height = subprocess.check_output(cmd2, text=True) - except CalledProcessError: + width = subprocess.run(cmd1, capture_output=True, + check=True).stdout.decode() + height = subprocess.run(cmd2, capture_output=True, + check=True).stdout.decode() + except subprocess.CalledProcessError: die("Failed to gather dimensions... Aborting") send(f'{width} {height}') @@ -150,11 +152,11 @@ def get_page_count(pdf_path): cmd = ['pdfinfo', pdf_path] try: - output = subprocess.check_output(cmd, text=True) + output = subprocess.run(cmd, capture_output=True, check=True) except subprocess.CalledProcessError: info(f'Probably not a PDF...') else: - for line in output.splitlines(): + for line in output.stdout.decode().splitlines(): if 'Pages:' in line: pages = int(line.split(':')[1]) From dd4abde6f98161dc7e28ee12c5c73198cd6c421f Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Tue, 31 Mar 2020 19:49:44 -0400 Subject: [PATCH 24/92] server, client: Grammar --- qpdf-convert-client.py | 2 +- qpdf-convert-server.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/qpdf-convert-client.py b/qpdf-convert-client.py index b5121d1..cb14fc4 100755 --- a/qpdf-convert-client.py +++ b/qpdf-convert-client.py @@ -250,4 +250,4 @@ def main(): try: main() except KeyboardInterrupt: - die("KeyboardInterrupt... aborting!") + die("KeyboardInterrupt... Aborting!") diff --git a/qpdf-convert-server.py b/qpdf-convert-server.py index 1612093..25f7a9b 100755 --- a/qpdf-convert-server.py +++ b/qpdf-convert-server.py @@ -191,4 +191,4 @@ def main(): try: main() except KeyboardInterrupt: - die("KeyboardInterrupt... aborting!") + die("KeyboardInterrupt... Aborting!") From 82b7e73395ae6cac3eb02437ce8a84be390fed2d Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Tue, 31 Mar 2020 19:50:13 -0400 Subject: [PATCH 25/92] client: Use OG stdout file descriptor --- qpdf-convert-client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qpdf-convert-client.py b/qpdf-convert-client.py index cb14fc4..55236a6 100755 --- a/qpdf-convert-client.py +++ b/qpdf-convert-client.py @@ -184,7 +184,7 @@ def send_pdf_file(untrusted_pdf): info('Sending file to a Disposable VM...') with open(untrusted_pdf, 'rb') as f: send_b(f.read()) - os.close(sys.stdout.fileno()) + os.close(sys.__stdout__.fileno()) def archive_pdf(untrusted_pdf): '''Move original untrusted PDF to an archive''' From e43c071cadba20e17499e5523379cd96a4153ec9 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Tue, 31 Mar 2020 19:52:03 -0400 Subject: [PATCH 26/92] server: Fix location of STDOUT flush call --- qpdf-convert-server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qpdf-convert-server.py b/qpdf-convert-server.py index 25f7a9b..06e5700 100755 --- a/qpdf-convert-server.py +++ b/qpdf-convert-server.py @@ -50,11 +50,11 @@ def die(msg): def send(data): '''Qrexec wrapper for sending text data to the client's STDOUT''' print(data, flush=True) - sys.stdout.buffer.flush() def send_b(data): '''Qrexec wrapper for sending binary data to the client's STDOUT''' sys.stdout.buffer.write(data) + sys.stdout.buffer.flush() def recv_b(): '''Qrexec wrapper for receiving binary data from a client''' From 098ef252b64e442877c87660fe860dd4ad9dcb9b Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Wed, 1 Apr 2020 15:17:47 -0400 Subject: [PATCH 27/92] wrapper: Simplify argument parsing --- qvm-convert-pdf.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/qvm-convert-pdf.py b/qvm-convert-pdf.py index 30eb548..2f02bbc 100755 --- a/qvm-convert-pdf.py +++ b/qvm-convert-pdf.py @@ -34,22 +34,15 @@ def die(msg): logging.error(msg) sys.exit(1) -def usage(): - print(f'usage: {PROG_NAME} [FILE ...]') - sys.exit(0) - -def parse_args(args): - if len(args) == 1: - usage() - def check_pdf_paths(pdfs): for pdf in pdfs: if not os.path.exists(pdf): die(f'{pdf}: No such file') def main(): - # TODO: Move parsing into qpdf-convert-client - parse_args(sys.argv) + if len(sys.argv) == 1: + die(f'usage: {PROG_NAME} [FILE ...]') + check_pdf_paths(sys.argv[1:]) # TODO: Handle os.execl() error (maybe with os._exit(127) From 88acf011f9d48a29e2834c03ac938780fefd3b3c Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Wed, 1 Apr 2020 15:18:12 -0400 Subject: [PATCH 28/92] wrapper: Add non-file check --- qvm-convert-pdf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/qvm-convert-pdf.py b/qvm-convert-pdf.py index 2f02bbc..67bd330 100755 --- a/qvm-convert-pdf.py +++ b/qvm-convert-pdf.py @@ -38,6 +38,8 @@ def check_pdf_paths(pdfs): for pdf in pdfs: if not os.path.exists(pdf): die(f'{pdf}: No such file') + elif not os.path.isfile(pdf): + die(f'{pdf}: Not a regular file') def main(): if len(sys.argv) == 1: From 7279415cf26395e94a22056fce777aebe08cd0a0 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Wed, 1 Apr 2020 23:14:41 -0400 Subject: [PATCH 29/92] wrapper, client, server: Qualify variable names --- qpdf-convert-client.py | 121 +++++++++++++++++++++-------------------- qpdf-convert-server.py | 48 ++++++++-------- qvm-convert-pdf.py | 21 +++---- 3 files changed, 98 insertions(+), 92 deletions(-) diff --git a/qpdf-convert-client.py b/qpdf-convert-client.py index 55236a6..c54432c 100755 --- a/qpdf-convert-client.py +++ b/qpdf-convert-client.py @@ -68,22 +68,27 @@ def send_b(data): def recv(): '''Qrexec wrapper for receiving text data from a client''' try: - data = input() + untrusted_data = input() except EOFError: sys.exit(1) - else: - return data + + return untrusted_data def recv_b(): '''Qrexec wrapper for receiving binary data from a client''' # TODO: Does this raise EOFError like in recv()? - return sys.stdin.buffer.read() + untrusted_data = sys.stdin.buffer.read() + return untrusted_data def check_range(val, upper): '''Raises a ValueError if the value isn't between 1 and some upper bound''' if (not 1 <= val <= upper) or (not 1 <= val <= upper): raise ValueError +def mkdir_archive(): + if not os.path.exists(ARCHIVE_PATH): + os.mkdir(ARCHIVE_PATH) + ############################### # Image-related @@ -91,34 +96,36 @@ def check_range(val, upper): def recv_measurements(): '''Receive image measurements for a PDF page from server''' - return (int(val) for val in recv().split(' ', 2)) + untrusted_measurements = recv().split(' ', 2) + return [int(untrusted_value) for untrusted_value in untrusted_measurements] -def get_size(width, height): +def get_size(untrusted_width, untrusted_height): '''Compute image size based on the received measurements''' - size = width * height * 3 + untrusted_size = untrusted_width * untrusted_height * 3 - if size > MAX_IMG_SIZE: + if untrusted_size > MAX_IMG_SIZE: die("Calculated image size is too large... aborting!") - return size + return untrusted_size def get_dimensions(): '''Compute image dimensions based on received measurements''' depth = 8 try: - width, height = recv_measurements() - check_range(width, MAX_IMG_WIDTH) - check_range(height, MAX_IMG_HEIGHT) + untrusted_width, untrusted_height = recv_measurements() + check_range(untrusted_width, MAX_IMG_WIDTH) + check_range(untrusted_height, MAX_IMG_HEIGHT) except ValueError: die("Invalid image geometry returned... aborting!") - size = get_size(width, height) + untrusted_size = get_size(untrusted_width, untrusted_height) Dimensions = namedtuple('Dimensions', ['width', 'height', 'depth', 'size']) - return Dimensions(width=width, height=height, size=size, depth=depth) + return Dimensions(width=untrusted_width, height=untrusted_height, + size=untrusted_size, depth=depth) -def recv_rgb(rgb_path, size): +def recv_rgb(rgb_path, untrusted_size): '''Receive a presumably clean RGB file of a PDF page from server''' # XXX: For some reason, this leaves us missing alot of bytes # rcvd_bytes = input().encode('utf-8', 'surrogateescape') @@ -131,20 +138,21 @@ def recv_rgb(rgb_path, size): # background.paste(png, mask=png.split()[3]) # 3 is the alpha channel # background.save('foo.jpg', 'JPEG', quality=80) - with open(rgb_path, 'wb') as rgb_f: + with open(rgb_path, 'wb') as f: # FIXME: Why can't we get this in pure Python????? This isn't # Windows-compatible - cmd = ['head', '-c', str(size)] - subprocess.run(cmd, stdout=rgb_f, check=True) + cmd = ['head', '-c', str(untrusted_size)] + subprocess.run(cmd, stdout=f, check=True) - if os.path.getsize(rgb_f.name) != size: + if os.path.getsize(f.name) != untrusted_size: os.remove(rgb_path) die('Invalid number of bytes in RGB file... aborting!') -def rgb_to_png(rgb_path, png_path, dimensions, page): +def rgb_to_png(rgb_path, png_path, untrusted_dimensions, page): '''Convert an RGB file to a PNG file''' - cmd = ['convert', '-size', f'{dimensions.width}x{dimensions.height}', - '-depth', str(dimensions.depth), f'rgb:{rgb_path}', + cmd = ['convert', '-size', + f'{untrusted_dimensions.width}x{untrusted_dimensions.height}', + '-depth', str(untrusted_dimensions.depth), f'rgb:{rgb_path}', f'png:{png_path}'] try: @@ -154,13 +162,13 @@ def rgb_to_png(rgb_path, png_path, dimensions, page): else: os.remove(rgb_path) -def convert_rgb(dimensions, page): +def convert_rgb(untrusted_dimensions, page): '''Driver for receiving and converting RGB files''' - with NamedTemporaryFile(prefix='qpdf-conversion-') as rcvd_f: - rgb_path = f'{rcvd_f.name}-{page}.rgb' - png_path = f'{rcvd_f.name}-{page}.png' - recv_rgb(rgb_path, dimensions.size) - rgb_to_png(rgb_path, png_path, dimensions, page) + with NamedTemporaryFile(prefix='qpdf-conversion-') as f: + rgb_path = f'{f.name}-{page}.rgb' + png_path = f'{f.name}-{page}.png' + recv_rgb(rgb_path, untrusted_dimensions.size) + rgb_to_png(rgb_path, png_path, untrusted_dimensions, page) return png_path @@ -172,41 +180,41 @@ def convert_rgb(dimensions, page): def recv_page_count(): '''Receive number of pages in PDF file from server''' try: - page_count = int(recv()) - check_range(page_count, MAX_PAGES) + untrusted_page_count = int(recv()) + check_range(untrusted_page_count, MAX_PAGES) except ValueError: die("Invalid number of pages returned... aborting!") - return page_count + return untrusted_page_count -def send_pdf_file(untrusted_pdf): +def send_pdf_file(untrusted_pdf_path): '''Send untrusted PDF file to server''' info('Sending file to a Disposable VM...') - with open(untrusted_pdf, 'rb') as f: + with open(untrusted_pdf_path, 'rb') as f: send_b(f.read()) os.close(sys.__stdout__.fileno()) -def archive_pdf(untrusted_pdf): - '''Move original untrusted PDF to an archive''' - archived_pdf = f'{ARCHIVE_PATH}/{os.path.basename(untrusted_pdf)}' - os.rename(untrusted_pdf, archived_pdf) - info(f'Original PDF saved as: {archived_pdf}') +def archive_pdf(untrusted_pdf_path): + '''Move original untrusted PDF to an archive directory''' + archived_pdf_path = f'{ARCHIVE_PATH}/{os.path.basename(untrusted_pdf_path)}' + os.rename(untrusted_pdf_path, archived_pdf_path) + info(f'Original PDF saved as: {archived_pdf_path}') -def process_pdf_file(untrusted_pdf, page_count): +def process_pdf_file(untrusted_pdf_path, untrusted_page_count): '''Process an untrusted PDF page and save the trusted result''' page = 1 images = [] - trusted_pdf = f'{os.path.splitext(untrusted_pdf)[0]}.trusted.pdf' + pdf_path = f'{os.path.splitext(untrusted_pdf_path)[0]}.trusted.pdf' info("Waiting for converted sample...") - while page <= page_count: - dimensions = get_dimensions() + while page <= untrusted_page_count: + untrusted_dimensions = get_dimensions() - info(f'Receiving page {page}/{page_count}...', '\r') + info(f'Receiving page {page}/{untrusted_page_count}...', '\r') # TODO: There's some weird verbose condition here in the og script - png_path = convert_rgb(dimensions, page) + png_path = convert_rgb(untrusted_dimensions, page) images.append(Image.open(png_path)) page += 1 @@ -215,23 +223,23 @@ def process_pdf_file(untrusted_pdf, page_count): # TODO: Maybe it'd be better to save->delete png over and over again to # avoid storing all PNGs in memory - images[0].save(trusted_pdf, 'PDF', resolution=100.0, save_all=True, + images[0].save(pdf_path, 'PDF', resolution=100.0, save_all=True, append_images=images[1:]) for img in images: img.close() os.remove(img.filename) - info(f'Converted PDF saved as: {trusted_pdf}') + info(f'Converted PDF saved as: {pdf_path}') -def process_pdfs(untrusted_pdfs): +def process_pdfs(untrusted_pdf_paths): '''Wrapper for PDF processing''' # TODO: Remove [0] when support for multiple PDFs is available - # for untrusted_pdf in untrusted_pdfs: - send_pdf_file(untrusted_pdfs[0]) - page_count = recv_page_count() - process_pdf_file(untrusted_pdfs[0], page_count) - archive_pdf(untrusted_pdfs[0]) + # for untrusted_pdf_path in untrusted_pdf_paths: + send_pdf_file(untrusted_pdf_paths[0]) + untrusted_page_count = recv_page_count() + process_pdf_file(untrusted_pdf_paths[0], untrusted_page_count) + archive_pdf(untrusted_pdf_paths[0]) ############################### @@ -239,12 +247,9 @@ def process_pdfs(untrusted_pdfs): ############################### def main(): - untrusted_pdfs = sys.argv[1:] - - if not os.path.exists(ARCHIVE_PATH): - os.mkdir(ARCHIVE_PATH) - - process_pdfs(untrusted_pdfs) + untrusted_pdf_paths = sys.argv[1:] + mkdir_archive() + process_pdfs(untrusted_pdf_paths) if __name__ == '__main__': try: diff --git a/qpdf-convert-server.py b/qpdf-convert-server.py index 06e5700..2a1b8d2 100755 --- a/qpdf-convert-server.py +++ b/qpdf-convert-server.py @@ -58,7 +58,8 @@ def send_b(data): def recv_b(): '''Qrexec wrapper for receiving binary data from a client''' - return sys.stdin.buffer.read() + untrusted_data = sys.stdin.buffer.read() + return untrusted_data ############################### @@ -67,18 +68,17 @@ def recv_b(): def send_dimensions(png_path): '''Send dimensions of untrusted PNG file to client for conversion''' - cmd1 = ['identify', '-format', '%w', png_path] - cmd2 = ['identify', '-format', '%h', png_path] - + cmd_width = ['identify', '-format', '%w', png_path] + cmd_height = ['identify', '-format', '%w', png_path] try: - width = subprocess.run(cmd1, capture_output=True, - check=True).stdout.decode() - height = subprocess.run(cmd2, capture_output=True, - check=True).stdout.decode() + untrusted_width = subprocess.run(cmd_width, capture_output=True, + check=True).stdout.decode() + untrusted_height = subprocess.run(cmd_height, capture_output=True, + check=True).stdout.decode() except subprocess.CalledProcessError: die("Failed to gather dimensions... Aborting") - send(f'{width} {height}') + send(f'{untrusted_width} {untrusted_height}') def send_rgb_file(rgb_path): '''Send presumably clean RGB file to client''' @@ -86,11 +86,11 @@ def send_rgb_file(rgb_path): data = f.read() send_b(data) -def pdf_to_png(page, pdf_path, png_path): +def pdf_to_png(pagenum, pdf_path, png_path): '''Convert an untrusted PDF page into an intermediate PNG file''' png_filename = os.path.splitext(png_path)[0] - cmd = ['pdftocairo', pdf_path, '-png', '-f', str(page), '-l', str(page), - '-singlefile', png_filename] + cmd = ['pdftocairo', pdf_path, '-png', '-f', str(pagenum), '-l', + str(pagenum), '-singlefile', png_filename] try: subprocess.run(cmd, check=True) @@ -99,9 +99,9 @@ def pdf_to_png(page, pdf_path, png_path): try: subprocess.run(cmd, check=True) except subprocess.CalledProcessError: - die(f'Page {page} conversion failed (PDF->PNG): {err}') + die(f'Page {pagenum} conversion failed (PDF->PNG): {err}') -def png_to_rgb(png_path, rgb_path): +def png_to_rgb(pagenum, png_path, rgb_path): '''Convert PNG file into a presumably clean RGB file''' depth = 8 cmd = ['convert', png_path, '-depth', str(depth), f'rgb:{rgb_path}'] @@ -109,7 +109,7 @@ def png_to_rgb(png_path, rgb_path): try: subprocess.run(cmd, check=True) except subprocess.CalledProcessError: - die(f'Page {page} conversion failed (PNG->RGB): {err}') + die(f'Page {pagenum} conversion failed (PNG->RGB): {err}') ############################### @@ -142,12 +142,12 @@ def make_tmp_files(paths): def recv_pdf(pdf_path): '''Receive untrusted PDF file from client''' with open(pdf_path, 'wb') as f: - data = recv_b() - f.write(data) + untrusted_data = recv_b() + f.write(untrusted_data) def get_page_count(pdf_path): '''Get number of pages in the untrusted PDF file''' - pages = 0 + untrusted_pages = 0 output = None cmd = ['pdfinfo', pdf_path] @@ -158,21 +158,21 @@ def get_page_count(pdf_path): else: for line in output.stdout.decode().splitlines(): if 'Pages:' in line: - pages = int(line.split(':')[1]) + untrusted_pages = int(line.split(':')[1]) - return pages + return untrusted_pages def process_pdf(paths): '''Process pages of the untrusted PDF file''' page = 1 - pages = get_page_count(paths.pdf) - send(pages) + untrusted_pages = get_page_count(paths.pdf) + send(untrusted_pages) - while (page <= pages): + while (page <= untrusted_pages): pdf_to_png(page, paths.pdf, paths.png) send_dimensions(paths.png) - png_to_rgb(paths.png, paths.rgb) + png_to_rgb(page, paths.png, paths.rgb) send_rgb_file(paths.rgb) page += 1 diff --git a/qvm-convert-pdf.py b/qvm-convert-pdf.py index 67bd330..3402aff 100755 --- a/qvm-convert-pdf.py +++ b/qvm-convert-pdf.py @@ -34,24 +34,25 @@ def die(msg): logging.error(msg) sys.exit(1) -def check_pdf_paths(pdfs): - for pdf in pdfs: - if not os.path.exists(pdf): - die(f'{pdf}: No such file') - elif not os.path.isfile(pdf): - die(f'{pdf}: Not a regular file') +def check_pdf_paths(untrusted_paths): + for untrusted_path in untrusted_paths: + if not os.path.exists(untrusted_path): + die(f'{untrusted_path}: No such file') + elif not os.path.isfile(untrusted_path): + die(f'{untrusted_path}: Not a regular file') def main(): if len(sys.argv) == 1: die(f'usage: {PROG_NAME} [FILE ...]') - check_pdf_paths(sys.argv[1:]) + untrusted_pdf_paths = sys.argv[1:] + check_pdf_paths(untrusted_pdf_paths) - # TODO: Handle os.execl() error (maybe with os._exit(127) + # TODO: Handle os.execl() error (maybe with os._exit(127)) cmd = [QREXEC_CLIENT, '$dispvm', 'qubes.PdfConvert', - '/usr/lib/qubes/qpdf-convert-client', *untrusted_pdfs] + '/usr/lib/qubes/qpdf-convert-client', *untrusted_pdf_paths] os.execvp(QREXEC_CLIENT, cmd) if __name__ == '__main__': - # No need to wrap this in a try block since we never return from execl() + # No need to wrap this in a try block since we never return from execvp() main() From a4b70cbf5b554f17f90228add6080a3a8eefbbb4 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Wed, 1 Apr 2020 23:32:35 -0400 Subject: [PATCH 30/92] server, client: Remove unecessary documentation --- qpdf-convert-client.py | 39 ++++++++++++++------------------------- qpdf-convert-server.py | 27 +++++++++------------------ 2 files changed, 23 insertions(+), 43 deletions(-) diff --git a/qpdf-convert-client.py b/qpdf-convert-client.py index c54432c..96b51fb 100755 --- a/qpdf-convert-client.py +++ b/qpdf-convert-client.py @@ -81,7 +81,6 @@ def recv_b(): return untrusted_data def check_range(val, upper): - '''Raises a ValueError if the value isn't between 1 and some upper bound''' if (not 1 <= val <= upper) or (not 1 <= val <= upper): raise ValueError @@ -94,13 +93,12 @@ def mkdir_archive(): # Image-related ############################### -def recv_measurements(): +def recv_img_measurements(): '''Receive image measurements for a PDF page from server''' untrusted_measurements = recv().split(' ', 2) return [int(untrusted_value) for untrusted_value in untrusted_measurements] -def get_size(untrusted_width, untrusted_height): - '''Compute image size based on the received measurements''' +def get_img_size(untrusted_width, untrusted_height): untrusted_size = untrusted_width * untrusted_height * 3 if untrusted_size > MAX_IMG_SIZE: @@ -108,25 +106,23 @@ def get_size(untrusted_width, untrusted_height): return untrusted_size -def get_dimensions(): - '''Compute image dimensions based on received measurements''' +def get_img_dimensions(): depth = 8 try: - untrusted_width, untrusted_height = recv_measurements() + untrusted_width, untrusted_height = recv_img_measurements() check_range(untrusted_width, MAX_IMG_WIDTH) check_range(untrusted_height, MAX_IMG_HEIGHT) except ValueError: die("Invalid image geometry returned... aborting!") - untrusted_size = get_size(untrusted_width, untrusted_height) + untrusted_size = get_img_size(untrusted_width, untrusted_height) Dimensions = namedtuple('Dimensions', ['width', 'height', 'depth', 'size']) return Dimensions(width=untrusted_width, height=untrusted_height, size=untrusted_size, depth=depth) -def recv_rgb(rgb_path, untrusted_size): - '''Receive a presumably clean RGB file of a PDF page from server''' +def recv_rgb_file(rgb_path, untrusted_size): # XXX: For some reason, this leaves us missing alot of bytes # rcvd_bytes = input().encode('utf-8', 'surrogateescape') # rcvd_bytes = rcvd_bytes[:dimensions.size] @@ -149,7 +145,6 @@ def recv_rgb(rgb_path, untrusted_size): die('Invalid number of bytes in RGB file... aborting!') def rgb_to_png(rgb_path, png_path, untrusted_dimensions, page): - '''Convert an RGB file to a PNG file''' cmd = ['convert', '-size', f'{untrusted_dimensions.width}x{untrusted_dimensions.height}', '-depth', str(untrusted_dimensions.depth), f'rgb:{rgb_path}', @@ -162,12 +157,11 @@ def rgb_to_png(rgb_path, png_path, untrusted_dimensions, page): else: os.remove(rgb_path) -def convert_rgb(untrusted_dimensions, page): - '''Driver for receiving and converting RGB files''' +def convert_rgb_file(untrusted_dimensions, page): with NamedTemporaryFile(prefix='qpdf-conversion-') as f: rgb_path = f'{f.name}-{page}.rgb' png_path = f'{f.name}-{page}.png' - recv_rgb(rgb_path, untrusted_dimensions.size) + recv_rgb_file(rgb_path, untrusted_dimensions.size) rgb_to_png(rgb_path, png_path, untrusted_dimensions, page) return png_path @@ -178,7 +172,6 @@ def convert_rgb(untrusted_dimensions, page): ############################### def recv_page_count(): - '''Receive number of pages in PDF file from server''' try: untrusted_page_count = int(recv()) check_range(untrusted_page_count, MAX_PAGES) @@ -187,21 +180,18 @@ def recv_page_count(): return untrusted_page_count -def send_pdf_file(untrusted_pdf_path): - '''Send untrusted PDF file to server''' +def send_pdf(untrusted_pdf_path): info('Sending file to a Disposable VM...') with open(untrusted_pdf_path, 'rb') as f: send_b(f.read()) os.close(sys.__stdout__.fileno()) def archive_pdf(untrusted_pdf_path): - '''Move original untrusted PDF to an archive directory''' archived_pdf_path = f'{ARCHIVE_PATH}/{os.path.basename(untrusted_pdf_path)}' os.rename(untrusted_pdf_path, archived_pdf_path) info(f'Original PDF saved as: {archived_pdf_path}') -def process_pdf_file(untrusted_pdf_path, untrusted_page_count): - '''Process an untrusted PDF page and save the trusted result''' +def process_pdf(untrusted_pdf_path, untrusted_page_count): page = 1 images = [] pdf_path = f'{os.path.splitext(untrusted_pdf_path)[0]}.trusted.pdf' @@ -209,12 +199,12 @@ def process_pdf_file(untrusted_pdf_path, untrusted_page_count): info("Waiting for converted sample...") while page <= untrusted_page_count: - untrusted_dimensions = get_dimensions() + untrusted_dimensions = get_img_dimensions() info(f'Receiving page {page}/{untrusted_page_count}...', '\r') # TODO: There's some weird verbose condition here in the og script - png_path = convert_rgb(untrusted_dimensions, page) + png_path = convert_rgb_file(untrusted_dimensions, page) images.append(Image.open(png_path)) page += 1 @@ -233,12 +223,11 @@ def process_pdf_file(untrusted_pdf_path, untrusted_page_count): info(f'Converted PDF saved as: {pdf_path}') def process_pdfs(untrusted_pdf_paths): - '''Wrapper for PDF processing''' # TODO: Remove [0] when support for multiple PDFs is available # for untrusted_pdf_path in untrusted_pdf_paths: - send_pdf_file(untrusted_pdf_paths[0]) + send_pdf(untrusted_pdf_paths[0]) untrusted_page_count = recv_page_count() - process_pdf_file(untrusted_pdf_paths[0], untrusted_page_count) + process_pdf(untrusted_pdf_paths[0], untrusted_page_count) archive_pdf(untrusted_pdf_paths[0]) diff --git a/qpdf-convert-server.py b/qpdf-convert-server.py index 2a1b8d2..1fde28c 100755 --- a/qpdf-convert-server.py +++ b/qpdf-convert-server.py @@ -66,10 +66,10 @@ def recv_b(): # Image-related ############################### -def send_dimensions(png_path): - '''Send dimensions of untrusted PNG file to client for conversion''' +def send_img_dimensions(png_path): cmd_width = ['identify', '-format', '%w', png_path] - cmd_height = ['identify', '-format', '%w', png_path] + cmd_height = ['identify', '-format', '%h', png_path] + try: untrusted_width = subprocess.run(cmd_width, capture_output=True, check=True).stdout.decode() @@ -81,13 +81,11 @@ def send_dimensions(png_path): send(f'{untrusted_width} {untrusted_height}') def send_rgb_file(rgb_path): - '''Send presumably clean RGB file to client''' with open(rgb_path, 'rb') as f: data = f.read() send_b(data) def pdf_to_png(pagenum, pdf_path, png_path): - '''Convert an untrusted PDF page into an intermediate PNG file''' png_filename = os.path.splitext(png_path)[0] cmd = ['pdftocairo', pdf_path, '-png', '-f', str(pagenum), '-l', str(pagenum), '-singlefile', png_filename] @@ -102,7 +100,6 @@ def pdf_to_png(pagenum, pdf_path, png_path): die(f'Page {pagenum} conversion failed (PDF->PNG): {err}') def png_to_rgb(pagenum, png_path, rgb_path): - '''Convert PNG file into a presumably clean RGB file''' depth = 8 cmd = ['convert', png_path, '-depth', str(depth), f'rgb:{rgb_path}'] @@ -116,8 +113,8 @@ def png_to_rgb(pagenum, png_path, rgb_path): # File-related ############################### -def get_tmp_files(): - '''Return random temporary file names for images and the untrusted PDF''' +def create_tmp_files(): + '''Create temporary file for storing page images and the untrusted PDF''' Files = namedtuple('Files', ['pdf', 'png', 'rgb']) suffixes = ('', '.png', '.rgb') paths = [] @@ -126,27 +123,23 @@ def get_tmp_files(): with NamedTemporaryFile(prefix='qpdf-conversion-', suffix=suffix) as f: paths.append(f.name) - return Files(pdf=paths[0], png=paths[1], rgb=paths[2]) - -def make_tmp_files(paths): - '''Create temporary files to store images and the untrusted PDF''' for path in paths: with open(path, 'wb') as f: f.write(b'') + return Files(pdf=paths[0], png=paths[1], rgb=paths[2]) + ############################### # PDF-related ############################### def recv_pdf(pdf_path): - '''Receive untrusted PDF file from client''' with open(pdf_path, 'wb') as f: untrusted_data = recv_b() f.write(untrusted_data) def get_page_count(pdf_path): - '''Get number of pages in the untrusted PDF file''' untrusted_pages = 0 output = None cmd = ['pdfinfo', pdf_path] @@ -163,7 +156,6 @@ def get_page_count(pdf_path): return untrusted_pages def process_pdf(paths): - '''Process pages of the untrusted PDF file''' page = 1 untrusted_pages = get_page_count(paths.pdf) @@ -171,7 +163,7 @@ def process_pdf(paths): while (page <= untrusted_pages): pdf_to_png(page, paths.pdf, paths.png) - send_dimensions(paths.png) + send_img_dimensions(paths.png) png_to_rgb(page, paths.png, paths.rgb) send_rgb_file(paths.rgb) page += 1 @@ -182,8 +174,7 @@ def process_pdf(paths): ############################### def main(): - paths = get_tmp_files() - make_tmp_files(paths) + paths = create_tmp_files() recv_pdf(paths.pdf) process_pdf(paths) From 574e63c1c8c8c241baf87bc89a69cc6af5548206 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Wed, 1 Apr 2020 23:54:21 -0400 Subject: [PATCH 31/92] readme: Update language --- README.md | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 94a5538..62bd97f 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,21 @@ Qubes PDF Converter ==================== -Qubes PDF converter is a [Qubes](https://qubes-os.org) Application, which -utilizes Qubes flexible qrexec (inter-VM communication) infrastructure and -Disposable VMs to perform conversion of potentially untrusted (e.g. maliciously +Qubes PDF converter is a [Qubes](https://qubes-os.org) Application that +utilizes Qubes' flexible qrexec (inter-VM communication) infrastructure and +Disposable VMs to securely convert potentially untrusted (e.g. maliciously malformed) PDF files into safe-to-view PDF files. -This is done by having the Disposable VM perform the complex (and potentially -buggy) rendering of the PDF in question and sending the resulting RGB bitmap -(simple representation) to the client AppVM. The client AppVM can _trivially_ -verify the received data are indeed the simple representation, and then -construct a new PDF out of the received bitmap. Of course the price we pay for -this conversion is loosing any structural information and text-based search in -the converted PDF. +This is done by having a Disposable VM render each page of a PDF file into a +very simple representation (RGB bitmap) that (presumably) leaves no room for +malicious code. This representation is then sent back to the client AppVM which +then constructs an entirely new PDF file out of the received bitmaps. -More discussion and introduction of the concept has been described in the -original article +Of course, the price we pay for this conversion is an increase in file size and +the loss of any structural information or text-based search in the converted +PDF. + +More discussion of the concept has been described in the original article [here](http://blog.invisiblethings.org/2013/02/21/converting-untrusted-pdfs-into-trusted.html). Usage From 54e25feefe2e06b622f7ca289c111f4d6fc3f199 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Thu, 2 Apr 2020 15:12:18 -0400 Subject: [PATCH 32/92] Add support for multiple files --- qpdf-convert-client.py | 44 ++++++++++++++++++++++++++++-------------- qpdf-convert-server.py | 33 ++++++++++++++++++++++++++----- qvm-convert-pdf.py | 2 +- 3 files changed, 58 insertions(+), 21 deletions(-) diff --git a/qpdf-convert-client.py b/qpdf-convert-client.py index 96b51fb..7fc103a 100755 --- a/qpdf-convert-client.py +++ b/qpdf-convert-client.py @@ -25,6 +25,7 @@ from PIL import Image import subprocess import sys +import time from tempfile import NamedTemporaryFile PROG_NAME = os.path.basename(sys.argv[0]) @@ -74,10 +75,9 @@ def recv(): return untrusted_data -def recv_b(): +def recv_b(size=None): '''Qrexec wrapper for receiving binary data from a client''' - # TODO: Does this raise EOFError like in recv()? - untrusted_data = sys.stdin.buffer.read() + untrusted_data = sys.stdin.buffer.read(size) return untrusted_data def check_range(val, upper): @@ -135,8 +135,7 @@ def recv_rgb_file(rgb_path, untrusted_size): # background.save('foo.jpg', 'JPEG', quality=80) with open(rgb_path, 'wb') as f: - # FIXME: Why can't we get this in pure Python????? This isn't - # Windows-compatible + # FIXME: Why doesn't this work in pure Python? cmd = ['head', '-c', str(untrusted_size)] subprocess.run(cmd, stdout=f, check=True) @@ -181,10 +180,23 @@ def recv_page_count(): return untrusted_page_count def send_pdf(untrusted_pdf_path): - info('Sending file to a Disposable VM...') + info(f'Sending {untrusted_pdf_path} to a Disposable VM...') + + # To process multiple files, we have to avoid closing STDIN since we can't + # reopen it afterwards without duplicating it to some new fd which doesn't + # seem ideal. Unfortunately, unless STDIN is being read from a terminal, I + # couldn't find a way to indicate to the server that we were done sending + # stuff. + # + # So, the current solution is to send file's size in advance so that the + # server can know when to stop reading from STDIN. The problem then becomes + # that the server may start its read after we send the PDF file. Thus, we + # make the client sleep so that the server can start its read beforehand. + send(os.path.getsize(untrusted_pdf_path)) + time.sleep(0.1) + with open(untrusted_pdf_path, 'rb') as f: send_b(f.read()) - os.close(sys.__stdout__.fileno()) def archive_pdf(untrusted_pdf_path): archived_pdf_path = f'{ARCHIVE_PATH}/{os.path.basename(untrusted_pdf_path)}' @@ -211,8 +223,7 @@ def process_pdf(untrusted_pdf_path, untrusted_page_count): else: info('') - # TODO: Maybe it'd be better to save->delete png over and over again to - # avoid storing all PNGs in memory + # TODO (?): Save->delete PNGs in a loop to avoid storing all PNGs in memory. images[0].save(pdf_path, 'PDF', resolution=100.0, save_all=True, append_images=images[1:]) @@ -223,12 +234,15 @@ def process_pdf(untrusted_pdf_path, untrusted_page_count): info(f'Converted PDF saved as: {pdf_path}') def process_pdfs(untrusted_pdf_paths): - # TODO: Remove [0] when support for multiple PDFs is available - # for untrusted_pdf_path in untrusted_pdf_paths: - send_pdf(untrusted_pdf_paths[0]) - untrusted_page_count = recv_page_count() - process_pdf(untrusted_pdf_paths[0], untrusted_page_count) - archive_pdf(untrusted_pdf_paths[0]) + # TODO (?): Add check for duplicate filenames + for untrusted_pdf_path in untrusted_pdf_paths: + send_pdf(untrusted_pdf_path) + untrusted_page_count = recv_page_count() + process_pdf(untrusted_pdf_path, untrusted_page_count) + archive_pdf(untrusted_pdf_path) + + if untrusted_pdf_path != untrusted_pdf_paths[-1]: + info('') ############################### diff --git a/qpdf-convert-server.py b/qpdf-convert-server.py index 1fde28c..2bcb279 100755 --- a/qpdf-convert-server.py +++ b/qpdf-convert-server.py @@ -56,9 +56,18 @@ def send_b(data): sys.stdout.buffer.write(data) sys.stdout.buffer.flush() -def recv_b(): +def recv(): + '''Qrexec wrapper for receiving text data from a client''' + try: + untrusted_data = input() + except EOFError: + sys.exit(1) + + return untrusted_data + +def recv_b(size=None): '''Qrexec wrapper for receiving binary data from a client''' - untrusted_data = sys.stdin.buffer.read() + untrusted_data = sys.stdin.buffer.read(size) return untrusted_data @@ -135,8 +144,10 @@ def create_tmp_files(): ############################### def recv_pdf(pdf_path): + filesize = int(recv()) + untrusted_data = recv_b(filesize) + with open(pdf_path, 'wb') as f: - untrusted_data = recv_b() f.write(untrusted_data) def get_page_count(pdf_path): @@ -175,8 +186,20 @@ def process_pdf(paths): def main(): paths = create_tmp_files() - recv_pdf(paths.pdf) - process_pdf(paths) + + # FIXME: + # When no more PDFs are available to process, the server will exit in + # recv() (called in recv_pdf()) with an EOFError. While this works + # perfectly fine, it is kinda ugly; successful runs shouldn't exit with an + # error, no? + # + # One solution would be to have the client initially send a + # space-delimited string containing the sizes of each file. Then, the + # server can turn that into an array and use the array's length as the + # number of times to loop. + while True: + recv_pdf(paths.pdf) + process_pdf(paths) if __name__ == '__main__': try: diff --git a/qvm-convert-pdf.py b/qvm-convert-pdf.py index 3402aff..dc34f81 100755 --- a/qvm-convert-pdf.py +++ b/qvm-convert-pdf.py @@ -45,7 +45,7 @@ def main(): if len(sys.argv) == 1: die(f'usage: {PROG_NAME} [FILE ...]') - untrusted_pdf_paths = sys.argv[1:] + untrusted_pdf_paths = [os.path.abspath(path) for path in sys.argv[1:]] check_pdf_paths(untrusted_pdf_paths) # TODO: Handle os.execl() error (maybe with os._exit(127)) From 3fd5912b261bb232dd542f9e91de7e09cdbd3116 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Thu, 2 Apr 2020 15:37:58 -0400 Subject: [PATCH 33/92] Update docs --- doc/qvm-convert-pdf.rst | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/doc/qvm-convert-pdf.rst b/doc/qvm-convert-pdf.rst index 8a89d9c..894b3a4 100644 --- a/doc/qvm-convert-pdf.rst +++ b/doc/qvm-convert-pdf.rst @@ -4,27 +4,28 @@ QVM-CONVERT-PDF(1) NAME ==== -qvm-convert-pdf - converts a potentially untrusted pdf to a safe-to-view pdf +qvm-convert-pdf - converts potentially untrusted PDFs to a safe-to-view PDF SYNOPSIS ======== -| qvm-convert-pdf +| qvm-convert-pdf DESCRIPTION =========== -Qubes PDF converter is a Qubes Application, which utilizes Qubes flexible qrexec -(inter-VM communication) infrastructure and Disposable VMs to perform conversion -of potentially untrusted (e.g. maliciously malformed) PDF files into -safe-to-view PDF files. +Qubes PDF converter is a Qubes Application that utilizes Qubes' flexible qrexec +(inter-VM communication) infrastructure and Disposable VMs to securely convert +potentially untrusted (e.g. maliciously malformed) PDF files into safe-to-view +PDF files. -This is done by having the Disposable VM perform the complex (and potentially -buggy) rendering of the PDF in question) and sending the resulting RGB bitmap -(simple representation) to the client AppVM. The client AppVM can _trivially_ -verify the received data are indeed the simple representation, and then -construct a new PDF out of the received bitmap. Of course the price we pay for -this conversion is loosing any structural information and text-based search in -the converted PDF. +This is done by having a Disposable VM render each page of a PDF file into a +very simple representation (RGB bitmap) that (presumably) leaves no room for +malicious code. This representation is then sent back to the client AppVM which +then constructs an entirely new PDF file out of the received bitmaps. + +Of course, the price we pay for this conversion is an increase in file size and +the loss of any structural information or text-based search in the converted +PDF. AUTHORS ======= From 672ad1a700187c748da72154ba844131fb2cd030 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Thu, 2 Apr 2020 22:20:32 -0400 Subject: [PATCH 34/92] client, server: Replace recv() with recvline_b() recvline_b() uses sys.stdout.buffer.readline() to handle binary inputs properly, which recv() couldn't do with it's input() call. A check was added in recv_img_measurements() to ensure that only 2 values were sent from the server (width, height). --- qpdf-convert-client.py | 32 ++++++++++++++------------------ qpdf-convert-server.py | 18 +++++++----------- 2 files changed, 21 insertions(+), 29 deletions(-) diff --git a/qpdf-convert-client.py b/qpdf-convert-client.py index 7fc103a..04aab88 100755 --- a/qpdf-convert-client.py +++ b/qpdf-convert-client.py @@ -66,20 +66,16 @@ def send_b(data): sys.stdout.buffer.write(data) sys.stdout.buffer.flush() -def recv(): - '''Qrexec wrapper for receiving text data from a client''' - try: - untrusted_data = input() - except EOFError: - sys.exit(1) - - return untrusted_data - def recv_b(size=None): - '''Qrexec wrapper for receiving binary data from a client''' + '''Qrexec wrapper for receiving data from the server''' untrusted_data = sys.stdin.buffer.read(size) return untrusted_data +def recvline_b(): + '''Qrexec wrapper for receiving a line of data from the server''' + untrusted_data = sys.stdin.buffer.readline() + return untrusted_data + def check_range(val, upper): if (not 1 <= val <= upper) or (not 1 <= val <= upper): raise ValueError @@ -94,8 +90,12 @@ def mkdir_archive(): ############################### def recv_img_measurements(): - '''Receive image measurements for a PDF page from server''' - untrusted_measurements = recv().split(' ', 2) + '''Receive the width and height of a PDF page from server''' + untrusted_measurements = recvline_b().decode().split(' ', 2) + + if len(untrusted_measurements) != 2: + raise ValueError + return [int(untrusted_value) for untrusted_value in untrusted_measurements] def get_img_size(untrusted_width, untrusted_height): @@ -172,7 +172,7 @@ def convert_rgb_file(untrusted_dimensions, page): def recv_page_count(): try: - untrusted_page_count = int(recv()) + untrusted_page_count = int(recvline_b().decode()) check_range(untrusted_page_count, MAX_PAGES) except ValueError: die("Invalid number of pages returned... aborting!") @@ -211,14 +211,10 @@ def process_pdf(untrusted_pdf_path, untrusted_page_count): info("Waiting for converted sample...") while page <= untrusted_page_count: - untrusted_dimensions = get_img_dimensions() - info(f'Receiving page {page}/{untrusted_page_count}...', '\r') - - # TODO: There's some weird verbose condition here in the og script + untrusted_dimensions = get_img_dimensions() png_path = convert_rgb_file(untrusted_dimensions, page) images.append(Image.open(png_path)) - page += 1 else: info('') diff --git a/qpdf-convert-server.py b/qpdf-convert-server.py index 2bcb279..66bf55a 100755 --- a/qpdf-convert-server.py +++ b/qpdf-convert-server.py @@ -56,20 +56,16 @@ def send_b(data): sys.stdout.buffer.write(data) sys.stdout.buffer.flush() -def recv(): - '''Qrexec wrapper for receiving text data from a client''' - try: - untrusted_data = input() - except EOFError: - sys.exit(1) - - return untrusted_data - def recv_b(size=None): - '''Qrexec wrapper for receiving binary data from a client''' + '''Qrexec wrapper for receiving data from a client''' untrusted_data = sys.stdin.buffer.read(size) return untrusted_data +def recvline_b(): + '''Qrexec wrapper for receiving a line of data from a client''' + untrusted_data = sys.stdin.buffer.readline() + return untrusted_data + ############################### # Image-related @@ -144,7 +140,7 @@ def create_tmp_files(): ############################### def recv_pdf(pdf_path): - filesize = int(recv()) + filesize = int(recvline_b().decode()) untrusted_data = recv_b(filesize) with open(pdf_path, 'wb') as f: From 7913920ca73fbc11bc1e227ec92d35bbac40b762 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sat, 18 Apr 2020 10:51:13 -0500 Subject: [PATCH 35/92] client: Remove redundant check condition --- qpdf-convert-client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qpdf-convert-client.py b/qpdf-convert-client.py index 04aab88..e207c83 100755 --- a/qpdf-convert-client.py +++ b/qpdf-convert-client.py @@ -77,7 +77,7 @@ def recvline_b(): return untrusted_data def check_range(val, upper): - if (not 1 <= val <= upper) or (not 1 <= val <= upper): + if not 1 <= val <= upper: raise ValueError def mkdir_archive(): From c6f6be48dbeef977ae221d46b8af42af567bc5fb Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sat, 18 Apr 2020 11:49:44 -0500 Subject: [PATCH 36/92] client, server: Add custom exceptions --- qpdf-convert-client.py | 31 +++++++++++++++++++++++++++---- qpdf-convert-server.py | 17 +++++++++++++++-- 2 files changed, 42 insertions(+), 6 deletions(-) diff --git a/qpdf-convert-client.py b/qpdf-convert-client.py index e207c83..a227794 100755 --- a/qpdf-convert-client.py +++ b/qpdf-convert-client.py @@ -25,8 +25,10 @@ from PIL import Image import subprocess import sys -import time -from tempfile import NamedTemporaryFile +from collections import namedtuple +from pathlib import Path +from PIL import Image +from tempfile import TemporaryDirectory PROG_NAME = os.path.basename(sys.argv[0]) ARCHIVE_PATH = f"{os.path.expanduser('~')}/QubesUntrustedPDFs" @@ -34,9 +36,30 @@ MAX_PAGES = 10000 MAX_IMG_WIDTH = 10000 MAX_IMG_HEIGHT = 10000 -MAX_IMG_SIZE = MAX_IMG_WIDTH * MAX_IMG_HEIGHT * 3 +DEPTH = 8 + +Dimensions = namedtuple("Dimensions", ["width", "height", "size", "depth"]) +Representation = namedtuple("Representations", ["initial", "final"]) + + +class DimensionError(ValueError): + """ + """ + + +class PageError(ValueError): + """ + """ + + +class ReceiveError(Exception): + """ + """ + -logging.basicConfig(format='%(message)s', stream=sys.stderr) +class RepresentationError(ValueError): + """ + """ ############################### diff --git a/qpdf-convert-server.py b/qpdf-convert-server.py index 66bf55a..03afed3 100755 --- a/qpdf-convert-server.py +++ b/qpdf-convert-server.py @@ -24,9 +24,22 @@ import os import subprocess import sys -from tempfile import NamedTemporaryFile +from collections import namedtuple +from pathlib import Path +from tempfile import TemporaryDirectory + +DEPTH = 8 + +Representation = namedtuple("Representation", ["initial", "final"]) + + +class ConversionError(Exception): + """ + """ -logging.basicConfig(format='%(message)s', stream=sys.stderr) +class ReceiveError(Exception): + """ + """ ############################### From 8178f1d08231744c71c5c922025494302e9394e2 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sat, 18 Apr 2020 11:53:40 -0500 Subject: [PATCH 37/92] client, server: Update utility functions Mainly, more proper error handling is in place, recv_b() takes a size, and unused functions have been removed. Convenience functions for asyncio control behavior have also been added. --- qpdf-convert-client.py | 148 ++++++++++++++++++++++++++++------------- qpdf-convert-server.py | 87 ++++++++++++++++++------ 2 files changed, 169 insertions(+), 66 deletions(-) diff --git a/qpdf-convert-client.py b/qpdf-convert-client.py index a227794..54e3e08 100755 --- a/qpdf-convert-client.py +++ b/qpdf-convert-client.py @@ -19,10 +19,9 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -from collections import namedtuple +import asyncio import logging import os -from PIL import Image import subprocess import sys from collections import namedtuple @@ -66,71 +65,130 @@ class RepresentationError(ValueError): # Utilities ############################### -def info(msg, suffix=None): - '''Qrexec wrapper for displaying information - `suffix` is typically only ever used when `msg` needs to overwrite - the line of the previous message (so as to imitate an updating - line). This is done by setting `suffix` to '\r'. - ''' - print(msg, end=suffix, flush=True, file=sys.stderr) +def check_paths(paths): + abs_paths = [] -def die(msg): - '''Qrexec wrapper for displaying error messages''' - logging.error(msg) - sys.exit(1) + for path in [Path(path) for path in paths]: + abspath = Path.resolve(path) -def send(data): - '''Qrexec wrapper for sending text data to the client's STDOUT''' - print(data, flush=True) + if not abspath.exists(): + logging.error(f"No such file: \"{path}\"") + sys.exit(1) + elif not abspath.is_file(): + logging.error(f"Not a regular file: \"{path}\"") + sys.exit(1) -def send_b(data): - '''Qrexec wrapper for sending binary data to the client's STDOUT''' - sys.stdout.buffer.write(data) - sys.stdout.buffer.flush() + abs_paths.append(abspath) -def recv_b(size=None): - '''Qrexec wrapper for receiving data from the server''' - untrusted_data = sys.stdin.buffer.read(size) - return untrusted_data + return abs_paths -def recvline_b(): - '''Qrexec wrapper for receiving a line of data from the server''' - untrusted_data = sys.stdin.buffer.readline() - return untrusted_data def check_range(val, upper): if not 1 <= val <= upper: raise ValueError -def mkdir_archive(): - if not os.path.exists(ARCHIVE_PATH): - os.mkdir(ARCHIVE_PATH) + +def unlink(path): + """Wrapper for Path.unlink(path, missing_ok=True)""" + try: + path.unlink() + except FileNotFoundError: + pass + + +async def cancel_task(task): + if not task.done(): + task.cancel() + await task + + +async def wait_proc(proc): + await proc.wait() + if proc.returncode: + raise subprocess.CalledProcessError + + +async def terminate_proc(proc): + if proc.returncode is None: + proc.terminate() + await proc.wait() ############################### -# Image-related +# Qrexec-related ############################### -def recv_img_measurements(): - '''Receive the width and height of a PDF page from server''' - untrusted_measurements = recvline_b().decode().split(' ', 2) - if len(untrusted_measurements) != 2: - raise ValueError +async def recv_b(proc, size): + """Qrexec wrapper for receiving binary data from the server""" + try: + untrusted_data = await proc.stdout.readexactly(size) + except asyncio.IncompleteReadError: + logging.error("server may have died...") + raise - return [int(untrusted_value) for untrusted_value in untrusted_measurements] + if not untrusted_data: + raise EOFError -def get_img_size(untrusted_width, untrusted_height): - untrusted_size = untrusted_width * untrusted_height * 3 + return untrusted_data + + +# TODO (?): Size limit for readline() +async def recvline_b(proc): + """Qrexec wrapper for receiving a line of binary data from the server""" + untrusted_data = await proc.stdout.readline() - if untrusted_size > MAX_IMG_SIZE: - die("Calculated image size is too large... aborting!") + if not untrusted_data: + raise EOFError + + return untrusted_data - return untrusted_size -def get_img_dimensions(): - depth = 8 +async def recv(proc, size): + """Convenience wrapper for receiving text data from the server""" + try: + untrusted_data = (await recv_b(proc, size)).decode() + except EOFError: + raise + except (AttributeError, UnicodeError): + logging.error("failed to decode received data!") + raise + + return untrusted_data + + +async def recvline(proc): + """Convenience wrapper for receiving a line of text data from the server""" + try: + untrusted_data = (await recvline_b(proc)).decode("ascii").rstrip() + except EOFError: + raise + except (AttributeError, UnicodeError): + logging.error("failed to decode received data!") + raise + + return untrusted_data + + +async def send(proc, data): + """Qrexec wrapper for sending data to the server""" + if isinstance(data, (str, int)): + data = str(data).encode() + + proc.stdin.write(data + b"\n") + try: + await proc.stdin.drain() + except BrokenPipeError: + # logging.error("server may have died") + raise + + + +############################### +# Image-related +############################### + try: untrusted_width, untrusted_height = recv_img_measurements() diff --git a/qpdf-convert-server.py b/qpdf-convert-server.py index 03afed3..7ef94b1 100755 --- a/qpdf-convert-server.py +++ b/qpdf-convert-server.py @@ -46,38 +46,83 @@ class ReceiveError(Exception): # Utilities ############################### + def info(msg, suffix=None): - '''Qrexec wrapper for displaying information + """Qrexec wrapper for displaying information on the client - `suffix` is typically only ever used when `msg` needs to overwrite - the line of the previous message (so as to imitate an updating - line). This is done by setting `suffix` to '\r'. - ''' + @suffix is really only ever used when @msg needs to overwrite the line of + the previous message (imitating an updating line). This is done by setting + @suffix to "\r". + """ print(msg, end=suffix, flush=True, file=sys.stderr) -def die(msg): - '''Qrexec wrapper for displaying error messages''' - logging.error(msg) - sys.exit(1) -def send(data): - '''Qrexec wrapper for sending text data to the client's STDOUT''' - print(data, flush=True) +def unlink(path): + """Wrapper for Path.unlink(path, missing_ok=True)""" + try: + path.unlink() + except FileNotFoundError: + pass + + +async def cancel_task(task): + if not task.done(): + task.cancel() + await task + + +async def wait_proc(proc): + await proc.wait() + if proc.returncode: + raise subprocess.CalledProcessError + + +async def terminate_proc(proc): + if proc.returncode is None: + proc.terminate() + await proc.wait() + + +############################### +# Qrexec-related +############################### + + +def recv_b(size): + """Qrexec wrapper for receiving binary data from the client""" + try: + untrusted_data = sys.stdin.buffer.read(size) + except EOFError as e: + raise ReceiveError from e + + if not untrusted_data: + raise ReceiveError + + return untrusted_data + + +def recvline(): + """Qrexec wrapper for receiving a line of text data from the client""" + try: + untrusted_data = sys.stdin.buffer.readline().decode("ascii") + except (AttributeError, EOFError, UnicodeError) as e: + raise ReceiveError from e + + return untrusted_data + def send_b(data): - '''Qrexec wrapper for sending binary data to the client's STDOUT''' + """Qrexec wrapper for sending binary data to the client's STDOUT""" + if isinstance(data, (str, int)): + data = str(data).encode() + sys.stdout.buffer.write(data) sys.stdout.buffer.flush() -def recv_b(size=None): - '''Qrexec wrapper for receiving data from a client''' - untrusted_data = sys.stdin.buffer.read(size) - return untrusted_data -def recvline_b(): - '''Qrexec wrapper for receiving a line of data from a client''' - untrusted_data = sys.stdin.buffer.readline() - return untrusted_data +def send(data): + """Qrexec wrapper for sending text data to the client's STDOUT""" + print(data, flush=True) ############################### From f66da6ef3a375be4eed08d0a992aa3a2a6ea821c Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sat, 18 Apr 2020 12:53:17 -0500 Subject: [PATCH 38/92] client: Combine image dimension functions --- qpdf-convert-client.py | 62 +++++++++--------------------------------- 1 file changed, 13 insertions(+), 49 deletions(-) diff --git a/qpdf-convert-client.py b/qpdf-convert-client.py index 54e3e08..e38661a 100755 --- a/qpdf-convert-client.py +++ b/qpdf-convert-client.py @@ -190,61 +190,25 @@ async def send(proc, data): ############################### +async def get_img_dim(proc): try: - untrusted_width, untrusted_height = recv_img_measurements() - check_range(untrusted_width, MAX_IMG_WIDTH) - check_range(untrusted_height, MAX_IMG_HEIGHT) - except ValueError: - die("Invalid image geometry returned... aborting!") - - untrusted_size = get_img_size(untrusted_width, untrusted_height) - Dimensions = namedtuple('Dimensions', ['width', 'height', 'depth', 'size']) - - return Dimensions(width=untrusted_width, height=untrusted_height, - size=untrusted_size, depth=depth) - -def recv_rgb_file(rgb_path, untrusted_size): - # XXX: For some reason, this leaves us missing alot of bytes - # rcvd_bytes = input().encode('utf-8', 'surrogateescape') - # rcvd_bytes = rcvd_bytes[:dimensions.size] - - # XXX: Example of using PIL for performant PNG -> JPG. Maybe use this? - # png = Image.open(object.logo.path) - # png.load() # required for png.split() - # background = Image.new("RGB", png.size, (255, 255, 255)) - # background.paste(png, mask=png.split()[3]) # 3 is the alpha channel - # background.save('foo.jpg', 'JPEG', quality=80) - - with open(rgb_path, 'wb') as f: - # FIXME: Why doesn't this work in pure Python? - cmd = ['head', '-c', str(untrusted_size)] - subprocess.run(cmd, stdout=f, check=True) - - if os.path.getsize(f.name) != untrusted_size: - os.remove(rgb_path) - die('Invalid number of bytes in RGB file... aborting!') - -def rgb_to_png(rgb_path, png_path, untrusted_dimensions, page): - cmd = ['convert', '-size', - f'{untrusted_dimensions.width}x{untrusted_dimensions.height}', - '-depth', str(untrusted_dimensions.depth), f'rgb:{rgb_path}', - f'png:{png_path}'] + untrusted_w, untrusted_h = map(int, (await recvline(proc)).split(" ", 1)) + except (AttributeError, EOFError, UnicodeError) as e: + raise ReceiveError from e try: - subprocess.run(cmd, check=True) - except subprocess.CalledProcessError: - die(f'Page {page} conversion failed (RGB->PNG)... aborting!') + check_range(untrusted_w, MAX_IMG_WIDTH) + check_range(untrusted_h, MAX_IMG_HEIGHT) + except ValueError as e: + logging.error(f"invalid image measurements received {e}") + raise DimensionError from e else: - os.remove(rgb_path) + width = untrusted_w + height = untrusted_h -def convert_rgb_file(untrusted_dimensions, page): - with NamedTemporaryFile(prefix='qpdf-conversion-') as f: - rgb_path = f'{f.name}-{page}.rgb' - png_path = f'{f.name}-{page}.png' - recv_rgb_file(rgb_path, untrusted_dimensions.size) - rgb_to_png(rgb_path, png_path, untrusted_dimensions, page) + size = width * height * 3 - return png_path + return Dimensions(width=width, height=height, size=size, depth=DEPTH) ############################### From 4d1ac9ecca737e361a2f896467a8badab573fce1 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sat, 18 Apr 2020 13:01:00 -0500 Subject: [PATCH 39/92] client, server: Update initial phase Before sending the PDF at the start, the client now sends a file size beforehand to avoid indicating it's done sending by closing its STDOUT. The initial phase is now asynchronous. --- qpdf-convert-client.py | 76 +++++++++++++----------------------------- qpdf-convert-server.py | 49 ++++++++++----------------- 2 files changed, 42 insertions(+), 83 deletions(-) diff --git a/qpdf-convert-client.py b/qpdf-convert-client.py index e38661a..5dfa519 100755 --- a/qpdf-convert-client.py +++ b/qpdf-convert-client.py @@ -215,62 +215,34 @@ async def get_img_dim(proc): # PDF-related ############################### -def recv_page_count(): + +async def send_pdf(loop, proc, path): try: - untrusted_page_count = int(recvline_b().decode()) - check_range(untrusted_page_count, MAX_PAGES) - except ValueError: - die("Invalid number of pages returned... aborting!") - - return untrusted_page_count - -def send_pdf(untrusted_pdf_path): - info(f'Sending {untrusted_pdf_path} to a Disposable VM...') - - # To process multiple files, we have to avoid closing STDIN since we can't - # reopen it afterwards without duplicating it to some new fd which doesn't - # seem ideal. Unfortunately, unless STDIN is being read from a terminal, I - # couldn't find a way to indicate to the server that we were done sending - # stuff. - # - # So, the current solution is to send file's size in advance so that the - # server can know when to stop reading from STDIN. The problem then becomes - # that the server may start its read after we send the PDF file. Thus, we - # make the client sleep so that the server can start its read beforehand. - send(os.path.getsize(untrusted_pdf_path)) - time.sleep(0.1) - - with open(untrusted_pdf_path, 'rb') as f: - send_b(f.read()) - -def archive_pdf(untrusted_pdf_path): - archived_pdf_path = f'{ARCHIVE_PATH}/{os.path.basename(untrusted_pdf_path)}' - os.rename(untrusted_pdf_path, archived_pdf_path) - info(f'Original PDF saved as: {archived_pdf_path}') - -def process_pdf(untrusted_pdf_path, untrusted_page_count): - page = 1 - images = [] - pdf_path = f'{os.path.splitext(untrusted_pdf_path)[0]}.trusted.pdf' - - info("Waiting for converted sample...") - - while page <= untrusted_page_count: - info(f'Receiving page {page}/{untrusted_page_count}...', '\r') - untrusted_dimensions = get_img_dimensions() - png_path = convert_rgb_file(untrusted_dimensions, page) - images.append(Image.open(png_path)) - page += 1 + filesize = (await loop.run_in_executor(None, path.stat)).st_size + await send(proc, filesize) + + data = await loop.run_in_executor(None, path.read_bytes) + await send(proc, data) + except BrokenPipeError: + raise + + +async def recv_pagenums(loop, proc): + try: + untrusted_pagenums = int(await recvline(proc)) + except (AttributeError, EOFError, UnicodeError, ValueError) as e: + raise ReceiveError from e + + try: + check_range(untrusted_pagenums, MAX_PAGES) + except ValueError as e: + logging.error("invalid number of pages received") + raise PageError from e else: - info('') + pagenums = untrusted_pagenums - # TODO (?): Save->delete PNGs in a loop to avoid storing all PNGs in memory. - images[0].save(pdf_path, 'PDF', resolution=100.0, save_all=True, - append_images=images[1:]) + return pagenums - for img in images: - img.close() - os.remove(img.filename) info(f'Converted PDF saved as: {pdf_path}') diff --git a/qpdf-convert-server.py b/qpdf-convert-server.py index 7ef94b1..7bb1e29 100755 --- a/qpdf-convert-server.py +++ b/qpdf-convert-server.py @@ -19,9 +19,8 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -from collections import namedtuple +import asyncio import logging -import os import subprocess import sys from collections import namedtuple @@ -197,41 +196,29 @@ def create_tmp_files(): # PDF-related ############################### -def recv_pdf(pdf_path): - filesize = int(recvline_b().decode()) - untrusted_data = recv_b(filesize) - with open(pdf_path, 'wb') as f: - f.write(untrusted_data) +def recv_pdf(): + try: + filesize = int(recvline()) + data = recv_b(filesize) + except (ReceiveError, ValueError): + raise + + return data -def get_page_count(pdf_path): - untrusted_pages = 0 - output = None - cmd = ['pdfinfo', pdf_path] +def get_pagenums(pdfpath): + cmd = ["pdfinfo", f"{pdfpath}"] try: output = subprocess.run(cmd, capture_output=True, check=True) except subprocess.CalledProcessError: - info(f'Probably not a PDF...') - else: - for line in output.stdout.decode().splitlines(): - if 'Pages:' in line: - untrusted_pages = int(line.split(':')[1]) - - return untrusted_pages - -def process_pdf(paths): - page = 1 - - untrusted_pages = get_page_count(paths.pdf) - send(untrusted_pages) - - while (page <= untrusted_pages): - pdf_to_png(page, paths.pdf, paths.png) - send_img_dimensions(paths.png) - png_to_rgb(page, paths.png, paths.rgb) - send_rgb_file(paths.rgb) - page += 1 + # TODO: Support converting JPGs and PNGs like the OG script + logging.error("file is probably not a PDF") + raise + + for line in output.stdout.decode("ascii").splitlines(): + if "Pages:" in line: + return int(line.split(":")[1]) ############################### From f4291c309b8cf1646b43c57dfa14fe8e9dbc1155 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sat, 18 Apr 2020 13:07:40 -0500 Subject: [PATCH 40/92] client, server: Update to asyncio and new model To prepare for a potential future move to another format besides RGB or PNG, the intermediate files are now called Representations. Switching over to another format simply consists of changing the name of a parameter and adding/updating the convert command. Intermediate files are now stored in a temporary directory for easier cleanup. Instead of a wrapper creating a client-vm subprocess which will execute the client, we now have the client directly make the client-vm subprocesses. This simplifies quite a bit as we can easily terminate them if they act up, which becomes important since multiple files are now supported. --- qpdf-convert-client.py | 189 +++++++++++++++++++++++++++++++++++++---- qpdf-convert-server.py | 179 ++++++++++++++++++++++++-------------- 2 files changed, 284 insertions(+), 84 deletions(-) diff --git a/qpdf-convert-client.py b/qpdf-convert-client.py index 5dfa519..a127e76 100755 --- a/qpdf-convert-client.py +++ b/qpdf-convert-client.py @@ -29,8 +29,8 @@ from PIL import Image from tempfile import TemporaryDirectory -PROG_NAME = os.path.basename(sys.argv[0]) -ARCHIVE_PATH = f"{os.path.expanduser('~')}/QubesUntrustedPDFs" +PROG_NAME = Path(sys.argv[0]).name +ARCHIVE_PATH = Path(Path.home(), "QubesUntrustedPDFs") MAX_PAGES = 10000 MAX_IMG_WIDTH = 10000 @@ -244,31 +244,184 @@ async def recv_pagenums(loop, proc): return pagenums - info(f'Converted PDF saved as: {pdf_path}') +def archive(path): + archive_path = Path(ARCHIVE_PATH, path.name) + path.rename(archive_path) + print(f"Original PDF saved as: {archive_path}") -def process_pdfs(untrusted_pdf_paths): - # TODO (?): Add check for duplicate filenames - for untrusted_pdf_path in untrusted_pdf_paths: - send_pdf(untrusted_pdf_path) - untrusted_page_count = recv_page_count() - process_pdf(untrusted_pdf_path, untrusted_page_count) - archive_pdf(untrusted_pdf_path) - if untrusted_pdf_path != untrusted_pdf_paths[-1]: - info('') +############################### +# Representation-related +############################### + + +def get_rep(tmpdir, page, initial, final): + name = Path(tmpdir, str(page)) + return Representation(initial=name.with_suffix(f".{initial}"), + final=name.with_suffix(f".{final}")) + + +def save_rep(rep, data, size): + if rep.initial.write_bytes(data) != size: + logging.error("inconsistent initial representation file size") + raise RepresentationError + + +async def recv_rep(loop, proc, tmpdir, page): + """Receive initial representation from the server + + :param proc: Qrexec process to read STDIN from + :param path: File path which will store the initial representation + """ + rep = get_rep(tmpdir, page, "rgb", "png") + + try: + dim = await get_img_dim(proc) + data = await recv_b(proc, dim.size) + await loop.run_in_executor(None, save_rep, rep, data, dim.size) + except EOFError as e: + raise ReceiveError from e + except (DimensionError, ReceiveError, RepresentationError): + raise + + return rep, dim + + +async def start_convert(rep, dim): + cmd = ["convert", "-size", f"{dim.width}x{dim.height}", "-depth", + f"{dim.depth}", f"rgb:{rep.initial}", f"png:{rep.final}"] + + try: + proc = await asyncio.create_subprocess_exec(*cmd) + await wait_proc(proc) + except asyncio.CancelledError: + terminate_proc(proc) + + return proc + + +# TODO (?): Save->delete PNGs in loop to avoid storing all PNGs in memory. +# TODO: Add error handling +def combine_reps(save_path, reps): + with Image.open(reps[0].final) as first: + remaining = [Image.open(rep.final) for rep in reps[1:]] + first.save(save_path, "PDF", resolution=100, append_images=remaining, + save_all=True) + + for img in remaining: + img.close() ############################### # Main ############################### +async def receive(loop, proc, tmpdir): + procs = [] + reps = [] + + try: + pagenums = await recv_pagenums(loop, proc) + except (PageError, ReceiveError): + raise + + for page in range(1, pagenums + 1): + try: + rep, dim = await recv_rep(loop, proc, tmpdir, page) + except (DimensionError, ReceiveError, RepresentationError): + term_tasks = [terminate_proc(p) for p in procs] + await asyncio.gather(*term_tasks) + raise + + reps.append(rep) + procs.append(await start_convert(rep, dim)) + + return procs, reps + + +async def convert(loop, path, procs, reps): + for proc, rep in zip(procs, reps): + try: + await wait_proc(proc) + except subprocess.CalledProcessError: + logging.error("page conversion failed") + raise + + await loop.run_in_executor(None, unlink, rep.initial) + + save_path = path.with_suffix(".trusted.pdf") + await loop.run_in_executor(None, combine_reps, save_path, reps) + + return save_path + + +async def sanitize(loop, proc, path): + with TemporaryDirectory(prefix=f"qvm-sanitize-{proc.pid}-") as tmpdir: + try: + convert_procs, reps = await receive(loop, proc, tmpdir) + except (DimensionError, PageError, ReceiveError, RepresentationError): + raise + + try: + pdf = await convert(loop, path, convert_procs, reps) + except (asyncio.CancelledError, subprocess.CalledProcessError): + for proc in convert_procs: + terminate_proc(proc) + raise + + print(f"\nConverted PDF saved as: {pdf}") + + +# TODO: KeyboardInterrupt +async def run(loop, paths): + cmd = ["/usr/bin/qrexec-client-vm", "@dispvm", "qubes.PdfConvert"] + procs = [] + send_tasks = [] + sanitize_tasks = [] + + print("Sending files to Disposable VMs...") + + for path in paths: + proc = await asyncio.create_subprocess_exec(*cmd, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE) + procs.append(proc) + send_tasks.append(asyncio.create_task(send_pdf(loop, proc, path))) + sanitize_tasks.append(asyncio.create_task(sanitize(loop, proc, path))) + + for proc, path, send_task, sanitize_task in zip(procs, paths, send_tasks, + sanitize_tasks): + try: + await asyncio.gather(send_task, + sanitize_task, + wait_proc(proc)) + except (BrokenPipeError, DimensionError, PageError, ReceiveError, + RepresentationError, subprocess.CalledProcessError): + await asyncio.gather(cancel_task(send_task), + cancel_task(sanitize_task)) + await terminate_proc(proc) + else: + await loop.run_in_executor(None, archive, path) + + def main(): - untrusted_pdf_paths = sys.argv[1:] - mkdir_archive() - process_pdfs(untrusted_pdf_paths) + logging.basicConfig(level=logging.INFO, format="%(message)s") + + if len(sys.argv) == 1: + print(f"usage: {PROG_NAME} [FILE ...]", file=sys.stderr) + sys.exit(1) + + paths = check_paths(sys.argv[1:]) + Path.mkdir(ARCHIVE_PATH, exist_ok=True) -if __name__ == '__main__': try: - main() + loop = asyncio.get_event_loop() + loop.run_until_complete(run(loop, paths)) except KeyboardInterrupt: - die("KeyboardInterrupt... Aborting!") + logging.error("Original file untouched.") + finally: + loop.run_until_complete(loop.shutdown_asyncgens()) + + +if __name__ == "__main__": + main() diff --git a/qpdf-convert-server.py b/qpdf-convert-server.py index 7bb1e29..2d041e8 100755 --- a/qpdf-convert-server.py +++ b/qpdf-convert-server.py @@ -125,71 +125,82 @@ def send(data): ############################### -# Image-related +# Rep-related ############################### -def send_img_dimensions(png_path): - cmd_width = ['identify', '-format', '%w', png_path] - cmd_height = ['identify', '-format', '%h', png_path] - try: - untrusted_width = subprocess.run(cmd_width, capture_output=True, - check=True).stdout.decode() - untrusted_height = subprocess.run(cmd_height, capture_output=True, - check=True).stdout.decode() - except subprocess.CalledProcessError: - die("Failed to gather dimensions... Aborting") +def get_rep(tmpdir, page, initial, final): + """Create temporary file for page representations""" + name = Path(tmpdir, f"{page}") + return Representation(initial=name.with_suffix(f".{initial}"), + final=name.with_suffix(f".{final}")) - send(f'{untrusted_width} {untrusted_height}') -def send_rgb_file(rgb_path): - with open(rgb_path, 'rb') as f: - data = f.read() - send_b(data) +############################### +# Image-related +############################### + -def pdf_to_png(pagenum, pdf_path, png_path): - png_filename = os.path.splitext(png_path)[0] - cmd = ['pdftocairo', pdf_path, '-png', '-f', str(pagenum), '-l', - str(pagenum), '-singlefile', png_filename] +async def get_irep(pdfpath, irep, page): + cmd = ["pdftocairo", f"{pdfpath}", "-png", "-f", f"{page}", "-l", + f"{page}", "-singlefile", f"{Path(irep.parent, irep.stem)}"] try: - subprocess.run(cmd, check=True) + proc = await asyncio.create_subprocess_exec(*cmd) + await wait_proc(proc) + except asyncio.CancelledError: + terminate_proc(proc) + raise except subprocess.CalledProcessError: - cmd = ['convert', pdf_path, f'png:{png_path}'] - try: - subprocess.run(cmd, check=True) - except subprocess.CalledProcessError: - die(f'Page {pagenum} conversion failed (PDF->PNG): {err}') + raise + -def png_to_rgb(pagenum, png_path, rgb_path): - depth = 8 - cmd = ['convert', png_path, '-depth', str(depth), f'rgb:{rgb_path}'] +async def get_img_dim(irep): + cmd = ["identify", "-format", "'%w %h'", f"{irep}"] try: - subprocess.run(cmd, check=True) + proc = await asyncio.create_subprocess_exec(*cmd, stdout=subprocess.PIPE) + output, _ = await proc.communicate() + except asyncio.CancelledError: + terminate_proc(proc) + raise except subprocess.CalledProcessError: - die(f'Page {pagenum} conversion failed (PNG->RGB): {err}') + raise + return output.decode("ascii").replace("'", "") -############################### -# File-related -############################### -def create_tmp_files(): - '''Create temporary file for storing page images and the untrusted PDF''' - Files = namedtuple('Files', ['pdf', 'png', 'rgb']) - suffixes = ('', '.png', '.rgb') - paths = [] +async def convert_rep(irep, frep): + cmd = ["convert", f"{irep}", "-depth", f"{DEPTH}", f"rgb:{frep}"] - for suffix in suffixes: - with NamedTemporaryFile(prefix='qpdf-conversion-', suffix=suffix) as f: - paths.append(f.name) + try: + proc = await asyncio.create_subprocess_exec(*cmd) + await wait_proc(proc) + except asyncio.CancelledError: + terminate_proc(proc) + raise + except subprocess.CalledProcessError: + raise - for path in paths: - with open(path, 'wb') as f: - f.write(b'') - return Files(pdf=paths[0], png=paths[1], rgb=paths[2]) +async def render(loop, page, pdfpath, rep): + try: + irep_task = asyncio.create_task(get_irep(pdfpath, rep.initial, page)) + await irep_task + + dim_task = asyncio.create_task(get_img_dim(rep.initial)) + convert_task = asyncio.create_task(convert_rep(rep.initial, rep.final)) + dim, _ = await asyncio.gather(dim_task, convert_task) + except subprocess.CalledProcessError: + raise + except asyncio.CancelledError: + cancel_task(irep_task) + cancel_task(dim_task) + cancel_task(convert_task) + finally: + await loop.run_in_executor(None, unlink, rep.initial) + + return (dim, rep.final) ############################### @@ -225,25 +236,61 @@ def get_pagenums(pdfpath): # Main ############################### + +async def run(loop, tmpdir, pdfpath, pagenums, max_tasks=0): + results = [] + tasks = [] + limit = max_tasks if max_tasks > 0 else pagenums + + for page in range(1, limit + 1): + rep = get_rep(tmpdir, page, "png", "rgb") + tasks.append(asyncio.create_task(render(loop, page, pdfpath, rep))) + + for task in tasks: + try: + results.append(await task) + except subprocess.CalledProcessError: + for task in tasks: + task.cancel() + + try: + await task + except asyncio.CancelledError: + pass + raise + + for dim, frep in results: + send(dim) + send_b(await loop.run_in_executor(None, frep.read_bytes)) + await loop.run_in_executor(None, unlink, frep) + + def main(): - paths = create_tmp_files() - - # FIXME: - # When no more PDFs are available to process, the server will exit in - # recv() (called in recv_pdf()) with an EOFError. While this works - # perfectly fine, it is kinda ugly; successful runs shouldn't exit with an - # error, no? - # - # One solution would be to have the client initially send a - # space-delimited string containing the sizes of each file. Then, the - # server can turn that into an array and use the array's length as the - # number of times to loop. - while True: - recv_pdf(paths.pdf) - process_pdf(paths) - -if __name__ == '__main__': + logging.basicConfig(format="DispVM: %(message)s") + try: - main() - except KeyboardInterrupt: - die("KeyboardInterrupt... Aborting!") + data = recv_pdf() + except (ReceiveError, ValueError): + sys.exit(1) + + with TemporaryDirectory(prefix="qvm-sanitize-") as tmpdir: + pdfpath = Path(tmpdir, "original") + pdfpath.write_bytes(data) + + try: + pagenums = get_pagenums(pdfpath) + send(pagenums) + except subprocess.CalledProcessError: + sys.exit(1) + + try: + loop = asyncio.get_event_loop() + loop.run_until_complete(run(loop, tmpdir, pdfpath, pagenums, 0)) + except subprocess.CalledProcessError: + sys.exit(1) + finally: + loop.run_until_complete(loop.shutdown_asyncgens()) + + +if __name__ == "__main__": + main() From 5979f9f13c4a07c5329b825327ba8bd1e9c68bed Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sat, 18 Apr 2020 13:18:45 -0500 Subject: [PATCH 41/92] client, server: Move to pdf-converter directory --- Makefile | 20 +-- debian/qubes-pdf-converter.install | 1 - .../client.py | 0 .../server.py | 0 pdf-converter/test.py | 20 +++ pdf-converter/tests/__init__.py | 1 + pdf-converter/tests/tests.py | 141 ++++++++++++++++++ qvm-convert-pdf.gnome | 2 +- rpm_spec/qpdf-converter.spec.in | 1 - test.py | 9 ++ 10 files changed, 180 insertions(+), 15 deletions(-) rename qpdf-convert-client.py => pdf-converter/client.py (100%) rename qpdf-convert-server.py => pdf-converter/server.py (100%) create mode 100644 pdf-converter/test.py create mode 100644 pdf-converter/tests/__init__.py create mode 100644 pdf-converter/tests/tests.py create mode 100644 test.py diff --git a/Makefile b/Makefile index 2e2b8dc..76c3c04 100644 --- a/Makefile +++ b/Makefile @@ -23,15 +23,12 @@ RPMS_DIR=rpm/ VERSION := $(shell cat version) help: - @echo "Qubes addons main Makefile:" ;\ - echo "make rpms <--- make rpms and sign them";\ - echo; \ - echo "make clean <--- clean all the binary files";\ - echo "make update-repo-current <-- copy newly generated rpms to qubes yum repo";\ - echo "make update-repo-current-testing <-- same, but for -current-testing repo";\ - echo "make update-repo-unstable <-- same, but to -testing repo";\ - echo "make update-repo-installer -- copy dom0 rpms to installer repo" - @exit 0; + @echo "make rpms -- generate signed rpm packages" + @echo "make update-repo-current -- copy newly generated rpms to qubes yum repo" + @echo "make update-repo-current-testing -- same, but for -current-testing repo" + @echo "make update-repo-unstable -- same, but to -testing repo" + @echo "make update-repo-installer -- copy dom0 rpms to installer repo" + @echo "make clean -- clean up binary files" rpms: rpms-vm @@ -78,9 +75,8 @@ build: install-vm: make install -C doc - install -D qvm-convert-pdf.py $(DESTDIR)/usr/bin/qvm-convert-pdf - install -D qpdf-convert-client.py $(DESTDIR)/usr/lib/qubes/qpdf-convert-client - install -D qpdf-convert-server.py $(DESTDIR)/usr/lib/qubes/qpdf-convert-server + install -D pdf-converter/client.py $(DESTDIR)/usr/bin/qvm-convert-pdf + install -D pdf-converter/server.py $(DESTDIR)/usr/lib/qubes/qpdf-convert-server install -d $(DESTDIR)/etc/qubes-rpc ln -s ../../usr/lib/qubes/qpdf-convert-server $(DESTDIR)/etc/qubes-rpc/qubes.PdfConvert install -D qvm-convert-pdf.gnome $(DESTDIR)/usr/lib/qubes/qvm-convert-pdf.gnome diff --git a/debian/qubes-pdf-converter.install b/debian/qubes-pdf-converter.install index 7a6a94b..3b02fd9 100644 --- a/debian/qubes-pdf-converter.install +++ b/debian/qubes-pdf-converter.install @@ -1,4 +1,3 @@ -usr/lib/qubes/qpdf-convert-client usr/lib/qubes/qpdf-convert-server etc/qubes-rpc/qubes.PdfConvert usr/bin/qvm-convert-pdf diff --git a/qpdf-convert-client.py b/pdf-converter/client.py similarity index 100% rename from qpdf-convert-client.py rename to pdf-converter/client.py diff --git a/qpdf-convert-server.py b/pdf-converter/server.py similarity index 100% rename from qpdf-convert-server.py rename to pdf-converter/server.py diff --git a/pdf-converter/test.py b/pdf-converter/test.py new file mode 100644 index 0000000..c821229 --- /dev/null +++ b/pdf-converter/test.py @@ -0,0 +1,20 @@ +import asyncio + +async def foobar(): + try: + await asyncio.sleep(5) + except asyncio.CancelledError: + print("foobar() got cancelled") + +async def run(): + task = asyncio.create_task(foobar()) + await asyncio.sleep(1) + task.cancel() + await task + print("done") + +def main(): + loop = asyncio.get_event_loop() + loop.run_until_complete(run()) + +main() diff --git a/pdf-converter/tests/__init__.py b/pdf-converter/tests/__init__.py new file mode 100644 index 0000000..a178c84 --- /dev/null +++ b/pdf-converter/tests/__init__.py @@ -0,0 +1 @@ +# pylint: no-file diff --git a/pdf-converter/tests/tests.py b/pdf-converter/tests/tests.py new file mode 100644 index 0000000..729db20 --- /dev/null +++ b/pdf-converter/tests/tests.py @@ -0,0 +1,141 @@ +# vim: fileencoding=utf-8 + +# +# The Qubes OS Project, https://www.qubes-os.org/ +# +# Copyright (C) 2016 +# Marek Marczykowski-Górecki +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +import os +import unittest + +import itertools +import qubes.tests.extra + + +# noinspection PyPep8Naming +class TC_00_PDFConverter(qubes.tests.extra.ExtraTestCase): + circle_svg = """ + + + + + + + + {text} + + + """ + + def setUp(self): + if 'whonix' in self.template: + self.skipTest('whonix do not have pdf converter installed') + super(TC_00_PDFConverter, self).setUp() + # noinspection PyAttributeOutsideInit + self.vm = self.create_vms(["vm"])[0] + self.vm.start() + + def create_pdf(self, filename, content): + '''Create PDF file with given (textual) content + + :param filename: output filename + :param content: content to be placed on each page (list of str) + ''' + for (page_content, page_no) in zip(content, itertools.count()): + p = self.vm.run( + 'cat > /tmp/page{no:04}.svg && ' + 'convert /tmp/page{no:04}.svg /tmp/page{no:04}.pdf 2>&1'.format( + no=page_no), passio_popen=True) + (stdout, _) = p.communicate(self.circle_svg.format( + text=page_content).encode()) + if p.returncode != 0: + self.skipTest('failed to create test page: {}'.format(stdout)) + + p = self.vm.run('pdfunite /tmp/page*.pdf "{}" 2>&1'.format(filename), + passio_popen=True) + (stdout, _) = p.communicate() + if p.returncode != 0: + self.skipTest('failed to create test pdf: {}'.format(stdout)) + + def get_pdfinfo(self, filename): + p = self.vm.run('pdfinfo "{}"'.format(filename), passio_popen=True) + (stdout, _) = p.communicate() + self.assertEquals(p.returncode, 0, + "Failed to get pdfinfo of {}".format(filename)) + pdfinfo = {} + for line in stdout.decode().splitlines(): + k, v = str(line).split(':', 1) + pdfinfo[k] = v.strip() + return pdfinfo + + def assertCorrectlyTransformed(self, orig_filename, trusted_filename): + self.assertEquals( + self.vm.run('test -r "{}"'.format(trusted_filename), wait=True), 0) + # TODO: somehow verify content of generated file, for now perform + # some heuristics + orig_info = self.get_pdfinfo(orig_filename) + trusted_info = self.get_pdfinfo(trusted_filename) + # 1. check number of pages + self.assertEqual(trusted_info['Pages'], orig_info['Pages']) + + untrusted_backup = 'QubesUntrustedPDFs/{}'.format(os.path.basename( + trusted_filename.replace('.trusted', ''))) + self.assertEquals( + self.vm.run('test -r "{}"'.format(untrusted_backup), wait=True), 0) + self.assertEquals(self.vm.run( + 'diff "{}" "{}"'.format(orig_filename, untrusted_backup), wait=True), 0) + + def test_000_one_page(self): + self.create_pdf('test.pdf', ['This is test']) + p = self.vm.run('cp test.pdf orig.pdf; qvm-convert-pdf test.pdf 2>&1', + passio_popen=True) + (stdout, _) = p.communicate() + self.assertEquals(p.returncode, 0, 'qvm-convert-pdf failed: {}'.format(stdout)) + self.assertCorrectlyTransformed('orig.pdf', 'test.trusted.pdf') + + def test_001_two_pages(self): + self.create_pdf('test.pdf', ['This is test', 'Second page']) + p = self.vm.run('cp test.pdf orig.pdf; qvm-convert-pdf test.pdf 2>&1', + passio_popen=True) + (stdout, _) = p.communicate() + self.assertEquals(p.returncode, 0, 'qvm-convert-pdf failed: {}'.format(stdout)) + self.assertCorrectlyTransformed('orig.pdf', 'test.trusted.pdf') + + def test_002_500_pages(self): + self.create_pdf('test.pdf', ['This is test'] * 500) + p = self.vm.run('cp test.pdf orig.pdf; qvm-convert-pdf test.pdf 2>&1', + passio_popen=True) + (stdout, _) = p.communicate() + self.assertEquals(p.returncode, 0, 'qvm-convert-pdf failed: {}'.format(stdout)) + self.assertCorrectlyTransformed('orig.pdf', 'test.trusted.pdf') + + def test_003_filename_with_spaces(self): + self.create_pdf('test with spaces.pdf', ['This is test']) + p = self.vm.run( + 'cp "test with spaces.pdf" orig.pdf; ' + 'qvm-convert-pdf "test with spaces.pdf" 2>&1', + passio_popen=True) + (stdout, _) = p.communicate() + self.assertEquals(p.returncode, 0, 'qvm-convert-pdf failed: {}'.format(stdout)) + self.assertCorrectlyTransformed('orig.pdf', + 'test with spaces.trusted.pdf') + + +def list_tests(): + tests = [TC_00_PDFConverter] + return tests diff --git a/qvm-convert-pdf.gnome b/qvm-convert-pdf.gnome index c1e8561..f1801d0 100755 --- a/qvm-convert-pdf.gnome +++ b/qvm-convert-pdf.gnome @@ -26,4 +26,4 @@ fi export PROGRESS_FOR_GUI="yes" -/usr/lib/qubes/qrexec_client_vm '$dispvm' qubes.PdfConvert /usr/lib/qubes/qpdf-convert-client "$@" | zenity --progress --text="Converting PDF using Disposable VM..." --auto-close --auto-kill +/usr/bin/qvm-convert-pdf "$@" | zenity --progress --text="Converting PDF using Disposable VM..." --auto-close --auto-kill diff --git a/rpm_spec/qpdf-converter.spec.in b/rpm_spec/qpdf-converter.spec.in index f69eab2..0ae7fca 100644 --- a/rpm_spec/qpdf-converter.spec.in +++ b/rpm_spec/qpdf-converter.spec.in @@ -56,7 +56,6 @@ rm -rf $RPM_BUILD_ROOT %files %defattr(-,root,root,-) /etc/qubes-rpc/qubes.PdfConvert -/usr/lib/qubes/qpdf-convert-client /usr/lib/qubes/qpdf-convert-server /usr/lib/qubes/qvm-convert-pdf.gnome /usr/bin/qvm-convert-pdf diff --git a/test.py b/test.py new file mode 100644 index 0000000..a49db51 --- /dev/null +++ b/test.py @@ -0,0 +1,9 @@ +import asyncio + +async def run(): + +def main(): + loop = asyncio.get_event_loop() + loop.run_until_complete(run()) + +main() From 23a45e535ed3237d11d0d0f16abe3296b5095e36 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sat, 18 Apr 2020 13:21:18 -0500 Subject: [PATCH 42/92] meta: Remove extra file --- pdf-converter/test.py | 20 -------------------- 1 file changed, 20 deletions(-) delete mode 100644 pdf-converter/test.py diff --git a/pdf-converter/test.py b/pdf-converter/test.py deleted file mode 100644 index c821229..0000000 --- a/pdf-converter/test.py +++ /dev/null @@ -1,20 +0,0 @@ -import asyncio - -async def foobar(): - try: - await asyncio.sleep(5) - except asyncio.CancelledError: - print("foobar() got cancelled") - -async def run(): - task = asyncio.create_task(foobar()) - await asyncio.sleep(1) - task.cancel() - await task - print("done") - -def main(): - loop = asyncio.get_event_loop() - loop.run_until_complete(run()) - -main() From 456ac9a819c39022fbff17c41c64506e70812cde Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sat, 18 Apr 2020 13:21:55 -0500 Subject: [PATCH 43/92] tests: Moved to pdf-converter/tests --- qubespdfconverter/__init__.py | 1 - qubespdfconverter/tests.py | 141 ---------------------------------- 2 files changed, 142 deletions(-) delete mode 100644 qubespdfconverter/__init__.py delete mode 100644 qubespdfconverter/tests.py diff --git a/qubespdfconverter/__init__.py b/qubespdfconverter/__init__.py deleted file mode 100644 index a178c84..0000000 --- a/qubespdfconverter/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# pylint: no-file diff --git a/qubespdfconverter/tests.py b/qubespdfconverter/tests.py deleted file mode 100644 index 729db20..0000000 --- a/qubespdfconverter/tests.py +++ /dev/null @@ -1,141 +0,0 @@ -# vim: fileencoding=utf-8 - -# -# The Qubes OS Project, https://www.qubes-os.org/ -# -# Copyright (C) 2016 -# Marek Marczykowski-Górecki -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License along -# with this program; if not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -# -import os -import unittest - -import itertools -import qubes.tests.extra - - -# noinspection PyPep8Naming -class TC_00_PDFConverter(qubes.tests.extra.ExtraTestCase): - circle_svg = """ - - - - - - - - {text} - - - """ - - def setUp(self): - if 'whonix' in self.template: - self.skipTest('whonix do not have pdf converter installed') - super(TC_00_PDFConverter, self).setUp() - # noinspection PyAttributeOutsideInit - self.vm = self.create_vms(["vm"])[0] - self.vm.start() - - def create_pdf(self, filename, content): - '''Create PDF file with given (textual) content - - :param filename: output filename - :param content: content to be placed on each page (list of str) - ''' - for (page_content, page_no) in zip(content, itertools.count()): - p = self.vm.run( - 'cat > /tmp/page{no:04}.svg && ' - 'convert /tmp/page{no:04}.svg /tmp/page{no:04}.pdf 2>&1'.format( - no=page_no), passio_popen=True) - (stdout, _) = p.communicate(self.circle_svg.format( - text=page_content).encode()) - if p.returncode != 0: - self.skipTest('failed to create test page: {}'.format(stdout)) - - p = self.vm.run('pdfunite /tmp/page*.pdf "{}" 2>&1'.format(filename), - passio_popen=True) - (stdout, _) = p.communicate() - if p.returncode != 0: - self.skipTest('failed to create test pdf: {}'.format(stdout)) - - def get_pdfinfo(self, filename): - p = self.vm.run('pdfinfo "{}"'.format(filename), passio_popen=True) - (stdout, _) = p.communicate() - self.assertEquals(p.returncode, 0, - "Failed to get pdfinfo of {}".format(filename)) - pdfinfo = {} - for line in stdout.decode().splitlines(): - k, v = str(line).split(':', 1) - pdfinfo[k] = v.strip() - return pdfinfo - - def assertCorrectlyTransformed(self, orig_filename, trusted_filename): - self.assertEquals( - self.vm.run('test -r "{}"'.format(trusted_filename), wait=True), 0) - # TODO: somehow verify content of generated file, for now perform - # some heuristics - orig_info = self.get_pdfinfo(orig_filename) - trusted_info = self.get_pdfinfo(trusted_filename) - # 1. check number of pages - self.assertEqual(trusted_info['Pages'], orig_info['Pages']) - - untrusted_backup = 'QubesUntrustedPDFs/{}'.format(os.path.basename( - trusted_filename.replace('.trusted', ''))) - self.assertEquals( - self.vm.run('test -r "{}"'.format(untrusted_backup), wait=True), 0) - self.assertEquals(self.vm.run( - 'diff "{}" "{}"'.format(orig_filename, untrusted_backup), wait=True), 0) - - def test_000_one_page(self): - self.create_pdf('test.pdf', ['This is test']) - p = self.vm.run('cp test.pdf orig.pdf; qvm-convert-pdf test.pdf 2>&1', - passio_popen=True) - (stdout, _) = p.communicate() - self.assertEquals(p.returncode, 0, 'qvm-convert-pdf failed: {}'.format(stdout)) - self.assertCorrectlyTransformed('orig.pdf', 'test.trusted.pdf') - - def test_001_two_pages(self): - self.create_pdf('test.pdf', ['This is test', 'Second page']) - p = self.vm.run('cp test.pdf orig.pdf; qvm-convert-pdf test.pdf 2>&1', - passio_popen=True) - (stdout, _) = p.communicate() - self.assertEquals(p.returncode, 0, 'qvm-convert-pdf failed: {}'.format(stdout)) - self.assertCorrectlyTransformed('orig.pdf', 'test.trusted.pdf') - - def test_002_500_pages(self): - self.create_pdf('test.pdf', ['This is test'] * 500) - p = self.vm.run('cp test.pdf orig.pdf; qvm-convert-pdf test.pdf 2>&1', - passio_popen=True) - (stdout, _) = p.communicate() - self.assertEquals(p.returncode, 0, 'qvm-convert-pdf failed: {}'.format(stdout)) - self.assertCorrectlyTransformed('orig.pdf', 'test.trusted.pdf') - - def test_003_filename_with_spaces(self): - self.create_pdf('test with spaces.pdf', ['This is test']) - p = self.vm.run( - 'cp "test with spaces.pdf" orig.pdf; ' - 'qvm-convert-pdf "test with spaces.pdf" 2>&1', - passio_popen=True) - (stdout, _) = p.communicate() - self.assertEquals(p.returncode, 0, 'qvm-convert-pdf failed: {}'.format(stdout)) - self.assertCorrectlyTransformed('orig.pdf', - 'test with spaces.trusted.pdf') - - -def list_tests(): - tests = [TC_00_PDFConverter] - return tests From a0cffae09543984a9d35ee8b19b60b179bf4ba5d Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sat, 18 Apr 2020 13:25:41 -0500 Subject: [PATCH 44/92] meta: Remove extra file --- test.py | 9 --------- 1 file changed, 9 deletions(-) delete mode 100644 test.py diff --git a/test.py b/test.py deleted file mode 100644 index a49db51..0000000 --- a/test.py +++ /dev/null @@ -1,9 +0,0 @@ -import asyncio - -async def run(): - -def main(): - loop = asyncio.get_event_loop() - loop.run_until_complete(run()) - -main() From d7dade56f06b002e819d3303b4280bce24bad5a6 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sat, 18 Apr 2020 13:47:33 -0500 Subject: [PATCH 45/92] meta: Remove wrapper --- qvm-convert-pdf.py | 58 ---------------------------------------------- 1 file changed, 58 deletions(-) delete mode 100755 qvm-convert-pdf.py diff --git a/qvm-convert-pdf.py b/qvm-convert-pdf.py deleted file mode 100755 index dc34f81..0000000 --- a/qvm-convert-pdf.py +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env python3 -# -# The Qubes OS Project, http://www.qubes-os.org -# -# Copyright (C) 2013 Joanna Rutkowska -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -# -# - -import logging -import os -import sys - -PROG_NAME = os.path.basename(sys.argv[0]) -QREXEC_CLIENT = '/usr/bin/qrexec-client-vm' - -logging.basicConfig(format='%(message)s', stream=sys.stderr) - -def die(msg): - '''Qrexec wrapper for displaying error messages''' - logging.error(msg) - sys.exit(1) - -def check_pdf_paths(untrusted_paths): - for untrusted_path in untrusted_paths: - if not os.path.exists(untrusted_path): - die(f'{untrusted_path}: No such file') - elif not os.path.isfile(untrusted_path): - die(f'{untrusted_path}: Not a regular file') - -def main(): - if len(sys.argv) == 1: - die(f'usage: {PROG_NAME} [FILE ...]') - - untrusted_pdf_paths = [os.path.abspath(path) for path in sys.argv[1:]] - check_pdf_paths(untrusted_pdf_paths) - - # TODO: Handle os.execl() error (maybe with os._exit(127)) - cmd = [QREXEC_CLIENT, '$dispvm', 'qubes.PdfConvert', - '/usr/lib/qubes/qpdf-convert-client', *untrusted_pdf_paths] - os.execvp(QREXEC_CLIENT, cmd) - -if __name__ == '__main__': - # No need to wrap this in a try block since we never return from execvp() - main() From ee7420d431277a007f2ffbfaed52b9c3070a0fa0 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sat, 18 Apr 2020 21:42:01 -0500 Subject: [PATCH 46/92] client: Fix error handling for readexactly() --- pdf-converter/client.py | 64 ++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 29 deletions(-) diff --git a/pdf-converter/client.py b/pdf-converter/client.py index a127e76..7865257 100755 --- a/pdf-converter/client.py +++ b/pdf-converter/client.py @@ -124,12 +124,9 @@ async def recv_b(proc, size): """Qrexec wrapper for receiving binary data from the server""" try: untrusted_data = await proc.stdout.readexactly(size) - except asyncio.IncompleteReadError: - logging.error("server may have died...") - raise - - if not untrusted_data: - raise EOFError + except asyncio.IncompleteReadError as e: + # got EOF before @size bytes received + raise ReceiveError from e return untrusted_data @@ -140,30 +137,32 @@ async def recvline_b(proc): untrusted_data = await proc.stdout.readline() if not untrusted_data: - raise EOFError + logging.error("server may have died...") + raise ReceiveError return untrusted_data -async def recv(proc, size): - """Convenience wrapper for receiving text data from the server""" - try: - untrusted_data = (await recv_b(proc, size)).decode() - except EOFError: - raise - except (AttributeError, UnicodeError): - logging.error("failed to decode received data!") - raise +# async def recv(proc, size): + # """Convenience wrapper for receiving text data from the server""" + # try: + # untrusted_data = (await recv_b(proc, size)).decode() + # except ReceiveError + # raise + # except (AttributeError, UnicodeError): + # logging.error("failed to decode received data!") + # raise - return untrusted_data + # return untrusted_data async def recvline(proc): """Convenience wrapper for receiving a line of text data from the server""" try: untrusted_data = (await recvline_b(proc)).decode("ascii").rstrip() - except EOFError: - raise + except EOFError as e: + logging.error("server may have died...") + raise ReceiveError from e except (AttributeError, UnicodeError): logging.error("failed to decode received data!") raise @@ -193,7 +192,7 @@ async def send(proc, data): async def get_img_dim(proc): try: untrusted_w, untrusted_h = map(int, (await recvline(proc)).split(" ", 1)) - except (AttributeError, EOFError, UnicodeError) as e: + except (AttributeError, EOFError, UnicodeError, ValueError) as e: raise ReceiveError from e try: @@ -261,12 +260,6 @@ def get_rep(tmpdir, page, initial, final): final=name.with_suffix(f".{final}")) -def save_rep(rep, data, size): - if rep.initial.write_bytes(data) != size: - logging.error("inconsistent initial representation file size") - raise RepresentationError - - async def recv_rep(loop, proc, tmpdir, page): """Receive initial representation from the server @@ -278,12 +271,25 @@ async def recv_rep(loop, proc, tmpdir, page): try: dim = await get_img_dim(proc) data = await recv_b(proc, dim.size) - await loop.run_in_executor(None, save_rep, rep, data, dim.size) - except EOFError as e: - raise ReceiveError from e except (DimensionError, ReceiveError, RepresentationError): raise + # @size bytes must have been received if we're here, so a check on how much + # is written to @rep.initial isn't needed. + # + # Also, since the server sends the dimensions and contents of each page in a + # simple loop, if the server only sends @size - N bytes for a particular + # page, either: + # + # 1. We'll eventually get @size bytes later on as recv_b() will mistake the + # other pages' dimensions and contents as part of the current page's + # contents and we end up with a malformed irep, which we'll handle later + # during representation's conversion. + # + # 2. The server exits (the loop is the last thing it does) and we get an + # EOF, causing a ReceiveError. + await loop.run_in_executor(None, rep.initial.write_bytes, data) + return rep, dim From e057643e2239c430d68e3bc6ca78d3ee92806c70 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sat, 18 Apr 2020 21:50:32 -0500 Subject: [PATCH 47/92] client: Remove extra byte for send() --- pdf-converter/client.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pdf-converter/client.py b/pdf-converter/client.py index 7865257..69de5c0 100755 --- a/pdf-converter/client.py +++ b/pdf-converter/client.py @@ -175,11 +175,10 @@ async def send(proc, data): if isinstance(data, (str, int)): data = str(data).encode() - proc.stdin.write(data + b"\n") + proc.stdin.write(data) try: await proc.stdin.drain() except BrokenPipeError: - # logging.error("server may have died") raise @@ -218,7 +217,7 @@ async def get_img_dim(proc): async def send_pdf(loop, proc, path): try: filesize = (await loop.run_in_executor(None, path.stat)).st_size - await send(proc, filesize) + await send(proc, f"{filesize}\n") data = await loop.run_in_executor(None, path.read_bytes) await send(proc, data) From 1c4f61dd0d51336413aa9c1c2999c6ece60e41ce Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sat, 18 Apr 2020 21:51:19 -0500 Subject: [PATCH 48/92] client: Remove unneeded PID prefix for tmpdir --- pdf-converter/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdf-converter/client.py b/pdf-converter/client.py index 69de5c0..962d6cc 100755 --- a/pdf-converter/client.py +++ b/pdf-converter/client.py @@ -361,7 +361,7 @@ async def convert(loop, path, procs, reps): async def sanitize(loop, proc, path): - with TemporaryDirectory(prefix=f"qvm-sanitize-{proc.pid}-") as tmpdir: + with TemporaryDirectory(prefix=f"qvm-sanitize-") as tmpdir: try: convert_procs, reps = await receive(loop, proc, tmpdir) except (DimensionError, PageError, ReceiveError, RepresentationError): From 523aab84f730b9200773352bb1d54b4507e596ac Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sat, 18 Apr 2020 21:54:15 -0500 Subject: [PATCH 49/92] client: Remove unused recv() --- pdf-converter/client.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/pdf-converter/client.py b/pdf-converter/client.py index 962d6cc..f8863b3 100755 --- a/pdf-converter/client.py +++ b/pdf-converter/client.py @@ -131,7 +131,6 @@ async def recv_b(proc, size): return untrusted_data -# TODO (?): Size limit for readline() async def recvline_b(proc): """Qrexec wrapper for receiving a line of binary data from the server""" untrusted_data = await proc.stdout.readline() @@ -143,19 +142,6 @@ async def recvline_b(proc): return untrusted_data -# async def recv(proc, size): - # """Convenience wrapper for receiving text data from the server""" - # try: - # untrusted_data = (await recv_b(proc, size)).decode() - # except ReceiveError - # raise - # except (AttributeError, UnicodeError): - # logging.error("failed to decode received data!") - # raise - - # return untrusted_data - - async def recvline(proc): """Convenience wrapper for receiving a line of text data from the server""" try: From 8dd8d6f1c4d71760ade67f33a06ae05c91c1f070 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sat, 18 Apr 2020 22:07:22 -0500 Subject: [PATCH 50/92] server: Remove uneeded quoting in identify command --- pdf-converter/server.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pdf-converter/server.py b/pdf-converter/server.py index 2d041e8..11c8acd 100755 --- a/pdf-converter/server.py +++ b/pdf-converter/server.py @@ -156,7 +156,7 @@ async def get_irep(pdfpath, irep, page): async def get_img_dim(irep): - cmd = ["identify", "-format", "'%w %h'", f"{irep}"] + cmd = ["identify", "-format", "%w %h", f"{irep}"] try: proc = await asyncio.create_subprocess_exec(*cmd, stdout=subprocess.PIPE) @@ -167,7 +167,7 @@ async def get_img_dim(irep): except subprocess.CalledProcessError: raise - return output.decode("ascii").replace("'", "") + return output.decode("ascii") async def convert_rep(irep, frep): From 11d610a40ca67a12b478bd28fa7c4847fdaf66df Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Mon, 20 Apr 2020 14:25:36 -0500 Subject: [PATCH 51/92] Rename source directory --- Makefile | 4 +- .../client.py | 45 +++++++++---------- .../server.py | 12 ++--- .../tests/__init__.py | 0 .../tests/tests.py | 0 5 files changed, 30 insertions(+), 31 deletions(-) rename {pdf-converter => qubespdfconverter}/client.py (91%) rename {pdf-converter => qubespdfconverter}/server.py (97%) rename {pdf-converter => qubespdfconverter}/tests/__init__.py (100%) rename {pdf-converter => qubespdfconverter}/tests/tests.py (100%) diff --git a/Makefile b/Makefile index 76c3c04..05bc7fe 100644 --- a/Makefile +++ b/Makefile @@ -75,8 +75,8 @@ build: install-vm: make install -C doc - install -D pdf-converter/client.py $(DESTDIR)/usr/bin/qvm-convert-pdf - install -D pdf-converter/server.py $(DESTDIR)/usr/lib/qubes/qpdf-convert-server + install -D qubespdfconverter/client.py $(DESTDIR)/usr/bin/qvm-convert-pdf + install -D qubespdfconverter/server.py $(DESTDIR)/usr/lib/qubes/qpdf-convert-server install -d $(DESTDIR)/etc/qubes-rpc ln -s ../../usr/lib/qubes/qpdf-convert-server $(DESTDIR)/etc/qubes-rpc/qubes.PdfConvert install -D qvm-convert-pdf.gnome $(DESTDIR)/usr/lib/qubes/qvm-convert-pdf.gnome diff --git a/pdf-converter/client.py b/qubespdfconverter/client.py similarity index 91% rename from pdf-converter/client.py rename to qubespdfconverter/client.py index f8863b3..e1230a2 100755 --- a/pdf-converter/client.py +++ b/qubespdfconverter/client.py @@ -52,8 +52,7 @@ class PageError(ValueError): class ReceiveError(Exception): - """ - """ + """Raise if an error occurs when reading from STDOUT""" class RepresentationError(ValueError): @@ -202,11 +201,9 @@ async def get_img_dim(proc): async def send_pdf(loop, proc, path): try: - filesize = (await loop.run_in_executor(None, path.stat)).st_size - await send(proc, f"{filesize}\n") - data = await loop.run_in_executor(None, path.read_bytes) await send(proc, data) + proc.stdin.write_eof() except BrokenPipeError: raise @@ -248,6 +245,21 @@ def get_rep(tmpdir, page, initial, final): async def recv_rep(loop, proc, tmpdir, page): """Receive initial representation from the server + @size bytes is guaranteed to have ben received by the time we write @data to + @rep.initial, so a check on how much is written to isn't needed. + + Also, since the server sends the dimensions and contents of each page in a + simple loop, if the server only sends @size - N bytes for a particular + page, either: + + 1. We'll eventually get @size bytes later on as recv_b() will mistake the + other pages' dimensions and contents as part of the current page's + contents and we end up with a malformed irep, which we'll handle later + during representation's conversion. + + 2. The server exits (the loop is the last thing it does) and we get an + EOF, causing a ReceiveError. + :param proc: Qrexec process to read STDIN from :param path: File path which will store the initial representation """ @@ -259,20 +271,6 @@ async def recv_rep(loop, proc, tmpdir, page): except (DimensionError, ReceiveError, RepresentationError): raise - # @size bytes must have been received if we're here, so a check on how much - # is written to @rep.initial isn't needed. - # - # Also, since the server sends the dimensions and contents of each page in a - # simple loop, if the server only sends @size - N bytes for a particular - # page, either: - # - # 1. We'll eventually get @size bytes later on as recv_b() will mistake the - # other pages' dimensions and contents as part of the current page's - # contents and we end up with a malformed irep, which we'll handle later - # during representation's conversion. - # - # 2. The server exits (the loop is the last thing it does) and we get an - # EOF, causing a ReceiveError. await loop.run_in_executor(None, rep.initial.write_bytes, data) return rep, dim @@ -280,7 +278,7 @@ async def recv_rep(loop, proc, tmpdir, page): async def start_convert(rep, dim): cmd = ["convert", "-size", f"{dim.width}x{dim.height}", "-depth", - f"{dim.depth}", f"rgb:{rep.initial}", f"png:{rep.final}"] + f"{dim.depth}", f"rgb:{rep.initial}", f"png:{rep.final}"] try: proc = await asyncio.create_subprocess_exec(*cmd) @@ -363,9 +361,9 @@ async def sanitize(loop, proc, path): print(f"\nConverted PDF saved as: {pdf}") -# TODO: KeyboardInterrupt async def run(loop, paths): - cmd = ["/usr/bin/qrexec-client-vm", "@dispvm", "qubes.PdfConvert"] + # cmd = ["/usr/bin/qrexec-client-vm", "@dispvm", "qubes.PdfConvert"] + cmd = ["/usr/bin/qrexec-client-vm", "disp4978", "qubes.PdfConvert"] procs = [] send_tasks = [] sanitize_tasks = [] @@ -387,7 +385,8 @@ async def run(loop, paths): sanitize_task, wait_proc(proc)) except (BrokenPipeError, DimensionError, PageError, ReceiveError, - RepresentationError, subprocess.CalledProcessError): + RepresentationError, subprocess.CalledProcessError) as e: + print(type(e).__name__) await asyncio.gather(cancel_task(send_task), cancel_task(sanitize_task)) await terminate_proc(proc) diff --git a/pdf-converter/server.py b/qubespdfconverter/server.py similarity index 97% rename from pdf-converter/server.py rename to qubespdfconverter/server.py index 11c8acd..e84a4dc 100755 --- a/pdf-converter/server.py +++ b/qubespdfconverter/server.py @@ -28,6 +28,7 @@ from tempfile import TemporaryDirectory DEPTH = 8 +STDIN_READ_SIZE = 65536 Representation = namedtuple("Representation", ["initial", "final"]) @@ -87,10 +88,10 @@ async def terminate_proc(proc): ############################### -def recv_b(size): +def recv_b(): """Qrexec wrapper for receiving binary data from the client""" try: - untrusted_data = sys.stdin.buffer.read(size) + untrusted_data = sys.stdin.buffer.read() except EOFError as e: raise ReceiveError from e @@ -210,9 +211,8 @@ async def render(loop, page, pdfpath, rep): def recv_pdf(): try: - filesize = int(recvline()) - data = recv_b(filesize) - except (ReceiveError, ValueError): + data = recv_b() + except ReceiveError: raise return data @@ -270,7 +270,7 @@ def main(): try: data = recv_pdf() - except (ReceiveError, ValueError): + except ReceiveError: sys.exit(1) with TemporaryDirectory(prefix="qvm-sanitize-") as tmpdir: diff --git a/pdf-converter/tests/__init__.py b/qubespdfconverter/tests/__init__.py similarity index 100% rename from pdf-converter/tests/__init__.py rename to qubespdfconverter/tests/__init__.py diff --git a/pdf-converter/tests/tests.py b/qubespdfconverter/tests/tests.py similarity index 100% rename from pdf-converter/tests/tests.py rename to qubespdfconverter/tests/tests.py From 89055506282b58de2f54b2e9a6ffff579a55aa41 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Wed, 22 Apr 2020 22:00:59 -0500 Subject: [PATCH 52/92] client: Start batch processing code --- qubespdfconverter/client.py | 173 ++++++++++++++++++++---------------- 1 file changed, 98 insertions(+), 75 deletions(-) diff --git a/qubespdfconverter/client.py b/qubespdfconverter/client.py index e1230a2..3d3ad47 100755 --- a/qubespdfconverter/client.py +++ b/qubespdfconverter/client.py @@ -20,6 +20,7 @@ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. import asyncio +import functools import logging import os import subprocess @@ -55,11 +56,6 @@ class ReceiveError(Exception): """Raise if an error occurs when reading from STDOUT""" -class RepresentationError(ValueError): - """ - """ - - ############################### # Utilities ############################### @@ -245,38 +241,22 @@ def get_rep(tmpdir, page, initial, final): async def recv_rep(loop, proc, tmpdir, page): """Receive initial representation from the server - @size bytes is guaranteed to have ben received by the time we write @data to - @rep.initial, so a check on how much is written to isn't needed. - - Also, since the server sends the dimensions and contents of each page in a - simple loop, if the server only sends @size - N bytes for a particular - page, either: - - 1. We'll eventually get @size bytes later on as recv_b() will mistake the - other pages' dimensions and contents as part of the current page's - contents and we end up with a malformed irep, which we'll handle later - during representation's conversion. - - 2. The server exits (the loop is the last thing it does) and we get an - EOF, causing a ReceiveError. - - :param proc: Qrexec process to read STDIN from - :param path: File path which will store the initial representation + :param loop: Event loop """ - rep = get_rep(tmpdir, page, "rgb", "png") - try: dim = await get_img_dim(proc) data = await recv_b(proc, dim.size) - except (DimensionError, ReceiveError, RepresentationError): + except (DimensionError, ReceiveError): raise + rep = get_rep(tmpdir, page, "rgb", "png") await loop.run_in_executor(None, rep.initial.write_bytes, data) return rep, dim -async def start_convert(rep, dim): +async def convert_rep(loop, rep, dim): + """Convert initial representation into final representation""" cmd = ["convert", "-size", f"{dim.width}x{dim.height}", "-depth", f"{dim.depth}", f"rgb:{rep.initial}", f"png:{rep.final}"] @@ -284,86 +264,132 @@ async def start_convert(rep, dim): proc = await asyncio.create_subprocess_exec(*cmd) await wait_proc(proc) except asyncio.CancelledError: - terminate_proc(proc) + await terminate_proc(proc) + except subprocess.CalledProcessError: + logging.error(f"Conversion failed for page {rep.final.with_suffix('')}") + raise - return proc + await loop.run_in_executor(None, unlink, rep.initial) -# TODO (?): Save->delete PNGs in loop to avoid storing all PNGs in memory. -# TODO: Add error handling -def combine_reps(save_path, reps): - with Image.open(reps[0].final) as first: - remaining = [Image.open(rep.final) for rep in reps[1:]] - first.save(save_path, "PDF", resolution=100, append_images=remaining, - save_all=True) +async def combine_reps(loop, save_path, freps): + images = [] - for img in remaining: - img.close() + try: + tasks = [loop.run_in_executor(None, Image.open, frep) for frep in freps] + images = await asyncio.gather(*tasks) + except IOError: + logging.error("Cannot identify image") + await asyncio.gather(*[loop.run_in_executor(None, img.close) + for img in images]) + raise + + try: + await loop.run_in_executor(None, functools.partial( + images[0].save, + save_path, + "PDF", + resolution=100, + append=save_path.exists(), + append_images=images[1:], + save_all=True)) + except IOError: + logging.error(f"Could not write to {save_path}") + await loop.run_in_executor(None, unlink, save_path) + raise + finally: + await asyncio.gather(*[loop.run_in_executor(None, img.close) + for img in images]) ############################### # Main ############################### -async def receive(loop, proc, tmpdir): - procs = [] - reps = [] - - try: - pagenums = await recv_pagenums(loop, proc) - except (PageError, ReceiveError): - raise +async def receive(loop, proc, pagenums, tmpdir, rep_q): for page in range(1, pagenums + 1): try: rep, dim = await recv_rep(loop, proc, tmpdir, page) - except (DimensionError, ReceiveError, RepresentationError): - term_tasks = [terminate_proc(p) for p in procs] - await asyncio.gather(*term_tasks) + except (DimensionError, ReceiveError): raise - reps.append(rep) - procs.append(await start_convert(rep, dim)) + await rep_q.put((rep, dim)) - return procs, reps +async def convert_batch(loop, convert_q, save_path): + convert_tasks = [] + freps = [] + + try: + while not convert_q.empty(): + convert_task, frep = await convert_q.get() + convert_tasks.append(convert_task) + freps.append(frep) + convert_q.task_done() -async def convert(loop, path, procs, reps): - for proc, rep in zip(procs, reps): try: - await wait_proc(proc) + await asyncio.gather(*convert_tasks) + await combine_reps(loop, save_path, freps) + except IOError: + raise except subprocess.CalledProcessError: - logging.error("page conversion failed") + await asyncio.gather(*[cancel_task(task) for task in convert_tasks]) raise - await loop.run_in_executor(None, unlink, rep.initial) + await asyncio.gather(*[loop.run_in_executor(None, unlink, frep) + for frep in freps]) + except asyncio.CancelledError: + await asyncio.gather(*[cancel_task(task) for task in convert_tasks]) + +async def convert(loop, path, pagenums, rep_q, convert_q): save_path = path.with_suffix(".trusted.pdf") - await loop.run_in_executor(None, combine_reps, save_path, reps) + + for page in range(1, pagenums + 1): + rep, dim = await rep_q.get() + convert_task = asyncio.create_task(convert_rep(loop, rep, dim)) + await convert_q.put((convert_task, rep.final)) + rep_q.task_done() + + if convert_q.full() or page == pagenums: + try: + await convert_batch(loop, convert_q, save_path) + except (IOError, subprocess.CalledProcessError): + raise return save_path async def sanitize(loop, proc, path): - with TemporaryDirectory(prefix=f"qvm-sanitize-") as tmpdir: - try: - convert_procs, reps = await receive(loop, proc, tmpdir) - except (DimensionError, PageError, ReceiveError, RepresentationError): - raise + rep_q = asyncio.Queue(50) + convert_q = asyncio.Queue(50) + + try: + pagenums = await recv_pagenums(loop, proc) + except (PageError, ReceiveError): + raise + + with TemporaryDirectory(prefix="qvm-sanitize-") as tmpdir: + receive_task = asyncio.create_task(receive(loop, proc, pagenums, tmpdir, + rep_q)) + convert_task = asyncio.create_task(convert(loop, path, pagenums, rep_q, + convert_q)) try: - pdf = await convert(loop, path, convert_procs, reps) - except (asyncio.CancelledError, subprocess.CalledProcessError): - for proc in convert_procs: - terminate_proc(proc) + _, save_path = await asyncio.gather(receive_task, convert_task) + except (DimensionError, ReceiveError): + cancel_task(convert_task) + raise + except (IOError, subprocess.CalledProcessError): + cancel_task(receive_task) raise - print(f"\nConverted PDF saved as: {pdf}") + print(f"\nConverted PDF saved as: {save_path}") async def run(loop, paths): - # cmd = ["/usr/bin/qrexec-client-vm", "@dispvm", "qubes.PdfConvert"] - cmd = ["/usr/bin/qrexec-client-vm", "disp4978", "qubes.PdfConvert"] + cmd = ["/usr/bin/qrexec-client-vm", "@dispvm", "qubes.PdfConvert"] procs = [] send_tasks = [] sanitize_tasks = [] @@ -381,12 +407,9 @@ async def run(loop, paths): for proc, path, send_task, sanitize_task in zip(procs, paths, send_tasks, sanitize_tasks): try: - await asyncio.gather(send_task, - sanitize_task, - wait_proc(proc)) + await asyncio.gather(send_task, sanitize_task, wait_proc(proc)) except (BrokenPipeError, DimensionError, PageError, ReceiveError, - RepresentationError, subprocess.CalledProcessError) as e: - print(type(e).__name__) + subprocess.CalledProcessError) as e: await asyncio.gather(cancel_task(send_task), cancel_task(sanitize_task)) await terminate_proc(proc) @@ -408,7 +431,7 @@ def main(): loop = asyncio.get_event_loop() loop.run_until_complete(run(loop, paths)) except KeyboardInterrupt: - logging.error("Original file untouched.") + logging.error("Original files untouched.") finally: loop.run_until_complete(loop.shutdown_asyncgens()) From 8c0054ceb366b45d604f95170543a984c3eb07af Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sat, 25 Apr 2020 18:32:53 -0500 Subject: [PATCH 53/92] client: Add CLI parameter handling --- qubespdfconverter/client.py | 119 +++++++++++++++++++++++++----------- setup.py | 4 ++ 2 files changed, 89 insertions(+), 34 deletions(-) diff --git a/qubespdfconverter/client.py b/qubespdfconverter/client.py index 3d3ad47..ace24bf 100755 --- a/qubespdfconverter/client.py +++ b/qubespdfconverter/client.py @@ -20,28 +20,37 @@ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. import asyncio +import click +from contextlib import contextmanager import functools import logging import os import subprocess import sys +from click._compat import get_text_stderr from collections import namedtuple from pathlib import Path from PIL import Image from tempfile import TemporaryDirectory -PROG_NAME = Path(sys.argv[0]).name -ARCHIVE_PATH = Path(Path.home(), "QubesUntrustedPDFs") +PROG = Path(sys.argv[0]).name MAX_PAGES = 10000 MAX_IMG_WIDTH = 10000 MAX_IMG_HEIGHT = 10000 DEPTH = 8 +ARCHIVE_PATH = Path(Path.home(), "QubesUntrustedPDFs") + Dimensions = namedtuple("Dimensions", ["width", "height", "size", "depth"]) Representation = namedtuple("Representations", ["initial", "final"]) +############################### +# Exceptions +############################### + + class DimensionError(ValueError): """ """ @@ -56,27 +65,66 @@ class ReceiveError(Exception): """Raise if an error occurs when reading from STDOUT""" -############################### -# Utilities -############################### +class BadPath(click.BadParameter): + """Raised if a Path object parsed by Click is invalid.""" + def __init__(self, path, message): + super().__init__(message, param_hint=f'"{path}"') -def check_paths(paths): - abs_paths = [] +def modify_click_errors(func): + """Decorator for replacing Click behavior on errors""" + def show(self, file=None): + """Removes usage message in UsageError messages""" + color = None - for path in [Path(path) for path in paths]: - abspath = Path.resolve(path) + if file is None: + file = get_text_stderr() - if not abspath.exists(): - logging.error(f"No such file: \"{path}\"") - sys.exit(1) - elif not abspath.is_file(): - logging.error(f"Not a regular file: \"{path}\"") - sys.exit(1) + if self.ctx is not None: + color = self.ctx.color + + click.echo(f"{self.format_message()}", file=file, color=color) + + def format_message(self): + """Removes 'Invalid value' prefix in BadParameter messages""" + if self.param_hint is not None: + prefix = self.param_hint + elif self.param is not None: + prefix = self.param.get_error_hint(self.ctx) + else: + return self.message + prefix = click.exceptions._join_param_hints(prefix) - abs_paths.append(abspath) + return f"{prefix}: {self.message}" - return abs_paths + click.exceptions.BadParameter.format_message = format_message + click.exceptions.UsageError.show = show + + return func + + +def validate_paths(ctx, param, untrusted_paths): + """Callback for validating file paths parsed by Click""" + for untrusted_p in untrusted_paths: + if not untrusted_p.resolve().exists(): + raise BadPath(untrusted_p, "No such file or directory") + elif not untrusted_p.resolve().is_file(): + raise BadPath(untrusted_p, "Not a regular file") + + try: + with untrusted_p.resolve().open("rb") as f: + pass + except PermissionError: + raise BadPath(untrusted_p, "Not readable") + else: + paths = untrusted_paths + + return paths + + +############################### +# Utilities +############################### def check_range(val, upper): @@ -361,9 +409,9 @@ async def convert(loop, path, pagenums, rep_q, convert_q): return save_path -async def sanitize(loop, proc, path): - rep_q = asyncio.Queue(50) - convert_q = asyncio.Queue(50) +async def sanitize(loop, proc, path, batch_size): + rep_q = asyncio.Queue(batch_size) + convert_q = asyncio.Queue(batch_size) try: pagenums = await recv_pagenums(loop, proc) @@ -388,24 +436,24 @@ async def sanitize(loop, proc, path): print(f"\nConverted PDF saved as: {save_path}") -async def run(loop, paths): +async def run(loop, params): cmd = ["/usr/bin/qrexec-client-vm", "@dispvm", "qubes.PdfConvert"] procs = [] send_tasks = [] sanitize_tasks = [] - print("Sending files to Disposable VMs...") + click.echo("Sending files to Disposable VMs...") - for path in paths: - proc = await asyncio.create_subprocess_exec(*cmd, - stdin=subprocess.PIPE, + for path in params["files"]: + proc = await asyncio.create_subprocess_exec(*cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) procs.append(proc) send_tasks.append(asyncio.create_task(send_pdf(loop, proc, path))) - sanitize_tasks.append(asyncio.create_task(sanitize(loop, proc, path))) + sanitize_tasks.append(asyncio.create_task(sanitize(loop, proc, path, + params["batch"]))) - for proc, path, send_task, sanitize_task in zip(procs, paths, send_tasks, - sanitize_tasks): + for proc, path, send_task, sanitize_task in zip(procs, params["files"], + send_tasks, sanitize_tasks): try: await asyncio.gather(send_task, sanitize_task, wait_proc(proc)) except (BrokenPipeError, DimensionError, PageError, ReceiveError, @@ -417,19 +465,22 @@ async def run(loop, paths): await loop.run_in_executor(None, archive, path) -def main(): +@click.command() +@click.option("-b", "--batch", type=click.IntRange(0), default=50) +@click.argument("files", type=Path, nargs=-1, callback=validate_paths) +@modify_click_errors +def main(**params): logging.basicConfig(level=logging.INFO, format="%(message)s") - if len(sys.argv) == 1: - print(f"usage: {PROG_NAME} [FILE ...]", file=sys.stderr) - sys.exit(1) + if not params["files"]: + logging.info("No files to sanitize.") + sys.exit(0) - paths = check_paths(sys.argv[1:]) Path.mkdir(ARCHIVE_PATH, exist_ok=True) try: loop = asyncio.get_event_loop() - loop.run_until_complete(run(loop, paths)) + loop.run_until_complete(run(loop, params)) except KeyboardInterrupt: logging.error("Original files untouched.") finally: diff --git a/setup.py b/setup.py index f2766c0..8869c0b 100644 --- a/setup.py +++ b/setup.py @@ -28,6 +28,10 @@ name='qubespdfconverter', version=open('version').read().strip(), packages=['qubespdfconverter'], + install_requires=[ + 'Click', + 'Pillow' + ], entry_points={ 'qubes.tests.extra.for_template': 'qubespdfconverter = qubespdfconverter.tests:list_tests', From 89ebda3dc75410d661e40e47cb36b0875780186b Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sat, 25 Apr 2020 18:35:36 -0500 Subject: [PATCH 54/92] client: Update error handling To start, cancel_task() has been updated to properly cancel running tasks only. Before, with task.done(), it attempted to cancel done or error'd tasks as well. Next, wait_proc() is properly cancelled in the event of an error and it now terminates the process being waited on if, when running as a task, it gets cancelled. Also in wait_proc(), a new parameter @cmd is added in order to properly raise subprocess.CalledProcessError. Curse asyncio for no proc.cmd attribute! Whenever asyncio.CancelledError is caught, it's now reraised. No idea why I wasn't doing it before. Conversion tasks are now all properly cancelled (and their associated subprocesses, if any, terminated) if either a conversion or receive throws an error. All cancel_task() calls are now properly awaited... Sorry. --- qubespdfconverter/client.py | 128 ++++++++++++++++++++++-------------- qubespdfconverter/server.py | 22 ++++--- 2 files changed, 89 insertions(+), 61 deletions(-) diff --git a/qubespdfconverter/client.py b/qubespdfconverter/client.py index ace24bf..69196e5 100755 --- a/qubespdfconverter/client.py +++ b/qubespdfconverter/client.py @@ -141,15 +141,16 @@ def unlink(path): async def cancel_task(task): - if not task.done(): - task.cancel() - await task - + """ -async def wait_proc(proc): - await proc.wait() - if proc.returncode: - raise subprocess.CalledProcessError + We might be cancelling finished tasks or tasks that exited with an error. + In those cases, we don't care about the Exceptions so we can ignore them. + """ + task.cancel() + try: + await task + except: + pass async def terminate_proc(proc): @@ -158,6 +159,15 @@ async def terminate_proc(proc): await proc.wait() +async def wait_proc(proc, cmd): + try: + await proc.wait() + if proc.returncode: + raise subprocess.CalledProcessError(proc.returncode, cmd) + except asyncio.CancelledError: + await terminate_proc(proc) + + ############################### # Qrexec-related ############################### @@ -168,7 +178,6 @@ async def recv_b(proc, size): try: untrusted_data = await proc.stdout.readexactly(size) except asyncio.IncompleteReadError as e: - # got EOF before @size bytes received raise ReceiveError from e return untrusted_data @@ -310,11 +319,13 @@ async def convert_rep(loop, rep, dim): try: proc = await asyncio.create_subprocess_exec(*cmd) - await wait_proc(proc) + await wait_proc(proc, cmd) except asyncio.CancelledError: await terminate_proc(proc) + raise except subprocess.CalledProcessError: - logging.error(f"Conversion failed for page {rep.final.with_suffix('')}") + logging.error("Conversion failed for page %s" + % rep.final.with_suffix("").name) raise await loop.run_in_executor(None, unlink, rep.initial) @@ -369,42 +380,49 @@ async def convert_batch(loop, convert_q, save_path): convert_tasks = [] freps = [] + while not convert_q.empty(): + convert_task, frep = await convert_q.get() + convert_tasks.append(convert_task) + freps.append(frep) + convert_q.task_done() + try: - while not convert_q.empty(): - convert_task, frep = await convert_q.get() - convert_tasks.append(convert_task) - freps.append(frep) - convert_q.task_done() + await asyncio.gather(*convert_tasks) + except subprocess.CalledProcessError: + for task in convert_tasks: + await cancel_task(task) + raise - try: - await asyncio.gather(*convert_tasks) - await combine_reps(loop, save_path, freps) - except IOError: - raise - except subprocess.CalledProcessError: - await asyncio.gather(*[cancel_task(task) for task in convert_tasks]) - raise + try: + await combine_reps(loop, save_path, freps) + except IOError: + raise - await asyncio.gather(*[loop.run_in_executor(None, unlink, frep) - for frep in freps]) - except asyncio.CancelledError: - await asyncio.gather(*[cancel_task(task) for task in convert_tasks]) + await asyncio.gather(*[loop.run_in_executor(None, unlink, frep) + for frep in freps]) async def convert(loop, path, pagenums, rep_q, convert_q): save_path = path.with_suffix(".trusted.pdf") - for page in range(1, pagenums + 1): - rep, dim = await rep_q.get() - convert_task = asyncio.create_task(convert_rep(loop, rep, dim)) - await convert_q.put((convert_task, rep.final)) - rep_q.task_done() - - if convert_q.full() or page == pagenums: - try: - await convert_batch(loop, convert_q, save_path) - except (IOError, subprocess.CalledProcessError): - raise + try: + for page in range(1, pagenums + 1): + rep, dim = await rep_q.get() + convert_task = asyncio.create_task(convert_rep(loop, rep, dim)) + await convert_q.put((convert_task, rep.final)) + rep_q.task_done() + + if convert_q.full() or page == pagenums: + try: + await convert_batch(loop, convert_q, save_path) + except (IOError, subprocess.CalledProcessError): + raise + except asyncio.CancelledError: + while not convert_q.empty(): + task, _ = await convert_q.get() + await cancel_task(task) + convert_q.task_done() + raise return save_path @@ -427,10 +445,10 @@ async def sanitize(loop, proc, path, batch_size): try: _, save_path = await asyncio.gather(receive_task, convert_task) except (DimensionError, ReceiveError): - cancel_task(convert_task) + await cancel_task(convert_task) raise except (IOError, subprocess.CalledProcessError): - cancel_task(receive_task) + await cancel_task(receive_task) raise print(f"\nConverted PDF saved as: {save_path}") @@ -438,29 +456,37 @@ async def sanitize(loop, proc, path, batch_size): async def run(loop, params): cmd = ["/usr/bin/qrexec-client-vm", "@dispvm", "qubes.PdfConvert"] - procs = [] + proc_tasks = [] send_tasks = [] sanitize_tasks = [] click.echo("Sending files to Disposable VMs...") for path in params["files"]: - proc = await asyncio.create_subprocess_exec(*cmd, stdin=subprocess.PIPE, + proc = await asyncio.create_subprocess_exec(*cmd, + stdin=subprocess.PIPE, stdout=subprocess.PIPE) - procs.append(proc) + proc_tasks.append(asyncio.create_task(wait_proc(proc, cmd))) send_tasks.append(asyncio.create_task(send_pdf(loop, proc, path))) - sanitize_tasks.append(asyncio.create_task(sanitize(loop, proc, path, + sanitize_tasks.append(asyncio.create_task(sanitize(loop, + proc, + path, params["batch"]))) - for proc, path, send_task, sanitize_task in zip(procs, params["files"], - send_tasks, sanitize_tasks): + for path, proc_task, send_task, sanitize_task in zip(params["files"], + proc_tasks, + send_tasks, + sanitize_tasks): try: - await asyncio.gather(send_task, sanitize_task, wait_proc(proc)) - except (BrokenPipeError, DimensionError, PageError, ReceiveError, + await asyncio.gather(proc_task, send_task, sanitize_task) + except (BrokenPipeError, + DimensionError, + PageError, + ReceiveError, subprocess.CalledProcessError) as e: await asyncio.gather(cancel_task(send_task), - cancel_task(sanitize_task)) - await terminate_proc(proc) + cancel_task(sanitize_task), + cancel_task(proc_task)) else: await loop.run_in_executor(None, archive, path) diff --git a/qubespdfconverter/server.py b/qubespdfconverter/server.py index e84a4dc..26f362f 100755 --- a/qubespdfconverter/server.py +++ b/qubespdfconverter/server.py @@ -66,15 +66,17 @@ def unlink(path): async def cancel_task(task): - if not task.done(): - task.cancel() + task.cancel() + try: await task + except: + pass -async def wait_proc(proc): +async def wait_proc(proc, cmd): await proc.wait() if proc.returncode: - raise subprocess.CalledProcessError + raise subprocess.CalledProcessError(proc, returncode, cmd) async def terminate_proc(proc): @@ -148,9 +150,9 @@ async def get_irep(pdfpath, irep, page): try: proc = await asyncio.create_subprocess_exec(*cmd) - await wait_proc(proc) + await wait_proc(proc, cmd) except asyncio.CancelledError: - terminate_proc(proc) + await terminate_proc(proc) raise except subprocess.CalledProcessError: raise @@ -163,7 +165,7 @@ async def get_img_dim(irep): proc = await asyncio.create_subprocess_exec(*cmd, stdout=subprocess.PIPE) output, _ = await proc.communicate() except asyncio.CancelledError: - terminate_proc(proc) + await terminate_proc(proc) raise except subprocess.CalledProcessError: raise @@ -175,10 +177,10 @@ async def convert_rep(irep, frep): cmd = ["convert", f"{irep}", "-depth", f"{DEPTH}", f"rgb:{frep}"] try: - proc = await asyncio.create_subprocess_exec(*cmd) - await wait_proc(proc) + proc = await asyncio.create_subprocess_exec(*cmd) + await wait_proc(proc, cmd) except asyncio.CancelledError: - terminate_proc(proc) + await terminate_proc(proc) raise except subprocess.CalledProcessError: raise From 06bafaff6bf11dab55b6fe4df04d07de1ba6976a Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sat, 25 Apr 2020 23:54:32 -0500 Subject: [PATCH 55/92] server: Remove recv_pdf() --- qubespdfconverter/server.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/qubespdfconverter/server.py b/qubespdfconverter/server.py index 26f362f..22c2ba3 100755 --- a/qubespdfconverter/server.py +++ b/qubespdfconverter/server.py @@ -211,14 +211,6 @@ async def render(loop, page, pdfpath, rep): ############################### -def recv_pdf(): - try: - data = recv_b() - except ReceiveError: - raise - - return data - def get_pagenums(pdfpath): cmd = ["pdfinfo", f"{pdfpath}"] @@ -271,13 +263,13 @@ def main(): logging.basicConfig(format="DispVM: %(message)s") try: - data = recv_pdf() + pdf_data = recv_b() except ReceiveError: sys.exit(1) with TemporaryDirectory(prefix="qvm-sanitize-") as tmpdir: pdfpath = Path(tmpdir, "original") - pdfpath.write_bytes(data) + pdfpath.write_bytes(pdf_data) try: pagenums = get_pagenums(pdfpath) From 1fa078b09453bc600ccd7e6022fd7bd827504f6d Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sat, 25 Apr 2020 23:56:13 -0500 Subject: [PATCH 56/92] server: Update error handling --- qubespdfconverter/server.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/qubespdfconverter/server.py b/qubespdfconverter/server.py index 22c2ba3..e356159 100755 --- a/qubespdfconverter/server.py +++ b/qubespdfconverter/server.py @@ -188,22 +188,23 @@ async def convert_rep(irep, frep): async def render(loop, page, pdfpath, rep): try: - irep_task = asyncio.create_task(get_irep(pdfpath, rep.initial, page)) - await irep_task + try: + irep_task = asyncio.create_task(get_irep(pdfpath, rep.initial, page)) + await irep_task - dim_task = asyncio.create_task(get_img_dim(rep.initial)) - convert_task = asyncio.create_task(convert_rep(rep.initial, rep.final)) - dim, _ = await asyncio.gather(dim_task, convert_task) - except subprocess.CalledProcessError: - raise - except asyncio.CancelledError: - cancel_task(irep_task) - cancel_task(dim_task) - cancel_task(convert_task) - finally: - await loop.run_in_executor(None, unlink, rep.initial) + dim_task = asyncio.create_task(get_img_dim(rep.initial)) + convert_task = asyncio.create_task(convert_rep(rep.initial, rep.final)) + dim, _ = await asyncio.gather(dim_task, convert_task) + except subprocess.CalledProcessError: + raise + finally: + await loop.run_in_executor(None, unlink, rep.initial) - return (dim, rep.final) + return (dim, rep.final) + except asyncio.CancelledError: + await asyncio.gather(cancel_task(irep_task), cancel_task(dim_task), + cancel_task(convert_task)) + raise ############################### From 7c9a21f81eca75cf0607a711c8df5ae02cbb2f72 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sat, 25 Apr 2020 23:57:30 -0500 Subject: [PATCH 57/92] server: Update sending process Instead of waiting on N renders to start before sending a single page, we take a page from the client's playbook and use queues so that we can start our renders without blocking but also send the pages in the correct order. For what it's worth, some early testing shows this method having a consistent runtime of around 9-10.5 seconds + DispVM boot time instead of 11+ seconds. Woo! --- qubespdfconverter/server.py | 57 ++++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 19 deletions(-) diff --git a/qubespdfconverter/server.py b/qubespdfconverter/server.py index e356159..3d67b96 100755 --- a/qubespdfconverter/server.py +++ b/qubespdfconverter/server.py @@ -232,36 +232,55 @@ def get_pagenums(pdfpath): ############################### -async def run(loop, tmpdir, pdfpath, pagenums, max_tasks=0): - results = [] - tasks = [] - limit = max_tasks if max_tasks > 0 else pagenums - - for page in range(1, limit + 1): +async def recv_pages(loop, queue, path, tmpdir, pagenums): + for page in range(1, pagenums + 1): rep = get_rep(tmpdir, page, "png", "rgb") - tasks.append(asyncio.create_task(render(loop, page, pdfpath, rep))) + task = asyncio.create_task(render(loop, page, path, rep)) - for task in tasks: try: - results.append(await task) - except subprocess.CalledProcessError: - for task in tasks: - task.cancel() + await queue.put(task) + except asyncio.CancelledError: + await cancel_task(task) + queue.task_done() + raise + - try: - await task - except asyncio.CancelledError: - pass +async def send_pages(loop, queue, pagenums): + for page in range(1, pagenums + 1): + task = await queue.get() + + try: + dim, frep = await task + except subprocess.CalledProcessError: raise + else: + queue.task_done() - for dim, frep in results: send(dim) send_b(await loop.run_in_executor(None, frep.read_bytes)) await loop.run_in_executor(None, unlink, frep) +async def run(loop, path, tmpdir, pagenums): + queue = asyncio.Queue(pagenums) + recv_task = asyncio.create_task(recv_pages(loop, queue, path, tmpdir, pagenums)) + send_task = asyncio.create_task(send_pages(loop, queue, pagenums)) + + try: + await asyncio.gather(recv_task, send_task) + except subprocess.CalledProcessError: + await cancel_task(recv_task) + + while not queue.empty(): + task = await queue.get() + await cancel_task(task) + queue.task_done() + + raise + + def main(): - logging.basicConfig(format="DispVM: %(message)s") + logging.basicConfig(level=logging.INFO, format="DispVM: %(message)s") try: pdf_data = recv_b() @@ -280,7 +299,7 @@ def main(): try: loop = asyncio.get_event_loop() - loop.run_until_complete(run(loop, tmpdir, pdfpath, pagenums, 0)) + loop.run_until_complete(run(loop, pdfpath, tmpdir, pagenums)) except subprocess.CalledProcessError: sys.exit(1) finally: From dc4742babc8698cab1b228b4838cd7e7f9f9b1c7 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sun, 26 Apr 2020 13:31:38 -0500 Subject: [PATCH 58/92] client: Replace click.echo() --- qubespdfconverter/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qubespdfconverter/client.py b/qubespdfconverter/client.py index 69196e5..f74ce17 100755 --- a/qubespdfconverter/client.py +++ b/qubespdfconverter/client.py @@ -460,7 +460,7 @@ async def run(loop, params): send_tasks = [] sanitize_tasks = [] - click.echo("Sending files to Disposable VMs...") + print("Sending files to Disposable VMs...") for path in params["files"]: proc = await asyncio.create_subprocess_exec(*cmd, From 4e5e87ee1dcf1677359afa652cc3ec62a8ac9050 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Mon, 27 Apr 2020 10:12:41 -0500 Subject: [PATCH 59/92] client, server: Add Representation & *File objects Documentation has also been updated. --- qubespdfconverter/client.py | 635 +++++++++++++++++++----------------- qubespdfconverter/server.py | 366 ++++++++++++--------- 2 files changed, 537 insertions(+), 464 deletions(-) diff --git a/qubespdfconverter/client.py b/qubespdfconverter/client.py index f74ce17..75600b9 100755 --- a/qubespdfconverter/client.py +++ b/qubespdfconverter/client.py @@ -34,6 +34,7 @@ from tempfile import TemporaryDirectory PROG = Path(sys.argv[0]).name +CLIENT_VM_CMD = ["/usr/bin/qrexec-client-vm", "@dispvm", "qubes.PdfConvert"] MAX_PAGES = 10000 MAX_IMG_WIDTH = 10000 @@ -43,26 +44,18 @@ ARCHIVE_PATH = Path(Path.home(), "QubesUntrustedPDFs") Dimensions = namedtuple("Dimensions", ["width", "height", "size", "depth"]) -Representation = namedtuple("Representations", ["initial", "final"]) - - -############################### -# Exceptions -############################### class DimensionError(ValueError): - """ - """ + """Raised if invalid image dimensions were received""" class PageError(ValueError): - """ - """ + """Raised if an invalid number of pages was received""" class ReceiveError(Exception): - """Raise if an error occurs when reading from STDOUT""" + """Raised if a STDOUT read failed in a qrexec-client-vm subprocess""" class BadPath(click.BadParameter): @@ -71,69 +64,11 @@ def __init__(self, path, message): super().__init__(message, param_hint=f'"{path}"') -def modify_click_errors(func): - """Decorator for replacing Click behavior on errors""" - def show(self, file=None): - """Removes usage message in UsageError messages""" - color = None - - if file is None: - file = get_text_stderr() - - if self.ctx is not None: - color = self.ctx.color - - click.echo(f"{self.format_message()}", file=file, color=color) - - def format_message(self): - """Removes 'Invalid value' prefix in BadParameter messages""" - if self.param_hint is not None: - prefix = self.param_hint - elif self.param is not None: - prefix = self.param.get_error_hint(self.ctx) - else: - return self.message - prefix = click.exceptions._join_param_hints(prefix) - - return f"{prefix}: {self.message}" - - click.exceptions.BadParameter.format_message = format_message - click.exceptions.UsageError.show = show - - return func - - -def validate_paths(ctx, param, untrusted_paths): - """Callback for validating file paths parsed by Click""" - for untrusted_p in untrusted_paths: - if not untrusted_p.resolve().exists(): - raise BadPath(untrusted_p, "No such file or directory") - elif not untrusted_p.resolve().is_file(): - raise BadPath(untrusted_p, "Not a regular file") - - try: - with untrusted_p.resolve().open("rb") as f: - pass - except PermissionError: - raise BadPath(untrusted_p, "Not readable") - else: - paths = untrusted_paths - - return paths - - -############################### -# Utilities -############################### - - -def check_range(val, upper): - if not 1 <= val <= upper: - raise ValueError - - def unlink(path): - """Wrapper for Path.unlink(path, missing_ok=True)""" + """Wrapper for pathlib.Path.unlink(path, missing_ok=True) + + :param Path: File path to delete + """ try: path.unlink() except FileNotFoundError: @@ -141,10 +76,12 @@ def unlink(path): async def cancel_task(task): - """ + """Convenience wrapper for cancelling an asyncio Task - We might be cancelling finished tasks or tasks that exited with an error. - In those cases, we don't care about the Exceptions so we can ignore them. + Presumably, since we're cancelling tasks, we don't care what they returned + with or raised. + + :param task: Task to cancel """ task.cancel() try: @@ -154,12 +91,21 @@ async def cancel_task(task): async def terminate_proc(proc): + """Convenience wrapper for terminating a process + + :param proc: Process to terminate + """ if proc.returncode is None: proc.terminate() await proc.wait() async def wait_proc(proc, cmd): + """Convenience wrapper for waiting on a process + + :param proc: Process to wait on + :param cmd: Command executed by @proc (Exception handling purposes) + """ try: await proc.wait() if proc.returncode: @@ -168,13 +114,28 @@ async def wait_proc(proc, cmd): await terminate_proc(proc) -############################### -# Qrexec-related -############################### +async def send(proc, data): + """Qrexec wrapper for sending data to the server + + :param proc: qrexec-client-vm process + :param data: Data to send (bytes, String, or int) + """ + if isinstance(data, (int, str)): + data = str(data).encode() + + proc.stdin.write(data) + try: + await proc.stdin.drain() + except BrokenPipeError: + raise async def recv_b(proc, size): - """Qrexec wrapper for receiving binary data from the server""" + """Qrexec wrapper for receiving binary data from the server + + :param proc: qrexec-client-vm process + :param size: Number of bytes to receive + """ try: untrusted_data = await proc.stdout.readexactly(size) except asyncio.IncompleteReadError as e: @@ -184,311 +145,381 @@ async def recv_b(proc, size): async def recvline_b(proc): - """Qrexec wrapper for receiving a line of binary data from the server""" + """Qrexec wrapper for receiving a line of binary data from the server + + :param proc: qrexec-client-vm process + """ untrusted_data = await proc.stdout.readline() if not untrusted_data: - logging.error("server may have died...") + logging.error("Server may have died...") raise ReceiveError return untrusted_data async def recvline(proc): - """Convenience wrapper for receiving a line of text data from the server""" + """Convenience wrapper for receiving a line of text data from the server + + :param proc: qrexec-client-vm process + """ try: untrusted_data = (await recvline_b(proc)).decode("ascii").rstrip() except EOFError as e: - logging.error("server may have died...") + logging.error("Server may have died...") raise ReceiveError from e except (AttributeError, UnicodeError): - logging.error("failed to decode received data!") + logging.error("Failed to decode received data!") raise return untrusted_data -async def send(proc, data): - """Qrexec wrapper for sending data to the server""" - if isinstance(data, (str, int)): - data = str(data).encode() +class Representation(object): + """Umbrella object for the initial & final representations of a file - proc.stdin.write(data) - try: - await proc.stdin.drain() - except BrokenPipeError: - raise + The initial representation must be of a format such that if it contains + malicious code/data, such code/data is excluded from the final + representation upon conversion. Generally, this makes the initial + representation a relatively simple format (e.g., RGB bitmap). + The final representation can be of any format you'd like, provided that + the initial representation's format was properly selected and you are + able to combine them later on into a PDF. + :param loop: Main event loop + :param prefix: Path prefixes of the representations + :param initial: Format of the initial representation + :param final: Format of the final representation + """ -############################### -# Image-related -############################### + def __init__(self, loop, prefix, initial, final): + self.loop = loop + self.initial = prefix.with_suffix(f".{initial}") + self.final = prefix.with_suffix(f".{final}") + self.dim = None + + + async def convert(self): + """Convert initial representation into final representation""" + cmd = [ + "convert", + "-size", + f"{self.dim.width}x{self.dim.height}", + "-depth", + f"{self.dim.depth}", + f"rgb:{self.initial}", + f"png:{self.final}" + ] + try: + proc = await asyncio.create_subprocess_exec(*cmd) + await wait_proc(proc, cmd) + except asyncio.CancelledError: + await terminate_proc(proc) + raise + except subprocess.CalledProcessError: + logging.error(f"Page conversion failed") + raise -async def get_img_dim(proc): - try: - untrusted_w, untrusted_h = map(int, (await recvline(proc)).split(" ", 1)) - except (AttributeError, EOFError, UnicodeError, ValueError) as e: - raise ReceiveError from e + await self.loop.run_in_executor(None, unlink, self.initial) - try: - check_range(untrusted_w, MAX_IMG_WIDTH) - check_range(untrusted_h, MAX_IMG_HEIGHT) - except ValueError as e: - logging.error(f"invalid image measurements received {e}") - raise DimensionError from e - else: - width = untrusted_w - height = untrusted_h - size = width * height * 3 + async def receive(self, proc): + """Receive initial representation from the server - return Dimensions(width=width, height=height, size=size, depth=DEPTH) + :param proc: qrexec-client-vm process + """ + try: + self.dim = await self._dim(proc) + data = await recv_b(proc, self.dim.size) + except (DimensionError, ReceiveError): + raise + await self.loop.run_in_executor(None, self.initial.write_bytes, data) -############################### -# PDF-related -############################### + async def _dim(self, proc): + """Receive and compute image dimensions for initial representation -async def send_pdf(loop, proc, path): - try: - data = await loop.run_in_executor(None, path.read_bytes) - await send(proc, data) - proc.stdin.write_eof() - except BrokenPipeError: - raise + :param proc: qrexec-client-vm process + """ + try: + untrusted_w, untrusted_h = map(int, (await recvline(proc)).split(" ", 1)) + except (AttributeError, EOFError, UnicodeError, ValueError) as e: + raise ReceiveError from e + + if 1 <= untrusted_w <= MAX_IMG_WIDTH and \ + 1 <= untrusted_h <= MAX_IMG_HEIGHT: + width = untrusted_w + height = untrusted_h + else: + logging.error(f"invalid image measurements received") + raise DimensionError + size = width * height * 3 + return Dimensions(width=width, height=height, size=size, depth=DEPTH) -async def recv_pagenums(loop, proc): - try: - untrusted_pagenums = int(await recvline(proc)) - except (AttributeError, EOFError, UnicodeError, ValueError) as e: - raise ReceiveError from e - try: - check_range(untrusted_pagenums, MAX_PAGES) - except ValueError as e: - logging.error("invalid number of pages received") - raise PageError from e - else: - pagenums = untrusted_pagenums +class UnsanitizedFile(object): + """A file not yet cleansed by holy bathwater - return pagenums + :param loop: Main event loop + :param path: Path to original, unsanitized file + """ + def __init__(self, loop, path): + self.loop = loop + self.proc = None + self.rep_q = None + self.convert_q = None -def archive(path): - archive_path = Path(ARCHIVE_PATH, path.name) - path.rename(archive_path) - print(f"Original PDF saved as: {archive_path}") + self.orig_path = path + self.save_path = path.with_suffix(".trusted.pdf") + self.dir = None + self.pagenums = None -############################### -# Representation-related -############################### + async def sanitize(self, size): + """Start Qubes RPC session and sanitization tasks + :param size: Batch size for queues + """ + self.rep_q = asyncio.Queue(size) + self.convert_q = asyncio.Queue(size) + self.proc = await asyncio.create_subprocess_exec(*CLIENT_VM_CMD, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE) + proc_task = asyncio.create_task(wait_proc(self.proc, CLIENT_VM_CMD)) + send_task = asyncio.create_task(self._send()) + sanitize_task = asyncio.create_task(self._sanitize()) -def get_rep(tmpdir, page, initial, final): - name = Path(tmpdir, str(page)) - return Representation(initial=name.with_suffix(f".{initial}"), - final=name.with_suffix(f".{final}")) + try: + await asyncio.gather(proc_task, send_task, sanitize_task) + except (BrokenPipeError, + DimensionError, + PageError, + ReceiveError, + subprocess.CalledProcessError) as e: + await asyncio.gather(cancel_task(send_task), + cancel_task(sanitize_task), + cancel_task(proc_task)) -async def recv_rep(loop, proc, tmpdir, page): - """Receive initial representation from the server + async def _sanitize(self): + """Receive and convert representation files""" + try: + self.pagenums = await self._pagenums() + except (PageError, ReceiveError): + raise - :param loop: Event loop - """ - try: - dim = await get_img_dim(proc) - data = await recv_b(proc, dim.size) - except (DimensionError, ReceiveError): - raise + with TemporaryDirectory(prefix="qvm-sanitize-") as d: + self.dir = d + receive_task = asyncio.create_task(self._receive()) + convert_task = asyncio.create_task(self._convert()) - rep = get_rep(tmpdir, page, "rgb", "png") - await loop.run_in_executor(None, rep.initial.write_bytes, data) + try: + await asyncio.gather(receive_task, convert_task) + except (DimensionError, ReceiveError): + await cancel_task(convert_task) + raise + except (IOError, subprocess.CalledProcessError): + await cancel_task(receive_task) + raise - return rep, dim + await self.loop.run_in_executor(None, self._archive) + print(f"Converted PDF saved as: {self.save_path}") -async def convert_rep(loop, rep, dim): - """Convert initial representation into final representation""" - cmd = ["convert", "-size", f"{dim.width}x{dim.height}", "-depth", - f"{dim.depth}", f"rgb:{rep.initial}", f"png:{rep.final}"] + async def _receive(self): + """Receive initial representations""" + for page in range(1, self.pagenums + 1): + try: + rep = Representation(self.loop, Path(self.dir, str(page)), + "rgb", "png") + await rep.receive(self.proc) + except (DimensionError, ReceiveError): + raise - try: - proc = await asyncio.create_subprocess_exec(*cmd) - await wait_proc(proc, cmd) - except asyncio.CancelledError: - await terminate_proc(proc) - raise - except subprocess.CalledProcessError: - logging.error("Conversion failed for page %s" - % rep.final.with_suffix("").name) - raise + await self.rep_q.put(rep) - await loop.run_in_executor(None, unlink, rep.initial) + async def _convert(self): + """Convert initial representations to final representations""" + try: + for page in range(1, self.pagenums + 1): + rep = await self.rep_q.get() + convert_task = asyncio.create_task(rep.convert()) + await self.convert_q.put((convert_task, rep.final)) + self.rep_q.task_done() + + if self.convert_q.full() or page == self.pagenums: + try: + await self._complete_batch() + except (IOError, subprocess.CalledProcessError): + raise + except asyncio.CancelledError: + while not self.convert_q.empty(): + convert_task, _ = await self.convert_q.get() + await cancel_task(convert_task) + self.convert_q.task_done() + raise -async def combine_reps(loop, save_path, freps): - images = [] - try: - tasks = [loop.run_in_executor(None, Image.open, frep) for frep in freps] - images = await asyncio.gather(*tasks) - except IOError: - logging.error("Cannot identify image") - await asyncio.gather(*[loop.run_in_executor(None, img.close) - for img in images]) - raise + async def _complete_batch(self): + """Wait on current batch of final representations to be combined""" + convert_tasks = [] + freps = [] - try: - await loop.run_in_executor(None, functools.partial( - images[0].save, - save_path, - "PDF", - resolution=100, - append=save_path.exists(), - append_images=images[1:], - save_all=True)) - except IOError: - logging.error(f"Could not write to {save_path}") - await loop.run_in_executor(None, unlink, save_path) - raise - finally: - await asyncio.gather(*[loop.run_in_executor(None, img.close) - for img in images]) + while not self.convert_q.empty(): + convert_task, frep = await self.convert_q.get() + convert_tasks.append(convert_task) + freps.append(frep) + self.convert_q.task_done() + try: + await asyncio.gather(*convert_tasks) + except subprocess.CalledProcessError: + for convert_task in convert_tasks: + await cancel_task(convert_task) + raise -############################### -# Main -############################### + try: + await self._combine_reps(freps) + except IOError: + raise + await asyncio.gather(*[self.loop.run_in_executor(None, unlink, frep) + for frep in freps]) -async def receive(loop, proc, pagenums, tmpdir, rep_q): - for page in range(1, pagenums + 1): + + async def _combine_reps(self, freps): + """Combine final representations into a sanitized PDF file + + :param freps: List of final representations + """ try: - rep, dim = await recv_rep(loop, proc, tmpdir, page) - except (DimensionError, ReceiveError): + img_tasks = [self.loop.run_in_executor(None, Image.open, frep) + for frep in freps] + images = await asyncio.gather(*img_tasks) + except IOError: + logging.error("Cannot identify image") + await asyncio.gather(*[self.loop.run_in_executor(None, img.close) + for img in images]) raise - await rep_q.put((rep, dim)) + try: + await self.loop.run_in_executor(None, + functools.partial( + images[0].save, + self.save_path, + "PDF", + resolution=100, + append=self.save_path.exists(), + append_images=images[1:], + save_all=True)) + except IOError: + logging.error(f"Could not write to {self.save_path}") + await self.loop.run_in_executor(None, unlink, self.save_path) + raise + finally: + await asyncio.gather(*[self.loop.run_in_executor(None, img.close) + for img in images]) -async def convert_batch(loop, convert_q, save_path): - convert_tasks = [] - freps = [] + async def _send(self): + """Send original document to server""" + try: + data = await self.loop.run_in_executor(None, + self.orig_path.read_bytes) + await send(self.proc, data) + self.proc.stdin.write_eof() + except BrokenPipeError: + raise - while not convert_q.empty(): - convert_task, frep = await convert_q.get() - convert_tasks.append(convert_task) - freps.append(frep) - convert_q.task_done() - try: - await asyncio.gather(*convert_tasks) - except subprocess.CalledProcessError: - for task in convert_tasks: - await cancel_task(task) - raise + async def _pagenums(self): + """Receive number of pages in original document from server""" + try: + untrusted_pagenums = int(await recvline(self.proc)) + except (AttributeError, EOFError, UnicodeError, ValueError) as e: + raise ReceiveError from e - try: - await combine_reps(loop, save_path, freps) - except IOError: - raise + try: + if not 1 <= untrusted_pagenums <= MAX_PAGES: + raise ValueError + except ValueError as e: + logging.error("Invalid number of pages received") + raise PageError from e + else: + pagenums = untrusted_pagenums - await asyncio.gather(*[loop.run_in_executor(None, unlink, frep) - for frep in freps]) + return pagenums -async def convert(loop, path, pagenums, rep_q, convert_q): - save_path = path.with_suffix(".trusted.pdf") + def _archive(self): + """Move original file into an archival directory""" + archive_path = Path(ARCHIVE_PATH, self.orig_path.name) + self.orig_path.rename(archive_path) + print(f"\nOriginal PDF saved as: {archive_path}") - try: - for page in range(1, pagenums + 1): - rep, dim = await rep_q.get() - convert_task = asyncio.create_task(convert_rep(loop, rep, dim)) - await convert_q.put((convert_task, rep.final)) - rep_q.task_done() - - if convert_q.full() or page == pagenums: - try: - await convert_batch(loop, convert_q, save_path) - except (IOError, subprocess.CalledProcessError): - raise - except asyncio.CancelledError: - while not convert_q.empty(): - task, _ = await convert_q.get() - await cancel_task(task) - convert_q.task_done() - raise - return save_path +def modify_click_errors(func): + """Decorator for replacing Click behavior on errors""" + def show(self, file=None): + """Removes usage message from UsageError error messages""" + color = None + if file is None: + file = get_text_stderr() -async def sanitize(loop, proc, path, batch_size): - rep_q = asyncio.Queue(batch_size) - convert_q = asyncio.Queue(batch_size) + if self.ctx is not None: + color = self.ctx.color - try: - pagenums = await recv_pagenums(loop, proc) - except (PageError, ReceiveError): - raise + click.echo(f"{self.format_message()}", file=file, color=color) - with TemporaryDirectory(prefix="qvm-sanitize-") as tmpdir: - receive_task = asyncio.create_task(receive(loop, proc, pagenums, tmpdir, - rep_q)) - convert_task = asyncio.create_task(convert(loop, path, pagenums, rep_q, - convert_q)) + def format_message(self): + """Removes 'Invalid value' from BadParameter error messages""" + if self.param_hint is not None: + prefix = self.param_hint + elif self.param is not None: + prefix = self.param.get_error_hint(self.ctx) + else: + return self.message + prefix = click.exceptions._join_param_hints(prefix) - try: - _, save_path = await asyncio.gather(receive_task, convert_task) - except (DimensionError, ReceiveError): - await cancel_task(convert_task) - raise - except (IOError, subprocess.CalledProcessError): - await cancel_task(receive_task) - raise + return f"{prefix}: {self.message}" - print(f"\nConverted PDF saved as: {save_path}") + click.exceptions.BadParameter.format_message = format_message + click.exceptions.UsageError.show = show + return func -async def run(loop, params): - cmd = ["/usr/bin/qrexec-client-vm", "@dispvm", "qubes.PdfConvert"] - proc_tasks = [] - send_tasks = [] - sanitize_tasks = [] - print("Sending files to Disposable VMs...") +def validate_paths(ctx, param, untrusted_paths): + """Callback for validating file paths parsed by Click""" + for untrusted_path in untrusted_paths: + if not untrusted_path.resolve().exists(): + raise BadPath(untrusted_path, "No such file or directory") + elif not untrusted_path.resolve().is_file(): + raise BadPath(untrusted_path, "Not a regular file") - for path in params["files"]: - proc = await asyncio.create_subprocess_exec(*cmd, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE) - proc_tasks.append(asyncio.create_task(wait_proc(proc, cmd))) - send_tasks.append(asyncio.create_task(send_pdf(loop, proc, path))) - sanitize_tasks.append(asyncio.create_task(sanitize(loop, - proc, - path, - params["batch"]))) - - for path, proc_task, send_task, sanitize_task in zip(params["files"], - proc_tasks, - send_tasks, - sanitize_tasks): try: - await asyncio.gather(proc_task, send_task, sanitize_task) - except (BrokenPipeError, - DimensionError, - PageError, - ReceiveError, - subprocess.CalledProcessError) as e: - await asyncio.gather(cancel_task(send_task), - cancel_task(sanitize_task), - cancel_task(proc_task)) - else: - await loop.run_in_executor(None, archive, path) + with untrusted_path.resolve().open("rb"): + pass + except PermissionError: + raise BadPath(untrusted_path, "Not readable") + else: + paths = untrusted_paths + + return paths + + +async def run(loop, params): + print("Sending files to Disposable VMs...") + files = [UnsanitizedFile(loop, f) for f in params["files"]] + await asyncio.gather(*[f.sanitize(params["batch"]) for f in files], + return_exceptions=True) @click.command() diff --git a/qubespdfconverter/server.py b/qubespdfconverter/server.py index 3d67b96..dbecd52 100755 --- a/qubespdfconverter/server.py +++ b/qubespdfconverter/server.py @@ -30,35 +30,16 @@ DEPTH = 8 STDIN_READ_SIZE = 65536 -Representation = namedtuple("Representation", ["initial", "final"]) - - -class ConversionError(Exception): - """ - """ class ReceiveError(Exception): - """ - """ - + """Raised if a STDOUT read failed in a qrexec-client-vm subprocess""" -############################### -# Utilities -############################### +def unlink(path): + """Wrapper for pathlib.Path.unlink(path, missing_ok=True) -def info(msg, suffix=None): - """Qrexec wrapper for displaying information on the client - - @suffix is really only ever used when @msg needs to overwrite the line of - the previous message (imitating an updating line). This is done by setting - @suffix to "\r". + :param Path: File path to delete """ - print(msg, end=suffix, flush=True, file=sys.stderr) - - -def unlink(path): - """Wrapper for Path.unlink(path, missing_ok=True)""" try: path.unlink() except FileNotFoundError: @@ -66,6 +47,13 @@ def unlink(path): async def cancel_task(task): + """Convenience wrapper for cancelling an asyncio Task + + Presumably, since we're cancelling tasks, we don't care what they returned + with or raised. + + :param task: Task to cancel + """ task.cancel() try: await task @@ -73,21 +61,28 @@ async def cancel_task(task): pass -async def wait_proc(proc, cmd): - await proc.wait() - if proc.returncode: - raise subprocess.CalledProcessError(proc, returncode, cmd) - - async def terminate_proc(proc): + """Convenience wrapper for terminating a process + + :param proc: Process to terminate + """ if proc.returncode is None: proc.terminate() await proc.wait() -############################### -# Qrexec-related -############################### +async def wait_proc(proc, cmd): + """Convenience wrapper for waiting on a process + + :param proc: Process to wait on + :param cmd: Command executed by @proc (Exception handling purposes) + """ + try: + await proc.wait() + if proc.returncode: + raise subprocess.CalledProcessError(proc.returncode, cmd) + except asyncio.CancelledError: + await terminate_proc(proc) def recv_b(): @@ -114,7 +109,10 @@ def recvline(): def send_b(data): - """Qrexec wrapper for sending binary data to the client's STDOUT""" + """Qrexec wrapper for sending binary data to the client + + :param data: Data to send (bytes, String, or int) + """ if isinstance(data, (str, int)): data = str(data).encode() @@ -123,184 +121,228 @@ def send_b(data): def send(data): - """Qrexec wrapper for sending text data to the client's STDOUT""" - print(data, flush=True) - - -############################### -# Rep-related -############################### + """Qrexec wrapper for sending text data to the client - -def get_rep(tmpdir, page, initial, final): - """Create temporary file for page representations""" - name = Path(tmpdir, f"{page}") - return Representation(initial=name.with_suffix(f".{initial}"), - final=name.with_suffix(f".{final}")) + :param data: Data to send + """ + print(data, flush=True) -############################### -# Image-related -############################### +class Representation(object): + """Umbrella object for the initial & final representations of a file + The initial representation must be of a format from which we can derive + the final representation without breaking any of its requirements. + Generally, this makes the initial representation some sort of image file + (e.g. PNG, JPEG). -async def get_irep(pdfpath, irep, page): - cmd = ["pdftocairo", f"{pdfpath}", "-png", "-f", f"{page}", "-l", - f"{page}", "-singlefile", f"{Path(irep.parent, irep.stem)}"] + The final representation must be of a format such that if the initial + representation contains malicious code/data, such code/data is excluded + from the final representation upon conversion. Generally, this makes the + final representation a relatively simple format (e.g., RGB bitmap). - try: - proc = await asyncio.create_subprocess_exec(*cmd) - await wait_proc(proc, cmd) - except asyncio.CancelledError: - await terminate_proc(proc) - raise - except subprocess.CalledProcessError: - raise - - -async def get_img_dim(irep): - cmd = ["identify", "-format", "%w %h", f"{irep}"] - - try: - proc = await asyncio.create_subprocess_exec(*cmd, stdout=subprocess.PIPE) - output, _ = await proc.communicate() - except asyncio.CancelledError: - await terminate_proc(proc) - raise - except subprocess.CalledProcessError: - raise + :param loop: Main event loop + :param path: Path to original, unsanitized file + :param prefix: Path prefixes of the representations + :param initial: The format of the initial representation + :param final: The format of the final representation + """ - return output.decode("ascii") + def __init__(self, loop, path, prefix, initial, final): + self.loop = loop + self.path = path + self.page = prefix.name + self.initial = prefix.with_suffix(f".{initial}") + self.final = prefix.with_suffix(f".{final}") + self.dim = None -async def convert_rep(irep, frep): - cmd = ["convert", f"{irep}", "-depth", f"{DEPTH}", f"rgb:{frep}"] + async def convert(self): + """Convert initial representation to final representation""" + try: + irep_task = asyncio.create_task(self._irep()) + await irep_task + except asyncio.CancelledError: + await cancel_task(irep_task) + raise + except subprocess.CalledProcessError: + raise - try: - proc = await asyncio.create_subprocess_exec(*cmd) - await wait_proc(proc, cmd) - except asyncio.CancelledError: - await terminate_proc(proc) - raise - except subprocess.CalledProcessError: - raise + try: + dim_task = asyncio.create_task(self._dim()) + self.dim = await dim_task + except asyncio.CancelledError: + await cancel_task(dim_task) + raise + except subprocess.CalledProcessError: + raise + cmd = [ + "convert", + f"{self.initial}", + "-depth", + f"{DEPTH}", + f"rgb:{self.final}" + ] -async def render(loop, page, pdfpath, rep): - try: try: - irep_task = asyncio.create_task(get_irep(pdfpath, rep.initial, page)) - await irep_task - - dim_task = asyncio.create_task(get_img_dim(rep.initial)) - convert_task = asyncio.create_task(convert_rep(rep.initial, rep.final)) - dim, _ = await asyncio.gather(dim_task, convert_task) + proc = await asyncio.create_subprocess_exec(*cmd) + await wait_proc(proc, cmd) + except asyncio.CancelledError: + await terminate_proc(proc) + raise except subprocess.CalledProcessError: raise finally: - await loop.run_in_executor(None, unlink, rep.initial) - - return (dim, rep.final) - except asyncio.CancelledError: - await asyncio.gather(cancel_task(irep_task), cancel_task(dim_task), - cancel_task(convert_task)) - raise + await self.loop.run_in_executor(None, unlink, self.initial) + + + async def _irep(self): + """Create initial representation""" + cmd = [ + "pdftocairo", + f"{self.path}", + "-png", + "-f", + f"{self.page}", + "-l", + f"{self.page}", + "-singlefile", + f"{Path(self.initial.parent, self.initial.stem)}" + ] + try: + proc = await asyncio.create_subprocess_exec(*cmd) + await wait_proc(proc, cmd) + except asyncio.CancelledError: + await terminate_proc(proc) + raise + except subprocess.CalledProcessError: + raise -############################### -# PDF-related -############################### + async def _dim(self): + """Identify image dimensions of initial representation""" + cmd = ["identify", "-format", "%w %h", f"{self.initial}"] -def get_pagenums(pdfpath): - cmd = ["pdfinfo", f"{pdfpath}"] + try: + proc = await asyncio.create_subprocess_exec(*cmd, + stdout=subprocess.PIPE) + output, _ = await proc.communicate() + except asyncio.CancelledError: + await terminate_proc(proc) + raise + except subprocess.CalledProcessError: + raise - try: - output = subprocess.run(cmd, capture_output=True, check=True) - except subprocess.CalledProcessError: - # TODO: Support converting JPGs and PNGs like the OG script - logging.error("file is probably not a PDF") - raise + return output.decode("ascii") - for line in output.stdout.decode("ascii").splitlines(): - if "Pages:" in line: - return int(line.split(":")[1]) +class SuspectFile(object): + """A potentially malicious file which needs sanitization -############################### -# Main -############################### + :param loop: Main event loop + :param path: Path to file + """ + def __init__(self, loop, path): + self.path = path + self.dir = path.parent + self.pagenums = None -async def recv_pages(loop, queue, path, tmpdir, pagenums): - for page in range(1, pagenums + 1): - rep = get_rep(tmpdir, page, "png", "rgb") - task = asyncio.create_task(render(loop, page, path, rep)) + self.loop = loop + self.queue = None try: - await queue.put(task) - except asyncio.CancelledError: - await cancel_task(task) - queue.task_done() + data = recv_b() + self.path.write_bytes(data) + except ReceiveError: raise -async def send_pages(loop, queue, pagenums): - for page in range(1, pagenums + 1): - task = await queue.get() - + async def sanitize(self): + """Start sanitization tasks""" try: - dim, frep = await task + self.pagenums = await self.loop.run_in_executor(None, self._pagenums) + send(self.pagenums) + self.queue = asyncio.Queue(self.pagenums) except subprocess.CalledProcessError: raise - else: - queue.task_done() - send(dim) - send_b(await loop.run_in_executor(None, frep.read_bytes)) - await loop.run_in_executor(None, unlink, frep) + publish_task = asyncio.create_task(self._publish()) + consume_task = asyncio.create_task(self._consume()) + try: + await asyncio.gather(publish_task, consume_task) + except subprocess.CalledProcessError: + await cancel_task(publish_task) -async def run(loop, path, tmpdir, pagenums): - queue = asyncio.Queue(pagenums) - recv_task = asyncio.create_task(recv_pages(loop, queue, path, tmpdir, pagenums)) - send_task = asyncio.create_task(send_pages(loop, queue, pagenums)) + while not self.queue.empty(): + convert_task = await self.queue.get() + await cancel_task(convert_task) + self.queue.task_done() - try: - await asyncio.gather(recv_task, send_task) - except subprocess.CalledProcessError: - await cancel_task(recv_task) + raise - while not queue.empty(): - task = await queue.get() - await cancel_task(task) - queue.task_done() - raise + async def _publish(self): + """Extract initial representations and enqueue conversion tasks""" + for page in range(1, self.pagenums + 1): + rep = Representation(self.loop, self.path, Path(self.dir, str(page)), + "png", "rgb") + try: + convert_task = asyncio.create_task(rep.convert()) + await self.queue.put((rep, convert_task)) + except asyncio.CancelledError: + await cancel_task(convert_task) + self.queue.task_done() + raise + except subprocess.CalledProcessError: + raise -def main(): - logging.basicConfig(level=logging.INFO, format="DispVM: %(message)s") - try: - pdf_data = recv_b() - except ReceiveError: - sys.exit(1) + async def _consume(self): + """Await conversion tasks and send final representation to client""" + for page in range(1, self.pagenums + 1): + rep, convert_task = await self.queue.get() - with TemporaryDirectory(prefix="qvm-sanitize-") as tmpdir: - pdfpath = Path(tmpdir, "original") - pdfpath.write_bytes(pdf_data) + try: + await convert_task + except subprocess.CalledProcessError: + raise + else: + self.queue.task_done() + + send(rep.dim) + send_b(await self.loop.run_in_executor(None, rep.final.read_bytes)) + await self.loop.run_in_executor(None, unlink, rep.final) + + + def _pagenums(self): + """Return the number of pages in the suspect file""" + cmd = ["pdfinfo", f"{self.path}"] try: - pagenums = get_pagenums(pdfpath) - send(pagenums) + output = subprocess.run(cmd, capture_output=True, check=True) except subprocess.CalledProcessError: - sys.exit(1) + logging.error("File is probably not a PDF") + raise + + for line in output.stdout.decode("ascii").splitlines(): + if "Pages:" in line: + return int(line.split(":")[1]) + +def main(): + logging.basicConfig(level=logging.INFO, format="DispVM: %(message)s") + loop = asyncio.get_event_loop() + + with TemporaryDirectory(prefix="qvm-sanitize-") as tmpdir: try: - loop = asyncio.get_event_loop() - loop.run_until_complete(run(loop, pdfpath, tmpdir, pagenums)) - except subprocess.CalledProcessError: + f = SuspectFile(loop, Path(tmpdir, "original")) + loop.run_until_complete(f.sanitize()) + except (PageError, ReceiveError, subprocess.CalledProcessError): sys.exit(1) finally: loop.run_until_complete(loop.shutdown_asyncgens()) From 3fd46364974296b11c12e14d6c38f014b76f62d4 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Mon, 27 Apr 2020 13:34:49 -0500 Subject: [PATCH 60/92] server: Remove nonexistent PageError --- qubespdfconverter/server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qubespdfconverter/server.py b/qubespdfconverter/server.py index dbecd52..a52fe92 100755 --- a/qubespdfconverter/server.py +++ b/qubespdfconverter/server.py @@ -342,7 +342,7 @@ def main(): try: f = SuspectFile(loop, Path(tmpdir, "original")) loop.run_until_complete(f.sanitize()) - except (PageError, ReceiveError, subprocess.CalledProcessError): + except (ReceiveError, subprocess.CalledProcessError): sys.exit(1) finally: loop.run_until_complete(loop.shutdown_asyncgens()) From 4e6b286ff08473c14880785249dfde0bfe192e9c Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Mon, 27 Apr 2020 21:33:47 -0500 Subject: [PATCH 61/92] client, server: Remove unused imports --- qubespdfconverter/client.py | 1 - qubespdfconverter/server.py | 1 - 2 files changed, 2 deletions(-) diff --git a/qubespdfconverter/client.py b/qubespdfconverter/client.py index 75600b9..9ae864c 100755 --- a/qubespdfconverter/client.py +++ b/qubespdfconverter/client.py @@ -21,7 +21,6 @@ import asyncio import click -from contextlib import contextmanager import functools import logging import os diff --git a/qubespdfconverter/server.py b/qubespdfconverter/server.py index a52fe92..c1c95ff 100755 --- a/qubespdfconverter/server.py +++ b/qubespdfconverter/server.py @@ -23,7 +23,6 @@ import logging import subprocess import sys -from collections import namedtuple from pathlib import Path from tempfile import TemporaryDirectory From 24dc64fad30d23a9360f5bbf6edb4b61187a7ac7 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Mon, 27 Apr 2020 21:34:52 -0500 Subject: [PATCH 62/92] client, server: Rename unsanitized file class --- qubespdfconverter/client.py | 7 ++++--- qubespdfconverter/server.py | 6 +++--- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/qubespdfconverter/client.py b/qubespdfconverter/client.py index 9ae864c..7955c30 100755 --- a/qubespdfconverter/client.py +++ b/qubespdfconverter/client.py @@ -260,8 +260,8 @@ async def _dim(self, proc): return Dimensions(width=width, height=height, size=size, depth=DEPTH) -class UnsanitizedFile(object): - """A file not yet cleansed by holy bathwater +class BaseFile(object): + """Unsanitized file :param loop: Main event loop :param path: Path to original, unsanitized file @@ -289,6 +289,7 @@ async def sanitize(self, size): self.proc = await asyncio.create_subprocess_exec(*CLIENT_VM_CMD, stdin=subprocess.PIPE, stdout=subprocess.PIPE) + proc_task = asyncio.create_task(wait_proc(self.proc, CLIENT_VM_CMD)) send_task = asyncio.create_task(self._send()) sanitize_task = asyncio.create_task(self._sanitize()) @@ -516,7 +517,7 @@ def validate_paths(ctx, param, untrusted_paths): async def run(loop, params): print("Sending files to Disposable VMs...") - files = [UnsanitizedFile(loop, f) for f in params["files"]] + files = [BaseFile(loop, f) for f in params["files"]] await asyncio.gather(*[f.sanitize(params["batch"]) for f in files], return_exceptions=True) diff --git a/qubespdfconverter/server.py b/qubespdfconverter/server.py index c1c95ff..470c900 100755 --- a/qubespdfconverter/server.py +++ b/qubespdfconverter/server.py @@ -237,8 +237,8 @@ async def _dim(self): return output.decode("ascii") -class SuspectFile(object): - """A potentially malicious file which needs sanitization +class BaseFile(object): + """Unsanitized file :param loop: Main event loop :param path: Path to file @@ -339,7 +339,7 @@ def main(): with TemporaryDirectory(prefix="qvm-sanitize-") as tmpdir: try: - f = SuspectFile(loop, Path(tmpdir, "original")) + f = BaseFile(loop, Path(tmpdir, "original")) loop.run_until_complete(f.sanitize()) except (ReceiveError, subprocess.CalledProcessError): sys.exit(1) From 225780e5d0384b1eb3c606fe926f3b1fcdf28771 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sat, 2 May 2020 10:12:03 -0500 Subject: [PATCH 63/92] client: Add --archive, --dry-run, and --in-place --- qubespdfconverter/client.py | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/qubespdfconverter/client.py b/qubespdfconverter/client.py index 7955c30..26443a4 100755 --- a/qubespdfconverter/client.py +++ b/qubespdfconverter/client.py @@ -522,9 +522,42 @@ async def run(loop, params): return_exceptions=True) +# @click.option("-v", "--verbose", is_flag=True) @click.command() -@click.option("-b", "--batch", type=click.IntRange(0), default=50) -@click.argument("files", type=Path, nargs=-1, callback=validate_paths) +@click.option( + "-b", + "--batch", + type=click.IntRange(0), + default=50, + metavar="SIZE", + help="Maximum number of conversion tasks" +) +@click.option( + "-a", + "--archive", + default="~/QubesUntrustedPDFs", + metavar="PATH", + help="Directory for storing archived files" +) +@click.option( + "-d", + "--dry-run", + is_flag=True, + help="Perform only server-side checks and conversions" +) +@click.option( + "-i", + "--in-place", + is_flag=True, + help="Replace original files instead of archiving them" +) +@click.argument( + "files", + type=Path, + nargs=-1, + callback=validate_paths, + metavar="[FILES ...]" +) @modify_click_errors def main(**params): logging.basicConfig(level=logging.INFO, format="%(message)s") From 9a61502e3e2fdb4df3123dec8893f90b32ae032b Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sat, 2 May 2020 10:12:45 -0500 Subject: [PATCH 64/92] client: Implement --in-place Also, the trusted file is now stored in /tmp/qvm-sanitize-XXX/FILENAME.trusted.pdf during all of its batch conversions. Once they're all done, it's moved to its final save path. The archive directory is no longer created if -i is specified. The ending prompts are removed to prepare for tqdm. --- qubespdfconverter/client.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/qubespdfconverter/client.py b/qubespdfconverter/client.py index 26443a4..ad6f301 100755 --- a/qubespdfconverter/client.py +++ b/qubespdfconverter/client.py @@ -24,6 +24,7 @@ import functools import logging import os +import shutil import subprocess import sys from click._compat import get_text_stderr @@ -274,7 +275,7 @@ def __init__(self, loop, path): self.convert_q = None self.orig_path = path - self.save_path = path.with_suffix(".trusted.pdf") + self.save_path = None self.dir = None self.pagenums = None @@ -315,6 +316,8 @@ async def _sanitize(self): with TemporaryDirectory(prefix="qvm-sanitize-") as d: self.dir = d + self.save_path = Path(d, self.orig_path.with_suffix(".trusted.pdf").name) + receive_task = asyncio.create_task(self._receive()) convert_task = asyncio.create_task(self._convert()) @@ -327,16 +330,23 @@ async def _sanitize(self): await cancel_task(receive_task) raise - await self.loop.run_in_executor(None, self._archive) - print(f"Converted PDF saved as: {self.save_path}") + await self.loop.run_in_executor( + None, + shutil.move, + self.save_path, Path(self.orig_path.parent, self.save_path.name) + ) + + if click.get_current_context().params["in_place"]: + await self.loop.run_in_executor(None, unlink, self.orig_path) + else: + await self.loop.run_in_executor(None, self._archive) async def _receive(self): """Receive initial representations""" for page in range(1, self.pagenums + 1): try: - rep = Representation(self.loop, Path(self.dir, str(page)), - "rgb", "png") + rep = Representation(self.loop, Path(self.dir, str(page)), "rgb", "png") await rep.receive(self.proc) except (DimensionError, ReceiveError): raise @@ -461,7 +471,6 @@ def _archive(self): """Move original file into an archival directory""" archive_path = Path(ARCHIVE_PATH, self.orig_path.name) self.orig_path.rename(archive_path) - print(f"\nOriginal PDF saved as: {archive_path}") def modify_click_errors(func): @@ -566,7 +575,8 @@ def main(**params): logging.info("No files to sanitize.") sys.exit(0) - Path.mkdir(ARCHIVE_PATH, exist_ok=True) + if not params["in_place"]: + Path.mkdir(ARCHIVE_PATH, exist_ok=True) try: loop = asyncio.get_event_loop() From 2a5d8b96b45fbae2b94c552a322d9bf77bffb728 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Mon, 4 May 2020 00:38:59 -0500 Subject: [PATCH 65/92] client: Add initial version of tqdm output --- qubespdfconverter/client.py | 79 ++++++++++++++++++++++++++++++++++--- 1 file changed, 73 insertions(+), 6 deletions(-) diff --git a/qubespdfconverter/client.py b/qubespdfconverter/client.py index ad6f301..816c3aa 100755 --- a/qubespdfconverter/client.py +++ b/qubespdfconverter/client.py @@ -32,6 +32,7 @@ from pathlib import Path from PIL import Image from tempfile import TemporaryDirectory +from tqdm import tqdm PROG = Path(sys.argv[0]).name CLIENT_VM_CMD = ["/usr/bin/qrexec-client-vm", "@dispvm", "qubes.PdfConvert"] @@ -114,6 +115,17 @@ async def wait_proc(proc, cmd): await terminate_proc(proc) +async def add_success_cb(fut, cb, *args, is_async=False): + result = await fut + + if is_async: + await cb(*args) + else: + cb(*args) + + return result + + async def send(proc, data): """Qrexec wrapper for sending data to the server @@ -261,6 +273,22 @@ async def _dim(self, proc): return Dimensions(width=width, height=height, size=size, depth=DEPTH) +class TqdmExtraFormat(tqdm): + """Provides a `pages` format parameter""" + @property + def format_dict(self): + d = super(TqdmExtraFormat, self).format_dict + + if self.total == 0: + pages = "n/a" + else: + pages = str(d["n"]) + "/" + str(d["total"]) + + d.update(pages=pages) + + return d + + class BaseFile(object): """Unsanitized file @@ -277,10 +305,14 @@ def __init__(self, loop, path): self.orig_path = path self.save_path = None self.dir = None - self.pagenums = None + self.pagenums = 0 + self.bar = None + self.size = None + self.converted = False - async def sanitize(self, size): + + async def sanitize(self, size, pos): """Start Qubes RPC session and sanitization tasks :param size: Batch size for queues @@ -291,6 +323,11 @@ async def sanitize(self, size): stdin=subprocess.PIPE, stdout=subprocess.PIPE) + self.bar = TqdmExtraFormat(total=self.pagenums, + position=pos, + desc=f"{self.orig_path}...", + bar_format=" {desc} ({pages})") + proc_task = asyncio.create_task(wait_proc(self.proc, CLIENT_VM_CMD)) send_task = asyncio.create_task(self._send()) sanitize_task = asyncio.create_task(self._sanitize()) @@ -305,6 +342,11 @@ async def sanitize(self, size): await asyncio.gather(cancel_task(send_task), cancel_task(sanitize_task), cancel_task(proc_task)) + raise + finally: + self.bar.set_description_str(str(self.orig_path)) + + self.converted = True async def _sanitize(self): @@ -314,6 +356,8 @@ async def _sanitize(self): except (PageError, ReceiveError): raise + self.bar.reset(total=self.pagenums) + with TemporaryDirectory(prefix="qvm-sanitize-") as d: self.dir = d self.save_path = Path(d, self.orig_path.with_suffix(".trusted.pdf").name) @@ -359,7 +403,10 @@ async def _convert(self): try: for page in range(1, self.pagenums + 1): rep = await self.rep_q.get() - convert_task = asyncio.create_task(rep.convert()) + convert_fut = asyncio.ensure_future(rep.convert()) + convert_task = asyncio.create_task(add_success_cb(convert_fut, + self.bar.update, + 1)) await self.convert_q.put((convert_task, rep.final)) self.rep_q.task_done() @@ -525,11 +572,31 @@ def validate_paths(ctx, param, untrusted_paths): async def run(loop, params): - print("Sending files to Disposable VMs...") files = [BaseFile(loop, f) for f in params["files"]] - await asyncio.gather(*[f.sanitize(params["batch"]) for f in files], + suffix = "s" if len(files) > 1 else "" + logging.info(f":: Sending file{suffix} to Disposable VM{suffix}...\n") + + # FIXME: Hides exceptions with return_exceptions=True + await asyncio.gather(*[f.sanitize(params["batch"], i) for i, f in enumerate(files)], return_exceptions=True) + # FIXME: Why does last close mess up the output w/out hardcoded positions? + n = 0 + net_size = 0 + + for f in files: + if f.converted: + n += 1 + net_size += Path(f.orig_path.parent, f.save_path.name).stat().st_size + f.bar.close() + + frac = f"{n}/{len(files)}" + netsize = net_size / (1024 * 1024) + spacing = len(f"{netsize:.2f} MiB") + 2 # 2 = 2 leading spaces + + print(f"\nTotal Sanitized Files:{frac:>{spacing}}") + print(f"Net Sanitized Size: {netsize:.2f} MiB") + # @click.option("-v", "--verbose", is_flag=True) @click.command() @@ -582,7 +649,7 @@ def main(**params): loop = asyncio.get_event_loop() loop.run_until_complete(run(loop, params)) except KeyboardInterrupt: - logging.error("Original files untouched.") + logging.error("Unsanitized files untouched.") finally: loop.run_until_complete(loop.shutdown_asyncgens()) From 924824f7dc28505aae34d701ba37976cad9fde7b Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Tue, 19 May 2020 14:15:51 -0500 Subject: [PATCH 66/92] client: Replace namedtuple with dataclass --- qubespdfconverter/client.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/qubespdfconverter/client.py b/qubespdfconverter/client.py index 816c3aa..24c5044 100755 --- a/qubespdfconverter/client.py +++ b/qubespdfconverter/client.py @@ -27,8 +27,10 @@ import shutil import subprocess import sys + from click._compat import get_text_stderr -from collections import namedtuple +from enum import IntFlag +from dataclasses import dataclass from pathlib import Path from PIL import Image from tempfile import TemporaryDirectory @@ -44,7 +46,21 @@ ARCHIVE_PATH = Path(Path.home(), "QubesUntrustedPDFs") -Dimensions = namedtuple("Dimensions", ["width", "height", "size", "depth"]) + +class Status(IntFlag): + """Sanitization job status""" + PENDING = 1 + DONE = 2 + FAIL = 4 + CANCELLED = 8 + + +@dataclass(frozen=True) +class ImageDimensions: + width: int + height: int + size: int + depth: int = DEPTH class DimensionError(ValueError): From 7243999801c2c78c4a6c971b371e039c74265672 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Tue, 19 May 2020 14:29:11 -0500 Subject: [PATCH 67/92] client, server: Update UI, design, and error handling The client finally has a UI that I'm satisfied with: progress "bars", status reports, and nice little error messages. For overall design changes, alot of the non-BaseFile related attributes and methods were taken out and put into a Job class. This just makes more intuitive sense since a BaseFile shouldn't really have a progress bar or qrexec subprocess. In addition, queue items are no longer arbitrary tuples but entries with an associated data type. Error handling saw some major improvements. Most notably, explicit raises in areas where we don't do any other cleanup code have been removed. Also, signal handling (which was a major pain) is finally solved (at least, for SIGINT) and task cancellation logic has been updated. Finally, batch processing is now done properly by having the publishing task join on the queue until all the running tasks are done. --- README.md | 30 +- qubespdfconverter/client.py | 743 ++++++++++++++++++------------------ qubespdfconverter/server.py | 302 ++++++--------- 3 files changed, 503 insertions(+), 572 deletions(-) diff --git a/README.md b/README.md index 62bd97f..25da71e 100644 --- a/README.md +++ b/README.md @@ -2,31 +2,29 @@ Qubes PDF Converter ==================== Qubes PDF converter is a [Qubes](https://qubes-os.org) Application that -utilizes Qubes' flexible qrexec (inter-VM communication) infrastructure and -Disposable VMs to securely convert potentially untrusted (e.g. maliciously -malformed) PDF files into safe-to-view PDF files. +utilizes Disposable VMs and Qubes' flexible qrexec (inter-VM communication) +infrastructure to securely convert potentially untrusted PDF files into +safe-to-view PDF files. -This is done by having a Disposable VM render each page of a PDF file into a -very simple representation (RGB bitmap) that (presumably) leaves no room for -malicious code. This representation is then sent back to the client AppVM which +This is done by having a Disposable VM render each page of a PDF file into a +very simple representation (RGB bitmap) that (presumably) leaves no room for +malicious code. This representation is then sent back to the client AppVM which then constructs an entirely new PDF file out of the received bitmaps. -Of course, the price we pay for this conversion is an increase in file size and -the loss of any structural information or text-based search in the converted -PDF. - More discussion of the concept has been described in the original article [here](http://blog.invisiblethings.org/2013/02/21/converting-untrusted-pdfs-into-trusted.html). Usage ------ - [user@varia ~]$ qvm-convert-pdf test.pdf - Sending file to a Disposable VM... - Waiting for converted samples... - Receving page 8/8... - Converted PDF saved as: /home/user/test.trusted.pdf - Original file saved as /home/user/QubesUntrustedPDFs/test.pdf + [user@domU ~]$ qvm-convert-pdf file1.pdf file2.pdf file3.pdf + :: Sending files to Disposable VMs... + + file1.pdf...done + file2.pdf...fail + file3.pdf...done + + Total Sanitized Files: 2/3 Authors --------- diff --git a/qubespdfconverter/client.py b/qubespdfconverter/client.py index 24c5044..bfd1cd2 100755 --- a/qubespdfconverter/client.py +++ b/qubespdfconverter/client.py @@ -25,6 +25,7 @@ import logging import os import shutil +import signal import subprocess import sys @@ -36,7 +37,6 @@ from tempfile import TemporaryDirectory from tqdm import tqdm -PROG = Path(sys.argv[0]).name CLIENT_VM_CMD = ["/usr/bin/qrexec-client-vm", "@dispvm", "qubes.PdfConvert"] MAX_PAGES = 10000 @@ -44,7 +44,7 @@ MAX_IMG_HEIGHT = 10000 DEPTH = 8 -ARCHIVE_PATH = Path(Path.home(), "QubesUntrustedPDFs") +ERROR_LOGS = asyncio.Queue() class Status(IntFlag): @@ -71,35 +71,78 @@ class PageError(ValueError): """Raised if an invalid number of pages was received""" -class ReceiveError(Exception): - """Raised if a STDOUT read failed in a qrexec-client-vm subprocess""" +class QrexecError(Exception): + """Raised if a qrexec-related error occured""" + + +class RepresentationError(Exception): + """Raised if an representation-related error occurred""" class BadPath(click.BadParameter): - """Raised if a Path object parsed by Click is invalid.""" + """Raised if a Path object parsed by Click is invalid""" def __init__(self, path, message): super().__init__(message, param_hint=f'"{path}"') -def unlink(path): - """Wrapper for pathlib.Path.unlink(path, missing_ok=True) +async def sigint_handler(tasks): + await asyncio.gather(*[cancel_task(t) for t in tasks]) - :param Path: File path to delete - """ - try: - path.unlink() - except FileNotFoundError: - pass +def modify_click_errors(func): + """Decorator for replacing Click behavior on errors""" -async def cancel_task(task): - """Convenience wrapper for cancelling an asyncio Task + def show(self, file=None): + """Removes usage message from UsageError error messages""" + color = None - Presumably, since we're cancelling tasks, we don't care what they returned - with or raised. + if file is None: + file = get_text_stderr() - :param task: Task to cancel - """ + if self.ctx is not None: + color = self.ctx.color + + click.echo(f"{self.format_message()}", file=file, color=color) + + + def format_message(self): + """Removes 'Invalid value' from BadParameter error messages""" + if self.param_hint is not None: + prefix = self.param_hint + elif self.param is not None: + prefix = self.param.get_error_hint(self.ctx) + else: + return self.message + prefix = click.exceptions._join_param_hints(prefix) + + return f"{prefix}: {self.message}" + + click.exceptions.BadParameter.format_message = format_message + click.exceptions.UsageError.show = show + + return func + + +def validate_paths(ctx, param, untrusted_paths): + """Callback for validating file paths parsed by Click""" + for untrusted_path in untrusted_paths: + if not untrusted_path.resolve().exists(): + raise BadPath(untrusted_path, "No such file or directory") + elif not untrusted_path.resolve().is_file(): + raise BadPath(untrusted_path, "Not a regular file") + + try: + with untrusted_path.resolve().open("rb"): + pass + except PermissionError as e: + raise BadPath(untrusted_path, "Not readable") from e + else: + paths = untrusted_paths + + return paths + + +async def cancel_task(task): task.cancel() try: await task @@ -108,50 +151,29 @@ async def cancel_task(task): async def terminate_proc(proc): - """Convenience wrapper for terminating a process - - :param proc: Process to terminate - """ if proc.returncode is None: proc.terminate() await proc.wait() async def wait_proc(proc, cmd): - """Convenience wrapper for waiting on a process - - :param proc: Process to wait on - :param cmd: Command executed by @proc (Exception handling purposes) - """ try: await proc.wait() - if proc.returncode: - raise subprocess.CalledProcessError(proc.returncode, cmd) except asyncio.CancelledError: await terminate_proc(proc) + raise - -async def add_success_cb(fut, cb, *args, is_async=False): - result = await fut - - if is_async: - await cb(*args) - else: - cb(*args) - - return result + if proc.returncode: + raise subprocess.CalledProcessError(proc.returncode, cmd) async def send(proc, data): - """Qrexec wrapper for sending data to the server - - :param proc: qrexec-client-vm process - :param data: Data to send (bytes, String, or int) - """ + """Qrexec wrapper for sending data to the server""" if isinstance(data, (int, str)): data = str(data).encode() proc.stdin.write(data) + try: await proc.stdin.drain() except BrokenPipeError: @@ -159,77 +181,84 @@ async def send(proc, data): async def recv_b(proc, size): - """Qrexec wrapper for receiving binary data from the server + """Qrexec wrapper for receiving binary data from the server""" + return await proc.stdout.readexactly(size) - :param proc: qrexec-client-vm process - :param size: Number of bytes to receive - """ - try: - untrusted_data = await proc.stdout.readexactly(size) - except asyncio.IncompleteReadError as e: - raise ReceiveError from e - return untrusted_data +async def recvline(proc): + """Qrexec wrapper for receiving a line of text data from the server""" + untrusted_data = await proc.stdout.readline() + if not untrusted_data: + raise EOFError + return untrusted_data.decode("ascii").rstrip() -async def recvline_b(proc): - """Qrexec wrapper for receiving a line of binary data from the server - :param proc: qrexec-client-vm process - """ - untrusted_data = await proc.stdout.readline() +class Tqdm(tqdm): + """Adds @pages and @status attributes""" - if not untrusted_data: - logging.error("Server may have died...") - raise ReceiveError + def __init__(self, *args, **kwargs): + self.pages = "0/?" + self.status = Status.PENDING + super().__init__(*args, **kwargs) - return untrusted_data + @property + def format_dict(self): + d = super().format_dict -async def recvline(proc): - """Convenience wrapper for receiving a line of text data from the server + if ( + self.status & Status.PENDING + and d["total"] != 0 + and d["n"] != d["total"] + ): + self.pages = f"{d['n']}/{d['total']}" + + d.update(pages=self.pages) + + return d - :param proc: qrexec-client-vm process - """ - try: - untrusted_data = (await recvline_b(proc)).decode("ascii").rstrip() - except EOFError as e: - logging.error("Server may have died...") - raise ReceiveError from e - except (AttributeError, UnicodeError): - logging.error("Failed to decode received data!") - raise - return untrusted_data + def set_status(self, flag, refresh=True): + self.status = flag + self.pages = self.status.name.lower() + if refresh: + self.refresh() -class Representation(object): - """Umbrella object for the initial & final representations of a file + +class Representation: + """Umbrella object for a file's initial and final representations The initial representation must be of a format such that if it contains malicious code/data, such code/data is excluded from the final - representation upon conversion. Generally, this makes the initial - representation a relatively simple format (e.g., RGB bitmap). + representation upon conversion. Generally, this restricts the initial + representation to a relatively simple format (e.g., RGB bitmap). The final representation can be of any format you'd like, provided that - the initial representation's format was properly selected and you are - able to combine them later on into a PDF. + the initial representation's format was properly selected (e.g., PNG). - :param loop: Main event loop - :param prefix: Path prefixes of the representations - :param initial: Format of the initial representation - :param final: Format of the final representation + :param prefix: Path prefixes for representations + :param f_suffix: File extension of initial representation (without .) + :param i_suffix: File extension of final representation (without .) """ - def __init__(self, loop, prefix, initial, final): - self.loop = loop - self.initial = prefix.with_suffix(f".{initial}") - self.final = prefix.with_suffix(f".{final}") + def __init__(self, prefix, i_suffix, f_suffix): + """ + :param initial: File path to initial representation + :param final: File path final representation + :param dim: Image dimensions received from the server + """ + self.initial = prefix.with_suffix(f".{i_suffix}") + self.final = prefix.with_suffix(f".{f_suffix}") self.dim = None - async def convert(self): - """Convert initial representation into final representation""" + async def convert(self, bar): + """Convert initial representation into final representation + + :param bar: Progress bar to update upon completion + """ cmd = [ "convert", "-size", @@ -240,17 +269,14 @@ async def convert(self): f"png:{self.final}" ] + proc = await asyncio.create_subprocess_exec(*cmd) + try: - proc = await asyncio.create_subprocess_exec(*cmd) await wait_proc(proc, cmd) - except asyncio.CancelledError: - await terminate_proc(proc) - raise - except subprocess.CalledProcessError: - logging.error(f"Page conversion failed") - raise + except subprocess.CalledProcessError as e: + raise RepresentationError("Failed to convert representation") from e - await self.loop.run_in_executor(None, unlink, self.initial) + bar.update(1) async def receive(self, proc): @@ -260,11 +286,21 @@ async def receive(self, proc): """ try: self.dim = await self._dim(proc) + except EOFError as e: + raise QrexecError("Failed to receive image dimensions") from e + except (AttributeError, UnicodeError, ValueError) as e: + raise DimensionError("Invalid image dimensions") from e + + try: data = await recv_b(proc, self.dim.size) - except (DimensionError, ReceiveError): - raise + except asyncio.IncompleteReadError as e: + raise QrexecError("Received inconsistent number of bytes") from e - await self.loop.run_in_executor(None, self.initial.write_bytes, data) + await asyncio.get_running_loop().run_in_executor( + None, + self.initial.write_bytes, + data + ) async def _dim(self, proc): @@ -272,243 +308,254 @@ async def _dim(self, proc): :param proc: qrexec-client-vm process """ - try: - untrusted_w, untrusted_h = map(int, (await recvline(proc)).split(" ", 1)) - except (AttributeError, EOFError, UnicodeError, ValueError) as e: - raise ReceiveError from e + untrusted_w, untrusted_h = map(int, (await recvline(proc)).split(" ", 1)) - if 1 <= untrusted_w <= MAX_IMG_WIDTH and \ - 1 <= untrusted_h <= MAX_IMG_HEIGHT: + if 1 <= untrusted_w <= MAX_IMG_WIDTH and 1 <= untrusted_h <= MAX_IMG_HEIGHT: width = untrusted_w height = untrusted_h + size = width * height * 3 else: - logging.error(f"invalid image measurements received") - raise DimensionError + raise ValueError - size = width * height * 3 - return Dimensions(width=width, height=height, size=size, depth=DEPTH) + return ImageDimensions(width, height, size) -class TqdmExtraFormat(tqdm): - """Provides a `pages` format parameter""" - @property - def format_dict(self): - d = super(TqdmExtraFormat, self).format_dict - - if self.total == 0: - pages = "n/a" - else: - pages = str(d["n"]) + "/" + str(d["total"]) - - d.update(pages=pages) - - return d +@dataclass(frozen=True) +class BatchEntry: + task: asyncio.Task + rep: Representation -class BaseFile(object): +class BaseFile: """Unsanitized file - :param loop: Main event loop :param path: Path to original, unsanitized file + :param pagenums: Number of pages in original file """ - def __init__(self, loop, path): - self.loop = loop - self.proc = None - self.rep_q = None - self.convert_q = None - - self.orig_path = path - self.save_path = None - self.dir = None - self.pagenums = 0 - - self.bar = None - self.size = None - self.converted = False + def __init__(self, path, pagenums, pdf): + """ + :param path: @path + :param pagenums: @pagenums + :param batch: Conversion queue + """ + self.path = path + self.pagenums = pagenums + self.pdf = pdf + self.batch = None - async def sanitize(self, size, pos): - """Start Qubes RPC session and sanitization tasks + async def sanitize(self, proc, bar, archive, depth, in_place): + """Receive and convert representation files - :param size: Batch size for queues + :param archive: Path to archive directory + :param depth: Conversion queue size + :param in_place: Value of --in-place flag """ - self.rep_q = asyncio.Queue(size) - self.convert_q = asyncio.Queue(size) - self.proc = await asyncio.create_subprocess_exec(*CLIENT_VM_CMD, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE) + self.batch = asyncio.Queue(depth) - self.bar = TqdmExtraFormat(total=self.pagenums, - position=pos, - desc=f"{self.orig_path}...", - bar_format=" {desc} ({pages})") - - proc_task = asyncio.create_task(wait_proc(self.proc, CLIENT_VM_CMD)) - send_task = asyncio.create_task(self._send()) - sanitize_task = asyncio.create_task(self._sanitize()) + publish_task = asyncio.create_task(self._publish(proc, bar)) + consume_task = asyncio.create_task(self._consume()) try: - await asyncio.gather(proc_task, send_task, sanitize_task) - except (BrokenPipeError, - DimensionError, - PageError, - ReceiveError, - subprocess.CalledProcessError) as e: - await asyncio.gather(cancel_task(send_task), - cancel_task(sanitize_task), - cancel_task(proc_task)) - raise + await asyncio.gather(publish_task, consume_task) finally: - self.bar.set_description_str(str(self.orig_path)) + if not publish_task.done(): + await cancel_task(publish_task) - self.converted = True + if not consume_task.done(): + await cancel_task(consume_task) + while not self.batch.empty(): + batch_e = await self.batch.get() + await cancel_task(batch_e.task) + self.batch.task_done() - async def _sanitize(self): - """Receive and convert representation files""" - try: - self.pagenums = await self._pagenums() - except (PageError, ReceiveError): - raise - self.bar.reset(total=self.pagenums) + async def _publish(self, proc, bar): + """Receive initial representations and start their conversions""" + for page in range(1, self.pagenums + 1): + rep = Representation(Path(self.pdf.parent, str(page)), "rgb", "png") + await rep.receive(proc) - with TemporaryDirectory(prefix="qvm-sanitize-") as d: - self.dir = d - self.save_path = Path(d, self.orig_path.with_suffix(".trusted.pdf").name) + if page % self.batch.maxsize == 0: + await self.batch.join() - receive_task = asyncio.create_task(self._receive()) - convert_task = asyncio.create_task(self._convert()) + task = asyncio.create_task(rep.convert(bar)) + batch_e = BatchEntry(task, rep) try: - await asyncio.gather(receive_task, convert_task) - except (DimensionError, ReceiveError): - await cancel_task(convert_task) - raise - except (IOError, subprocess.CalledProcessError): - await cancel_task(receive_task) + await self.batch.put(batch_e) + except asyncio.CancelledError: + await cancel_task(task) raise - await self.loop.run_in_executor( - None, - shutil.move, - self.save_path, Path(self.orig_path.parent, self.save_path.name) - ) - - if click.get_current_context().params["in_place"]: - await self.loop.run_in_executor(None, unlink, self.orig_path) - else: - await self.loop.run_in_executor(None, self._archive) - - async def _receive(self): - """Receive initial representations""" + async def _consume(self): + """Convert initial representations to final form and save as PDF""" for page in range(1, self.pagenums + 1): - try: - rep = Representation(self.loop, Path(self.dir, str(page)), "rgb", "png") - await rep.receive(self.proc) - except (DimensionError, ReceiveError): - raise + batch_e = await self.batch.get() + await batch_e.task + await self._save_rep(batch_e.rep) + self.batch.task_done() - await self.rep_q.put(rep) + async def _save_rep(self, rep): + """Save final representations to a PDF file""" + try: + image = await asyncio.get_running_loop().run_in_executor( + None, + Image.open, + rep.final + ) + except IOError as e: + raise RepresentationError("Failed to open representation") from e - async def _convert(self): - """Convert initial representations to final representations""" try: - for page in range(1, self.pagenums + 1): - rep = await self.rep_q.get() - convert_fut = asyncio.ensure_future(rep.convert()) - convert_task = asyncio.create_task(add_success_cb(convert_fut, - self.bar.update, - 1)) - await self.convert_q.put((convert_task, rep.final)) - self.rep_q.task_done() - - if self.convert_q.full() or page == self.pagenums: - try: - await self._complete_batch() - except (IOError, subprocess.CalledProcessError): - raise - except asyncio.CancelledError: - while not self.convert_q.empty(): - convert_task, _ = await self.convert_q.get() - await cancel_task(convert_task) - self.convert_q.task_done() - raise + await asyncio.get_running_loop().run_in_executor( + None, + functools.partial(image.save, + self.pdf, + "PDF", + resolution=100, + append=self.pdf.exists(), + save_all=True) + ) + except IOError as e: + raise RepresentationError("Failed to save representation") from e + finally: + await asyncio.get_running_loop().run_in_executor( + None, + image.close + ) + +class Job: + """ - async def _complete_batch(self): - """Wait on current batch of final representations to be combined""" - convert_tasks = [] - freps = [] + :param path: Path to original, unsanitized file + :param pos: Bar position + """ - while not self.convert_q.empty(): - convert_task, frep = await self.convert_q.get() - convert_tasks.append(convert_task) - freps.append(frep) - self.convert_q.task_done() + def __init__(self, path, pos): + """ - try: - await asyncio.gather(*convert_tasks) - except subprocess.CalledProcessError: - for convert_task in convert_tasks: - await cancel_task(convert_task) - raise + :param file: Base file + :param bar: Progress bar + :param proc: qrexec-client-vm process + :param pdf: Path to temporary PDF for appending representations + """ + self.path = path + self.bar = Tqdm(total=0, + position=pos, + desc=str(path), + bar_format=" {desc}...{pages}") + self.base = None + self.proc = None + self.pdf = None - try: - await self._combine_reps(freps) - except IOError: - raise - await asyncio.gather(*[self.loop.run_in_executor(None, unlink, frep) - for frep in freps]) + async def run(self, archive, depth, in_place): + self.proc = await asyncio.create_subprocess_exec( + *CLIENT_VM_CMD, + stdin=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE + ) + with TemporaryDirectory(prefix="qvm-sanitize-") as tmpdir: + try: + await self._setup(tmpdir) + await self._start(archive, depth, in_place) + except (PageError, + QrexecError, + DimensionError, + RepresentationError, + subprocess.CalledProcessError) as e: + # Since the qrexec-client-vm subprocesses belong to the same + # process group, when a SIGINT is issued, it's sent to each one. + # Consequently, there's a race between the signal and our + # cleanup code. Occasionally, the signal wins and causes some + # qrexec-client-vm subprocesses to exit, potentially during an + # operation (e.g., a STDOUT read), thereby raising an exception + # not expected by the cleanup code. + if self.proc.returncode == -signal.SIGINT: + self.bar.set_status(Status.CANCELLED) + raise asyncio.CancelledError + + self.bar.set_status(Status.FAIL) + await ERROR_LOGS.put(f"{self.path.name}: {str(e)}") + if self.proc.returncode is not None: + await terminate_proc(self.proc) + raise + except asyncio.CancelledError: + self.bar.set_status(Status.CANCELLED) + raise - async def _combine_reps(self, freps): - """Combine final representations into a sanitized PDF file + self.bar.set_status(Status.DONE) - :param freps: List of final representations - """ - try: - img_tasks = [self.loop.run_in_executor(None, Image.open, frep) - for frep in freps] - images = await asyncio.gather(*img_tasks) - except IOError: - logging.error("Cannot identify image") - await asyncio.gather(*[self.loop.run_in_executor(None, img.close) - for img in images]) - raise + + async def _setup(self, tmpdir): + send_task = asyncio.create_task(self._send()) + page_task = asyncio.create_task(self._pagenums()) try: - await self.loop.run_in_executor(None, - functools.partial( - images[0].save, - self.save_path, - "PDF", - resolution=100, - append=self.save_path.exists(), - append_images=images[1:], - save_all=True)) - except IOError: - logging.error(f"Could not write to {self.save_path}") - await self.loop.run_in_executor(None, unlink, self.save_path) + _, pagenums = await asyncio.gather(send_task, page_task) + except QrexecError: + await cancel_task(page_task) raise - finally: - await asyncio.gather(*[self.loop.run_in_executor(None, img.close) - for img in images]) + else: + self.bar.reset(total=pagenums) + + self.pdf = Path(tmpdir, self.path.with_suffix(".trusted.pdf").name) + self.base = BaseFile(self.path, pagenums, self.pdf) + + + async def _start(self, archive, depth, in_place): + await self.base.sanitize( + self.proc, + self.bar, + archive, + depth, + in_place + ) + await wait_proc(self.proc, CLIENT_VM_CMD) + + await asyncio.get_running_loop().run_in_executor( + None, + shutil.move, + self.pdf, + Path(self.path.parent, self.pdf.name) + ) + + if in_place: + try: + await asyncio.get_running_loop().run_in_executor( + None, + self.path.unlink + ) + except FileNotFoundError: + pass + else: + await asyncio.get_running_loop().run_in_executor( + None, + self._archive, + archive + ) async def _send(self): """Send original document to server""" + data = await asyncio.get_running_loop().run_in_executor( + None, + self.path.read_bytes + ) + try: - data = await self.loop.run_in_executor(None, - self.orig_path.read_bytes) await send(self.proc, data) + except BrokenPipeError as e: + raise QrexecError("Failed to send PDF") from e + else: self.proc.stdin.write_eof() - except BrokenPipeError: - raise async def _pagenums(self): @@ -516,102 +563,57 @@ async def _pagenums(self): try: untrusted_pagenums = int(await recvline(self.proc)) except (AttributeError, EOFError, UnicodeError, ValueError) as e: - raise ReceiveError from e + raise QrexecError("Failed to receive page count") from e - try: - if not 1 <= untrusted_pagenums <= MAX_PAGES: - raise ValueError - except ValueError as e: - logging.error("Invalid number of pages received") - raise PageError from e - else: + if 1 <= untrusted_pagenums <= MAX_PAGES: pagenums = untrusted_pagenums + else: + raise PageError("Invalid page count") return pagenums - def _archive(self): + def _archive(self, archive): """Move original file into an archival directory""" - archive_path = Path(ARCHIVE_PATH, self.orig_path.name) - self.orig_path.rename(archive_path) + Path.mkdir(archive, exist_ok=True) + self.path.rename(Path(archive, self.path.name)) -def modify_click_errors(func): - """Decorator for replacing Click behavior on errors""" - def show(self, file=None): - """Removes usage message from UsageError error messages""" - color = None +async def output_logs(): + while not ERROR_LOGS.empty(): + err_msg = await ERROR_LOGS.get() + logging.error(err_msg) + ERROR_LOGS.task_done() - if file is None: - file = get_text_stderr() - if self.ctx is not None: - color = self.ctx.color +def output_statistics(results): + completed = [res for res in results].count(None) + print(f"\nTotal Sanitized Files: {completed}/{len(results)}") - click.echo(f"{self.format_message()}", file=file, color=color) - def format_message(self): - """Removes 'Invalid value' from BadParameter error messages""" - if self.param_hint is not None: - prefix = self.param_hint - elif self.param is not None: - prefix = self.param.get_error_hint(self.ctx) - else: - return self.message - prefix = click.exceptions._join_param_hints(prefix) +async def run(params): + suffix = "s" if len(params["files"]) > 1 else "" + print(f":: Sending file{suffix} to Disposable VM{suffix}...\n") - return f"{prefix}: {self.message}" + tasks = [] + jobs = [Job(f, i) for i, f in enumerate(params["files"])] + for job in jobs: + tasks.append(asyncio.create_task(job.run(params["archive"], + params["batch"], + params["in_place"]))) - click.exceptions.BadParameter.format_message = format_message - click.exceptions.UsageError.show = show + asyncio.get_running_loop().add_signal_handler( + signal.SIGINT, + lambda: asyncio.ensure_future(sigint_handler(tasks)) + ) - return func + results = await asyncio.gather(*tasks, return_exceptions=True) + for job in jobs: + job.bar.close() -def validate_paths(ctx, param, untrusted_paths): - """Callback for validating file paths parsed by Click""" - for untrusted_path in untrusted_paths: - if not untrusted_path.resolve().exists(): - raise BadPath(untrusted_path, "No such file or directory") - elif not untrusted_path.resolve().is_file(): - raise BadPath(untrusted_path, "Not a regular file") - - try: - with untrusted_path.resolve().open("rb"): - pass - except PermissionError: - raise BadPath(untrusted_path, "Not readable") - else: - paths = untrusted_paths - - return paths - - -async def run(loop, params): - files = [BaseFile(loop, f) for f in params["files"]] - suffix = "s" if len(files) > 1 else "" - logging.info(f":: Sending file{suffix} to Disposable VM{suffix}...\n") - - # FIXME: Hides exceptions with return_exceptions=True - await asyncio.gather(*[f.sanitize(params["batch"], i) for i, f in enumerate(files)], - return_exceptions=True) - - # FIXME: Why does last close mess up the output w/out hardcoded positions? - n = 0 - net_size = 0 - - for f in files: - if f.converted: - n += 1 - net_size += Path(f.orig_path.parent, f.save_path.name).stat().st_size - f.bar.close() - - frac = f"{n}/{len(files)}" - netsize = net_size / (1024 * 1024) - spacing = len(f"{netsize:.2f} MiB") + 2 # 2 = 2 leading spaces - - print(f"\nTotal Sanitized Files:{frac:>{spacing}}") - print(f"Net Sanitized Size: {netsize:.2f} MiB") + await output_logs() + output_statistics(results) # @click.option("-v", "--verbose", is_flag=True) @@ -652,22 +654,13 @@ async def run(loop, params): ) @modify_click_errors def main(**params): - logging.basicConfig(level=logging.INFO, format="%(message)s") - - if not params["files"]: - logging.info("No files to sanitize.") - sys.exit(0) + logging.basicConfig(format="error: %(message)s") - if not params["in_place"]: - Path.mkdir(ARCHIVE_PATH, exist_ok=True) - - try: + if params["files"]: loop = asyncio.get_event_loop() - loop.run_until_complete(run(loop, params)) - except KeyboardInterrupt: - logging.error("Unsanitized files untouched.") - finally: - loop.run_until_complete(loop.shutdown_asyncgens()) + loop.run_until_complete(run(params)) + else: + print("No files to sanitize.") if __name__ == "__main__": diff --git a/qubespdfconverter/server.py b/qubespdfconverter/server.py index 470c900..eadafc7 100755 --- a/qubespdfconverter/server.py +++ b/qubespdfconverter/server.py @@ -20,9 +20,10 @@ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. import asyncio -import logging import subprocess import sys + +from dataclasses import dataclass from pathlib import Path from tempfile import TemporaryDirectory @@ -30,15 +31,8 @@ STDIN_READ_SIZE = 65536 -class ReceiveError(Exception): - """Raised if a STDOUT read failed in a qrexec-client-vm subprocess""" - - def unlink(path): - """Wrapper for pathlib.Path.unlink(path, missing_ok=True) - - :param Path: File path to delete - """ + """Wrapper for pathlib.Path.unlink(path, missing_ok=True)""" try: path.unlink() except FileNotFoundError: @@ -46,13 +40,6 @@ def unlink(path): async def cancel_task(task): - """Convenience wrapper for cancelling an asyncio Task - - Presumably, since we're cancelling tasks, we don't care what they returned - with or raised. - - :param task: Task to cancel - """ task.cancel() try: await task @@ -61,57 +48,24 @@ async def cancel_task(task): async def terminate_proc(proc): - """Convenience wrapper for terminating a process - - :param proc: Process to terminate - """ if proc.returncode is None: proc.terminate() await proc.wait() async def wait_proc(proc, cmd): - """Convenience wrapper for waiting on a process - - :param proc: Process to wait on - :param cmd: Command executed by @proc (Exception handling purposes) - """ try: await proc.wait() - if proc.returncode: - raise subprocess.CalledProcessError(proc.returncode, cmd) except asyncio.CancelledError: await terminate_proc(proc) + raise - -def recv_b(): - """Qrexec wrapper for receiving binary data from the client""" - try: - untrusted_data = sys.stdin.buffer.read() - except EOFError as e: - raise ReceiveError from e - - if not untrusted_data: - raise ReceiveError - - return untrusted_data - - -def recvline(): - """Qrexec wrapper for receiving a line of text data from the client""" - try: - untrusted_data = sys.stdin.buffer.readline().decode("ascii") - except (AttributeError, EOFError, UnicodeError) as e: - raise ReceiveError from e - - return untrusted_data + if proc.returncode: + raise subprocess.CalledProcessError(proc.returncode, cmd) def send_b(data): - """Qrexec wrapper for sending binary data to the client - - :param data: Data to send (bytes, String, or int) - """ + """Qrexec wrapper for sending binary data to the client""" if isinstance(data, (str, int)): data = str(data).encode() @@ -120,15 +74,20 @@ def send_b(data): def send(data): - """Qrexec wrapper for sending text data to the client - - :param data: Data to send - """ + """Qrexec wrapper for sending text data to the client""" print(data, flush=True) -class Representation(object): - """Umbrella object for the initial & final representations of a file +def recv_b(): + """Qrexec wrapper for receiving binary data from the client""" + untrusted_data = sys.stdin.buffer.read() + if not untrusted_data: + raise EOFError + return untrusted_data + + +class Representation: + """Umbrella object for a file's initial and final representations The initial representation must be of a format from which we can derive the final representation without breaking any of its requirements. @@ -140,133 +99,99 @@ class Representation(object): from the final representation upon conversion. Generally, this makes the final representation a relatively simple format (e.g., RGB bitmap). - :param loop: Main event loop :param path: Path to original, unsanitized file - :param prefix: Path prefixes of the representations - :param initial: The format of the initial representation - :param final: The format of the final representation + :param prefix: Path prefix for representations + :param f_suffix: File extension of initial representation (without .) + :param i_suffix: File extension of final representation (without .) """ - def __init__(self, loop, path, prefix, initial, final): - self.loop = loop + def __init__(self, path, prefix, i_suffix, f_suffix): self.path = path self.page = prefix.name - self.initial = prefix.with_suffix(f".{initial}") - self.final = prefix.with_suffix(f".{final}") + self.initial = prefix.with_suffix(f".{i_suffix}") + self.final = prefix.with_suffix(f".{f_suffix}") self.dim = None async def convert(self): """Convert initial representation to final representation""" - try: - irep_task = asyncio.create_task(self._irep()) - await irep_task - except asyncio.CancelledError: - await cancel_task(irep_task) - raise - except subprocess.CalledProcessError: - raise - - try: - dim_task = asyncio.create_task(self._dim()) - self.dim = await dim_task - except asyncio.CancelledError: - await cancel_task(dim_task) - raise - except subprocess.CalledProcessError: - raise - cmd = [ "convert", - f"{self.initial}", + str(self.initial), "-depth", - f"{DEPTH}", + str(DEPTH), f"rgb:{self.final}" ] + await self.create_irep() + self.dim = await self._dim() + + proc = await asyncio.create_subprocess_exec(*cmd) try: - proc = await asyncio.create_subprocess_exec(*cmd) await wait_proc(proc, cmd) - except asyncio.CancelledError: - await terminate_proc(proc) - raise - except subprocess.CalledProcessError: - raise finally: - await self.loop.run_in_executor(None, unlink, self.initial) + await asyncio.get_running_loop().run_in_executor( + None, + unlink, + self.initial + ) - async def _irep(self): + async def create_irep(self): """Create initial representation""" cmd = [ "pdftocairo", - f"{self.path}", + str(self.path), "-png", "-f", - f"{self.page}", + str(self.page), "-l", - f"{self.page}", + str(self.page), "-singlefile", - f"{Path(self.initial.parent, self.initial.stem)}" + str(Path(self.initial.parent, self.initial.stem)) ] - try: - proc = await asyncio.create_subprocess_exec(*cmd) - await wait_proc(proc, cmd) - except asyncio.CancelledError: - await terminate_proc(proc) - raise - except subprocess.CalledProcessError: - raise + proc = await asyncio.create_subprocess_exec(*cmd) + await wait_proc(proc, cmd) async def _dim(self): """Identify image dimensions of initial representation""" - cmd = ["identify", "-format", "%w %h", f"{self.initial}"] + cmd = ["identify", "-format", "%w %h", str(self.initial)] + proc = await asyncio.create_subprocess_exec( + *cmd, + stdout=subprocess.PIPE + ) try: - proc = await asyncio.create_subprocess_exec(*cmd, - stdout=subprocess.PIPE) output, _ = await proc.communicate() except asyncio.CancelledError: await terminate_proc(proc) raise - except subprocess.CalledProcessError: - raise return output.decode("ascii") -class BaseFile(object): - """Unsanitized file +@dataclass(frozen=True) +class BatchEntry: + task: asyncio.Task + rep: Representation - :param loop: Main event loop - :param path: Path to file - """ - def __init__(self, loop, path): +class BaseFile: + """Unsanitized file""" + def __init__(self, path): self.path = path - self.dir = path.parent - self.pagenums = None - - self.loop = loop - self.queue = None - - try: - data = recv_b() - self.path.write_bytes(data) - except ReceiveError: - raise + self.pagenums = 0 + self.batch = None async def sanitize(self): """Start sanitization tasks""" - try: - self.pagenums = await self.loop.run_in_executor(None, self._pagenums) - send(self.pagenums) - self.queue = asyncio.Queue(self.pagenums) - except subprocess.CalledProcessError: - raise + self.pagenums = self._pagenums() + self.batch = asyncio.Queue(self.pagenums) + + send(self.pagenums) publish_task = asyncio.create_task(self._publish()) consume_task = asyncio.create_task(self._consume()) @@ -276,75 +201,90 @@ async def sanitize(self): except subprocess.CalledProcessError: await cancel_task(publish_task) - while not self.queue.empty(): - convert_task = await self.queue.get() + while not self.batch.empty(): + convert_task = await self.batch.get() await cancel_task(convert_task) - self.queue.task_done() + self.batch.task_done() raise + def _pagenums(self): + """Return the number of pages in the suspect file""" + cmd = ["pdfinfo", str(self.path)] + output = subprocess.run(cmd, capture_output=True, check=True) + + for line in output.stdout.decode("ascii").splitlines(): + if "Pages:" in line: + return int(line.split(":")[1]) + + async def _publish(self): """Extract initial representations and enqueue conversion tasks""" for page in range(1, self.pagenums + 1): - rep = Representation(self.loop, self.path, Path(self.dir, str(page)), - "png", "rgb") + rep = Representation( + self.path, + Path(self.path.parent, str(page)), + "png", + "rgb" + ) + task = asyncio.create_task(rep.convert()) + entry = BatchEntry(task, rep) try: - convert_task = asyncio.create_task(rep.convert()) - await self.queue.put((rep, convert_task)) + await self.batch.put(entry) except asyncio.CancelledError: - await cancel_task(convert_task) - self.queue.task_done() - raise - except subprocess.CalledProcessError: + await cancel_task(task) raise async def _consume(self): """Await conversion tasks and send final representation to client""" for page in range(1, self.pagenums + 1): - rep, convert_task = await self.queue.get() - - try: - await convert_task - except subprocess.CalledProcessError: - raise - else: - self.queue.task_done() - - send(rep.dim) - send_b(await self.loop.run_in_executor(None, rep.final.read_bytes)) - await self.loop.run_in_executor(None, unlink, rep.final) - - - def _pagenums(self): - """Return the number of pages in the suspect file""" - cmd = ["pdfinfo", f"{self.path}"] - - try: - output = subprocess.run(cmd, capture_output=True, check=True) - except subprocess.CalledProcessError: - logging.error("File is probably not a PDF") - raise - - for line in output.stdout.decode("ascii").splitlines(): - if "Pages:" in line: - return int(line.split(":")[1]) + # Get RGB data + entry = await self.batch.get() + await entry.task + + # Read RGB data + rgb_data = await asyncio.get_running_loop().run_in_executor( + None, + entry.rep.final.read_bytes + ) + + # Clean up RGB data + await asyncio.get_running_loop().run_in_executor( + None, + unlink, + entry.rep.final + ) + + # Send dimensions and RGB data + await asyncio.get_running_loop().run_in_executor( + None, + send, + entry.rep.dim + ) + send_b(rgb_data) + + self.batch.task_done() def main(): - logging.basicConfig(level=logging.INFO, format="DispVM: %(message)s") - loop = asyncio.get_event_loop() + try: + data = recv_b() + except EOFError: + sys.exit(1) + + with TemporaryDirectory(prefix="qvm-sanitize") as tmpdir: + pdf_path = Path(tmpdir, "original") + pdf_path.write_bytes(data) + base = BaseFile(pdf_path) - with TemporaryDirectory(prefix="qvm-sanitize-") as tmpdir: + loop = asyncio.get_event_loop() try: - f = BaseFile(loop, Path(tmpdir, "original")) - loop.run_until_complete(f.sanitize()) - except (ReceiveError, subprocess.CalledProcessError): + loop.run_until_complete(base.sanitize()) + except subprocess.CalledProcessError as e: sys.exit(1) - finally: - loop.run_until_complete(loop.shutdown_asyncgens()) if __name__ == "__main__": From f05f4d57b722bbf07495a47bd0e0c0a81025f1fc Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Tue, 19 May 2020 14:46:18 -0500 Subject: [PATCH 68/92] client: Fix lower bound for batch size --- qubespdfconverter/client.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/qubespdfconverter/client.py b/qubespdfconverter/client.py index bfd1cd2..695cfe5 100755 --- a/qubespdfconverter/client.py +++ b/qubespdfconverter/client.py @@ -616,12 +616,11 @@ async def run(params): output_statistics(results) -# @click.option("-v", "--verbose", is_flag=True) @click.command() @click.option( "-b", "--batch", - type=click.IntRange(0), + type=click.IntRange(1), default=50, metavar="SIZE", help="Maximum number of conversion tasks" @@ -629,16 +628,11 @@ async def run(params): @click.option( "-a", "--archive", - default="~/QubesUntrustedPDFs", + type=Path, + default=Path(Path.home(), "QubesUntrustedPDFs"), metavar="PATH", help="Directory for storing archived files" ) -@click.option( - "-d", - "--dry-run", - is_flag=True, - help="Perform only server-side checks and conversions" -) @click.option( "-i", "--in-place", From 21a6314c69addda66adbdadc55ab2d02360d52b4 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Tue, 19 May 2020 14:46:54 -0500 Subject: [PATCH 69/92] client: Update shebang to 3.7 --- qubespdfconverter/client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qubespdfconverter/client.py b/qubespdfconverter/client.py index 695cfe5..07805ef 100755 --- a/qubespdfconverter/client.py +++ b/qubespdfconverter/client.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python3.7 # -*- coding: utf-8 -*- # The Qubes OS Project, http://www.qubes-os.org @@ -17,7 +17,7 @@ # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. import asyncio import click From 5ac1bf09b65406915291eac449c94a4cec78354a Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Tue, 19 May 2020 14:47:30 -0500 Subject: [PATCH 70/92] setup: Add tqdm dependency --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8869c0b..10c633b 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,8 @@ packages=['qubespdfconverter'], install_requires=[ 'Click', - 'Pillow' + 'Pillow', + 'tqdm' ], entry_points={ 'qubes.tests.extra.for_template': From 4cfb5e975afbe45a848aacf16dae20149c020645 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Tue, 19 May 2020 14:54:55 -0500 Subject: [PATCH 71/92] client: Update docstring --- qubespdfconverter/client.py | 1 + 1 file changed, 1 insertion(+) diff --git a/qubespdfconverter/client.py b/qubespdfconverter/client.py index 07805ef..449a4dc 100755 --- a/qubespdfconverter/client.py +++ b/qubespdfconverter/client.py @@ -331,6 +331,7 @@ class BaseFile: :param path: Path to original, unsanitized file :param pagenums: Number of pages in original file + :param pdf: Path to temporary final PDf """ def __init__(self, path, pagenums, pdf): From 35524b7d3f550aea7becbe26e9922856fbd79ae8 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Wed, 27 May 2020 22:57:39 -0400 Subject: [PATCH 72/92] client, server: Replace shellcheck with pylint --- .travis.yml | 2 +- qubespdfconverter/client.py | 42 +++++++++++++++---------------------- qubespdfconverter/server.py | 9 +++++--- 3 files changed, 24 insertions(+), 29 deletions(-) diff --git a/.travis.yml b/.travis.yml index d8f4a0a..c50f598 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,4 +15,4 @@ env: jobs: include: - script: - - shellcheck qpdf-convert-client qpdf-convert-server + - pylint --exit-zero --disable=C0411,C0111,C0103,C0102 client.py server.py diff --git a/qubespdfconverter/client.py b/qubespdfconverter/client.py index 449a4dc..265efb7 100755 --- a/qubespdfconverter/client.py +++ b/qubespdfconverter/client.py @@ -23,11 +23,9 @@ import click import functools import logging -import os import shutil import signal import subprocess -import sys from click._compat import get_text_stderr from enum import IntFlag @@ -128,7 +126,8 @@ def validate_paths(ctx, param, untrusted_paths): for untrusted_path in untrusted_paths: if not untrusted_path.resolve().exists(): raise BadPath(untrusted_path, "No such file or directory") - elif not untrusted_path.resolve().is_file(): + + if not untrusted_path.resolve().is_file(): raise BadPath(untrusted_path, "Not a regular file") try: @@ -136,9 +135,8 @@ def validate_paths(ctx, param, untrusted_paths): pass except PermissionError as e: raise BadPath(untrusted_path, "Not readable") from e - else: - paths = untrusted_paths + paths = untrusted_paths return paths @@ -173,11 +171,7 @@ async def send(proc, data): data = str(data).encode() proc.stdin.write(data) - - try: - await proc.stdin.drain() - except BrokenPipeError: - raise + await proc.stdin.drain() async def recv_b(proc, size): @@ -208,9 +202,9 @@ def format_dict(self): d = super().format_dict if ( - self.status & Status.PENDING - and d["total"] != 0 - and d["n"] != d["total"] + self.status & Status.PENDING + and d["total"] != 0 + and d["n"] != d["total"] ): self.pages = f"{d['n']}/{d['total']}" @@ -346,7 +340,7 @@ def __init__(self, path, pagenums, pdf): self.batch = None - async def sanitize(self, proc, bar, archive, depth, in_place): + async def sanitize(self, proc, bar, depth): """Receive and convert representation files :param archive: Path to archive directory @@ -394,7 +388,7 @@ async def _publish(self, proc, bar): async def _consume(self): """Convert initial representations to final form and save as PDF""" - for page in range(1, self.pagenums + 1): + for _ in range(1, self.pagenums + 1): batch_e = await self.batch.get() await batch_e.task await self._save_rep(batch_e.rep) @@ -515,9 +509,7 @@ async def _start(self, archive, depth, in_place): await self.base.sanitize( self.proc, self.bar, - archive, - depth, - in_place + depth ) await wait_proc(self.proc, CLIENT_VM_CMD) @@ -537,11 +529,11 @@ async def _start(self, archive, depth, in_place): except FileNotFoundError: pass else: - await asyncio.get_running_loop().run_in_executor( - None, - self._archive, - archive - ) + await asyncio.get_running_loop().run_in_executor( + None, + self._archive, + archive + ) async def _send(self): @@ -588,13 +580,13 @@ async def output_logs(): def output_statistics(results): - completed = [res for res in results].count(None) + completed = results.count(None) print(f"\nTotal Sanitized Files: {completed}/{len(results)}") async def run(params): suffix = "s" if len(params["files"]) > 1 else "" - print(f":: Sending file{suffix} to Disposable VM{suffix}...\n") + print(f"Sending file{suffix} to Disposable VM{suffix}...\n") tasks = [] jobs = [Job(f, i) for i, f in enumerate(params["files"])] diff --git a/qubespdfconverter/server.py b/qubespdfconverter/server.py index eadafc7..31b8665 100755 --- a/qubespdfconverter/server.py +++ b/qubespdfconverter/server.py @@ -213,10 +213,13 @@ def _pagenums(self): """Return the number of pages in the suspect file""" cmd = ["pdfinfo", str(self.path)] output = subprocess.run(cmd, capture_output=True, check=True) + pages = 0 for line in output.stdout.decode("ascii").splitlines(): if "Pages:" in line: - return int(line.split(":")[1]) + pages = int(line.split(":")[1]) + + return pages async def _publish(self): @@ -240,7 +243,7 @@ async def _publish(self): async def _consume(self): """Await conversion tasks and send final representation to client""" - for page in range(1, self.pagenums + 1): + for _ in range(1, self.pagenums + 1): # Get RGB data entry = await self.batch.get() await entry.task @@ -283,7 +286,7 @@ def main(): loop = asyncio.get_event_loop() try: loop.run_until_complete(base.sanitize()) - except subprocess.CalledProcessError as e: + except subprocess.CalledProcessError: sys.exit(1) From 32dd0d29665a108345a716274356db74dee4aed5 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Wed, 27 May 2020 23:20:16 -0400 Subject: [PATCH 73/92] travis: Add pylint dependency --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index c50f598..89620a8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,8 @@ sudo: required dist: bionic language: generic +before_install: + - sudo apt-get -y install pylint install: git clone https://github.com/QubesOS/qubes-builder ~/qubes-builder script: ~/qubes-builder/scripts/travis-build env: From 271865341b877c5767c28478cc94213e009fc29b Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Wed, 27 May 2020 23:35:55 -0400 Subject: [PATCH 74/92] travis: fix paths and try without --exit-code --exit-code is apparently not available on one of the VMs --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 89620a8..ecd2f65 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,4 +17,4 @@ env: jobs: include: - script: - - pylint --exit-zero --disable=C0411,C0111,C0103,C0102 client.py server.py + - pylint --disable=C0411,C0111,C0103,C0102 qubespdfconverter/client.py qubespdfconverter/server.py From 4e3fee33f7ac403ebb02fb8ecad08465fb0864db Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Wed, 27 May 2020 23:46:43 -0400 Subject: [PATCH 75/92] travis: Re-add --exit-code because i was right the first time --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index ecd2f65..6b74bcf 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,4 +17,4 @@ env: jobs: include: - script: - - pylint --disable=C0411,C0111,C0103,C0102 qubespdfconverter/client.py qubespdfconverter/server.py + - pylint --exit-code --disable=C0411,C0111,C0103,C0102 qubespdfconverter/client.py qubespdfconverter/server.py From 0b1e80c6f97076a22f8315d5aa6397e84bb336b0 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sun, 31 May 2020 15:28:03 -0500 Subject: [PATCH 76/92] travis: Attempt to fix pylint --- .pylintrc | 18 ++++++++++++++++++ .travis.yml | 19 +++++++++---------- ci/requirements.txt | 3 +++ 3 files changed, 30 insertions(+), 10 deletions(-) create mode 100644 .pylintrc create mode 100644 ci/requirements.txt diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 0000000..38a2bd8 --- /dev/null +++ b/.pylintrc @@ -0,0 +1,18 @@ +[MASTER] +persistent=no +ignore=qubespdfconverter/tests + +[MESSAGES CONTROL] +disable= + bad-continuation, + blacklisted-name, + deprecated-method, + duplicate-code, + file-ignored, + fixme, + invalid-name, + locally-disabled, + locally-enabled, + missing-docstring, + too-few-public-methods, + wrong-import-order diff --git a/.travis.yml b/.travis.yml index 6b74bcf..877f894 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,10 +1,14 @@ sudo: required dist: bionic -language: generic -before_install: - - sudo apt-get -y install pylint -install: git clone https://github.com/QubesOS/qubes-builder ~/qubes-builder -script: ~/qubes-builder/scripts/travis-build +language: python +python: + - '3.7' +install: + - git clone https://github.com/QubesOS/qubes-builder ~/qubes-builder + - pip install --quiet -r ci/requirements.txt +script: + - python3 -m pylint --exit-code --rcfile=.pylintrc qubespdfconverter + - ~/qubes-builder/scripts/travis-build env: - DIST_DOM0=fc25 USE_QUBES_REPO_VERSION=4.0 USE_QUBES_REPO_TESTING=1 - DISTS_VM=fc30 USE_QUBES_REPO_VERSION=4.0 USE_QUBES_REPO_TESTING=1 @@ -13,8 +17,3 @@ env: - DISTS_VM=buster USE_QUBES_REPO_VERSION=4.0 USE_QUBES_REPO_TESTING=1 - DIST_DOM0=fc31 USE_QUBES_REPO_VERSION=4.1 USE_QUBES_REPO_TESTING=1 - DISTS_VM=bullseye USE_QUBES_REPO_VERSION=4.1 USE_QUBES_REPO_TESTING=1 - -jobs: - include: - - script: - - pylint --exit-code --disable=C0411,C0111,C0103,C0102 qubespdfconverter/client.py qubespdfconverter/server.py diff --git a/ci/requirements.txt b/ci/requirements.txt new file mode 100644 index 0000000..62d3cdd --- /dev/null +++ b/ci/requirements.txt @@ -0,0 +1,3 @@ +# WARNING: those requirements are used only for travis-ci.org +# they SHOULD NOT be used under normal conditions; use system package manager +pylint From 467a7c3aa26d8cfff43567def54e207a57df4889 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sun, 31 May 2020 15:34:48 -0500 Subject: [PATCH 77/92] travis: remove --exit-code --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 877f894..935b3d1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,7 +7,7 @@ install: - git clone https://github.com/QubesOS/qubes-builder ~/qubes-builder - pip install --quiet -r ci/requirements.txt script: - - python3 -m pylint --exit-code --rcfile=.pylintrc qubespdfconverter + - python3 -m pylint --rcfile=.pylintrc qubespdfconverter - ~/qubes-builder/scripts/travis-build env: - DIST_DOM0=fc25 USE_QUBES_REPO_VERSION=4.0 USE_QUBES_REPO_TESTING=1 From deb6c87b3ed99ea85823bfb1609ea817b37cc3b3 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sun, 31 May 2020 15:45:05 -0500 Subject: [PATCH 78/92] travis: Update imports and pylint exceptions --- .pylintrc | 4 +++- ci/requirements.txt | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.pylintrc b/.pylintrc index 38a2bd8..346ec75 100644 --- a/.pylintrc +++ b/.pylintrc @@ -4,7 +4,7 @@ ignore=qubespdfconverter/tests [MESSAGES CONTROL] disable= - bad-continuation, + bare-except, blacklisted-name, deprecated-method, duplicate-code, @@ -14,5 +14,7 @@ disable= locally-disabled, locally-enabled, missing-docstring, + protected-access, too-few-public-methods, + unused-argument, wrong-import-order diff --git a/ci/requirements.txt b/ci/requirements.txt index 62d3cdd..f3aeabd 100644 --- a/ci/requirements.txt +++ b/ci/requirements.txt @@ -1,3 +1,6 @@ # WARNING: those requirements are used only for travis-ci.org # they SHOULD NOT be used under normal conditions; use system package manager +click +pillow pylint +tqdm From 61437ca9becf1d70d50207d0aae95fa7dc41ac1c Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sun, 14 Jun 2020 16:16:23 -0400 Subject: [PATCH 79/92] client: Use python3 shebang --- qubespdfconverter/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qubespdfconverter/client.py b/qubespdfconverter/client.py index 265efb7..439306c 100755 --- a/qubespdfconverter/client.py +++ b/qubespdfconverter/client.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3.7 +#!/usr/bin/env python3 # -*- coding: utf-8 -*- # The Qubes OS Project, http://www.qubes-os.org From 5772fdfb1e905640af30f9cdad0cdae015a92e6c Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sun, 14 Jun 2020 16:39:21 -0400 Subject: [PATCH 80/92] server: Handle non-ASCII output from pdfinfo(1) --- qubespdfconverter/server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qubespdfconverter/server.py b/qubespdfconverter/server.py index 31b8665..5d0058a 100755 --- a/qubespdfconverter/server.py +++ b/qubespdfconverter/server.py @@ -215,7 +215,7 @@ def _pagenums(self): output = subprocess.run(cmd, capture_output=True, check=True) pages = 0 - for line in output.stdout.decode("ascii").splitlines(): + for line in output.stdout.decode().splitlines(): if "Pages:" in line: pages = int(line.split(":")[1]) From c00e7a12c1ce6ddc60eea5ea8d78f0948d850780 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Tue, 16 Jun 2020 15:32:58 -0400 Subject: [PATCH 81/92] server: Prevent CPU hogging by PNG tasks PNG tasks were being enqueued too quickly, leaving no time for RGB conversions or PNG deletions. This meant that the server would create PNGs for every single page of a PDF before any conversions started, which is clearly not ideal. After experimenting with limits on the number of PNGs created before forcing the PNG creation task to join on the queue, I found that a limit of 1 gave the best performance. Technically, it's a limit of 2 since we start a new task before we await the previous one. In any case, the server is quite a bit faster now and won't run out of space easily. --- qubespdfconverter/server.py | 1 + 1 file changed, 1 insertion(+) diff --git a/qubespdfconverter/server.py b/qubespdfconverter/server.py index 5d0058a..7663013 100755 --- a/qubespdfconverter/server.py +++ b/qubespdfconverter/server.py @@ -233,6 +233,7 @@ async def _publish(self): ) task = asyncio.create_task(rep.convert()) entry = BatchEntry(task, rep) + await self.batch.join() try: await self.batch.put(entry) From 8e32abb2f90cd13433d7d50a39f3f615032f59d5 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Tue, 16 Jun 2020 15:34:36 -0400 Subject: [PATCH 82/92] server: Rename batch entry variables --- qubespdfconverter/server.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/qubespdfconverter/server.py b/qubespdfconverter/server.py index 7663013..63101a5 100755 --- a/qubespdfconverter/server.py +++ b/qubespdfconverter/server.py @@ -232,11 +232,11 @@ async def _publish(self): "rgb" ) task = asyncio.create_task(rep.convert()) - entry = BatchEntry(task, rep) + batch_e = BatchEntry(task, rep) await self.batch.join() try: - await self.batch.put(entry) + await self.batch.put(batch_e) except asyncio.CancelledError: await cancel_task(task) raise @@ -244,29 +244,25 @@ async def _publish(self): async def _consume(self): """Await conversion tasks and send final representation to client""" - for _ in range(1, self.pagenums + 1): - # Get RGB data - entry = await self.batch.get() - await entry.task + for _ in range(self.pagenums): + batch_e = await self.batch.get() + await batch_e.task - # Read RGB data rgb_data = await asyncio.get_running_loop().run_in_executor( None, - entry.rep.final.read_bytes + batch_e.rep.final.read_bytes ) - # Clean up RGB data await asyncio.get_running_loop().run_in_executor( None, unlink, - entry.rep.final + batch_e.rep.final ) - # Send dimensions and RGB data await asyncio.get_running_loop().run_in_executor( None, send, - entry.rep.dim + batch_e.rep.dim ) send_b(rgb_data) From bcbaf9ed4109e03b90766def0d42c13b4441676d Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Tue, 16 Jun 2020 15:48:33 -0400 Subject: [PATCH 83/92] client: Implement bulk saves and remove reps appropriately --- qubespdfconverter/client.py | 60 ++++++++++++++++++++++++++----------- 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/qubespdfconverter/client.py b/qubespdfconverter/client.py index 439306c..c33d890 100755 --- a/qubespdfconverter/client.py +++ b/qubespdfconverter/client.py @@ -270,6 +270,11 @@ async def convert(self, bar): except subprocess.CalledProcessError as e: raise RepresentationError("Failed to convert representation") from e + await asyncio.get_running_loop().run_in_executor( + None, + self.initial.unlink + ) + bar.update(1) @@ -369,13 +374,12 @@ async def sanitize(self, proc, bar, depth): async def _publish(self, proc, bar): """Receive initial representations and start their conversions""" + pages = [] + for page in range(1, self.pagenums + 1): rep = Representation(Path(self.pdf.parent, str(page)), "rgb", "png") await rep.receive(proc) - if page % self.batch.maxsize == 0: - await self.batch.join() - task = asyncio.create_task(rep.convert(bar)) batch_e = BatchEntry(task, rep) @@ -385,44 +389,64 @@ async def _publish(self, proc, bar): await cancel_task(task) raise + pages.append(page) + + if page % self.batch.maxsize == 0 or page == self.pagenums: + await self.batch.join() + await self._save_reps(pages) + pages = [] + async def _consume(self): """Convert initial representations to final form and save as PDF""" for _ in range(1, self.pagenums + 1): batch_e = await self.batch.get() await batch_e.task - await self._save_rep(batch_e.rep) self.batch.task_done() - async def _save_rep(self, rep): + async def _save_reps(self, pages): """Save final representations to a PDF file""" - try: - image = await asyncio.get_running_loop().run_in_executor( - None, - Image.open, - rep.final - ) - except IOError as e: - raise RepresentationError("Failed to open representation") from e + images = [] + + for page in pages: + try: + images.append(await asyncio.get_running_loop().run_in_executor( + None, + Image.open, + Path(self.pdf.parent, f"{page}.png")) + ) + except IOError as e: + for image in images: + await asyncio.get_running_loop().run_in_executor( + None, + image.close + ) + raise RepresentationError("Failed to open representation") from e try: await asyncio.get_running_loop().run_in_executor( None, - functools.partial(image.save, + functools.partial(images[0].save, self.pdf, "PDF", resolution=100, append=self.pdf.exists(), + append_images=[] if len(images) == 1 else images[1:], save_all=True) ) except IOError as e: raise RepresentationError("Failed to save representation") from e finally: - await asyncio.get_running_loop().run_in_executor( - None, - image.close - ) + for image, page in zip(images, pages): + await asyncio.get_running_loop().run_in_executor( + None, + image.close + ) + await asyncio.get_running_loop().run_in_executor( + None, + Path(self.pdf.parent, f"{page}.png").unlink + ) class Job: From ef4191378e3a1327dd402d9815e7e690dbf58de2 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Tue, 16 Jun 2020 15:49:14 -0400 Subject: [PATCH 84/92] client: Exit with 1 on error --- qubespdfconverter/client.py | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/qubespdfconverter/client.py b/qubespdfconverter/client.py index c33d890..4251307 100755 --- a/qubespdfconverter/client.py +++ b/qubespdfconverter/client.py @@ -26,6 +26,7 @@ import shutil import signal import subprocess +import sys from click._compat import get_text_stderr from enum import IntFlag @@ -596,18 +597,6 @@ def _archive(self, archive): self.path.rename(Path(archive, self.path.name)) -async def output_logs(): - while not ERROR_LOGS.empty(): - err_msg = await ERROR_LOGS.get() - logging.error(err_msg) - ERROR_LOGS.task_done() - - -def output_statistics(results): - completed = results.count(None) - print(f"\nTotal Sanitized Files: {completed}/{len(results)}") - - async def run(params): suffix = "s" if len(params["files"]) > 1 else "" print(f"Sending file{suffix} to Disposable VM{suffix}...\n") @@ -625,12 +614,19 @@ async def run(params): ) results = await asyncio.gather(*tasks, return_exceptions=True) + completed = results.count(None) for job in jobs: job.bar.close() - await output_logs() - output_statistics(results) + while not ERROR_LOGS.empty(): + err_msg = await ERROR_LOGS.get() + logging.error(err_msg) + ERROR_LOGS.task_done() + + print(f"\nTotal Sanitized Files: {completed}/{len(results)}") + + return completed != len(results) @click.command() @@ -669,7 +665,7 @@ def main(**params): if params["files"]: loop = asyncio.get_event_loop() - loop.run_until_complete(run(params)) + sys.exit(loop.run_until_complete(run(params))) else: print("No files to sanitize.") From 2abacda0ed99170b7b3f0227230d378f9a0f69d9 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Tue, 16 Jun 2020 15:59:43 -0400 Subject: [PATCH 85/92] meta: Copyright info --- qubespdfconverter/client.py | 1 + qubespdfconverter/server.py | 1 + 2 files changed, 2 insertions(+) diff --git a/qubespdfconverter/client.py b/qubespdfconverter/client.py index 4251307..17c9610 100755 --- a/qubespdfconverter/client.py +++ b/qubespdfconverter/client.py @@ -4,6 +4,7 @@ # The Qubes OS Project, http://www.qubes-os.org # # Copyright (C) 2013 Joanna Rutkowska +# Copyright (C) 2020 Jason Phan # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License diff --git a/qubespdfconverter/server.py b/qubespdfconverter/server.py index 63101a5..47b0b89 100755 --- a/qubespdfconverter/server.py +++ b/qubespdfconverter/server.py @@ -4,6 +4,7 @@ # The Qubes OS Project, http://www.qubes-os.org # # Copyright (C) 2013 Joanna Rutkowska +# Copyright (C) 2020 Jason Phan # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License From 9b655afb621ad43823b21093fe9302b77e6d1ae6 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Fri, 19 Jun 2020 15:32:45 -0400 Subject: [PATCH 86/92] pylint: Add bad-continuation to .pylintrc --- .pylintrc | 1 + 1 file changed, 1 insertion(+) diff --git a/.pylintrc b/.pylintrc index 346ec75..3075b0d 100644 --- a/.pylintrc +++ b/.pylintrc @@ -5,6 +5,7 @@ ignore=qubespdfconverter/tests [MESSAGES CONTROL] disable= bare-except, + bad-continuation, blacklisted-name, deprecated-method, duplicate-code, From c87e61b42e62e90883a9cea424ebad55e79edb94 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Fri, 19 Jun 2020 18:35:15 -0400 Subject: [PATCH 87/92] client: Simplify image appending --- qubespdfconverter/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qubespdfconverter/client.py b/qubespdfconverter/client.py index 17c9610..be963a2 100755 --- a/qubespdfconverter/client.py +++ b/qubespdfconverter/client.py @@ -434,7 +434,7 @@ async def _save_reps(self, pages): "PDF", resolution=100, append=self.pdf.exists(), - append_images=[] if len(images) == 1 else images[1:], + append_images=images[1:], save_all=True) ) except IOError as e: From 5eab363e8efe2cad88a5dd67d4533d6156867c0f Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Fri, 19 Jun 2020 18:36:41 -0400 Subject: [PATCH 88/92] client: Fix output spacing --- qubespdfconverter/client.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/qubespdfconverter/client.py b/qubespdfconverter/client.py index be963a2..0693ac2 100755 --- a/qubespdfconverter/client.py +++ b/qubespdfconverter/client.py @@ -620,6 +620,9 @@ async def run(params): for job in jobs: job.bar.close() + if not ERROR_LOGS.empty(): + print() + while not ERROR_LOGS.empty(): err_msg = await ERROR_LOGS.get() logging.error(err_msg) From f1d35f2ed0e2a6ddb952ee89a395d60201d033e5 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Fri, 19 Jun 2020 21:31:31 -0400 Subject: [PATCH 89/92] client: Handle out of space error --- qubespdfconverter/client.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/qubespdfconverter/client.py b/qubespdfconverter/client.py index 0693ac2..0c6bcfd 100755 --- a/qubespdfconverter/client.py +++ b/qubespdfconverter/client.py @@ -487,7 +487,8 @@ async def run(self, archive, depth, in_place): try: await self._setup(tmpdir) await self._start(archive, depth, in_place) - except (PageError, + except (OSError, + PageError, QrexecError, DimensionError, RepresentationError, From 379659b6ebd1127bedb0dc2e00fbd57d1f39bbf7 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sun, 21 Jun 2020 11:15:36 -0400 Subject: [PATCH 90/92] client: Add support for older tqdm versions --- qubespdfconverter/client.py | 104 ++++++++++++++++-------------------- 1 file changed, 47 insertions(+), 57 deletions(-) diff --git a/qubespdfconverter/client.py b/qubespdfconverter/client.py index 0c6bcfd..c601e02 100755 --- a/qubespdfconverter/client.py +++ b/qubespdfconverter/client.py @@ -28,14 +28,13 @@ import signal import subprocess import sys +import tqdm -from click._compat import get_text_stderr -from enum import IntFlag +from enum import Enum, auto from dataclasses import dataclass from pathlib import Path from PIL import Image from tempfile import TemporaryDirectory -from tqdm import tqdm CLIENT_VM_CMD = ["/usr/bin/qrexec-client-vm", "@dispvm", "qubes.PdfConvert"] @@ -47,12 +46,11 @@ ERROR_LOGS = asyncio.Queue() -class Status(IntFlag): +class Status(Enum): """Sanitization job status""" - PENDING = 1 - DONE = 2 - FAIL = 4 - CANCELLED = 8 + DONE = auto() + FAIL = auto() + CANCELLED = auto() @dataclass(frozen=True) @@ -97,7 +95,7 @@ def show(self, file=None): color = None if file is None: - file = get_text_stderr() + file = click._compat.get_text_stderr() if self.ctx is not None: color = self.ctx.color @@ -190,37 +188,15 @@ async def recvline(proc): return untrusted_data.decode("ascii").rstrip() -class Tqdm(tqdm): - """Adds @pages and @status attributes""" +class Tqdm(tqdm.tqdm): + def set_status(self, status): + prefix = self.desc[:self.desc.rfind('.') + 1] + self.set_description_str(prefix + status) + self.refresh() - def __init__(self, *args, **kwargs): - self.pages = "0/?" - self.status = Status.PENDING - super().__init__(*args, **kwargs) - - @property - def format_dict(self): - d = super().format_dict - - if ( - self.status & Status.PENDING - and d["total"] != 0 - and d["n"] != d["total"] - ): - self.pages = f"{d['n']}/{d['total']}" - - d.update(pages=self.pages) - - return d - - - def set_status(self, flag, refresh=True): - self.status = flag - self.pages = self.status.name.lower() - - if refresh: - self.refresh() + def set_job_status(self, status): + self.set_status(status.name.lower()) class Representation: @@ -278,6 +254,7 @@ async def convert(self, bar): ) bar.update(1) + bar.set_status(f"{bar.n}/{bar.total}") async def receive(self, proc): @@ -328,7 +305,7 @@ class BatchEntry: class BaseFile: - """Unsanitized file + """An unsanitized file :param path: Path to original, unsanitized file :param pagenums: Number of pages in original file @@ -452,7 +429,7 @@ async def _save_reps(self, pages): class Job: - """ + """A sanitization job :param path: Path to original, unsanitized file :param pos: Bar position @@ -467,10 +444,9 @@ def __init__(self, path, pos): :param pdf: Path to temporary PDF for appending representations """ self.path = path - self.bar = Tqdm(total=0, - position=pos, - desc=str(path), - bar_format=" {desc}...{pages}") + self.bar = Tqdm(desc=f"{path}...0/?", + bar_format=" {desc}", + position=pos) self.base = None self.proc = None self.pdf = None @@ -501,19 +477,19 @@ async def run(self, archive, depth, in_place): # operation (e.g., a STDOUT read), thereby raising an exception # not expected by the cleanup code. if self.proc.returncode == -signal.SIGINT: - self.bar.set_status(Status.CANCELLED) + self.bar.set_job_status(Status.CANCELLED) raise asyncio.CancelledError - self.bar.set_status(Status.FAIL) - await ERROR_LOGS.put(f"{self.path.name}: {str(e)}") + self.bar.set_job_status(Status.FAIL) + await ERROR_LOGS.put(f"{self.path.name}: {e}") if self.proc.returncode is not None: await terminate_proc(self.proc) raise except asyncio.CancelledError: - self.bar.set_status(Status.CANCELLED) + self.bar.set_job_status(Status.CANCELLED) raise - self.bar.set_status(Status.DONE) + self.bar.set_job_status(Status.DONE) async def _setup(self, tmpdir): @@ -526,7 +502,11 @@ async def _setup(self, tmpdir): await cancel_task(page_task) raise else: - self.bar.reset(total=pagenums) + try: + self.bar.reset(total=pagenums) + except AttributeError: + self.bar.total = pagenums + self.bar.refresh() self.pdf = Path(tmpdir, self.path.with_suffix(".trusted.pdf").name) self.base = BaseFile(self.path, pagenums, self.pdf) @@ -621,15 +601,25 @@ async def run(params): for job in jobs: job.bar.close() - if not ERROR_LOGS.empty(): - print() + if ERROR_LOGS.empty(): + if tqdm.__version__ >= "4.34.0": + newlines = "\n" + else: + newlines = "\n" if len(jobs) == 1 else "\n" * (len(jobs) + 1) + else: + newlines = "\n" + + if tqdm.__version__ >= "4.34.0": + print() + else: + print() if len(jobs) == 1 else print("\n" * len(jobs)) - while not ERROR_LOGS.empty(): - err_msg = await ERROR_LOGS.get() - logging.error(err_msg) - ERROR_LOGS.task_done() + while not ERROR_LOGS.empty(): + err_msg = await ERROR_LOGS.get() + logging.error(err_msg) + ERROR_LOGS.task_done() - print(f"\nTotal Sanitized Files: {completed}/{len(results)}") + print(f"{newlines}Total Sanitized Files: {completed}/{len(results)}") return completed != len(results) From 64cc14f2765f2717e961f5155d87688456236c06 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sun, 21 Jun 2020 11:22:12 -0400 Subject: [PATCH 91/92] pylint: Add expression-not-assigned --- .pylintrc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.pylintrc b/.pylintrc index 3075b0d..f21db66 100644 --- a/.pylintrc +++ b/.pylintrc @@ -4,11 +4,12 @@ ignore=qubespdfconverter/tests [MESSAGES CONTROL] disable= - bare-except, bad-continuation, + bare-except, blacklisted-name, deprecated-method, duplicate-code, + expression-not-assigned, file-ignored, fixme, invalid-name, From 1ee08f7d10df625d7103e3d8e1f78fdef6932037 Mon Sep 17 00:00:00 2001 From: Jason Phan Date: Sun, 21 Jun 2020 13:36:16 -0400 Subject: [PATCH 92/92] makefile: Resolve makefile conflict --- Makefile | 54 ++---------------------------------------------------- 1 file changed, 2 insertions(+), 52 deletions(-) diff --git a/Makefile b/Makefile index 05bc7fe..33dfc60 100644 --- a/Makefile +++ b/Makefile @@ -19,57 +19,6 @@ # # -RPMS_DIR=rpm/ -VERSION := $(shell cat version) - -help: - @echo "make rpms -- generate signed rpm packages" - @echo "make update-repo-current -- copy newly generated rpms to qubes yum repo" - @echo "make update-repo-current-testing -- same, but for -current-testing repo" - @echo "make update-repo-unstable -- same, but to -testing repo" - @echo "make update-repo-installer -- copy dom0 rpms to installer repo" - @echo "make clean -- clean up binary files" - -rpms: rpms-vm - -rpms-dom0: - rpmbuild --define "_rpmdir rpm/" -bb rpm_spec/qpdf-converter-dom0.spec - rpm --addsign rpm/x86_64/qubes-pdf-converter-dom0*$(VERSION)*.rpm - -rpms-vm: - rpmbuild --define "_rpmdir rpm/" -bb rpm_spec/qpdf-converter.spec - rpm --addsign rpm/x86_64/qubes-pdf-converter*$(VERSION)*.rpm - -update-repo-current: - for vmrepo in ../yum/current-release/current/vm/* ; do \ - dist=$$(basename $$vmrepo) ;\ - ln -f $(RPMS_DIR)/x86_64/qubes-pdf-converter*$(VERSION)*$$dist*.rpm $$vmrepo/rpm/ ;\ - done - ln -f $(RPMS_DIR)/x86_64/qubes-pdf-converter-dom0-*$(VERSION)*.rpm ../yum/current-release/current/dom0/rpm/ - -update-repo-current-testing: - for vmrepo in ../yum/current-release/current-testing/vm/* ; do \ - dist=$$(basename $$vmrepo) ;\ - ln -f $(RPMS_DIR)/x86_64/qubes-pdf-converter*$(VERSION)*$$dist*.rpm $$vmrepo/rpm/ ;\ - done - ln -f $(RPMS_DIR)/x86_64/qubes-pdf-converter-dom0-*$(VERSION)*.rpm ../yum/current-release/current-testing/dom0/rpm/ - -update-repo-unstable: - for vmrepo in ../yum/current-release/unstable/vm/* ; do \ - dist=$$(basename $$vmrepo) ;\ - ln -f $(RPMS_DIR)/x86_64/qubes-pdf-converter*$(VERSION)*$$dist*.rpm $$vmrepo/rpm/ ;\ - done - ln -f $(RPMS_DIR)/x86_64/qubes-pdf-converter-dom0-*$(VERSION)*.rpm ../yum/current-release/unstable/dom0/rpm/ - -update-repo-template: - for vmrepo in ../template-builder/yum_repo_qubes/* ; do \ - dist=$$(basename $$vmrepo) ;\ - ln -f $(RPMS_DIR)/x86_64/qubes-pdf-converter*$(VERSION)*$$dist*.rpm $$vmrepo/rpm/ ;\ - done - -update-repo-installer: - ln -f $(RPMS_DIR)/x86_64/qubes-pdf-converter-dom0-*$(VERSION)*.rpm ../installer/yum/qubes-dom0/rpm/ - build: make manpages -C doc @@ -86,7 +35,8 @@ install-vm: install -m 0644 qvm-convert-pdf.desktop $(DESTDIR)/usr/share/kde4/services install-dom0: - python2 setup.py install -O1 --root $(DESTDIR) python3 setup.py install -O1 --root $(DESTDIR) clean: + rm -rf debian/changelog.* + rm -rf pkgs