diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 0000000..f21db66 --- /dev/null +++ b/.pylintrc @@ -0,0 +1,22 @@ +[MASTER] +persistent=no +ignore=qubespdfconverter/tests + +[MESSAGES CONTROL] +disable= + bad-continuation, + bare-except, + blacklisted-name, + deprecated-method, + duplicate-code, + expression-not-assigned, + file-ignored, + fixme, + invalid-name, + locally-disabled, + locally-enabled, + missing-docstring, + protected-access, + too-few-public-methods, + unused-argument, + wrong-import-order diff --git a/.travis.yml b/.travis.yml index 211db67..540e990 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,5 +5,10 @@ import: jobs: include: - - script: - - shellcheck qpdf-convert-client qpdf-convert-server + - language: python + python: + - '3.7' + install: + - pip install --quiet -r ci/requirements.txt + script: + - python3 -m pylint --rcfile=.pylintrc qubespdfconverter diff --git a/Makefile b/Makefile index 4a8fe43..33dfc60 100644 --- a/Makefile +++ b/Makefile @@ -24,9 +24,8 @@ build: install-vm: make install -C doc - install -D qvm-convert-pdf $(DESTDIR)/usr/bin/qvm-convert-pdf - install -D qpdf-convert-client $(DESTDIR)/usr/lib/qubes/qpdf-convert-client - install -D qpdf-convert-server $(DESTDIR)/usr/lib/qubes/qpdf-convert-server + install -D qubespdfconverter/client.py $(DESTDIR)/usr/bin/qvm-convert-pdf + install -D qubespdfconverter/server.py $(DESTDIR)/usr/lib/qubes/qpdf-convert-server install -d $(DESTDIR)/etc/qubes-rpc ln -s ../../usr/lib/qubes/qpdf-convert-server $(DESTDIR)/etc/qubes-rpc/qubes.PdfConvert install -D qvm-convert-pdf.gnome $(DESTDIR)/usr/lib/qubes/qvm-convert-pdf.gnome diff --git a/README.md b/README.md index d71f294..25da71e 100644 --- a/README.md +++ b/README.md @@ -1,32 +1,30 @@ Qubes PDF Converter ==================== -Qubes PDF converter is a [Qubes](https://qubes-os.org) Application, which -utilizes Qubes flexible qrexec (inter-VM communication) infrastructure and -Disposable VMs to perform conversion of potentially untrusted (e.g. maliciously -malformed) PDF files into safe-to-view PDF files. - -This is done by having the Disposable VM perform the complex (and potentially -buggy) rendering of the PDF in question) and sending the resulting RGB bitmap -(simple representation) to the client AppVM. The client AppVM can _trivially_ -verify the received data are indeed the simple representation, and then -construct a new PDF out of the received bitmap. Of course the price we pay for -this conversion is loosing any structural information and text-based search in -the converted PDF. - -More discussion and introduction of the concept has been described in the -original article +Qubes PDF converter is a [Qubes](https://qubes-os.org) Application that +utilizes Disposable VMs and Qubes' flexible qrexec (inter-VM communication) +infrastructure to securely convert potentially untrusted PDF files into +safe-to-view PDF files. + +This is done by having a Disposable VM render each page of a PDF file into a +very simple representation (RGB bitmap) that (presumably) leaves no room for +malicious code. This representation is then sent back to the client AppVM which +then constructs an entirely new PDF file out of the received bitmaps. + +More discussion of the concept has been described in the original article [here](http://blog.invisiblethings.org/2013/02/21/converting-untrusted-pdfs-into-trusted.html). Usage ------ - [user@varia ~]$ qvm-convert-pdf test.pdf - -> Sending file to remote VM... - -> Waiting for converted samples... - -> Receving page 8 out of 8... - -> Converted PDF saved as: ./test.trusted.pdf - -> Original file saved as /home/user/QubesUntrustedPDFs/test.pdf + [user@domU ~]$ qvm-convert-pdf file1.pdf file2.pdf file3.pdf + :: Sending files to Disposable VMs... + + file1.pdf...done + file2.pdf...fail + file3.pdf...done + + Total Sanitized Files: 2/3 Authors --------- diff --git a/ci/requirements.txt b/ci/requirements.txt new file mode 100644 index 0000000..f3aeabd --- /dev/null +++ b/ci/requirements.txt @@ -0,0 +1,6 @@ +# WARNING: those requirements are used only for travis-ci.org +# they SHOULD NOT be used under normal conditions; use system package manager +click +pillow +pylint +tqdm diff --git a/debian/qubes-pdf-converter.install b/debian/qubes-pdf-converter.install index 7a6a94b..3b02fd9 100644 --- a/debian/qubes-pdf-converter.install +++ b/debian/qubes-pdf-converter.install @@ -1,4 +1,3 @@ -usr/lib/qubes/qpdf-convert-client usr/lib/qubes/qpdf-convert-server etc/qubes-rpc/qubes.PdfConvert usr/bin/qvm-convert-pdf diff --git a/doc/qvm-convert-pdf.rst b/doc/qvm-convert-pdf.rst index 8a89d9c..894b3a4 100644 --- a/doc/qvm-convert-pdf.rst +++ b/doc/qvm-convert-pdf.rst @@ -4,27 +4,28 @@ QVM-CONVERT-PDF(1) NAME ==== -qvm-convert-pdf - converts a potentially untrusted pdf to a safe-to-view pdf +qvm-convert-pdf - converts potentially untrusted PDFs to a safe-to-view PDF SYNOPSIS ======== -| qvm-convert-pdf +| qvm-convert-pdf DESCRIPTION =========== -Qubes PDF converter is a Qubes Application, which utilizes Qubes flexible qrexec -(inter-VM communication) infrastructure and Disposable VMs to perform conversion -of potentially untrusted (e.g. maliciously malformed) PDF files into -safe-to-view PDF files. +Qubes PDF converter is a Qubes Application that utilizes Qubes' flexible qrexec +(inter-VM communication) infrastructure and Disposable VMs to securely convert +potentially untrusted (e.g. maliciously malformed) PDF files into safe-to-view +PDF files. -This is done by having the Disposable VM perform the complex (and potentially -buggy) rendering of the PDF in question) and sending the resulting RGB bitmap -(simple representation) to the client AppVM. The client AppVM can _trivially_ -verify the received data are indeed the simple representation, and then -construct a new PDF out of the received bitmap. Of course the price we pay for -this conversion is loosing any structural information and text-based search in -the converted PDF. +This is done by having a Disposable VM render each page of a PDF file into a +very simple representation (RGB bitmap) that (presumably) leaves no room for +malicious code. This representation is then sent back to the client AppVM which +then constructs an entirely new PDF file out of the received bitmaps. + +Of course, the price we pay for this conversion is an increase in file size and +the loss of any structural information or text-based search in the converted +PDF. AUTHORS ======= diff --git a/qpdf-convert-client b/qpdf-convert-client deleted file mode 100755 index 2fdb2ce..0000000 --- a/qpdf-convert-client +++ /dev/null @@ -1,150 +0,0 @@ -#!/bin/bash -# -# The Qubes OS Project, http://www.qubes-os.org -# -# Copyright (C) 2013 Joanna Rutkowska -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -# -# Requires: -# - ImageMagick (convert) - -INPUT_FILE="$1" -RCVD_FILE=$(mktemp --tmpdir qpdf-conversion-XXXXXXXX) -CONVERTED_FILE="$(dirname "$1")/$(basename "$1" .pdf).trusted.pdf" -CONVERTED_FILE_PARTIAL="$CONVERTED_FILE".part.pdf - - -MAX_PAGES=10000 -MAX_IMG_WIDTH=10000 -MAX_IMG_HEIGHT=10000 -IMG_DEPTH=8 -MAX_IMG_SIZE=$((MAX_IMG_WIDTH * MAX_IMG_HEIGHT * 3)) - -VERBOSE=1 -if [ -n "$PROGRESS_FOR_GUI" ]; then - VERBOSE=0; -fi - -die() { - reason="$1" - if [ -n "$PROGRESS_FOR_GUI" ]; then - zenity --error --title="PDF conversion error" --text="$reason" - else - echo "$reason" >&2 - fi - exit 1 -} - - -# Send the input (untrusted) file to the server... -[ $VERBOSE -ge 1 ] && echo "-> Sending file to a Disposable VM..." >&2 -cat "$INPUT_FILE" -exec >&- - -# ... and get the recvd *simple* representation: - -# Note: the server might be compromised at this point so, it can very well send -# us something else than the simple representation. Thus we explicitly specify -# input format to ImageMagick's convert via "rgb:" prefix, forcing it to -# interpret whatever stream of bytes it gets on input as a simple RGB array. We -# hope that when using this RGB format explicitly (which is the simplest format -# for bitmaps in the known universe), there is no space for offending bug in -# image parsing... - -# First, get the no of pages: -read -r NO_PAGES -if [[ ! "$NO_PAGES" =~ ^[1-9][0-9]*$ ]] || [[ $NO_PAGES -le 0 ]] || [[ $NO_PAGES -gt $MAX_PAGES ]] ; then - die "The remote party return invalid no of pages, aborting!" -fi - -[ $VERBOSE -ge 1 ] && echo "-> Waiting for converted samples..." >&2 - -PAGE=1 -while [ $PAGE -le "$NO_PAGES" ]; do - read -r IMG_WIDTH IMG_HEIGHT - if [ $VERBOSE -eq 1 ]; then - echo -n "-> Receiving page $PAGE out of $NO_PAGES..." >&2 - printf "\r" >&2 - elif [ $VERBOSE -gt 1 ]; then - echo "-> Receiving page $PAGE out of $NO_PAGES..." >&2 - fi - if [[ ! "$IMG_WIDTH" =~ ^[1-9][0-9]*$ ]] || [ "$IMG_WIDTH" -le 0 ] || [ "$IMG_WIDTH" -gt $MAX_IMG_WIDTH ] || \ - [[ ! "$IMG_HEIGHT" =~ ^[1-9][0-9]*$ ]] || [ "$IMG_HEIGHT" -le 0 ] || [ "$IMG_HEIGHT" -gt $MAX_IMG_HEIGHT ]; then - die "The remote party return invalid image geometry info, aborting!" - fi - [ $VERBOSE -ge 2 ] && echo "--> page geometry: $IMG_WIDTH x $IMG_HEIGHT x $IMG_DEPTH" >&2 - IMG_SIZE=$((IMG_WIDTH*IMG_HEIGHT*3)) - if [ $IMG_SIZE -le 0 ] || [ $IMG_SIZE -gt $MAX_IMG_SIZE ]; then - die "Calculated image size is invalid, aborting!" - fi - # save the simplified RGB image into a temp PDF file: - RGB_FILE=$RCVD_FILE-$PAGE.rgb - PNG_FILE=$RCVD_FILE-$PAGE.png - PDF_FILE=$RCVD_FILE-$PAGE.pdf - head -c $IMG_SIZE > "$RGB_FILE" - RCVD_IMG_SIZE=$(stat -c %s "$RGB_FILE") - if [ "$RCVD_IMG_SIZE" -ne $IMG_SIZE ]; then - die "The remote party return invalid no of bytes of the RGB file, aborting!" - fi - # here, the important part is that we *explicitly* specify RGB as the input format via "rgb:" - # We first convert to a (compressed) PNG to create smaller output files - convert_msgs=$(convert -size "${IMG_WIDTH}x${IMG_HEIGHT}" -depth ${IMG_DEPTH} rgb:"$RGB_FILE" png:"$PNG_FILE" 2>&1) - # shellcheck disable=SC2181 - if [ $? -ne 0 ]; then - die "Page $PAGE conversion failed (RGB->PNG): $convert_msgs" - fi - rm -f "$RGB_FILE" - - # now convert the (trusted but compressed) PNG into PDF for easy assembly... - convert_msgs=$(convert "$PNG_FILE" "$PDF_FILE" 2>&1) - # shellcheck disable=SC2181 - if [ $? -ne 0 ]; then - die "Page $PAGE conversion failed (PNG->PDF): $convert_msgs" - fi - rm -f "$PNG_FILE" - - if [ $PAGE -gt 1 ]; then - convert_msgs=$(pdfunite "$CONVERTED_FILE" "$PDF_FILE" "$CONVERTED_FILE_PARTIAL" 2>&1) - # shellcheck disable=SC2181 - if [ $? -ne 0 ]; then - die "Error merging converted page: $convert_msgs" - fi - mv "$CONVERTED_FILE_PARTIAL" "$CONVERTED_FILE" || die - else - mv "$PDF_FILE" "$CONVERTED_FILE" || die - fi - rm -f "$PDF_FILE" || die - - PAGE=$((PAGE+1)) - - [ -n "$PROGRESS_FOR_GUI" ] && echo $(((PAGE - 1) * 90 / NO_PAGES)) >& "$SAVED_FD_1" -done - -if [ $VERBOSE -eq 1 ]; then - echo >&2 -fi - -[ $VERBOSE -ge 1 ] && echo "-> Converted PDF saved as: $CONVERTED_FILE" >&2 - -mkdir -p "$HOME/QubesUntrustedPDFs" -ORIG_FILE="$HOME/QubesUntrustedPDFs/$(basename "$INPUT_FILE")" -mv "$INPUT_FILE" "${ORIG_FILE}" || die "Moving original file failed" -[ $VERBOSE -ge 1 ] && echo "-> Original file saved as $ORIG_FILE" >&2 - -# Cleanup -rm -f "$RCVD_FILE"* -[ -n "$PROGRESS_FOR_GUI" ] && echo "100" >& "$SAVED_FD_1" -exit 0 diff --git a/qpdf-convert-server b/qpdf-convert-server deleted file mode 100755 index 8997090..0000000 --- a/qpdf-convert-server +++ /dev/null @@ -1,68 +0,0 @@ -#!/bin/bash -# -# The Qubes OS Project, http://www.qubes-os.org -# -# Copyright (C) 2013 Joanna Rutkowska -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -# -# Requires: -# - poppler-utils (pdftocairo, pdfinfo) -# - ImageMagick (convert) - -INPUT_FILE=$(mktemp --tmpdir qpdf-conversion-XXXXXXXX) -TEMP_PNG_FILE=$(mktemp --tmpdir qpdf-conversion-XXXXXXXX.png) -TEMP_RGB_FILE=$(mktemp --tmpdir qpdf-conversion-XXXXXXXX.pdf) -IMG_DEPTH=8 - -# Get the original (untrusted) PDF file... -cat > "$INPUT_FILE" - -# now, let's convert it into a simple representation, -# and send back to the client. - -# Note that we might be compromised at this point (due to exploitation of PDF -# parsing code) and so what we're sending back might very well be something -# totally different than a decent simple representation -- the client should -# never trust what we're sending back, and should discard anything that doesn't -# look like the simple representation! - -NO_PAGES=$(pdfinfo "$INPUT_FILE" | grep -a "^Pages:" | sed -e "s/^Pages:[^0-9]*//") -if [ -z "$NO_PAGES" ]; then - # Perhaps this is not a PDF, only some JPG/PNG/etc? Let's try it anyway... - NO_PAGES=1 -fi -echo $NO_PAGES - -cd /tmp || exit 1 -PAGE=1 -while [ $PAGE -le $NO_PAGES ]; do - # if pdftocairo fails, lets try the ImageMagick's convert -- perhaps this is just some img file? - pdftocairo "$INPUT_FILE" -png -f $PAGE -l $PAGE -singlefile "$(basename "$TEMP_PNG_FILE" .png)" || \ - convert "$INPUT_FILE" png:"$TEMP_PNG_FILE" - IMG_WIDTH=$(identify -format "%w" "$TEMP_PNG_FILE") - IMG_HEIGHT=$(identify -format "%h" "$TEMP_PNG_FILE") - convert "$TEMP_PNG_FILE" -depth $IMG_DEPTH rgb:"$TEMP_RGB_FILE" - echo "$IMG_WIDTH $IMG_HEIGHT" - cat "$TEMP_RGB_FILE" - PAGE=$((PAGE + 1)) -done - -# Cleanup tmp files... -# Note: our DispVM might get destroyed before the commands below -# complete, but that doesn't hurt us, because this is... well a DispVM. -rm -f "$INPUT_FILE" -rm -f "$TEMP_PNG_FILE" -rm -f "$TEMP_RGB_FILE" diff --git a/qubespdfconverter/client.py b/qubespdfconverter/client.py new file mode 100755 index 0000000..c601e02 --- /dev/null +++ b/qubespdfconverter/client.py @@ -0,0 +1,669 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# The Qubes OS Project, http://www.qubes-os.org +# +# Copyright (C) 2013 Joanna Rutkowska +# Copyright (C) 2020 Jason Phan +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +import asyncio +import click +import functools +import logging +import shutil +import signal +import subprocess +import sys +import tqdm + +from enum import Enum, auto +from dataclasses import dataclass +from pathlib import Path +from PIL import Image +from tempfile import TemporaryDirectory + +CLIENT_VM_CMD = ["/usr/bin/qrexec-client-vm", "@dispvm", "qubes.PdfConvert"] + +MAX_PAGES = 10000 +MAX_IMG_WIDTH = 10000 +MAX_IMG_HEIGHT = 10000 +DEPTH = 8 + +ERROR_LOGS = asyncio.Queue() + + +class Status(Enum): + """Sanitization job status""" + DONE = auto() + FAIL = auto() + CANCELLED = auto() + + +@dataclass(frozen=True) +class ImageDimensions: + width: int + height: int + size: int + depth: int = DEPTH + + +class DimensionError(ValueError): + """Raised if invalid image dimensions were received""" + + +class PageError(ValueError): + """Raised if an invalid number of pages was received""" + + +class QrexecError(Exception): + """Raised if a qrexec-related error occured""" + + +class RepresentationError(Exception): + """Raised if an representation-related error occurred""" + + +class BadPath(click.BadParameter): + """Raised if a Path object parsed by Click is invalid""" + def __init__(self, path, message): + super().__init__(message, param_hint=f'"{path}"') + + +async def sigint_handler(tasks): + await asyncio.gather(*[cancel_task(t) for t in tasks]) + + +def modify_click_errors(func): + """Decorator for replacing Click behavior on errors""" + + def show(self, file=None): + """Removes usage message from UsageError error messages""" + color = None + + if file is None: + file = click._compat.get_text_stderr() + + if self.ctx is not None: + color = self.ctx.color + + click.echo(f"{self.format_message()}", file=file, color=color) + + + def format_message(self): + """Removes 'Invalid value' from BadParameter error messages""" + if self.param_hint is not None: + prefix = self.param_hint + elif self.param is not None: + prefix = self.param.get_error_hint(self.ctx) + else: + return self.message + prefix = click.exceptions._join_param_hints(prefix) + + return f"{prefix}: {self.message}" + + click.exceptions.BadParameter.format_message = format_message + click.exceptions.UsageError.show = show + + return func + + +def validate_paths(ctx, param, untrusted_paths): + """Callback for validating file paths parsed by Click""" + for untrusted_path in untrusted_paths: + if not untrusted_path.resolve().exists(): + raise BadPath(untrusted_path, "No such file or directory") + + if not untrusted_path.resolve().is_file(): + raise BadPath(untrusted_path, "Not a regular file") + + try: + with untrusted_path.resolve().open("rb"): + pass + except PermissionError as e: + raise BadPath(untrusted_path, "Not readable") from e + + paths = untrusted_paths + return paths + + +async def cancel_task(task): + task.cancel() + try: + await task + except: + pass + + +async def terminate_proc(proc): + if proc.returncode is None: + proc.terminate() + await proc.wait() + + +async def wait_proc(proc, cmd): + try: + await proc.wait() + except asyncio.CancelledError: + await terminate_proc(proc) + raise + + if proc.returncode: + raise subprocess.CalledProcessError(proc.returncode, cmd) + + +async def send(proc, data): + """Qrexec wrapper for sending data to the server""" + if isinstance(data, (int, str)): + data = str(data).encode() + + proc.stdin.write(data) + await proc.stdin.drain() + + +async def recv_b(proc, size): + """Qrexec wrapper for receiving binary data from the server""" + return await proc.stdout.readexactly(size) + + +async def recvline(proc): + """Qrexec wrapper for receiving a line of text data from the server""" + untrusted_data = await proc.stdout.readline() + if not untrusted_data: + raise EOFError + + return untrusted_data.decode("ascii").rstrip() + + +class Tqdm(tqdm.tqdm): + def set_status(self, status): + prefix = self.desc[:self.desc.rfind('.') + 1] + self.set_description_str(prefix + status) + self.refresh() + + + def set_job_status(self, status): + self.set_status(status.name.lower()) + + +class Representation: + """Umbrella object for a file's initial and final representations + + The initial representation must be of a format such that if it contains + malicious code/data, such code/data is excluded from the final + representation upon conversion. Generally, this restricts the initial + representation to a relatively simple format (e.g., RGB bitmap). + + The final representation can be of any format you'd like, provided that + the initial representation's format was properly selected (e.g., PNG). + + :param prefix: Path prefixes for representations + :param f_suffix: File extension of initial representation (without .) + :param i_suffix: File extension of final representation (without .) + """ + + def __init__(self, prefix, i_suffix, f_suffix): + """ + :param initial: File path to initial representation + :param final: File path final representation + :param dim: Image dimensions received from the server + """ + self.initial = prefix.with_suffix(f".{i_suffix}") + self.final = prefix.with_suffix(f".{f_suffix}") + self.dim = None + + + async def convert(self, bar): + """Convert initial representation into final representation + + :param bar: Progress bar to update upon completion + """ + cmd = [ + "convert", + "-size", + f"{self.dim.width}x{self.dim.height}", + "-depth", + f"{self.dim.depth}", + f"rgb:{self.initial}", + f"png:{self.final}" + ] + + proc = await asyncio.create_subprocess_exec(*cmd) + + try: + await wait_proc(proc, cmd) + except subprocess.CalledProcessError as e: + raise RepresentationError("Failed to convert representation") from e + + await asyncio.get_running_loop().run_in_executor( + None, + self.initial.unlink + ) + + bar.update(1) + bar.set_status(f"{bar.n}/{bar.total}") + + + async def receive(self, proc): + """Receive initial representation from the server + + :param proc: qrexec-client-vm process + """ + try: + self.dim = await self._dim(proc) + except EOFError as e: + raise QrexecError("Failed to receive image dimensions") from e + except (AttributeError, UnicodeError, ValueError) as e: + raise DimensionError("Invalid image dimensions") from e + + try: + data = await recv_b(proc, self.dim.size) + except asyncio.IncompleteReadError as e: + raise QrexecError("Received inconsistent number of bytes") from e + + await asyncio.get_running_loop().run_in_executor( + None, + self.initial.write_bytes, + data + ) + + + async def _dim(self, proc): + """Receive and compute image dimensions for initial representation + + :param proc: qrexec-client-vm process + """ + untrusted_w, untrusted_h = map(int, (await recvline(proc)).split(" ", 1)) + + if 1 <= untrusted_w <= MAX_IMG_WIDTH and 1 <= untrusted_h <= MAX_IMG_HEIGHT: + width = untrusted_w + height = untrusted_h + size = width * height * 3 + else: + raise ValueError + + return ImageDimensions(width, height, size) + + +@dataclass(frozen=True) +class BatchEntry: + task: asyncio.Task + rep: Representation + + +class BaseFile: + """An unsanitized file + + :param path: Path to original, unsanitized file + :param pagenums: Number of pages in original file + :param pdf: Path to temporary final PDf + """ + + def __init__(self, path, pagenums, pdf): + """ + :param path: @path + :param pagenums: @pagenums + :param batch: Conversion queue + """ + self.path = path + self.pagenums = pagenums + self.pdf = pdf + self.batch = None + + + async def sanitize(self, proc, bar, depth): + """Receive and convert representation files + + :param archive: Path to archive directory + :param depth: Conversion queue size + :param in_place: Value of --in-place flag + """ + self.batch = asyncio.Queue(depth) + + publish_task = asyncio.create_task(self._publish(proc, bar)) + consume_task = asyncio.create_task(self._consume()) + + try: + await asyncio.gather(publish_task, consume_task) + finally: + if not publish_task.done(): + await cancel_task(publish_task) + + if not consume_task.done(): + await cancel_task(consume_task) + + while not self.batch.empty(): + batch_e = await self.batch.get() + await cancel_task(batch_e.task) + self.batch.task_done() + + + async def _publish(self, proc, bar): + """Receive initial representations and start their conversions""" + pages = [] + + for page in range(1, self.pagenums + 1): + rep = Representation(Path(self.pdf.parent, str(page)), "rgb", "png") + await rep.receive(proc) + + task = asyncio.create_task(rep.convert(bar)) + batch_e = BatchEntry(task, rep) + + try: + await self.batch.put(batch_e) + except asyncio.CancelledError: + await cancel_task(task) + raise + + pages.append(page) + + if page % self.batch.maxsize == 0 or page == self.pagenums: + await self.batch.join() + await self._save_reps(pages) + pages = [] + + + async def _consume(self): + """Convert initial representations to final form and save as PDF""" + for _ in range(1, self.pagenums + 1): + batch_e = await self.batch.get() + await batch_e.task + self.batch.task_done() + + + async def _save_reps(self, pages): + """Save final representations to a PDF file""" + images = [] + + for page in pages: + try: + images.append(await asyncio.get_running_loop().run_in_executor( + None, + Image.open, + Path(self.pdf.parent, f"{page}.png")) + ) + except IOError as e: + for image in images: + await asyncio.get_running_loop().run_in_executor( + None, + image.close + ) + raise RepresentationError("Failed to open representation") from e + + try: + await asyncio.get_running_loop().run_in_executor( + None, + functools.partial(images[0].save, + self.pdf, + "PDF", + resolution=100, + append=self.pdf.exists(), + append_images=images[1:], + save_all=True) + ) + except IOError as e: + raise RepresentationError("Failed to save representation") from e + finally: + for image, page in zip(images, pages): + await asyncio.get_running_loop().run_in_executor( + None, + image.close + ) + await asyncio.get_running_loop().run_in_executor( + None, + Path(self.pdf.parent, f"{page}.png").unlink + ) + + +class Job: + """A sanitization job + + :param path: Path to original, unsanitized file + :param pos: Bar position + """ + + def __init__(self, path, pos): + """ + + :param file: Base file + :param bar: Progress bar + :param proc: qrexec-client-vm process + :param pdf: Path to temporary PDF for appending representations + """ + self.path = path + self.bar = Tqdm(desc=f"{path}...0/?", + bar_format=" {desc}", + position=pos) + self.base = None + self.proc = None + self.pdf = None + + + async def run(self, archive, depth, in_place): + self.proc = await asyncio.create_subprocess_exec( + *CLIENT_VM_CMD, + stdin=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE + ) + + with TemporaryDirectory(prefix="qvm-sanitize-") as tmpdir: + try: + await self._setup(tmpdir) + await self._start(archive, depth, in_place) + except (OSError, + PageError, + QrexecError, + DimensionError, + RepresentationError, + subprocess.CalledProcessError) as e: + # Since the qrexec-client-vm subprocesses belong to the same + # process group, when a SIGINT is issued, it's sent to each one. + # Consequently, there's a race between the signal and our + # cleanup code. Occasionally, the signal wins and causes some + # qrexec-client-vm subprocesses to exit, potentially during an + # operation (e.g., a STDOUT read), thereby raising an exception + # not expected by the cleanup code. + if self.proc.returncode == -signal.SIGINT: + self.bar.set_job_status(Status.CANCELLED) + raise asyncio.CancelledError + + self.bar.set_job_status(Status.FAIL) + await ERROR_LOGS.put(f"{self.path.name}: {e}") + if self.proc.returncode is not None: + await terminate_proc(self.proc) + raise + except asyncio.CancelledError: + self.bar.set_job_status(Status.CANCELLED) + raise + + self.bar.set_job_status(Status.DONE) + + + async def _setup(self, tmpdir): + send_task = asyncio.create_task(self._send()) + page_task = asyncio.create_task(self._pagenums()) + + try: + _, pagenums = await asyncio.gather(send_task, page_task) + except QrexecError: + await cancel_task(page_task) + raise + else: + try: + self.bar.reset(total=pagenums) + except AttributeError: + self.bar.total = pagenums + self.bar.refresh() + + self.pdf = Path(tmpdir, self.path.with_suffix(".trusted.pdf").name) + self.base = BaseFile(self.path, pagenums, self.pdf) + + + async def _start(self, archive, depth, in_place): + await self.base.sanitize( + self.proc, + self.bar, + depth + ) + await wait_proc(self.proc, CLIENT_VM_CMD) + + await asyncio.get_running_loop().run_in_executor( + None, + shutil.move, + self.pdf, + Path(self.path.parent, self.pdf.name) + ) + + if in_place: + try: + await asyncio.get_running_loop().run_in_executor( + None, + self.path.unlink + ) + except FileNotFoundError: + pass + else: + await asyncio.get_running_loop().run_in_executor( + None, + self._archive, + archive + ) + + + async def _send(self): + """Send original document to server""" + data = await asyncio.get_running_loop().run_in_executor( + None, + self.path.read_bytes + ) + + try: + await send(self.proc, data) + except BrokenPipeError as e: + raise QrexecError("Failed to send PDF") from e + else: + self.proc.stdin.write_eof() + + + async def _pagenums(self): + """Receive number of pages in original document from server""" + try: + untrusted_pagenums = int(await recvline(self.proc)) + except (AttributeError, EOFError, UnicodeError, ValueError) as e: + raise QrexecError("Failed to receive page count") from e + + if 1 <= untrusted_pagenums <= MAX_PAGES: + pagenums = untrusted_pagenums + else: + raise PageError("Invalid page count") + + return pagenums + + + def _archive(self, archive): + """Move original file into an archival directory""" + Path.mkdir(archive, exist_ok=True) + self.path.rename(Path(archive, self.path.name)) + + +async def run(params): + suffix = "s" if len(params["files"]) > 1 else "" + print(f"Sending file{suffix} to Disposable VM{suffix}...\n") + + tasks = [] + jobs = [Job(f, i) for i, f in enumerate(params["files"])] + for job in jobs: + tasks.append(asyncio.create_task(job.run(params["archive"], + params["batch"], + params["in_place"]))) + + asyncio.get_running_loop().add_signal_handler( + signal.SIGINT, + lambda: asyncio.ensure_future(sigint_handler(tasks)) + ) + + results = await asyncio.gather(*tasks, return_exceptions=True) + completed = results.count(None) + + for job in jobs: + job.bar.close() + + if ERROR_LOGS.empty(): + if tqdm.__version__ >= "4.34.0": + newlines = "\n" + else: + newlines = "\n" if len(jobs) == 1 else "\n" * (len(jobs) + 1) + else: + newlines = "\n" + + if tqdm.__version__ >= "4.34.0": + print() + else: + print() if len(jobs) == 1 else print("\n" * len(jobs)) + + while not ERROR_LOGS.empty(): + err_msg = await ERROR_LOGS.get() + logging.error(err_msg) + ERROR_LOGS.task_done() + + print(f"{newlines}Total Sanitized Files: {completed}/{len(results)}") + + return completed != len(results) + + +@click.command() +@click.option( + "-b", + "--batch", + type=click.IntRange(1), + default=50, + metavar="SIZE", + help="Maximum number of conversion tasks" +) +@click.option( + "-a", + "--archive", + type=Path, + default=Path(Path.home(), "QubesUntrustedPDFs"), + metavar="PATH", + help="Directory for storing archived files" +) +@click.option( + "-i", + "--in-place", + is_flag=True, + help="Replace original files instead of archiving them" +) +@click.argument( + "files", + type=Path, + nargs=-1, + callback=validate_paths, + metavar="[FILES ...]" +) +@modify_click_errors +def main(**params): + logging.basicConfig(format="error: %(message)s") + + if params["files"]: + loop = asyncio.get_event_loop() + sys.exit(loop.run_until_complete(run(params))) + else: + print("No files to sanitize.") + + +if __name__ == "__main__": + main() diff --git a/qubespdfconverter/server.py b/qubespdfconverter/server.py new file mode 100755 index 0000000..47b0b89 --- /dev/null +++ b/qubespdfconverter/server.py @@ -0,0 +1,292 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# The Qubes OS Project, http://www.qubes-os.org +# +# Copyright (C) 2013 Joanna Rutkowska +# Copyright (C) 2020 Jason Phan +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +import asyncio +import subprocess +import sys + +from dataclasses import dataclass +from pathlib import Path +from tempfile import TemporaryDirectory + +DEPTH = 8 +STDIN_READ_SIZE = 65536 + + +def unlink(path): + """Wrapper for pathlib.Path.unlink(path, missing_ok=True)""" + try: + path.unlink() + except FileNotFoundError: + pass + + +async def cancel_task(task): + task.cancel() + try: + await task + except: + pass + + +async def terminate_proc(proc): + if proc.returncode is None: + proc.terminate() + await proc.wait() + + +async def wait_proc(proc, cmd): + try: + await proc.wait() + except asyncio.CancelledError: + await terminate_proc(proc) + raise + + if proc.returncode: + raise subprocess.CalledProcessError(proc.returncode, cmd) + + +def send_b(data): + """Qrexec wrapper for sending binary data to the client""" + if isinstance(data, (str, int)): + data = str(data).encode() + + sys.stdout.buffer.write(data) + sys.stdout.buffer.flush() + + +def send(data): + """Qrexec wrapper for sending text data to the client""" + print(data, flush=True) + + +def recv_b(): + """Qrexec wrapper for receiving binary data from the client""" + untrusted_data = sys.stdin.buffer.read() + if not untrusted_data: + raise EOFError + return untrusted_data + + +class Representation: + """Umbrella object for a file's initial and final representations + + The initial representation must be of a format from which we can derive + the final representation without breaking any of its requirements. + Generally, this makes the initial representation some sort of image file + (e.g. PNG, JPEG). + + The final representation must be of a format such that if the initial + representation contains malicious code/data, such code/data is excluded + from the final representation upon conversion. Generally, this makes the + final representation a relatively simple format (e.g., RGB bitmap). + + :param path: Path to original, unsanitized file + :param prefix: Path prefix for representations + :param f_suffix: File extension of initial representation (without .) + :param i_suffix: File extension of final representation (without .) + """ + + def __init__(self, path, prefix, i_suffix, f_suffix): + self.path = path + self.page = prefix.name + self.initial = prefix.with_suffix(f".{i_suffix}") + self.final = prefix.with_suffix(f".{f_suffix}") + self.dim = None + + + async def convert(self): + """Convert initial representation to final representation""" + cmd = [ + "convert", + str(self.initial), + "-depth", + str(DEPTH), + f"rgb:{self.final}" + ] + + await self.create_irep() + self.dim = await self._dim() + + proc = await asyncio.create_subprocess_exec(*cmd) + try: + await wait_proc(proc, cmd) + finally: + await asyncio.get_running_loop().run_in_executor( + None, + unlink, + self.initial + ) + + + async def create_irep(self): + """Create initial representation""" + cmd = [ + "pdftocairo", + str(self.path), + "-png", + "-f", + str(self.page), + "-l", + str(self.page), + "-singlefile", + str(Path(self.initial.parent, self.initial.stem)) + ] + + proc = await asyncio.create_subprocess_exec(*cmd) + await wait_proc(proc, cmd) + + + async def _dim(self): + """Identify image dimensions of initial representation""" + cmd = ["identify", "-format", "%w %h", str(self.initial)] + proc = await asyncio.create_subprocess_exec( + *cmd, + stdout=subprocess.PIPE + ) + + try: + output, _ = await proc.communicate() + except asyncio.CancelledError: + await terminate_proc(proc) + raise + + return output.decode("ascii") + + +@dataclass(frozen=True) +class BatchEntry: + task: asyncio.Task + rep: Representation + + +class BaseFile: + """Unsanitized file""" + def __init__(self, path): + self.path = path + self.pagenums = 0 + self.batch = None + + + async def sanitize(self): + """Start sanitization tasks""" + self.pagenums = self._pagenums() + self.batch = asyncio.Queue(self.pagenums) + + send(self.pagenums) + + publish_task = asyncio.create_task(self._publish()) + consume_task = asyncio.create_task(self._consume()) + + try: + await asyncio.gather(publish_task, consume_task) + except subprocess.CalledProcessError: + await cancel_task(publish_task) + + while not self.batch.empty(): + convert_task = await self.batch.get() + await cancel_task(convert_task) + self.batch.task_done() + + raise + + + def _pagenums(self): + """Return the number of pages in the suspect file""" + cmd = ["pdfinfo", str(self.path)] + output = subprocess.run(cmd, capture_output=True, check=True) + pages = 0 + + for line in output.stdout.decode().splitlines(): + if "Pages:" in line: + pages = int(line.split(":")[1]) + + return pages + + + async def _publish(self): + """Extract initial representations and enqueue conversion tasks""" + for page in range(1, self.pagenums + 1): + rep = Representation( + self.path, + Path(self.path.parent, str(page)), + "png", + "rgb" + ) + task = asyncio.create_task(rep.convert()) + batch_e = BatchEntry(task, rep) + await self.batch.join() + + try: + await self.batch.put(batch_e) + except asyncio.CancelledError: + await cancel_task(task) + raise + + + async def _consume(self): + """Await conversion tasks and send final representation to client""" + for _ in range(self.pagenums): + batch_e = await self.batch.get() + await batch_e.task + + rgb_data = await asyncio.get_running_loop().run_in_executor( + None, + batch_e.rep.final.read_bytes + ) + + await asyncio.get_running_loop().run_in_executor( + None, + unlink, + batch_e.rep.final + ) + + await asyncio.get_running_loop().run_in_executor( + None, + send, + batch_e.rep.dim + ) + send_b(rgb_data) + + self.batch.task_done() + + +def main(): + try: + data = recv_b() + except EOFError: + sys.exit(1) + + with TemporaryDirectory(prefix="qvm-sanitize") as tmpdir: + pdf_path = Path(tmpdir, "original") + pdf_path.write_bytes(data) + base = BaseFile(pdf_path) + + loop = asyncio.get_event_loop() + try: + loop.run_until_complete(base.sanitize()) + except subprocess.CalledProcessError: + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/qubespdfconverter/__init__.py b/qubespdfconverter/tests/__init__.py similarity index 100% rename from qubespdfconverter/__init__.py rename to qubespdfconverter/tests/__init__.py diff --git a/qubespdfconverter/tests.py b/qubespdfconverter/tests/tests.py similarity index 100% rename from qubespdfconverter/tests.py rename to qubespdfconverter/tests/tests.py diff --git a/qvm-convert-pdf b/qvm-convert-pdf deleted file mode 100755 index 3469e34..0000000 --- a/qvm-convert-pdf +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash -# -# The Qubes OS Project, http://www.qubes-os.org -# -# Copyright (C) 2013 Joanna Rutkowska -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -# -# - -if [ $# -ne 1 ] ; then - echo usage: $0 '' - exit 1 -fi - -exec /usr/bin/qrexec-client-vm '$dispvm' qubes.PdfConvert /usr/lib/qubes/qpdf-convert-client "$@" diff --git a/qvm-convert-pdf.gnome b/qvm-convert-pdf.gnome index c1e8561..f1801d0 100755 --- a/qvm-convert-pdf.gnome +++ b/qvm-convert-pdf.gnome @@ -26,4 +26,4 @@ fi export PROGRESS_FOR_GUI="yes" -/usr/lib/qubes/qrexec_client_vm '$dispvm' qubes.PdfConvert /usr/lib/qubes/qpdf-convert-client "$@" | zenity --progress --text="Converting PDF using Disposable VM..." --auto-close --auto-kill +/usr/bin/qvm-convert-pdf "$@" | zenity --progress --text="Converting PDF using Disposable VM..." --auto-close --auto-kill diff --git a/rpm_spec/qpdf-converter.spec.in b/rpm_spec/qpdf-converter.spec.in index f69eab2..0ae7fca 100644 --- a/rpm_spec/qpdf-converter.spec.in +++ b/rpm_spec/qpdf-converter.spec.in @@ -56,7 +56,6 @@ rm -rf $RPM_BUILD_ROOT %files %defattr(-,root,root,-) /etc/qubes-rpc/qubes.PdfConvert -/usr/lib/qubes/qpdf-convert-client /usr/lib/qubes/qpdf-convert-server /usr/lib/qubes/qvm-convert-pdf.gnome /usr/bin/qvm-convert-pdf diff --git a/setup.py b/setup.py index f2766c0..10c633b 100644 --- a/setup.py +++ b/setup.py @@ -28,6 +28,11 @@ name='qubespdfconverter', version=open('version').read().strip(), packages=['qubespdfconverter'], + install_requires=[ + 'Click', + 'Pillow', + 'tqdm' + ], entry_points={ 'qubes.tests.extra.for_template': 'qubespdfconverter = qubespdfconverter.tests:list_tests',