From f1a5c4305ea5a2926303bb8ddbfb8856d39e5322 Mon Sep 17 00:00:00 2001 From: deeplow Date: Wed, 22 Nov 2023 11:12:47 +0000 Subject: [PATCH] Stream pages in containers (merge isolation providers) Merge Qubes and Containers isolation providers into a superclass called "ProcessBasedIsolationProviders" by streaming pages in containers for exclusively in first conversion process. The commit is rather large due to the multiple interdependencies of the code, making it difficult to split into various commits. The main conversion method (_convert) now in the superclass simply calls two methods: - doc_to_pixels() - pixels_to_pdf() Critically, doc_to_pixels is implemented in the superclass, diverging only in a specialized method called "get_doc_to_pixels_proc()". This method obtains the process responsible that communicates with the isolation provider (container / disp VM) via `podman/docker` and qrexec on Containers and Qubes respectively. Known regressions: - progress reports stopped working on containers Fixes #443 --- Dockerfile | 9 +- dangerzone/conversion/common.py | 56 ++++-- dangerzone/conversion/doc_to_pixels.py | 57 +++--- .../conversion/doc_to_pixels_qubes_wrapper.py | 108 ----------- dangerzone/conversion/pixels_to_pdf.py | 18 +- dangerzone/isolation_provider/base.py | 144 +++++++++++++- dangerzone/isolation_provider/container.py | 178 ++++++------------ dangerzone/isolation_provider/qubes.py | 146 +------------- qubes/dz.Convert | 2 +- qubes/dz.ConvertDev | 2 +- tests/isolation_provider/test_qubes.py | 4 +- 11 files changed, 295 insertions(+), 429 deletions(-) delete mode 100644 dangerzone/conversion/doc_to_pixels_qubes_wrapper.py diff --git a/Dockerfile b/Dockerfile index 5f551ff8c..4b794b2dd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -77,10 +77,5 @@ COPY conversion /opt/dangerzone/dangerzone/conversion RUN adduser -s /bin/sh -D dangerzone USER dangerzone -# /tmp/input_file is where the first convert expects the input file to be, and -# /tmp where it will write the pixel files -# -# /dangerzone is where the second script expects files to be put by the first one -# -# /safezone is where the wrapper eventually moves the sanitized files. -VOLUME /dangerzone /tmp/input_file /safezone +# /safezone is a directory through which Pixels to PDF receives files +VOLUME /safezone diff --git a/dangerzone/conversion/common.py b/dangerzone/conversion/common.py index 21122065d..b367bc062 100644 --- a/dangerzone/conversion/common.py +++ b/dangerzone/conversion/common.py @@ -10,7 +10,7 @@ import sys import time from abc import abstractmethod -from typing import Callable, Dict, List, Optional, Tuple, Union +from typing import Callable, Dict, List, Optional, TextIO, Tuple, Union TIMEOUT_PER_PAGE: float = 30 # (seconds) TIMEOUT_PER_MB: float = 30 # (seconds) @@ -58,6 +58,49 @@ def __init__(self, progress_callback: Optional[Callable] = None) -> None: self.progress_callback = progress_callback self.captured_output: bytes = b"" + @classmethod + def _read_bytes(cls) -> bytes: + """Read bytes from the stdin.""" + data = sys.stdin.buffer.read() + if data is None: + raise EOFError + return data + + @classmethod + def _write_bytes(cls, data: bytes, file: TextIO = sys.stdout) -> None: + file.buffer.write(data) + + @classmethod + def _write_text(cls, text: str, file: TextIO = sys.stdout) -> None: + cls._write_bytes(text.encode(), file=file) + + @classmethod + def _write_int(cls, num: int, file: TextIO = sys.stdout) -> None: + cls._write_bytes(num.to_bytes(2, signed=False), file=file) + + # ==== ASYNC METHODS ==== + # We run sync methods in async wrappers, because pure async methods are more difficult: + # https://stackoverflow.com/a/52702646 + # + # In practice, because they are I/O bound and we don't have many running concurrently, + # they shouldn't cause a problem. + + @classmethod + async def read_bytes(cls) -> bytes: + return await asyncio.to_thread(cls._read_bytes) + + @classmethod + async def write_bytes(cls, data: bytes, file: TextIO = sys.stdout) -> None: + return await asyncio.to_thread(cls._write_bytes, data, file=file) + + @classmethod + async def write_text(cls, text: str, file: TextIO = sys.stdout) -> None: + return await asyncio.to_thread(cls._write_text, text, file=file) + + @classmethod + async def write_int(cls, num: int, file: TextIO = sys.stdout) -> None: + return await asyncio.to_thread(cls._write_int, num, file=file) + async def read_stream( self, sr: asyncio.StreamReader, callback: Optional[Callable] = None ) -> bytes: @@ -150,13 +193,4 @@ async def convert(self) -> None: pass def update_progress(self, text: str, *, error: bool = False) -> None: - if running_on_qubes(): - if self.progress_callback: - self.progress_callback(error, text, int(self.percentage)) - else: - print( - json.dumps( - {"error": error, "text": text, "percentage": int(self.percentage)} - ) - ) - sys.stdout.flush() + pass diff --git a/dangerzone/conversion/doc_to_pixels.py b/dangerzone/conversion/doc_to_pixels.py index 786f25f1a..e27232e30 100644 --- a/dangerzone/conversion/doc_to_pixels.py +++ b/dangerzone/conversion/doc_to_pixels.py @@ -13,7 +13,7 @@ import re import shutil import sys -from typing import Dict, List, Optional +from typing import Dict, List, Optional, TextIO import fitz import magic @@ -23,26 +23,17 @@ class DocumentToPixels(DangerzoneConverter): - # XXX: These functions write page data and metadata to a separate file. For now, - # they act as an anchor point for Qubes to stream back page data/metadata in - # real time. In the future, they will be completely replaced by their streaming - # counterparts. See: - # - # https://github.com/freedomofpress/dangerzone/issues/443 async def write_page_count(self, count: int) -> None: - pass + return await self.write_int(count) async def write_page_width(self, width: int, filename: str) -> None: - with open(filename, "w") as f: - f.write(str(width)) + return await self.write_int(width) async def write_page_height(self, height: int, filename: str) -> None: - with open(filename, "w") as f: - f.write(str(height)) + return await self.write_int(height) async def write_page_data(self, data: bytes, filename: str) -> None: - with open(filename, "wb") as f: - f.write(data) + return await self.write_bytes(data) async def convert(self) -> None: conversions: Dict[str, Dict[str, Optional[str]]] = { @@ -255,20 +246,6 @@ async def convert(self) -> None: await self.write_page_height(pix.height, height_filename) await self.write_page_data(rgb_buf, rgb_filename) - final_files = ( - glob.glob("/tmp/page-*.rgb") - + glob.glob("/tmp/page-*.width") - + glob.glob("/tmp/page-*.height") - ) - - # XXX: Sanity check to avoid situations like #560. - if not running_on_qubes() and len(final_files) != 3 * doc.page_count: - raise errors.PageCountMismatch() - - # Move converted files into /tmp/dangerzone - for filename in final_files: - shutil.move(filename, "/tmp/dangerzone") - self.update_progress("Converted document to pixels") async def install_libreoffice_ext(self, libreoffice_ext: str) -> None: @@ -298,18 +275,28 @@ def detect_mime_type(self, path: str) -> str: return mime_type -async def main() -> int: - converter = DocumentToPixels() +async def main() -> None: + try: + data = await DocumentToPixels.read_bytes() + except EOFError: + sys.exit(1) + + with open("/tmp/input_file", "wb") as f: + f.write(data) try: + converter = DocumentToPixels() await converter.convert() - error_code = 0 # Success! - except errors.ConversionException as e: # Expected Errors - error_code = e.error_code + except errors.ConversionException as e: + await DocumentToPixels.write_bytes(str(e).encode(), file=sys.stderr) + sys.exit(e.error_code) except Exception as e: - converter.update_progress(str(e), error=True) + await DocumentToPixels.write_bytes(str(e).encode(), file=sys.stderr) error_code = errors.UnexpectedConversionError.error_code - return error_code + sys.exit(error_code) + + # Write debug information + await DocumentToPixels.write_bytes(converter.captured_output, file=sys.stderr) if __name__ == "__main__": diff --git a/dangerzone/conversion/doc_to_pixels_qubes_wrapper.py b/dangerzone/conversion/doc_to_pixels_qubes_wrapper.py deleted file mode 100644 index b94c51f44..000000000 --- a/dangerzone/conversion/doc_to_pixels_qubes_wrapper.py +++ /dev/null @@ -1,108 +0,0 @@ -import asyncio -import os -import shutil -import sys -import tempfile -from pathlib import Path -from typing import Optional, TextIO - -from . import errors -from .doc_to_pixels import DocumentToPixels - - -def _read_bytes() -> bytes: - """Read bytes from the stdin.""" - data = sys.stdin.buffer.read() - if data is None: - raise EOFError - return data - - -def _write_bytes(data: bytes, file: TextIO = sys.stdout) -> None: - file.buffer.write(data) - - -def _write_text(text: str, file: TextIO = sys.stdout) -> None: - _write_bytes(text.encode(), file=file) - - -def _write_int(num: int, file: TextIO = sys.stdout) -> None: - _write_bytes(num.to_bytes(2, signed=False), file=file) - - -# ==== ASYNC METHODS ==== -# We run sync methods in async wrappers, because pure async methods are more difficult: -# https://stackoverflow.com/a/52702646 -# -# In practice, because they are I/O bound and we don't have many running concurrently, -# they shouldn't cause a problem. - - -async def read_bytes() -> bytes: - return await asyncio.to_thread(_read_bytes) - - -async def write_bytes(data: bytes, file: TextIO = sys.stdout) -> None: - return await asyncio.to_thread(_write_bytes, data, file=file) - - -async def write_text(text: str, file: TextIO = sys.stdout) -> None: - return await asyncio.to_thread(_write_text, text, file=file) - - -async def write_int(num: int, file: TextIO = sys.stdout) -> None: - return await asyncio.to_thread(_write_int, num, file=file) - - -class QubesDocumentToPixels(DocumentToPixels): - # Override the write_page_* functions to stream data back to the caller, instead of - # writing it to separate files. This way, we have more accurate progress reports and - # client-side timeouts. See also: - # - # https://github.com/freedomofpress/dangerzone/issues/443 - # https://github.com/freedomofpress/dangerzone/issues/557 - - async def write_page_count(self, count: int) -> None: - return await write_int(count) - - async def write_page_width(self, width: int, filename: str) -> None: - return await write_int(width) - - async def write_page_height(self, height: int, filename: str) -> None: - return await write_int(height) - - async def write_page_data(self, data: bytes, filename: str) -> None: - return await write_bytes(data) - - -async def main() -> None: - out_dir = Path("/tmp/dangerzone") - if out_dir.exists(): - shutil.rmtree(out_dir) - out_dir.mkdir() - - try: - data = await read_bytes() - except EOFError: - sys.exit(1) - - with open("/tmp/input_file", "wb") as f: - f.write(data) - - try: - converter = QubesDocumentToPixels() - await converter.convert() - except errors.ConversionException as e: - await write_bytes(str(e).encode(), file=sys.stderr) - sys.exit(e.error_code) - except Exception as e: - await write_bytes(str(e).encode(), file=sys.stderr) - error_code = errors.UnexpectedConversionError.error_code - sys.exit(error_code) - - # Write debug information - await write_bytes(converter.captured_output, file=sys.stderr) - - -if __name__ == "__main__": - sys.exit(asyncio.run(main())) diff --git a/dangerzone/conversion/pixels_to_pdf.py b/dangerzone/conversion/pixels_to_pdf.py index 2217e0055..0ff492cf7 100644 --- a/dangerzone/conversion/pixels_to_pdf.py +++ b/dangerzone/conversion/pixels_to_pdf.py @@ -22,12 +22,12 @@ async def convert( ) -> None: self.percentage = 50.0 if tempdir is None: - tempdir = "/tmp" + tempdir = "/safezone" # XXX lazy loading of fitz module to avoid import issues on non-Qubes systems import fitz - num_pages = len(glob.glob(f"{tempdir}/dangerzone/page-*.rgb")) + num_pages = len(glob.glob(f"{tempdir}/pixels/page-*.rgb")) total_size = 0.0 safe_doc = fitz.Document() @@ -35,7 +35,7 @@ async def convert( # Convert RGB files to PDF files percentage_per_page = 45.0 / num_pages for page_num in range(1, num_pages + 1): - filename_base = f"{tempdir}/dangerzone/page-{page_num}" + filename_base = f"{tempdir}/pixels/page-{page_num}" rgb_filename = f"{filename_base}.rgb" width_filename = f"{filename_base}.width" height_filename = f"{filename_base}.height" @@ -90,6 +90,18 @@ async def convert( safe_doc.save(safe_pdf_path, deflate_images=True) + def update_progress(self, text: str, *, error: bool = False) -> None: + if running_on_qubes(): + if self.progress_callback: + self.progress_callback(error, text, int(self.percentage)) + else: + print( + json.dumps( + {"error": error, "text": text, "percentage": int(self.percentage)} + ) + ) + sys.stdout.flush() + async def main() -> int: ocr_lang = os.environ.get("OCR_LANGUAGE") if os.environ.get("OCR") == "1" else None diff --git a/dangerzone/isolation_provider/base.py b/dangerzone/isolation_provider/base.py index 6265146f7..afca95583 100644 --- a/dangerzone/isolation_provider/base.py +++ b/dangerzone/isolation_provider/base.py @@ -1,13 +1,18 @@ import logging +import os import subprocess +import sys +import tempfile from abc import ABC, abstractmethod -from typing import Callable, Optional +from pathlib import Path +from typing import IO, Callable, Optional from colorama import Fore, Style -from ..conversion.errors import ConversionException +from ..conversion import errors +from ..conversion.common import calculate_timeout from ..document import Document -from ..util import replace_control_chars +from ..util import Stopwatch, nonblocking_read, replace_control_chars log = logging.getLogger(__name__) @@ -18,11 +23,35 @@ PIXELS_TO_PDF_LOG_END = "----- PIXELS TO PDF LOG END -----" +def read_bytes(f: IO[bytes], size: int, timeout: float, exact: bool = True) -> bytes: + """Read bytes from a file-like object.""" + buf = nonblocking_read(f, size, timeout) + if exact and len(buf) != size: + raise errors.InterruptedConversion + return buf + + +def read_int(f: IO[bytes], timeout: float) -> int: + """Read 2 bytes from a file-like object, and decode them as int.""" + untrusted_int = read_bytes(f, 2, timeout) + return int.from_bytes(untrusted_int, signed=False) + + +def read_debug_text(f: IO[bytes], size: int) -> str: + """Read arbitrarily long text (for debug purposes)""" + timeout = calculate_timeout(size) + untrusted_text = read_bytes(f, size, timeout, exact=False) + return untrusted_text.decode("ascii", errors="replace") + + class IsolationProvider(ABC): """ Abstracts an isolation provider """ + def __init__(self) -> None: + self.percentage = 0.0 + @abstractmethod def install(self) -> bool: pass @@ -37,7 +66,7 @@ def convert( document.mark_as_converting() try: success = self._convert(document, ocr_lang) - except ConversionException as e: + except errors.ConversionException as e: success = False self.print_progress_trusted(document, True, str(e), 0) except Exception as e: @@ -102,6 +131,113 @@ def sanitize_conversion_str(self, untrusted_conversion_str: str) -> str: return armor_start + conversion_string + armor_end +class ProcessBasedIsolationProvider(IsolationProvider): + # The maximum time it takes a the provider to start up. + STARTUP_TIME_SECONDS = 0 + + def __init__(self) -> None: + self.proc: Optional[subprocess.Popen] = None + super().__init__() + + @abstractmethod + def get_doc_to_pixels_proc(self) -> subprocess.Popen: + pass + + def _convert( + self, + document: Document, + ocr_lang: Optional[str] = None, + ) -> bool: + try: + with tempfile.TemporaryDirectory() as t: + Path(f"{t}/pixels").mkdir() + self.doc_to_pixels(document, t) + # TODO: validate convert to pixels output + self.pixels_to_pdf(document, t, ocr_lang) + return True + except errors.InterruptedConversion: + assert self.proc is not None + error_code = self.proc.wait(3) + # XXX Reconstruct exception from error code + raise errors.exception_from_error_code(error_code) # type: ignore [misc] + + def doc_to_pixels(self, document: Document, tempdir: str) -> None: + with open(document.input_filename, "rb") as f: + self.proc = self.get_doc_to_pixels_proc() + try: + assert self.proc.stdin is not None + self.proc.stdin.write(f.read()) + self.proc.stdin.close() + except BrokenPipeError as e: + raise errors.InterruptedConversion() + + # Get file size (in MiB) + size = os.path.getsize(document.input_filename) / 1024**2 + timeout = calculate_timeout(size) + self.STARTUP_TIME_SECONDS + + assert self.proc is not None + assert self.proc.stdout is not None + os.set_blocking(self.proc.stdout.fileno(), False) + + n_pages = read_int(self.proc.stdout, timeout) + if n_pages == 0 or n_pages > errors.MAX_PAGES: + raise errors.MaxPagesException() + percentage_per_page = 50.0 / n_pages + + timeout = calculate_timeout(size, n_pages) + sw = Stopwatch(timeout) + sw.start() + for page in range(1, n_pages + 1): + text = f"Converting page {page}/{n_pages} to pixels" + self.print_progress_trusted(document, False, text, self.percentage) + + width = read_int(self.proc.stdout, timeout=sw.remaining) + height = read_int(self.proc.stdout, timeout=sw.remaining) + if not (1 <= width <= errors.MAX_PAGE_WIDTH): + raise errors.MaxPageWidthException() + if not (1 <= height <= errors.MAX_PAGE_HEIGHT): + raise errors.MaxPageHeightException() + + num_pixels = width * height * 3 # three color channels + untrusted_pixels = read_bytes( + self.proc.stdout, + num_pixels, + timeout=sw.remaining, + ) + + # Wrapper code + with open(f"{tempdir}/pixels/page-{page}.width", "w") as f_width: + f_width.write(str(width)) + with open(f"{tempdir}/pixels/page-{page}.height", "w") as f_height: + f_height.write(str(height)) + with open(f"{tempdir}/pixels/page-{page}.rgb", "wb") as f_rgb: + f_rgb.write(untrusted_pixels) + + self.percentage += percentage_per_page + + # Ensure nothing else is read after all bitmaps are obtained + self.proc.stdout.close() + + # TODO handle leftover code input + text = "Converted document to pixels" + self.print_progress_trusted(document, False, text, self.percentage) + + if getattr(sys, "dangerzone_dev", False): + assert self.proc.stderr is not None + os.set_blocking(self.proc.stderr.fileno(), False) + untrusted_log = read_debug_text(self.proc.stderr, MAX_CONVERSION_LOG_CHARS) + self.proc.stderr.close() + log.info( + f"Conversion output (doc to pixels)\n{self.sanitize_conversion_str(untrusted_log)}" + ) + + @abstractmethod + def pixels_to_pdf( + self, document: Document, tempdir: str, ocr_lang: Optional[str] + ) -> None: + pass + + # From global_common: # def validate_convert_to_pixel_output(self, common, output): diff --git a/dangerzone/isolation_provider/container.py b/dangerzone/isolation_provider/container.py index 27bd98836..6a634bf0a 100644 --- a/dangerzone/isolation_provider/container.py +++ b/dangerzone/isolation_provider/container.py @@ -2,16 +2,14 @@ import json import logging import os -import pathlib import platform import shlex import shutil import subprocess import sys -import tempfile -from typing import Any, Callable, List, Optional, Tuple +from typing import Any, List, Optional -from ..conversion.errors import exception_from_error_code +from ..conversion import errors from ..document import Document from ..util import ( get_resource_path, @@ -19,12 +17,7 @@ get_tmp_dir, replace_control_chars, ) -from .base import ( - MAX_CONVERSION_LOG_CHARS, - PIXELS_TO_PDF_LOG_END, - PIXELS_TO_PDF_LOG_START, - IsolationProvider, -) +from .base import ProcessBasedIsolationProvider # Define startupinfo for subprocesses if platform.system() == "Windows": @@ -42,9 +35,10 @@ def __init__(self, container_tech: str) -> None: super().__init__(f"{container_tech} is not installed") -class Container(IsolationProvider): +class Container(ProcessBasedIsolationProvider): # Name of the dangerzone container CONTAINER_NAME = "dangerzone.rocks/dangerzone" + STARTUP_TIME_SECONDS = 5 def __init__(self, enable_timeouts: bool) -> None: self.enable_timeouts = 1 if enable_timeouts else 0 @@ -179,34 +173,30 @@ def parse_progress(self, document: Document, untrusted_line: str) -> None: def exec( self, - document: Document, args: List[str], - ) -> int: + ) -> subprocess.Popen: args_str = " ".join(shlex.quote(s) for s in args) log.info("> " + args_str) - with subprocess.Popen( + dev_mode = getattr(sys, "dangerzone_dev", False) == True + if dev_mode: + stderr = subprocess.PIPE + else: + stderr = subprocess.DEVNULL + + return subprocess.Popen( args, - stdin=None, + stdin=subprocess.PIPE, stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - bufsize=1, - universal_newlines=True, + stderr=stderr, startupinfo=startupinfo, - ) as p: - if p.stdout is not None: - for untrusted_line in p.stdout: - self.parse_progress(document, untrusted_line) - - p.communicate() - return p.returncode + ) def exec_container( self, - document: Document, command: List[str], extra_args: List[str] = [], - ) -> int: + ) -> subprocess.Popen: container_runtime = self.get_runtime() if self.get_runtime_name() == "podman": @@ -218,6 +208,7 @@ def exec_container( # drop all linux kernel capabilities security_args += ["--cap-drop", "all"] user_args = ["-u", "dangerzone"] + enable_stdin = ["-i"] prevent_leakage_args = ["--rm"] @@ -226,60 +217,55 @@ def exec_container( + user_args + security_args + prevent_leakage_args + + enable_stdin + extra_args + [self.CONTAINER_NAME] + command ) args = [container_runtime] + args - return self.exec(document, args) + return self.exec(args) - def _convert( - self, - document: Document, - ocr_lang: Optional[str], - ) -> bool: - # Create a temporary directory inside the cache directory for this run. Then, - # create some subdirectories for the various stages of the file conversion: - # - # * unsafe: Where the input file will be copied - # * pixel: Where the RGB data will be stored - # * safe: Where the final PDF file will be stored - with tempfile.TemporaryDirectory(dir=get_tmp_dir()) as t: - tmp_dir = pathlib.Path(t) - unsafe_dir = tmp_dir / "unsafe" - unsafe_dir.mkdir() - pixel_dir = tmp_dir / "pixels" - pixel_dir.mkdir() - safe_dir = tmp_dir / "safe" - safe_dir.mkdir() - - return self._convert_with_tmpdirs( - document=document, - unsafe_dir=unsafe_dir, - pixel_dir=pixel_dir, - safe_dir=safe_dir, - ocr_lang=ocr_lang, - ) + def pixels_to_pdf( + self, document: Document, tempdir: str, ocr_lang: Optional[str] + ) -> None: + # Convert pixels to safe PDF + command = [ + "/usr/bin/python3", + "-m", + "dangerzone.conversion.pixels_to_pdf", + ] + extra_args = [ + "-v", + f"{tempdir}:/safezone:Z", + "-e", + "TESSDATA_PREFIX=/usr/share/tessdata", + "-e", + f"OCR={0 if ocr_lang is None else 1}", + "-e", + f"OCR_LANGUAGE={ocr_lang}", + "-e", + f"ENABLE_TIMEOUTS={self.enable_timeouts}", + ] - def _convert_with_tmpdirs( - self, - document: Document, - unsafe_dir: pathlib.Path, - pixel_dir: pathlib.Path, - safe_dir: pathlib.Path, - ocr_lang: Optional[str], - ) -> bool: - success = False - - if ocr_lang: - ocr = "1" + pixels_to_pdf_proc = self.exec_container(command, extra_args) + for line in pixels_to_pdf_proc.stdout: + self.parse_progress(document, line) + error_code = pixels_to_pdf_proc.wait() + if error_code != 0: + log.error("pixels-to-pdf failed") + raise errors.exception_from_error_code(error_code) # type: ignore [misc] else: - ocr = "0" + # Move the final file to the right place + if os.path.exists(document.output_filename): + os.remove(document.output_filename) - copied_file = unsafe_dir / "input_file" - shutil.copyfile(f"{document.input_filename}", copied_file) + container_output_filename = os.path.join( + tempdir, "safe-output-compressed.pdf" + ) + shutil.move(container_output_filename, document.output_filename) + def get_doc_to_pixels_proc(self) -> subprocess.Popen: # Convert document to pixels command = [ "/usr/bin/python3", @@ -287,60 +273,10 @@ def _convert_with_tmpdirs( "dangerzone.conversion.doc_to_pixels", ] extra_args = [ - "-v", - f"{copied_file}:/tmp/input_file:Z", - "-v", - f"{pixel_dir}:/tmp/dangerzone:Z", "-e", f"ENABLE_TIMEOUTS={self.enable_timeouts}", ] - ret = self.exec_container(document, command, extra_args) - - if ret != 0: - log.error("documents-to-pixels failed") - - # XXX Reconstruct exception from error code - raise exception_from_error_code(ret) # type: ignore [misc] - else: - # TODO: validate convert to pixels output - - # Convert pixels to safe PDF - command = [ - "/usr/bin/python3", - "-m", - "dangerzone.conversion.pixels_to_pdf", - ] - extra_args = [ - "-v", - f"{pixel_dir}:/tmp/dangerzone:Z", - "-v", - f"{safe_dir}:/safezone:Z", - "-e", - "TESSDATA_PREFIX=/usr/share/tessdata", - "-e", - f"OCR={ocr}", - "-e", - f"OCR_LANGUAGE={ocr_lang}", - "-e", - f"ENABLE_TIMEOUTS={self.enable_timeouts}", - ] - ret = self.exec_container(document, command, extra_args) - if ret != 0: - log.error("pixels-to-pdf failed") - else: - # Move the final file to the right place - if os.path.exists(document.output_filename): - os.remove(document.output_filename) - - container_output_filename = os.path.join( - safe_dir, "safe-output-compressed.pdf" - ) - shutil.move(container_output_filename, document.output_filename) - - # We did it - success = True - - return success + return self.exec_container(command, extra_args) def get_max_parallel_conversions(self) -> int: # FIXME hardcoded 1 until timeouts are more limited and better handled diff --git a/dangerzone/isolation_provider/qubes.py b/dangerzone/isolation_provider/qubes.py index b9f1fcd3d..c9ebd5cab 100644 --- a/dangerzone/isolation_provider/qubes.py +++ b/dangerzone/isolation_provider/qubes.py @@ -1,5 +1,4 @@ import asyncio -import glob import inspect import io import logging @@ -7,60 +6,29 @@ import shutil import subprocess import sys -import tempfile -import time import zipfile from pathlib import Path from typing import IO, Callable, Optional from ..conversion import errors -from ..conversion.common import calculate_timeout, running_on_qubes +from ..conversion.common import running_on_qubes from ..conversion.pixels_to_pdf import PixelsToPDF from ..document import Document -from ..util import ( - Stopwatch, - get_resource_path, - get_subprocess_startupinfo, - get_tmp_dir, - nonblocking_read, -) +from ..util import get_resource_path from .base import ( - MAX_CONVERSION_LOG_CHARS, PIXELS_TO_PDF_LOG_END, PIXELS_TO_PDF_LOG_START, - IsolationProvider, + ProcessBasedIsolationProvider, ) log = logging.getLogger(__name__) -# The maximum time a qube takes to start up. -STARTUP_TIME_SECONDS = 5 * 60 # 5 minutes - - -def read_bytes(f: IO[bytes], size: int, timeout: float, exact: bool = True) -> bytes: - """Read bytes from a file-like object.""" - buf = nonblocking_read(f, size, timeout) - if exact and len(buf) != size: - raise errors.InterruptedConversion - return buf - - -def read_int(f: IO[bytes], timeout: float) -> int: - """Read 2 bytes from a file-like object, and decode them as int.""" - untrusted_int = read_bytes(f, 2, timeout) - return int.from_bytes(untrusted_int, signed=False) - -def read_debug_text(f: IO[bytes], size: int) -> str: - """Read arbitrarily long text (for debug purposes)""" - timeout = calculate_timeout(size) - untrusted_text = read_bytes(f, size, timeout, exact=False) - return untrusted_text.decode("ascii", errors="replace") - - -class Qubes(IsolationProvider): +class Qubes(ProcessBasedIsolationProvider): """Uses a disposable qube for performing the conversion""" + STARTUP_TIME_SECONDS = 5 * 60 # 5 minutes + def __init__(self) -> None: self.proc: Optional[subprocess.Popen] = None super().__init__() @@ -68,86 +36,9 @@ def __init__(self) -> None: def install(self) -> bool: return True - def _convert_with_tmpdirs( - self, - document: Document, - tempdir: str, - ocr_lang: Optional[str] = None, - ) -> bool: - success = False - - Path(f"{tempdir}/dangerzone").mkdir() - percentage = 0.0 - - with open(document.input_filename, "rb") as f: - self.proc = self.qrexec_subprocess() - try: - assert self.proc.stdin is not None - self.proc.stdin.write(f.read()) - self.proc.stdin.close() - except BrokenPipeError as e: - raise errors.InterruptedConversion() - - # Get file size (in MiB) - size = os.path.getsize(document.input_filename) / 1024**2 - timeout = calculate_timeout(size) + STARTUP_TIME_SECONDS - - assert self.proc is not None - assert self.proc.stdout is not None - os.set_blocking(self.proc.stdout.fileno(), False) - - n_pages = read_int(self.proc.stdout, timeout) - if n_pages == 0 or n_pages > errors.MAX_PAGES: - raise errors.MaxPagesException() - percentage_per_page = 50.0 / n_pages - - timeout = calculate_timeout(size, n_pages) - sw = Stopwatch(timeout) - sw.start() - for page in range(1, n_pages + 1): - text = f"Converting page {page}/{n_pages} to pixels" - self.print_progress_trusted(document, False, text, percentage) - - width = read_int(self.proc.stdout, timeout=sw.remaining) - height = read_int(self.proc.stdout, timeout=sw.remaining) - if not (1 <= width <= errors.MAX_PAGE_WIDTH): - raise errors.MaxPageWidthException() - if not (1 <= height <= errors.MAX_PAGE_HEIGHT): - raise errors.MaxPageHeightException() - - num_pixels = width * height * 3 # three color channels - untrusted_pixels = read_bytes( - self.proc.stdout, - num_pixels, - timeout=sw.remaining, - ) - - # Wrapper code - with open(f"{tempdir}/dangerzone/page-{page}.width", "w") as f_width: - f_width.write(str(width)) - with open(f"{tempdir}/dangerzone/page-{page}.height", "w") as f_height: - f_height.write(str(height)) - with open(f"{tempdir}/dangerzone/page-{page}.rgb", "wb") as f_rgb: - f_rgb.write(untrusted_pixels) - - percentage += percentage_per_page - - # Ensure nothing else is read after all bitmaps are obtained - self.proc.stdout.close() - - # TODO handle leftover code input - text = "Converted document to pixels" - self.print_progress_trusted(document, False, text, percentage) - - if getattr(sys, "dangerzone_dev", False): - assert self.proc.stderr is not None - os.set_blocking(self.proc.stderr.fileno(), False) - untrusted_log = read_debug_text(self.proc.stderr, MAX_CONVERSION_LOG_CHARS) - self.proc.stderr.close() - log.info( - f"Conversion output (doc to pixels)\n{self.sanitize_conversion_str(untrusted_log)}" - ) - + def pixels_to_pdf( + self, document: Document, tempdir: str, ocr_lang: Optional[str] + ) -> None: def print_progress_wrapper(error: bool, text: str, percentage: float) -> None: self.print_progress_trusted(document, error, text, percentage) @@ -166,28 +57,11 @@ def print_progress_wrapper(error: bool, text: str, percentage: float) -> None: log.info(text) shutil.move(f"{tempdir}/safe-output-compressed.pdf", document.output_filename) - success = True - - return success - - def _convert( - self, - document: Document, - ocr_lang: Optional[str] = None, - ) -> bool: - try: - with tempfile.TemporaryDirectory() as t: - return self._convert_with_tmpdirs(document, t, ocr_lang) - except errors.InterruptedConversion: - assert self.proc is not None - error_code = self.proc.wait(3) - # XXX Reconstruct exception from error code - raise errors.exception_from_error_code(error_code) # type: ignore [misc] def get_max_parallel_conversions(self) -> int: return 1 - def qrexec_subprocess(self) -> subprocess.Popen: + def get_doc_to_pixels_proc(self) -> subprocess.Popen: dev_mode = getattr(sys, "dangerzone_dev", False) == True if dev_mode: # Use dz.ConvertDev RPC call instead, if we are in development mode. diff --git a/qubes/dz.Convert b/qubes/dz.Convert index d6b2e3e23..b9dded3cf 100755 --- a/qubes/dz.Convert +++ b/qubes/dz.Convert @@ -1,2 +1,2 @@ #!/bin/sh -python -m dangerzone.conversion.doc_to_pixels_qubes_wrapper +python -m dangerzone.conversion.doc_to_pixels diff --git a/qubes/dz.ConvertDev b/qubes/dz.ConvertDev index cb93b799e..74a984540 100755 --- a/qubes/dz.ConvertDev +++ b/qubes/dz.ConvertDev @@ -34,7 +34,7 @@ def main(): say(f"Importing the conversion module") sys.path.insert(0, t.name) - from dangerzone.conversion.doc_to_pixels_qubes_wrapper import main + from dangerzone.conversion.doc_to_pixels import main return asyncio.run(main()) diff --git a/tests/isolation_provider/test_qubes.py b/tests/isolation_provider/test_qubes.py index 78ce9e21f..022ea2a46 100644 --- a/tests/isolation_provider/test_qubes.py +++ b/tests/isolation_provider/test_qubes.py @@ -69,7 +69,7 @@ def test_out_of_ram( ) -> None: provider.progress_callback = mocker.MagicMock() - def qrexec_subprocess() -> subprocess.Popen: + def get_doc_to_pixels_proc() -> subprocess.Popen: p = subprocess.Popen( # XXX error 126 simulates a qrexec-policy failure. Source: # https://github.com/QubesOS/qubes-core-qrexec/blob/fdcbfd7/daemon/qrexec-daemon.c#L1022 @@ -81,7 +81,7 @@ def qrexec_subprocess() -> subprocess.Popen: ) return p - monkeypatch.setattr(provider, "qrexec_subprocess", qrexec_subprocess) + monkeypatch.setattr(provider, "get_doc_to_pixels_proc", get_doc_to_pixels_proc) with pytest.raises(errors.QubesQrexecFailed) as e: doc = Document(sample_doc)