diff --git a/Dockerfile b/Dockerfile index 5f551ff8c..4b794b2dd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -77,10 +77,5 @@ COPY conversion /opt/dangerzone/dangerzone/conversion RUN adduser -s /bin/sh -D dangerzone USER dangerzone -# /tmp/input_file is where the first convert expects the input file to be, and -# /tmp where it will write the pixel files -# -# /dangerzone is where the second script expects files to be put by the first one -# -# /safezone is where the wrapper eventually moves the sanitized files. -VOLUME /dangerzone /tmp/input_file /safezone +# /safezone is a directory through which Pixels to PDF receives files +VOLUME /safezone diff --git a/dangerzone/conversion/common.py b/dangerzone/conversion/common.py index 21122065d..b367bc062 100644 --- a/dangerzone/conversion/common.py +++ b/dangerzone/conversion/common.py @@ -10,7 +10,7 @@ import sys import time from abc import abstractmethod -from typing import Callable, Dict, List, Optional, Tuple, Union +from typing import Callable, Dict, List, Optional, TextIO, Tuple, Union TIMEOUT_PER_PAGE: float = 30 # (seconds) TIMEOUT_PER_MB: float = 30 # (seconds) @@ -58,6 +58,49 @@ def __init__(self, progress_callback: Optional[Callable] = None) -> None: self.progress_callback = progress_callback self.captured_output: bytes = b"" + @classmethod + def _read_bytes(cls) -> bytes: + """Read bytes from the stdin.""" + data = sys.stdin.buffer.read() + if data is None: + raise EOFError + return data + + @classmethod + def _write_bytes(cls, data: bytes, file: TextIO = sys.stdout) -> None: + file.buffer.write(data) + + @classmethod + def _write_text(cls, text: str, file: TextIO = sys.stdout) -> None: + cls._write_bytes(text.encode(), file=file) + + @classmethod + def _write_int(cls, num: int, file: TextIO = sys.stdout) -> None: + cls._write_bytes(num.to_bytes(2, signed=False), file=file) + + # ==== ASYNC METHODS ==== + # We run sync methods in async wrappers, because pure async methods are more difficult: + # https://stackoverflow.com/a/52702646 + # + # In practice, because they are I/O bound and we don't have many running concurrently, + # they shouldn't cause a problem. + + @classmethod + async def read_bytes(cls) -> bytes: + return await asyncio.to_thread(cls._read_bytes) + + @classmethod + async def write_bytes(cls, data: bytes, file: TextIO = sys.stdout) -> None: + return await asyncio.to_thread(cls._write_bytes, data, file=file) + + @classmethod + async def write_text(cls, text: str, file: TextIO = sys.stdout) -> None: + return await asyncio.to_thread(cls._write_text, text, file=file) + + @classmethod + async def write_int(cls, num: int, file: TextIO = sys.stdout) -> None: + return await asyncio.to_thread(cls._write_int, num, file=file) + async def read_stream( self, sr: asyncio.StreamReader, callback: Optional[Callable] = None ) -> bytes: @@ -150,13 +193,4 @@ async def convert(self) -> None: pass def update_progress(self, text: str, *, error: bool = False) -> None: - if running_on_qubes(): - if self.progress_callback: - self.progress_callback(error, text, int(self.percentage)) - else: - print( - json.dumps( - {"error": error, "text": text, "percentage": int(self.percentage)} - ) - ) - sys.stdout.flush() + pass diff --git a/dangerzone/conversion/doc_to_pixels.py b/dangerzone/conversion/doc_to_pixels.py index ac00c65d2..26640bdab 100644 --- a/dangerzone/conversion/doc_to_pixels.py +++ b/dangerzone/conversion/doc_to_pixels.py @@ -13,7 +13,7 @@ import re import shutil import sys -from typing import Dict, List, Optional +from typing import Dict, List, Optional, TextIO import fitz import magic @@ -23,26 +23,17 @@ class DocumentToPixels(DangerzoneConverter): - # XXX: These functions write page data and metadata to a separate file. For now, - # they act as an anchor point for Qubes to stream back page data/metadata in - # real time. In the future, they will be completely replaced by their streaming - # counterparts. See: - # - # https://github.com/freedomofpress/dangerzone/issues/443 async def write_page_count(self, count: int) -> None: - pass + return await self.write_int(count) async def write_page_width(self, width: int, filename: str) -> None: - with open(filename, "w") as f: - f.write(str(width)) + return await self.write_int(width) async def write_page_height(self, height: int, filename: str) -> None: - with open(filename, "w") as f: - f.write(str(height)) + return await self.write_int(height) async def write_page_data(self, data: bytes, filename: str) -> None: - with open(filename, "wb") as f: - f.write(data) + return await self.write_bytes(data) async def convert(self) -> None: conversions: Dict[str, Dict[str, Optional[str]]] = { @@ -243,20 +234,6 @@ async def convert(self) -> None: await self.write_page_height(pix.height, height_filename) await self.write_page_data(rgb_buf, rgb_filename) - final_files = ( - glob.glob("/tmp/page-*.rgb") - + glob.glob("/tmp/page-*.width") - + glob.glob("/tmp/page-*.height") - ) - - # XXX: Sanity check to avoid situations like #560. - if not running_on_qubes() and len(final_files) != 3 * doc.page_count: - raise errors.PageCountMismatch() - - # Move converted files into /tmp/dangerzone - for filename in final_files: - shutil.move(filename, "/tmp/dangerzone") - self.update_progress("Converted document to pixels") async def install_libreoffice_ext(self, libreoffice_ext: str) -> None: @@ -286,18 +263,28 @@ def detect_mime_type(self, path: str) -> str: return mime_type -async def main() -> int: - converter = DocumentToPixels() +async def main() -> None: + try: + data = await DocumentToPixels.read_bytes() + except EOFError: + sys.exit(1) + + with open("/tmp/input_file", "wb") as f: + f.write(data) try: + converter = DocumentToPixels() await converter.convert() - error_code = 0 # Success! - except errors.ConversionException as e: # Expected Errors - error_code = e.error_code + except errors.ConversionException as e: + await DocumentToPixels.write_bytes(str(e).encode(), file=sys.stderr) + sys.exit(e.error_code) except Exception as e: - converter.update_progress(str(e), error=True) + await DocumentToPixels.write_bytes(str(e).encode(), file=sys.stderr) error_code = errors.UnexpectedConversionError.error_code - return error_code + sys.exit(error_code) + + # Write debug information + await DocumentToPixels.write_bytes(converter.captured_output, file=sys.stderr) if __name__ == "__main__": diff --git a/dangerzone/conversion/doc_to_pixels_qubes_wrapper.py b/dangerzone/conversion/doc_to_pixels_qubes_wrapper.py deleted file mode 100644 index b94c51f44..000000000 --- a/dangerzone/conversion/doc_to_pixels_qubes_wrapper.py +++ /dev/null @@ -1,108 +0,0 @@ -import asyncio -import os -import shutil -import sys -import tempfile -from pathlib import Path -from typing import Optional, TextIO - -from . import errors -from .doc_to_pixels import DocumentToPixels - - -def _read_bytes() -> bytes: - """Read bytes from the stdin.""" - data = sys.stdin.buffer.read() - if data is None: - raise EOFError - return data - - -def _write_bytes(data: bytes, file: TextIO = sys.stdout) -> None: - file.buffer.write(data) - - -def _write_text(text: str, file: TextIO = sys.stdout) -> None: - _write_bytes(text.encode(), file=file) - - -def _write_int(num: int, file: TextIO = sys.stdout) -> None: - _write_bytes(num.to_bytes(2, signed=False), file=file) - - -# ==== ASYNC METHODS ==== -# We run sync methods in async wrappers, because pure async methods are more difficult: -# https://stackoverflow.com/a/52702646 -# -# In practice, because they are I/O bound and we don't have many running concurrently, -# they shouldn't cause a problem. - - -async def read_bytes() -> bytes: - return await asyncio.to_thread(_read_bytes) - - -async def write_bytes(data: bytes, file: TextIO = sys.stdout) -> None: - return await asyncio.to_thread(_write_bytes, data, file=file) - - -async def write_text(text: str, file: TextIO = sys.stdout) -> None: - return await asyncio.to_thread(_write_text, text, file=file) - - -async def write_int(num: int, file: TextIO = sys.stdout) -> None: - return await asyncio.to_thread(_write_int, num, file=file) - - -class QubesDocumentToPixels(DocumentToPixels): - # Override the write_page_* functions to stream data back to the caller, instead of - # writing it to separate files. This way, we have more accurate progress reports and - # client-side timeouts. See also: - # - # https://github.com/freedomofpress/dangerzone/issues/443 - # https://github.com/freedomofpress/dangerzone/issues/557 - - async def write_page_count(self, count: int) -> None: - return await write_int(count) - - async def write_page_width(self, width: int, filename: str) -> None: - return await write_int(width) - - async def write_page_height(self, height: int, filename: str) -> None: - return await write_int(height) - - async def write_page_data(self, data: bytes, filename: str) -> None: - return await write_bytes(data) - - -async def main() -> None: - out_dir = Path("/tmp/dangerzone") - if out_dir.exists(): - shutil.rmtree(out_dir) - out_dir.mkdir() - - try: - data = await read_bytes() - except EOFError: - sys.exit(1) - - with open("/tmp/input_file", "wb") as f: - f.write(data) - - try: - converter = QubesDocumentToPixels() - await converter.convert() - except errors.ConversionException as e: - await write_bytes(str(e).encode(), file=sys.stderr) - sys.exit(e.error_code) - except Exception as e: - await write_bytes(str(e).encode(), file=sys.stderr) - error_code = errors.UnexpectedConversionError.error_code - sys.exit(error_code) - - # Write debug information - await write_bytes(converter.captured_output, file=sys.stderr) - - -if __name__ == "__main__": - sys.exit(asyncio.run(main())) diff --git a/dangerzone/conversion/pixels_to_pdf.py b/dangerzone/conversion/pixels_to_pdf.py index 2217e0055..0ff492cf7 100644 --- a/dangerzone/conversion/pixels_to_pdf.py +++ b/dangerzone/conversion/pixels_to_pdf.py @@ -22,12 +22,12 @@ async def convert( ) -> None: self.percentage = 50.0 if tempdir is None: - tempdir = "/tmp" + tempdir = "/safezone" # XXX lazy loading of fitz module to avoid import issues on non-Qubes systems import fitz - num_pages = len(glob.glob(f"{tempdir}/dangerzone/page-*.rgb")) + num_pages = len(glob.glob(f"{tempdir}/pixels/page-*.rgb")) total_size = 0.0 safe_doc = fitz.Document() @@ -35,7 +35,7 @@ async def convert( # Convert RGB files to PDF files percentage_per_page = 45.0 / num_pages for page_num in range(1, num_pages + 1): - filename_base = f"{tempdir}/dangerzone/page-{page_num}" + filename_base = f"{tempdir}/pixels/page-{page_num}" rgb_filename = f"{filename_base}.rgb" width_filename = f"{filename_base}.width" height_filename = f"{filename_base}.height" @@ -90,6 +90,18 @@ async def convert( safe_doc.save(safe_pdf_path, deflate_images=True) + def update_progress(self, text: str, *, error: bool = False) -> None: + if running_on_qubes(): + if self.progress_callback: + self.progress_callback(error, text, int(self.percentage)) + else: + print( + json.dumps( + {"error": error, "text": text, "percentage": int(self.percentage)} + ) + ) + sys.stdout.flush() + async def main() -> int: ocr_lang = os.environ.get("OCR_LANGUAGE") if os.environ.get("OCR") == "1" else None diff --git a/dangerzone/isolation_provider/base.py b/dangerzone/isolation_provider/base.py index 6265146f7..afca95583 100644 --- a/dangerzone/isolation_provider/base.py +++ b/dangerzone/isolation_provider/base.py @@ -1,13 +1,18 @@ import logging +import os import subprocess +import sys +import tempfile from abc import ABC, abstractmethod -from typing import Callable, Optional +from pathlib import Path +from typing import IO, Callable, Optional from colorama import Fore, Style -from ..conversion.errors import ConversionException +from ..conversion import errors +from ..conversion.common import calculate_timeout from ..document import Document -from ..util import replace_control_chars +from ..util import Stopwatch, nonblocking_read, replace_control_chars log = logging.getLogger(__name__) @@ -18,11 +23,35 @@ PIXELS_TO_PDF_LOG_END = "----- PIXELS TO PDF LOG END -----" +def read_bytes(f: IO[bytes], size: int, timeout: float, exact: bool = True) -> bytes: + """Read bytes from a file-like object.""" + buf = nonblocking_read(f, size, timeout) + if exact and len(buf) != size: + raise errors.InterruptedConversion + return buf + + +def read_int(f: IO[bytes], timeout: float) -> int: + """Read 2 bytes from a file-like object, and decode them as int.""" + untrusted_int = read_bytes(f, 2, timeout) + return int.from_bytes(untrusted_int, signed=False) + + +def read_debug_text(f: IO[bytes], size: int) -> str: + """Read arbitrarily long text (for debug purposes)""" + timeout = calculate_timeout(size) + untrusted_text = read_bytes(f, size, timeout, exact=False) + return untrusted_text.decode("ascii", errors="replace") + + class IsolationProvider(ABC): """ Abstracts an isolation provider """ + def __init__(self) -> None: + self.percentage = 0.0 + @abstractmethod def install(self) -> bool: pass @@ -37,7 +66,7 @@ def convert( document.mark_as_converting() try: success = self._convert(document, ocr_lang) - except ConversionException as e: + except errors.ConversionException as e: success = False self.print_progress_trusted(document, True, str(e), 0) except Exception as e: @@ -102,6 +131,113 @@ def sanitize_conversion_str(self, untrusted_conversion_str: str) -> str: return armor_start + conversion_string + armor_end +class ProcessBasedIsolationProvider(IsolationProvider): + # The maximum time it takes a the provider to start up. + STARTUP_TIME_SECONDS = 0 + + def __init__(self) -> None: + self.proc: Optional[subprocess.Popen] = None + super().__init__() + + @abstractmethod + def get_doc_to_pixels_proc(self) -> subprocess.Popen: + pass + + def _convert( + self, + document: Document, + ocr_lang: Optional[str] = None, + ) -> bool: + try: + with tempfile.TemporaryDirectory() as t: + Path(f"{t}/pixels").mkdir() + self.doc_to_pixels(document, t) + # TODO: validate convert to pixels output + self.pixels_to_pdf(document, t, ocr_lang) + return True + except errors.InterruptedConversion: + assert self.proc is not None + error_code = self.proc.wait(3) + # XXX Reconstruct exception from error code + raise errors.exception_from_error_code(error_code) # type: ignore [misc] + + def doc_to_pixels(self, document: Document, tempdir: str) -> None: + with open(document.input_filename, "rb") as f: + self.proc = self.get_doc_to_pixels_proc() + try: + assert self.proc.stdin is not None + self.proc.stdin.write(f.read()) + self.proc.stdin.close() + except BrokenPipeError as e: + raise errors.InterruptedConversion() + + # Get file size (in MiB) + size = os.path.getsize(document.input_filename) / 1024**2 + timeout = calculate_timeout(size) + self.STARTUP_TIME_SECONDS + + assert self.proc is not None + assert self.proc.stdout is not None + os.set_blocking(self.proc.stdout.fileno(), False) + + n_pages = read_int(self.proc.stdout, timeout) + if n_pages == 0 or n_pages > errors.MAX_PAGES: + raise errors.MaxPagesException() + percentage_per_page = 50.0 / n_pages + + timeout = calculate_timeout(size, n_pages) + sw = Stopwatch(timeout) + sw.start() + for page in range(1, n_pages + 1): + text = f"Converting page {page}/{n_pages} to pixels" + self.print_progress_trusted(document, False, text, self.percentage) + + width = read_int(self.proc.stdout, timeout=sw.remaining) + height = read_int(self.proc.stdout, timeout=sw.remaining) + if not (1 <= width <= errors.MAX_PAGE_WIDTH): + raise errors.MaxPageWidthException() + if not (1 <= height <= errors.MAX_PAGE_HEIGHT): + raise errors.MaxPageHeightException() + + num_pixels = width * height * 3 # three color channels + untrusted_pixels = read_bytes( + self.proc.stdout, + num_pixels, + timeout=sw.remaining, + ) + + # Wrapper code + with open(f"{tempdir}/pixels/page-{page}.width", "w") as f_width: + f_width.write(str(width)) + with open(f"{tempdir}/pixels/page-{page}.height", "w") as f_height: + f_height.write(str(height)) + with open(f"{tempdir}/pixels/page-{page}.rgb", "wb") as f_rgb: + f_rgb.write(untrusted_pixels) + + self.percentage += percentage_per_page + + # Ensure nothing else is read after all bitmaps are obtained + self.proc.stdout.close() + + # TODO handle leftover code input + text = "Converted document to pixels" + self.print_progress_trusted(document, False, text, self.percentage) + + if getattr(sys, "dangerzone_dev", False): + assert self.proc.stderr is not None + os.set_blocking(self.proc.stderr.fileno(), False) + untrusted_log = read_debug_text(self.proc.stderr, MAX_CONVERSION_LOG_CHARS) + self.proc.stderr.close() + log.info( + f"Conversion output (doc to pixels)\n{self.sanitize_conversion_str(untrusted_log)}" + ) + + @abstractmethod + def pixels_to_pdf( + self, document: Document, tempdir: str, ocr_lang: Optional[str] + ) -> None: + pass + + # From global_common: # def validate_convert_to_pixel_output(self, common, output): diff --git a/dangerzone/isolation_provider/container.py b/dangerzone/isolation_provider/container.py index 27bd98836..6a634bf0a 100644 --- a/dangerzone/isolation_provider/container.py +++ b/dangerzone/isolation_provider/container.py @@ -2,16 +2,14 @@ import json import logging import os -import pathlib import platform import shlex import shutil import subprocess import sys -import tempfile -from typing import Any, Callable, List, Optional, Tuple +from typing import Any, List, Optional -from ..conversion.errors import exception_from_error_code +from ..conversion import errors from ..document import Document from ..util import ( get_resource_path, @@ -19,12 +17,7 @@ get_tmp_dir, replace_control_chars, ) -from .base import ( - MAX_CONVERSION_LOG_CHARS, - PIXELS_TO_PDF_LOG_END, - PIXELS_TO_PDF_LOG_START, - IsolationProvider, -) +from .base import ProcessBasedIsolationProvider # Define startupinfo for subprocesses if platform.system() == "Windows": @@ -42,9 +35,10 @@ def __init__(self, container_tech: str) -> None: super().__init__(f"{container_tech} is not installed") -class Container(IsolationProvider): +class Container(ProcessBasedIsolationProvider): # Name of the dangerzone container CONTAINER_NAME = "dangerzone.rocks/dangerzone" + STARTUP_TIME_SECONDS = 5 def __init__(self, enable_timeouts: bool) -> None: self.enable_timeouts = 1 if enable_timeouts else 0 @@ -179,34 +173,30 @@ def parse_progress(self, document: Document, untrusted_line: str) -> None: def exec( self, - document: Document, args: List[str], - ) -> int: + ) -> subprocess.Popen: args_str = " ".join(shlex.quote(s) for s in args) log.info("> " + args_str) - with subprocess.Popen( + dev_mode = getattr(sys, "dangerzone_dev", False) == True + if dev_mode: + stderr = subprocess.PIPE + else: + stderr = subprocess.DEVNULL + + return subprocess.Popen( args, - stdin=None, + stdin=subprocess.PIPE, stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - bufsize=1, - universal_newlines=True, + stderr=stderr, startupinfo=startupinfo, - ) as p: - if p.stdout is not None: - for untrusted_line in p.stdout: - self.parse_progress(document, untrusted_line) - - p.communicate() - return p.returncode + ) def exec_container( self, - document: Document, command: List[str], extra_args: List[str] = [], - ) -> int: + ) -> subprocess.Popen: container_runtime = self.get_runtime() if self.get_runtime_name() == "podman": @@ -218,6 +208,7 @@ def exec_container( # drop all linux kernel capabilities security_args += ["--cap-drop", "all"] user_args = ["-u", "dangerzone"] + enable_stdin = ["-i"] prevent_leakage_args = ["--rm"] @@ -226,60 +217,55 @@ def exec_container( + user_args + security_args + prevent_leakage_args + + enable_stdin + extra_args + [self.CONTAINER_NAME] + command ) args = [container_runtime] + args - return self.exec(document, args) + return self.exec(args) - def _convert( - self, - document: Document, - ocr_lang: Optional[str], - ) -> bool: - # Create a temporary directory inside the cache directory for this run. Then, - # create some subdirectories for the various stages of the file conversion: - # - # * unsafe: Where the input file will be copied - # * pixel: Where the RGB data will be stored - # * safe: Where the final PDF file will be stored - with tempfile.TemporaryDirectory(dir=get_tmp_dir()) as t: - tmp_dir = pathlib.Path(t) - unsafe_dir = tmp_dir / "unsafe" - unsafe_dir.mkdir() - pixel_dir = tmp_dir / "pixels" - pixel_dir.mkdir() - safe_dir = tmp_dir / "safe" - safe_dir.mkdir() - - return self._convert_with_tmpdirs( - document=document, - unsafe_dir=unsafe_dir, - pixel_dir=pixel_dir, - safe_dir=safe_dir, - ocr_lang=ocr_lang, - ) + def pixels_to_pdf( + self, document: Document, tempdir: str, ocr_lang: Optional[str] + ) -> None: + # Convert pixels to safe PDF + command = [ + "/usr/bin/python3", + "-m", + "dangerzone.conversion.pixels_to_pdf", + ] + extra_args = [ + "-v", + f"{tempdir}:/safezone:Z", + "-e", + "TESSDATA_PREFIX=/usr/share/tessdata", + "-e", + f"OCR={0 if ocr_lang is None else 1}", + "-e", + f"OCR_LANGUAGE={ocr_lang}", + "-e", + f"ENABLE_TIMEOUTS={self.enable_timeouts}", + ] - def _convert_with_tmpdirs( - self, - document: Document, - unsafe_dir: pathlib.Path, - pixel_dir: pathlib.Path, - safe_dir: pathlib.Path, - ocr_lang: Optional[str], - ) -> bool: - success = False - - if ocr_lang: - ocr = "1" + pixels_to_pdf_proc = self.exec_container(command, extra_args) + for line in pixels_to_pdf_proc.stdout: + self.parse_progress(document, line) + error_code = pixels_to_pdf_proc.wait() + if error_code != 0: + log.error("pixels-to-pdf failed") + raise errors.exception_from_error_code(error_code) # type: ignore [misc] else: - ocr = "0" + # Move the final file to the right place + if os.path.exists(document.output_filename): + os.remove(document.output_filename) - copied_file = unsafe_dir / "input_file" - shutil.copyfile(f"{document.input_filename}", copied_file) + container_output_filename = os.path.join( + tempdir, "safe-output-compressed.pdf" + ) + shutil.move(container_output_filename, document.output_filename) + def get_doc_to_pixels_proc(self) -> subprocess.Popen: # Convert document to pixels command = [ "/usr/bin/python3", @@ -287,60 +273,10 @@ def _convert_with_tmpdirs( "dangerzone.conversion.doc_to_pixels", ] extra_args = [ - "-v", - f"{copied_file}:/tmp/input_file:Z", - "-v", - f"{pixel_dir}:/tmp/dangerzone:Z", "-e", f"ENABLE_TIMEOUTS={self.enable_timeouts}", ] - ret = self.exec_container(document, command, extra_args) - - if ret != 0: - log.error("documents-to-pixels failed") - - # XXX Reconstruct exception from error code - raise exception_from_error_code(ret) # type: ignore [misc] - else: - # TODO: validate convert to pixels output - - # Convert pixels to safe PDF - command = [ - "/usr/bin/python3", - "-m", - "dangerzone.conversion.pixels_to_pdf", - ] - extra_args = [ - "-v", - f"{pixel_dir}:/tmp/dangerzone:Z", - "-v", - f"{safe_dir}:/safezone:Z", - "-e", - "TESSDATA_PREFIX=/usr/share/tessdata", - "-e", - f"OCR={ocr}", - "-e", - f"OCR_LANGUAGE={ocr_lang}", - "-e", - f"ENABLE_TIMEOUTS={self.enable_timeouts}", - ] - ret = self.exec_container(document, command, extra_args) - if ret != 0: - log.error("pixels-to-pdf failed") - else: - # Move the final file to the right place - if os.path.exists(document.output_filename): - os.remove(document.output_filename) - - container_output_filename = os.path.join( - safe_dir, "safe-output-compressed.pdf" - ) - shutil.move(container_output_filename, document.output_filename) - - # We did it - success = True - - return success + return self.exec_container(command, extra_args) def get_max_parallel_conversions(self) -> int: # FIXME hardcoded 1 until timeouts are more limited and better handled diff --git a/dangerzone/isolation_provider/qubes.py b/dangerzone/isolation_provider/qubes.py index b9f1fcd3d..c9ebd5cab 100644 --- a/dangerzone/isolation_provider/qubes.py +++ b/dangerzone/isolation_provider/qubes.py @@ -1,5 +1,4 @@ import asyncio -import glob import inspect import io import logging @@ -7,60 +6,29 @@ import shutil import subprocess import sys -import tempfile -import time import zipfile from pathlib import Path from typing import IO, Callable, Optional from ..conversion import errors -from ..conversion.common import calculate_timeout, running_on_qubes +from ..conversion.common import running_on_qubes from ..conversion.pixels_to_pdf import PixelsToPDF from ..document import Document -from ..util import ( - Stopwatch, - get_resource_path, - get_subprocess_startupinfo, - get_tmp_dir, - nonblocking_read, -) +from ..util import get_resource_path from .base import ( - MAX_CONVERSION_LOG_CHARS, PIXELS_TO_PDF_LOG_END, PIXELS_TO_PDF_LOG_START, - IsolationProvider, + ProcessBasedIsolationProvider, ) log = logging.getLogger(__name__) -# The maximum time a qube takes to start up. -STARTUP_TIME_SECONDS = 5 * 60 # 5 minutes - - -def read_bytes(f: IO[bytes], size: int, timeout: float, exact: bool = True) -> bytes: - """Read bytes from a file-like object.""" - buf = nonblocking_read(f, size, timeout) - if exact and len(buf) != size: - raise errors.InterruptedConversion - return buf - - -def read_int(f: IO[bytes], timeout: float) -> int: - """Read 2 bytes from a file-like object, and decode them as int.""" - untrusted_int = read_bytes(f, 2, timeout) - return int.from_bytes(untrusted_int, signed=False) - -def read_debug_text(f: IO[bytes], size: int) -> str: - """Read arbitrarily long text (for debug purposes)""" - timeout = calculate_timeout(size) - untrusted_text = read_bytes(f, size, timeout, exact=False) - return untrusted_text.decode("ascii", errors="replace") - - -class Qubes(IsolationProvider): +class Qubes(ProcessBasedIsolationProvider): """Uses a disposable qube for performing the conversion""" + STARTUP_TIME_SECONDS = 5 * 60 # 5 minutes + def __init__(self) -> None: self.proc: Optional[subprocess.Popen] = None super().__init__() @@ -68,86 +36,9 @@ def __init__(self) -> None: def install(self) -> bool: return True - def _convert_with_tmpdirs( - self, - document: Document, - tempdir: str, - ocr_lang: Optional[str] = None, - ) -> bool: - success = False - - Path(f"{tempdir}/dangerzone").mkdir() - percentage = 0.0 - - with open(document.input_filename, "rb") as f: - self.proc = self.qrexec_subprocess() - try: - assert self.proc.stdin is not None - self.proc.stdin.write(f.read()) - self.proc.stdin.close() - except BrokenPipeError as e: - raise errors.InterruptedConversion() - - # Get file size (in MiB) - size = os.path.getsize(document.input_filename) / 1024**2 - timeout = calculate_timeout(size) + STARTUP_TIME_SECONDS - - assert self.proc is not None - assert self.proc.stdout is not None - os.set_blocking(self.proc.stdout.fileno(), False) - - n_pages = read_int(self.proc.stdout, timeout) - if n_pages == 0 or n_pages > errors.MAX_PAGES: - raise errors.MaxPagesException() - percentage_per_page = 50.0 / n_pages - - timeout = calculate_timeout(size, n_pages) - sw = Stopwatch(timeout) - sw.start() - for page in range(1, n_pages + 1): - text = f"Converting page {page}/{n_pages} to pixels" - self.print_progress_trusted(document, False, text, percentage) - - width = read_int(self.proc.stdout, timeout=sw.remaining) - height = read_int(self.proc.stdout, timeout=sw.remaining) - if not (1 <= width <= errors.MAX_PAGE_WIDTH): - raise errors.MaxPageWidthException() - if not (1 <= height <= errors.MAX_PAGE_HEIGHT): - raise errors.MaxPageHeightException() - - num_pixels = width * height * 3 # three color channels - untrusted_pixels = read_bytes( - self.proc.stdout, - num_pixels, - timeout=sw.remaining, - ) - - # Wrapper code - with open(f"{tempdir}/dangerzone/page-{page}.width", "w") as f_width: - f_width.write(str(width)) - with open(f"{tempdir}/dangerzone/page-{page}.height", "w") as f_height: - f_height.write(str(height)) - with open(f"{tempdir}/dangerzone/page-{page}.rgb", "wb") as f_rgb: - f_rgb.write(untrusted_pixels) - - percentage += percentage_per_page - - # Ensure nothing else is read after all bitmaps are obtained - self.proc.stdout.close() - - # TODO handle leftover code input - text = "Converted document to pixels" - self.print_progress_trusted(document, False, text, percentage) - - if getattr(sys, "dangerzone_dev", False): - assert self.proc.stderr is not None - os.set_blocking(self.proc.stderr.fileno(), False) - untrusted_log = read_debug_text(self.proc.stderr, MAX_CONVERSION_LOG_CHARS) - self.proc.stderr.close() - log.info( - f"Conversion output (doc to pixels)\n{self.sanitize_conversion_str(untrusted_log)}" - ) - + def pixels_to_pdf( + self, document: Document, tempdir: str, ocr_lang: Optional[str] + ) -> None: def print_progress_wrapper(error: bool, text: str, percentage: float) -> None: self.print_progress_trusted(document, error, text, percentage) @@ -166,28 +57,11 @@ def print_progress_wrapper(error: bool, text: str, percentage: float) -> None: log.info(text) shutil.move(f"{tempdir}/safe-output-compressed.pdf", document.output_filename) - success = True - - return success - - def _convert( - self, - document: Document, - ocr_lang: Optional[str] = None, - ) -> bool: - try: - with tempfile.TemporaryDirectory() as t: - return self._convert_with_tmpdirs(document, t, ocr_lang) - except errors.InterruptedConversion: - assert self.proc is not None - error_code = self.proc.wait(3) - # XXX Reconstruct exception from error code - raise errors.exception_from_error_code(error_code) # type: ignore [misc] def get_max_parallel_conversions(self) -> int: return 1 - def qrexec_subprocess(self) -> subprocess.Popen: + def get_doc_to_pixels_proc(self) -> subprocess.Popen: dev_mode = getattr(sys, "dangerzone_dev", False) == True if dev_mode: # Use dz.ConvertDev RPC call instead, if we are in development mode. diff --git a/qubes/dz.Convert b/qubes/dz.Convert index d6b2e3e23..b9dded3cf 100755 --- a/qubes/dz.Convert +++ b/qubes/dz.Convert @@ -1,2 +1,2 @@ #!/bin/sh -python -m dangerzone.conversion.doc_to_pixels_qubes_wrapper +python -m dangerzone.conversion.doc_to_pixels diff --git a/qubes/dz.ConvertDev b/qubes/dz.ConvertDev index cb93b799e..74a984540 100755 --- a/qubes/dz.ConvertDev +++ b/qubes/dz.ConvertDev @@ -34,7 +34,7 @@ def main(): say(f"Importing the conversion module") sys.path.insert(0, t.name) - from dangerzone.conversion.doc_to_pixels_qubes_wrapper import main + from dangerzone.conversion.doc_to_pixels import main return asyncio.run(main()) diff --git a/tests/isolation_provider/test_qubes.py b/tests/isolation_provider/test_qubes.py index 78ce9e21f..022ea2a46 100644 --- a/tests/isolation_provider/test_qubes.py +++ b/tests/isolation_provider/test_qubes.py @@ -69,7 +69,7 @@ def test_out_of_ram( ) -> None: provider.progress_callback = mocker.MagicMock() - def qrexec_subprocess() -> subprocess.Popen: + def get_doc_to_pixels_proc() -> subprocess.Popen: p = subprocess.Popen( # XXX error 126 simulates a qrexec-policy failure. Source: # https://github.com/QubesOS/qubes-core-qrexec/blob/fdcbfd7/daemon/qrexec-daemon.c#L1022 @@ -81,7 +81,7 @@ def qrexec_subprocess() -> subprocess.Popen: ) return p - monkeypatch.setattr(provider, "qrexec_subprocess", qrexec_subprocess) + monkeypatch.setattr(provider, "get_doc_to_pixels_proc", get_doc_to_pixels_proc) with pytest.raises(errors.QubesQrexecFailed) as e: doc = Document(sample_doc)