From 814d533c3b1cc5896db1badd9ac47dc641db3fda Mon Sep 17 00:00:00 2001 From: deeplow Date: Tue, 20 Jun 2023 19:33:48 +0300 Subject: [PATCH 01/10] Restructure container code The files in `container/` no longer make sense to have that name since the "document to pixels" part will run in Qubes OS in its own virtual machine. To adapt to this, this PR does the following: - Moves all the files in `container` to `dangerzone/conversion` - Splits the old `container/dangerzone.py` into its two components `dangerzone/conversion/{doc_to_pixels,pixels_to_pdf}.py` with a `common.py` file for shared functions - Moves the Dockerfile to the project root and adapts it to the new container code location - Updates the CircleCI config to properly cache Docker images. - Updates our install scripts to properly build Docker images. - Adds the new conversion module to the container image, so that it can be imported as a package. - Adapts the container isolation provider to use the new way of calling the code. NOTE: We have made zero changes to the conversion code in this commit, except for necessary imports in order to factor out some common parts. Any changes necessary for Qubes integration follow in the subsequent commits. --- .circleci/config.yml | 37 ++- .github/workflows/scan.yml | 2 +- container/Dockerfile => Dockerfile | 7 +- Makefile | 5 +- dangerzone/conversion/__init__.py | 0 dangerzone/conversion/common.py | 134 +++++++++ .../conversion/doc_to_pixels.py | 279 +----------------- dangerzone/conversion/pixels_to_pdf.py | 166 +++++++++++ dangerzone/isolation_provider/container.py | 8 +- install/linux/build-image.sh | 2 +- install/macos/build-image.sh | 2 +- install/windows/build-image.py | 4 +- setup.py | 7 +- 13 files changed, 362 insertions(+), 291 deletions(-) rename container/Dockerfile => Dockerfile (90%) create mode 100644 dangerzone/conversion/__init__.py create mode 100644 dangerzone/conversion/common.py rename container/dangerzone.py => dangerzone/conversion/doc_to_pixels.py (50%) create mode 100644 dangerzone/conversion/pixels_to_pdf.py diff --git a/.circleci/config.yml b/.circleci/config.yml index 208e49d51..0215cfc7d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -42,8 +42,16 @@ aliases: ./install/linux/build-rpm.py ls -lh dist/ + - &calculate-cache-key + name: Caculating container cache key + command: | + mkdir -p /caches/ + cd dangerzone/conversion/ + cat common.py doc_to_pixels.py pixels_to_pdf.py | sha1sum | cut -d' ' -f1 > /caches/cache-id.txt + cd ../../ + - &restore-cache - key: v1-{{ checksum "container/Dockerfile" }}-{{ checksum "container/dangerzone.py" }} + key: v1-{{ checksum "Dockerfile" }}-{{ checksum "/caches/cache-id.txt" }} paths: - /caches/container.tar.gz - /caches/image-id.txt @@ -85,9 +93,8 @@ jobs: - image: docker:dind steps: - checkout - - restore_cache: - keys: - - v1-{{ checksum "container/Dockerfile" }}-{{ checksum "container/dangerzone.py" }} + - run: *calculate-cache-key + - restore_cache: *restore-cache - setup_remote_docker - run: name: Build Dangerzone image @@ -95,7 +102,9 @@ jobs: if [ -f "/caches/container.tar.gz" ]; then echo "Already cached, skipping" else - docker build --cache-from=dangerzone.rocks/dangerzone --tag dangerzone.rocks/dangerzone container + docker build dangerzone/ -f Dockerfile \ + --cache-from=dangerzone.rocks/dangerzone \ + --tag dangerzone.rocks/dangerzone fi - run: name: Save Dangerzone image and image-id.txt to cache @@ -108,8 +117,9 @@ jobs: gzip -f /caches/container.tar docker image ls dangerzone.rocks/dangerzone | grep "dangerzone.rocks/dangerzone" | tr -s ' ' | cut -d' ' -f3 > /caches/image-id.txt fi + - run: *calculate-cache-key - save_cache: - key: v1-{{ checksum "container/Dockerfile" }}-{{ checksum "container/dangerzone.py" }} + key: v1-{{ checksum "Dockerfile" }}-{{ checksum "/caches/cache-id.txt" }} paths: - /caches/container.tar.gz - /caches/image-id.txt @@ -136,6 +146,7 @@ jobs: command: | sudo mkdir -p /caches sudo chown -R $USER:$USER /caches + - run: *calculate-cache-key - restore_cache: *restore-cache - run: *copy-image - run: @@ -155,6 +166,7 @@ jobs: command: | sudo mkdir -p /caches sudo chown -R $USER:$USER /caches + - run: *calculate-cache-key - restore_cache: *restore-cache - run: *copy-image @@ -181,6 +193,7 @@ jobs: command: | sudo mkdir -p /caches sudo chown -R $USER:$USER /caches + - run: *calculate-cache-key - restore_cache: *restore-cache - run: *copy-image @@ -207,6 +220,7 @@ jobs: command: | sudo mkdir -p /caches sudo chown -R $USER:$USER /caches + - run: *calculate-cache-key - restore_cache: *restore-cache - run: *copy-image @@ -233,6 +247,7 @@ jobs: command: | sudo mkdir -p /caches sudo chown -R $USER:$USER /caches + - run: *calculate-cache-key - restore_cache: *restore-cache - run: *copy-image @@ -259,6 +274,7 @@ jobs: command: | sudo mkdir -p /caches sudo chown -R $USER:$USER /caches + - run: *calculate-cache-key - restore_cache: *restore-cache - run: *copy-image @@ -285,6 +301,7 @@ jobs: command: | sudo mkdir -p /caches sudo chown -R $USER:$USER /caches + - run: *calculate-cache-key - restore_cache: *restore-cache - run: *copy-image @@ -328,6 +345,7 @@ jobs: command: | sudo mkdir -p /caches sudo chown -R $USER:$USER /caches + - run: *calculate-cache-key - restore_cache: *restore-cache - run: *copy-image @@ -365,6 +383,7 @@ jobs: steps: - run: *install-dependencies-deb - checkout + - run: *calculate-cache-key - restore_cache: *restore-cache - run: *copy-image - run: *build-deb @@ -376,6 +395,7 @@ jobs: steps: - run: *install-dependencies-deb - checkout + - run: *calculate-cache-key - restore_cache: *restore-cache - run: *copy-image - run: *build-deb @@ -388,6 +408,7 @@ jobs: - run: *install-dependencies-deb - run: *install-python-all - checkout + - run: *calculate-cache-key - restore_cache: *restore-cache - run: *copy-image - run: *build-deb @@ -399,6 +420,7 @@ jobs: steps: - run: *install-dependencies-deb - checkout + - run: *calculate-cache-key - restore_cache: *restore-cache - run: *copy-image - run: *build-deb @@ -410,6 +432,7 @@ jobs: steps: - run: *install-dependencies-deb - checkout + - run: *calculate-cache-key - restore_cache: *restore-cache - run: *copy-image - run: *build-deb @@ -421,6 +444,7 @@ jobs: steps: - run: *install-dependencies-rpm - checkout + - run: *calculate-cache-key - restore_cache: *restore-cache - run: *copy-image - run: *build-rpm @@ -432,6 +456,7 @@ jobs: steps: - run: *install-dependencies-rpm - checkout + - run: *calculate-cache-key - restore_cache: *restore-cache - run: *copy-image - run: *build-rpm diff --git a/.github/workflows/scan.yml b/.github/workflows/scan.yml index 992292119..4c885bf5c 100644 --- a/.github/workflows/scan.yml +++ b/.github/workflows/scan.yml @@ -13,7 +13,7 @@ jobs: - name: Checkout uses: actions/checkout@v3 - name: Build container image - run: docker build container --tag dangerzone.rocks/dangerzone:latest + run: docker build dangerzone/ -f Dockerfile --tag dangerzone.rocks/dangerzone:latest # NOTE: Scan first without failing, else we won't be able to read the scan # report. - name: Scan container image (no fail) diff --git a/container/Dockerfile b/Dockerfile similarity index 90% rename from container/Dockerfile rename to Dockerfile index 14e05c61e..77bcbced4 100644 --- a/container/Dockerfile +++ b/Dockerfile @@ -33,8 +33,11 @@ RUN mkdir tessdata && cd tessdata \ && find . -name '*.traineddata' -maxdepth 2 -exec cp {} /usr/share/tessdata \; \ && cd .. && rm -r tessdata -COPY dangerzone.py /usr/local/bin/ -RUN chmod +x /usr/local/bin/dangerzone.py +ENV PYTHONPATH=/opt/dangerzone + +RUN mkdir -p /opt/dangerzone/dangerzone +RUN touch /opt/dangerzone/dangerzone/__init__.py +COPY conversion /opt/dangerzone/dangerzone/conversion # Add the unprivileged user RUN adduser -s /bin/sh -D dangerzone diff --git a/Makefile b/Makefile index 05c64e568..900d5a8f3 100644 --- a/Makefile +++ b/Makefile @@ -24,13 +24,10 @@ MYPY_ARGS := --ignore-missing-imports \ mypy-host: mypy $(MYPY_ARGS) dangerzone -mypy-container: - mypy $(MYPY_ARGS) container - mypy-tests: mypy $(MYPY_ARGS) tests -mypy: mypy-host mypy-container mypy-tests ## check type hints with mypy +mypy: mypy-host mypy-tests ## check type hints with mypy .PHONY: lint lint: lint-black lint-isort mypy ## check the code with various linters diff --git a/dangerzone/conversion/__init__.py b/dangerzone/conversion/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dangerzone/conversion/common.py b/dangerzone/conversion/common.py new file mode 100644 index 000000000..45629e19a --- /dev/null +++ b/dangerzone/conversion/common.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 + +import asyncio +import glob +import json +import os +import re +import shutil +import subprocess +import sys +import time +from abc import abstractmethod +from typing import Callable, Dict, List, Optional, Tuple, Union + +TIMEOUT_PER_PAGE: float = 30 # (seconds) +TIMEOUT_PER_MB: float = 30 # (seconds) +TIMEOUT_MIN: float = 60 # (seconds) + + +async def read_stream( + sr: asyncio.StreamReader, callback: Optional[Callable] = None +) -> bytes: + """Consume a byte stream line-by-line. + + Read all lines in a stream until EOF. If a user has passed a callback, call it for + each line. + + Note that the lines are in bytes, since we can't assume that all command output will + be UTF-8 encoded. Higher level commands are advised to decode the output to Unicode, + if they know its encoding. + """ + buf = b"" + while True: + line = await sr.readline() + if sr.at_eof(): + break + if callback is not None: + callback(line) + # TODO: This would be a good place to log the received line, mostly for debug + # logging. + buf += line + return buf + + +async def run_command( + args: List[str], + *, + error_message: str, + timeout_message: str, + timeout: Optional[float], + stdout_callback: Optional[Callable] = None, + stderr_callback: Optional[Callable] = None, +) -> Tuple[bytes, bytes]: + """Run a command and get its output. + + Run a command using asyncio.subprocess, consume its standard streams, and return its + output in bytes. + + :raises RuntimeError: if the process returns a non-zero exit status + :raises TimeoutError: if the process times out + """ + # Start the provided command, and return a handle. The command will run in the + # background. + proc = await asyncio.subprocess.create_subprocess_exec( + *args, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + + assert proc.stdout is not None + assert proc.stderr is not None + + # Create asynchronous tasks that will consume the standard streams of the command, + # and call callbacks if necessary. + stdout_task = asyncio.create_task(read_stream(proc.stdout, stdout_callback)) + stderr_task = asyncio.create_task(read_stream(proc.stderr, stderr_callback)) + + # Wait until the command has finished, for a specific timeout. Then, verify that the + # command has completed successfully. In any other case, raise an exception. + try: + ret = await asyncio.wait_for(proc.wait(), timeout=timeout) + except asyncio.exceptions.TimeoutError: + raise TimeoutError(timeout_message) + if ret != 0: + raise RuntimeError(error_message) + + # Wait until the tasks that consume the command's standard streams have exited as + # well, and return their output. + stdout = await stdout_task + stderr = await stderr_task + return (stdout, stderr) + + +class DangerzoneConverter: + def __init__(self) -> None: + self.percentage: float = 0.0 + + def calculate_timeout( + self, size: float, pages: Optional[float] = None + ) -> Optional[float]: + """Calculate the timeout for a command. + + The timeout calculation takes two factors in mind: + + 1. The size (in MiBs) of the dataset (document, multiple pages). + 2. The number of pages in the dataset. + + It then calculates proportional timeout values based on the above, and keeps the + large one. This way, we can handle several corner cases: + + * Documents with lots of pages, but small file size. + * Single images with large file size. + """ + if not int(os.environ.get("ENABLE_TIMEOUTS", 1)): + return None + + # Do not have timeouts lower than 10 seconds, if the file size is small, since + # we need to take into account the program's startup time as well. + timeout = max(TIMEOUT_PER_MB * size, TIMEOUT_MIN) + if pages: + timeout = max(timeout, TIMEOUT_PER_PAGE * pages) + return timeout + + @abstractmethod + async def convert(self) -> None: + pass + + def update_progress(self, text: str, *, error: bool = False) -> None: + print( + json.dumps( + {"error": error, "text": text, "percentage": int(self.percentage)} + ) + ) + sys.stdout.flush() diff --git a/container/dangerzone.py b/dangerzone/conversion/doc_to_pixels.py similarity index 50% rename from container/dangerzone.py rename to dangerzone/conversion/doc_to_pixels.py index 360552e8c..23db83ad6 100644 --- a/container/dangerzone.py +++ b/dangerzone/conversion/doc_to_pixels.py @@ -1,140 +1,27 @@ #!/usr/bin/env python3 """ -Here are the steps, with progress bar percentages for each step: +Here are the steps, with progress bar percentages: -document_to_pixels - 0%-3%: Convert document into a PDF (skipped if the input file is a PDF) - 3%-5%: Split PDF into individual pages, and count those pages - 5%-50%: Convert each page into pixels (each page takes 45/n%, where n is the number of pages) - -pixels_to_pdf: -- 50%-95%: Convert each page of pixels into a PDF (each page takes 45/n%, where n is the number of pages) -- 95%-100%: Compress the final PDF """ import asyncio import glob -import json import os import re import shutil -import subprocess import sys -import time -from typing import Callable, Dict, List, Optional, Tuple, Union +from typing import Dict, Optional import magic -TIMEOUT_PER_PAGE: float = 30 # (seconds) -TIMEOUT_PER_MB: float = 30 # (seconds) -TIMEOUT_MIN: float = 60 # (seconds) - - -async def read_stream( - sr: asyncio.StreamReader, callback: Optional[Callable] = None -) -> bytes: - """Consume a byte stream line-by-line. - - Read all lines in a stream until EOF. If a user has passed a callback, call it for - each line. - - Note that the lines are in bytes, since we can't assume that all command output will - be UTF-8 encoded. Higher level commands are advised to decode the output to Unicode, - if they know its encoding. - """ - buf = b"" - while True: - line = await sr.readline() - if sr.at_eof(): - break - if callback is not None: - callback(line) - # TODO: This would be a good place to log the received line, mostly for debug - # logging. - buf += line - return buf - - -async def run_command( - args: List[str], - *, - error_message: str, - timeout_message: str, - timeout: Optional[float], - stdout_callback: Optional[Callable] = None, - stderr_callback: Optional[Callable] = None, -) -> Tuple[bytes, bytes]: - """Run a command and get its output. - - Run a command using asyncio.subprocess, consume its standard streams, and return its - output in bytes. - - :raises RuntimeError: if the process returns a non-zero exit status - :raises TimeoutError: if the process times out - """ - # Start the provided command, and return a handle. The command will run in the - # background. - proc = await asyncio.subprocess.create_subprocess_exec( - *args, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - - assert proc.stdout is not None - assert proc.stderr is not None - - # Create asynchronous tasks that will consume the standard streams of the command, - # and call callbacks if necessary. - stdout_task = asyncio.create_task(read_stream(proc.stdout, stdout_callback)) - stderr_task = asyncio.create_task(read_stream(proc.stderr, stderr_callback)) - - # Wait until the command has finished, for a specific timeout. Then, verify that the - # command has completed successfully. In any other case, raise an exception. - try: - ret = await asyncio.wait_for(proc.wait(), timeout=timeout) - except asyncio.exceptions.TimeoutError: - raise TimeoutError(timeout_message) - if ret != 0: - raise RuntimeError(error_message) - - # Wait until the tasks that consume the command's standard streams have exited as - # well, and return their output. - stdout = await stdout_task - stderr = await stderr_task - return (stdout, stderr) - - -class DangerzoneConverter: - def __init__(self) -> None: - self.percentage: float = 0.0 - - def calculate_timeout( - self, size: float, pages: Optional[float] = None - ) -> Optional[float]: - """Calculate the timeout for a command. - - The timeout calculation takes two factors in mind: - - 1. The size (in MiBs) of the dataset (document, multiple pages). - 2. The number of pages in the dataset. - - It then calculates proportional timeout values based on the above, and keeps the - large one. This way, we can handle several corner cases: - - * Documents with lots of pages, but small file size. - * Single images with large file size. - """ - if not int(os.environ.get("ENABLE_TIMEOUTS", 1)): - return None - - # Do not have timeouts lower than 10 seconds, if the file size is small, since - # we need to take into account the program's startup time as well. - timeout = max(TIMEOUT_PER_MB * size, TIMEOUT_MIN) - if pages: - timeout = max(timeout, TIMEOUT_PER_PAGE * pages) - return timeout - - async def document_to_pixels(self) -> None: +from .common import DangerzoneConverter, run_command + + +class DocumentToPixels(DangerzoneConverter): + async def convert(self) -> None: conversions: Dict[str, Dict[str, Optional[str]]] = { # .pdf "application/pdf": {"type": None}, @@ -393,160 +280,12 @@ def pdftoppm_progress_callback(line: bytes) -> None: ): shutil.move(filename, "/dangerzone") - async def pixels_to_pdf(self) -> None: - self.percentage = 50.0 - - num_pages = len(glob.glob("/dangerzone/page-*.rgb")) - total_size = 0.0 - - # Convert RGB files to PDF files - percentage_per_page = 45.0 / num_pages - for page in range(1, num_pages + 1): - filename_base = f"/dangerzone/page-{page}" - rgb_filename = f"{filename_base}.rgb" - width_filename = f"{filename_base}.width" - height_filename = f"{filename_base}.height" - png_filename = f"/tmp/page-{page}.png" - ocr_filename = f"/tmp/page-{page}" - pdf_filename = f"/tmp/page-{page}.pdf" - - with open(width_filename) as f: - width = f.read().strip() - with open(height_filename) as f: - height = f.read().strip() - - # The first few operations happen on a per-page basis. - page_size = os.path.getsize(filename_base + ".rgb") / 1024**2 - total_size += page_size - timeout = self.calculate_timeout(page_size, 1) - - if os.environ.get("OCR") == "1": # OCR the document - self.update_progress( - f"Converting page {page}/{num_pages} from pixels to searchable PDF" - ) - await run_command( - [ - "gm", - "convert", - "-size", - f"{width}x{height}", - "-depth", - "8", - f"rgb:{rgb_filename}", - f"png:{png_filename}", - ], - error_message=f"Page {page}/{num_pages} conversion to PNG failed", - timeout_message=( - "Error converting pixels to PNG, convert timed out after" - f" {timeout} seconds" - ), - timeout=timeout, - ) - await run_command( - [ - "tesseract", - png_filename, - ocr_filename, - "-l", - os.environ.get("OCR_LANGUAGE"), # type: ignore - "--dpi", - "70", - "pdf", - ], - error_message=f"Page {page}/{num_pages} OCR failed", - timeout_message=( - "Error converting PNG to searchable PDF, tesseract timed out" - f" after {timeout} seconds" - ), - timeout=timeout, - ) - - else: # Don't OCR - self.update_progress( - f"Converting page {page}/{num_pages} from pixels to PDF" - ) - await run_command( - [ - "gm", - "convert", - "-size", - f"{width}x{height}", - "-depth", - "8", - f"rgb:{rgb_filename}", - f"pdf:{pdf_filename}", - ], - error_message=f"Page {page}/{num_pages} conversion to PDF failed", - timeout_message=( - "Error converting RGB to PDF, convert timed out after" - f" {timeout} seconds" - ), - timeout=timeout, - ) - - self.percentage += percentage_per_page - - # Next operations apply to the all the pages, so we need to recalculate the - # timeout. - timeout = self.calculate_timeout(total_size, num_pages) - - # Merge pages into a single PDF - self.update_progress(f"Merging {num_pages} pages into a single PDF") - args = ["pdfunite"] - for page in range(1, num_pages + 1): - args.append(f"/tmp/page-{page}.pdf") - args.append(f"/tmp/safe-output.pdf") - await run_command( - args, - error_message="Merging pages into a single PDF failed", - timeout_message=( - "Error merging pages into a single PDF, pdfunite timed out after" - f" {timeout} seconds" - ), - timeout=timeout, - ) - - self.percentage += 2 - - # Compress - self.update_progress("Compressing PDF") - await run_command( - ["ps2pdf", "/tmp/safe-output.pdf", "/tmp/safe-output-compressed.pdf"], - error_message="Compressing PDF failed", - timeout_message=( - f"Error compressing PDF, ps2pdf timed out after {timeout} seconds" - ), - timeout=timeout, - ) - - self.percentage = 100.0 - self.update_progress("Safe PDF created") - - # Move converted files into /safezone - shutil.move("/tmp/safe-output.pdf", "/safezone") - shutil.move("/tmp/safe-output-compressed.pdf", "/safezone") - - def update_progress(self, text: str, *, error: bool = False) -> None: - print( - json.dumps( - {"error": error, "text": text, "percentage": int(self.percentage)} - ) - ) - sys.stdout.flush() - async def main() -> int: - if len(sys.argv) != 2: - print(f"Usage: {sys.argv[0]} [document-to-pixels]|[pixels-to-pdf]") - return -1 - - converter = DangerzoneConverter() + converter = DocumentToPixels() try: - if sys.argv[1] == "document-to-pixels": - await converter.document_to_pixels() - elif sys.argv[1] == "pixels-to-pdf": - await converter.pixels_to_pdf() + await converter.convert() except (RuntimeError, TimeoutError, ValueError) as e: converter.update_progress(str(e), error=True) return 1 diff --git a/dangerzone/conversion/pixels_to_pdf.py b/dangerzone/conversion/pixels_to_pdf.py new file mode 100644 index 000000000..2e97de5fc --- /dev/null +++ b/dangerzone/conversion/pixels_to_pdf.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 +""" +Here are the steps, with progress bar percentages: + +- 50%-95%: Convert each page of pixels into a PDF (each page takes 45/n%, where n is the number of pages) +- 95%-100%: Compress the final PDF +""" +import asyncio +import glob +import json +import os +import shutil +import sys + +from .common import DangerzoneConverter, run_command + + +class PixelsToPDF(DangerzoneConverter): + async def convert(self) -> None: + self.percentage = 50.0 + + num_pages = len(glob.glob("/tmp/dangerzone/page-*.rgb")) + total_size = 0.0 + + # Convert RGB files to PDF files + percentage_per_page = 45.0 / num_pages + for page in range(1, num_pages + 1): + filename_base = f"/tmp/dangerzone/page-{page}" + rgb_filename = f"{filename_base}.rgb" + width_filename = f"{filename_base}.width" + height_filename = f"{filename_base}.height" + png_filename = f"/tmp/page-{page}.png" + ocr_filename = f"/tmp/page-{page}" + pdf_filename = f"/tmp/page-{page}.pdf" + + with open(width_filename) as f: + width = f.read().strip() + with open(height_filename) as f: + height = f.read().strip() + + # The first few operations happen on a per-page basis. + page_size = os.path.getsize(filename_base + ".rgb") / 1024**2 + total_size += page_size + timeout = self.calculate_timeout(page_size, 1) + + if os.environ.get("OCR") == "1": # OCR the document + self.update_progress( + f"Converting page {page}/{num_pages} from pixels to searchable PDF" + ) + await run_command( + [ + "gm", + "convert", + "-size", + f"{width}x{height}", + "-depth", + "8", + f"rgb:{rgb_filename}", + f"png:{png_filename}", + ], + error_message=f"Page {page}/{num_pages} conversion to PNG failed", + timeout_message=( + "Error converting pixels to PNG, convert timed out after" + f" {timeout} seconds" + ), + timeout=timeout, + ) + await run_command( + [ + "tesseract", + png_filename, + ocr_filename, + "-l", + os.environ.get("OCR_LANGUAGE"), # type: ignore + "--dpi", + "70", + "pdf", + ], + error_message=f"Page {page}/{num_pages} OCR failed", + timeout_message=( + "Error converting PNG to searchable PDF, tesseract timed out" + f" after {timeout} seconds" + ), + timeout=timeout, + ) + + else: # Don't OCR + self.update_progress( + f"Converting page {page}/{num_pages} from pixels to PDF" + ) + await run_command( + [ + "gm", + "convert", + "-size", + f"{width}x{height}", + "-depth", + "8", + f"rgb:{rgb_filename}", + f"pdf:{pdf_filename}", + ], + error_message=f"Page {page}/{num_pages} conversion to PDF failed", + timeout_message=( + "Error converting RGB to PDF, convert timed out after" + f" {timeout} seconds" + ), + timeout=timeout, + ) + + self.percentage += percentage_per_page + + # Next operations apply to the all the pages, so we need to recalculate the + # timeout. + timeout = self.calculate_timeout(total_size, num_pages) + + # Merge pages into a single PDF + self.update_progress(f"Merging {num_pages} pages into a single PDF") + args = ["pdfunite"] + for page in range(1, num_pages + 1): + args.append(f"/tmp/page-{page}.pdf") + args.append(f"/tmp/safe-output.pdf") + await run_command( + args, + error_message="Merging pages into a single PDF failed", + timeout_message=( + "Error merging pages into a single PDF, pdfunite timed out after" + f" {timeout} seconds" + ), + timeout=timeout, + ) + + self.percentage += 2 + + # Compress + self.update_progress("Compressing PDF") + await run_command( + ["ps2pdf", "/tmp/safe-output.pdf", "/tmp/safe-output-compressed.pdf"], + error_message="Compressing PDF failed", + timeout_message=( + f"Error compressing PDF, ps2pdf timed out after {timeout} seconds" + ), + timeout=timeout, + ) + + self.percentage = 100.0 + self.update_progress("Safe PDF created") + + # Move converted files into /safezone + shutil.move("/tmp/safe-output.pdf", "/safezone") + shutil.move("/tmp/safe-output-compressed.pdf", "/safezone") + + +async def main() -> int: + converter = PixelsToPDF() + + try: + await converter.convert() + except (RuntimeError, TimeoutError, ValueError) as e: + converter.update_progress(str(e), error=True) + return 1 + else: + return 0 # Success! + + +if __name__ == "__main__": + sys.exit(asyncio.run(main())) diff --git a/dangerzone/isolation_provider/container.py b/dangerzone/isolation_provider/container.py index 4113317f8..6b71b63cf 100644 --- a/dangerzone/isolation_provider/container.py +++ b/dangerzone/isolation_provider/container.py @@ -262,8 +262,8 @@ def _convert_with_tmpdirs( # Convert document to pixels command = [ "/usr/bin/python3", - "/usr/local/bin/dangerzone.py", - "document-to-pixels", + "-m", + "dangerzone.conversion.doc_to_pixels", ] extra_args = [ "-v", @@ -282,8 +282,8 @@ def _convert_with_tmpdirs( # Convert pixels to safe PDF command = [ "/usr/bin/python3", - "/usr/local/bin/dangerzone.py", - "pixels-to-pdf", + "-m", + "dangerzone.conversion.pixels_to_pdf", ] extra_args = [ "-v", diff --git a/install/linux/build-image.sh b/install/linux/build-image.sh index fc662ecd7..ad573c79a 100755 --- a/install/linux/build-image.sh +++ b/install/linux/build-image.sh @@ -5,7 +5,7 @@ set -e TAG=dangerzone.rocks/dangerzone:latest echo "Building container image" -podman build container --tag $TAG +podman build dangerzone/ -f Dockerfile --tag $TAG echo "Saving and compressing container image" podman save $TAG | gzip > share/container.tar.gz diff --git a/install/macos/build-image.sh b/install/macos/build-image.sh index ab4fd9793..eafb1c6ed 100755 --- a/install/macos/build-image.sh +++ b/install/macos/build-image.sh @@ -5,7 +5,7 @@ set -e TAG=dangerzone.rocks/dangerzone:latest echo "Building container image" -docker build container --tag $TAG +docker build dangerzone/ -f Dockerfile --tag $TAG echo "Saving and compressing container image" docker save $TAG | gzip > share/container.tar.gz diff --git a/install/windows/build-image.py b/install/windows/build-image.py index cf1d578cc..23fd2d590 100644 --- a/install/windows/build-image.py +++ b/install/windows/build-image.py @@ -9,7 +9,9 @@ def main(): [ "docker", "build", - "container", + "dangerzone/", + "-f", + "Dockerfile", "--tag", "dangerzone.rocks/dangerzone:latest", ] diff --git a/setup.py b/setup.py index 33f5e07c2..98656fd21 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,12 @@ def file_list(path): It uses container technology to convert the documents within a secure sandbox.\ """, url="https://github.com/freedomofpress/dangerzone", - packages=["dangerzone", "dangerzone.gui", "dangerzone.isolation_provider"], + packages=[ + "dangerzone", + "dangerzone.conversion", + "dangerzone.gui", + "dangerzone.isolation_provider", + ], data_files=[ ( "share/applications", From a0d1a6830244679340f60ac4815540d852827edd Mon Sep 17 00:00:00 2001 From: deeplow Date: Tue, 20 Jun 2023 19:38:01 +0300 Subject: [PATCH 02/10] Use /tmp/dangerzone for Qubes compatibility For using in containers, creating a /dangerzone directory is fine but it is more standard to do this in /tmp. --- dangerzone/conversion/doc_to_pixels.py | 4 ++-- dangerzone/isolation_provider/container.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dangerzone/conversion/doc_to_pixels.py b/dangerzone/conversion/doc_to_pixels.py index 23db83ad6..aa9208818 100644 --- a/dangerzone/conversion/doc_to_pixels.py +++ b/dangerzone/conversion/doc_to_pixels.py @@ -272,13 +272,13 @@ def pdftoppm_progress_callback(line: bytes) -> None: self.update_progress("Converted document to pixels") - # Move converted files into /dangerzone + # Move converted files into /tmp/dangerzone for filename in ( glob.glob("/tmp/page-*.rgb") + glob.glob("/tmp/page-*.width") + glob.glob("/tmp/page-*.height") ): - shutil.move(filename, "/dangerzone") + shutil.move(filename, "/tmp/dangerzone") async def main() -> int: diff --git a/dangerzone/isolation_provider/container.py b/dangerzone/isolation_provider/container.py index 6b71b63cf..ed5f38f22 100644 --- a/dangerzone/isolation_provider/container.py +++ b/dangerzone/isolation_provider/container.py @@ -269,7 +269,7 @@ def _convert_with_tmpdirs( "-v", f"{copied_file}:/tmp/input_file:Z", "-v", - f"{pixel_dir}:/dangerzone:Z", + f"{pixel_dir}:/tmp/dangerzone:Z", "-e", f"ENABLE_TIMEOUTS={self.enable_timeouts}", ] @@ -287,7 +287,7 @@ def _convert_with_tmpdirs( ] extra_args = [ "-v", - f"{pixel_dir}:/dangerzone:Z", + f"{pixel_dir}:/tmp/dangerzone:Z", "-v", f"{safe_dir}:/safezone:Z", "-e", From 9410da762c3e712cb59e664612984e07cf7e860d Mon Sep 17 00:00:00 2001 From: deeplow Date: Tue, 20 Jun 2023 19:46:11 +0300 Subject: [PATCH 03/10] Check if conversion code runs on Qubes Add a way to check if the code runs (or should run) on Qubes. Refs #451 --- dangerzone/conversion/common.py | 19 ++++++++++++++----- dangerzone/conversion/pixels_to_pdf.py | 7 ++++--- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/dangerzone/conversion/common.py b/dangerzone/conversion/common.py index 45629e19a..d0760aa50 100644 --- a/dangerzone/conversion/common.py +++ b/dangerzone/conversion/common.py @@ -17,6 +17,14 @@ TIMEOUT_MIN: float = 60 # (seconds) +def running_on_qubes() -> bool: + # https://www.qubes-os.org/faq/#what-is-the-canonical-way-to-detect-qubes-vm + if os.environ.get("DZ_USE_CONTAINERS", "0") == "0": + return os.path.exists("/usr/share/qubes/marker-vm") + else: + return False + + async def read_stream( sr: asyncio.StreamReader, callback: Optional[Callable] = None ) -> bytes: @@ -126,9 +134,10 @@ async def convert(self) -> None: pass def update_progress(self, text: str, *, error: bool = False) -> None: - print( - json.dumps( - {"error": error, "text": text, "percentage": int(self.percentage)} + if not running_on_qubes(): + print( + json.dumps( + {"error": error, "text": text, "percentage": int(self.percentage)} + ) ) - ) - sys.stdout.flush() + sys.stdout.flush() diff --git a/dangerzone/conversion/pixels_to_pdf.py b/dangerzone/conversion/pixels_to_pdf.py index 2e97de5fc..905e8e841 100644 --- a/dangerzone/conversion/pixels_to_pdf.py +++ b/dangerzone/conversion/pixels_to_pdf.py @@ -12,7 +12,7 @@ import shutil import sys -from .common import DangerzoneConverter, run_command +from .common import DangerzoneConverter, run_command, running_on_qubes class PixelsToPDF(DangerzoneConverter): @@ -146,8 +146,9 @@ async def convert(self) -> None: self.update_progress("Safe PDF created") # Move converted files into /safezone - shutil.move("/tmp/safe-output.pdf", "/safezone") - shutil.move("/tmp/safe-output-compressed.pdf", "/safezone") + if not running_on_qubes(): + shutil.move("/tmp/safe-output.pdf", "/safezone") + shutil.move("/tmp/safe-output-compressed.pdf", "/safezone") async def main() -> int: From cfdaec23c5896df56a29094f984142942ce09dcf Mon Sep 17 00:00:00 2001 From: Alex Pyrgiotis Date: Tue, 20 Jun 2023 19:46:33 +0300 Subject: [PATCH 04/10] Support multiple Python libraries for libmagic It seems that there are at least two Python libraries with libmagic support: * PyPI: python-magic (https://pypi.org/project/python-magic/) On Fedora it's `python3-magic` * PyPI: filemagic (https://pypi.org/project/filemagic/) On Fedora it's `python3-file-magic` The first package corresponds to the `py3-magic` package on Alpine Linux, and it's the one we install in the container. The second package uses a different API, and it's the only one we can use on Qubes. To make matters worse, we: * Can't install the first package on Fedora, because it installs the second under the hood: https://bugzilla.redhat.com/show_bug.cgi?id=1899279 * Can't install the second package on Alpine Linux (untested), due to Musl being used instead of libC: https://stackoverflow.com/a/53936722 Ultimately, we need to support both, by trying the first API, and on failure using the other API. --- dangerzone/conversion/doc_to_pixels.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/dangerzone/conversion/doc_to_pixels.py b/dangerzone/conversion/doc_to_pixels.py index aa9208818..215150042 100644 --- a/dangerzone/conversion/doc_to_pixels.py +++ b/dangerzone/conversion/doc_to_pixels.py @@ -101,8 +101,11 @@ async def convert(self) -> None: } # Detect MIME type - mime = magic.Magic(mime=True) - mime_type = mime.from_file("/tmp/input_file") + try: + mime = magic.Magic(mime=True) + mime_type = mime.from_file("/tmp/input_file") + except TypeError: + mime_type = magic.detect_from_filename("/tmp/input_file").mime_type # Validate MIME type if mime_type not in conversions: From a83f5dfc7a8c52082edd240f996bacbc610b6d1c Mon Sep 17 00:00:00 2001 From: deeplow Date: Tue, 20 Jun 2023 19:49:53 +0300 Subject: [PATCH 05/10] Add Qubes-specific code for disposable VMs The "document to pixels" code assumes that the client has called it with some mount points in which it can write files. This is true for the container isolation provider, but not for Qubes, who can communicate with the client only via stdin/stdout. Add a Qubes wrapper for this code that reads the suspicious document from stdin and writes the pages to stdout. The on-wire format is the same as the one that TrustedPDF uses. --- .../conversion/doc_to_pixels_qubes_wrapper.py | 89 +++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 dangerzone/conversion/doc_to_pixels_qubes_wrapper.py diff --git a/dangerzone/conversion/doc_to_pixels_qubes_wrapper.py b/dangerzone/conversion/doc_to_pixels_qubes_wrapper.py new file mode 100644 index 000000000..82c0feb35 --- /dev/null +++ b/dangerzone/conversion/doc_to_pixels_qubes_wrapper.py @@ -0,0 +1,89 @@ +import asyncio +import os +import shutil +import sys +import tempfile +from pathlib import Path +from typing import Optional, TextIO + +from .doc_to_pixels import DocumentToPixels + + +def _read_bytes() -> bytes: + """Read bytes from the stdin.""" + data = sys.stdin.buffer.read() + if data is None: + raise EOFError + return data + + +def _write_bytes(data: bytes, file: TextIO = sys.stdout) -> None: + file.buffer.write(data) + + +def _write_text(text: str, file: TextIO = sys.stdout) -> None: + _write_bytes(text.encode(), file=file) + + +def _write_int(num: int, file: TextIO = sys.stdout) -> None: + _write_bytes(num.to_bytes(2, signed=False), file=file) + + +# ==== ASYNC METHODS ==== +# We run sync methods in async wrappers, because pure async methods are more difficult: +# https://stackoverflow.com/a/52702646 +# +# In practice, because they are I/O bound and we don't have many running concurrently, +# they shouldn't cause a problem. + + +async def read_bytes() -> bytes: + return await asyncio.to_thread(_read_bytes) + + +async def write_bytes(data: bytes, file: TextIO = sys.stdout) -> None: + return await asyncio.to_thread(_write_bytes, data, file=file) + + +async def write_text(text: str, file: TextIO = sys.stdout) -> None: + return await asyncio.to_thread(_write_text, text, file=file) + + +async def write_int(num: int, file: TextIO = sys.stdout) -> None: + return await asyncio.to_thread(_write_int, num, file=file) + + +async def main() -> None: + out_dir = Path("/tmp/dangerzone") + if out_dir.exists(): + shutil.rmtree(out_dir) + out_dir.mkdir() + + try: + data = await read_bytes() + except EOFError: + sys.exit(1) + + with open("/tmp/input_file", "wb") as f: + f.write(data) + + converter = DocumentToPixels() + await converter.convert() + + num_pages = len(list(out_dir.glob("*.rgb"))) + await write_int(num_pages) + for num_page in range(1, num_pages + 1): + page_base = out_dir / f"page-{num_page}" + with open(f"{page_base}.width", "r") as width_file: + width = int(width_file.read()) + with open(f"{page_base}.height", "r") as height_file: + height = int(height_file.read()) + await write_int(width) + await write_int(height) + with open(f"{page_base}.rgb", "rb") as rgb_file: + rgb_data = rgb_file.read() + await write_bytes(rgb_data) + + +if __name__ == "__main__": + sys.exit(asyncio.run(main())) From c194606550f95b22dabe729e3ac96e4ffe7d0d40 Mon Sep 17 00:00:00 2001 From: Alex Pyrgiotis Date: Tue, 20 Jun 2023 19:47:54 +0300 Subject: [PATCH 06/10] Add Qubes RPC calls Add two RPC calls that can run on disposable VMs: * dz.Convert: This call simply imports the dangerzone package and runs the Qubes wrapper for the "document to pixels" code. This call is similar to the way we run the conversion part in a container. * dz.ConvertDev: This call is for development purposes, and does the following: - First it receives the `dangerzone.conversion` module as Python zipfile. This way, we can quickly iterate on changes on the server-side part of Qubes, without altering the templates. - Second, it calls the Qubes wrapper for the "document to pixels" code, as dz.Convert does. --- qubes/dz.Convert | 2 ++ qubes/dz.ConvertDev | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100755 qubes/dz.Convert create mode 100755 qubes/dz.ConvertDev diff --git a/qubes/dz.Convert b/qubes/dz.Convert new file mode 100755 index 000000000..d6b2e3e23 --- /dev/null +++ b/qubes/dz.Convert @@ -0,0 +1,2 @@ +#!/bin/sh +python -m dangerzone.conversion.doc_to_pixels_qubes_wrapper diff --git a/qubes/dz.ConvertDev b/qubes/dz.ConvertDev new file mode 100755 index 000000000..a66308189 --- /dev/null +++ b/qubes/dz.ConvertDev @@ -0,0 +1,42 @@ +#!/usr/bin/env python + +import asyncio +import glob +import io +import os +import sys +import tempfile +import zipfile + + +def say(msg): + print(msg, file=sys.stderr, flush=True) + + +def main(): + say("Debugging mode enabled") + + # Get the size of the zipfile + size = int.from_bytes(sys.stdin.buffer.read(4)) + say(f"Reading {size} bytes of Python zipfile") + + # Read the zipfile from stdin + zf = sys.stdin.buffer.read(size) + if len(zf) < size: + say(f"Client closed the connection early") + return 1 + + with tempfile.NamedTemporaryFile(suffix=".zip") as t: + say(f"Storing the Python zipfile to {t.name}") + t.write(zf) + t.flush() + + say(f"Importing the conversion module") + sys.path.insert(0, t.name) + + from dangerzone.conversion.doc_to_pixels_qubes_wrapper import main + return asyncio.run(main()) + + +if __name__ == "__main__": + sys.exit(main()) From baeab9d7ebea0a70922276586a15bb8c649b8f93 Mon Sep 17 00:00:00 2001 From: deeplow Date: Tue, 20 Jun 2023 19:50:07 +0300 Subject: [PATCH 07/10] Add Qubes isolation provider Add an isolation provider for Qubes, that performs the document conversion as follows: Document to pixels phase ------------------------ 1. Starts a disposable qube by calling either the dz.Convert or the dz.ConvertDev RPC call, depending on the execution context. 2. Sends the file to disposable qube through its stdin. * If we call the conversion from the development environment, also pass the conversion module as a Python zipfile, before the suspicious document. 3. Reads the number of pages, their dimensions, and the page data. Pixels to PDF phase ------------------- 1. Writes the page data under /tmp/dangerzone, so that the `pixels_to_pdf` module can read them. 2. Pass OCR parameters as envvars. 3. Call the `pixels_to_pdf` main function, as if it was running within a container. Wait until the PDF gets created. 4. Move the resulting PDF to the proper directory. Fixes #414 --- dangerzone/isolation_provider/qubes.py | 181 +++++++++++++++++++++++++ 1 file changed, 181 insertions(+) create mode 100644 dangerzone/isolation_provider/qubes.py diff --git a/dangerzone/isolation_provider/qubes.py b/dangerzone/isolation_provider/qubes.py new file mode 100644 index 000000000..10a976807 --- /dev/null +++ b/dangerzone/isolation_provider/qubes.py @@ -0,0 +1,181 @@ +import asyncio +import glob +import inspect +import io +import logging +import os +import shutil +import subprocess +import sys +import tempfile +import time +import zipfile +from pathlib import Path +from typing import IO, Callable, Optional + +from ..document import Document +from ..util import get_resource_path +from .base import IsolationProvider + +log = logging.getLogger(__name__) + +from ..conversion.pixels_to_pdf import PixelsToPDF +from ..util import get_subprocess_startupinfo, get_tmp_dir + +CONVERTED_FILE_PATH = ( + # FIXME won't work for parallel conversions (see #454) + "/tmp/safe-output-compressed.pdf" +) + + +def read_bytes(p: subprocess.Popen, buff_size: int) -> bytes: + """Read bytes from stdout.""" + return p.stdout.read(buff_size) # type: ignore [union-attr] + + +def read_int(p: subprocess.Popen) -> int: + """Read 2 bytes from stdout, and decode them as int.""" + untrusted_int = p.stdout.read(2) # type: ignore [union-attr] + return int.from_bytes(untrusted_int, signed=False) + + +class Qubes(IsolationProvider): + """Uses a disposable qube for performing the conversion""" + + def install(self) -> bool: + return True + + def _convert( + self, + document: Document, + ocr_lang: Optional[str], + stdout_callback: Optional[Callable] = None, + ) -> bool: + success = False + + # FIXME won't work on windows, nor with multi-conversion + out_dir = Path("/tmp/dangerzone") + if out_dir.exists(): + shutil.rmtree(out_dir) + out_dir.mkdir() + + # Reset hard-coded state + if os.path.exists(CONVERTED_FILE_PATH): + os.remove(CONVERTED_FILE_PATH) + + percentage = 0.0 + + with open(document.input_filename, "rb") as f: + # TODO handle lack of memory to start qube + if getattr(sys, "dangerzone_dev", False): + # Use dz.ConvertDev RPC call instead, if we are in development mode. + # Basically, the change is that we also transfer the necessary Python + # code as a zipfile, before sending the doc that the user requested. + p = subprocess.Popen( + ["/usr/bin/qrexec-client-vm", "@dispvm:dz-dvm", "dz.ConvertDev"], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + ) + assert p.stdin is not None + + # Send the dangerzone module first. + self.teleport_dz_module(p.stdin) + + # Finally, send the document, as in the normal case. + p.stdin.write(f.read()) + p.stdin.close() + else: + p = subprocess.Popen( + ["/usr/bin/qrexec-client-vm", "@dispvm:dz-dvm", "dz.Convert"], + stdin=f, + stdout=subprocess.PIPE, + ) + + n_pages = read_int(p) + if n_pages == 0: + # FIXME: Fail loudly in that case + return False + if ocr_lang: + percentage_per_page = 50.0 / n_pages + else: + percentage_per_page = 100.0 / n_pages + for page in range(1, n_pages + 1): + # TODO handle too width > MAX_PAGE_WIDTH + # TODO handle too big height > MAX_PAGE_HEIGHT + + width = read_int(p) + height = read_int(p) + untrusted_pixels = read_bytes( + p, width * height * 3 + ) # three color channels + + # Wrapper code + with open(f"/tmp/dangerzone/page-{page}.width", "w") as f_width: + f_width.write(str(width)) + with open(f"/tmp/dangerzone/page-{page}.height", "w") as f_height: + f_height.write(str(height)) + with open(f"/tmp/dangerzone/page-{page}.rgb", "wb") as f_rgb: + f_rgb.write(untrusted_pixels) + + percentage += percentage_per_page + + text = f"Converting page {page}/{n_pages} to pixels" + self.print_progress(document, False, text, percentage) + if stdout_callback: + stdout_callback(False, text, percentage) + + # TODO handle leftover code input + text = "Converted document to pixels" + self.print_progress(document, False, text, percentage) + if stdout_callback: + stdout_callback(False, text, percentage) + + # FIXME pass OCR stuff properly (see #455) + old_environ = dict(os.environ) + if ocr_lang: + os.environ["OCR"] = "1" + os.environ["OCR_LANGUAGE"] = ocr_lang + + asyncio.run( + PixelsToPDF().convert() + ) # TODO add progress updates on second stage + + percentage = 100.0 + text = "Safe PDF created" + self.print_progress(document, False, text, percentage) + if stdout_callback: + stdout_callback(False, text, percentage) + + # FIXME remove once the OCR args are no longer passed with env vars + os.environ.clear() + os.environ.update(old_environ) + + shutil.move(CONVERTED_FILE_PATH, document.output_filename) + success = True + + return success + + def get_max_parallel_conversions(self) -> int: + return 1 + + def teleport_dz_module(self, wpipe: IO[bytes]) -> None: + """Send the dangerzone module to another qube, as a zipfile.""" + # Grab the absolute file path of the dangerzone module. + import dangerzone.conversion as _conv + + _conv_path = Path(inspect.getfile(_conv)).parent + temp_file = io.BytesIO() + + # Create a Python zipfile that contains all the files of the dangerzone module. + with zipfile.PyZipFile(temp_file, "w") as z: + z.mkdir("dangerzone/") + z.writestr("dangerzone/__init__.py", "") + z.writepy(str(_conv_path), basename="dangerzone/") + + # Send the following data: + # 1. The size of the Python zipfile, so that the server can know when to + # stop. + # 2. The Python zipfile itself. + bufsize_bytes = len(temp_file.getvalue()).to_bytes(4) + wpipe.write(bufsize_bytes) + wpipe.write(temp_file.getvalue()) From 5191556dcd4bdd3771da7b3c77cad42df9b31e9f Mon Sep 17 00:00:00 2001 From: deeplow Date: Tue, 20 Jun 2023 19:52:01 +0300 Subject: [PATCH 08/10] Use the Qubes isolation provider from CLI/GUI Autodetect in the CLI/GUI if we should run the conversion in disposable qubes. --- dangerzone/cli.py | 4 ++++ dangerzone/gui/__init__.py | 5 +++++ dangerzone/gui/main_window.py | 7 +++++-- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/dangerzone/cli.py b/dangerzone/cli.py index 60926931e..1d31c05e2 100644 --- a/dangerzone/cli.py +++ b/dangerzone/cli.py @@ -6,9 +6,11 @@ from colorama import Back, Fore, Style from . import args, errors +from .conversion.common import running_on_qubes from .document import ARCHIVE_SUBDIR, SAFE_EXTENSION from .isolation_provider.container import Container from .isolation_provider.dummy import Dummy +from .isolation_provider.qubes import Qubes from .logic import DangerzoneCore from .util import get_version @@ -63,6 +65,8 @@ def cli_main( if getattr(sys, "dangerzone_dev", False) and dummy_conversion: dangerzone = DangerzoneCore(Dummy()) + elif running_on_qubes(): + dangerzone = DangerzoneCore(Qubes()) else: dangerzone = DangerzoneCore(Container(enable_timeouts=enable_timeouts)) diff --git a/dangerzone/gui/__init__.py b/dangerzone/gui/__init__.py index a40a7e4de..255f119e7 100644 --- a/dangerzone/gui/__init__.py +++ b/dangerzone/gui/__init__.py @@ -21,9 +21,11 @@ from PySide2 import QtCore, QtGui, QtWidgets from .. import args, errors +from ..conversion.common import running_on_qubes from ..document import Document from ..isolation_provider.container import Container from ..isolation_provider.dummy import Dummy +from ..isolation_provider.qubes import Qubes from ..util import get_resource_path, get_version from .logic import DangerzoneGui from .main_window import MainWindow @@ -100,6 +102,9 @@ def gui_main( if getattr(sys, "dangerzone_dev", False) and dummy_conversion: dummy = Dummy() dangerzone = DangerzoneGui(app, isolation_provider=dummy) + elif running_on_qubes(): + qubes = Qubes() + dangerzone = DangerzoneGui(app, isolation_provider=qubes) else: container = Container(enable_timeouts=enable_timeouts) dangerzone = DangerzoneGui(app, isolation_provider=container) diff --git a/dangerzone/gui/main_window.py b/dangerzone/gui/main_window.py index 101f32d08..a4b17866b 100644 --- a/dangerzone/gui/main_window.py +++ b/dangerzone/gui/main_window.py @@ -24,6 +24,7 @@ from ..document import SAFE_EXTENSION, Document from ..isolation_provider.container import Container, NoContainerTechException from ..isolation_provider.dummy import Dummy +from ..isolation_provider.qubes import Qubes from ..util import get_resource_path, get_subprocess_startupinfo, get_version from .logic import Alert, DangerzoneGui @@ -71,8 +72,10 @@ def __init__(self, dangerzone: DangerzoneGui) -> None: self.waiting_widget: WaitingWidget = WaitingWidgetContainer(self.dangerzone) self.waiting_widget.finished.connect(self.waiting_finished) - elif isinstance(self.dangerzone.isolation_provider, Dummy): - # Don't wait with dummy converter + elif isinstance(self.dangerzone.isolation_provider, Dummy) or isinstance( + self.dangerzone.isolation_provider, Qubes + ): + # Don't wait with dummy converter and on Qubes. self.waiting_widget = WaitingWidget() self.dangerzone.is_waiting_finished = True From a1d40fde7894cbdb212c2a3e2f27ace25c62c3c9 Mon Sep 17 00:00:00 2001 From: deeplow Date: Wed, 21 Jun 2023 11:09:18 +0300 Subject: [PATCH 09/10] Create an RPM for Qubes Allow creating an RPM package that is to be installed specifically on Qubes. This package has the following extra properties from our regular RPM packages: 1. Make `python3-magic`, `libreoffice` and `tesseract` requirements for installing Dangerzone, since the conversion takes place in a disposable qube that needs these packages. 2. Ignore the container.tar.gz file, if it exists. 3. Add our RPC calls under `/etc/qubes-rpc` --- install/linux/build-rpm.py | 28 +++++++++++++++++++++++++++- setup.py | 36 +++++++++++++++++++++++++----------- 2 files changed, 52 insertions(+), 12 deletions(-) diff --git a/install/linux/build-rpm.py b/install/linux/build-rpm.py index 5d7cba4bd..a7f423b31 100755 --- a/install/linux/build-rpm.py +++ b/install/linux/build-rpm.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- +import argparse import inspect import os import shutil @@ -17,6 +18,12 @@ def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--qubes", action="store_true", help="Build RPM package for a Qubes OS system" + ) + args = parser.parse_args() + build_path = os.path.join(root, "build") dist_path = os.path.join(root, "dist") @@ -26,9 +33,28 @@ def main(): if os.path.exists(dist_path): shutil.rmtree(dist_path) + if args.qubes: + print("> Building for a Qubes system") + os.environ["QUBES_TARGET"] = "1" + + # Server and Client package requirements are bundled together since + # we assume the server and client qubes are installed on the same + # template + platform_dependant_packages = ",".join( + [ + # Server package requirements + "python3-magic", + "libreoffice", + # Client package requirements + "tesseract", # FIXME add other languages + ] + ) + else: + platform_dependant_packages = "podman" + print("* Building RPM package") subprocess.run( - "python3 setup.py bdist_rpm --requires='podman,python3-pyside2,python3-appdirs,python3-click,python3-pyxdg,python3-colorama'", + f"python3 setup.py bdist_rpm --requires='{platform_dependant_packages},python3-pyside2,python3-appdirs,python3-click,python3-pyxdg,python3-colorama'", shell=True, cwd=root, check=True, diff --git a/setup.py b/setup.py index 98656fd21..9634ddfe9 100644 --- a/setup.py +++ b/setup.py @@ -7,15 +7,39 @@ with open("share/version.txt") as f: version = f.read().strip() +qubes_target = os.environ.get("QUBES_TARGET") == "1" +if qubes_target: + print("Target: Qubes OS") + def file_list(path): files = [] for filename in os.listdir(path): if os.path.isfile(os.path.join(path, filename)): + if qubes_target and filename.endswith("container.tar.gz"): + continue # ignore container when building a Qubes package files.append(os.path.join(path, filename)) return files +def data_files_list(): + data_files = [ + ( + "share/applications", + ["install/linux/press.freedom.dangerzone.desktop"], + ), + ( + "share/icons/hicolor/64x64/apps", + ["install/linux/press.freedom.dangerzone.png"], + ), + ("share/dangerzone", file_list("share")), + ] + if qubes_target: + # Qubes RPC policy + data_files.append(("/etc/qubes-rpc/", ["qubes/dz.Convert"])) + return data_files + + setuptools.setup( name="dangerzone", version=version, @@ -35,17 +59,7 @@ def file_list(path): "dangerzone.gui", "dangerzone.isolation_provider", ], - data_files=[ - ( - "share/applications", - ["install/linux/press.freedom.dangerzone.desktop"], - ), - ( - "share/icons/hicolor/64x64/apps", - ["install/linux/press.freedom.dangerzone.png"], - ), - ("share/dangerzone", file_list("share")), - ], + data_files=data_files_list(), classifiers=[ "Programming Language :: Python", "Intended Audience :: End Users/Desktop", From 20b24a6c714e6c2b8b418508ded0da8fe82aceaf Mon Sep 17 00:00:00 2001 From: Alex Pyrgiotis Date: Tue, 20 Jun 2023 19:52:43 +0300 Subject: [PATCH 10/10] Add development instructions for Qubes integration Add instructions aimed at developers who want to try out Qubes integration. Fixes #411 --- BUILD.md | 188 ++++++++++++++++++++++++++++++++++++++++++++++++++- CHANGELOG.md | 5 +- INSTALL.md | 19 +++++- 3 files changed, 207 insertions(+), 5 deletions(-) diff --git a/BUILD.md b/BUILD.md index aeaa8288b..3e465f0c2 100644 --- a/BUILD.md +++ b/BUILD.md @@ -102,11 +102,193 @@ Create a .rpm: ./install/linux/build-rpm.py ``` -## QubesOS +## Qubes OS -Create a Debian- or Fedora-based standalone VM with at least 8GB of private storage space, and follow the relevant instructions above. +
+ :memo: Expand this section if you want to use containers instead of disposable qubes. +
-Over time, you may need to increase disk space or prune outdated Docker images if you run into build issues on this VM. + Create a Debian or Fedora-based development standalone qube with at least + 8GB of private storage space, and follow the relevant instructions above for + the respective template. + + Remember to set the environment variable `DZ_USE_CONTAINERS=1`, before executing + Dangerzone. + + Over time, you may need to increase disk space or prune outdated container + images if you run into build issues on this VM. +
+ +> :warning: Native Qubes support is in alpha stage, so the instructions below +> require switching between qubes, and are subject to change. + +### Initial Setup + +The following steps must be completed once. Make sure you run them in the +specified qubes. + +#### In `dom0` + +1. Create a new Fedora **template** (`fedora-37-dz`) for Dangerzone development: + + ``` + qvm-clone fedora-37 fedora-37-dz + ``` + + > :bulb: Alternatively, you can use your base Fedora 37 template in the + > following instructions. In that case, replace `fedora-37-dz` with + > `fedora-37` in the steps below. + +2. Create a **disposable**, offline app qube (`dz-dvm`), based on the + `fedora-37-dz` template. This will be the qube where the documents will be + sanitized: + + ``` + qvm-create --class AppVM --label red --template fedora-37-dz \ + --prop netvm="" --prop template_for_dispvms=True \ + dz-dvm + ``` + +3. Create an **app** qube (`dz`) that will be used for Dangerzone development + and initiating the sanitization process: + + ``` + qvm-create --class AppVM --label red --template fedora-37-dz dz + ``` + + > :bulb: Alternatively, you can use a different app qube for Dangerzone + > development. In that case, replace `dz` with the qube of your choice in the + > steps below. + +4. Add an RPC policy (`/etc/qubes/policy.d/50-dangerzone.policy`) that will + allow launching a disposable qube (`dz-dvm`) when Dangerzone converts a + document, with the following contents: + + ``` + dz.Convert * @anyvm @dispvm:dz-dvm allow + dz.ConvertDev * @anyvm @dispvm:dz-dvm allow + ``` + +#### In the `fedora-37-dz` template + +1. Install dependencies: + + ``` + sudo dnf install -y rpm-build pipx qt5-qtbase-gui libreoffice python3-magic \ + tesseract* + ``` + +2. Shutdown the `fedora-37-dz` template: + + ``` + shutdown -h now + ``` + +#### In the `dz` app qube + +1. Clone the Dangerzone project: + + ``` + git clone https://github.com/freedomofpress/dangerzone + ``` + +2. Install Poetry using `pipx`: + + ```sh + pipx install poetry + ``` + +3. Change to the `dangerzone` folder, and install the poetry dependencies: + + ``` + poetry install + ``` + + > **Note**: due to an issue with + > [poetry](https://github.com/python-poetry/poetry/issues/1917), if it + > prompts for your keyring, disable the keyring with `keyring --disable` and + > run the command again. + +4. Change to the `dangerzone` folder and copy the Qubes RPC calls into the + template for the **disposable** qube that will be used for document + sanitization (`dz-dvm`): + + ``` + qvm-copy-to-vm dz-dvm qubes/ + ``` + +#### In the `dz-dvm` template + +1. Create the directory that will contain the Dangerzone RPC calls, if it does + not exist already: + + ``` + sudo mkdir -p /rw/usrlocal/etc/qubes-rpc/ + ``` + +2. Move the files we copied in the previous step to their proper place: + + ``` + sudo cp ~/QubesIncoming/dz/qubes/* /rw/usrlocal/etc/qubes-rpc/ + ``` + +3. Shutdown the `dz-dvm` template: + + ``` + shutdown -h now + ``` + +### Developing Dangerzone + +From here on, developing Dangerzone is similar as in other Linux platforms. You +can run the following commands in the `dz` app qube: + +```sh +# start a shell in the virtual environment +poetry shell + +# run the CLI +./dev_scripts/dangerzone-cli --help + +# run the GUI +./dev_scripts/dangerzone +``` + +Create a .rpm: + +```sh +./install/linux/build-rpm.py --qubes +``` + +For changes in the server side components, you can simply edit them locally, +and they will be mirrored to the disposable qube through the `dz.ConvertDev` +RPC call. + +The only reason to update the `fedora-37-dz` template from there on is if: +1. The project requires new server-side components. +2. The code for `dz.ConvertDev` needs to be updated. Copy the updated file + as we've shown in the steps above. + +### Installing Dangerzone system-wide + +If you want to test the .rpm you just created, you can do the following: + +On the `dz` app cube, copy the built `dangerzone.rpm` to `fedora-37-dz` +template: + +``` +qvm-copy-to-vm fedora-37-dz dist/dangerzone*.noarch.rpm +``` + +On the `fedora-37-dz` template, install the copied .rpm: + +``` +sudo dnf install -y ~/QubesIncoming/dz/dangerzone-*.rpm +``` + +Shutdown the `fedora-37-dz` template and the `dz` app qube, and then you can +refresh the applications on the `dz` qube, find Dangerzone in the list, and use +it to convert a document. ## macOS diff --git a/CHANGELOG.md b/CHANGELOG.md index 7d2b6fcd3..5918c50b5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,8 +9,11 @@ since 0.4.1, and this project adheres to [Semantic Versioning](https://semver.or ### Added +- Platform support: Alpha integration with Qubes OS ([issue #411](https://github.com/freedomofpress/dangerzone/issues/411)) + ### Removed -- Platform support: Drop Fedora 36, since it's end-of-life ([issues #420](https://github.com/freedomofpress/dangerzone/issues/420)) + +- Platform support: Drop Fedora 36, since it's end-of-life ([issue #420](https://github.com/freedomofpress/dangerzone/issues/420)) ### Fixed diff --git a/INSTALL.md b/INSTALL.md index 26efb2fde..de505d624 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -8,6 +8,7 @@ Dangerzone is available for: - Fedora 38 - Fedora 37 - Fedora 36 +- Qubes OS (alpha support) ### Ubuntu, Debian @@ -117,8 +118,24 @@ After confirming that it matches, type `y` (for yes) and the installation should +### Qubes OS +
+ :memo: Expand this section if you want to use containers instead of disposable qubes. +
+ + Create a Debian or Fedora-based development standalone qube with at least + 8GB of private storage space, and follow the relevant instructions above for + the respective template. + + Remember to set the environment variable `DZ_USE_CONTAINERS=1`, before + executing Dangerzone. +
+ +> :warning: Native Qubes support is in alpha stage, so we don't have official +> installation instructions yet. If you want to try out Dangerzone with native +> Qubes support, check out our [build instructions](BUILD.md#qubes-os) instead. ## Build from source -If you'd like to build from source, follow the [build instructions](https://github.com/freedomofpress/dangerzone/blob/master/BUILD.md). +If you'd like to build from source, follow the [build instructions](BUILD.md).