Skip to content

Commit

Permalink
Stream pages in containers (merge isolation providers)
Browse files Browse the repository at this point in the history
Merge Qubes and Containers isolation providers into a superclass called
"ProcessBasedIsolationProviders" by streaming pages in containers for
exclusively in first conversion process. The commit is rather large due
to the multiple interdependencies of the code, making it difficult to
split into various commits.

The main conversion method (_convert) now in the superclass simply calls
two methods:
  - doc_to_pixels()
  - pixels_to_pdf()

Critically, doc_to_pixels is implemented in the superclass, diverging
only in a specialized method called "get_doc_to_pixels_proc()". This
method obtains the process responsible that communicates with the
isolation provider (container / disp VM) via `podman/docker` and qrexec
on Containers and Qubes respectively.

Known regressions:
  - progress reports stopped working on containers

Fixes #443
  • Loading branch information
deeplow authored and apyrgio committed Feb 6, 2024
1 parent 37fbc79 commit f1a5c43
Show file tree
Hide file tree
Showing 11 changed files with 295 additions and 429 deletions.
9 changes: 2 additions & 7 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,5 @@ COPY conversion /opt/dangerzone/dangerzone/conversion
RUN adduser -s /bin/sh -D dangerzone
USER dangerzone

# /tmp/input_file is where the first convert expects the input file to be, and
# /tmp where it will write the pixel files
#
# /dangerzone is where the second script expects files to be put by the first one
#
# /safezone is where the wrapper eventually moves the sanitized files.
VOLUME /dangerzone /tmp/input_file /safezone
# /safezone is a directory through which Pixels to PDF receives files
VOLUME /safezone
56 changes: 45 additions & 11 deletions dangerzone/conversion/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import sys
import time
from abc import abstractmethod
from typing import Callable, Dict, List, Optional, Tuple, Union
from typing import Callable, Dict, List, Optional, TextIO, Tuple, Union

TIMEOUT_PER_PAGE: float = 30 # (seconds)
TIMEOUT_PER_MB: float = 30 # (seconds)
Expand Down Expand Up @@ -58,6 +58,49 @@ def __init__(self, progress_callback: Optional[Callable] = None) -> None:
self.progress_callback = progress_callback
self.captured_output: bytes = b""

@classmethod
def _read_bytes(cls) -> bytes:
"""Read bytes from the stdin."""
data = sys.stdin.buffer.read()
if data is None:
raise EOFError
return data

@classmethod
def _write_bytes(cls, data: bytes, file: TextIO = sys.stdout) -> None:
file.buffer.write(data)

@classmethod
def _write_text(cls, text: str, file: TextIO = sys.stdout) -> None:
cls._write_bytes(text.encode(), file=file)

@classmethod
def _write_int(cls, num: int, file: TextIO = sys.stdout) -> None:
cls._write_bytes(num.to_bytes(2, signed=False), file=file)

# ==== ASYNC METHODS ====
# We run sync methods in async wrappers, because pure async methods are more difficult:
# https://stackoverflow.com/a/52702646
#
# In practice, because they are I/O bound and we don't have many running concurrently,
# they shouldn't cause a problem.

@classmethod
async def read_bytes(cls) -> bytes:
return await asyncio.to_thread(cls._read_bytes)

@classmethod
async def write_bytes(cls, data: bytes, file: TextIO = sys.stdout) -> None:
return await asyncio.to_thread(cls._write_bytes, data, file=file)

@classmethod
async def write_text(cls, text: str, file: TextIO = sys.stdout) -> None:
return await asyncio.to_thread(cls._write_text, text, file=file)

@classmethod
async def write_int(cls, num: int, file: TextIO = sys.stdout) -> None:
return await asyncio.to_thread(cls._write_int, num, file=file)

async def read_stream(
self, sr: asyncio.StreamReader, callback: Optional[Callable] = None
) -> bytes:
Expand Down Expand Up @@ -150,13 +193,4 @@ async def convert(self) -> None:
pass

def update_progress(self, text: str, *, error: bool = False) -> None:
if running_on_qubes():
if self.progress_callback:
self.progress_callback(error, text, int(self.percentage))
else:
print(
json.dumps(
{"error": error, "text": text, "percentage": int(self.percentage)}
)
)
sys.stdout.flush()
pass
57 changes: 22 additions & 35 deletions dangerzone/conversion/doc_to_pixels.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import re
import shutil
import sys
from typing import Dict, List, Optional
from typing import Dict, List, Optional, TextIO

import fitz
import magic
Expand All @@ -23,26 +23,17 @@


class DocumentToPixels(DangerzoneConverter):
# XXX: These functions write page data and metadata to a separate file. For now,
# they act as an anchor point for Qubes to stream back page data/metadata in
# real time. In the future, they will be completely replaced by their streaming
# counterparts. See:
#
# https://github.com/freedomofpress/dangerzone/issues/443
async def write_page_count(self, count: int) -> None:
pass
return await self.write_int(count)

async def write_page_width(self, width: int, filename: str) -> None:
with open(filename, "w") as f:
f.write(str(width))
return await self.write_int(width)

async def write_page_height(self, height: int, filename: str) -> None:
with open(filename, "w") as f:
f.write(str(height))
return await self.write_int(height)

async def write_page_data(self, data: bytes, filename: str) -> None:
with open(filename, "wb") as f:
f.write(data)
return await self.write_bytes(data)

async def convert(self) -> None:
conversions: Dict[str, Dict[str, Optional[str]]] = {
Expand Down Expand Up @@ -255,20 +246,6 @@ async def convert(self) -> None:
await self.write_page_height(pix.height, height_filename)
await self.write_page_data(rgb_buf, rgb_filename)

final_files = (
glob.glob("/tmp/page-*.rgb")
+ glob.glob("/tmp/page-*.width")
+ glob.glob("/tmp/page-*.height")
)

# XXX: Sanity check to avoid situations like #560.
if not running_on_qubes() and len(final_files) != 3 * doc.page_count:
raise errors.PageCountMismatch()

# Move converted files into /tmp/dangerzone
for filename in final_files:
shutil.move(filename, "/tmp/dangerzone")

self.update_progress("Converted document to pixels")

async def install_libreoffice_ext(self, libreoffice_ext: str) -> None:
Expand Down Expand Up @@ -298,18 +275,28 @@ def detect_mime_type(self, path: str) -> str:
return mime_type


async def main() -> int:
converter = DocumentToPixels()
async def main() -> None:
try:
data = await DocumentToPixels.read_bytes()
except EOFError:
sys.exit(1)

with open("/tmp/input_file", "wb") as f:
f.write(data)

try:
converter = DocumentToPixels()
await converter.convert()
error_code = 0 # Success!
except errors.ConversionException as e: # Expected Errors
error_code = e.error_code
except errors.ConversionException as e:
await DocumentToPixels.write_bytes(str(e).encode(), file=sys.stderr)
sys.exit(e.error_code)
except Exception as e:
converter.update_progress(str(e), error=True)
await DocumentToPixels.write_bytes(str(e).encode(), file=sys.stderr)
error_code = errors.UnexpectedConversionError.error_code
return error_code
sys.exit(error_code)

# Write debug information
await DocumentToPixels.write_bytes(converter.captured_output, file=sys.stderr)


if __name__ == "__main__":
Expand Down
108 changes: 0 additions & 108 deletions dangerzone/conversion/doc_to_pixels_qubes_wrapper.py

This file was deleted.

18 changes: 15 additions & 3 deletions dangerzone/conversion/pixels_to_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,20 +22,20 @@ async def convert(
) -> None:
self.percentage = 50.0
if tempdir is None:
tempdir = "/tmp"
tempdir = "/safezone"

# XXX lazy loading of fitz module to avoid import issues on non-Qubes systems
import fitz

num_pages = len(glob.glob(f"{tempdir}/dangerzone/page-*.rgb"))
num_pages = len(glob.glob(f"{tempdir}/pixels/page-*.rgb"))
total_size = 0.0

safe_doc = fitz.Document()

# Convert RGB files to PDF files
percentage_per_page = 45.0 / num_pages
for page_num in range(1, num_pages + 1):
filename_base = f"{tempdir}/dangerzone/page-{page_num}"
filename_base = f"{tempdir}/pixels/page-{page_num}"
rgb_filename = f"{filename_base}.rgb"
width_filename = f"{filename_base}.width"
height_filename = f"{filename_base}.height"
Expand Down Expand Up @@ -90,6 +90,18 @@ async def convert(

safe_doc.save(safe_pdf_path, deflate_images=True)

def update_progress(self, text: str, *, error: bool = False) -> None:
if running_on_qubes():
if self.progress_callback:
self.progress_callback(error, text, int(self.percentage))
else:
print(
json.dumps(
{"error": error, "text": text, "percentage": int(self.percentage)}
)
)
sys.stdout.flush()


async def main() -> int:
ocr_lang = os.environ.get("OCR_LANGUAGE") if os.environ.get("OCR") == "1" else None
Expand Down
Loading

0 comments on commit f1a5c43

Please sign in to comment.