Skip to content

Commit

Permalink
container: Introduce proportional timeouts
Browse files Browse the repository at this point in the history
Introduce proportional timeouts in the container code, where the
conversion logic runs.

Previously, we had a single timeout for each command (120 seconds),
which didn't scale well either with the number of pages in a document,
or with the size of the document.

In this commit, we look into each operation, and we're trying to figure
out the following:

1. What's the number of pages we will operate on?
2. How large is the document?

Knowing the above, we can break down a command into multiple operations,
at least conceptually. Having a number of operations and a sane timeout
value per operation (10 seconds), we can multiply those and reach to a
timeout that fits the command better.

Refs #327
  • Loading branch information
apyrgio committed Feb 15, 2023
1 parent c263264 commit f1f8212
Showing 1 changed file with 89 additions and 22 deletions.
111 changes: 89 additions & 22 deletions container/dangerzone.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,8 @@

import magic

# timeout in seconds for any single subprocess
DEFAULT_TIMEOUT: float = 120

# timeout in seconds for compressing a single page of the final document
COMPRESSION_TIMEOUT: float = 10
TIMEOUT_PER_PAGE: float = 10 # (seconds)
TIMEOUT_PER_MB: float = 10 # (seconds)


async def read_stream(sr: asyncio.StreamReader, callback: Callable = None) -> bytes:
Expand Down Expand Up @@ -60,7 +57,7 @@ async def run_command(
*,
error_message: str,
timeout_message: str,
timeout: float = DEFAULT_TIMEOUT,
timeout: Optional[float],
stdout_callback: Callable = None,
stderr_callback: Callable = None,
) -> Tuple[bytes, bytes]:
Expand Down Expand Up @@ -108,6 +105,27 @@ class DangerzoneConverter:
def __init__(self) -> None:
self.percentage: float = 0.0

def calculate_timeout(
self, size: float, pages: Optional[float] = None
) -> Optional[float]:
"""Calculate the timeout for a command.
The timeout calculation takes two factors in mind:
1. The size (in MiBs) of the dataset (document, multiple pages).
2. The number of pages in the dataset.
It then calculates proportional timeout values based on the above, and keeps the
large one. This way, we can handle several corner cases:
* Documents with lots of pages, but small file size.
* Single images with large file size.
"""
timeout = TIMEOUT_PER_MB * size
if pages:
timeout = max(timeout, TIMEOUT_PER_PAGE * pages)
return timeout

async def document_to_pixels(self) -> None:

conversions: Dict[str, Dict[str, Optional[str]]] = {
Expand Down Expand Up @@ -187,6 +205,14 @@ async def document_to_pixels(self) -> None:
if mime_type not in conversions:
raise ValueError("The document format is not supported")

# Get file size (in MiB)
size = os.path.getsize("/tmp/input_file") / 1024**2

# Calculate timeout for the first few file operations. The difference with the
# subsequent ones is that we don't know the number of pages, before we have a
# PDF at hand, so we rely on size heuristics.
timeout = self.calculate_timeout(size)

# Convert input document to PDF
conversion = conversions[mime_type]
if conversion["type"] is None:
Expand All @@ -205,7 +231,11 @@ async def document_to_pixels(self) -> None:
await run_command(
args,
error_message="Conversion to PDF with LibreOffice failed",
timeout_message=f"Error converting document to PDF, LibreOffice timed out after {DEFAULT_TIMEOUT} seconds",
timeout_message=(
"Error converting document to PDF, LibreOffice timed out after"
f" {timeout} seconds"
),
timeout=timeout,
)
pdf_filename = "/tmp/input_file.pdf"
elif conversion["type"] == "convert":
Expand All @@ -219,7 +249,11 @@ async def document_to_pixels(self) -> None:
await run_command(
args,
error_message="Conversion to PDF with GraphicsMagick failed",
timeout_message=f"Error converting document to PDF, GraphicsMagick timed out after {DEFAULT_TIMEOUT} seconds",
timeout_message=(
"Error converting document to PDF, GraphicsMagick timed out after"
f" {timeout} seconds"
),
timeout=timeout,
)
pdf_filename = "/tmp/input_file.pdf"
else:
Expand All @@ -233,16 +267,21 @@ async def document_to_pixels(self) -> None:
stdout, _ = await run_command(
["pdfinfo", pdf_filename],
error_message="PDF file is corrupted",
timeout_message=f"Extracting metadata from PDF timed out after 1 second",
timeout=1,
timeout_message=(
f"Extracting metadata from PDF timed out after {timeout} second"
),
timeout=timeout,
)

search = re.search(r"Pages:\s*(\d+)\s*\n", stdout.decode())
if search is not None:
self.num_pages: int = int(search.group(1))
num_pages: int = int(search.group(1))
else:
raise ValueError("Number of pages could not be extracted from PDF")

# Get a more precise timeout, based on the number of pages
timeout = self.calculate_timeout(size, num_pages)

def pdftoppm_progress_callback(line: bytes) -> None:
"""Function called for every line the 'pdftoppm' command outputs
Expand Down Expand Up @@ -308,8 +347,6 @@ def pdftoppm_progress_callback(line: bytes) -> None:

page_base = "/tmp/page"

# Convert to PPM, which is essentially an RGB format
pdftoppm_timeout = 1.0 * self.num_pages
await run_command(
[
"pdftoppm",
Expand All @@ -318,9 +355,12 @@ def pdftoppm_progress_callback(line: bytes) -> None:
"-progress",
],
error_message="Conversion from PDF to PPM failed",
timeout_message=f"Error converting from PDF to PPM, pdftoppm timed out after {pdftoppm_timeout} seconds",
timeout_message=(
f"Error converting from PDF to PPM, pdftoppm timed out after {timeout}"
" seconds"
),
stderr_callback=pdftoppm_progress_callback,
timeout=pdftoppm_timeout,
timeout=timeout,
)

self.update_progress("Converted document to pixels")
Expand All @@ -337,6 +377,7 @@ async def pixels_to_pdf(self) -> None:
self.percentage = 50.0

num_pages = len(glob.glob("/dangerzone/page-*.rgb"))
total_size = 0.0

# Convert RGB files to PDF files
percentage_per_page = 45.0 / num_pages
Expand All @@ -354,6 +395,11 @@ async def pixels_to_pdf(self) -> None:
with open(height_filename) as f:
height = f.read().strip()

# The first few operations happen on a per-page basis.
page_size = os.path.getsize(filename_base + ".rgb") / 1024**2
total_size += page_size
timeout = self.calculate_timeout(page_size, 1)

if os.environ.get("OCR") == "1": # OCR the document
self.update_progress(
f"Converting page {page}/{num_pages} from pixels to searchable PDF"
Expand All @@ -370,7 +416,11 @@ async def pixels_to_pdf(self) -> None:
f"png:{png_filename}",
],
error_message=f"Page {page}/{num_pages} conversion to PNG failed",
timeout_message=f"Error converting pixels to PNG, convert timed out after {DEFAULT_TIMEOUT} seconds",
timeout_message=(
"Error converting pixels to PNG, convert timed out after"
f" {timeout} seconds"
),
timeout=timeout,
)
await run_command(
[
Expand All @@ -384,7 +434,11 @@ async def pixels_to_pdf(self) -> None:
"pdf",
],
error_message=f"Page {page}/{num_pages} OCR failed",
timeout_message=f"Error converting PNG to searchable PDF, tesseract timed out after {DEFAULT_TIMEOUT} seconds",
timeout_message=(
"Error converting PNG to searchable PDF, tesseract timed out"
f" after {timeout} seconds"
),
timeout=timeout,
)

else: # Don't OCR
Expand All @@ -403,11 +457,19 @@ async def pixels_to_pdf(self) -> None:
f"pdf:{pdf_filename}",
],
error_message=f"Page {page}/{num_pages} conversion to PDF failed",
timeout_message=f"Error converting RGB to PDF, convert timed out after {DEFAULT_TIMEOUT} seconds",
timeout_message=(
"Error converting RGB to PDF, convert timed out after"
f" {timeout} seconds"
),
timeout=timeout,
)

self.percentage += percentage_per_page

# Next operations apply to the all the pages, so we need to recalculate the
# timeout.
timeout = self.calculate_timeout(total_size, num_pages)

# Merge pages into a single PDF
self.update_progress(f"Merging {num_pages} pages into a single PDF")
args = ["pdfunite"]
Expand All @@ -417,19 +479,24 @@ async def pixels_to_pdf(self) -> None:
await run_command(
args,
error_message="Merging pages into a single PDF failed",
timeout_message=f"Error merging pages into a single PDF, pdfunite timed out after {DEFAULT_TIMEOUT} seconds",
timeout_message=(
"Error merging pages into a single PDF, pdfunite timed out after"
f" {timeout} seconds"
),
timeout=timeout,
)

self.percentage += 2

# Compress
self.update_progress("Compressing PDF")
compress_timeout = num_pages * COMPRESSION_TIMEOUT
await run_command(
["ps2pdf", "/tmp/safe-output.pdf", "/tmp/safe-output-compressed.pdf"],
timeout_message=f"Error compressing PDF, ps2pdf timed out after {compress_timeout} seconds",
error_message="Compressing PDF failed",
timeout=compress_timeout,
timeout_message=(
f"Error compressing PDF, ps2pdf timed out after {timeout} seconds"
),
timeout=timeout,
)

self.percentage = 100.0
Expand Down

0 comments on commit f1f8212

Please sign in to comment.