container: Introduce proportional timeouts

Introduce proportional timeouts in the container code, where the conversion logic runs. Previously, we had a single timeout for each command (120 seconds), which didn't scale well either with the number of pages in a document, or with the size of the document. In this commit, we look into each operation, and we're trying to figure out the following: 1. What's the number of pages we will operate on? 2. How large is the document? Knowing the above, we can break down a command into multiple operations, at least conceptually. Having a number of operations and a sane timeout value per operation (10 seconds), we can multiply those and reach to a timeout that fits the command better. Refs #327
freedomofpress · Feb 15, 2023 · f1f8212 · f1f8212
1 parent c263264
commit f1f8212
Showing 1 changed file with 89 additions and 22 deletions.
diff --git a/container/dangerzone.py b/container/dangerzone.py
@@ -25,11 +25,8 @@
 
 import magic
 
-# timeout in seconds for any single subprocess
-DEFAULT_TIMEOUT: float = 120
-
-# timeout in seconds for compressing a single page of the final document
-COMPRESSION_TIMEOUT: float = 10
+TIMEOUT_PER_PAGE: float = 10  # (seconds)
+TIMEOUT_PER_MB: float = 10  # (seconds)
 
 
 async def read_stream(sr: asyncio.StreamReader, callback: Callable = None) -> bytes:
@@ -60,7 +57,7 @@ async def run_command(
     *,
     error_message: str,
     timeout_message: str,
-    timeout: float = DEFAULT_TIMEOUT,
+    timeout: Optional[float],
     stdout_callback: Callable = None,
     stderr_callback: Callable = None,
 ) -> Tuple[bytes, bytes]:
@@ -108,6 +105,27 @@ class DangerzoneConverter:
     def __init__(self) -> None:
         self.percentage: float = 0.0
 
+    def calculate_timeout(
+        self, size: float, pages: Optional[float] = None
+    ) -> Optional[float]:
+        """Calculate the timeout for a command.
+
+        The timeout calculation takes two factors in mind:
+
+        1. The size (in MiBs) of the dataset (document, multiple pages).
+        2. The number of pages in the dataset.
+
+        It then calculates proportional timeout values based on the above, and keeps the
+        large one.  This way, we can handle several corner cases:
+
+        * Documents with lots of pages, but small file size.
+        * Single images with large file size.
+        """
+        timeout = TIMEOUT_PER_MB * size
+        if pages:
+            timeout = max(timeout, TIMEOUT_PER_PAGE * pages)
+        return timeout
+
     async def document_to_pixels(self) -> None:
 
         conversions: Dict[str, Dict[str, Optional[str]]] = {
@@ -187,6 +205,14 @@ async def document_to_pixels(self) -> None:
         if mime_type not in conversions:
             raise ValueError("The document format is not supported")
 
+        # Get file size (in MiB)
+        size = os.path.getsize("/tmp/input_file") / 1024**2
+
+        # Calculate timeout for the first few file operations. The difference with the
+        # subsequent ones is that we don't know the number of pages, before we have a
+        # PDF at hand, so we rely on size heuristics.
+        timeout = self.calculate_timeout(size)
+
         # Convert input document to PDF
         conversion = conversions[mime_type]
         if conversion["type"] is None:
@@ -205,7 +231,11 @@ async def document_to_pixels(self) -> None:
             await run_command(
                 args,
                 error_message="Conversion to PDF with LibreOffice failed",
-                timeout_message=f"Error converting document to PDF, LibreOffice timed out after {DEFAULT_TIMEOUT} seconds",
+                timeout_message=(
+                    "Error converting document to PDF, LibreOffice timed out after"
+                    f" {timeout} seconds"
+                ),
+                timeout=timeout,
             )
             pdf_filename = "/tmp/input_file.pdf"
         elif conversion["type"] == "convert":
@@ -219,7 +249,11 @@ async def document_to_pixels(self) -> None:
             await run_command(
                 args,
                 error_message="Conversion to PDF with GraphicsMagick failed",
-                timeout_message=f"Error converting document to PDF, GraphicsMagick timed out after {DEFAULT_TIMEOUT} seconds",
+                timeout_message=(
+                    "Error converting document to PDF, GraphicsMagick timed out after"
+                    f" {timeout} seconds"
+                ),
+                timeout=timeout,
             )
             pdf_filename = "/tmp/input_file.pdf"
         else:
@@ -233,16 +267,21 @@ async def document_to_pixels(self) -> None:
         stdout, _ = await run_command(
             ["pdfinfo", pdf_filename],
             error_message="PDF file is corrupted",
-            timeout_message=f"Extracting metadata from PDF timed out after 1 second",
-            timeout=1,
+            timeout_message=(
+                f"Extracting metadata from PDF timed out after {timeout} second"
+            ),
+            timeout=timeout,
         )
 
         search = re.search(r"Pages:\s*(\d+)\s*\n", stdout.decode())
         if search is not None:
-            self.num_pages: int = int(search.group(1))
+            num_pages: int = int(search.group(1))
         else:
             raise ValueError("Number of pages could not be extracted from PDF")
 
+        # Get a more precise timeout, based on the number of pages
+        timeout = self.calculate_timeout(size, num_pages)
+
         def pdftoppm_progress_callback(line: bytes) -> None:
             """Function called for every line the 'pdftoppm' command outputs
 
@@ -308,8 +347,6 @@ def pdftoppm_progress_callback(line: bytes) -> None:
 
         page_base = "/tmp/page"
 
-        # Convert to PPM, which is essentially an RGB format
-        pdftoppm_timeout = 1.0 * self.num_pages
         await run_command(
             [
                 "pdftoppm",
@@ -318,9 +355,12 @@ def pdftoppm_progress_callback(line: bytes) -> None:
                 "-progress",
             ],
             error_message="Conversion from PDF to PPM failed",
-            timeout_message=f"Error converting from PDF to PPM, pdftoppm timed out after {pdftoppm_timeout} seconds",
+            timeout_message=(
+                f"Error converting from PDF to PPM, pdftoppm timed out after {timeout}"
+                " seconds"
+            ),
             stderr_callback=pdftoppm_progress_callback,
-            timeout=pdftoppm_timeout,
+            timeout=timeout,
         )
 
         self.update_progress("Converted document to pixels")
@@ -337,6 +377,7 @@ async def pixels_to_pdf(self) -> None:
         self.percentage = 50.0
 
         num_pages = len(glob.glob("/dangerzone/page-*.rgb"))
+        total_size = 0.0
 
         # Convert RGB files to PDF files
         percentage_per_page = 45.0 / num_pages
@@ -354,6 +395,11 @@ async def pixels_to_pdf(self) -> None:
             with open(height_filename) as f:
                 height = f.read().strip()
 
+            # The first few operations happen on a per-page basis.
+            page_size = os.path.getsize(filename_base + ".rgb") / 1024**2
+            total_size += page_size
+            timeout = self.calculate_timeout(page_size, 1)
+
             if os.environ.get("OCR") == "1":  # OCR the document
                 self.update_progress(
                     f"Converting page {page}/{num_pages} from pixels to searchable PDF"
@@ -370,7 +416,11 @@ async def pixels_to_pdf(self) -> None:
                         f"png:{png_filename}",
                     ],
                     error_message=f"Page {page}/{num_pages} conversion to PNG failed",
-                    timeout_message=f"Error converting pixels to PNG, convert timed out after {DEFAULT_TIMEOUT} seconds",
+                    timeout_message=(
+                        "Error converting pixels to PNG, convert timed out after"
+                        f" {timeout} seconds"
+                    ),
+                    timeout=timeout,
                 )
                 await run_command(
                     [
@@ -384,7 +434,11 @@ async def pixels_to_pdf(self) -> None:
                         "pdf",
                     ],
                     error_message=f"Page {page}/{num_pages} OCR failed",
-                    timeout_message=f"Error converting PNG to searchable PDF, tesseract timed out after {DEFAULT_TIMEOUT} seconds",
+                    timeout_message=(
+                        "Error converting PNG to searchable PDF, tesseract timed out"
+                        f" after {timeout} seconds"
+                    ),
+                    timeout=timeout,
                 )
 
             else:  # Don't OCR
@@ -403,11 +457,19 @@ async def pixels_to_pdf(self) -> None:
                         f"pdf:{pdf_filename}",
                     ],
                     error_message=f"Page {page}/{num_pages} conversion to PDF failed",
-                    timeout_message=f"Error converting RGB to PDF, convert timed out after {DEFAULT_TIMEOUT} seconds",
+                    timeout_message=(
+                        "Error converting RGB to PDF, convert timed out after"
+                        f" {timeout} seconds"
+                    ),
+                    timeout=timeout,
                 )
 
             self.percentage += percentage_per_page
 
+        # Next operations apply to the all the pages, so we need to recalculate the
+        # timeout.
+        timeout = self.calculate_timeout(total_size, num_pages)
+
         # Merge pages into a single PDF
         self.update_progress(f"Merging {num_pages} pages into a single PDF")
         args = ["pdfunite"]
@@ -417,19 +479,24 @@ async def pixels_to_pdf(self) -> None:
         await run_command(
             args,
             error_message="Merging pages into a single PDF failed",
-            timeout_message=f"Error merging pages into a single PDF, pdfunite timed out after {DEFAULT_TIMEOUT} seconds",
+            timeout_message=(
+                "Error merging pages into a single PDF, pdfunite timed out after"
+                f" {timeout} seconds"
+            ),
+            timeout=timeout,
         )
 
         self.percentage += 2
 
         # Compress
         self.update_progress("Compressing PDF")
-        compress_timeout = num_pages * COMPRESSION_TIMEOUT
         await run_command(
             ["ps2pdf", "/tmp/safe-output.pdf", "/tmp/safe-output-compressed.pdf"],
-            timeout_message=f"Error compressing PDF, ps2pdf timed out after {compress_timeout} seconds",
             error_message="Compressing PDF failed",
-            timeout=compress_timeout,
+            timeout_message=(
+                f"Error compressing PDF, ps2pdf timed out after {timeout} seconds"
+            ),
+            timeout=timeout,
         )
 
         self.percentage = 100.0