freedomofpress · deeplow · Aug 22, 2023 · Jun 22, 2023 · Aug 21, 2023 · Aug 8, 2023
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "tests/test_docs_large"]
+	path = tests/test_docs_large
+	url = https://github.com/freedomofpress/dangerzone-test-set
diff --git a/Makefile b/Makefile
@@ -1,25 +1,30 @@
+LARGE_TEST_REPO_DIR:=tests/test_docs_large
+GIT_DESC=$$(git describe)
+JUNIT_FLAGS := --capture=sys -o junit_logging=all
+
 .PHONY: lint-black
 lint-black: ## check python source code formatting issues, with black
-	black --check --diff --exclude dev_scripts/envs ./
+	black --check --diff --exclude dev_scripts/envs --exclude $(LARGE_TEST_REPO_DIR) ./
 
 .PHONY: lint-black-apply
 lint-black-apply: ## apply black's source code formatting suggestions
-	black --exclude dev_scripts/envs ./
+	black --exclude dev_scripts/envs --exclude $(LARGE_TEST_REPO_DIR) ./
 
 .PHONY: lint-isort
 lint-isort: ## check imports are organized, with isort
-	isort --check-only --skip dev_scripts/envs ./
+	isort --check-only --skip dev_scripts/envs --skip $(LARGE_TEST_REPO_DIR) ./
 
 .PHONY: lint-isort-apply
 lint-isort-apply: ## apply isort's imports organization suggestions
-	isort --skip dev_scripts/envs ./
+	isort --skip dev_scripts/envs --skip $(LARGE_TEST_REPO_DIR) ./
 
 MYPY_ARGS := --ignore-missing-imports \
 			 --disallow-incomplete-defs \
 			 --disallow-untyped-defs \
 			 --show-error-codes \
 			 --warn-unreachable \
-			 --warn-unused-ignores
+			 --warn-unused-ignores \
+			 --exclude $(LARGE_TEST_REPO_DIR)/*.py
 
 mypy-host:
 	mypy $(MYPY_ARGS) dangerzone
@@ -41,8 +46,25 @@ test:
 	# shared state.
 	# See more in https://github.com/freedomofpress/dangerzone/issues/493
 	pytest --co -q tests/gui | grep -v ' collected' | xargs -n 1 pytest -v
-	pytest -v --cov --ignore dev_scripts --ignore tests/gui
+	pytest -v --cov --ignore dev_scripts --ignore tests/gui --ignore tests/test_large_set.py
+
+
+.PHONY: test-large-requirements
+test-large-requirements:
+	@git-lfs --version || (echo "ERROR: you need to install 'git-lfs'" && false)
+	@xmllint --version || (echo "ERROR: you need to install 'xmllint'" && false)
+
+test-large-init: test-large-requirements
+	@echo "initializing 'test_docs_large' submodule"
+	git submodule init $(LARGE_TEST_REPO_DIR)
+	git submodule update $(LARGE_TEST_REPO_DIR)
+	cd $(LARGE_TEST_REPO_DIR) && $(MAKE) clone-docs
 
+TEST_LARGE_RESULTS:=$(LARGE_TEST_REPO_DIR)/results/junit/commit_$(GIT_DESC).junit.xml
+.PHONY: tests-large
+test-large: test-large-init  ## Run large test set
+	python -m pytest --tb=no tests/test_large_set.py::TestLargeSet -v $(JUNIT_FLAGS) --junitxml=$(TEST_LARGE_RESULTS)
+	python $(TEST_LARGE_RESULTS)/report.py $(TEST_LARGE_RESULTS)
 
 # Makefile self-help borrowed from the securedrop-client project
 # Explaination of the below shell command should it ever break.

diff --git a/RELEASE.md b/RELEASE.md
@@ -2,6 +2,14 @@
 
 This section documents the release process. Unless you're a dangerzone developer making a release, you'll probably never need to follow it.
 
+## Large document testing
+
+Parallel to the QA process, the release candidate should be put through the large document tests in a dedicated machine to run overnight.
+
+Follow the instructions in `docs/developer/TESTING.md` to run the tests.
+
+These tests will identify any regressions or progression in terms of document coverage.
+
 ## QA
 
 To ensure that new releases do not introduce regressions, and support existing

diff --git a/dangerzone/conversion/common.py b/dangerzone/conversion/common.py
@@ -22,84 +22,91 @@ def running_on_qubes() -> bool:
     return os.path.exists("/usr/share/qubes/marker-vm")
 
 
-async def read_stream(
-    sr: asyncio.StreamReader, callback: Optional[Callable] = None
-) -> bytes:
-    """Consume a byte stream line-by-line.
-
-    Read all lines in a stream until EOF. If a user has passed a callback, call it for
-    each line.
-
-    Note that the lines are in bytes, since we can't assume that all command output will
-    be UTF-8 encoded. Higher level commands are advised to decode the output to Unicode,
-    if they know its encoding.
-    """
-    buf = b""
-    while True:
-        line = await sr.readline()
-        if sr.at_eof():
-            break
-        if callback is not None:
-            callback(line)
-        # TODO: This would be a good place to log the received line, mostly for debug
-        # logging.
-        buf += line
-    return buf
-
-
-async def run_command(
-    args: List[str],
-    *,
-    error_message: str,
-    timeout_message: str,
-    timeout: Optional[float],
-    stdout_callback: Optional[Callable] = None,
-    stderr_callback: Optional[Callable] = None,
-) -> Tuple[bytes, bytes]:
-    """Run a command and get its output.
-
-    Run a command using asyncio.subprocess, consume its standard streams, and return its
-    output in bytes.
-
-    :raises RuntimeError: if the process returns a non-zero exit status
-    :raises TimeoutError: if the process times out
-    """
-    # Start the provided command, and return a handle. The command will run in the
-    # background.
-    proc = await asyncio.subprocess.create_subprocess_exec(
-        *args,
-        stdout=asyncio.subprocess.PIPE,
-        stderr=asyncio.subprocess.PIPE,
-    )
-
-    assert proc.stdout is not None
-    assert proc.stderr is not None
-
-    # Create asynchronous tasks that will consume the standard streams of the command,
-    # and call callbacks if necessary.
-    stdout_task = asyncio.create_task(read_stream(proc.stdout, stdout_callback))
-    stderr_task = asyncio.create_task(read_stream(proc.stderr, stderr_callback))
-
-    # Wait until the command has finished, for a specific timeout. Then, verify that the
-    # command has completed successfully. In any other case, raise an exception.
-    try:
-        ret = await asyncio.wait_for(proc.wait(), timeout=timeout)
-    except asyncio.exceptions.TimeoutError:
-        raise TimeoutError(timeout_message)
-    if ret != 0:
-        raise RuntimeError(error_message)
-
-    # Wait until the tasks that consume the command's standard streams have exited as
-    # well, and return their output.
-    stdout = await stdout_task
-    stderr = await stderr_task
-    return (stdout, stderr)
-
-
 class DangerzoneConverter:
     def __init__(self, progress_callback: Optional[Callable] = None) -> None:
         self.percentage: float = 0.0
         self.progress_callback = progress_callback
+        self.captured_output: bytes = b""
+
+    async def read_stream(
+        self, sr: asyncio.StreamReader, callback: Optional[Callable] = None
+    ) -> bytes:
+        """Consume a byte stream line-by-line.
+
+        Read all lines in a stream until EOF. If a user has passed a callback, call it for
+        each line.
+
+        Note that the lines are in bytes, since we can't assume that all command output will
+        be UTF-8 encoded. Higher level commands are advised to decode the output to Unicode,
+        if they know its encoding.
+        """
+        buf = b""
+        while True:
+            line = await sr.readline()
+            if sr.at_eof():
+                break
+            self.captured_output += line
+            if callback is not None:
+                callback(line)
+            buf += line
+        return buf
+
+    async def run_command(
+        self,
+        args: List[str],
+        *,
+        error_message: str,
+        timeout_message: str,
+        timeout: Optional[float],
+        stdout_callback: Optional[Callable] = None,
+        stderr_callback: Optional[Callable] = None,
+    ) -> Tuple[bytes, bytes]:
+        """Run a command and get its output.
+
+        Run a command using asyncio.subprocess, consume its standard streams, and return its
+        output in bytes.
+
+        :raises RuntimeError: if the process returns a non-zero exit status
+        :raises TimeoutError: if the process times out
+        """
+        # Start the provided command, and return a handle. The command will run in the
+        # background.
+        proc = await asyncio.subprocess.create_subprocess_exec(
+            *args,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+        )
+
+        # Log command to debug log so we can trace back which errors
+        # are from each command
+        self.captured_output += f"[COMMAND] {' '.join(args)}\n".encode()
+
+        assert proc.stdout is not None
+        assert proc.stderr is not None
+
+        # Create asynchronous tasks that will consume the standard streams of the command,
+        # and call callbacks if necessary.
+        stdout_task = asyncio.create_task(
+            self.read_stream(proc.stdout, stdout_callback)
+        )
+        stderr_task = asyncio.create_task(
+            self.read_stream(proc.stderr, stderr_callback)
+        )
+
+        # Wait until the command has finished, for a specific timeout. Then, verify that the
+        # command has completed successfully. In any other case, raise an exception.
+        try:
+            ret = await asyncio.wait_for(proc.wait(), timeout=timeout)
+        except asyncio.exceptions.TimeoutError:
+            raise TimeoutError(timeout_message)
+        if ret != 0:
+            raise RuntimeError(error_message)
+
+        # Wait until the tasks that consume the command's standard streams have exited as
+        # well, and return their output.
+        stdout = await stdout_task
+        stderr = await stderr_task
+        return (stdout, stderr)
 
     def calculate_timeout(
         self, size: float, pages: Optional[float] = None

diff --git a/dangerzone/conversion/doc_to_pixels.py b/dangerzone/conversion/doc_to_pixels.py
@@ -18,7 +18,7 @@
 
 import magic
 
-from .common import DangerzoneConverter, run_command, running_on_qubes
+from .common import DangerzoneConverter, running_on_qubes
 
 
 class DocumentToPixels(DangerzoneConverter):
@@ -189,7 +189,7 @@ async def convert(self) -> None:
                 "/tmp",
                 "/tmp/input_file",
             ]
-            await run_command(
+            await self.run_command(
                 args,
                 error_message="Conversion to PDF with LibreOffice failed",
                 timeout_message=(
@@ -213,7 +213,7 @@ async def convert(self) -> None:
                 "/tmp/input_file",
                 "/tmp/input_file.pdf",
             ]
-            await run_command(
+            await self.run_command(
                 args,
                 error_message="Conversion to PDF with GraphicsMagick failed",
                 timeout_message=(
@@ -231,7 +231,7 @@ async def convert(self) -> None:
 
         # Obtain number of pages
         self.update_progress("Calculating number of pages")
-        stdout, _ = await run_command(
+        stdout, _ = await self.run_command(
             ["pdfinfo", pdf_filename],
             error_message="PDF file is corrupted",
             timeout_message=(
@@ -317,7 +317,7 @@ def pdftoppm_progress_callback(line: bytes) -> None:
 
         page_base = "/tmp/page"
 
-        await run_command(
+        await self.run_command(
             [
                 "pdftoppm",
                 pdf_filename,
@@ -351,7 +351,7 @@ async def install_libreoffice_ext(self, libreoffice_ext: str) -> None:
             f"/usr/lib/libreoffice/share/extensions/{libreoffice_ext}/",
             f"/libreoffice_ext/{libreoffice_ext}",
         ]
-        await run_command(
+        await self.run_command(
             unzip_args,
             error_message="LibreOffice extension installation failed (unzipping)",
             timeout_message="unzipping LibreOffice extension timed out 5 seconds",
@@ -377,11 +377,16 @@ async def main() -> int:
 
     try:
         await converter.convert()
+        error_code = 0  # Success!
     except (RuntimeError, TimeoutError, ValueError) as e:
         converter.update_progress(str(e), error=True)
-        return 1
-    else:
-        return 0  # Success!
+        error_code = 1
+
+    if not running_on_qubes():
+        # Write debug information (containers version)
+        with open("/tmp/dangerzone/captured_output.txt", "wb") as container_log:
+            container_log.write(converter.captured_output)
+    return error_code
 
 
 if __name__ == "__main__":

diff --git a/dangerzone/conversion/doc_to_pixels_qubes_wrapper.py b/dangerzone/conversion/doc_to_pixels_qubes_wrapper.py
@@ -84,6 +84,9 @@ async def main() -> None:
             rgb_data = rgb_file.read()
             await write_bytes(rgb_data)
 
+    # Write debug information
+    await write_bytes(converter.captured_output, file=sys.stderr)
+
 
 if __name__ == "__main__":
     sys.exit(asyncio.run(main()))