feat: add public layout-base extraction support on PDFToTextConverter (…

…#3137) * feat(PDFToTextConverter): add option to get text in physical layout order * test: add physical layout extraction test to PDFToTextConverter * refactor: change layout parameter attribution places * docs: manually trigger pre-commits * docs: generate new docs to comply with pydoc-markdown style
deepset-ai · Sep 21, 2022 · 73af416 · 73af416
1 parent 7dfc5ea
commit 73af416
Show file tree

Hide file tree

Showing 5 changed files with 29 additions and 8 deletions.
diff --git a/docs/_src/api/api/file_converter.md b/docs/_src/api/api/file_converter.md
@@ -337,7 +337,8 @@ class PDFToTextConverter(BaseConverter)
 def __init__(remove_numeric_tables: bool = False,
              valid_languages: Optional[List[str]] = None,
              id_hash_keys: Optional[List[str]] = None,
-             encoding: Optional[str] = "UTF-8")
+             encoding: Optional[str] = "UTF-8",
+             keep_physical_layout: bool = False)
 ```
 
 **Arguments**:
@@ -359,6 +360,8 @@ In this case the id will be generated by using the content and the defined metad
 - `encoding`: Encoding that will be passed as `-enc` parameter to `pdftotext`.
 Defaults to "UTF-8" in order to support special characters (e.g. German Umlauts, Cyrillic ...).
 (See list of available encodings, such as "Latin1", by running `pdftotext -listenc` in the terminal)
+- `keep_physical_layout`: This option will maintain original physical layout on the extracted text.
+It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order.
 
 <a id="pdf.PDFToTextConverter.convert"></a>
 
@@ -392,6 +395,8 @@ not one of the valid languages, then it might likely be encoding error resulting
 in garbled text.
 - `encoding`: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`.
 (See list of available encodings by running `pdftotext -listenc` in the terminal)
+- `keep_physical_layout`: This option will maintain original physical layout on the extracted text.
+It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order.
 - `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
 attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
 not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).

diff --git a/haystack/json-schemas/haystack-pipeline-main.schema.json b/haystack/json-schemas/haystack-pipeline-main.schema.json
@@ -4047,6 +4047,11 @@
                   "type": "null"
                 }
               ]
+            },
+            "keep_physical_layout": {
+              "title": "Keep Physical Layout",
+              "default": false,
+              "type": "boolean"
             }
           },
           "additionalProperties": false,

diff --git a/haystack/nodes/file_converter/pdf.py b/haystack/nodes/file_converter/pdf.py
@@ -28,6 +28,7 @@ def __init__(
         valid_languages: Optional[List[str]] = None,
         id_hash_keys: Optional[List[str]] = None,
         encoding: Optional[str] = "UTF-8",
+        keep_physical_layout: bool = False,
     ):
         """
         :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
@@ -47,14 +48,16 @@ def __init__(
         :param encoding: Encoding that will be passed as `-enc` parameter to `pdftotext`.
                          Defaults to "UTF-8" in order to support special characters (e.g. German Umlauts, Cyrillic ...).
                          (See list of available encodings, such as "Latin1", by running `pdftotext -listenc` in the terminal)
+        :param keep_physical_layout: This option will maintain original physical layout on the extracted text.
+            It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order.
         """
         super().__init__(
             remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, id_hash_keys=id_hash_keys
         )
 
         verify_installation = subprocess.run(["pdftotext -v"], shell=True)
         if verify_installation.returncode == 127:
-            raise Exception(
+            raise FileNotFoundError(
                 """pdftotext is not installed. It is part of xpdf or poppler-utils software suite.
                 
                    Installation on Linux:
@@ -68,8 +71,8 @@ def __init__(
                 """
             )
 
-        super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)
         self.encoding = encoding
+        self.keep_physical_layout = keep_physical_layout
 
     def convert(
         self,
@@ -98,6 +101,8 @@ def convert(
                                 in garbled text.
         :param encoding: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`.
                          (See list of available encodings by running `pdftotext -listenc` in the terminal)
+        :param keep_physical_layout: This option will maintain original physical layout on the extracted text.
+            It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order.
         :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
             attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
             not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
@@ -110,7 +115,9 @@ def convert(
         if id_hash_keys is None:
             id_hash_keys = self.id_hash_keys
 
-        pages = self._read_pdf(file_path, layout=False, encoding=encoding)
+        keep_physical_layout = self.keep_physical_layout
+
+        pages = self._read_pdf(file_path, layout=keep_physical_layout, encoding=encoding)
 
         cleaned_pages = []
         for page in pages:
@@ -162,10 +169,6 @@ def _read_pdf(self, file_path: Path, layout: bool, encoding: Optional[str] = Non
         :param encoding: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`.
                          (See list of available encodings by running `pdftotext -listenc` in the terminal)
         """
-        # if layout:
-        #     command = ["pdftotext", "-enc", encoding, "-layout", str(file_path), "-"]
-        # else:
-        #     command = ["pdftotext", "-enc", encoding, str(file_path), "-"]
         if not encoding:
             encoding = self.encoding
 

diff --git a/test/nodes/test_file_converter.py b/test/nodes/test_file_converter.py
@@ -77,6 +77,14 @@ def test_pdf_encoding(Converter):
     assert "ɪ" not in document.content
 
 
+@pytest.mark.parametrize("Converter", [PDFToTextConverter])
+def test_pdf_layout(Converter):
+    converter = Converter(keep_physical_layout=True)
+
+    document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_3.pdf")[0]
+    assert str(document.content).startswith("This is the second test sentence.")
+
+
 @pytest.mark.parametrize("Converter", [PDFToTextConverter])
 def test_pdf_ligatures(Converter):
     converter = Converter()

diff --git a/test/samples/pdf/sample_pdf_3.pdf b/test/samples/pdf/sample_pdf_3.pdf