deepset-ai · masci · Sep 13, 2022 · Sep 1, 2022 · Sep 1, 2022 · Sep 1, 2022
@@ -301,7 +301,7 @@ class PDFToTextConverter(BaseConverter)
 #### PDFToTextConverter.\_\_init\_\_
 
 ```python
-def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8")
+def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8", keep_physical_layout: bool = False)
 ```
 
 **Arguments**:
@@ -323,6 +323,8 @@ In this case the id will be generated by using the content and the defined metad
 - `encoding`: Encoding that will be passed as `-enc` parameter to `pdftotext`.
 Defaults to "UTF-8" in order to support special characters (e.g. German Umlauts, Cyrillic ...).
 (See list of available encodings, such as "Latin1", by running `pdftotext -listenc` in the terminal)
+- `keep_physical_layout`: This option will maintain original physical layout on the extracted text.
+It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order.
 
 <a id="pdf.PDFToTextConverter.convert"></a>
 
@@ -351,6 +353,8 @@ not one of the valid languages, then it might likely be encoding error resulting
 in garbled text.
 - `encoding`: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`.
 (See list of available encodings by running `pdftotext -listenc` in the terminal)
+- `keep_physical_layout`: This option will maintain original physical layout on the extracted text.
+It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order.
 - `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
 attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
 not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).

@@ -4036,6 +4036,11 @@
                   "type": "null"
                 }
               ]
+            },
+            "keep_physical_layout": {
+              "title": "Keep Physical Layout",
+              "default": false,
+              "type": "boolean"
             }
           },
           "additionalProperties": false,

@@ -28,6 +28,7 @@ def __init__(
         valid_languages: Optional[List[str]] = None,
         id_hash_keys: Optional[List[str]] = None,
         encoding: Optional[str] = "UTF-8",
+        keep_physical_layout: bool = False,
     ):
         """
         :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
@@ -47,14 +48,16 @@ def __init__(
         :param encoding: Encoding that will be passed as `-enc` parameter to `pdftotext`.
                          Defaults to "UTF-8" in order to support special characters (e.g. German Umlauts, Cyrillic ...).
                          (See list of available encodings, such as "Latin1", by running `pdftotext -listenc` in the terminal)
+        :param keep_physical_layout: This option will maintain original physical layout on the extracted text.
+            It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order.
         """
         super().__init__(
             remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, id_hash_keys=id_hash_keys
         )
 
         verify_installation = subprocess.run(["pdftotext -v"], shell=True)
         if verify_installation.returncode == 127:
-            raise Exception(
+            raise FileNotFoundError(
                 """pdftotext is not installed. It is part of xpdf or poppler-utils software suite.
 
                    Installation on Linux:
@@ -68,8 +71,8 @@ def __init__(
                 """
             )
 
-        super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)
         self.encoding = encoding
+        self.keep_physical_layout = keep_physical_layout
 
     def convert(
         self,
@@ -98,6 +101,8 @@ def convert(
                                 in garbled text.
         :param encoding: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`.
                          (See list of available encodings by running `pdftotext -listenc` in the terminal)
+        :param keep_physical_layout: This option will maintain original physical layout on the extracted text.
+            It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order.
         :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
             attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
             not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
@@ -110,7 +115,9 @@ def convert(
         if id_hash_keys is None:
             id_hash_keys = self.id_hash_keys
 
-        pages = self._read_pdf(file_path, layout=False, encoding=encoding)
+        keep_physical_layout = self.keep_physical_layout
+
+        pages = self._read_pdf(file_path, layout=keep_physical_layout, encoding=encoding)
 
         cleaned_pages = []
         for page in pages:
@@ -162,10 +169,6 @@ def _read_pdf(self, file_path: Path, layout: bool, encoding: Optional[str] = Non
         :param encoding: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`.
                          (See list of available encodings by running `pdftotext -listenc` in the terminal)
         """
-        # if layout:
-        #     command = ["pdftotext", "-enc", encoding, "-layout", str(file_path), "-"]
-        # else:
-        #     command = ["pdftotext", "-enc", encoding, str(file_path), "-"]
         if not encoding:
             encoding = self.encoding
 

@@ -77,6 +77,14 @@ def test_pdf_encoding(Converter):
     assert "ɪ" not in document.content
 
 
+@pytest.mark.parametrize("Converter", [PDFToTextConverter])
+def test_pdf_layout(Converter):
+    converter = Converter(keep_physical_layout=True)
+
+    document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_3.pdf")[0]
+    assert str(document.content).startswith("This is the second test sentence.")
+
+
 @pytest.mark.parametrize("Converter", [PDFToTextConverter])
 def test_pdf_ligatures(Converter):
     converter = Converter()