diff --git a/docs/_src/api/api/file_converter.md b/docs/_src/api/api/file_converter.md index 43cd2437ce..6ab0608362 100644 --- a/docs/_src/api/api/file_converter.md +++ b/docs/_src/api/api/file_converter.md @@ -337,7 +337,8 @@ class PDFToTextConverter(BaseConverter) def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, - encoding: Optional[str] = "UTF-8") + encoding: Optional[str] = "UTF-8", + keep_physical_layout: bool = False) ``` **Arguments**: @@ -359,6 +360,8 @@ In this case the id will be generated by using the content and the defined metad - `encoding`: Encoding that will be passed as `-enc` parameter to `pdftotext`. Defaults to "UTF-8" in order to support special characters (e.g. German Umlauts, Cyrillic ...). (See list of available encodings, such as "Latin1", by running `pdftotext -listenc` in the terminal) +- `keep_physical_layout`: This option will maintain original physical layout on the extracted text. +It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order. @@ -392,6 +395,8 @@ not one of the valid languages, then it might likely be encoding error resulting in garbled text. - `encoding`: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`. (See list of available encodings by running `pdftotext -listenc` in the terminal) +- `keep_physical_layout`: This option will maintain original physical layout on the extracted text. +It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order. - `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). diff --git a/haystack/json-schemas/haystack-pipeline-main.schema.json b/haystack/json-schemas/haystack-pipeline-main.schema.json index 90b14f50c4..54c0779159 100644 --- a/haystack/json-schemas/haystack-pipeline-main.schema.json +++ b/haystack/json-schemas/haystack-pipeline-main.schema.json @@ -4047,6 +4047,11 @@ "type": "null" } ] + }, + "keep_physical_layout": { + "title": "Keep Physical Layout", + "default": false, + "type": "boolean" } }, "additionalProperties": false, diff --git a/haystack/nodes/file_converter/pdf.py b/haystack/nodes/file_converter/pdf.py index 25899fb5e6..88d1288240 100644 --- a/haystack/nodes/file_converter/pdf.py +++ b/haystack/nodes/file_converter/pdf.py @@ -28,6 +28,7 @@ def __init__( valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8", + keep_physical_layout: bool = False, ): """ :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables. @@ -47,6 +48,8 @@ def __init__( :param encoding: Encoding that will be passed as `-enc` parameter to `pdftotext`. Defaults to "UTF-8" in order to support special characters (e.g. German Umlauts, Cyrillic ...). (See list of available encodings, such as "Latin1", by running `pdftotext -listenc` in the terminal) + :param keep_physical_layout: This option will maintain original physical layout on the extracted text. + It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order. """ super().__init__( remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, id_hash_keys=id_hash_keys @@ -54,7 +57,7 @@ def __init__( verify_installation = subprocess.run(["pdftotext -v"], shell=True) if verify_installation.returncode == 127: - raise Exception( + raise FileNotFoundError( """pdftotext is not installed. It is part of xpdf or poppler-utils software suite. Installation on Linux: @@ -68,8 +71,8 @@ def __init__( """ ) - super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages) self.encoding = encoding + self.keep_physical_layout = keep_physical_layout def convert( self, @@ -98,6 +101,8 @@ def convert( in garbled text. :param encoding: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`. (See list of available encodings by running `pdftotext -listenc` in the terminal) + :param keep_physical_layout: This option will maintain original physical layout on the extracted text. + It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order. :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). @@ -110,7 +115,9 @@ def convert( if id_hash_keys is None: id_hash_keys = self.id_hash_keys - pages = self._read_pdf(file_path, layout=False, encoding=encoding) + keep_physical_layout = self.keep_physical_layout + + pages = self._read_pdf(file_path, layout=keep_physical_layout, encoding=encoding) cleaned_pages = [] for page in pages: @@ -162,10 +169,6 @@ def _read_pdf(self, file_path: Path, layout: bool, encoding: Optional[str] = Non :param encoding: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`. (See list of available encodings by running `pdftotext -listenc` in the terminal) """ - # if layout: - # command = ["pdftotext", "-enc", encoding, "-layout", str(file_path), "-"] - # else: - # command = ["pdftotext", "-enc", encoding, str(file_path), "-"] if not encoding: encoding = self.encoding diff --git a/test/nodes/test_file_converter.py b/test/nodes/test_file_converter.py index 34c64ac097..0f6d095c3f 100644 --- a/test/nodes/test_file_converter.py +++ b/test/nodes/test_file_converter.py @@ -77,6 +77,14 @@ def test_pdf_encoding(Converter): assert "ɪ" not in document.content +@pytest.mark.parametrize("Converter", [PDFToTextConverter]) +def test_pdf_layout(Converter): + converter = Converter(keep_physical_layout=True) + + document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_3.pdf")[0] + assert str(document.content).startswith("This is the second test sentence.") + + @pytest.mark.parametrize("Converter", [PDFToTextConverter]) def test_pdf_ligatures(Converter): converter = Converter() diff --git a/test/samples/pdf/sample_pdf_3.pdf b/test/samples/pdf/sample_pdf_3.pdf new file mode 100644 index 0000000000..d3900c155d Binary files /dev/null and b/test/samples/pdf/sample_pdf_3.pdf differ