Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add public layout-base extraction support on PDFToTextConverter #3137

Merged
6 changes: 5 additions & 1 deletion docs/_src/api/api/file_converter.md
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ class PDFToTextConverter(BaseConverter)
#### PDFToTextConverter.\_\_init\_\_

```python
def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8")
def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8", keep_physical_layout: bool = False)
```

**Arguments**:
Expand All @@ -323,6 +323,8 @@ In this case the id will be generated by using the content and the defined metad
- `encoding`: Encoding that will be passed as `-enc` parameter to `pdftotext`.
Defaults to "UTF-8" in order to support special characters (e.g. German Umlauts, Cyrillic ...).
(See list of available encodings, such as "Latin1", by running `pdftotext -listenc` in the terminal)
- `keep_physical_layout`: This option will maintain original physical layout on the extracted text.
It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order.

<a id="pdf.PDFToTextConverter.convert"></a>

Expand Down Expand Up @@ -351,6 +353,8 @@ not one of the valid languages, then it might likely be encoding error resulting
in garbled text.
- `encoding`: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`.
(See list of available encodings by running `pdftotext -listenc` in the terminal)
- `keep_physical_layout`: This option will maintain original physical layout on the extracted text.
It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order.
- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
Expand Down
5 changes: 5 additions & 0 deletions haystack/json-schemas/haystack-pipeline-main.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -4036,6 +4036,11 @@
"type": "null"
}
]
},
"keep_physical_layout": {
"title": "Keep Physical Layout",
"default": false,
"type": "boolean"
}
},
"additionalProperties": false,
Expand Down
17 changes: 10 additions & 7 deletions haystack/nodes/file_converter/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def __init__(
valid_languages: Optional[List[str]] = None,
id_hash_keys: Optional[List[str]] = None,
encoding: Optional[str] = "UTF-8",
keep_physical_layout: bool = False,
):
"""
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
Expand All @@ -47,14 +48,16 @@ def __init__(
:param encoding: Encoding that will be passed as `-enc` parameter to `pdftotext`.
Defaults to "UTF-8" in order to support special characters (e.g. German Umlauts, Cyrillic ...).
(See list of available encodings, such as "Latin1", by running `pdftotext -listenc` in the terminal)
:param keep_physical_layout: This option will maintain original physical layout on the extracted text.
It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order.
"""
super().__init__(
remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, id_hash_keys=id_hash_keys
)

verify_installation = subprocess.run(["pdftotext -v"], shell=True)
if verify_installation.returncode == 127:
raise Exception(
raise FileNotFoundError(
"""pdftotext is not installed. It is part of xpdf or poppler-utils software suite.

Installation on Linux:
Expand All @@ -68,8 +71,8 @@ def __init__(
"""
)

super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)
self.encoding = encoding
self.keep_physical_layout = keep_physical_layout

def convert(
self,
Expand Down Expand Up @@ -98,6 +101,8 @@ def convert(
in garbled text.
:param encoding: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`.
(See list of available encodings by running `pdftotext -listenc` in the terminal)
:param keep_physical_layout: This option will maintain original physical layout on the extracted text.
It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order.
:param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
Expand All @@ -110,7 +115,9 @@ def convert(
if id_hash_keys is None:
id_hash_keys = self.id_hash_keys

pages = self._read_pdf(file_path, layout=False, encoding=encoding)
keep_physical_layout = self.keep_physical_layout

pages = self._read_pdf(file_path, layout=keep_physical_layout, encoding=encoding)

cleaned_pages = []
for page in pages:
Expand Down Expand Up @@ -162,10 +169,6 @@ def _read_pdf(self, file_path: Path, layout: bool, encoding: Optional[str] = Non
:param encoding: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`.
(See list of available encodings by running `pdftotext -listenc` in the terminal)
"""
# if layout:
# command = ["pdftotext", "-enc", encoding, "-layout", str(file_path), "-"]
# else:
# command = ["pdftotext", "-enc", encoding, str(file_path), "-"]
if not encoding:
encoding = self.encoding

Expand Down
8 changes: 8 additions & 0 deletions test/nodes/test_file_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,14 @@ def test_pdf_encoding(Converter):
assert "ɪ" not in document.content


@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_layout(Converter):
converter = Converter(keep_physical_layout=True)

document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_3.pdf")[0]
assert str(document.content).startswith("This is the second test sentence.")


@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_ligatures(Converter):
converter = Converter()
Expand Down
Binary file added test/samples/pdf/sample_pdf_3.pdf
Binary file not shown.