Skip to content

Commit

Permalink
feat: add public layout-base extraction support on PDFToTextConverter (
Browse files Browse the repository at this point in the history
…#3137)

* feat(PDFToTextConverter): add option to get text in physical layout order

* test: add physical layout extraction test to PDFToTextConverter

* refactor: change layout parameter attribution places

* docs: manually trigger pre-commits

* docs: generate new docs to comply with pydoc-markdown style
  • Loading branch information
danielbichuetti authored and brandenchan committed Sep 21, 2022
1 parent 7dfc5ea commit 73af416
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 8 deletions.
7 changes: 6 additions & 1 deletion docs/_src/api/api/file_converter.md
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,8 @@ class PDFToTextConverter(BaseConverter)
def __init__(remove_numeric_tables: bool = False,
valid_languages: Optional[List[str]] = None,
id_hash_keys: Optional[List[str]] = None,
encoding: Optional[str] = "UTF-8")
encoding: Optional[str] = "UTF-8",
keep_physical_layout: bool = False)
```

**Arguments**:
Expand All @@ -359,6 +360,8 @@ In this case the id will be generated by using the content and the defined metad
- `encoding`: Encoding that will be passed as `-enc` parameter to `pdftotext`.
Defaults to "UTF-8" in order to support special characters (e.g. German Umlauts, Cyrillic ...).
(See list of available encodings, such as "Latin1", by running `pdftotext -listenc` in the terminal)
- `keep_physical_layout`: This option will maintain original physical layout on the extracted text.
It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order.

<a id="pdf.PDFToTextConverter.convert"></a>

Expand Down Expand Up @@ -392,6 +395,8 @@ not one of the valid languages, then it might likely be encoding error resulting
in garbled text.
- `encoding`: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`.
(See list of available encodings by running `pdftotext -listenc` in the terminal)
- `keep_physical_layout`: This option will maintain original physical layout on the extracted text.
It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order.
- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
Expand Down
5 changes: 5 additions & 0 deletions haystack/json-schemas/haystack-pipeline-main.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -4047,6 +4047,11 @@
"type": "null"
}
]
},
"keep_physical_layout": {
"title": "Keep Physical Layout",
"default": false,
"type": "boolean"
}
},
"additionalProperties": false,
Expand Down
17 changes: 10 additions & 7 deletions haystack/nodes/file_converter/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def __init__(
valid_languages: Optional[List[str]] = None,
id_hash_keys: Optional[List[str]] = None,
encoding: Optional[str] = "UTF-8",
keep_physical_layout: bool = False,
):
"""
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
Expand All @@ -47,14 +48,16 @@ def __init__(
:param encoding: Encoding that will be passed as `-enc` parameter to `pdftotext`.
Defaults to "UTF-8" in order to support special characters (e.g. German Umlauts, Cyrillic ...).
(See list of available encodings, such as "Latin1", by running `pdftotext -listenc` in the terminal)
:param keep_physical_layout: This option will maintain original physical layout on the extracted text.
It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order.
"""
super().__init__(
remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, id_hash_keys=id_hash_keys
)

verify_installation = subprocess.run(["pdftotext -v"], shell=True)
if verify_installation.returncode == 127:
raise Exception(
raise FileNotFoundError(
"""pdftotext is not installed. It is part of xpdf or poppler-utils software suite.
Installation on Linux:
Expand All @@ -68,8 +71,8 @@ def __init__(
"""
)

super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)
self.encoding = encoding
self.keep_physical_layout = keep_physical_layout

def convert(
self,
Expand Down Expand Up @@ -98,6 +101,8 @@ def convert(
in garbled text.
:param encoding: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`.
(See list of available encodings by running `pdftotext -listenc` in the terminal)
:param keep_physical_layout: This option will maintain original physical layout on the extracted text.
It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order.
:param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
Expand All @@ -110,7 +115,9 @@ def convert(
if id_hash_keys is None:
id_hash_keys = self.id_hash_keys

pages = self._read_pdf(file_path, layout=False, encoding=encoding)
keep_physical_layout = self.keep_physical_layout

pages = self._read_pdf(file_path, layout=keep_physical_layout, encoding=encoding)

cleaned_pages = []
for page in pages:
Expand Down Expand Up @@ -162,10 +169,6 @@ def _read_pdf(self, file_path: Path, layout: bool, encoding: Optional[str] = Non
:param encoding: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`.
(See list of available encodings by running `pdftotext -listenc` in the terminal)
"""
# if layout:
# command = ["pdftotext", "-enc", encoding, "-layout", str(file_path), "-"]
# else:
# command = ["pdftotext", "-enc", encoding, str(file_path), "-"]
if not encoding:
encoding = self.encoding

Expand Down
8 changes: 8 additions & 0 deletions test/nodes/test_file_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,14 @@ def test_pdf_encoding(Converter):
assert "ɪ" not in document.content


@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_layout(Converter):
converter = Converter(keep_physical_layout=True)

document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_3.pdf")[0]
assert str(document.content).startswith("This is the second test sentence.")


@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_ligatures(Converter):
converter = Converter()
Expand Down
Binary file added test/samples/pdf/sample_pdf_3.pdf
Binary file not shown.

0 comments on commit 73af416

Please sign in to comment.