Skip to content

Commit

Permalink
fix: PyPDFToDocument initializes documents with content and meta (#8698)
Browse files Browse the repository at this point in the history
* initialize document with content and meta

* update test

* add test checking that not only content is used for id generation
  • Loading branch information
julian-risch authored Jan 9, 2025
1 parent fe9b1e2 commit dd9660f
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 7 deletions.
10 changes: 5 additions & 5 deletions haystack/components/converters/pypdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def from_dict(cls, data):
"""
return default_from_dict(cls, data)

def _default_convert(self, reader: "PdfReader") -> Document:
def _default_convert(self, reader: "PdfReader") -> str:
texts = []
for page in reader.pages:
texts.append(
Expand All @@ -170,7 +170,7 @@ def _default_convert(self, reader: "PdfReader") -> Document:
)
)
text = "\f".join(texts)
return Document(content=text)
return text

@component.output_types(documents=List[Document])
def run(
Expand Down Expand Up @@ -205,14 +205,14 @@ def run(
continue
try:
pdf_reader = PdfReader(io.BytesIO(bytestream.data))
document = self._default_convert(pdf_reader)
text = self._default_convert(pdf_reader)
except Exception as e:
logger.warning(
"Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e
)
continue

if document.content is None or document.content.strip() == "":
if text is None or text.strip() == "":
logger.warning(
"PyPDFToDocument could not extract text from the file {source}. Returning an empty document.",
source=source,
Expand All @@ -222,7 +222,7 @@ def run(

if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
merged_metadata["file_path"] = os.path.basename(file_path)
document.meta = merged_metadata
document = Document(content=text, meta=merged_metadata)
documents.append(document)

return {"documents": documents}
4 changes: 4 additions & 0 deletions releasenotes/notes/pypdf-docid-293dac08ea5f8491.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
fixes:
- |
PyPDFToDocument now creates documents with id based on converted text and meta data. Before it didn't take the meta data into account.
8 changes: 6 additions & 2 deletions test/components/converters/test_pypdf_to_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,8 @@ def test_default_convert(self):
layout_mode_font_height_weight=1.5,
)

doc = converter._default_convert(mock_reader)
assert doc.content == "Page 1 content\fPage 2 content"
text = converter._default_convert(mock_reader)
assert text == "Page 1 content\fPage 2 content"

expected_params = {
"extraction_mode": "layout",
Expand Down Expand Up @@ -209,3 +209,7 @@ def test_run_empty_document(self, caplog, test_files_path):
output = PyPDFToDocument().run(sources=paths)
assert "PyPDFToDocument could not extract text from the file" in caplog.text
assert output["documents"][0].content == ""

# Check that meta is used when the returned document is initialized and thus when doc id is generated
assert output["documents"][0].meta["file_path"] == "non_text_searchable.pdf"
assert output["documents"][0].id != Document(content="").id

0 comments on commit dd9660f

Please sign in to comment.