From dd9660f90d8cd074ac420139e0f78fa3970b162e Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Thu, 9 Jan 2025 20:12:10 +0100 Subject: [PATCH] fix: PyPDFToDocument initializes documents with content and meta (#8698) * initialize document with content and meta * update test * add test checking that not only content is used for id generation --- haystack/components/converters/pypdf.py | 10 +++++----- releasenotes/notes/pypdf-docid-293dac08ea5f8491.yaml | 4 ++++ test/components/converters/test_pypdf_to_document.py | 8 ++++++-- 3 files changed, 15 insertions(+), 7 deletions(-) create mode 100644 releasenotes/notes/pypdf-docid-293dac08ea5f8491.yaml diff --git a/haystack/components/converters/pypdf.py b/haystack/components/converters/pypdf.py index 19a4e2e453..334ef097d7 100644 --- a/haystack/components/converters/pypdf.py +++ b/haystack/components/converters/pypdf.py @@ -155,7 +155,7 @@ def from_dict(cls, data): """ return default_from_dict(cls, data) - def _default_convert(self, reader: "PdfReader") -> Document: + def _default_convert(self, reader: "PdfReader") -> str: texts = [] for page in reader.pages: texts.append( @@ -170,7 +170,7 @@ def _default_convert(self, reader: "PdfReader") -> Document: ) ) text = "\f".join(texts) - return Document(content=text) + return text @component.output_types(documents=List[Document]) def run( @@ -205,14 +205,14 @@ def run( continue try: pdf_reader = PdfReader(io.BytesIO(bytestream.data)) - document = self._default_convert(pdf_reader) + text = self._default_convert(pdf_reader) except Exception as e: logger.warning( "Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e ) continue - if document.content is None or document.content.strip() == "": + if text is None or text.strip() == "": logger.warning( "PyPDFToDocument could not extract text from the file {source}. Returning an empty document.", source=source, @@ -222,7 +222,7 @@ def run( if not self.store_full_path and (file_path := bytestream.meta.get("file_path")): merged_metadata["file_path"] = os.path.basename(file_path) - document.meta = merged_metadata + document = Document(content=text, meta=merged_metadata) documents.append(document) return {"documents": documents} diff --git a/releasenotes/notes/pypdf-docid-293dac08ea5f8491.yaml b/releasenotes/notes/pypdf-docid-293dac08ea5f8491.yaml new file mode 100644 index 0000000000..f077d8b4ee --- /dev/null +++ b/releasenotes/notes/pypdf-docid-293dac08ea5f8491.yaml @@ -0,0 +1,4 @@ +--- +fixes: + - | + PyPDFToDocument now creates documents with id based on converted text and meta data. Before it didn't take the meta data into account. diff --git a/test/components/converters/test_pypdf_to_document.py b/test/components/converters/test_pypdf_to_document.py index fa8f295db7..916bb771ee 100644 --- a/test/components/converters/test_pypdf_to_document.py +++ b/test/components/converters/test_pypdf_to_document.py @@ -113,8 +113,8 @@ def test_default_convert(self): layout_mode_font_height_weight=1.5, ) - doc = converter._default_convert(mock_reader) - assert doc.content == "Page 1 content\fPage 2 content" + text = converter._default_convert(mock_reader) + assert text == "Page 1 content\fPage 2 content" expected_params = { "extraction_mode": "layout", @@ -209,3 +209,7 @@ def test_run_empty_document(self, caplog, test_files_path): output = PyPDFToDocument().run(sources=paths) assert "PyPDFToDocument could not extract text from the file" in caplog.text assert output["documents"][0].content == "" + + # Check that meta is used when the returned document is initialized and thus when doc id is generated + assert output["documents"][0].meta["file_path"] == "non_text_searchable.pdf" + assert output["documents"][0].id != Document(content="").id