fix: meta from ByteStream input for AzureOCRDocumentConverter (#7955)

* fix: meta from ByteStream input for AzureOCRDocumentConverter * add test * add reno * fix test
deepset-ai · Jul 4, 2024 · ce8af3f · ce8af3f
1 parent 2be576f
commit ce8af3f
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 1 deletion.
diff --git a/haystack/components/converters/azure.py b/haystack/components/converters/azure.py
@@ -135,7 +135,8 @@ def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[D
             result = poller.result()
             azure_output.append(result.to_dict())
 
-            docs = self._convert_tables_and_text(result=result, meta=metadata)
+            merged_metadata = {**bytestream.meta, **metadata}
+            docs = self._convert_tables_and_text(result=result, meta=merged_metadata)
             documents.extend(docs)
 
         return {"documents": documents, "raw_azure_response": azure_output}

diff --git a/releasenotes/notes/fix-azure-ocr-bytestream-meta-0d2c8e6ea761b791.yaml b/releasenotes/notes/fix-azure-ocr-bytestream-meta-0d2c8e6ea761b791.yaml
@@ -0,0 +1,4 @@
+---
+fixes:
+  - |
+    Meta handling of bytestreams in Azure OCR has been fixed.
diff --git a/test/components/converters/test_azure_ocr_doc_converter.py b/test/components/converters/test_azure_ocr_doc_converter.py
@@ -13,6 +13,7 @@
 from azure.ai.formrecognizer import AnalyzeResult
 
 from haystack.components.converters.azure import AzureOCRDocumentConverter
+from haystack.dataclasses.byte_stream import ByteStream
 from haystack.utils import Secret
 
 
@@ -306,3 +307,24 @@ def test_hashing_dataframe(self, mock_resolve_value):
 
         # doesn't mean much, more for sanity check
         assert hash_string_1 != hash_string_2 != hash_string_3
+
+    @patch("haystack.utils.auth.EnvVarSecret.resolve_value")
+    def test_meta_from_byte_stream(self, mock_resolve_value, test_files_path) -> None:
+        mock_resolve_value.return_value = "test_api_key"
+
+        class MockPoller:
+            def result(self) -> AnalyzeResult:
+                with open(test_files_path / "json" / "azure_sample_pdf_1.json", encoding="utf-8") as azure_file:
+                    result = json.load(azure_file)
+                return AnalyzeResult.from_dict(result)
+
+        with patch("azure.ai.formrecognizer.DocumentAnalysisClient.begin_analyze_document") as azure_mock:
+            azure_mock.return_value = MockPoller()
+            ocr_node = AzureOCRDocumentConverter(endpoint="")
+            bytes = (test_files_path / "pdf" / "sample_pdf_1.pdf").read_bytes()
+            byte_stream = ByteStream(data=bytes, meta={"test_from": "byte_stream"})
+            out = ocr_node.run(sources=[byte_stream], meta=[{"test": "value_1"}])
+
+        docs = out["documents"]
+        assert docs[1].meta["test"] == "value_1"
+        assert docs[1].meta["test_from"] == "byte_stream"