Skip to content

Commit

Permalink
fix: meta from ByteStream input for AzureOCRDocumentConverter (#7955)
Browse files Browse the repository at this point in the history
* fix: meta from ByteStream input for AzureOCRDocumentConverter

* add test

* add reno

* fix test
  • Loading branch information
tstadel authored and vblagoje committed Jul 4, 2024
1 parent 2be576f commit ce8af3f
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 1 deletion.
3 changes: 2 additions & 1 deletion haystack/components/converters/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,8 @@ def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[D
result = poller.result()
azure_output.append(result.to_dict())

docs = self._convert_tables_and_text(result=result, meta=metadata)
merged_metadata = {**bytestream.meta, **metadata}
docs = self._convert_tables_and_text(result=result, meta=merged_metadata)
documents.extend(docs)

return {"documents": documents, "raw_azure_response": azure_output}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
fixes:
- |
Meta handling of bytestreams in Azure OCR has been fixed.
22 changes: 22 additions & 0 deletions test/components/converters/test_azure_ocr_doc_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from azure.ai.formrecognizer import AnalyzeResult

from haystack.components.converters.azure import AzureOCRDocumentConverter
from haystack.dataclasses.byte_stream import ByteStream
from haystack.utils import Secret


Expand Down Expand Up @@ -306,3 +307,24 @@ def test_hashing_dataframe(self, mock_resolve_value):

# doesn't mean much, more for sanity check
assert hash_string_1 != hash_string_2 != hash_string_3

@patch("haystack.utils.auth.EnvVarSecret.resolve_value")
def test_meta_from_byte_stream(self, mock_resolve_value, test_files_path) -> None:
mock_resolve_value.return_value = "test_api_key"

class MockPoller:
def result(self) -> AnalyzeResult:
with open(test_files_path / "json" / "azure_sample_pdf_1.json", encoding="utf-8") as azure_file:
result = json.load(azure_file)
return AnalyzeResult.from_dict(result)

with patch("azure.ai.formrecognizer.DocumentAnalysisClient.begin_analyze_document") as azure_mock:
azure_mock.return_value = MockPoller()
ocr_node = AzureOCRDocumentConverter(endpoint="")
bytes = (test_files_path / "pdf" / "sample_pdf_1.pdf").read_bytes()
byte_stream = ByteStream(data=bytes, meta={"test_from": "byte_stream"})
out = ocr_node.run(sources=[byte_stream], meta=[{"test": "value_1"}])

docs = out["documents"]
assert docs[1].meta["test"] == "value_1"
assert docs[1].meta["test_from"] == "byte_stream"

0 comments on commit ce8af3f

Please sign in to comment.