From 49f96f59592654037921194c59614df41026aa19 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Tue, 7 Jan 2025 14:19:08 +0100 Subject: [PATCH] fix: Let BeautifulSoup detect the HTML encoding Signed-off-by: Christoph Auer --- docling/backend/html_backend.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 9cd1e29b..ae478885 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -37,10 +37,10 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path] try: if isinstance(self.path_or_stream, BytesIO): - text_stream = self.path_or_stream.getvalue().decode("utf-8") + text_stream = self.path_or_stream.getvalue() self.soup = BeautifulSoup(text_stream, "html.parser") if isinstance(self.path_or_stream, Path): - with open(self.path_or_stream, "r", encoding="utf-8") as f: + with open(self.path_or_stream, "rb") as f: html_content = f.read() self.soup = BeautifulSoup(html_content, "html.parser") except Exception as e: