Skip to content

Commit

Permalink
chore: add safe initialization of PatentUsptoDocumentBackend
Browse files Browse the repository at this point in the history
Signed-off-by: Cesar Berrospi Ramis <[email protected]>
  • Loading branch information
ceberam committed Dec 17, 2024
1 parent 99e25c3 commit dde5bdc
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 8 deletions.
Empty file added docling/backend/xml/__init__.py
Empty file.
21 changes: 13 additions & 8 deletions docling/backend/xml/uspto_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,17 +67,22 @@ def __init__(
self.patent_content: str = ""
self.parser: Optional[PatentUspto] = None

if isinstance(self.path_or_stream, BytesIO):
while line := self.path_or_stream.readline().decode("utf-8"):
if line.startswith("<!DOCTYPE") or line == "PATN\n":
self._set_parser(line)
self.patent_content += line
elif isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, encoding="utf-8") as file_obj:
while line := file_obj.readline():
try:
if isinstance(self.path_or_stream, BytesIO):
while line := self.path_or_stream.readline().decode("utf-8"):
if line.startswith("<!DOCTYPE") or line == "PATN\n":
self._set_parser(line)
self.patent_content += line
elif isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, encoding="utf-8") as file_obj:
while line := file_obj.readline():
if line.startswith("<!DOCTYPE") or line == "PATN\n":
self._set_parser(line)
self.patent_content += line
except Exception as exc:
raise RuntimeError(
f"Could not initialize USPTO backend for file with hash {self.document_hash}."
) from exc

def _set_parser(self, doctype: str) -> None:
doctype_line = doctype.lower()
Expand Down

0 comments on commit dde5bdc

Please sign in to comment.