diff --git a/docling/backend/patent_uspto_backend.py b/docling/backend/patent_uspto_backend.py new file mode 100644 index 00000000..4ad7fd48 --- /dev/null +++ b/docling/backend/patent_uspto_backend.py @@ -0,0 +1,45 @@ +"""Backend to parse patents from the United States Patent Office (USPTO). + +The parsers included in this module can handle patent grants pubished since 1976 and +patent applications since 2001. +The original files can be found in https://bulkdata.uspto.gov. +""" + +import logging +from io import BytesIO +from pathlib import Path +from typing import Union + +from docling_core.types.doc import DoclingDocument + +from docling.backend.abstract_backend import DeclarativeDocumentBackend +from docling.datamodel.base_models import InputFormat +from docling.datamodel.document import InputDocument + +_log = logging.getLogger(__name__) + + +class PatentUsptoDocumentBackend(DeclarativeDocumentBackend): + def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]): + super().__init__(in_doc, path_or_stream) + + return + + def is_valid(self) -> bool: + return False + + @classmethod + def supports_pagination(cls) -> bool: + return False + + def unload(self): + return + + @classmethod + def supported_formats(cls) -> set[InputFormat]: + return {InputFormat.PATENT_USPTO} + + def convert(self) -> DoclingDocument: + doc = DoclingDocument(name=self.file.stem or "file") + + return doc diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index dd6291ab..6a95fdcd 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -36,6 +36,7 @@ class InputFormat(str, Enum): ASCIIDOC = "asciidoc" MD = "md" XLSX = "xlsx" + PATENT_USPTO = "uspto" class OutputFormat(str, Enum): @@ -55,6 +56,7 @@ class OutputFormat(str, Enum): InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"], InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"], InputFormat.XLSX: ["xlsx"], + InputFormat.PATENT_USPTO: ["xml", "txt"], } FormatToMimeType: Dict[InputFormat, List[str]] = { @@ -81,6 +83,7 @@ class OutputFormat(str, Enum): InputFormat.XLSX: [ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ], + InputFormat.PATENT_USPTO: ["application/xml", "text/plain"], } MimeTypeToFormat = { diff --git a/docling/document_converter.py b/docling/document_converter.py index 61608c0f..123051c8 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -15,6 +15,7 @@ from docling.backend.msexcel_backend import MsExcelDocumentBackend from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend from docling.backend.msword_backend import MsWordDocumentBackend +from docling.backend.patent_uspto_backend import PatentUsptoDocumentBackend from docling.datamodel.base_models import ( ConversionStatus, DoclingComponentType, @@ -82,12 +83,17 @@ class HTMLFormatOption(FormatOption): backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend -class PdfFormatOption(FormatOption): +class PatentUsptoFormatOption(FormatOption): + pipeline_cls: Type = SimplePipeline + backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend + + +class ImageFormatOption(FormatOption): pipeline_cls: Type = StandardPdfPipeline backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend -class ImageFormatOption(FormatOption): +class PdfFormatOption(FormatOption): pipeline_cls: Type = StandardPdfPipeline backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend @@ -112,6 +118,9 @@ def _get_default_option(format: InputFormat) -> FormatOption: InputFormat.HTML: FormatOption( pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend ), + InputFormat.PATENT_USPTO: FormatOption( + pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend + ), InputFormat.IMAGE: FormatOption( pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend ),