Skip to content

Commit

Permalink
feat: add PATENT_USPTO as input format
Browse files Browse the repository at this point in the history
Signed-off-by: Cesar Berrospi Ramis <[email protected]>
  • Loading branch information
ceberam committed Dec 11, 2024
1 parent 3da166e commit 458df06
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 2 deletions.
45 changes: 45 additions & 0 deletions docling/backend/patent_uspto_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""Backend to parse patents from the United States Patent Office (USPTO).
The parsers included in this module can handle patent grants pubished since 1976 and
patent applications since 2001.
The original files can be found in https://bulkdata.uspto.gov.
"""

import logging
from io import BytesIO
from pathlib import Path
from typing import Union

from docling_core.types.doc import DoclingDocument

from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument

_log = logging.getLogger(__name__)


class PatentUsptoDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)

return

def is_valid(self) -> bool:
return False

@classmethod
def supports_pagination(cls) -> bool:
return False

def unload(self):
return

@classmethod
def supported_formats(cls) -> set[InputFormat]:
return {InputFormat.PATENT_USPTO}

def convert(self) -> DoclingDocument:
doc = DoclingDocument(name=self.file.stem or "file")

return doc
3 changes: 3 additions & 0 deletions docling/datamodel/base_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ class InputFormat(str, Enum):
ASCIIDOC = "asciidoc"
MD = "md"
XLSX = "xlsx"
PATENT_USPTO = "uspto"


class OutputFormat(str, Enum):
Expand All @@ -55,6 +56,7 @@ class OutputFormat(str, Enum):
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
InputFormat.XLSX: ["xlsx"],
InputFormat.PATENT_USPTO: ["xml", "txt"],
}

FormatToMimeType: Dict[InputFormat, List[str]] = {
Expand All @@ -81,6 +83,7 @@ class OutputFormat(str, Enum):
InputFormat.XLSX: [
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
],
InputFormat.PATENT_USPTO: ["application/xml", "text/plain"],
}

MimeTypeToFormat = {
Expand Down
13 changes: 11 additions & 2 deletions docling/document_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.patent_uspto_backend import PatentUsptoDocumentBackend
from docling.datamodel.base_models import (
ConversionStatus,
DoclingComponentType,
Expand Down Expand Up @@ -82,12 +83,17 @@ class HTMLFormatOption(FormatOption):
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend


class PdfFormatOption(FormatOption):
class PatentUsptoFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend


class ImageFormatOption(FormatOption):
pipeline_cls: Type = StandardPdfPipeline
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend


class ImageFormatOption(FormatOption):
class PdfFormatOption(FormatOption):
pipeline_cls: Type = StandardPdfPipeline
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend

Expand All @@ -112,6 +118,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
InputFormat.HTML: FormatOption(
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
),
InputFormat.PATENT_USPTO: FormatOption(
pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
),
InputFormat.IMAGE: FormatOption(
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
),
Expand Down

0 comments on commit 458df06

Please sign in to comment.