Skip to content

Commit

Permalink
refactor: simplify handling multiple InputFormats per mime type
Browse files Browse the repository at this point in the history
Signed-off-by: Cesar Berrospi Ramis <[email protected]>
  • Loading branch information
ceberam committed Dec 17, 2024
1 parent 058b18f commit dcfb2ae
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 29 deletions.
39 changes: 12 additions & 27 deletions docling/datamodel/base_models.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from enum import Enum, auto
from enum import Enum
from typing import TYPE_CHECKING, Dict, List, Optional, Union

from docling_core.types.doc import (
Expand All @@ -13,7 +13,6 @@
)
from PIL.Image import Image
from pydantic import BaseModel, ConfigDict
from typing_extensions import Self, override

if TYPE_CHECKING:
from docling.backend.pdf_backend import PdfPageBackend
Expand All @@ -29,31 +28,17 @@ class ConversionStatus(str, Enum):


class InputFormat(str, Enum):
"""A document format supported by document backend parsers.
The field `is_custom` indicates whether the document format is more specific than
the standard and content formats, typically defined by MIME types.
"""

DOCX = "docx", False
PPTX = "pptx", False
HTML = "html", False
IMAGE = "image", False
PDF = "pdf", False
ASCIIDOC = "asciidoc", False
MD = "md", False
XLSX = "xlsx", False
XML_USPTO = "uspto", True

@override
def __new__(cls, value: str, _) -> Self:
obj = str.__new__(cls, [value])
obj._value_ = value
return obj

@override
def __init__(self, _, is_custom: bool) -> None:
self.is_custom: bool = is_custom
"""A document format supported by document backend parsers."""

DOCX = "docx"
PPTX = "pptx"
HTML = "html"
IMAGE = "image"
PDF = "pdf"
ASCIIDOC = "asciidoc"
MD = "md"
XLSX = "xlsx"
XML_USPTO = "uspto"


class OutputFormat(str, Enum):
Expand Down
5 changes: 3 additions & 2 deletions docling/datamodel/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,9 +290,10 @@ def _guess_format(self, obj: Union[Path, DocumentStream]) -> Optional[InputForma
mime = mime or "text/plain"
formats = MimeTypeToFormat.get(mime, [])
if formats:
if len(formats) == 1 and not formats[0].is_custom:
# TODO: remove application/xml case after adding another XML parse
if len(formats) == 1 and mime not in ("text/plain", "application/xml"):
return formats[0]
else: # ambiguity or custom cases
else: # ambiguity in formats
return _DocumentConversionInput._guess_from_content(
content, mime, formats
)
Expand Down

0 comments on commit dcfb2ae

Please sign in to comment.