Skip to content

Commit

Permalink
feat: Create a backend to transform PubMed XML files to DoclingDocument
Browse files Browse the repository at this point in the history
Signed-off-by: lucas-morin <[email protected]>
  • Loading branch information
lucas-morin committed Dec 13, 2024
1 parent 365a1e7 commit 8b68d01
Show file tree
Hide file tree
Showing 18 changed files with 30,558 additions and 42 deletions.
539 changes: 539 additions & 0 deletions docling/backend/pubmed_backend.py

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions docling/datamodel/base_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ class InputFormat(str, Enum):
DOCX = "docx"
PPTX = "pptx"
HTML = "html"
XML_PUBMED = "xml_pubmed"
IMAGE = "image"
PDF = "pdf"
ASCIIDOC = "asciidoc"
Expand All @@ -52,6 +53,7 @@ class OutputFormat(str, Enum):
InputFormat.PDF: ["pdf"],
InputFormat.MD: ["md"],
InputFormat.HTML: ["html", "htm", "xhtml"],
InputFormat.XML_PUBMED: ["xml"],
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
InputFormat.XLSX: ["xlsx"],
Expand All @@ -68,6 +70,7 @@ class OutputFormat(str, Enum):
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
],
InputFormat.HTML: ["text/html", "application/xhtml+xml"],
InputFormat.XML_PUBMED: ["application/xml"],
InputFormat.IMAGE: [
"image/png",
"image/jpeg",
Expand Down
5 changes: 5 additions & 0 deletions docling/datamodel/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,11 @@ def _guess_format(self, obj: Union[Path, DocumentStream]):
with obj.open("rb") as f:
content = f.read(1024) # Read first 1KB

# Detect PubMed XML documents
xml_doctype = re.search(r"<!DOCTYPE [^>]+>", content.decode("utf-8"))
if xml_doctype and ("/NLM//DTD JATS" in xml_doctype.group()):
return InputFormat.XML_PUBMED

elif isinstance(obj, DocumentStream):
content = obj.stream.read(8192)
obj.stream.seek(0)
Expand Down
10 changes: 9 additions & 1 deletion docling/document_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.pubmed_backend import PubMedDocumentBackend
from docling.datamodel.base_models import (
ConversionStatus,
DoclingComponentType,
Expand Down Expand Up @@ -82,6 +83,11 @@ class HTMLFormatOption(FormatOption):
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend


class XMLPubMedFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = PubMedDocumentBackend


class PdfFormatOption(FormatOption):
pipeline_cls: Type = StandardPdfPipeline
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
Expand Down Expand Up @@ -118,6 +124,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
InputFormat.PDF: FormatOption(
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
),
InputFormat.XML_PUBMED: FormatOption(
pipeline_cls=SimplePipeline, backend=PubMedDocumentBackend
),
}
if (options := format_to_default_options.get(format)) is not None:
return options
Expand Down Expand Up @@ -162,7 +171,6 @@ def convert(
max_num_pages: int = sys.maxsize,
max_file_size: int = sys.maxsize,
) -> ConversionResult:

all_res = self.convert_all(
source=[source],
raises_on_error=raises_on_error,
Expand Down
86 changes: 45 additions & 41 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 8b68d01

Please sign in to comment.