From 4338dea17b31c982155c80f515d8a3ca1b79dfcb Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Mon, 12 Aug 2024 11:29:27 +0200 Subject: [PATCH 1/2] Add assemble options and example saving pages and figures Signed-off-by: Michele Dolfi --- docling/datamodel/base_models.py | 6 +++ docling/document_converter.py | 17 +++++-- examples/export_figures.py | 87 ++++++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+), 4 deletions(-) create mode 100644 examples/export_figures.py diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 8b6796d6..28207793 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -265,3 +265,9 @@ class PipelineOptions(BaseModel): do_ocr: bool = False # True: perform OCR, replace programmatic PDF text table_structure_options: TableStructureOptions = TableStructureOptions() + + +class AssembleOptions(BaseModel): + remove_page_images: bool = ( + True # True: page images are removed in the assemble step + ) diff --git a/docling/document_converter.py b/docling/document_converter.py index 95b30a06..fb6381f2 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -14,6 +14,7 @@ from docling.backend.abstract_backend import PdfDocumentBackend from docling.datamodel.base_models import ( AssembledUnit, + AssembleOptions, ConversionStatus, Page, PipelineOptions, @@ -44,6 +45,7 @@ def __init__( pipeline_options: PipelineOptions = PipelineOptions(), pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND, pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline, + assemble_options: AssembleOptions = AssembleOptions(), ): if not artifacts_path: artifacts_path = self.download_models_hf() @@ -57,6 +59,7 @@ def __init__( self.page_assemble_model = PageAssembleModel(config={}) self.glm_model = GlmModel(config={}) self.pdf_backend = pdf_backend + self.assemble_options = assemble_options @staticmethod def download_models_hf( @@ -174,17 +177,23 @@ def process_document(self, in_doc: InputDocument) -> ConvertedDocument: pages_with_images, ) + # 4. Run pipeline stages pipeline_pages = self.model_pipeline.apply(pages_with_cells) - # 7. Assemble page elements (per page) + # 5. Assemble page elements (per page) assembled_pages = self.page_assemble_model(pipeline_pages) # exhaust assembled_pages for assembled_page in assembled_pages: # Free up mem resources before moving on with next batch - assembled_page.image = ( - None # Comment this if you want to visualize page images - ) + + # Remove page images (can be disabled) + if self.assemble_options.remove_page_images: + assembled_page.image = ( + None # Comment this if you want to visualize page images + ) + + # Unload backend assembled_page._backend.unload() all_assembled_pages.append(assembled_page) diff --git a/examples/export_figures.py b/examples/export_figures.py new file mode 100644 index 00000000..80a567ec --- /dev/null +++ b/examples/export_figures.py @@ -0,0 +1,87 @@ +import json +import logging +import time +from pathlib import Path +from typing import Iterable + +from docling.datamodel.base_models import ( + AssembleOptions, + BoundingBox, + ConversionStatus, + CoordOrigin, + PipelineOptions, +) +from docling.datamodel.document import ConvertedDocument, DocumentConversionInput +from docling.document_converter import DocumentConverter + +_log = logging.getLogger(__name__) + + +def export_figures( + converted_docs: Iterable[ConvertedDocument], + output_dir: Path, +): + output_dir.mkdir(parents=True, exist_ok=True) + + success_count = 0 + failure_count = 0 + + for doc in converted_docs: + if doc.status == ConversionStatus.SUCCESS: + success_count += 1 + doc_filename = doc.input.file.stem + + for page in doc.pages: + page_no = page.page_no + 1 + page_image_filename = output_dir / f"{doc_filename}-{page_no}.png" + with page_image_filename.open("wb") as fp: + page.image.save(fp, format="PNG") + + for fig_ix, fig in enumerate(doc.output.figures): + page_no = fig.prov[0].page + page_ix = page_no - 1 + x0, y0, x1, y1 = fig.prov[0].bbox + crop_bbox = BoundingBox( + l=x0, b=y0, r=x1, t=y1, coord_origin=CoordOrigin.BOTTOMLEFT + ).to_top_left_origin(page_height=doc.pages[page_ix].size.height) + + cropped_im = doc.pages[page_ix].image.crop(crop_bbox.as_tuple()) + fig_image_filename = output_dir / f"{doc_filename}-fig{fig_ix+1}.png" + with fig_image_filename.open("wb") as fp: + cropped_im.save(fp, "PNG") + + else: + _log.info(f"Document {doc.input.file} failed to convert.") + failure_count += 1 + + _log.info( + f"Processed {success_count + failure_count} docs, of which {failure_count} failed" + ) + + +def main(): + logging.basicConfig(level=logging.INFO) + + input_doc_paths = [ + Path("./test/data/2206.01062.pdf"), + ] + + input_files = DocumentConversionInput.from_paths(input_doc_paths) + + assemble_options = AssembleOptions() + assemble_options.remove_page_images = False + + doc_converter = DocumentConverter(assemble_options=assemble_options) + + start_time = time.time() + + converted_docs = doc_converter.convert(input_files) + export_figures(converted_docs, output_dir=Path("./scratch")) + + end_time = time.time() - start_time + + _log.info(f"All documents were converted in {end_time:.2f} seconds.") + + +if __name__ == "__main__": + main() From 4b9aff5fc6a5015a13492c00d9344966ce1b4d55 Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Mon, 12 Aug 2024 16:46:16 +0200 Subject: [PATCH 2/2] add options for different page elements, improve example and flip name of assemble_options Signed-off-by: Michele Dolfi --- docling/datamodel/base_models.py | 4 +- docling/document_converter.py | 2 +- examples/export_figures.py | 97 ++++++++++++++++++-------------- 3 files changed, 59 insertions(+), 44 deletions(-) diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 28207793..10086917 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -268,6 +268,6 @@ class PipelineOptions(BaseModel): class AssembleOptions(BaseModel): - remove_page_images: bool = ( - True # True: page images are removed in the assemble step + keep_page_images: bool = ( + False # False: page images are removed in the assemble step ) diff --git a/docling/document_converter.py b/docling/document_converter.py index fb6381f2..9954bc9b 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -188,7 +188,7 @@ def process_document(self, in_doc: InputDocument) -> ConvertedDocument: # Free up mem resources before moving on with next batch # Remove page images (can be disabled) - if self.assemble_options.remove_page_images: + if not self.assemble_options.keep_page_images: assembled_page.image = ( None # Comment this if you want to visualize page images ) diff --git a/examples/export_figures.py b/examples/export_figures.py index 80a567ec..6cd98430 100644 --- a/examples/export_figures.py +++ b/examples/export_figures.py @@ -1,15 +1,14 @@ -import json import logging import time from pathlib import Path -from typing import Iterable +from typing import Tuple from docling.datamodel.base_models import ( AssembleOptions, - BoundingBox, ConversionStatus, - CoordOrigin, - PipelineOptions, + FigureElement, + PageElement, + TableElement, ) from docling.datamodel.document import ConvertedDocument, DocumentConversionInput from docling.document_converter import DocumentConverter @@ -17,46 +16,43 @@ _log = logging.getLogger(__name__) -def export_figures( - converted_docs: Iterable[ConvertedDocument], +def export_page_images( + doc: ConvertedDocument, output_dir: Path, ): output_dir.mkdir(parents=True, exist_ok=True) - success_count = 0 - failure_count = 0 + doc_filename = doc.input.file.stem - for doc in converted_docs: - if doc.status == ConversionStatus.SUCCESS: - success_count += 1 - doc_filename = doc.input.file.stem - - for page in doc.pages: - page_no = page.page_no + 1 - page_image_filename = output_dir / f"{doc_filename}-{page_no}.png" - with page_image_filename.open("wb") as fp: - page.image.save(fp, format="PNG") - - for fig_ix, fig in enumerate(doc.output.figures): - page_no = fig.prov[0].page - page_ix = page_no - 1 - x0, y0, x1, y1 = fig.prov[0].bbox - crop_bbox = BoundingBox( - l=x0, b=y0, r=x1, t=y1, coord_origin=CoordOrigin.BOTTOMLEFT - ).to_top_left_origin(page_height=doc.pages[page_ix].size.height) - - cropped_im = doc.pages[page_ix].image.crop(crop_bbox.as_tuple()) - fig_image_filename = output_dir / f"{doc_filename}-fig{fig_ix+1}.png" - with fig_image_filename.open("wb") as fp: - cropped_im.save(fp, "PNG") - - else: - _log.info(f"Document {doc.input.file} failed to convert.") - failure_count += 1 + for page in doc.pages: + page_no = page.page_no + 1 + page_image_filename = output_dir / f"{doc_filename}-{page_no}.png" + with page_image_filename.open("wb") as fp: + page.image.save(fp, format="PNG") + + +def export_element_images( + doc: ConvertedDocument, + output_dir: Path, + allowed_element_types: Tuple[PageElement] = (FigureElement,), +): + output_dir.mkdir(parents=True, exist_ok=True) + + doc_filename = doc.input.file.stem - _log.info( - f"Processed {success_count + failure_count} docs, of which {failure_count} failed" - ) + for element_ix, element in enumerate(doc.assembled.elements): + if isinstance(element, allowed_element_types): + page_ix = element.page_no + crop_bbox = element.cluster.bbox.to_top_left_origin( + page_height=doc.pages[page_ix].size.height + ) + + cropped_im = doc.pages[page_ix].image.crop(crop_bbox.as_tuple()) + element_image_filename = ( + output_dir / f"{doc_filename}-element-{element_ix}.png" + ) + with element_image_filename.open("wb") as fp: + cropped_im.save(fp, "PNG") def main(): @@ -68,15 +64,34 @@ def main(): input_files = DocumentConversionInput.from_paths(input_doc_paths) + # Important: For operating with page images, we must keep them, otherwise the DocumentConverter + # will destroy them for cleaning up memory. assemble_options = AssembleOptions() - assemble_options.remove_page_images = False + assemble_options.keep_page_images = True doc_converter = DocumentConverter(assemble_options=assemble_options) start_time = time.time() converted_docs = doc_converter.convert(input_files) - export_figures(converted_docs, output_dir=Path("./scratch")) + + for doc in converted_docs: + if doc.status != ConversionStatus.SUCCESS: + _log.info(f"Document {doc.input.file} failed to convert.") + continue + + # Export page images + export_page_images(doc, output_dir=Path("./scratch")) + + # Export figures + # export_element_images(doc, output_dir=Path("./scratch"), allowed_element_types=(FigureElement,)) + + # Export figures and tables + export_element_images( + doc, + output_dir=Path("./scratch"), + allowed_element_types=(FigureElement, TableElement), + ) end_time = time.time() - start_time