Skip to content

Commit

Permalink
feat: expose scale for export of page images and document elements
Browse files Browse the repository at this point in the history
Signed-off-by: Michele Dolfi <[email protected]>
  • Loading branch information
dolfim-ibm committed Aug 20, 2024
1 parent 35e90b6 commit 909e883
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 57 deletions.
28 changes: 22 additions & 6 deletions docling/datamodel/base_models.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import copy
import warnings
from enum import Enum, auto
from io import BytesIO
from typing import Any, Dict, List, Optional, Tuple, Union
from typing import Annotated, Any, Dict, List, Optional, Tuple, Union

from PIL.Image import Image
from pydantic import BaseModel, ConfigDict, model_validator
from pydantic import BaseModel, ConfigDict, Field, model_validator
from typing_extensions import Self

from docling.backend.abstract_backend import PdfPageBackend

Expand Down Expand Up @@ -243,6 +245,7 @@ class Page(BaseModel):
_backend: Optional[PdfPageBackend] = (
None # Internal PDF backend. By default it is cleared during assembling.
)
_default_image_scale: float = 1.0 # Default image scale for external usage.
_image_cache: Dict[float, Image] = (
{}
) # Cache of images in different scales. By default it is cleared during assembling.
Expand All @@ -256,7 +259,7 @@ def get_image(self, scale: float = 1.0) -> Optional[Image]:

@property
def image(self) -> Optional[Image]:
return self.get_image()
return self.get_image(scale=self._default_image_scale)


class DocumentStream(BaseModel):
Expand All @@ -283,6 +286,19 @@ class PipelineOptions(BaseModel):


class AssembleOptions(BaseModel):
keep_page_images: bool = (
False # False: page images are removed in the assemble step
)
keep_page_images: Annotated[
bool,
Field(
deprecated="`keep_page_images` is depreacted, set the value of `page_images_scales` instead"
),
] = False # False: page images are removed in the assemble step
images_scale: Optional[float] = None # if set, the scale for generated images

@model_validator(mode="after")
def set_page_images_from_deprecated(self) -> Self:
with warnings.catch_warnings():
warnings.simplefilter("ignore", DeprecationWarning)
default_scale = 1.0
if self.keep_page_images and self.images_scale is None:
self.images_scale = default_scale
return self
17 changes: 16 additions & 1 deletion docling/datamodel/document.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging
from io import BytesIO
from pathlib import Path, PurePath
from typing import ClassVar, Dict, Iterable, List, Optional, Type, Union
from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union

from docling_core.types import BaseCell, BaseText
from docling_core.types import BoundingBox as DsBoundingBox
Expand All @@ -21,6 +21,7 @@
DocumentStream,
FigureElement,
Page,
PageElement,
TableElement,
TextElement,
)
Expand Down Expand Up @@ -302,6 +303,20 @@ def render_as_markdown(self):
else:
return ""

def render_element_images(
self, element_types: Tuple[PageElement] = (FigureElement,)
):
for element in self.assembled.elements:
if isinstance(element, element_types):
page_ix = element.page_no
scale = self.pages[page_ix]._default_image_scale
crop_bbox = element.cluster.bbox.scaled(scale=scale).to_top_left_origin(
page_height=self.pages[page_ix].size.height * scale
)

cropped_im = self.pages[page_ix].image.crop(crop_bbox.as_tuple())
yield element, cropped_im


class DocumentConversionInput(BaseModel):

Expand Down
12 changes: 10 additions & 2 deletions docling/document_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
# Free up mem resources before moving on with next batch

# Remove page images (can be disabled)
if not self.assemble_options.keep_page_images:
if self.assemble_options.images_scale is None:
assembled_page._image_cache = {}

# Unload backend
Expand Down Expand Up @@ -229,7 +229,15 @@ def initialize_page(self, doc: InputDocument, page: Page) -> Page:

# Generate the page image and store it in the page object
def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
page.get_image() # this will trigger storing the image in the internal cache
# default scale
page.get_image(scale=1.0)

# user requested scales
if self.assemble_options.images_scale is not None:
page._default_image_scale = self.assemble_options.images_scale
page.get_image(
scale=self.assemble_options.images_scale
) # this will trigger storing the image in the internal cache

return page

Expand Down
69 changes: 21 additions & 48 deletions examples/export_figures.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,44 +15,7 @@

_log = logging.getLogger(__name__)


def export_page_images(
doc: ConvertedDocument,
output_dir: Path,
):
output_dir.mkdir(parents=True, exist_ok=True)

doc_filename = doc.input.file.stem

for page in doc.pages:
page_no = page.page_no + 1
page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
with page_image_filename.open("wb") as fp:
page.image.save(fp, format="PNG")


def export_element_images(
doc: ConvertedDocument,
output_dir: Path,
allowed_element_types: Tuple[PageElement] = (FigureElement,),
):
output_dir.mkdir(parents=True, exist_ok=True)

doc_filename = doc.input.file.stem

for element_ix, element in enumerate(doc.assembled.elements):
if isinstance(element, allowed_element_types):
page_ix = element.page_no
crop_bbox = element.cluster.bbox.to_top_left_origin(
page_height=doc.pages[page_ix].size.height
)

cropped_im = doc.pages[page_ix].image.crop(crop_bbox.as_tuple())
element_image_filename = (
output_dir / f"{doc_filename}-element-{element_ix}.png"
)
with element_image_filename.open("wb") as fp:
cropped_im.save(fp, "PNG")
IMAGE_RESOLUTION_SCALE = 2.0


def main():
Expand All @@ -61,37 +24,47 @@ def main():
input_doc_paths = [
Path("./test/data/2206.01062.pdf"),
]
output_dir = Path("./scratch")

input_files = DocumentConversionInput.from_paths(input_doc_paths)

# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
# will destroy them for cleaning up memory.
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
# scale=1 correspond of a standard 72 DPI image
assemble_options = AssembleOptions()
assemble_options.keep_page_images = True
assemble_options.images_scale = IMAGE_RESOLUTION_SCALE

doc_converter = DocumentConverter(assemble_options=assemble_options)

start_time = time.time()

converted_docs = doc_converter.convert(input_files)

output_dir.mkdir(parents=True, exist_ok=True)
for doc in converted_docs:
if doc.status != ConversionStatus.SUCCESS:
_log.info(f"Document {doc.input.file} failed to convert.")
continue

# Export page images
export_page_images(doc, output_dir=Path("./scratch"))
doc_filename = doc.input.file.stem

# Export figures
# export_element_images(doc, output_dir=Path("./scratch"), allowed_element_types=(FigureElement,))
# Export page images
for page in doc.pages:
page_no = page.page_no + 1
page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
with page_image_filename.open("wb") as fp:
page.image.save(fp, format="PNG")

# Export figures and tables
export_element_images(
doc,
output_dir=Path("./scratch"),
allowed_element_types=(FigureElement, TableElement),
)
for element, image in doc.render_element_images(
element_types=(FigureElement, TableElement)
):
element_image_filename = (
output_dir / f"{doc_filename}-element-{element.id}.png"
)
with element_image_filename.open("wb") as fp:
image.save(fp, "PNG")

end_time = time.time() - start_time

Expand Down

0 comments on commit 909e883

Please sign in to comment.