Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: allow computing page images on-demand with scale and cache them #36

Merged
merged 3 commits into from
Aug 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion docling/backend/docling_parse_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,9 @@ def get_text_cells(self) -> Iterable[Cell]:
cell_counter += 1

def draw_clusters_and_cells():
image = self.get_page_image()
image = (
self.get_page_image()
) # make new image to avoid drawing on the saved ones
draw = ImageDraw.Draw(image)
for c in cells:
x0, y0, x1, y1 = c.bbox.as_tuple()
Expand Down
4 changes: 3 additions & 1 deletion docling/backend/pypdfium2_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,9 @@ def merge_group(group: List[Cell]) -> Cell:
return merged_cells

def draw_clusters_and_cells():
image = self.get_page_image()
image = (
self.get_page_image()
) # make new image to avoid drawing on the saved ones
draw = ImageDraw.Draw(image)
for c in cells:
x0, y0, x1, y1 = c.bbox.as_tuple()
Expand Down
51 changes: 41 additions & 10 deletions docling/datamodel/base_models.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import copy
import warnings
from enum import Enum, auto
from io import BytesIO
from typing import Any, Dict, List, Optional, Tuple, Union
from typing import Annotated, Any, Dict, List, Optional, Tuple, Union

from PIL.Image import Image
from pydantic import BaseModel, ConfigDict, model_validator
from pydantic import BaseModel, ConfigDict, Field, model_validator
from typing_extensions import Self

from docling.backend.abstract_backend import PdfPageBackend

Expand Down Expand Up @@ -234,14 +236,30 @@ class Page(BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True)

page_no: int
page_hash: str = None
size: PageSize = None
image: Image = None
page_hash: Optional[str] = None
size: Optional[PageSize] = None
cells: List[Cell] = None
predictions: PagePredictions = PagePredictions()
assembled: AssembledUnit = None
assembled: Optional[AssembledUnit] = None

_backend: PdfPageBackend = None # Internal PDF backend
_backend: Optional[PdfPageBackend] = (
None # Internal PDF backend. By default it is cleared during assembling.
)
_default_image_scale: float = 1.0 # Default image scale for external usage.
_image_cache: Dict[float, Image] = (
{}
) # Cache of images in different scales. By default it is cleared during assembling.

def get_image(self, scale: float = 1.0) -> Optional[Image]:
if self._backend is None:
return self._image_cache.get(scale, None)
if not scale in self._image_cache:
self._image_cache[scale] = self._backend.get_page_image(scale=scale)
return self._image_cache[scale]

@property
def image(self) -> Optional[Image]:
return self.get_image(scale=self._default_image_scale)


class DocumentStream(BaseModel):
Expand All @@ -268,6 +286,19 @@ class PipelineOptions(BaseModel):


class AssembleOptions(BaseModel):
keep_page_images: bool = (
False # False: page images are removed in the assemble step
)
keep_page_images: Annotated[
bool,
Field(
deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
),
] = False # False: page images are removed in the assemble step
images_scale: Optional[float] = None # if set, the scale for generated images

@model_validator(mode="after")
def set_page_images_from_deprecated(self) -> Self:
with warnings.catch_warnings():
warnings.simplefilter("ignore", DeprecationWarning)
default_scale = 1.0
if self.keep_page_images and self.images_scale is None:
self.images_scale = default_scale
return self
17 changes: 16 additions & 1 deletion docling/datamodel/document.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging
from io import BytesIO
from pathlib import Path, PurePath
from typing import ClassVar, Dict, Iterable, List, Optional, Type, Union
from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union

from docling_core.types import BaseCell, BaseText
from docling_core.types import BoundingBox as DsBoundingBox
Expand All @@ -21,6 +21,7 @@
DocumentStream,
FigureElement,
Page,
PageElement,
TableElement,
TextElement,
)
Expand Down Expand Up @@ -302,6 +303,20 @@ def render_as_markdown(self):
else:
return ""

def render_element_images(
self, element_types: Tuple[PageElement] = (FigureElement,)
):
for element in self.assembled.elements:
if isinstance(element, element_types):
page_ix = element.page_no
scale = self.pages[page_ix]._default_image_scale
crop_bbox = element.cluster.bbox.scaled(scale=scale).to_top_left_origin(
page_height=self.pages[page_ix].size.height * scale
)

cropped_im = self.pages[page_ix].image.crop(crop_bbox.as_tuple())
yield element, cropped_im


class DocumentConversionInput(BaseModel):

Expand Down
18 changes: 12 additions & 6 deletions docling/document_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,10 +188,8 @@ def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
# Free up mem resources before moving on with next batch

# Remove page images (can be disabled)
if not self.assemble_options.keep_page_images:
assembled_page.image = (
None # Comment this if you want to visualize page images
)
if self.assemble_options.images_scale is None:
assembled_page._image_cache = {}

# Unload backend
assembled_page._backend.unload()
Expand Down Expand Up @@ -231,7 +229,15 @@ def initialize_page(self, doc: InputDocument, page: Page) -> Page:

# Generate the page image and store it in the page object
def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
page.image = page._backend.get_page_image()
# default scale
page.get_image(scale=1.0)

# user requested scales
if self.assemble_options.images_scale is not None:
page._default_image_scale = self.assemble_options.images_scale
page.get_image(
scale=self.assemble_options.images_scale
) # this will trigger storing the image in the internal cache

return page

Expand All @@ -247,7 +253,7 @@ def draw_text_boxes(image, cells):
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
image.show()

# draw_text_boxes(page.image, cells)
# draw_text_boxes(page.get_image(scale=1.0), cells)

return page

Expand Down
2 changes: 1 addition & 1 deletion docling/models/easyocr_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:

for page in page_batch:
# rects = page._fpage.
high_res_image = page._backend.get_page_image(scale=self.scale)
high_res_image = page.get_image(scale=self.scale)
im = numpy.array(high_res_image)
result = self.reader.readtext(im)

Expand Down
4 changes: 3 additions & 1 deletion docling/models/layout_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,9 @@ def postprocess(self, clusters: List[Cluster], cells: List[Cell], page_height):
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
for page in page_batch:
clusters = []
for ix, pred_item in enumerate(self.layout_predictor.predict(page.image)):
for ix, pred_item in enumerate(
self.layout_predictor.predict(page.get_image(scale=1.0))
):
cluster = Cluster(
id=ix,
label=pred_item["label"],
Expand Down
12 changes: 4 additions & 8 deletions docling/models/table_structure_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,9 @@ def __init__(self, config):
self.scale = 2.0 # Scale up table input images to 144 dpi

def draw_table_and_cells(self, page: Page, tbl_list: List[TableElement]):
image = page._backend.get_page_image()
image = (
page._backend.get_page_image()
) # make new image to avoid drawing on the saved ones
draw = ImageDraw.Draw(image)

for table_element in tbl_list:
Expand Down Expand Up @@ -94,13 +96,7 @@ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
"width": page.size.width * self.scale,
"height": page.size.height * self.scale,
}
# add image to page input.
if self.scale == 1.0:
page_input["image"] = numpy.asarray(page.image)
else: # render new page image on the fly at desired scale
page_input["image"] = numpy.asarray(
page._backend.get_page_image(scale=self.scale)
)
page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))

table_clusters, table_bboxes = zip(*in_tables)

Expand Down
69 changes: 21 additions & 48 deletions examples/export_figures.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,44 +15,7 @@

_log = logging.getLogger(__name__)


def export_page_images(
doc: ConvertedDocument,
output_dir: Path,
):
output_dir.mkdir(parents=True, exist_ok=True)

doc_filename = doc.input.file.stem

for page in doc.pages:
page_no = page.page_no + 1
page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
with page_image_filename.open("wb") as fp:
page.image.save(fp, format="PNG")


def export_element_images(
doc: ConvertedDocument,
output_dir: Path,
allowed_element_types: Tuple[PageElement] = (FigureElement,),
):
output_dir.mkdir(parents=True, exist_ok=True)

doc_filename = doc.input.file.stem

for element_ix, element in enumerate(doc.assembled.elements):
if isinstance(element, allowed_element_types):
page_ix = element.page_no
crop_bbox = element.cluster.bbox.to_top_left_origin(
page_height=doc.pages[page_ix].size.height
)

cropped_im = doc.pages[page_ix].image.crop(crop_bbox.as_tuple())
element_image_filename = (
output_dir / f"{doc_filename}-element-{element_ix}.png"
)
with element_image_filename.open("wb") as fp:
cropped_im.save(fp, "PNG")
IMAGE_RESOLUTION_SCALE = 2.0


def main():
Expand All @@ -61,37 +24,47 @@ def main():
input_doc_paths = [
Path("./test/data/2206.01062.pdf"),
]
output_dir = Path("./scratch")

input_files = DocumentConversionInput.from_paths(input_doc_paths)

# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
# will destroy them for cleaning up memory.
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
# scale=1 correspond of a standard 72 DPI image
assemble_options = AssembleOptions()
assemble_options.keep_page_images = True
assemble_options.images_scale = IMAGE_RESOLUTION_SCALE

doc_converter = DocumentConverter(assemble_options=assemble_options)

start_time = time.time()

converted_docs = doc_converter.convert(input_files)

output_dir.mkdir(parents=True, exist_ok=True)
for doc in converted_docs:
if doc.status != ConversionStatus.SUCCESS:
_log.info(f"Document {doc.input.file} failed to convert.")
continue

# Export page images
export_page_images(doc, output_dir=Path("./scratch"))
doc_filename = doc.input.file.stem

# Export figures
# export_element_images(doc, output_dir=Path("./scratch"), allowed_element_types=(FigureElement,))
# Export page images
for page in doc.pages:
page_no = page.page_no + 1
page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
with page_image_filename.open("wb") as fp:
page.image.save(fp, format="PNG")

# Export figures and tables
export_element_images(
doc,
output_dir=Path("./scratch"),
allowed_element_types=(FigureElement, TableElement),
)
for element, image in doc.render_element_images(
element_types=(FigureElement, TableElement)
):
element_image_filename = (
output_dir / f"{doc_filename}-element-{element.id}.png"
)
with element_image_filename.open("wb") as fp:
image.save(fp, "PNG")

end_time = time.time() - start_time

Expand Down
Loading