Skip to content

Commit

Permalink
docs: update custom convert and dockerfile (#226)
Browse files Browse the repository at this point in the history
* docs: remove old code from custom_convert.py

Signed-off-by: Michele Dolfi <[email protected]>

* docs: update example Dockerfile

Signed-off-by: Michele Dolfi <[email protected]>

---------

Signed-off-by: Michele Dolfi <[email protected]>
  • Loading branch information
dolfim-ibm authored Nov 4, 2024
1 parent 41acaa9 commit 5f5fea9
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 30 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ ENV TORCH_HOME=/tmp/
COPY examples/minimal.py /root/minimal.py

RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
RUN python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);'
RUN python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; StandardPdfPipeline.download_models_hf(force=True);'

# On container environments, always set a thread budget to avoid undesired thread congestion.
ENV OMP_NUM_THREADS=4
Expand Down
70 changes: 41 additions & 29 deletions docs/examples/custom_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@
import time
from pathlib import Path

from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
from docling.models.tesseract_ocr_model import TesseractOcrOptions

_log = logging.getLogger(__name__)

Expand All @@ -23,32 +26,51 @@ def main():

# PyPdfium without EasyOCR
# --------------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=False
# pipeline_options.do_table_structure=True
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = False
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = False

# doc_converter = DocumentConverter(
# pipeline_options=pipeline_options,
# pdf_backend=PyPdfiumDocumentBackend,
# format_options={
# InputFormat.PDF: PdfFormatOption(
# pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
# )
# }
# )

# PyPdfium with EasyOCR
# -----------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=True
# pipeline_options.do_table_structure=True
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True

# doc_converter = DocumentConverter(
# pipeline_options=pipeline_options,
# pdf_backend=PyPdfiumDocumentBackend,
# format_options={
# InputFormat.PDF: PdfFormatOption(
# pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
# )
# }
# )

# Docling Parse without EasyOCR
# -------------------------
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = False
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True

# doc_converter = DocumentConverter(
# format_options={
# InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
# }
# )

# Docling Parse with EasyOCR
# ----------------------
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True

Expand All @@ -58,42 +80,32 @@ def main():
}
)

# Docling Parse with EasyOCR
# ----------------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=True
# pipeline_options.do_table_structure=True
# pipeline_options.table_structure_options.do_cell_matching = True

# doc_converter = DocumentConverter(
# pipeline_options=pipeline_options,
# pdf_backend=DoclingParseDocumentBackend,
# )

# Docling Parse with Tesseract
# ----------------------
# pipeline_options = PipelineOptions()
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True
# pipeline_options.ocr_options = TesseractOcrOptions()

# doc_converter = DocumentConverter(
# pipeline_options=pipeline_options,
# pdf_backend=DoclingParseDocumentBackend,
# format_options={
# InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
# }
# )

# Docling Parse with Tesseract CLI
# ----------------------
# pipeline_options = PipelineOptions()
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True
# pipeline_options.ocr_options = TesseractCliOcrOptions()

# doc_converter = DocumentConverter(
# pipeline_options=pipeline_options,
# pdf_backend=DoclingParseDocumentBackend,
# format_options={
# InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
# }
# )

###########################################################################
Expand Down

0 comments on commit 5f5fea9

Please sign in to comment.