Create OCRmyPDF_DjVu_Optimize

rmast · Jun 23, 2022 · 7c9adf9 · 7c9adf9
1 parent ff296b1
commit 7c9adf9
Showing 1 changed file with 313 additions and 0 deletions.
diff --git a/OCRmyPDF_DjVu_Optimize b/OCRmyPDF_DjVu_Optimize
@@ -0,0 +1,313 @@
+#!/usr/bin/python3
+# OCRmyPDF Optimizatopm plugin by Robert Mast inspired by James R. Barlow: github.com/jbarlow83, Merlijn Wajer <[email protected]>
+# enabling OCRmyPDF to optimize PDF's as good as Open Sourced parts of DjVu can do.
+# As some DjVu-software-patents have expired there might even be some additional room for improvement for anyone who is able to understand them deeply.
+
+"""Built-in plugin to implement PDF page optimization."""
+
+import argparse
+import logging
+from pathlib import Path
+from typing import (
+    Callable,
+    Dict,
+    Iterator,
+    List,
+    MutableSet,
+    NamedTuple,
+    NewType,
+    Optional,
+    Sequence,
+    Tuple,
+)
+from ocrmypdf import PdfContext, hookimpl
+from ocrmypdf._concurrent import Executor, SerialExecutor
+from ocrmypdf._exec import jbig2enc, pngquant
+from ocrmypdf._pipeline import get_pdf_save_settings
+from ocrmypdf.cli import numeric
+from ocrmypdf.optimize import optimize
+from ocrmypdf.subprocess import check_external_program
+from subprocess import check_call, check_output
+import sys
+import os.path
+import tempfile
+import shutil
+import threading
+from collections import defaultdict
+from os import fspath
+from zlib import compress
+
+import img2pdf
+from pikepdf import (
+    Dictionary,
+    Name,
+    Object,
+    ObjectStreamMode,
+    Pdf,
+    PdfError,
+    PdfImage,
+    Stream,
+    UnsupportedImageTypeError,
+)
+from PIL import Image
+
+from ocrmypdf._jobcontext import PdfContext
+from ocrmypdf.exceptions import OutputFileAccessError
+from ocrmypdf.helpers import IMG2PDF_KWARGS, safe_symlink
+#from subprocess import check_call, check_output
+
+import os
+import io
+
+import fitz
+
+from hocr.parse import hocr_page_iterator, hocr_page_to_word_data
+from internetarchivepdf.const import DENOISE_FAST, JPEG2000_IMPL_KAKADU, \
+    JPEG2000_IMPL_PILLOW, COMPRESSOR_JPEG, COMPRESSOR_JPEG2000
+from internetarchivepdf.mrc import create_mrc_hocr_components, encode_mrc_images
+
+# TODO:
+# - Deal with arbitrary rotation and matrix sizes when placing the image back
+# - Decide if we want to ignore transparent images alltogether
+# - Give black/white images (1bit images) special treatment
+# - Stabilise and clean up the code, the many clean_contents
+# - 
+
+
+#from pymupdf examples
+def remove_images(doc, page, unwanted):
+    un_list = [b"/%s Do" % u.encode() for u in unwanted]
+    #page.clean_contents()  # unify / format the commands
+    xref=page.get_contents()[0]  # get its XREF
+    cont=page.read_contents().splitlines()  # read commands as list of lines
+    for i in range(len(cont)):  # walk thru the lines
+        if cont[i] in un_list:  # invokes an unwanted image
+            cont[i] = b""  # remove command
+    doc.update_stream(xref, b"\n".join(cont))  # replace cleaned command object
+    #page.clean_contents()  # removes now unreferenced images from page definition
+
+
+def compress_page_images(doc, page, hocr_word_data=[]):
+    page.clean_contents()
+    imgs = page.get_images(full=True)
+
+    to_remove_xrefs = []
+    to_insert = []
+
+    for img_data in imgs:
+        xref = img_data[0]
+        #print(img_data)
+        orig_img = doc.extract_image(xref)
+        to_remove_xrefs.append(xref)
+        bbox = page.get_image_bbox(img_data)
+        #print(bbox)
+
+        imgfd = io.BytesIO()
+        imgfd.write(orig_img["image"])
+        pil_image = Image.open(imgfd)
+        pil_image.load()
+        # TODO: if greyscale or 1bit, treat differently
+        # TODO: force 1bit mode?
+        #print('image mode', pil_image.mode)
+        #print('image size', pil_image.size)
+
+        imgfd.close()
+
+        dpi = orig_img['xres']
+
+        try:
+            mrc_gen = create_mrc_hocr_components(pil_image, hocr_word_data,
+        #mrc_gen = create_mrc_hocr_components(pil_image, [],
+                denoise_mask=DENOISE_FAST,
+                bg_downsample=3
+                )
+        except:
+            print("An exception occurredRobert")
+
+        fg_slope = 44500
+        bg_slope = 44250
+        # with pillow
+        #mask_f, bg_f, bg_s, fg_f, fg_s = encode_mrc_images(mrc_gen,
+        #        jpeg2000_implementation=JPEG2000_IMPL_PILLOW,
+        #        bg_compression_flags=['quality_mode:"rates";quality_layers:[250]'],
+        #        #fg_compression_flags=['quality_mode:"rates";quality_layers:[300]'],
+        #        fg_compression_flags=[''],
+        #        )
+
+        # with jpegoptim
+        #mask_f, bg_f, bg_s, fg_f, fg_s = encode_mrc_images(mrc_gen,
+        #        mrc_image_format=COMPRESSOR_JPEG,
+        #        bg_compression_flags=['-S30'],
+        #        fg_compression_flags=['-S20'],
+        #        )
+
+        mask_f, bg_f, bg_s, fg_f, fg_s = encode_mrc_images(mrc_gen,
+                jpeg2000_implementation=JPEG2000_IMPL_KAKADU,
+                bg_compression_flags=['-slope', str(bg_slope)],
+                #fg_compression_flags=['-slope', str(fg_slope)],
+                fg_compression_flags=['-com','hoi'],
+                )
+
+        # TODO: maybe we can replace the existing image with the background image
+        # here
+        bg_contents = open(bg_f, 'rb').read()
+        fg_contents = open(fg_f, 'rb').read()
+        mask_contents = open(mask_f, 'rb').read()
+
+        os.remove(mask_f)
+        os.remove(bg_f)
+        os.remove(fg_f)
+
+        to_insert.append([
+            {'bbox': bbox, 'stream': bg_contents, 'mask': None, 'overlay': False},
+            {'bbox': bbox, 'stream': fg_contents, 'mask': mask_contents, 'overlay': True}
+        ])
+
+
+    page.clean_contents()
+    for xref in to_remove_xrefs:
+        imgs = page.get_images(full=True)
+        for img_data in imgs:
+            if img_data[0] == xref:
+                remove_images(doc, page, [img_data[7]])
+    page.clean_contents()
+
+    for insert in to_insert:
+        img1 = insert[0]
+        img2 = insert[1]
+        page.insert_image(img1['bbox'], stream=img1['stream'],
+                mask=img1['mask'], overlay=img1['overlay'], alpha=0)
+        page.insert_image(img2['bbox'], stream=img2['stream'],
+                mask=img2['mask'], overlay=img2['overlay'], alpha=0)
+        #page.clean_contents()
+
+    page.clean_contents()
+
+log = logging.getLogger(__name__)
+DEBUG = False
+
+@hookimpl
+def add_options(parser):
+    pass
+
+@hookimpl
+def check_options(options):
+    pass
+
+@hookimpl
+def optimize_pdf(
+    input_pdf: Path,
+    output_pdf: Path,
+    context: PdfContext,
+    executor: Executor,
+    linearize: bool,
+) -> Tuple[Path, Sequence[str]]:
+    save_settings = dict(
+        linearize=linearize,
+        **get_pdf_save_settings(context.options.output_type),
+    )
+    result_path = optimizeR(input_pdf, output_pdf, context, save_settings, executor)
+    messages = []
+    if context.options.optimize == 0:
+        messages.append("Optimization was disabled.")
+    else:
+        image_optimizers = {
+            'jbig2': jbig2enc.available(),
+            'pngquant': pngquant.available(),
+        }
+        for name, available in image_optimizers.items():
+            if not available:
+                messages.append(
+                    f"The optional dependency '{name}' was not found, so some image "
+                    f"optimizations could not be attempted."
+                )
+    return result_path, messages
+
+
+@hookimpl
+def is_optimization_enabled(context: PdfContext) -> bool:
+    return True
+
+# © 2018 James R. Barlow: github.com/jbarlow83
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+"""Post-processing image optimization of OCR PDFs."""
+
+
+
+log = logging.getLogger(__name__)
+
+
+
+DEFAULT_EXECUTOR = SerialExecutor()
+
+
+def optimizeR(
+    input_file: Path,
+    output_file: Path,
+    context,
+    save_settings,
+    executor: Executor = DEFAULT_EXECUTOR,
+) -> Path:
+    options = context.options
+    log.info(
+        "In OptimizeR aanbeland"
+    )
+
+    tmpd = tempfile.mkdtemp()
+    pdfmeta = os.path.join(tmpd, 'pdfmeta.json')
+    pdfhocr = os.path.join(tmpd, 'pdfhocr.html')
+
+    if DEBUG:
+        stde = sys.stderr
+    else:
+        stde = open(os.devnull, 'wb')
+
+    out = check_output(['pdf-metadata-json', input_file], stderr=stde)
+    with open(pdfmeta, 'wb+') as fd:
+        fd.write(out)
+
+    out = check_output(['pdf-to-hocr', '-f', input_file, '-J', pdfmeta], stderr=stde)
+    with open(pdfhocr, 'wb+') as fd:
+        fd.write(out)
+
+    doc = fitz.open(input_file)
+    hocr_iter = pdfhocr
+    outfile = output_file
+
+    for page in doc:
+        hocr_page = next(hocr_iter)
+        hocr_word_data = hocr_page_to_word_data(hocr_page)
+
+        compress_page_images(doc, page, hocr_word_data=hocr_word_data)
+
+        page.clean_contents()
+
+    doc.save(outfile, deflate=True, pretty=True, garbage=2)
+
+    oldsize = os.path.getsize(input_file)
+    newsize = os.path.getsize(output_file)
+    compression_ratio  = oldsize / newsize
+    print('Compression factor:', compression_ratio, file=sys.stderr)
+
+    input_size = input_file.stat().st_size
+    output_size = output_file.stat().st_size
+    if output_size == 0:
+        raise OutputFileAccessError(
+            f"Output file not created after optimizing. We probably ran "
+            f"out of disk space in the temporary folder: {tempfile.gettempdir()}."
+        )
+    savings = 1 - output_size / input_size
+
+    if savings < 0:
+        log.info(
+            "Image optimization did not improve the file - "
+            "optimizations will not be used"
+        )
+        return input_file
+    else:
+        return output_file
+