Skip to content

Commit

Permalink
imports nog niet goed
Browse files Browse the repository at this point in the history
  • Loading branch information
rmast committed Jun 24, 2022
1 parent 20d1a47 commit 129b657
Showing 1 changed file with 105 additions and 7 deletions.
112 changes: 105 additions & 7 deletions OCRmyPDF_DjVu_Optimize
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,24 @@

"""Built-in plugin to implement PDF page optimization."""

import argparse
import itertools
import logging
import os
import sys


from .didjvu.lib import djvu_support as djvu
from .didjvu.lib import filetype
from .didjvu.lib import fs
from .didjvu.lib import gamera_support as gamera
from .didjvu.lib import ipc
from .didjvu.lib import templates
from .didjvu.lib import temporary
from .didjvu.lib import utils
from .didjvu.lib import xmp
from .didjvu.lib import didjvu

import argparse
from pathlib import Path
from typing import (
Callable,
Expand All @@ -28,7 +44,6 @@ from ocrmypdf.cli import numeric
from ocrmypdf.optimize import optimize
from ocrmypdf.subprocess import check_external_program
from subprocess import check_call, check_output
import sys
import os.path
import tempfile
import shutil
Expand All @@ -54,17 +69,16 @@ from PIL import Image
from ocrmypdf._jobcontext import PdfContext
from ocrmypdf.exceptions import OutputFileAccessError
from ocrmypdf.helpers import IMG2PDF_KWARGS, safe_symlink
#from subprocess import check_call, check_output

import os
import io

import fitz

from hocr.parse import hocr_page_iterator, hocr_page_to_word_data
from internetarchivepdf.const import DENOISE_FAST, JPEG2000_IMPL_KAKADU, \
JPEG2000_IMPL_PILLOW, COMPRESSOR_JPEG, COMPRESSOR_JPEG2000
from internetarchivepdf.mrc import create_mrc_hocr_components, encode_mrc_images
#from internetarchivepdf.mrc import create_mrc_hocr_components
from internetarchivepdf.mrc import encode_mrc_images

# TODO:
# - Deal with arbitrary rotation and matrix sizes when placing the image back
Expand All @@ -73,8 +87,6 @@ from internetarchivepdf.mrc import create_mrc_hocr_components, encode_mrc_images
# - Stabilise and clean up the code, the many clean_contents
# -


#from pymupdf examples
def remove_images(doc, page, unwanted):
un_list = [b"/%s Do" % u.encode() for u in unwanted]
#page.clean_contents() # unify / format the commands
Expand Down Expand Up @@ -116,6 +128,8 @@ def compress_page_images(doc, page, hocr_word_data=[]):
dpi = orig_img['xres']

try:
djvu.require_cli()
gamera.init()
mrc_gen = create_mrc_hocr_components(pil_image, hocr_word_data,
#mrc_gen = create_mrc_hocr_components(pil_image, [],
denoise_mask=DENOISE_FAST,
Expand Down Expand Up @@ -311,3 +325,87 @@ def optimizeR(
else:
return output_file

# TODO: Reduce amount of memory active at one given point (keep less images in
# memory, write to disk sooner, etc), careful with numpy <-> PIL conversions
def create_mrc_hocr_components(image, hocr_word_data,
dpi=None,
downsample=None,
bg_downsample=None,
fg_downsample=None,
denoise_mask=None, timing_data=None,
errors=None):
"""
Create the MRC components: mask, foreground and background
Args:
* image (PIL.Image): Image to be decomposed
* hocr_word_data: OCR data about found text on the page
* downsample (int): factor by which the OCR data is to be downsampled
* bg_downsample (int): if the background image should be downscaled
* denoise_mask (bool): Whether to denoise the image if it is deemed too
noisy
* timing_data: Optional timing data to log individual timing data to.
* errors: Optional argument (of type set) with encountered runtime errors
Returns a tuple of the components, as numpy arrays: (mask, foreground,
background)
"""

width_, height_ = image.size

gamera_image = gamera._from_pil(image)
mask2 = gamera.methods['djvu'](gamera_image)
mask3 = _image_conversion.to_greyscale(mask2)
mask = gamera.to_pil_1bpp(mask3)
mask_arr = np.array(mask)

mask_inv = np.invert(mask_arr)

yield mask_inv

fg_djvu = didjvu.make_layer(gamera_image, mask3, subsample_fg)
fg_ppm = djvu_to_ppm(fg_djvu)
foreground_arr = np.array(Image.open(fg_ppm))

yield foreground_arr
foreground_arr = None

bg_djvu = didjvu.make_layer(gamera_image, mask3, subsample_bg)
bg_ppm = djvu_to_ppm(bg_djvu)
background_arr = np.array(Image.open(bg_ppm))

yield background_arr
return

def djvu_to_ppm(djvu_file):
# TODO: Use Multichunk.
ppm_file = temporary.file(suffix='.ppm')
args = ['ddjvu','-format=ppm', djvu_file.name, ppm_file.name]
with open(os.devnull, 'wb') as dev_null:
return utils.Proxy(ppm_file, ipc.Subprocess(args, stderr=dev_null).wait, [djvu_file])

def main(infile, outfile):
from shutil import copy # pylint: disable=import-outside-toplevel
from tempfile import TemporaryDirectory # pylint: disable=import-outside-toplevel

infile = Path(infile)

with TemporaryDirectory() as tmpdir:
context = PdfContext(options, tmpdir, infile, None, None)
tmpout = Path(tmpdir) / 'out.pdf'
optimizeR(
infile,
tmpout,
context,
dict(
compress_streams=True,
preserve_pdfa=True,
object_stream_mode=ObjectStreamMode.generate,
),
)
copy(fspath(tmpout), fspath(outfile))


if __name__ == '__main__':
main(sys.argv[1], sys.argv[2])

0 comments on commit 129b657

Please sign in to comment.