Skip to content

Commit

Permalink
Implement —pdfa-image-compression to control Ghostscript’s compression
Browse files Browse the repository at this point in the history
Fixes #163
  • Loading branch information
James R. Barlow committed May 9, 2017
1 parent 63a4a76 commit 01a1c2b
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 15 deletions.
18 changes: 18 additions & 0 deletions ocrmypdf/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,18 @@ def complain(message):
'--rotate-pages-threshold', default=14.0, type=float, metavar='CONFIDENCE',
help="only rotate pages when confidence is above this value (arbitrary "
"units reported by tesseract)")
advanced.add_argument(
'--pdfa-image-compression', choices=['auto', 'jpeg', 'lossless'],
default='auto',
help="specify how to compress images in the output PDF/A. 'auto' lets "
"OCRmyPDF decide. 'jpeg' changes all grayscale and color images to "
"JPEG compression. 'lossless' uses PNG-style lossless compression "
"for all images. Monochrome images are always compressed using a "
"lossless codec. In 'auto' mode OCRmyPDF lets Ghostscript's auto "
"filter decide how to compress the image. Compression settings "
"are applied to all pages, including those for which OCR was "
"skipped. Not supported for --output-type=pdf ; that setting "
"preserves the original compression of all images.")

debugging = parser.add_argument_group(
"Debugging",
Expand Down Expand Up @@ -346,6 +358,12 @@ def check_options_advanced(options, log):
raise MissingDependencyError(
"--pdf-renderer tess4 requires Tesseract 4.x "
"commit 3d9fb3b or later")
if options.pdfa_image_compression != 'auto' and \
options.output_type != 'pdfa':
log.warning(
"--pdfa-image-compression argument has no effect when "
"--output-type is not 'pdfa'"
)


def check_options_metadata(options, log):
Expand Down
30 changes: 24 additions & 6 deletions ocrmypdf/exec/ghostscript.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,28 @@ def rasterize_pdf(input_file, output_file, xres, yres, raster_device, log,
raise SubprocessOutputError()


def generate_pdfa(pdf_pages, output_file, log, threads=1):
def generate_pdfa(pdf_pages, output_file, compression, log, threads=1):
compression_args = []
if compression == 'jpeg':
compression_args = [
"-dAutoFilterColorImages=false",
"-dColorImageFilter=/DCTEncode",
"-dAutoFilterGrayImages=false",
"-dGrayImageFilter=/DCTEncode",
]
elif compression == 'lossless':
compression_args = [
"-dAutoFilterColorImages=false",
"-dColorImageFilter=/FlateEncode",
"-dAutoFilterGrayImages=false",
"-dGrayImageFilter=/FlateEncode",
]
else:
compression_args = [
"-dAutoFilterColorImages=true",
"-dAutoFilterGrayImages=true",
]

with NamedTemporaryFile(delete=True) as gs_pdf:
args_gs = [
get_program("gs"),
Expand All @@ -76,11 +97,8 @@ def generate_pdfa(pdf_pages, output_file, log, threads=1):
"-sDEVICE=pdfwrite",
"-dAutoRotatePages=/None",
"-sColorConversionStrategy=/RGB",
"-sProcessColorModel=DeviceRGB",
"-dAutoFilterColorImages=false",
"-sColorImageFilter=",
"-dAutoFilterGrayImages=false",
"-sGrayImageFilter=",
"-sProcessColorModel=DeviceRGB"
] + compression_args + [
"-dJPEGQ=95",
"-dPDFA=2",
"-dPDFACompatibilityPolicy=1",
Expand Down
6 changes: 4 additions & 2 deletions ocrmypdf/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,7 @@ def select_visible_page_image(
if pageinfo['images'] and \
all(im['enc'] == 'jpeg' for im in pageinfo['images']):
log.debug('{:4d}: JPEG input -> JPEG output'.format(
page_number(page_pdf)))
page_number(image)))
# If all images were JPEGs originally, produce a JPEG as output
im = Image.open(image)

Expand Down Expand Up @@ -807,7 +807,9 @@ def input_file_order(s):

pdf_pages = sorted(input_files, key=input_file_order)
log.debug("Final pages: " + "\n".join(pdf_pages))
ghostscript.generate_pdfa(pdf_pages, output_file, log, options.jobs or 1)
ghostscript.generate_pdfa(
pdf_pages, output_file, options.pdfa_image_compression,
log, options.jobs or 1)


def merge_pages_qpdf(
Expand Down
67 changes: 60 additions & 7 deletions tests/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -885,9 +885,10 @@ def test_no_contents(spoof_tesseract_noop, resources, outpdf):
@pytest.mark.parametrize('image', [
'baiona.png',
'baiona_gray.png',
'congress.jpg'
])
def test_lossless_to_lossless(spoof_tesseract_noop, ocrmypdf_exec,
resources, image, outpdf):
def test_compression_preserved(spoof_tesseract_noop, ocrmypdf_exec,
resources, image, outpdf):
from PIL import Image

input_file = str(resources / image)
Expand All @@ -897,7 +898,8 @@ def test_lossless_to_lossless(spoof_tesseract_noop, ocrmypdf_exec,

# Runs: ocrmypdf - output.pdf < testfile
with open(input_file, 'rb') as input_stream:
p_args = ocrmypdf_exec + ['--image-dpi', '150', '-', output_file]
p_args = ocrmypdf_exec + [
'--image-dpi', '150', '--output-type', 'pdf', '-', output_file]
p = Popen(
p_args, close_fds=True, stdout=PIPE, stderr=PIPE,
stdin=input_stream, env=spoof_tesseract_noop)
Expand All @@ -906,11 +908,62 @@ def test_lossless_to_lossless(spoof_tesseract_noop, ocrmypdf_exec,
assert p.returncode == ExitCode.ok

pdfinfo = pdf_get_all_pageinfo(output_file)
assert pdfinfo[0]['images'][0]['enc'] != 'jpeg', \
"Lossless compression changed to lossy!"

pdfimage = pdfinfo[0]['images'][0]

if input_file.endswith('.png'):
assert pdfimage['enc'] != 'jpeg', \
"Lossless compression changed to lossy!"
elif input_file.endswith('.jpg'):
assert pdfimage['enc'] == 'jpeg', \
"Lossy compression changed to lossless!"
if im.mode.startswith('RGB') or im.mode.startswith('BGR'):
assert pdfimage['color'] == 'rgb', \
"Colorspace changed"
elif im.mode.startswith('L'):
assert pdfimage['color'] == 'gray', \
"Colorspace changed"


@pytest.mark.parametrize('image,compression', [
('baiona.png', 'jpeg'),
('baiona_gray.png', 'lossless'),
('congress.jpg', 'lossless')
])
def test_compression_changed(spoof_tesseract_noop, ocrmypdf_exec,
resources, image, compression, outpdf):
from PIL import Image

input_file = str(resources / image)
output_file = str(outpdf)

im = Image.open(input_file)

# Runs: ocrmypdf - output.pdf < testfile
with open(input_file, 'rb') as input_stream:
p_args = ocrmypdf_exec + [
'--image-dpi', '150', '--output-type', 'pdfa',
'--pdfa-image-compression', compression,
'-', output_file]
p = Popen(
p_args, close_fds=True, stdout=PIPE, stderr=PIPE,
stdin=input_stream, env=spoof_tesseract_noop)
out, err = p.communicate()

assert p.returncode == ExitCode.ok

pdfinfo = pdf_get_all_pageinfo(output_file)

pdfimage = pdfinfo[0]['images'][0]

if compression == 'jpeg':
assert pdfimage['enc'] == 'jpeg'
elif compression == 'lossless':
assert pdfimage['enc'] == 'image'

if im.mode.startswith('RGB') or im.mode.startswith('BGR'):
assert pdfinfo[0]['images'][0]['color'] == 'rgb', \
assert pdfimage['color'] == 'rgb', \
"Colorspace changed"
elif im.mode.startswith('L'):
assert pdfinfo[0]['images'][0]['color'] == 'gray', \
assert pdfimage['color'] == 'gray', \
"Colorspace changed"

0 comments on commit 01a1c2b

Please sign in to comment.