diff --git a/ocrmypdf/exec/ghostscript.py b/ocrmypdf/exec/ghostscript.py index eb8eb4517..8c87186a1 100644 --- a/ocrmypdf/exec/ghostscript.py +++ b/ocrmypdf/exec/ghostscript.py @@ -77,6 +77,10 @@ def generate_pdfa(pdf_pages, output_file, log, threads=1): "-dAutoRotatePages=/None", "-sColorConversionStrategy=/RGB", "-sProcessColorModel=DeviceRGB", + "-dAutoFilterColorImages=false", + "-sColorImageFilter=", + "-dAutoFilterGrayImages=false", + "-sGrayImageFilter=", "-dJPEGQ=95", "-dPDFA=2", "-dPDFACompatibilityPolicy=1", diff --git a/ocrmypdf/pipeline.py b/ocrmypdf/pipeline.py index 6303b9104..1ad88dd92 100644 --- a/ocrmypdf/pipeline.py +++ b/ocrmypdf/pipeline.py @@ -520,6 +520,8 @@ def select_visible_page_image( pageinfo = get_pageinfo(image, context) if pageinfo['images'] and \ all(im['enc'] == 'jpeg' for im in pageinfo['images']): + log.debug('{:4d}: JPEG input -> JPEG output'.format( + page_number(page_pdf))) # If all images were JPEGs originally, produce a JPEG as output im = Image.open(image) diff --git a/tests/resources/README.rst b/tests/resources/README.rst index 12e057005..29a803e85 100644 --- a/tests/resources/README.rst +++ b/tests/resources/README.rst @@ -34,6 +34,9 @@ In some cases they were converted from one image format to another without other * - typewriter.png, 2400dpi.pdf - `Wikimedia: Triumph typewrtier text Linzensoep`_ * Creative Commons BY-SA 2.5 + * - baiona.png + - `Wikimedia: Baionako udalerri mugakideak`_ + - Creative Commons BY-SA 4.0 Files generated for this project @@ -118,4 +121,6 @@ These test resources are assemblies from other previously mentioned files, relea .. _`Linux (Wikipedia Article)`: https://de.wikipedia.org/wiki/Linux -.. _`Wikimedia: Triumph typewrtier text Linzensoep`: https://commons.wikimedia.org/wiki/File:Triumph.typewriter_text_Linzensoep.gif \ No newline at end of file +.. _`Wikimedia: Triumph typewrtier text Linzensoep`: https://commons.wikimedia.org/wiki/File:Triumph.typewriter_text_Linzensoep.gif + +.. _`Wikimedia: Baionako udalerri mugakideak`: https://commons.wikimedia.org/wiki/File:Baionako_udalerri_mugakideak.png \ No newline at end of file diff --git a/tests/resources/baiona.png b/tests/resources/baiona.png new file mode 100644 index 000000000..02318fc0b Binary files /dev/null and b/tests/resources/baiona.png differ diff --git a/tests/resources/baiona_gray.png b/tests/resources/baiona_gray.png new file mode 100644 index 000000000..0aed02044 Binary files /dev/null and b/tests/resources/baiona_gray.png differ diff --git a/tests/test_main.py b/tests/test_main.py index 0c8daf9d5..3cc873350 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -874,4 +874,38 @@ def test_gs_raster_failure(spoof_no_tess_gs_raster_fail, resources, outpdf): def test_no_contents(spoof_tesseract_noop, resources, outpdf): check_ocrmypdf(resources / 'no_contents.pdf', outpdf, '--force-ocr', - env=spoof_tesseract_noop) \ No newline at end of file + env=spoof_tesseract_noop) + + +@pytest.mark.parametrize('image', [ + 'baiona.png', + 'baiona_gray.png', + ]) +def test_lossless_to_lossless(spoof_tesseract_noop, ocrmypdf_exec, + resources, image, outpdf): + from PIL import Image + + input_file = str(resources / image) + output_file = str(outpdf) + + im = Image.open(input_file) + + # Runs: ocrmypdf - output.pdf < testfile + with open(input_file, 'rb') as input_stream: + p_args = ocrmypdf_exec + ['--image-dpi', '150', '-', output_file] + p = Popen( + p_args, close_fds=True, stdout=PIPE, stderr=PIPE, + stdin=input_stream, env=spoof_tesseract_noop) + out, err = p.communicate() + + assert p.returncode == ExitCode.ok + + pdfinfo = pdf_get_all_pageinfo(output_file) + assert pdfinfo[0]['images'][0]['enc'] != 'jpeg', \ + "Lossless compression changed to lossy!" + if im.mode.startswith('RGB') or im.mode.startswith('BGR'): + assert pdfinfo[0]['images'][0]['color'] == 'rgb', \ + "Colorspace changed" + elif im.mode.startswith('L'): + assert pdfinfo[0]['images'][0]['color'] == 'gray', \ + "Colorspace changed" \ No newline at end of file