Skip to content

Commit

Permalink
Merge branch 'main' of github.com:ocrmypdf/OCRmyPDF
Browse files Browse the repository at this point in the history
  • Loading branch information
jbarlow83 committed Jun 20, 2023
2 parents f80dd0d + 050dd1f commit cf33095
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 9 deletions.
37 changes: 28 additions & 9 deletions src/ocrmypdf/_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -852,24 +852,43 @@ def report_on_metadata(missing):

with pikepdf.open(context.origin) as original, pikepdf.open(working_file) as pdf:
docinfo = get_docinfo(original, context)
with pdf.open_metadata() as meta:
meta.load_from_docinfo(docinfo, delete_missing=False, raise_failure=False)
with pdf.open_metadata() as meta_pdf:
meta_pdf.load_from_docinfo(docinfo, delete_missing=False, raise_failure=False)
# If xmp:CreateDate is missing, set it to the modify date to
# match Ghostscript, for consistency
if 'xmp:CreateDate' not in meta:
meta['xmp:CreateDate'] = meta.get('xmp:ModifyDate', '')
# ensure consistency with Ghostscript.
if 'xmp:CreateDate' not in meta_pdf:
meta_pdf['xmp:CreateDate'] = meta_pdf.get('xmp:ModifyDate', '')

with original.open_metadata(
set_pikepdf_as_editor=False, update_docinfo=False, strict=False
) as meta_original:
if meta.get('dc:title') == 'Untitled':
if meta_pdf.get('dc:title') == 'Untitled':
# Ghostscript likes to set title to Untitled if omitted from input.
# Reverse this, because PDF/A TechNote 0003:Metadata in PDF/A-1
# and the XMP Spec do not make this recommendation.
if 'dc:title' not in meta_original:
del meta['dc:title']
missing = set(meta_original.keys()) - set(meta.keys())
report_on_metadata(missing)
del meta_pdf['dc:title']
# If the user explicitly specified an empty string for any of the
# following, they should be unset and not reported as missing in
# the output pdf. Note that some metadata fields use differing names
# between PDF-A and PDF.
for meta in [meta_pdf, meta_original]:
if options.title == '' and 'dc:title' in meta:
del meta['dc:title'] # PDF-A and PDF
if options.author == '':
if 'dc:creator' in meta:
del meta['dc:creator'] # PDF-A (Not xmp:CreatorTool)
if 'pdf:Author' in meta:
del meta['pdf:Author'] # PDF
if options.subject == '':
if 'dc:description' in meta:
del meta['dc:description'] # PDF-A
if 'dc:subject' in meta:
del meta['dc:subject'] # PDF
if options.keywords == '' and 'pdf:Keywords' in meta:
del meta['pdf:Keywords'] # PDF-A and PDF
meta_missing = set(meta_original.keys()) - set(meta_pdf.keys())
report_on_metadata(meta_missing)

optimizing = context.plugin_manager.hook.is_optimization_enabled(
context=context
Expand Down
Binary file added tests/resources/meta.pdf
Binary file not shown.
44 changes: 44 additions & 0 deletions tests/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,50 @@ def test_override_metadata(output_type, resources, outpdf):
assert pdfa_info['output'] == output_type


@pytest.mark.parametrize('output_type', ['pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3'])
@pytest.mark.parametrize('field', ['title', 'author', 'subject', 'keywords'])
def test_unset_metadata(output_type, field, resources, outpdf):
input_file = resources / 'meta.pdf'

# magic strings contained in the input pdf metadata
meta = {
'title': b'NFY5f7Ft2DWMkxLhXwxvFf7eWR2KeK3vEDcd',
'author': b'yXaryipxyRk9dVjWjSSaVaNCKeLRgEVzPRMp',
'subject': b't49vimctvnuH7ZeAjAkv52ACvWFjcnm5MPJr',
'keywords': b's9EeALwUg7urA7fnnhm5EtUyC54sW2WPUzqh'}

p = run_ocrmypdf(
input_file,
outpdf,
f'--{field}',
'',
'--output-type',
output_type,
'--plugin',
'tests/plugins/tesseract_noop.py',
)

assert p.returncode == ExitCode.ok, p.stderr

# We mainly want to ensure that when '' is passed, the corresponding
# metadata is unset in the output pdf. Since metedata is not compressed,
# the best way to gaurentee the metadata of interest didn't carry
# forward is to just check to ensure the corresponding magic string
# isn't contained anywhere in the output pdf. We'll also check to ensure
# it's in the input pdf and that any values not unset are still in the
# output pdf.
with open(input_file, 'rb') as before, open(outpdf, 'rb') as after:
before_data = before.read()
after_data = after.read()

for k, v in meta.items():
assert v in before_data
if k == field:
assert v not in after_data
else:
assert v in after_data


def test_high_unicode(resources, no_outpdf):
# Ghostscript doesn't support high Unicode, so neither do we, to be
# safe
Expand Down

0 comments on commit cf33095

Please sign in to comment.