Skip to content

Commit

Permalink
Merge pull request #1139 from OCR-D/bagger-filegrp-filter
Browse files Browse the repository at this point in the history
Generic support for fileGrp whitelist/blacklist
  • Loading branch information
kba authored Nov 23, 2023
2 parents ef70d11 + ab1ab4f commit b32f776
Show file tree
Hide file tree
Showing 19 changed files with 119 additions and 78 deletions.
18 changes: 14 additions & 4 deletions ocrd/ocrd/cli/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,10 +119,11 @@ def workspace_validate(ctx, mets_url, download, skip, page_textequiv_consistency
@click.option('-f', '--clobber-mets', help="Overwrite existing METS file", default=False, is_flag=True)
@click.option('-a', '--download', is_flag=True, help="Download all files and change location in METS file after cloning")
@click.argument('mets_url')
@mets_find_options
# XXX deprecated
@click.argument('workspace_dir', default=None, required=False)
@pass_workspace
def workspace_clone(ctx, clobber_mets, download, mets_url, workspace_dir):
def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mimetype, include_fileGrp, exclude_fileGrp, mets_url, workspace_dir):
"""
Create a workspace from METS_URL and return the directory
Expand All @@ -141,6 +142,11 @@ def workspace_clone(ctx, clobber_mets, download, mets_url, workspace_dir):
mets_basename=ctx.mets_basename,
clobber_mets=clobber_mets,
download=download,
ID=file_id,
pageId=page_id,
mimetype=mimetype,
include_fileGrp=include_fileGrp,
exclude_fileGrp=exclude_fileGrp,
)
workspace.save_mets()
print(workspace.directory)
Expand Down Expand Up @@ -432,7 +438,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_fi
@click.option('--undo-download', is_flag=True, help="Remove all downloaded files from the METS")
@click.option('--wait', type=int, default=0, help="Wait this many seconds between download requests")
@pass_workspace
def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, download, undo_download, wait):
def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, include_fileGrp, exclude_fileGrp, download, undo_download, wait):
"""
Find files.
Expand All @@ -454,6 +460,8 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, down
file_grp=file_grp,
mimetype=mimetype,
page_id=page_id,
include_fileGrp=include_fileGrp,
exclude_fileGrp=exclude_fileGrp,
):
ret_entry = [f.ID if field == 'pageId' else str(getattr(f, field)) or '' for field in output_field]
if download and not f.local_filename:
Expand Down Expand Up @@ -679,7 +687,7 @@ def _handle_json_option(ctx, param, value):
@click.option('--pageId-mapping', help="JSON object mapping src to dest page ID", callback=_handle_json_option)
@mets_find_options
@pass_workspace
def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pageid_mapping, file_grp, file_id, page_id, mimetype, mets_path): # pylint: disable=redefined-builtin
def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pageid_mapping, file_grp, file_id, page_id, mimetype, include_fileGrp, exclude_fileGrp, mets_path): # pylint: disable=redefined-builtin
"""
Merges this workspace with the workspace that contains ``METS_PATH``
Expand All @@ -706,7 +714,9 @@ def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pa
file_grp=file_grp,
file_id=file_id,
page_id=page_id,
mimetype=mimetype
mimetype=mimetype,
include_fileGrp=include_fileGrp,
exclude_fileGrp=exclude_fileGrp,
)
workspace.save_mets()

Expand Down
8 changes: 6 additions & 2 deletions ocrd/ocrd/cli/zip.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,15 @@ def zip_cli():
default="mets.xml",
help='Basename of the METS file.',
show_default=True)
@click.option('-q', '--include-file-grps', 'include_fileGrp', help="fileGrps to include", default=[], multiple=True)
@click.option('-Q', '--exclude-file-grps', 'exclude_fileGrp', help="fileGrps to exclude", default=[], multiple=True)
@click.option('-i', '--identifier', '--id', help="Ocrd-Identifier", required=True)
@click.option('-m', '--mets', help="location of mets.xml in the bag's data dir", default="mets.xml")
@click.option('-b', '--base-version-checksum', help="Ocrd-Base-Version-Checksum")
@click.option('-t', '--tag-file', help="Add a non-payload file to bag", type=click.Path(file_okay=True, dir_okay=False, readable=True, resolve_path=True), multiple=True)
@click.option('-Z', '--skip-zip', help="Create a directory but do not ZIP it", is_flag=True, default=False)
@click.option('-j', '--processes', help="Number of parallel processes", type=int, default=1)
def bag(directory, mets_basename, dest, identifier, mets, base_version_checksum, tag_file, skip_zip, processes):
def bag(directory, mets_basename, dest, include_fileGrp, exclude_fileGrp, identifier, mets, base_version_checksum, tag_file, skip_zip, processes):
"""
Bag workspace as OCRD-ZIP at DEST
"""
Expand All @@ -59,7 +61,9 @@ def bag(directory, mets_basename, dest, identifier, mets, base_version_checksum,
ocrd_base_version_checksum=base_version_checksum,
processes=processes,
tag_files=tag_file,
skip_zip=skip_zip
skip_zip=skip_zip,
include_fileGrp=include_fileGrp,
exclude_fileGrp=exclude_fileGrp,
)

# ----------------------------------------------------------------------
Expand Down
2 changes: 2 additions & 0 deletions ocrd/ocrd/decorators/mets_find_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ def mets_find_options(f):
option('-m', '--mimetype', help="Media type to look for", metavar='FILTER'),
option('-g', '--page-id', help="Page ID", metavar='FILTER'),
option('-i', '--file-id', help="ID", metavar='FILTER'),
option('-q', '--include-file-grps', 'include_fileGrp', help="fileGrps to include", default=[], multiple=True),
option('-Q', '--exclude-file-grps', 'exclude_fileGrp', help="fileGrps to exclude", default=[], multiple=True),
]:
opt(f)
return f
4 changes: 3 additions & 1 deletion ocrd/ocrd/resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ def workspace_from_url(
download=False,
src_baseurl=None,
mets_server_url=None,
**kwargs
):
"""
Create a workspace from a METS by URL (i.e. clone if :py:attr:`mets_url` is remote or :py:attr:`dst_dir` is given).
Expand All @@ -172,6 +173,7 @@ def workspace_from_url(
By default existing ``mets.xml`` will raise an exception.
download (boolean, False): Whether to also download all the files referenced by the METS
src_baseurl (string, None): Base URL for resolving relative file locations
**kwargs (): Passed on to ``OcrdMets.find_files`` if download == True
Download (clone) :py:attr:`mets_url` to ``mets.xml`` in :py:attr:`dst_dir`, unless
the former is already local and the latter is ``none`` or already identical to its directory name.
Expand Down Expand Up @@ -217,7 +219,7 @@ def workspace_from_url(
workspace = Workspace(self, dst_dir, mets_basename=mets_basename, baseurl=src_baseurl, mets_server_url=mets_server_url)

if download:
for f in workspace.mets.find_files():
for f in workspace.mets.find_files(**kwargs):
workspace.download_file(f)

return workspace
Expand Down
21 changes: 15 additions & 6 deletions ocrd/ocrd/workspace_bagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
pushd_popd,
getLogger,
MIME_TO_EXT,
is_local_filename,
unzip_file_to_dir,

MIMETYPE_PAGE,
Expand Down Expand Up @@ -56,7 +55,15 @@ def _log_or_raise(self, msg):
else:
log.info(msg)

def _bag_mets_files(self, workspace, bagdir, ocrd_mets, processes):
def _bag_mets_files(
self,
workspace,
bagdir,
ocrd_mets,
processes,
include_fileGrp=None,
exclude_fileGrp=None,
):
mets = workspace.mets
changed_local_filenames = {}

Expand All @@ -65,8 +72,8 @@ def _bag_mets_files(self, workspace, bagdir, ocrd_mets, processes):

with pushd_popd(workspace.directory):
# local_filenames of the files before changing
for f in mets.find_files():
log.info("Handling OcrdFile %s", f)
for f in mets.find_files(include_fileGrp=include_fileGrp, exclude_fileGrp=exclude_fileGrp):
log.info("Bagging OcrdFile %s", f)

file_grp_dir = Path(bagdir, 'data', f.fileGrp)
if not file_grp_dir.is_dir():
Expand Down Expand Up @@ -130,7 +137,9 @@ def bag(self,
ocrd_base_version_checksum=None,
processes=1,
skip_zip=False,
tag_files=None
tag_files=None,
include_fileGrp=None,
exclude_fileGrp=None,
):
"""
Bag a workspace
Expand Down Expand Up @@ -170,7 +179,7 @@ def bag(self,
f.write(BAGIT_TXT.encode('utf-8'))

# create manifests
total_bytes, total_files = self._bag_mets_files(workspace, bagdir, ocrd_mets, processes)
total_bytes, total_files = self._bag_mets_files(workspace, bagdir, ocrd_mets, processes, include_fileGrp, exclude_fileGrp)

# create bag-info.txt
bag = Bag(bagdir)
Expand Down
25 changes: 23 additions & 2 deletions ocrd_models/ocrd_models/ocrd_mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,18 @@ def find_all_files(self, *args, **kwargs):
return list(self.find_files(*args, **kwargs))

# pylint: disable=multiple-statements
def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None, local_filename=None, local_only=False):
def find_files(
self,
ID=None,
fileGrp=None,
pageId=None,
mimetype=None,
url=None,
local_filename=None,
local_only=False,
include_fileGrp=None,
exclude_fileGrp=None,
):
"""
Search ``mets:file`` entries in this METS document and yield results.
The :py:attr:`ID`, :py:attr:`pageId`, :py:attr:`fileGrp`,
Expand All @@ -257,6 +268,8 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None
local_filename (string) : ``@xlink:href`` local/cached filename of ``mets:Flocat`` of ``mets:file``
mimetype (string) : ``@MIMETYPE`` of ``mets:file``
local (boolean) : Whether to restrict results to local files in the filesystem
include_fileGrp (list[str]) : Whitelist of allowd file groups
exclude_fileGrp (list[str]) : Blacklist of disallowd file groups
Yields:
:py:class:`ocrd_models:ocrd_file:OcrdFile` instantiations
"""
Expand Down Expand Up @@ -351,7 +364,15 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None
if is_local is None:
continue

yield OcrdFile(cand, mets=self)
ret = OcrdFile(cand, mets=self)

# XXX include_fileGrp is redundant to fileGrp but for completeness
if exclude_fileGrp and ret.fileGrp in exclude_fileGrp:
continue
if include_fileGrp and ret.fileGrp not in include_fileGrp:
continue

yield ret

def add_file_group(self, fileGrp):
"""
Expand Down
25 changes: 15 additions & 10 deletions ocrd_validators/ocrd_validators/workspace_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,9 @@ def check_file_grp(workspace, input_file_grp=None, output_file_grp=None, page_id
return report

def __init__(self, resolver, mets_url, src_dir=None, skip=None, download=False,
page_strictness='strict', page_coordinate_consistency='poly'):
page_strictness='strict', page_coordinate_consistency='poly',
include_fileGrp=None, exclude_fileGrp=None
):
"""
Construct a new WorkspaceValidator.
Expand All @@ -80,6 +82,8 @@ def __init__(self, resolver, mets_url, src_dir=None, skip=None, download=False,
* `"baseline"`: Baseline in TextLine
* `"both"`: both `poly` and `baseline` checks
* `"off"`: no coordinate checks
include_fileGrp (list[str]): filegrp whitelist
exclude_fileGrp (list[str]): filegrp blacklist
"""
self.report = ValidationReport()
self.skip = skip if skip else []
Expand All @@ -97,6 +101,7 @@ def __init__(self, resolver, mets_url, src_dir=None, skip=None, download=False,
if 'mets_fileid_page_pcgtsid' not in self.skip:
self.page_checks.append('pcgtsid')

self.find_kwargs = dict(include_fileGrp=include_fileGrp, exclude_fileGrp=exclude_fileGrp)
self.src_dir = src_dir
self.workspace = None
self.mets = None
Expand Down Expand Up @@ -184,14 +189,14 @@ def _validate_imagefilename(self):
Validate that the imageFilename is correctly set to a filename relative to the workspace
"""
self.log.debug('_validate_imagefilename')
for f in self.mets.find_files(mimetype=MIMETYPE_PAGE):
for f in self.mets.find_files(mimetype=MIMETYPE_PAGE, **self.find_kwargs):
if not f.local_filename and not self.download:
self.log.warning("Not available locally and 'download' is not set: %s", f)
continue
self.workspace.download_file(f)
page = page_from_file(f).get_Page()
imageFilename = page.imageFilename
if not self.mets.find_files(url=imageFilename):
if not self.mets.find_files(url=imageFilename, **self.find_kwargs):
self.report.add_error("PAGE-XML %s : imageFilename '%s' not found in METS" % (f.local_filename, imageFilename))
if is_local_filename(imageFilename) and not Path(imageFilename).exists():
self.report.add_warning("PAGE-XML %s : imageFilename '%s' points to non-existent local file" % (f.local_filename, imageFilename))
Expand All @@ -201,7 +206,7 @@ def _validate_dimension(self):
Validate image height and PAGE imageHeight match
"""
self.log.info('_validate_dimension')
for f in self.mets.find_files(mimetype=MIMETYPE_PAGE):
for f in self.mets.find_files(mimetype=MIMETYPE_PAGE, **self.find_kwargs):
if not f.local_filename and not self.download:
self.log.warning("Not available locally and 'download' is not set: %s", f)
continue
Expand All @@ -220,7 +225,7 @@ def _validate_multipage(self):
See `spec <https://ocr-d.github.io/mets#no-multi-page-images>`_.
"""
self.log.debug('_validate_multipage')
for f in self.mets.find_files(mimetype='//image/.*'):
for f in self.mets.find_files(mimetype='//image/.*', **self.find_kwargs):
if not f.local_filename and not self.download:
self.log.warning("Not available locally and 'download' is not set: %s", f)
continue
Expand All @@ -240,7 +245,7 @@ def _validate_pixel_density(self):
See `spec <https://ocr-d.github.io/mets#pixel-density-of-images-must-be-explicit-and-high-enough>`_.
"""
self.log.debug('_validate_pixel_density')
for f in self.mets.find_files(mimetype='//image/.*'):
for f in self.mets.find_files(mimetype='//image/.*', **self.find_kwargs):
if not f.local_filename and not self.download:
self.log.warning("Not available locally and 'download' is not set: %s", f)
continue
Expand Down Expand Up @@ -282,10 +287,10 @@ def _validate_mets_files(self):
"""
self.log.debug('_validate_mets_files')
try:
next(self.mets.find_files())
next(self.mets.find_files(**self.find_kwargs))
except StopIteration:
self.report.add_error("No files")
for f in self.mets.find_files():
for f in self.mets.find_files(**self.find_kwargs):
if f._el.get('GROUPID'): # pylint: disable=protected-access
self.report.add_notice("File '%s' has GROUPID attribute - document might need an update" % f.ID)
if not (f.url or f.local_filename):
Expand All @@ -303,7 +308,7 @@ def _validate_page(self):
Run PageValidator on the PAGE-XML documents referenced in the METS.
"""
self.log.debug('_validate_page')
for f in self.mets.find_files(mimetype=MIMETYPE_PAGE):
for f in self.mets.find_files(mimetype=MIMETYPE_PAGE, **self.find_kwargs):
if not f.local_filename and not self.download:
self.log.warning("Not available locally and 'download' is not set: %s", f)
continue
Expand All @@ -322,7 +327,7 @@ def _validate_page_xsd(self):
Validate all PAGE-XML files against PAGE XSD schema
"""
self.log.debug('_validate_page_xsd')
for f in self.mets.find_files(mimetype=MIMETYPE_PAGE):
for f in self.mets.find_files(mimetype=MIMETYPE_PAGE, **self.find_kwargs):
if not f.local_filename and not self.download:
self.log.warning("Not available locally and 'download' is not set: %s", f)
continue
Expand Down
5 changes: 0 additions & 5 deletions tests/data/bagger-conflict-workspace.ocrd/bag-info.txt

This file was deleted.

2 changes: 0 additions & 2 deletions tests/data/bagger-conflict-workspace.ocrd/bagit.txt

This file was deleted.

Empty file.
34 changes: 0 additions & 34 deletions tests/data/bagger-conflict-workspace.ocrd/data/mets.xml

This file was deleted.

2 changes: 0 additions & 2 deletions tests/data/bagger-conflict-workspace.ocrd/manifest-sha512.txt

This file was deleted.

This file was deleted.

6 changes: 3 additions & 3 deletions tests/data/bagger-conflict-workspace/mets.xml
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@
<mets:fileSec>
<mets:fileGrp USE="A">
<mets:file ID="file1" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="../bagger-conflict-source/A/name.ext" LOCTYPE="URL"/>
<mets:FLocat xlink:href="../bagger-conflict-source/A/name.xml" LOCTYPE="URL"/>
</mets:file>
<mets:file ID="file2" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="../bagger-conflict-source/B/name.ext" LOCTYPE="URL"/>
<mets:FLocat xlink:href="../bagger-conflict-source/B/name.xml" LOCTYPE="URL"/>
</mets:file>
<mets:file ID="file3" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="../bagger-conflict-source/C/name.ext" LOCTYPE="URL"/>
<mets:FLocat xlink:href="../bagger-conflict-source/C/name.xml" LOCTYPE="URL"/>
</mets:file>
</mets:fileGrp>
</mets:fileSec>
Expand Down
Loading

0 comments on commit b32f776

Please sign in to comment.