Skip to content

Commit

Permalink
Merge branch 'list-page-extended'
Browse files Browse the repository at this point in the history
  • Loading branch information
kba committed Nov 23, 2023
2 parents a2259fa + 09290a4 commit 30e3763
Show file tree
Hide file tree
Showing 6 changed files with 533 additions and 281 deletions.
29 changes: 24 additions & 5 deletions ocrd/ocrd/cli/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,18 @@
from os import getcwd
from os.path import relpath, exists, join, isabs
from pathlib import Path
from json import loads
from json import loads, dumps
import sys
from glob import glob # XXX pathlib.Path.glob does not support absolute globs
import re
import time
import numpy as np

import click

from ocrd import Resolver, Workspace, WorkspaceValidator, WorkspaceBackupManager
from ocrd.mets_server import OcrdMetsServer
from ocrd_utils import getLogger, initLogging, pushd_popd, EXT_TO_MIME, safe_filename, parse_json_string_or_file
from ocrd_utils import getLogger, initLogging, pushd_popd, EXT_TO_MIME, safe_filename, parse_json_string_or_file, partition_list
from ocrd.decorators import mets_find_options
from . import command_with_replaced_help

Expand Down Expand Up @@ -583,17 +584,35 @@ def list_groups(ctx):
print("\n".join(workspace.mets.file_groups))

# ----------------------------------------------------------------------
# ocrd workspace list-pages
# ocrd workspace list-page
# ----------------------------------------------------------------------

@workspace_cli.command('list-page')
@click.option('-f', '--output-format', help="Output format", type=click.Choice(['one-per-line', 'comma-separated', 'json']), default='one-per-line')
@click.option('-D', '--chunk-number', help="Partition the return value into n roughly equally sized chunks", default=1, type=int)
@click.option('-C', '--chunk-index', help="Output the nth chunk of results, -1 for all of them.", default=None, type=int)
@click.option('-r', '--page-id-range', help="Restrict the pages to those matching the provided range, based on the @ID attribute. Separate start/end with ..")
@click.option('-R', '--numeric-range', help="Restrict the pages to those in the range, in numerical document order. Separate start/end with ..")
@pass_workspace
def list_pages(ctx):
def list_pages(ctx, output_format, chunk_number, chunk_index, page_id_range, numeric_range):
"""
List physical page IDs
"""
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
print("\n".join(workspace.mets.physical_pages))
find_kwargs = {}
if page_id_range:
find_kwargs['pageId'] = page_id_range
ids = sorted({x.pageId for x in workspace.mets.find_files(**find_kwargs)})
if numeric_range:
start, end = map(int, numeric_range.split('..'))
ids = ids[start-1:end]
chunks = partition_list(ids, chunk_number, chunk_index)
if output_format == 'one-per-line':
print("\n".join(["\n".join(chunk) for chunk in chunks]))
elif output_format == 'comma-separated':
print("\n".join([",".join(chunk) for chunk in chunks]))
elif output_format == 'json':
print(dumps(chunks))

# ----------------------------------------------------------------------
# ocrd workspace get-id
Expand Down
1 change: 1 addition & 0 deletions ocrd_utils/ocrd_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@
is_string,
make_file_id,
nth_url_segment,
partition_list,
parse_json_string_or_file,
parse_json_string_with_comments,
remove_non_path_from_url,
Expand Down
38 changes: 38 additions & 0 deletions ocrd_utils/ocrd_utils/str.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,28 @@
from .constants import REGEX_FILE_ID
from .deprecate import deprecation_warning
from warnings import warn
from math import ceil
import sys
from itertools import islice

if sys.version_info >= (3, 12):
from itertools import batched
else:
def batched(iterable, chunk_size):
iterator = iter(iterable)
chunk = None
while True:
chunk = tuple(islice(iterator, chunk_size))
if not chunk:
break
yield chunk

__all__ = [
'assert_file_grp_cardinality',
'concat_padded',
'get_local_filename',
'is_local_filename',
'partition_list',
'is_string',
'make_file_id',
'nth_url_segment',
Expand Down Expand Up @@ -207,3 +223,25 @@ def generate_range(start, end):
for i in range(int(start_num), int(end_num) + 1):
ret.append(start.replace(start_num, str(i).zfill(len(start_num))))
return ret

def partition_list(lst, chunks, chunk_index=None):
"""
Partition a list into roughly equally-sized chunks
Args:
lst (list): list to partition
chunks (int): number of chunks to generate (not per chunk!)
Keyword Args:
chunk_index (None|int): If provided, return only a list consisting of this chunk
Returns:
list(list())
"""
if not lst:
return []
items_per_chunk = ceil(len(lst) / chunks)
ret = list(map(list, batched(lst, items_per_chunk)))
if chunk_index is not None:
return [ret[chunk_index]]
return ret
16 changes: 16 additions & 0 deletions tests/cli/test_workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -553,5 +553,21 @@ def test_bulk_add_stdin(self):
assert f.local_filename == Path('BIN/FILE_0001_BIN.IMG-wolf.png')
assert f.url == 'https://host/FILE_0001_BIN.IMG-wolf/BIN/FILE_0001_BIN.IMG-wolf.png'

def test_list_page(self):
def _call(args):
_, out, _ = self.invoke_cli(workspace_cli, ['list-page', *args])
return out.rstrip('\n')
with pushd_popd(Path(__file__).parent.parent / 'data/list-page-workspace'):
assert _call([]) == 'PHYS_0001\nPHYS_0002\nPHYS_0003\nPHYS_0004\nPHYS_0005\nPHYS_0006\nPHYS_0008\nPHYS_0009\nPHYS_0010\nPHYS_0011\nPHYS_0012\nPHYS_0013\nPHYS_0014\nPHYS_0015\nPHYS_0016\nPHYS_0017\nPHYS_0018\nPHYS_0019\nPHYS_0020\nPHYS_0022\nPHYS_0023\nPHYS_0024\nPHYS_0025\nPHYS_0026\nPHYS_0027\nPHYS_0028\nPHYS_0029'
assert _call(['-f', 'comma-separated']) == 'PHYS_0001,PHYS_0002,PHYS_0003,PHYS_0004,PHYS_0005,PHYS_0006,PHYS_0008,PHYS_0009,PHYS_0010,PHYS_0011,PHYS_0012,PHYS_0013,PHYS_0014,PHYS_0015,PHYS_0016,PHYS_0017,PHYS_0018,PHYS_0019,PHYS_0020,PHYS_0022,PHYS_0023,PHYS_0024,PHYS_0025,PHYS_0026,PHYS_0027,PHYS_0028,PHYS_0029'
assert _call(['-f', 'json']) == '[["PHYS_0001", "PHYS_0002", "PHYS_0003", "PHYS_0004", "PHYS_0005", "PHYS_0006", "PHYS_0008", "PHYS_0009", "PHYS_0010", "PHYS_0011", "PHYS_0012", "PHYS_0013", "PHYS_0014", "PHYS_0015", "PHYS_0016", "PHYS_0017", "PHYS_0018", "PHYS_0019", "PHYS_0020", "PHYS_0022", "PHYS_0023", "PHYS_0024", "PHYS_0025", "PHYS_0026", "PHYS_0027", "PHYS_0028", "PHYS_0029"]]'
assert _call(['-f', 'comma-separated', '-R', '5..5']) == 'PHYS_0005'
assert _call(['-f', 'comma-separated', '-R', '6..8']) == 'PHYS_0006,PHYS_0008,PHYS_0009'
assert _call(['-f', 'comma-separated', '-r', 'PHYS_0006..PHYS_0009']) == 'PHYS_0006,PHYS_0008,PHYS_0009'
assert _call(['-f', 'comma-separated', '-r', 'PHYS_0001..PHYS_0010', '-D', '3']) == 'PHYS_0001,PHYS_0002,PHYS_0003\nPHYS_0004,PHYS_0005,PHYS_0006\nPHYS_0008,PHYS_0009,PHYS_0010'
assert _call(['-f', 'comma-separated', '-r', 'PHYS_0001..PHYS_0010', '-D', '3', '-C', '2']) == 'PHYS_0008,PHYS_0009,PHYS_0010'
from json import loads
assert loads(_call(['-f', 'json', '-r', 'PHYS_0001..PHYS_0010', '-D', '3', '-C', '2'])) == [['PHYS_0008', 'PHYS_0009', 'PHYS_0010']]

if __name__ == '__main__':
main(__file__)
188 changes: 188 additions & 0 deletions tests/data/list-page-workspace/mets.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
<?xml version="1.0" encoding="UTF-8"?>
<mets:mets xmlns:mets="http://www.loc.gov/METS/" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="info:lc/xmlns/premis-v2 http://www.loc.gov/standards/premis/v2/premis-v2-0.xsd http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-6.xsd http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/mets.xsd http://www.loc.gov/mix/v10 http://www.loc.gov/standards/mix/mix10/mix10.xsd">
<mets:metsHdr CREATEDATE="2023-11-20T19:35:02.939335">
<mets:agent TYPE="OTHER" OTHERTYPE="SOFTWARE" ROLE="CREATOR">
<mets:name>ocrd/core v2.58.1</mets:name>
</mets:agent>
</mets:metsHdr>
<mets:dmdSec ID="DMDLOG_0001">
<mets:mdWrap MDTYPE="MODS">
<mets:xmlData>
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
</mods:mods>
</mets:xmlData>
</mets:mdWrap>
</mets:dmdSec>
<mets:amdSec ID="AMD">
</mets:amdSec>
<mets:fileSec>
<mets:fileGrp USE="FOO">
<mets:file ID="FOO_1" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_2" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_3" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_4" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_5" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_6" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_8" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_9" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_10" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_11" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_12" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_13" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_14" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_15" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_16" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_17" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_18" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_19" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_20" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_22" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_23" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_24" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_25" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_26" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_27" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_28" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_29" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
</mets:fileGrp>
</mets:fileSec>
<mets:structMap TYPE="PHYSICAL">
<mets:div TYPE="physSequence">
<mets:div TYPE="page" ID="PHYS_0001">
<mets:fptr FILEID="FOO_1"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0002">
<mets:fptr FILEID="FOO_2"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0003">
<mets:fptr FILEID="FOO_3"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0004">
<mets:fptr FILEID="FOO_4"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0005">
<mets:fptr FILEID="FOO_5"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0006">
<mets:fptr FILEID="FOO_6"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0008">
<mets:fptr FILEID="FOO_8"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0009">
<mets:fptr FILEID="FOO_9"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0010">
<mets:fptr FILEID="FOO_10"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0011">
<mets:fptr FILEID="FOO_11"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0012">
<mets:fptr FILEID="FOO_12"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0013">
<mets:fptr FILEID="FOO_13"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0014">
<mets:fptr FILEID="FOO_14"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0015">
<mets:fptr FILEID="FOO_15"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0016">
<mets:fptr FILEID="FOO_16"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0017">
<mets:fptr FILEID="FOO_17"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0018">
<mets:fptr FILEID="FOO_18"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0019">
<mets:fptr FILEID="FOO_19"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0020">
<mets:fptr FILEID="FOO_20"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0022">
<mets:fptr FILEID="FOO_22"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0023">
<mets:fptr FILEID="FOO_23"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0024">
<mets:fptr FILEID="FOO_24"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0025">
<mets:fptr FILEID="FOO_25"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0026">
<mets:fptr FILEID="FOO_26"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0027">
<mets:fptr FILEID="FOO_27"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0028">
<mets:fptr FILEID="FOO_28"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0029">
<mets:fptr FILEID="FOO_29"/>
</mets:div>
</mets:div>
</mets:structMap>
</mets:mets>
Loading

0 comments on commit 30e3763

Please sign in to comment.