Skip to content

Commit

Permalink
5.x dataset from object regression
Browse files Browse the repository at this point in the history
and some basic tests so this doesn't happen again
  • Loading branch information
mittagessen committed May 9, 2024
1 parent 6b9f7d4 commit ac5e7b5
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 13 deletions.
9 changes: 5 additions & 4 deletions kraken/lib/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,7 @@ def _extract_line(xml_record, skip_empty_lines: bool = True, legacy_polygons: bo
return lines, None, None
if is_bitonal(im):
im = im.convert('1')
recs = xml_record.lines.values()
for idx, rec in enumerate(recs):
for idx, rec in enumerate(xml_record.lines):
seg = Segmentation(text_direction='horizontal-lr',
imagename=xml_record.imagename,
type=xml_record.type,
Expand Down Expand Up @@ -167,6 +166,8 @@ def build_binary_dataset(files: Optional[List[Union[str, 'PathLike', 'Segmentati
for doc in files:
try:
data = parse_fn(doc)
if format_type in ['xml', 'alto', 'page']:
data = data.to_container()
except (FileNotFoundError, KrakenInputException, ValueError):
logger.warning(f'Invalid input file {doc}')
continue
Expand All @@ -191,12 +192,12 @@ def build_binary_dataset(files: Optional[List[Union[str, 'PathLike', 'Segmentati
num_lines = 0
for doc in docs:
if format_type in ['xml', 'alto', 'page', None]:
lines = doc.lines.values()
lines = doc.lines
elif format_type == 'path':
lines = doc['lines']
for line in lines:
num_lines += 1
alphabet.update(line.text if format_type in ['xml', 'alto', 'page'] else line['text'])
alphabet.update(line.text if format_type in ['xml', 'alto', 'page', None] else line['text'])

callback(0, num_lines)

Expand Down
2 changes: 1 addition & 1 deletion tests/resources/170025120000003,0074-lite.xml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
<Coords points="790,224 2398,259 2397,309 789,274"/>
<Baseline points="789,269 2397,304"/>
<TextEquiv>
<Unicode>$-nor su hijo, De todos sus bienes, con los pactos</Unicode>
<Unicode></Unicode>
</TextEquiv>
</TextLine>
<TextLine id="line_1469098653078_465" custom="readingOrder {index:1;} structure {type:$pac;}">
Expand Down
72 changes: 64 additions & 8 deletions tests/test_arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@

import json
import unittest
from pathlib import Path
import tempfile
import pyarrow as pa

from pathlib import Path
from pytest import raises

import kraken
Expand All @@ -13,23 +15,77 @@
thisfile = Path(__file__).resolve().parent
resources = thisfile / 'resources'

def _validate_ds(self, path, num_lines, num_empty_lines, ds_type):
with pa.memory_map(path, 'rb') as source:
ds_table = pa.ipc.open_file(source).read_all()
raw_metadata = ds_table.schema.metadata
if not raw_metadata or b'lines' not in raw_metadata:
raise ValueError(f'{file} does not contain a valid metadata record.')
metadata = json.loads(raw_metadata[b'lines'])
self.assertEqual(metadata['type'],
ds_type,
f'Unexpected dataset type (expected: {ds_type}, found: {metadata["type"]}')
self.assertEqual(metadata['counts']['all'],
num_lines,
'Unexpected number of lines in dataset metadata '
f'(expected: {num_lines}, found: {metadata["counts"]["all"]}')
self.assertEqual(len(ds_table),
num_lines,
'Unexpected number of rows in arrow table '
f'(expected: {num_lines}, found: {metadata["counts"]["all"]}')

real_empty_lines = len([line for line in ds_table.column('lines') if not str(line[0])])
self.assertEqual(real_empty_lines,
num_empty_lines,
'Unexpected number of empty lines in dataset '
f'(expected: {num_empty_lines}, found: {real_empty_lines}')


class TestKrakenArrowCompilation(unittest.TestCase):
"""
Tests for binary datasets
"""
def setUp(self):
self.xml = resources / '170025120000003,0074.xml'
self.bls = xml.XMLPage(self.xml)
self.xml = resources / '170025120000003,0074-lite.xml'
self.seg = xml.XMLPage(self.xml).to_container()
self.box_lines = [resources / '000236.png']

def test_build_path_dataset(self):
pass
with tempfile.NamedTemporaryFile() as tmp_file:
build_binary_dataset(files=4*self.box_lines,
output_file=tmp_file.name,
format_type='path')
_validate_ds(self, tmp_file.name, 4, 0, 'kraken_recognition_bbox')

def test_build_xml_dataset(self):
pass
with tempfile.NamedTemporaryFile() as tmp_file:
build_binary_dataset(files=[self.xml],
output_file=tmp_file.name,
format_type='xml')
_validate_ds(self, tmp_file.name, 4, 0, 'kraken_recognition_baseline')

def test_build_seg_dataset(self):
with tempfile.NamedTemporaryFile() as tmp_file:
build_binary_dataset(files=[self.seg],
output_file=tmp_file.name,
format_type=None)
_validate_ds(self, tmp_file.name, 4, 0, 'kraken_recognition_baseline')

def test_build_obj_dataset(self):
pass
def test_forced_type_dataset(self):
with tempfile.NamedTemporaryFile() as tmp_file:
build_binary_dataset(files=4*self.box_lines,
output_file=tmp_file.name,
format_type='path',
force_type='kraken_recognition_baseline')
_validate_ds(self, tmp_file.name, 4, 0, 'kraken_recognition_baseline')

def test_build_empty_dataset(self):
pass
"""
Test that empty lines are retained in compiled dataset.
"""
with tempfile.NamedTemporaryFile() as tmp_file:
build_binary_dataset(files=[self.xml],
output_file=tmp_file.name,
format_type='xml',
skip_empty_lines=False)
_validate_ds(self, tmp_file.name, 5, 1, 'kraken_recognition_baseline')

0 comments on commit ac5e7b5

Please sign in to comment.