Skip to content

Commit

Permalink
Improve kraken list robustness
Browse files Browse the repository at this point in the history
Call would crash in the presence of invalid records in the repository.
Fixes #561.
  • Loading branch information
mittagessen committed Jan 4, 2024
1 parent 992fb0b commit a21f1e3
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 10 deletions.
2 changes: 1 addition & 1 deletion kraken/ketos/recognition.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,7 +472,7 @@ def test(ctx, batch_size, model, evaluation_files, device, pad, workers,
for line in test_set:
try:
ds.add(**line)
except KrakenInputException as e:
except ValueError as e:
logger.info(e)
# don't encode validation set as the alphabets may not match causing encoding failures
ds.no_encode()
Expand Down
20 changes: 11 additions & 9 deletions kraken/lib/dataset/recognition.py
Original file line number Diff line number Diff line change
Expand Up @@ -528,7 +528,7 @@ def add(self,
self.add_line(line)
if page:
self.add_page(page)
if not (line and page):
if not (line or page):
raise ValueError('Neither line nor page data provided in dataset builder')

def add_page(self, page: Segmentation):
Expand Down Expand Up @@ -567,12 +567,10 @@ def add_line(self, line: BBoxLine):
text = func(text)
if not text and self.skip_empty_lines:
raise ValueError(f'Text line "{line.text}" is empty after transformations')
if not line.baseline:
raise ValueError('No baseline given for line')
if not line.boundary:
raise ValueError('No boundary given for line')
if not line.bbox:
raise ValueError('No bounding box given for line')

self._images.append(line.image)
self._images.append((line.imagename, line.bbox))
self._gt.append(text)
self.alphabet.update(text)

Expand Down Expand Up @@ -602,9 +600,12 @@ def __getitem__(self, index: int) -> Tuple[torch.Tensor, torch.Tensor]:
item = self.training_set[index]
try:
logger.debug(f'Attempting to load {item[0]}')
im = item[0]
if not isinstance(im, Image.Image):
im = Image.open(im)
im, bbox = item[0]
flat_box = [x for point in bbox for x in point]
xmin, xmax = min(flat_box[::2]), max(flat_box[::2])
ymin, ymax = min(flat_box[1::2]), max(flat_box[1::2])
im = Image.open(im)
im = im.crop((xmin, ymin, xmax, ymax))
im = self.transforms(im)
if im.shape[0] == 3:
im_mode = 'RGB'
Expand All @@ -621,6 +622,7 @@ def __getitem__(self, index: int) -> Tuple[torch.Tensor, torch.Tensor]:
im = torch.tensor(o['image'].transpose(2, 0, 1))
return {'image': im, 'target': item[1]}
except Exception:
raise
self.failed_samples.add(index)
idx = np.random.randint(0, len(self.training_set))
logger.debug(traceback.format_exc())
Expand Down
4 changes: 4 additions & 0 deletions kraken/repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,7 @@ def get_listing(callback: Callable[[int, int], Any] = lambda total, advance: Non
model_type = SUPPORTED_MODELS.intersection(record['metadata']['keywords'])
if not model_type:
continue
metadata = None
for file in record['files']:
if file['key'] == 'metadata.json':
callback(total, 1)
Expand All @@ -259,6 +260,9 @@ def get_listing(callback: Callable[[int, int], Any] = lambda total, advance: Non
msg = f'Metadata for \'{record["metadata"]["title"]}\' ({record["metadata"]["doi"]}) not in JSON format'
logger.error(msg)
raise KrakenRepoException(msg)
if not metadata:
logger.warning(f"No metadata found for record '{record['doi']}'.")
continue
# merge metadata.jsn into DataCite
key = record['metadata']['doi']
models[key] = record['metadata']
Expand Down

0 comments on commit a21f1e3

Please sign in to comment.