diff --git a/kraken/ketos/recognition.py b/kraken/ketos/recognition.py index 19ade5c0e..ec17372c6 100644 --- a/kraken/ketos/recognition.py +++ b/kraken/ketos/recognition.py @@ -472,7 +472,7 @@ def test(ctx, batch_size, model, evaluation_files, device, pad, workers, for line in test_set: try: ds.add(**line) - except KrakenInputException as e: + except ValueError as e: logger.info(e) # don't encode validation set as the alphabets may not match causing encoding failures ds.no_encode() diff --git a/kraken/lib/dataset/recognition.py b/kraken/lib/dataset/recognition.py index 2fda9896c..0dfca6779 100644 --- a/kraken/lib/dataset/recognition.py +++ b/kraken/lib/dataset/recognition.py @@ -528,7 +528,7 @@ def add(self, self.add_line(line) if page: self.add_page(page) - if not (line and page): + if not (line or page): raise ValueError('Neither line nor page data provided in dataset builder') def add_page(self, page: Segmentation): @@ -567,12 +567,10 @@ def add_line(self, line: BBoxLine): text = func(text) if not text and self.skip_empty_lines: raise ValueError(f'Text line "{line.text}" is empty after transformations') - if not line.baseline: - raise ValueError('No baseline given for line') - if not line.boundary: - raise ValueError('No boundary given for line') + if not line.bbox: + raise ValueError('No bounding box given for line') - self._images.append(line.image) + self._images.append((line.imagename, line.bbox)) self._gt.append(text) self.alphabet.update(text) @@ -602,9 +600,12 @@ def __getitem__(self, index: int) -> Tuple[torch.Tensor, torch.Tensor]: item = self.training_set[index] try: logger.debug(f'Attempting to load {item[0]}') - im = item[0] - if not isinstance(im, Image.Image): - im = Image.open(im) + im, bbox = item[0] + flat_box = [x for point in bbox for x in point] + xmin, xmax = min(flat_box[::2]), max(flat_box[::2]) + ymin, ymax = min(flat_box[1::2]), max(flat_box[1::2]) + im = Image.open(im) + im = im.crop((xmin, ymin, xmax, ymax)) im = self.transforms(im) if im.shape[0] == 3: im_mode = 'RGB' @@ -621,6 +622,7 @@ def __getitem__(self, index: int) -> Tuple[torch.Tensor, torch.Tensor]: im = torch.tensor(o['image'].transpose(2, 0, 1)) return {'image': im, 'target': item[1]} except Exception: + raise self.failed_samples.add(index) idx = np.random.randint(0, len(self.training_set)) logger.debug(traceback.format_exc()) diff --git a/kraken/repo.py b/kraken/repo.py index af22f1ac9..828ca14c2 100644 --- a/kraken/repo.py +++ b/kraken/repo.py @@ -248,6 +248,7 @@ def get_listing(callback: Callable[[int, int], Any] = lambda total, advance: Non model_type = SUPPORTED_MODELS.intersection(record['metadata']['keywords']) if not model_type: continue + metadata = None for file in record['files']: if file['key'] == 'metadata.json': callback(total, 1) @@ -259,6 +260,9 @@ def get_listing(callback: Callable[[int, int], Any] = lambda total, advance: Non msg = f'Metadata for \'{record["metadata"]["title"]}\' ({record["metadata"]["doi"]}) not in JSON format' logger.error(msg) raise KrakenRepoException(msg) + if not metadata: + logger.warning(f"No metadata found for record '{record['doi']}'.") + continue # merge metadata.jsn into DataCite key = record['metadata']['doi'] models[key] = record['metadata']