Skip to content

Commit

Permalink
Merge pull request #29 from AjaxMultiCommentary/ocr_debug
Browse files Browse the repository at this point in the history
Ocr debug
  • Loading branch information
sven-nm authored Apr 19, 2024
2 parents 83bf96f + 1d08446 commit 9ecd570
Show file tree
Hide file tree
Showing 39 changed files with 1,100 additions and 538 deletions.
14 changes: 9 additions & 5 deletions ajmc/commons/geometry.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,8 @@ def are_bboxes_overlapping_with_threshold(bbox1: variables.BoxType,

@docstring_formatter(**docstrings)
def adjust_bbox_to_included_contours(bbox: variables.BoxType,
contours: List[Shape]) -> Shape:
contours: List[Shape],
exclude_vertically_expanding_contours: bool = True) -> Shape:
"""Finds the contours included in ``bbox`` and returns a shape objects that minimally contains them.
Note:
Expand All @@ -276,10 +277,13 @@ def adjust_bbox_to_included_contours(bbox: variables.BoxType,
contours: A list of included contours
"""

included_contours = [c for c in contours
if are_bboxes_overlapping(c.bbox, bbox)
and not (c.bbox[0][1] < bbox[0][1]
or c.bbox[1][1] > bbox[1][1])]
if exclude_vertically_expanding_contours:
included_contours = [c for c in contours
if are_bboxes_overlapping(c.bbox, bbox)
and not (c.bbox[0][1] < bbox[0][1]
or c.bbox[1][1] > bbox[1][1])]
else:
included_contours = [c for c in contours if are_bboxes_overlapping(c.bbox, bbox)]

if included_contours: # If we find included contours, readjust the bounding box
return Shape([xy for c in included_contours for xy in c.bbox])
Expand Down
78 changes: 56 additions & 22 deletions ajmc/commons/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import random
from pathlib import Path
from typing import List, Optional, Tuple, Union
from typing import List, Optional, Tuple, Union, Callable

import cv2
import numpy as np
Expand All @@ -12,6 +12,8 @@
from ajmc.commons.docstrings import docstring_formatter, docstrings
from ajmc.commons.geometry import Shape
from ajmc.commons.miscellaneous import get_ajmc_logger
from ajmc.ocr.data_processing import font_utils
from ajmc.ocr.data_processing.data_generation import draw_textline

logger = get_ajmc_logger(__name__)

Expand Down Expand Up @@ -147,31 +149,63 @@ def draw_box(box: variables.BoxType,
thickness=stroke_thickness)

if text is not None:
try:
text_img = draw_textline(text,
fonts=font_utils.get_default_fonts(),
fallback_fonts=font_utils.get_fallback_fonts(),
target_height=max((box[1][1] - box[0][1]) // 3, 10),
raise_if_unprintable_char=False)

# Convert the pillow image to a numpy array
text_img = np.array(text_img)
# Colorize the text image (it is black by default)
text_img = cv2.cvtColor(text_img, cv2.COLOR_GRAY2BGR)

# Include the image in the original image
img_matrix[box[0][1] - text_img.shape[0]:box[0][1], box[1][0] - text_img.shape[1]: box[1][0]:] = text_img
except Exception as e:
print(f'Error: {e}')
pass

# Start by getting the actual size of the text_box
(text_width, text_height), _ = cv2.getTextSize(text, fontFace=cv2.FONT_HERSHEY_SIMPLEX,
fontScale=text_size,
thickness=text_thickness)

# Draw a rectangle around the text
img_matrix = cv2.rectangle(img_matrix,
pt1=(box[1][0] - text_width - 4, box[0][1] - text_height - 4),
pt2=(box[1][0], box[0][1]),
color=rgb_to_bgr(stroke_color),
thickness=-1) # -1 means that the rectangle will be filled

img_matrix = cv2.putText(img_matrix, text,
org=(box[1][0] - text_width, box[0][1] - 2),
fontFace=cv2.FONT_HERSHEY_SIMPLEX,
fontScale=text_size,
color=(255, 255, 255),
thickness=text_thickness)
# (text_width, text_height), _ = cv2.getTextSize(text, fontFace=cv2.FONT_HERSHEY_TRIPLEX,
# fontScale=text_size,
# thickness=text_thickness)
#
# # Draw a rectangle around the text
# img_matrix = cv2.rectangle(img_matrix,
# pt1=(box[1][0] - text_width - 4, box[0][1] - text_height - 4),
# pt2=(box[1][0], box[0][1]),
# color=rgb_to_bgr(stroke_color),
# thickness=-1) # -1 means that the rectangle will be filled
#
# img_matrix = cv2.putText(img_matrix, text,
# org=(box[1][0] - text_width, box[0][1] - 2),
# fontFace=cv2.FONT_HERSHEY_TRIPLEX,
# fontScale=text_size,
# color=(255, 255, 255),
# thickness=text_thickness)

return img_matrix


def draw_textcontainers(img_matrix: np.ndarray, output_path: Optional[Union[str, Path]] = None, *textcontainers):
def draw_textcontainers(img_matrix: np.ndarray,
output_path: Optional[Union[str, Path]] = None,
text_getter: Optional[Callable] = None,
*textcontainers, ):
"""Draws a list of ``TextContainer``s on ``img_matrix``."""

def _text_getter(tc):
if text_getter is None:
if tc.type == 'region':
return tc.region_type
elif tc.type in ['entity', 'sentence', 'hyphenation', 'lemma']:
return tc.label if tc.type == 'entity' else tc.type
else:
return tc.type
else:
return text_getter(tc)

# Get the set of textcontainer types
for tc in textcontainers:
if tc.type == 'region':
Expand All @@ -181,7 +215,7 @@ def draw_textcontainers(img_matrix: np.ndarray, output_path: Optional[Union[str,
stroke_thickness=2,
fill_color=variables.REGION_TYPES_TO_COLORS[tc.region_type],
fill_opacity=.3,
text=tc.region_type)
text=_text_getter(tc))

elif tc.type in ['entity', 'sentence', 'hyphenation', 'lemma']:
for i, bbox in enumerate(tc.bboxes):
Expand All @@ -193,7 +227,7 @@ def draw_textcontainers(img_matrix: np.ndarray, output_path: Optional[Union[str,
stroke_thickness=2,
fill_color=variables.TEXTCONTAINERS_TYPES_TO_COLORS[tc.type],
fill_opacity=.3,
text=tc.label if tc.type == 'entity' else tc.type)
text=_text_getter(tc))
else:
img_matrix = draw_box(box=bbox.bbox,
img_matrix=img_matrix,
Expand All @@ -211,7 +245,7 @@ def draw_textcontainers(img_matrix: np.ndarray, output_path: Optional[Union[str,
stroke_thickness=1,
fill_color=variables.TEXTCONTAINERS_TYPES_TO_COLORS[tc.type],
fill_opacity=.2,
text=tc.type)
text=_text_getter(tc))

if output_path is not None:
cv2.imwrite(str(output_path), img_matrix)
Expand Down
26 changes: 20 additions & 6 deletions ajmc/commons/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
NE_CORPUS_DIR = AJMC_DATA_DIR / 'AjMC-NE-corpus'
LEMLINK_CORPUS_DIR = AJMC_DATA_DIR / 'lemma-linkage-corpus'
LEMLINK_XMI_DIR = LEMLINK_CORPUS_DIR / 'data/preparation/corpus/annotated'
FONTS_DIR = PACKAGE_DIR / 'data/fonts/fonts'

# RELATIVE PATHS
COMM_IMG_REL_DIR = Path('images/png')
Expand Down Expand Up @@ -107,8 +108,13 @@ def get_comm_ocr_outputs_dir(comm_id: str, ocr_run_id: str) -> Path:
return get_comm_ocr_runs_dir(comm_id) / get_ocr_run_id_from_pattern(comm_id, ocr_run_id) / 'outputs'


def get_comm_canonical_path_from_ocr_run_id(comm_id: str, ocr_run_pattern: str) -> Path:
return get_comm_canonical_dir(comm_id) / f'{get_ocr_run_id_from_pattern(comm_id, ocr_run_pattern)}.json'
def get_comm_canonical_path_from_ocr_run_pattern(comm_id: str, ocr_run_pattern: str) -> Path:
if not ocr_run_pattern.endswith('.json'):
ocr_run_pattern += '.json'
try:
return next(get_comm_canonical_dir(comm_id).glob(ocr_run_pattern))
except StopIteration:
raise FileNotFoundError(f'No canonical found for comm_id={comm_id} and ocr_run_pattern={ocr_run_pattern}')


def get_comm_sections_path(comm_id: str) -> Path:
Expand All @@ -126,7 +132,8 @@ def get_comm_sections_path(comm_id: str) -> Path:
# FORMATS, EXTENSIONS AND PATTERNS
# ======================================================================================================================

OCR_OUTPUT_EXTENSIONS = ['.xml', '.hocr', '.html']
OCR_OUTPUTS_EXTENSIONS = ['.hocr', '.xml', '.html', '.json']

DEFAULT_IMG_EXTENSION = '.png'
OLR_PREFIX = '_OLR_'
OCR_GT_PREFIX = 'OCRGT_'
Expand Down Expand Up @@ -164,7 +171,13 @@ def get_comm_sections_path(comm_id: str) -> Path:
'sophoclesplaysa05campgoog',
'sophokle1v3soph',
'Wecklein1894',
'SchneidewinNauckRadermacher1913', 'Hermann1851', 'lestragdiesdeso00tourgoog']
'SchneidewinNauckRadermacher1913',
'Hermann1851',
'lestragdiesdeso00tourgoog',
'thukydides02thuc',
'pvergiliusmaroa00virggoog',
'annalsoftacitusp00taci',
]

COPYRIGHT_COMM_IDS = list(set(ALL_COMM_IDS) - set(EXTERNAL_COMM_IDS) - set(PD_COMM_IDS))

Expand Down Expand Up @@ -446,13 +459,14 @@ def get_comm_sections_path(comm_id: str) -> Path:
'red': (178, 0, 30),
'pink': (240, 34, 130),
'blue': (59, 159, 241),
'green': (152, 229, 135),
'green': (99, 163, 103),
'yellow': (255, 200, 2),
'brown': (175, 113, 89),
'dark_green': (18, 91, 79),
'purple': (70, 30, 68),
'dark_blue': (55, 80, 125),
'ecru': (181, 162, 103),
'grey': (136, 136, 136),
},
# https://coolors.co/f72585-b5179e-7209b7-560bad-480ca8-3a0ca3-3f37c9-4361ee-4895ef-4cc9f0
'hues': {
Expand Down Expand Up @@ -496,5 +510,5 @@ def get_comm_sections_path(comm_id: str) -> Path:
PARAMETERS = {
'ocr_region_inclusion_threshold': 0.7,
'words_line_inclusion_threshold': 0.7,
'entity_inclusion_threshold': 0.8,
'word_annotation_inclusion_threshold': 0.80,
}
34 changes: 34 additions & 0 deletions ajmc/corpora/_scripts/get_corpora_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from ajmc.commons.file_management import walk_dirs
from ajmc.corpora import variables as vs
from ajmc.corpora.corpora_classes import Corpus


DONE = [
'forum_romanum',
'corpus_scriptorum_latinorum',
'canonical-latinLit',
'canonical-greekLit',
'perseus_secondary',
'perseus_legacy',
'First1KGreek',
'propylaeum_BOOKS',
'propylaeum_DOK',
'agoraclass',
]

corpora_stats = {}

for corpus_id in walk_dirs(vs.ROOT_STORING_DIR):
corpus_id = corpus_id.stem
corpus_id = 'EpibauCorpus'
if corpus_id in DONE:
continue
print('---------------------------------')
print(corpus_id)
try:
corpus = Corpus.auto_init(corpus_id)
corpora_stats[corpus_id] = len(corpus.get_plain_text())
print(corpora_stats[corpus_id])
except Exception as e:
print('Skipping corpus:', corpus_id, e)
break
20 changes: 20 additions & 0 deletions ajmc/corpora/_scripts/read_vicipaedia.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from pathlib import Path

# Read an XML file with bs4
from bs4 import BeautifulSoup

file_path = Path('/Users/sven/Desktop/lawiki-20240320-pages-articles-multistream.xml')
soup = BeautifulSoup(file_path.read_text('utf-8'), features='xml')

# find all the element named 'page' in the soup
pages = soup.find_all('page')

print(len(pages))

print(pages[0].prettify())

# We now get the text of the first page
text = pages[0].text

# We now estimate the size of the text in gb
size = len(text) / 1e9
24 changes: 24 additions & 0 deletions ajmc/corpora/corpora_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,3 +210,27 @@ def get_lexica(self) -> typing.Dict[str, typing.Dict[str, str]]:
dict_ = json.loads(file.read_text(encoding='utf-8'))
lexica[file.stem] = dict_
return lexica


class EpibauCorpus(Corpus):

def __init__(self, corpus_id: str = 'EpibauCorpus'):
super().__init__(corpus_id)

@property
def data_dir(self) -> Path:
return self.root_dir / 'data/release/v0.3/'

@property
def files(self) -> typing.List[Path]:
return [p for p in self.data_dir.rglob('*.tsv') if 'masked' not in p.name]


def get_plain_text(self) -> str:
if (self.root_dir / 'plaintext.txt').exists():
return (self.root_dir / 'plaintext.txt').read_text(encoding='utf-8')
text = ''
for file in self.files:
text += ' '.join([l.split('\t')[0] for l in file.read_text(encoding='utf-8').splitlines()[1:]])
(self.root_dir / 'plaintext.txt').write_text(text, encoding='utf-8')
return text
Loading

0 comments on commit 9ecd570

Please sign in to comment.