Merge pull request #29 from AjaxMultiCommentary/ocr_debug

Ocr debug
AjaxMultiCommentary · Apr 19, 2024 · 9ecd570 · 9ecd570
2 parents 83bf96f + 1d08446
commit 9ecd570
Show file tree

Hide file tree

Showing 39 changed files with 1,100 additions and 538 deletions.
diff --git a/ajmc/commons/geometry.py b/ajmc/commons/geometry.py
@@ -264,7 +264,8 @@ def are_bboxes_overlapping_with_threshold(bbox1: variables.BoxType,
 
 @docstring_formatter(**docstrings)
 def adjust_bbox_to_included_contours(bbox: variables.BoxType,
-                                     contours: List[Shape]) -> Shape:
+                                     contours: List[Shape],
+                                     exclude_vertically_expanding_contours: bool = True) -> Shape:
     """Finds the contours included in ``bbox`` and returns a shape objects that minimally contains them.
 
     Note:
@@ -276,10 +277,13 @@ def adjust_bbox_to_included_contours(bbox: variables.BoxType,
         contours: A list of included contours
     """
 
-    included_contours = [c for c in contours
-                         if are_bboxes_overlapping(c.bbox, bbox)
-                         and not (c.bbox[0][1] < bbox[0][1]
-                                  or c.bbox[1][1] > bbox[1][1])]
+    if exclude_vertically_expanding_contours:
+        included_contours = [c for c in contours
+                             if are_bboxes_overlapping(c.bbox, bbox)
+                             and not (c.bbox[0][1] < bbox[0][1]
+                                      or c.bbox[1][1] > bbox[1][1])]
+    else:
+        included_contours = [c for c in contours if are_bboxes_overlapping(c.bbox, bbox)]
 
     if included_contours:  # If we find included contours, readjust the bounding box
         return Shape([xy for c in included_contours for xy in c.bbox])

diff --git a/ajmc/commons/image.py b/ajmc/commons/image.py
@@ -2,7 +2,7 @@
 
 import random
 from pathlib import Path
-from typing import List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union, Callable
 
 import cv2
 import numpy as np
@@ -12,6 +12,8 @@
 from ajmc.commons.docstrings import docstring_formatter, docstrings
 from ajmc.commons.geometry import Shape
 from ajmc.commons.miscellaneous import get_ajmc_logger
+from ajmc.ocr.data_processing import font_utils
+from ajmc.ocr.data_processing.data_generation import draw_textline
 
 logger = get_ajmc_logger(__name__)
 
@@ -147,31 +149,63 @@ def draw_box(box: variables.BoxType,
                                thickness=stroke_thickness)
 
     if text is not None:
+        try:
+            text_img = draw_textline(text,
+                                     fonts=font_utils.get_default_fonts(),
+                                     fallback_fonts=font_utils.get_fallback_fonts(),
+                                     target_height=max((box[1][1] - box[0][1]) // 3, 10),
+                                     raise_if_unprintable_char=False)
+
+            # Convert the pillow image to a numpy array
+            text_img = np.array(text_img)
+            # Colorize the text image (it is black by default)
+            text_img = cv2.cvtColor(text_img, cv2.COLOR_GRAY2BGR)
+
+            # Include the image in the original image
+            img_matrix[box[0][1] - text_img.shape[0]:box[0][1], box[1][0] - text_img.shape[1]: box[1][0]:] = text_img
+        except Exception as e:
+            print(f'Error: {e}')
+            pass
+
         # Start by getting the actual size of the text_box
-        (text_width, text_height), _ = cv2.getTextSize(text, fontFace=cv2.FONT_HERSHEY_SIMPLEX,
-                                                       fontScale=text_size,
-                                                       thickness=text_thickness)
-
-        # Draw a rectangle around the text
-        img_matrix = cv2.rectangle(img_matrix,
-                                   pt1=(box[1][0] - text_width - 4, box[0][1] - text_height - 4),
-                                   pt2=(box[1][0], box[0][1]),
-                                   color=rgb_to_bgr(stroke_color),
-                                   thickness=-1)  # -1 means that the rectangle will be filled
-
-        img_matrix = cv2.putText(img_matrix, text,
-                                 org=(box[1][0] - text_width, box[0][1] - 2),
-                                 fontFace=cv2.FONT_HERSHEY_SIMPLEX,
-                                 fontScale=text_size,
-                                 color=(255, 255, 255),
-                                 thickness=text_thickness)
+        # (text_width, text_height), _ = cv2.getTextSize(text, fontFace=cv2.FONT_HERSHEY_TRIPLEX,
+        #                                                fontScale=text_size,
+        #                                                thickness=text_thickness)
+        #
+        # # Draw a rectangle around the text
+        # img_matrix = cv2.rectangle(img_matrix,
+        #                            pt1=(box[1][0] - text_width - 4, box[0][1] - text_height - 4),
+        #                            pt2=(box[1][0], box[0][1]),
+        #                            color=rgb_to_bgr(stroke_color),
+        #                            thickness=-1)  # -1 means that the rectangle will be filled
+        #
+        # img_matrix = cv2.putText(img_matrix, text,
+        #                          org=(box[1][0] - text_width, box[0][1] - 2),
+        #                          fontFace=cv2.FONT_HERSHEY_TRIPLEX,
+        #                          fontScale=text_size,
+        #                          color=(255, 255, 255),
+        #                          thickness=text_thickness)
 
     return img_matrix
 
 
-def draw_textcontainers(img_matrix: np.ndarray, output_path: Optional[Union[str, Path]] = None, *textcontainers):
+def draw_textcontainers(img_matrix: np.ndarray,
+                        output_path: Optional[Union[str, Path]] = None,
+                        text_getter: Optional[Callable] = None,
+                        *textcontainers, ):
     """Draws a list of ``TextContainer``s on ``img_matrix``."""
 
+    def _text_getter(tc):
+        if text_getter is None:
+            if tc.type == 'region':
+                return tc.region_type
+            elif tc.type in ['entity', 'sentence', 'hyphenation', 'lemma']:
+                return tc.label if tc.type == 'entity' else tc.type
+            else:
+                return tc.type
+        else:
+            return text_getter(tc)
+
     # Get the set of textcontainer types
     for tc in textcontainers:
         if tc.type == 'region':
@@ -181,7 +215,7 @@ def draw_textcontainers(img_matrix: np.ndarray, output_path: Optional[Union[str,
                                   stroke_thickness=2,
                                   fill_color=variables.REGION_TYPES_TO_COLORS[tc.region_type],
                                   fill_opacity=.3,
-                                  text=tc.region_type)
+                                  text=_text_getter(tc))
 
         elif tc.type in ['entity', 'sentence', 'hyphenation', 'lemma']:
             for i, bbox in enumerate(tc.bboxes):
@@ -193,7 +227,7 @@ def draw_textcontainers(img_matrix: np.ndarray, output_path: Optional[Union[str,
                                           stroke_thickness=2,
                                           fill_color=variables.TEXTCONTAINERS_TYPES_TO_COLORS[tc.type],
                                           fill_opacity=.3,
-                                          text=tc.label if tc.type == 'entity' else tc.type)
+                                          text=_text_getter(tc))
                 else:
                     img_matrix = draw_box(box=bbox.bbox,
                                           img_matrix=img_matrix,
@@ -211,7 +245,7 @@ def draw_textcontainers(img_matrix: np.ndarray, output_path: Optional[Union[str,
                                   stroke_thickness=1,
                                   fill_color=variables.TEXTCONTAINERS_TYPES_TO_COLORS[tc.type],
                                   fill_opacity=.2,
-                                  text=tc.type)
+                                  text=_text_getter(tc))
 
     if output_path is not None:
         cv2.imwrite(str(output_path), img_matrix)

diff --git a/ajmc/commons/variables.py b/ajmc/commons/variables.py
@@ -34,6 +34,7 @@
 NE_CORPUS_DIR = AJMC_DATA_DIR / 'AjMC-NE-corpus'
 LEMLINK_CORPUS_DIR = AJMC_DATA_DIR / 'lemma-linkage-corpus'
 LEMLINK_XMI_DIR = LEMLINK_CORPUS_DIR / 'data/preparation/corpus/annotated'
+FONTS_DIR = PACKAGE_DIR / 'data/fonts/fonts'
 
 # RELATIVE PATHS
 COMM_IMG_REL_DIR = Path('images/png')
@@ -107,8 +108,13 @@ def get_comm_ocr_outputs_dir(comm_id: str, ocr_run_id: str) -> Path:
     return get_comm_ocr_runs_dir(comm_id) / get_ocr_run_id_from_pattern(comm_id, ocr_run_id) / 'outputs'
 
 
-def get_comm_canonical_path_from_ocr_run_id(comm_id: str, ocr_run_pattern: str) -> Path:
-    return get_comm_canonical_dir(comm_id) / f'{get_ocr_run_id_from_pattern(comm_id, ocr_run_pattern)}.json'
+def get_comm_canonical_path_from_ocr_run_pattern(comm_id: str, ocr_run_pattern: str) -> Path:
+    if not ocr_run_pattern.endswith('.json'):
+        ocr_run_pattern += '.json'
+    try:
+        return next(get_comm_canonical_dir(comm_id).glob(ocr_run_pattern))
+    except StopIteration:
+        raise FileNotFoundError(f'No canonical found for comm_id={comm_id} and ocr_run_pattern={ocr_run_pattern}')
 
 
 def get_comm_sections_path(comm_id: str) -> Path:
@@ -126,7 +132,8 @@ def get_comm_sections_path(comm_id: str) -> Path:
 #                                                 FORMATS, EXTENSIONS AND PATTERNS
 # ======================================================================================================================
 
-OCR_OUTPUT_EXTENSIONS = ['.xml', '.hocr', '.html']
+OCR_OUTPUTS_EXTENSIONS = ['.hocr', '.xml', '.html', '.json']
+
 DEFAULT_IMG_EXTENSION = '.png'
 OLR_PREFIX = '_OLR_'
 OCR_GT_PREFIX = 'OCRGT_'
@@ -164,7 +171,13 @@ def get_comm_sections_path(comm_id: str) -> Path:
                'sophoclesplaysa05campgoog',
                'sophokle1v3soph',
                'Wecklein1894',
-               'SchneidewinNauckRadermacher1913', 'Hermann1851', 'lestragdiesdeso00tourgoog']
+               'SchneidewinNauckRadermacher1913',
+               'Hermann1851',
+               'lestragdiesdeso00tourgoog',
+               'thukydides02thuc',
+               'pvergiliusmaroa00virggoog',
+               'annalsoftacitusp00taci',
+               ]
 
 COPYRIGHT_COMM_IDS = list(set(ALL_COMM_IDS) - set(EXTERNAL_COMM_IDS) - set(PD_COMM_IDS))
 
@@ -446,13 +459,14 @@ def get_comm_sections_path(comm_id: str) -> Path:
         'red': (178, 0, 30),
         'pink': (240, 34, 130),
         'blue': (59, 159, 241),
-        'green': (152, 229, 135),
+        'green': (99, 163, 103),
         'yellow': (255, 200, 2),
         'brown': (175, 113, 89),
         'dark_green': (18, 91, 79),
         'purple': (70, 30, 68),
         'dark_blue': (55, 80, 125),
         'ecru': (181, 162, 103),
+        'grey': (136, 136, 136),
     },
     # https://coolors.co/f72585-b5179e-7209b7-560bad-480ca8-3a0ca3-3f37c9-4361ee-4895ef-4cc9f0
     'hues': {
@@ -496,5 +510,5 @@ def get_comm_sections_path(comm_id: str) -> Path:
 PARAMETERS = {
     'ocr_region_inclusion_threshold': 0.7,
     'words_line_inclusion_threshold': 0.7,
-    'entity_inclusion_threshold': 0.8,
+    'word_annotation_inclusion_threshold': 0.80,
 }
diff --git a/ajmc/corpora/_scripts/get_corpora_stats.py b/ajmc/corpora/_scripts/get_corpora_stats.py
@@ -0,0 +1,34 @@
+from ajmc.commons.file_management import walk_dirs
+from ajmc.corpora import variables as vs
+from ajmc.corpora.corpora_classes import Corpus
+
+
+DONE = [
+    'forum_romanum',
+    'corpus_scriptorum_latinorum',
+    'canonical-latinLit',
+    'canonical-greekLit',
+    'perseus_secondary',
+    'perseus_legacy',
+    'First1KGreek',
+    'propylaeum_BOOKS',
+    'propylaeum_DOK',
+    'agoraclass',
+]
+
+corpora_stats = {}
+
+for corpus_id in walk_dirs(vs.ROOT_STORING_DIR):
+    corpus_id = corpus_id.stem
+    corpus_id = 'EpibauCorpus'
+    if corpus_id in DONE:
+        continue
+    print('---------------------------------')
+    print(corpus_id)
+    try:
+        corpus = Corpus.auto_init(corpus_id)
+        corpora_stats[corpus_id] = len(corpus.get_plain_text())
+        print(corpora_stats[corpus_id])
+    except Exception as e:
+        print('Skipping corpus:', corpus_id, e)
+    break
diff --git a/ajmc/corpora/_scripts/read_vicipaedia.py b/ajmc/corpora/_scripts/read_vicipaedia.py
@@ -0,0 +1,20 @@
+from pathlib import Path
+
+# Read an XML file with bs4
+from bs4 import BeautifulSoup
+
+file_path = Path('/Users/sven/Desktop/lawiki-20240320-pages-articles-multistream.xml')
+soup = BeautifulSoup(file_path.read_text('utf-8'), features='xml')
+
+# find all the element named 'page' in the soup
+pages = soup.find_all('page')
+
+print(len(pages))
+
+print(pages[0].prettify())
+
+# We now get the text of the first page
+text = pages[0].text
+
+# We now estimate the size of the text in gb
+size = len(text) / 1e9
diff --git a/ajmc/corpora/corpora_classes.py b/ajmc/corpora/corpora_classes.py
@@ -210,3 +210,27 @@ def get_lexica(self) -> typing.Dict[str, typing.Dict[str, str]]:
             dict_ = json.loads(file.read_text(encoding='utf-8'))
             lexica[file.stem] = dict_
         return lexica
+
+
+class EpibauCorpus(Corpus):
+
+    def __init__(self, corpus_id: str = 'EpibauCorpus'):
+        super().__init__(corpus_id)
+
+    @property
+    def data_dir(self) -> Path:
+        return self.root_dir / 'data/release/v0.3/'
+
+    @property
+    def files(self) -> typing.List[Path]:
+        return [p for p in self.data_dir.rglob('*.tsv') if 'masked' not in p.name]
+
+
+    def get_plain_text(self) -> str:
+        if (self.root_dir / 'plaintext.txt').exists():
+            return (self.root_dir / 'plaintext.txt').read_text(encoding='utf-8')
+        text = ''
+        for file in self.files:
+            text += ' '.join([l.split('\t')[0] for l in file.read_text(encoding='utf-8').splitlines()[1:]])
+        (self.root_dir / 'plaintext.txt').write_text(text, encoding='utf-8')
+        return text