Merge pull request #17 from Yinwhe/main

FEAT add livetext support
straussmaximilian · Nov 7, 2024 · cbfc416 · cbfc416
2 parents 9a24407 + e9ffcdf
commit cbfc416
Show file tree

Hide file tree

Showing 4 changed files with 167 additions and 5 deletions.
diff --git a/README.md b/README.md
@@ -70,6 +70,19 @@ MacBook Pro (14-inch, 2021):
 - `fast`: 200 ms ± 4.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
 
 
+## About LiveText
+Since MacOS Sonoma, `LiveText` is now supported, which is stronger than the `VisionKit` OCR. You can try this feature by:
+```python
+# Use the OCR class
+from ocrmac import ocrmac
+annotations = ocrmac.OCR('test.png', framework="livetext").recognize()
+print(annotations)
+
+# Or use the helper directly
+annotations = ocrmac.livetext_from_image('test.png').recognize()
+```
+Notice, when using this feature, the `recognition_level` and `confidence_threshold` are not available. The `confidence` output will always be 1.
+
 ## Technical Background & Motivation
 If you want to do Optical character recognition (OCR) with Python, widely used tools are [`pytesseract`](https://github.com/madmaze/pytesseract) or [`EasyOCR`](https://github.com/JaidedAI/EasyOCR). For me, tesseract never did give great results. EasyOCR did, but it is slow on CPU. While there is GPU acceleration with CUDA, this does not work for Mac. *(Update from 9/2023: Apparently EasyOCR now has mps support for Mac.)*  
 In any case, as a Mac user you might notice that you can, with newer versions, directly copy and paste from images. The built-in OCR functionality is quite good. The underlying functionality for this is [`VNRecognizeTextRequest`](https://developer.apple.com/documentation/vision/vnrecognizetextrequest) from Apple's Vision Framework. Unfortunately it is in Swift; luckily, a wrapper for this exists. [`pyobjc-framework-Vision`](https://github.com/ronaldoussoren/pyobjc). `ocrmac` utilizes this wrapper and provides an easy interface to use this for OCR.

diff --git a/ocrmac/ocrmac.py b/ocrmac/ocrmac.py
@@ -24,6 +24,48 @@
     MATPLOTLIB_AVAILABLE = False
 
 
+try:
+    from AppKit import NSData, NSImage
+    from CoreFoundation import (
+        CFRunLoopRunInMode,
+        kCFRunLoopDefaultMode,
+        CFRunLoopStop,
+        CFRunLoopGetCurrent,
+    )
+
+    objc.registerMetaDataForSelector(
+            b"VKCImageAnalyzer",
+            b"processRequest:progressHandler:completionHandler:",
+            {
+                "arguments": {
+                    3: {
+                        "callable": {
+                            "retval": {"type": b"v"},
+                            "arguments": {
+                                0: {"type": b"^v"},
+                                1: {"type": b"d"},
+                            },
+                        }
+                    },
+                    4: {
+                        "callable": {
+                            "retval": {"type": b"v"},
+                            "arguments": {
+                                0: {"type": b"^v"},
+                                1: {"type": b"@"},
+                                2: {"type": b"@"},
+                            },
+                        }
+                    },
+                }
+            },
+        )
+
+    LIVETEXT_AVAILABLE = True
+except ImportError:
+    LIVETEXT_AVAILABLE = False
+
+
 def pil2buf(pil_image: Image.Image):
     """Convert PIL image to buffer"""
     buffer = io.BytesIO()
@@ -131,12 +173,103 @@ def text_from_image(
         return res
 
 
+def livetext_from_image(image, language_preference=None, detail=True):
+    """
+    Helper function to call VKCImageAnalyzer from Apple's livetext framework.
+
+    :param image: Path to image (str) or PIL Image.Image.
+    :param language_preference: Language preference. Defaults to None.
+    :param detail: Whether to return the bounding box or not. Defaults to True.
+
+    :returns: List of tuples containing the text and the bounding box.
+        Each tuple looks like (text, (x, y, width, height))
+        The bounding box (x, y, width, height) is composed of numbers between 0 and 1,
+        that represent a percentage from total image (width, height) accordingly.
+        You can use the `convert_coordinates_*` functions to convert them to pixels.
+        For more info, see https://developer.apple.com/documentation/vision/vndetectedobjectobservation/2867227-boundingbox?language=objc
+        and https://developer.apple.com/documentation/vision/vnrectangleobservation?language=objc
+    """
+
+    if not LIVETEXT_AVAILABLE:
+        raise ImportError(
+            "Invalid framework selected, Livetext is not available. \
+            Please makesure your system is running MacOS Sonoma or later, and essential packages are installed."
+        )
+
+    if isinstance(image, str):
+        image = Image.open(image)
+    elif not isinstance(image, Image.Image):
+        raise ValueError("Invalid image format. Image must be a path or a PIL image.")
+
+    if language_preference is not None and not isinstance(language_preference, list):
+        raise ValueError(
+            "Invalid language preference format. Language preference must be a list."
+        )
+
+    def pil2nsimage(pil_image: Image.Image):
+        image_bytes = io.BytesIO()
+        pil_image.save(image_bytes, format="TIFF")
+        ns_data = NSData.dataWithBytes_length_(
+            image_bytes.getvalue(), len(image_bytes.getvalue())
+        )
+        return NSImage.alloc().initWithData_(ns_data)
+
+    ns_image = pil2nsimage(image)
+
+    # Initialize the image analyzer
+    analyzer = objc.lookUpClass("VKCImageAnalyzer").alloc().init()
+    request = (
+        objc.lookUpClass("VKCImageAnalyzerRequest")
+        .alloc()
+        .initWithImage_requestType_(ns_image, 1)  # VKAnalysisTypeText
+    )
+
+    # Set the language preference
+    if language_preference is not None:
+        request.setLocales_(language_preference)
+
+    result = []
+
+    # Analysis callback functions
+    def process_handler(analysis, error):
+        if error:
+            raise RuntimeError("Error during analysis: " + str(error))
+        else:
+            lines = analysis.allLines()
+            if lines:
+                for line in lines:
+                    for char in line.children():
+                        char_text = char.string()
+                        if detail:
+                            bounding_box = char.quad().boundingBox()
+                            x, y = bounding_box.origin.x, bounding_box.origin.y
+                            w, h = bounding_box.size.width, bounding_box.size.height
+                            # More process on y, it differs from the vision framework
+                            y = 1 - y - h
+                            result.append((char_text, 1.0, [x, y, w, h]))
+                        else:
+                            result.append(char_text)
+
+            CFRunLoopStop(CFRunLoopGetCurrent())
+
+    # Do the analysis
+    analyzer.processRequest_progressHandler_completionHandler_(
+        request, lambda progress: None, process_handler
+    )
+
+    # Loops until the OCR is completed
+    CFRunLoopRunInMode(kCFRunLoopDefaultMode, 10.0, False)
+
+    return result
+
+
 class OCR:
-    def __init__(self, image, recognition_level="accurate", language_preference=None, confidence_threshold=0.0, detail=True):
+    def __init__(self, image, framework="vision", recognition_level="accurate", language_preference=None, confidence_threshold=0.0, detail=True):
         """OCR class to extract text from images.
 
         Args:
             image (str or PIL image): Path to image or PIL image.
+            framework (str, optional): Framework to use. Defaults to 'vision'.
             recognition_level (str, optional): Recognition level. Defaults to 'accurate'.
             language_preference (list, optional): Language preference. Defaults to None.
             param confidence_threshold: Confidence threshold. Defaults to 0.0.
@@ -149,8 +282,12 @@ def __init__(self, image, recognition_level="accurate", language_preference=None
             raise ValueError(
                 "Invalid image format. Image must be a path or a PIL image."
             )
+
+        if framework not in {"vision", "livetext"}:
+            raise ValueError("Invalid framework selected. Framework must be 'vision' or 'livetext'.")
 
         self.image = image
+        self.framework = framework
         self.recognition_level = recognition_level
         self.language_preference = language_preference
         self.confidence_threshold = confidence_threshold
@@ -160,9 +297,14 @@ def __init__(self, image, recognition_level="accurate", language_preference=None
     def recognize(
         self, px=False
     ) -> List[Tuple[str, float, Tuple[float, float, float, float]]]:
-        res = text_from_image(
-            self.image, self.recognition_level, self.language_preference, self.confidence_threshold, detail=self.detail
-        )
+        if self.framework == "vision":
+            res = text_from_image(
+                self.image, self.recognition_level, self.language_preference, self.confidence_threshold, detail=self.detail
+            )
+        else:
+            res = livetext_from_image(
+                self.image, self.language_preference, detail=self.detail
+            )
         self.res = res
 
         if px:
@@ -241,4 +383,4 @@ def annotate_PIL(self, color="red", fontsize=12) -> Image.Image:
             draw.rectangle((x1, y1, x2, y2), outline=color)
             draw.text((x1, y2), text, font=font, align="left", fill=color)
 
-        return annotated_image
+        return annotated_image
diff --git a/tests/test_ocrmac.py b/tests/test_ocrmac.py
@@ -115,4 +115,11 @@ def test_accurate(self):
         ref_image = Image.open(os.path.join(THIS_FOLDER, "test_output_accurate.png"))
         rms = rms_difference(annotated, ref_image)
 
+        assert rms < 5.0
+
+    def test_livetext(self):
+        annotated = ocrmac.OCR(os.path.join(THIS_FOLDER, "test.png"), framework="livetext", language_preference=['en-US']).annotate_PIL()
+        ref_image = Image.open(os.path.join(THIS_FOLDER, "test_output_livetext.png"))
+        rms = rms_difference(annotated, ref_image)
+
         assert rms < 5.0
diff --git a/tests/test_output_livetext.png b/tests/test_output_livetext.png