Skip to content

Commit

Permalink
Merge pull request #17 from Yinwhe/main
Browse files Browse the repository at this point in the history
FEAT add livetext support
  • Loading branch information
straussmaximilian authored Nov 7, 2024
2 parents 9a24407 + e9ffcdf commit cbfc416
Show file tree
Hide file tree
Showing 4 changed files with 167 additions and 5 deletions.
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,19 @@ MacBook Pro (14-inch, 2021):
- `fast`: 200 ms ± 4.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## About LiveText
Since MacOS Sonoma, `LiveText` is now supported, which is stronger than the `VisionKit` OCR. You can try this feature by:
```python
# Use the OCR class
from ocrmac import ocrmac
annotations = ocrmac.OCR('test.png', framework="livetext").recognize()
print(annotations)

# Or use the helper directly
annotations = ocrmac.livetext_from_image('test.png').recognize()
```
Notice, when using this feature, the `recognition_level` and `confidence_threshold` are not available. The `confidence` output will always be 1.

## Technical Background & Motivation
If you want to do Optical character recognition (OCR) with Python, widely used tools are [`pytesseract`](https://github.com/madmaze/pytesseract) or [`EasyOCR`](https://github.com/JaidedAI/EasyOCR). For me, tesseract never did give great results. EasyOCR did, but it is slow on CPU. While there is GPU acceleration with CUDA, this does not work for Mac. *(Update from 9/2023: Apparently EasyOCR now has mps support for Mac.)*
In any case, as a Mac user you might notice that you can, with newer versions, directly copy and paste from images. The built-in OCR functionality is quite good. The underlying functionality for this is [`VNRecognizeTextRequest`](https://developer.apple.com/documentation/vision/vnrecognizetextrequest) from Apple's Vision Framework. Unfortunately it is in Swift; luckily, a wrapper for this exists. [`pyobjc-framework-Vision`](https://github.com/ronaldoussoren/pyobjc). `ocrmac` utilizes this wrapper and provides an easy interface to use this for OCR.
Expand Down
152 changes: 147 additions & 5 deletions ocrmac/ocrmac.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,48 @@
MATPLOTLIB_AVAILABLE = False


try:
from AppKit import NSData, NSImage
from CoreFoundation import (
CFRunLoopRunInMode,
kCFRunLoopDefaultMode,
CFRunLoopStop,
CFRunLoopGetCurrent,
)

objc.registerMetaDataForSelector(
b"VKCImageAnalyzer",
b"processRequest:progressHandler:completionHandler:",
{
"arguments": {
3: {
"callable": {
"retval": {"type": b"v"},
"arguments": {
0: {"type": b"^v"},
1: {"type": b"d"},
},
}
},
4: {
"callable": {
"retval": {"type": b"v"},
"arguments": {
0: {"type": b"^v"},
1: {"type": b"@"},
2: {"type": b"@"},
},
}
},
}
},
)

LIVETEXT_AVAILABLE = True
except ImportError:
LIVETEXT_AVAILABLE = False


def pil2buf(pil_image: Image.Image):
"""Convert PIL image to buffer"""
buffer = io.BytesIO()
Expand Down Expand Up @@ -131,12 +173,103 @@ def text_from_image(
return res


def livetext_from_image(image, language_preference=None, detail=True):
"""
Helper function to call VKCImageAnalyzer from Apple's livetext framework.
:param image: Path to image (str) or PIL Image.Image.
:param language_preference: Language preference. Defaults to None.
:param detail: Whether to return the bounding box or not. Defaults to True.
:returns: List of tuples containing the text and the bounding box.
Each tuple looks like (text, (x, y, width, height))
The bounding box (x, y, width, height) is composed of numbers between 0 and 1,
that represent a percentage from total image (width, height) accordingly.
You can use the `convert_coordinates_*` functions to convert them to pixels.
For more info, see https://developer.apple.com/documentation/vision/vndetectedobjectobservation/2867227-boundingbox?language=objc
and https://developer.apple.com/documentation/vision/vnrectangleobservation?language=objc
"""

if not LIVETEXT_AVAILABLE:
raise ImportError(
"Invalid framework selected, Livetext is not available. \
Please makesure your system is running MacOS Sonoma or later, and essential packages are installed."
)

if isinstance(image, str):
image = Image.open(image)
elif not isinstance(image, Image.Image):
raise ValueError("Invalid image format. Image must be a path or a PIL image.")

if language_preference is not None and not isinstance(language_preference, list):
raise ValueError(
"Invalid language preference format. Language preference must be a list."
)

def pil2nsimage(pil_image: Image.Image):
image_bytes = io.BytesIO()
pil_image.save(image_bytes, format="TIFF")
ns_data = NSData.dataWithBytes_length_(
image_bytes.getvalue(), len(image_bytes.getvalue())
)
return NSImage.alloc().initWithData_(ns_data)

ns_image = pil2nsimage(image)

# Initialize the image analyzer
analyzer = objc.lookUpClass("VKCImageAnalyzer").alloc().init()
request = (
objc.lookUpClass("VKCImageAnalyzerRequest")
.alloc()
.initWithImage_requestType_(ns_image, 1) # VKAnalysisTypeText
)

# Set the language preference
if language_preference is not None:
request.setLocales_(language_preference)

result = []

# Analysis callback functions
def process_handler(analysis, error):
if error:
raise RuntimeError("Error during analysis: " + str(error))
else:
lines = analysis.allLines()
if lines:
for line in lines:
for char in line.children():
char_text = char.string()
if detail:
bounding_box = char.quad().boundingBox()
x, y = bounding_box.origin.x, bounding_box.origin.y
w, h = bounding_box.size.width, bounding_box.size.height
# More process on y, it differs from the vision framework
y = 1 - y - h
result.append((char_text, 1.0, [x, y, w, h]))
else:
result.append(char_text)

CFRunLoopStop(CFRunLoopGetCurrent())

# Do the analysis
analyzer.processRequest_progressHandler_completionHandler_(
request, lambda progress: None, process_handler
)

# Loops until the OCR is completed
CFRunLoopRunInMode(kCFRunLoopDefaultMode, 10.0, False)

return result


class OCR:
def __init__(self, image, recognition_level="accurate", language_preference=None, confidence_threshold=0.0, detail=True):
def __init__(self, image, framework="vision", recognition_level="accurate", language_preference=None, confidence_threshold=0.0, detail=True):
"""OCR class to extract text from images.
Args:
image (str or PIL image): Path to image or PIL image.
framework (str, optional): Framework to use. Defaults to 'vision'.
recognition_level (str, optional): Recognition level. Defaults to 'accurate'.
language_preference (list, optional): Language preference. Defaults to None.
param confidence_threshold: Confidence threshold. Defaults to 0.0.
Expand All @@ -149,8 +282,12 @@ def __init__(self, image, recognition_level="accurate", language_preference=None
raise ValueError(
"Invalid image format. Image must be a path or a PIL image."
)

if framework not in {"vision", "livetext"}:
raise ValueError("Invalid framework selected. Framework must be 'vision' or 'livetext'.")

self.image = image
self.framework = framework
self.recognition_level = recognition_level
self.language_preference = language_preference
self.confidence_threshold = confidence_threshold
Expand All @@ -160,9 +297,14 @@ def __init__(self, image, recognition_level="accurate", language_preference=None
def recognize(
self, px=False
) -> List[Tuple[str, float, Tuple[float, float, float, float]]]:
res = text_from_image(
self.image, self.recognition_level, self.language_preference, self.confidence_threshold, detail=self.detail
)
if self.framework == "vision":
res = text_from_image(
self.image, self.recognition_level, self.language_preference, self.confidence_threshold, detail=self.detail
)
else:
res = livetext_from_image(
self.image, self.language_preference, detail=self.detail
)
self.res = res

if px:
Expand Down Expand Up @@ -241,4 +383,4 @@ def annotate_PIL(self, color="red", fontsize=12) -> Image.Image:
draw.rectangle((x1, y1, x2, y2), outline=color)
draw.text((x1, y2), text, font=font, align="left", fill=color)

return annotated_image
return annotated_image
7 changes: 7 additions & 0 deletions tests/test_ocrmac.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,4 +115,11 @@ def test_accurate(self):
ref_image = Image.open(os.path.join(THIS_FOLDER, "test_output_accurate.png"))
rms = rms_difference(annotated, ref_image)

assert rms < 5.0

def test_livetext(self):
annotated = ocrmac.OCR(os.path.join(THIS_FOLDER, "test.png"), framework="livetext", language_preference=['en-US']).annotate_PIL()
ref_image = Image.open(os.path.join(THIS_FOLDER, "test_output_livetext.png"))
rms = rms_difference(annotated, ref_image)

assert rms < 5.0
Binary file added tests/test_output_livetext.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit cbfc416

Please sign in to comment.