-
Notifications
You must be signed in to change notification settings - Fork 31
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add support for set_page_seg_mode (#36)
* Add support for set_page_seg_mode #33 * Add final new line to python file
- Loading branch information
Showing
6 changed files
with
167 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
[package] | ||
name = "tesseract" | ||
version = "0.12.1" | ||
version = "0.13.0" | ||
authors = ["Kevin Kwok <[email protected]>", "Chris Couzens <[email protected]>"] | ||
documentation = "https://docs.rs/tesseract" | ||
repository = "https://github.com/antimatter15/tesseract-rs" | ||
|
@@ -11,5 +11,5 @@ categories = ["api-bindings", "multimedia::images"] | |
|
||
[dependencies] | ||
tesseract-sys = "~0.5" | ||
tesseract-plumbing = "~0.8" | ||
tesseract-plumbing = "~0.9" | ||
thiserror = "1.0" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
src/page_seg_mode.rs: page-seg-modes.txt build_page_seg_modes.py | ||
python build_page_seg_modes.py < page-seg-modes.txt | rustfmt > $@ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
import sys | ||
from itertools import islice | ||
|
||
class PageSegMode: | ||
def __init__(self, name, comment): | ||
self.name = name | ||
self.comment = comment | ||
|
||
def name_as_enum(self): | ||
return ''.join(n.capitalize() for n in self.name.split("_")) | ||
|
||
def name_as_variable(self): | ||
return 'tesseract_sys::TessPageSegMode_' + self.name | ||
|
||
name = None | ||
page_seg_modes = [] | ||
i = 0; | ||
|
||
for line in islice(sys.stdin, 1, None): | ||
if i == 0: | ||
name = line.rstrip('\n') | ||
elif i == 1: | ||
comment = line.rstrip('\n') | ||
page_seg_mode = PageSegMode(name, comment) | ||
page_seg_modes.append(page_seg_mode) | ||
i = (i + 1) % 3 | ||
|
||
print("// ⚠️ This file is generated") | ||
print("// ⚠️ Regenerate with `make src/page_seg_mode.rs`") | ||
print() | ||
print("use tesseract_sys::TessPageSegMode;") | ||
print() | ||
print("/// Enum representing different PageSegMode options accepted by Tesseract") | ||
print("#[derive(Debug, Clone, Copy, PartialEq, Eq)]") | ||
print("pub enum PageSegMode {") | ||
|
||
for page_seg_mode in page_seg_modes: | ||
print(f" /// {page_seg_mode.comment}") | ||
print(f" {page_seg_mode.name_as_enum()},",) | ||
|
||
print("}") | ||
print() | ||
print("impl PageSegMode {") | ||
print(" /// Get the page-seg-mode's value as used by Tesseract") | ||
print(" pub fn as_tess_page_seg_mode(&self) -> TessPageSegMode {") | ||
print(" match self {") | ||
|
||
for page_seg_mode in page_seg_modes: | ||
print(f" PageSegMode::{page_seg_mode.name_as_enum()} => {page_seg_mode.name_as_variable()},") | ||
|
||
print(" }") | ||
print(" }") | ||
print("}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
# Copied from https://tesseract-ocr.github.io/tessapi/5.x/a01818.html#a338d4c8b5d497b5ec3e6e4269d8ac66aab76fe3ca390d99e070ea60b892ee18ef | ||
PSM_OSD_ONLY | ||
Orientation and script detection only. | ||
|
||
PSM_AUTO_OSD | ||
Automatic page segmentation with orientation and script detection. (OSD) | ||
|
||
PSM_AUTO_ONLY | ||
Automatic page segmentation, but no OSD, or OCR. | ||
|
||
PSM_AUTO | ||
Fully automatic page segmentation, but no OSD. | ||
|
||
PSM_SINGLE_COLUMN | ||
Assume a single column of text of variable sizes. | ||
|
||
PSM_SINGLE_BLOCK_VERT_TEXT | ||
Assume a single uniform block of vertically aligned text. | ||
|
||
PSM_SINGLE_BLOCK | ||
Assume a single uniform block of text. (Default.) | ||
|
||
PSM_SINGLE_LINE | ||
Treat the image as a single text line. | ||
|
||
PSM_SINGLE_WORD | ||
Treat the image as a single word. | ||
|
||
PSM_CIRCLE_WORD | ||
Treat the image as a single word in a circle. | ||
|
||
PSM_SINGLE_CHAR | ||
Treat the image as a single character. | ||
|
||
PSM_SPARSE_TEXT | ||
Find as much text as possible in no particular order. | ||
|
||
PSM_SPARSE_TEXT_OSD | ||
Sparse text with orientation and script det. | ||
|
||
PSM_RAW_LINE | ||
Treat the image as a single text line, bypassing hacks that are Tesseract-specific. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
// ⚠️ This file is generated | ||
// ⚠️ Regenerate with `make src/page_seg_mode.rs` | ||
|
||
use tesseract_sys::TessPageSegMode; | ||
|
||
/// Enum representing different PageSegMode options accepted by Tesseract | ||
#[derive(Debug, Clone, Copy, PartialEq, Eq)] | ||
pub enum PageSegMode { | ||
/// Orientation and script detection only. | ||
PsmOsdOnly, | ||
/// Automatic page segmentation with orientation and script detection. (OSD) | ||
PsmAutoOsd, | ||
/// Automatic page segmentation, but no OSD, or OCR. | ||
PsmAutoOnly, | ||
/// Fully automatic page segmentation, but no OSD. | ||
PsmAuto, | ||
/// Assume a single column of text of variable sizes. | ||
PsmSingleColumn, | ||
/// Assume a single uniform block of vertically aligned text. | ||
PsmSingleBlockVertText, | ||
/// Assume a single uniform block of text. (Default.) | ||
PsmSingleBlock, | ||
/// Treat the image as a single text line. | ||
PsmSingleLine, | ||
/// Treat the image as a single word. | ||
PsmSingleWord, | ||
/// Treat the image as a single word in a circle. | ||
PsmCircleWord, | ||
/// Treat the image as a single character. | ||
PsmSingleChar, | ||
/// Find as much text as possible in no particular order. | ||
PsmSparseText, | ||
/// Sparse text with orientation and script det. | ||
PsmSparseTextOsd, | ||
/// Treat the image as a single text line, bypassing hacks that are Tesseract-specific. | ||
PsmRawLine, | ||
} | ||
|
||
impl PageSegMode { | ||
/// Get the page-seg-mode's value as used by Tesseract | ||
pub fn as_tess_page_seg_mode(&self) -> TessPageSegMode { | ||
match self { | ||
PageSegMode::PsmOsdOnly => tesseract_sys::TessPageSegMode_PSM_OSD_ONLY, | ||
PageSegMode::PsmAutoOsd => tesseract_sys::TessPageSegMode_PSM_AUTO_OSD, | ||
PageSegMode::PsmAutoOnly => tesseract_sys::TessPageSegMode_PSM_AUTO_ONLY, | ||
PageSegMode::PsmAuto => tesseract_sys::TessPageSegMode_PSM_AUTO, | ||
PageSegMode::PsmSingleColumn => tesseract_sys::TessPageSegMode_PSM_SINGLE_COLUMN, | ||
PageSegMode::PsmSingleBlockVertText => { | ||
tesseract_sys::TessPageSegMode_PSM_SINGLE_BLOCK_VERT_TEXT | ||
} | ||
PageSegMode::PsmSingleBlock => tesseract_sys::TessPageSegMode_PSM_SINGLE_BLOCK, | ||
PageSegMode::PsmSingleLine => tesseract_sys::TessPageSegMode_PSM_SINGLE_LINE, | ||
PageSegMode::PsmSingleWord => tesseract_sys::TessPageSegMode_PSM_SINGLE_WORD, | ||
PageSegMode::PsmCircleWord => tesseract_sys::TessPageSegMode_PSM_CIRCLE_WORD, | ||
PageSegMode::PsmSingleChar => tesseract_sys::TessPageSegMode_PSM_SINGLE_CHAR, | ||
PageSegMode::PsmSparseText => tesseract_sys::TessPageSegMode_PSM_SPARSE_TEXT, | ||
PageSegMode::PsmSparseTextOsd => tesseract_sys::TessPageSegMode_PSM_SPARSE_TEXT_OSD, | ||
PageSegMode::PsmRawLine => tesseract_sys::TessPageSegMode_PSM_RAW_LINE, | ||
} | ||
} | ||
} |