forked from sirfz/tesserocr
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tesserocr.pyx
2536 lines (2060 loc) · 89.9 KB
/
tesserocr.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!python
#cython: c_string_type=unicode, c_string_encoding=utf-8
"""Python wrapper around the Tesseract-OCR C++ API
This module provides a wrapper class :class:`PyTessBaseAPI` to call
Tesseract API methods. See :class:`PyTessBaseAPI` for details.
In addition, helper functions are provided for ocr operations:
>>> text = image_to_text(Image.open('./image.jpg').convert('L'), lang='eng')
>>> text = file_to_text('./image.jpg', psm=PSM.AUTO)
>>> print tesseract_version()
tesseract 3.04.00
leptonica-1.72
libjpeg 8d (libjpeg-turbo 1.3.0) : libpng 1.2.51 : libtiff 4.0.3 : zlib 1.2.8
>>> get_languages()
('/usr/share/tesseract-ocr/tessdata/',
['eng', 'osd', 'equ'])
"""
__version__ = '2.4.0'
import os
from io import BytesIO
from os.path import abspath, join
try:
from PIL import Image
except ImportError:
# PIL.Image won't be supported
pass
from tesseract cimport *
from libc.stdlib cimport malloc, free
from cpython.version cimport PY_MAJOR_VERSION
cdef bytes _b(s):
if PY_MAJOR_VERSION > 3:
if isinstance(s, str):
return s.encode('UTF-8')
elif isinstance(s, unicode):
return s.encode('UTF-8')
return s
# default parameters
setMsgSeverity(L_SEVERITY_NONE) # suppress leptonica error messages
cdef TessBaseAPI _api = TessBaseAPI()
_api.SetVariable('debug_file', '/dev/null') # suppress tesseract debug messages
_api.Init(NULL, NULL)
IF TESSERACT_VERSION >= 0x3999800:
cdef _DEFAULT_PATH = _api.GetDatapath() # "tessdata/" is not appended by tesseract since commit dba13db
ELSE:
cdef _DEFAULT_PATH = abspath(join(_api.GetDatapath(), os.pardir)) + os.sep
_init_lang = _api.GetInitLanguagesAsString()
if _init_lang == '':
_init_lang = 'eng'
cdef _DEFAULT_LANG = _init_lang
_api.End()
TessBaseAPI.ClearPersistentCache()
cdef class _Enum:
def __init__(self):
raise TypeError('{} is an enum and cannot be instantiated'.format(type(self).__name__))
cdef class OEM(_Enum):
"""An enum that defines avaialble OCR engine modes.
Attributes:
TESSERACT_ONLY: Run Tesseract only - fastest
LSTM_ONLY: Run just the LSTM line recognizer. (>=v4.00)
TESSERACT_LSTM_COMBINED: Run the LSTM recognizer, but allow fallback
to Tesseract when things get difficult. (>=v4.00)
CUBE_ONLY: Specify this mode when calling Init*(), to indicate that
any of the above modes should be automatically inferred from the
variables in the language-specific config, command-line configs, or
if not specified in any of the above should be set to the default
`OEM.TESSERACT_ONLY`.
TESSERACT_CUBE_COMBINED: Run Cube only - better accuracy, but slower.
DEFAULT: Run both and combine results - best accuracy.
"""
TESSERACT_ONLY = OEM_TESSERACT_ONLY
IF TESSERACT_VERSION >= 0x3999800:
LSTM_ONLY = OEM_LSTM_ONLY
TESSERACT_LSTM_COMBINED = OEM_TESSERACT_LSTM_COMBINED
ELSE:
CUBE_ONLY = OEM_CUBE_ONLY
TESSERACT_CUBE_COMBINED = OEM_TESSERACT_CUBE_COMBINED
DEFAULT = OEM_DEFAULT
cdef class PSM(_Enum):
"""An enum that defines all available page segmentation modes.
Attributes:
OSD_ONLY: Orientation and script detection only.
AUTO_OSD: Automatic page segmentation with orientation and script detection. (OSD)
AUTO_ONLY: Automatic page segmentation, but no OSD, or OCR.
AUTO: Fully automatic page segmentation, but no OSD. (:mod:`tesserocr` default)
SINGLE_COLUMN: Assume a single column of text of variable sizes.
SINGLE_BLOCK_VERT_TEXT: Assume a single uniform block of vertically aligned text.
SINGLE_BLOCK: Assume a single uniform block of text.
SINGLE_LINE: Treat the image as a single text line.
SINGLE_WORD: Treat the image as a single word.
CIRCLE_WORD: Treat the image as a single word in a circle.
SINGLE_CHAR: Treat the image as a single character.
SPARSE_TEXT: Find as much text as possible in no particular order.
SPARSE_TEXT_OSD: Sparse text with orientation and script det.
RAW_LINE: Treat the image as a single text line, bypassing hacks that are Tesseract-specific.
COUNT: Number of enum entries.
"""
OSD_ONLY = PSM_OSD_ONLY
"""Orientation and script detection only."""
AUTO_OSD = PSM_AUTO_OSD
"""Automatic page segmentation with orientation and script detection. (OSD)"""
AUTO_ONLY = PSM_AUTO_ONLY
"""Automatic page segmentation, but no OSD, or OCR."""
AUTO = PSM_AUTO
"""Fully automatic page segmentation, but no OSD. (tesserocr default)"""
SINGLE_COLUMN = PSM_SINGLE_COLUMN
"""Assume a single column of text of variable sizes."""
SINGLE_BLOCK_VERT_TEXT = PSM_SINGLE_BLOCK_VERT_TEXT
"""Assume a single uniform block of vertically aligned text."""
SINGLE_BLOCK = PSM_SINGLE_BLOCK
"""Assume a single uniform block of text. (Default.)"""
SINGLE_LINE = PSM_SINGLE_LINE
"""Treat the image as a single text line."""
SINGLE_WORD = PSM_SINGLE_WORD
"""Treat the image as a single word."""
CIRCLE_WORD = PSM_CIRCLE_WORD
"""Treat the image as a single word in a circle."""
SINGLE_CHAR = PSM_SINGLE_CHAR
"""Treat the image as a single character."""
SPARSE_TEXT = PSM_SPARSE_TEXT
"""Find as much text as possible in no particular order."""
SPARSE_TEXT_OSD = PSM_SPARSE_TEXT_OSD
"""Sparse text with orientation and script det."""
RAW_LINE = PSM_RAW_LINE
"""Treat the image as a single text line, bypassing hacks that are Tesseract-specific."""
COUNT = PSM_COUNT
"""Number of enum entries."""
cdef class RIL(_Enum):
"""An enum that defines available Page Iterator levels.
Attributes:
BLOCK: of text/image/separator line.
PARA: within a block.
TEXTLINE: within a paragraph.
WORD: within a textline.
SYMBOL: character within a word.
"""
BLOCK = RIL_BLOCK
"""of text/image/separator line."""
PARA = RIL_PARA
"""within a block."""
TEXTLINE = RIL_TEXTLINE
"""within a paragraph."""
WORD = RIL_WORD
"""within a textline."""
SYMBOL = RIL_SYMBOL
"""character within a word."""
cdef class PT(_Enum):
"""An enum the defines avaialbe Poly Block types.
Attributes:
UNKNOWN: Type is not yet known. Keep as the first element.
FLOWING_TEXT: Text that lives inside a column.
HEADING_TEXT: Text that spans more than one column.
PULLOUT_TEXT: Text that is in a cross-column pull-out region.
EQUATION: Partition belonging to an equation region.
INLINE_EQUATION: Partition has inline equation.
TABLE: Partition belonging to a table region.
VERTICAL_TEXT: Text-line runs vertically.
CAPTION_TEXT: Text that belongs to an image.
FLOWING_IMAGE: Image that lives inside a column.
HEADING_IMAGE: Image that spans more than one column.
PULLOUT_IMAGE: Image that is in a cross-column pull-out region.
HORZ_LINE: Horizontal Line.
VERT_LINE: Vertical Line.
NOISE: Lies outside of any column.
COUNT: Count
"""
UNKNOWN = PT_UNKNOWN
"""Type is not yet known. Keep as the first element."""
FLOWING_TEXT = PT_FLOWING_TEXT
"""Text that lives inside a column."""
HEADING_TEXT = PT_HEADING_TEXT
"""Text that spans more than one column."""
PULLOUT_TEXT = PT_PULLOUT_TEXT
"""Text that is in a cross-column pull-out region."""
EQUATION = PT_EQUATION
"""Partition belonging to an equation region."""
INLINE_EQUATION = PT_INLINE_EQUATION
"""Partition has inline equation."""
TABLE = PT_TABLE
"""Partition belonging to a table region."""
VERTICAL_TEXT = PT_VERTICAL_TEXT
"""Text-line runs vertically."""
CAPTION_TEXT = PT_CAPTION_TEXT
"""Text that belongs to an image."""
FLOWING_IMAGE = PT_FLOWING_IMAGE
"""Image that lives inside a column."""
HEADING_IMAGE = PT_HEADING_IMAGE
"""Image that spans more than one column."""
PULLOUT_IMAGE = PT_PULLOUT_IMAGE
"""Image that is in a cross-column pull-out region."""
HORZ_LINE = PT_HORZ_LINE
"""Horizontal Line."""
VERT_LINE = PT_VERT_LINE
"""Vertical Line."""
NOISE = PT_NOISE
"""Lies outside of any column."""
COUNT = PT_COUNT
cdef class Orientation(_Enum):
"""Enum for orientation options."""
PAGE_UP = ORIENTATION_PAGE_UP
PAGE_RIGHT = ORIENTATION_PAGE_RIGHT
PAGE_DOWN = ORIENTATION_PAGE_DOWN
PAGE_LEFT = ORIENTATION_PAGE_LEFT
cdef class WritingDirection(_Enum):
"""Enum for writing direction options."""
LEFT_TO_RIGHT = WRITING_DIRECTION_LEFT_TO_RIGHT
RIGHT_TO_LEFT = WRITING_DIRECTION_RIGHT_TO_LEFT
TOP_TO_BOTTOM = WRITING_DIRECTION_TOP_TO_BOTTOM
cdef class TextlineOrder(_Enum):
"""Enum for text line order options."""
LEFT_TO_RIGHT = TEXTLINE_ORDER_LEFT_TO_RIGHT
RIGHT_TO_LEFT = TEXTLINE_ORDER_RIGHT_TO_LEFT
TOP_TO_BOTTOM = TEXTLINE_ORDER_TOP_TO_BOTTOM
cdef class Justification(_Enum):
"""Enum for justification options."""
UNKNOWN = JUSTIFICATION_UNKNOWN
LEFT = JUSTIFICATION_LEFT
CENTER = JUSTIFICATION_CENTER
RIGHT = JUSTIFICATION_RIGHT
cdef class DIR(_Enum):
"""Enum for strong text direction values.
Attributes:
NEUTRAL: Text contains only neutral characters.
LEFT_TO_RIGHT: Text contains no Right-to-Left characters.
RIGHT_TO_LEFT: Text contains no Left-to-Right characters.
MIX: Text contains a mixture of left-to-right and right-to-left characters.
"""
NEUTRAL = DIR_NEUTRAL
"""Text contains only neutral characters."""
LEFT_TO_RIGHT = DIR_LEFT_TO_RIGHT
"""Text contains no Right-to-Left characters."""
RIGHT_TO_LEFT = DIR_RIGHT_TO_LEFT
"""Text contains no Left-to-Right characters."""
MIX = DIR_MIX
"""Text contains a mixture of left-to-right
and right-to-left characters."""
cdef unicode _free_str(char *text):
"""Return unicode string and free the c pointer"""
try:
return text
finally:
free(text)
cdef bytes _image_buffer(image):
"""Return raw bytes of a PIL Image"""
with BytesIO() as f:
image.save(f, image.format or 'JPEG')
return f.getvalue()
cdef _pix_to_image(Pix *pix):
"""Convert Pix object to PIL.Image."""
cdef:
unsigned char *buff
size_t size
int result
int fmt = pix.informat
if fmt > 0:
result = pixWriteMem(&buff, &size, pix, fmt)
else:
# write as JPEG if format is unknown
result = pixWriteMemJpeg(&buff, &size, pix, 0, 0)
try:
if result == 1:
raise RuntimeError("Failed to convert pix image to PIL.Image")
with BytesIO(<bytes>buff[:size]) as f:
image = Image.open(f)
image.load()
finally:
free(buff)
return image
cdef boxa_to_list(Boxa *boxa):
"""Convert Boxa (boxes array) to list of boxes dicts."""
boxes = []
for box in boxa.box[:boxa.n]:
boxes.append(box[0])
return boxes
cdef pixa_to_list(Pixa *pixa):
"""Convert Pixa (Array of pixes and boxes) to list of pix, box tuples."""
return zip((_pix_to_image(pix) for pix in pixa.pix[:pixa.n]), boxa_to_list(pixa.boxa))
cdef class PyPageIterator:
"""Wrapper around Tesseract's ``PageIterator`` class.
Returned by :meth:`PyTessBaseAPI.AnalyseLayout`.
Instances of this class and its subclasses cannot be instantiated from Python.
Accessing data
==============
Coordinate system:
Integer coordinates are at the cracks between the pixels.
The top-left corner of the top-left pixel in the image is at (0,0).
The bottom-right corner of the bottom-right pixel in the image is at
(width, height).
Every bounding box goes from the top-left of the top-left contained
pixel to the bottom-right of the bottom-right contained pixel, so
the bounding box of the single top-left pixel in the image is:
(0,0)->(1,1).
If an image rectangle has been set in the API, then returned coordinates
relate to the original (full) image, rather than the rectangle.
.. note::
You can iterate through the elements of a level using the :func:`iterate_level`
helper function:
>>> for e in iterate_level(api.AnalyseLayout(), RIL.WORD):
... orientation = e.Orientation()
.. warning::
This class points to data held within the :class:`PyTessBaseAPI`
instance, and therefore can only be used while the :class:`PyTessBaseAPI`
instance still exists and has not been subjected to a call of :meth:`Init`,
:meth:`SetImage`, :meth:`Recognize`, :meth:`Clear`, :meth:`End`,
or anything else that changes the internal `PAGE_RES`.
"""
cdef PageIterator *_piter
@staticmethod
cdef PyPageIterator createPageIterator(PageIterator *piter):
cdef PyPageIterator pyiter = PyPageIterator.__new__(PyPageIterator)
pyiter._piter = piter
return pyiter
def __cinit__(self):
self._piter = NULL
def __dealloc__(self):
if self._piter != NULL:
del self._piter
def __init__(self):
raise TypeError('{} cannot be instantiated from Python'.format(type(self).__name__))
def Begin(self):
"""Move the iterator to point to the start of the page to begin an iteration."""
self._piter.Begin()
def RestartParagraph(self):
"""Move the iterator to the beginning of the paragraph.
This class implements this functionality by moving it to the zero indexed
blob of the first (leftmost) word on the first row of the paragraph.
"""
self._piter.RestartParagraph()
def IsWithinFirstTextlineOfParagraph(self):
"""Return whether this iterator points anywhere in the first textline of a
paragraph."""
return self._piter.IsWithinFirstTextlineOfParagraph()
def RestartRow(self):
"""Move the iterator to the beginning of the text line.
This class implements this functionality by moving it to the zero indexed
blob of the first (leftmost) word of the row.
"""
return self._piter.RestartRow()
def Next(self, PageIteratorLevel level):
"""Move to the start of the next object at the given level in the
page hierarchy, and returns false if the end of the page was reached.
.. note::
:attr:`RIL.SYMBOL` will skip non-text blocks, but all other
:class:`RIL` level values will visit each non-text block once.
Think of non text blocks as containing a single para, with a single line,
with a single imaginary word.
Calls to Next with different levels may be freely intermixed.
This function iterates words in right-to-left scripts correctly, if
the appropriate language has been loaded into Tesseract.
Args:
level (int): Iterator level. See :class:`RIL`.
"""
return self._piter.Next(level)
def IsAtBeginningOf(self, PageIteratorLevel level):
"""Return whether the iterator is at the start of an object at the given
level.
For instance, suppose an iterator it is pointed to the first symbol of the
first word of the third line of the second paragraph of the first block in
a page, then::
it.IsAtBeginningOf(RIL.BLOCK) is False
it.IsAtBeginningOf(RIL.PARA) is False
it.IsAtBeginningOf(RIL.TEXTLINE) is True
it.IsAtBeginningOf(RIL.WORD) is True
it.IsAtBeginningOf(RIL.SYMBOL) is True
Args:
level (int): Iterator level. See :class:`RIL`.
Returns:
bool: ``True`` if the iterator is at the start of an object at the
given level.
"""
return self._piter.IsAtBeginningOf(level)
def IsAtFinalElement(self, PageIteratorLevel level, PageIteratorLevel element):
"""Return whether the iterator is positioned at the last element in a
given level. (e.g. the last word in a line, the last line in a block)
Here's some two-paragraph example
text:
It starts off innocuously
enough but quickly turns bizarre.
The author inserts a cornucopia
of words to guard against confused
references.
Now take an iterator ``it`` pointed to the start of "bizarre."
it.IsAtFinalElement(RIL.PARA, RIL.SYMBOL) = False
it.IsAtFinalElement(RIL.PARA, RIL.WORD) = True
it.IsAtFinalElement(RIL.BLOCK, RIL.WORD) = False
Args:
level (int): Iterator Level. See :class:`RIL`.
element (int): Element level. See :class:`RIL`.
Returns:
bool: ``True`` if the iterator is positioned at the last element
in the given level.
"""
return self._piter.IsAtFinalElement(level, element)
def SetBoundingBoxComponents(self, bool include_upper_dots, bool include_lower_dots):
"""Controls what to include in a bounding box. Bounding boxes of all levels
between :attr:`RIL.WORD` and :attr:`RIL.BLOCK` can include or exclude potential diacritics.
Between layout analysis and recognition, it isn't known where all
diacritics belong, so this control is used to include or exclude some
diacritics that are above or below the main body of the word. In most cases
where the placement is obvious, and after recognition, it doesn't make as
much difference, as the diacritics will already be included in the word.
Args:
include_upper_dots (bool): Include upper dots.
include_lower_dots (bool): Include lower dots.
"""
self._piter.SetBoundingBoxComponents(include_upper_dots, include_lower_dots)
def BoundingBox(self, PageIteratorLevel level, const int padding=0):
"""Return the bounding rectangle of the current object at the given level.
See comment on coordinate system above.
Args:
level (int): Page Iteration Level. See :class:`RIL` for avaialbe levels.
Kwargs:
padding (int): The padding argument to :meth:`GetImage` can be used to expand
the image to include more foreground pixels.
Returns:
tuple or None if there is no such object at the current position.
The returned bounding box (left, top, right and bottom values
respectively) is guaranteed to match the size and position of
the image returned by :meth:`GetBinaryImage`, but may clip
foreground pixels from a grey image.
"""
cdef int left, top, right, bottom
if not self._piter.BoundingBox(level, padding, &left, &top, &right, &bottom):
return None
return left, top, right, bottom
def BoundingBoxInternal(self, PageIteratorLevel level):
"""Return the bounding rectangle of the object in a coordinate system of the
working image rectangle having its origin at (rect_left_, rect_top_) with
respect to the original image and is scaled by a factor scale_.
Args:
level (int): Page Iteration Level. See :class:`RIL` for avaialbe levels.
Returns:
tuple or None if there is no such object at the current position.
The returned bounding box is represented as a tuple with
left, top, right and bottom values respectively.
"""
cdef int left, top, right, bottom
if not self._piter.BoundingBoxInternal(level, &left, &top, &right, &bottom):
return None
return left, top, right, bottom
def Empty(self, PageIteratorLevel level):
"""Return whether there is no object of a given level.
Args:
level (int): Iterator level. See :class:`RIL`.
Returns:
bool: ``True`` if there is no object at the given level.
"""
return self._piter.Empty(level)
def BlockType(self):
"""Return the type of the current block. See :class:`PolyBlockType` for
possible types.
"""
return self._piter.BlockType()
def BlockPolygon(self):
"""Return the polygon outline of the current block.
Returns:
list or None: list of points (x,y tuples) which list the vertices
of the polygon, and the last edge is the line segment between the last
point and the first point.
``None`` will be returned if the iterator is
at the end of the document or layout analysis was not used.
"""
cdef Pta *pta = self._piter.BlockPolygon()
if pta == NULL:
return None
try:
return zip((x for x in pta.x[:pta.n]), (y for y in pta.y[:pta.n]))
finally:
free(pta)
def GetBinaryImage(self, PageIteratorLevel level):
"""Return a binary image of the current object at the given level.
The position and size match the return from :meth:`BoundingBoxInternal`, and so
this could be upscaled with respect to the original input image.
Args:
level (int): Iterator level. See :class:`RIL`.
Returns:
:class:`PIL.Image`: Image object or None if no image is returned.
"""
cdef Pix *pix = self._piter.GetBinaryImage(level)
if pix == NULL:
return None
try:
return _pix_to_image(pix)
finally:
pixDestroy(&pix)
def GetImage(self, PageIteratorLevel level, int padding, original_image):
"""Return an image of the current object at the given level in greyscale
if available in the input.
To guarantee a binary image use :meth:`BinaryImage`.
Args:
level (int): Iterator level. See :class:`RIL`.
padding (int): Padding by which to expand the returned image.
.. note::
in order to give the best possible image, the bounds are
expanded slightly over the binary connected component, by
the supplied padding, so the top-left position of the returned
image is returned along with the image (left, top respectively).
These will most likely not match the coordinates returned by
:meth:`BoundingBox`.
original_image (:class:`PIL.Image`): Original image.
If you do not supply an original image (None), you will get a binary one.
Returns:
tuple: The image (:class:`PIL.Image`) of the current object at the given level in greyscale
followed by its top and left positions.
"""
cdef:
Pix *pix
Pix *opix = NULL
size_t size
cuchar_t *buff
int left
int top
if original_image:
raw = _image_buffer(original_image)
size = len(raw)
buff = raw
opix = pixReadMem(buff, size)
pix = self._piter.GetImage(level, padding, opix, &left, &top)
try:
return _pix_to_image(pix), left, top
finally:
pixDestroy(&pix)
if opix != NULL:
pixDestroy(&opix)
def Baseline(self, PageIteratorLevel level):
"""Return the baseline of the current object at the given level.
The baseline is the line that passes through (x1, y1) and (x2, y2).
.. warning::
with vertical text, baselines may be vertical!
Args:
level (int): Iterator level. See :class:`RIL`.
Returns:
tuple: Baseline points' coordinates (x1, y1), (x2, y2).
``None`` if there is no baseline at the current position.
"""
cdef int x1, y1, x2, y2
if not self._piter.Baseline(level, &x1, &y1, &x2, &y2):
return False
return (x1, y1), (x2, y2)
def Orientation(self):
"""Return the orientation for the block the iterator points to.
Returns:
tuple: The following values are returned respectively::
orientation: See :class:`Orientation`
writing_direction: See :class:`WritingDirection`
textline_order: See :class:`TextlineOrder`
deskew_angle: After rotating the block so the text orientation is
upright, how many radians does one have to rotate the
block anti-clockwise for it to be level?
-Pi/4 <= deskew_angle <= Pi/4
"""
cdef:
TessOrientation orientation
TessWritingDirection writing_direction
TessTextlineOrder textline_order
float deskew_angle
self._piter.Orientation(&orientation, &writing_direction, &textline_order, &deskew_angle)
return orientation, writing_direction, textline_order, deskew_angle
def ParagraphInfo(self):
"""Return information about the current paragraph, if available.
Returns:
tuple: The following values are returned respectively::
justification:
LEFT if ragged right, or fully justified and script is left-to-right.
RIGHT if ragged left, or fully justified and script is right-to-left.
UNKNOWN if it looks like source code or we have very few lines.
See :class:`Justification`.
is_list_item:
``True`` if we believe this is a member of an ordered or unordered list.
is_crown:
``True`` if the first line of the paragraph is aligned with the other
lines of the paragraph even though subsequent paragraphs have first
line indents. This typically indicates that this is the continuation
of a previous paragraph or that it is the very first paragraph in
the chapter.
first_line_indent:
For LEFT aligned paragraphs, the first text line of paragraphs of
this kind are indented this many pixels from the left edge of the
rest of the paragraph.
for RIGHT aligned paragraphs, the first text line of paragraphs of
this kind are indented this many pixels from the right edge of the
rest of the paragraph.
NOTE 1: This value may be negative.
NOTE 2: if ``is_crown == True``, the first line of this paragraph is
actually flush, and first_line_indent is set to the "common"
first_line_indent for subsequent paragraphs in this block
of text.
"""
cdef:
TessParagraphJustification justification
bool is_list_item
bool is_crown
int first_line_indent
self._piter.ParagraphInfo(&justification, &is_list_item, &is_crown, &first_line_indent)
return justification, is_list_item, is_crown, first_line_indent
cdef class PyLTRResultIterator(PyPageIterator):
cdef LTRResultIterator *_ltrriter
def __cinit__(self):
self._ltrriter = NULL
def __dealloc__(self):
if self._ltrriter != NULL:
del self._ltrriter
self._piter = NULL
def GetChoiceIterator(self):
"""Return `PyChoiceIterator` instance to iterate over symbol choices.
Returns `None` on failure.
"""
cdef:
const LTRResultIterator *ltrriter = self._ltrriter
ChoiceIterator *citer = new ChoiceIterator(ltrriter[0])
if citer == NULL:
return None
return PyChoiceIterator.create(citer)
def GetUTF8Text(self, PageIteratorLevel level):
"""Returns the UTF-8 encoded text string for the current
object at the given level.
Args:
level (int): Iterator level. See :class:`RIL`.
Returns:
unicode: UTF-8 encoded text for the given level's current object.
Raises:
:exc:`RuntimeError`: If no text returned.
"""
cdef char *text = self._ltrriter.GetUTF8Text(level)
if text == NULL:
raise RuntimeError('No text returned')
return _free_str(text)
def SetLineSeparator(self, separator):
"""Set the string inserted at the end of each text line. "\n" by default."""
cdef bytes py_sep = _b(separator)
self._ltrriter.SetLineSeparator(py_sep)
def SetParagraphSeparator(self, separator):
"""Set the string inserted at the end of each paragraph. "\n" by default."""
cdef bytes py_sep = _b(separator)
self._ltrriter.SetParagraphSeparator(py_sep)
def Confidence(self, PageIteratorLevel level):
"""Return the mean confidence of the current object at the given level.
The number should be interpreted as a percent probability. (0.0-100.0)
"""
return self._ltrriter.Confidence(level)
def WordFontAttributes(self):
"""Return the font attributes of the current word.
.. note::
If iterating at a higher level object than words, eg textlines,
then this will return the attributes of the first word in that textline.
Returns:
dict: `None` if nothing found or a dictionary with the font attributes::
font_name: String representing a font name. Lifespan is the same as
the iterator itself, ie rendered invalid by various members of
:class:`PyTessBaseAPI`, including `Init`, `SetImage`, `End` or
deleting the :class:`PyTessBaseAPI`.
bold (bool): ``True`` if bold.
italic (bool): ``True`` if italic.
underlined (bool): ``True`` if underlined.
monospace (bool): ``True`` if monospace.
serif (bool): ``True`` if serif.
smallcaps (bool): ``True`` if smallcaps.
pointsize (int): printers points (1/72 inch.)
font_id (int): font id.
"""
cdef:
bool is_bold,
bool is_italic
bool is_underlined
bool is_monospace
bool is_serif
bool is_smallcaps
int pointsize
int font_id
cchar_t *font_name
font_name = self._ltrriter.WordFontAttributes(&is_bold, &is_italic, &is_underlined,
&is_monospace, &is_serif, &is_smallcaps,
&pointsize, &font_id)
if font_name == NULL:
return None
return {
'font_name': font_name,
'bold': is_bold,
'italic': is_italic,
'underlined': is_underlined,
'monospace': is_monospace,
'serif': is_serif,
'smallcaps': is_smallcaps,
'pointsize': pointsize,
'font_id': font_id
}
def WordRecognitionLanguage(self):
"""Return the name of the language used to recognize this word.
Returns ``None`` on error.
"""
cdef cchar_t *lang = self._ltrriter.WordRecognitionLanguage()
if lang == NULL:
return None
return lang
def WordDirection(self):
"""Return the overall directionality of this word.
See :class:`DIR` for available values.
"""
return self._ltrriter.WordDirection()
def WordIsFromDictionary(self):
"""Return True if the current word was found in a dictionary."""
return self._ltrriter.WordIsFromDictionary()
IF TESSERACT_VERSION >= 0x4000000:
def BlanksBeforeWord(self):
"""Return True if the current word is numeric."""
return self._ltrriter.BlanksBeforeWord()
def WordIsNumeric(self):
"""Return True if the current word is numeric."""
return self._ltrriter.WordIsNumeric()
def HasBlamerInfo(self):
"""Return True if the word contains blamer information."""
return self._ltrriter.HasBlamerInfo()
def GetBlamerDebug(self):
"""Return a string with blamer information for this word."""
return self._ltrriter.GetBlamerDebug()
def GetBlamerMisadaptionDebug(self):
"""Return a string with misadaption information for this word."""
return self._ltrriter.GetBlamerMisadaptionDebug()
def HasTruthString(self):
"""Returns True if a truth string was recorded for the current word."""
return self._ltrriter.HasTruthString()
def EquivalentToTruth(self, text):
"""Return True if the given string is equivalent to the truth string for
the current word."""
cdef bytes py_text = _b(text)
return self._ltrriter.EquivalentToTruth(py_text)
def WordTruthUTF8Text(self):
"""Return a UTF-8 encoded truth string for the current word."""
cdef char *text = self._ltrriter.WordTruthUTF8Text()
return _free_str(text)
def WordNormedUTF8Text(self):
"""Returns a UTF-8 encoded normalized OCR string for the
current word."""
cdef char *text = self._ltrriter.WordNormedUTF8Text()
return _free_str(text)
def WordLattice(self):
"""Return a serialized choice lattice."""
cdef:
cchar_t *word_lattice
int lattice_size
word_lattice = self._ltrriter.WordLattice(&lattice_size)
if not lattice_size:
return None
return word_lattice[:lattice_size]
def SymbolIsSuperscript(self):
"""Return True if the current symbol is a superscript.
If iterating at a higher level object than symbols, eg words, then
this will return the attributes of the first symbol in that word.
"""
return self._ltrriter.SymbolIsSuperscript()
def SymbolIsSubscript(self):
"""Return True if the current symbol is a subscript.
If iterating at a higher level object than symbols, eg words, then
this will return the attributes of the first symbol in that word.
"""
return self._ltrriter.SymbolIsSubscript()
def SymbolIsDropcap(self):
"""Return True if the current symbol is a dropcap.
If iterating at a higher level object than symbols, eg words, then
this will return the attributes of the first symbol in that word.
"""
return self._ltrriter.SymbolIsDropcap()
cdef class PyResultIterator(PyLTRResultIterator):
"""Wrapper around Tesseract's ``ResultIterator`` class.
.. note::
You can iterate through the elements of a level using the :func:`iterate_level`
helper function:
>>> for e in iterate_level(api.GetIterator(), RIL.WORD):
... word = e.GetUTF8Text()
See :class:`PyPageIterator` for more details.
"""
cdef ResultIterator *_riter
@staticmethod
cdef PyResultIterator createResultIterator(ResultIterator *riter):
cdef PyResultIterator pyiter = PyResultIterator.__new__(PyResultIterator)
pyiter._piter = <PageIterator *>riter
pyiter._ltrriter = <LTRResultIterator *>riter
pyiter._riter = riter
return pyiter