-
Notifications
You must be signed in to change notification settings - Fork 9.6k
/
pageres.cpp
1741 lines (1636 loc) · 62 KB
/
pageres.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/**********************************************************************
* File: pageres.cpp (Formerly page_res.c)
* Description: Hierarchy of results classes from PAGE_RES to WERD_RES
* and an iterator class to iterate over the words.
* Main purposes:
* Easy way to iterate over the words without a 3-nested loop.
* Holds data used during word recognition.
* Holds information about alternative spacing paths.
* Author: Phil Cheatle
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include "pageres.h"
#include "blamer.h" // for BlamerBundle
#include "blobs.h" // for TWERD, TBLOB
#include "boxword.h" // for BoxWord
#include "errcode.h" // for ASSERT_HOST
#include "ocrblock.h" // for BLOCK_IT, BLOCK, BLOCK_LIST (ptr only)
#include "ocrrow.h" // for ROW, ROW_IT
#include "pdblock.h" // for PDBLK
#include "polyblk.h" // for POLY_BLOCK
#include "seam.h" // for SEAM, start_seam_list
#include "stepblob.h" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST
#include "tprintf.h" // for tprintf
#include <tesseract/publictypes.h> // for OcrEngineMode, OEM_LSTM_ONLY
#include <cassert> // for assert
#include <cstdint> // for INT32_MAX
#include <cstring> // for strlen
struct Pix;
namespace tesseract {
// Gain factor for computing thresholds that determine the ambiguity of a
// word.
static const double kStopperAmbiguityThresholdGain = 8.0;
// Constant offset for computing thresholds that determine the ambiguity of a
// word.
static const double kStopperAmbiguityThresholdOffset = 1.5;
// Max number of broken pieces to associate.
const int kWordrecMaxNumJoinChunks = 4;
// Max ratio of word box height to line size to allow it to be processed as
// a line with other words.
const double kMaxWordSizeRatio = 1.25;
// Max ratio of line box height to line size to allow a new word to be added.
const double kMaxLineSizeRatio = 1.25;
// Max ratio of word gap to line size to allow a new word to be added.
const double kMaxWordGapRatio = 2.0;
// Computes and returns a threshold of certainty difference used to determine
// which words to keep, based on the adjustment factors of the two words.
// TODO(rays) This is horrible. Replace with an enhance params training model.
static double StopperAmbigThreshold(double f1, double f2) {
return (f2 - f1) * kStopperAmbiguityThresholdGain -
kStopperAmbiguityThresholdOffset;
}
/*************************************************************************
* PAGE_RES::PAGE_RES
*
* Constructor for page results
*************************************************************************/
PAGE_RES::PAGE_RES(bool merge_similar_words, BLOCK_LIST *the_block_list,
WERD_CHOICE **prev_word_best_choice_ptr) {
Init();
BLOCK_IT block_it(the_block_list);
BLOCK_RES_IT block_res_it(&block_res_list);
for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
block_res_it.add_to_end(
new BLOCK_RES(merge_similar_words, block_it.data()));
}
prev_word_best_choice = prev_word_best_choice_ptr;
}
/*************************************************************************
* BLOCK_RES::BLOCK_RES
*
* Constructor for BLOCK results
*************************************************************************/
BLOCK_RES::BLOCK_RES(bool merge_similar_words, BLOCK *the_block) {
ROW_IT row_it(the_block->row_list());
ROW_RES_IT row_res_it(&row_res_list);
char_count = 0;
rej_count = 0;
font_class = -1; // not assigned
x_height = -1.0;
font_assigned = false;
row_count = 0;
block = the_block;
for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
row_res_it.add_to_end(new ROW_RES(merge_similar_words, row_it.data()));
}
}
/*************************************************************************
* ROW_RES::ROW_RES
*
* Constructor for ROW results
*************************************************************************/
ROW_RES::ROW_RES(bool merge_similar_words, ROW *the_row) {
WERD_IT word_it(the_row->word_list());
WERD_RES_IT word_res_it(&word_res_list);
WERD_RES *combo = nullptr; // current combination of fuzzies
WERD *copy_word;
char_count = 0;
rej_count = 0;
whole_word_rej_count = 0;
row = the_row;
bool add_next_word = false;
TBOX union_box;
float line_height =
the_row->x_height() + the_row->ascenders() - the_row->descenders();
for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
auto *word_res = new WERD_RES(word_it.data());
word_res->x_height = the_row->x_height();
if (add_next_word) {
ASSERT_HOST(combo != nullptr);
// We are adding this word to the combination.
word_res->part_of_combo = true;
combo->copy_on(word_res);
} else if (merge_similar_words) {
union_box = word_res->word->bounding_box();
add_next_word = !word_res->word->flag(W_REP_CHAR) &&
union_box.height() <= line_height * kMaxWordSizeRatio;
word_res->odd_size = !add_next_word;
}
WERD *next_word = word_it.data_relative(1);
if (merge_similar_words) {
if (add_next_word && !next_word->flag(W_REP_CHAR)) {
// Next word will be added on if all of the following are true:
// Not a rep char.
// Box height small enough.
// Union box height small enough.
// Horizontal gap small enough.
TBOX next_box = next_word->bounding_box();
int prev_right = union_box.right();
union_box += next_box;
if (next_box.height() > line_height * kMaxWordSizeRatio ||
union_box.height() > line_height * kMaxLineSizeRatio ||
next_box.left() > prev_right + line_height * kMaxWordGapRatio) {
add_next_word = false;
}
}
next_word->set_flag(W_FUZZY_NON, add_next_word);
} else {
add_next_word = next_word->flag(W_FUZZY_NON);
}
if (add_next_word) {
if (combo == nullptr) {
copy_word = new WERD;
*copy_word = *(word_it.data()); // deep copy
combo = new WERD_RES(copy_word);
combo->x_height = the_row->x_height();
combo->combination = true;
word_res_it.add_to_end(combo);
}
word_res->part_of_combo = true;
} else {
combo = nullptr;
}
word_res_it.add_to_end(word_res);
}
}
WERD_RES &WERD_RES::operator=(const WERD_RES &source) {
this->ELIST<WERD_RES>::LINK::operator=(source);
Clear();
if (source.combination) {
word = new WERD;
*word = *(source.word); // deep copy
} else {
word = source.word; // pt to same word
}
if (source.bln_boxes != nullptr) {
bln_boxes = new tesseract::BoxWord(*source.bln_boxes);
}
if (source.chopped_word != nullptr) {
chopped_word = new TWERD(*source.chopped_word);
}
if (source.rebuild_word != nullptr) {
rebuild_word = new TWERD(*source.rebuild_word);
}
// TODO(rays) Do we ever need to copy the seam_array?
blob_row = source.blob_row;
denorm = source.denorm;
if (source.box_word != nullptr) {
box_word = new tesseract::BoxWord(*source.box_word);
}
best_state = source.best_state;
correct_text = source.correct_text;
blob_widths = source.blob_widths;
blob_gaps = source.blob_gaps;
// None of the uses of operator= require the ratings matrix to be copied,
// so don't as it would be really slow.
// Copy the cooked choices.
WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST *>(&source.best_choices));
WERD_CHOICE_IT wc_dest_it(&best_choices);
for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
const WERD_CHOICE *choice = wc_it.data();
wc_dest_it.add_after_then_move(new WERD_CHOICE(*choice));
}
if (!wc_dest_it.empty()) {
wc_dest_it.move_to_first();
best_choice = wc_dest_it.data();
} else {
best_choice = nullptr;
}
if (source.raw_choice != nullptr) {
raw_choice = new WERD_CHOICE(*source.raw_choice);
} else {
raw_choice = nullptr;
}
if (source.ep_choice != nullptr) {
ep_choice = new WERD_CHOICE(*source.ep_choice);
} else {
ep_choice = nullptr;
}
reject_map = source.reject_map;
combination = source.combination;
part_of_combo = source.part_of_combo;
CopySimpleFields(source);
if (source.blamer_bundle != nullptr) {
blamer_bundle = new BlamerBundle(*(source.blamer_bundle));
}
return *this;
}
// Copies basic fields that don't involve pointers that might be useful
// to copy when making one WERD_RES from another.
void WERD_RES::CopySimpleFields(const WERD_RES &source) {
tess_failed = source.tess_failed;
tess_accepted = source.tess_accepted;
tess_would_adapt = source.tess_would_adapt;
done = source.done;
unlv_crunch_mode = source.unlv_crunch_mode;
small_caps = source.small_caps;
odd_size = source.odd_size;
fontinfo = source.fontinfo;
fontinfo2 = source.fontinfo2;
fontinfo_id_count = source.fontinfo_id_count;
fontinfo_id2_count = source.fontinfo_id2_count;
x_height = source.x_height;
caps_height = source.caps_height;
baseline_shift = source.baseline_shift;
guessed_x_ht = source.guessed_x_ht;
guessed_caps_ht = source.guessed_caps_ht;
reject_spaces = source.reject_spaces;
uch_set = source.uch_set;
tesseract = source.tesseract;
}
// Initializes a blank (default constructed) WERD_RES from one that has
// already been recognized.
// Use SetupFor*Recognition afterwards to complete the setup and make
// it ready for a retry recognition.
void WERD_RES::InitForRetryRecognition(const WERD_RES &source) {
word = source.word;
CopySimpleFields(source);
if (source.blamer_bundle != nullptr) {
blamer_bundle = new BlamerBundle();
blamer_bundle->CopyTruth(*source.blamer_bundle);
}
}
// Sets up the members used in recognition: bln_boxes, chopped_word,
// seam_array, denorm. Returns false if
// the word is empty and sets up fake results. If use_body_size is
// true and row->body_size is set, then body_size will be used for
// blob normalization instead of xheight + ascrise. This flag is for
// those languages that are using CJK pitch model and thus it has to
// be true if and only if tesseract->textord_use_cjk_fp_model is
// true.
// If allow_detailed_fx is true, the feature extractor will receive fine
// precision outline information, allowing smoother features and better
// features on low resolution images.
// The norm_mode_hint sets the default mode for normalization in absence
// of any of the above flags.
// norm_box is used to override the word bounding box to determine the
// normalization scale and offset.
// Returns false if the word is empty and sets up fake results.
bool WERD_RES::SetupForRecognition(const UNICHARSET &unicharset_in,
tesseract::Tesseract *tess, Image pix,
int norm_mode, const TBOX *norm_box,
bool numeric_mode, bool use_body_size,
bool allow_detailed_fx, ROW *row,
const BLOCK *block) {
auto norm_mode_hint = static_cast<tesseract::OcrEngineMode>(norm_mode);
tesseract = tess;
POLY_BLOCK *pb = block != nullptr ? block->pdblk.poly_block() : nullptr;
if ((norm_mode_hint != tesseract::OEM_LSTM_ONLY &&
word->cblob_list()->empty()) ||
(pb != nullptr && !pb->IsText())) {
// Empty words occur when all the blobs have been moved to the rej_blobs
// list, which seems to occur frequently in junk.
SetupFake(unicharset_in);
word->set_flag(W_REP_CHAR, false);
return false;
}
ClearResults();
SetupWordScript(unicharset_in);
chopped_word = TWERD::PolygonalCopy(allow_detailed_fx, word);
float word_xheight =
use_body_size && row != nullptr && row->body_size() > 0.0f
? row->body_size()
: x_height;
chopped_word->BLNormalize(block, row, pix, word->flag(W_INVERSE),
word_xheight, baseline_shift, numeric_mode,
norm_mode_hint, norm_box, &denorm);
blob_row = row;
SetupBasicsFromChoppedWord(unicharset_in);
SetupBlamerBundle();
int num_blobs = chopped_word->NumBlobs();
ratings = new MATRIX(num_blobs, kWordrecMaxNumJoinChunks);
tess_failed = false;
return true;
}
// Set up the seam array, bln_boxes, best_choice, and raw_choice to empty
// accumulators from a made chopped word. We presume the fields are already
// empty.
void WERD_RES::SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in) {
bln_boxes = tesseract::BoxWord::CopyFromNormalized(chopped_word);
start_seam_list(chopped_word, &seam_array);
SetupBlobWidthsAndGaps();
ClearWordChoices();
}
// Sets up the members used in recognition for an empty recognition result:
// bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
void WERD_RES::SetupFake(const UNICHARSET &unicharset_in) {
ClearResults();
SetupWordScript(unicharset_in);
chopped_word = new TWERD;
rebuild_word = new TWERD;
bln_boxes = new tesseract::BoxWord;
box_word = new tesseract::BoxWord;
int blob_count = word->cblob_list()->length();
if (blob_count > 0) {
auto **fake_choices = new BLOB_CHOICE *[blob_count];
// For non-text blocks, just pass any blobs through to the box_word
// and call the word failed with a fake classification.
C_BLOB_IT b_it(word->cblob_list());
int blob_id = 0;
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
TBOX box = b_it.data()->bounding_box();
box_word->InsertBox(box_word->length(), box);
fake_choices[blob_id++] = new BLOB_CHOICE;
}
FakeClassifyWord(blob_count, fake_choices);
delete[] fake_choices;
} else {
auto *word = new WERD_CHOICE(&unicharset_in);
word->make_bad();
LogNewRawChoice(word);
// Ownership of word is taken by *this WERD_RES in LogNewCookedChoice.
LogNewCookedChoice(1, false, word);
}
tess_failed = true;
done = true;
}
void WERD_RES::SetupWordScript(const UNICHARSET &uch) {
uch_set = &uch;
int script = uch.default_sid();
word->set_script_id(script);
word->set_flag(W_SCRIPT_HAS_XHEIGHT, uch.script_has_xheight());
word->set_flag(W_SCRIPT_IS_LATIN, script == uch.latin_sid());
}
// Sets up the blamer_bundle if it is not null, using the initialized denorm.
void WERD_RES::SetupBlamerBundle() {
if (blamer_bundle != nullptr) {
blamer_bundle->SetupNormTruthWord(denorm);
}
}
// Computes the blob_widths and blob_gaps from the chopped_word.
void WERD_RES::SetupBlobWidthsAndGaps() {
blob_widths.clear();
blob_gaps.clear();
int num_blobs = chopped_word->NumBlobs();
for (int b = 0; b < num_blobs; ++b) {
TBLOB *blob = chopped_word->blobs[b];
TBOX box = blob->bounding_box();
blob_widths.push_back(box.width());
if (b + 1 < num_blobs) {
blob_gaps.push_back(chopped_word->blobs[b + 1]->bounding_box().left() -
box.right());
}
}
}
// Updates internal data to account for a new SEAM (chop) at the given
// blob_number. Fixes the ratings matrix and states in the choices, as well
// as the blob widths and gaps.
void WERD_RES::InsertSeam(int blob_number, SEAM *seam) {
// Insert the seam into the SEAMS array.
seam->PrepareToInsertSeam(seam_array, chopped_word->blobs, blob_number, true);
seam_array.insert(seam_array.begin() + blob_number, seam);
if (ratings != nullptr) {
// Expand the ratings matrix.
ratings = ratings->ConsumeAndMakeBigger(blob_number);
// Fix all the segmentation states.
if (raw_choice != nullptr) {
raw_choice->UpdateStateForSplit(blob_number);
}
WERD_CHOICE_IT wc_it(&best_choices);
for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
WERD_CHOICE *choice = wc_it.data();
choice->UpdateStateForSplit(blob_number);
}
SetupBlobWidthsAndGaps();
}
}
// Returns true if all the word choices except the first have adjust_factors
// worse than the given threshold.
bool WERD_RES::AlternativeChoiceAdjustmentsWorseThan(float threshold) const {
// The choices are not changed by this iteration.
WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST *>(&best_choices));
for (wc_it.forward(); !wc_it.at_first(); wc_it.forward()) {
WERD_CHOICE *choice = wc_it.data();
if (choice->adjust_factor() <= threshold) {
return false;
}
}
return true;
}
// Returns true if the current word is ambiguous (by number of answers or
// by dangerous ambigs.)
bool WERD_RES::IsAmbiguous() {
return !best_choices.singleton() || best_choice->dangerous_ambig_found();
}
// Returns true if the ratings matrix size matches the sum of each of the
// segmentation states.
bool WERD_RES::StatesAllValid() {
unsigned ratings_dim = ratings->dimension();
if (raw_choice->TotalOfStates() != ratings_dim) {
tprintf("raw_choice has total of states = %u vs ratings dim of %u\n",
raw_choice->TotalOfStates(), ratings_dim);
return false;
}
WERD_CHOICE_IT it(&best_choices);
unsigned index = 0;
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
WERD_CHOICE *choice = it.data();
if (choice->TotalOfStates() != ratings_dim) {
tprintf("Cooked #%u has total of states = %u vs ratings dim of %u\n",
index, choice->TotalOfStates(), ratings_dim);
return false;
}
}
return true;
}
// Prints a list of words found if debug is true or the word result matches
// the word_to_debug.
void WERD_RES::DebugWordChoices(bool debug, const char *word_to_debug) {
if (debug || (word_to_debug != nullptr && *word_to_debug != '\0' &&
best_choice != nullptr &&
best_choice->unichar_string() == std::string(word_to_debug))) {
if (raw_choice != nullptr) {
raw_choice->print("\nBest Raw Choice");
}
WERD_CHOICE_IT it(&best_choices);
int index = 0;
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
WERD_CHOICE *choice = it.data();
std::string label;
label += "\nCooked Choice #" + std::to_string(index);
choice->print(label.c_str());
}
}
}
// Prints the top choice along with the accepted/done flags.
void WERD_RES::DebugTopChoice(const char *msg) const {
tprintf("Best choice: accepted=%d, adaptable=%d, done=%d : ", tess_accepted,
tess_would_adapt, done);
if (best_choice == nullptr) {
tprintf("<Null choice>\n");
} else {
best_choice->print(msg);
}
}
// Removes from best_choices all choices which are not within a reasonable
// range of the best choice.
// TODO(rays) incorporate the information used here into the params training
// re-ranker, in place of this heuristic that is based on the previous
// adjustment factor.
void WERD_RES::FilterWordChoices(int debug_level) {
if (best_choice == nullptr || best_choices.singleton()) {
return;
}
if (debug_level >= 2) {
best_choice->print("\nFiltering against best choice");
}
WERD_CHOICE_IT it(&best_choices);
int index = 0;
for (it.forward(); !it.at_first(); it.forward(), ++index) {
WERD_CHOICE *choice = it.data();
float threshold = StopperAmbigThreshold(best_choice->adjust_factor(),
choice->adjust_factor());
// i, j index the blob choice in choice, best_choice.
// chunk is an index into the chopped_word blobs (AKA chunks).
// Since the two words may use different segmentations of the chunks, we
// iterate over the chunks to find out whether a comparable blob
// classification is much worse than the best result.
unsigned i = 0, j = 0, chunk = 0;
// Each iteration of the while deals with 1 chunk. On entry choice_chunk
// and best_chunk are the indices of the first chunk in the NEXT blob,
// i.e. we don't have to increment i, j while chunk < choice_chunk and
// best_chunk respectively.
auto choice_chunk = choice->state(0), best_chunk = best_choice->state(0);
while (i < choice->length() && j < best_choice->length()) {
if (choice->unichar_id(i) != best_choice->unichar_id(j) &&
choice->certainty(i) - best_choice->certainty(j) < threshold) {
if (debug_level >= 2) {
choice->print("WorstCertaintyDiffWorseThan");
tprintf(
"i %u j %u Choice->Blob[i].Certainty %.4g"
" WorstOtherChoiceCertainty %g Threshold %g\n",
i, j, choice->certainty(i), best_choice->certainty(j), threshold);
tprintf("Discarding bad choice #%d\n", index);
}
delete it.extract();
break;
}
++chunk;
// If needed, advance choice_chunk to keep up with chunk.
while (choice_chunk < chunk && ++i < choice->length()) {
choice_chunk += choice->state(i);
}
// If needed, advance best_chunk to keep up with chunk.
while (best_chunk < chunk && ++j < best_choice->length()) {
best_chunk += best_choice->state(j);
}
}
}
}
void WERD_RES::ComputeAdaptionThresholds(float certainty_scale,
float min_rating, float max_rating,
float rating_margin,
float *thresholds) {
int chunk = 0;
int end_chunk = best_choice->state(0);
int end_raw_chunk = raw_choice->state(0);
int raw_blob = 0;
for (unsigned i = 0; i < best_choice->length(); i++, thresholds++) {
float avg_rating = 0.0f;
int num_error_chunks = 0;
// For each chunk in best choice blob i, count non-matching raw results.
while (chunk < end_chunk) {
if (chunk >= end_raw_chunk) {
++raw_blob;
end_raw_chunk += raw_choice->state(raw_blob);
}
if (best_choice->unichar_id(i) != raw_choice->unichar_id(raw_blob)) {
avg_rating += raw_choice->certainty(raw_blob);
++num_error_chunks;
}
++chunk;
}
if (num_error_chunks > 0) {
avg_rating /= num_error_chunks;
*thresholds = (avg_rating / -certainty_scale) * (1.0 - rating_margin);
} else {
*thresholds = max_rating;
}
if (*thresholds > max_rating) {
*thresholds = max_rating;
}
if (*thresholds < min_rating) {
*thresholds = min_rating;
}
}
}
// Saves a copy of the word_choice if it has the best unadjusted rating.
// Returns true if the word_choice was the new best.
bool WERD_RES::LogNewRawChoice(WERD_CHOICE *word_choice) {
if (raw_choice == nullptr || word_choice->rating() < raw_choice->rating()) {
delete raw_choice;
raw_choice = new WERD_CHOICE(*word_choice);
raw_choice->set_permuter(TOP_CHOICE_PERM);
return true;
}
return false;
}
// Consumes word_choice by adding it to best_choices, (taking ownership) if
// the certainty for word_choice is some distance of the best choice in
// best_choices, or by deleting the word_choice and returning false.
// The best_choices list is kept in sorted order by rating. Duplicates are
// removed, and the list is kept no longer than max_num_choices in length.
// Returns true if the word_choice is still a valid pointer.
bool WERD_RES::LogNewCookedChoice(int max_num_choices, bool debug,
WERD_CHOICE *word_choice) {
if (best_choice != nullptr) {
// Throw out obviously bad choices to save some work.
// TODO(rays) Get rid of this! This piece of code produces different
// results according to the order in which words are found, which is an
// undesirable behavior. It would be better to keep all the choices and
// prune them later when more information is available.
float max_certainty_delta = StopperAmbigThreshold(
best_choice->adjust_factor(), word_choice->adjust_factor());
if (max_certainty_delta > -kStopperAmbiguityThresholdOffset) {
max_certainty_delta = -kStopperAmbiguityThresholdOffset;
}
if (word_choice->certainty() - best_choice->certainty() <
max_certainty_delta) {
if (debug) {
std::string bad_string;
word_choice->string_and_lengths(&bad_string, nullptr);
tprintf(
"Discarding choice \"%s\" with an overly low certainty"
" %.3f vs best choice certainty %.3f (Threshold: %.3f)\n",
bad_string.c_str(), word_choice->certainty(),
best_choice->certainty(),
max_certainty_delta + best_choice->certainty());
}
delete word_choice;
return false;
}
}
// Insert in the list in order of increasing rating, but knock out worse
// string duplicates.
WERD_CHOICE_IT it(&best_choices);
const std::string &new_str = word_choice->unichar_string();
bool inserted = false;
int num_choices = 0;
if (!it.empty()) {
do {
WERD_CHOICE *choice = it.data();
if (choice->rating() > word_choice->rating() && !inserted) {
// Time to insert.
it.add_before_stay_put(word_choice);
inserted = true;
if (num_choices == 0) {
best_choice = word_choice; // This is the new best.
}
++num_choices;
}
if (choice->unichar_string() == new_str) {
if (inserted) {
// New is better.
delete it.extract();
} else {
// Old is better.
if (debug) {
tprintf("Discarding duplicate choice \"%s\", rating %g vs %g\n",
new_str.c_str(), word_choice->rating(), choice->rating());
}
delete word_choice;
return false;
}
} else {
++num_choices;
if (num_choices > max_num_choices) {
delete it.extract();
}
}
it.forward();
} while (!it.at_first());
}
if (!inserted && num_choices < max_num_choices) {
it.add_to_end(word_choice);
inserted = true;
if (num_choices == 0) {
best_choice = word_choice; // This is the new best.
}
}
if (debug) {
if (inserted) {
tprintf("New %s", best_choice == word_choice ? "Best" : "Secondary");
} else {
tprintf("Poor");
}
word_choice->print(" Word Choice");
}
if (!inserted) {
delete word_choice;
return false;
}
return true;
}
// Simple helper moves the ownership of the pointer data from src to dest,
// first deleting anything in dest, and nulling out src afterwards.
template <class T>
static void MovePointerData(T **dest, T **src) {
delete *dest;
*dest = *src;
*src = nullptr;
}
// Prints a brief list of all the best choices.
void WERD_RES::PrintBestChoices() const {
std::string alternates_str;
WERD_CHOICE_IT it(const_cast<WERD_CHOICE_LIST *>(&best_choices));
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
if (!it.at_first()) {
alternates_str += "\", \"";
}
alternates_str += it.data()->unichar_string();
}
tprintf("Alternates for \"%s\": {\"%s\"}\n",
best_choice->unichar_string().c_str(), alternates_str.c_str());
}
// Returns the sum of the widths of the blob between start_blob and last_blob
// inclusive.
int WERD_RES::GetBlobsWidth(int start_blob, int last_blob) const {
int result = 0;
for (int b = start_blob; b <= last_blob; ++b) {
result += blob_widths[b];
if (b < last_blob) {
result += blob_gaps[b];
}
}
return result;
}
// Returns the width of a gap between the specified blob and the next one.
int WERD_RES::GetBlobsGap(unsigned blob_index) const {
if (blob_index >= blob_gaps.size()) {
return 0;
}
return blob_gaps[blob_index];
}
// Returns the BLOB_CHOICE corresponding to the given index in the
// best choice word taken from the appropriate cell in the ratings MATRIX.
// Borrowed pointer, so do not delete. May return nullptr if there is no
// BLOB_CHOICE matching the unichar_id at the given index.
BLOB_CHOICE *WERD_RES::GetBlobChoice(unsigned index) const {
if (index >= best_choice->length()) {
return nullptr;
}
BLOB_CHOICE_LIST *choices = GetBlobChoices(index);
return FindMatchingChoice(best_choice->unichar_id(index), choices);
}
// Returns the BLOB_CHOICE_LIST corresponding to the given index in the
// best choice word taken from the appropriate cell in the ratings MATRIX.
// Borrowed pointer, so do not delete.
BLOB_CHOICE_LIST *WERD_RES::GetBlobChoices(int index) const {
return best_choice->blob_choices(index, ratings);
}
// Moves the results fields from word to this. This takes ownership of all
// the data, so src can be destructed.
void WERD_RES::ConsumeWordResults(WERD_RES *word) {
denorm = word->denorm;
blob_row = word->blob_row;
MovePointerData(&chopped_word, &word->chopped_word);
MovePointerData(&rebuild_word, &word->rebuild_word);
MovePointerData(&box_word, &word->box_word);
for (auto data : seam_array) {
delete data;
}
seam_array = word->seam_array;
word->seam_array.clear();
// TODO: optimize moves.
best_state = word->best_state;
word->best_state.clear();
correct_text = word->correct_text;
word->correct_text.clear();
blob_widths = word->blob_widths;
word->blob_widths.clear();
blob_gaps = word->blob_gaps;
word->blob_gaps.clear();
if (ratings != nullptr) {
ratings->delete_matrix_pointers();
}
MovePointerData(&ratings, &word->ratings);
best_choice = word->best_choice;
MovePointerData(&raw_choice, &word->raw_choice);
best_choices.clear();
WERD_CHOICE_IT wc_it(&best_choices);
wc_it.add_list_after(&word->best_choices);
reject_map = word->reject_map;
if (word->blamer_bundle != nullptr) {
assert(blamer_bundle != nullptr);
blamer_bundle->CopyResults(*(word->blamer_bundle));
}
CopySimpleFields(*word);
}
// Replace the best choice and rebuild box word.
// choice must be from the current best_choices list.
void WERD_RES::ReplaceBestChoice(WERD_CHOICE *choice) {
best_choice = choice;
RebuildBestState();
SetupBoxWord();
// Make up a fake reject map of the right length to keep the
// rejection pass happy.
reject_map.initialise(best_state.size());
done = tess_accepted = tess_would_adapt = true;
SetScriptPositions();
}
// Builds the rebuild_word and sets the best_state from the chopped_word and
// the best_choice->state.
void WERD_RES::RebuildBestState() {
ASSERT_HOST(best_choice != nullptr);
delete rebuild_word;
rebuild_word = new TWERD;
if (seam_array.empty()) {
start_seam_list(chopped_word, &seam_array);
}
best_state.clear();
int start = 0;
for (unsigned i = 0; i < best_choice->length(); ++i) {
int length = best_choice->state(i);
best_state.push_back(length);
if (length > 1) {
SEAM::JoinPieces(seam_array, chopped_word->blobs, start,
start + length - 1);
}
TBLOB *blob = chopped_word->blobs[start];
rebuild_word->blobs.push_back(new TBLOB(*blob));
if (length > 1) {
SEAM::BreakPieces(seam_array, chopped_word->blobs, start,
start + length - 1);
}
start += length;
}
}
// Copies the chopped_word to the rebuild_word, faking a best_state as well.
// Also sets up the output box_word.
void WERD_RES::CloneChoppedToRebuild() {
delete rebuild_word;
rebuild_word = new TWERD(*chopped_word);
SetupBoxWord();
auto word_len = box_word->length();
best_state.reserve(word_len);
correct_text.reserve(word_len);
for (unsigned i = 0; i < word_len; ++i) {
best_state.push_back(1);
correct_text.emplace_back("");
}
}
// Sets/replaces the box_word with one made from the rebuild_word.
void WERD_RES::SetupBoxWord() {
delete box_word;
rebuild_word->ComputeBoundingBoxes();
box_word = tesseract::BoxWord::CopyFromNormalized(rebuild_word);
box_word->ClipToOriginalWord(denorm.block(), word);
}
// Sets up the script positions in the output best_choice using the best_choice
// to get the unichars, and the unicharset to get the target positions.
void WERD_RES::SetScriptPositions() {
best_choice->SetScriptPositions(small_caps, chopped_word);
}
// Sets all the blobs in all the words (raw choice and best choices) to be
// the given position. (When a sub/superscript is recognized as a separate
// word, it falls victim to the rule that a whole word cannot be sub or
// superscript, so this function overrides that problem.)
void WERD_RES::SetAllScriptPositions(tesseract::ScriptPos position) {
raw_choice->SetAllScriptPositions(position);
WERD_CHOICE_IT wc_it(&best_choices);
for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
wc_it.data()->SetAllScriptPositions(position);
}
}
// Classifies the word with some already-calculated BLOB_CHOICEs.
// The choices are an array of blob_count pointers to BLOB_CHOICE,
// providing a single classifier result for each blob.
// The BLOB_CHOICEs are consumed and the word takes ownership.
// The number of blobs in the box_word must match blob_count.
void WERD_RES::FakeClassifyWord(unsigned blob_count, BLOB_CHOICE **choices) {
// Setup the WERD_RES.
ASSERT_HOST(box_word != nullptr);
ASSERT_HOST(blob_count == box_word->length());
ClearWordChoices();
ClearRatings();
ratings = new MATRIX(blob_count, 1);
for (unsigned c = 0; c < blob_count; ++c) {
auto *choice_list = new BLOB_CHOICE_LIST;
BLOB_CHOICE_IT choice_it(choice_list);
choice_it.add_after_then_move(choices[c]);
ratings->put(c, c, choice_list);
}
FakeWordFromRatings(TOP_CHOICE_PERM);
reject_map.initialise(blob_count);
best_state.clear();
best_state.resize(blob_count, 1);
done = true;
}
// Creates a WERD_CHOICE for the word using the top choices from the leading
// diagonal of the ratings matrix.
void WERD_RES::FakeWordFromRatings(PermuterType permuter) {
int num_blobs = ratings->dimension();
auto *word_choice = new WERD_CHOICE(uch_set, num_blobs);
word_choice->set_permuter(permuter);
for (int b = 0; b < num_blobs; ++b) {
UNICHAR_ID unichar_id = UNICHAR_SPACE;
// Initialize rating and certainty like in WERD_CHOICE::make_bad().
float rating = WERD_CHOICE::kBadRating;
float certainty = -FLT_MAX;
BLOB_CHOICE_LIST *choices = ratings->get(b, b);
if (choices != nullptr && !choices->empty()) {
BLOB_CHOICE_IT bc_it(choices);
BLOB_CHOICE *choice = bc_it.data();
unichar_id = choice->unichar_id();
rating = choice->rating();
certainty = choice->certainty();
}
word_choice->append_unichar_id_space_allocated(unichar_id, 1, rating,
certainty);
}
LogNewRawChoice(word_choice);
// Ownership of word_choice taken by word here.
LogNewCookedChoice(1, false, word_choice);
}
// Copies the best_choice strings to the correct_text for adaption/training.
void WERD_RES::BestChoiceToCorrectText() {
correct_text.clear();
ASSERT_HOST(best_choice != nullptr);
for (unsigned i = 0; i < best_choice->length(); ++i) {
UNICHAR_ID choice_id = best_choice->unichar_id(i);
const char *blob_choice = uch_set->id_to_unichar(choice_id);
correct_text.emplace_back(blob_choice);
}
}
// Merges 2 adjacent blobs in the result if the permanent callback
// class_cb returns other than INVALID_UNICHAR_ID, AND the permanent
// callback box_cb is nullptr or returns true, setting the merged blob
// result to the class returned from class_cb.
// Returns true if anything was merged.
bool WERD_RES::ConditionalBlobMerge(
const std::function<UNICHAR_ID(UNICHAR_ID, UNICHAR_ID)> &class_cb,
const std::function<bool(const TBOX &, const TBOX &)> &box_cb) {
ASSERT_HOST(best_choice->empty() || ratings != nullptr);
bool modified = false;
for (unsigned i = 0; i + 1 < best_choice->length(); ++i) {
UNICHAR_ID new_id =
class_cb(best_choice->unichar_id(i), best_choice->unichar_id(i + 1));
if (new_id != INVALID_UNICHAR_ID &&
(box_cb == nullptr ||
box_cb(box_word->BlobBox(i), box_word->BlobBox(i + 1)))) {
// Raw choice should not be fixed.
best_choice->set_unichar_id(new_id, i);
modified = true;
MergeAdjacentBlobs(i);
const MATRIX_COORD &coord = best_choice->MatrixCoord(i);
if (!coord.Valid(*ratings)) {
ratings->IncreaseBandSize(coord.row + 1 - coord.col);
}
BLOB_CHOICE_LIST *blob_choices = GetBlobChoices(i);
if (FindMatchingChoice(new_id, blob_choices) == nullptr) {
// Insert a fake result.
auto *blob_choice = new BLOB_CHOICE;
blob_choice->set_unichar_id(new_id);
BLOB_CHOICE_IT bc_it(blob_choices);
bc_it.add_before_then_move(blob_choice);
}
}
}
return modified;