diff --git a/ccmain/Makefile.am b/ccmain/Makefile.am
index 9d7326cc89..8fdebedbed 100644
--- a/ccmain/Makefile.am
+++ b/ccmain/Makefile.am
@@ -9,10 +9,11 @@ AM_CPPFLAGS = \
     -I$(top_srcdir)/textord
 
 include_HEADERS = \
-    control.h cube_reco_context.h \
-    docqual.h fixspace.h \
-    imgscale.h osdetect.h output.h \
-    paramsd.h pgedit.h reject.h scaleimg.h \
+    control.h cube_reco_context.h cubeclassifier.h \
+    docqual.h equationdetect.h fixspace.h \
+    imgscale.h ltrresultiterator.h mutableiterator.h osdetect.h output.h \
+    pageiterator.h paragraphs.h paragraphs_internal.h paramsd.h pgedit.h \
+    reject.h resultiterator.h scaleimg.h \
     tessbox.h tessedit.h tesseractclass.h \
     tesseract_cube_combiner.h \
     tessvars.h tfacep.h tfacepp.h thresholder.h \
@@ -38,11 +39,13 @@ endif
 
 libtesseract_main_la_SOURCES = \
     adaptions.cpp applybox.cpp \
-    control.cpp cube_control.cpp cube_reco_context.cpp \
-    docqual.cpp fixspace.cpp fixxht.cpp \
-    imgscale.cpp osdetect.cpp output.cpp pagesegmain.cpp \
-    pagewalk.cpp paramsd.cpp pgedit.cpp reject.cpp scaleimg.cpp \
-    recogtraining.cpp tesseract_cube_combiner.cpp \
+    control.cpp cube_control.cpp cube_reco_context.cpp cubeclassifier.cpp \
+    docqual.cpp equationdetect.cpp fixspace.cpp fixxht.cpp \
+    imgscale.cpp ltrresultiterator.cpp \
+    osdetect.cpp output.cpp pageiterator.cpp pagesegmain.cpp \
+    pagewalk.cpp paragraphs.cpp paramsd.cpp pgedit.cpp recogtraining.cpp \
+    reject.cpp resultiterator.cpp scaleimg.cpp \
+    tesseract_cube_combiner.cpp \
     tessbox.cpp tessedit.cpp tesseractclass.cpp tessvars.cpp \
     tfacepp.cpp thresholder.cpp \
     werdit.cpp
diff --git a/ccmain/applybox.cpp b/ccmain/applybox.cpp
index 051507f814..2d183eb3ff 100644
--- a/ccmain/applybox.cpp
+++ b/ccmain/applybox.cpp
@@ -35,9 +35,13 @@
 #include "unichar.h"
 #include "unicharset.h"
 #include "tesseractclass.h"
+#include "genericvector.h"
 
 // Max number of blobs to classify together in FindSegmentation.
 const int kMaxGroupSize = 4;
+// Max fraction of median allowed as deviation in xheight before switching
+// to median.
+const double kMaxXHeightDeviationFraction = 0.125;
 
 /*************************************************************************
  * The box file is assumed to contain box definitions, one per line, of the
@@ -107,69 +111,93 @@ static void clear_any_old_text(BLOCK_LIST *block_list) {
 PAGE_RES* Tesseract::ApplyBoxes(const STRING& fname,
                                 bool find_segmentation,
                                 BLOCK_LIST *block_list) {
-  // In word mode, we use the boxes to make a word for each box, but
-  // in blob mode we use the existing words and maximally chop them first.
-  PAGE_RES* page_res = find_segmentation ? NULL : SetupApplyBoxes(block_list);
   int box_count = 0;
   int box_failures = 0;
 
   FILE* box_file = OpenBoxFile(fname);
-  clear_any_old_text(block_list);
-  TBOX prev_box, box, next_box;
-  bool found_box = false;
-  char text[kBoxReadBufSize];
-  do {
-    prev_box = box;
-    box = next_box;
+  TBOX box;
+  GenericVector<TBOX> boxes;
+  GenericVector<STRING> texts, full_texts;
+
+  bool found_box = true;
+  while (found_box) {
     int line_number = 0;           // Line number of the box file.
-    int x_min;
-    int y_min;
-    int x_max;
-    int y_max;
-    char next_text[kBoxReadBufSize];
-    // Keep a look-ahead box, so we can pass the next box into the resegment
-    // functions.
-    found_box = read_next_box(applybox_page, &line_number, box_file, next_text,
-                              &x_min, &y_min, &x_max, &y_max);
+    STRING text, full_text;
+    found_box = ReadNextBox(applybox_page, &line_number, box_file, &text, &box);
     if (found_box) {
-      next_box = TBOX(ICOORD(x_min, y_min), ICOORD (x_max, y_max));
       ++box_count;
+      MakeBoxFileStr(text.string(), box, applybox_page, &full_text);
     } else {
-      next_box = TBOX();
-      next_text[0] = '\0';
+      full_text = "";
     }
-    if (!box.null_box()) {
-      bool foundit = false;
-      if (page_res != NULL)
-        foundit = ResegmentCharBox(page_res, box, next_box, text);
-      else
-        foundit = ResegmentWordBox(block_list, box, next_box, text);
-      if (!foundit) {
-        box_failures++;
-        ReportFailedBox(box_count, box, text,
-                        "FAILURE! Couldn't find a matching blob");
+    boxes.push_back(box);
+    texts.push_back(text);
+    full_texts.push_back(full_text);
+  }
+
+  // In word mode, we use the boxes to make a word for each box, but
+  // in blob mode we use the existing words and maximally chop them first.
+  PAGE_RES* page_res = find_segmentation ?
+      NULL : SetupApplyBoxes(boxes, block_list);
+  clear_any_old_text(block_list);
+
+  for (int i = 0; i < boxes.size() - 1; i++) {
+    bool foundit = false;
+    if (page_res != NULL) {
+      if (i == 0) {
+        foundit = ResegmentCharBox(page_res, NULL, boxes[i], boxes[i + 1],
+                                   full_texts[i].string());
+      } else {
+        foundit = ResegmentCharBox(page_res, &boxes[i-1], boxes[i],
+                                   boxes[i + 1], full_texts[i].string());
       }
+    } else {
+      foundit = ResegmentWordBox(block_list, boxes[i], boxes[i + 1],
+                                 texts[i].string());
+    }
+    if (!foundit) {
+      box_failures++;
+      ReportFailedBox(box_count, boxes[i], texts[i].string(),
+                      "FAILURE! Couldn't find a matching blob");
     }
-    strcpy(text, next_text);
-  } while (found_box);
+  }
+
   if (page_res == NULL) {
     // In word/line mode, we now maximally chop all the words and resegment
     // them with the classifier.
-    page_res = SetupApplyBoxes(block_list);
+    page_res = SetupApplyBoxes(boxes, block_list);
     ReSegmentByClassification(page_res);
   }
   if (applybox_debug > 0) {
     tprintf("APPLY_BOXES:\n");
     tprintf("   Boxes read from boxfile:  %6d\n", box_count);
-    tprintf("   Boxes failed resegmentation:  %6d\n", box_failures);
+    if (box_failures > 0)
+      tprintf("   Boxes failed resegmentation:  %6d\n", box_failures);
   }
   TidyUp(page_res);
   return page_res;
 }
 
+// Helper computes median xheight in the image.
+static double MedianXHeight(BLOCK_LIST *block_list) {
+  BLOCK_IT block_it(block_list);
+  STATS xheights(0, block_it.data()->bounding_box().height());
+  for (block_it.mark_cycle_pt();
+       !block_it.cycled_list(); block_it.forward()) {
+    ROW_IT row_it(block_it.data()->row_list());
+    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
+      xheights.add(IntCastRounded(row_it.data()->x_height()), 1);
+    }
+  }
+  return xheights.median();
+}
+
 // Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
 // All fuzzy spaces are removed, and all the words are maximally chopped.
-PAGE_RES* Tesseract::SetupApplyBoxes(BLOCK_LIST *block_list) {
+PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes,
+                                     BLOCK_LIST *block_list) {
+  double median_xheight = MedianXHeight(block_list);
+  double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
   // Strip all fuzzy space markers to simplify the PAGE_RES.
   BLOCK_IT b_it(block_list);
   for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
@@ -177,6 +205,14 @@ PAGE_RES* Tesseract::SetupApplyBoxes(BLOCK_LIST *block_list) {
     ROW_IT r_it(block->row_list());
     for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
       ROW* row = r_it.data();
+      float diff = fabs(row->x_height() - median_xheight);
+      if (diff > max_deviation) {
+        if (applybox_debug) {
+          tprintf("row xheight=%g, but median xheight = %g\n",
+                  row->x_height(), median_xheight);
+        }
+        row->set_x_height(static_cast<float>(median_xheight));
+      }
       WERD_IT w_it(row->word_list());
       for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
         WERD* word = w_it.data();
@@ -193,7 +229,8 @@ PAGE_RES* Tesseract::SetupApplyBoxes(BLOCK_LIST *block_list) {
   PAGE_RES_IT pr_it(page_res);
   WERD_RES* word_res;
   while ((word_res = pr_it.word()) != NULL) {
-    MaximallyChopWord(pr_it.block()->block, pr_it.row()->row, word_res);
+    MaximallyChopWord(boxes, pr_it.block()->block,
+                      pr_it.row()->row, word_res);
     pr_it.forward();
   }
   return page_res;
@@ -204,6 +241,7 @@ PAGE_RES* Tesseract::SetupApplyBoxes(BLOCK_LIST *block_list) {
 static void MakeWordChoice(const BLOB_CHOICE_LIST_VECTOR& char_choices,
                            const UNICHARSET& unicharset,
                            WERD_CHOICE* word_choice) {
+  *word_choice = WERD_CHOICE(&unicharset);  // clear the word choice.
   word_choice->make_bad();
   for (int i = 0; i < char_choices.size(); ++i) {
     BLOB_CHOICE_IT it(char_choices[i]);
@@ -211,15 +249,21 @@ static void MakeWordChoice(const BLOB_CHOICE_LIST_VECTOR& char_choices,
     word_choice->append_unichar_id(bc->unichar_id(), 1,
                                    bc->rating(), bc->certainty());
   }
-  word_choice->populate_unichars(unicharset);
+  word_choice->populate_unichars();
 }
 
 // Tests the chopper by exhaustively running chop_one_blob.
 // The word_res will contain filled chopped_word, seam_array, denorm,
 // box_word and best_state for the maximally chopped word.
-void Tesseract::MaximallyChopWord(BLOCK* block, ROW* row, WERD_RES* word_res) {
-  if (!word_res->SetupForRecognition(unicharset, false, row, block))
+void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
+                                  BLOCK* block, ROW* row,
+                                  WERD_RES* word_res) {
+  if (!word_res->SetupForTessRecognition(unicharset, this, BestPix(), false,
+                                         this->textord_use_cjk_fp_model,
+                                         row, block)) {
+    word_res->CloneChoppedToRebuild();
     return;
+  }
   if (chop_debug) {
     tprintf("Maximally chopping word at:");
     word_res->word->bounding_box().print();
@@ -227,7 +271,6 @@ void Tesseract::MaximallyChopWord(BLOCK* block, ROW* row, WERD_RES* word_res) {
   blob_match_table.init_match_table();
   BLOB_CHOICE_LIST *match_result;
   BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
-  set_denorm(&word_res->denorm);
   ASSERT_HOST(word_res->chopped_word->blobs != NULL);
   float rating = static_cast<float>(MAX_INT8);
   for (TBLOB* blob = word_res->chopped_word->blobs; blob != NULL;
@@ -248,8 +291,16 @@ void Tesseract::MaximallyChopWord(BLOCK* block, ROW* row, WERD_RES* word_res) {
   }
   inT32 blob_number;
   int right_chop_index = 0;
-  while (chop_one_blob(word_res->chopped_word, char_choices,
-                       &blob_number, &word_res->seam_array, &right_chop_index));
+  if (!assume_fixed_pitch_char_segment) {
+    // We only chop if the language is not fixed pitch like CJK.
+    if (prioritize_division) {
+      while (chop_one_blob2(boxes, word_res, &word_res->seam_array));
+    } else {
+      while (chop_one_blob(word_res->chopped_word, char_choices,
+                           &blob_number, &word_res->seam_array,
+                           &right_chop_index));
+    }
+  }
   MakeWordChoice(*char_choices, unicharset, word_res->best_choice);
   MakeWordChoice(*char_choices, unicharset, word_res->raw_choice);
   word_res->CloneChoppedToRebuild();
@@ -288,7 +339,7 @@ static double BoxMissMetric(const TBOX& box1, const TBOX& box2) {
 // failing to find an appropriate blob for a box.
 // This means that occasionally, blobs may be incorrectly segmented if the
 // chopper fails to find a suitable chop point.
-bool Tesseract::ResegmentCharBox(PAGE_RES* page_res,
+bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box,
                                  const TBOX& box, const TBOX& next_box,
                                  const char* correct_text) {
   if (applybox_debug > 1) {
@@ -306,6 +357,7 @@ bool Tesseract::ResegmentCharBox(PAGE_RES* page_res,
     }
     int word_len = word_res->box_word->length();
     for (int i = 0; i < word_len; ++i) {
+      TBOX char_box = TBOX();
       int blob_count = 0;
       for (blob_count = 0; i + blob_count < word_len; ++blob_count) {
         TBOX blob_box = word_res->box_word->BlobBox(i + blob_count);
@@ -323,8 +375,17 @@ bool Tesseract::ResegmentCharBox(PAGE_RES* page_res,
         }
         if (current_box_miss_metric > next_box_miss_metric)
           break;  // Blob is a better match for next box.
+        char_box += blob_box;
       }
       if (blob_count > 0) {
+        if (applybox_debug > 1) {
+          tprintf("Index [%d, %d) seem good.\n", i, i + blob_count);
+        }
+        if (!char_box.almost_equal(box, 3) &&
+            (box.x_gap(next_box) < -3 ||
+             (prev_box != NULL && prev_box->x_gap(box) < -3))) {
+          return false;
+        }
         // We refine just the box_word, best_state and correct_text here.
         // The rebuild_word is made in TidyUp.
         // blob_count blobs are put together to match the box. Merge the
@@ -354,11 +415,19 @@ bool Tesseract::ResegmentCharBox(PAGE_RES* page_res,
             tprintf("%d ", word_res->best_state[j]);
           }
           tprintf("\n");
+          tprintf("Correct text = [[ ");
+          for (int j = 0; j < word_res->correct_text.size(); ++j) {
+            tprintf("%s ", word_res->correct_text[j].string());
+          }
+          tprintf("]]\n");
         }
         return true;
       }
     }
   }
+  if (applybox_debug > 0) {
+    tprintf("FAIL!\n");
+  }
   return false;  // Failure.
 }
 
@@ -433,6 +502,7 @@ bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
       }
     }
   }
+  if (new_word == NULL && applybox_debug > 0) tprintf("FAIL!\n");
   return new_word != NULL;
 }
 
@@ -498,8 +568,8 @@ bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
   for (int i = 0; i < word_length; ++i) {
     for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
       BLOB_CHOICE_LIST* match_result = classify_piece(
-          word_res->chopped_word->blobs, word_res->seam_array,
-          i, i + j - 1);
+          word_res->chopped_word->blobs, word_res->denorm, word_res->seam_array,
+          i, i + j - 1, word_res->blamer_bundle);
       if (applybox_debug > 2) {
         tprintf("%d+%d:", i, j);
         print_ratings_list("Segment:", match_result, unicharset);
@@ -629,7 +699,7 @@ void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
 // Counts up the labelled words and the blobs within.
 // Deletes all unused or emptied words, counting the unused ones.
 // Resets W_BOL and W_EOL flags correctly.
-// Builds the rebuild_word and rebuilds the box_word.
+// Builds the rebuild_word and rebuilds the box_word and the best_choice.
 void Tesseract::TidyUp(PAGE_RES* page_res) {
   int ok_blob_count = 0;
   int bad_blob_count = 0;
@@ -639,14 +709,21 @@ void Tesseract::TidyUp(PAGE_RES* page_res) {
   WERD_RES* word_res;
   for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
     int ok_in_word = 0;
-    for (int i = 0; i < word_res->correct_text.size(); ++i) {
+    BLOB_CHOICE_LIST_VECTOR char_choices;
+    for (int i = word_res->correct_text.size() - 1; i >= 0; i--) {
       if (word_res->correct_text[i].length() > 0) {
         ++ok_in_word;
       }
+      // Since we only need a fake word_res->best_choice, the actual
+      // unichar_ids do not matter. Which is fortunate, since TidyUp()
+      // can be called while training Tesseract, at the stage where
+      // unicharset is not meaningful yet.
+      char_choices += fake_classify_blob(INVALID_UNICHAR_ID, 1.0, -1.0);
     }
     if (ok_in_word > 0) {
       ok_blob_count += ok_in_word;
       bad_blob_count += word_res->correct_text.size() - ok_in_word;
+      MakeWordChoice(char_choices, unicharset, word_res->best_choice);
     } else {
       ++unlabelled_words;
       if (applybox_debug > 0) {
@@ -655,6 +732,7 @@ void Tesseract::TidyUp(PAGE_RES* page_res) {
       }
       pr_it.DeleteCurrentWord();
     }
+    char_choices.delete_data_pointers();
   }
   pr_it.restart_page();
   for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
@@ -665,9 +743,13 @@ void Tesseract::TidyUp(PAGE_RES* page_res) {
     word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row());
   }
   if (applybox_debug > 0) {
-    tprintf("   Found %d good blobs and %d unlabelled blobs in %d words.\n",
-            ok_blob_count, bad_blob_count, ok_word_count);
-    tprintf("   %d remaining unlabelled words deleted.\n", unlabelled_words);
+    tprintf("   Found %d good blobs.\n", ok_blob_count);
+    if (bad_blob_count > 0) {
+      tprintf("   Leaving %d unlabelled blobs in %d words.\n",
+              bad_blob_count, ok_word_count);
+    }
+    if (unlabelled_words > 0)
+      tprintf("   %d remaining unlabelled words deleted.\n", unlabelled_words);
   }
 }
 
@@ -684,13 +766,17 @@ void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
   PAGE_RES_IT pr_it(page_res);
   for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
        word_res = pr_it.forward()) {
-    WERD_CHOICE* choice = new WERD_CHOICE(word_res->correct_text.size());
+    WERD_CHOICE* choice = new WERD_CHOICE(word_res->uch_set,
+                                          word_res->correct_text.size());
     for (int i = 0; i < word_res->correct_text.size(); ++i) {
-      UNICHAR_ID char_id = unicharset.unichar_to_id(
-          word_res->correct_text[i].string());
+      // The part before the first space is the real ground truth, and the
+      // rest is the bounding box location and page number.
+      GenericVector<STRING> tokens;
+      word_res->correct_text[i].split(' ', &tokens);
+      UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string());
       choice->append_unichar_id_space_allocated(char_id, 1, 0.0f, 0.0f);
     }
-    choice->populate_unichars(unicharset);
+    choice->populate_unichars();
     if (word_res->best_choice != NULL)
       delete word_res->best_choice;
     word_res->best_choice = choice;
@@ -707,7 +793,7 @@ void Tesseract::ApplyBoxTraining(const STRING& filename, PAGE_RES* page_res) {
     LearnWord(filename.string(), NULL, word_res);
     ++word_count;
   }
-  tprintf ("Generated training data for %d words\n", word_count);
+  tprintf("Generated training data for %d words\n", word_count);
 }
 
 
diff --git a/ccmain/control.cpp b/ccmain/control.cpp
index 1f12eb8d5c..8313e9d717 100644
--- a/ccmain/control.cpp
+++ b/ccmain/control.cpp
@@ -98,7 +98,8 @@ BOOL8 Tesseract::recog_interactive(BLOCK* block, ROW* row, WERD_RES* word_res) {
   inT16 char_qual;
   inT16 good_char_qual;
 
-  classify_word_pass2(word_res, block, row);
+  classify_word_and_language(&Tesseract::classify_word_pass2,
+                             block, row, word_res);
   if (tessedit_debug_quality_metrics) {
     word_char_quality(word_res, row, &char_qual, &good_char_qual);
     tprintf
@@ -121,6 +122,9 @@ BOOL8 Tesseract::recog_interactive(BLOCK* block, ROW* row, WERD_RES* word_res) {
 // Note that this function uses a fixed temporary file for storing the previous
 // configs, so it is neither thread-safe, nor process-safe, but the assumption
 // is that it will only be used for one debug window at a time.
+//
+// Since this function is used for debugging (and not to change OCR results)
+// set only debug params from the word config file.
 bool Tesseract::ProcessTargetWord(const TBOX& word_box,
                                   const TBOX& target_word_box,
                                   const char* word_config,
@@ -132,11 +136,15 @@ bool Tesseract::ProcessTargetWord(const TBOX& word_box,
         FILE* config_fp = fopen(backup_config_file_, "wb");
         ParamUtils::PrintParams(config_fp, params());
         fclose(config_fp);
-        ParamUtils::ReadParamsFile(word_config, false, params());
+        ParamUtils::ReadParamsFile(word_config,
+                                   SET_PARAM_CONSTRAINT_DEBUG_ONLY,
+                                   params());
       }
     } else {
       if (backup_config_file_ != NULL) {
-        ParamUtils::ReadParamsFile(backup_config_file_, false, params());
+        ParamUtils::ReadParamsFile(backup_config_file_,
+                                   SET_PARAM_CONSTRAINT_DEBUG_ONLY,
+                                   params());
         backup_config_file_ = NULL;
       }
     }
@@ -158,6 +166,7 @@ bool Tesseract::ProcessTargetWord(const TBOX& word_box,
  * if word_config is not null, the word config file is read for just the
  * target word(s), otherwise, on pass 2 and beyond ONLY the target words
  * are processed (Jetsoft modification.)
+ * Returns false if we cancelled prematurely.
  *
  * @param page_res page structure
  * @param monitor progress monitor
@@ -165,49 +174,28 @@ bool Tesseract::ProcessTargetWord(const TBOX& word_box,
  * @param dopasses 0 - all, 1 just pass 1, 2 passes 2 and higher
  */
 
-void Tesseract::recog_all_words(PAGE_RES* page_res,
+bool Tesseract::recog_all_words(PAGE_RES* page_res,
                                 ETEXT_DESC* monitor,
                                 const TBOX* target_word_box,
                                 const char* word_config,
                                 int dopasses) {
-  // TODO(rays): Normalize the "classify word" interface.  For instance:
-  //   (1) word.denorm gets set in word->SetupForRecognition() but does
-  //       not get invoked for cube alone.  Maybe it should?
-  //   (2) run_cube() checks whether word->best_choice is NULL, and if
-  //       so determines that "neither cube nor tess have an answer."
-  //       However, if tess gets run at all, the first thing it does is
-  //       call word->SetupForRecognition which inserts a poorly scoring
-  //       best_answer.  So what is the way that an engine (tess or cube)
-  //       says "I don't have an answer": an empty list or a single
-  //       poorly scoring best_answer?
-
-                                 // reset page iterator
-  // If we only intend to run cube - run it and return.
-  if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) {
-    PrepareForCubeOCR();
-    mutable_splitter()->Clear();
-    run_cube(page_res);
-    return;
-  }
-  // Return if we do not want to run Tesseract.
-  if (tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY &&
-      tessedit_ocr_engine_mode != OEM_TESSERACT_CUBE_COMBINED) return;
-
   PAGE_RES_IT page_res_it;
-  inT16 chars_in_word;
-  inT16 rejects_in_word;
-  inT16 blob_quality = 0;
-  inT16 outline_errs = 0;
-  inT16 all_char_quality;
-  inT16 accepted_all_char_quality;
   inT32 word_index;              // current word
-  int i;
 
   if (tessedit_minimal_rej_pass1) {
     tessedit_test_adaption.set_value (TRUE);
     tessedit_minimal_rejection.set_value (TRUE);
   }
 
+  // Before the main recognition loop below, walk through the whole page and set
+  // up fake words.  That way, if we run out of time a user will still get the
+  // expected best_choice and box_words out the end; they'll just be empty.
+  page_res_it.page_res = page_res;
+  for (page_res_it.restart_page(); page_res_it.word() != NULL;
+       page_res_it.forward()) {
+    page_res_it.word()->SetupFake(unicharset);
+  }
+
   if (dopasses==0 || dopasses==1) {
     page_res_it.page_res=page_res;
     page_res_it.restart_page();
@@ -216,11 +204,16 @@ void Tesseract::recog_all_words(PAGE_RES* page_res,
 
     // Clear adaptive classifier at the beginning of the page if it is full.
     // This is done only at the beginning of the page to ensure that the
-    // classifier is not reset at an arbitraty point while processing the page,
+    // classifier is not reset at an arbitrary point while processing the page,
     // which would cripple Passes 2+ if the reset happens towards the end of
-    // Pass 1 on a page with very difficul text.
+    // Pass 1 on a page with very difficult text.
     // TODO(daria): preemptively clear the classifier if it is almost full.
-    if (AdaptiveClassifierIsFull()) ResetAdaptiveClassifier();
+    if (AdaptiveClassifierIsFull()) ResetAdaptiveClassifierInternal();
+    // Now check the sub-langs as well.
+    for (int i = 0; i < sub_langs_.size(); ++i) {
+      if (sub_langs_[i]->AdaptiveClassifierIsFull())
+        sub_langs_[i]->ResetAdaptiveClassifierInternal();
+    }
 
     stats_.word_count = 0;
     if (monitor != NULL) {
@@ -243,6 +236,7 @@ void Tesseract::recog_all_words(PAGE_RES* page_res,
     stats_.good_char_count = 0;
     stats_.doc_good_char_quality = 0;
 
+    most_recently_used_ = this;
     while (page_res_it.word() != NULL) {
       set_global_loc_code(LOC_PASS1);
       word_index++;
@@ -252,7 +246,7 @@ void Tesseract::recog_all_words(PAGE_RES* page_res,
         if (monitor->deadline_exceeded() ||
             (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
                                                            stats_.dict_words)))
-          return;
+          return false;
       }
       if (target_word_box &&
           !ProcessTargetWord(page_res_it.word()->word->bounding_box(),
@@ -260,8 +254,10 @@ void Tesseract::recog_all_words(PAGE_RES* page_res,
         page_res_it.forward();
         continue;
       }
-      classify_word_pass1(page_res_it.word(), page_res_it.row()->row,
-                          page_res_it.block()->block);
+      classify_word_and_language(&Tesseract::classify_word_pass1,
+                                 page_res_it.block()->block,
+                                 page_res_it.row()->row,
+                                 page_res_it.word());
       if (page_res_it.word()->word->flag(W_REP_CHAR)) {
         fix_rep_char(&page_res_it);
         page_res_it.forward();
@@ -271,8 +267,7 @@ void Tesseract::recog_all_words(PAGE_RES* page_res,
         word_dumper(NULL, page_res_it.row()->row, page_res_it.word());
         tprintf("Pass1: %s [%s]\n",
                 page_res_it.word()->best_choice->unichar_string().string(),
-                page_res_it.word()->best_choice->
-                  debug_string(unicharset).string());
+                page_res_it.word()->best_choice->debug_string().string());
       }
 
       // tessedit_test_adaption enables testing of the accuracy of the
@@ -285,7 +280,7 @@ void Tesseract::recog_all_words(PAGE_RES* page_res,
         } else {
           // Override rejection mechanisms for this word.
           UNICHAR_ID space = unicharset.unichar_to_id(" ");
-          for (i = 0; i < page_res_it.word()->best_choice->length(); i++) {
+          for (int i = 0; i < page_res_it.word()->best_choice->length(); i++) {
             if ((page_res_it.word()->best_choice->unichar_id(i) != space) &&
                 page_res_it.word()->reject_map[i].rejected())
               page_res_it.word()->reject_map[i].setrej_minimal_rej_accept();
@@ -296,15 +291,25 @@ void Tesseract::recog_all_words(PAGE_RES* page_res,
       // Count dict words.
       if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM)
         ++(stats_.dict_words);
+
+      // Update misadaption log (we only need to do it on pass 1, since
+      // adaption only happens on this pass).
+      if (page_res_it.word()->blamer_bundle != NULL &&
+          page_res_it.word()->blamer_bundle->misadaption_debug.length() > 0) {
+        page_res->misadaption_log.push_back(
+            page_res_it.word()->blamer_bundle->misadaption_debug);
+      }
+
       page_res_it.forward();
     }
   }
 
-  if (dopasses == 1) return;
+  if (dopasses == 1) return true;
 
   // ****************** Pass 2 *******************
   page_res_it.restart_page();
   word_index = 0;
+  most_recently_used_ = this;
   while (!tessedit_test_adaption && page_res_it.word() != NULL) {
     set_global_loc_code(LOC_PASS2);
     word_index++;
@@ -314,7 +319,7 @@ void Tesseract::recog_all_words(PAGE_RES* page_res,
       if (monitor->deadline_exceeded() ||
           (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
                                                          stats_.dict_words)))
-        return;
+        return false;
     }
 
     // changed by jetsoft
@@ -327,8 +332,10 @@ void Tesseract::recog_all_words(PAGE_RES* page_res,
     }
     // end jetsoft
 
-    classify_word_pass2(page_res_it.word(), page_res_it.block()->block,
-                        page_res_it.row()->row);
+    classify_word_and_language(&Tesseract::classify_word_pass2,
+                               page_res_it.block()->block,
+                               page_res_it.row()->row,
+                               page_res_it.word());
     if (page_res_it.word()->word->flag(W_REP_CHAR) &&
         !page_res_it.word()->done) {
       fix_rep_char(&page_res_it);
@@ -339,37 +346,279 @@ void Tesseract::recog_all_words(PAGE_RES* page_res,
       word_dumper(NULL, page_res_it.row()->row, page_res_it.word());
       tprintf("Pass2: %s [%s]\n",
               page_res_it.word()->best_choice->unichar_string().string(),
-              page_res_it.word()->best_choice->
-                debug_string(unicharset).string());
+              page_res_it.word()->best_choice->debug_string().string());
     }
     page_res_it.forward();
   }
 
-  // ****************** Pass 3 *******************
-  // Fix fuzzy spaces.
-  set_global_loc_code(LOC_FUZZY_SPACE);
+  // The next passes can only be run if tesseract has been used, as cube
+  // doesn't set all the necessary outputs in WERD_RES.
+  if (tessedit_ocr_engine_mode == OEM_TESSERACT_ONLY ||
+      tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) {
+    // ****************** Pass 3 *******************
+    // Fix fuzzy spaces.
+    set_global_loc_code(LOC_FUZZY_SPACE);
+
+    if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces
+        && !tessedit_word_for_word && !right_to_left())
+      fix_fuzzy_spaces(monitor, stats_.word_count, page_res);
+
+    // ****************** Pass 4 *******************
+    if (tessedit_enable_bigram_correction) bigram_correction_pass(page_res);
+
+    // ****************** Pass 5,6 *******************
+    rejection_passes(page_res, monitor, target_word_box, word_config);
+
+    // ****************** Pass 7 *******************
+    // Cube combiner.
+    // If cube is loaded and its combiner is present, run it.
+    if (tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) {
+      run_cube_combiner(page_res);
+    }
+
+    // ****************** Pass 8 *******************
+    font_recognition_pass(page_res);
+
+    // ****************** Pass 9 *******************
+    // Check the correctness of the final results.
+    blamer_pass(page_res);
+  }
+
+  if (!save_blob_choices) {
+    // We aren't saving the blob choices so get rid of them now.
+    // set_blob_choices() does a deep clear.
+    page_res_it.restart_page();
+    while (page_res_it.word() != NULL) {
+      WERD_RES* word = page_res_it.word();
+      word->best_choice->set_blob_choices(NULL);
+      page_res_it.forward();
+    }
+  }
+
+  // Write results pass.
+  set_global_loc_code(LOC_WRITE_RESULTS);
+  // This is now redundant, but retained commented so show how to obtain
+  // bounding boxes and style information.
 
-  if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces
-      && !tessedit_word_for_word && !right_to_left())
-    fix_fuzzy_spaces(monitor, stats_.word_count, page_res);
+  // changed by jetsoft
+  // needed for dll to output memory structure
+  if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv))
+    output_pass(page_res_it, target_word_box);
+  // end jetsoft
+  PageSegMode pageseg_mode = static_cast<PageSegMode>(
+      static_cast<int>(tessedit_pageseg_mode));
+  textord_.CleanupSingleRowResult(pageseg_mode, page_res);
 
-  // ****************** Pass 4 *******************
+  if (monitor != NULL) {
+    monitor->progress = 100;
+  }
+  return true;
+}
+
+void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {
+  PAGE_RES_IT word_it(page_res);
+
+  WERD_RES *w_prev = NULL;
+  WERD_RES *w = word_it.word();
+  if (w && w->best_choice) w->best_choice->populate_unichars();
+  while (1) {
+    w_prev = w;
+    while (word_it.forward() != NULL &&
+           (!word_it.word() || word_it.word()->part_of_combo)) {
+      // advance word_it, skipping over parts of combos
+    }
+    if (!word_it.word()) break;
+    w = word_it.word();
+    if (w && w->best_choice)
+      w->best_choice->populate_unichars();
+    if (!w || !w_prev || w->uch_set != w_prev->uch_set) {
+      continue;
+    }
+    if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) {
+      if (tessedit_bigram_debug) {
+        tprintf("Skipping because one of the words is W_REP_CHAR\n");
+      }
+      continue;
+    }
+    // Two words sharing the same language model, excellent!
+    if (w->alt_choices.empty()) {
+      if (tessedit_bigram_debug) {
+        tprintf("Alt choices not set up for word choice: %s\n",
+                w->best_choice->unichar_string().string());
+      }
+      continue;
+    }
+    if (w_prev->alt_choices.empty()) {
+      if (tessedit_bigram_debug) {
+        tprintf("Alt choices not set up for word choice: %s\n",
+                w_prev->best_choice->unichar_string().string());
+      }
+      continue;
+    }
+
+    // We saved alternate choices, excellent!
+    GenericVector<WERD_CHOICE *> overrides_word1;
+    GenericVector<GenericVector<int> *> overrides_word1_state;
+    GenericVector<WERD_CHOICE *> overrides_word2;
+    GenericVector<GenericVector<int> *> overrides_word2_state;
+
+    STRING orig_w1_str = w_prev->best_choice->unichar_string();
+    STRING orig_w2_str = w->best_choice->unichar_string();
+    WERD_CHOICE prev_best(w->uch_set);
+    {
+      int w1start, w1end;
+      w_prev->WithoutFootnoteSpan(&w1start, &w1end);
+      prev_best = w_prev->best_choice->shallow_copy(w1start, w1end);
+    }
+    WERD_CHOICE this_best(w->uch_set);
+    {
+      int w2start, w2end;
+      w->WithoutFootnoteSpan(&w2start, &w2end);
+      this_best = w->best_choice->shallow_copy(w2start, w2end);
+    }
+
+    if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) {
+      if (tessedit_bigram_debug) {
+        tprintf("Top choice \"%s %s\" verified by bigram model.\n",
+                orig_w1_str.string(), orig_w2_str.string());
+      }
+      continue;
+    }
+    if (tessedit_bigram_debug > 2) {
+      tprintf("Examining alt choices for \"%s %s\".\n",
+              orig_w1_str.string(), orig_w2_str.string());
+    }
+    if (tessedit_bigram_debug > 1) {
+      if (w_prev->alt_choices.size() > 1) {
+        print_word_alternates_list(w_prev->best_choice, &w_prev->alt_choices,
+                                   false);
+      }
+      if (w->alt_choices.size() > 1) {
+        print_word_alternates_list(w->best_choice, &w->alt_choices, false);
+      }
+    }
+    float best_rating = 0.0;
+    int best_idx = 0;
+    for (int i = 0; i < w_prev->alt_choices.size(); i++) {
+      WERD_CHOICE *p1 = w_prev->alt_choices.get(i);
+      WERD_CHOICE strip1(w->uch_set);
+      {
+        int p1start, p1end;
+        w_prev->WithoutFootnoteSpan(*p1, w_prev->alt_states.get(i),
+                                    &p1start, &p1end);
+        strip1 = p1->shallow_copy(p1start, p1end);
+      }
+      for (int j = 0; j < w->alt_choices.size(); j++) {
+        WERD_CHOICE *p2 = w->alt_choices.get(j);
+        WERD_CHOICE strip2(w->uch_set);
+        {
+          int p2start, p2end;
+          w->WithoutFootnoteSpan(*p2, w->alt_states.get(j), &p2start, &p2end);
+          strip2 = p2->shallow_copy(p2start, p2end);
+        }
+        if (w->tesseract->getDict().valid_bigram(strip1, strip2)) {
+          overrides_word1.push_back(p1);
+          overrides_word1_state.push_back(&w_prev->alt_states.get(i));
+          overrides_word2.push_back(p2);
+          overrides_word2_state.push_back(&w->alt_states.get(j));
+          if (overrides_word1.size() == 1 ||
+              p1->rating() + p2->rating() < best_rating) {
+            best_rating = p1->rating() + p2->rating();
+            best_idx = overrides_word1.size() - 1;
+          }
+        }
+      }
+    }
+    if (overrides_word1.size() >= 1) {
+      // Excellent, we have some bigram matches.
+      if (EqualIgnoringCaseAndTerminalPunct(*w_prev->best_choice,
+                                            *overrides_word1[best_idx]) &&
+          EqualIgnoringCaseAndTerminalPunct(*w->best_choice,
+                                            *overrides_word2[best_idx])) {
+        tprintf("Top choice \"%s %s\" verified (sans case) by bigram model.\n",
+                orig_w1_str.string(), orig_w2_str.string());
+        continue;
+      }
+      STRING new_w1_str = overrides_word1[best_idx]->unichar_string();
+      STRING new_w2_str = overrides_word2[best_idx]->unichar_string();
+      if (new_w1_str != orig_w1_str) {
+        w_prev->ReplaceBestChoice(*overrides_word1[best_idx],
+                                  *overrides_word1_state[best_idx]);
+      }
+      if (new_w2_str != orig_w2_str) {
+        w->ReplaceBestChoice(*overrides_word2[best_idx],
+                             *overrides_word2_state[best_idx]);
+      }
+      if (tessedit_bigram_debug > 0) {
+        STRING choices_description;
+        int num_bigram_choices
+            = overrides_word1.size() * overrides_word2.size();
+        if (num_bigram_choices == 1) {
+          choices_description = "This was the unique bigram choice.";
+        } else {
+          if (tessedit_bigram_debug > 1) {
+            STRING bigrams_list;
+            const int kMaxChoicesToPrint = 20;
+            int num_choices_printed = 0;
+            for (int i = 0; i < overrides_word1.size() &&
+                 num_choices_printed < kMaxChoicesToPrint; i++) {
+              for (int j = 0; j < overrides_word2.size() &&
+                   num_choices_printed < kMaxChoicesToPrint; j++) {
+                if (i > 0 || j > 0) { bigrams_list += ", "; }
+                WERD_CHOICE *p1 = overrides_word1[i];
+                WERD_CHOICE *p2 = overrides_word2[j];
+                bigrams_list +=
+                    p1->unichar_string() + " " + p2->unichar_string();
+                num_choices_printed++;
+                if (num_choices_printed == kMaxChoicesToPrint) {
+                  bigrams_list += " ...";
+                }
+              }
+            }
+            choices_description = "There were many choices: {";
+            choices_description += bigrams_list;
+            choices_description += "}";
+          } else {
+            choices_description.add_str_int("There were ", num_bigram_choices);
+            choices_description += " compatible bigrams.";
+          }
+        }
+        tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n",
+                orig_w1_str.string(), orig_w2_str.string(),
+                new_w1_str.string(), new_w2_str.string(),
+                choices_description.string());
+      }
+    }
+  }
+}
+
+void Tesseract::rejection_passes(PAGE_RES* page_res,
+                                 ETEXT_DESC* monitor,
+                                 const TBOX* target_word_box,
+                                 const char* word_config) {
+  PAGE_RES_IT page_res_it(page_res);
+  // ****************** Pass 5 *******************
   // Gather statistics on rejects.
-  page_res_it.restart_page();
-  word_index = 0;
+  int word_index = 0;
   while (!tessedit_test_adaption && page_res_it.word() != NULL) {
     set_global_loc_code(LOC_MM_ADAPT);
+    WERD_RES* word = page_res_it.word();
     word_index++;
     if (monitor != NULL) {
       monitor->ocr_alive = TRUE;
       monitor->progress = 95 + 5 * word_index / stats_.word_count;
     }
-    check_debug_pt(page_res_it.word(), 70);
+    if (word->rebuild_word == NULL) {
+      // Word was not processed by tesseract.
+      page_res_it.forward();
+      continue;
+    }
+    check_debug_pt(word, 70);
 
     // changed by jetsoft
     // specific to its needs to extract one word when need
     if (target_word_box &&
-        !ProcessTargetWord(page_res_it.word()->word->bounding_box(),
+        !ProcessTargetWord(word->word->bounding_box(),
                            *target_word_box, word_config, 4)) {
       page_res_it.forward();
       continue;
@@ -377,41 +626,33 @@ void Tesseract::recog_all_words(PAGE_RES* page_res,
     // end jetsoft
 
     page_res_it.rej_stat_word();
-    chars_in_word = page_res_it.word()->reject_map.length();
-    rejects_in_word = page_res_it.word()->reject_map.reject_count();
+    int chars_in_word = word->reject_map.length();
+    int rejects_in_word = word->reject_map.reject_count();
 
-    blob_quality = word_blob_quality(page_res_it.word(),
-      page_res_it.row()->row);
+    int blob_quality = word_blob_quality(word, page_res_it.row()->row);
     stats_.doc_blob_quality += blob_quality;
-    outline_errs = word_outline_errs(page_res_it.word());
+    int outline_errs = word_outline_errs(word);
     stats_.doc_outline_errs += outline_errs;
-    word_char_quality(page_res_it.word(),
-                      page_res_it.row()->row,
+    inT16 all_char_quality;
+    inT16 accepted_all_char_quality;
+    word_char_quality(word, page_res_it.row()->row,
                       &all_char_quality, &accepted_all_char_quality);
     stats_.doc_char_quality += all_char_quality;
-    uinT8 permuter_type = page_res_it.word()->best_choice->permuter();
+    uinT8 permuter_type = word->best_choice->permuter();
     if ((permuter_type == SYSTEM_DAWG_PERM) ||
         (permuter_type == FREQ_DAWG_PERM) ||
         (permuter_type == USER_DAWG_PERM)) {
       stats_.good_char_count += chars_in_word - rejects_in_word;
       stats_.doc_good_char_quality += accepted_all_char_quality;
     }
-    check_debug_pt(page_res_it.word(), 80);
+    check_debug_pt(word, 80);
     if (tessedit_reject_bad_qual_wds &&
         (blob_quality == 0) && (outline_errs >= chars_in_word))
-      page_res_it.word()->reject_map.rej_word_bad_quality();
-    check_debug_pt(page_res_it.word(), 90);
+      word->reject_map.rej_word_bad_quality();
+    check_debug_pt(word, 90);
     page_res_it.forward();
   }
 
-  // ****************** Pass 5 *******************
-  // If cube is loaded and its combiner is present, run it.
-  if (tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) {
-    PrepareForCubeOCR();
-    mutable_splitter()->Clear();
-    run_cube(page_res);
-  }
-
   if (tessedit_debug_quality_metrics) {
     tprintf
       ("QUALITY: num_chs= %d  num_rejs= %d %5.3f blob_qual= %d %5.3f"
@@ -445,29 +686,177 @@ void Tesseract::recog_all_words(PAGE_RES* page_res,
     set_global_loc_code(LOC_DOC_BLK_REJ);
     quality_based_rejection(page_res_it, good_quality_doc);
   }
+}
 
-  // ****************** Pass 7 *******************
-  font_recognition_pass(page_res_it);
-
-  // Write results pass.
-  set_global_loc_code(LOC_WRITE_RESULTS);
-  // This is now redundant, but retained commented so show how to obtain
-  // bounding boxes and style information.
+void Tesseract::blamer_pass(PAGE_RES* page_res) {
+  if (!wordrec_run_blamer) return;
+  PAGE_RES_IT page_res_it(page_res);
+  for (page_res_it.restart_page(); page_res_it.word() != NULL;
+      page_res_it.forward()) {
+    WERD_RES *word = page_res_it.word();
+    if (word->blamer_bundle == NULL) {
+      word->blamer_bundle = new BlamerBundle();
+      word->blamer_bundle->incorrect_result_reason = IRR_PAGE_LAYOUT;
+      word->blamer_bundle->debug = word->blamer_bundle->IncorrectReason();
+      word->blamer_bundle->debug += " to blame";
+    } else if (word->blamer_bundle->incorrect_result_reason ==
+        IRR_NO_TRUTH) {
+      word->blamer_bundle->SetBlame(IRR_NO_TRUTH, "Rejected truth",
+                                    word->best_choice, wordrec_debug_blamer);
+    } else {
+      bool correct = ChoiceIsCorrect(*word->uch_set, word->best_choice,
+                                     word->blamer_bundle->truth_text);
+      IncorrectResultReason irr =
+          word->blamer_bundle->incorrect_result_reason;
+      if (irr == IRR_CORRECT && !correct) {
+        STRING debug = "Choice is incorrect after recognition";
+        word->blamer_bundle->SetBlame(IRR_UNKNOWN, debug,
+                                      word->best_choice,
+                                      wordrec_debug_blamer);
+      } else if (irr != IRR_CORRECT && correct) {
+        if (wordrec_debug_blamer) {
+          tprintf("Corrected %s\n", word->blamer_bundle->debug.string());
+        }
+        word->blamer_bundle->incorrect_result_reason = IRR_CORRECT;
+        word->blamer_bundle->debug = "";
+      }
+    }
+    page_res->blame_reasons[word->blamer_bundle->incorrect_result_reason]++;
+  }
+  tprintf("Blame reasons:\n");
+  for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) {
+    tprintf("%s %d\n", BlamerBundle::IncorrectReasonName(
+        static_cast<IncorrectResultReason>(bl)),
+        page_res->blame_reasons[bl]);
+  }
+  if (page_res->misadaption_log.length() > 0) {
+    tprintf("Misadaption log:\n");
+    for (int i = 0; i < page_res->misadaption_log.length(); ++i) {
+      tprintf("%s\n", page_res->misadaption_log[i].string());
+    }
+  }
+}
 
-  // changed by jetsoft
-  // needed for dll to output memory structure
-  if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv))
-    output_pass(page_res_it, target_word_box);
-  // end jetsoft
-  PageSegMode pageseg_mode = static_cast<PageSegMode>(
-      static_cast<int>(tessedit_pageseg_mode));
-  textord_.CleanupSingleRowResult(pageseg_mode, page_res);
+// Helper returns true if the new_word is better than the word, using a
+// simple test of better certainty AND rating (to reduce false positives
+// from cube) or a dictionary vs non-dictionary word.
+static bool NewWordBetter(const WERD_RES& word, const WERD_RES& new_word) {
+  if (new_word.best_choice == NULL) {
+    return false;  // New one no good.
+  }
+  if (word.best_choice == NULL) {
+    return true;  // Old one no good.
+  }
+  if (new_word.best_choice->certainty() > word.best_choice->certainty() &&
+      new_word.best_choice->rating() < word.best_choice->rating()) {
+    return true;  // New word has better confidence.
+  }
+  if (!Dict::valid_word_permuter(word.best_choice->permuter(), false) &&
+      Dict::valid_word_permuter(new_word.best_choice->permuter(), false)) {
+    return true;  // New word is from a dictionary.
+  }
+  return false;  // New word is no better.
+}
 
-  if (monitor != NULL) {
-    monitor->progress = 100;
+// Helper to recognize the word using the given (language-specific) tesseract.
+// Returns true if the result was better than previously.
+bool Tesseract::RetryWithLanguage(WERD_RES *word, BLOCK* block, ROW *row,
+                                  WordRecognizer recognizer) {
+  if (classify_debug_level || cube_debug_level) {
+    tprintf("Retrying word using lang %s, oem %d\n",
+            lang.string(), static_cast<int>(tessedit_ocr_engine_mode));
   }
+  // Setup a trial WERD_RES in which to classify.
+  WERD_RES lang_word;
+  lang_word.InitForRetryRecognition(*word);
+  // Run the recognizer on the word.
+  // Initial version is a bit of a hack based on better certainty and rating
+  // (to reduce false positives from cube) or a dictionary vs non-dictionary
+  // word.
+  (this->*recognizer)(block, row, &lang_word);
+  bool new_is_better = NewWordBetter(*word, lang_word);
+  if (classify_debug_level || cube_debug_level) {
+    if (lang_word.best_choice == NULL) {
+      tprintf("New result %s better:%s\n",
+              new_is_better ? "IS" : "NOT");
+    } else {
+      tprintf("New result %s better:%s, r=%g, c=%g\n",
+              new_is_better ? "IS" : "NOT",
+              lang_word.best_choice->unichar_string().string(),
+              lang_word.best_choice->rating(),
+              lang_word.best_choice->certainty());
+    }
+  }
+  if (new_is_better) {
+    word->ConsumeWordResults(&lang_word);
+  }
+  return new_is_better;
 }
 
+// Generic function for classifying a word. Can be used either for pass1 or
+// pass2 according to the function passed to recognizer.
+// word block and row are the current location in the document's PAGE_RES.
+// Recognizes in the current language, and if successful that is all.
+// If recognition was not successful, tries all available languages until
+// it gets a successful result or runs out of languages. Keeps the best result.
+void Tesseract::classify_word_and_language(WordRecognizer recognizer,
+                                           BLOCK* block,
+                                           ROW *row,
+                                           WERD_RES *word) {
+  if (classify_debug_level || cube_debug_level) {
+    tprintf("Processing word with lang %s at:",
+            most_recently_used_->lang.string());
+    word->word->bounding_box().print();
+  }
+  const char* result_type = "Initial";
+  bool initially_done = !word->tess_failed && word->done;
+  if (initially_done) {
+    // If done on pass1, we reuse the tesseract that did it, and don't try
+    // any more. The only need to call the classifier at all is for the
+    // cube combiner and xheight fixing (which may be bogus on a done word.)
+    most_recently_used_ = word->tesseract;
+    result_type = "Already done";
+  }
+  (most_recently_used_->*recognizer)(block, row, word);
+  if (!word->tess_failed && word->tess_accepted)
+    result_type = "Accepted";
+  if (classify_debug_level || cube_debug_level) {
+    tprintf("%s result: %s r=%g, c=%g, accepted=%d, adaptable=%d\n",
+            result_type,
+            word->best_choice->unichar_string().string(),
+            word->best_choice->rating(),
+            word->best_choice->certainty(),
+            word->tess_accepted, word->tess_would_adapt);
+  }
+  if (word->tess_failed || !word->tess_accepted) {
+    // Try all the other languages to see if they are any better.
+    Tesseract* previous_used = most_recently_used_;
+    if (most_recently_used_ != this) {
+      if (classify_debug_level) {
+        tprintf("Retrying with main-Tesseract, lang: %s\n", lang.string());
+      }
+      if (RetryWithLanguage(word, block, row, recognizer)) {
+        most_recently_used_ = this;
+        if (!word->tess_failed && word->tess_accepted)
+          return;  // No need to look at the others.
+      }
+    }
+
+    for (int i = 0; i < sub_langs_.size(); ++i) {
+      if (sub_langs_[i] != previous_used) {
+        if (classify_debug_level) {
+          tprintf("Retrying with sub-Tesseract[%d] lang: %s\n",
+                  i, sub_langs_[i]->lang.string());
+        }
+        if (sub_langs_[i]->RetryWithLanguage(word, block, row, recognizer)) {
+          most_recently_used_ = sub_langs_[i];
+          if (!word->tess_failed && word->tess_accepted)
+            return;  // No need to look at the others.
+        }
+      }
+    }
+  }
+}
 
 /**
  * classify_word_pass1
@@ -475,9 +864,13 @@ void Tesseract::recog_all_words(PAGE_RES* page_res,
  * Baseline normalize the word and pass it to Tess.
  */
 
-void Tesseract::classify_word_pass1(WERD_RES *word,  // word to do
-                                    ROW *row,
-                                    BLOCK* block) {
+void Tesseract::classify_word_pass1(BLOCK* block, ROW *row, WERD_RES *word) {
+  // If we only intend to run cube - run it and return.
+  if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) {
+    cube_word_pass1(block, row, word);
+    return;
+  }
+
   BLOB_CHOICE_LIST_CLIST *blob_choices = new BLOB_CHOICE_LIST_CLIST();
   BOOL8 adapt_ok;
   const char *rejmap;
@@ -485,8 +878,10 @@ void Tesseract::classify_word_pass1(WERD_RES *word,  // word to do
   STRING mapstr = "";
 
   check_debug_pt(word, 0);
-  if (word->SetupForRecognition(unicharset, classify_bln_numeric_mode,
-                                row, block))
+  if (word->SetupForTessRecognition(unicharset, this, BestPix(),
+                                    classify_bln_numeric_mode,
+                                    this->textord_use_cjk_fp_model,
+                                    row, block))
     tess_segment_pass1(word, blob_choices);
   if (!word->tess_failed) {
     /*
@@ -502,12 +897,12 @@ void Tesseract::classify_word_pass1(WERD_RES *word,  // word to do
     if (!word->word->flag(W_REP_CHAR)) {
       // TODO(daria) delete these hacks when replaced by more generic code.
       // Convert '' (double single) to " (single double).
-      fix_quotes(word, blob_choices);
+      word->fix_quotes(blob_choices);
       if (tessedit_fix_hyphens)  // turn -- to -
-        fix_hyphens(word, blob_choices);
+        word->fix_hyphens(blob_choices);
 
       word->tess_accepted = tess_acceptable_word(word->best_choice,
-        word->raw_choice);
+                                                 word->raw_choice);
 
       word->tess_would_adapt = word->best_choice && word->raw_choice &&
           AdaptableWord(word->rebuild_word,
@@ -534,9 +929,24 @@ void Tesseract::classify_word_pass1(WERD_RES *word,  // word to do
           rejmap = mapstr.string();
         }
         // Send word to adaptive classifier for training.
-        word->BestChoiceToCorrectText(unicharset);
+        word->BestChoiceToCorrectText();
         set_word_fonts(word, blob_choices);
         LearnWord(NULL, rejmap, word);
+        // Mark misadaptions if running blamer.
+        if (word->blamer_bundle != NULL &&
+            word->blamer_bundle->incorrect_result_reason != IRR_NO_TRUTH &&
+            !ChoiceIsCorrect(*word->uch_set, word->best_choice,
+                             word->blamer_bundle->truth_text)) {
+          word->blamer_bundle->misadaption_debug ="misadapt to word (";
+          word->blamer_bundle->misadaption_debug +=
+              word->best_choice->permuter_name();
+          word->blamer_bundle->misadaption_debug += "): ";
+          word->blamer_bundle->FillDebugString(
+              "", word->best_choice, &(word->blamer_bundle->misadaption_debug));
+          if (wordrec_debug_blamer) {
+            tprintf("%s\n", word->blamer_bundle->misadaption_debug.string());
+          }
+        }
       }
 
       if (tessedit_enable_doc_dict)
@@ -548,52 +958,16 @@ void Tesseract::classify_word_pass1(WERD_RES *word,  // word to do
   word->best_choice->set_blob_choices(blob_choices);
 }
 
-// Helper to switch between the original and new xht word or to discard
-// the new xht word, according to accept_new_word.
-static void SwitchWordOrDiscard(bool accept_new_word, WERD_RES* word,
-                                WERD_RES* new_word) {
-  if (accept_new_word) {
-    // The new_word is deemed superior so put the final results in the real
-    // word and destroy the old results.
-    word->denorm = new_word->denorm;
-    delete word->chopped_word;
-    word->chopped_word = new_word->chopped_word;
-    new_word->chopped_word = NULL;
-    delete word->rebuild_word;
-    word->rebuild_word = new_word->rebuild_word;
-    new_word->rebuild_word = NULL;
-    delete word->box_word;
-    word->box_word = new_word->box_word;
-    new_word->box_word = NULL;
-    free_seam_list(word->seam_array);
-    word->seam_array = new_word->seam_array;
-    new_word->seam_array = NULL;
-    word->best_state.move(&new_word->best_state);
-    word->correct_text.move(&new_word->correct_text);
-    delete word->best_choice;
-    word->best_choice = new_word->best_choice;
-    new_word->best_choice = NULL;
-    delete word->raw_choice;
-    word->raw_choice = new_word->raw_choice;
-    new_word->raw_choice = NULL;
-    word->reject_map = new_word->reject_map;
-    word->CopySimpleFields(*new_word);
-  } else {
-    // The new_word is no better, so destroy it and cleanup.
-    new_word->ClearResults();
-  }
-}
-
 // Helper to report the result of the xheight fix.
 void Tesseract::ReportXhtFixResult(bool accept_new_word, float new_x_ht,
                                    WERD_RES* word, WERD_RES* new_word) {
   tprintf("New XHT Match:%s = %s ",
           word->best_choice->unichar_string().string(),
-          word->best_choice->debug_string(unicharset).string());
+          word->best_choice->debug_string().string());
   word->reject_map.print(debug_fp);
   tprintf(" -> %s = %s ",
           new_word->best_choice->unichar_string().string(),
-          new_word->best_choice->debug_string(unicharset).string());
+          new_word->best_choice->debug_string().string());
   new_word->reject_map.print(debug_fp);
   tprintf(" %s->%s %s %s\n",
           word->guessed_x_ht ? "GUESS" : "CERT",
@@ -614,6 +988,10 @@ bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row) {
   float new_x_ht = ComputeCompatibleXheight(word);
   if (new_x_ht > 0.0f) {
     WERD_RES new_x_ht_word(word->word);
+    if (word->blamer_bundle != NULL) {
+      new_x_ht_word.blamer_bundle = new BlamerBundle();
+      new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle));
+    }
     new_x_ht_word.x_height = new_x_ht;
     new_x_ht_word.caps_height = 0.0;
     match_word_pass2(&new_x_ht_word, row, block);
@@ -638,9 +1016,10 @@ bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row) {
         ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word);
       }
     }
-    SwitchWordOrDiscard(accept_new_x_ht, word, &new_x_ht_word);
-    if (accept_new_x_ht)
+    if (accept_new_x_ht) {
+      word->ConsumeWordResults(&new_x_ht_word);
       return true;
+    }
   }
   return false;
 }
@@ -651,7 +1030,12 @@ bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row) {
  * Control what to do with the word in pass 2
  */
 
-void Tesseract::classify_word_pass2(WERD_RES *word, BLOCK* block, ROW *row) {
+void Tesseract::classify_word_pass2(BLOCK* block, ROW *row, WERD_RES *word) {
+  // Return if we do not want to run Tesseract.
+  if (tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY &&
+      tessedit_ocr_engine_mode != OEM_TESSERACT_CUBE_COMBINED)
+    return;
+
   bool done_this_pass = false;
   set_global_subloc_code(SUBLOC_NORM);
   check_debug_pt(word, 30);
@@ -691,12 +1075,12 @@ void Tesseract::classify_word_pass2(WERD_RES *word, BLOCK* block, ROW *row) {
       if (num_upper > 0 && num_lower == 0)
         word->small_caps = true;
     }
-    word->SetScriptPositions(unicharset);
+    word->SetScriptPositions();
 
     set_global_subloc_code(SUBLOC_NORM);
   }
 #ifndef GRAPHICS_DISABLED
-  if (tessedit_draw_outwords) {
+  if (tessedit_display_outwords) {
     if (fx_win == NULL)
       create_fx_win();
     clear_fx_win();
@@ -707,7 +1091,6 @@ void Tesseract::classify_word_pass2(WERD_RES *word, BLOCK* block, ROW *row) {
     ScrollView::Update();
   }
 #endif
-
   set_global_subloc_code(SUBLOC_NORM);
   check_debug_pt(word, 50);
 }
@@ -724,21 +1107,23 @@ void Tesseract::match_word_pass2(WERD_RES *word,  //word to do
                                  BLOCK* block) {
   BLOB_CHOICE_LIST_CLIST *blob_choices = new BLOB_CHOICE_LIST_CLIST();
 
-  if (word->SetupForRecognition(unicharset, classify_bln_numeric_mode,
-                                row, block))
+  if (word->SetupForTessRecognition(unicharset, this, BestPix(),
+                                    classify_bln_numeric_mode,
+                                    this->textord_use_cjk_fp_model,
+                                    row, block))
     tess_segment_pass2(word, blob_choices);
 
   if (!word->tess_failed) {
     if (!word->word->flag (W_REP_CHAR)) {
-      fix_quotes(word, blob_choices);
+      word->fix_quotes(blob_choices);
       if (tessedit_fix_hyphens)
-        fix_hyphens(word, blob_choices);
+        word->fix_hyphens(blob_choices);
       /* Dont trust fix_quotes! - though I think I've fixed the bug */
       if (word->best_choice->length() != word->box_word->length() ||
           word->best_choice->length() != blob_choices->length()) {
         tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
                 " #Blobs=%d; #Choices=%d\n",
-                word->best_choice->debug_string(unicharset).string(),
+                word->best_choice->debug_string().string(),
                 word->best_choice->length(),
                 word->box_word->length(), blob_choices->length());
 
@@ -752,6 +1137,7 @@ void Tesseract::match_word_pass2(WERD_RES *word,  //word to do
 
   // Save best choices in the WERD_CHOICE if needed
   word->best_choice->set_blob_choices(blob_choices);
+  set_word_fonts(word, blob_choices);
 
   assert (word->raw_choice != NULL);
 }
@@ -823,7 +1209,7 @@ void Tesseract::fix_rep_char(PAGE_RES_IT* page_res_it) {
   const WERD_CHOICE &word = *(word_res->best_choice);
 
   // Find the frequency of each unique character in the word.
-  UNICHAR_ID space = unicharset.unichar_to_id(" ");
+  UNICHAR_ID space = word_res->uch_set->unichar_to_id(" ");
   SortHelper<UNICHAR_ID> rep_ch(word.length());
   for (int i = 0; i < word.length(); ++i) {
     if (word.unichar_id(i) != space)
@@ -837,7 +1223,7 @@ void Tesseract::fix_rep_char(PAGE_RES_IT* page_res_it) {
   BLOB_CHOICE* best_choice = FindBestMatchingChoice(maxch_id, word_res);
   if (best_choice == NULL) {
     tprintf("Failed to find a choice for %s, occurring %d times\n",
-            unicharset.debug_str(maxch_id).string(), max_count);
+            word_res->uch_set->debug_str(maxch_id).string(), max_count);
     return;
   }
   word_res->done = TRUE;
@@ -862,7 +1248,7 @@ void Tesseract::fix_rep_char(PAGE_RES_IT* page_res_it) {
   } else {
     // Just correct existing classification.
     CorrectRepcharChoices(best_choice, word_res);
-    word_res->best_choice->populate_unichars(unicharset);
+    word_res->best_choice->populate_unichars();
     word_res->reject_map.initialise(word.length());
   }
 }
@@ -884,109 +1270,27 @@ void Tesseract::ExplodeRepeatedWord(BLOB_CHOICE* best_choice,
     bool last_blob = blob_it.at_last();
     WERD* blob_word = werd->ConstructFromSingleBlob(first_blob, last_blob,
                                                     blob_it.extract());
-    WERD_RES* rep_word = page_res_it->InsertCloneWord(*word_res, blob_word);
+    // Note that blamer_bundle (truth information) is not copied, which is
+    // desirable, since the newly inserted words would not have the original
+    // bounding box corresponding to the one recorded in truth fields.
+    WERD_RES* rep_word =
+        page_res_it->InsertSimpleCloneWord(*word_res, blob_word);
     // Setup the single char WERD_RES
-    rep_word->SetupForRecognition(unicharset, false, page_res_it->row()->row,
-                                  page_res_it->block()->block);
-    rep_word->CloneChoppedToRebuild();
-    BLOB_CHOICE* blob_choice = new BLOB_CHOICE(*best_choice);
-    rep_word->FakeClassifyWord(unicharset, 1, &blob_choice);
+    if (rep_word->SetupForTessRecognition(*word_res->uch_set, this, BestPix(),
+                                          false,
+                                          this->textord_use_cjk_fp_model,
+                                          page_res_it->row()->row,
+                                          page_res_it->block()->block)) {
+      rep_word->CloneChoppedToRebuild();
+      BLOB_CHOICE* blob_choice = new BLOB_CHOICE(*best_choice);
+      rep_word->FakeClassifyWord(1, &blob_choice);
+    }
   }
   page_res_it->DeleteCurrentWord();
 }
 
-// TODO(tkielbus) Decide between keeping this behavior here or modifying the
-// training data.
-
-// Utility function for fix_quotes
-// Return true if the next character in the string (given the UTF8 length in
-// bytes) is a quote character.
-static int is_simple_quote(const char* signed_str, int length) {
-  const unsigned char* str =
-    reinterpret_cast<const unsigned char*>(signed_str);
-   //standard 1 byte quotes
-  return (length == 1 && (*str == '\'' || *str == '`')) ||
-      //utf8 3 bytes curved quotes
-      (length == 3 && ((*str == 0xe2 &&
-                        *(str + 1) == 0x80 &&
-                        *(str + 2) == 0x98) ||
-                       (*str == 0xe2 &&
-                        *(str + 1) == 0x80 &&
-                        *(str + 2) == 0x99)));
-}
-
-// Callback helper for fix_quotes returns a double quote if both
-// arguments are quote, otherwise INVALID_UNICHAR_ID.
-UNICHAR_ID Tesseract::BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2) {
-  const char *ch = unicharset.id_to_unichar(id1);
-  const char *next_ch = unicharset.id_to_unichar(id2);
-  if (is_simple_quote(ch, strlen(ch)) &&
-      is_simple_quote(next_ch, strlen(next_ch)))
-    return unicharset.unichar_to_id("\"");
-  return INVALID_UNICHAR_ID;
-}
-
-/**
- * fix_quotes
- *
- * Change pairs of quotes to double quotes.
- */
-void Tesseract::fix_quotes(WERD_RES* word_res,
-                           BLOB_CHOICE_LIST_CLIST* blob_choices) {
-  if (!unicharset.contains_unichar("\"") ||
-      !unicharset.get_enabled(unicharset.unichar_to_id("\"")))
-    return;  // Don't create it if it is disallowed.
-
-  word_res->ConditionalBlobMerge(
-      unicharset,
-      NewPermanentTessCallback(this, &Tesseract::BothQuotes),
-      NULL,
-      blob_choices);
-}
-
-// Callback helper for fix_hyphens returns UNICHAR_ID of - if both
-// arguments are hyphen, otherwise INVALID_UNICHAR_ID.
-UNICHAR_ID Tesseract::BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2) {
-  const char *ch = unicharset.id_to_unichar(id1);
-  const char *next_ch = unicharset.id_to_unichar(id2);
-  if (strlen(ch) == 1 && strlen(next_ch) == 1 &&
-      (*ch == '-' || *ch == '~') && (*next_ch == '-' || *next_ch == '~'))
-    return unicharset.unichar_to_id("-");
-  return INVALID_UNICHAR_ID;
-}
-
-// Callback helper for fix_hyphens returns true if box1 and box2 overlap
-// (assuming both on the same textline, are in order and a chopped em dash.)
-bool Tesseract::HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2) {
-  return box1.right() >= box2.left();
-}
-
-/**
- * fix_hyphens
- *
- * Change pairs of hyphens to a single hyphen if the bounding boxes touch
- * Typically a long dash which has been segmented.
- */
-void Tesseract::fix_hyphens(WERD_RES *word_res,
-                            BLOB_CHOICE_LIST_CLIST *blob_choices) {
-  if (!unicharset.contains_unichar("-") ||
-      !unicharset.get_enabled(unicharset.unichar_to_id("-")))
-    return;  // Don't create it if it is disallowed.
-
-  word_res->ConditionalBlobMerge(
-      unicharset,
-      NewPermanentTessCallback(this, &Tesseract::BothHyphens),
-      NewPermanentTessCallback(this, &Tesseract::HyphenBoxesOverlap),
-      blob_choices);
-}
-}  // namespace tesseract
-
-
-
-namespace tesseract {
-
-ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(const char *s,
-                                                       const char *lengths) {
+ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(
+    const UNICHARSET& char_set, const char *s, const char *lengths) {
   int i = 0;
   int offset = 0;
   int leading_punct_count;
@@ -999,22 +1303,20 @@ ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(const char *s,
 
   /* Single Leading punctuation char*/
 
-  if ((s[offset] != '\0') && (STRING (chs_leading_punct).contains (s[offset])))
+  if (s[offset] != '\0' && STRING(chs_leading_punct).contains(s[offset]))
     offset += lengths[i++];
   leading_punct_count = i;
 
   /* Initial cap */
-  while ((s[offset] != '\0') &&
-         unicharset.get_isupper(s + offset, lengths[i])) {
+  while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) {
     offset += lengths[i++];
     upper_count++;
   }
-  if (upper_count > 1)
+  if (upper_count > 1) {
     word_type = AC_UPPER_CASE;
-  else {
+  } else {
     /* Lower case word, possibly with an initial cap */
-    while ((s[offset] != '\0') &&
-           unicharset.get_islower (s + offset, lengths[i])) {
+    while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) {
       offset += lengths[i++];
     }
     if (i - leading_punct_count < quality_min_initial_alphas_reqd)
@@ -1028,14 +1330,13 @@ ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(const char *s,
       offset += lengths[i++];
       if (s[offset] != '\0') {
         while ((s[offset] != '\0') &&
-               unicharset.get_islower(s + offset, lengths[i])) {
+               char_set.get_islower(s + offset, lengths[i])) {
           offset += lengths[i++];
         }
         if (i < hyphen_pos + 3)
           goto not_a_word;
       }
-    }
-    else {
+    } else {
       /* Allow "'s" in NON hyphenated lower case words */
       if (lengths[i] == 1 && (s[offset] == '\'') &&
           lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) {
@@ -1050,12 +1351,12 @@ ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(const char *s,
   }
 
   /* Up to two different, constrained trailing punctuation chars */
-  if (lengths[i] == 1 && (s[offset] != '\0') &&
-      (STRING (chs_trailing_punct1).contains (s[offset])))
+  if (lengths[i] == 1 && s[offset] != '\0' &&
+      STRING(chs_trailing_punct1).contains(s[offset]))
     offset += lengths[i++];
-  if (lengths[i] == 1 && (s[offset] != '\0') && i > 0 &&
-    (s[offset - lengths[i - 1]] != s[offset]) &&
-      (STRING (chs_trailing_punct2).contains (s[offset])))
+  if (lengths[i] == 1 && s[offset] != '\0' && i > 0 &&
+      s[offset - lengths[i - 1]] != s[offset] &&
+      STRING(chs_trailing_punct2).contains (s[offset]))
     offset += lengths[i++];
 
   if (s[offset] != '\0')
@@ -1067,20 +1368,20 @@ ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(const char *s,
     /* Look for abbreviation string */
     i = 0;
     offset = 0;
-    if (s[0] != '\0' && unicharset.get_isupper (s, lengths[0])) {
+    if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) {
       word_type = AC_UC_ABBREV;
-      while ((s[offset] != '\0') &&
-             unicharset.get_isupper(s + offset, lengths[i]) &&
-             (lengths[i + 1] == 1 && s[offset + lengths[i]] == '.')) {
+      while (s[offset] != '\0' &&
+             char_set.get_isupper(s + offset, lengths[i]) &&
+             lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
         offset += lengths[i++];
         offset += lengths[i++];
       }
     }
-    else if (s[0] != '\0' && unicharset.get_islower (s, lengths[0])) {
+    else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) {
       word_type = AC_LC_ABBREV;
-      while ((s[offset] != '\0') &&
-             unicharset.get_islower(s + offset, lengths[i]) &&
-             (lengths[i + 1] == 1 && s[offset + lengths[i]] == '.')) {
+      while (s[offset] != '\0' &&
+             char_set.get_islower(s + offset, lengths[i]) &&
+             lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
         offset += lengths[i++];
         offset += lengths[i++];
       }
@@ -1229,12 +1530,10 @@ void Tesseract::set_word_fonts(WERD_RES *word,
   for (char_it.mark_cycle_pt(), index = 0;
        !char_it.cycled_list(); ++index, char_it.forward()) {
     UNICHAR_ID word_ch_id = word->best_choice->unichar_id(index);
-    if (word_ch_id >= PreTrainedTemplates->NumClasses)
-      return;  // This must be a cube word.
     choice_it.set_to_list(char_it.data());
     if (tessedit_debug_fonts) {
-      tprintf("Examining fonts in %s\n", word->best_choice->debug_string(
-          getDict().getUnicharset()).string());
+      tprintf("Examining fonts in %s\n",
+              word->best_choice->debug_string().string());
     }
     for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
          choice_it.forward()) {
@@ -1242,7 +1541,7 @@ void Tesseract::set_word_fonts(WERD_RES *word,
       if (blob_ch_id == word_ch_id) {
         if (tessedit_debug_fonts) {
           tprintf("%s font %s (%d) font2 %s (%d)\n",
-                  getDict().getUnicharset().id_to_unichar(blob_ch_id),
+                  word->uch_set->id_to_unichar(blob_ch_id),
                   choice_it.data()->fontinfo_id() < 0 ? "unknown" :
                   fontinfo_table_.get(choice_it.data()->fontinfo_id()).name,
                   choice_it.data()->fontinfo_id(),
@@ -1261,19 +1560,22 @@ void Tesseract::set_word_fonts(WERD_RES *word,
       }
     }
   }
-  find_modal_font(&fonts, &word->fontinfo_id, &word->fontinfo_id_count);
-  find_modal_font(&fonts, &word->fontinfo_id2, &word->fontinfo_id2_count);
+  inT16 font_id1, font_id2;
+  find_modal_font(&fonts, &font_id1, &word->fontinfo_id_count);
+  find_modal_font(&fonts, &font_id2, &word->fontinfo_id2_count);
+  word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.get(font_id1) : NULL;
+  word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.get(font_id2) : NULL;
   // All the blobs get the word's best choice font.
   for (int i = 0; i < word->best_choice->length(); ++i) {
-    word->best_choice_fontinfo_ids.push_back(word->fontinfo_id);
+    word->best_choice_fontinfo_ids.push_back(font_id1);
   }
   if (word->fontinfo_id_count > 0) {
-    FontInfo fi = fontinfo_table_.get(word->fontinfo_id);
+    FontInfo fi = fontinfo_table_.get(font_id1);
     if (tessedit_debug_fonts) {
       if (word->fontinfo_id2_count > 0) {
         tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n",
                 fi.name, word->fontinfo_id_count,
-                fontinfo_table_.get(word->fontinfo_id2).name,
+                fontinfo_table_.get(font_id2).name,
                 word->fontinfo_id2_count);
       } else {
         tprintf("Word modal font=%s, score=%d. No 2nd choice\n",
@@ -1293,47 +1595,58 @@ void Tesseract::set_word_fonts(WERD_RES *word,
  * Smooth the fonts for the document.
  */
 
-void Tesseract::font_recognition_pass(  //good chars in word
-                                      PAGE_RES_IT &page_res_it) {
-  inT32 length;                  //of word
-  inT32 count;                   //of a feature
-  inT16 doc_font;                 //modal font
-  inT8 doc_font_count;           //modal font
-  WERD_RES *word;                //current word
-  STATS doc_fonts (0, get_fontinfo_table().size() ?
-               get_fontinfo_table().size() : 32);           // font counters
+void Tesseract::font_recognition_pass(PAGE_RES* page_res) {
+  PAGE_RES_IT page_res_it(page_res);
+  WERD_RES *word;                // current word
+  STATS doc_fonts(0, font_table_size_);           // font counters
 
-  page_res_it.restart_page();
-  while (page_res_it.word() != NULL) {
+  // Gather font id statistics.
+  for (page_res_it.restart_page(); page_res_it.word() != NULL;
+       page_res_it.forward()) {
     word = page_res_it.word();
-    set_word_fonts(word, word->best_choice->blob_choices());
-    if (!save_best_choices) {  // set_blob_choices() does a deep clear
-      word->best_choice->set_blob_choices(NULL);
+    if (word->fontinfo != NULL) {
+      doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count);
+    }
+    if (word->fontinfo2 != NULL) {
+      doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count);
     }
-    doc_fonts.add(word->fontinfo_id, word->fontinfo_id_count);
-    doc_fonts.add(word->fontinfo_id2, word->fontinfo_id2_count);
-    page_res_it.forward();
   }
+  inT16 doc_font;                 // modal font
+  inT8 doc_font_count;           // modal font
   find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
   if (doc_font_count == 0)
     return;
-  FontInfo fi = fontinfo_table_.get(doc_font);
+  // Get the modal font pointer.
+  const FontInfo* modal_font = NULL;
+  for (page_res_it.restart_page(); page_res_it.word() != NULL;
+       page_res_it.forward()) {
+    word = page_res_it.word();
+    if (word->fontinfo != NULL && word->fontinfo->universal_id == doc_font) {
+      modal_font = word->fontinfo;
+      break;
+    }
+    if (word->fontinfo2 != NULL && word->fontinfo2->universal_id == doc_font) {
+      modal_font = word->fontinfo2;
+      break;
+    }
+  }
+  ASSERT_HOST(modal_font != NULL);
 
-  page_res_it.restart_page ();
-  while (page_res_it.word () != NULL) {
-    word = page_res_it.word ();
-    length = word->best_choice->length();
+  // Assign modal font to weak words.
+  for (page_res_it.restart_page(); page_res_it.word() != NULL;
+       page_res_it.forward()) {
+    word = page_res_it.word();
+    int length = word->best_choice->length();
 
     // 1st choices got 2 pts, so we need to halve the score for the mode.
-    count = (word->fontinfo_id_count + 1) / 2;
+    int count = (word->fontinfo_id_count + 1) / 2;
     if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
-      word->fontinfo_id = doc_font;
+      word->fontinfo = modal_font;
       // Counts only get 1 as it came from the doc.
       word->fontinfo_id_count = 1;
-      word->italic = fi.is_italic() ? 1 : -1;
-      word->bold = fi.is_bold() ? 1 : -1;
+      word->italic = modal_font->is_italic() ? 1 : -1;
+      word->bold = modal_font->is_bold() ? 1 : -1;
     }
-    page_res_it.forward();
   }
 }
 
diff --git a/ccmain/cube_control.cpp b/ccmain/cube_control.cpp
index 305581a4c8..cd5cc74191 100644
--- a/ccmain/cube_control.cpp
+++ b/ccmain/cube_control.cpp
@@ -157,13 +157,16 @@ static WERD_CHOICE *create_werd_choice(
                                        CharSet* cube_char_set
                                        ) {
   // Insert unichar ids into WERD_CHOICE
-  WERD_CHOICE *werd_choice = new WERD_CHOICE(num_chars);
+  WERD_CHOICE *werd_choice = new WERD_CHOICE(&unicharset, num_chars);
+  // within a word, cube recognizes the word in reading order.
+  werd_choice->set_unichars_in_script_order(true);
   ASSERT_HOST(werd_choice != NULL);
   UNICHAR_ID uch_id;
   for (int i = 0; i < num_chars; ++i) {
     uch_id = cube_char_set->UnicharID(char_samples[i]->StrLabel());
     if (uch_id != INVALID_UNICHAR_ID)
-      werd_choice->append_unichar_id_space_allocated(uch_id, 1, 0.0, certainty);
+      werd_choice->append_unichar_id_space_allocated(
+          uch_id, 1, 0.0, certainty);
   }
 
   BLOB_CHOICE *blob_choice;
@@ -179,12 +182,12 @@ static WERD_CHOICE *create_werd_choice(
     choices_list_it.set_to_list(choices_list);
     // Add a single BLOB_CHOICE to the list
     blob_choice = new BLOB_CHOICE(werd_choice->unichar_id(i),
-                                  0.0, certainty, -1, -1, 0);
+                                  0.0, certainty, -1, -1, 0, 0, 0, false);
     choices_list_it.add_after_then_move(blob_choice);
     // Add list to the clist
     blob_choices_it.add_to_end(choices_list);
   }
-  werd_choice->populate_unichars(unicharset);
+  werd_choice->populate_unichars();
   werd_choice->set_certainty(certainty);
   werd_choice->set_blob_choices(blob_choices);
   return werd_choice;
@@ -231,115 +234,153 @@ bool Tesseract::init_cube_objects(bool load_combiner,
 }
 
 /**********************************************************************
- * run_cube
+ * run_cube_combiner
  *
- * Iterate through tesseract's results and call cube on each word.
- * If the combiner is present, optionally run the tesseract-cube
- * combiner on each word.
+ * Iterates through tesseract's results and calls cube on each word,
+ * combining the results with the existing tesseract result.
  **********************************************************************/
-void Tesseract::run_cube(
-                         PAGE_RES *page_res  // page structure
-                         ) {
-  ASSERT_HOST(cube_cntxt_ != NULL);
-  if (!pix_binary_) {
-    if (cube_debug_level > 0)
-      tprintf("Tesseract::run_cube(): NULL binary image.\n");
-    return;
-  }
-  if (!page_res)
+void Tesseract::run_cube_combiner(PAGE_RES *page_res) {
+  if (page_res == NULL || tess_cube_combiner_ == NULL)
     return;
   PAGE_RES_IT page_res_it(page_res);
-  page_res_it.restart_page();
-
   // Iterate through the word results and call cube on each word.
-  CubeObject *cube_obj;
   for (page_res_it.restart_page(); page_res_it.word () != NULL;
        page_res_it.forward()) {
     WERD_RES* word = page_res_it.word();
-    TBOX word_box = word->word->bounding_box();
-    // TODO(rays): Instead of page_res_it.block()->block maybe use
-    //             word->denorm.block() once TODO in
-    //             Tesseract::recog_all_words() is addressed.
-    const BLOCK* block = page_res_it.block()->block;
-    if (block != NULL && (block->re_rotation().x() != 1.0f ||
-          block->re_rotation().y() != 0.0f)) {
-      // TODO(rays) We have to rotate the bounding box to get the true coords.
-      // This will be achieved in the future via DENORM.
-      // In the mean time, cube can't process this word.
-      if (cube_debug_level > 0) {
-        tprintf("Cube can't process rotated word at:");
-        word_box.print();
-      }
-      if (word->best_choice == NULL)
-        page_res_it.DeleteCurrentWord();  // Nobody has an answer.
+    // Skip cube entirely if tesseract's certainty is greater than threshold.
+    int combiner_run_thresh = convert_prob_to_tess_certainty(
+        cube_cntxt_->Params()->CombinerRunThresh());
+    if (word->best_choice->certainty() >= combiner_run_thresh) {
       continue;
     }
-    cube_obj = new tesseract::CubeObject(cube_cntxt_, pix_binary_,
-                                         word_box.left(),
-                                         pix_binary_->h - word_box.top(),
-                                         word_box.width(), word_box.height());
-    cube_recognize(cube_obj, &page_res_it);
+    // Use the same language as Tesseract used for the word.
+    Tesseract* lang_tess = word->tesseract;
+
+    // Setup a trial WERD_RES in which to classify with cube.
+    WERD_RES cube_word;
+    cube_word.InitForRetryRecognition(*word);
+    CubeObject *cube_obj = lang_tess->cube_recognize_word(
+        page_res_it.block()->block, &cube_word);
+    if (cube_obj != NULL)
+      lang_tess->cube_combine_word(cube_obj, &cube_word, word);
     delete cube_obj;
   }
 }
 
 /**********************************************************************
- * cube_recognize
+ * cube_word_pass1
  *
- * Call cube on the current word, optionally run the tess-cube combiner, and
- * modify the tesseract result if cube wins. If cube fails to run, or
- * if tesseract wins, leave the tesseract result unchanged. If the
- * combiner is not instantiated, always use cube's result.
+ * Recognizes a single word using (only) cube. Compatible with
+ * Tesseract's classify_word_pass1/classify_word_pass2.
+ **********************************************************************/
+void Tesseract::cube_word_pass1(BLOCK* block, ROW *row, WERD_RES *word) {
+  CubeObject *cube_obj = cube_recognize_word(block, word);
+  delete cube_obj;
+}
+
+/**********************************************************************
+ * cube_recognize_word
  *
+ * Cube recognizer to recognize a single word as with classify_word_pass1
+ * but also returns the cube object in case the combiner is needed.
  **********************************************************************/
-void Tesseract::cube_recognize(
-                               CubeObject *cube_obj,
-                               PAGE_RES_IT *page_res_it
-                               ) {
-  // Retrieve tesseract's data structure for the current word.
-  WERD_RES *tess_werd_res = page_res_it->word();
-  if (!tess_werd_res->best_choice && tess_cube_combiner_ != NULL) {
-    if (cube_debug_level > 0)
-      tprintf("Cube WARNING (Tesseract::cube_recognize): Cannot run combiner "
-              "without a tess result.\n");
-    return;
+CubeObject* Tesseract::cube_recognize_word(BLOCK* block, WERD_RES* word) {
+  if (!cube_binary_ || !cube_cntxt_) {
+    if (cube_debug_level > 0 && !cube_binary_)
+      tprintf("Tesseract::run_cube(): NULL binary image.\n");
+    word->SetupFake(unicharset);
+    return NULL;
   }
+  TBOX word_box = word->word->bounding_box();
+  if (block != NULL && (block->re_rotation().x() != 1.0f ||
+        block->re_rotation().y() != 0.0f)) {
+    // TODO(rays) We have to rotate the bounding box to get the true coords.
+    // This will be achieved in the future via DENORM.
+    // In the mean time, cube can't process this word.
+    if (cube_debug_level > 0) {
+      tprintf("Cube can't process rotated word at:");
+      word_box.print();
+    }
+    word->SetupFake(unicharset);
+    return NULL;
+  }
+  CubeObject* cube_obj = new tesseract::CubeObject(
+      cube_cntxt_, cube_binary_, word_box.left(),
+      pixGetHeight(cube_binary_) - word_box.top(),
+      word_box.width(), word_box.height());
+  if (!cube_recognize(cube_obj, block, word)) {
+    delete cube_obj;
+    return NULL;
+  }
+  return cube_obj;
+}
 
-  // Skip cube entirely if combiner is present but tesseract's
-  // certainty is greater than threshold.
-  int combiner_run_thresh = convert_prob_to_tess_certainty(
-      cube_cntxt_->Params()->CombinerRunThresh());
-  if (tess_cube_combiner_ != NULL &&
-      (tess_werd_res->best_choice->certainty() >= combiner_run_thresh)) {
+/**********************************************************************
+ * cube_combine_word
+ *
+ * Combines the cube and tesseract results for a single word, leaving the
+ * result in tess_word.
+ **********************************************************************/
+void Tesseract::cube_combine_word(CubeObject* cube_obj, WERD_RES* cube_word,
+                                  WERD_RES* tess_word) {
+  float combiner_prob = tess_cube_combiner_->CombineResults(tess_word,
+                                                            cube_obj);
+  // If combiner probability is greater than tess/cube combiner
+  // classifier threshold, i.e. tesseract wins, then just return the
+  // tesseract result unchanged, as the combiner knows nothing about how
+  // correct the answer is. If cube and tesseract agree, then improve the
+  // scores before returning.
+  WERD_CHOICE* tess_best = tess_word->best_choice;
+  WERD_CHOICE* cube_best = cube_word->best_choice;
+  if (cube_debug_level || classify_debug_level) {
+    tprintf("Combiner prob = %g vs threshold %g\n",
+            combiner_prob, cube_cntxt_->Params()->CombinerClassifierThresh());
+  }
+  if (combiner_prob >=
+      cube_cntxt_->Params()->CombinerClassifierThresh()) {
+    if (tess_best->unichar_string() == cube_best->unichar_string()) {
+      // Cube and tess agree, so improve the scores.
+      tess_best->set_rating(tess_best->rating() / 2);
+      tess_best->set_certainty(tess_best->certainty() / 2);
+    }
     return;
   }
+  // Cube wins.
+  // It is better for the language combiner to have all tesseract scores,
+  // so put them in the cube result.
+  cube_best->set_rating(tess_best->rating());
+  cube_best->set_certainty(tess_best->certainty());
+  if (cube_debug_level || classify_debug_level) {
+    tprintf("Cube INFO: tesseract result replaced by cube: %s -> %s\n",
+            tess_best->unichar_string().string(),
+            cube_best->unichar_string().string());
+  }
+  tess_word->ConsumeWordResults(cube_word);
+}
+
+/**********************************************************************
+ * cube_recognize
+ *
+ * Call cube on the current word, and write the result to word.
+ * Sets up a fake result and returns false if something goes wrong.
+ **********************************************************************/
+bool Tesseract::cube_recognize(CubeObject *cube_obj, BLOCK* block,
+                               WERD_RES *word) {
+  if (!word->SetupForCubeRecognition(unicharset, this, block)) {
+    return false;  // Graphics block.
+  }
 
   // Run cube
   WordAltList *cube_alt_list = cube_obj->RecognizeWord();
   if (!cube_alt_list || cube_alt_list->AltCount() <= 0) {
     if (cube_debug_level > 0) {
       tprintf("Cube returned nothing for word at:");
-      tess_werd_res->word->bounding_box().print();
-    }
-    if (tess_werd_res->best_choice == NULL) {
-      // Nobody has recognized it, so pretend it doesn't exist.
-      if (cube_debug_level > 0) {
-        tprintf("Deleted word not recognized by cube and/or tesseract at:");
-        tess_werd_res->word->bounding_box().print();
-      }
-      page_res_it->DeleteCurrentWord();
+      word->word->bounding_box().print();
     }
-    return;
+    word->SetupFake(unicharset);
+    return false;
   }
 
-  // At this point we *could* run the combiner and bail out if
-  // Tesseract wins, but that would require instantiating a new
-  // CubeObject to avoid losing the original recognition results
-  // (e.g., beam search lattice) stored with the CubeObject. Instead,
-  // we first extract the state we need from the current recognition
-  // and then reuse the CubeObject so that the combiner does not need
-  // to recompute the image's connected components, segmentation, etc.
-
   // Get cube's best result and its probability, mapped to tesseract's
   // certainty range
   char_32 *cube_best_32 = cube_alt_list->Alt(0);
@@ -357,14 +398,15 @@ void Tesseract::cube_recognize(
       && cube_debug_level > 0) {
     tprintf("Cube WARNING (Tesseract::cube_recognize): Cannot extract "
             "cube state.\n");
-    return;
+    word->SetupFake(unicharset);
+    return false;
   }
 
   // Convert cube's character bounding boxes to a BoxWord.
   BoxWord cube_box_word;
-  TBOX tess_word_box = tess_werd_res->word->bounding_box();
-  if (tess_werd_res->denorm.block() != NULL)
-    tess_word_box.rotate(tess_werd_res->denorm.block()->re_rotation());
+  TBOX tess_word_box = word->word->bounding_box();
+  if (word->denorm.block() != NULL)
+    tess_word_box.rotate(word->denorm.block()->re_rotation());
   bool box_word_success = create_cube_box_word(char_boxes, num_chars,
                                                tess_word_box,
                                                &cube_box_word);
@@ -374,7 +416,8 @@ void Tesseract::cube_recognize(
       tprintf("Cube WARNING (Tesseract::cube_recognize): Could not "
               "create cube BoxWord\n");
     }
-    return;
+    word->SetupFake(unicharset);
+    return false;
   }
 
   // Create cube's best choice.
@@ -388,36 +431,19 @@ void Tesseract::cube_recognize(
       tprintf("Cube WARNING (Tesseract::cube_recognize): Could not "
               "create cube WERD_CHOICE\n");
     }
-    return;
+    word->SetupFake(unicharset);
+    return false;
   }
-
-  // Run combiner if present, now that we're free to reuse the CubeObject.
-  if (tess_cube_combiner_ != NULL) {
-    float combiner_prob = tess_cube_combiner_->CombineResults(tess_werd_res,
-                                                              cube_obj);
-    // If combiner probability is greater than tess/cube combiner
-    // classifier threshold, i.e. tesseract wins, then reset the WERD_RES
-    // certainty to the combiner certainty and return. Note that when
-    // tesseract and cube agree, the combiner probability is 1.0, so
-    // the final WERD_RES certainty will be maximized to 0.0.
-    if (combiner_prob >=
-        cube_cntxt_->Params()->CombinerClassifierThresh()) {
-      float combiner_certainty = convert_prob_to_tess_certainty(combiner_prob);
-      tess_werd_res->best_choice->set_certainty(combiner_certainty);
-      delete cube_werd_choice;
-      return;
-    }
-    if (cube_debug_level > 5) {
-      tprintf("Cube INFO: tesseract result replaced by cube: "
-              "%s -> %s\n",
-              tess_werd_res->best_choice->unichar_string().string(),
-              cube_best_str.c_str());
-    }
+  if (cube_debug_level || classify_debug_level) {
+    tprintf("Cube result: %s r=%g, c=%g\n",
+            cube_werd_choice->unichar_string().string(),
+            cube_werd_choice->rating(),
+            cube_werd_choice->certainty());
   }
 
   // Fill tesseract result's fields with cube results
-  fill_werd_res(cube_box_word, cube_werd_choice, cube_best_str.c_str(),
-                page_res_it);
+  fill_werd_res(cube_box_word, cube_werd_choice, cube_best_str.c_str(), word);
+  return true;
 }
 
 /**********************************************************************
@@ -429,16 +455,14 @@ void Tesseract::cube_recognize(
 void Tesseract::fill_werd_res(const BoxWord& cube_box_word,
                               WERD_CHOICE* cube_werd_choice,
                               const char* cube_best_str,
-                              PAGE_RES_IT *page_res_it) {
-  WERD_RES *tess_werd_res = page_res_it->word();
-
+                              WERD_RES* tess_werd_res) {
   // Replace tesseract results's best choice with cube's
-  delete tess_werd_res->best_choice;
   tess_werd_res->best_choice = cube_werd_choice;
+  tess_werd_res->raw_choice = new WERD_CHOICE(*cube_werd_choice);
 
   delete tess_werd_res->box_word;
   tess_werd_res->box_word = new BoxWord(cube_box_word);
-  tess_werd_res->box_word->ClipToOriginalWord(page_res_it->block()->block,
+  tess_werd_res->box_word->ClipToOriginalWord(tess_werd_res->denorm.block(),
                                               tess_werd_res->word);
   // Fill text and remaining fields
   tess_werd_res->word->set_text(cube_best_str);
diff --git a/ccmain/cube_reco_context.cpp b/ccmain/cube_reco_context.cpp
index 0f2ff63df4..f6d960bb85 100644
--- a/ccmain/cube_reco_context.cpp
+++ b/ccmain/cube_reco_context.cpp
@@ -109,6 +109,7 @@ bool CubeRecoContext::GetDataFilePath(string *path) const {
 bool CubeRecoContext::Load(TessdataManager *tessdata_manager,
                            UNICHARSET *tess_unicharset) {
   ASSERT_HOST(tess_obj_ != NULL);
+  tess_unicharset_ = tess_unicharset;
   string data_file_path;
 
   // Get the data file path.
diff --git a/ccmain/cube_reco_context.h b/ccmain/cube_reco_context.h
index 822ef62ce7..811a6308be 100644
--- a/ccmain/cube_reco_context.h
+++ b/ccmain/cube_reco_context.h
@@ -56,6 +56,7 @@ class CubeRecoContext {
   // accessor functions
   inline const string & Lang() const { return lang_; }
   inline CharSet *CharacterSet() const { return char_set_; }
+  const UNICHARSET *TessUnicharset() const { return tess_unicharset_; }
   inline CharClassifier *Classifier() const { return char_classifier_; }
   inline WordSizeModel *SizeModel() const { return word_size_model_; }
   inline CharBigrams *Bigrams() const { return char_bigrams_; }
@@ -135,6 +136,7 @@ class CubeRecoContext {
   bool loaded_;
   string lang_;
   CharSet *char_set_;
+  UNICHARSET *tess_unicharset_;
   WordSizeModel *word_size_model_;
   CharClassifier *char_classifier_;
   CharBigrams *char_bigrams_;
diff --git a/ccmain/cubeclassifier.cpp b/ccmain/cubeclassifier.cpp
new file mode 100644
index 0000000000..12d57f17d0
--- /dev/null
+++ b/ccmain/cubeclassifier.cpp
@@ -0,0 +1,136 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+// Author: rays@google.com (Ray Smith)
+///////////////////////////////////////////////////////////////////////
+// File:        cubeclassifier.cpp
+// Description: Cube implementation of a ShapeClassifier.
+// Author:      Ray Smith
+// Created:     Wed Nov 23 10:39:45 PST 2011
+//
+// (C) Copyright 2011, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "cubeclassifier.h"
+
+#include "char_altlist.h"
+#include "char_set.h"
+#include "cube_object.h"
+#include "cube_reco_context.h"
+#include "tessclassifier.h"
+#include "tesseractclass.h"
+#include "trainingsample.h"
+#include "unicharset.h"
+
+namespace tesseract {
+
+CubeClassifier::CubeClassifier(tesseract::Tesseract* tesseract)
+    : cube_cntxt_(tesseract->GetCubeRecoContext()),
+      shape_table_(*tesseract->shape_table()) {
+}
+CubeClassifier::~CubeClassifier() {
+}
+
+// Classifies the given [training] sample, writing to results.
+// See ShapeClassifier for a full description.
+int CubeClassifier::ClassifySample(const TrainingSample& sample,
+                                   Pix* page_pix, int debug, int keep_this,
+                                   GenericVector<ShapeRating>* results) {
+  results->clear();
+  if (page_pix == NULL) return 0;
+
+  ASSERT_HOST(cube_cntxt_ != NULL);
+  const TBOX& char_box = sample.bounding_box();
+  CubeObject* cube_obj = new tesseract::CubeObject(
+      cube_cntxt_, page_pix, char_box.left(),
+      pixGetHeight(page_pix) - char_box.top(),
+      char_box.width(), char_box.height());
+  CharAltList* alt_list = cube_obj->RecognizeChar();
+  alt_list->Sort();
+  CharSet* char_set = cube_cntxt_->CharacterSet();
+  if (alt_list != NULL) {
+    for (int i = 0; i < alt_list->AltCount(); ++i) {
+      // Convert cube representation to a shape_id.
+      int alt_id = alt_list->Alt(i);
+      int unichar_id = char_set->UnicharID(char_set->ClassString(alt_id));
+      int shape_id = shape_table_.FindShape(unichar_id, -1);
+      if (shape_id >= 0)
+        results->push_back(ShapeRating(shape_id, alt_list->AltProb(i)));
+    }
+    delete alt_list;
+  }
+  delete cube_obj;
+  return results->size();
+}
+
+// Provides access to the ShapeTable that this classifier works with.
+const ShapeTable* CubeClassifier::GetShapeTable() const {
+  return &shape_table_;
+}
+
+CubeTessClassifier::CubeTessClassifier(tesseract::Tesseract* tesseract)
+    : cube_cntxt_(tesseract->GetCubeRecoContext()),
+      shape_table_(*tesseract->shape_table()),
+      pruner_(new TessClassifier(true, tesseract)) {
+}
+CubeTessClassifier::~CubeTessClassifier() {
+  delete pruner_;
+}
+
+// Classifies the given [training] sample, writing to results.
+// See ShapeClassifier for a full description.
+int CubeTessClassifier::ClassifySample(const TrainingSample& sample,
+                                       Pix* page_pix, int debug, int keep_this,
+                                       GenericVector<ShapeRating>* results) {
+  int num_results = pruner_->ClassifySample(sample, page_pix, debug, keep_this,
+                                            results);
+  if (page_pix == NULL) return num_results;
+
+  ASSERT_HOST(cube_cntxt_ != NULL);
+  const TBOX& char_box = sample.bounding_box();
+  CubeObject* cube_obj = new tesseract::CubeObject(
+      cube_cntxt_, page_pix, char_box.left(),
+      pixGetHeight(page_pix) - char_box.top(),
+      char_box.width(), char_box.height());
+  CharAltList* alt_list = cube_obj->RecognizeChar();
+  CharSet* char_set = cube_cntxt_->CharacterSet();
+  if (alt_list != NULL) {
+    for (int r = 0; r < num_results; ++r) {
+      const Shape& shape = shape_table_.GetShape((*results)[r].shape_id);
+      // Get the best cube probability of all unichars in the shape.
+      double best_prob = 0.0;
+      for (int i = 0; i < alt_list->AltCount(); ++i) {
+        int alt_id = alt_list->Alt(i);
+        int unichar_id = char_set->UnicharID(char_set->ClassString(alt_id));
+        if (shape.ContainsUnichar(unichar_id) &&
+            alt_list->AltProb(i) > best_prob) {
+          best_prob = alt_list->AltProb(i);
+        }
+      }
+      (*results)[r].rating = best_prob;
+    }
+    delete alt_list;
+    // Re-sort by rating.
+    results->sort(&ShapeRating::SortDescendingRating);
+  }
+  delete cube_obj;
+  return results->size();
+}
+
+// Provides access to the ShapeTable that this classifier works with.
+const ShapeTable* CubeTessClassifier::GetShapeTable() const {
+  return &shape_table_;
+}
+
+}  // namespace tesseract
+
+
+
diff --git a/ccmain/cubeclassifier.h b/ccmain/cubeclassifier.h
new file mode 100644
index 0000000000..6359cc51f7
--- /dev/null
+++ b/ccmain/cubeclassifier.h
@@ -0,0 +1,79 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+// Author: rays@google.com (Ray Smith)
+///////////////////////////////////////////////////////////////////////
+// File:        cubeclassifier.h
+// Description: Cube implementation of a ShapeClassifier.
+// Author:      Ray Smith
+// Created:     Wed Nov 23 10:36:32 PST 2011
+//
+// (C) Copyright 2011, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef THIRD_PARTY_TESSERACT_CCMAIN_CUBECLASSIFIER_H_
+#define THIRD_PARTY_TESSERACT_CCMAIN_CUBECLASSIFIER_H_
+
+#include "shapeclassifier.h"
+
+namespace tesseract {
+
+class Classify;
+class CubeRecoContext;
+class ShapeTable;
+class TessClassifier;
+class Tesseract;
+class TrainingSample;
+
+// Cube implementation of a ShapeClassifier.
+class CubeClassifier : public ShapeClassifier {
+ public:
+  explicit CubeClassifier(Tesseract* tesseract);
+  virtual ~CubeClassifier();
+
+  // Classifies the given [training] sample, writing to results.
+  // See ShapeClassifier for a full description.
+  virtual int ClassifySample(const TrainingSample& sample, Pix* page_pix,
+                             int debug, int keep_this,
+                             GenericVector<ShapeRating>* results);
+  // Provides access to the ShapeTable that this classifier works with.
+  virtual const ShapeTable* GetShapeTable() const;
+
+ private:
+  // Cube objects.
+  CubeRecoContext* cube_cntxt_;
+  const ShapeTable& shape_table_;
+};
+
+// Combination of Tesseract class pruner with scoring by cube.
+class CubeTessClassifier : public ShapeClassifier {
+ public:
+  explicit CubeTessClassifier(Tesseract* tesseract);
+  virtual ~CubeTessClassifier();
+
+  // Classifies the given [training] sample, writing to results.
+  // See ShapeClassifier for a full description.
+  virtual int ClassifySample(const TrainingSample& sample, Pix* page_pix,
+                             int debug, int keep_this,
+                             GenericVector<ShapeRating>* results);
+  // Provides access to the ShapeTable that this classifier works with.
+  virtual const ShapeTable* GetShapeTable() const;
+
+ private:
+  // Cube objects.
+  CubeRecoContext* cube_cntxt_;
+  const ShapeTable& shape_table_;
+  TessClassifier* pruner_;
+};
+
+}  // namespace tesseract
+
+#endif /* THIRD_PARTY_TESSERACT_CCMAIN_CUBECLASSIFIER_H_ */
diff --git a/ccmain/docqual.cpp b/ccmain/docqual.cpp
index adcb697768..c8fed20fc9 100644
--- a/ccmain/docqual.cpp
+++ b/ccmain/docqual.cpp
@@ -81,12 +81,13 @@ inT16 Tesseract::word_outline_errs(WERD_RES *word) {
   inT16 i = 0;
   inT16 err_count = 0;
 
-  TBLOB* blob = word->rebuild_word->blobs;
-
-  for (; blob != NULL; blob = blob->next) {
-    err_count += count_outline_errs(word->best_choice->unichar_string()[i],
-                                    blob->NumOutlines());
-    i++;
+  if (word->rebuild_word != NULL) {
+    TBLOB* blob = word->rebuild_word->blobs;
+    for (; blob != NULL; blob = blob->next) {
+      err_count += count_outline_errs(word->best_choice->unichar_string()[i],
+                                      blob->NumOutlines());
+      i++;
+    }
   }
   return err_count;
 }
@@ -185,12 +186,13 @@ void Tesseract::unrej_good_quality_words(  //unreject potential
       (float) page_res_it.row ()->char_count) <=
     quality_rowrej_pc)) {
       word = page_res_it.word ();
-      if (word->reject_map.quality_recoverable_rejects () &&
-        (tessedit_unrej_any_wd ||
-        acceptable_word_string (word->best_choice->unichar_string().string(),
-                                word->best_choice->unichar_lengths().string())
-      != AC_UNACCEPTABLE)) {
-        unrej_good_chs (word, page_res_it.row ()->row);
+      if (word->reject_map.quality_recoverable_rejects() &&
+          (tessedit_unrej_any_wd ||
+           acceptable_word_string(*word->uch_set,
+                                  word->best_choice->unichar_string().string(),
+                                  word->best_choice->unichar_lengths().string())
+               != AC_UNACCEPTABLE)) {
+        unrej_good_chs(word, page_res_it.row ()->row);
       }
       page_res_it.forward ();
     }
@@ -246,68 +248,57 @@ void Tesseract::doc_and_block_rejection(  //reject big chunks
   inT16 char_quality = 0;
   inT16 accepted_char_quality;
 
-  if ((page_res_it.page_res->rej_count * 100.0 /
-  page_res_it.page_res->char_count) > tessedit_reject_doc_percent) {
+  if (page_res_it.page_res->rej_count * 100.0 /
+      page_res_it.page_res->char_count > tessedit_reject_doc_percent) {
     reject_whole_page(page_res_it);
-    #ifndef SECURE_NAMES
     if (tessedit_debug_doc_rejection) {
-      tprintf ("REJECT ALL #chars: %d #Rejects: %d; \n",
-        page_res_it.page_res->char_count,
-        page_res_it.page_res->rej_count);
+      tprintf("REJECT ALL #chars: %d #Rejects: %d; \n",
+              page_res_it.page_res->char_count,
+              page_res_it.page_res->rej_count);
+    }
+  } else {
+    if (tessedit_debug_doc_rejection) {
+      tprintf("NO PAGE REJECTION #chars: %d  # Rejects: %d; \n",
+              page_res_it.page_res->char_count,
+              page_res_it.page_res->rej_count);
     }
-    #endif
-  }
-  else {
-    #ifndef SECURE_NAMES
-    if (tessedit_debug_doc_rejection)
-      tprintf ("NO PAGE REJECTION #chars: %d  # Rejects: %d; \n",
-        page_res_it.page_res->char_count,
-        page_res_it.page_res->rej_count);
-    #endif
 
     /* Walk blocks testing for block rejection */
 
-    page_res_it.restart_page ();
-    while (page_res_it.word () != NULL) {
+    page_res_it.restart_page();
+    WERD_RES* word;
+    while ((word = page_res_it.word()) != NULL) {
       current_block = page_res_it.block();
       block_no = current_block->block->index();
-      if ((page_res_it.block ()->char_count > 0) &&
-        ((page_res_it.block ()->rej_count * 100.0 /
-        page_res_it.block ()->char_count) >
-      tessedit_reject_block_percent)) {
-        #ifndef SECURE_NAMES
-        if (tessedit_debug_block_rejection)
-          tprintf ("REJECTING BLOCK %d  #chars: %d;  #Rejects: %d\n",
-            block_no,
-            page_res_it.block ()->char_count,
-            page_res_it.block ()->rej_count);
-        #endif
+      if (current_block->char_count > 0 &&
+          (current_block->rej_count * 100.0 / current_block->char_count) >
+           tessedit_reject_block_percent) {
+        if (tessedit_debug_block_rejection) {
+          tprintf("REJECTING BLOCK %d  #chars: %d;  #Rejects: %d\n",
+                  block_no, current_block->char_count,
+                  current_block->rej_count);
+        }
         prev_word_rejected = FALSE;
-        while ((page_res_it.word () != NULL) &&
-        (page_res_it.block () == current_block)) {
+        while ((word = page_res_it.word()) != NULL &&
+               (page_res_it.block() == current_block)) {
           if (tessedit_preserve_blk_rej_perfect_wds) {
-            rej_word =
-              (page_res_it.word ()->reject_map.reject_count () > 0)
-              || (page_res_it.word ()->reject_map.length () <
-              tessedit_preserve_min_wd_len);
-            if (rej_word && tessedit_dont_blkrej_good_wds
-              && !(page_res_it.word ()->reject_map.length () <
-              tessedit_preserve_min_wd_len)
-              &&
-              (acceptable_word_string
-               (page_res_it.word()->best_choice->unichar_string().string(),
-               page_res_it.word ()->best_choice->unichar_lengths().string()) !=
-               AC_UNACCEPTABLE)) {
-              word_char_quality (page_res_it.word (),
-                page_res_it.row ()->row,
-                &char_quality,
-                &accepted_char_quality);
-              rej_word = char_quality !=
-                page_res_it.word ()->reject_map.length ();
+            rej_word = word->reject_map.reject_count() > 0 ||
+                word->reject_map.length () < tessedit_preserve_min_wd_len;
+            if (rej_word && tessedit_dont_blkrej_good_wds &&
+                word->reject_map.length() >= tessedit_preserve_min_wd_len &&
+                acceptable_word_string(
+                    *word->uch_set,
+                    word->best_choice->unichar_string().string(),
+                    word->best_choice->unichar_lengths().string()) !=
+                AC_UNACCEPTABLE) {
+              word_char_quality(word, page_res_it.row()->row,
+                                &char_quality,
+                                &accepted_char_quality);
+              rej_word = char_quality !=  word->reject_map.length();
             }
-          }
-          else
+          } else {
             rej_word = TRUE;
+          }
           if (rej_word) {
             /*
               Reject spacing if both current and prev words are rejected.
@@ -315,89 +306,70 @@ void Tesseract::doc_and_block_rejection(  //reject big chunks
               generated more space errors.
             */
             if (tessedit_use_reject_spaces &&
-              prev_word_rejected &&
-              (page_res_it.prev_row () == page_res_it.row ()) &&
-              (page_res_it.word ()->word->space () == 1))
-              page_res_it.word ()->reject_spaces = TRUE;
-            page_res_it.word ()->reject_map.rej_word_block_rej ();
+                prev_word_rejected &&
+                page_res_it.prev_row() == page_res_it.row() &&
+                word->word->space() == 1)
+              word->reject_spaces = TRUE;
+            word->reject_map.rej_word_block_rej();
           }
           prev_word_rejected = rej_word;
-          page_res_it.forward ();
+          page_res_it.forward();
+        }
+      } else {
+        if (tessedit_debug_block_rejection) {
+          tprintf("NOT REJECTING BLOCK %d #chars: %d  # Rejects: %d; \n",
+                  block_no, page_res_it.block()->char_count,
+                  page_res_it.block()->rej_count);
         }
-      }
-      else {
-        #ifndef SECURE_NAMES
-        if (tessedit_debug_block_rejection)
-          tprintf
-            ("NOT REJECTING BLOCK %d #chars: %d  # Rejects: %d; \n",
-            block_no, page_res_it.block ()->char_count,
-            page_res_it.block ()->rej_count);
-        #endif
 
         /* Walk rows in block testing for row rejection */
         row_no = 0;
-        while ((page_res_it.word () != NULL) &&
-        (page_res_it.block () == current_block)) {
-          current_row = page_res_it.row ();
+        while ((word = page_res_it.word()) != NULL &&
+               page_res_it.block() == current_block) {
+          current_row = page_res_it.row();
           row_no++;
           /* Reject whole row if:
             fraction of chars on row which are rejected exceed a limit AND
             fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
             limit
           */
-          if ((page_res_it.row ()->char_count > 0) &&
-            ((page_res_it.row ()->rej_count * 100.0 /
-            page_res_it.row ()->char_count) >
-            tessedit_reject_row_percent) &&
-            ((page_res_it.row ()->whole_word_rej_count * 100.0 /
-            page_res_it.row ()->rej_count) <
-          tessedit_whole_wd_rej_row_percent)) {
-            #ifndef SECURE_NAMES
-            if (tessedit_debug_block_rejection)
-              tprintf
-                ("REJECTING ROW %d  #chars: %d;  #Rejects: %d\n",
-                row_no, page_res_it.row ()->char_count,
-                page_res_it.row ()->rej_count);
-            #endif
+          if (current_row->char_count > 0 &&
+              (current_row->rej_count * 100.0 / current_row->char_count) >
+              tessedit_reject_row_percent &&
+              (current_row->whole_word_rej_count * 100.0 /
+                  current_row->rej_count) <
+              tessedit_whole_wd_rej_row_percent) {
+            if (tessedit_debug_block_rejection) {
+              tprintf("REJECTING ROW %d  #chars: %d;  #Rejects: %d\n",
+                      row_no, current_row->char_count,
+                      current_row->rej_count);
+            }
             prev_word_rejected = FALSE;
-            while ((page_res_it.word () != NULL) &&
-            (page_res_it.row () == current_row)) {
+            while ((word = page_res_it.word()) != NULL &&
+                   page_res_it.row () == current_row) {
               /* Preserve words on good docs unless they are mostly rejected*/
               if (!tessedit_row_rej_good_docs && good_quality_doc) {
-                rej_word =
-                  page_res_it.word ()->reject_map.
-                  reject_count () /
-                  (float) page_res_it.word ()->reject_map.
-                  length () > tessedit_good_doc_still_rowrej_wd;
-              }
-
-              /* Preserve perfect words anyway */
-              else if (tessedit_preserve_row_rej_perfect_wds) {
-                rej_word =
-                  (page_res_it.word ()->reject_map.
-                  reject_count () > 0)
-                  || (page_res_it.word ()->reject_map.
-                  length () < tessedit_preserve_min_wd_len);
-                if (rej_word && tessedit_dont_rowrej_good_wds
-                  && !(page_res_it.word ()->reject_map.
-                  length () <
-                  tessedit_preserve_min_wd_len)
-                  &&
-                  (acceptable_word_string
-                   (page_res_it.word ()->best_choice->
-                    unichar_string().string(),
-                    page_res_it.word ()->best_choice->
-                    unichar_lengths().string()) != AC_UNACCEPTABLE)) {
-                  word_char_quality (page_res_it.word (),
-                    page_res_it.row ()->row,
-                    &char_quality,
-                    &accepted_char_quality);
-                  rej_word = char_quality !=
-                    page_res_it.word ()->reject_map.length ();
+                rej_word = word->reject_map.reject_count() /
+                    static_cast<float>(word->reject_map.length()) >
+                    tessedit_good_doc_still_rowrej_wd;
+              } else if (tessedit_preserve_row_rej_perfect_wds) {
+                /* Preserve perfect words anyway */
+                rej_word = word->reject_map.reject_count() > 0 ||
+                    word->reject_map.length () < tessedit_preserve_min_wd_len;
+                if (rej_word && tessedit_dont_rowrej_good_wds &&
+                    word->reject_map.length() >= tessedit_preserve_min_wd_len &&
+                    acceptable_word_string(*word->uch_set,
+                        word->best_choice->unichar_string().string(),
+                        word->best_choice->unichar_lengths().string()) !=
+                            AC_UNACCEPTABLE) {
+                  word_char_quality(word, page_res_it.row()->row,
+                                    &char_quality,
+                                    &accepted_char_quality);
+                  rej_word = char_quality != word->reject_map.length();
                 }
-              }
-              else
+              } else {
                 rej_word = TRUE;
+              }
               if (rej_word) {
                 /*
                   Reject spacing if both current and prev words are rejected.
@@ -405,36 +377,30 @@ void Tesseract::doc_and_block_rejection(  //reject big chunks
                   this generated more space errors.
                 */
                 if (tessedit_use_reject_spaces &&
-                  prev_word_rejected &&
-                  (page_res_it.prev_row () ==
-                  page_res_it.row ())
-                  && (page_res_it.word ()->word->space () ==
-                  1))
-                  page_res_it.word ()->reject_spaces = TRUE;
-                page_res_it.word ()->reject_map.
-                  rej_word_row_rej();
+                    prev_word_rejected &&
+                    page_res_it.prev_row() == page_res_it.row() &&
+                    word->word->space () == 1)
+                  word->reject_spaces = TRUE;
+                word->reject_map.rej_word_row_rej();
               }
               prev_word_rejected = rej_word;
-              page_res_it.forward ();
+              page_res_it.forward();
             }
-          }
-          else {
-            #ifndef SECURE_NAMES
-            if (tessedit_debug_block_rejection)
-              tprintf
-                ("NOT REJECTING ROW %d #chars: %d  # Rejects: %d; \n",
-                row_no, page_res_it.row ()->char_count,
-                page_res_it.row ()->rej_count);
-            #endif
-            while ((page_res_it.word () != NULL) &&
-              (page_res_it.row () == current_row))
-              page_res_it.forward ();
+          } else {
+            if (tessedit_debug_block_rejection) {
+              tprintf("NOT REJECTING ROW %d #chars: %d  # Rejects: %d; \n",
+                      row_no, current_row->char_count, current_row->rej_count);
+            }
+            while (page_res_it.word() != NULL &&
+                   page_res_it.row() == current_row)
+              page_res_it.forward();
           }
         }
       }
     }
   }
 }
+
 }  // namespace tesseract
 
 
@@ -463,15 +429,20 @@ void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) {
   BOOL8 found_terrible_word = FALSE;
   BOOL8 ok_dict_word;
 
-  page_res_it.restart_page ();
-  while (page_res_it.word () != NULL) {
-    word = page_res_it.word ();
+  page_res_it.restart_page();
+  while (page_res_it.word() != NULL) {
+    POLY_BLOCK* pb = page_res_it.block()->block->poly_block();
+    if (pb != NULL && !pb->IsText()) {
+      page_res_it.forward();
+      continue;
+    }
+    word = page_res_it.word();
 
     if (crunch_early_convert_bad_unlv_chs)
       convert_bad_unlv_chs(word);
 
     if (crunch_early_merge_tess_fails)
-      merge_tess_fails(word);
+      word->merge_tess_fails();
 
     if (word->reject_map.accept_count () != 0) {
       found_terrible_word = FALSE;
@@ -479,7 +450,7 @@ void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) {
       prev_potential_marked = FALSE;
     }
     else {
-      ok_dict_word = safe_dict_word(*(word->best_choice));
+      ok_dict_word = safe_dict_word(word);
       garbage_level = garbage_word (word, ok_dict_word);
 
       if ((garbage_level != G_NEVER_CRUNCH) &&
@@ -584,47 +555,44 @@ BOOL8 Tesseract::potential_word_crunch(WERD_RES *word,
   BOOL8 word_crunchable;
   int poor_indicator_count = 0;
 
-  word_crunchable =
-    !crunch_leave_accept_strings ||
-    (word->reject_map.length () < 3) ||
-    ((acceptable_word_string (str, lengths) == AC_UNACCEPTABLE) &&
-     !ok_dict_word);
+  word_crunchable = !crunch_leave_accept_strings ||
+                    word->reject_map.length() < 3 ||
+                    (acceptable_word_string(*word->uch_set,
+                                            str, lengths) == AC_UNACCEPTABLE &&
+                     !ok_dict_word);
 
-  adjusted_len = word->reject_map.length ();
+  adjusted_len = word->reject_map.length();
   if (adjusted_len > 10)
     adjusted_len = 10;
-  rating_per_ch = word->best_choice->rating () / adjusted_len;
+  rating_per_ch = word->best_choice->rating() / adjusted_len;
 
   if (rating_per_ch > crunch_pot_poor_rate) {
     if (crunch_debug > 2) {
-      tprintf ("Potential poor rating on \"%s\"\n",
-        word->best_choice->unichar_string().string());
+      tprintf("Potential poor rating on \"%s\"\n",
+              word->best_choice->unichar_string().string());
     }
     poor_indicator_count++;
   }
 
   if (word_crunchable &&
-  (word->best_choice->certainty () < crunch_pot_poor_cert)) {
+      word->best_choice->certainty() < crunch_pot_poor_cert) {
     if (crunch_debug > 2) {
-      tprintf ("Potential poor cert on \"%s\"\n",
-        word->best_choice->unichar_string().string());
+      tprintf("Potential poor cert on \"%s\"\n",
+              word->best_choice->unichar_string().string());
     }
     poor_indicator_count++;
   }
 
   if (garbage_level != G_OK) {
     if (crunch_debug > 2) {
-      tprintf ("Potential garbage on \"%s\"\n",
-        word->best_choice->unichar_string().string());
+      tprintf("Potential garbage on \"%s\"\n",
+              word->best_choice->unichar_string().string());
     }
     poor_indicator_count++;
   }
-  return (poor_indicator_count >= crunch_pot_indicators);
+  return poor_indicator_count >= crunch_pot_indicators;
 }
-}  // namespace tesseract
 
-
-namespace tesseract {
 void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) {
   WERD_RES *word;
   PAGE_RES_IT copy_it;
@@ -635,9 +603,9 @@ void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) {
   inT16 x_debug_delete_mode;
   CRUNCH_MODE x_delete_mode;
 
-  page_res_it.restart_page ();
-  while (page_res_it.word () != NULL) {
-    word = page_res_it.word ();
+  page_res_it.restart_page();
+  while (page_res_it.word() != NULL) {
+    word = page_res_it.word();
 
     delete_mode = word_deletable (word, debug_delete_mode);
     if (delete_mode != CR_NONE) {
@@ -649,10 +617,9 @@ void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) {
         }
         word->unlv_crunch_mode = delete_mode;
         deleting_from_bol = TRUE;
-      }
-      else if (word->word->flag (W_EOL)) {
+      } else if (word->word->flag(W_EOL)) {
         if (marked_delete_point) {
-          while (copy_it.word () != word) {
+          while (copy_it.word() != word) {
             x_delete_mode = word_deletable (copy_it.word (),
               x_debug_delete_mode);
             if (crunch_debug > 0) {
@@ -690,7 +657,7 @@ void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) {
       determine if the word is deletable.
     */
     if (!crunch_early_merge_tess_fails)
-      merge_tess_fails(word);
+      word->merge_tess_fails();
     page_res_it.forward ();
   }
 }
@@ -698,10 +665,10 @@ void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) {
 
 void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) {
   int i;
-  UNICHAR_ID unichar_dash = unicharset.unichar_to_id("-");
-  UNICHAR_ID unichar_space = unicharset.unichar_to_id(" ");
-  UNICHAR_ID unichar_tilde = unicharset.unichar_to_id("~");
-  UNICHAR_ID unichar_pow = unicharset.unichar_to_id("^");
+  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
+  UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");
+  UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");
+  UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");
   bool modified = false;
   for (i = 0; i < word_res->reject_map.length(); ++i) {
     if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
@@ -718,37 +685,7 @@ void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) {
     }
   }
   if (modified) {
-    word_res->best_choice->populate_unichars(unicharset);
-  }
-}
-
-// Callback helper for merge_tess_fails returns a space if both
-// arguments are space, otherwise INVALID_UNICHAR_ID.
-UNICHAR_ID Tesseract::BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2) {
-  if (id1 == id2 && id1 == unicharset.unichar_to_id(" "))
-    return id1;
-  else
-    return INVALID_UNICHAR_ID;
-}
-
-// Change pairs of tess failures to a single one
-void Tesseract::merge_tess_fails(WERD_RES *word_res) {
-  if (word_res->ConditionalBlobMerge(
-      unicharset,
-      NewPermanentTessCallback(this, &Tesseract::BothSpaces), NULL,
-      word_res->best_choice->blob_choices())) {
-    if (crunch_debug) {
-      tprintf("Post:bc len=%d, rejmap=%d, boxword=%d, chopword=%d,"
-              " rebuild=%d\n",
-              word_res->best_choice->length(),
-              word_res->reject_map.length(),
-              word_res->box_word->length(),
-              word_res->chopped_word->NumBlobs(),
-              word_res->rebuild_word->NumBlobs());
-    }
-    int len = word_res->best_choice->length();
-    ASSERT_HOST(word_res->reject_map.length() == len);
-    ASSERT_HOST(word_res->box_word->length() == len);
+    word_res->best_choice->populate_unichars();
   }
 }
 
@@ -785,7 +722,7 @@ GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, BOOL8 ok_dict_word) {
 
   for (; *str != '\0'; str += *(lengths++)) {
     len++;
-    if (unicharset.get_isupper (str, *lengths)) {
+    if (word->uch_set->get_isupper (str, *lengths)) {
       total_alpha_count++;
       switch (state) {
         case SUBSEQUENT_UPPER:
@@ -794,14 +731,14 @@ GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, BOOL8 ok_dict_word) {
           upper_string_count++;
           if (longest_upper_run_len < upper_string_count)
             longest_upper_run_len = upper_string_count;
-          if (last_char == unicharset.unichar_to_id(str, *lengths)) {
+          if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
             alpha_repetition_count++;
             if (longest_alpha_repetition_count < alpha_repetition_count) {
               longest_alpha_repetition_count = alpha_repetition_count;
             }
           }
           else {
-            last_char = unicharset.unichar_to_id(str, *lengths);
+            last_char = word->uch_set->unichar_to_id(str, *lengths);
             alpha_repetition_count = 1;
           }
           break;
@@ -809,13 +746,13 @@ GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, BOOL8 ok_dict_word) {
           isolated_digits++;
         default:
           state = FIRST_UPPER;
-          last_char = unicharset.unichar_to_id(str, *lengths);
+          last_char = word->uch_set->unichar_to_id(str, *lengths);
           alpha_repetition_count = 1;
           upper_string_count = 1;
           break;
       }
     }
-    else if (unicharset.get_islower (str, *lengths)) {
+    else if (word->uch_set->get_islower (str, *lengths)) {
       total_alpha_count++;
       switch (state) {
         case SUBSEQUENT_LOWER:
@@ -824,14 +761,14 @@ GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, BOOL8 ok_dict_word) {
           lower_string_count++;
           if (longest_lower_run_len < lower_string_count)
             longest_lower_run_len = lower_string_count;
-          if (last_char == unicharset.unichar_to_id(str, *lengths)) {
+          if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
             alpha_repetition_count++;
             if (longest_alpha_repetition_count < alpha_repetition_count) {
               longest_alpha_repetition_count = alpha_repetition_count;
             }
           }
           else {
-            last_char = unicharset.unichar_to_id(str, *lengths);
+            last_char = word->uch_set->unichar_to_id(str, *lengths);
             alpha_repetition_count = 1;
           }
           break;
@@ -839,13 +776,13 @@ GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, BOOL8 ok_dict_word) {
           isolated_digits++;
         default:
           state = FIRST_LOWER;
-          last_char = unicharset.unichar_to_id(str, *lengths);
+          last_char = word->uch_set->unichar_to_id(str, *lengths);
           alpha_repetition_count = 1;
           lower_string_count = 1;
           break;
       }
     }
-    else if (unicharset.get_isdigit (str, *lengths)) {
+    else if (word->uch_set->get_isdigit (str, *lengths)) {
       total_digit_count++;
       switch (state) {
         case FIRST_NUM:
@@ -894,56 +831,56 @@ GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, BOOL8 ok_dict_word) {
     total_alpha_count += total_digit_count - isolated_digits;
   }
 
-  if (crunch_leave_ok_strings &&
-    (len >= 4) &&
-    (2 * (total_alpha_count - isolated_alphas) > len) &&
-  (longest_alpha_repetition_count < crunch_long_repetitions)) {
+  if (crunch_leave_ok_strings && len >= 4 &&
+      2 * (total_alpha_count - isolated_alphas) > len &&
+      longest_alpha_repetition_count < crunch_long_repetitions) {
     if ((crunch_accept_ok &&
-      (acceptable_word_string (str, lengths) != AC_UNACCEPTABLE)) ||
-      (longest_lower_run_len > crunch_leave_lc_strings) ||
-      (longest_upper_run_len > crunch_leave_uc_strings))
+         acceptable_word_string(*word->uch_set, str, lengths) !=
+             AC_UNACCEPTABLE) ||
+        longest_lower_run_len > crunch_leave_lc_strings ||
+        longest_upper_run_len > crunch_leave_uc_strings)
       return G_NEVER_CRUNCH;
   }
-  if ((word->reject_map.length () > 1) &&
-    (strpbrk (str, " ") == NULL) &&
-    ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
-    (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
-    (word->best_choice->permuter () == USER_DAWG_PERM) ||
-    (word->best_choice->permuter () == NUMBER_PERM) ||
-    (acceptable_word_string (str, lengths) != AC_UNACCEPTABLE) || ok_dict_word))
+  if (word->reject_map.length() > 1 &&
+      strpbrk(str, " ") == NULL &&
+      (word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
+       word->best_choice->permuter() == FREQ_DAWG_PERM ||
+       word->best_choice->permuter() == USER_DAWG_PERM ||
+       word->best_choice->permuter() == NUMBER_PERM ||
+       acceptable_word_string(*word->uch_set, str, lengths) !=
+           AC_UNACCEPTABLE || ok_dict_word))
     return G_OK;
 
   ok_chars = len - bad_char_count - isolated_digits -
     isolated_alphas - tess_rejs;
 
   if (crunch_debug > 3) {
-    tprintf ("garbage_word: \"%s\"\n",
-      word->best_choice->unichar_string().string());
-    tprintf ("LEN: %d  bad: %d  iso_N: %d  iso_A: %d  rej: %d\n",
-      len,
-      bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
+    tprintf("garbage_word: \"%s\"\n",
+            word->best_choice->unichar_string().string());
+    tprintf("LEN: %d  bad: %d  iso_N: %d  iso_A: %d  rej: %d\n",
+            len,
+            bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
   }
-  if ((bad_char_count == 0) &&
-    (tess_rejs == 0) &&
-    ((len > isolated_digits + isolated_alphas) || (len <= 2)))
+  if (bad_char_count == 0 &&
+      tess_rejs == 0 &&
+      (len > isolated_digits + isolated_alphas || len <= 2))
     return G_OK;
 
-  if ((tess_rejs > ok_chars) ||
-    ((tess_rejs > 0) && ((bad_char_count + tess_rejs) * 2 > len)))
+  if (tess_rejs > ok_chars ||
+      (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))
     return G_TERRIBLE;
 
   if (len > 4) {
-    dodgy_chars = 2 * tess_rejs + bad_char_count +
-      isolated_digits + isolated_alphas;
-    if ((dodgy_chars > 5) || ((dodgy_chars / (float) len) > 0.5))
+    dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits +
+        isolated_alphas;
+    if (dodgy_chars > 5 || (dodgy_chars / (float) len) > 0.5)
       return G_DODGY;
     else
       return G_OK;
-  }
-  else {
+  } else {
     dodgy_chars = 2 * tess_rejs + bad_char_count;
-    if (((len == 4) && (dodgy_chars > 2)) ||
-      ((len == 3) && (dodgy_chars > 2)) || (dodgy_chars >= len))
+    if ((len == 4 && dodgy_chars > 2) ||
+        (len == 3 && dodgy_chars > 2) || dodgy_chars >= len)
       return G_DODGY;
     else
       return G_OK;
@@ -982,15 +919,18 @@ CRUNCH_MODE Tesseract::word_deletable(WERD_RES *word, inT16 &delete_mode) {
     return CR_DELETE;
   }
 
-  box = word->rebuild_word->bounding_box();
-  if (box.height () < crunch_del_min_ht * kBlnXHeight) {
-    delete_mode = 4;
-    return CR_DELETE;
-  }
+  if (word->rebuild_word != NULL) {
+    // Cube leaves rebuild_word NULL.
+    box = word->rebuild_word->bounding_box();
+    if (box.height () < crunch_del_min_ht * kBlnXHeight) {
+      delete_mode = 4;
+      return CR_DELETE;
+    }
 
-  if (noise_outlines(word->rebuild_word)) {
-    delete_mode = 5;
-    return CR_DELETE;
+    if (noise_outlines(word->rebuild_word)) {
+      delete_mode = 5;
+      return CR_DELETE;
+    }
   }
 
   if ((failure_count (word) * 1.5) > word_len) {
diff --git a/ccmain/fixspace.cpp b/ccmain/fixspace.cpp
index 4c1adaf299..494e54d510 100644
--- a/ccmain/fixspace.cpp
+++ b/ccmain/fixspace.cpp
@@ -204,7 +204,8 @@ void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
     word = word_it.data();
     if ((!word->part_of_combo) && (word->box_word == NULL)) {
-      classify_word_pass2(word, block, row);
+      classify_word_and_language(&Tesseract::classify_word_pass2,
+                                 block, row, word);
     }
     prev_word_best_choice_ = word->best_choice;
   }
@@ -347,7 +348,7 @@ BOOL8 Tesseract::digit_or_numeric_punct(WERD_RES *word, int char_position) {
   for (i = 0, offset = 0; i < char_position;
        offset += word->best_choice->unichar_lengths()[i++]);
   return (
-      unicharset.get_isdigit(
+      word->uch_set->get_isdigit(
           word->best_choice->unichar_string().string() + offset,
           word->best_choice->unichar_lengths()[i]) ||
       (word->best_choice->permuter() == NUMBER_PERM &&
@@ -771,6 +772,9 @@ inT16 Tesseract::worst_noise_blob(WERD_RES *word_res,
   float small_limit = kBlnXHeight * fixsp_small_outlines_size;
   float non_noise_limit = kBlnXHeight * 0.8;
 
+  if (word_res->rebuild_word == NULL)
+    return -1;  // Can't handle cube words.
+
   TBLOB* blob = word_res->rebuild_word->blobs;
   // Normalised.
   int blob_count = word_res->box_word->length();
@@ -917,15 +921,17 @@ inT16 Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
 
   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
     word = word_it.data();
+    if (word->rebuild_word == NULL)
+      continue;  // Can't handle cube words.
     word_length = word->reject_map.length();
     if (word->done ||
         word->tess_accepted ||
         word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
         word->best_choice->permuter() == FREQ_DAWG_PERM ||
         word->best_choice->permuter() == USER_DAWG_PERM ||
-        safe_dict_word(*word->best_choice) > 0) {
+        safe_dict_word(word) > 0) {
       TBLOB* blob = word->rebuild_word->blobs;
-      UNICHAR_ID space = getDict().getUnicharset().unichar_to_id(" ");
+      UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
       for (i = 0; i < word->best_choice->length() && blob != NULL;
            ++i, blob = blob->next) {
         if (word->best_choice->unichar_id(i) == space ||
diff --git a/ccmain/ltrresultiterator.cpp b/ccmain/ltrresultiterator.cpp
new file mode 100644
index 0000000000..7659b7f3df
--- /dev/null
+++ b/ccmain/ltrresultiterator.cpp
@@ -0,0 +1,369 @@
+///////////////////////////////////////////////////////////////////////
+// File:        ltrresultiterator.cpp
+// Description: Iterator for tesseract results in strict left-to-right
+//              order that avoids using tesseract internal data structures.
+// Author:      Ray Smith
+// Created:     Fri Feb 26 14:32:09 PST 2010
+//
+// (C) Copyright 2010, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "ltrresultiterator.h"
+
+#include "allheaders.h"
+#include "pageres.h"
+#include "strngs.h"
+#include "tesseractclass.h"
+
+namespace tesseract {
+
+LTRResultIterator::LTRResultIterator(PAGE_RES* page_res, Tesseract* tesseract,
+                                     int scale, int scaled_yres,
+                                     int rect_left, int rect_top,
+                                     int rect_width, int rect_height)
+  : PageIterator(page_res, tesseract, scale, scaled_yres,
+                 rect_left, rect_top, rect_width, rect_height),
+    line_separator_("\n"),
+    paragraph_separator_("\n") {
+}
+
+LTRResultIterator::~LTRResultIterator() {
+}
+
+// Returns the null terminated UTF-8 encoded text string for the current
+// object at the given level. Use delete [] to free after use.
+char* LTRResultIterator::GetUTF8Text(PageIteratorLevel level) const {
+  if (it_->word() == NULL) return NULL;  // Already at the end!
+  STRING text;
+  PAGE_RES_IT res_it(*it_);
+  WERD_CHOICE* best_choice = res_it.word()->best_choice;
+  ASSERT_HOST(best_choice != NULL);
+  if (level == RIL_SYMBOL) {
+    text = res_it.word()->BestUTF8(blob_index_, false);
+  } else if (level == RIL_WORD) {
+    text = best_choice->unichar_string();
+  } else {
+    bool eol = false;  // end of line?
+    bool eop = false;  // end of paragraph?
+    do {  // for each paragraph in a block
+      do {  // for each text line in a paragraph
+        do {  // for each word in a text line
+          best_choice = res_it.word()->best_choice;
+          ASSERT_HOST(best_choice != NULL);
+          text += best_choice->unichar_string();
+          text += " ";
+          res_it.forward();
+          eol = res_it.row() != res_it.prev_row();
+        } while (!eol);
+        text.truncate_at(text.length() - 1);
+        text += line_separator_;
+        eop = res_it.block() != res_it.prev_block() ||
+            res_it.row()->row->para() != res_it.prev_row()->row->para();
+      } while (level != RIL_TEXTLINE && !eop);
+      if (eop) text += paragraph_separator_;
+    } while (level == RIL_BLOCK && res_it.block() == res_it.prev_block());
+  }
+  int length = text.length() + 1;
+  char* result = new char[length];
+  strncpy(result, text.string(), length);
+  return result;
+}
+
+// Set the string inserted at the end of each text line. "\n" by default.
+void LTRResultIterator::SetLineSeparator(const char *new_line) {
+  line_separator_ = new_line;
+}
+
+// Set the string inserted at the end of each paragraph. "\n" by default.
+void LTRResultIterator::SetParagraphSeparator(const char *new_para) {
+  paragraph_separator_ = new_para;
+}
+
+// Returns the mean confidence of the current object at the given level.
+// The number should be interpreted as a percent probability. (0.0f-100.0f)
+float LTRResultIterator::Confidence(PageIteratorLevel level) const {
+  if (it_->word() == NULL) return 0.0f;  // Already at the end!
+  float mean_certainty = 0.0f;
+  int certainty_count = 0;
+  PAGE_RES_IT res_it(*it_);
+  WERD_CHOICE* best_choice = res_it.word()->best_choice;
+  ASSERT_HOST(best_choice != NULL);
+  switch (level) {
+    case RIL_BLOCK:
+      do {
+        best_choice = res_it.word()->best_choice;
+        ASSERT_HOST(best_choice != NULL);
+        mean_certainty += best_choice->certainty();
+        ++certainty_count;
+        res_it.forward();
+      } while (res_it.block() == res_it.prev_block());
+      break;
+    case RIL_PARA:
+      do {
+        best_choice = res_it.word()->best_choice;
+        ASSERT_HOST(best_choice != NULL);
+        mean_certainty += best_choice->certainty();
+        ++certainty_count;
+        res_it.forward();
+      } while (res_it.block() == res_it.prev_block() &&
+               res_it.row()->row->para() == res_it.prev_row()->row->para());
+      break;
+    case RIL_TEXTLINE:
+      do {
+        best_choice = res_it.word()->best_choice;
+        ASSERT_HOST(best_choice != NULL);
+        mean_certainty += best_choice->certainty();
+        ++certainty_count;
+        res_it.forward();
+      } while (res_it.row() == res_it.prev_row());
+      break;
+    case RIL_WORD:
+      mean_certainty += best_choice->certainty();
+     ++certainty_count;
+      break;
+    case RIL_SYMBOL:
+      BLOB_CHOICE_LIST_CLIST* choices = best_choice->blob_choices();
+      if (choices != NULL) {
+        BLOB_CHOICE_LIST_C_IT blob_choices_it(choices);
+        for (int blob = 0; blob < blob_index_; ++blob)
+          blob_choices_it.forward();
+        BLOB_CHOICE_IT choice_it(blob_choices_it.data());
+        for (choice_it.mark_cycle_pt();
+             !choice_it.cycled_list();
+             choice_it.forward()) {
+          if (choice_it.data()->unichar_id() ==
+              best_choice->unichar_id(blob_index_))
+            break;
+        }
+        mean_certainty += choice_it.data()->certainty();
+      } else {
+        mean_certainty += best_choice->certainty();
+      }
+      ++certainty_count;
+  }
+  if (certainty_count > 0) {
+    mean_certainty /= certainty_count;
+    float confidence = 100 + 5 * mean_certainty;
+    if (confidence < 0.0f) confidence = 0.0f;
+    if (confidence > 100.0f) confidence = 100.0f;
+    return confidence;
+  }
+  return 0.0f;
+}
+
+// Returns the font attributes of the current word. If iterating at a higher
+// level object than words, eg textlines, then this will return the
+// attributes of the first word in that textline.
+// The actual return value is a string representing a font name. It points
+// to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as
+// the iterator itself, ie rendered invalid by various members of
+// TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI.
+// Pointsize is returned in printers points (1/72 inch.)
+const char* LTRResultIterator::WordFontAttributes(bool* is_bold,
+                                                  bool* is_italic,
+                                                  bool* is_underlined,
+                                                  bool* is_monospace,
+                                                  bool* is_serif,
+                                                  bool* is_smallcaps,
+                                                  int* pointsize,
+                                                  int* font_id) const {
+  if (it_->word() == NULL) return NULL;  // Already at the end!
+  if (it_->word()->fontinfo == NULL) {
+    *font_id = -1;
+    return NULL;  // No font information.
+  }
+  const FontInfo& font_info = *it_->word()->fontinfo;
+  *font_id = font_info.universal_id;
+  *is_bold = font_info.is_bold();
+  *is_italic = font_info.is_italic();
+  *is_underlined = false;  // TODO(rays) fix this!
+  *is_monospace = font_info.is_fixed_pitch();
+  *is_serif = font_info.is_serif();
+  *is_smallcaps = it_->word()->small_caps;
+  float row_height = it_->row()->row->x_height() +
+      it_->row()->row->ascenders() - it_->row()->row->descenders();
+  // Convert from pixels to printers points.
+  *pointsize = scaled_yres_ > 0
+      ? static_cast<int>(row_height * kPointsPerInch / scaled_yres_ + 0.5)
+      : 0;
+
+  return font_info.name;
+}
+
+// Returns the name of the language used to recognize this word.
+const char* LTRResultIterator::WordRecognitionLanguage() const {
+  if (it_->word() == NULL || it_->word()->tesseract == NULL) return NULL;
+  return it_->word()->tesseract->lang.string();
+}
+
+// Return the overall directionality of this word.
+StrongScriptDirection LTRResultIterator::WordDirection() const {
+  if (it_->word() == NULL) return DIR_NEUTRAL;
+  bool has_rtl = it_->word()->AnyRtlCharsInWord();
+  bool has_ltr = it_->word()->AnyLtrCharsInWord();
+  if (has_rtl && !has_ltr)
+    return DIR_RIGHT_TO_LEFT;
+  if (has_ltr && !has_rtl)
+    return DIR_LEFT_TO_RIGHT;
+  if (!has_ltr && !has_rtl)
+    return DIR_NEUTRAL;
+  return DIR_MIX;
+}
+
+// Returns true if the current word was found in a dictionary.
+bool LTRResultIterator::WordIsFromDictionary() const {
+  if (it_->word() == NULL) return false;  // Already at the end!
+  int permuter = it_->word()->best_choice->permuter();
+  return permuter == SYSTEM_DAWG_PERM || permuter == FREQ_DAWG_PERM ||
+         permuter == USER_DAWG_PERM;
+}
+
+// Returns true if the current word is numeric.
+bool LTRResultIterator::WordIsNumeric() const {
+  if (it_->word() == NULL) return false;  // Already at the end!
+  int permuter = it_->word()->best_choice->permuter();
+  return permuter == NUMBER_PERM;
+}
+
+// Returns true if the word contains blamer information.
+bool LTRResultIterator::HasBlamerInfo() const {
+  return (it_->word() != NULL && it_->word()->blamer_bundle != NULL &&
+           (it_->word()->blamer_bundle->debug.length() > 0 ||
+            it_->word()->blamer_bundle->misadaption_debug.length() > 0));
+}
+
+// Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle
+// of the current word.
+void *LTRResultIterator::GetParamsTrainingBundle() const {
+  return (it_->word() != NULL && it_->word()->blamer_bundle != NULL) ?
+      &(it_->word()->blamer_bundle->params_training_bundle) : NULL;
+}
+
+// Returns the pointer to the string with blamer information for this word.
+// Assumes that the word's blamer_bundle is not NULL.
+const char *LTRResultIterator::GetBlamerDebug() const {
+  return it_->word()->blamer_bundle->debug.string();
+}
+
+// Returns the pointer to the string with misadaption information for this word.
+// Assumes that the word's blamer_bundle is not NULL.
+const char *LTRResultIterator::GetBlamerMisadaptionDebug() const {
+  return it_->word()->blamer_bundle->misadaption_debug.string();
+}
+
+// Returns the null terminated UTF-8 encoded truth string for the current word.
+// Use delete [] to free after use.
+char* LTRResultIterator::WordTruthUTF8Text() const {
+  if (it_->word() == NULL) return NULL;  // Already at the end!
+  if (it_->word()->blamer_bundle == NULL ||
+      it_->word()->blamer_bundle->incorrect_result_reason == IRR_NO_TRUTH) {
+    return NULL;  // no truth information for this word
+  }
+  const GenericVector<STRING> &truth_vec =
+      it_->word()->blamer_bundle->truth_text;
+  STRING truth_text;
+  for (int i = 0; i < truth_vec.size(); ++i) truth_text += truth_vec[i];
+  int length = truth_text.length() + 1;
+  char* result = new char[length];
+  strncpy(result, truth_text.string(), length);
+  return result;
+}
+
+// Returns a pointer to serialized choice lattice.
+// Fills lattice_size with the number of bytes in lattice data.
+const char *LTRResultIterator::WordLattice(int *lattice_size) const {
+  if (it_->word() == NULL) return NULL;  // Already at the end!
+  if (it_->word()->blamer_bundle == NULL) return NULL;
+  *lattice_size = it_->word()->blamer_bundle->lattice_size;
+  return it_->word()->blamer_bundle->lattice_data;
+}
+
+// Returns true if the current symbol is a superscript.
+// If iterating at a higher level object than symbols, eg words, then
+// this will return the attributes of the first symbol in that word.
+bool LTRResultIterator::SymbolIsSuperscript() const {
+  if (cblob_it_ == NULL && it_->word() != NULL)
+    return it_->word()->box_word->BlobPosition(blob_index_) == SP_SUPERSCRIPT;
+  return false;
+}
+
+// Returns true if the current symbol is a subscript.
+// If iterating at a higher level object than symbols, eg words, then
+// this will return the attributes of the first symbol in that word.
+bool LTRResultIterator::SymbolIsSubscript() const {
+  if (cblob_it_ == NULL && it_->word() != NULL)
+    return it_->word()->box_word->BlobPosition(blob_index_) == SP_SUBSCRIPT;
+  return false;
+}
+
+// Returns true if the current symbol is a dropcap.
+// If iterating at a higher level object than symbols, eg words, then
+// this will return the attributes of the first symbol in that word.
+bool LTRResultIterator::SymbolIsDropcap() const {
+  if (cblob_it_ == NULL && it_->word() != NULL)
+    return it_->word()->box_word->BlobPosition(blob_index_) == SP_DROPCAP;
+  return false;
+}
+
+ChoiceIterator::ChoiceIterator(const LTRResultIterator& result_it) {
+  ASSERT_HOST(result_it.it_->word() != NULL);
+  word_res_ = result_it.it_->word();
+  PAGE_RES_IT res_it(*result_it.it_);
+  WERD_CHOICE* best_choice = word_res_->best_choice;
+  BLOB_CHOICE_LIST_CLIST* choices = best_choice->blob_choices();
+  if (choices != NULL) {
+    BLOB_CHOICE_LIST_C_IT blob_choices_it(choices);
+    for (int blob = 0; blob < result_it.blob_index_; ++blob)
+      blob_choices_it.forward();
+    choice_it_ = new BLOB_CHOICE_IT(blob_choices_it.data());
+    choice_it_->mark_cycle_pt();
+  } else {
+    choice_it_ = NULL;
+  }
+}
+
+ChoiceIterator::~ChoiceIterator() {
+  delete choice_it_;
+}
+
+// Moves to the next choice for the symbol and returns false if there
+// are none left.
+bool ChoiceIterator::Next() {
+  if (choice_it_ == NULL)
+    return false;
+  choice_it_->forward();
+  return !choice_it_->cycled_list();
+}
+
+// Returns the null terminated UTF-8 encoded text string for the current
+// choice. Use delete [] to free after use.
+const char* ChoiceIterator::GetUTF8Text() const {
+  if (choice_it_ == NULL)
+    return NULL;
+  UNICHAR_ID id = choice_it_->data()->unichar_id();
+  return word_res_->BestUTF8(id, false);
+}
+
+// Returns the confidence of the current choice.
+// The number should be interpreted as a percent probability. (0.0f-100.0f)
+float ChoiceIterator::Confidence() const {
+  if (choice_it_ == NULL)
+    return 0.0f;
+  float confidence = 100 + 5 * choice_it_->data()->certainty();
+  if (confidence < 0.0f) confidence = 0.0f;
+  if (confidence > 100.0f) confidence = 100.0f;
+  return confidence;
+}
+
+
+}  // namespace tesseract.
diff --git a/ccmain/ltrresultiterator.h b/ccmain/ltrresultiterator.h
new file mode 100644
index 0000000000..e563dd42ef
--- /dev/null
+++ b/ccmain/ltrresultiterator.h
@@ -0,0 +1,202 @@
+///////////////////////////////////////////////////////////////////////
+// File:        ltrresultiterator.h
+// Description: Iterator for tesseract results in strict left-to-right
+//              order that avoids using tesseract internal data structures.
+// Author:      Ray Smith
+// Created:     Fri Feb 26 11:01:06 PST 2010
+//
+// (C) Copyright 2010, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H__
+#define TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H__
+
+#include "pageiterator.h"
+#include "unicharset.h"
+
+class BLOB_CHOICE_IT;
+class WERD_RES;
+
+namespace tesseract {
+
+class Tesseract;
+
+// Class to iterate over tesseract results, providing access to all levels
+// of the page hierarchy, without including any tesseract headers or having
+// to handle any tesseract structures.
+// WARNING! This class points to data held within the TessBaseAPI class, and
+// therefore can only be used while the TessBaseAPI class still exists and
+// has not been subjected to a call of Init, SetImage, Recognize, Clear, End
+// DetectOS, or anything else that changes the internal PAGE_RES.
+// See apitypes.h for the definition of PageIteratorLevel.
+// See also base class PageIterator, which contains the bulk of the interface.
+// LTRResultIterator adds text-specific methods for access to OCR output.
+
+class LTRResultIterator : public PageIterator {
+  friend class ChoiceIterator;
+ public:
+  // page_res and tesseract come directly from the BaseAPI.
+  // The rectangle parameters are copied indirectly from the Thresholder,
+  // via the BaseAPI. They represent the coordinates of some rectangle in an
+  // original image (in top-left-origin coordinates) and therefore the top-left
+  // needs to be added to any output boxes in order to specify coordinates
+  // in the original image. See TessBaseAPI::SetRectangle.
+  // The scale and scaled_yres are in case the Thresholder scaled the image
+  // rectangle prior to thresholding. Any coordinates in tesseract's image
+  // must be divided by scale before adding (rect_left, rect_top).
+  // The scaled_yres indicates the effective resolution of the binary image
+  // that tesseract has been given by the Thresholder.
+  // After the constructor, Begin has already been called.
+  LTRResultIterator(PAGE_RES* page_res, Tesseract* tesseract,
+                    int scale, int scaled_yres,
+                    int rect_left, int rect_top,
+                    int rect_width, int rect_height);
+  virtual ~LTRResultIterator();
+
+  // LTRResultIterators may be copied! This makes it possible to iterate over
+  // all the objects at a lower level, while maintaining an iterator to
+  // objects at a higher level. These constructors DO NOT CALL Begin, so
+  // iterations will continue from the location of src.
+  // TODO: For now the copy constructor and operator= only need the base class
+  // versions, but if new data members are added, don't forget to add them!
+
+  // ============= Moving around within the page ============.
+
+  // See PageIterator.
+
+  // ============= Accessing data ==============.
+
+  // Returns the null terminated UTF-8 encoded text string for the current
+  // object at the given level. Use delete [] to free after use.
+  char* GetUTF8Text(PageIteratorLevel level) const;
+
+  // Set the string inserted at the end of each text line. "\n" by default.
+  void SetLineSeparator(const char *new_line);
+
+  // Set the string inserted at the end of each paragraph. "\n" by default.
+  void SetParagraphSeparator(const char *new_para);
+
+  // Returns the mean confidence of the current object at the given level.
+  // The number should be interpreted as a percent probability. (0.0f-100.0f)
+  float Confidence(PageIteratorLevel level) const;
+
+  // ============= Functions that refer to words only ============.
+
+  // Returns the font attributes of the current word. If iterating at a higher
+  // level object than words, eg textlines, then this will return the
+  // attributes of the first word in that textline.
+  // The actual return value is a string representing a font name. It points
+  // to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as
+  // the iterator itself, ie rendered invalid by various members of
+  // TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI.
+  // Pointsize is returned in printers points (1/72 inch.)
+  const char* WordFontAttributes(bool* is_bold,
+                                 bool* is_italic,
+                                 bool* is_underlined,
+                                 bool* is_monospace,
+                                 bool* is_serif,
+                                 bool* is_smallcaps,
+                                 int* pointsize,
+                                 int* font_id) const;
+
+  // Return the name of the language used to recognize this word.
+  // On error, NULL.  Do not delete this pointer.
+  const char* WordRecognitionLanguage() const;
+
+  // Return the overall directionality of this word.
+  StrongScriptDirection WordDirection() const;
+
+  // Returns true if the current word was found in a dictionary.
+  bool WordIsFromDictionary() const;
+
+  // Returns true if the current word is numeric.
+  bool WordIsNumeric() const;
+
+  // Returns true if the word contains blamer information.
+  bool HasBlamerInfo() const;
+
+  // Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle
+  // of the current word.
+  void *GetParamsTrainingBundle() const;
+
+  // Returns a pointer to the string with blamer information for this word.
+  // Assumes that the word's blamer_bundle is not NULL.
+  const char *GetBlamerDebug() const;
+
+  // Returns a pointer to the string with misadaption information for this word.
+  // Assumes that the word's blamer_bundle is not NULL.
+  const char *GetBlamerMisadaptionDebug() const;
+
+  // Returns a null terminated UTF-8 encoded truth string for the current word.
+  // Use delete [] to free after use.
+  char* WordTruthUTF8Text() const;
+
+  // Returns a pointer to serialized choice lattice.
+  // Fills lattice_size with the number of bytes in lattice data.
+  const char *WordLattice(int *lattice_size) const;
+
+  // ============= Functions that refer to symbols only ============.
+
+  // Returns true if the current symbol is a superscript.
+  // If iterating at a higher level object than symbols, eg words, then
+  // this will return the attributes of the first symbol in that word.
+  bool SymbolIsSuperscript() const;
+  // Returns true if the current symbol is a subscript.
+  // If iterating at a higher level object than symbols, eg words, then
+  // this will return the attributes of the first symbol in that word.
+  bool SymbolIsSubscript() const;
+  // Returns true if the current symbol is a dropcap.
+  // If iterating at a higher level object than symbols, eg words, then
+  // this will return the attributes of the first symbol in that word.
+  bool SymbolIsDropcap() const;
+
+ protected:
+  const char *line_separator_;
+  const char *paragraph_separator_;
+};
+
+// Class to iterate over the classifier choices for a single RIL_SYMBOL.
+class ChoiceIterator {
+ public:
+  // Construction is from a LTRResultIterator that points to the symbol of
+  // interest. The ChoiceIterator allows a one-shot iteration over the
+  // choices for this symbol and after that is is useless.
+  explicit ChoiceIterator(const LTRResultIterator& result_it);
+  ~ChoiceIterator();
+
+  // Moves to the next choice for the symbol and returns false if there
+  // are none left.
+  bool Next();
+
+  // ============= Accessing data ==============.
+
+  // Returns the null terminated UTF-8 encoded text string for the current
+  // choice.
+  // NOTE: Unlike LTRResultIterator::GetUTF8Text, the return points to an
+  // internal structure and should NOT be delete[]ed to free after use.
+  const char* GetUTF8Text() const;
+
+  // Returns the confidence of the current choice.
+  // The number should be interpreted as a percent probability. (0.0f-100.0f)
+  float Confidence() const;
+
+ private:
+  // Pointer to the WERD_RES object owned by the API.
+  WERD_RES* word_res_;
+  // Iterator over the blob choices.
+  BLOB_CHOICE_IT* choice_it_;
+};
+
+}  // namespace tesseract.
+
+#endif  // TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H__
diff --git a/ccmain/mutableiterator.h b/ccmain/mutableiterator.h
new file mode 100644
index 0000000000..f097f47e2b
--- /dev/null
+++ b/ccmain/mutableiterator.h
@@ -0,0 +1,64 @@
+///////////////////////////////////////////////////////////////////////
+// File:        mutableiterator.h
+// Description: Iterator for tesseract results providing access to
+//              both high-level API and Tesseract internal data structures.
+// Author:      David Eger
+// Created:     Thu Feb 24 19:01:06 PST 2011
+//
+// (C) Copyright 2011, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCMAIN_MUTABLEITERATOR_H__
+#define TESSERACT_CCMAIN_MUTABLEITERATOR_H__
+
+#include "resultiterator.h"
+
+class BLOB_CHOICE_IT;
+
+namespace tesseract {
+
+class Tesseract;
+
+// Class to iterate over tesseract results, providing access to all levels
+// of the page hierarchy, without including any tesseract headers or having
+// to handle any tesseract structures.
+// WARNING! This class points to data held within the TessBaseAPI class, and
+// therefore can only be used while the TessBaseAPI class still exists and
+// has not been subjected to a call of Init, SetImage, Recognize, Clear, End
+// DetectOS, or anything else that changes the internal PAGE_RES.
+// See apitypes.h for the definition of PageIteratorLevel.
+// See also base class PageIterator, which contains the bulk of the interface.
+// ResultIterator adds text-specific methods for access to OCR output.
+// MutableIterator adds access to internal data structures.
+
+class MutableIterator : public ResultIterator {
+ public:
+  // See argument descriptions in ResultIterator()
+  MutableIterator(PAGE_RES* page_res, Tesseract* tesseract,
+                  int scale, int scaled_yres,
+                  int rect_left, int rect_top,
+                  int rect_width, int rect_height)
+      : ResultIterator(
+          LTRResultIterator(page_res, tesseract, scale, scaled_yres, rect_left,
+                            rect_top, rect_width, rect_height)) {}
+  virtual ~MutableIterator() {}
+
+  // See PageIterator and ResultIterator for most calls.
+
+  // Return access to Tesseract internals.
+  const PAGE_RES_IT *PageResIt() const { return it_; }
+};
+
+}  // namespace tesseract.
+
+#endif  // TESSERACT_CCMAIN_MUTABLEITERATOR_H__
diff --git a/ccmain/osdetect.cpp b/ccmain/osdetect.cpp
index 0052468faf..c31c9b1b68 100644
--- a/ccmain/osdetect.cpp
+++ b/ccmain/osdetect.cpp
@@ -22,6 +22,7 @@
 #include "blobbox.h"
 #include "blread.h"
 #include "colfind.h"
+#include "fontinfo.h"
 #include "imagefind.h"
 #include "linefind.h"
 #include "oldlist.h"
@@ -113,6 +114,49 @@ void OSResults::update_best_script(int orientation) {
       (first / second - 1.0) / (kScriptAcceptRatio - 1.0);
 }
 
+int OSResults::get_best_script(int orientation_id) const {
+  int max_id = -1;
+  for (int j = 0; j < kMaxNumberOfScripts; ++j) {
+    const char *script = unicharset->get_script_from_script_id(j);
+    if (strcmp(script, "Common") && strcmp(script, "NULL")) {
+      if (max_id == -1 ||
+          scripts_na[orientation_id][j] > scripts_na[orientation_id][max_id])
+        max_id = j;
+    }
+  }
+  return max_id;
+}
+
+// Print the script scores for all possible orientations.
+void OSResults::print_scores(void) const {
+  for (int i = 0; i < 4; ++i) {
+    printf("Orientation id #%d", i);
+    print_scores(i);
+  }
+}
+
+// Print the script scores for the given candidate orientation.
+void OSResults::print_scores(int orientation_id) const {
+  for (int j = 0; j < kMaxNumberOfScripts; ++j) {
+    if (scripts_na[orientation_id][j]) {
+      printf("%12s\t: %f\n", unicharset->get_script_from_script_id(j),
+             scripts_na[orientation_id][j]);
+    }
+  }
+}
+
+// Accumulate scores with given OSResults instance and update the best script.
+void OSResults::accumulate(const OSResults& osr) {
+  for (int i = 0; i < 4; ++i) {
+    orientations[i] += osr.orientations[i];
+    for (int j = 0; j < kMaxNumberOfScripts; ++j)
+      scripts_na[i][j] += osr.scripts_na[i][j];
+  }
+  unicharset = osr.unicharset;
+  update_best_orientation();
+  update_best_script(best_result.orientation_id);
+}
+
 // Detect and erase horizontal/vertical lines and picture regions from the
 // image, so that non-text blobs are removed from consideration.
 void remove_nontext_regions(tesseract::Tesseract *tess, BLOCK_LIST *blocks,
@@ -123,18 +167,18 @@ void remove_nontext_regions(tesseract::Tesseract *tess, BLOCK_LIST *blocks,
   int vertical_y = 1;
   tesseract::TabVector_LIST v_lines;
   tesseract::TabVector_LIST h_lines;
-  Boxa* boxa = NULL;
-  Pixa* pixa = NULL;
   const int kMinCredibleResolution = 70;
   int resolution = (kMinCredibleResolution > pixGetXRes(pix)) ?
       kMinCredibleResolution : pixGetXRes(pix);
 
-  tesseract::LineFinder::FindVerticalLines(resolution, pix, &vertical_x,
-                                           &vertical_y, &v_lines);
-  tesseract::LineFinder::FindHorizontalLines(resolution, pix, &h_lines);
-  tesseract::ImageFinder::FindImages(pix, &boxa, &pixa);
-  pixaDestroy(&pixa);
-  boxaDestroy(&boxa);
+  tesseract::LineFinder::FindAndRemoveLines(resolution, false, pix,
+                                            &vertical_x, &vertical_y,
+                                            NULL, &v_lines, &h_lines);
+  Pix* im_pix = tesseract::ImageFind::FindImages(pix);
+  if (im_pix != NULL) {
+    pixSubtract(pix, pix, im_pix);
+    pixDestroy(&im_pix);
+  }
   tess->mutable_textord()->find_components(tess->pix_binary(),
                                            blocks, to_blocks);
 }
@@ -309,8 +353,7 @@ bool os_detect_blob(BLOBNBOX* bbox, OrientationDetector* o,
                               0.0f, static_cast<float>(kBlnBaselineOffset));
     TBLOB* rotated_blob = new TBLOB(*tblob);
     rotated_blob->Normalize(denorm);
-    tess->set_denorm(&denorm);
-    tess->AdaptiveClassifier(rotated_blob, ratings + i, NULL);
+    tess->AdaptiveClassifier(rotated_blob, denorm, ratings + i, NULL);
     delete rotated_blob;
     current_rotation.rotate(rotation90);
   }
@@ -452,7 +495,7 @@ void ScriptDetector::detect_blob(BLOB_CHOICE_LIST* scores) {
       // Workaround for Fraktur
       if (prev_id == latin_id_) {
         if (prev_fontinfo_id >= 0) {
-          const FontInfo &fi =
+          const tesseract::FontInfo &fi =
               tess_->get_fontinfo_table().get(prev_fontinfo_id);
           //printf("Font: %s i:%i b:%i f:%i s:%i k:%i (%s)\n", fi.name,
           //       fi.is_italic(), fi.is_bold(), fi.is_fixed_pitch(),
diff --git a/ccmain/osdetect.h b/ccmain/osdetect.h
index f649b8a6a1..97bf599be7 100644
--- a/ccmain/osdetect.h
+++ b/ccmain/osdetect.h
@@ -45,7 +45,7 @@ struct OSBestResult {
 };
 
 struct OSResults {
-  OSResults() {
+  OSResults() : unicharset(NULL) {
     for (int i = 0; i < 4; ++i) {
       for (int j = 0; j < kMaxNumberOfScripts; ++j)
         scripts_na[i][j] = 0;
@@ -53,8 +53,19 @@ struct OSResults {
     }
   }
   void update_best_orientation();
+  // Set the estimate of the orientation to the given id.
   void set_best_orientation(int orientation_id);
+  // Update/Compute the best estimate of the script assuming the given
+  // orientation id.
   void update_best_script(int orientation_id);
+  // Return the index of the script with the highest score for this orientation.
+  int get_best_script(int orientation_id) const;
+  // Accumulate scores with given OSResults instance and update the best script.
+  void accumulate(const OSResults& osr);
+
+  // Print statistics.
+  void print_scores(void) const;
+  void print_scores(int orientation_id) const;
 
   // Array holding scores for each orientation id [0,3].
   // Orientation ids [0..3] map to [0, 270, 180, 90] degree orientations of the
diff --git a/ccmain/output.cpp b/ccmain/output.cpp
index dce3d478cd..95ed7214e0 100644
--- a/ccmain/output.cpp
+++ b/ccmain/output.cpp
@@ -139,6 +139,7 @@ void Tesseract::write_results(PAGE_RES_IT &page_res_it,
                               char newline_type,  // type of newline
                               BOOL8 force_eol) {  // override tilde crunch?
   WERD_RES *word = page_res_it.word();
+  const UNICHARSET &uchset = *word->uch_set;
   STRING repetition_code;
   const STRING *wordstr;
   STRING wordstr_lengths;
@@ -150,7 +151,7 @@ void Tesseract::write_results(PAGE_RES_IT &page_res_it,
   char map_chs[32];              //Only for unlv_tilde_crunch
   int txt_index = 0;
   BOOL8 need_reject = FALSE;
-  UNICHAR_ID space = unicharset.unichar_to_id(" ");
+  UNICHAR_ID space = uchset.unichar_to_id(" ");
   if ((word->unlv_crunch_mode != CR_NONE ||
        word->best_choice->length() == 0) &&
       !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
@@ -219,7 +220,7 @@ void Tesseract::write_results(PAGE_RES_IT &page_res_it,
     txt_chs[txt_index] = '\0';
     map_chs[txt_index] = '\0';
     ep_chars[ep_chars_index] = '\0';  // terminate string
-    word->ep_choice = new WERD_CHOICE(ep_chars, unicharset);
+    word->ep_choice = new WERD_CHOICE(ep_chars, uchset);
 
     if (force_eol)
       stats_.write_results_empty_block = true;
@@ -247,10 +248,9 @@ void Tesseract::write_results(PAGE_RES_IT &page_res_it,
       BLOB_CHOICE_LIST_C_IT blob_choices_it(word->best_choice->blob_choices());
       if (!blob_choices_it.empty()) delete blob_choices_it.extract();
     }
-    word->best_choice->populate_unichars(getDict().getUnicharset());
+    word->best_choice->populate_unichars();
     word->reject_map.remove_pos (0);
-    delete word->box_word;
-    word->box_word = new BoxWord;
+    word->box_word->DeleteBox(0);
   }
   if (newline_type ||
     (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes))
@@ -273,14 +273,14 @@ void Tesseract::write_results(PAGE_RES_IT &page_res_it,
   check_debug_pt (word, 120);
   if (tessedit_rejection_debug) {
     tprintf ("Dict word: \"%s\": %d\n",
-             word->best_choice->debug_string(unicharset).string(),
+             word->best_choice->debug_string().string(),
              dict_word(*(word->best_choice)));
   }
   if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
     repetition_code = "|^~R";
     wordstr_lengths = "\001\001\001\001";
-    repetition_code += unicharset.id_to_unichar(get_rep_char (word));
-    wordstr_lengths += strlen(unicharset.id_to_unichar(get_rep_char (word)));
+    repetition_code += uchset.id_to_unichar(get_rep_char(word));
+    wordstr_lengths += strlen(uchset.id_to_unichar(get_rep_char(word)));
     wordstr = &repetition_code;
   } else {
     if (tessedit_zero_rejection) {
@@ -355,7 +355,7 @@ UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) {  // what char is repeated?
   if (i < word->reject_map.length()) {
     return word->best_choice->unichar_id(i);
   } else {
-    return unicharset.unichar_to_id(unrecognised_char.string());
+    return word->uch_set->unichar_to_id(unrecognised_char.string());
   }
 }
 
@@ -372,6 +372,7 @@ UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) {  // what char is repeated?
 void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
   int len = word_res->reject_map.length();
   const WERD_CHOICE &word = *(word_res->best_choice);
+  const UNICHARSET &uchset = *word.unicharset();
   int i;
   float rating_per_ch;
 
@@ -388,12 +389,12 @@ void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
 
   /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
 
-  if (safe_dict_word(word) &&
+  if (safe_dict_word(word_res) &&
       (count_alphas(word) > suspect_short_words)) {
     /* Unreject alphas in dictionary words */
     for (i = 0; i < len; ++i) {
       if (word_res->reject_map[i].rejected() &&
-          unicharset.get_isalpha(word.unichar_id(i)))
+          uchset.get_isalpha(word.unichar_id(i)))
         word_res->reject_map[i].setrej_minimal_rej_accept();
     }
   }
@@ -407,7 +408,7 @@ void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
     /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
     for (i = 0; i < len; ++i) {
       if (word_res->reject_map[i].rejected() &&
-          (!unicharset.eq(word.unichar_id(i), " ")))
+          (!uchset.eq(word.unichar_id(i), " ")))
         word_res->reject_map[i].setrej_minimal_rej_accept();
     }
   }
@@ -441,9 +442,10 @@ void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
     }
   }
 
-  if ((acceptable_word_string(word.unichar_string().string(),
-                              word.unichar_lengths().string()) !=
-       AC_UNACCEPTABLE) ||
+  if (acceptable_word_string(*word_res->uch_set,
+                             word.unichar_string().string(),
+                             word.unichar_lengths().string()) !=
+                                 AC_UNACCEPTABLE ||
       acceptable_number_string(word.unichar_string().string(),
                                word.unichar_lengths().string())) {
     if (word_res->reject_map.length() > suspect_short_words) {
@@ -463,7 +465,7 @@ void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
 inT16 Tesseract::count_alphas(const WERD_CHOICE &word) {
   int count = 0;
   for (int i = 0; i < word.length(); ++i) {
-    if (unicharset.get_isalpha(word.unichar_id(i)))
+    if (word.unicharset()->get_isalpha(word.unichar_id(i)))
       count++;
   }
   return count;
@@ -473,8 +475,8 @@ inT16 Tesseract::count_alphas(const WERD_CHOICE &word) {
 inT16 Tesseract::count_alphanums(const WERD_CHOICE &word) {
   int count = 0;
   for (int i = 0; i < word.length(); ++i) {
-    if (unicharset.get_isalpha(word.unichar_id(i)) ||
-        unicharset.get_isdigit(word.unichar_id(i)))
+    if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
+        word.unicharset()->get_isdigit(word.unichar_id(i)))
       count++;
   }
   return count;
@@ -493,7 +495,7 @@ BOOL8 Tesseract::acceptable_number_string(const char *s,
     s++;
 
   for (; *s != '\0'; s += *(lengths++)) {
-    if (unicharset.get_isdigit (s, *lengths))
+    if (unicharset.get_isdigit(s, *lengths))
       prev_digit = TRUE;
     else if (prev_digit &&
              (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
diff --git a/ccmain/pagesegmain.cpp b/ccmain/pagesegmain.cpp
index eae958cafc..10b98924c9 100644
--- a/ccmain/pagesegmain.cpp
+++ b/ccmain/pagesegmain.cpp
@@ -28,28 +28,27 @@
 #pragma warning(disable:4244)  // Conversion warnings
 #endif
 
-#include <string>
-
 // Include automatically generated configuration file if running autoconf.
 #ifdef HAVE_CONFIG_H
 #include "config_auto.h"
 #endif
 
 #include "allheaders.h"
-#include "tesseractclass.h"
-#include "img.h"
 #include "blobbox.h"
-#include "linefind.h"
-#include "imagefind.h"
-#include "colfind.h"
-#include "tabvector.h"
 #include "blread.h"
-#include "wordseg.h"
+#include "colfind.h"
+#include "equationdetect.h"
+#include "imagefind.h"
+#include "img.h"
+#include "linefind.h"
 #include "makerow.h"
 #include "osdetect.h"
+#include "tabvector.h"
+#include "tesseractclass.h"
+#include "tessvars.h"
 #include "textord.h"
 #include "tordmain.h"
-#include "tessvars.h"
+#include "wordseg.h"
 
 namespace tesseract {
 
@@ -110,10 +109,6 @@ int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
   ASSERT_HOST(pix_binary_ != NULL);
   int width = pixGetWidth(pix_binary_);
   int height = pixGetHeight(pix_binary_);
-  int resolution = pixGetXRes(pix_binary_);
-  // Zero resolution messes up the algorithms, so make sure it is credible.
-  if (resolution < kMinCredibleResolution)
-    resolution = kDefaultResolution;
   // Get page segmentation mode.
   PageSegMode pageseg_mode = static_cast<PageSegMode>(
       static_cast<int>(tessedit_pageseg_mode));
@@ -145,7 +140,7 @@ int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
   TO_BLOCK_LIST to_blocks;
   if (osd_enabled || PSM_BLOCK_FIND_ENABLED(pageseg_mode)) {
     auto_page_seg_ret_val =
-        AutoPageSeg(resolution, single_column, osd_enabled, osd_only,
+        AutoPageSeg(single_column, osd_enabled, osd_only,
                     blocks, &to_blocks, osd_tess, osr);
     if (osd_only)
       return auto_page_seg_ret_val;
@@ -175,29 +170,29 @@ int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
 
   textord_.TextordPage(pageseg_mode, width, height, pix_binary_,
                        blocks, &to_blocks);
-  SetupWordScripts(blocks);
   return auto_page_seg_ret_val;
 }
 
-// TODO(rays) This is a hack to set all the words with a default script.
-// In the future this will be set by a preliminary pass over the document.
-void Tesseract::SetupWordScripts(BLOCK_LIST* blocks) {
-  int script = unicharset.default_sid();
-  bool has_x_height = unicharset.script_has_xheight();
-  bool is_latin = script == unicharset.latin_sid();
-  BLOCK_IT b_it(blocks);
-  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
-    ROW_IT r_it(b_it.data()->row_list());
-    for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
-      WERD_IT w_it(r_it.data()->word_list());
-      for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
-        WERD* word = w_it.data();
-        word->set_script_id(script);
-        word->set_flag(W_SCRIPT_HAS_XHEIGHT, has_x_height);
-        word->set_flag(W_SCRIPT_IS_LATIN, is_latin);
-      }
-    }
+// Helper writes a grey image to a file for use by scrollviewer.
+// Normally for speed we don't display the image in the layout debug windows.
+// If textord_debug_images is true, we draw the image as a background to some
+// of the debug windows. printable determines whether these
+// images are optimized for printing instead of screen display.
+static void WriteDebugBackgroundImage(bool printable, Pix* pix_binary) {
+  Pix* grey_pix = pixCreate(pixGetWidth(pix_binary),
+                            pixGetHeight(pix_binary), 8);
+  // Printable images are light grey on white, but for screen display
+  // they are black on dark grey so the other colors show up well.
+  if (printable) {
+    pixSetAll(grey_pix);
+    pixSetMasked(grey_pix, pix_binary, 192);
+  } else {
+    pixSetAllArbitrary(grey_pix, 64);
+    pixSetMasked(grey_pix, pix_binary, 0);
   }
+  AlignedBlob::IncrementDebugPix();
+  pixWrite(AlignedBlob::textord_debug_pix().string(), grey_pix, IFF_PNG);
+  pixDestroy(&grey_pix);
 }
 
 
@@ -214,119 +209,50 @@ void Tesseract::SetupWordScripts(BLOCK_LIST* blocks) {
  * into columns, but multiple blocks are still made if the text is of
  * non-uniform linespacing.
  *
- * If osd is true, then orientation and script detection is performed as well.
- * If only_osd is true, then only orientation and script detection is
- * performed. If osr is desired, the osr_tess must be another Tesseract
- * that was initialized especially for osd, and the results will be output
- * into osr.
+ * If osd (orientation and script detection) is true then that is performed
+ * as well. If only_osd is true, then only orientation and script detection is
+ * performed. If osd is desired, (osd or only_osd) then osr_tess must be
+ * another Tesseract that was initialized especially for osd, and the results
+ * will be output into osr (orientation and script result).
  */
-int Tesseract::AutoPageSeg(int resolution, bool single_column,
-                           bool osd, bool only_osd,
+int Tesseract::AutoPageSeg(bool single_column, bool osd, bool only_osd,
                            BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks,
                            Tesseract* osd_tess, OSResults* osr) {
-  int vertical_x = 0;
-  int vertical_y = 1;
-  TabVector_LIST v_lines;
-  TabVector_LIST h_lines;
-  ICOORD bleft(0, 0);
-  Boxa* boxa = NULL;
-  Pixa* pixa = NULL;
+  if (textord_debug_images) {
+    WriteDebugBackgroundImage(textord_debug_printable, pix_binary_);
+  }
+  Pix* photomask_pix = NULL;
+  Pix* musicmask_pix = NULL;
   // The blocks made by the ColumnFinder. Moved to blocks before return.
   BLOCK_LIST found_blocks;
+  TO_BLOCK_LIST temp_blocks;
 
-  if (pix_binary_ != NULL) {
-    if (textord_debug_images) {
-      Pix* grey_pix = pixCreate(pixGetWidth(pix_binary_),
-                                pixGetHeight(pix_binary_), 8);
-      // Printable images are light grey on white, but for screen display
-      // they are black on dark grey so the other colors show up well.
-      if (textord_debug_printable) {
-        pixSetAll(grey_pix);
-        pixSetMasked(grey_pix, pix_binary_, 192);
-      } else {
-        pixSetAllArbitrary(grey_pix, 64);
-        pixSetMasked(grey_pix, pix_binary_, 0);
-      }
-      AlignedBlob::IncrementDebugPix();
-      pixWrite(AlignedBlob::textord_debug_pix().string(), grey_pix, IFF_PNG);
-      pixDestroy(&grey_pix);
+  ColumnFinder* finder = SetupPageSegAndDetectOrientation(
+      single_column, osd, only_osd, blocks, osd_tess, osr,
+      &temp_blocks, &photomask_pix, &musicmask_pix);
+  if (finder != NULL) {
+    TO_BLOCK_IT to_block_it(&temp_blocks);
+    TO_BLOCK* to_block = to_block_it.data();
+    if (musicmask_pix != NULL) {
+      // TODO(rays) pass the musicmask_pix into FindBlocks and mark music
+      // blocks separately. For now combine with photomask_pix.
+      pixOr(photomask_pix, photomask_pix, musicmask_pix);
     }
-    if (tessedit_dump_pageseg_images) {
-      pixWrite("tessinput.png", pix_binary_, IFF_PNG);
+    if (equ_detect_) {
+      finder->SetEquationDetect(equ_detect_);
     }
-    // Leptonica is used to find the lines and image regions in the input.
-    LineFinder::FindVerticalLines(resolution, pix_binary_,
-                                  &vertical_x, &vertical_y, &v_lines);
-    LineFinder::FindHorizontalLines(resolution, pix_binary_, &h_lines);
-    if (tessedit_dump_pageseg_images)
-      pixWrite("tessnolines.png", pix_binary_, IFF_PNG);
-    ImageFinder::FindImages(pix_binary_, &boxa, &pixa);
-    if (tessedit_dump_pageseg_images)
-      pixWrite("tessnoimages.png", pix_binary_, IFF_PNG);
-    if (single_column)
-      v_lines.clear();
-  }
-
-  TO_BLOCK_LIST port_blocks;
-  // The rest of the algorithm uses the usual connected components.
-  textord_.find_components(pix_binary_, blocks, &port_blocks);
-
-  TO_BLOCK_IT to_block_it(&port_blocks);
-  ASSERT_HOST(!to_block_it.empty());
-  for (to_block_it.mark_cycle_pt(); !to_block_it.cycled_list();
-       to_block_it.forward()) {
-    TO_BLOCK* to_block = to_block_it.data();
-    TBOX blkbox = to_block->block->bounding_box();
-    if (to_block->line_size >= 2) {
-      // Note: if there are multiple blocks, then v_lines, boxa, and pixa
-      // are empty on the next iteration, but in this case, we assume
-      // that there aren't any interesting line separators or images, since
-      // it means that we have a pre-defined unlv zone file.
-      ColumnFinder finder(static_cast<int>(to_block->line_size),
-                          blkbox.botleft(), blkbox.topright(), resolution,
-                          &v_lines, &h_lines, vertical_x, vertical_y);
-      BLOBNBOX_CLIST osd_blobs;
-      int osd_orientation = 0;
-      bool vertical_text = finder.IsVerticallyAlignedText(to_block, &osd_blobs);
-      if (osd && osd_tess != NULL && osr != NULL) {
-        os_detect_blobs(&osd_blobs, osr, osd_tess);
-        if (only_osd) continue;
-        osd_orientation = osr->best_result.orientation_id;
-        double osd_score = osr->orientations[osd_orientation];
-        double osd_margin = min_orientation_margin * 2;
-        // tprintf("Orientation scores:");
-        for (int i = 0; i < 4; ++i) {
-          if (i != osd_orientation &&
-              osd_score - osr->orientations[i] < osd_margin) {
-            osd_margin = osd_score - osr->orientations[i];
-          }
-          // tprintf(" %d:%f", i, osr->orientations[i]);
-        }
-        // tprintf("\n");
-        if (osd_margin < min_orientation_margin) {
-          // Margin insufficient - dream up a suitable default.
-          if (vertical_text && (osd_orientation & 1))
-            osd_orientation = 3;
-          else
-            osd_orientation = 0;
-          tprintf("Score margin insufficient:%.2f, using %d as a default\n",
-                  osd_margin, osd_orientation);
-        }
-      }
-      osd_blobs.shallow_clear();
-      finder.CorrectOrientation(to_block, vertical_text, osd_orientation);
-      if (finder.FindBlocks(single_column, pixGetHeight(pix_binary_),
-                            to_block, boxa, pixa, &found_blocks, to_blocks) < 0)
-        return -1;
-      finder.GetDeskewVectors(&deskew_, &reskew_);
-      boxa = NULL;
-      pixa = NULL;
+    if (finder->FindBlocks(single_column, scaled_color_, scaled_factor_,
+                           to_block, photomask_pix,
+                           &found_blocks, to_blocks) < 0) {
+      pixDestroy(&photomask_pix);
+      pixDestroy(&musicmask_pix);
+      return -1;
     }
+    finder->GetDeskewVectors(&deskew_, &reskew_);
+    delete finder;
   }
-  boxaDestroy(&boxa);
-  pixaDestroy(&pixa);
-  if (only_osd) return 0;
-
+  pixDestroy(&photomask_pix);
+  pixDestroy(&musicmask_pix);
   blocks->clear();
   BLOCK_IT block_it(blocks);
   // Move the found blocks to the input/output blocks.
@@ -339,4 +265,116 @@ int Tesseract::AutoPageSeg(int resolution, bool single_column,
   return 0;
 }
 
+/**
+ * Sets up auto page segmentation, determines the orientation, and corrects it.
+ * Somewhat arbitrary chunk of functionality, factored out of AutoPageSeg to
+ * facilitate testing.
+ * photo_mask_pix is a pointer to a NULL pointer that will be filled on return
+ * with the leptonica photo mask, which must be pixDestroyed by the caller.
+ * to_blocks is an empty list that will be filled with (usually a single)
+ * block that is used during layout analysis. This ugly API is required
+ * because of the possibility of a unlv zone file.
+ * TODO(rays) clean this up.
+ * See AutoPageSeg for other arguments.
+ * The returned ColumnFinder must be deleted after use.
+ */
+ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation(
+    bool single_column, bool osd, bool only_osd,
+    BLOCK_LIST* blocks, Tesseract* osd_tess, OSResults* osr,
+    TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix, Pix** music_mask_pix) {
+  int vertical_x = 0;
+  int vertical_y = 1;
+  TabVector_LIST v_lines;
+  TabVector_LIST h_lines;
+  ICOORD bleft(0, 0);
+
+  ASSERT_HOST(pix_binary_ != NULL);
+  if (tessedit_dump_pageseg_images) {
+    pixWrite("tessinput.png", pix_binary_, IFF_PNG);
+  }
+  // Leptonica is used to find the rule/separator lines in the input.
+  LineFinder::FindAndRemoveLines(source_resolution_,
+                                 textord_tabfind_show_vlines, pix_binary_,
+                                 &vertical_x, &vertical_y, music_mask_pix,
+                                 &v_lines, &h_lines);
+  if (tessedit_dump_pageseg_images)
+    pixWrite("tessnolines.png", pix_binary_, IFF_PNG);
+  // Leptonica is used to find a mask of the photo regions in the input.
+  *photo_mask_pix = ImageFind::FindImages(pix_binary_);
+  if (tessedit_dump_pageseg_images)
+    pixWrite("tessnoimages.png", pix_binary_, IFF_PNG);
+  if (single_column)
+    v_lines.clear();
+
+  // The rest of the algorithm uses the usual connected components.
+  textord_.find_components(pix_binary_, blocks, to_blocks);
+
+  TO_BLOCK_IT to_block_it(to_blocks);
+  // There must be exactly one input block.
+  // TODO(rays) handle new textline finding with a UNLV zone file.
+  ASSERT_HOST(to_blocks->singleton());
+  TO_BLOCK* to_block = to_block_it.data();
+  TBOX blkbox = to_block->block->bounding_box();
+  ColumnFinder* finder = NULL;
+
+  if (to_block->line_size >= 2) {
+    finder = new ColumnFinder(static_cast<int>(to_block->line_size),
+                              blkbox.botleft(), blkbox.topright(),
+                              source_resolution_,
+                              &v_lines, &h_lines, vertical_x, vertical_y);
+
+    finder->SetupAndFilterNoise(*photo_mask_pix, to_block);
+
+    if (equ_detect_) {
+      equ_detect_->LabelSpecialText(to_block);
+    }
+
+    BLOBNBOX_CLIST osd_blobs;
+    // osd_orientation is the number of 90 degree rotations to make the
+    // characters upright. (See osdetect.h for precise definition.)
+    // We want the text lines horizontal, (vertical text indicates vertical
+    // textlines) which may conflict (eg vertically written CJK).
+    int osd_orientation = 0;
+    bool vertical_text = finder->IsVerticallyAlignedText(to_block, &osd_blobs);
+    if (osd && osd_tess != NULL && osr != NULL) {
+      os_detect_blobs(&osd_blobs, osr, osd_tess);
+      if (only_osd) {
+        delete finder;
+        return NULL;
+      }
+      osd_orientation = osr->best_result.orientation_id;
+      double osd_score = osr->orientations[osd_orientation];
+      double osd_margin = min_orientation_margin * 2;
+      for (int i = 0; i < 4; ++i) {
+        if (i != osd_orientation &&
+            osd_score - osr->orientations[i] < osd_margin) {
+          osd_margin = osd_score - osr->orientations[i];
+        }
+      }
+      if (osd_margin < min_orientation_margin) {
+        // The margin is weak.
+        int best_script_id = osr->best_result.script_id;
+        bool cjk = (best_script_id == osd_tess->unicharset.han_sid()) ||
+            (best_script_id == osd_tess->unicharset.hiragana_sid()) ||
+            (best_script_id == osd_tess->unicharset.katakana_sid());
+
+        if (!cjk && !vertical_text && osd_orientation == 2) {
+          // upside down latin text is improbable with such a weak margin.
+          tprintf("OSD: Weak margin (%.2f), horiz textlines, not CJK: "
+                  "Don't rotate.\n", osd_margin);
+          osd_orientation = 0;
+        } else {
+          tprintf("OSD: Weak margin (%.2f) for %d blob text block, "
+                  "but using orientation anyway: %d\n",
+                  osd_blobs.length(), osd_margin, osd_orientation);
+        }
+      }
+    }
+    osd_blobs.shallow_clear();
+    finder->CorrectOrientation(to_block, vertical_text, osd_orientation);
+  }
+
+  return finder;
+}
+
 }  // namespace tesseract.
diff --git a/ccmain/paragraphs.cpp b/ccmain/paragraphs.cpp
new file mode 100644
index 0000000000..856513087b
--- /dev/null
+++ b/ccmain/paragraphs.cpp
@@ -0,0 +1,2444 @@
+/**********************************************************************
+ * File:        paragraphs.cpp
+ * Description: Paragraph detection for tesseract.
+ * Author:      David Eger
+ * Created:     25 February 2011
+ *
+ * (C) Copyright 2011, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include <ctype.h>
+
+#include "genericvector.h"
+#include "helpers.h"
+#include "mutableiterator.h"
+#include "ocrpara.h"
+#include "pageres.h"
+#include "paragraphs.h"
+#include "paragraphs_internal.h"
+#include "publictypes.h"
+#include "ratngs.h"
+#include "rect.h"
+#include "statistc.h"
+#include "strngs.h"
+#include "tprintf.h"
+#include "unicharset.h"
+#include "unicodes.h"
+
+namespace tesseract {
+
+// The tab vectors for a given line should be ignored if both its tab vectors
+// are infrequent, specifically, if both tab vectors appear at most once per
+// kStrayLinePer lines in a block.
+const int kStrayLinePer = 6;
+
+// Special "weak" ParagraphModels.
+const ParagraphModel *kCrownLeft
+    = reinterpret_cast<ParagraphModel *>(0xDEAD111F);
+const ParagraphModel *kCrownRight
+    = reinterpret_cast<ParagraphModel *>(0xDEAD888F);
+
+// Given the width of a typical space between words, what is the threshold
+// by which by which we think left and right alignments for paragraphs
+// can vary and still be aligned.
+static int Epsilon(int space_pix) {
+  return space_pix * 4 / 5;
+}
+
+template<typename T>
+void SimpleSwap(T &a, T &b) {
+  T c = a;
+  a = b;
+  b = c;
+}
+
+static bool AcceptableRowArgs(
+    int debug_level, int min_num_rows, const char *function_name,
+    const GenericVector<RowScratchRegisters> *rows,
+    int row_start, int row_end) {
+  if (row_start < 0 || row_end > rows->size() || row_start > row_end) {
+    tprintf("Invalid arguments rows[%d, %d) while rows is of size %d.\n",
+            row_start, row_end, rows->size());
+    return false;
+  }
+  if (row_end - row_start < min_num_rows) {
+    if (debug_level > 1) {
+      tprintf("# Too few rows[%d, %d) for %s.\n",
+              row_start, row_end, function_name);
+    }
+    return false;
+  }
+  return true;
+}
+
+// =============================== Debug Code ================================
+
+// Convert an integer to a decimal string.
+static STRING StrOf(int num) {
+  char buffer[30];
+  snprintf(buffer, sizeof(buffer), "%d", num);
+  return STRING(buffer);
+}
+
+// Given a row-major matrix of unicode text and a column separator, print
+// a formatted table.  For ASCII, we get good column alignment.
+static void PrintTable(const GenericVector<GenericVector<STRING> > &rows,
+                       const STRING &colsep) {
+  GenericVector<int> max_col_widths;
+  for (int r = 0; r < rows.size(); r++) {
+    int num_columns = rows[r].size();
+    for (int c = 0; c < num_columns; c++) {
+      int num_unicodes = 0;
+      for (int i = 0; i < rows[r][c].size(); i++) {
+        if ((rows[r][c][i] & 0xC0) != 0x80) num_unicodes++;
+      }
+      if (c >= max_col_widths.size()) {
+        max_col_widths.push_back(num_unicodes);
+      } else {
+        if (num_unicodes > max_col_widths[c])
+          max_col_widths[c] = num_unicodes;
+      }
+    }
+  }
+
+  GenericVector<STRING> col_width_patterns;
+  for (int c = 0; c < max_col_widths.size(); c++) {
+    col_width_patterns.push_back(
+        STRING("%-") + StrOf(max_col_widths[c]) + "s");
+  }
+
+  for (int r = 0; r < rows.size(); r++) {
+    for (int c = 0; c < rows[r].size(); c++) {
+      if (c > 0)
+        tprintf("%s", colsep.string());
+      tprintf(col_width_patterns[c].string(), rows[r][c].string());
+    }
+    tprintf("\n");
+  }
+}
+
+STRING RtlEmbed(const STRING &word, bool rtlify) {
+  if (rtlify)
+    return STRING(kRLE) + word + STRING(kPDF);
+  return word;
+}
+
+// Print the current thoughts of the paragraph detector.
+static void PrintDetectorState(const ParagraphTheory &theory,
+                               const GenericVector<RowScratchRegisters> &rows) {
+  GenericVector<GenericVector<STRING> > output;
+  output.push_back(GenericVector<STRING>());
+  output.back().push_back("#row");
+  output.back().push_back("space");
+  output.back().push_back("..");
+  output.back().push_back("lword[widthSEL]");
+  output.back().push_back("rword[widthSEL]");
+  RowScratchRegisters::AppendDebugHeaderFields(&output.back());
+  output.back().push_back("text");
+
+  for (int i = 0; i < rows.size(); i++) {
+    output.push_back(GenericVector<STRING>());
+    GenericVector<STRING> &row = output.back();
+    const RowInfo& ri = *rows[i].ri_;
+    row.push_back(StrOf(i));
+    row.push_back(StrOf(ri.average_interword_space));
+    row.push_back(ri.has_leaders ? ".." : " ");
+    row.push_back(RtlEmbed(ri.lword_text, !ri.ltr) +
+                  "[" + StrOf(ri.lword_box.width()) +
+                  (ri.lword_likely_starts_idea ? "S" : "s") +
+                  (ri.lword_likely_ends_idea ? "E" : "e") +
+                  (ri.lword_indicates_list_item ? "L" : "l") +
+                  "]");
+    row.push_back(RtlEmbed(ri.rword_text, !ri.ltr) +
+                  "[" + StrOf(ri.rword_box.width()) +
+                  (ri.rword_likely_starts_idea ? "S" : "s") +
+                  (ri.rword_likely_ends_idea ? "E" : "e") +
+                  (ri.rword_indicates_list_item ? "L" : "l") +
+                  "]");
+    rows[i].AppendDebugInfo(theory, &row);
+    row.push_back(RtlEmbed(ri.text, !ri.ltr));
+  }
+  PrintTable(output, " ");
+
+  tprintf("Active Paragraph Models:\n");
+  for (int m = 0; m < theory.models().size(); m++) {
+    tprintf(" %d: %s\n", m + 1, theory.models()[m]->ToString().string());
+  }
+}
+
+static void DebugDump(
+    bool should_print,
+    const STRING &phase,
+    const ParagraphTheory &theory,
+    const GenericVector<RowScratchRegisters> &rows) {
+  if (!should_print)
+    return;
+  tprintf("# %s\n", phase.string());
+  PrintDetectorState(theory, rows);
+}
+
+// Print out the text for rows[row_start, row_end)
+static void PrintRowRange(const GenericVector<RowScratchRegisters> &rows,
+                          int row_start, int row_end) {
+  tprintf("======================================\n");
+  for (int row = row_start; row < row_end; row++) {
+    tprintf("%s\n", rows[row].ri_->text.string());
+  }
+  tprintf("======================================\n");
+}
+
+// ============= Brain Dead Language Model (ASCII Version) ===================
+
+bool IsLatinLetter(int ch) {
+  return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
+}
+
+bool IsDigitLike(int ch) {
+  return ch == 'o' || ch == 'O' || ch == 'l' || ch == 'I';
+}
+
+bool IsOpeningPunct(int ch) {
+  return strchr("'\"({[", ch) != NULL;
+}
+
+bool IsTerminalPunct(int ch) {
+  return strchr(":'\".?!]})", ch) != NULL;
+}
+
+// Return a pointer after consuming as much text as qualifies as roman numeral.
+const char *SkipChars(const char *str, const char *toskip) {
+  while (*str != '\0' && strchr(toskip, *str)) { str++; }
+  return str;
+}
+
+const char *SkipChars(const char *str, bool (*skip)(int)) {
+  while (*str != '\0' && skip(*str)) { str++; }
+  return str;
+}
+
+const char *SkipOne(const char *str, const char *toskip) {
+  if (*str != '\0' && strchr(toskip, *str)) return str + 1;
+  return str;
+}
+
+// Return whether it is very likely that this is a numeral marker that could
+// start a list item.  Some examples include:
+//   A   I   iii.   VI   (2)   3.5.   [C-4]
+bool LikelyListNumeral(const STRING &word) {
+  const char *kRomans = "ivxlmdIVXLMD";
+  const char *kDigits = "012345789";
+  const char *kOpen = "[{(";
+  const char *kSep = ":;-.,";
+  const char *kClose = "]})";
+
+  int num_segments = 0;
+  const char *pos = word.string();
+  while (*pos != '\0' && num_segments < 3) {
+    // skip up to two open parens.
+    const char *numeral_start = SkipOne(SkipOne(pos, kOpen), kOpen);
+    const char *numeral_end = SkipChars(numeral_start, kRomans);
+    if (numeral_end != numeral_start) {
+      // Got Roman Numeral. Great.
+    } else {
+      numeral_end = SkipChars(numeral_start, kDigits);
+      if (numeral_end == numeral_start) {
+        // If there's a single latin letter, we can use that.
+        numeral_end = SkipChars(numeral_start, IsLatinLetter);
+        if (numeral_end - numeral_start != 1)
+          break;
+      }
+    }
+    // We got some sort of numeral.
+    num_segments++;
+    // Skip any trailing parens or punctuation.
+    pos = SkipChars(SkipChars(numeral_end, kClose), kSep);
+    if (pos == numeral_end)
+      break;
+  }
+  return *pos == '\0';
+}
+
+bool LikelyListMark(const STRING &word) {
+  const char *kListMarks = "0Oo*.,+.";
+  return word.size() == 1 && strchr(kListMarks, word[0]) != NULL;
+}
+
+bool AsciiLikelyListItem(const STRING &word) {
+  return LikelyListMark(word) || LikelyListNumeral(word);
+}
+
+// ========== Brain Dead Language Model (Tesseract Version) ================
+
+// Return the first Unicode Codepoint from werd[pos].
+int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos) {
+  if (!u || !werd || pos > werd->length())
+    return 0;
+  return UNICHAR(u->id_to_unichar(werd->unichar_id(pos)), -1).first_uni();
+}
+
+// A useful helper class for finding the first j >= i so that word[j]
+// does not have given character type.
+class UnicodeSpanSkipper {
+ public:
+  UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)
+      : u_(unicharset), word_(word) { wordlen_ = word->length(); }
+
+  // Given an input position, return the first position >= pos not punc.
+  int SkipPunc(int pos);
+  // Given an input position, return the first position >= pos not digit.
+  int SkipDigits(int pos);
+  // Given an input position, return the first position >= pos not roman.
+  int SkipRomans(int pos);
+  // Given an input position, return the first position >= pos not alpha.
+  int SkipAlpha(int pos);
+
+ private:
+  const UNICHARSET *u_;
+  const WERD_CHOICE *word_;
+  int wordlen_;
+};
+
+int UnicodeSpanSkipper::SkipPunc(int pos) {
+  while (pos < wordlen_ && u_->get_ispunctuation(word_->unichar_id(pos))) pos++;
+  return pos;
+}
+
+int UnicodeSpanSkipper::SkipDigits(int pos) {
+  while (pos < wordlen_ && (u_->get_isdigit(word_->unichar_id(pos)) ||
+                            IsDigitLike(UnicodeFor(u_, word_, pos)))) pos++;
+  return pos;
+}
+
+int UnicodeSpanSkipper::SkipRomans(int pos) {
+  const char *kRomans = "ivxlmdIVXLMD";
+  while (pos < wordlen_) {
+    int ch = UnicodeFor(u_, word_, pos);
+    if (ch >= 0xF0 || strchr(kRomans, ch) == 0) break;
+    pos++;
+  }
+  return pos;
+}
+
+int UnicodeSpanSkipper::SkipAlpha(int pos) {
+  while (pos < wordlen_ && u_->get_isalpha(word_->unichar_id(pos))) pos++;
+  return pos;
+}
+
+bool LikelyListMarkUnicode(int ch) {
+  if (ch < 0x80) {
+    STRING single_ch;
+    single_ch += ch;
+    return LikelyListMark(single_ch);
+  }
+  switch (ch) {
+    // TODO(eger) expand this list of unicodes as needed.
+    case 0x00B0:  // degree sign
+    case 0x2022:  // bullet
+    case 0x25E6:  // white bullet
+    case 0x00B7:  // middle dot
+    case 0x25A1:  // white square
+    case 0x25A0:  // black square
+    case 0x25AA:  // black small square
+    case 0x2B1D:  // black very small square
+    case 0x25BA:  // black right-pointing pointer
+    case 0x25CF:  // black circle
+    case 0x25CB:  // white circle
+      return true;
+    default:
+      break;  // fall through
+  }
+  return false;
+}
+
+// Return whether it is very likely that this is a numeral marker that could
+// start a list item.  Some examples include:
+//   A   I   iii.   VI   (2)   3.5.   [C-4]
+bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) {
+  if (werd->length() == 1 && LikelyListMarkUnicode(UnicodeFor(u, werd, 0)))
+    return true;
+
+  UnicodeSpanSkipper m(u, werd);
+  int num_segments = 0;
+  int pos = 0;
+  while (pos < werd->length() && num_segments < 3) {
+    int numeral_start = m.SkipPunc(pos);
+    if (numeral_start > pos + 1) break;
+    int numeral_end = m.SkipRomans(numeral_start);
+    if (numeral_end == numeral_start) {
+      numeral_end = m.SkipDigits(numeral_start);
+      if (numeral_end == numeral_start) {
+        // If there's a single latin letter, we can use that.
+        numeral_end = m.SkipAlpha(numeral_start);
+        if (numeral_end - numeral_start != 1)
+          break;
+      }
+    }
+    // We got some sort of numeral.
+    num_segments++;
+    // Skip any trailing punctuation.
+    pos = m.SkipPunc(numeral_end);
+    if (pos == numeral_end)
+      break;
+  }
+  return pos == werd->length();
+}
+
+// ========= Brain Dead Language Model (combined entry points) ================
+
+// Given the leftmost word of a line either as a Tesseract unicharset + werd
+// or a utf8 string, set the following attributes for it:
+//   is_list -      this word might be a list number or bullet.
+//   starts_idea -  this word is likely to start a sentence.
+//   ends_idea -    this word is likely to end a sentence.
+void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
+                        const STRING &utf8,
+                        bool *is_list, bool *starts_idea, bool *ends_idea) {
+  *is_list = false;
+  *starts_idea = false;
+  *ends_idea = false;
+  if (utf8.size() == 0 || (werd != NULL && werd->length() == 0)) {  // Empty
+    *ends_idea = true;
+    return;
+  }
+
+  if (unicharset && werd) {  // We have a proper werd and unicharset so use it.
+    if (UniLikelyListItem(unicharset, werd)) {
+      *is_list = true;
+      *starts_idea = true;
+      *ends_idea = true;
+    }
+    if (unicharset->get_isupper(werd->unichar_id(0))) {
+      *starts_idea = true;
+    }
+    if (unicharset->get_ispunctuation(werd->unichar_id(0))) {
+      *starts_idea = true;
+      *ends_idea = true;
+    }
+  } else {  // Assume utf8 is mostly ASCII
+    if (AsciiLikelyListItem(utf8)) {
+      *is_list = true;
+      *starts_idea = true;
+    }
+    int start_letter = utf8[0];
+    if (IsOpeningPunct(start_letter)) {
+      *starts_idea = true;
+    }
+    if (IsTerminalPunct(start_letter)) {
+      *ends_idea = true;
+    }
+    if (start_letter >= 'A' && start_letter <= 'Z') {
+      *starts_idea = true;
+    }
+  }
+}
+
+// Given the rightmost word of a line either as a Tesseract unicharset + werd
+// or a utf8 string, set the following attributes for it:
+//   is_list -      this word might be a list number or bullet.
+//   starts_idea -  this word is likely to start a sentence.
+//   ends_idea -    this word is likely to end a sentence.
+void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
+                         const STRING &utf8,
+                         bool *is_list, bool *starts_idea, bool *ends_idea) {
+  *is_list = false;
+  *starts_idea = false;
+  *ends_idea = false;
+  if (utf8.size() == 0 || (werd != NULL && werd->length() == 0)) {  // Empty
+    *ends_idea = true;
+    return;
+  }
+
+  if (unicharset && werd) {  // We have a proper werd and unicharset so use it.
+    if (UniLikelyListItem(unicharset, werd)) {
+      *is_list = true;
+      *starts_idea = true;
+    }
+    UNICHAR_ID last_letter = werd->unichar_id(werd->length() - 1);
+    if (unicharset->get_ispunctuation(last_letter)) {
+      *ends_idea = true;
+    }
+  } else {  // Assume utf8 is mostly ASCII
+    if (AsciiLikelyListItem(utf8)) {
+      *is_list = true;
+      *starts_idea = true;
+    }
+    int last_letter = utf8[utf8.size() - 1];
+    if (IsOpeningPunct(last_letter) || IsTerminalPunct(last_letter)) {
+      *ends_idea = true;
+    }
+  }
+}
+
+// =============== Implementation of RowScratchRegisters =====================
+/* static */
+void RowScratchRegisters::AppendDebugHeaderFields(
+    GenericVector<STRING> *header) {
+  header->push_back("[lmarg,lind;rind,rmarg]");
+  header->push_back("model");
+}
+
+void RowScratchRegisters::AppendDebugInfo(const ParagraphTheory &theory,
+                                          GenericVector<STRING> *dbg) const {
+  char s[30];
+  snprintf(s, sizeof(s), "[%3d,%3d;%3d,%3d]",
+           lmargin_, lindent_, rindent_, rmargin_);
+  dbg->push_back(s);
+  STRING model_string;
+  model_string += static_cast<char>(GetLineType());
+  model_string += ":";
+
+  int model_numbers = 0;
+  for (int h = 0; h < hypotheses_.size(); h++) {
+    if (hypotheses_[h].model == NULL)
+      continue;
+    if (model_numbers > 0)
+      model_string += ",";
+    if (StrongModel(hypotheses_[h].model)) {
+      model_string += StrOf(1 + theory.IndexOf(hypotheses_[h].model));
+    } else if (hypotheses_[h].model == kCrownLeft) {
+      model_string += "CrL";
+    } else if (hypotheses_[h].model == kCrownRight) {
+      model_string += "CrR";
+    }
+    model_numbers++;
+  }
+  if (model_numbers == 0)
+    model_string += "0";
+
+  dbg->push_back(model_string);
+}
+
+void RowScratchRegisters::Init(const RowInfo &row) {
+  ri_ = &row;
+  lmargin_ = 0;
+  lindent_ = row.pix_ldistance;
+  rmargin_ = 0;
+  rindent_ = row.pix_rdistance;
+}
+
+LineType RowScratchRegisters::GetLineType() const {
+  if (hypotheses_.empty())
+    return LT_UNKNOWN;
+  bool has_start = false;
+  bool has_body = false;
+  for (int i = 0; i < hypotheses_.size(); i++) {
+    switch (hypotheses_[i].ty) {
+      case LT_START: has_start = true; break;
+      case LT_BODY: has_body = true; break;
+      default:
+        tprintf("Encountered bad value in hypothesis list: %c\n",
+                hypotheses_[i].ty);
+        break;
+    }
+  }
+  if (has_start && has_body)
+    return LT_MULTIPLE;
+  return has_start ? LT_START : LT_BODY;
+}
+
+LineType RowScratchRegisters::GetLineType(const ParagraphModel *model) const {
+  if (hypotheses_.empty())
+    return LT_UNKNOWN;
+  bool has_start = false;
+  bool has_body = false;
+  for (int i = 0; i < hypotheses_.size(); i++) {
+    if (hypotheses_[i].model != model)
+      continue;
+    switch (hypotheses_[i].ty) {
+      case LT_START: has_start = true; break;
+      case LT_BODY: has_body = true; break;
+      default:
+        tprintf("Encountered bad value in hypothesis list: %c\n",
+                hypotheses_[i].ty);
+        break;
+    }
+  }
+  if (has_start && has_body)
+    return LT_MULTIPLE;
+  return has_start ? LT_START : LT_BODY;
+}
+
+void RowScratchRegisters::SetStartLine() {
+  LineType current_lt = GetLineType();
+  if (current_lt != LT_UNKNOWN && current_lt != LT_START) {
+    tprintf("Trying to set a line to be START when it's already BODY.\n");
+  }
+  if (current_lt == LT_UNKNOWN || current_lt == LT_BODY) {
+    hypotheses_.push_back_new(LineHypothesis(LT_START, NULL));
+  }
+}
+
+void RowScratchRegisters::SetBodyLine() {
+  LineType current_lt = GetLineType();
+  if (current_lt != LT_UNKNOWN && current_lt != LT_BODY) {
+    tprintf("Trying to set a line to be BODY when it's already START.\n");
+  }
+  if (current_lt == LT_UNKNOWN || current_lt == LT_START) {
+    hypotheses_.push_back_new(LineHypothesis(LT_BODY, NULL));
+  }
+}
+
+void RowScratchRegisters::AddStartLine(const ParagraphModel *model) {
+  hypotheses_.push_back_new(LineHypothesis(LT_START, model));
+  int old_idx = hypotheses_.get_index(LineHypothesis(LT_START, NULL));
+  if (old_idx >= 0)
+    hypotheses_.remove(old_idx);
+}
+
+void RowScratchRegisters::AddBodyLine(const ParagraphModel *model) {
+  hypotheses_.push_back_new(LineHypothesis(LT_BODY, model));
+  int old_idx = hypotheses_.get_index(LineHypothesis(LT_BODY, NULL));
+  if (old_idx >= 0)
+    hypotheses_.remove(old_idx);
+}
+
+void RowScratchRegisters::StartHypotheses(SetOfModels *models) const {
+  for (int h = 0; h < hypotheses_.size(); h++) {
+    if (hypotheses_[h].ty == LT_START && StrongModel(hypotheses_[h].model))
+      models->push_back_new(hypotheses_[h].model);
+  }
+}
+
+void RowScratchRegisters::StrongHypotheses(SetOfModels *models) const {
+  for (int h = 0; h < hypotheses_.size(); h++) {
+    if (StrongModel(hypotheses_[h].model))
+      models->push_back_new(hypotheses_[h].model);
+  }
+}
+
+void RowScratchRegisters::NonNullHypotheses(SetOfModels *models) const {
+  for (int h = 0; h < hypotheses_.size(); h++) {
+    if (hypotheses_[h].model != NULL)
+      models->push_back_new(hypotheses_[h].model);
+  }
+}
+
+const ParagraphModel *RowScratchRegisters::UniqueStartHypothesis() const {
+  if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_START)
+    return NULL;
+  return hypotheses_[0].model;
+}
+
+const ParagraphModel *RowScratchRegisters::UniqueBodyHypothesis() const {
+  if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_BODY)
+    return NULL;
+  return hypotheses_[0].model;
+}
+
+// Discard any hypotheses whose model is not in the given list.
+void RowScratchRegisters::DiscardNonMatchingHypotheses(
+    const SetOfModels &models) {
+  if (models.empty())
+    return;
+  for (int h = hypotheses_.size() - 1; h >= 0; h--) {
+    if (!models.contains(hypotheses_[h].model)) {
+      hypotheses_.remove(h);
+    }
+  }
+}
+
+// ============ Geometry based Paragraph Detection Algorithm =================
+
+struct Cluster {
+  Cluster() : center(0), count(0) {}
+  Cluster(int cen, int num) : center(cen), count(num) {}
+
+  int center;  // The center of the cluster.
+  int count;   // The number of entries within the cluster.
+};
+
+class SimpleClusterer {
+ public:
+  explicit SimpleClusterer(int max_cluster_width)
+      : max_cluster_width_(max_cluster_width) {}
+  void Add(int value) { values_.push_back(value); }
+  int size() const { return values_.size(); }
+  void GetClusters(GenericVector<Cluster> *clusters);
+
+ private:
+  int max_cluster_width_;
+  GenericVectorEqEq<int> values_;
+};
+
+// Return the index of the cluster closest to value.
+int ClosestCluster(const GenericVector<Cluster> &clusters, int value) {
+  int best_index = 0;
+  for (int i = 0; i < clusters.size(); i++) {
+    if (abs(value - clusters[i].center) <
+        abs(value - clusters[best_index].center))
+        best_index = i;
+  }
+  return best_index;
+}
+
+void SimpleClusterer::GetClusters(GenericVector<Cluster> *clusters) {
+  clusters->clear();
+  values_.sort();
+  for (int i = 0; i < values_.size();) {
+    int orig_i = i;
+    int lo = values_[i];
+    int hi = lo;
+    while (++i < values_.size() && values_[i] <= lo + max_cluster_width_) {
+      hi = values_[i];
+    }
+    clusters->push_back(Cluster((hi + lo) / 2, i - orig_i));
+  }
+}
+
+// Calculate left- and right-indent tab stop values seen in
+// rows[row_start, row_end) given a tolerance of tolerance.
+void CalculateTabStops(GenericVector<RowScratchRegisters> *rows,
+                       int row_start, int row_end,
+                       int tolerance,
+                       GenericVector<Cluster> *left_tabs,
+                       GenericVector<Cluster> *right_tabs) {
+  if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end))
+    return;
+  // First pass: toss all left and right indents into clusterers.
+  SimpleClusterer initial_lefts(tolerance);
+  SimpleClusterer initial_rights(tolerance);
+  GenericVector<Cluster> initial_left_tabs;
+  GenericVector<Cluster> initial_right_tabs;
+  for (int i = row_start; i < row_end; i++) {
+    initial_lefts.Add((*rows)[i].lindent_);
+    initial_rights.Add((*rows)[i].rindent_);
+  }
+  initial_lefts.GetClusters(&initial_left_tabs);
+  initial_rights.GetClusters(&initial_right_tabs);
+
+  // Second pass: cluster only lines that are not "stray"
+  //   An example of a stray line is a page number -- a line whose start
+  //   and end tab-stops are far outside the typical start and end tab-stops
+  //   for the block.
+  //   Put another way, we only cluster data from lines whose start or end
+  //   tab stop is frequent.
+  SimpleClusterer lefts(tolerance);
+  SimpleClusterer rights(tolerance);
+  int infrequent_enough_to_ignore = (row_end - row_start) / kStrayLinePer;
+  for (int i = row_start; i < row_end; i++) {
+    int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
+    int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
+    if (initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
+        initial_right_tabs[ridx].count > infrequent_enough_to_ignore) {
+      lefts.Add((*rows)[i].lindent_);
+      rights.Add((*rows)[i].rindent_);
+    }
+  }
+  lefts.GetClusters(left_tabs);
+  rights.GetClusters(right_tabs);
+}
+
+// Given a paragraph model mark rows[row_start, row_end) as said model
+// start or body lines.
+//
+// Case 1: model->first_indent_ != model->body_indent_
+//   Differentiating the paragraph start lines from the paragraph body lines in
+//   this case is easy, we just see how far each line is indented.
+//
+// Case 2: model->first_indent_ == model->body_indent_
+//   Here, we find end-of-paragraph lines by looking for "short lines."
+//   What constitutes a "short line" changes depending on whether the text
+//   ragged-right[left] or fully justified (aligned left and right).
+//
+//   Case 2a: Ragged Right (or Left) text.  (eop_threshold == 0)
+//     We have a new paragraph it the first word would have at the end
+//     of the previous line.
+//
+//   Case 2b: Fully Justified.  (eop_threshold > 0)
+//     We mark a line as short (end of paragraph) if the offside indent
+//     is greater than eop_threshold.
+void MarkRowsWithModel(GenericVector<RowScratchRegisters> *rows,
+                       int row_start, int row_end,
+                       const ParagraphModel *model,
+                       bool ltr,
+                       int eop_threshold) {
+  if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end))
+    return;
+  for (int row = row_start; row < row_end; row++) {
+    bool valid_first = ValidFirstLine(rows, row, model);
+    bool valid_body = ValidBodyLine(rows, row, model);
+    if (valid_first && !valid_body) {
+      (*rows)[row].AddStartLine(model);
+    } else if (valid_body && !valid_first) {
+      (*rows)[row].AddBodyLine(model);
+    } else if (valid_body && valid_first) {
+      bool after_eop = (row == row_start);
+      if (row > row_start) {
+        if (eop_threshold > 0) {
+          if (model->justification() == JUSTIFICATION_LEFT) {
+            after_eop = (*rows)[row - 1].rindent_ > eop_threshold;
+          } else {
+            after_eop = (*rows)[row - 1].lindent_ > eop_threshold;
+          }
+        } else {
+         after_eop = FirstWordWouldHaveFit((*rows)[row - 1], (*rows)[row],
+                                           model->justification());
+        }
+      }
+      if (after_eop) {
+        (*rows)[row].AddStartLine(model);
+      } else {
+        (*rows)[row].AddBodyLine(model);
+      }
+    } else {
+      // Do nothing. Stray row.
+    }
+  }
+}
+
+// GeometricClassifierState holds all of the information we'll use while
+// trying to determine a paragraph model for the text lines in a block of
+// text:
+//   + the rows under consideration [row_start, row_end)
+//   + the common left- and right-indent tab stops
+//   + does the block start out left-to-right or right-to-left
+// Further, this struct holds the data we amass for the (single) ParagraphModel
+// we'll assign to the text lines (assuming we get that far).
+struct GeometricClassifierState {
+  GeometricClassifierState(int dbg_level,
+                           GenericVector<RowScratchRegisters> *r,
+                           int r_start, int r_end)
+      : debug_level(dbg_level), rows(r), row_start(r_start), row_end(r_end),
+        margin(0) {
+    tolerance = InterwordSpace(*r, r_start, r_end);
+    CalculateTabStops(r, r_start, r_end, tolerance,
+                      &left_tabs, &right_tabs);
+    ltr = (*r)[r_start].ri_->ltr;
+  }
+
+  void AssumeLeftJustification() {
+    just = tesseract::JUSTIFICATION_LEFT;
+    margin = (*rows)[row_start].lmargin_;
+  }
+
+  void AssumeRightJustification() {
+    just = tesseract::JUSTIFICATION_RIGHT;
+    margin = (*rows)[row_start].rmargin_;
+  }
+
+  // Align tabs are the tab stops the text is aligned to.
+  const GenericVector<Cluster> &AlignTabs() const {
+    if (just == tesseract::JUSTIFICATION_RIGHT) return right_tabs;
+    return left_tabs;
+  }
+
+  // Offside tabs are the tab stops opposite the tabs used to align the text.
+  //
+  // Note that for a left-to-right text which is aligned to the right such as
+  //     this function comment, the offside tabs are the horizontal tab stops
+  //                 marking the beginning of ("Note", "this" and "marking").
+  const GenericVector<Cluster> &OffsideTabs() const {
+    if (just == tesseract::JUSTIFICATION_RIGHT) return left_tabs;
+    return right_tabs;
+  }
+
+  // Return whether the i'th row extends from the leftmost left tab stop
+  // to the right most right tab stop.
+  bool IsFullRow(int i) const {
+    return ClosestCluster(left_tabs, (*rows)[i].lindent_) == 0 &&
+        ClosestCluster(right_tabs, (*rows)[i].rindent_) == 0;
+  }
+
+  int AlignsideTabIndex(int row_idx) const {
+    return ClosestCluster(AlignTabs(), (*rows)[row_idx].AlignsideIndent(just));
+  }
+
+  // Given what we know about the paragraph justification (just), would the
+  // first word of row_b have fit at the end of row_a?
+  bool FirstWordWouldHaveFit(int row_a, int row_b) {
+    return ::tesseract::FirstWordWouldHaveFit(
+        (*rows)[row_a], (*rows)[row_b], just);
+  }
+
+  void PrintRows() const { PrintRowRange(*rows, row_start, row_end); }
+
+  void Fail(int min_debug_level, const char *why) const {
+    if (debug_level < min_debug_level) return;
+    tprintf("# %s\n", why);
+    PrintRows();
+  }
+
+  ParagraphModel Model() const {
+    return ParagraphModel(just, margin, first_indent, body_indent, tolerance);
+  }
+
+  // We print out messages with a debug level at least as great as debug_level.
+  int debug_level;
+
+  // The Geometric Classifier was asked to find a single paragraph model
+  // to fit the text rows (*rows)[row_start, row_end)
+  GenericVector<RowScratchRegisters> *rows;
+  int row_start;
+  int row_end;
+
+  // The amount by which we expect the text edge can vary and still be aligned.
+  int tolerance;
+
+  // Is the script in this text block left-to-right?
+  // HORRIBLE ROUGH APPROXIMATION.  TODO(eger): Improve
+  bool ltr;
+
+  // These left and right tab stops were determined to be the common tab
+  // stops for the given text.
+  GenericVector<Cluster> left_tabs;
+  GenericVector<Cluster> right_tabs;
+
+  // These are parameters we must determine to create a ParagraphModel.
+  tesseract::ParagraphJustification just;
+  int margin;
+  int first_indent;
+  int body_indent;
+
+  // eop_threshold > 0 if the text is fully justified.  See MarkRowsWithModel()
+  int eop_threshold;
+};
+
+// Given a section of text where strong textual clues did not help identifying
+// paragraph breaks, and for which the left and right indents have exactly
+// three tab stops between them, attempt to find the paragraph breaks based
+// solely on the outline of the text and whether the script is left-to-right.
+//
+// Algorithm Detail:
+//   The selected rows are in the form of a rectangle except
+//   for some number of "short lines" of the same length:
+//
+//   (A1)  xxxxxxxxxxxxx  (B1) xxxxxxxxxxxx
+//           xxxxxxxxxxx       xxxxxxxxxx    # A "short" line.
+//         xxxxxxxxxxxxx       xxxxxxxxxxxx
+//         xxxxxxxxxxxxx       xxxxxxxxxxxx
+//
+//   We have a slightly different situation if the only short
+//   line is at the end of the excerpt.
+//
+//   (A2) xxxxxxxxxxxxx  (B2) xxxxxxxxxxxx
+//        xxxxxxxxxxxxx       xxxxxxxxxxxx
+//        xxxxxxxxxxxxx       xxxxxxxxxxxx
+//          xxxxxxxxxxx       xxxxxxxxxx     # A "short" line.
+//
+//   We'll interpret these as follows based on the reasoning in the comment for
+//   GeometricClassify():
+//       [script direction: first indent, body indent]
+//   (A1) LtR: 2,0  RtL: 0,0   (B1) LtR: 0,0  RtL: 2,0
+//   (A2) LtR: 2,0  RtL: CrR   (B2) LtR: CrL  RtL: 2,0
+void GeometricClassifyThreeTabStopTextBlock(
+    int debug_level,
+    GeometricClassifierState &s,
+    ParagraphTheory *theory) {
+  int num_rows = s.row_end - s.row_start;
+  int num_full_rows = 0;
+  int last_row_full = 0;
+  for (int i = s.row_start; i < s.row_end; i++) {
+    if (s.IsFullRow(i)) {
+      num_full_rows++;
+      if (i == s.row_end - 1) last_row_full++;
+    }
+  }
+
+  if (num_full_rows < 0.7 * num_rows) {
+    s.Fail(1, "Not enough full lines to know which lines start paras.");
+    return;
+  }
+
+  // eop_threshold gets set if we're fully justified; see MarkRowsWithModel()
+  s.eop_threshold = 0;
+
+  if (s.ltr) {
+    s.AssumeLeftJustification();
+  } else {
+    s.AssumeRightJustification();
+  }
+
+  if (debug_level > 0) {
+    tprintf("# Not enough variety for clear outline classification. "
+            "Guessing these are %s aligned based on script.\n",
+            s.ltr ? "left" : "right");
+    s.PrintRows();
+  }
+
+  if (s.AlignTabs().size() == 2) {  // case A1 or A2
+    s.first_indent = s.AlignTabs()[1].center;
+    s.body_indent = s.AlignTabs()[0].center;
+  } else {                      // case B1 or B2
+    if (num_rows - 1 == num_full_rows - last_row_full) {
+      // case B2
+      const ParagraphModel *model = s.ltr ? kCrownLeft : kCrownRight;
+      (*s.rows)[s.row_start].AddStartLine(model);
+      for (int i = s.row_start + 1; i < s.row_end; i++) {
+        (*s.rows)[i].AddBodyLine(model);
+      }
+      return;
+    } else {
+      // case B1
+      s.first_indent = s.body_indent = s.AlignTabs()[0].center;
+      s.eop_threshold = (s.OffsideTabs()[0].center +
+                         s.OffsideTabs()[1].center) / 2;
+    }
+  }
+  const ParagraphModel *model = theory->AddModel(s.Model());
+  MarkRowsWithModel(s.rows, s.row_start, s.row_end, model,
+                    s.ltr, s.eop_threshold);
+  return;
+}
+
+// This function is called if strong textual clues were not available, but
+// the caller hopes that the paragraph breaks will be super obvious just
+// by the outline of the text.
+//
+// The particularly difficult case is figuring out what's going on if you
+// don't have enough short paragraph end lines to tell us what's going on.
+//
+// For instance, let's say you have the following outline:
+//
+//   (A1)  xxxxxxxxxxxxxxxxxxxxxx
+//           xxxxxxxxxxxxxxxxxxxx
+//         xxxxxxxxxxxxxxxxxxxxxx
+//         xxxxxxxxxxxxxxxxxxxxxx
+//
+// Even if we know that the text is left-to-right and so will probably be
+// left-aligned, both of the following are possible texts:
+//
+//  (A1a)  1. Here our list item
+//           with two full lines.
+//         2. Here a second item.
+//         3. Here our third one.
+//
+//  (A1b)  so ends paragraph one.
+//           Here  starts another
+//         paragraph  we want  to
+//         read.  This  continues
+//
+// These examples are obvious from the text and should have been caught
+// by the StrongEvidenceClassify pass.  However, for languages where we don't
+// have capital letters to go on (e.g. Hebrew, Arabic, Hindi, Chinese),
+// it's worth guessing that (A1b) is the correct interpretation if there are
+// far more "full" lines than "short" lines.
+void GeometricClassify(int debug_level,
+                       GenericVector<RowScratchRegisters> *rows,
+                       int row_start, int row_end,
+                       ParagraphTheory *theory) {
+  if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end))
+    return;
+  if (debug_level > 1) {
+    tprintf("###############################################\n");
+    tprintf("##### GeometricClassify( rows[%d:%d) )   ####\n",
+            row_start, row_end);
+    tprintf("###############################################\n");
+  }
+  RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
+
+  GeometricClassifierState s(debug_level, rows, row_start, row_end);
+  if (s.left_tabs.size() > 2 && s.right_tabs.size() > 2) {
+    s.Fail(2, "Too much variety for simple outline classification.");
+    return;
+  }
+  if (s.left_tabs.size() <= 1 && s.right_tabs.size() <= 1) {
+    s.Fail(1, "Not enough variety for simple outline classification.");
+    return;
+  }
+  if (s.left_tabs.size() + s.right_tabs.size() == 3) {
+    GeometricClassifyThreeTabStopTextBlock(debug_level, s, theory);
+    return;
+  }
+
+  // At this point, we know that one side has at least two tab stops, and the
+  // other side has one or two tab stops.
+  // Left to determine:
+  //   (1) Which is the body indent and which is the first line indent?
+  //   (2) Is the text fully justified?
+
+  // If one side happens to have three or more tab stops, assume that side
+  // is opposite of the aligned side.
+  if (s.right_tabs.size() > 2) {
+    s.AssumeLeftJustification();
+  } else if (s.left_tabs.size() > 2) {
+    s.AssumeRightJustification();
+  } else if (s.ltr) {  // guess based on script direction
+    s.AssumeLeftJustification();
+  } else {
+    s.AssumeRightJustification();
+  }
+
+  if (s.AlignTabs().size() == 2) {
+    // For each tab stop on the aligned side, how many of them appear
+    // to be paragraph start lines?  [first lines]
+    int firsts[2] = {0, 0};
+    // Count the first line as a likely paragraph start line.
+    firsts[s.AlignsideTabIndex(s.row_start)]++;
+    // For each line, if the first word would have fit on the previous
+    // line count it as a likely paragraph start line.
+    for (int i = s.row_start + 1; i < s.row_end; i++) {
+      if (s.FirstWordWouldHaveFit(i - 1, i)) {
+        firsts[s.AlignsideTabIndex(i)]++;
+      }
+    }
+    // Make an extra accounting for the last line of the paragraph just
+    // in case it's the only short line in the block.  That is, take its
+    // first word as typical and see if this looks like the *last* line
+    // of a paragraph.  If so, mark the *other* indent as probably a first.
+    if (s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) {
+      firsts[1 - s.AlignsideTabIndex(s.row_end - 1)]++;
+    }
+
+    int percent0firsts, percent1firsts;
+    percent0firsts = (100 * firsts[0]) / s.AlignTabs()[0].count;
+    percent1firsts = (100 * firsts[1]) / s.AlignTabs()[1].count;
+
+    // TODO(eger): Tune these constants if necessary.
+    if ((percent0firsts < 20 && 30 < percent1firsts) ||
+        percent0firsts + 30 < percent1firsts) {
+      s.first_indent = s.AlignTabs()[1].center;
+      s.body_indent = s.AlignTabs()[0].center;
+    } else if ((percent1firsts < 20 && 30 < percent0firsts) ||
+               percent1firsts + 30 < percent0firsts) {
+      s.first_indent = s.AlignTabs()[0].center;
+      s.body_indent = s.AlignTabs()[1].center;
+    } else {
+      // Ambiguous! Probably lineated (poetry)
+      if (debug_level > 1) {
+        tprintf("# Cannot determine %s indent likely to start paragraphs.\n",
+                s.just == tesseract::JUSTIFICATION_LEFT ? "left" : "right");
+        tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
+                s.AlignTabs()[0].center, percent0firsts);
+        tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
+                s.AlignTabs()[1].center, percent1firsts);
+        s.PrintRows();
+      }
+      return;
+    }
+  } else {
+    // There's only one tab stop for the "aligned to" side.
+    s.first_indent = s.body_indent = s.AlignTabs()[0].center;
+  }
+
+  // At this point, we have our model.
+  const ParagraphModel *model = theory->AddModel(s.Model());
+
+  // Now all we have to do is figure out if the text is fully justified or not.
+  // eop_threshold: default to fully justified unless we see evidence below.
+  //    See description on MarkRowsWithModel()
+  s.eop_threshold =
+      (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2;
+  // If the text is not fully justified, re-set the eop_threshold to 0.
+  if (s.AlignTabs().size() == 2) {
+    // Paragraphs with a paragraph-start indent.
+    for (int i = s.row_start; i < s.row_end - 1; i++) {
+      if (ValidFirstLine(s.rows, i + 1, model) &&
+          !NearlyEqual(s.OffsideTabs()[0].center,
+                       (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) {
+        // We found a non-end-of-paragraph short line: not fully justified.
+        s.eop_threshold = 0;
+        break;
+      }
+    }
+  } else {
+    // Paragraphs with no paragraph-start indent.
+    for (int i = s.row_start; i < s.row_end - 1; i++) {
+      if (!s.FirstWordWouldHaveFit(i, i + 1) &&
+          !NearlyEqual(s.OffsideTabs()[0].center,
+                       (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) {
+        // We found a non-end-of-paragraph short line: not fully justified.
+        s.eop_threshold = 0;
+        break;
+      }
+    }
+  }
+  MarkRowsWithModel(rows, row_start, row_end, model, s.ltr, s.eop_threshold);
+}
+
+// =============== Implementation of ParagraphTheory =====================
+
+const ParagraphModel *ParagraphTheory::AddModel(const ParagraphModel &model) {
+  for (int i = 0; i < models_->size(); i++) {
+    if ((*models_)[i]->Comparable(model))
+      return (*models_)[i];
+  }
+  ParagraphModel *m = new ParagraphModel(model);
+  models_->push_back(m);
+  models_we_added_.push_back_new(m);
+  return m;
+}
+
+void ParagraphTheory::DiscardUnusedModels(const SetOfModels &used_models) {
+  for (int i = models_->size() - 1; i >= 0; i--) {
+    ParagraphModel *m = (*models_)[i];
+    if (!used_models.contains(m) && models_we_added_.contains(m)) {
+      delete m;
+      models_->remove(i);
+      models_we_added_.remove(models_we_added_.get_index(m));
+    }
+  }
+}
+
+// Examine rows[start, end) and try to determine if an existing non-centered
+// paragraph model would fit them perfectly.  If so, return a pointer to it.
+// If not, return NULL.
+const ParagraphModel *ParagraphTheory::Fits(
+    const GenericVector<RowScratchRegisters> *rows, int start, int end) const {
+  for (int m = 0; m < models_->size(); m++) {
+    const ParagraphModel *model = (*models_)[m];
+    if (model->justification() != JUSTIFICATION_CENTER &&
+        RowsFitModel(rows, start, end, model))
+      return model;
+  }
+  return NULL;
+}
+
+void ParagraphTheory::NonCenteredModels(SetOfModels *models) {
+  for (int m = 0; m < models_->size(); m++) {
+    const ParagraphModel *model = (*models_)[m];
+    if (model->justification() != JUSTIFICATION_CENTER)
+      models->push_back_new(model);
+  }
+}
+
+int ParagraphTheory::IndexOf(const ParagraphModel *model) const {
+  for (int i = 0; i < models_->size(); i++) {
+    if ((*models_)[i] == model)
+      return i;
+  }
+  return -1;
+}
+
+bool ValidFirstLine(const GenericVector<RowScratchRegisters> *rows,
+                    int row, const ParagraphModel *model) {
+  if (!StrongModel(model)) {
+    tprintf("ValidFirstLine() should only be called with strong models!\n");
+  }
+  return StrongModel(model) &&
+      model->ValidFirstLine(
+          (*rows)[row].lmargin_, (*rows)[row].lindent_,
+          (*rows)[row].rindent_, (*rows)[row].rmargin_);
+}
+
+bool ValidBodyLine(const GenericVector<RowScratchRegisters> *rows,
+                   int row, const ParagraphModel *model) {
+  if (!StrongModel(model)) {
+    tprintf("ValidBodyLine() should only be called with strong models!\n");
+  }
+  return StrongModel(model) &&
+      model->ValidBodyLine(
+          (*rows)[row].lmargin_, (*rows)[row].lindent_,
+          (*rows)[row].rindent_, (*rows)[row].rmargin_);
+}
+
+bool CrownCompatible(const GenericVector<RowScratchRegisters> *rows,
+                     int a, int b, const ParagraphModel *model) {
+  if (model != kCrownRight && model != kCrownLeft) {
+    tprintf("CrownCompatible() should only be called with crown models!\n");
+    return false;
+  }
+  RowScratchRegisters &row_a = (*rows)[a];
+  RowScratchRegisters &row_b = (*rows)[b];
+  if (model == kCrownRight) {
+    return NearlyEqual(row_a.rindent_ + row_a.rmargin_,
+                       row_b.rindent_ + row_b.rmargin_,
+                       Epsilon(row_a.ri_->average_interword_space));
+  }
+  return NearlyEqual(row_a.lindent_ + row_a.lmargin_,
+                     row_b.lindent_ + row_b.lmargin_,
+                     Epsilon(row_a.ri_->average_interword_space));
+}
+
+
+// =============== Implementation of ParagraphModelSmearer ====================
+
+ParagraphModelSmearer::ParagraphModelSmearer(
+    GenericVector<RowScratchRegisters> *rows,
+    int row_start, int row_end, ParagraphTheory *theory)
+        : theory_(theory), rows_(rows), row_start_(row_start),
+          row_end_(row_end) {
+  if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {
+    row_start_ = 0;
+    row_end_ = 0;
+    return;
+  }
+  SetOfModels no_models;
+  for (int row = row_start - 1; row <= row_end; row++) {
+    open_models_.push_back(no_models);
+  }
+}
+
+// see paragraphs_internal.h
+void ParagraphModelSmearer::CalculateOpenModels(int row_start, int row_end) {
+  SetOfModels no_models;
+  if (row_start < row_start_) row_start = row_start_;
+  if (row_end > row_end_) row_end = row_end_;
+
+  for (int row = (row_start > 0) ? row_start - 1 : row_start; row < row_end;
+       row++) {
+    if ((*rows_)[row].ri_->num_words == 0) {
+      OpenModels(row + 1) = no_models;
+    } else {
+      SetOfModels &opened = OpenModels(row);
+      (*rows_)[row].StartHypotheses(&opened);
+
+      // Which models survive the transition from row to row + 1?
+      SetOfModels still_open;
+      for (int m = 0; m < opened.size(); m++) {
+        if (ValidFirstLine(rows_, row, opened[m]) ||
+            ValidBodyLine(rows_, row, opened[m])) {
+          // This is basic filtering; we check likely paragraph starty-ness down
+          // below in Smear() -- you know, whether the first word would have fit
+          // and such.
+          still_open.push_back_new(opened[m]);
+        }
+      }
+      OpenModels(row + 1) = still_open;
+    }
+  }
+}
+
+// see paragraphs_internal.h
+void ParagraphModelSmearer::Smear() {
+  CalculateOpenModels(row_start_, row_end_);
+
+  // For each row which we're unsure about (that is, it is LT_UNKNOWN or
+  // we have multiple LT_START hypotheses), see if there's a model that
+  // was recently used (an "open" model) which might model it well.
+  for (int i = row_start_; i < row_end_; i++) {
+    RowScratchRegisters &row = (*rows_)[i];
+    if (row.ri_->num_words == 0)
+      continue;
+
+    // Step One:
+    //   Figure out if there are "open" models which are left-alined or
+    //   right-aligned.  This is important for determining whether the
+    //   "first" word in a row would fit at the "end" of the previous row.
+    bool left_align_open = false;
+    bool right_align_open = false;
+    for (int m = 0; m < OpenModels(i).size(); m++) {
+      switch (OpenModels(i)[m]->justification()) {
+        case JUSTIFICATION_LEFT: left_align_open = true; break;
+        case JUSTIFICATION_RIGHT: right_align_open = true; break;
+        default: left_align_open = right_align_open = true;
+      }
+    }
+    // Step Two:
+    //   Use that knowledge to figure out if this row is likely to
+    //   start a paragraph.
+    bool likely_start;
+    if (i == 0) {
+      likely_start = true;
+    } else {
+      if ((left_align_open && right_align_open) ||
+          (!left_align_open && !right_align_open)) {
+        likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
+                                            JUSTIFICATION_LEFT) ||
+                       LikelyParagraphStart((*rows_)[i - 1], row,
+                                            JUSTIFICATION_RIGHT);
+      } else if (left_align_open) {
+        likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
+                                            JUSTIFICATION_LEFT);
+      } else {
+        likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
+                                            JUSTIFICATION_RIGHT);
+      }
+    }
+
+    // Step Three:
+    //   If this text line seems like an obvious first line of an
+    //   open model, or an obvious continuation of an existing
+    //   modelled paragraph, mark it up.
+    if (likely_start) {
+      // Add Start Hypotheses for all Open models that fit.
+      for (int m = 0; m < OpenModels(i).size(); m++) {
+        if (ValidFirstLine(rows_, i, OpenModels(i)[m])) {
+          row.AddStartLine(OpenModels(i)[m]);
+        }
+      }
+    } else {
+      // Add relevant body line hypotheses.
+      SetOfModels last_line_models;
+      if (i > 0) {
+        (*rows_)[i - 1].StrongHypotheses(&last_line_models);
+      } else {
+        theory_->NonCenteredModels(&last_line_models);
+      }
+      for (int m = 0; m < last_line_models.size(); m++) {
+        const ParagraphModel *model = last_line_models[m];
+        if (ValidBodyLine(rows_, i, model))
+          row.AddBodyLine(model);
+      }
+    }
+
+    // Step Four:
+    //   If we're still quite unsure about this line, go through all
+    //   models in our theory and see if this row could be the start
+    //   of any of our  models.
+    if (row.GetLineType() == LT_UNKNOWN ||
+        (row.GetLineType() == LT_START && !row.UniqueStartHypothesis())) {
+      SetOfModels all_models;
+      theory_->NonCenteredModels(&all_models);
+      for (int m = 0; m < all_models.size(); m++) {
+        if (ValidFirstLine(rows_, i, all_models[m])) {
+          row.AddStartLine(all_models[m]);
+        }
+      }
+    }
+    // Step Five:
+    //   Since we may have updated the hypotheses about this row, we need
+    //   to recalculate the Open models for the rest of rows[i + 1, row_end)
+    if (row.GetLineType() != LT_UNKNOWN) {
+      CalculateOpenModels(i + 1, row_end_);
+    }
+  }
+}
+
+// ================ Main Paragraph Detection Algorithm =======================
+
+// Find out what ParagraphModels are actually used, and discard any
+// that are not.
+void DiscardUnusedModels(const GenericVector<RowScratchRegisters> &rows,
+                         ParagraphTheory *theory) {
+  SetOfModels used_models;
+  for (int i = 0; i < rows.size(); i++) {
+    rows[i].StrongHypotheses(&used_models);
+  }
+  theory->DiscardUnusedModels(used_models);
+}
+
+// DowngradeWeakestToCrowns:
+//   Forget any flush-{left, right} models unless we see two or more
+//   of them in sequence.
+//
+// In pass 3, we start to classify even flush-left paragraphs (paragraphs
+// where the first line and body indent are the same) as having proper Models.
+// This is generally dangerous, since if you start imagining that flush-left
+// is a typical paragraph model when it is not, it will lead you to chop normal
+// indented paragraphs in the middle whenever a sentence happens to start on a
+// new line (see "This" above).  What to do?
+//   What we do is to take any paragraph which is flush left and is not
+// preceded by another paragraph of the same model and convert it to a "Crown"
+// paragraph.  This is a weak pseudo-ParagraphModel which is a placeholder
+// for later.  It means that the paragraph is flush, but it would be desirable
+// to mark it as the same model as following text if it fits.  This downgrade
+// FlushLeft -> CrownLeft -> Model of following paragraph.  Means that we
+// avoid making flush left Paragraph Models whenever we see a top-of-the-page
+// half-of-a-paragraph. and instead we mark it the same as normal body text.
+//
+// Implementation:
+//
+//   Comb backwards through the row scratch registers, and turn any
+//   sequences of body lines of equivalent type abutted against the beginning
+//   or a body or start line of a different type into a crown paragraph.
+void DowngradeWeakestToCrowns(int debug_level,
+                              ParagraphTheory *theory,
+                              GenericVector<RowScratchRegisters> *rows) {
+  int start;
+  for (int end = rows->size(); end > 0; end = start) {
+    // Search back for a body line of a unique type.
+    const ParagraphModel *model = NULL;
+    while (end > 0 &&
+           (model = (*rows)[end - 1].UniqueBodyHypothesis()) == NULL) {
+      end--;
+    }
+    if (end == 0) break;
+    start = end - 1;
+    while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) {
+      start--;  // walk back to the first line that is not the same body type.
+    }
+    if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model &&
+        StrongModel(model) &&
+        NearlyEqual(model->first_indent(), model->body_indent(),
+                    model->tolerance())) {
+        start--;
+    }
+    start++;
+    // Now rows[start, end) is a sequence of unique body hypotheses of model.
+    if (StrongModel(model) && model->justification() == JUSTIFICATION_CENTER)
+      continue;
+    if (!StrongModel(model)) {
+      while (start > 0 &&
+             CrownCompatible(rows, start - 1, start, model))
+        start--;
+    }
+    if (start == 0 ||
+        (!StrongModel(model)) ||
+        (StrongModel(model) && !ValidFirstLine(rows, start - 1, model))) {
+      // crownify rows[start, end)
+      const ParagraphModel *crown_model = model;
+      if (StrongModel(model)) {
+          if (model->justification() == JUSTIFICATION_LEFT)
+            crown_model = kCrownLeft;
+          else
+            crown_model = kCrownRight;
+      }
+      (*rows)[start].SetUnknown();
+      (*rows)[start].AddStartLine(crown_model);
+      for (int row = start + 1; row < end; row++) {
+        (*rows)[row].SetUnknown();
+        (*rows)[row].AddBodyLine(crown_model);
+      }
+    }
+  }
+  DiscardUnusedModels(*rows, theory);
+}
+
+
+// Clear all hypotheses about lines [start, end) and reset margins.
+//
+// The empty space between the left of a row and the block boundary (and
+// similarly for the right) is split into two pieces: margin and indent.
+// In initial processing, we assume the block is tight and the margin for
+// all lines is set to zero.   However, if our first pass does not yield
+// models for  everything,  it may be  due to an  inset paragraph like a
+// block-quote.   In that case, we make a second pass over that unmarked
+// section of the page and reset the "margin" portion of the empty space
+// to the common amount of space at  the ends of the lines under consid-
+// eration.    This would be equivalent to percentile set to 0. However,
+// sometimes we have a single character sticking out in the right margin
+// of a text block  (like the 'r' in 'for' on line 3 above),  and we can
+// really  just ignore it as an outlier.   To express this, we allow the
+// user to specify  the percentile (0..100)  of indent values  to use as
+// the common margin for each row in the run of rows[start, end).
+void RecomputeMarginsAndClearHypotheses(
+    GenericVector<RowScratchRegisters> *rows, int start, int end,
+    int percentile) {
+  if (!AcceptableRowArgs(0, 0, __func__, rows, start, end))
+    return;
+
+  int lmin, lmax, rmin, rmax;
+  lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_;
+  rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_;
+  for (int i = start; i < end; i++) {
+    RowScratchRegisters &sr = (*rows)[i];
+    sr.SetUnknown();
+    if (sr.ri_->num_words == 0)
+      continue;
+    UpdateRange(sr.lmargin_ + sr.lindent_, &lmin, &lmax);
+    UpdateRange(sr.rmargin_ + sr.rindent_, &rmin, &rmax);
+  }
+  STATS lefts(lmin, lmax + 1);
+  STATS rights(rmin, rmax + 1);
+  for (int i = start; i < end; i++) {
+    RowScratchRegisters &sr = (*rows)[i];
+    if (sr.ri_->num_words == 0)
+      continue;
+    lefts.add(sr.lmargin_ + sr.lindent_, 1);
+    rights.add(sr.rmargin_ + sr.rindent_, 1);
+  }
+  int ignorable_left = lefts.ile(ClipToRange(percentile, 0, 100) / 100.0);
+  int ignorable_right = rights.ile(ClipToRange(percentile, 0, 100) / 100.0);
+  for (int i = start; i < end; i++) {
+    RowScratchRegisters &sr = (*rows)[i];
+    int ldelta = ignorable_left - sr.lmargin_;
+    sr.lmargin_ += ldelta;
+    sr.lindent_ -= ldelta;
+    int rdelta = ignorable_right - sr.rmargin_;
+    sr.rmargin_ += rdelta;
+    sr.rindent_ -= rdelta;
+  }
+}
+
+// Return the minimum inter-word space in rows[row_start, row_end).
+int InterwordSpace(const GenericVector<RowScratchRegisters> &rows,
+                   int row_start, int row_end) {
+  if (row_end < row_start + 1) return 1;
+  bool legit = false;
+  int natural_space = rows[row_start].ri_->average_interword_space;
+  for (int i = row_start; i < row_end; i++) {
+    if (rows[i].ri_->num_words > 1) {
+      if (!legit) {
+        natural_space = rows[i].ri_->average_interword_space;
+        legit = true;
+      } else {
+        if (rows[i].ri_->average_interword_space < natural_space)
+          natural_space = rows[i].ri_->average_interword_space;
+      }
+    }
+  }
+  return natural_space;
+}
+
+// Return whether the first word on the after line can fit in the space at
+// the end of the before line (knowing which way the text is aligned and read).
+bool FirstWordWouldHaveFit(const RowScratchRegisters &before,
+                           const RowScratchRegisters &after,
+                           tesseract::ParagraphJustification justification) {
+  if (before.ri_->num_words == 0 || after.ri_->num_words == 0)
+    return true;
+
+  if (justification == JUSTIFICATION_UNKNOWN) {
+    tprintf("Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n");
+  }
+  int available_space;
+  if (justification == JUSTIFICATION_CENTER) {
+    available_space = before.lindent_ + before.rindent_;
+  } else {
+    available_space = before.OffsideIndent(justification);
+  }
+  available_space -= before.ri_->average_interword_space;
+
+  if (before.ri_->ltr)
+    return after.ri_->lword_box.width() < available_space;
+  return after.ri_->rword_box.width() < available_space;
+}
+
+// Return whether the first word on the after line can fit in the space at
+// the end of the before line (not knowing which way the text goes) in a left
+// or right alignemnt.
+bool FirstWordWouldHaveFit(const RowScratchRegisters &before,
+                           const RowScratchRegisters &after) {
+  if (before.ri_->num_words == 0 || after.ri_->num_words == 0)
+    return true;
+
+  int available_space = before.lindent_;
+  if (before.rindent_ > available_space)
+    available_space = before.rindent_;
+  available_space -= before.ri_->average_interword_space;
+
+  if (before.ri_->ltr)
+    return after.ri_->lword_box.width() < available_space;
+  return after.ri_->rword_box.width() < available_space;
+}
+
+bool TextSupportsBreak(const RowScratchRegisters &before,
+                       const RowScratchRegisters &after) {
+  if (before.ri_->ltr) {
+    return before.ri_->rword_likely_ends_idea &&
+           after.ri_->lword_likely_starts_idea;
+  } else {
+    return before.ri_->lword_likely_ends_idea &&
+           after.ri_->rword_likely_starts_idea;
+  }
+}
+
+bool LikelyParagraphStart(const RowScratchRegisters &before,
+                          const RowScratchRegisters &after) {
+  return before.ri_->num_words == 0 ||
+      (FirstWordWouldHaveFit(before, after) &&
+       TextSupportsBreak(before, after));
+}
+
+bool LikelyParagraphStart(const RowScratchRegisters &before,
+                          const RowScratchRegisters &after,
+                          tesseract::ParagraphJustification j) {
+  return before.ri_->num_words == 0 ||
+      (FirstWordWouldHaveFit(before, after, j) &&
+       TextSupportsBreak(before, after));
+}
+
+// Examine rows[start, end) and try to determine what sort of ParagraphModel
+// would fit them as a single paragraph.
+// If we can't produce a unique model justification_ = JUSTIFICATION_UNKNOWN.
+// If the rows given could be a consistent start to a paragraph, set *consistent
+// true.
+ParagraphModel InternalParagraphModelByOutline(
+    const GenericVector<RowScratchRegisters> *rows,
+    int start, int end, int tolerance, bool *consistent) {
+  int ltr_line_count = 0;
+  for (int i = start; i < end; i++) {
+    ltr_line_count += static_cast<int>((*rows)[i].ri_->ltr);
+  }
+  bool ltr = (ltr_line_count >= (end - start) / 2);
+
+  *consistent = true;
+  if (!AcceptableRowArgs(0, 2, __func__, rows, start, end))
+    return ParagraphModel();
+
+  // Ensure the caller only passed us a region with a common rmargin and
+  // lmargin.
+  int lmargin = (*rows)[start].lmargin_;
+  int rmargin = (*rows)[start].rmargin_;
+  int lmin, lmax, rmin, rmax, cmin, cmax;
+  lmin = lmax = (*rows)[start + 1].lindent_;
+  rmin = rmax = (*rows)[start + 1].rindent_;
+  cmin = cmax = 0;
+  for (int i = start + 1; i < end; i++) {
+    if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) {
+      tprintf("Margins don't match! Software error.\n");
+      *consistent = false;
+      return ParagraphModel();
+    }
+    UpdateRange((*rows)[i].lindent_, &lmin, &lmax);
+    UpdateRange((*rows)[i].rindent_, &rmin, &rmax);
+    UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax);
+  }
+  int ldiff = lmax - lmin;
+  int rdiff = rmax - rmin;
+  int cdiff = cmax - cmin;
+  if (rdiff > tolerance && ldiff > tolerance) {
+    if (cdiff < tolerance * 2) {
+      if (end - start < 3)
+        return ParagraphModel();
+      return ParagraphModel(JUSTIFICATION_CENTER, 0, 0, 0, tolerance);
+    }
+    *consistent = false;
+    return ParagraphModel();
+  }
+  if (end - start < 3)  // Don't return a model for two line paras.
+    return ParagraphModel();
+
+  // These booleans keep us from saying something is aligned left when the body
+  // left variance is too large.
+  bool body_admits_left_alignment = ldiff < tolerance;
+  bool body_admits_right_alignment = rdiff < tolerance;
+
+  ParagraphModel left_model =
+      ParagraphModel(JUSTIFICATION_LEFT, lmargin, (*rows)[start].lindent_,
+                     (lmin + lmax) / 2, tolerance);
+  ParagraphModel right_model =
+      ParagraphModel(JUSTIFICATION_RIGHT, rmargin, (*rows)[start].rindent_,
+                     (rmin + rmax) / 2, tolerance);
+
+  // These booleans keep us from having an indent on the "wrong side" for the
+  // first line.
+  bool text_admits_left_alignment = ltr || left_model.is_flush();
+  bool text_admits_right_alignment = !ltr || right_model.is_flush();
+
+  // At least one of the edges is less than tolerance in variance.
+  // If the other is obviously ragged, it can't be the one aligned to.
+  // [Note the last line is included in this raggedness.]
+  if (tolerance < rdiff) {
+    if (body_admits_left_alignment && text_admits_left_alignment)
+      return left_model;
+    *consistent = false;
+    return ParagraphModel();
+  }
+  if (tolerance < ldiff) {
+    if (body_admits_right_alignment && text_admits_right_alignment)
+      return right_model;
+    *consistent = false;
+    return ParagraphModel();
+  }
+
+  // At this point, we know the body text doesn't vary much on either side.
+
+  // If the first line juts out oddly in one direction or the other,
+  // that likely indicates the side aligned to.
+  int first_left = (*rows)[start].lindent_;
+  int first_right = (*rows)[start].rindent_;
+
+  if (ltr && body_admits_left_alignment &&
+      (first_left < lmin || first_left > lmax))
+    return left_model;
+  if (!ltr && body_admits_right_alignment &&
+      (first_right < rmin || first_right > rmax))
+    return right_model;
+
+  *consistent = false;
+  return ParagraphModel();
+}
+
+// Examine rows[start, end) and try to determine what sort of ParagraphModel
+// would fit them as a single paragraph.   If nothing fits,
+// justification_ = JUSTIFICATION_UNKNOWN and print the paragraph to debug
+// output if we're debugging.
+ParagraphModel ParagraphModelByOutline(
+    int debug_level,
+    const GenericVector<RowScratchRegisters> *rows,
+    int start, int end, int tolerance) {
+  bool unused_consistent;
+  ParagraphModel retval = InternalParagraphModelByOutline(
+      rows, start, end, tolerance, &unused_consistent);
+  if (debug_level >= 2 && retval.justification() == JUSTIFICATION_UNKNOWN) {
+    tprintf("Could not determine a model for this paragraph:\n");
+    PrintRowRange(*rows, start, end);
+  }
+  return retval;
+}
+
+// Do rows[start, end) form a single instance of the given paragraph model?
+bool RowsFitModel(const GenericVector<RowScratchRegisters> *rows,
+                  int start, int end, const ParagraphModel *model) {
+  if (!AcceptableRowArgs(0, 1, __func__, rows, start, end))
+    return false;
+  if (!ValidFirstLine(rows, start, model)) return false;
+  for (int i = start + 1 ; i < end; i++) {
+    if (!ValidBodyLine(rows, i, model)) return false;
+  }
+  return true;
+}
+
+// Examine rows[row_start, row_end) as an independent section of text,
+// and mark rows that are exceptionally clear as start-of-paragraph
+// and paragraph-body lines.
+//
+// We presume that any lines surrounding rows[row_start, row_end) may
+// have wildly different paragraph models, so we don't key any data off
+// of those lines.
+//
+// We only take the very strongest signals, as we don't want to get
+// confused and marking up centered text, poetry, or source code as
+// clearly part of a typical paragraph.
+void MarkStrongEvidence(GenericVector<RowScratchRegisters> *rows,
+                        int row_start, int row_end) {
+  // Record patently obvious body text.
+  for (int i = row_start + 1; i < row_end; i++) {
+    const RowScratchRegisters &prev = (*rows)[i - 1];
+    RowScratchRegisters &curr = (*rows)[i];
+    tesseract::ParagraphJustification typical_justification =
+        prev.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
+    if (!curr.ri_->rword_likely_starts_idea &&
+        !curr.ri_->lword_likely_starts_idea &&
+        !FirstWordWouldHaveFit(prev, curr, typical_justification)) {
+      curr.SetBodyLine();
+    }
+  }
+
+  // Record patently obvious start paragraph lines.
+  //
+  // It's an extremely good signal of the start of a paragraph that
+  // the first word would have fit on the end of the previous line.
+  // However, applying just that signal would have us mark random
+  // start lines of lineated text (poetry and source code) and some
+  // centered headings as paragraph start lines.  Therefore, we use
+  // a second qualification for a paragraph start: Not only should
+  // the first word of this line have fit on the previous line,
+  // but also, this line should go full to the right of the block,
+  // disallowing a subsequent word from having fit on this line.
+
+  // First row:
+  {
+    RowScratchRegisters &curr = (*rows)[row_start];
+    RowScratchRegisters &next = (*rows)[row_start + 1];
+    tesseract::ParagraphJustification j =
+        curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
+    if (curr.GetLineType() == LT_UNKNOWN &&
+        !FirstWordWouldHaveFit(curr, next, j) &&
+        (curr.ri_->lword_likely_starts_idea ||
+         curr.ri_->rword_likely_starts_idea)) {
+      curr.SetStartLine();
+    }
+  }
+  // Middle rows
+  for (int i = row_start + 1; i < row_end - 1; i++) {
+    RowScratchRegisters &prev = (*rows)[i - 1];
+    RowScratchRegisters &curr = (*rows)[i];
+    RowScratchRegisters &next = (*rows)[i + 1];
+    tesseract::ParagraphJustification j =
+        curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
+    if (curr.GetLineType() == LT_UNKNOWN &&
+        !FirstWordWouldHaveFit(curr, next, j) &&
+        LikelyParagraphStart(prev, curr, j)) {
+      curr.SetStartLine();
+    }
+  }
+  // Last row
+  {  // the short circuit at the top means we have at least two lines.
+    RowScratchRegisters &prev = (*rows)[row_end - 2];
+    RowScratchRegisters &curr = (*rows)[row_end - 1];
+    tesseract::ParagraphJustification j =
+        curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
+    if (curr.GetLineType() == LT_UNKNOWN &&
+        !FirstWordWouldHaveFit(curr, curr, j) &&
+        LikelyParagraphStart(prev, curr, j)) {
+      curr.SetStartLine();
+    }
+  }
+}
+
+// Look for sequences of a start line followed by some body lines in
+// rows[row_start, row_end) and create ParagraphModels for them if
+// they seem coherent.
+void ModelStrongEvidence(int debug_level,
+                         GenericVector<RowScratchRegisters> *rows,
+                         int row_start, int row_end,
+                         bool allow_flush_models,
+                         ParagraphTheory *theory) {
+  if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
+    return;
+
+  int start = row_start;
+  while (start < row_end) {
+    while (start < row_end && (*rows)[start].GetLineType() != LT_START)
+      start++;
+    if (start >= row_end - 1)
+      break;
+
+    int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space);
+    int end = start;
+    ParagraphModel last_model;
+    bool next_consistent;
+    do {
+      ++end;
+      // rows[row, end) was consistent.
+      // If rows[row, end + 1) is not consistent,
+      //   just model rows[row, end)
+      if (end < row_end - 1) {
+        RowScratchRegisters &next = (*rows)[end];
+        LineType lt = next.GetLineType();
+        next_consistent = lt == LT_BODY ||
+            (lt == LT_UNKNOWN &&
+             !FirstWordWouldHaveFit((*rows)[end - 1], (*rows)[end]));
+      } else {
+        next_consistent = false;
+      }
+      if (next_consistent) {
+        ParagraphModel next_model = InternalParagraphModelByOutline(
+            rows, start, end + 1, tolerance, &next_consistent);
+        if (((*rows)[start].ri_->ltr &&
+             last_model.justification() == JUSTIFICATION_LEFT &&
+             next_model.justification() != JUSTIFICATION_LEFT) ||
+            (!(*rows)[start].ri_->ltr &&
+             last_model.justification() == JUSTIFICATION_RIGHT &&
+             next_model.justification() != JUSTIFICATION_RIGHT)) {
+          next_consistent = false;
+        }
+        last_model = next_model;
+      } else {
+        next_consistent = false;
+      }
+    } while (next_consistent && end < row_end);
+    // At this point, rows[start, end) looked like it could have been a
+    // single paragraph.  If we can make a good ParagraphModel for it,
+    // do so and mark this sequence with that model.
+    if (end > start + 1) {
+      // emit a new paragraph if we have more than one line.
+      const ParagraphModel *model = NULL;
+      ParagraphModel new_model = ParagraphModelByOutline(
+          debug_level, rows, start, end,
+          Epsilon(InterwordSpace(*rows, start, end)));
+      if (new_model.justification() == JUSTIFICATION_UNKNOWN) {
+        // couldn't create a good model, oh well.
+      } else if (new_model.is_flush()) {
+        if (end == start + 2) {
+          // It's very likely we just got two paragraph starts in a row.
+          end = start + 1;
+        } else if (start == row_start) {
+          // Mark this as a Crown.
+          if (new_model.justification() == JUSTIFICATION_LEFT) {
+            model = kCrownLeft;
+          } else {
+            model = kCrownRight;
+          }
+        } else if (allow_flush_models) {
+          model = theory->AddModel(new_model);
+        }
+      } else {
+        model = theory->AddModel(new_model);
+      }
+      if (model) {
+        (*rows)[start].AddStartLine(model);
+        for (int i = start + 1; i < end; i++) {
+          (*rows)[i].AddBodyLine(model);
+        }
+      }
+    }
+    start = end;
+  }
+}
+
+// We examine rows[row_start, row_end) and do the following:
+//   (1) Clear all existing hypotheses for the rows being considered.
+//   (2) Mark up any rows as exceptionally likely to be paragraph starts
+//       or paragraph body lines as such using both geometric and textual
+//       clues.
+//   (3) Form models for any sequence of start + continuation lines.
+//   (4) Smear the paragraph models to cover surrounding text.
+void StrongEvidenceClassify(int debug_level,
+                            GenericVector<RowScratchRegisters> *rows,
+                            int row_start, int row_end,
+                            ParagraphTheory *theory) {
+  if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
+    return;
+
+  if (debug_level > 1) {
+    tprintf("#############################################\n");
+    tprintf("# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end);
+    tprintf("#############################################\n");
+  }
+
+  RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
+  MarkStrongEvidence(rows, row_start, row_end);
+
+  DebugDump(debug_level > 2, "Initial strong signals.", *theory, *rows);
+
+  // Create paragraph models.
+  ModelStrongEvidence(debug_level, rows, row_start, row_end, false, theory);
+
+  DebugDump(debug_level > 2, "Unsmeared hypotheses.s.", *theory, *rows);
+
+  // At this point, some rows are marked up as paragraphs with model numbers,
+  // and some rows are marked up as either LT_START or LT_BODY.  Now let's
+  // smear any good paragraph hypotheses forward and backward.
+  ParagraphModelSmearer smearer(rows, row_start, row_end, theory);
+  smearer.Smear();
+}
+
+void SeparateSimpleLeaderLines(GenericVector<RowScratchRegisters> *rows,
+                               int row_start, int row_end,
+                               ParagraphTheory *theory) {
+  for (int i = row_start + 1; i < row_end - 1; i++) {
+    if ((*rows)[i - 1].ri_->has_leaders &&
+        (*rows)[i].ri_->has_leaders &&
+        (*rows)[i + 1].ri_->has_leaders) {
+      const ParagraphModel *model = theory->AddModel(
+          ParagraphModel(JUSTIFICATION_UNKNOWN, 0, 0, 0, 0));
+      (*rows)[i].AddStartLine(model);
+    }
+  }
+}
+
+// Collect sequences of unique hypotheses in row registers and create proper
+// paragraphs for them, referencing the paragraphs in row_owners.
+void ConvertHypothesizedModelRunsToParagraphs(
+    int debug_level,
+    const GenericVector<RowScratchRegisters> &rows,
+    GenericVector<PARA *> *row_owners,
+    ParagraphTheory *theory) {
+  int end = rows.size();
+  int start;
+  for (; end > 0; end = start) {
+    start = end - 1;
+    const ParagraphModel *model = NULL;
+    // TODO(eger): Be smarter about dealing with multiple hypotheses.
+    bool single_line_paragraph = false;
+    SetOfModels models;
+    rows[start].NonNullHypotheses(&models);
+    if (models.size() > 0) {
+      model = models[0];
+      if (rows[start].GetLineType(model) != LT_BODY)
+        single_line_paragraph = true;
+    }
+    if (model && !single_line_paragraph) {
+      // walk back looking for more body lines and then a start line.
+      while (--start > 0 && rows[start].GetLineType(model) == LT_BODY) {
+        // do nothing
+      }
+      if (start < 0 || rows[start].GetLineType(model) != LT_START) {
+        model = NULL;
+      }
+    }
+    if (model == NULL) {
+      continue;
+    }
+    // rows[start, end) should be a paragraph.
+    PARA *p = new PARA();
+    if (model == kCrownLeft || model == kCrownRight) {
+      p->is_very_first_or_continuation = true;
+      // Crown paragraph.
+      //   If we can find an existing ParagraphModel that fits, use it,
+      //   else create a new one.
+      for (int row = end; row < rows.size(); row++) {
+        if ((*row_owners)[row] &&
+            (ValidBodyLine(&rows, start, (*row_owners)[row]->model) &&
+            (start == 0 ||
+             ValidFirstLine(&rows, start, (*row_owners)[row]->model)))) {
+          model = (*row_owners)[row]->model;
+          break;
+        }
+      }
+      if (model == kCrownLeft) {
+        // No subsequent model fits, so cons one up.
+        model = theory->AddModel(ParagraphModel(
+            JUSTIFICATION_LEFT, rows[start].lmargin_ + rows[start].lindent_,
+            0, 0, Epsilon(rows[start].ri_->average_interword_space)));
+      } else if (model == kCrownRight) {
+        // No subsequent model fits, so cons one up.
+        model = theory->AddModel(ParagraphModel(
+            JUSTIFICATION_RIGHT, rows[start].rmargin_ + rows[start].rmargin_,
+            0, 0, Epsilon(rows[start].ri_->average_interword_space)));
+      }
+    }
+    rows[start].SetUnknown();
+    rows[start].AddStartLine(model);
+    for (int i = start + 1; i < end; i++) {
+      rows[i].SetUnknown();
+      rows[i].AddBodyLine(model);
+    }
+    p->model = model;
+    p->has_drop_cap = rows[start].ri_->has_drop_cap;
+    p->is_list_item =
+        model->justification() == JUSTIFICATION_RIGHT
+            ? rows[start].ri_->rword_indicates_list_item
+            : rows[start].ri_->lword_indicates_list_item;
+    for (int row = start; row < end; row++) {
+      if ((*row_owners)[row] != NULL) {
+        tprintf("Memory leak! ConvertHypothesizeModelRunsToParagraphs() called "
+                "more than once!\n");
+      }
+      (*row_owners)[row] = p;
+    }
+  }
+}
+
+struct Interval {
+  Interval() : begin(0), end(0) {}
+  Interval(int b, int e) : begin(b), end(e) {}
+
+  int begin;
+  int end;
+};
+
+// Return whether rows[row] appears to be stranded, meaning that the evidence
+// for this row is very weak due to context.  For instance, two lines of source
+// code may happen to be indented at the same tab vector as body text starts,
+// leading us to think they are two start-of-paragraph lines.  This is not
+// optimal.  However, we also don't want to mark a sequence of short dialog
+// as "weak," so our heuristic is:
+//   (1) If a line is surrounded by lines of unknown type, it's weak.
+//   (2) If two lines in a row are start lines for a given paragraph type, but
+//       after that the same paragraph type does not continue, they're weak.
+bool RowIsStranded(const GenericVector<RowScratchRegisters> &rows, int row) {
+  SetOfModels row_models;
+  rows[row].StrongHypotheses(&row_models);
+
+  for (int m = 0; m < row_models.size(); m++) {
+    bool all_starts = rows[row].GetLineType();
+    int run_length = 1;
+    bool continues = true;
+    for (int i = row - 1; i >= 0 && continues; i--) {
+      SetOfModels models;
+      rows[i].NonNullHypotheses(&models);
+      switch (rows[i].GetLineType(row_models[m])) {
+        case LT_START: run_length++; break;
+        case LT_MULTIPLE:  // explicit fall-through
+        case LT_BODY: run_length++; all_starts = false; break;
+        case LT_UNKNOWN:  // explicit fall-through
+        default: continues = false;
+      }
+    }
+    continues = true;
+    for (int i = row + 1; i < rows.size() && continues; i++) {
+      SetOfModels models;
+      rows[i].NonNullHypotheses(&models);
+      switch (rows[i].GetLineType(row_models[m])) {
+        case LT_START: run_length++; break;
+        case LT_MULTIPLE:  // explicit fall-through
+        case LT_BODY: run_length++; all_starts = false; break;
+        case LT_UNKNOWN:  // explicit fall-through
+        default: continues = false;
+      }
+    }
+    if (run_length > 2 || (!all_starts && run_length > 1)) return false;
+  }
+  return true;
+}
+
+// Go through rows[row_start, row_end) and gather up sequences that need better
+// classification.
+// + Sequences of non-empty rows without hypotheses.
+// + Crown paragraphs not immediately followed by a strongly modeled line.
+// + Single line paragraphs surrounded by text that doesn't match the
+//   model.
+void LeftoverSegments(const GenericVector<RowScratchRegisters> &rows,
+                      GenericVector<Interval> *to_fix,
+                      int row_start, int row_end) {
+  to_fix->clear();
+  for (int i = row_start; i < row_end; i++) {
+    bool needs_fixing = false;
+
+    SetOfModels models;
+    SetOfModels models_w_crowns;
+    rows[i].StrongHypotheses(&models);
+    rows[i].NonNullHypotheses(&models_w_crowns);
+    if (models.empty() && models_w_crowns.size() > 0) {
+      // Crown paragraph.  Is it followed by a modeled line?
+      for (int end = i + 1; end < rows.size(); end++) {
+        SetOfModels end_models;
+        SetOfModels strong_end_models;
+        rows[end].NonNullHypotheses(&end_models);
+        rows[end].StrongHypotheses(&strong_end_models);
+        if (end_models.size() == 0) {
+          needs_fixing = true;
+          break;
+        } else if (strong_end_models.size() > 0) {
+          needs_fixing = false;
+          break;
+        }
+      }
+    } else if (models.empty() && rows[i].ri_->num_words > 0) {
+      // No models at all.
+      needs_fixing = true;
+    }
+
+    if (!needs_fixing && !models.empty()) {
+      needs_fixing = RowIsStranded(rows, i);
+    }
+
+    if (needs_fixing) {
+      if (!to_fix->empty() && to_fix->back().end == i - 1)
+        to_fix->back().end = i;
+      else
+        to_fix->push_back(Interval(i, i));
+    }
+  }
+  // Convert inclusive intervals to half-open intervals.
+  for (int i = 0; i < to_fix->size(); i++) {
+    (*to_fix)[i].end = (*to_fix)[i].end + 1;
+  }
+}
+
+// Given a set of row_owners pointing to PARAs or NULL (no paragraph known),
+// normalize each row_owner to point to an actual PARA, and output the
+// paragraphs in order onto paragraphs.
+void CanonicalizeDetectionResults(
+    GenericVector<PARA *> *row_owners,
+    PARA_LIST *paragraphs) {
+  GenericVector<PARA *> &rows = *row_owners;
+  paragraphs->clear();
+  PARA_IT out(paragraphs);
+  PARA *formerly_null = NULL;
+  for (int i = 0; i < rows.size(); i++) {
+    if (rows[i] == NULL) {
+      if (i == 0 || rows[i - 1] != formerly_null) {
+        rows[i] = formerly_null = new PARA();
+      } else {
+        rows[i] = formerly_null;
+        continue;
+      }
+    } else if (i > 0 && rows[i - 1] == rows[i]) {
+      continue;
+    }
+    out.add_after_then_move(rows[i]);
+  }
+}
+
+// Main entry point for Paragraph Detection Algorithm.
+//
+// Given a set of equally spaced textlines (described by row_infos),
+// Split them into paragraphs.
+//
+// Output:
+//   row_owners - one pointer for each row, to the paragraph it belongs to.
+//   paragraphs - this is the actual list of PARA objects.
+//   models - the list of paragraph models referenced by the PARA objects.
+//            caller is responsible for deleting the models.
+void DetectParagraphs(int debug_level,
+                      GenericVector<RowInfo> *row_infos,
+                      GenericVector<PARA *> *row_owners,
+                      PARA_LIST *paragraphs,
+                      GenericVector<ParagraphModel *> *models) {
+  GenericVector<RowScratchRegisters> rows;
+  ParagraphTheory theory(models);
+
+  // Initialize row_owners to be a bunch of NULL pointers.
+  row_owners->init_to_size(row_infos->size(), NULL);
+
+  // Set up row scratch registers for the main algorithm.
+  rows.init_to_size(row_infos->size(), RowScratchRegisters());
+  for (int i = 0; i < row_infos->size(); i++) {
+    rows[i].Init((*row_infos)[i]);
+  }
+
+  // Pass 1:
+  //   Detect sequences of lines that all contain leader dots (.....)
+  //   These are likely Tables of Contents.  If there are three text lines in
+  //   a row with leader dots, it's pretty safe to say the middle one should
+  //   be a paragraph of its own.
+  SeparateSimpleLeaderLines(&rows, 0, rows.size(), &theory);
+
+  DebugDump(debug_level > 1, "End of Pass 1", theory, rows);
+
+  GenericVector<Interval> leftovers;
+  LeftoverSegments(rows, &leftovers, 0, rows.size());
+  for (int i = 0; i < leftovers.size(); i++) {
+    // Pass 2a:
+    //   Find any strongly evidenced start-of-paragraph lines.  If they're
+    //   followed by two lines that look like body lines, make a paragraph
+    //   model for that and see if that model applies throughout the text
+    //   (that is, "smear" it).
+    StrongEvidenceClassify(debug_level, &rows,
+                           leftovers[i].begin, leftovers[i].end, &theory);
+
+    // Pass 2b:
+    //   If we had any luck in pass 2a, we got part of the page and didn't
+    //   know how to classify a few runs of rows. Take the segments that
+    //   didn't find a model and reprocess them individually.
+    GenericVector<Interval> leftovers2;
+    LeftoverSegments(rows, &leftovers2, leftovers[i].begin, leftovers[i].end);
+    bool pass2a_was_useful = leftovers2.size() > 1 ||
+        (leftovers2.size() == 1 &&
+         (leftovers2[0].begin != 0 || leftovers2[0].end != rows.size()));
+    if (pass2a_was_useful) {
+      for (int j = 0; j < leftovers2.size(); j++) {
+        StrongEvidenceClassify(debug_level, &rows,
+                               leftovers2[j].begin, leftovers2[j].end,
+                               &theory);
+      }
+    }
+  }
+
+  DebugDump(debug_level > 1, "End of Pass 2", theory, rows);
+
+  // Pass 3:
+  //   These are the dregs for which we didn't have enough strong textual
+  //   and geometric clues to form matching models for.  Let's see if
+  //   the geometric clues are simple enough that we could just use those.
+  LeftoverSegments(rows, &leftovers, 0, rows.size());
+  for (int i = 0; i < leftovers.size(); i++) {
+    GeometricClassify(debug_level, &rows,
+                      leftovers[i].begin, leftovers[i].end, &theory);
+  }
+  // Undo any flush models for which there's little evidence.
+  DowngradeWeakestToCrowns(debug_level, &theory, &rows);
+
+  DebugDump(debug_level > 1, "End of Pass 3", theory, rows);
+
+  // Pass 4:
+  //   Take everything that's still not marked up well and clear all markings.
+  LeftoverSegments(rows, &leftovers, 0, rows.size());
+  for (int i = 0; i < leftovers.size(); i++) {
+    for (int j = leftovers[i].begin; j < leftovers[i].end; j++) {
+      rows[j].SetUnknown();
+    }
+  }
+
+  DebugDump(debug_level > 1, "End of Pass 4", theory, rows);
+
+  // Convert all of the unique hypothesis runs to PARAs.
+  ConvertHypothesizedModelRunsToParagraphs(debug_level, rows, row_owners,
+                                           &theory);
+
+  DebugDump(debug_level > 0, "Final Paragraph Segmentation", theory, rows);
+
+  // Finally, clean up any dangling NULL row paragraph parents.
+  CanonicalizeDetectionResults(row_owners, paragraphs);
+}
+
+// ============ Code interfacing with the rest of Tesseract ==================
+
+// Given a Tesseract Iterator pointing to a text line, fill in the paragraph
+// detector RowInfo with all relevant information from the row.
+void InitializeRowInfo(const MutableIterator &it, RowInfo *info) {
+  if (it.PageResIt()->row() != NULL) {
+    ROW *row = it.PageResIt()->row()->row;
+    info->pix_ldistance = row->lmargin();
+    info->pix_rdistance = row->rmargin();
+    info->average_interword_space =
+        row->space() > 0 ? row->space() : MAX(row->x_height(), 1);
+    info->pix_xheight = row->x_height();
+    info->has_leaders = false;
+    info->has_drop_cap = row->has_drop_cap();
+    info->ltr = true;  // set below depending on word scripts
+  } else {
+    info->pix_ldistance = info->pix_rdistance = 0;
+    info->average_interword_space = 1;
+    info->pix_xheight = 1.0;
+    info->has_leaders = false;
+    info->has_drop_cap = false;
+    info->ltr = true;
+  }
+
+  info->text = "";
+  char *text = it.GetUTF8Text(RIL_TEXTLINE);
+  int num_nonws_chars = strlen(text);  // strip trailing space
+  while (num_nonws_chars > 0 && isspace(text[num_nonws_chars - 1]))
+    num_nonws_chars--;
+  if (num_nonws_chars > 0) {
+    int lspaces = info->pix_ldistance / info->average_interword_space;
+    for (int i = 0; i < lspaces; i++)
+      info->text += ' ';
+    for (int i = 0; i < num_nonws_chars; i++)
+      info->text += text[i];
+  }
+  delete []text;
+
+  info->num_words = 0;
+  info->lword_indicates_list_item = false;
+  info->lword_likely_starts_idea = false;
+  info->lword_likely_ends_idea = false;
+  info->rword_indicates_list_item = false;
+  info->rword_likely_starts_idea = false;
+  info->rword_likely_ends_idea = false;
+
+  if (info->text.size() == 0) {
+    info->rword_likely_ends_idea = false;
+    info->rword_likely_ends_idea = false;
+    return;
+  }
+
+  int ltr = 0;
+  int rtl = 0;
+
+  PAGE_RES_IT page_res_it = *it.PageResIt();
+  GenericVector<WERD_RES *> werds;
+  WERD_RES *word_res = page_res_it.restart_row();
+  ROW_RES *this_row = page_res_it.row();
+  int num_leaders = 0;
+  do {
+    if (word_res && word_res->best_choice->unichar_string().length() > 0) {
+      werds.push_back(word_res);
+      ltr += word_res->AnyLtrCharsInWord() ? 1 : 0;
+      rtl += word_res->AnyRtlCharsInWord() ? 1 : 0;
+      if (word_res->word->flag(W_REP_CHAR)) num_leaders++;
+    }
+    word_res = page_res_it.forward();
+  } while (page_res_it.row() == this_row);
+
+  info->has_leaders = num_leaders > 3;
+  info->num_words = werds.size();
+  if (werds.size() > 0) {
+    WERD_RES *lword = werds[0], *rword = werds[werds.size() - 1];
+    info->lword_text = lword->best_choice->unichar_string().string();
+    info->rword_text = rword->best_choice->unichar_string().string();
+    info->lword_box = lword->word->bounding_box();
+    info->rword_box = rword->word->bounding_box();
+    LeftWordAttributes(lword->uch_set, lword->best_choice,
+                       info->lword_text,
+                       &info->lword_indicates_list_item,
+                       &info->lword_likely_starts_idea,
+                       &info->lword_likely_ends_idea);
+    RightWordAttributes(rword->uch_set, rword->best_choice,
+                        info->rword_text,
+                        &info->rword_indicates_list_item,
+                        &info->rword_likely_starts_idea,
+                        &info->rword_likely_ends_idea);
+  }
+  info->ltr = ltr >= rtl;
+}
+
+// This is called after rows have been identified and words are recognized.
+// Much of this could be implemented before word recognition, but text helps
+// to identify bulleted lists and gives good signals for sentence boundaries.
+void DetectParagraphs(int debug_level,
+                      const MutableIterator *block_start,
+                      GenericVector<ParagraphModel *> *models) {
+  // Clear out any preconceived notions.
+  if (block_start->Empty(RIL_TEXTLINE)) {
+    return;
+  }
+  BLOCK *block = block_start->PageResIt()->block()->block;
+  block->para_list()->clear();
+  bool is_image_block = block->poly_block() && !block->poly_block()->IsText();
+
+  // Convert the Tesseract structures to RowInfos
+  // for the paragraph detection algorithm.
+  MutableIterator row(*block_start);
+  if (row.Empty(RIL_TEXTLINE))
+    return;  // end of input already.
+
+  GenericVector<RowInfo> row_infos;
+  do {
+    if (!row.PageResIt()->row())
+      continue;  // empty row.
+    row.PageResIt()->row()->row->set_para(NULL);
+    row_infos.push_back(RowInfo());
+    RowInfo &ri = row_infos.back();
+    InitializeRowInfo(row, &ri);
+  } while (!row.IsAtFinalElement(RIL_BLOCK, RIL_TEXTLINE) &&
+           row.Next(RIL_TEXTLINE));
+
+  // Run the paragraph detection algorithm.
+  GenericVector<PARA *> row_owners;
+  GenericVector<PARA *> the_paragraphs;
+  if (!is_image_block) {
+    DetectParagraphs(debug_level, &row_infos, &row_owners, block->para_list(),
+                     models);
+  } else {
+    row_owners.init_to_size(row_infos.size(), NULL);
+    CanonicalizeDetectionResults(&row_owners, block->para_list());
+  }
+
+  // Now stitch in the row_owners into the rows.
+  row = *block_start;
+  for (int i = 0; i < row_owners.size(); i++) {
+    while (!row.PageResIt()->row())
+      row.Next(RIL_TEXTLINE);
+    row.PageResIt()->row()->row->set_para(row_owners[i]);
+    row.Next(RIL_TEXTLINE);
+  }
+}
+
+}  // namespace
diff --git a/ccmain/paragraphs.h b/ccmain/paragraphs.h
new file mode 100644
index 0000000000..9c11d95c1e
--- /dev/null
+++ b/ccmain/paragraphs.h
@@ -0,0 +1,107 @@
+/**********************************************************************
+ * File:        paragraphs.h
+ * Description: Paragraph Detection data structures.
+ * Author:      David Eger
+ * Created:     25 February 2011
+ *
+ * (C) Copyright 2011, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef TESSERACT_CCMAIN_PARAGRAPHS_H_
+#define TESSERACT_CCMAIN_PARAGRAPHS_H_
+
+#include "rect.h"
+#include "ocrpara.h"
+#include "genericvector.h"
+#include "strngs.h"
+
+
+class WERD;
+class UNICHARSET;
+
+namespace tesseract {
+
+class MutableIterator;
+
+// This structure captures all information needed about a text line for the
+// purposes of paragraph detection.  It is meant to be exceedingly light-weight
+// so that we can easily test paragraph detection independent of the rest of
+// Tesseract.
+class RowInfo {
+ public:
+  // Constant data derived from Tesseract output.
+  STRING text;        // the full UTF-8 text of the line.
+  bool ltr;           // whether the majority of the text is left-to-right
+                      // TODO(eger) make this more fine-grained.
+
+  bool has_leaders;   // does the line contain leader dots (.....)?
+  bool has_drop_cap;  // does the line have a drop cap?
+  int pix_ldistance;  // distance to the left pblock boundary in pixels
+  int pix_rdistance;  // distance to the right pblock boundary in pixels
+  float pix_xheight;  // guessed xheight for the line
+  int average_interword_space; // average space between words in pixels.
+
+  int num_words;
+  TBOX lword_box;     // in normalized (horiz text rows) space
+  TBOX rword_box;     // in normalized (horiz text rows) space
+
+  STRING lword_text;   // the UTF-8 text of the leftmost werd
+  STRING rword_text;   // the UTF-8 text of the rightmost werd
+
+  //   The text of a paragraph typically starts with the start of an idea and
+  // ends with the end of an idea.  Here we define paragraph as something that
+  // may have a first line indent and a body indent which may be different.
+  // Typical words that start an idea are:
+  //   1. Words in western scripts that start with
+  //      a capital letter, for example "The"
+  //   2. Bulleted or numbered list items, for
+  //      example "2."
+  // Typical words which end an idea are words ending in punctuation marks. In
+  // this vocabulary, each list item is represented as a paragraph.
+  bool lword_indicates_list_item;
+  bool lword_likely_starts_idea;
+  bool lword_likely_ends_idea;
+
+  bool rword_indicates_list_item;
+  bool rword_likely_starts_idea;
+  bool rword_likely_ends_idea;
+};
+
+// Main entry point for Paragraph Detection Algorithm.
+//
+// Given a set of equally spaced textlines (described by row_infos),
+// Split them into paragraphs.  See http://goto/paragraphstalk
+//
+// Output:
+//   row_owners - one pointer for each row, to the paragraph it belongs to.
+//   paragraphs - this is the actual list of PARA objects.
+//   models - the list of paragraph models referenced by the PARA objects.
+//            caller is responsible for deleting the models.
+void DetectParagraphs(int debug_level,
+                      GenericVector<RowInfo> *row_infos,
+                      GenericVector<PARA *> *row_owners,
+                      PARA_LIST *paragraphs,
+                      GenericVector<ParagraphModel *> *models);
+
+// Given a MutableIterator to the start of a block, run DetectParagraphs on
+// that block and commit the results to the underlying ROW and BLOCK structs,
+// saving the ParagraphModels in models.  Caller owns the models.
+// We use unicharset during the function to answer questions such as "is the
+// first letter of this word upper case?"
+void DetectParagraphs(int debug_level,
+                      const MutableIterator *block_start,
+                      GenericVector<ParagraphModel *> *models);
+
+}  // namespace
+
+#endif  // TESSERACT_CCMAIN_PARAGRAPHS_H_
diff --git a/ccmain/paragraphs_internal.h b/ccmain/paragraphs_internal.h
new file mode 100644
index 0000000000..c622290f9a
--- /dev/null
+++ b/ccmain/paragraphs_internal.h
@@ -0,0 +1,308 @@
+/**********************************************************************
+ * File:        paragraphs.h
+ * Description: Paragraph Detection internal data structures.
+ * Author:      David Eger
+ * Created:     11 March 2011
+ *
+ * (C) Copyright 2011, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
+#define TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
+
+#include "paragraphs.h"
+#include "strings.h"
+
+// NO CODE OUTSIDE OF paragraphs.cpp AND TESTS SHOULD NEED TO ACCESS
+// DATA STRUCTURES OR FUNCTIONS IN THIS FILE.
+
+class WERD_CHOICE;
+
+namespace tesseract {
+
+// Return whether the given word is likely to be a list item start word.
+bool AsciiLikelyListItem(const STRING &word);
+
+// Return the first Unicode Codepoint from werd[pos].
+int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos);
+
+// Set right word attributes given either a unicharset and werd or a utf8
+// string.
+void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
+                         const STRING &utf8,
+                         bool *is_list, bool *starts_idea, bool *ends_idea);
+
+// Set left word attributes given either a unicharset and werd or a utf8 string.
+void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
+                        const STRING &utf8,
+                        bool *is_list, bool *starts_idea, bool *ends_idea);
+
+enum LineType {
+  LT_START = 'S',     // First line of a paragraph.
+  LT_BODY = 'C',      // Continuation line of a paragraph.
+  LT_UNKNOWN = 'U',   // No clues.
+  LT_MULTIPLE = 'M',  // Matches for both LT_START and LT_BODY.
+};
+
+// The first paragraph in a page of body text is often un-indented.
+// This is a typographic convention which is common to indicate either that:
+// (1) The paragraph is the continuation of a previous paragraph, or
+// (2) The paragraph is the first paragraph in a chapter.
+//
+// I refer to such paragraphs as "crown"s, and the output of the paragraph
+// detection algorithm attempts to give them the same paragraph model as
+// the rest of the body text.
+//
+// Nonetheless, while building hypotheses, it is useful to mark the lines
+// of crown paragraphs temporarily as crowns, either aligned left or right.
+extern const ParagraphModel *kCrownLeft;
+extern const ParagraphModel *kCrownRight;
+
+inline bool StrongModel(const ParagraphModel *model) {
+  return model != NULL && model != kCrownLeft && model != kCrownRight;
+}
+
+struct LineHypothesis {
+  LineHypothesis() : ty(LT_UNKNOWN), model(NULL) {}
+  LineHypothesis(LineType line_type, const ParagraphModel *m)
+      : ty(line_type), model(m) {}
+  LineHypothesis(const LineHypothesis &other)
+      : ty(other.ty), model(other.model) {}
+
+  bool operator==(const LineHypothesis &other) const {
+    return ty == other.ty && model == other.model;
+  }
+
+  LineType ty;
+  const ParagraphModel *model;
+};
+
+class ParagraphTheory;  // Forward Declaration
+
+typedef GenericVectorEqEq<const ParagraphModel *> SetOfModels;
+
+// Row Scratch Registers are data generated by the paragraph detection
+// algorithm based on a RowInfo input.
+class RowScratchRegisters {
+ public:
+  // We presume row will outlive us.
+  void Init(const RowInfo &row);
+
+  LineType GetLineType() const;
+
+  LineType GetLineType(const ParagraphModel *model) const;
+
+  // Mark this as a start line type, sans model.  This is useful for the
+  // initial marking of probable body lines or paragraph start lines.
+  void SetStartLine();
+
+  // Mark this as a body line type, sans model.  This is useful for the
+  // initial marking of probably body lines or paragraph start lines.
+  void SetBodyLine();
+
+  // Record that this row fits as a paragraph start line in the given model,
+  void AddStartLine(const ParagraphModel *model);
+  // Record that this row fits as a paragraph body line in the given model,
+  void AddBodyLine(const ParagraphModel *model);
+
+  // Clear all hypotheses about this line.
+  void SetUnknown() { hypotheses_.truncate(0); }
+
+  // Append all hypotheses of strong models that match this row as a start.
+  void StartHypotheses(SetOfModels *models) const;
+
+  // Append all hypotheses of strong models matching this row.
+  void StrongHypotheses(SetOfModels *models) const;
+
+  // Append all hypotheses for this row.
+  void NonNullHypotheses(SetOfModels *models) const;
+
+  // Discard any hypotheses whose model is not in the given list.
+  void DiscardNonMatchingHypotheses(const SetOfModels &models);
+
+  // If we have only one hypothesis and that is that this line is a paragraph
+  // start line of a certain model, return that model.  Else return NULL.
+  const ParagraphModel *UniqueStartHypothesis() const;
+
+  // If we have only one hypothesis and that is that this line is a paragraph
+  // body line of a certain model, return that model.  Else return NULL.
+  const ParagraphModel *UniqueBodyHypothesis() const;
+
+  // Return the indentation for the side opposite of the aligned side.
+  int OffsideIndent(tesseract::ParagraphJustification just) const {
+    switch (just) {
+      case tesseract::JUSTIFICATION_RIGHT: return lindent_;
+      case tesseract::JUSTIFICATION_LEFT: return rindent_;
+      default: return lindent_ > rindent_ ? lindent_ : rindent_;
+    }
+  }
+
+  // Return the indentation for the side the text is aligned to.
+  int AlignsideIndent(tesseract::ParagraphJustification just) const {
+    switch (just) {
+      case tesseract::JUSTIFICATION_RIGHT: return rindent_;
+      case tesseract::JUSTIFICATION_LEFT: return lindent_;
+      default: return lindent_ > rindent_ ? lindent_ : rindent_;
+    }
+  }
+
+  // Append header fields to a vector of row headings.
+  static void AppendDebugHeaderFields(GenericVector<STRING> *header);
+
+  // Append data for this row to a vector of debug strings.
+  void AppendDebugInfo(const ParagraphTheory &theory,
+                       GenericVector<STRING> *dbg) const;
+
+  const RowInfo *ri_;
+
+  // These four constants form a horizontal box model for the white space
+  // on the edges of each line.  At each point in the algorithm, the following
+  // shall hold:
+  //   ri_->pix_ldistance = lmargin_ + lindent_
+  //   ri_->pix_rdistance = rindent_ + rmargin_
+  int lmargin_;
+  int lindent_;
+  int rindent_;
+  int rmargin_;
+
+ private:
+  // Hypotheses of either LT_START or LT_BODY
+  GenericVectorEqEq<LineHypothesis> hypotheses_;
+};
+
+// A collection of convenience functions for wrapping the set of
+// Paragraph Models we believe correctly model the paragraphs in the image.
+class ParagraphTheory {
+ public:
+  // We presume models will outlive us, and that models will take ownership
+  // of any ParagraphModel *'s we add.
+  explicit ParagraphTheory(GenericVector<ParagraphModel *> *models)
+      : models_(models) {}
+  GenericVector<ParagraphModel *> &models() { return *models_; }
+  const GenericVector<ParagraphModel *> &models() const { return *models_; }
+
+  // Return an existing model if one that is Comparable() can be found.
+  // Else, allocate a new copy of model to save and return a pointer to it.
+  const ParagraphModel *AddModel(const ParagraphModel &model);
+
+  // Discard any models we've made that are not in the list of used models.
+  void DiscardUnusedModels(const SetOfModels &used_models);
+
+  // Return the set of all non-centered models.
+  void NonCenteredModels(SetOfModels *models);
+
+  // If any of the non-centered paragraph models we know about fit
+  // rows[start, end), return it.  Else NULL.
+  const ParagraphModel *Fits(const GenericVector<RowScratchRegisters> *rows,
+                             int start, int end) const;
+
+  int IndexOf(const ParagraphModel *model) const;
+
+ private:
+  GenericVector<ParagraphModel *> *models_;
+  GenericVectorEqEq<ParagraphModel *> models_we_added_;
+};
+
+bool ValidFirstLine(const GenericVector<RowScratchRegisters> *rows,
+                    int row, const ParagraphModel *model);
+bool ValidBodyLine(const GenericVector<RowScratchRegisters> *rows,
+                   int row, const ParagraphModel *model);
+bool CrownCompatible(const GenericVector<RowScratchRegisters> *rows,
+                     int a, int b, const ParagraphModel *model);
+
+// A class for smearing Paragraph Model hypotheses to surrounding rows.
+// The idea here is that StrongEvidenceClassify first marks only exceedingly
+// obvious start and body rows and constructs models of them.  Thereafter,
+// we may have left over unmarked lines (mostly end-of-paragraph lines) which
+// were too short to have much confidence about, but which fit the models we've
+// constructed perfectly and which we ought to mark.  This class is used to
+// "smear" our models over the text.
+class ParagraphModelSmearer {
+ public:
+  ParagraphModelSmearer(GenericVector<RowScratchRegisters> *rows,
+                        int row_start, int row_end,
+                        ParagraphTheory *theory);
+
+  // Smear forward paragraph models from existing row markings to subsequent
+  // text lines if they fit, and mark any thereafter still unmodeled rows
+  // with any model in the theory that fits them.
+  void Smear();
+
+ private:
+  // Record in open_models_ for rows [start_row, end_row) the list of models
+  // currently open at each row.
+  // A model is still open in a row if some previous row has said model as a
+  // start hypothesis, and all rows since (including this row) would fit as
+  // either a body or start line in that model.
+  void CalculateOpenModels(int row_start, int row_end);
+
+  SetOfModels &OpenModels(int row) {
+    return open_models_[row - row_start_ + 1];
+  }
+
+  ParagraphTheory *theory_;
+  GenericVector<RowScratchRegisters> *rows_;
+  int row_start_;
+  int row_end_;
+
+  // open_models_ corresponds to rows[start_row_ - 1, end_row_]
+  //
+  // open_models_:  Contains models which there was an active (open) paragraph
+  //                as of the previous line and for which the left and right
+  //                indents admit the possibility that this text line continues
+  //                to fit the same model.
+  // TODO(eger): Think about whether we can get rid of "Open" models and just
+  //   use the current hypotheses on RowScratchRegisters.
+  GenericVector<SetOfModels> open_models_;
+};
+
+// Clear all hypotheses about lines [start, end) and reset the margins to the
+// percentile (0..100) value of the left and right row edges for this run of
+// rows.
+void RecomputeMarginsAndClearHypotheses(
+    GenericVector<RowScratchRegisters> *rows, int start, int end,
+    int percentile);
+
+// Return the minimum inter-word space in rows[row_start, row_end).
+int InterwordSpace(const GenericVector<RowScratchRegisters> &rows,
+                   int row_start, int row_end);
+
+// Return whether the first word on the after line can fit in the space at
+// the end of the before line (knowing which way the text is aligned and read).
+bool FirstWordWouldHaveFit(const RowScratchRegisters &before,
+                           const RowScratchRegisters &after,
+                           tesseract::ParagraphJustification justification);
+
+// Return whether the first word on the after line can fit in the space at
+// the end of the before line (not knowing the text alignment).
+bool FirstWordWouldHaveFit(const RowScratchRegisters &before,
+                           const RowScratchRegisters &after);
+
+// Do rows[start, end) form a single instance of the given paragraph model?
+bool RowsFitModel(const GenericVector<RowScratchRegisters> *rows,
+                  int start, int end, const ParagraphModel *model);
+
+// Do the text and geometry of two rows support a paragraph break between them?
+bool LikelyParagraphStart(const RowScratchRegisters &before,
+                          const RowScratchRegisters &after,
+                          tesseract::ParagraphJustification j);
+
+// Given a set of row_owners pointing to PARAs or NULL (no paragraph known),
+// normalize each row_owner to point to an actual PARA, and output the
+// paragraphs in order onto paragraphs.
+void CanonicalizeDetectionResults(
+    GenericVector<PARA *> *row_owners,
+    PARA_LIST *paragraphs);
+
+}  // namespace
+#endif  // TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
diff --git a/ccmain/pgedit.cpp b/ccmain/pgedit.cpp
index a31c57f7f6..8209366133 100755
--- a/ccmain/pgedit.cpp
+++ b/ccmain/pgedit.cpp
@@ -31,20 +31,17 @@
 #include          <ctype.h>
 #include          <math.h>
 
-#include          "tordmain.h"
-#include          "statistc.h"
-#include          "svshowim.h"
-#include          "paramsd.h"
-#include          "string.h"
-
-#include          "scrollview.h"
-#include          "svmnode.h"
-
-#include          "control.h"
+#include "blread.h"
+#include "control.h"
+#include "svshowim.h"
+#include "paramsd.h"
+#include "pageres.h"
+#include "tordmain.h"
+#include "scrollview.h"
+#include "svmnode.h"
+#include "statistc.h"
 #include "tesseractclass.h"
 
-#include          "blread.h"
-
 #ifndef GRAPHICS_DISABLED
 #define ASC_HEIGHT     (2 * kBlnBaselineOffset + kBlnXHeight)
 #define X_HEIGHT       (kBlnBaselineOffset + kBlnXHeight)
@@ -62,6 +59,7 @@ enum CMD_EVENTS
   SHOW_POINT_CMD_EVENT,
   SHOW_BLN_WERD_CMD_EVENT,
   DEBUG_WERD_CMD_EVENT,
+  BLAMER_CMD_EVENT,
   BOUNDING_BOX_CMD_EVENT,
   CORRECT_TEXT_CMD_EVENT,
   POLYGONAL_CMD_EVENT,
@@ -116,6 +114,8 @@ ScrollView* bln_word_window = NULL;       // baseline norm words
 
 CMD_EVENTS mode = CHANGE_DISP_CMD_EVENT;  // selected words op
 
+bool recog_done = false;                  // recog_all_words was called
+
 // These variables should remain global, since they are only used for the
 // debug mode (in which only a single Tesseract thread/instance will be exist).
 BITS16 word_display_mode;
@@ -195,8 +195,8 @@ void build_image_window(int width, int height) {
                              editor_image_xpos, editor_image_ypos,
                              width + 1,
                              height + editor_image_menuheight + 1,
-                             width + 1,
-                             height + 1,
+                             width,
+                             height,
                              true);
 }
 
@@ -269,6 +269,7 @@ SVMenuNode *Tesseract::build_menu_new() {
 
   parent_menu = root_menu_item->AddChild("DISPLAY");
 
+  parent_menu->AddChild("Blamer", BLAMER_CMD_EVENT, FALSE);
   parent_menu->AddChild("Bounding Boxes", BOUNDING_BOX_CMD_EVENT, FALSE);
   parent_menu->AddChild("Correct Text", CORRECT_TEXT_CMD_EVENT, FALSE);
   parent_menu->AddChild("Polygonal Approx", POLYGONAL_CMD_EVENT, FALSE);
@@ -333,11 +334,11 @@ void Tesseract::do_re_display(
  */
 
 void Tesseract::pgeditor_main(int width, int height, PAGE_RES *page_res) {
-
   current_page_res = page_res;
   if (current_page_res->block_res_list.empty())
     return;
 
+  recog_done = false;
   stillRunning = true;
 
   build_image_window(width, height);
@@ -400,6 +401,28 @@ BOOL8 Tesseract::process_cmd_win_event(                 // UI command semantics
   BOOL8 exit = FALSE;
 
   color_mode = CM_RAINBOW;
+
+  // Run recognition on the full page if needed.
+  switch (cmd_event) {
+    case BLAMER_CMD_EVENT:
+    case SHOW_SUBSCRIPT_CMD_EVENT:
+    case SHOW_SUPERSCRIPT_CMD_EVENT:
+    case SHOW_ITALIC_CMD_EVENT:
+    case SHOW_BOLD_CMD_EVENT:
+    case SHOW_UNDERLINE_CMD_EVENT:
+    case SHOW_FIXEDPITCH_CMD_EVENT:
+    case SHOW_SERIF_CMD_EVENT:
+    case SHOW_SMALLCAPS_CMD_EVENT:
+    case SHOW_DROPCAPS_CMD_EVENT:
+      if (!recog_done) {
+        recog_all_words(current_page_res, NULL, NULL, NULL, 0);
+        recog_done = true;
+      }
+      break;
+    default:
+      break;
+  }
+
   switch (cmd_event) {
     case NULL_CMD_EVENT:
       break;
@@ -423,6 +446,14 @@ BOOL8 Tesseract::process_cmd_win_event(                 // UI command semantics
         word_display_mode.turn_off_bit(DF_BOX);
       mode = CHANGE_DISP_CMD_EVENT;
       break;
+    case BLAMER_CMD_EVENT:
+      if (new_value[0] == 'T')
+        word_display_mode.turn_on_bit(DF_BLAMER);
+      else
+        word_display_mode.turn_off_bit(DF_BLAMER);
+      do_re_display(&tesseract::Tesseract::word_display);
+      mode = CHANGE_DISP_CMD_EVENT;
+      break;
     case CORRECT_TEXT_CMD_EVENT:
       if (new_value[0] == 'T')
         word_display_mode.turn_on_bit(DF_TEXT);
@@ -691,7 +722,9 @@ BOOL8 Tesseract:: word_blank_and_set_display(BLOCK* block, ROW* row,
 BOOL8 Tesseract::word_bln_display(BLOCK* block, ROW* row, WERD_RES* word_res) {
   TWERD *bln_word = word_res->chopped_word;
   if (bln_word == NULL) {
-    word_res->SetupForRecognition(unicharset, false, row, block);
+    word_res->SetupForTessRecognition(unicharset, this, BestPix(), false,
+                                      this->textord_use_cjk_fp_model,
+                                      row, block);
     bln_word = word_res->chopped_word;
   }
   bln_word_window_handle()->Clear();
@@ -720,10 +753,8 @@ BOOL8 Tesseract::word_display(BLOCK* block, ROW* row, WERD_RES* word_res) {
   if (color_mode != CM_RAINBOW && word_res->box_word != NULL) {
     BoxWord* box_word = word_res->box_word;
     int length = box_word->length();
-    int font_id = word_res->fontinfo_id;
-    if (font_id < 0) font_id = 0;
-    const UnicityTable<FontInfo> &font_table = get_fontinfo_table();
-    FontInfo font_info = font_table.get(font_id);
+    if (word_res->fontinfo == NULL) return false;
+    const FontInfo& font_info = *word_res->fontinfo;
     for (int i = 0; i < length; ++i) {
       ScrollView::Color color = ScrollView::GREEN;
       switch (color_mode) {
@@ -806,25 +837,56 @@ BOOL8 Tesseract::word_display(BLOCK* block, ROW* row, WERD_RES* word_res) {
     displayed_something = TRUE;
   }
 
-  // display correct text
+  // Display correct text and blamer information.
+  STRING text;
+  STRING blame;
   if (word->display_flag(DF_TEXT) && word->text() != NULL) {
+    text = word->text();
+  }
+  if (word->display_flag(DF_BLAMER) &&
+      !(word_res->blamer_bundle != NULL &&
+        word_res->blamer_bundle->incorrect_result_reason == IRR_CORRECT)) {
+    text = "";
+    const BlamerBundle *blamer_bundle = word_res->blamer_bundle;
+    if (blamer_bundle == NULL) {
+      text += "NULL";
+    } else {
+      for (int i = 0; i < blamer_bundle->truth_text.length(); ++i) {
+        text += blamer_bundle->truth_text[i];
+      }
+    }
+    text += " -> ";
+    STRING best_choice_str;
+    if (word_res->best_choice == NULL) {
+      best_choice_str = "NULL";
+    } else {
+      word_res->best_choice->string_and_lengths(&best_choice_str, NULL);
+    }
+    text += best_choice_str;
+    IncorrectResultReason reason = (blamer_bundle == NULL) ?
+        IRR_PAGE_LAYOUT : blamer_bundle->incorrect_result_reason;
+    ASSERT_HOST(reason < IRR_NUM_REASONS)
+    blame += " [";
+    blame += BlamerBundle::IncorrectReasonName(reason);
+    blame += "]";
+  }
+  if (text.length() > 0) {
     word_bb = word->bounding_box();
-    ScrollView::Color c =(ScrollView::Color)
-       ((inT32) editor_image_blob_bb_color);
-    image_win->Pen(c);
+    image_win->Pen(ScrollView::RED);
     word_height = word_bb.height();
-    image_win->TextAttributes("Times", 0.75 * word_height,
-                              false, false, false);
-    if (word_height < word_bb.width())
-      shift = 0.25 * word_height;
-    else
-      shift = 0.0f;
-
+    int text_height = 0.50 * word_height;
+    if (text_height > 20) text_height = 20;
+    image_win->TextAttributes("Arial", text_height, false, false, false);
+    shift = (word_height < word_bb.width()) ? 0.25 * word_height : 0.0f;
     image_win->Text(word_bb.left() + shift,
-                    word_bb.bottom() + 0.25 * word_height, word->text());
+                    word_bb.bottom() + 0.25 * word_height, text.string());
+    if (blame.length() > 0) {
+      image_win->Text(word_bb.left() + shift,
+                      word_bb.bottom() + 0.25 * word_height - text_height,
+                      blame.string());
+    }
 
-    if (strlen(word->text()) > 0)
-      displayed_something = TRUE;
+    displayed_something = TRUE;
   }
 
   if (!displayed_something)      // display BBox anyway
@@ -849,6 +911,11 @@ BOOL8 Tesseract::word_dumper(BLOCK* block, ROW* row, WERD_RES* word_res) {
   row->print(NULL);
   tprintf("\nWord data...\n");
   word_res->word->print();
+  if (word_res->blamer_bundle != NULL && wordrec_debug_blamer &&
+      word_res->blamer_bundle->incorrect_result_reason != IRR_CORRECT) {
+    tprintf("Current blamer debug: %s\n",
+            word_res->blamer_bundle->debug.string());
+  }
   return TRUE;
 }
 
@@ -866,6 +933,7 @@ BOOL8 Tesseract::word_set_display(BLOCK* block, ROW* row, WERD_RES* word_res) {
   word->set_display_flag(DF_EDGE_STEP, word_display_mode.bit(DF_EDGE_STEP));
   word->set_display_flag(DF_BN_POLYGONAL,
     word_display_mode.bit(DF_BN_POLYGONAL));
+  word->set_display_flag(DF_BLAMER, word_display_mode.bit(DF_BLAMER));
   return word_display(block, row, word_res);
 }
 }  // namespace tesseract
diff --git a/ccmain/recogtraining.cpp b/ccmain/recogtraining.cpp
index 8c4b7e1ce3..5047ff6629 100644
--- a/ccmain/recogtraining.cpp
+++ b/ccmain/recogtraining.cpp
@@ -38,7 +38,7 @@ FILE *Tesseract::init_recog_training(const STRING &fname) {
   if (tessedit_ambigs_training) {
     tessedit_tess_adaption_mode.set_value(0);    // turn off adaption
     tessedit_enable_doc_dict.set_value(0);       // turn off document dictionary
-    save_best_choices.set_value(1);              // save individual char choices
+    save_blob_choices.set_value(1);              // save individual char choices
     getDict().save_raw_choices.set_value(1);     // save raw choices
     getDict().permute_only_top.set_value(true);  // use only top choice permuter
     tessedit_ok_mode.set_value(0);               // turn off context checking
@@ -56,22 +56,24 @@ FILE *Tesseract::init_recog_training(const STRING &fname) {
 
 // Copies the bounding box from page_res_it->word() to the given TBOX.
 bool read_t(PAGE_RES_IT *page_res_it, TBOX *tbox) {
+  while (page_res_it->block() != NULL) {
+    if (page_res_it->word() != NULL)
+      break;
+    page_res_it->forward();
+  }
+
   if (page_res_it->word() != NULL) {
     *tbox = page_res_it->word()->word->bounding_box();
     page_res_it->forward();
-    return true;
-  } else {
-    return false;
-  }
-}
 
-// Reads the next box from the given box file into TBOX.
-bool read_b(int applybox_page, int *line_number, FILE *box_file,
-            char *label, TBOX *bbox) {
-  int x_min, y_min, x_max, y_max;
-  if (read_next_box(applybox_page, line_number, box_file, label,
-                    &x_min, &y_min, &x_max, &y_max)) {
-    bbox->set_to_given_coords(x_min, y_min, x_max, y_max);
+    // If tbox->left() is negative, the training image has vertical text and
+    // all the coordinates of bounding boxes of page_res are rotated by 90
+    // degrees in a counterclockwise direction. We need to rotate the TBOX back
+    // in order to compare with the TBOXes of box files.
+    if (tbox->left() < 0) {
+      tbox->rotate(FCOORD(0.0, -1.0));
+    }
+
     return true;
   } else {
     return false;
@@ -97,27 +99,29 @@ void Tesseract::recog_training_segmented(const STRING &fname,
   PAGE_RES_IT page_res_it;
   page_res_it.page_res = page_res;
   page_res_it.restart_page();
-  char label[kBoxReadBufSize];
+  STRING label;
 
   // Process all the words on this page.
   TBOX tbox;  // tesseract-identified box
   TBOX bbox;  // box from the box file
   bool keep_going;
   int line_number = 0;
+  int examined_words = 0;
   do {
     keep_going = read_t(&page_res_it, &tbox);
-    keep_going &= read_b(applybox_page, &line_number, box_file, label, &bbox);
+    keep_going &= ReadNextBox(applybox_page, &line_number, box_file, &label,
+                              &bbox);
     // Align bottom left points of the TBOXes.
     while (keep_going &&
            !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
       keep_going = (bbox.bottom() < tbox.bottom()) ?
           read_t(&page_res_it, &tbox) :
-            read_b(applybox_page, &line_number, box_file, label, &bbox);
+            ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
     }
     while (keep_going &&
            !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
       keep_going = (bbox.left() > tbox.left()) ? read_t(&page_res_it, &tbox) :
-        read_b(applybox_page, &line_number, box_file, label, &bbox);
+          ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
     }
     // OCR the word if top right points of the TBOXes are similar.
     if (keep_going &&
@@ -126,9 +130,30 @@ void Tesseract::recog_training_segmented(const STRING &fname,
         ambigs_classify_and_output(page_res_it.prev_word(),
                                    page_res_it.prev_row(),
                                    page_res_it.prev_block(),
-                                   label, output_file);
+                                   label.string(), output_file);
+        examined_words++;
     }
   } while (keep_going);
+
+  // Set up scripts on all of the words that did not get sent to
+  // ambigs_classify_and_output.  They all should have, but if all the
+  // werd_res's don't get uch_sets, tesseract will crash when you try
+  // to iterate over them. :-(
+  int total_words = 0;
+  for (page_res_it.restart_page(); page_res_it.block() != NULL;
+       page_res_it.forward()) {
+    if (page_res_it.word()) {
+      if (page_res_it.word()->uch_set == NULL)
+        page_res_it.word()->SetupFake(unicharset);
+      total_words++;
+    }
+  }
+  if (examined_words < 0.85 * total_words) {
+    tprintf("TODO(antonova): clean up recog_training_segmented; "
+            " It examined only a small fraction of the ambigs image.\n");
+  }
+  tprintf("recog_training_segmented: examined %d / %d words.\n",
+          examined_words, total_words);
 }
 
 // Runs classify_word_pass1() on the current word. Outputs Tesseract's
@@ -142,7 +167,8 @@ void Tesseract::ambigs_classify_and_output(WERD_RES *werd_res,
                                            FILE *output_file) {
   int offset;
   // Classify word.
-  classify_word_pass1(werd_res, row_res->row, block_res->block);
+  fflush(stdout);
+  classify_word_pass1(block_res->block, row_res->row, werd_res);
   WERD_CHOICE *best_choice = werd_res->best_choice;
   ASSERT_HOST(best_choice != NULL);
   ASSERT_HOST(best_choice->blob_choices() != NULL);
@@ -151,7 +177,7 @@ void Tesseract::ambigs_classify_and_output(WERD_RES *werd_res,
   int label_num_unichars = 0;
   int step = 1;  // should be non-zero on the first iteration
   for (offset = 0; label[offset] != '\0' && step > 0;
-       step = getDict().getUnicharset().step(label + offset),
+       step = werd_res->uch_set->step(label + offset),
        offset += step, ++label_num_unichars);
   if (step == 0) {
     tprintf("Not outputting illegal unichar %s\n", label);
diff --git a/ccmain/reject.cpp b/ccmain/reject.cpp
index f3052008f7..94497bfa8a 100644
--- a/ccmain/reject.cpp
+++ b/ccmain/reject.cpp
@@ -202,9 +202,9 @@ void Tesseract::make_reject_map(      //make rej map for wd //detailed results
   int offset;
 
   flip_0O(word);
-  check_debug_pt (word, -1);     //For trap only
-  set_done(word, pass);  //Set acceptance
-  word->reject_map.initialise (word->best_choice->unichar_lengths().length ());
+  check_debug_pt(word, -1);     // For trap only
+  set_done(word, pass);  // Set acceptance
+  word->reject_map.initialise(word->best_choice->unichar_lengths().length());
   reject_blanks(word);
   /*
   0: Rays original heuristic - the baseline
@@ -212,16 +212,15 @@ void Tesseract::make_reject_map(      //make rej map for wd //detailed results
   if (tessedit_reject_mode == 0) {
     if (!word->done)
       reject_poor_matches(word, blob_choices);
-  }
-  /*
-  5: Reject I/1/l from words where there is no strong contextual confirmation;
-    the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
-    and the whole of any words which are very small
-  */
-  else if (tessedit_reject_mode == 5) {
-    if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels)
-      word->reject_map.rej_word_small_xht ();
-    else {
+  } else if (tessedit_reject_mode == 5) {
+    /*
+    5: Reject I/1/l from words where there is no strong contextual confirmation;
+      the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
+      and the whole of any words which are very small
+    */
+    if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
+      word->reject_map.rej_word_small_xht();
+    } else {
       one_ell_conflict(word, TRUE);
       /*
         Originally the code here just used the done flag. Now I have duplicated
@@ -236,42 +235,38 @@ void Tesseract::make_reject_map(      //make rej map for wd //detailed results
         (strchr (word->best_choice->unichar_string().string (), ' ') != NULL))
         word->reject_map.rej_word_contains_blanks ();
 
+      WERD_CHOICE* best_choice = word->best_choice;
       if (rej_use_good_perm) {
-        if (((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
-          (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
-          (word->best_choice->permuter () == USER_DAWG_PERM)) &&
-          (!rej_use_sensible_wd ||
-          (acceptable_word_string
-          (word->best_choice->unichar_string().string (),
-           word->best_choice->unichar_lengths().string ()) !=
-        AC_UNACCEPTABLE))) {
-          //PASSED TEST
-        }
-        else if (word->best_choice->permuter () == NUMBER_PERM) {
+        if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
+             best_choice->permuter() == FREQ_DAWG_PERM ||
+             best_choice->permuter() == USER_DAWG_PERM) &&
+            (!rej_use_sensible_wd ||
+             acceptable_word_string(*word->uch_set,
+                                    best_choice->unichar_string().string(),
+                                    best_choice->unichar_lengths().string()) !=
+                                        AC_UNACCEPTABLE)) {
+          // PASSED TEST
+        } else if (best_choice->permuter() == NUMBER_PERM) {
           if (rej_alphas_in_number_perm) {
             for (i = 0, offset = 0;
-                 word->best_choice->unichar_string()[offset] != '\0';
-                 offset += word->best_choice->unichar_lengths()[i++]) {
-              if (word->reject_map[i].accepted () &&
-                  unicharset.get_isalpha(
-                      word->best_choice->unichar_string().string() + offset,
-                      word->best_choice->unichar_lengths()[i]))
-                word->reject_map[i].setrej_bad_permuter ();
-              //rej alpha
+                 best_choice->unichar_string()[offset] != '\0';
+                 offset += best_choice->unichar_lengths()[i++]) {
+              if (word->reject_map[i].accepted() &&
+                  word->uch_set->get_isalpha(
+                      best_choice->unichar_string().string() + offset,
+                      best_choice->unichar_lengths()[i]))
+                word->reject_map[i].setrej_bad_permuter();
+              // rej alpha
             }
           }
-        }
-        else {
-          word->reject_map.rej_word_bad_permuter ();
+        } else {
+          word->reject_map.rej_word_bad_permuter();
         }
       }
-
       /* Ambig word rejection was here once !!*/
-
     }
-  }
-  else {
-    tprintf ("BAD tessedit_reject_mode\n");
+  } else {
+    tprintf("BAD tessedit_reject_mode\n");
     err_exit();
   }
 
@@ -280,14 +275,14 @@ void Tesseract::make_reject_map(      //make rej map for wd //detailed results
 
   check_debug_pt (word, 10);
   if (tessedit_rejection_debug) {
-    tprintf ("Permuter Type = %d\n", word->best_choice->permuter ());
-    tprintf ("Certainty: %f     Rating: %f\n",
+    tprintf("Permuter Type = %d\n", word->best_choice->permuter ());
+    tprintf("Certainty: %f     Rating: %f\n",
       word->best_choice->certainty (), word->best_choice->rating ());
     tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
   }
 
   flip_hyphens(word);
-  check_debug_pt (word, 20);
+  check_debug_pt(word, 20);
 }
 }  // namespace tesseract
 
@@ -492,8 +487,8 @@ BOOL8 Tesseract::one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) {
   for (i = 0, offset = 0, non_conflict_set_char = FALSE;
        (i < word_len) && !non_conflict_set_char; offset += lengths[i++])
     non_conflict_set_char =
-        (unicharset.get_isalpha(word + offset, lengths[i]) ||
-         unicharset.get_isdigit(word + offset, lengths[i])) &&
+        (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
+            word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
         !STRING (conflict_set_I_l_1).contains (word[offset]);
   if (!non_conflict_set_char) {
     if (update_map)
@@ -524,7 +519,7 @@ BOOL8 Tesseract::one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) {
     if (lengths[first_alphanum_index_] == 1 &&
         word[first_alphanum_offset_] == 'I') {
       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
-      if (safe_dict_word(*(word_res->best_choice)) > 0) {
+      if (safe_dict_word(word_res) > 0) {
         word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
         if (update_map)
           word_res->reject_map[first_alphanum_index_].
@@ -540,7 +535,7 @@ BOOL8 Tesseract::one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) {
     if (lengths[first_alphanum_index_] == 1 &&
         word[first_alphanum_offset_] == 'l') {
       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
-      if (safe_dict_word(*(word_res->best_choice)) > 0) {
+      if (safe_dict_word(word_res) > 0) {
         word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
         if (update_map)
           word_res->reject_map[first_alphanum_index_].
@@ -571,7 +566,7 @@ BOOL8 Tesseract::one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) {
   if (lengths[first_alphanum_index_] == 1 &&
       word[first_alphanum_offset_] == 'l') {
     word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
-    if (safe_dict_word(*(word_res->best_choice)) > 0)
+    if (safe_dict_word(word_res) > 0)
       return FALSE;
     else
       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
@@ -579,7 +574,7 @@ BOOL8 Tesseract::one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) {
   else if (lengths[first_alphanum_index_] == 1 &&
            word[first_alphanum_offset_] == 'I') {
     word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
-    if (safe_dict_word(*(word_res->best_choice)) > 0)
+    if (safe_dict_word(word_res) > 0)
       return FALSE;
     else
       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
@@ -611,7 +606,7 @@ BOOL8 Tesseract::one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) {
     For anything else. See if it conforms to an acceptable word type. If so,
     treat accordingly.
   */
-  word_type = acceptable_word_string (word, lengths);
+  word_type = acceptable_word_string(*word_res->uch_set, word, lengths);
   if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
     first_alphanum_index_ = first_alphanum_index (word, lengths);
     first_alphanum_offset_ = first_alphanum_offset (word, lengths);
@@ -721,8 +716,8 @@ void Tesseract::dont_allow_1Il(WERD_RES *word) {
       if (STRING(conflict_set_I_l_1).contains(s[offset])) {
         accepted_1Il = TRUE;
       } else {
-        if (unicharset.get_isalpha(s + offset, lengths[i]) ||
-            unicharset.get_isdigit(s + offset, lengths[i]))
+        if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||
+            word->uch_set->get_isdigit(s + offset, lengths[i]))
           return;                // >=1 non 1Il ch accepted
       }
     }
@@ -744,8 +739,8 @@ inT16 Tesseract::count_alphanums(WERD_RES *word_res) {
   const WERD_CHOICE *best_choice = word_res->best_choice;
   for (int i = 0; i < word_res->reject_map.length(); ++i) {
     if ((word_res->reject_map[i].accepted()) &&
-        (unicharset.get_isalpha(best_choice->unichar_id(i)) ||
-         unicharset.get_isdigit(best_choice->unichar_id(i)))) {
+        (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||
+            word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {
       count++;
     }
   }
@@ -788,8 +783,9 @@ BOOL8 Tesseract::repeated_nonalphanum_wd(WERD_RES *word, ROW *row) {
     return FALSE;
 }
 
-inT16 Tesseract::safe_dict_word(const WERD_CHOICE &word) {
-  int dict_word_type = dict_word(word);
+inT16 Tesseract::safe_dict_word(const WERD_RES *werd_res) {
+  const WERD_CHOICE &word = *werd_res->best_choice;
+  int dict_word_type = werd_res->tesseract->dict_word(word);
   return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
 }
 
@@ -809,7 +805,7 @@ void Tesseract::flip_hyphens(WERD_RES *word_res) {
     return;
 
   TBLOB* blob = word_res->rebuild_word->blobs;
-  UNICHAR_ID unichar_dash = unicharset.unichar_to_id("-");
+  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
   bool modified = false;
   for (i = 0; i < best_choice->length() && blob != NULL; ++i,
        blob = blob->next) {
@@ -822,10 +818,10 @@ void Tesseract::flip_hyphens(WERD_RES *word_res) {
     if ((out_box.width() > 8 * word_res->denorm.x_scale()) &&
         (out_box.left() > prev_right) && (out_box.right() < next_left)) {
       aspect_ratio = out_box.width() / (float) out_box.height();
-      if (unicharset.eq(best_choice->unichar_id(i), ".")) {
+      if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) {
         if (aspect_ratio >= tessedit_upper_flip_hyphen &&
-            unicharset.contains_unichar_id(unichar_dash) &&
-            unicharset.get_enabled(unichar_dash)) {
+            word_res->uch_set->contains_unichar_id(unichar_dash) &&
+            word_res->uch_set->get_enabled(unichar_dash)) {
           /* Certain HYPHEN */
           best_choice->set_unichar_id(unichar_dash, i);
           modified = true;
@@ -852,7 +848,7 @@ void Tesseract::flip_hyphens(WERD_RES *word_res) {
     prev_right = out_box.right();
   }
   if (modified) {
-    best_choice->populate_unichars(unicharset);
+    best_choice->populate_unichars();
   }
 }
 
@@ -871,18 +867,20 @@ void Tesseract::flip_0O(WERD_RES *word_res) {
   TBLOB* blob = word_res->rebuild_word->blobs;
   for (i = 0; i < best_choice->length() && blob != NULL; ++i,
        blob = blob->next) {
-    if (unicharset.get_isupper(best_choice->unichar_id(i)) ||
-        unicharset.get_isdigit(best_choice->unichar_id(i))) {
+    if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
+        word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
       out_box = blob->bounding_box();
       if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||
         (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4))
         return;                  //Beware words with sub/superscripts
     }
   }
-  UNICHAR_ID unichar_0 = unicharset.unichar_to_id("0");
-  UNICHAR_ID unichar_O = unicharset.unichar_to_id("O");
-  if (unichar_0 == INVALID_UNICHAR_ID || !unicharset.get_enabled(unichar_0) ||
-      unichar_O == INVALID_UNICHAR_ID || !unicharset.get_enabled(unichar_O)) {
+  UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0");
+  UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O");
+  if (unichar_0 == INVALID_UNICHAR_ID ||
+      !word_res->uch_set->get_enabled(unichar_0) ||
+      unichar_O == INVALID_UNICHAR_ID ||
+      !word_res->uch_set->get_enabled(unichar_O)) {
     return;  // 0 or O are not present/enabled in unicharset
   }
   bool modified = false;
@@ -891,43 +889,43 @@ void Tesseract::flip_0O(WERD_RES *word_res) {
         best_choice->unichar_id(i) == unichar_O) {
       /* A0A */
       if ((i+1) < best_choice->length() &&
-          non_O_upper(best_choice->unichar_id(i-1)) &&
-          non_O_upper(best_choice->unichar_id(i+1))) {
+          non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
+          non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+1))) {
         best_choice->set_unichar_id(unichar_O, i);
         modified = true;
       }
       /* A00A */
-      if (non_O_upper(best_choice->unichar_id(i-1)) &&
+      if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
           (i+1) < best_choice->length() &&
           (best_choice->unichar_id(i+1) == unichar_0 ||
            best_choice->unichar_id(i+1) == unichar_O) &&
           (i+2) < best_choice->length() &&
-          non_O_upper(best_choice->unichar_id(i+2))) {
+          non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+2))) {
         best_choice->set_unichar_id(unichar_O, i);
         modified = true;
         i++;
       }
       /* AA0<non digit or end of word> */
       if ((i > 1) &&
-          non_O_upper(best_choice->unichar_id(i-2)) &&
-          non_O_upper(best_choice->unichar_id(i-1)) &&
+          non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-2)) &&
+          non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
           (((i+1) < best_choice->length() &&
-            !unicharset.get_isdigit(best_choice->unichar_id(i+1)) &&
-            !unicharset.eq(best_choice->unichar_id(i+1), "l") &&
-            !unicharset.eq(best_choice->unichar_id(i+1), "I")) ||
+            !word_res->uch_set->get_isdigit(best_choice->unichar_id(i+1)) &&
+            !word_res->uch_set->eq(best_choice->unichar_id(i+1), "l") &&
+            !word_res->uch_set->eq(best_choice->unichar_id(i+1), "I")) ||
            (i == best_choice->length() - 1))) {
         best_choice->set_unichar_id(unichar_O, i);
         modified = true;
       }
       /* 9O9 */
-      if (non_0_digit(best_choice->unichar_id(i-1)) &&
+      if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
           (i+1) < best_choice->length() &&
-          non_0_digit(best_choice->unichar_id(i+1))) {
+          non_0_digit(*word_res->uch_set, best_choice->unichar_id(i+1))) {
         best_choice->set_unichar_id(unichar_0, i);
         modified = true;
       }
       /* 9OOO */
-      if (non_0_digit(best_choice->unichar_id(i-1)) &&
+      if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
           (i+2) < best_choice->length() &&
           (best_choice->unichar_id(i+1) == unichar_0 ||
            best_choice->unichar_id(i+1) == unichar_O) &&
@@ -940,27 +938,27 @@ void Tesseract::flip_0O(WERD_RES *word_res) {
         i += 2;
       }
       /* 9OO<non upper> */
-      if (non_0_digit(best_choice->unichar_id(i-1)) &&
+      if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
           (i+2) < best_choice->length() &&
           (best_choice->unichar_id(i+1) == unichar_0 ||
           best_choice->unichar_id(i+1) == unichar_O) &&
-          !unicharset.get_isupper(best_choice->unichar_id(i+2))) {
+          !word_res->uch_set->get_isupper(best_choice->unichar_id(i+2))) {
         best_choice->set_unichar_id(unichar_0, i);
         best_choice->set_unichar_id(unichar_0, i+1);
         modified = true;
         i++;
       }
       /* 9O<non upper> */
-      if (non_0_digit(best_choice->unichar_id(i-1)) &&
+      if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
           (i+1) < best_choice->length() &&
-          !unicharset.get_isupper(best_choice->unichar_id(i+1))) {
+          !word_res->uch_set->get_isupper(best_choice->unichar_id(i+1))) {
         best_choice->set_unichar_id(unichar_0, i);
       }
       /* 9[.,]OOO.. */
       if ((i > 1) &&
-          (unicharset.eq(best_choice->unichar_id(i-1), ".") ||
-           unicharset.eq(best_choice->unichar_id(i-1), ",")) &&
-          (unicharset.get_isdigit(best_choice->unichar_id(i-2)) ||
+          (word_res->uch_set->eq(best_choice->unichar_id(i-1), ".") ||
+              word_res->uch_set->eq(best_choice->unichar_id(i-1), ",")) &&
+          (word_res->uch_set->get_isdigit(best_choice->unichar_id(i-2)) ||
            best_choice->unichar_id(i-2) == unichar_O)) {
         if (best_choice->unichar_id(i-2) == unichar_O) {
           best_choice->set_unichar_id(unichar_0, i-2);
@@ -978,17 +976,15 @@ void Tesseract::flip_0O(WERD_RES *word_res) {
     }
   }
   if (modified) {
-    best_choice->populate_unichars(unicharset);
+    best_choice->populate_unichars();
   }
 }
 
-BOOL8 Tesseract::non_O_upper(UNICHAR_ID unichar_id) {
-  return (unicharset.get_isupper(unichar_id) &&
-          (!unicharset.eq(unichar_id, "O")));
+BOOL8 Tesseract::non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) {
+  return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O");
 }
 
-BOOL8 Tesseract::non_0_digit(UNICHAR_ID unichar_id) {
-  return (unicharset.get_isdigit(unichar_id) &&
-          (!unicharset.eq(unichar_id, "0")));
+BOOL8 Tesseract::non_0_digit(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) {
+  return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0");
 }
 }  // namespace tesseract
diff --git a/ccmain/tessedit.cpp b/ccmain/tessedit.cpp
index b7b975142a..644db54c54 100644
--- a/ccmain/tessedit.cpp
+++ b/ccmain/tessedit.cpp
@@ -64,7 +64,8 @@ namespace tesseract {
 // Read a "config" file containing a set of variable, value pairs.
 // Searches the standard places: tessdata/configs, tessdata/tessconfigs
 // and also accepts a relative or absolute path name.
-void Tesseract::read_config_file(const char *filename, bool init_only) {
+void Tesseract::read_config_file(const char *filename,
+                                 SetParamConstraint constraint) {
   STRING path = datadir;
   path += "configs/";
   path += filename;
@@ -81,7 +82,7 @@ void Tesseract::read_config_file(const char *filename, bool init_only) {
       path = filename;
     }
   }
-  ParamUtils::ReadParamsFile(path.string(), init_only, this->params());
+  ParamUtils::ReadParamsFile(path.string(), constraint, this->params());
 }
 
 // Returns false if a unicharset file for the specified language was not found
@@ -99,7 +100,7 @@ bool Tesseract::init_tesseract_lang_data(
     OcrEngineMode oem, char **configs, int configs_size,
     const GenericVector<STRING> *vars_vec,
     const GenericVector<STRING> *vars_values,
-    bool set_only_init_params) {
+    bool set_only_non_debug_params) {
   // Set the basename, compute the data directory.
   main_setup(arg0, textbase);
 
@@ -121,17 +122,19 @@ bool Tesseract::init_tesseract_lang_data(
     ParamUtils::ReadParamsFromFp(
         tessdata_manager.GetDataFilePtr(),
         tessdata_manager.GetEndOffset(TESSDATA_LANG_CONFIG),
-        false, this->params());
+        SET_PARAM_CONSTRAINT_NONE, this->params());
     if (tessdata_manager_debug_level) {
       tprintf("Loaded language config file\n");
     }
   }
 
+  SetParamConstraint set_params_constraint = set_only_non_debug_params ?
+      SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY : SET_PARAM_CONSTRAINT_NONE;
   // Load tesseract variables from config files. This is done after loading
   // language-specific variables from [lang].traineddata file, so that custom
   // config files can override values in [lang].traineddata file.
   for (int i = 0; i < configs_size; ++i) {
-    read_config_file(configs[i], set_only_init_params);
+    read_config_file(configs[i], set_params_constraint);
   }
 
   // Set params specified in vars_vec (done after setting params from config
@@ -140,7 +143,7 @@ bool Tesseract::init_tesseract_lang_data(
     for (int i = 0; i < vars_vec->size(); ++i) {
       if (!ParamUtils::SetParam((*vars_vec)[i].string(),
                                 (*vars_values)[i].string(),
-                                set_only_init_params, this->params())) {
+                                set_params_constraint, this->params())) {
         tprintf("Error setting param %s\n", (*vars_vec)[i].string());
         exit(1);
       }
@@ -169,6 +172,15 @@ bool Tesseract::init_tesseract_lang_data(
             static_cast<int>(tessedit_ocr_engine_mode));
   }
 
+  // If we are only loading the config file (and so not planning on doing any
+  // recognition) then there's nothing else do here.
+  if (tessedit_init_config_only) {
+    if (tessdata_manager_debug_level) {
+      tprintf("Returning after loading config file\n");
+    }
+    return true;
+  }
+
   // Load the unicharset
   if (!tessdata_manager.SeekToStart(TESSDATA_UNICHARSET) ||
       !unicharset.load_from_file(tessdata_manager.GetDataFilePtr())) {
@@ -178,8 +190,8 @@ bool Tesseract::init_tesseract_lang_data(
     tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
     return false;
   }
-  right_to_left_ = unicharset.any_right_to_left();
   if (tessdata_manager_debug_level) tprintf("Loaded unicharset\n");
+  right_to_left_ = unicharset.major_right_to_left();
 
   if (!tessedit_ambigs_training &&
       tessdata_manager.SeekToStart(TESSDATA_AMBIGS)) {
@@ -204,17 +216,151 @@ bool Tesseract::init_tesseract_lang_data(
   return true;
 }
 
+// Helper returns true if the given string is in the vector of strings.
+static bool IsStrInList(const STRING& str,
+                        const GenericVector<STRING>& str_list) {
+  for (int i = 0; i < str_list.size(); ++i) {
+    if (str_list[i] == str)
+      return true;
+  }
+  return false;
+}
+
+// Parse a string of the form [~]<lang>[+[~]<lang>]*.
+// Langs with no prefix get appended to to_load, provided they
+// are not in there already.
+// Langs with ~ prefix get appended to not_to_load, provided they are not in
+// there already.
+void Tesseract::ParseLanguageString(const char* lang_str,
+                                    GenericVector<STRING>* to_load,
+                                    GenericVector<STRING>* not_to_load) {
+  STRING remains(lang_str);
+  while (remains.length() > 0) {
+    // Find the start of the lang code and which vector to add to.
+    const char* start = remains.string();
+    while (*start == '+')
+      ++start;
+    GenericVector<STRING>* target = to_load;
+    if (*start == '~') {
+      target = not_to_load;
+      ++start;
+    }
+    // Find the index of the end of the lang code in string start.
+    int end = strlen(start);
+    const char* plus = strchr(start, '+');
+    if (plus != NULL && plus - start < end)
+      end = plus - start;
+    STRING lang_code(start);
+    lang_code.truncate_at(end);
+    STRING next(start + end);
+    remains = next;
+    // Check whether lang_code is already in the target vector and add.
+    if (!IsStrInList(lang_code, *target)) {
+      if (tessdata_manager_debug_level)
+        tprintf("Adding language '%s' to list\n", lang_code.string());
+      target->push_back(lang_code);
+    }
+  }
+}
+
+// Initialize for potentially a set of languages defined by the language
+// string and recursively any additional languages required by any language
+// traineddata file (via tessedit_load_sublangs in its config) that is loaded.
+// See init_tesseract_internal for args.
 int Tesseract::init_tesseract(
     const char *arg0, const char *textbase, const char *language,
     OcrEngineMode oem, char **configs, int configs_size,
     const GenericVector<STRING> *vars_vec,
     const GenericVector<STRING> *vars_values,
-    bool set_only_init_params) {
+    bool set_only_non_debug_params) {
+  GenericVector<STRING> langs_to_load;
+  GenericVector<STRING> langs_not_to_load;
+  ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
+
+  sub_langs_.delete_data_pointers();
+  sub_langs_.clear();
+  // Find the first loadable lang and load into this.
+  // Add any languages that this language requires
+  bool loaded_primary = false;
+  // Load the rest into sub_langs_.
+  for (int lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {
+    if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) {
+      const char *lang_str = langs_to_load[lang_index].string();
+      Tesseract *tess_to_init;
+      if (!loaded_primary) {
+        tess_to_init = this;
+      } else {
+        tess_to_init = new Tesseract;
+      }
+
+      int result = tess_to_init->init_tesseract_internal(
+          arg0, textbase, lang_str, oem, configs, configs_size,
+          vars_vec, vars_values, set_only_non_debug_params);
+
+      if (!loaded_primary) {
+        if (result < 0) {
+          tprintf("Failed loading language '%s'\n", lang_str);
+        } else {
+          if (tessdata_manager_debug_level)
+            tprintf("Loaded language '%s' as main language\n", lang_str);
+          ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
+                              &langs_to_load, &langs_not_to_load);
+          loaded_primary = true;
+        }
+      } else {
+        if (result < 0) {
+          tprintf("Failed loading language '%s'\n", lang_str);
+          delete tess_to_init;
+        } else {
+          if (tessdata_manager_debug_level)
+            tprintf("Loaded language '%s' as secondary language\n", lang_str);
+          sub_langs_.push_back(tess_to_init);
+          // Add any languages that this language requires
+          ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
+                              &langs_to_load, &langs_not_to_load);
+        }
+      }
+    }
+  }
+  if (!loaded_primary) {
+    tprintf("Tesseract couldn't load any languages!\n");
+    return -1;  // Couldn't load any language!
+  }
+  SetupUniversalFontIds();
+  return 0;
+}
+
+// Common initialization for a single language.
+// arg0 is the datapath for the tessdata directory, which could be the
+// path of the tessdata directory with no trailing /, or (if tessdata
+// lives in the same directory as the executable, the path of the executable,
+// hence the name arg0.
+// textbase is an optional output file basename (used only for training)
+// language is the language code to load.
+// oem controls which engine(s) will operate on the image
+// configs (argv) is an array of config filenames to load variables from.
+// May be NULL.
+// configs_size (argc) is the number of elements in configs.
+// vars_vec is an optional vector of variables to set.
+// vars_values is an optional corresponding vector of values for the variables
+// in vars_vec.
+// If set_only_init_params is true, then only the initialization variables
+// will be set.
+int Tesseract::init_tesseract_internal(
+    const char *arg0, const char *textbase, const char *language,
+    OcrEngineMode oem, char **configs, int configs_size,
+    const GenericVector<STRING> *vars_vec,
+    const GenericVector<STRING> *vars_values,
+    bool set_only_non_debug_params) {
   if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs,
                                 configs_size, vars_vec, vars_values,
-                                set_only_init_params)) {
+                                set_only_non_debug_params)) {
     return -1;
   }
+  if (tessedit_init_config_only) {
+    tessdata_manager.End();
+    return 0;
+  }
   // If only Cube will be used, skip loading Tesseract classifier's
   // pre-trained templates.
   bool init_tesseract_classifier =
@@ -230,6 +376,46 @@ int Tesseract::init_tesseract(
   return 0;                      //Normal exit
 }
 
+// Helper builds the all_fonts table by adding new fonts from new_fonts.
+static void CollectFonts(const UnicityTable<FontInfo>& new_fonts,
+                         UnicityTable<FontInfo>* all_fonts) {
+  for (int i = 0; i < new_fonts.size(); ++i) {
+    // UnicityTable uniques as we go.
+    all_fonts->push_back(new_fonts.get(i));
+  }
+}
+
+// Helper assigns an id to lang_fonts using the index in all_fonts table.
+static void AssignIds(const UnicityTable<FontInfo>& all_fonts,
+                      UnicityTable<FontInfo>* lang_fonts) {
+  for (int i = 0; i < lang_fonts->size(); ++i) {
+    int index = all_fonts.get_id(lang_fonts->get(i));
+    lang_fonts->get_mutable(i)->universal_id = index;
+  }
+}
+
+// Set the universal_id member of each font to be unique among all
+// instances of the same font loaded.
+void Tesseract::SetupUniversalFontIds() {
+  // Note that we can get away with bitwise copying FontInfo in
+  // all_fonts, as it is a temporary structure and we avoid setting the
+  // delete callback.
+  UnicityTable<FontInfo> all_fonts;
+  all_fonts.set_compare_callback(NewPermanentTessCallback(CompareFontInfo));
+
+  // Create the universal ID table.
+  CollectFonts(get_fontinfo_table(), &all_fonts);
+  for (int i = 0; i < sub_langs_.size(); ++i) {
+    CollectFonts(sub_langs_[i]->get_fontinfo_table(), &all_fonts);
+  }
+  // Assign ids from the table to each font table.
+  AssignIds(all_fonts, &get_fontinfo_table());
+  for (int i = 0; i < sub_langs_.size(); ++i) {
+    AssignIds(all_fonts, &sub_langs_[i]->get_fontinfo_table());
+  }
+  font_table_size_ = all_fonts.size();
+}
+
 // init the LM component
 int Tesseract::init_tesseract_lm(const char *arg0,
                    const char *textbase,
diff --git a/ccmain/tesseract_cube_combiner.cpp b/ccmain/tesseract_cube_combiner.cpp
index d0524b6d61..7fd7c6b198 100644
--- a/ccmain/tesseract_cube_combiner.cpp
+++ b/ccmain/tesseract_cube_combiner.cpp
@@ -148,8 +148,7 @@ bool TesseractCubeCombiner::ComputeCombinerFeatures(const string &tess_str,
   bool cube_best_bigram_cost_valid = true;
   if (cube_cntxt_->Bigrams())
     cube_best_bigram_cost = cube_cntxt_->Bigrams()->
-        Cost(cube_best_str32, cube_cntxt_->CharacterSet(),
-             &cube_cntxt_->TesseractObject()->unicharset);
+        Cost(cube_best_str32, cube_cntxt_->CharacterSet());
   else
     cube_best_bigram_cost_valid = false;
   CubeUtils::UTF32ToUTF8(cube_best_str32, &cube_best_str);
@@ -191,8 +190,7 @@ bool TesseractCubeCombiner::ComputeCombinerFeatures(const string &tess_str,
   int tess_bigram_cost_valid = true;
   if (cube_cntxt_->Bigrams())
     tess_bigram_cost = cube_cntxt_->Bigrams()->
-        Cost(tess_str32.c_str(), cube_cntxt_->CharacterSet(),
-             &cube_cntxt_->TesseractObject()->unicharset);
+        Cost(tess_str32.c_str(), cube_cntxt_->CharacterSet());
   else
     tess_bigram_cost_valid = false;
 
diff --git a/ccmain/tesseractclass.cpp b/ccmain/tesseractclass.cpp
index 62e0169e08..8d1a7f0f4b 100644
--- a/ccmain/tesseractclass.cpp
+++ b/ccmain/tesseractclass.cpp
@@ -19,17 +19,19 @@
 ///////////////////////////////////////////////////////////////////////
 
 #include "tesseractclass.h"
+
+#include "allheaders.h"
 #include "cube_reco_context.h"
-#include "tesseract_cube_combiner.h"
+#include "edgblob.h"
+#include "equationdetect.h"
 #include "globals.h"
+#include "tesseract_cube_combiner.h"
 
 // Include automatically generated configuration file if running autoconf.
 #ifdef HAVE_CONFIG_H
 #include "config_auto.h"
 #endif
 
-#include "allheaders.h"
-
 namespace tesseract {
 
 Tesseract::Tesseract()
@@ -63,7 +65,7 @@ Tesseract::Tesseract()
                   "Blacklist of chars not to recognize", this->params()),
     STRING_MEMBER(tessedit_char_whitelist, "",
                   "Whitelist of chars to recognize", this->params()),
-    BOOL_INIT_MEMBER(tessedit_ambigs_training, false,
+    BOOL_MEMBER(tessedit_ambigs_training, false,
                 "Perform training for ambiguities", this->params()),
     INT_MEMBER(pageseg_devanagari_split_strategy,
               tesseract::ShiroRekhaSplitter::NO_SPLIT,
@@ -80,6 +82,7 @@ Tesseract::Tesseract()
                 " a character composed form fragments", this->params()),
     BOOL_MEMBER(tessedit_adaption_debug, false, "Generate and print debug"
                 " information for adaption", this->params()),
+    INT_MEMBER(bidi_debug, 0, "Debug level for BiDi", this->params()),
     INT_MEMBER(applybox_debug, 1, "Debug level", this->params()),
     INT_MEMBER(applybox_page, 0,
                "Page number to apply boxes from", this->params()),
@@ -94,7 +97,7 @@ Tesseract::Tesseract()
     BOOL_MEMBER(applybox_learn_ngrams_mode, false, "Each bounding box"
                 " is assumed to contain ngrams. Only learn the ngrams"
                 " whose outlines overlap horizontally.", this->params()),
-    BOOL_MEMBER(tessedit_draw_outwords, false,
+    BOOL_MEMBER(tessedit_display_outwords, false,
                 "Draw output words", this->params()),
     BOOL_MEMBER(tessedit_training_tess, false,
                 "Call Tess to learn blobs", this->params()),
@@ -114,6 +117,12 @@ Tesseract::Tesseract()
                 "Output font info per char", this->params()),
     BOOL_MEMBER(tessedit_debug_block_rejection, false,
                 "Block and Row stats", this->params()),
+    BOOL_MEMBER(tessedit_enable_bigram_correction, false,
+                "Enable correction based on the word bigram dictionary.",
+                this->params()),
+    INT_MEMBER(tessedit_bigram_debug, 0,
+               "Amount of debug output for bigram correction.",
+               this->params()),
     INT_MEMBER(debug_x_ht_level, 0, "Reestimate debug", this->params()),
     BOOL_MEMBER(debug_acceptable_wds, false,
                 "Dump word pass/fail chk", this->params()),
@@ -145,13 +154,15 @@ Tesseract::Tesseract()
                 "Log matcher activity", this->params()),
     INT_MEMBER(tessedit_test_adaption_mode, 3,
                "Adaptation decision algorithm for tess", this->params()),
-    BOOL_MEMBER(save_best_choices, false,
+    BOOL_MEMBER(save_blob_choices, false,
                 "Save the results of the recognition step (blob_choices)"
                 " within the corresponding WERD_CHOICE", this->params()),
     BOOL_MEMBER(test_pt, false, "Test for point", this->params()),
     double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params()),
     double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params()),
-    INT_MEMBER(cube_debug_level, 1, "Print cube debug info.", this->params()),
+    INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.",
+               this->params()),
+    INT_MEMBER(cube_debug_level, 0, "Print cube debug info.", this->params()),
     STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines",
                   this->params()),
     STRING_MEMBER(outlines_2, "ij!?%\":;",
@@ -345,28 +356,49 @@ Tesseract::Tesseract()
                " , else specifc page to process", this->params()),
     BOOL_MEMBER(tessedit_write_images, false,
                 "Capture the image from the IPE", this->params()),
-    BOOL_MEMBER(interactive_mode, false, "Run interactively?", this->params()),
+    BOOL_MEMBER(interactive_display_mode, false, "Run interactively?",
+                this->params()),
     STRING_MEMBER(file_type, ".tif", "Filename extension", this->params()),
     BOOL_MEMBER(tessedit_override_permuter, true,
                 "According to dict_word", this->params()),
-    INT_INIT_MEMBER(tessdata_manager_debug_level, 0, "Debug level for"
-                    " TessdataManager functions.", this->params()),
-    double_MEMBER(min_orientation_margin, 12.0,
+    INT_MEMBER(tessdata_manager_debug_level, 0, "Debug level for"
+               " TessdataManager functions.", this->params()),
+    STRING_MEMBER(tessedit_load_sublangs, "",
+                  "List of languages to load with this one", this->params()),
+    double_MEMBER(min_orientation_margin, 7.0,
                   "Min acceptable orientation margin", this->params()),
+    BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding",
+                this->params()),
+    BOOL_MEMBER(textord_use_cjk_fp_model, FALSE, "Use CJK fixed pitch model",
+                this->params()),
+    BOOL_INIT_MEMBER(tessedit_init_config_only, false,
+                     "Only initialize with the config file. Useful if the "
+                     "instance is not going to be used for OCR but say only "
+                     "for layout analysis.", this->params()),
+    BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector",
+                this->params()),
     backup_config_file_(NULL),
     pix_binary_(NULL),
+    cube_binary_(NULL),
     pix_grey_(NULL),
-    orig_image_changed_(false),
+    source_resolution_(0),
     textord_(this),
     right_to_left_(false),
+    scaled_color_(NULL),
+    scaled_factor_(-1),
     deskew_(1.0f, 0.0f),
     reskew_(1.0f, 0.0f),
+    most_recently_used_(this),
+    font_table_size_(0),
     cube_cntxt_(NULL),
-    tess_cube_combiner_(NULL) {
+    tess_cube_combiner_(NULL),
+    equ_detect_(NULL) {
 }
 
 Tesseract::~Tesseract() {
   Clear();
+  end_tesseract();
+  sub_langs_.delete_data_pointers();
   // Delete cube objects.
   if (cube_cntxt_ != NULL) {
     delete cube_cntxt_;
@@ -379,77 +411,124 @@ Tesseract::~Tesseract() {
 }
 
 void Tesseract::Clear() {
-  if (pix_binary_ != NULL)
-    pixDestroy(&pix_binary_);
-  if (pix_grey_ != NULL)
-    pixDestroy(&pix_grey_);
+  pixDestroy(&pix_binary_);
+  pixDestroy(&cube_binary_);
+  pixDestroy(&pix_grey_);
+  pixDestroy(&scaled_color_);
   deskew_ = FCOORD(1.0f, 0.0f);
   reskew_ = FCOORD(1.0f, 0.0f);
-  orig_image_changed_ = false;
   splitter_.Clear();
+  scaled_factor_ = -1;
+  ResetFeaturesHaveBeenExtracted();
+  for (int i = 0; i < sub_langs_.size(); ++i)
+    sub_langs_[i]->Clear();
+}
+
+void Tesseract::SetEquationDetect(EquationDetect* detector) {
+  equ_detect_ = detector;
+  equ_detect_->SetLangTesseract(this);
+}
+
+// Clear all memory of adaption for this and all subclassifiers.
+void Tesseract::ResetAdaptiveClassifier() {
+  ResetAdaptiveClassifierInternal();
+  for (int i = 0; i < sub_langs_.size(); ++i) {
+    sub_langs_[i]->ResetAdaptiveClassifierInternal();
+  }
+}
+
+// Clear the document dictionary for this and all subclassifiers.
+void Tesseract::ResetDocumentDictionary() {
+  getDict().ResetDocumentDictionary();
+  for (int i = 0; i < sub_langs_.size(); ++i) {
+    sub_langs_[i]->getDict().ResetDocumentDictionary();
+  }
 }
 
 void Tesseract::SetBlackAndWhitelist() {
   // Set the white and blacklists (if any)
   unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(),
                                      tessedit_char_whitelist.string());
+  // Black and white lists should apply to all loaded classifiers.
+  for (int i = 0; i < sub_langs_.size(); ++i) {
+    sub_langs_[i]->unicharset.set_black_and_whitelist(
+        tessedit_char_blacklist.string(), tessedit_char_whitelist.string());
+  }
 }
 
 // Perform steps to prepare underlying binary image/other data structures for
 // page segmentation.
 void Tesseract::PrepareForPageseg() {
+  textord_.set_use_cjk_fp_model(textord_use_cjk_fp_model);
+  pixDestroy(&cube_binary_);
+  cube_binary_ = pixClone(pix_binary());
+  // Find the max splitter strategy over all langs.
+  ShiroRekhaSplitter::SplitStrategy max_pageseg_strategy =
+      static_cast<ShiroRekhaSplitter::SplitStrategy>(
+      static_cast<inT32>(pageseg_devanagari_split_strategy));
+  for (int i = 0; i < sub_langs_.size(); ++i) {
+    ShiroRekhaSplitter::SplitStrategy pageseg_strategy =
+        static_cast<ShiroRekhaSplitter::SplitStrategy>(
+        static_cast<inT32>(sub_langs_[i]->pageseg_devanagari_split_strategy));
+    if (pageseg_strategy > max_pageseg_strategy)
+      max_pageseg_strategy = pageseg_strategy;
+    // Clone the cube image to all the sub langs too.
+    pixDestroy(&sub_langs_[i]->cube_binary_);
+    sub_langs_[i]->cube_binary_ = pixClone(pix_binary());
+    pixDestroy(&sub_langs_[i]->pix_binary_);
+    sub_langs_[i]->pix_binary_ = pixClone(pix_binary());
+  }
   // Perform shiro-rekha (top-line) splitting and replace the current image by
   // the newly splitted image.
   splitter_.set_orig_pix(pix_binary());
-  splitter_.set_pageseg_split_strategy(
-      (ShiroRekhaSplitter::SplitStrategy)
-      ((inT32)pageseg_devanagari_split_strategy));
+  splitter_.set_pageseg_split_strategy(max_pageseg_strategy);
   if (splitter_.Split(true)) {
     ASSERT_HOST(splitter_.splitted_image());
-    splitter_.CopySplittedImageTo(NULL, &pix_binary_);
-    orig_image_changed_ = true;
+    pixDestroy(&pix_binary_);
+    pix_binary_ = pixClone(splitter_.splitted_image());
   }
 }
 
 // Perform steps to prepare underlying binary image/other data structures for
 // OCR. The current segmentation is required by this method.
+// Note that this method resets pix_binary_ to the original binarized image,
+// which may be different from the image actually used for OCR depending on the
+// value of devanagari_ocr_split_strategy.
 void Tesseract::PrepareForTessOCR(BLOCK_LIST* block_list,
                                   Tesseract* osd_tess, OSResults* osr) {
-  // Creating blobs to OCR.
+  // Find the max splitter strategy over all langs.
+  ShiroRekhaSplitter::SplitStrategy max_ocr_strategy =
+      static_cast<ShiroRekhaSplitter::SplitStrategy>(
+      static_cast<inT32>(ocr_devanagari_split_strategy));
+  for (int i = 0; i < sub_langs_.size(); ++i) {
+    ShiroRekhaSplitter::SplitStrategy ocr_strategy =
+        static_cast<ShiroRekhaSplitter::SplitStrategy>(
+        static_cast<inT32>(sub_langs_[i]->ocr_devanagari_split_strategy));
+    if (ocr_strategy > max_ocr_strategy)
+      max_ocr_strategy = ocr_strategy;
+  }
   // Utilize the segmentation information available.
   splitter_.set_segmentation_block_list(block_list);
-  splitter_.set_ocr_split_strategy(
-      (ShiroRekhaSplitter::SplitStrategy)
-      ((inT32)ocr_devanagari_split_strategy));
-  if (splitter_.Split(false)) {
-    ASSERT_HOST(splitter_.splitted_image());
-    splitter_.CopySplittedImageTo(NULL, &pix_binary_);
-    orig_image_changed_ = true;
-    // If the split strategies used before pageseg and ocr are the same, the
-    // segmentation obtained from the second round can be used going forward.
-    // Otherwise, the page-segmentation (& importantly, the word segmentation)
-    // of first round is used.
-    if (splitter_.HasDifferentSplitStrategies()) {
-      // Refresh the segmentation with new blobs.
-      BLOCK_LIST new_segmentation;
-      SegmentPage(NULL, &new_segmentation, osd_tess, osr);
-      C_BLOB_LIST new_blobs;
-      ExtractBlobsFromSegmentation(&new_segmentation, &new_blobs);
-      splitter_.RefreshSegmentationWithNewBlobs(&new_blobs);
-    } else {
-      block_list->clear();
-      SegmentPage(NULL, block_list, osd_tess, osr);
-    }
+  splitter_.set_ocr_split_strategy(max_ocr_strategy);
+  // Run the splitter for OCR
+  bool split_for_ocr = splitter_.Split(false);
+  // Restore pix_binary to the binarized original pix for future reference.
+  ASSERT_HOST(splitter_.orig_pix());
+  pixDestroy(&pix_binary_);
+  pix_binary_ = pixClone(splitter_.orig_pix());
+  // If the pageseg and ocr strategies are different, refresh the block list
+  // (from the last SegmentImage call) with blobs from the real image to be used
+  // for OCR.
+  if (splitter_.HasDifferentSplitStrategies()) {
+    BLOCK block("", TRUE, 0, 0, 0, 0, pixGetWidth(pix_binary_),
+                pixGetHeight(pix_binary_));
+    Pix* pix_for_ocr = split_for_ocr ? splitter_.splitted_image() :
+        splitter_.orig_pix();
+    extract_edges(pix_for_ocr, &block);
+    splitter_.RefreshSegmentationWithNewBlobs(block.blob_list());
   }
+  // The splitter isn't needed any more after this, so save memory by clearing.
+  splitter_.Clear();
 }
 
-// Perform steps to prepare underlying binary image/other data structures for
-// Cube OCR.
-void Tesseract::PrepareForCubeOCR() {
-  if (orig_image_changed_) {
-    // Revert to the original image as Cube likes them more.
-    splitter_.CopyOriginalImageTo(NULL, &pix_binary_);
-    orig_image_changed_ = false;
-  }
-}
 }  // namespace tesseract
diff --git a/ccmain/tesseractclass.h b/ccmain/tesseractclass.h
index 78ab31e006..11ff7033a5 100644
--- a/ccmain/tesseractclass.h
+++ b/ccmain/tesseractclass.h
@@ -22,14 +22,14 @@
 #define TESSERACT_CCMAIN_TESSERACTCLASS_H__
 
 #include "allheaders.h"
-#include "genericvector.h"
-#include "params.h"
-#include "wordrec.h"
-#include "ocrclass.h"
 #include "control.h"
 #include "docqual.h"
 #include "devanagari_processing.h"
+#include "genericvector.h"
+#include "params.h"
+#include "ocrclass.h"
 #include "textord.h"
+#include "wordrec.h"
 
 class PAGE_RES;
 class PAGE_RES_IT;
@@ -92,11 +92,18 @@ struct OSResults;
 
 namespace tesseract {
 
+class ColumnFinder;
 class CubeLineObject;
 class CubeObject;
 class CubeRecoContext;
+class EquationDetect;
+class Tesseract;
 class TesseractCubeCombiner;
 
+typedef void (Tesseract::*WordRecognizer)(BLOCK* block,
+                                          ROW *row,
+                                          WERD_RES *word);
+
 // A collection of various variables for statistics and debugging.
 struct TesseractStats {
   TesseractStats()
@@ -134,7 +141,16 @@ class Tesseract : public Wordrec {
   Tesseract();
   ~Tesseract();
 
+  // Clear as much used memory as possible without resetting the adaptive
+  // classifier or losing any other classifier data.
   void Clear();
+  // Clear all memory of adaption for this and all subclassifiers.
+  void ResetAdaptiveClassifier();
+  // Clear the document dictionary for this and all subclassifiers.
+  void ResetDocumentDictionary();
+
+  // Set the equation detector.
+  void SetEquationDetect(EquationDetect* detector);
 
   // Simple accessors.
   const FCOORD& reskew() const {
@@ -152,20 +168,40 @@ class Tesseract : public Wordrec {
     return pix_grey_;
   }
   void set_pix_grey(Pix* grey_pix) {
+    pixDestroy(&pix_grey_);
     pix_grey_ = grey_pix;
   }
+  // Returns a pointer to a Pix representing the best available image of the
+  // page. The image will be 8-bit grey if the input was grey or color. Note
+  // that in grey 0 is black and 255 is white. If the input was binary, then
+  // the returned Pix will be binary. Note that here black is 1 and white is 0.
+  // To tell the difference pixGetDepth() will return 8 or 1.
+  // In either case, the return value is a borrowed Pix, and should not be
+  // deleted or pixDestroyed.
+  Pix* BestPix() const {
+    return pix_grey_ != NULL ? pix_grey_ : pix_binary_;
+  }
+  int source_resolution() const {
+    return source_resolution_;
+  }
+  void set_source_resolution(int ppi) {
+    source_resolution_ = ppi;
+  }
   int ImageWidth() const {
     return pixGetWidth(pix_binary_);
   }
   int ImageHeight() const {
     return pixGetHeight(pix_binary_);
   }
-
-  const ShiroRekhaSplitter& splitter() const {
-    return splitter_;
+  Pix* scaled_color() const {
+    return scaled_color_;
   }
-  ShiroRekhaSplitter* mutable_splitter() {
-    return &splitter_;
+  int scaled_factor() const {
+    return scaled_factor_;
+  }
+  void SetScaledColor(int factor, Pix* color) {
+    scaled_factor_ = factor;
+    scaled_color_ = color;
   }
   const Textord& textord() const {
     return textord_;
@@ -177,6 +213,12 @@ class Tesseract : public Wordrec {
   bool right_to_left() const {
     return right_to_left_;
   }
+  int num_sub_langs() const {
+    return sub_langs_.size();
+  }
+  Tesseract* get_sub_lang(int index) const {
+    return sub_langs_[index];
+  }
 
   void SetBlackAndWhitelist();
 
@@ -194,81 +236,87 @@ class Tesseract : public Wordrec {
   void PrepareForTessOCR(BLOCK_LIST* block_list,
                          Tesseract* osd_tess, OSResults* osr);
 
-  // Perform steps to prepare underlying binary image/other data structures for
-  // Cube OCR.
-  void PrepareForCubeOCR();
-
   int SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
                   Tesseract* osd_tess, OSResults* osr);
   void SetupWordScripts(BLOCK_LIST* blocks);
-  int AutoPageSeg(int resolution, bool single_column,
-                  bool osd, bool only_osd,
+  int AutoPageSeg(bool single_column, bool osd, bool only_osd,
                   BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks,
                   Tesseract* osd_tess, OSResults* osr);
+  ColumnFinder* SetupPageSegAndDetectOrientation(
+      bool single_column, bool osd, bool only_osd,
+      BLOCK_LIST* blocks, Tesseract* osd_tess, OSResults* osr,
+      TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix, Pix** music_mask_pix);
 
   //// control.h /////////////////////////////////////////////////////////
   bool ProcessTargetWord(const TBOX& word_box, const TBOX& target_word_box,
                          const char* word_config, int pass);
-  void recog_all_words(PAGE_RES* page_res,
+  bool recog_all_words(PAGE_RES* page_res,
                        ETEXT_DESC* monitor,
                        const TBOX* target_word_box,
                        const char* word_config,
                        int dopasses);
-  void classify_word_pass1(                 //recog one word
-                           WERD_RES *word,  //word to do
-                           ROW *row,
-                           BLOCK* block);
+  void rejection_passes(PAGE_RES* page_res,
+                        ETEXT_DESC* monitor,
+                        const TBOX* target_word_box,
+                        const char* word_config);
+  void bigram_correction_pass(PAGE_RES *page_res);
+  void blamer_pass(PAGE_RES* page_res);
+  // Helper to recognize the word using the given (language-specific) tesseract.
+  // Returns true if the result was better than previously.
+  bool RetryWithLanguage(WERD_RES *word, BLOCK* block, ROW *row,
+                         WordRecognizer recognizer);
+  void classify_word_and_language(WordRecognizer recognizer,
+                                  BLOCK* block, ROW *row, WERD_RES *word);
+  void classify_word_pass1(BLOCK* block, ROW *row, WERD_RES *word);
   void recog_pseudo_word(PAGE_RES* page_res,  // blocks to check
                          TBOX &selection_box);
 
   void fix_rep_char(PAGE_RES_IT* page_res_it);
   void ExplodeRepeatedWord(BLOB_CHOICE* best_choice, PAGE_RES_IT* page_res_it);
 
-  // Callback helper for fix_quotes returns a double quote if both
-  // arguments are quote, otherwise INVALID_UNICHAR_ID.
-  UNICHAR_ID BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2);
-  void fix_quotes(WERD_RES* word_res,
-                  BLOB_CHOICE_LIST_CLIST *blob_choices);
-  ACCEPTABLE_WERD_TYPE acceptable_word_string(const char *s,
+  ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET& char_set,
+                                              const char *s,
                                               const char *lengths);
   void match_word_pass2(                 //recog one word
                         WERD_RES *word,  //word to do
                         ROW *row,
                         BLOCK* block);
-  void classify_word_pass2(  //word to do
-                           WERD_RES *word,
-                           BLOCK* block,
-                           ROW *row);
+  void classify_word_pass2(BLOCK* block, ROW *row, WERD_RES *word);
   void ReportXhtFixResult(bool accept_new_word, float new_x_ht,
                           WERD_RES* word, WERD_RES* new_word);
   bool RunOldFixXht(WERD_RES *word, BLOCK* block, ROW *row);
   bool TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row);
   BOOL8 recog_interactive(BLOCK* block, ROW* row, WERD_RES* word_res);
 
-  // Callback helper for fix_hyphens returns UNICHAR_ID of - if both
-  // arguments are hyphen, otherwise INVALID_UNICHAR_ID.
-  UNICHAR_ID BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2);
-  // Callback helper for fix_hyphens returns true if box1 and box2 overlap
-  // (assuming both on the same textline, are in order and a chopped em dash.)
-  bool HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2);
-  void fix_hyphens(WERD_RES* word_res,
-                   BLOB_CHOICE_LIST_CLIST *blob_choices);
   void set_word_fonts(
       WERD_RES *word,  // set fonts of this word
       BLOB_CHOICE_LIST_CLIST *blob_choices);  // detailed results
-  void font_recognition_pass(  //good chars in word
-                             PAGE_RES_IT &page_res_it);
+  void font_recognition_pass(PAGE_RES* page_res);
   BOOL8 check_debug_pt(WERD_RES *word, int location);
 
   //// cube_control.cpp ///////////////////////////////////////////////////
   bool init_cube_objects(bool load_combiner,
                          TessdataManager *tessdata_manager);
-  void run_cube(PAGE_RES *page_res);
-  void cube_recognize(CubeObject *cube_obj, PAGE_RES_IT *page_res_it);
+  // Iterates through tesseract's results and calls cube on each word,
+  // combining the results with the existing tesseract result.
+  void run_cube_combiner(PAGE_RES *page_res);
+  // Recognizes a single word using (only) cube. Compatible with
+  // Tesseract's classify_word_pass1/classify_word_pass2.
+  void cube_word_pass1(BLOCK* block, ROW *row, WERD_RES *word);
+  // Cube recognizer to recognize a single word as with classify_word_pass1
+  // but also returns the cube object in case the combiner is needed.
+  CubeObject* cube_recognize_word(BLOCK* block, WERD_RES* word);
+  // Combines the cube and tesseract results for a single word, leaving the
+  // result in tess_word.
+  void cube_combine_word(CubeObject* cube_obj, WERD_RES* cube_word,
+                        WERD_RES* tess_word);
+  // Call cube on the current word, and write the result to word.
+  // Sets up a fake result  and returns false if something goes wrong.
+  bool cube_recognize(CubeObject *cube_obj, BLOCK* block, WERD_RES *word);
   void fill_werd_res(const BoxWord& cube_box_word,
                      WERD_CHOICE* cube_werd_choice,
                      const char* cube_best_str,
-                     PAGE_RES_IT *page_res_it);
+                     WERD_RES* tess_werd_res);
   bool extract_cube_state(CubeObject* cube_obj, int* num_chars,
                           Boxa** char_boxes, CharSamp*** char_samples);
   bool create_cube_box_word(Boxa *char_boxes, int num_chars,
@@ -287,7 +335,11 @@ class Tesseract : public Wordrec {
   inT16 count_alphanums(const WERD_CHOICE &word);
   inT16 count_alphas(const WERD_CHOICE &word);
   //// tessedit.h ////////////////////////////////////////////////////////
-  void read_config_file(const char *filename, bool init_only);
+  void read_config_file(const char *filename, SetParamConstraint constraint);
+  // Initialize for potentially a set of languages defined by the language
+  // string and recursively any additional languages required by any language
+  // traineddata file (via tessedit_load_sublangs in its config) that is loaded.
+  // See init_tesseract_internal for args.
   int init_tesseract(const char *arg0,
                      const char *textbase,
                      const char *language,
@@ -303,6 +355,35 @@ class Tesseract : public Wordrec {
     return init_tesseract(datapath, NULL, language, oem,
                           NULL, 0, NULL, NULL, false);
   }
+  // Common initialization for a single language.
+  // arg0 is the datapath for the tessdata directory, which could be the
+  // path of the tessdata directory with no trailing /, or (if tessdata
+  // lives in the same directory as the executable, the path of the executable,
+  // hence the name arg0.
+  // textbase is an optional output file basename (used only for training)
+  // language is the language code to load.
+  // oem controls which engine(s) will operate on the image
+  // configs (argv) is an array of config filenames to load variables from.
+  // May be NULL.
+  // configs_size (argc) is the number of elements in configs.
+  // vars_vec is an optional vector of variables to set.
+  // vars_values is an optional corresponding vector of values for the variables
+  // in vars_vec.
+  // If set_only_init_params is true, then only the initialization variables
+  // will be set.
+  int init_tesseract_internal(const char *arg0,
+                              const char *textbase,
+                              const char *language,
+                              OcrEngineMode oem,
+                              char **configs,
+                              int configs_size,
+                              const GenericVector<STRING> *vars_vec,
+                              const GenericVector<STRING> *vars_values,
+                              bool set_only_init_params);
+
+  // Set the universal_id member of each font to be unique among all
+  // instances of the same font loaded.
+  void SetupUniversalFontIds();
 
   int init_tesseract_lm(const char *arg0,
                         const char *textbase,
@@ -321,6 +402,10 @@ class Tesseract : public Wordrec {
                                 const GenericVector<STRING> *vars_values,
                                 bool set_only_init_params);
 
+  void ParseLanguageString(const char* lang_str,
+                           GenericVector<STRING>* to_load,
+                           GenericVector<STRING>* not_to_load);
+
   //// pgedit.h //////////////////////////////////////////////////////////
   SVMenuNode *build_menu_new();
   void pgeditor_main(int width, int height, PAGE_RES* page_res);
@@ -360,8 +445,8 @@ class Tesseract : public Wordrec {
   inT16 count_alphanums(  //how many alphanums
                         WERD_RES *word);
   void flip_0O(WERD_RES *word);
-  BOOL8 non_0_digit(UNICHAR_ID unichar_id);
-  BOOL8 non_O_upper(UNICHAR_ID unichar_id);
+  BOOL8 non_0_digit(const UNICHARSET& ch_set, UNICHAR_ID unichar_id);
+  BOOL8 non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id);
   BOOL8 repeated_nonalphanum_wd(WERD_RES *word, ROW *row);
   void nn_match_word(  //Match a word
                      WERD_RES *word,
@@ -372,7 +457,7 @@ class Tesseract : public Wordrec {
   void set_done(  //set done flag
                 WERD_RES *word,
                 inT16 pass);
-  inT16 safe_dict_word(const WERD_CHOICE  &word);
+  inT16 safe_dict_word(const WERD_RES *werd_res);  // is best_choice in dict?
   void flip_hyphens(WERD_RES *word);
   void reject_I_1_L(WERD_RES *word);
   void reject_edge_blobs(WERD_RES *word);
@@ -425,10 +510,6 @@ class Tesseract : public Wordrec {
   void quality_based_rejection(PAGE_RES_IT &page_res_it,
                                BOOL8 good_quality_doc);
   void convert_bad_unlv_chs(WERD_RES *word_res);
-  // Callback helper for merge_tess_fails returns a space if both
-  // arguments are space, otherwise INVALID_UNICHAR_ID.
-  UNICHAR_ID BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2);
-  void merge_tess_fails(WERD_RES *word_res);
   void tilde_delete(PAGE_RES_IT &page_res_it);
   inT16 word_blob_quality(WERD_RES *word, ROW *row);
   void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count,
@@ -487,11 +568,13 @@ class Tesseract : public Wordrec {
 
   // Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
   // All fuzzy spaces are removed, and all the words are maximally chopped.
-  PAGE_RES* SetupApplyBoxes(BLOCK_LIST *block_list);
+  PAGE_RES* SetupApplyBoxes(const GenericVector<TBOX>& boxes,
+                            BLOCK_LIST *block_list);
   // Tests the chopper by exhaustively running chop_one_blob.
   // The word_res will contain filled chopped_word, seam_array, denorm,
   // box_word and best_state for the maximally chopped word.
-  void MaximallyChopWord(BLOCK* block, ROW* row, WERD_RES* word_res);
+  void MaximallyChopWord(const GenericVector<TBOX>& boxes,
+                         BLOCK* block, ROW* row, WERD_RES* word_res);
   // Gather consecutive blobs that match the given box into the best_state
   // and corresponding correct_text.
   // Fights over which box owns which blobs are settled by pre-chopping and
@@ -500,7 +583,7 @@ class Tesseract : public Wordrec {
   // failing to find an appropriate blob for a box.
   // This means that occasionally, blobs may be incorrectly segmented if the
   // chopper fails to find a suitable chop point.
-  bool ResegmentCharBox(PAGE_RES* page_res,
+  bool ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box,
                         const TBOX& box, const TBOX& next_box,
                         const char* correct_text);
   // Consume all source blobs that strongly overlap the given box,
@@ -602,6 +685,7 @@ class Tesseract : public Wordrec {
              " a character composed form fragments");
   BOOL_VAR_H(tessedit_adaption_debug, false,
              "Generate and print debug information for adaption");
+  INT_VAR_H(bidi_debug, 0, "Debug level for BiDi");
   INT_VAR_H(applybox_debug, 1, "Debug level");
   INT_VAR_H(applybox_page, 0, "Page number to apply boxes from");
   STRING_VAR_H(applybox_exposure_pattern, ".exp",
@@ -615,7 +699,7 @@ class Tesseract : public Wordrec {
   BOOL_VAR_H(applybox_learn_ngrams_mode, false,
              "Each bounding box is assumed to contain ngrams. Only"
              " learn the ngrams whose outlines overlap horizontally.");
-  BOOL_VAR_H(tessedit_draw_outwords, false, "Draw output words");
+  BOOL_VAR_H(tessedit_display_outwords, false, "Draw output words");
   BOOL_VAR_H(tessedit_training_tess, false, "Call Tess to learn blobs");
   BOOL_VAR_H(tessedit_dump_choices, false, "Dump char choices");
   BOOL_VAR_H(tessedit_fix_fuzzy_spaces, true,
@@ -628,6 +712,10 @@ class Tesseract : public Wordrec {
              "Add words to the document dictionary");
   BOOL_VAR_H(tessedit_debug_fonts, false, "Output font info per char");
   BOOL_VAR_H(tessedit_debug_block_rejection, false, "Block and Row stats");
+  BOOL_VAR_H(tessedit_enable_bigram_correction, false,
+             "Enable correction based on the word bigram dictionary.");
+  INT_VAR_H(tessedit_bigram_debug, 0, "Amount of debug output for bigram "
+            "correction.");
   INT_VAR_H(debug_x_ht_level, 0, "Reestimate debug");
   BOOL_VAR_H(debug_acceptable_wds, false, "Dump word pass/fail chk");
   STRING_VAR_H(chs_leading_punct, "('`\"", "Leading punctuation");
@@ -649,12 +737,13 @@ class Tesseract : public Wordrec {
   BOOL_VAR_H(tessedit_matcher_log, false, "Log matcher activity");
   INT_VAR_H(tessedit_test_adaption_mode, 3,
             "Adaptation decision algorithm for tess");
-  BOOL_VAR_H(save_best_choices, false,
+  BOOL_VAR_H(save_blob_choices, false,
              "Save the results of the recognition step"
              " (blob_choices) within the corresponding WERD_CHOICE");
   BOOL_VAR_H(test_pt, false, "Test for point");
   double_VAR_H(test_pt_x, 99999.99, "xcoord");
   double_VAR_H(test_pt_y, 99999.99, "ycoord");
+  INT_VAR_H(paragraph_debug_level, 0, "Print paragraph debug info.");
   INT_VAR_H(cube_debug_level, 1, "Print cube debug info.");
   STRING_VAR_H(outlines_odd, "%| ", "Non standard number of outlines");
   STRING_VAR_H(outlines_2, "ij!?%\":;", "Non standard number of outlines");
@@ -788,15 +877,23 @@ class Tesseract : public Wordrec {
   INT_VAR_H(tessedit_page_number, -1,
             "-1 -> All pages, else specifc page to process");
   BOOL_VAR_H(tessedit_write_images, false, "Capture the image from the IPE");
-  BOOL_VAR_H(interactive_mode, false, "Run interactively?");
+  BOOL_VAR_H(interactive_display_mode, false, "Run interactively?");
   STRING_VAR_H(file_type, ".tif", "Filename extension");
   BOOL_VAR_H(tessedit_override_permuter, true, "According to dict_word");
   INT_VAR_H(tessdata_manager_debug_level, 0,
             "Debug level for TessdataManager functions.");
+  STRING_VAR_H(tessedit_load_sublangs, "",
+               "List of languages to load with this one");
   // Min acceptable orientation margin (difference in scores between top and 2nd
   // choice in OSResults::orientations) to believe the page orientation.
-  double_VAR_H(min_orientation_margin, 12.0,
+  double_VAR_H(min_orientation_margin, 7.0,
                "Min acceptable orientation margin");
+  BOOL_VAR_H(textord_tabfind_show_vlines, false, "Debug line finding");
+  BOOL_VAR_H(textord_use_cjk_fp_model, FALSE, "Use CJK fixed pitch model");
+  BOOL_VAR_H(tessedit_init_config_only, false,
+             "Only initialize with the config file. Useful if the instance is "
+             "not going to be used for OCR but say only for layout analysis.");
+  BOOL_VAR_H(textord_equation_detect, false, "Turn on equation detector");
 
   //// ambigsrecog.cpp /////////////////////////////////////////////////////////
   FILE *init_recog_training(const STRING &fname);
@@ -819,25 +916,40 @@ class Tesseract : public Wordrec {
   const char* backup_config_file_;
   // The filename of a config file to read when processing a debug word.
   STRING word_config_;
+  // Image used for input to layout analysis and tesseract recognition.
+  // May be modified by the ShiroRekhaSplitter to eliminate the top-line.
   Pix* pix_binary_;
+  // Unmodified image used for input to cube. Always valid.
+  Pix* cube_binary_;
+  // Grey-level input image if the input was not binary, otherwise NULL.
   Pix* pix_grey_;
+  // Input image resolution after any scaling. The resolution is not well
+  // transmitted by operations on Pix, so we keep an independent record here.
+  int source_resolution_;
   // The shiro-rekha splitter object which is used to split top-lines in
   // Devanagari words to provide a better word and grapheme segmentation.
   ShiroRekhaSplitter splitter_;
-  // The boolean records if the currently set
-  // pix_binary_ member has been modified due to any processing so that this
-  // may hurt Cube's recognition phase.
-  bool orig_image_changed_;
   // Page segmentation/layout
   Textord textord_;
   // True if the primary language uses right_to_left reading order.
   bool right_to_left_;
+  Pix* scaled_color_;
+  int scaled_factor_;
   FCOORD deskew_;
   FCOORD reskew_;
   TesseractStats stats_;
+  // Sub-languages to be tried in addition to this.
+  GenericVector<Tesseract*> sub_langs_;
+  // Most recently used Tesseract out of this and sub_langs_. The default
+  // language for the next word.
+  Tesseract* most_recently_used_;
+  // The size of the font table, ie max possible font id + 1.
+  int font_table_size_;
   // Cube objects.
   CubeRecoContext* cube_cntxt_;
   TesseractCubeCombiner *tess_cube_combiner_;
+  // Equation detector. Note: this pointer is NOT owned by the class.
+  EquationDetect* equ_detect_;
 };
 
 }  // namespace tesseract
diff --git a/ccmain/tfacepp.cpp b/ccmain/tfacepp.cpp
index 8f7c74f47a..d845932883 100644
--- a/ccmain/tfacepp.cpp
+++ b/ccmain/tfacepp.cpp
@@ -23,6 +23,8 @@
 #pragma warning(disable:4800)  // int/bool warnings
 #endif
 
+#include <math.h>
+
 #include "mfcpch.h"
 #ifdef __UNIX__
 #include          <assert.h>
@@ -58,7 +60,7 @@ void Tesseract::recog_word(WERD_RES *word,
       (word->best_choice->length() != blob_choices->length())) {
     tprintf("recog_word ASSERT FAIL String:\"%s\"; "
             "Strlen=%d; #Blobs=%d; #Choices=%d\n",
-            word->best_choice->debug_string(unicharset).string(),
+            word->best_choice->debug_string().string(),
             word->best_choice->length(), word->box_word->length(),
             blob_choices->length());
   }
@@ -128,7 +130,7 @@ void Tesseract::recog_word_recursive(WERD_RES *word,
       word->raw_choice->append_unichar_id(space_id, 1, 0.0,
                                           word->raw_choice->certainty());
     }
-    word->raw_choice->populate_unichars(unicharset);
+    word->raw_choice->populate_unichars();
   }
 
   // Do sanity checks and minor fixes on best_choice.
@@ -162,7 +164,7 @@ void Tesseract::recog_word_recursive(WERD_RES *word,
       word->best_choice->append_unichar_id(space_id, 1, 0.0,
                                            word->best_choice->certainty());
     }
-    word->best_choice->populate_unichars(unicharset);
+    word->best_choice->populate_unichars();
   }
 }
 
@@ -198,6 +200,7 @@ void Tesseract::split_and_recog_word(WERD_RES *word,
     prev_blob = blob;
   }
   ASSERT_HOST(best_end != NULL);
+  ASSERT_HOST(best_end->next != NULL);
 
   // Make a copy of the word to put the 2nd half in.
   WERD_RES* word2 = new WERD_RES(*word);
@@ -211,6 +214,67 @@ void Tesseract::split_and_recog_word(WERD_RES *word,
   free_seam_list(word->seam_array);
   word->seam_array = start_seam_list(word->chopped_word->blobs);
   word2->seam_array = start_seam_list(word2->chopped_word->blobs);
+  BlamerBundle *orig_bb = word->blamer_bundle;
+  STRING blamer_debug;
+  // Try to adjust truth information.
+  if (orig_bb != NULL) {
+    // Find truth boxes that correspond to the split in the blobs.
+    int b;
+    int begin2_truth_index = -1;
+    if (orig_bb->incorrect_result_reason != IRR_NO_TRUTH &&
+        orig_bb->truth_has_char_boxes) {
+      int end1_x = best_end->bounding_box().right();
+      int begin2_x = word2->chopped_word->blobs->bounding_box().left();
+      blamer_debug = "Looking for truth split at";
+      blamer_debug.add_str_int(" end1_x ", end1_x);
+      blamer_debug.add_str_int(" begin2_x ", begin2_x);
+      blamer_debug += "\nnorm_truth_word boxes:\n";
+      if (orig_bb->norm_truth_word.length() > 1) {
+        orig_bb->norm_truth_word.BlobBox(0).append_debug(&blamer_debug);
+        for (b = 1; b < orig_bb->norm_truth_word.length(); ++b) {
+          orig_bb->norm_truth_word.BlobBox(b).append_debug(&blamer_debug);
+          if ((abs(end1_x - orig_bb->norm_truth_word.BlobBox(b-1).right()) <
+              orig_bb->norm_box_tolerance) &&
+              (abs(begin2_x - orig_bb->norm_truth_word.BlobBox(b).left()) <
+              orig_bb->norm_box_tolerance)) {
+            begin2_truth_index = b;
+            blamer_debug += "Split found\n";
+            break;
+          }
+        }
+      }
+    }
+    // Populate truth information in word and word2 with the first and second
+    // part of the original truth.
+    word->blamer_bundle = new BlamerBundle();
+    word2->blamer_bundle = new BlamerBundle();
+    if (begin2_truth_index > 0) {
+      word->blamer_bundle->truth_has_char_boxes = true;
+      word->blamer_bundle->norm_box_tolerance = orig_bb->norm_box_tolerance;
+      word2->blamer_bundle->truth_has_char_boxes = true;
+      word2->blamer_bundle->norm_box_tolerance = orig_bb->norm_box_tolerance;
+      BlamerBundle *curr_bb = word->blamer_bundle;
+      for (b = 0; b < orig_bb->norm_truth_word.length(); ++b) {
+        if (b == begin2_truth_index) curr_bb = word2->blamer_bundle;
+        curr_bb->norm_truth_word.InsertBox(
+            b, orig_bb->norm_truth_word.BlobBox(b));
+        curr_bb->truth_word.InsertBox(b, orig_bb->truth_word.BlobBox(b));
+        curr_bb->truth_text.push_back(orig_bb->truth_text[b]);
+      }
+    } else if (orig_bb->incorrect_result_reason == IRR_NO_TRUTH) {
+      word->blamer_bundle->incorrect_result_reason = IRR_NO_TRUTH;
+      word2->blamer_bundle->incorrect_result_reason = IRR_NO_TRUTH;
+    } else {
+      blamer_debug += "Truth split not found";
+      blamer_debug += orig_bb->truth_has_char_boxes ?
+          "\n" : " (no truth char boxes)\n";
+      word->blamer_bundle->SetBlame(IRR_NO_TRUTH_SPLIT, blamer_debug,
+                                    NULL, wordrec_debug_blamer);
+      word2->blamer_bundle->SetBlame(IRR_NO_TRUTH_SPLIT, blamer_debug,
+                                     NULL, wordrec_debug_blamer);
+    }
+  }
+
   // Recognize the first part of the word.
   recog_word_recursive(word, blob_choices);
   // Recognize the second part of the word.
@@ -239,6 +303,75 @@ void Tesseract::split_and_recog_word(WERD_RES *word,
   // Append the word choices.
   *word->best_choice += *word2->best_choice;
   *word->raw_choice += *word2->raw_choice;
+
+  // How many alt choices from each should we try to get?
+  const int kAltsPerPiece = 2;
+  // When do we start throwing away extra alt choices?
+  const int kTooManyAltChoices = 100;
+
+  if (word->alt_choices.size() > 0 && word2->alt_choices.size() > 0) {
+    // Construct the cartesian product of the alt choices of word(1) and word2.
+    int num_first_alt_choices = word->alt_choices.size();
+    // Nota Bene: For the main loop here, we leave in place word1-only
+    // alt_choices in
+    //   word->alt_choices[0] .. word_alt_choices[num_first_alt_choices - 1]
+    // These will get fused with the best choices for word2 below.
+    for (int j = 1; j < word2->alt_choices.size() &&
+         (j <= kAltsPerPiece || word->alt_choices.size() < kTooManyAltChoices);
+         j++) {
+      for (int i = 0; i < num_first_alt_choices &&
+           (i <= kAltsPerPiece ||
+            word->alt_choices.size() < kTooManyAltChoices);
+           i++) {
+        WERD_CHOICE *wc = new WERD_CHOICE(*word->alt_choices[i]);
+        *wc += *word2->alt_choices[j];
+        word->alt_choices.push_back(wc);
+
+        word->alt_states.push_back(GenericVector<int>());
+        GenericVector<int> &alt_state = word->alt_states.back();
+        alt_state += word->alt_states[i];
+        alt_state += word2->alt_states[j];
+      }
+    }
+    // Now that we've filled in as many alternates as we want, paste the best
+    // choice for word2 onto the original word alt_choices.
+    for (int i = 0; i < num_first_alt_choices; i++) {
+      *word->alt_choices[i] += *word2->alt_choices[0];
+      word->alt_states[i] += word2->alt_states[0];
+    }
+  }
+
+  // Restore the pointer to original blamer bundle and combine blamer
+  // information recorded in the splits.
+  if (orig_bb != NULL) {
+    IncorrectResultReason irr = orig_bb->incorrect_result_reason;
+    if (irr != IRR_NO_TRUTH_SPLIT) blamer_debug = "";
+    if (word->blamer_bundle->incorrect_result_reason != IRR_CORRECT &&
+        word->blamer_bundle->incorrect_result_reason != IRR_NO_TRUTH &&
+        word->blamer_bundle->incorrect_result_reason != IRR_NO_TRUTH_SPLIT) {
+      blamer_debug += "Blame from part 1: ";
+      blamer_debug += word->blamer_bundle->debug;
+      irr = word->blamer_bundle->incorrect_result_reason;
+    }
+    if (word2->blamer_bundle->incorrect_result_reason != IRR_CORRECT &&
+        word2->blamer_bundle->incorrect_result_reason != IRR_NO_TRUTH &&
+        word2->blamer_bundle->incorrect_result_reason != IRR_NO_TRUTH_SPLIT) {
+      blamer_debug += "Blame from part 2: ";
+      blamer_debug += word2->blamer_bundle->debug;
+      if (irr == IRR_CORRECT) {
+        irr = word2->blamer_bundle->incorrect_result_reason;
+      } else if (irr != word2->blamer_bundle->incorrect_result_reason) {
+        irr = IRR_UNKNOWN;
+      }
+    }
+    delete word->blamer_bundle;
+    word->blamer_bundle = orig_bb;
+    word->blamer_bundle->incorrect_result_reason = irr;
+    if (irr != IRR_CORRECT && irr != IRR_NO_TRUTH) {
+      word->blamer_bundle->SetBlame(irr, blamer_debug, NULL,
+                                    wordrec_debug_blamer);
+    }
+  }
   delete word2;
 }
 
diff --git a/ccmain/thresholder.cpp b/ccmain/thresholder.cpp
index f8ae740d1b..c4b76d9560 100644
--- a/ccmain/thresholder.cpp
+++ b/ccmain/thresholder.cpp
@@ -33,7 +33,7 @@ ImageThresholder::ImageThresholder()
     image_data_(NULL),
     image_width_(0), image_height_(0),
     image_bytespp_(0), image_bytespl_(0),
-    scale_(1), yres_(300) {
+    scale_(1), yres_(300), estimated_res_(300) {
   SetRectangle(0, 0, 0, 0);
 }
 
@@ -79,7 +79,7 @@ void ImageThresholder::SetImage(const unsigned char* imagedata,
   image_bytespp_ = bytes_per_pixel;
   image_bytespl_ = bytes_per_line;
   scale_ = 1;
-  yres_ = 300;
+  estimated_res_ = yres_ = 300;
   Init();
 }
 
@@ -129,7 +129,7 @@ void ImageThresholder::SetImage(const Pix* pix) {
   image_bytespp_ = depth / 8;
   image_bytespl_ = pixGetWpl(pix_) * sizeof(l_uint32);
   scale_ = 1;
-  yres_ = pixGetYRes(src);
+  estimated_res_ = yres_ = pixGetYRes(src);
   Init();
 }
 
diff --git a/ccmain/thresholder.h b/ccmain/thresholder.h
index 7e21e259ab..a22cf0c561 100644
--- a/ccmain/thresholder.h
+++ b/ccmain/thresholder.h
@@ -79,12 +79,33 @@ class ImageThresholder {
   int GetScaleFactor() const {
     return scale_;
   }
+
+  // Set the resolution of the source image in pixels per inch.
+  // This should be called right after SetImage(), and will let us return
+  // appropriate font sizes for the text.
+  void SetSourceYResolution(int ppi) {
+    yres_ = ppi;
+    estimated_res_ = ppi;
+  }
   int GetSourceYResolution() const {
     return yres_;
   }
   int GetScaledYResolution() const {
     return scale_ * yres_;
   }
+  // Set the resolution of the source image in pixels per inch, as estimated
+  // by the thresholder from the text size found during thresholding.
+  // This value will be used to set internal size thresholds during recognition
+  // and will not influence the output "point size." The default value is
+  // the same as the source resolution. (yres_)
+  void SetEstimatedResolution(int ppi) {
+    estimated_res_ = ppi;
+  }
+  // Returns the estimated resolution, including any active scaling.
+  // This value will be used to set internal size thresholds during recognition.
+  int GetScaledEstimatedResolution() const {
+    return scale_ * estimated_res_;
+  }
 
   /// Pix vs raw, which to use?
   /// Implementations should provide the ability to source and target Pix
@@ -157,7 +178,8 @@ class ImageThresholder {
   int                  image_bytespl_;  //< Bytes per line of source image/pix.
   // Limits of image rectangle to be processed.
   int                  scale_;          //< Scale factor from original image.
-  int                  yres_;           //< y pixels/inch in source image
+  int                  yres_;           //< y pixels/inch in source image.
+  int                  estimated_res_;  //< Resolution estimate from text size.
   int                  rect_left_;
   int                  rect_top_;
   int                  rect_width_;
@@ -167,4 +189,3 @@ class ImageThresholder {
 }  // namespace tesseract.
 
 #endif  // TESSERACT_CCMAIN_THRESHOLDER_H__
-