diff --git a/api/baseapi.cpp b/api/baseapi.cpp index 53cfab3b02..2011adb0ae 100644 --- a/api/baseapi.cpp +++ b/api/baseapi.cpp @@ -41,6 +41,11 @@ #include #endif // _WIN32 +#include +#include +#include +#include + #if !defined(VERSION) #include "version.h" #endif @@ -73,6 +78,8 @@ #include "strngs.h" #include "openclwrapper.h" +BOOL_VAR(stream_filelist, FALSE, "Stream a filelist from stdin"); + namespace tesseract { /** Minimum sensible image size to be worth running tesseract. */ @@ -536,9 +543,10 @@ void TessBaseAPI::SetSourceResolution(int ppi) { * Because of that, an implementation that sources and targets Pix may end up * with less copies than an implementation that does not. */ -void TessBaseAPI::SetImage(const Pix* pix) { +void TessBaseAPI::SetImage(Pix* pix) { if (InternalSetImage()) thresholder_->SetImage(pix); + SetInputImage(pix); } /** @@ -681,7 +689,8 @@ Boxa* TessBaseAPI::GetComponentImages(PageIteratorLevel level, if (pixa != NULL) { Pix* pix = NULL; if (raw_image) { - pix = page_it->GetImage(level, raw_padding, &left, &top); + pix = page_it->GetImage(level, raw_padding, input_image_, + &left, &top); } else { pix = page_it->GetBinaryImage(level); } @@ -907,50 +916,12 @@ int TessBaseAPI::RecognizeForChopTest(ETEXT_DESC* monitor) { return 0; } -/** - * Recognizes all the pages in the named file, as a multi-page tiff or - * list of filenames, or single image, and gets the appropriate kind of text - * according to parameters: tessedit_create_boxfile, - * tessedit_make_boxes_from_boxes, tessedit_write_unlv, tessedit_create_hocr. - * Calls ProcessPage on each page in the input file, which may be a - * multi-page tiff, single-page other file format, or a plain text list of - * images to read. If tessedit_page_number is non-negative, processing begins - * at that page of a multi-page tiff file, or filelist. - * The text is returned in text_out. Returns false on error. - * If non-zero timeout_millisec terminates processing after the timeout on - * a single page. - * If non-NULL and non-empty, and some page fails for some reason, - * the page is reprocessed with the retry_config config file. Useful - * for interactively debugging a bad page. - */ -bool TessBaseAPI::ProcessPages(const char* filename, - const char* retry_config, int timeout_millisec, - STRING* text_out) { - TessResultRenderer* renderer = NewRenderer(); - - if (!ProcessPages(filename, retry_config, timeout_millisec, renderer)) { - delete renderer; - return false; - } - - const char* out_data; - inT32 out_len; - bool success = renderer->GetOutput(&out_data, &out_len); - if (success) { - // TODO(ewiseblatt): 20111103 - // if text_out->size() != out_len then we have binary data which STRING wont - // support so this should fail. Really want to eliminate this interface - // alltogether so not worrying about at this time. - text_out->assign(out_data, out_len); - } - delete renderer; - return success; -} - void TessBaseAPI::SetInputImage(Pix *pix) { if (input_image_) pixDestroy(&input_image_); - input_image_ = pixClone(pix); + input_image_ = NULL; + if (pix) + input_image_ = pixClone(pix); } Pix* TessBaseAPI::GetInputImage() { @@ -971,168 +942,213 @@ int TessBaseAPI::GetSourceYResolution() { return thresholder_->GetSourceYResolution(); } -bool TessBaseAPI::ProcessPages(const char* filename, - const char* retry_config, int timeout_millisec, - TessResultRenderer* renderer) { - PERF_COUNT_START("ProcessPages") - int page = tesseract_->tessedit_page_number; - if (page < 0) - page = 0; - FILE* fp = fopen(filename, "rb"); - if (fp == NULL) { - tprintf("Image file %s cannot be opened!\n", filename); - return false; +// If flist exists, get data from there. Otherwise get data from buf. +// Seems convoluted, but is the easiest way I know of to meet multiple +// goals. Support streaming from stdin, and also work on platforms +// lacking fmemopen. +bool TessBaseAPI::ProcessPagesFileList(FILE *flist, + STRING *buf, + const char* retry_config, + int timeout_millisec, + TessResultRenderer* renderer, + int tessedit_page_number) { + if (!flist && !buf) return false; + int page = (tessedit_page_number >= 0) ? tessedit_page_number : 0; + char pagename[MAX_PATH]; + + GenericVector lines; + if (!flist) { + buf->split('\n', &lines); + if (lines.empty()) return false; + } + + // Skip to the requested page number. + for (int i = 0; i < page; i++) { + if (flist) { + if (fgets(pagename, sizeof(pagename), flist) == NULL) break; + } } - // Find the number of pages if a tiff file, or zero otherwise. - int npages = 0; - int format; - Pix *pix; - pix = pixRead(filename); - format = pixGetInputFormat(pix); - if (format == IFF_TIFF || format == IFF_TIFF_PACKBITS || - format == IFF_TIFF_RLE || format == IFF_TIFF_G3 || - format == IFF_TIFF_G4 || format == IFF_TIFF_LZW || - format == IFF_TIFF_ZIP) - tiffGetCount(fp, &npages); - fclose(fp); - bool success = true; + // Begin producing output const char* kUnknownTitle = ""; if (renderer && !renderer->BeginDocument(kUnknownTitle)) { - success = false; + return false; + } + + // Loop over all pages - or just the requested one + while (true) { + if (flist) { + if (fgets(pagename, sizeof(pagename), flist) == NULL) break; + } else { + if (page >= lines.size()) break; + snprintf(pagename, sizeof(pagename), "%s", lines[page].c_str()); + } + chomp_string(pagename); + Pix *pix = pixRead(pagename); + if (pix == NULL) { + tprintf("Image file %s cannot be read!\n", pagename); + return false; + } + tprintf("Page %d : %s\n", page, pagename); + bool r = ProcessPage(pix, page, pagename, retry_config, + timeout_millisec, renderer); + pixDestroy(&pix); + if (!r) return false; + if (tessedit_page_number >= 0) break; + ++page; + } + + // Finish producing output + if (renderer && !renderer->EndDocument()) { + return false; } + return true; +} +bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data, + size_t size, + const char* filename, + const char* retry_config, + int timeout_millisec, + TessResultRenderer* renderer, + int tessedit_page_number) { + Pix *pix = NULL; #ifdef USE_OPENCL OpenclDevice od; #endif - - if (npages > 0) { - pixDestroy(&pix); - for (; page < npages; ++page) { - // only use opencl if compiled w/ OpenCL and selected device is opencl + int page = (tessedit_page_number >= 0) ? tessedit_page_number : 0; + for (; ; ++page) { + if (tessedit_page_number >= 0) + page = tessedit_page_number; #ifdef USE_OPENCL - if ( od.selectedDeviceIsOpenCL() ) { - pix = od.pixReadTiffCl(filename, page); - } else { + if ( od.selectedDeviceIsOpenCL() ) { + // FIXME(jbreiden) Not implemented. + pix = od.pixReadMemTiffCl(data, size, page); + } else { #endif - pix = pixReadTiff(filename, page); + pix = pixReadMemTiff(data, size, page); #ifdef USE_OPENCL - } + } #endif + if (pix == NULL) break; + tprintf("Page %d\n", page + 1); + char page_str[kMaxIntSize]; + snprintf(page_str, kMaxIntSize - 1, "%d", page); + SetVariable("applybox_page", page_str); + bool r = ProcessPage(pix, page, filename, retry_config, + timeout_millisec, renderer); + pixDestroy(&pix); + if (!r) return false; + if (tessedit_page_number >= 0) break; + } + return true; +} - if (pix == NULL) break; - - if ((page >= 0) && (npages > 1)) - tprintf("Page %d of %d\n", page + 1, npages); - char page_str[kMaxIntSize]; - snprintf(page_str, kMaxIntSize - 1, "%d", page); - SetVariable("applybox_page", page_str); - success &= ProcessPage(pix, page, filename, retry_config, - timeout_millisec, renderer); - pixDestroy(&pix); - if (tesseract_->tessedit_page_number >= 0 || npages == 1) { - break; - } - } +// In the ideal scenario, Tesseract will start working on data as soon +// as it can. For example, if you steam a filelist through stdin, we +// should start the OCR process as soon as the first filename is +// available. This is particularly useful when hooking Tesseract up to +// slow hardware such as a book scanning machine. +// +// Unfortunately there are tradeoffs. You can't seek on stdin. That +// makes automatic detection of datatype (TIFF? filelist? PNG?) +// impractical. So we support a command line flag to explicitly +// identify the scenario that really matters: filelists on +// stdin. We'll still do our best if the user likes pipes. That means +// piling up any data coming into stdin into a memory buffer. +bool TessBaseAPI::ProcessPages(const char* filename, + const char* retry_config, int timeout_millisec, + TessResultRenderer* renderer) { + PERF_COUNT_START("ProcessPages") + bool stdInput = !strcmp(filename, "stdin") || !strcmp(filename, "-"); + if (stdInput) { +#ifdef WIN32 + if (_setmode(_fileno(stdin), _O_BINARY) == -1) + tprintf("ERROR: cin to binary: %s", strerror(errno)); +#endif // WIN32 + } + + if (stream_filelist) { + return ProcessPagesFileList(stdin, NULL, retry_config, + timeout_millisec, renderer, + tesseract_->tessedit_page_number); + } + + // At this point we are officially in autodection territory. + // That means we are going to buffer stdin so that it is + // seekable. To keep code simple we will also buffer data + // coming from a file. + std::string buf; + if (stdInput) { + buf.assign((std::istreambuf_iterator(std::cin)), + (std::istreambuf_iterator())); } else { - // The file is not a tiff file. - if (pix != NULL) { - success &= ProcessPage(pix, 0, filename, retry_config, - timeout_millisec, renderer); - pixDestroy(&pix); - } else { - // The file is not an image file, so try it as a list of filenames. - FILE* fimg = fopen(filename, "rb"); - if (fimg == NULL) { - tprintf("File %s cannot be opened!\n", filename); - return false; - } - tprintf("Reading %s as a list of filenames...\n", filename); - char pagename[MAX_PATH]; - // Skip to the requested page number. - for (int i = 0; i < page && - fgets(pagename, sizeof(pagename), fimg) != NULL; - ++i); - while (fgets(pagename, sizeof(pagename), fimg) != NULL) { - chomp_string(pagename); - pix = pixRead(pagename); - if (pix == NULL) { - tprintf("Image file %s cannot be read!\n", pagename); - fclose(fimg); - return false; - } - tprintf("Page %d : %s\n", page, pagename); - success &= ProcessPage(pix, page, pagename, retry_config, - timeout_millisec, renderer); - pixDestroy(&pix); - ++page; - } - fclose(fimg); - } + std::ifstream ifs(filename); + buf.assign((std::istreambuf_iterator(ifs)), + (std::istreambuf_iterator())); } - bool all_ok = success; - if (renderer && !renderer->EndDocument()) { - all_ok = false; + // Here is our autodetection + int format; + const l_uint8 * data = reinterpret_cast(buf.c_str()); + findFileFormatBuffer(data, &format); + + // Maybe we have a filelist + if (format == IFF_UNKNOWN) { + STRING s(buf.c_str()); + return ProcessPagesFileList(NULL, &s, retry_config, + timeout_millisec, renderer, + tesseract_->tessedit_page_number); + } + + // Maybe we have a TIFF which is potentially multipage + bool tiff = (format == IFF_TIFF || format == IFF_TIFF_PACKBITS || + format == IFF_TIFF_RLE || format == IFF_TIFF_G3 || + format == IFF_TIFF_G4 || format == IFF_TIFF_LZW || + format == IFF_TIFF_ZIP); + + // Fail early if we can, before producing any output + Pix *pix = NULL; + if (!tiff) { + pix = pixReadMem(data, buf.size()); + if (pix == NULL) { + return false; + } } - PERF_COUNT_END - return all_ok; -} -/** - * Recognizes a single page for ProcessPages, appending the text to text_out. - * The pix is the image processed - filename and page_index are metadata - * used by side-effect processes, such as reading a box file or formatting - * as hOCR. - * If non-zero timeout_millisec terminates processing after the timeout. - * If non-NULL and non-empty, and some page fails for some reason, - * the page is reprocessed with the retry_config config file. Useful - * for interactively debugging a bad page. - * The text is returned in text_out. Returns false on error. - */ -bool TessBaseAPI::ProcessPage(Pix* pix, int page_index, const char* filename, - const char* retry_config, int timeout_millisec, - STRING* text_out) { - TessResultRenderer* renderer = NewRenderer(); - - if (!ProcessPage(pix, page_index, filename, retry_config, timeout_millisec, - renderer)) { + // Begin the output + const char* kUnknownTitle = ""; + if (renderer && !renderer->BeginDocument(kUnknownTitle)) { + pixDestroy(&pix); return false; } - const char* out_data; - inT32 out_len; - if (!renderer->GetOutput(&out_data, &out_len)) { - return false; + // Produce output + bool r = false; + if (tiff) { + r = ProcessPagesMultipageTiff(data, buf.size(), filename, retry_config, + timeout_millisec, renderer, + tesseract_->tessedit_page_number); + } else { + r = ProcessPage(pix, 0, filename, retry_config, + timeout_millisec, renderer); + pixDestroy(&pix); } - // TODO(ewiseblatt): 20111103 - // if text_out->size() != out_len then we have binary data which STRING wont - // support so this should fail. Really want to eliminate this interface - // alltogether so not worrying about at this time. - text_out->assign(out_data, out_len); - + // End the output + if (!r || (renderer && !renderer->EndDocument())) { + return false; + } + PERF_COUNT_END return true; } -/** - * Recognizes a single page for ProcessPages, appending the text to text_out. - * The pix is the image processed - filename and page_index are metadata - * used by side-effect processes, such as reading a box file or formatting - * as hOCR. - * If non-zero timeout_millisec terminates processing after the timeout. - * If non-NULL and non-empty, and some page fails for some reason, - * the page is reprocessed with the retry_config config file. Useful - * for interactively debugging a bad page. - * The text is returned in renderer. Returns false on error. - */ bool TessBaseAPI::ProcessPage(Pix* pix, int page_index, const char* filename, const char* retry_config, int timeout_millisec, TessResultRenderer* renderer) { PERF_COUNT_START("ProcessPage") SetInputName(filename); SetImage(pix); - SetInputImage(pix); bool failed = false; if (timeout_millisec > 0) { // Running with a timeout. @@ -1174,12 +1190,8 @@ bool TessBaseAPI::ProcessPage(Pix* pix, int page_index, const char* filename, ReadConfigFile(kOldVarsFile); } - if (renderer) { - if (failed) { - renderer->AddError(this); - } else { - failed = !renderer->AddImage(this); - } + if (renderer && !failed) { + failed = !renderer->AddImage(this); } PERF_COUNT_END return !failed; @@ -1782,6 +1794,7 @@ void TessBaseAPI::Clear() { if (thresholder_ != NULL) thresholder_->Clear(); ClearResults(); + SetInputImage(NULL); } /** @@ -1826,6 +1839,10 @@ void TessBaseAPI::End() { delete input_file_; input_file_ = NULL; } + if (input_image_ != NULL) { + pixDestroy(&input_image_); + input_image_ = NULL; + } if (output_file_ != NULL) { delete output_file_; output_file_ = NULL; @@ -2554,22 +2571,6 @@ CubeRecoContext *TessBaseAPI::GetCubeRecoContext() const { return (tesseract_ == NULL) ? NULL : tesseract_->GetCubeRecoContext(); } -TessResultRenderer* TessBaseAPI::NewRenderer() { - if (tesseract_->tessedit_create_boxfile - || tesseract_->tessedit_make_boxes_from_boxes) { - return new TessBoxTextRenderer(); - } else if (tesseract_->tessedit_create_hocr) { - return new TessHOcrRenderer(); - } else if (tesseract_->tessedit_create_pdf) { - return new TessPDFRenderer(tesseract_->datadir.c_str()); - } else if (tesseract_->tessedit_write_unlv) { - return new TessUnlvRenderer(); - } else if (tesseract_->tessedit_create_boxfile) { - return new TessBoxTextRenderer(); - } else { - return new TessTextRenderer(); - } -} /** Escape a char string - remove <>&"' with HTML codes. */ void HOcrEscape(const char* text, STRING& ret) { diff --git a/api/baseapi.h b/api/baseapi.h index 0b26e8a18e..2d60250d9b 100644 --- a/api/baseapi.h +++ b/api/baseapi.h @@ -346,7 +346,7 @@ class TESS_API TessBaseAPI { * Because of that, an implementation that sources and targets Pix may end up * with less copies than an implementation that does not. */ - void SetImage(const Pix* pix); + void SetImage(Pix* pix); /** * Set the resolution of the source image in pixels per inch so font size @@ -505,44 +505,40 @@ class TESS_API TessBaseAPI { int RecognizeForChopTest(ETEXT_DESC* monitor); /** - * Recognizes all the pages in the named file, as a multi-page tiff or - * list of filenames, or single image, and gets the appropriate kind of text - * according to parameters: tessedit_create_boxfile, - * tessedit_make_boxes_from_boxes, tessedit_write_unlv, tessedit_create_hocr. - * Calls ProcessPage on each page in the input file, which may be a - * multi-page tiff, single-page other file format, or a plain text list of - * images to read. If tessedit_page_number is non-negative, processing begins - * at that page of a multi-page tiff file, or filelist. - * The text is returned in text_out. Returns false on error. - * If non-zero timeout_millisec terminates processing after the timeout on - * a single page. - * If non-NULL and non-empty, and some page fails for some reason, - * the page is reprocessed with the retry_config config file. Useful - * for interactively debugging a bad page. + * Turns images into symbolic text. + * + * filename can point to a single image, a multi-page TIFF, + * or a plain text list of image filenames. + * + * retry_config is useful for debugging. If not NULL, you can fall + * back to an alternate configuration if a page fails for some + * reason. + * + * timeout_millisec terminates processing if any single page + * takes too long. Set to 0 for unlimited time. + * + * renderer is responible for creating the output. For example, + * use the TessTextRenderer if you want plaintext output, or + * the TessPDFRender to produce searchable PDF. + * + * If tessedit_page_number is non-negative, will only process that + * single page. Works for multi-page tiff file, or filelist. + * + * Returns true if successful, false on error. */ - bool ProcessPages(const char* filename, - const char* retry_config, int timeout_millisec, - STRING* text_out); - bool ProcessPages(const char* filename, const char* retry_config, int timeout_millisec, TessResultRenderer* renderer); /** - * Recognizes a single page for ProcessPages, appending the text to text_out. - * The pix is the image processed - filename and page_index are metadata - * used by side-effect processes, such as reading a box file or formatting - * as hOCR. - * If non-zero timeout_millisec terminates processing after the timeout. - * If non-NULL and non-empty, and some page fails for some reason, - * the page is reprocessed with the retry_config config file. Useful - * for interactively debugging a bad page. - * The text is returned in text_out. Returns false on error. + * Turn a single image into symbolic text. + * + * The pix is the image processed. filename and page_index are + * metadata used by side-effect processes, such as reading a box + * file or formatting as hOCR. + * + * See ProcessPages for desciptions of other parameters. */ - bool ProcessPage(Pix* pix, int page_index, const char* filename, - const char* retry_config, int timeout_millisec, - STRING* text_out); - bool ProcessPage(Pix* pix, int page_index, const char* filename, const char* retry_config, int timeout_millisec, TessResultRenderer* renderer); @@ -852,16 +848,20 @@ class TESS_API TessBaseAPI { /* @} */ private: - /** - * DEPRECATED - * Returns new renderer instance based on how tesseract was configured to - * render results using old API. This should be removed along with those - * attributes so that the renderer is just passed in rather than the - * old methods taking output strings. - * - * Caller must destroy result. - */ - TessResultRenderer* NewRenderer(); + // A list of image filenames gets special consideration + bool ProcessPagesFileList(FILE *fp, + STRING *buf, + const char* retry_config, int timeout_millisec, + TessResultRenderer* renderer, + int tessedit_page_number); + // TIFF supports multipage so gets special consideration + bool ProcessPagesMultipageTiff(const unsigned char *data, + size_t size, + const char* filename, + const char* retry_config, + int timeout_millisec, + TessResultRenderer* renderer, + int tessedit_page_number); }; // class TessBaseAPI. /** Escape a char string - remove &<>"' with HTML codes. */ diff --git a/api/pdfrenderer.cpp b/api/pdfrenderer.cpp index 14df0126ce..820f856ae3 100644 --- a/api/pdfrenderer.cpp +++ b/api/pdfrenderer.cpp @@ -32,8 +32,8 @@ const int kCharWidth = 2; * PDF Renderer interface implementation **********************************************************************/ -TessPDFRenderer::TessPDFRenderer(const char *datadir) - : TessResultRenderer("PDF", "pdf") { +TessPDFRenderer::TessPDFRenderer(const char* outputbase, const char *datadir) + : TessResultRenderer(outputbase, "pdf") { obj_ = 0; datadir_ = datadir; offsets_.push_back(0); @@ -440,20 +440,26 @@ bool TessPDFRenderer::fileToPDFObj(char *filename, long int objnum, FILE *fp = fopen(filename, "rb"); if (!fp) return false; - int format; + const char *filter; + int spp, w, h; + int cmyk = false; + int format; findFileFormatStream(fp, &format); - if (format != IFF_JFIF_JPEG) { - fclose(fp); - return false; + switch(format) { + case IFF_JFIF_JPEG: + freadHeaderJpeg(fp, &w, &h, &spp, NULL, &cmyk); + filter = "/DCTDecode"; + break; + case IFF_JP2: + freadHeaderJp2k(fp, &w, &h, &spp); + filter = "/JPXDecode"; + break; + default: + fclose(fp); + return false; } - fseek(fp, 0, SEEK_END); - long int jpeg_size = ftell(fp); - fseek(fp, 0, SEEK_SET); - - int spp, cmyk, w, h; - freadHeaderJpeg(fp, &w, &h, &spp, NULL, &cmyk); const char *colorspace; switch (spp) { case 1: @@ -472,6 +478,10 @@ bool TessPDFRenderer::fileToPDFObj(char *filename, long int objnum, return false; } + fseek(fp, 0, SEEK_END); + long int file_size = ftell(fp); + fseek(fp, 0, SEEK_SET); + // IMAGE snprintf(b1, sizeof(b1), "%ld 0 obj\n" @@ -482,10 +492,10 @@ bool TessPDFRenderer::fileToPDFObj(char *filename, long int objnum, " /Width %d\n" " /Height %d\n" " /BitsPerComponent 8\n" - " /Filter /DCTDecode\n" + " /Filter %s\n" ">>\n" - "stream\n", objnum, jpeg_size, - colorspace, w, h); + "stream\n", objnum, file_size, + colorspace, w, h, filter); size_t b1_len = strlen(b1); snprintf(b2, sizeof(b2), @@ -494,17 +504,17 @@ bool TessPDFRenderer::fileToPDFObj(char *filename, long int objnum, "endobj\n"); size_t b2_len = strlen(b2); - *pdf_object_size = b1_len + jpeg_size + b2_len; + *pdf_object_size = b1_len + file_size + b2_len; *pdf_object = new char[*pdf_object_size]; if (!pdf_object) return false; memcpy(*pdf_object, b1, b1_len); - if (static_cast(fread(*pdf_object + b1_len, 1, jpeg_size, fp)) != - jpeg_size) { + if (static_cast(fread(*pdf_object + b1_len, 1, file_size, fp)) != + file_size) { delete[] pdf_object; return false; } - memcpy(*pdf_object + b1_len + jpeg_size, b2, b2_len); + memcpy(*pdf_object + b1_len + file_size, b2, b2_len); fclose(fp); return true; } diff --git a/api/renderer.cpp b/api/renderer.cpp index 54fcf5556b..37f984238f 100644 --- a/api/renderer.cpp +++ b/api/renderer.cpp @@ -14,23 +14,30 @@ namespace tesseract { -// Start with a 4K output buffer which should be pretty big for a page of text -// though might need to grow for other formats or multi-page documents. -static const int kInitialAlloc = 1 << 12; - /********************************************************************** * Base Renderer interface implementation **********************************************************************/ -TessResultRenderer::TessResultRenderer(const char* type, const char* extension) - : full_typename_(type), file_extension_(extension), +TessResultRenderer::TessResultRenderer(const char *outputbase, + const char* extension) + : file_extension_(extension), title_(""), imagenum_(-1), - output_data_(NULL), - next_(NULL) { - ResetData(); + fout_(stdout), + next_(NULL), + happy_(true) { + if (strcmp(outputbase, "-") && strcmp(outputbase, "stdout")) { + STRING outfile = STRING(outputbase) + STRING(".") + STRING(file_extension_); + fout_ = fopen(outfile.string(), "wb"); + if (fout_ == NULL) { + happy_ = false; + } + } } TessResultRenderer::~TessResultRenderer() { - delete[] output_data_; + if (fout_ != stdout) + fclose(fout_); + else + clearerr(fout_); delete next_; } @@ -48,8 +55,7 @@ void TessResultRenderer::insert(TessResultRenderer* next) { } bool TessResultRenderer::BeginDocument(const char* title) { - ResetData(); - + if (!happy_) return false; title_ = title; imagenum_ = -1; bool ok = BeginDocumentHandler(); @@ -60,6 +66,7 @@ bool TessResultRenderer::BeginDocument(const char* title) { } bool TessResultRenderer::AddImage(TessBaseAPI* api) { + if (!happy_) return false; ++imagenum_; bool ok = AddImageHandler(api); if (next_) { @@ -68,16 +75,8 @@ bool TessResultRenderer::AddImage(TessBaseAPI* api) { return ok; } -bool TessResultRenderer::AddError(TessBaseAPI* api) { - ++imagenum_; - bool ok = AddErrorHandler(api); - if (next_) { - ok = next_->AddError(api) && ok; - } - return ok; -} - bool TessResultRenderer::EndDocument() { + if (!happy_) return false; bool ok = EndDocumentHandler(); if (next_) { ok = next_->EndDocument() && ok; @@ -85,62 +84,29 @@ bool TessResultRenderer::EndDocument() { return ok; } -bool TessResultRenderer::GetOutput(const char** data, int* data_len) const { - *data = output_data_; - *data_len = output_len_; - return true; -} - -void TessResultRenderer::ResetData() { - delete[] output_data_; - output_data_ = new char[kInitialAlloc]; - output_alloc_ = kInitialAlloc; - output_len_ = 0; -} - -void TessResultRenderer::ReserveAdditionalData(int relative_len) { - int total = relative_len + output_len_; - if (total <= output_alloc_) - return; - - if (total < 2 * output_alloc_) { - total = 2 * output_alloc_; - } - - char* new_data = new char[total]; - memcpy(new_data, output_data_, output_len_); - delete[] output_data_; - output_data_ = new_data; -} - void TessResultRenderer::AppendString(const char* s) { AppendData(s, strlen(s)); } void TessResultRenderer::AppendData(const char* s, int len) { - ReserveAdditionalData(len); - memcpy(output_data_ + output_len_, s, len); - output_len_ += len; + int n = fwrite(s, 1, len, fout_); + if (n != len) happy_ = false; } bool TessResultRenderer::BeginDocumentHandler() { - return true; -} - -bool TessResultRenderer::AddErrorHandler(TessBaseAPI* api) { - return true; + return happy_; } bool TessResultRenderer::EndDocumentHandler() { - return true; + return happy_; } /********************************************************************** * UTF8 Text Renderer interface implementation **********************************************************************/ -TessTextRenderer::TessTextRenderer() - : TessResultRenderer("Text", "txt") { +TessTextRenderer::TessTextRenderer(const char *outputbase) + : TessResultRenderer(outputbase, "txt") { } bool TessTextRenderer::AddImageHandler(TessBaseAPI* api) { @@ -158,8 +124,8 @@ bool TessTextRenderer::AddImageHandler(TessBaseAPI* api) { /********************************************************************** * HOcr Text Renderer interface implementation **********************************************************************/ -TessHOcrRenderer::TessHOcrRenderer() - : TessResultRenderer("HOcr", "hocr") { +TessHOcrRenderer::TessHOcrRenderer(const char *outputbase) + : TessResultRenderer(outputbase, "hocr") { } bool TessHOcrRenderer::BeginDocumentHandler() { @@ -201,8 +167,8 @@ bool TessHOcrRenderer::AddImageHandler(TessBaseAPI* api) { /********************************************************************** * UNLV Text Renderer interface implementation **********************************************************************/ -TessUnlvRenderer::TessUnlvRenderer() - : TessResultRenderer("UNLV", "unlv") { +TessUnlvRenderer::TessUnlvRenderer(const char *outputbase) + : TessResultRenderer(outputbase, "unlv") { } bool TessUnlvRenderer::AddImageHandler(TessBaseAPI* api) { @@ -218,8 +184,8 @@ bool TessUnlvRenderer::AddImageHandler(TessBaseAPI* api) { /********************************************************************** * BoxText Renderer interface implementation **********************************************************************/ -TessBoxTextRenderer::TessBoxTextRenderer() - : TessResultRenderer("Box Text", "box") { +TessBoxTextRenderer::TessBoxTextRenderer(const char *outputbase) + : TessResultRenderer(outputbase, "box") { } bool TessBoxTextRenderer::AddImageHandler(TessBaseAPI* api) { diff --git a/api/renderer.h b/api/renderer.h index 080a603f2f..39be8d3fa4 100644 --- a/api/renderer.h +++ b/api/renderer.h @@ -47,7 +47,7 @@ class TESS_API TessResultRenderer { virtual ~TessResultRenderer(); // Takes ownership of pointer so must be new'd instance. - // Renderers arent ordered, but appends the sequences of next parameter + // Renderers aren't ordered, but appends the sequences of next parameter // and existing next(). The renderers should be unique across both lists. void insert(TessResultRenderer* next); @@ -70,23 +70,17 @@ class TESS_API TessResultRenderer { */ bool AddImage(TessBaseAPI* api); - /** - * Called to inform the renderer when tesseract failed on an image. - */ - bool AddError(TessBaseAPI* api); - /** * Finishes the document and finalizes the output data * Invalid if BeginDocument not yet called. */ bool EndDocument(); - const char* full_typename() const { return full_typename_; } const char* file_extension() const { return file_extension_; } const char* title() const { return title_; } /** - * Returns the index of the last image given to AddImage or AddError + * Returns the index of the last image given to AddImage * (i.e. images are incremented whether the image succeeded or not) * * This is always defined. It means either the number of the @@ -96,20 +90,19 @@ class TESS_API TessResultRenderer { */ int imagenum() const { return imagenum_; } - /** - * The results are not defined if EndDocument has not yet been called. - * Returns the current output from the renderer. The data is owned by - * the renderer and only valid until the next call into the renderer - * that may modify document state (such as Begin/End Document - * or AddImage. - */ - virtual bool GetOutput(const char** data, int* data_len) const; - protected: /** - * Called by concrete classes + * Called by concrete classes. + * + * outputbase is the name of the output file excluding + * extension. For example, "/path/to/chocolate-chip-cookie-recipe" + * + * extension indicates the file extension to be used for output + * files. For example "pdf" will produce a .pdf file, and "hocr" + * will produce .hocr files. */ - TessResultRenderer(const char* type, const char* extension); + TessResultRenderer(const char *outputbase, + const char* extension); // Hook for specialized handling in BeginDocument() virtual bool BeginDocumentHandler(); @@ -117,22 +110,9 @@ class TESS_API TessResultRenderer { // This must be overriden to render the OCR'd results virtual bool AddImageHandler(TessBaseAPI* api) = 0; - // The default handler ignores the error and just returns true - virtual bool AddErrorHandler(TessBaseAPI* api); - // Hook for specialized handling in EndDocument() virtual bool EndDocumentHandler(); - // Clear output data. - void ResetData(); - - // Renderers can call this method to allocate data storage in advance, - // which can cut down on allocations and copying. This isnt required, - // and if used can still request less than will ultimately be used without - // worrying about data corruption. It's purely performance. - // Note that relative_len is in addition to what is already being used. - void ReserveAdditionalData(int relative_len); - // Renderers can call this to append '\0' terminated strings into // the output string returned by GetOutput. // This method will grow the output buffer if needed. @@ -145,15 +125,13 @@ class TESS_API TessResultRenderer { void AppendData(const char* s, int len); private: - const char* full_typename_; // name of renderer const char* file_extension_; // standard extension for generated output const char* title_; // title of document being renderered int imagenum_; // index of last image added - char* output_data_; // output bytes - int output_alloc_; // bytes allocated - int output_len_; // bytes actually used - TessResultRenderer* next_; // Can link multiple renderers together. + FILE* fout_; // output file pointer + TessResultRenderer* next_; // Can link multiple renderers together + bool happy_; // I get grumpy when the disk fills up, etc. }; /** @@ -161,7 +139,7 @@ class TESS_API TessResultRenderer { */ class TESS_API TessTextRenderer : public TessResultRenderer { public: - TessTextRenderer(); + explicit TessTextRenderer(const char *outputbase); protected: virtual bool AddImageHandler(TessBaseAPI* api); @@ -172,7 +150,7 @@ class TESS_API TessTextRenderer : public TessResultRenderer { */ class TESS_API TessHOcrRenderer : public TessResultRenderer { public: - TessHOcrRenderer(); + explicit TessHOcrRenderer(const char *outputbase); protected: virtual bool BeginDocumentHandler(); @@ -185,7 +163,9 @@ class TESS_API TessHOcrRenderer : public TessResultRenderer { */ class TESS_API TessPDFRenderer : public TessResultRenderer { public: - TessPDFRenderer(const char *datadir); + // datadir is the location of the TESSDATA. We need it because + // we load a custom PDF font from this location. + TessPDFRenderer(const char *outputbase, const char *datadir); protected: virtual bool BeginDocumentHandler(); @@ -224,7 +204,7 @@ class TESS_API TessPDFRenderer : public TessResultRenderer { */ class TESS_API TessUnlvRenderer : public TessResultRenderer { public: - TessUnlvRenderer(); + explicit TessUnlvRenderer(const char *outputbase); protected: virtual bool AddImageHandler(TessBaseAPI* api); @@ -235,7 +215,7 @@ class TESS_API TessUnlvRenderer : public TessResultRenderer { */ class TESS_API TessBoxTextRenderer : public TessResultRenderer { public: - TessBoxTextRenderer(); + explicit TessBoxTextRenderer(const char *outputbase); protected: virtual bool AddImageHandler(TessBaseAPI* api); diff --git a/api/tesseractmain.cpp b/api/tesseractmain.cpp index a0da91b9d7..c6c41d7775 100644 --- a/api/tesseractmain.cpp +++ b/api/tesseractmain.cpp @@ -85,7 +85,7 @@ int main(int argc, char **argv) { // Make the order of args a bit more forgiving than it used to be. const char* lang = "eng"; const char* image = NULL; - const char* output = NULL; + const char* outputbase = NULL; const char* datapath = NULL; bool noocr = false; bool list_langs = false; @@ -94,7 +94,7 @@ int main(int argc, char **argv) { tesseract::PageSegMode pagesegmode = tesseract::PSM_AUTO; int arg = 1; - while (arg < argc && (output == NULL || argv[arg][0] == '-')) { + while (arg < argc && (outputbase == NULL || argv[arg][0] == '-')) { if (strcmp(argv[arg], "-l") == 0 && arg + 1 < argc) { lang = argv[arg + 1]; ++arg; @@ -123,8 +123,8 @@ int main(int argc, char **argv) { ++arg; } else if (image == NULL) { image = argv[arg]; - } else if (output == NULL) { - output = argv[arg]; + } else if (outputbase == NULL) { + outputbase = argv[arg]; } ++arg; } @@ -134,7 +134,7 @@ int main(int argc, char **argv) { noocr = true; } - if (output == NULL && noocr == false) { + if (outputbase == NULL && noocr == false) { fprintf(stderr, "Usage:\n %s imagename|stdin outputbase|stdout " "[options...] [configfile...]\n\n", argv[0]); @@ -172,14 +172,15 @@ int main(int argc, char **argv) { exit(1); } - if (output != NULL && strcmp(output, "-") && strcmp(output, "stdout")) { + if (outputbase != NULL && strcmp(outputbase, "-") && + strcmp(outputbase, "stdout")) { tprintf("Tesseract Open Source OCR Engine v%s with Leptonica\n", tesseract::TessBaseAPI::Version()); } PERF_COUNT_START("Tesseract:main") tesseract::TessBaseAPI api; - api.SetOutputName(output); + api.SetOutputName(outputbase); int rc = api.Init(datapath, lang, tesseract::OEM_DEFAULT, &(argv[arg]), argc - arg, &vars_vec, &vars_values, false); @@ -192,7 +193,12 @@ int main(int argc, char **argv) { for (arg = 0; arg < argc; arg++) { if (strcmp(argv[arg], "-c") == 0 && arg + 1 < argc) { strncpy(opt1, argv[arg + 1], 255); - *(strchr(opt1, '=')) = 0; + char *p = strchr(opt1, '='); + if (!p) { + fprintf(stderr, "Missing = in configvar assignment\n"); + exit(1); + } + *p = 0; strncpy(opt2, strchr(argv[arg + 1], '=') + 1, 255); opt2[254] = 0; ++arg; @@ -239,32 +245,11 @@ int main(int argc, char **argv) { if (api.GetPageSegMode() == tesseract::PSM_SINGLE_BLOCK) api.SetPageSegMode(pagesegmode); - bool stdInput = !strcmp(image, "stdin") || !strcmp(image, "-"); - Pix* pixs = NULL; - if (stdInput) { - char byt; - GenericVector ch_data; - std::istream file(std::cin.rdbuf()); - -#ifdef WIN32 - if (_setmode(_fileno(stdin), _O_BINARY) == -1) - tprintf("ERROR: cin to binary: %s", strerror(errno)); -#endif // WIN32 - - while (file.get(byt)) { - ch_data.push_back(byt); - } - std::cin.ignore(std::cin.rdbuf()->in_avail() + 1); - - pixs = pixReadMem(&ch_data[0], ch_data.size()); - } - if (pagesegmode == tesseract::PSM_AUTO_ONLY || pagesegmode == tesseract::PSM_OSD_ONLY) { int ret_val = 0; - if (!pixs) - pixs = pixRead(image); + Pix* pixs = pixRead(image); if (!pixs) { fprintf(stderr, "Cannot open input file: %s\n", image); exit(2); @@ -296,7 +281,7 @@ int main(int argc, char **argv) { it->Orientation(&orientation, &direction, &order, &deskew_angle); tprintf("Orientation: %d\nWritingDirection: %d\nTextlineOrder: %d\n" \ "Deskew angle: %.4f\n", - orientation, direction, order, deskew_angle); + orientation, direction, order, deskew_angle); } else { ret_val = 1; } @@ -309,59 +294,29 @@ int main(int argc, char **argv) { tesseract::TessResultRenderer* renderer = NULL; bool b; api.GetBoolVariable("tessedit_create_hocr", &b); - if (b && renderer == NULL) renderer = new tesseract::TessHOcrRenderer(); + if (b) + renderer = new tesseract::TessHOcrRenderer(outputbase); api.GetBoolVariable("tessedit_create_pdf", &b); if (b && renderer == NULL) - renderer = new tesseract::TessPDFRenderer(api.GetDatapath()); + renderer = new tesseract::TessPDFRenderer(outputbase, api.GetDatapath()); api.GetBoolVariable("tessedit_write_unlv", &b); - if (b && renderer == NULL) renderer = new tesseract::TessUnlvRenderer(); + if (b && renderer == NULL) + renderer = new tesseract::TessUnlvRenderer(outputbase); api.GetBoolVariable("tessedit_create_boxfile", &b); - if (b && renderer == NULL) renderer = new tesseract::TessBoxTextRenderer(); + if (b && renderer == NULL) + renderer = new tesseract::TessBoxTextRenderer(outputbase); - if (renderer == NULL) renderer = new tesseract::TessTextRenderer(); + if (renderer == NULL) + renderer = new tesseract::TessTextRenderer(outputbase); - if (pixs) { - if (renderer) renderer->BeginDocument(""); - api.ProcessPage(pixs, 0, NULL, NULL, 0, renderer); - if (renderer) renderer->EndDocument(); - pixDestroy(&pixs); - } else { - FILE* fin = fopen(image, "rb"); - if (fin == NULL) { - fprintf(stderr, "Cannot open input file: %s\n", image); - exit(2); - } - fclose(fin); - if (!api.ProcessPages(image, NULL, 0, renderer)) { - fprintf(stderr, "Error during processing.\n"); - exit(1); - } - } - - FILE* fout = stdout; - if (strcmp(output, "-") && strcmp(output, "stdout")) { - STRING outfile = STRING(output) - + STRING(".") - + STRING(renderer->file_extension()); - fout = fopen(outfile.string(), "wb"); - if (fout == NULL) { - fprintf(stderr, "Cannot create output file %s\n", outfile.string()); - exit(1); - } + if (!api.ProcessPages(image, NULL, 0, renderer)) { + fprintf(stderr, "Error during processing.\n"); + exit(1); } - const char* data; - inT32 data_len; - if (renderer->GetOutput(&data, &data_len)) { - fwrite(data, 1, data_len, fout); - if (fout != stdout) - fclose(fout); - else - clearerr(fout); - } PERF_COUNT_END return 0; // Normal exit }