diff --git a/api/capi.cpp b/api/capi.cpp index 2979ae158f..4f69731400 100644 --- a/api/capi.cpp +++ b/api/capi.cpp @@ -64,9 +64,10 @@ TESS_API TessResultRenderer* TESS_CALL TessHOcrRendererCreate2(const char* outpu return new TessHOcrRenderer(outputbase, font_info); } -TESS_API TessResultRenderer* TESS_CALL TessPDFRendererCreate(const char* outputbase, const char* datadir) +TESS_API TessResultRenderer* TESS_CALL TessPDFRendererCreate(const char* outputbase, const char* datadir, + BOOL textonly) { - return new TessPDFRenderer(outputbase, datadir); + return new TessPDFRenderer(outputbase, datadir, textonly); } TESS_API TessResultRenderer* TESS_CALL TessUnlvRendererCreate(const char* outputbase) diff --git a/api/pdfrenderer.cpp b/api/pdfrenderer.cpp index 001c86ce41..ebb4922d55 100644 --- a/api/pdfrenderer.cpp +++ b/api/pdfrenderer.cpp @@ -178,10 +178,12 @@ const int kCharWidth = 2; * PDF Renderer interface implementation **********************************************************************/ -TessPDFRenderer::TessPDFRenderer(const char* outputbase, const char *datadir) +TessPDFRenderer::TessPDFRenderer(const char *outputbase, const char *datadir, + bool textonly) : TessResultRenderer(outputbase, "pdf") { obj_ = 0; datadir_ = datadir; + textonly_ = textonly; offsets_.push_back(0); } @@ -326,7 +328,11 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api, pdf_str.add_str_double("", prec(width)); pdf_str += " 0 0 "; pdf_str.add_str_double("", prec(height)); - pdf_str += " 0 0 cm /Im1 Do Q\n"; + pdf_str += " 0 0 cm"; + if (!textonly_) { + pdf_str += " /Im1 Do"; + } + pdf_str += " Q\n"; int line_x1 = 0; int line_y1 = 0; @@ -837,6 +843,7 @@ bool TessPDFRenderer::imageToPDFObj(Pix *pix, bool TessPDFRenderer::AddImageHandler(TessBaseAPI* api) { size_t n; char buf[kBasicBufSize]; + char buf2[kBasicBufSize]; Pix *pix = api->GetInputImage(); char *filename = (char *)api->GetInputName(); int ppi = api->GetSourceYResolution(); @@ -845,6 +852,9 @@ bool TessPDFRenderer::AddImageHandler(TessBaseAPI* api) { double width = pixGetWidth(pix) * 72.0 / ppi; double height = pixGetHeight(pix) * 72.0 / ppi; + snprintf(buf2, sizeof(buf2), "XObject << /Im1 %ld 0 R >>\n", obj_ + 2); + const char *xobject = (textonly_) ? "" : buf2; + // PAGE n = snprintf(buf, sizeof(buf), "%ld 0 obj\n" @@ -855,19 +865,18 @@ bool TessPDFRenderer::AddImageHandler(TessBaseAPI* api) { " /Contents %ld 0 R\n" " /Resources\n" " <<\n" - " /XObject << /Im1 %ld 0 R >>\n" + " %s" " /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n" " /Font << /f-0-0 %ld 0 R >>\n" " >>\n" ">>\n" "endobj\n", obj_, - 2L, // Pages object - width, - height, - obj_ + 1, // Contents object - obj_ + 2, // Image object - 3L); // Type0 Font + 2L, // Pages object + width, height, + obj_ + 1, // Contents object + xobject, // Image object + 3L); // Type0 Font if (n >= sizeof(buf)) return false; pages_.push_back(obj_); AppendPDFObject(buf); @@ -904,13 +913,15 @@ bool TessPDFRenderer::AddImageHandler(TessBaseAPI* api) { objsize += strlen(b2); AppendPDFObjectDIY(objsize); - char *pdf_object; - if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize)) { - return false; + if (!textonly_) { + char *pdf_object = nullptr; + if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize)) { + return false; + } + AppendData(pdf_object, objsize); + AppendPDFObjectDIY(objsize); + delete[] pdf_object; } - AppendData(pdf_object, objsize); - AppendPDFObjectDIY(objsize); - delete[] pdf_object; return true; } diff --git a/api/renderer.h b/api/renderer.h index 97ccff2674..b23e1fff22 100644 --- a/api/renderer.h +++ b/api/renderer.h @@ -186,7 +186,7 @@ class TESS_API TessPDFRenderer : public TessResultRenderer { public: // datadir is the location of the TESSDATA. We need it because // we load a custom PDF font from this location. - TessPDFRenderer(const char *outputbase, const char *datadir); + TessPDFRenderer(const char* outputbase, const char* datadir, bool textonly); protected: virtual bool BeginDocumentHandler(); @@ -196,20 +196,20 @@ class TESS_API TessPDFRenderer : public TessResultRenderer { private: // We don't want to have every image in memory at once, // so we store some metadata as we go along producing - // PDFs one page at a time. At the end that metadata is + // PDFs one page at a time. At the end, that metadata is // used to make everything that isn't easily handled in a // streaming fashion. long int obj_; // counter for PDF objects GenericVector offsets_; // offset of every PDF object in bytes GenericVector pages_; // object number for every /Page object const char *datadir_; // where to find the custom font + bool textonly_; // skip images if set // Bookkeeping only. DIY = Do It Yourself. void AppendPDFObjectDIY(size_t objectsize); // Bookkeeping + emit data. void AppendPDFObject(const char *data); // Create the /Contents object for an entire page. - static char* GetPDFTextObjects(TessBaseAPI* api, - double width, double height); + char* GetPDFTextObjects(TessBaseAPI* api, double width, double height); // Turn an image into a PDF object. Only transcode if we have to. static bool imageToPDFObj(Pix *pix, char *filename, long int objnum, char **pdf_object, long int *pdf_object_size); diff --git a/api/tesseractmain.cpp b/api/tesseractmain.cpp index 210eec61d6..7cd6ff8427 100644 --- a/api/tesseractmain.cpp +++ b/api/tesseractmain.cpp @@ -348,8 +348,10 @@ void PreloadRenderers( api->GetBoolVariable("tessedit_create_pdf", &b); if (b) { - renderers->push_back( - new tesseract::TessPDFRenderer(outputbase, api->GetDatapath())); + bool textonly; + api->GetBoolVariable("textonly_pdf", &textonly); + renderers->push_back(new tesseract::TessPDFRenderer( + outputbase, api->GetDatapath(), textonly)); } api->GetBoolVariable("tessedit_write_unlv", &b); diff --git a/ccmain/tesseractclass.cpp b/ccmain/tesseractclass.cpp index 156633c0fb..bd23830d28 100644 --- a/ccmain/tesseractclass.cpp +++ b/ccmain/tesseractclass.cpp @@ -389,6 +389,8 @@ Tesseract::Tesseract() this->params()), BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file", this->params()), + BOOL_MEMBER(textonly_pdf, false, "Invisible text only for PDF", + this->params()), STRING_MEMBER(unrecognised_char, "|", "Output char for unidentified blobs", this->params()), INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()), diff --git a/ccmain/tesseractclass.h b/ccmain/tesseractclass.h index 5c93e3bc20..472fd4318c 100644 --- a/ccmain/tesseractclass.h +++ b/ccmain/tesseractclass.h @@ -1026,6 +1026,7 @@ class Tesseract : public Wordrec { BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file"); BOOL_VAR_H(tessedit_create_tsv, false, "Write .tsv output file"); BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file"); + BOOL_VAR_H(textonly_pdf, false, "Invisible text only for PDF"); STRING_VAR_H(unrecognised_char, "|", "Output char for unidentified blobs"); INT_VAR_H(suspect_level, 99, "Suspect marker level");