-
Notifications
You must be signed in to change notification settings - Fork 1
/
Tessocr.hh
120 lines (107 loc) · 3.77 KB
/
Tessocr.hh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#ifndef TESSOCR_H
#define TESSOCR_H
#include <tesseract/baseapi.h>
#include <tesseract/ocrclass.h>
#include <tesseract/strngs.h>
#include <tesseract/genericvector.h>
#include "Painter.hh"
#include "HOCRDocument.hh"
#include "Interprocess.hh"
struct PageData {
bool success;
QString filename;
int page;
double angle;
int resolution;
QList<QImage> ocrAreas;
};
class OcrParam {
public:
OcrParam() {}
OcrParam(const QString& password, const QString& lang,
const QList<int>& pages, const PdfPostProcess& pdfPostProcess);
QString m_password;
QString m_lang;
QList<int> m_pages;
PdfPostProcess m_pdfPostProcess;
};
class AbstractProgressMonitor {
public:
AbstractProgressMonitor(int total) : m_total(total) {}
virtual ~AbstractProgressMonitor() {}
int increaseProgress() {
++m_pageProgress;
return m_pageProgress;
}
protected:
const int m_total;
int m_pageProgress = 0;
};
class ProgressMonitor : public AbstractProgressMonitor {
public:
ETEXT_DESC desc;
ProgressMonitor(int nPages, ProgressInfo* interProgressInfo)
: AbstractProgressMonitor(nPages), m_interProcessInfo(interProgressInfo) {
desc.progress = 0;
desc.cancel = CancelCallback;
desc.cancel_this = this;
}
static bool CancelCallback(void* instance, int /*words*/) {
static int lastProcess = 0;
ProgressMonitor* monitor = reinterpret_cast<ProgressMonitor*>(instance);
int progress = monitor->GetProgress();
if(lastProcess != progress) {
lastProcess = progress;
monitor->m_interProcessInfo->m_progress = progress * 0.9;
}
return monitor->Cancelled();
}
bool Cancelled() {
return m_interProcessInfo->m_errCode == ERROR_CODE::CANCLED_BY_USER;
}
private:
ProgressInfo* m_interProcessInfo;
public slots:
int GetProgress() const {
return 100.0 * ((m_pageProgress + desc.progress / 99.0) / m_total);
}
};
class TessOcr {
public:
enum FILE_TYPE {
PDF,
XML,
TXT,
IMG
};
public:
TessOcr(const QString& parentOfTessdataDir);
ERROR_CODE recognize(const QString& inPath, const OcrParam& pdfOcrParam, bool autodetectLayout, ProgressInfo* interProcessInfo);
ERROR_CODE ParseXML(const QString& inPath, ProgressInfo* interProcessInfo);
ERROR_CODE ExportPdf(const QString& outPath, ProgressInfo* interProcessInfo);
ERROR_CODE ExporteXML(const QString& outPath, ProgressInfo* interProcessInfo);
ERROR_CODE ExportTxt(const QString& outPath, ProgressInfo* interProcessInfo);
void SetOutfileType(FILE_TYPE outfileType) { m_outfileType = outfileType;}
FILE_TYPE GetOutfileType() { return m_outfileType;}
void SetInfileType(FILE_TYPE infileType) {m_infileType = infileType;}
FILE_TYPE GetInfileType() { return m_infileType;}
private:
QList<QImage> GetOCRAreas(const QFileInfo& fileinfo, int resolution, int page);
void read(const char* hocrtext, PageData pageData);
QPageSize GetPdfPageSize(const HOCRDocument* hocrdocument);
ERROR_CODE ExportResult(const QString& outPath, ProgressInfo* interProgressInfo);
PDFSettings& GetPdfSettings();
ERROR_CODE CheckFileStatus(const QFileInfo& fileInfo, ProgressInfo* interProcessInfo, const OcrParam& pdfOcrParam = OcrParam());
private:
void printChildren(PDFPainter& painter, const HOCRItem* item, const PDFSettings& pdfSettings, double px2pu, double imgScale = 1.);
PDFSettings getPdfSettings() const;
PageData setPage(int page, bool autodetectLayout, QString filename);
HOCRDocument m_hocrDocument;
QString m_parentOfTessdataDir;
QString m_utf8Text;
FILE_TYPE m_outfileType;
FILE_TYPE m_infileType;
PDFSettings m_pdfSettings;
PageData m_pageData;
};
#endif // TESSOCR_H