-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathvesseract.v
212 lines (168 loc) · 4.43 KB
/
vesseract.v
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
module vesseract
import os
// Used for bounding box detection
pub struct Tesseract_box {
pub:
letter string
x1 u32
y1 u32
x2 u32
y2 u32
page u32
}
// Used as a parameter
pub struct Tesseract {
pub:
// Image path
image string
// Custom arguments
args string
// Set language
lang string = 'eng'
}
// Used to make it easier to get tesseract version
pub struct Tesseract_version {
pub:
major int
minor int
patch int
str string = '0.0.0'
raw string = 'tesseract'
}
// Extract text from image
pub fn image_to_string(t Tesseract) ?string {
// Run tesseract
result := extract_text_tesseract(t) or { return err }
// Tmp txt file output
file_path := result.output_filename
// Read output
str := os.read_file(file_path) or { return err }
// Remove tmp txt file
os.rm(file_path) ?
// Check if tesseract find something
if str.len <= 1 {
return ''
}
return str[..str.len - 2]
}
// Generate a map containing all of the languages supported by tesseract
fn get_language_map() ?map[string]bool {
// Get tesseract langs
t_result := run_tesseract(['--list-langs']) or { return err }
// Language list
mut langs_supported := map[string]bool{}
// Split
content := t_result.split('\n')
// Skip first line
for i in 1 .. content.len {
line := content[i]
// Filter empty lines
if line.len > 0 {
langs_supported[content[i]] = true
}
}
return langs_supported
}
// Get installed languages from Tesseract-OCR
// return a list of languages code
pub fn get_languages() ?[]string {
// Get tesseract langs
t_result := run_tesseract(['--list-langs']) or { return err }
// Get language map
lang_map := get_language_map() or { return err }
// Language list
mut langs_supported := []string{}
// Skip first line
for code, _ in lang_map {
langs_supported << code
}
return langs_supported
}
// Check if a language code is supported
// No optional as this make the code easier to write
// Return false on tesseract error (or not available), return true if supported
pub fn is_language_code_supported(code string) bool {
// Get a map of langages
map_lang := get_language_map() or { return false }
return code in map_lang
}
// Get tesseract-OCR version
pub fn get_tesseract_version() ?Tesseract_version {
// Get tesseract version
t_result := run_tesseract(['--version']) or { return err }
// Get tesseract string
lines := t_result.split('\n')
t_version_raw := lines[0].trim('\r')
// Extract version string - ex: 4.1.1
t_version_str := t_version_raw.split(' ')[1]
// Get version numbers
t_version_num := t_version_str.split('.')
// Extract major/minor/patch
t_version_major := int(t_version_num[0].u32())
t_version_minor := int(t_version_num[1].u32())
t_version_patch := int(t_version_num[2].u32())
// Set values into struct
return Tesseract_version{
major: t_version_major
minor: t_version_minor
patch: t_version_patch
str: t_version_str
raw: t_version_raw
}
}
// Get alto representation from Tesseract-OCR as XML format
pub fn image_to_alto_xml(t Tesseract) ?string {
// Tesseract option: -c tessedit_create_alto=1
// Check version for alto support
ver := get_tesseract_version() or { return err }
if ver.major <= 4 && ver.minor < 1 {
return error('vesseract: Alto export require Tesseract >= 4.1.0')
}
// Generate result id
id := generate_id()
xml_filename := id + '.xml'
// Run tesseract
run_tesseract([t.image, id, '-c tessedit_create_alto=1', t.args]) or { return err }
// Read output
xml := os.read_file(xml_filename) or { return err }
// Delete
os.rm(xml_filename) ?
// Get XML
return xml
}
// Get bounding boxes from Tesseract
// Return an array of Tesseract boxes
pub fn image_to_boxes(t Tesseract) ?[]Tesseract_box {
// Run tesseract with bounding box detection
result := extract_text_tesseract(
image: t.image
lang: t.lang
args: t.args + ' batch.nochop makebox'
) or { return err }
// Load box file
box_file := os.read_file(result.id + '.box') or { return err }
lines := box_file.split('\n')
// Delete "box" file and txt
os.rm(result.id + '.box') ?
// Hold results
mut boxes := []Tesseract_box{}
// Parse
for line in lines {
// Letter, x1, y1, x2, y1, page
// Example: H 68 206 91 235 0
content := line.split(' ')
// Skip malformed lines
if content.len != 6 {
continue
}
boxes << Tesseract_box{
letter: content[0]
x1: content[1].u32()
y1: content[2].u32()
x2: content[3].u32()
y2: content[4].u32()
page: content[5].u32()
}
}
return boxes
}