From 2794410c9b5095f1b6607f68584b1fe5b7a32f7c Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Sun, 30 Apr 2017 11:27:17 +0200 Subject: [PATCH] doc: Remove generated files and add rules to build manpages Those files can be built by doc/generate_manpages.sh. The manpages are needed for the installation, so add Makefile rules for them. Git must ignore the generated manpages. Signed-off-by: Stefan Weil --- .gitignore | 8 +- doc/Makefile.am | 12 + doc/ambiguous_words.1 | 46 -- doc/ambiguous_words.1.html | 790 -------------------- doc/ambiguous_words.1.xml | 43 -- doc/cntraining.1 | 54 -- doc/cntraining.1.html | 805 --------------------- doc/cntraining.1.xml | 58 -- doc/combine_tessdata.1 | 205 ------ doc/combine_tessdata.1.html | 996 ------------------------- doc/combine_tessdata.1.xml | 259 ------- doc/dawg2wordlist.1 | 55 -- doc/dawg2wordlist.1.html | 802 --------------------- doc/dawg2wordlist.1.xml | 53 -- doc/mftraining.1 | 94 --- doc/mftraining.1.html | 847 ---------------------- doc/mftraining.1.xml | 102 --- doc/shapeclustering.1 | 94 --- doc/shapeclustering.1.html | 850 ---------------------- doc/shapeclustering.1.xml | 105 --- doc/tesseract.1 | 282 -------- doc/tesseract.1.html | 1202 ------------------------------- doc/tesseract.1.xml | 468 ------------ doc/unicharambigs.5 | 120 --- doc/unicharambigs.5.html | 875 ---------------------- doc/unicharambigs.5.xml | 126 ---- doc/unicharset.5 | 220 ------ doc/unicharset.5.html | 965 ------------------------- doc/unicharset.5.xml | 219 ------ doc/unicharset_extractor.1 | 69 -- doc/unicharset_extractor.1.html | 815 --------------------- doc/unicharset_extractor.1.xml | 63 -- doc/wordlist2dawg.1 | 72 -- doc/wordlist2dawg.1.html | 820 --------------------- doc/wordlist2dawg.1.xml | 69 -- 35 files changed, 17 insertions(+), 12646 deletions(-) delete mode 100644 doc/ambiguous_words.1 delete mode 100644 doc/ambiguous_words.1.html delete mode 100644 doc/ambiguous_words.1.xml delete mode 100644 doc/cntraining.1 delete mode 100644 doc/cntraining.1.html delete mode 100644 doc/cntraining.1.xml delete mode 100644 doc/combine_tessdata.1 delete mode 100644 doc/combine_tessdata.1.html delete mode 100644 doc/combine_tessdata.1.xml delete mode 100644 doc/dawg2wordlist.1 delete mode 100644 doc/dawg2wordlist.1.html delete mode 100644 doc/dawg2wordlist.1.xml delete mode 100644 doc/mftraining.1 delete mode 100644 doc/mftraining.1.html delete mode 100644 doc/mftraining.1.xml delete mode 100644 doc/shapeclustering.1 delete mode 100644 doc/shapeclustering.1.html delete mode 100644 doc/shapeclustering.1.xml delete mode 100644 doc/tesseract.1 delete mode 100644 doc/tesseract.1.html delete mode 100644 doc/tesseract.1.xml delete mode 100644 doc/unicharambigs.5 delete mode 100644 doc/unicharambigs.5.html delete mode 100644 doc/unicharambigs.5.xml delete mode 100644 doc/unicharset.5 delete mode 100644 doc/unicharset.5.html delete mode 100644 doc/unicharset.5.xml delete mode 100644 doc/unicharset_extractor.1 delete mode 100644 doc/unicharset_extractor.1.html delete mode 100644 doc/unicharset_extractor.1.xml delete mode 100644 doc/wordlist2dawg.1 delete mode 100644 doc/wordlist2dawg.1.html delete mode 100644 doc/wordlist2dawg.1.xml diff --git a/.gitignore b/.gitignore index 1aff930edf..b933321678 100644 --- a/.gitignore +++ b/.gitignore @@ -20,8 +20,6 @@ vs2010/LIB_Release/* vs2010/LIB_OpenCL_Release/* vs2010/LIB_OpenCL_Debug/* - - # Linux # ignore local configuration config.* @@ -36,7 +34,11 @@ libtool stamp-h1 tesseract.pc config_auto.h -doc/html/* +/doc/html/* +/doc/*.1 +/doc/*.5 +/doc/*.html +/doc/*.xml api/tesseract training/ambiguous_words training/classifier_tester diff --git a/doc/Makefile.am b/doc/Makefile.am index beaf408019..8790697e23 100644 --- a/doc/Makefile.am +++ b/doc/Makefile.am @@ -1,4 +1,16 @@ +asciidoc=asciidoc -d manpage + man_MANS = cntraining.1 combine_tessdata.1 mftraining.1 tesseract.1 \ unicharset_extractor.1 wordlist2dawg.1 unicharambigs.5 \ unicharset.5 ambiguous_words.1 shapeclustering.1 dawg2wordlist.1 + +if MAINTAINER_MODE + EXTRA_DIST = $(man_MANS) Doxyfile + +%: %.asc + $(asciidoc) -o $@ $< + +endif # MAINTAINER_MODE + +MAINTAINERCLEANFILES = $(man_MANS) Doxyfile diff --git a/doc/ambiguous_words.1 b/doc/ambiguous_words.1 deleted file mode 100644 index 1a1761ca3d..0000000000 --- a/doc/ambiguous_words.1 +++ /dev/null @@ -1,46 +0,0 @@ -'\" t -.\" Title: ambiguous_words -.\" Author: [see the "AUTHOR" section] -.\" Generator: DocBook XSL Stylesheets v1.78.1 -.\" Date: 06/12/2015 -.\" Manual: \ \& -.\" Source: \ \& -.\" Language: English -.\" -.TH "AMBIGUOUS_WORDS" "1" "06/12/2015" "\ \&" "\ \&" -.\" ----------------------------------------------------------------- -.\" * Define some portability stuff -.\" ----------------------------------------------------------------- -.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.\" http://bugs.debian.org/507673 -.\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html -.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.ie \n(.g .ds Aq \(aq -.el .ds Aq ' -.\" ----------------------------------------------------------------- -.\" * set default formatting -.\" ----------------------------------------------------------------- -.\" disable hyphenation -.nh -.\" disable justification (adjust text to left margin only) -.ad l -.\" ----------------------------------------------------------------- -.\" * MAIN CONTENT STARTS HERE * -.\" ----------------------------------------------------------------- -.SH "NAME" -ambiguous_words \- generate sets of words Tesseract is likely to find ambiguous -.SH "SYNOPSIS" -.sp -\fBambiguous_words\fR [\-l lang] \fITESSDATADIR\fR \fIWORDLIST\fR \fIAMBIGUOUSFILE\fR -.SH "DESCRIPTION" -.sp -ambiguous_words(1) runs Tesseract in a special mode, and for each word in word list, produces a set of words which Tesseract thinks might be ambiguous with it\&. \fITESSDATADIR\fR must be set to the absolute path of a directory containing \fItessdata/lang\&.traineddata\fR\&. -.SH "SEE ALSO" -.sp -tesseract(1) -.SH "COPYING" -.sp -Copyright (C) 2012 Google, Inc\&. Licensed under the Apache License, Version 2\&.0 -.SH "AUTHOR" -.sp -The Tesseract OCR engine was written by Ray Smith and his research groups at Hewlett Packard (1985\-1995) and Google (2006\-present)\&. diff --git a/doc/ambiguous_words.1.html b/doc/ambiguous_words.1.html deleted file mode 100644 index be74b62d0d..0000000000 --- a/doc/ambiguous_words.1.html +++ /dev/null @@ -1,790 +0,0 @@ - - - - - -AMBIGUOUS_WORDS(1) - - - - - -
-
-

SYNOPSIS

-
-

ambiguous_words [-l lang] TESSDATADIR WORDLIST AMBIGUOUSFILE

-
-
-
-

DESCRIPTION

-
-

ambiguous_words(1) runs Tesseract in a special mode, and for each word -in word list, produces a set of words which Tesseract thinks might be -ambiguous with it. TESSDATADIR must be set to the absolute path of -a directory containing tessdata/lang.traineddata.

-
-
-
-

SEE ALSO

-
-

tesseract(1)

-
-
-
-

COPYING

-
-

Copyright (C) 2012 Google, Inc. -Licensed under the Apache License, Version 2.0

-
-
-
-

AUTHOR

-
-

The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present).

-
-
-
-

- - - diff --git a/doc/ambiguous_words.1.xml b/doc/ambiguous_words.1.xml deleted file mode 100644 index 4900c6eb93..0000000000 --- a/doc/ambiguous_words.1.xml +++ /dev/null @@ -1,43 +0,0 @@ - - - - - - - AMBIGUOUS_WORDS(1) - - -ambiguous_words -1 -  -  - - - ambiguous_words - generate sets of words Tesseract is likely to find ambiguous - - -ambiguous_words [-l lang] TESSDATADIR WORDLIST AMBIGUOUSFILE - - -DESCRIPTION -ambiguous_words(1) runs Tesseract in a special mode, and for each word -in word list, produces a set of words which Tesseract thinks might be -ambiguous with it. TESSDATADIR must be set to the absolute path of -a directory containing tessdata/lang.traineddata. - - -SEE ALSO -tesseract(1) - - -COPYING -Copyright (C) 2012 Google, Inc. -Licensed under the Apache License, Version 2.0 - - -AUTHOR -The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present). - - diff --git a/doc/cntraining.1 b/doc/cntraining.1 deleted file mode 100644 index 332655e513..0000000000 --- a/doc/cntraining.1 +++ /dev/null @@ -1,54 +0,0 @@ -'\" t -.\" Title: cntraining -.\" Author: [see the "AUTHOR" section] -.\" Generator: DocBook XSL Stylesheets v1.78.1 -.\" Date: 06/12/2015 -.\" Manual: \ \& -.\" Source: \ \& -.\" Language: English -.\" -.TH "CNTRAINING" "1" "06/12/2015" "\ \&" "\ \&" -.\" ----------------------------------------------------------------- -.\" * Define some portability stuff -.\" ----------------------------------------------------------------- -.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.\" http://bugs.debian.org/507673 -.\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html -.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.ie \n(.g .ds Aq \(aq -.el .ds Aq ' -.\" ----------------------------------------------------------------- -.\" * set default formatting -.\" ----------------------------------------------------------------- -.\" disable hyphenation -.nh -.\" disable justification (adjust text to left margin only) -.ad l -.\" ----------------------------------------------------------------- -.\" * MAIN CONTENT STARTS HERE * -.\" ----------------------------------------------------------------- -.SH "NAME" -cntraining \- character normalization training for Tesseract -.SH "SYNOPSIS" -.sp -\fBcntraining\fR [\-D \fIdir\fR] \fIFILE\fR\&... -.SH "DESCRIPTION" -.sp -cntraining takes a list of \&.tr files, from which it generates the \fBnormproto\fR data file (the character normalization sensitivity prototypes)\&. -.SH "OPTIONS" -.PP -\-D \fIdir\fR -.RS 4 -Directory to write output files to\&. -.RE -.SH "SEE ALSO" -.sp -tesseract(1), shapeclustering(1), mftraining(1) -.sp -\m[blue]\fBhttps://github\&.com/tesseract\-ocr/tesseract/wiki/TrainingTesseract\fR\m[] -.SH "COPYING" -.sp -Copyright (c) Hewlett\-Packard Company, 1988 Licensed under the Apache License, Version 2\&.0 -.SH "AUTHOR" -.sp -The Tesseract OCR engine was written by Ray Smith and his research groups at Hewlett Packard (1985\-1995) and Google (2006\-present)\&. diff --git a/doc/cntraining.1.html b/doc/cntraining.1.html deleted file mode 100644 index 7653061e1e..0000000000 --- a/doc/cntraining.1.html +++ /dev/null @@ -1,805 +0,0 @@ - - - - - -CNTRAINING(1) - - - - - -
-
-

SYNOPSIS

-
-

cntraining [-D dir] FILE

-
-
-
-

DESCRIPTION

-
-

cntraining takes a list of .tr files, from which it generates the -normproto data file (the character normalization sensitivity -prototypes).

-
-
-
-

OPTIONS

-
-
-
--D dir -
-
-

- Directory to write output files to. -

-
-
-
-
-
-

SEE ALSO

-
-

tesseract(1), shapeclustering(1), mftraining(1)

- -
-
-
-

COPYING

-
-

Copyright (c) Hewlett-Packard Company, 1988 -Licensed under the Apache License, Version 2.0

-
-
-
-

AUTHOR

-
-

The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present).

-
-
-
-

- - - diff --git a/doc/cntraining.1.xml b/doc/cntraining.1.xml deleted file mode 100644 index 6efc99be1d..0000000000 --- a/doc/cntraining.1.xml +++ /dev/null @@ -1,58 +0,0 @@ - - - - - - - CNTRAINING(1) - - -cntraining -1 -  -  - - - cntraining - character normalization training for Tesseract - - -cntraining [-D dir] FILE - - -DESCRIPTION -cntraining takes a list of .tr files, from which it generates the -normproto data file (the character normalization sensitivity -prototypes). - - -OPTIONS - - - --D dir - - - - Directory to write output files to. - - - - - - -SEE ALSO -tesseract(1), shapeclustering(1), mftraining(1) -https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract - - -COPYING -Copyright (c) Hewlett-Packard Company, 1988 -Licensed under the Apache License, Version 2.0 - - -AUTHOR -The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present). - - diff --git a/doc/combine_tessdata.1 b/doc/combine_tessdata.1 deleted file mode 100644 index 7f29bad422..0000000000 --- a/doc/combine_tessdata.1 +++ /dev/null @@ -1,205 +0,0 @@ -'\" t -.\" Title: combine_tessdata -.\" Author: [see the "AUTHOR" section] -.\" Generator: DocBook XSL Stylesheets v1.78.1 -.\" Date: 06/12/2015 -.\" Manual: \ \& -.\" Source: \ \& -.\" Language: English -.\" -.TH "COMBINE_TESSDATA" "1" "06/12/2015" "\ \&" "\ \&" -.\" ----------------------------------------------------------------- -.\" * Define some portability stuff -.\" ----------------------------------------------------------------- -.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.\" http://bugs.debian.org/507673 -.\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html -.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.ie \n(.g .ds Aq \(aq -.el .ds Aq ' -.\" ----------------------------------------------------------------- -.\" * set default formatting -.\" ----------------------------------------------------------------- -.\" disable hyphenation -.nh -.\" disable justification (adjust text to left margin only) -.ad l -.\" ----------------------------------------------------------------- -.\" * MAIN CONTENT STARTS HERE * -.\" ----------------------------------------------------------------- -.SH "NAME" -combine_tessdata \- combine/extract/overwrite Tesseract data -.SH "SYNOPSIS" -.sp -\fBcombine_tessdata\fR [\fIOPTION\fR] \fIFILE\fR\&... -.SH "DESCRIPTION" -.sp -combine_tessdata(1) is the main program to combine/extract/overwrite tessdata components in [lang]\&.traineddata files\&. -.sp -To combine all the individual tessdata components (unicharset, DAWGs, classifier templates, ambiguities, language configs) located at, say, /home/$USER/temp/eng\&.* run: -.sp -.if n \{\ -.RS 4 -.\} -.nf -combine_tessdata /home/$USER/temp/eng\&. -.fi -.if n \{\ -.RE -.\} -.sp -The result will be a combined tessdata file /home/$USER/temp/eng\&.traineddata -.sp -Specify option \-e if you would like to extract individual components from a combined traineddata file\&. For example, to extract language config file and the unicharset from tessdata/eng\&.traineddata run: -.sp -.if n \{\ -.RS 4 -.\} -.nf -combine_tessdata \-e tessdata/eng\&.traineddata \e - /home/$USER/temp/eng\&.config /home/$USER/temp/eng\&.unicharset -.fi -.if n \{\ -.RE -.\} -.sp -The desired config file and unicharset will be written to /home/$USER/temp/eng\&.config /home/$USER/temp/eng\&.unicharset -.sp -Specify option \-o to overwrite individual components of the given [lang]\&.traineddata file\&. For example, to overwrite language config and unichar ambiguities files in tessdata/eng\&.traineddata use: -.sp -.if n \{\ -.RS 4 -.\} -.nf -combine_tessdata \-o tessdata/eng\&.traineddata \e - /home/$USER/temp/eng\&.config /home/$USER/temp/eng\&.unicharambigs -.fi -.if n \{\ -.RE -.\} -.sp -As a result, tessdata/eng\&.traineddata will contain the new language config and unichar ambigs, plus all the original DAWGs, classifier templates, etc\&. -.sp -Note: the file names of the files to extract to and to overwrite from should have the appropriate file suffixes (extensions) indicating their tessdata component type (\&.unicharset for the unicharset, \&.unicharambigs for unichar ambigs, etc)\&. See k*FileSuffix variable in ccutil/tessdatamanager\&.h\&. -.sp -Specify option \-u to unpack all the components to the specified path: -.sp -.if n \{\ -.RS 4 -.\} -.nf -combine_tessdata \-u tessdata/eng\&.traineddata /home/$USER/temp/eng\&. -.fi -.if n \{\ -.RE -.\} -.sp -This will create /home/$USER/temp/eng\&.* files with individual tessdata components from tessdata/eng\&.traineddata\&. -.SH "OPTIONS" -.sp -\fB\-e\fR \fI\&.traineddata\fR \fIFILE\fR\&...: Extracts the specified components from the \&.traineddata file -.sp -\fB\-o\fR \fI\&.traineddata\fR \fIFILE\fR\&...: Overwrites the specified components of the \&.traineddata file with those provided on the comand line\&. -.sp -\fB\-u\fR \fI\&.traineddata\fR \fIPATHPREFIX\fR Unpacks the \&.traineddata using the provided prefix\&. -.SH "CAVEATS" -.sp -\fIPrefix\fR refers to the full file prefix, including period (\&.) -.SH "COMPONENTS" -.sp -The components in a Tesseract lang\&.traineddata file as of Tesseract 3\&.02 are briefly described below; For more information on many of these files, see \m[blue]\fBhttps://github\&.com/tesseract\-ocr/tesseract/wiki/TrainingTesseract\fR\m[] -.PP -lang\&.config -.RS 4 -(Optional) Language\-specific overrides to default config variables\&. -.RE -.PP -lang\&.unicharset -.RS 4 -(Required) The list of symbols that Tesseract recognizes, with properties\&. See unicharset(5)\&. -.RE -.PP -lang\&.unicharambigs -.RS 4 -(Optional) This file contains information on pairs of recognized symbols which are often confused\&. For example, -\fIrn\fR -and -\fIm\fR\&. -.RE -.PP -lang\&.inttemp -.RS 4 -(Required) Character shape templates for each unichar\&. Produced by mftraining(1)\&. -.RE -.PP -lang\&.pffmtable -.RS 4 -(Required) The number of features expected for each unichar\&. Produced by mftraining(1) from -\fB\&.tr\fR -files\&. -.RE -.PP -lang\&.normproto -.RS 4 -(Required) Character normalization prototypes generated by cntraining(1) from -\fB\&.tr\fR -files\&. -.RE -.PP -lang\&.punc\-dawg -.RS 4 -(Optional) A dawg made from punctuation patterns found around words\&. The "word" part is replaced by a single space\&. -.RE -.PP -lang\&.word\-dawg -.RS 4 -(Optional) A dawg made from dictionary words from the language\&. -.RE -.PP -lang\&.number\-dawg -.RS 4 -(Optional) A dawg made from tokens which originally contained digits\&. Each digit is replaced by a space character\&. -.RE -.PP -lang\&.freq\-dawg -.RS 4 -(Optional) A dawg made from the most frequent words which would have gone into word\-dawg\&. -.RE -.PP -lang\&.fixed\-length\-dawgs -.RS 4 -(Optional) Several dawgs of different fixed lengths \(em useful for languages like Chinese\&. -.RE -.PP -lang\&.shapetable -.RS 4 -(Optional) When present, a shapetable is an extra layer between the character classifier and the word recognizer that allows the character classifier to return a collection of unichar ids and fonts instead of a single unichar\-id and font\&. -.RE -.PP -lang\&.bigram\-dawg -.RS 4 -(Optional) A dawg of word bigrams where the words are separated by a space and each digit is replaced by a -\fI?\fR\&. -.RE -.PP -lang\&.unambig\-dawg -.RS 4 -(Optional) TODO: Describe\&. -.RE -.PP -lang\&.params\-training\-model -.RS 4 -(Optional) TODO: Describe\&. -.RE -.SH "HISTORY" -.sp -combine_tessdata(1) first appeared in version 3\&.00 of Tesseract -.SH "SEE ALSO" -.sp -tesseract(1), wordlist2dawg(1), cntraining(1), mftraining(1), unicharset(5), unicharambigs(5) -.SH "COPYING" -.sp -Copyright (C) 2009, Google Inc\&. Licensed under the Apache License, Version 2\&.0 -.SH "AUTHOR" -.sp -The Tesseract OCR engine was written by Ray Smith and his research groups at Hewlett Packard (1985\-1995) and Google (2006\-present)\&. diff --git a/doc/combine_tessdata.1.html b/doc/combine_tessdata.1.html deleted file mode 100644 index 2fc45b0855..0000000000 --- a/doc/combine_tessdata.1.html +++ /dev/null @@ -1,996 +0,0 @@ - - - - - -COMBINE_TESSDATA(1) - - - - - -
-
-

SYNOPSIS

-
-

combine_tessdata [OPTION] FILE

-
-
-
-

DESCRIPTION

-
-

combine_tessdata(1) is the main program to combine/extract/overwrite -tessdata components in [lang].traineddata files.

-

To combine all the individual tessdata components (unicharset, DAWGs, -classifier templates, ambiguities, language configs) located at, say, -/home/$USER/temp/eng.* run:

-
-
-
combine_tessdata /home/$USER/temp/eng.
-
-

The result will be a combined tessdata file /home/$USER/temp/eng.traineddata

-

Specify option -e if you would like to extract individual components -from a combined traineddata file. For example, to extract language config -file and the unicharset from tessdata/eng.traineddata run:

-
-
-
combine_tessdata -e tessdata/eng.traineddata \
-  /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset
-
-

The desired config file and unicharset will be written to -/home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset

-

Specify option -o to overwrite individual components of the given -[lang].traineddata file. For example, to overwrite language config -and unichar ambiguities files in tessdata/eng.traineddata use:

-
-
-
combine_tessdata -o tessdata/eng.traineddata \
-  /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharambigs
-
-

As a result, tessdata/eng.traineddata will contain the new language config -and unichar ambigs, plus all the original DAWGs, classifier templates, etc.

-

Note: the file names of the files to extract to and to overwrite from should -have the appropriate file suffixes (extensions) indicating their tessdata -component type (.unicharset for the unicharset, .unicharambigs for unichar -ambigs, etc). See k*FileSuffix variable in ccutil/tessdatamanager.h.

-

Specify option -u to unpack all the components to the specified path:

-
-
-
combine_tessdata -u tessdata/eng.traineddata /home/$USER/temp/eng.
-
-

This will create /home/$USER/temp/eng.* files with individual tessdata -components from tessdata/eng.traineddata.

-
-
-
-

OPTIONS

-
-

-e .traineddata FILE…: - Extracts the specified components from the .traineddata file

-

-o .traineddata FILE…: - Overwrites the specified components of the .traineddata file - with those provided on the comand line.

-

-u .traineddata PATHPREFIX - Unpacks the .traineddata using the provided prefix.

-
-
-
-

CAVEATS

-
-

Prefix refers to the full file prefix, including period (.)

-
-
-
-

COMPONENTS

-
-

The components in a Tesseract lang.traineddata file as of -Tesseract 3.02 are briefly described below; For more information on -many of these files, see -https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract

-
-
-lang.config -
-
-

- (Optional) Language-specific overrides to default config variables. -

-
-
-lang.unicharset -
-
-

- (Required) The list of symbols that Tesseract recognizes, with properties. - See unicharset(5). -

-
-
-lang.unicharambigs -
-
-

- (Optional) This file contains information on pairs of recognized symbols - which are often confused. For example, rn and m. -

-
-
-lang.inttemp -
-
-

- (Required) Character shape templates for each unichar. Produced by - mftraining(1). -

-
-
-lang.pffmtable -
-
-

- (Required) The number of features expected for each unichar. - Produced by mftraining(1) from .tr files. -

-
-
-lang.normproto -
-
-

- (Required) Character normalization prototypes generated by cntraining(1) - from .tr files. -

-
-
-lang.punc-dawg -
-
-

- (Optional) A dawg made from punctuation patterns found around words. - The "word" part is replaced by a single space. -

-
-
-lang.word-dawg -
-
-

- (Optional) A dawg made from dictionary words from the language. -

-
-
-lang.number-dawg -
-
-

- (Optional) A dawg made from tokens which originally contained digits. - Each digit is replaced by a space character. -

-
-
-lang.freq-dawg -
-
-

- (Optional) A dawg made from the most frequent words which would have - gone into word-dawg. -

-
-
-lang.fixed-length-dawgs -
-
-

- (Optional) Several dawgs of different fixed lengths — useful for - languages like Chinese. -

-
-
-lang.shapetable -
-
-

- (Optional) When present, a shapetable is an extra layer between the character - classifier and the word recognizer that allows the character classifier to - return a collection of unichar ids and fonts instead of a single unichar-id - and font. -

-
-
-lang.bigram-dawg -
-
-

- (Optional) A dawg of word bigrams where the words are separated by a space - and each digit is replaced by a ?. -

-
-
-lang.unambig-dawg -
-
-

- (Optional) TODO: Describe. -

-
-
-lang.params-training-model -
-
-

- (Optional) TODO: Describe. -

-
-
-
-
-
-

HISTORY

-
-

combine_tessdata(1) first appeared in version 3.00 of Tesseract

-
-
-
-

SEE ALSO

-
-

tesseract(1), wordlist2dawg(1), cntraining(1), mftraining(1), unicharset(5), -unicharambigs(5)

-
-
-
-

COPYING

-
-

Copyright (C) 2009, Google Inc. -Licensed under the Apache License, Version 2.0

-
-
-
-

AUTHOR

-
-

The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present).

-
-
-
-

- - - diff --git a/doc/combine_tessdata.1.xml b/doc/combine_tessdata.1.xml deleted file mode 100644 index d11bac8fb5..0000000000 --- a/doc/combine_tessdata.1.xml +++ /dev/null @@ -1,259 +0,0 @@ - - - - - - - COMBINE_TESSDATA(1) - - -combine_tessdata -1 -  -  - - - combine_tessdata - combine/extract/overwrite Tesseract data - - -combine_tessdata [OPTION] FILE - - -DESCRIPTION -combine_tessdata(1) is the main program to combine/extract/overwrite -tessdata components in [lang].traineddata files. -To combine all the individual tessdata components (unicharset, DAWGs, -classifier templates, ambiguities, language configs) located at, say, -/home/$USER/temp/eng.* run: -combine_tessdata /home/$USER/temp/eng. -The result will be a combined tessdata file /home/$USER/temp/eng.traineddata -Specify option -e if you would like to extract individual components -from a combined traineddata file. For example, to extract language config -file and the unicharset from tessdata/eng.traineddata run: -combine_tessdata -e tessdata/eng.traineddata \ - /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset -The desired config file and unicharset will be written to -/home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset -Specify option -o to overwrite individual components of the given -[lang].traineddata file. For example, to overwrite language config -and unichar ambiguities files in tessdata/eng.traineddata use: -combine_tessdata -o tessdata/eng.traineddata \ - /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharambigs -As a result, tessdata/eng.traineddata will contain the new language config -and unichar ambigs, plus all the original DAWGs, classifier templates, etc. -Note: the file names of the files to extract to and to overwrite from should -have the appropriate file suffixes (extensions) indicating their tessdata -component type (.unicharset for the unicharset, .unicharambigs for unichar -ambigs, etc). See k*FileSuffix variable in ccutil/tessdatamanager.h. -Specify option -u to unpack all the components to the specified path: -combine_tessdata -u tessdata/eng.traineddata /home/$USER/temp/eng. -This will create /home/$USER/temp/eng.* files with individual tessdata -components from tessdata/eng.traineddata. - - -OPTIONS --e .traineddata FILE…: - Extracts the specified components from the .traineddata file --o .traineddata FILE…: - Overwrites the specified components of the .traineddata file - with those provided on the comand line. --u .traineddata PATHPREFIX - Unpacks the .traineddata using the provided prefix. - - -CAVEATS -Prefix refers to the full file prefix, including period (.) - - -COMPONENTS -The components in a Tesseract lang.traineddata file as of -Tesseract 3.02 are briefly described below; For more information on -many of these files, see -https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract - - - -lang.config - - - - (Optional) Language-specific overrides to default config variables. - - - - - -lang.unicharset - - - - (Required) The list of symbols that Tesseract recognizes, with properties. - See unicharset(5). - - - - - -lang.unicharambigs - - - - (Optional) This file contains information on pairs of recognized symbols - which are often confused. For example, rn and m. - - - - - -lang.inttemp - - - - (Required) Character shape templates for each unichar. Produced by - mftraining(1). - - - - - -lang.pffmtable - - - - (Required) The number of features expected for each unichar. - Produced by mftraining(1) from .tr files. - - - - - -lang.normproto - - - - (Required) Character normalization prototypes generated by cntraining(1) - from .tr files. - - - - - -lang.punc-dawg - - - - (Optional) A dawg made from punctuation patterns found around words. - The "word" part is replaced by a single space. - - - - - -lang.word-dawg - - - - (Optional) A dawg made from dictionary words from the language. - - - - - -lang.number-dawg - - - - (Optional) A dawg made from tokens which originally contained digits. - Each digit is replaced by a space character. - - - - - -lang.freq-dawg - - - - (Optional) A dawg made from the most frequent words which would have - gone into word-dawg. - - - - - -lang.fixed-length-dawgs - - - - (Optional) Several dawgs of different fixed lengths — useful for - languages like Chinese. - - - - - -lang.shapetable - - - - (Optional) When present, a shapetable is an extra layer between the character - classifier and the word recognizer that allows the character classifier to - return a collection of unichar ids and fonts instead of a single unichar-id - and font. - - - - - -lang.bigram-dawg - - - - (Optional) A dawg of word bigrams where the words are separated by a space - and each digit is replaced by a ?. - - - - - -lang.unambig-dawg - - - - (Optional) TODO: Describe. - - - - - -lang.params-training-model - - - - (Optional) TODO: Describe. - - - - - - -HISTORY -combine_tessdata(1) first appeared in version 3.00 of Tesseract - - -SEE ALSO -tesseract(1), wordlist2dawg(1), cntraining(1), mftraining(1), unicharset(5), -unicharambigs(5) - - -COPYING -Copyright (C) 2009, Google Inc. -Licensed under the Apache License, Version 2.0 - - -AUTHOR -The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present). - - diff --git a/doc/dawg2wordlist.1 b/doc/dawg2wordlist.1 deleted file mode 100644 index 5fb50b522b..0000000000 --- a/doc/dawg2wordlist.1 +++ /dev/null @@ -1,55 +0,0 @@ -'\" t -.\" Title: dawg2wordlist -.\" Author: [see the "AUTHOR" section] -.\" Generator: DocBook XSL Stylesheets v1.78.1 -.\" Date: 06/12/2015 -.\" Manual: \ \& -.\" Source: \ \& -.\" Language: English -.\" -.TH "DAWG2WORDLIST" "1" "06/12/2015" "\ \&" "\ \&" -.\" ----------------------------------------------------------------- -.\" * Define some portability stuff -.\" ----------------------------------------------------------------- -.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.\" http://bugs.debian.org/507673 -.\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html -.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.ie \n(.g .ds Aq \(aq -.el .ds Aq ' -.\" ----------------------------------------------------------------- -.\" * set default formatting -.\" ----------------------------------------------------------------- -.\" disable hyphenation -.nh -.\" disable justification (adjust text to left margin only) -.ad l -.\" ----------------------------------------------------------------- -.\" * MAIN CONTENT STARTS HERE * -.\" ----------------------------------------------------------------- -.SH "NAME" -dawg2wordlist \- convert a Tesseract DAWG to a wordlist -.SH "SYNOPSIS" -.sp -\fBdawg2wordlist\fR \fIUNICHARSET\fR \fIDAWG\fR \fIWORDLIST\fR -.SH "DESCRIPTION" -.sp -dawg2wordlist(1) converts a Tesseract Directed Acyclic Word Graph (DAWG) to a list of words using a unicharset as key\&. -.SH "OPTIONS" -.sp -\fIUNICHARSET\fR The unicharset of the language\&. This is the unicharset generated by mftraining(1)\&. -.sp -\fIDAWG\fR The input DAWG, created by wordlist2dawg(1) -.sp -\fIWORDLIST\fR Plain text (output) file in UTF\-8, one word per line -.SH "SEE ALSO" -.sp -tesseract(1), mftraining(1), wordlist2dawg(1), unicharset(5), combine_tessdata(1) -.sp -\m[blue]\fBhttps://github\&.com/tesseract\-ocr/tesseract/wiki/TrainingTesseract\fR\m[] -.SH "COPYING" -.sp -Copyright (C) 2012 Google, Inc\&. Licensed under the Apache License, Version 2\&.0 -.SH "AUTHOR" -.sp -The Tesseract OCR engine was written by Ray Smith and his research groups at Hewlett Packard (1985\-1995) and Google (2006\-present)\&. diff --git a/doc/dawg2wordlist.1.html b/doc/dawg2wordlist.1.html deleted file mode 100644 index 0b2645dfb7..0000000000 --- a/doc/dawg2wordlist.1.html +++ /dev/null @@ -1,802 +0,0 @@ - - - - - -DAWG2WORDLIST(1) - - - - - -
-
-

SYNOPSIS

-
-

dawg2wordlist UNICHARSET DAWG WORDLIST

-
-
-
-

DESCRIPTION

-
-

dawg2wordlist(1) converts a Tesseract Directed Acyclic Word -Graph (DAWG) to a list of words using a unicharset as key.

-
-
-
-

OPTIONS

-
-

UNICHARSET - The unicharset of the language. This is the unicharset - generated by mftraining(1).

-

DAWG - The input DAWG, created by wordlist2dawg(1)

-

WORDLIST - Plain text (output) file in UTF-8, one word per line

-
-
-
-

SEE ALSO

-
-

tesseract(1), mftraining(1), wordlist2dawg(1), unicharset(5), -combine_tessdata(1)

- -
-
-
-

COPYING

-
-

Copyright (C) 2012 Google, Inc. -Licensed under the Apache License, Version 2.0

-
-
-
-

AUTHOR

-
-

The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present).

-
-
-
-

- - - diff --git a/doc/dawg2wordlist.1.xml b/doc/dawg2wordlist.1.xml deleted file mode 100644 index ee960ad9fc..0000000000 --- a/doc/dawg2wordlist.1.xml +++ /dev/null @@ -1,53 +0,0 @@ - - - - - - - DAWG2WORDLIST(1) - - -dawg2wordlist -1 -  -  - - - dawg2wordlist - convert a Tesseract DAWG to a wordlist - - -dawg2wordlist UNICHARSET DAWG WORDLIST - - -DESCRIPTION -dawg2wordlist(1) converts a Tesseract Directed Acyclic Word -Graph (DAWG) to a list of words using a unicharset as key. - - -OPTIONS -UNICHARSET - The unicharset of the language. This is the unicharset - generated by mftraining(1). -DAWG - The input DAWG, created by wordlist2dawg(1) -WORDLIST - Plain text (output) file in UTF-8, one word per line - - -SEE ALSO -tesseract(1), mftraining(1), wordlist2dawg(1), unicharset(5), -combine_tessdata(1) -https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract - - -COPYING -Copyright (C) 2012 Google, Inc. -Licensed under the Apache License, Version 2.0 - - -AUTHOR -The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present). - - diff --git a/doc/mftraining.1 b/doc/mftraining.1 deleted file mode 100644 index 1901850ada..0000000000 --- a/doc/mftraining.1 +++ /dev/null @@ -1,94 +0,0 @@ -'\" t -.\" Title: mftraining -.\" Author: [see the "AUTHOR" section] -.\" Generator: DocBook XSL Stylesheets v1.78.1 -.\" Date: 06/12/2015 -.\" Manual: \ \& -.\" Source: \ \& -.\" Language: English -.\" -.TH "MFTRAINING" "1" "06/12/2015" "\ \&" "\ \&" -.\" ----------------------------------------------------------------- -.\" * Define some portability stuff -.\" ----------------------------------------------------------------- -.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.\" http://bugs.debian.org/507673 -.\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html -.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.ie \n(.g .ds Aq \(aq -.el .ds Aq ' -.\" ----------------------------------------------------------------- -.\" * set default formatting -.\" ----------------------------------------------------------------- -.\" disable hyphenation -.nh -.\" disable justification (adjust text to left margin only) -.ad l -.\" ----------------------------------------------------------------- -.\" * MAIN CONTENT STARTS HERE * -.\" ----------------------------------------------------------------- -.SH "NAME" -mftraining \- feature training for Tesseract -.SH "SYNOPSIS" -.sp -mftraining \-U \fIunicharset\fR \-O \fIlang\&.unicharset\fR \fIFILE\fR\&... -.SH "DESCRIPTION" -.sp -mftraining takes a list of \&.tr files, from which it generates the files \fBinttemp\fR (the shape prototypes), \fBshapetable\fR, and \fBpffmtable\fR (the number of expected features for each character)\&. (A fourth file called Microfeat is also written by this program, but it is not used\&.) -.SH "OPTIONS" -.PP -\-U \fIFILE\fR -.RS 4 -(Input) The unicharset generated by unicharset_extractor(1) -.RE -.PP -\-F \fIfont_properties_file\fR -.RS 4 -(Input) font properties file, each line is of the following form, where each field other than the font name is 0 or 1: -.sp -.if n \{\ -.RS 4 -.\} -.nf -*font_name* *italic* *bold* *fixed_pitch* *serif* *fraktur* -.fi -.if n \{\ -.RE -.\} -.RE -.PP -\-X \fIxheights_file\fR -.RS 4 -(Input) x heights file, each line is of the following form, where xheight is calculated as the pixel x height of a character drawn at 32pt on 300 dpi\&. [ That is, if base x height + ascenders + descenders = 133, how much is x height? ] -.sp -.if n \{\ -.RS 4 -.\} -.nf -*font_name* *xheight* -.fi -.if n \{\ -.RE -.\} -.RE -.PP -\-D \fIdir\fR -.RS 4 -Directory to write output files to\&. -.RE -.PP -\-O \fIFILE\fR -.RS 4 -(Output) The output unicharset that will be given to combine_tessdata(1) -.RE -.SH "SEE ALSO" -.sp -tesseract(1), cntraining(1), unicharset_extractor(1), combine_tessdata(1), shapeclustering(1), unicharset(5) -.sp -\m[blue]\fBhttps://github\&.com/tesseract\-ocr/tesseract/wiki/TrainingTesseract\fR\m[] -.SH "COPYING" -.sp -Copyright (C) Hewlett\-Packard Company, 1988 Licensed under the Apache License, Version 2\&.0 -.SH "AUTHOR" -.sp -The Tesseract OCR engine was written by Ray Smith and his research groups at Hewlett Packard (1985\-1995) and Google (2006\-present)\&. diff --git a/doc/mftraining.1.html b/doc/mftraining.1.html deleted file mode 100644 index 41a3804457..0000000000 --- a/doc/mftraining.1.html +++ /dev/null @@ -1,847 +0,0 @@ - - - - - -MFTRAINING(1) - - - - - -
-
-

SYNOPSIS

-
-

mftraining -U unicharset -O lang.unicharset FILE

-
-
-
-

DESCRIPTION

-
-

mftraining takes a list of .tr files, from which it generates the -files inttemp (the shape prototypes), shapetable, and pffmtable -(the number of expected features for each character). (A fourth file -called Microfeat is also written by this program, but it is not used.)

-
-
-
-

OPTIONS

-
-
-
--U FILE -
-
-

- (Input) The unicharset generated by unicharset_extractor(1) -

-
-
--F font_properties_file -
-
-

- (Input) font properties file, each line is of the following form, where each field other than the font name is 0 or 1: -

-
-
-
*font_name* *italic* *bold* *fixed_pitch* *serif* *fraktur*
-
-
-
--X xheights_file -
-
-

- (Input) x heights file, each line is of the following form, where xheight is calculated as the pixel x height of a character drawn at 32pt on 300 dpi. [ That is, if base x height + ascenders + descenders = 133, how much is x height? ] -

-
-
-
*font_name* *xheight*
-
-
-
--D dir -
-
-

- Directory to write output files to. -

-
-
--O FILE -
-
-

- (Output) The output unicharset that will be given to combine_tessdata(1) -

-
-
-
-
-
-

SEE ALSO

-
-

tesseract(1), cntraining(1), unicharset_extractor(1), combine_tessdata(1), -shapeclustering(1), unicharset(5)

- -
-
-
-

COPYING

-
-

Copyright (C) Hewlett-Packard Company, 1988 -Licensed under the Apache License, Version 2.0

-
-
-
-

AUTHOR

-
-

The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present).

-
-
-
-

- - - diff --git a/doc/mftraining.1.xml b/doc/mftraining.1.xml deleted file mode 100644 index 10b3c6d2e5..0000000000 --- a/doc/mftraining.1.xml +++ /dev/null @@ -1,102 +0,0 @@ - - - - - - - MFTRAINING(1) - - -mftraining -1 -  -  - - - mftraining - feature training for Tesseract - - -mftraining -U unicharset -O lang.unicharset FILE - - -DESCRIPTION -mftraining takes a list of .tr files, from which it generates the -files inttemp (the shape prototypes), shapetable, and pffmtable -(the number of expected features for each character). (A fourth file -called Microfeat is also written by this program, but it is not used.) - - -OPTIONS - - - --U FILE - - - - (Input) The unicharset generated by unicharset_extractor(1) - - - - - --F font_properties_file - - - - (Input) font properties file, each line is of the following form, where each field other than the font name is 0 or 1: - -*font_name* *italic* *bold* *fixed_pitch* *serif* *fraktur* - - - - --X xheights_file - - - - (Input) x heights file, each line is of the following form, where xheight is calculated as the pixel x height of a character drawn at 32pt on 300 dpi. [ That is, if base x height + ascenders + descenders = 133, how much is x height? ] - -*font_name* *xheight* - - - - --D dir - - - - Directory to write output files to. - - - - - --O FILE - - - - (Output) The output unicharset that will be given to combine_tessdata(1) - - - - - - -SEE ALSO -tesseract(1), cntraining(1), unicharset_extractor(1), combine_tessdata(1), -shapeclustering(1), unicharset(5) -https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract - - -COPYING -Copyright (C) Hewlett-Packard Company, 1988 -Licensed under the Apache License, Version 2.0 - - -AUTHOR -The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present). - - diff --git a/doc/shapeclustering.1 b/doc/shapeclustering.1 deleted file mode 100644 index f1f9fbdea6..0000000000 --- a/doc/shapeclustering.1 +++ /dev/null @@ -1,94 +0,0 @@ -'\" t -.\" Title: shapeclustering -.\" Author: [see the "AUTHOR" section] -.\" Generator: DocBook XSL Stylesheets v1.78.1 -.\" Date: 06/12/2015 -.\" Manual: \ \& -.\" Source: \ \& -.\" Language: English -.\" -.TH "SHAPECLUSTERING" "1" "06/12/2015" "\ \&" "\ \&" -.\" ----------------------------------------------------------------- -.\" * Define some portability stuff -.\" ----------------------------------------------------------------- -.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.\" http://bugs.debian.org/507673 -.\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html -.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.ie \n(.g .ds Aq \(aq -.el .ds Aq ' -.\" ----------------------------------------------------------------- -.\" * set default formatting -.\" ----------------------------------------------------------------- -.\" disable hyphenation -.nh -.\" disable justification (adjust text to left margin only) -.ad l -.\" ----------------------------------------------------------------- -.\" * MAIN CONTENT STARTS HERE * -.\" ----------------------------------------------------------------- -.SH "NAME" -shapeclustering \- shape clustering training for Tesseract -.SH "SYNOPSIS" -.sp -shapeclustering \-D \fIoutput_dir\fR \-U \fIunicharset\fR \-O \fImfunicharset\fR \-F \fIfont_props\fR \-X \fIxheights\fR \fIFILE\fR\&... -.SH "DESCRIPTION" -.sp -shapeclustering(1) takes extracted feature \&.tr files (generated by tesseract(1) run in a special mode from box files) and produces a file \fBshapetable\fR and an enhanced unicharset\&. This program is still experimental, and is not required (yet) for training Tesseract\&. -.SH "OPTIONS" -.PP -\-U \fIFILE\fR -.RS 4 -The unicharset generated by unicharset_extractor(1)\&. -.RE -.PP -\-D \fIdir\fR -.RS 4 -Directory to write output files to\&. -.RE -.PP -\-F \fIfont_properties_file\fR -.RS 4 -(Input) font properties file, where each line is of the following form, where each field other than the font name is 0 or 1: -.sp -.if n \{\ -.RS 4 -.\} -.nf -\*(Aqfont_name\*(Aq \*(Aqitalic\*(Aq \*(Aqbold\*(Aq \*(Aqfixed_pitch\*(Aq \*(Aqserif\*(Aq \*(Aqfraktur\*(Aq -.fi -.if n \{\ -.RE -.\} -.RE -.PP -\-X \fIxheights_file\fR -.RS 4 -(Input) x heights file, each line is of the following form, where xheight is calculated as the pixel x height of a character drawn at 32pt on 300 dpi\&. [ That is, if base x height + ascenders + descenders = 133, how much is x height? ] -.sp -.if n \{\ -.RS 4 -.\} -.nf -\*(Aqfont_name\*(Aq \*(Aqxheight\*(Aq -.fi -.if n \{\ -.RE -.\} -.RE -.PP -\-O \fIFILE\fR -.RS 4 -The output unicharset that will be given to combine_tessdata(1)\&. -.RE -.SH "SEE ALSO" -.sp -tesseract(1), cntraining(1), unicharset_extractor(1), combine_tessdata(1), unicharset(5) -.sp -\m[blue]\fBhttps://github\&.com/tesseract\-ocr/tesseract/wiki/TrainingTesseract\fR\m[] -.SH "COPYING" -.sp -Copyright (C) Google, 2011 Licensed under the Apache License, Version 2\&.0 -.SH "AUTHOR" -.sp -The Tesseract OCR engine was written by Ray Smith and his research groups at Hewlett Packard (1985\-1995) and Google (2006\-present)\&. diff --git a/doc/shapeclustering.1.html b/doc/shapeclustering.1.html deleted file mode 100644 index 5fca944fc8..0000000000 --- a/doc/shapeclustering.1.html +++ /dev/null @@ -1,850 +0,0 @@ - - - - - -SHAPECLUSTERING(1) - - - - - -
-
-

SYNOPSIS

-
-

shapeclustering -D output_dir - -U unicharset -O mfunicharset - -F font_props -X xheights - FILE

-
-
-
-

DESCRIPTION

-
-

shapeclustering(1) takes extracted feature .tr files (generated by -tesseract(1) run in a special mode from box files) and produces a -file shapetable and an enhanced unicharset. This program is still -experimental, and is not required (yet) for training Tesseract.

-
-
-
-

OPTIONS

-
-
-
--U FILE -
-
-

- The unicharset generated by unicharset_extractor(1). -

-
-
--D dir -
-
-

- Directory to write output files to. -

-
-
--F font_properties_file -
-
-

- (Input) font properties file, where each line is of the following form, where each field other than the font name is 0 or 1: -

-
-
-
'font_name' 'italic' 'bold' 'fixed_pitch' 'serif' 'fraktur'
-
-
-
--X xheights_file -
-
-

- (Input) x heights file, each line is of the following form, where xheight is calculated as the pixel x height of a character drawn at 32pt on 300 dpi. [ That is, if base x height + ascenders + descenders = 133, how much is x height? ] -

-
-
-
'font_name' 'xheight'
-
-
-
--O FILE -
-
-

- The output unicharset that will be given to combine_tessdata(1). -

-
-
-
-
-
-

SEE ALSO

-
-

tesseract(1), cntraining(1), unicharset_extractor(1), combine_tessdata(1), -unicharset(5)

- -
-
-
-

COPYING

-
-

Copyright (C) Google, 2011 -Licensed under the Apache License, Version 2.0

-
-
-
-

AUTHOR

-
-

The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present).

-
-
-
-

- - - diff --git a/doc/shapeclustering.1.xml b/doc/shapeclustering.1.xml deleted file mode 100644 index 933789ad3c..0000000000 --- a/doc/shapeclustering.1.xml +++ /dev/null @@ -1,105 +0,0 @@ - - - - - - - SHAPECLUSTERING(1) - - -shapeclustering -1 -  -  - - - shapeclustering - shape clustering training for Tesseract - - -shapeclustering -D output_dir - -U unicharset -O mfunicharset - -F font_props -X xheights - FILE - - -DESCRIPTION -shapeclustering(1) takes extracted feature .tr files (generated by -tesseract(1) run in a special mode from box files) and produces a -file shapetable and an enhanced unicharset. This program is still -experimental, and is not required (yet) for training Tesseract. - - -OPTIONS - - - --U FILE - - - - The unicharset generated by unicharset_extractor(1). - - - - - --D dir - - - - Directory to write output files to. - - - - - --F font_properties_file - - - - (Input) font properties file, where each line is of the following form, where each field other than the font name is 0 or 1: - -'font_name' 'italic' 'bold' 'fixed_pitch' 'serif' 'fraktur' - - - - --X xheights_file - - - - (Input) x heights file, each line is of the following form, where xheight is calculated as the pixel x height of a character drawn at 32pt on 300 dpi. [ That is, if base x height + ascenders + descenders = 133, how much is x height? ] - -'font_name' 'xheight' - - - - --O FILE - - - - The output unicharset that will be given to combine_tessdata(1). - - - - - - -SEE ALSO -tesseract(1), cntraining(1), unicharset_extractor(1), combine_tessdata(1), -unicharset(5) -https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract - - -COPYING -Copyright (C) Google, 2011 -Licensed under the Apache License, Version 2.0 - - -AUTHOR -The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present). - - diff --git a/doc/tesseract.1 b/doc/tesseract.1 deleted file mode 100644 index fdf7cdac5c..0000000000 --- a/doc/tesseract.1 +++ /dev/null @@ -1,282 +0,0 @@ -'\" t -.\" Title: tesseract -.\" Author: [see the "AUTHOR" section] -.\" Generator: DocBook XSL Stylesheets v1.78.1 -.\" Date: 03/23/2017 -.\" Manual: \ \& -.\" Source: \ \& -.\" Language: English -.\" -.TH "TESSERACT" "1" "03/23/2017" "\ \&" "\ \&" -.\" ----------------------------------------------------------------- -.\" * Define some portability stuff -.\" ----------------------------------------------------------------- -.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.\" http://bugs.debian.org/507673 -.\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html -.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.ie \n(.g .ds Aq \(aq -.el .ds Aq ' -.\" ----------------------------------------------------------------- -.\" * set default formatting -.\" ----------------------------------------------------------------- -.\" disable hyphenation -.nh -.\" disable justification (adjust text to left margin only) -.ad l -.\" ----------------------------------------------------------------- -.\" * MAIN CONTENT STARTS HERE * -.\" ----------------------------------------------------------------- -.SH "NAME" -tesseract \- command\-line OCR engine -.SH "SYNOPSIS" -.sp -\fBtesseract\fR \fIimagename\fR|\fIstdin\fR \fIoutputbase\fR|\fIstdout\fR [options\&...] [configfile\&...] -.SH "DESCRIPTION" -.sp -tesseract(1) is a commercial quality OCR engine originally developed at HP between 1985 and 1995\&. In 1995, this engine was among the top 3 evaluated by UNLV\&. It was open\-sourced by HP and UNLV in 2005, and has been developed at Google since then\&. -.SH "IN/OUT ARGUMENTS" -.PP -\fIimagename\fR -.RS 4 -The name of the input image\&. Most image file formats (anything readable by Leptonica) are supported\&. -.RE -.PP -\fIstdin\fR -.RS 4 -Instruction to read data from standard input -.RE -.PP -\fIoutputbase\fR -.RS 4 -The basename of the output file (to which the appropriate extension will be appended)\&. By default the output will be named -\fIoutbase\&.txt\fR\&. -.RE -.PP -\fIstdout\fR -.RS 4 -Instruction to sent output data to standard output -.RE -.SH "OPTIONS" -.PP -\fI\-\-tessdata\-dir /path\fR -.RS 4 -Specify the location of tessdata path -.RE -.PP -\fI\-\-user\-words /path/to/file\fR -.RS 4 -Specify the location of user words file -.RE -.PP -\fI\-\-user\-patterns /path/to/file specify\fR -.RS 4 -The location of user patterns file -.RE -.PP -\fI\-c configvar=value\fR -.RS 4 -Set value for control parameter\&. Multiple \-c arguments are allowed\&. -.RE -.PP -\fI\-l lang\fR -.RS 4 -The language to use\&. If none is specified, English is assumed\&. Multiple languages may be specified, separated by plus characters\&. Tesseract uses 3\-character ISO 639\-2 language codes\&. (See LANGUAGES) -.RE -.PP -\fI\-\-psm N\fR -.RS 4 -Set Tesseract to only run a subset of layout analysis and assume a certain form of image\&. The options for -\fBN\fR -are: -.sp -.if n \{\ -.RS 4 -.\} -.nf -0 = Orientation and script detection (OSD) only\&. -1 = Automatic page segmentation with OSD\&. -2 = Automatic page segmentation, but no OSD, or OCR\&. -3 = Fully automatic page segmentation, but no OSD\&. (Default) -4 = Assume a single column of text of variable sizes\&. -5 = Assume a single uniform block of vertically aligned text\&. -6 = Assume a single uniform block of text\&. -7 = Treat the image as a single text line\&. -8 = Treat the image as a single word\&. -9 = Treat the image as a single word in a circle\&. -10 = Treat the image as a single character\&. -.fi -.if n \{\ -.RE -.\} -.RE -.PP -\fI\-\-oem N\fR -.RS 4 -Specify OCR Engine mode\&. The options for -\fBN\fR -are: -.sp -.if n \{\ -.RS 4 -.\} -.nf -0 = Original Tesseract only\&. -1 = Neural nets LSTM only\&. -2 = Tesseract + LSTM\&. -3 = Default, based on what is available\&. -.fi -.if n \{\ -.RE -.\} -.RE -.PP -\fIconfigfile\fR -.RS 4 -The name of a config to use\&. A config is a plaintext file which contains a list of variables and their values, one per line, with a space separating variable from value\&. Interesting config files include: - -.sp -.RS 4 -.ie n \{\ -\h'-04'\(bu\h'+03'\c -.\} -.el \{\ -.sp -1 -.IP \(bu 2.3 -.\} -hocr \- Output in hOCR format instead of as a text file\&. -.RE -.sp -.RS 4 -.ie n \{\ -\h'-04'\(bu\h'+03'\c -.\} -.el \{\ -.sp -1 -.IP \(bu 2.3 -.\} -pdf \- Output in pdf instead of a text file\&. -.RE -.RE -.sp -\fBNota Bene:\fR The options \fI\-l lang\fR and \fI\-\-psm N\fR must occur before any \fIconfigfile\fR\&. -.SH "SINGLE OPTIONS" -.PP -\fI\-h, \-\-help\fR -.RS 4 -Show help message\&. -.RE -.PP -\fI\-\-help\-psm\fR -.RS 4 -Show page segmentation modes\&. -.RE -.PP -\fI\-\-help\-oem\fR -.RS 4 -Show OCR Engine modes\&. -.RE -.PP -\fI\-v, \-\-version\fR -.RS 4 -Returns the current version of the tesseract(1) executable\&. -.RE -.PP -\fI\-\-list\-langs\fR -.RS 4 -List available languages for tesseract engine\&. Can be used with \-\-tessdata\-dir\&. -.RE -.PP -\fI\-\-print\-parameters\fR -.RS 4 -Print tesseract parameters\&. -.RE -.SH "LANGUAGES" -.sp -There are currently language packs available for the following languages (in \m[blue]\fBhttps://github\&.com/tesseract\-ocr/tessdata\fR\m[]): -.sp -\fBafr\fR (Afrikaans) \fBamh\fR (Amharic) \fBara\fR (Arabic) \fBasm\fR (Assamese) \fBaze\fR (Azerbaijani) \fBaze_cyrl\fR (Azerbaijani \- Cyrilic) \fBbel\fR (Belarusian) \fBben\fR (Bengali) \fBbod\fR (Tibetan) \fBbos\fR (Bosnian) \fBbul\fR (Bulgarian) \fBcat\fR (Catalan; Valencian) \fBceb\fR (Cebuano) \fBces\fR (Czech) \fBchi_sim\fR (Chinese \- Simplified) \fBchi_tra\fR (Chinese \- Traditional) \fBchr\fR (Cherokee) \fBcym\fR (Welsh) \fBdan\fR (Danish) \fBdan_frak\fR (Danish \- Fraktur) \fBdeu\fR (German) \fBdeu_frak\fR (German \- Fraktur) \fBdzo\fR (Dzongkha) \fBell\fR (Greek, Modern (1453\-)) \fBeng\fR (English) \fBenm\fR (English, Middle (1100\-1500)) \fBepo\fR (Esperanto) \fBequ\fR (Math / equation detection module) \fBest\fR (Estonian) \fBeus\fR (Basque) \fBfas\fR (Persian) \fBfin\fR (Finnish) \fBfra\fR (French) \fBfrk\fR (Frankish) \fBfrm\fR (French, Middle (ca\&.1400\-1600)) \fBgle\fR (Irish) \fBglg\fR (Galician) \fBgrc\fR (Greek, Ancient (to 1453)) \fBguj\fR (Gujarati) \fBhat\fR (Haitian; Haitian Creole) \fBheb\fR (Hebrew) \fBhin\fR (Hindi) \fBhrv\fR (Croatian) \fBhun\fR (Hungarian) \fBiku\fR (Inuktitut) \fBind\fR (Indonesian) \fBisl\fR (Icelandic) \fBita\fR (Italian) \fBita_old\fR (Italian \- Old) \fBjav\fR (Javanese) \fBjpn\fR (Japanese) \fBkan\fR (Kannada) \fBkat\fR (Georgian) \fBkat_old\fR (Georgian \- Old) \fBkaz\fR (Kazakh) \fBkhm\fR (Central Khmer) \fBkir\fR (Kirghiz; Kyrgyz) \fBkor\fR (Korean) \fBkur\fR (Kurdish) \fBlao\fR (Lao) \fBlat\fR (Latin) \fBlav\fR (Latvian) \fBlit\fR (Lithuanian) \fBmal\fR (Malayalam) \fBmar\fR (Marathi) \fBmkd\fR (Macedonian) \fBmlt\fR (Maltese) \fBmsa\fR (Malay) \fBmya\fR (Burmese) \fBnep\fR (Nepali) \fBnld\fR (Dutch; Flemish) \fBnor\fR (Norwegian) \fBori\fR (Oriya) \fBosd\fR (Orientation and script detection module) \fBpan\fR (Panjabi; Punjabi) \fBpol\fR (Polish) \fBpor\fR (Portuguese) \fBpus\fR (Pushto; Pashto) \fBron\fR (Romanian; Moldavian; Moldovan) \fBrus\fR (Russian) \fBsan\fR (Sanskrit) \fBsin\fR (Sinhala; Sinhalese) \fBslk\fR (Slovak) \fBslk_frak\fR (Slovak \- Fraktur) \fBslv\fR (Slovenian) \fBspa\fR (Spanish; Castilian) \fBspa_old\fR (Spanish; Castilian \- Old) \fBsqi\fR (Albanian) \fBsrp\fR (Serbian) \fBsrp_latn\fR (Serbian \- Latin) \fBswa\fR (Swahili) \fBswe\fR (Swedish) \fBsyr\fR (Syriac) \fBtam\fR (Tamil) \fBtel\fR (Telugu) \fBtgk\fR (Tajik) \fBtgl\fR (Tagalog) \fBtha\fR (Thai) \fBtir\fR (Tigrinya) \fBtur\fR (Turkish) \fBuig\fR (Uighur; Uyghur) \fBukr\fR (Ukrainian) \fBurd\fR (Urdu) \fBuzb\fR (Uzbek) \fBuzb_cyrl\fR (Uzbek \- Cyrilic) \fBvie\fR (Vietnamese) \fByid\fR (Yiddish) -.sp -To use a non\-standard language pack named \fBfoo\&.traineddata\fR, set the \fBTESSDATA_PREFIX\fR environment variable so the file can be found at \fBTESSDATA_PREFIX\fR/tessdata/\fBfoo\fR\&.traineddata and give Tesseract the argument \fI\-l foo\fR\&. -.SH "CONFIG FILES AND AUGMENTING WITH USER DATA" -.sp -Tesseract config files consist of lines with variable\-value pairs (space separated)\&. The variables are documented as flags in the source code like the following one in tesseractclass\&.h: -.sp -STRING_VAR_H(tessedit_char_blacklist, "", "Blacklist of chars not to recognize"); -.sp -These variables may enable or disable various features of the engine, and may cause it to load (or not load) various data\&. For instance, let\(cqs suppose you want to OCR in English, but suppress the normal dictionary and load an alternative word list and an alternative list of patterns \(em these two files are the most commonly used extra data files\&. -.sp -If your language pack is in /path/to/eng\&.traineddata and the hocr config is in /path/to/configs/hocr then create three new files: -.sp -/path/to/eng\&.user\-words: -.sp -.if n \{\ -.RS 4 -.\} -.nf -the -quick -brown -fox -jumped -.fi -.if n \{\ -.RE -.\} -.sp -/path/to/eng\&.user\-patterns: -.sp -.if n \{\ -.RS 4 -.\} -.nf -1\-\ed\ed\ed\-GOOG\-411 -www\&.\en\e\e\e*\&.com -.fi -.if n \{\ -.RE -.\} -.sp -/path/to/configs/bazaar: -.sp -.if n \{\ -.RS 4 -.\} -.nf -load_system_dawg F -load_freq_dawg F -user_words_suffix user\-words -user_patterns_suffix user\-patterns -.fi -.if n \{\ -.RE -.\} -.sp -Now, if you pass the word \fIbazaar\fR as a trailing command line parameter to Tesseract, Tesseract will not bother loading the system dictionary nor the dictionary of frequent words and will load and use the eng\&.user\-words and eng\&.user\-patterns files you provided\&. The former is a simple word list, one per line\&. The format of the latter is documented in dict/trie\&.h on read_pattern_list()\&. -.SH "HISTORY" -.sp -The engine was developed at Hewlett Packard Laboratories Bristol and at Hewlett Packard Co, Greeley Colorado between 1985 and 1994, with some more changes made in 1996 to port to Windows, and some C++izing in 1998\&. A lot of the code was written in C, and then some more was written in C++\&. The C++ code makes heavy use of a list system using macros\&. This predates stl, was portable before stl, and is more efficient than stl lists, but has the big negative that if you do get a segmentation violation, it is hard to debug\&. -.sp -Version 2\&.00 brought Unicode (UTF\-8) support, six languages, and the ability to train Tesseract\&. -.sp -Tesseract was included in UNLV\(cqs Fourth Annual Test of OCR Accuracy\&. See \m[blue]\fBhttps://github\&.com/tesseract\-ocr/docs/blob/master/AT\-1995\&.pdf\fR\m[]\&. With Tesseract 2\&.00, scripts are now included to allow anyone to reproduce some of these tests\&. See \m[blue]\fBhttps://github\&.com/tesseract\-ocr/tesseract/wiki/TestingTesseract\fR\m[] for more details\&. -.sp -Tesseract 3\&.00 adds a number of new languages, including Chinese, Japanese, and Korean\&. It also introduces a new, single\-file based system of managing language data\&. -.sp -Tesseract 3\&.02 adds BiDirectional text support, the ability to recognize multiple languages in a single image, and improved layout analysis\&. -.sp -For further details, see the file ReleaseNotes included with the distribution\&. -.SH "RESOURCES" -.sp -Main web site: \m[blue]\fBhttps://github\&.com/tesseract\-ocr\fR\m[] Information on training: \m[blue]\fBhttps://github\&.com/tesseract\-ocr/tesseract/wiki/TrainingTesseract\fR\m[] -.SH "SEE ALSO" -.sp -ambiguous_words(1), cntraining(1), combine_tessdata(1), dawg2wordlist(1), shape_training(1), mftraining(1), unicharambigs(5), unicharset(5), unicharset_extractor(1), wordlist2dawg(1) -.SH "AUTHOR" -.sp -Tesseract development was led at Hewlett\-Packard and Google by Ray Smith\&. The development team has included: -.sp -Ahmad Abdulkader, Chris Newton, Dan Johnson, Dar\-Shyang Lee, David Eger, Eric Wiseblatt, Faisal Shafait, Hiroshi Takenaka, Joe Liu, Joern Wanke, Mark Seaman, Mickey Namiki, Nicholas Beato, Oded Fuhrmann, Phil Cheatle, Pingping Xiu, Pong Eksombatchai (Chantat), Ranjith Unnikrishnan, Raquel Romano, Ray Smith, Rika Antonova, Robert Moss, Samuel Charron, Sheelagh Lloyd, Shobhit Saxena, and Thomas Kielbus\&. -.SH "COPYING" -.sp -Licensed under the Apache License, Version 2\&.0 diff --git a/doc/tesseract.1.html b/doc/tesseract.1.html deleted file mode 100644 index d9dbcc0b7c..0000000000 --- a/doc/tesseract.1.html +++ /dev/null @@ -1,1202 +0,0 @@ - - - - - -TESSERACT(1) - - - - - -
-
-

SYNOPSIS

-
-

tesseract imagename|stdin outputbase|stdout [options…] [configfile…]

-
-
-
-

DESCRIPTION

-
-

tesseract(1) is a commercial quality OCR engine originally developed at HP -between 1985 and 1995. In 1995, this engine was among the top 3 evaluated by -UNLV. It was open-sourced by HP and UNLV in 2005, and has been developed -at Google since then.

-
-
-
-

IN/OUT ARGUMENTS

-
-
-
-imagename -
-
-

- The name of the input image. Most image file formats (anything - readable by Leptonica) are supported. -

-
-
-stdin -
-
-

- Instruction to read data from standard input -

-
-
-outputbase -
-
-

- The basename of the output file (to which the appropriate extension - will be appended). By default the output will be named outbase.txt. -

-
-
-stdout -
-
-

- Instruction to sent output data to standard output -

-
-
-
-
-
-

OPTIONS

-
-
-
---tessdata-dir /path -
-
-

- Specify the location of tessdata path -

-
-
---user-words /path/to/file -
-
-

- Specify the location of user words file -

-
-
---user-patterns /path/to/file specify -
-
-

- The location of user patterns file -

-
-
--c configvar=value -
-
-

- Set value for control parameter. Multiple -c arguments are allowed. -

-
-
--l lang -
-
-

- The language to use. If none is specified, English is assumed. - Multiple languages may be specified, separated by plus characters. - Tesseract uses 3-character ISO 639-2 language codes. (See LANGUAGES) -

-
-
---psm N -
-
-

- Set Tesseract to only run a subset of layout analysis and assume - a certain form of image. The options for N are: -

-
-
-
0 = Orientation and script detection (OSD) only.
-1 = Automatic page segmentation with OSD.
-2 = Automatic page segmentation, but no OSD, or OCR.
-3 = Fully automatic page segmentation, but no OSD. (Default)
-4 = Assume a single column of text of variable sizes.
-5 = Assume a single uniform block of vertically aligned text.
-6 = Assume a single uniform block of text.
-7 = Treat the image as a single text line.
-8 = Treat the image as a single word.
-9 = Treat the image as a single word in a circle.
-10 = Treat the image as a single character.
-
-
-
---oem N -
-
-

- Specify OCR Engine mode. The options for N are: -

-
-
-
0 = Original Tesseract only.
-1 = Neural nets LSTM only.
-2 = Tesseract + LSTM.
-3 = Default, based on what is available.
-
-
-
-configfile -
-
-

- The name of a config to use. A config is a plaintext file which - contains a list of variables and their values, one per line, with a - space separating variable from value. Interesting config files - include:
-

-
    -
  • -

    -hocr - Output in hOCR format instead of as a text file. -

    -
  • -
  • -

    -pdf - Output in pdf instead of a text file. -

    -
  • -
-
-
-

Nota Bene: The options -l lang and --psm N must occur -before any configfile.

-
-
-
-

SINGLE OPTIONS

-
-
-
--h, --help -
-
-

- Show help message. -

-
-
---help-psm -
-
-

- Show page segmentation modes. -

-
-
---help-oem -
-
-

- Show OCR Engine modes. -

-
-
--v, --version -
-
-

- Returns the current version of the tesseract(1) executable. -

-
-
---list-langs -
-
-

- List available languages for tesseract engine. Can be used with --tessdata-dir. -

-
-
---print-parameters -
-
-

- Print tesseract parameters. -

-
-
-
-
-
-

LANGUAGES

-
-

There are currently language packs available for the following languages -(in https://github.com/tesseract-ocr/tessdata):

-

afr (Afrikaans) -amh (Amharic) -ara (Arabic) -asm (Assamese) -aze (Azerbaijani) -aze_cyrl (Azerbaijani - Cyrilic) -bel (Belarusian) -ben (Bengali) -bod (Tibetan) -bos (Bosnian) -bul (Bulgarian) -cat (Catalan; Valencian) -ceb (Cebuano) -ces (Czech) -chi_sim (Chinese - Simplified) -chi_tra (Chinese - Traditional) -chr (Cherokee) -cym (Welsh) -dan (Danish) -dan_frak (Danish - Fraktur) -deu (German) -deu_frak (German - Fraktur) -dzo (Dzongkha) -ell (Greek, Modern (1453-)) -eng (English) -enm (English, Middle (1100-1500)) -epo (Esperanto) -equ (Math / equation detection module) -est (Estonian) -eus (Basque) -fas (Persian) -fin (Finnish) -fra (French) -frk (Frankish) -frm (French, Middle (ca.1400-1600)) -gle (Irish) -glg (Galician) -grc (Greek, Ancient (to 1453)) -guj (Gujarati) -hat (Haitian; Haitian Creole) -heb (Hebrew) -hin (Hindi) -hrv (Croatian) -hun (Hungarian) -iku (Inuktitut) -ind (Indonesian) -isl (Icelandic) -ita (Italian) -ita_old (Italian - Old) -jav (Javanese) -jpn (Japanese) -kan (Kannada) -kat (Georgian) -kat_old (Georgian - Old) -kaz (Kazakh) -khm (Central Khmer) -kir (Kirghiz; Kyrgyz) -kor (Korean) -kur (Kurdish) -lao (Lao) -lat (Latin) -lav (Latvian) -lit (Lithuanian) -mal (Malayalam) -mar (Marathi) -mkd (Macedonian) -mlt (Maltese) -msa (Malay) -mya (Burmese) -nep (Nepali) -nld (Dutch; Flemish) -nor (Norwegian) -ori (Oriya) -osd (Orientation and script detection module) -pan (Panjabi; Punjabi) -pol (Polish) -por (Portuguese) -pus (Pushto; Pashto) -ron (Romanian; Moldavian; Moldovan) -rus (Russian) -san (Sanskrit) -sin (Sinhala; Sinhalese) -slk (Slovak) -slk_frak (Slovak - Fraktur) -slv (Slovenian) -spa (Spanish; Castilian) -spa_old (Spanish; Castilian - Old) -sqi (Albanian) -srp (Serbian) -srp_latn (Serbian - Latin) -swa (Swahili) -swe (Swedish) -syr (Syriac) -tam (Tamil) -tel (Telugu) -tgk (Tajik) -tgl (Tagalog) -tha (Thai) -tir (Tigrinya) -tur (Turkish) -uig (Uighur; Uyghur) -ukr (Ukrainian) -urd (Urdu) -uzb (Uzbek) -uzb_cyrl (Uzbek - Cyrilic) -vie (Vietnamese) -yid (Yiddish)

-

To use a non-standard language pack named foo.traineddata, set the -TESSDATA_PREFIX environment variable so the file can be found at -TESSDATA_PREFIX/tessdata/foo.traineddata and give Tesseract the -argument -l foo.

-
-
-
-

CONFIG FILES AND AUGMENTING WITH USER DATA

-
-

Tesseract config files consist of lines with variable-value pairs (space -separated). The variables are documented as flags in the source code like -the following one in tesseractclass.h:

-

STRING_VAR_H(tessedit_char_blacklist, "", - "Blacklist of chars not to recognize");

-

These variables may enable or disable various features of the engine, and -may cause it to load (or not load) various data. For instance, let’s suppose -you want to OCR in English, but suppress the normal dictionary and load an -alternative word list and an alternative list of patterns — these two files -are the most commonly used extra data files.

-

If your language pack is in /path/to/eng.traineddata and the hocr config -is in /path/to/configs/hocr then create three new files:

-

/path/to/eng.user-words:

-
-
the
-quick
-brown
-fox
-jumped
-
-
-

/path/to/eng.user-patterns:

-
-
1-\d\d\d-GOOG-411
-www.\n\\\*.com
-
-
-

/path/to/configs/bazaar:

-
-
load_system_dawg     F
-load_freq_dawg       F
-user_words_suffix    user-words
-user_patterns_suffix user-patterns
-
-
-

Now, if you pass the word bazaar as a trailing command line parameter -to Tesseract, Tesseract will not bother loading the system dictionary nor -the dictionary of frequent words and will load and use the eng.user-words -and eng.user-patterns files you provided. The former is a simple word list, -one per line. The format of the latter is documented in dict/trie.h -on read_pattern_list().

-
-
-
-

HISTORY

-
-

The engine was developed at Hewlett Packard Laboratories Bristol and at -Hewlett Packard Co, Greeley Colorado between 1985 and 1994, with some more -changes made in 1996 to port to Windows, and some C++izing in 1998. A -lot of the code was written in C, and then some more was written in C++. -The C++ code makes heavy use of a list system using macros. This predates -stl, was portable before stl, and is more efficient than stl lists, but has -the big negative that if you do get a segmentation violation, it is hard to -debug.

-

Version 2.00 brought Unicode (UTF-8) support, six languages, and the ability -to train Tesseract.

-

Tesseract was included in UNLV’s Fourth Annual Test of OCR Accuracy. -See https://github.com/tesseract-ocr/docs/blob/master/AT-1995.pdf. With Tesseract 2.00, -scripts are now included to allow anyone to reproduce some of these tests. -See https://github.com/tesseract-ocr/tesseract/wiki/TestingTesseract for more -details.

-

Tesseract 3.00 adds a number of new languages, including Chinese, Japanese, -and Korean. It also introduces a new, single-file based system of managing -language data.

-

Tesseract 3.02 adds BiDirectional text support, the ability to recognize -multiple languages in a single image, and improved layout analysis.

-

For further details, see the file ReleaseNotes included with the distribution.

-
-
- -
-

SEE ALSO

-
-

ambiguous_words(1), cntraining(1), combine_tessdata(1), dawg2wordlist(1), -shape_training(1), mftraining(1), unicharambigs(5), unicharset(5), -unicharset_extractor(1), wordlist2dawg(1)

-
-
-
-

AUTHOR

-
-

Tesseract development was led at Hewlett-Packard and Google by Ray Smith. -The development team has included:

-

Ahmad Abdulkader, Chris Newton, Dan Johnson, Dar-Shyang Lee, David Eger, -Eric Wiseblatt, Faisal Shafait, Hiroshi Takenaka, Joe Liu, Joern Wanke, -Mark Seaman, Mickey Namiki, Nicholas Beato, Oded Fuhrmann, Phil Cheatle, -Pingping Xiu, Pong Eksombatchai (Chantat), Ranjith Unnikrishnan, Raquel -Romano, Ray Smith, Rika Antonova, Robert Moss, Samuel Charron, Sheelagh -Lloyd, Shobhit Saxena, and Thomas Kielbus.

-
-
-
-

COPYING

-
-

Licensed under the Apache License, Version 2.0

-
-
-
-

- - - diff --git a/doc/tesseract.1.xml b/doc/tesseract.1.xml deleted file mode 100644 index 941caa5bbc..0000000000 --- a/doc/tesseract.1.xml +++ /dev/null @@ -1,468 +0,0 @@ - - - - - - - TESSERACT(1) - - -tesseract -1 -  -  - - - tesseract - command-line OCR engine - - -tesseract imagename|stdin outputbase|stdout [options…] [configfile…] - - -DESCRIPTION -tesseract(1) is a commercial quality OCR engine originally developed at HP -between 1985 and 1995. In 1995, this engine was among the top 3 evaluated by -UNLV. It was open-sourced by HP and UNLV in 2005, and has been developed -at Google since then. - - -IN/OUT ARGUMENTS - - - -imagename - - - - The name of the input image. Most image file formats (anything - readable by Leptonica) are supported. - - - - - -stdin - - - - Instruction to read data from standard input - - - - - -outputbase - - - - The basename of the output file (to which the appropriate extension - will be appended). By default the output will be named outbase.txt. - - - - - -stdout - - - - Instruction to sent output data to standard output - - - - - - -OPTIONS - - - ---tessdata-dir /path - - - - Specify the location of tessdata path - - - - - ---user-words /path/to/file - - - - Specify the location of user words file - - - - - ---user-patterns /path/to/file specify - - - - The location of user patterns file - - - - - --c configvar=value - - - - Set value for control parameter. Multiple -c arguments are allowed. - - - - - --l lang - - - - The language to use. If none is specified, English is assumed. - Multiple languages may be specified, separated by plus characters. - Tesseract uses 3-character ISO 639-2 language codes. (See LANGUAGES) - - - - - ---psm N - - - - Set Tesseract to only run a subset of layout analysis and assume - a certain form of image. The options for N are: - -0 = Orientation and script detection (OSD) only. -1 = Automatic page segmentation with OSD. -2 = Automatic page segmentation, but no OSD, or OCR. -3 = Fully automatic page segmentation, but no OSD. (Default) -4 = Assume a single column of text of variable sizes. -5 = Assume a single uniform block of vertically aligned text. -6 = Assume a single uniform block of text. -7 = Treat the image as a single text line. -8 = Treat the image as a single word. -9 = Treat the image as a single word in a circle. -10 = Treat the image as a single character. - - - - ---oem N - - - - Specify OCR Engine mode. The options for N are: - -0 = Original Tesseract only. -1 = Neural nets LSTM only. -2 = Tesseract + LSTM. -3 = Default, based on what is available. - - - - -configfile - - - - The name of a config to use. A config is a plaintext file which - contains a list of variables and their values, one per line, with a - space separating variable from value. Interesting config files - include: - - - - -hocr - Output in hOCR format instead of as a text file. - - - - -pdf - Output in pdf instead of a text file. - - - - - - -Nota Bene: The options -l lang and --psm N must occur -before any configfile. - - -SINGLE OPTIONS - - - --h, --help - - - - Show help message. - - - - - ---help-psm - - - - Show page segmentation modes. - - - - - ---help-oem - - - - Show OCR Engine modes. - - - - - --v, --version - - - - Returns the current version of the tesseract(1) executable. - - - - - ---list-langs - - - - List available languages for tesseract engine. Can be used with --tessdata-dir. - - - - - ---print-parameters - - - - Print tesseract parameters. - - - - - - -LANGUAGES -There are currently language packs available for the following languages -(in https://github.com/tesseract-ocr/tessdata): -afr (Afrikaans) -amh (Amharic) -ara (Arabic) -asm (Assamese) -aze (Azerbaijani) -aze_cyrl (Azerbaijani - Cyrilic) -bel (Belarusian) -ben (Bengali) -bod (Tibetan) -bos (Bosnian) -bul (Bulgarian) -cat (Catalan; Valencian) -ceb (Cebuano) -ces (Czech) -chi_sim (Chinese - Simplified) -chi_tra (Chinese - Traditional) -chr (Cherokee) -cym (Welsh) -dan (Danish) -dan_frak (Danish - Fraktur) -deu (German) -deu_frak (German - Fraktur) -dzo (Dzongkha) -ell (Greek, Modern (1453-)) -eng (English) -enm (English, Middle (1100-1500)) -epo (Esperanto) -equ (Math / equation detection module) -est (Estonian) -eus (Basque) -fas (Persian) -fin (Finnish) -fra (French) -frk (Frankish) -frm (French, Middle (ca.1400-1600)) -gle (Irish) -glg (Galician) -grc (Greek, Ancient (to 1453)) -guj (Gujarati) -hat (Haitian; Haitian Creole) -heb (Hebrew) -hin (Hindi) -hrv (Croatian) -hun (Hungarian) -iku (Inuktitut) -ind (Indonesian) -isl (Icelandic) -ita (Italian) -ita_old (Italian - Old) -jav (Javanese) -jpn (Japanese) -kan (Kannada) -kat (Georgian) -kat_old (Georgian - Old) -kaz (Kazakh) -khm (Central Khmer) -kir (Kirghiz; Kyrgyz) -kor (Korean) -kur (Kurdish) -lao (Lao) -lat (Latin) -lav (Latvian) -lit (Lithuanian) -mal (Malayalam) -mar (Marathi) -mkd (Macedonian) -mlt (Maltese) -msa (Malay) -mya (Burmese) -nep (Nepali) -nld (Dutch; Flemish) -nor (Norwegian) -ori (Oriya) -osd (Orientation and script detection module) -pan (Panjabi; Punjabi) -pol (Polish) -por (Portuguese) -pus (Pushto; Pashto) -ron (Romanian; Moldavian; Moldovan) -rus (Russian) -san (Sanskrit) -sin (Sinhala; Sinhalese) -slk (Slovak) -slk_frak (Slovak - Fraktur) -slv (Slovenian) -spa (Spanish; Castilian) -spa_old (Spanish; Castilian - Old) -sqi (Albanian) -srp (Serbian) -srp_latn (Serbian - Latin) -swa (Swahili) -swe (Swedish) -syr (Syriac) -tam (Tamil) -tel (Telugu) -tgk (Tajik) -tgl (Tagalog) -tha (Thai) -tir (Tigrinya) -tur (Turkish) -uig (Uighur; Uyghur) -ukr (Ukrainian) -urd (Urdu) -uzb (Uzbek) -uzb_cyrl (Uzbek - Cyrilic) -vie (Vietnamese) -yid (Yiddish) -To use a non-standard language pack named foo.traineddata, set the -TESSDATA_PREFIX environment variable so the file can be found at -TESSDATA_PREFIX/tessdata/foo.traineddata and give Tesseract the -argument -l foo. - - -CONFIG FILES AND AUGMENTING WITH USER DATA -Tesseract config files consist of lines with variable-value pairs (space -separated). The variables are documented as flags in the source code like -the following one in tesseractclass.h: -STRING_VAR_H(tessedit_char_blacklist, "", - "Blacklist of chars not to recognize"); -These variables may enable or disable various features of the engine, and -may cause it to load (or not load) various data. For instance, let’s suppose -you want to OCR in English, but suppress the normal dictionary and load an -alternative word list and an alternative list of patterns — these two files -are the most commonly used extra data files. -If your language pack is in /path/to/eng.traineddata and the hocr config -is in /path/to/configs/hocr then create three new files: -/path/to/eng.user-words: -
-the -quick -brown -fox -jumped -
-/path/to/eng.user-patterns: -
-1-\d\d\d-GOOG-411 -www.\n\\\*.com -
-/path/to/configs/bazaar: -
-load_system_dawg F -load_freq_dawg F -user_words_suffix user-words -user_patterns_suffix user-patterns -
-Now, if you pass the word bazaar as a trailing command line parameter -to Tesseract, Tesseract will not bother loading the system dictionary nor -the dictionary of frequent words and will load and use the eng.user-words -and eng.user-patterns files you provided. The former is a simple word list, -one per line. The format of the latter is documented in dict/trie.h -on read_pattern_list(). -
- -HISTORY -The engine was developed at Hewlett Packard Laboratories Bristol and at -Hewlett Packard Co, Greeley Colorado between 1985 and 1994, with some more -changes made in 1996 to port to Windows, and some C++izing in 1998. A -lot of the code was written in C, and then some more was written in C++. -The C++ code makes heavy use of a list system using macros. This predates -stl, was portable before stl, and is more efficient than stl lists, but has -the big negative that if you do get a segmentation violation, it is hard to -debug. -Version 2.00 brought Unicode (UTF-8) support, six languages, and the ability -to train Tesseract. -Tesseract was included in UNLV’s Fourth Annual Test of OCR Accuracy. -See https://github.com/tesseract-ocr/docs/blob/master/AT-1995.pdf. With Tesseract 2.00, -scripts are now included to allow anyone to reproduce some of these tests. -See https://github.com/tesseract-ocr/tesseract/wiki/TestingTesseract for more -details. -Tesseract 3.00 adds a number of new languages, including Chinese, Japanese, -and Korean. It also introduces a new, single-file based system of managing -language data. -Tesseract 3.02 adds BiDirectional text support, the ability to recognize -multiple languages in a single image, and improved layout analysis. -For further details, see the file ReleaseNotes included with the distribution. - - -RESOURCES -Main web site: https://github.com/tesseract-ocr -Information on training: https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract - - -SEE ALSO -ambiguous_words(1), cntraining(1), combine_tessdata(1), dawg2wordlist(1), -shape_training(1), mftraining(1), unicharambigs(5), unicharset(5), -unicharset_extractor(1), wordlist2dawg(1) - - -AUTHOR -Tesseract development was led at Hewlett-Packard and Google by Ray Smith. -The development team has included: -Ahmad Abdulkader, Chris Newton, Dan Johnson, Dar-Shyang Lee, David Eger, -Eric Wiseblatt, Faisal Shafait, Hiroshi Takenaka, Joe Liu, Joern Wanke, -Mark Seaman, Mickey Namiki, Nicholas Beato, Oded Fuhrmann, Phil Cheatle, -Pingping Xiu, Pong Eksombatchai (Chantat), Ranjith Unnikrishnan, Raquel -Romano, Ray Smith, Rika Antonova, Robert Moss, Samuel Charron, Sheelagh -Lloyd, Shobhit Saxena, and Thomas Kielbus. - - -COPYING -Licensed under the Apache License, Version 2.0 - -
diff --git a/doc/unicharambigs.5 b/doc/unicharambigs.5 deleted file mode 100644 index b126433a27..0000000000 --- a/doc/unicharambigs.5 +++ /dev/null @@ -1,120 +0,0 @@ -'\" t -.\" Title: unicharambigs -.\" Author: [see the "AUTHOR" section] -.\" Generator: DocBook XSL Stylesheets v1.78.1 -.\" Date: 06/12/2015 -.\" Manual: \ \& -.\" Source: \ \& -.\" Language: English -.\" -.TH "UNICHARAMBIGS" "5" "06/12/2015" "\ \&" "\ \&" -.\" ----------------------------------------------------------------- -.\" * Define some portability stuff -.\" ----------------------------------------------------------------- -.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.\" http://bugs.debian.org/507673 -.\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html -.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.ie \n(.g .ds Aq \(aq -.el .ds Aq ' -.\" ----------------------------------------------------------------- -.\" * set default formatting -.\" ----------------------------------------------------------------- -.\" disable hyphenation -.nh -.\" disable justification (adjust text to left margin only) -.ad l -.\" ----------------------------------------------------------------- -.\" * MAIN CONTENT STARTS HERE * -.\" ----------------------------------------------------------------- -.SH "NAME" -unicharambigs \- Tesseract unicharset ambiguities -.SH "DESCRIPTION" -.sp -The unicharambigs file (a component of traineddata, see combine_tessdata(1) ) is used by Tesseract to represent possible ambiguities between characters, or groups of characters\&. -.sp -The file contains a number of lines, laid out as follow: -.sp -.if n \{\ -.RS 4 -.\} -.nf -[num] [char(s)] [num] [char(s)] [num] -.fi -.if n \{\ -.RE -.\} -.sp -.TS -tab(:); -lt lt -lt lt -lt lt -lt lt -lt lt. -T{ -.sp -Field one -T}:T{ -.sp -the number of characters contained in field two -T} -T{ -.sp -Field two -T}:T{ -.sp -the character sequence to be replaced -T} -T{ -.sp -Field three -T}:T{ -.sp -the number of characters contained in field four -T} -T{ -.sp -Field four -T}:T{ -.sp -the character sequence used to replace field two -T} -T{ -.sp -Field five -T}:T{ -.sp -contains either 1 or 0\&. 1 denotes a mandatory replacement, 0 denotes an optional replacement\&. -T} -.TE -.sp 1 -.sp -Characters appearing in fields two and four should appear in unicharset\&. The numbers in fields one and three refer to the number of unichars (not bytes)\&. -.SH "EXAMPLE" -.sp -.if n \{\ -.RS 4 -.\} -.nf -2 \*(Aq \*(Aq 1 " 1 -1 m 2 r n 0 -3 i i i 1 m 0 -.fi -.if n \{\ -.RE -.\} -.sp -In this example, all instances of the \fI2\fR character sequence \fI\*(Aq\fR\*(Aq will \fBalways\fR be replaced by the \fI1\fR character sequence \fI"\fR; a \fI1\fR character sequence \fIm\fR \fBmay\fR be replaced by the \fI2\fR character sequence \fIrn\fR, and the \fI3\fR character sequence \fBmay\fR be replaced by the \fI1\fR character sequence \fIm\fR\&. -.SH "HISTORY" -.sp -The unicharambigs file first appeared in Tesseract 3\&.00; prior to that, a similar format, called DangAmbigs (\fIdangerous ambiguities\fR) was used: the format was almost identical, except only mandatory replacements could be specified, and field 5 was absent\&. -.SH "BUGS" -.sp -This is a documentation "bug": it\(cqs not currently clear what should be done in the case of ligatures (such as \fIfi\fR) which may also appear as regular letters in the unicharset\&. -.SH "SEE ALSO" -.sp -tesseract(1), unicharset(5) -.SH "AUTHOR" -.sp -The Tesseract OCR engine was written by Ray Smith and his research groups at Hewlett Packard (1985\-1995) and Google (2006\-present)\&. diff --git a/doc/unicharambigs.5.html b/doc/unicharambigs.5.html deleted file mode 100644 index bb9fb291a3..0000000000 --- a/doc/unicharambigs.5.html +++ /dev/null @@ -1,875 +0,0 @@ - - - - - -UNICHARAMBIGS(5) - - - - - -
-
-

DESCRIPTION

-
-

The unicharambigs file (a component of traineddata, see combine_tessdata(1) ) -is used by Tesseract to represent possible ambiguities between characters, -or groups of characters.

-

The file contains a number of lines, laid out as follow:

-
-
-
[num] <TAB> [char(s)] <TAB> [num] <TAB> [char(s)] <TAB> [num]
-
-
- - - - - - - - - - - - - - - - - - - - -
-Field one -
-
-

-the number of characters contained in field two -

-
-Field two -
-
-

-the character sequence to be replaced -

-
-Field three -
-
-

-the number of characters contained in field four -

-
-Field four -
-
-

-the character sequence used to replace field two -

-
-Field five -
-
-

-contains either 1 or 0. 1 denotes a mandatory -replacement, 0 denotes an optional replacement. -

-
-

Characters appearing in fields two and four should appear in -unicharset. The numbers in fields one and three refer to the -number of unichars (not bytes).

-
-
-
-

EXAMPLE

-
-
-
-
2       ' '     1       "     1
-1       m       2       r n   0
-3       i i i   1       m     0
-
-

In this example, all instances of the 2 character sequence '' will -always be replaced by the 1 character sequence "; a 1 character -sequence m may be replaced by the 2 character sequence rn, and -the 3 character sequence may be replaced by the 1 character -sequence m.

-
-
-
-

HISTORY

-
-

The unicharambigs file first appeared in Tesseract 3.00; prior to that, a -similar format, called DangAmbigs (dangerous ambiguities) was used: the -format was almost identical, except only mandatory replacements could be -specified, and field 5 was absent.

-
-
-
-

BUGS

-
-

This is a documentation "bug": it’s not currently clear what should be done -in the case of ligatures (such as fi) which may also appear as regular -letters in the unicharset.

-
-
-
-

SEE ALSO

-
-

tesseract(1), unicharset(5)

-
-
-
-

AUTHOR

-
-

The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present).

-
-
-
-

- - - diff --git a/doc/unicharambigs.5.xml b/doc/unicharambigs.5.xml deleted file mode 100644 index cbc0f50e50..0000000000 --- a/doc/unicharambigs.5.xml +++ /dev/null @@ -1,126 +0,0 @@ - - - - - - - UNICHARAMBIGS(5) - - -unicharambigs -5 -  -  - - - unicharambigs - Tesseract unicharset ambiguities - - -DESCRIPTION -The unicharambigs file (a component of traineddata, see combine_tessdata(1) ) -is used by Tesseract to represent possible ambiguities between characters, -or groups of characters. -The file contains a number of lines, laid out as follow: -[num] <TAB> [char(s)] <TAB> [num] <TAB> [char(s)] <TAB> [num] - - - - -Field one - - - - -the number of characters contained in field two - - - - - - -Field two - - - - -the character sequence to be replaced - - - - - - -Field three - - - - -the number of characters contained in field four - - - - - - -Field four - - - - -the character sequence used to replace field two - - - - - - -Field five - - - - -contains either 1 or 0. 1 denotes a mandatory -replacement, 0 denotes an optional replacement. - - - - -Characters appearing in fields two and four should appear in -unicharset. The numbers in fields one and three refer to the -number of unichars (not bytes). - - -EXAMPLE -2 ' ' 1 " 1 -1 m 2 r n 0 -3 i i i 1 m 0 -In this example, all instances of the 2 character sequence '' will -always be replaced by the 1 character sequence "; a 1 character -sequence m may be replaced by the 2 character sequence rn, and -the 3 character sequence may be replaced by the 1 character -sequence m. - - -HISTORY -The unicharambigs file first appeared in Tesseract 3.00; prior to that, a -similar format, called DangAmbigs (dangerous ambiguities) was used: the -format was almost identical, except only mandatory replacements could be -specified, and field 5 was absent. - - -BUGS -This is a documentation "bug": it’s not currently clear what should be done -in the case of ligatures (such as fi) which may also appear as regular -letters in the unicharset. - - -SEE ALSO -tesseract(1), unicharset(5) - - -AUTHOR -The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present). - - diff --git a/doc/unicharset.5 b/doc/unicharset.5 deleted file mode 100644 index a5924db6e8..0000000000 --- a/doc/unicharset.5 +++ /dev/null @@ -1,220 +0,0 @@ -'\" t -.\" Title: unicharset -.\" Author: [see the "AUTHOR" section] -.\" Generator: DocBook XSL Stylesheets v1.78.1 -.\" Date: 06/12/2015 -.\" Manual: \ \& -.\" Source: \ \& -.\" Language: English -.\" -.TH "UNICHARSET" "5" "06/12/2015" "\ \&" "\ \&" -.\" ----------------------------------------------------------------- -.\" * Define some portability stuff -.\" ----------------------------------------------------------------- -.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.\" http://bugs.debian.org/507673 -.\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html -.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.ie \n(.g .ds Aq \(aq -.el .ds Aq ' -.\" ----------------------------------------------------------------- -.\" * set default formatting -.\" ----------------------------------------------------------------- -.\" disable hyphenation -.nh -.\" disable justification (adjust text to left margin only) -.ad l -.\" ----------------------------------------------------------------- -.\" * MAIN CONTENT STARTS HERE * -.\" ----------------------------------------------------------------- -.SH "NAME" -unicharset \- character properties file used by tesseract(1) -.SH "DESCRIPTION" -.sp -Tesseract\(cqs unicharset file contains information on each symbol (unichar) the Tesseract OCR engine is trained to recognize\&. -.sp -A unicharset file (i\&.e\&. \fIeng\&.unicharset\fR) is distributed as part of a Tesseract language pack (i\&.e\&. \fIeng\&.traineddata\fR)\&. For information on extracting the unicharset file, see combine_tessdata(1)\&. -.sp -The first line of a unicharset file contains the number of unichars in the file\&. After this line, each subsequent line provides information for a single unichar\&. The first such line contains a placeholder reserved for the space character\&. Each unichar is referred to within Tesseract by its Unichar ID, which is the line number (minus 1) within the unicharset file\&. Therefore, space gets unichar 0\&. -.sp -Each unichar line in the unicharset file (v2+) may have four space\-separated fields: -.sp -.if n \{\ -.RS 4 -.\} -.nf -\*(Aqcharacter\*(Aq \*(Aqproperties\*(Aq \*(Aqscript\*(Aq \*(Aqid\*(Aq -.fi -.if n \{\ -.RE -.\} -.sp -Starting with Tesseract v3\&.02, more information may be given for each unichar: -.sp -.if n \{\ -.RS 4 -.\} -.nf -\*(Aqcharacter\*(Aq \*(Aqproperties\*(Aq \*(Aqglyph_metrics\*(Aq \*(Aqscript\*(Aq \*(Aqother_case\*(Aq \*(Aqdirection\*(Aq \*(Aqmirror\*(Aq \*(Aqnormed_form\*(Aq -.fi -.if n \{\ -.RE -.\} -.sp -Entries: -.PP -\fIcharacter\fR -.RS 4 -The UTF\-8 encoded string to be produced for this unichar\&. -.RE -.PP -\fIproperties\fR -.RS 4 -An integer mask of character properties, one per bit\&. From least to most significant bit, these are: isalpha, islower, isupper, isdigit, ispunctuation\&. -.RE -.PP -\fIglyph_metrics\fR -.RS 4 -Ten comma\-separated integers representing various standards for where this glyph is to be found within a baseline\-normalized coordinate system where 128 is normalized to x\-height\&. -.sp -.RS 4 -.ie n \{\ -\h'-04'\(bu\h'+03'\c -.\} -.el \{\ -.sp -1 -.IP \(bu 2.3 -.\} -min_bottom, max_bottom: the ranges where the bottom of the character can be found\&. -.RE -.sp -.RS 4 -.ie n \{\ -\h'-04'\(bu\h'+03'\c -.\} -.el \{\ -.sp -1 -.IP \(bu 2.3 -.\} -min_top, max_top: the ranges where the top of the character may be found\&. -.RE -.sp -.RS 4 -.ie n \{\ -\h'-04'\(bu\h'+03'\c -.\} -.el \{\ -.sp -1 -.IP \(bu 2.3 -.\} -min_width, max_width: horizontal width of the character\&. -.RE -.sp -.RS 4 -.ie n \{\ -\h'-04'\(bu\h'+03'\c -.\} -.el \{\ -.sp -1 -.IP \(bu 2.3 -.\} -min_bearing, max_bearing: how far from the usual start position does the leftmost part of the character begin\&. -.RE -.sp -.RS 4 -.ie n \{\ -\h'-04'\(bu\h'+03'\c -.\} -.el \{\ -.sp -1 -.IP \(bu 2.3 -.\} -min_advance, max_advance: how far from the printer\(cqs cell left do we advance to begin the next character\&. -.RE -.RE -.PP -\fIscript\fR -.RS 4 -Name of the script (Latin, Common, Greek, Cyrillic, Han, null)\&. -.RE -.PP -\fIother_case\fR -.RS 4 -The Unichar ID of the other case version of this character (upper or lower)\&. -.RE -.PP -\fIdirection\fR -.RS 4 -The Unicode BiDi direction of this character, as defined by ICU\(cqs enum UCharDirection\&. (0 = Left to Right, 1 = Right to Left, 2 = European Number\&...) -.RE -.PP -\fImirror\fR -.RS 4 -The Unichar ID of the BiDirectional mirror of this character\&. For example the mirror of open paren is close paren, but Latin Capital C has no mirror, so it remains a Latin Capital C\&. -.RE -.PP -\fInormed_form\fR -.RS 4 -The UTF\-8 representation of a "normalized form" of this unichar for the purpose of blaming a module for errors given ground truth text\&. For instance, a left or right single quote may normalize to an ASCII quote\&. -.RE -.SH "EXAMPLE (V2)" -.sp -.if n \{\ -.RS 4 -.\} -.nf -; 10 Common 46 -b 3 Latin 59 -W 5 Latin 40 -7 8 Common 66 -= 0 Common 93 -.fi -.if n \{\ -.RE -.\} -.sp -";" is a punctuation character\&. Its properties are thus represented by the binary number 10000 (10 in hexadecimal)\&. -.sp -"b" is an alphabetic character and a lower case character\&. Its properties are thus represented by the binary number 00011 (3 in hexadecimal)\&. -.sp -"W" is an alphabetic character and an upper case character\&. Its properties are thus represented by the binary number 00101 (5 in hexadecimal)\&. -.sp -"7" is just a digit\&. Its properties are thus represented by the binary number 01000 (8 in hexadecimal)\&. -.sp -"=" is not punctuation nor a digit nor an alphabetic character\&. Its properties are thus represented by the binary number 00000 (0 in hexadecimal)\&. -.sp -Japanese or Chinese alphabetic character properties are represented by the binary number 00001 (1 in hexadecimal): they are alphabetic, but neither upper nor lower case\&. -.SH "EXAMPLE (V3.02)" -.sp -.if n \{\ -.RS 4 -.\} -.nf -110 -NULL 0 NULL 0 -N 5 59,68,216,255,87,236,0,27,104,227 Latin 11 0 1 N -Y 5 59,68,216,255,91,205,0,47,91,223 Latin 33 0 2 Y -1 8 59,69,203,255,45,128,0,66,74,173 Common 3 2 3 1 -9 8 18,66,203,255,89,156,0,39,104,173 Common 4 2 4 9 -a 3 58,65,186,198,85,164,0,26,97,185 Latin 56 0 5 a -\&. \&. \&. -.fi -.if n \{\ -.RE -.\} -.SH "CAVEATS" -.sp -Although the unicharset reader maintains the ability to read unicharsets of older formats and will assign default values to missing fields, the accuracy will be degraded\&. -.sp -Further, most other data files are indexed by the unicharset file, so changing it without re\-generating the others is likely to have dire consequences\&. -.SH "HISTORY" -.sp -The unicharset format first appeared with Tesseract 2\&.00, which was the first version to support languages other than English\&. The unicharset file contained only the first two fields, and the "ispunctuation" property was absent (punctuation was regarded as "0", as "=" is in the above example\&. -.SH "SEE ALSO" -.sp -tesseract(1), combine_tessdata(1), unicharset_extractor(1) -.sp -\m[blue]\fBhttps://github\&.com/tesseract\-ocr/tesseract/wiki/TrainingTesseract\fR\m[] -.SH "AUTHOR" -.sp -The Tesseract OCR engine was written by Ray Smith and his research groups at Hewlett Packard (1985\-1995) and Google (2006\-present)\&. diff --git a/doc/unicharset.5.html b/doc/unicharset.5.html deleted file mode 100644 index f3c3e7a9fc..0000000000 --- a/doc/unicharset.5.html +++ /dev/null @@ -1,965 +0,0 @@ - - - - - -UNICHARSET(5) - - - - - -
-
-

DESCRIPTION

-
-

Tesseract’s unicharset file contains information on each symbol -(unichar) the Tesseract OCR engine is trained to recognize.

-

A unicharset file (i.e. eng.unicharset) is distributed as part of a -Tesseract language pack (i.e. eng.traineddata). For information on -extracting the unicharset file, see combine_tessdata(1).

-

The first line of a unicharset file contains the number of unichars in -the file. After this line, each subsequent line provides information for -a single unichar. The first such line contains a placeholder reserved for -the space character. Each unichar is referred to within Tesseract by its -Unichar ID, which is the line number (minus 1) within the unicharset file. -Therefore, space gets unichar 0.

-

Each unichar line in the unicharset file (v2+) may have four space-separated fields:

-
-
-
'character' 'properties' 'script' 'id'
-
-

Starting with Tesseract v3.02, more information may be given for each unichar:

-
-
-
'character' 'properties' 'glyph_metrics' 'script' 'other_case' 'direction' 'mirror' 'normed_form'
-
-

Entries:

-
-
-character -
-
-

-The UTF-8 encoded string to be produced for this unichar. -

-
-
-properties -
-
-

-An integer mask of character properties, one per bit. - From least to most significant bit, these are: isalpha, islower, isupper, - isdigit, ispunctuation. -

-
-
-glyph_metrics -
-
-

-Ten comma-separated integers representing various standards - for where this glyph is to be found within a baseline-normalized coordinate - system where 128 is normalized to x-height. -

-
    -
  • -

    -min_bottom, max_bottom: the ranges where the bottom of the character can - be found. -

    -
  • -
  • -

    -min_top, max_top: the ranges where the top of the character may be found. -

    -
  • -
  • -

    -min_width, max_width: horizontal width of the character. -

    -
  • -
  • -

    -min_bearing, max_bearing: how far from the usual start position does the - leftmost part of the character begin. -

    -
  • -
  • -

    -min_advance, max_advance: how far from the printer’s cell left do we - advance to begin the next character. -

    -
  • -
-
-
-script -
-
-

-Name of the script (Latin, Common, Greek, Cyrillic, Han, null). -

-
-
-other_case -
-
-

-The Unichar ID of the other case version of this character - (upper or lower). -

-
-
-direction -
-
-

-The Unicode BiDi direction of this character, as defined by - ICU’s enum UCharDirection. (0 = Left to Right, 1 = Right to Left, - 2 = European Number…) -

-
-
-mirror -
-
-

-The Unichar ID of the BiDirectional mirror of this character. - For example the mirror of open paren is close paren, but Latin Capital C - has no mirror, so it remains a Latin Capital C. -

-
-
-normed_form -
-
-

-The UTF-8 representation of a "normalized form" of this unichar - for the purpose of blaming a module for errors given ground truth text. - For instance, a left or right single quote may normalize to an ASCII quote. -

-
-
-
-
-
-

EXAMPLE (v2)

-
-
-
-
; 10 Common 46
-b 3 Latin 59
-W 5 Latin 40
-7 8 Common 66
-= 0 Common 93
-
-

";" is a punctuation character. Its properties are thus represented by the -binary number 10000 (10 in hexadecimal).

-

"b" is an alphabetic character and a lower case character. Its properties are -thus represented by the binary number 00011 (3 in hexadecimal).

-

"W" is an alphabetic character and an upper case character. Its properties are -thus represented by the binary number 00101 (5 in hexadecimal).

-

"7" is just a digit. Its properties are thus represented by the binary number -01000 (8 in hexadecimal).

-

"=" is not punctuation nor a digit nor an alphabetic character. Its properties -are thus represented by the binary number 00000 (0 in hexadecimal).

-

Japanese or Chinese alphabetic character properties are represented by the -binary number 00001 (1 in hexadecimal): they are alphabetic, but neither -upper nor lower case.

-
-
-
-

EXAMPLE (v3.02)

-
-
-
-
110
-NULL 0 NULL 0
-N 5 59,68,216,255,87,236,0,27,104,227 Latin 11 0 1 N
-Y 5 59,68,216,255,91,205,0,47,91,223 Latin 33 0 2 Y
-1 8 59,69,203,255,45,128,0,66,74,173 Common 3 2 3 1
-9 8 18,66,203,255,89,156,0,39,104,173 Common 4 2 4 9
-a 3 58,65,186,198,85,164,0,26,97,185 Latin 56 0 5 a
-. . .
-
-
-
-
-

CAVEATS

-
-

Although the unicharset reader maintains the ability to read unicharsets -of older formats and will assign default values to missing fields, -the accuracy will be degraded.

-

Further, most other data files are indexed by the unicharset file, -so changing it without re-generating the others is likely to have dire -consequences.

-
-
-
-

HISTORY

-
-

The unicharset format first appeared with Tesseract 2.00, which was the -first version to support languages other than English. The unicharset file -contained only the first two fields, and the "ispunctuation" property was -absent (punctuation was regarded as "0", as "=" is in the above example.

-
-
-
-

SEE ALSO

-
-

tesseract(1), combine_tessdata(1), unicharset_extractor(1)

- -
-
-
-

AUTHOR

-
-

The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present).

-
-
-
-

- - - diff --git a/doc/unicharset.5.xml b/doc/unicharset.5.xml deleted file mode 100644 index 40e03c6eea..0000000000 --- a/doc/unicharset.5.xml +++ /dev/null @@ -1,219 +0,0 @@ - - - - - - - UNICHARSET(5) - - -unicharset -5 -  -  - - - unicharset - character properties file used by tesseract(1) - - -DESCRIPTION -Tesseract’s unicharset file contains information on each symbol -(unichar) the Tesseract OCR engine is trained to recognize. -A unicharset file (i.e. eng.unicharset) is distributed as part of a -Tesseract language pack (i.e. eng.traineddata). For information on -extracting the unicharset file, see combine_tessdata(1). -The first line of a unicharset file contains the number of unichars in -the file. After this line, each subsequent line provides information for -a single unichar. The first such line contains a placeholder reserved for -the space character. Each unichar is referred to within Tesseract by its -Unichar ID, which is the line number (minus 1) within the unicharset file. -Therefore, space gets unichar 0. -Each unichar line in the unicharset file (v2+) may have four space-separated fields: -'character' 'properties' 'script' 'id' -Starting with Tesseract v3.02, more information may be given for each unichar: -'character' 'properties' 'glyph_metrics' 'script' 'other_case' 'direction' 'mirror' 'normed_form' -Entries: - - - -character - - - -The UTF-8 encoded string to be produced for this unichar. - - - - - -properties - - - -An integer mask of character properties, one per bit. - From least to most significant bit, these are: isalpha, islower, isupper, - isdigit, ispunctuation. - - - - - -glyph_metrics - - - -Ten comma-separated integers representing various standards - for where this glyph is to be found within a baseline-normalized coordinate - system where 128 is normalized to x-height. - - - - -min_bottom, max_bottom: the ranges where the bottom of the character can - be found. - - - - -min_top, max_top: the ranges where the top of the character may be found. - - - - -min_width, max_width: horizontal width of the character. - - - - -min_bearing, max_bearing: how far from the usual start position does the - leftmost part of the character begin. - - - - -min_advance, max_advance: how far from the printer’s cell left do we - advance to begin the next character. - - - - - - - -script - - - -Name of the script (Latin, Common, Greek, Cyrillic, Han, null). - - - - - -other_case - - - -The Unichar ID of the other case version of this character - (upper or lower). - - - - - -direction - - - -The Unicode BiDi direction of this character, as defined by - ICU’s enum UCharDirection. (0 = Left to Right, 1 = Right to Left, - 2 = European Number…) - - - - - -mirror - - - -The Unichar ID of the BiDirectional mirror of this character. - For example the mirror of open paren is close paren, but Latin Capital C - has no mirror, so it remains a Latin Capital C. - - - - - -normed_form - - - -The UTF-8 representation of a "normalized form" of this unichar - for the purpose of blaming a module for errors given ground truth text. - For instance, a left or right single quote may normalize to an ASCII quote. - - - - - - -EXAMPLE (v2) -; 10 Common 46 -b 3 Latin 59 -W 5 Latin 40 -7 8 Common 66 -= 0 Common 93 -";" is a punctuation character. Its properties are thus represented by the -binary number 10000 (10 in hexadecimal). -"b" is an alphabetic character and a lower case character. Its properties are -thus represented by the binary number 00011 (3 in hexadecimal). -"W" is an alphabetic character and an upper case character. Its properties are -thus represented by the binary number 00101 (5 in hexadecimal). -"7" is just a digit. Its properties are thus represented by the binary number -01000 (8 in hexadecimal). -"=" is not punctuation nor a digit nor an alphabetic character. Its properties -are thus represented by the binary number 00000 (0 in hexadecimal). -Japanese or Chinese alphabetic character properties are represented by the -binary number 00001 (1 in hexadecimal): they are alphabetic, but neither -upper nor lower case. - - -EXAMPLE (v3.02) -110 -NULL 0 NULL 0 -N 5 59,68,216,255,87,236,0,27,104,227 Latin 11 0 1 N -Y 5 59,68,216,255,91,205,0,47,91,223 Latin 33 0 2 Y -1 8 59,69,203,255,45,128,0,66,74,173 Common 3 2 3 1 -9 8 18,66,203,255,89,156,0,39,104,173 Common 4 2 4 9 -a 3 58,65,186,198,85,164,0,26,97,185 Latin 56 0 5 a -. . . - - -CAVEATS -Although the unicharset reader maintains the ability to read unicharsets -of older formats and will assign default values to missing fields, -the accuracy will be degraded. -Further, most other data files are indexed by the unicharset file, -so changing it without re-generating the others is likely to have dire -consequences. - - -HISTORY -The unicharset format first appeared with Tesseract 2.00, which was the -first version to support languages other than English. The unicharset file -contained only the first two fields, and the "ispunctuation" property was -absent (punctuation was regarded as "0", as "=" is in the above example. - - -SEE ALSO -tesseract(1), combine_tessdata(1), unicharset_extractor(1) -https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract - - -AUTHOR -The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present). - - diff --git a/doc/unicharset_extractor.1 b/doc/unicharset_extractor.1 deleted file mode 100644 index ed2040dbfc..0000000000 --- a/doc/unicharset_extractor.1 +++ /dev/null @@ -1,69 +0,0 @@ -'\" t -.\" Title: unicharset_extractor -.\" Author: [see the "AUTHOR" section] -.\" Generator: DocBook XSL Stylesheets v1.78.1 -.\" Date: 06/12/2015 -.\" Manual: \ \& -.\" Source: \ \& -.\" Language: English -.\" -.TH "UNICHARSET_EXTRACTOR" "1" "06/12/2015" "\ \&" "\ \&" -.\" ----------------------------------------------------------------- -.\" * Define some portability stuff -.\" ----------------------------------------------------------------- -.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.\" http://bugs.debian.org/507673 -.\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html -.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.ie \n(.g .ds Aq \(aq -.el .ds Aq ' -.\" ----------------------------------------------------------------- -.\" * set default formatting -.\" ----------------------------------------------------------------- -.\" disable hyphenation -.nh -.\" disable justification (adjust text to left margin only) -.ad l -.\" ----------------------------------------------------------------- -.\" * MAIN CONTENT STARTS HERE * -.\" ----------------------------------------------------------------- -.SH "NAME" -unicharset_extractor \- extract unicharset from Tesseract boxfiles -.SH "SYNOPSIS" -.sp -\fBunicharset_extractor\fR \fI[\-D dir]\fR \fIFILE\fR\&... -.SH "DESCRIPTION" -.sp -Tesseract needs to know the set of possible characters it can output\&. To generate the unicharset data file, use the unicharset_extractor program on the same training pages bounding box files as used for clustering: -.sp -.if n \{\ -.RS 4 -.\} -.nf -unicharset_extractor fontfile_1\&.box fontfile_2\&.box \&.\&.\&. -.fi -.if n \{\ -.RE -.\} -.sp -The unicharset will be put into the file \fIdir/unicharset\fR, or simply \fI\&./unicharset\fR if no output directory is provided\&. -.sp -Tesseract also needs to have access to character properties isalpha, isdigit, isupper, islower, ispunctuation\&. all of this auxilury data and more is encoded in this file\&. (See unicharset(5)) -.sp -If your system supports the wctype functions, these values will be set automatically by unicharset_extractor and there is no need to edit the unicharset file\&. On some older systems (eg Windows 95), the unicharset file must be edited by hand to add these property description codes\&. -.sp -\fBNOTE\fR The unicharset file must be regenerated whenever inttemp, normproto and pffmtable are generated (i\&.e\&. they must all be recreated when the box file is changed) as they have to be in sync\&. This is made easier than in previous versions by running unicharset_extractor before mftraining and cntraining, and giving the unicharset to mftraining\&. -.SH "SEE ALSO" -.sp -tesseract(1), unicharset(5) -.sp -\m[blue]\fBhttps://github\&.com/tesseract\-ocr/tesseract/wiki/TrainingTesseract\fR\m[] -.SH "HISTORY" -.sp -unicharset_extractor first appeared in Tesseract 2\&.00\&. -.SH "COPYING" -.sp -Copyright (C) 2006, Google Inc\&. Licensed under the Apache License, Version 2\&.0 -.SH "AUTHOR" -.sp -The Tesseract OCR engine was written by Ray Smith and his research groups at Hewlett Packard (1985\-1995) and Google (2006\-present)\&. diff --git a/doc/unicharset_extractor.1.html b/doc/unicharset_extractor.1.html deleted file mode 100644 index 6fdeb5e953..0000000000 --- a/doc/unicharset_extractor.1.html +++ /dev/null @@ -1,815 +0,0 @@ - - - - - -UNICHARSET_EXTRACTOR(1) - - - - - -
-
-

SYNOPSIS

-
-

unicharset_extractor [-D dir] FILE

-
-
-
-

DESCRIPTION

-
-

Tesseract needs to know the set of possible characters it can output. -To generate the unicharset data file, use the unicharset_extractor -program on the same training pages bounding box files as used for -clustering:

-
-
-
unicharset_extractor fontfile_1.box fontfile_2.box ...
-
-

The unicharset will be put into the file dir/unicharset, or simply -./unicharset if no output directory is provided.

-

Tesseract also needs to have access to character properties isalpha, -isdigit, isupper, islower, ispunctuation. all of this auxilury data -and more is encoded in this file. (See unicharset(5))

-

If your system supports the wctype functions, these values will be set -automatically by unicharset_extractor and there is no need to edit the -unicharset file. On some older systems (eg Windows 95), the unicharset -file must be edited by hand to add these property description codes.

-

NOTE The unicharset file must be regenerated whenever inttemp, normproto -and pffmtable are generated (i.e. they must all be recreated when the box -file is changed) as they have to be in sync. This is made easier than in -previous versions by running unicharset_extractor before mftraining and -cntraining, and giving the unicharset to mftraining.

-
-
- -
-

HISTORY

-
-

unicharset_extractor first appeared in Tesseract 2.00.

-
-
-
-

COPYING

-
-

Copyright (C) 2006, Google Inc. -Licensed under the Apache License, Version 2.0

-
-
-
-

AUTHOR

-
-

The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present).

-
-
-
-

- - - diff --git a/doc/unicharset_extractor.1.xml b/doc/unicharset_extractor.1.xml deleted file mode 100644 index 45087a8c64..0000000000 --- a/doc/unicharset_extractor.1.xml +++ /dev/null @@ -1,63 +0,0 @@ - - - - - - - UNICHARSET_EXTRACTOR(1) - - -unicharset_extractor -1 -  -  - - - unicharset_extractor - extract unicharset from Tesseract boxfiles - - -unicharset_extractor [-D dir] FILE - - -DESCRIPTION -Tesseract needs to know the set of possible characters it can output. -To generate the unicharset data file, use the unicharset_extractor -program on the same training pages bounding box files as used for -clustering: -unicharset_extractor fontfile_1.box fontfile_2.box ... -The unicharset will be put into the file dir/unicharset, or simply -./unicharset if no output directory is provided. -Tesseract also needs to have access to character properties isalpha, -isdigit, isupper, islower, ispunctuation. all of this auxilury data -and more is encoded in this file. (See unicharset(5)) -If your system supports the wctype functions, these values will be set -automatically by unicharset_extractor and there is no need to edit the -unicharset file. On some older systems (eg Windows 95), the unicharset -file must be edited by hand to add these property description codes. -NOTE The unicharset file must be regenerated whenever inttemp, normproto -and pffmtable are generated (i.e. they must all be recreated when the box -file is changed) as they have to be in sync. This is made easier than in -previous versions by running unicharset_extractor before mftraining and -cntraining, and giving the unicharset to mftraining. - - -SEE ALSO -tesseract(1), unicharset(5) -https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract - - -HISTORY -unicharset_extractor first appeared in Tesseract 2.00. - - -COPYING -Copyright (C) 2006, Google Inc. -Licensed under the Apache License, Version 2.0 - - -AUTHOR -The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present). - - diff --git a/doc/wordlist2dawg.1 b/doc/wordlist2dawg.1 deleted file mode 100644 index 4c8cd19e04..0000000000 --- a/doc/wordlist2dawg.1 +++ /dev/null @@ -1,72 +0,0 @@ -'\" t -.\" Title: wordlist2dawg -.\" Author: [see the "AUTHOR" section] -.\" Generator: DocBook XSL Stylesheets v1.78.1 -.\" Date: 06/12/2015 -.\" Manual: \ \& -.\" Source: \ \& -.\" Language: English -.\" -.TH "WORDLIST2DAWG" "1" "06/12/2015" "\ \&" "\ \&" -.\" ----------------------------------------------------------------- -.\" * Define some portability stuff -.\" ----------------------------------------------------------------- -.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.\" http://bugs.debian.org/507673 -.\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html -.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.ie \n(.g .ds Aq \(aq -.el .ds Aq ' -.\" ----------------------------------------------------------------- -.\" * set default formatting -.\" ----------------------------------------------------------------- -.\" disable hyphenation -.nh -.\" disable justification (adjust text to left margin only) -.ad l -.\" ----------------------------------------------------------------- -.\" * MAIN CONTENT STARTS HERE * -.\" ----------------------------------------------------------------- -.SH "NAME" -wordlist2dawg \- convert a wordlist to a DAWG for Tesseract -.SH "SYNOPSIS" -.sp -\fBwordlist2dawg\fR \fIWORDLIST\fR \fIDAWG\fR \fIlang\&.unicharset\fR -.sp -\fBwordlist2dawg\fR \-t \fIWORDLIST\fR \fIDAWG\fR \fIlang\&.unicharset\fR -.sp -\fBwordlist2dawg\fR \-r 1 \fIWORDLIST\fR \fIDAWG\fR \fIlang\&.unicharset\fR -.sp -\fBwordlist2dawg\fR \-r 2 \fIWORDLIST\fR \fIDAWG\fR \fIlang\&.unicharset\fR -.sp -\fBwordlist2dawg\fR \-l \fIWORDLIST\fR \fIDAWG\fR \fIlang\&.unicharset\fR -.SH "DESCRIPTION" -.sp -wordlist2dawg(1) converts a wordlist to a Directed Acyclic Word Graph (DAWG) for use with Tesseract\&. A DAWG is a compressed, space and time efficient representation of a word list\&. -.SH "OPTIONS" -.sp -\-t Verify that a given dawg file is equivalent to a given wordlist\&. -.sp -\-r 1 Reverse a word if it contains an RTL character\&. -.sp -\-r 2 Reverse all words\&. -.sp -\-l Produce a file with several dawgs in it, one each for words of length , ,\&... -.SH "ARGUMENTS" -.sp -\fIWORDLIST\fR A plain text file in UTF\-8, one word per line\&. -.sp -\fIDAWG\fR The output DAWG to write\&. -.sp -\fIlang\&.unicharset\fR The unicharset of the language\&. This is the unicharset generated by mftraining(1)\&. -.SH "SEE ALSO" -.sp -tesseract(1), combine_tessdata(1), dawg2wordlist(1) -.sp -\m[blue]\fBhttps://github\&.com/tesseract\-ocr/tesseract/wiki/TrainingTesseract\fR\m[] -.SH "COPYING" -.sp -Copyright (C) 2006 Google, Inc\&. Licensed under the Apache License, Version 2\&.0 -.SH "AUTHOR" -.sp -The Tesseract OCR engine was written by Ray Smith and his research groups at Hewlett Packard (1985\-1995) and Google (2006\-present)\&. diff --git a/doc/wordlist2dawg.1.html b/doc/wordlist2dawg.1.html deleted file mode 100644 index 733570511a..0000000000 --- a/doc/wordlist2dawg.1.html +++ /dev/null @@ -1,820 +0,0 @@ - - - - - -WORDLIST2DAWG(1) - - - - - -
-
-

SYNOPSIS

-
-

wordlist2dawg WORDLIST DAWG lang.unicharset

-

wordlist2dawg -t WORDLIST DAWG lang.unicharset

-

wordlist2dawg -r 1 WORDLIST DAWG lang.unicharset

-

wordlist2dawg -r 2 WORDLIST DAWG lang.unicharset

-

wordlist2dawg -l <short> <long> WORDLIST DAWG lang.unicharset

-
-
-
-

DESCRIPTION

-
-

wordlist2dawg(1) converts a wordlist to a Directed Acyclic Word Graph -(DAWG) for use with Tesseract. A DAWG is a compressed, space and time -efficient representation of a word list.

-
-
-
-

OPTIONS

-
-

-t - Verify that a given dawg file is equivalent to a given wordlist.

-

-r 1 - Reverse a word if it contains an RTL character.

-

-r 2 - Reverse all words.

-

-l <short> <long> - Produce a file with several dawgs in it, one each for words - of length <short>, <short+1>,… <long>

-
-
-
-

ARGUMENTS

-
-

WORDLIST - A plain text file in UTF-8, one word per line.

-

DAWG - The output DAWG to write.

-

lang.unicharset - The unicharset of the language. This is the unicharset - generated by mftraining(1).

-
-
-
-

SEE ALSO

-
-

tesseract(1), combine_tessdata(1), dawg2wordlist(1)

- -
-
-
-

COPYING

-
-

Copyright (C) 2006 Google, Inc. -Licensed under the Apache License, Version 2.0

-
-
-
-

AUTHOR

-
-

The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present).

-
-
-
-

- - - diff --git a/doc/wordlist2dawg.1.xml b/doc/wordlist2dawg.1.xml deleted file mode 100644 index bad256fe70..0000000000 --- a/doc/wordlist2dawg.1.xml +++ /dev/null @@ -1,69 +0,0 @@ - - - - - - - WORDLIST2DAWG(1) - - -wordlist2dawg -1 -  -  - - - wordlist2dawg - convert a wordlist to a DAWG for Tesseract - - -wordlist2dawg WORDLIST DAWG lang.unicharset -wordlist2dawg -t WORDLIST DAWG lang.unicharset -wordlist2dawg -r 1 WORDLIST DAWG lang.unicharset -wordlist2dawg -r 2 WORDLIST DAWG lang.unicharset -wordlist2dawg -l <short> <long> WORDLIST DAWG lang.unicharset - - -DESCRIPTION -wordlist2dawg(1) converts a wordlist to a Directed Acyclic Word Graph -(DAWG) for use with Tesseract. A DAWG is a compressed, space and time -efficient representation of a word list. - - -OPTIONS --t - Verify that a given dawg file is equivalent to a given wordlist. --r 1 - Reverse a word if it contains an RTL character. --r 2 - Reverse all words. --l <short> <long> - Produce a file with several dawgs in it, one each for words - of length <short>, <short+1>,… <long> - - -ARGUMENTS -WORDLIST - A plain text file in UTF-8, one word per line. -DAWG - The output DAWG to write. -lang.unicharset - The unicharset of the language. This is the unicharset - generated by mftraining(1). - - -SEE ALSO -tesseract(1), combine_tessdata(1), dawg2wordlist(1) -https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract - - -COPYING -Copyright (C) 2006 Google, Inc. -Licensed under the Apache License, Version 2.0 - - -AUTHOR -The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present). - -