From 92922b421c49892542dd3191b9ed9fe4f02a41dc Mon Sep 17 00:00:00 2001
From: Shree Devi Kumar <shreeshrii@gmail.com>
Date: Thu, 30 Aug 2018 14:28:34 +0000
Subject: [PATCH] Add langtests framework with frk example

---
 Makefile.am                              |   2 +-
 configure.ac                             |   1 +
 langtests/.gitignore                     |   2 +
 langtests/Makefile.am                    |   8 ++
 langtests/README.md                      |  98 ++++++++++++++++++++++
 langtests/counttestset.sh                |  52 ++++++++++++
 langtests/frk_setup.sh                   |  24 ++++++
 langtests/frk_test.sh                    |  13 +++
 langtests/reports/4_best_frk.summary     |   2 +
 langtests/reports/4_best_int_frk.summary |   2 +
 langtests/reports/4_fast_Fraktur.summary |   2 +
 langtests/reports/4_fast_frk.summary     |   2 +
 langtests/runlangtests.sh                | 100 +++++++++++++++++++++++
 langtests/runtestset.sh                  |  60 ++++++++++++++
 14 files changed, 367 insertions(+), 1 deletion(-)
 create mode 100644 langtests/.gitignore
 create mode 100644 langtests/Makefile.am
 create mode 100644 langtests/README.md
 create mode 100755 langtests/counttestset.sh
 create mode 100644 langtests/frk_setup.sh
 create mode 100644 langtests/frk_test.sh
 create mode 100644 langtests/reports/4_best_frk.summary
 create mode 100644 langtests/reports/4_best_int_frk.summary
 create mode 100644 langtests/reports/4_fast_Fraktur.summary
 create mode 100644 langtests/reports/4_fast_frk.summary
 create mode 100755 langtests/runlangtests.sh
 create mode 100755 langtests/runtestset.sh

diff --git a/Makefile.am b/Makefile.am
index 947ccb5b7f..de6af2ebda 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -24,7 +24,7 @@ SUBDIRS += src/ccmain src/api . tessdata doc unittest
 
 EXTRA_DIST = README.md\
 	aclocal.m4 config configure.ac autogen.sh contrib \
-	tesseract.pc.in $(TRAINING_SUBDIR) java doc unlvtests
+	tesseract.pc.in $(TRAINING_SUBDIR) java doc langtests unlvtests
 
 DIST_SUBDIRS = $(SUBDIRS) $(TRAINING_SUBDIR)
 
diff --git a/configure.ac b/configure.ac
index e74e5f1784..d27f5ce380 100644
--- a/configure.ac
+++ b/configure.ac
@@ -466,6 +466,7 @@ fi
 
 # Output files
 AC_CONFIG_FILES([Makefile tesseract.pc])
+AC_CONFIG_FILES([langtests/Makefile])
 AC_CONFIG_FILES([src/api/Makefile])
 AC_CONFIG_FILES([src/api/tess_version.h])
 AC_CONFIG_FILES([src/arch/Makefile])
diff --git a/langtests/.gitignore b/langtests/.gitignore
new file mode 100644
index 0000000000..d9f9b7fa10
--- /dev/null
+++ b/langtests/.gitignore
@@ -0,0 +1,2 @@
+#
+results/*
diff --git a/langtests/Makefile.am b/langtests/Makefile.am
new file mode 100644
index 0000000000..2103eeef8e
--- /dev/null
+++ b/langtests/Makefile.am
@@ -0,0 +1,8 @@
+
+EXTRA_DIST = README.md
+EXTRA_DIST += frk_setup.sh
+EXTRA_DIST += frk_test.sh
+EXTRA_DIST += counttestset.sh
+EXTRA_DIST += runlangtests.sh
+EXTRA_DIST += runtestset.sh
+EXTRA_DIST += reports/*
diff --git a/langtests/README.md b/langtests/README.md
new file mode 100644
index 0000000000..af4b6095cc
--- /dev/null
+++ b/langtests/README.md
@@ -0,0 +1,98 @@
+# How to run Language tests.
+
+The scripts in this directory make it possible to test Accuracy of Tesseract 
+for different languages. 
+
+### Step 1: If not already installed, download the modified ISRI toolkit, 
+make and install the tools in /usr/local/bin.
+
+```
+git clone https://github.com/Shreeshrii/ocr-evaluation-tools.git
+cd ~/ocr-evaluation-tools
+sudo make install
+```
+
+### Step 2: If not alrady installed, Build tesseract.
+
+## Testing for Fraktur - frk and script/Fraktur
+
+### Step 3: download the images and groundtruth
+
+```
+mkdir -p ~/lang-downloads
+cd ~/lang-downloads
+wget -O frk-jbarth-ubhd.zip http://digi.ub.uni-heidelberg.de/diglitData/v/abbyy11r8-vs-tesseract4.zip
+wget -O frk-stweil-gt.zip https://digi.bib.uni-mannheim.de/~stweil/fraktur-gt.zip
+```
+
+### Step 4: extract the files. 
+It doesn't really matter where in your filesystem you put them, 
+but they must go under a common root, for example, ~/lang-files
+
+```
+mkdir -p ~/lang-files
+cd ~/lang-files
+unzip ~/lang-downloads/frk-jbarth-ubhd.zip -d frk
+unzip ~/lang-downloads/frk-stweil-gt.zip -d frk
+mkdir -p ./frk-ligatures
+cp ./frk/abbyy-vs-tesseract/*.tif ./frk-ligatures/
+cp ./frk/gt/*.txt ./frk-ligatures/
+
+cd ./frk-ligatures/
+ls -1 *.tif >pages
+sed -i -e 's/.tif//g' pages
+cat pages
+```
+
+```
+mkdir -p ~/lang-stopwords
+cd ~/lang-stopwords
+wget -O frk.stopwords.txt https://raw.githubusercontent.com/stopwords-iso/stopwords-de/master/stopwords-de.txt
+```
+Edit ~/lang-files/stopwords/frk.stopwords.txt as 
+wordacc uses a space delimited stopwords file, not line delimited.
+
+```
+sed -i -e 's/\n/ /g' frk.stopwords.txt
+cat frk.stopwords.txt
+```
+
+### Step 5: run langtests/runlangtests.sh with the root ISRI data dir, testname, tessdata-dir, language code:
+
+```
+cd ~/tesseract
+langtests/runlangtests.sh ~/lang-files 4_fast_Fraktur ../tessdata_fast/script Fraktur
+langtests/runlangtests.sh ~/lang-files 4_fast_frk ../tessdata_fast frk
+langtests/runlangtests.sh ~/lang-files 4_best_int_frk ../tessdata frk
+langtests/runlangtests.sh ~/lang-files 4_best_frk ../tessdata_best frk
+
+
+
+
+langtests/runlangtests.sh ~/lang-files 4_shreetest_frk-Fraktur /home/ubuntu/tessdata_frk/frk-finetune-impact frk
+langtests/runlangtests.sh ~/lang-files 4_shreetest_frk-frk /home/ubuntu/tessdata_frk/frk-finetune-frk frk
+```
+and go to the gym, have lunch etc. It takes a while to run.
+
+### Step 6: There should be a RELEASE.summary file
+*langtests/reports/4-beta_fast.summary* that contains the final summarized accuracy
+
+```
+
+#### Notes from Nick White regarding wordacc
+
+If you just want to remove all lines which have 100% recognition,
+you can add a 'awk' command like this:
+
+ocrevalutf8 wordacc ground.txt ocr.txt | awk '$3 != 100 {print $0}'  
+results.txt
+
+or if you've already got a results file you want to change, you can do this:
+
+awk '$3 != 100 {print $0}'  results.txt  newresults.txt
+
+If you only want the last sections where things are broken down by
+word, you can add a sed commend, like this:
+
+ocrevalutf8 wordacc ground.txt ocr.txt | sed '/^   Count   Missed %Right   $/,$ 
+!d' | awk '$3 != 100 {print $0}'  results.txt
diff --git a/langtests/counttestset.sh b/langtests/counttestset.sh
new file mode 100755
index 0000000000..d9ef4ce010
--- /dev/null
+++ b/langtests/counttestset.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+# File:        counttestset.sh
+# Description: Script to count the errors on a single UNLV set.
+# Author:      Ray Smith
+# Created:     Wed Jun 13 11:58:01 PDT 2007
+#
+# (C) Copyright 2007, Google Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if [ $# -ne 2 ]
+then
+  echo "Usage:$0 pagesfile langcode"
+  exit 1
+fi
+
+pages=$1
+langcode=$2
+
+imdir=${pages%/pages}
+setname=${imdir##*/}
+resdir=langtests/results/$setname
+mkdir -p langtests/reports
+echo "Counting on set $setname in directory $imdir to $resdir"
+accfiles=""
+wafiles=""
+while read page dir
+do
+  if [ "$dir" ]
+  then
+     srcdir="$imdir/$dir"
+  else
+     srcdir="$imdir"
+  fi
+  echo "$srcdir/$page.tif"
+  # Count character errors.
+  ocrevalutf8  accuracy "$srcdir/$page.txt" "$resdir/$page.txt" > "$resdir/$page.acc"
+  accfiles="$accfiles $resdir/$page.acc"
+  # Count word errors.
+  ocrevalutf8   wordacc -S"$resdir/$langcode.stopwords" "$srcdir/$page.txt" "$resdir/$page.txt" > "$resdir/$page.wa"
+  wafiles="$wafiles $resdir/$page.wa"
+done <"$pages"
+
+accsum $accfiles >"langtests/results/$setname.characc"
+wordaccsum $wafiles >"langtests/results/$setname.wordacc"
diff --git a/langtests/frk_setup.sh b/langtests/frk_setup.sh
new file mode 100644
index 0000000000..e86b6109f0
--- /dev/null
+++ b/langtests/frk_setup.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+#
+mkdir -p ~/lang-downloads
+cd ~/lang-downloads
+wget -O frk-jbarth-ubhd.zip http://digi.ub.uni-heidelberg.de/diglitData/v/abbyy11r8-vs-tesseract4.zip
+wget -O frk-stweil-gt.zip https://digi.bib.uni-mannheim.de/~stweil/fraktur-gt.zip
+
+mkdir -p ~/lang-files
+cd ~/lang-files
+unzip ~/lang-downloads/frk-jbarth-ubhd.zip -d frk
+unzip ~/lang-downloads/frk-stweil-gt.zip -d frk
+mkdir -p ./frk-ligatures
+cp ./frk/abbyy-vs-tesseract/*.tif ./frk-ligatures/
+cp ./frk/gt/*.txt ./frk-ligatures/
+
+cd ./frk-ligatures/
+ls -1 *.tif >pages
+sed -i -e 's/.tif//g' pages
+
+mkdir -p ~/lang-stopwords
+cd ~/lang-stopwords
+wget -O frk.stopwords.txt https://raw.githubusercontent.com/stopwords-iso/stopwords-de/master/stopwords-de.txt
+
+echo "Edit ~/lang-files/stopwords/frk.stopwords.txt as wordacc uses a space delimited stopwords file, not line delimited."
diff --git a/langtests/frk_test.sh b/langtests/frk_test.sh
new file mode 100644
index 0000000000..83078ca96c
--- /dev/null
+++ b/langtests/frk_test.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+#
+# run langtests/runlangtests.sh with the root ISRI data dir, testname, tessdata-dir, language code:
+
+cd ~/tesseract
+langtests/runlangtests.sh ~/lang-files 4_fast_Fraktur ../tessdata_fast/script Fraktur
+
+langtests/runlangtests.sh ~/lang-files 4_fast_frk ../tessdata_fast frk
+langtests/runlangtests.sh ~/lang-files 4_best_int_frk ../tessdata frk
+langtests/runlangtests.sh ~/lang-files 4_best_frk ../tessdata_best frk
+
+### It takes a while to run.
+
diff --git a/langtests/reports/4_best_frk.summary b/langtests/reports/4_best_frk.summary
new file mode 100644
index 0000000000..0b963f682e
--- /dev/null
+++ b/langtests/reports/4_best_frk.summary
@@ -0,0 +1,2 @@
+RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
+4_best_frk	frk-ligatures	178		94.73%		100		81.31%		74			75.17		94.29s
diff --git a/langtests/reports/4_best_int_frk.summary b/langtests/reports/4_best_int_frk.summary
new file mode 100644
index 0000000000..20df4cd8e8
--- /dev/null
+++ b/langtests/reports/4_best_int_frk.summary
@@ -0,0 +1,2 @@
+RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
+4_best_int_frk	frk-ligatures	244		92.78%		109		79.63%		80			73.15		89.80s
diff --git a/langtests/reports/4_fast_Fraktur.summary b/langtests/reports/4_fast_Fraktur.summary
new file mode 100644
index 0000000000..b8f8e81b72
--- /dev/null
+++ b/langtests/reports/4_fast_Fraktur.summary
@@ -0,0 +1,2 @@
+RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
+4_fast_Fraktur	frk-ligatures	265		92.16%		116		78.32%		82			72.48		91.29s
diff --git a/langtests/reports/4_fast_frk.summary b/langtests/reports/4_fast_frk.summary
new file mode 100644
index 0000000000..42ce1bcd34
--- /dev/null
+++ b/langtests/reports/4_fast_frk.summary
@@ -0,0 +1,2 @@
+RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
+4_fast_frk	frk-ligatures	244		92.78%		109		79.63%		80			73.15		89.98s
diff --git a/langtests/runlangtests.sh b/langtests/runlangtests.sh
new file mode 100755
index 0000000000..0af075cb30
--- /dev/null
+++ b/langtests/runlangtests.sh
@@ -0,0 +1,100 @@
+#!/bin/bash
+##############################################################################
+# File:        runalltests_spa.sh
+# Description: Script to run a set of UNLV test sets for Spanish.
+#                      based on runalltests.sh by Ray Smith
+# Author:      Shree Devi Kumar
+# Created:     June 09, 2018
+#
+# (C) Copyright 2007, Google Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+if [ $# -ne 4 ]
+then
+   echo "Usage:$0 unlv-data-dir version-id tessdata-dir langcode"
+   exit 1
+fi
+
+tessdata=$3
+lang=$4
+
+#timesum computes the total cpu time
+timesum() {
+awk ' BEGIN {
+total = 0.0;
+}
+{
+  total += $2;
+}
+END {
+  printf("%.2f\n", total);
+}' "$1"
+}
+
+imdir="$1"
+vid="$2"
+bindir=${0%/*}
+if [ "$bindir" = "$0" ]
+then
+    bindir="./"
+fi
+rdir=langtests/reports
+if [ "$lang" = "frk" ] ||  [ "$lang" = "Fraktur" ]
+    then
+       testsets="frk-ligatures"
+fi
+
+totalerrs=0
+totalwerrs=0
+totalnswerrs=0
+for set in $testsets
+do
+    resdir=langtests/results/$set
+    mkdir -p "$resdir"
+    cp ~/lang-stopwords/frk.stopwords.txt "$resdir/$lang.stopwords"
+    if [ -r "$imdir/$set/pages" ]
+    then
+	# Run tesseract on all the pages.
+	$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" $lang
+	# Count the errors on all the pages.
+	$bindir/counttestset.sh "$imdir/$set/pages" $lang
+	# Get the new character word and nonstop word errors and accuracy.
+	cherrs=$(head -4 "langtests/results/$set.characc" |tail -1 |cut -c1-9 |
+	    tr -d '[:blank:]')
+	chacc=$(head -5 "langtests/results/$set.characc" |tail -1 |cut -c1-9 |
+	    tr -d '[:blank:]')
+	wderrs=$(head -4 "langtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
+	    tr -d '[:blank:]')
+	wdacc=$(head -5 "langtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
+	    tr -d '[:blank:]')
+	nswderrs=$(grep Total "langtests/results/$set.wordacc" |head -2 |tail -1 |
+	    cut -c10-17 |tr -d '[:blank:]')
+	nswdacc=$(grep Total "langtests/results/$set.wordacc" |head -2 |tail -1 |
+	    cut -c19-26 |tr -d '[:blank:]')
+
+    sumfile=$rdir/$vid.$set.sum
+        if [ -r "langtests/results/$set.times" ]
+        then
+          total_time=$(timesum "langtests/results/$set.times")
+      	else
+          total_time='0.0'
+        fi
+        echo "RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy\
+	NonStopWErrors	Accuracy	TimeTaken">"$sumfile"
+        echo "$vid	$set	$cherrs		$chacc		$wderrs		$wdacc\
+		$nswderrs			$nswdacc		${total_time}s" >>"$sumfile"
+    fi
+done
+
+cat "$rdir/$vid".*.sum >"$rdir/$vid".summary
+
+mv "$rdir/$vid".*.sum langtests/results/
+cat "$rdir/$vid".summary
diff --git a/langtests/runtestset.sh b/langtests/runtestset.sh
new file mode 100755
index 0000000000..fc12f40eba
--- /dev/null
+++ b/langtests/runtestset.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+# File:        runtestset.sh
+# Description: Script to run tesseract on a single UNLV set.
+# Author:      Ray Smith
+# Created:     Wed Jun 13 10:13:01 PDT 2007
+#
+# (C) Copyright 2007, Google Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if  [ $# -ne 3 ] 
+then
+  echo "Usage:$0 pagesfile tessdata-dir langcode "
+  exit 1
+fi
+
+tess="time -f %U -o times.txt ./src/api/tesseract"
+
+tessdata=$2
+langcode=$3
+pages=$1
+imdir=${pages%/pages}
+setname=${imdir##*/}
+
+config=""
+resdir=langtests/results/$setname
+
+echo -e "Testing on set $setname in directory $imdir to $resdir\n"
+mkdir -p "$resdir"
+rm -f "langtests/results/$setname.times"
+while read page dir
+do
+  # A pages file may be a list of files with subdirs or maybe just
+  # a plain list of files so accommodate both.
+  if [ "$dir" ]
+  then
+     srcdir="$imdir/$dir"
+  else
+     srcdir="$imdir"
+  fi
+  echo "$srcdir/$page.tif"
+  $tess "$srcdir/$page.tif" "$resdir/$page" --tessdata-dir $tessdata --oem 1 -l $langcode --psm 6 $config 2>&1 |grep -v "OCR Engine" |grep -v "Page 1"
+  if [ -r times.txt ]
+  then
+    read t <times.txt
+    echo "$page $t" >>"langtests/results/$setname.times"
+    echo -e "\033M$page $t"
+    if [ "$t" = "Command terminated by signal 2" ]
+    then
+      exit 0
+    fi
+  fi
+done <"$pages"