update Spanish UNLV test, use spa.stopwords, iconv to UTF-8

tesseract-ocr · Jun 9, 2018 · 6559af0 · 6559af0
1 parent 481d777
commit 6559af0
Show file tree

Hide file tree

Showing 5 changed files with 156 additions and 30 deletions.
diff --git a/unlvtests/README.md b/unlvtests/README.md
@@ -34,11 +34,15 @@ tar xzvf ~/isri-downloads/doe3.3B.tar.gz
 tar xzvf ~/isri-downloads/mag.3B.tar.gz
 tar xzvf ~/isri-downloads/news.3B.tar.gz
 tar xzvf ~/isri-downloads/spn.3B.tar.gz
+mkdir -p stopwords
+cd stopwords
+wget -O spa.stopwords.txt https://raw.githubusercontent.com/stopwords-iso/stopwords-es/master/stopwords-es.txt
 ```
+Edit ~/ISRI-OCRtk/stopwords/spa.stopwords.txt
+wordacc uses a space delimited stopwords file, not line delimited.
 
 Edit *~/ISRI-OCRtk/spn.3B/pages*
 delete the line containing the following imagename as it crashes tesseract.
-
 7733_005.3B.tif
 
 ### Step 3: Download the modified ISRI toolkit, make and install the tools :
@@ -52,10 +56,10 @@ sudo make install
 
 ### Step 4: cd back to your main tesseract-ocr dir and Build tesseract.
 
-### Step 5: run unlvtests/runalltests.sh with the root ISRI data dir, testname, tessdata-dir and language:
+### Step 5: run unlvtests/runalltests.sh with the root ISRI data dir, testname, tessdata-dir:
 
 ```
-unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_eng ../tessdata_fast eng
+unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_eng ../tessdata_fast
 ```
 and go to the gym, have lunch etc. It takes a while to run.
 
@@ -66,5 +70,23 @@ report and comparison with the 1995 results.
 ### Step 7: run the test for Spanish.
 
 ```
-unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_spa ../tessdata_fast spa
+unlvtests/runalltests_spa.sh ~/ISRI-OCRtk 4_fast_spa ../tessdata_fast
 ```
+
+#### Notes from Nick White regarding wordacc
+
+If you just want to remove all lines which have 100% recognition,
+you can add a 'awk' command like this:
+
+ocrevalutf8 wordacc ground.txt ocr.txt | awk '$3 != 100 {print $0}'  
+results.txt
+
+or if you've already got a results file you want to change, you can do this:
+
+awk '$3 != 100 {print $0}'  results.txt  newresults.txt
+
+If you only want the last sections where things are broken down by
+word, you can add a sed commend, like this:
+
+ocrevalutf8 wordacc ground.txt ocr.txt | sed '/^   Count   Missed %Right   $/,$ 
+!d' | awk '$3 != 100 {print $0}'  results.txt
diff --git a/unlvtests/counttestset.sh b/unlvtests/counttestset.sh
@@ -15,9 +15,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-if [ $# -ne 1 ]
+if [ $# -ne 2 ]
 then
-  echo "Usage:$0 pagesfile"
+  echo "Usage:$0 pagesfile langcode"
   exit 1
 fi
 if [ ! -d src/api ]
@@ -27,6 +27,7 @@ then
 fi
 
 pages=$1
+langcode=$2
 
 imdir=${pages%/pages}
 setname=${imdir##*/}
@@ -45,15 +46,22 @@ do
   fi
 #echo "$srcdir/$page.tif"
   # Count character errors.
-  ocrevalutf8  accuracy "$srcdir/$page.txt" "$resdir/$page.unlv" > "$resdir/$page.acc"
+  iconv -f  ISO8859-1 -t UTF-8 "$resdir/$page.unlv" >"$resdir/$page.text"
+  iconv -f  ISO8859-1 -t UTF-8 "$srcdir/$page.txt" >"$srcdir/$page.text"
+  ocrevalutf8  accuracy "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.acc"
   accfiles="$accfiles $resdir/$page.acc"
   # Count word errors.
-  ocrevalutf8  wordacc "$srcdir/$page.txt" "$resdir/$page.unlv" > "$resdir/$page.wa"
+  #langcode should be either eng or spa
+  if [ "$langcode" = "eng" ]
+    then
+      ocrevalutf8  wordacc "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.wa"
+    else
+      cp /home/ubuntu/ISRI-OCRtk/stopwords/spa.stopwords.txt "$resdir/spa.stopwords"
+      ocrevalutf8   wordacc -S"$resdir/spa.stopwords" "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.wa"
+  fi
   wafiles="$wafiles $resdir/$page.wa"
 done <"$pages"
 
-#echo "$accfiles"
-#echo "$wafiles"
-
 accsum $accfiles >"unlvtests/results/$setname.characc"
 wordaccsum $wafiles >"unlvtests/results/$setname.wordacc"
+
diff --git a/unlvtests/reports/1995.spn.3B.sum b/unlvtests/reports/1995.spn.3B.sum
diff --git a/unlvtests/runalltests.sh b/unlvtests/runalltests.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 # File:        runalltests.sh
-# Description: Script to run a set of UNLV test sets.
+# Description: Script to run a set of UNLV test sets for English.
 # Author:      Ray Smith
 # Created:     Thu Jun 14 08:21:01 PDT 2007
 #
@@ -15,9 +15,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-if [ $# -ne 4 ]
+if [ $# -ne 3 ]
 then
-   echo "Usage:$0 unlv-data-dir version-id tessdata-dir lang "
+   echo "Usage:$0 unlv-data-dir version-id tessdata-dir"
    exit 1
 fi
 if [ ! -d src/api ]
@@ -31,7 +31,6 @@ then
   exit 1
 fi
 tessdata=$3
-lang=$4
 
 #deltapc new old calculates the %change from old to new
 deltapc() {
@@ -62,19 +61,8 @@ then
 fi
 rdir=unlvtests/reports
 
-if [ "$lang" = "eng" ]
-then
-    testsets="bus.3B doe3.3B mag.3B news.3B"
-    #testsets="bus.3B"
-else
-    if [ "$lang" = "spa" ]
-    then
-        testsets="spn.3B"
-    else
-        echo "Language has to be eng or spa"
-        exit 1
-    fi
-fi
+testsets="bus.3B doe3.3B mag.3B news.3B"
+#testsets="bus.3B"
 
 totalerrs=0
 totalwerrs=0
@@ -87,7 +75,7 @@ do
     if [ -r "$imdir/$set/pages" ]
     then
 	# Run tesseract on all the pages.
-	$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "$lang"
+	$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "eng"
 	# Count the errors on all the pages.
 	$bindir/counttestset.sh "$imdir/$set/pages"
 	# Get the old character word and nonstop word errors.

diff --git a/unlvtests/runalltests_spa.sh b/unlvtests/runalltests_spa.sh
@@ -0,0 +1,109 @@
+#!/bin/bash
+##############################################################################
+# File:        runalltests_spa.sh
+# Description: Script to run a set of UNLV test sets for Spanish.
+#                      based on runalltests.sh by Ray Smith
+# Author:      Shree Devi Kumar
+# Created:     June 09, 2018
+#
+# (C) Copyright 2007, Google Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+if [ $# -ne 3 ]
+then
+   echo "Usage:$0 unlv-data-dir version-id tessdata-dir"
+   exit 1
+fi
+if [ ! -d src/api ]
+then
+  echo "Run $0 from the tesseract-ocr root directory!"
+  exit 1
+fi
+if [ ! -r src/api/tesseract ] && [ ! -r tesseract.exe ]
+then
+  echo "Please build tesseract before running $0"
+  exit 1
+fi
+tessdata=$3
+lang=$4
+
+#timesum computes the total cpu time
+timesum() {
+awk ' BEGIN {
+total = 0.0;
+}
+{
+  total += $2;
+}
+END {
+  printf("%.2f\n", total);
+}' "$1"
+}
+
+imdir="$1"
+vid="$2"
+bindir=${0%/*}
+if [ "$bindir" = "$0" ]
+then
+    bindir="./"
+fi
+rdir=unlvtests/reports
+
+testsets="spn.3B"
+
+totalerrs=0
+totalwerrs=0
+totalnswerrs=0
+for set in $testsets
+do
+    if [ -r "$imdir/$set/pages" ]
+    then
+	# Run tesseract on all the pages.
+	$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "spa"
+	# Count the errors on all the pages.
+	$bindir/counttestset.sh "$imdir/$set/pages" "spa"
+	# Get the new character word and nonstop word errors and accuracy.
+	cherrs=$(head -4 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 |
+	    tr -d '[:blank:]')
+	chacc=$(head -5 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 |
+	    tr -d '[:blank:]')
+	wderrs=$(head -4 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
+	    tr -d '[:blank:]')
+	wdacc=$(head -5 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
+	    tr -d '[:blank:]')
+	nswderrs=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 |
+	    cut -c10-17 |tr -d '[:blank:]')
+	nswdacc=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 |
+	    cut -c19-26 |tr -d '[:blank:]')
+
+sumfile=$rdir/$vid.$set.sum
+        if [ -r "unlvtests/results/$set.times" ]
+        then
+          total_time=$(timesum "unlvtests/results/$set.times")
+          if [ -r "unlvtests/results/prev/$set.times" ]
+          then
+            paste "unlvtests/results/prev/$set.times" "unlvtests/results/$set.times" |
+              awk '{ printf("%s %.2f\n", $1, $4-$2); }' |sort -k2n >"unlvtests/results/$set.timedelta"
+          fi
+	else
+          total_time='0.0'
+        fi
+        echo "RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy\
+	NonStopWordErrors	Accuracy	TimeTaken">"$sumfile"
+        echo "$vid	$set	$cherrs		$chacc		$wderrs		$wdacc\
+		$nswderrs			$nswdacc		${total_time}s" >>"$sumfile"
+    fi
+done
+
+cat "$rdir/$vid".*.sum >"$rdir/$vid".summary
+
+mv "$rdir/$vid".*.sum unlvtests/results/
+cat "$rdir/$vid".summary