Skip to content

Commit

Permalink
update Spanish UNLV test, use spa.stopwords, iconv to UTF-8
Browse files Browse the repository at this point in the history
  • Loading branch information
Shreeshrii committed Jun 9, 2018
1 parent 481d777 commit 6559af0
Show file tree
Hide file tree
Showing 5 changed files with 156 additions and 30 deletions.
30 changes: 26 additions & 4 deletions unlvtests/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,15 @@ tar xzvf ~/isri-downloads/doe3.3B.tar.gz
tar xzvf ~/isri-downloads/mag.3B.tar.gz
tar xzvf ~/isri-downloads/news.3B.tar.gz
tar xzvf ~/isri-downloads/spn.3B.tar.gz
mkdir -p stopwords
cd stopwords
wget -O spa.stopwords.txt https://raw.githubusercontent.com/stopwords-iso/stopwords-es/master/stopwords-es.txt
```
Edit ~/ISRI-OCRtk/stopwords/spa.stopwords.txt
wordacc uses a space delimited stopwords file, not line delimited.

Edit *~/ISRI-OCRtk/spn.3B/pages*
delete the line containing the following imagename as it crashes tesseract.

7733_005.3B.tif

### Step 3: Download the modified ISRI toolkit, make and install the tools :
Expand All @@ -52,10 +56,10 @@ sudo make install

### Step 4: cd back to your main tesseract-ocr dir and Build tesseract.

### Step 5: run unlvtests/runalltests.sh with the root ISRI data dir, testname, tessdata-dir and language:
### Step 5: run unlvtests/runalltests.sh with the root ISRI data dir, testname, tessdata-dir:

```
unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_eng ../tessdata_fast eng
unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_eng ../tessdata_fast
```
and go to the gym, have lunch etc. It takes a while to run.

Expand All @@ -66,5 +70,23 @@ report and comparison with the 1995 results.
### Step 7: run the test for Spanish.

```
unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_spa ../tessdata_fast spa
unlvtests/runalltests_spa.sh ~/ISRI-OCRtk 4_fast_spa ../tessdata_fast
```

#### Notes from Nick White regarding wordacc

If you just want to remove all lines which have 100% recognition,
you can add a 'awk' command like this:

ocrevalutf8 wordacc ground.txt ocr.txt | awk '$3 != 100 {print $0}'
results.txt

or if you've already got a results file you want to change, you can do this:

awk '$3 != 100 {print $0}' results.txt newresults.txt

If you only want the last sections where things are broken down by
word, you can add a sed commend, like this:

ocrevalutf8 wordacc ground.txt ocr.txt | sed '/^ Count Missed %Right $/,$
!d' | awk '$3 != 100 {print $0}' results.txt
22 changes: 15 additions & 7 deletions unlvtests/counttestset.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

if [ $# -ne 1 ]
if [ $# -ne 2 ]
then
echo "Usage:$0 pagesfile"
echo "Usage:$0 pagesfile langcode"
exit 1
fi
if [ ! -d src/api ]
Expand All @@ -27,6 +27,7 @@ then
fi

pages=$1
langcode=$2

imdir=${pages%/pages}
setname=${imdir##*/}
Expand All @@ -45,15 +46,22 @@ do
fi
#echo "$srcdir/$page.tif"
# Count character errors.
ocrevalutf8 accuracy "$srcdir/$page.txt" "$resdir/$page.unlv" > "$resdir/$page.acc"
iconv -f ISO8859-1 -t UTF-8 "$resdir/$page.unlv" >"$resdir/$page.text"
iconv -f ISO8859-1 -t UTF-8 "$srcdir/$page.txt" >"$srcdir/$page.text"
ocrevalutf8 accuracy "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.acc"
accfiles="$accfiles $resdir/$page.acc"
# Count word errors.
ocrevalutf8 wordacc "$srcdir/$page.txt" "$resdir/$page.unlv" > "$resdir/$page.wa"
#langcode should be either eng or spa
if [ "$langcode" = "eng" ]
then
ocrevalutf8 wordacc "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.wa"
else
cp /home/ubuntu/ISRI-OCRtk/stopwords/spa.stopwords.txt "$resdir/spa.stopwords"
ocrevalutf8 wordacc -S"$resdir/spa.stopwords" "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.wa"
fi
wafiles="$wafiles $resdir/$page.wa"
done <"$pages"

#echo "$accfiles"
#echo "$wafiles"

accsum $accfiles >"unlvtests/results/$setname.characc"
wordaccsum $wafiles >"unlvtests/results/$setname.wordacc"

1 change: 0 additions & 1 deletion unlvtests/reports/1995.spn.3B.sum

This file was deleted.

24 changes: 6 additions & 18 deletions unlvtests/runalltests.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash
# File: runalltests.sh
# Description: Script to run a set of UNLV test sets.
# Description: Script to run a set of UNLV test sets for English.
# Author: Ray Smith
# Created: Thu Jun 14 08:21:01 PDT 2007
#
Expand All @@ -15,9 +15,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

if [ $# -ne 4 ]
if [ $# -ne 3 ]
then
echo "Usage:$0 unlv-data-dir version-id tessdata-dir lang "
echo "Usage:$0 unlv-data-dir version-id tessdata-dir"
exit 1
fi
if [ ! -d src/api ]
Expand All @@ -31,7 +31,6 @@ then
exit 1
fi
tessdata=$3
lang=$4

#deltapc new old calculates the %change from old to new
deltapc() {
Expand Down Expand Up @@ -62,19 +61,8 @@ then
fi
rdir=unlvtests/reports

if [ "$lang" = "eng" ]
then
testsets="bus.3B doe3.3B mag.3B news.3B"
#testsets="bus.3B"
else
if [ "$lang" = "spa" ]
then
testsets="spn.3B"
else
echo "Language has to be eng or spa"
exit 1
fi
fi
testsets="bus.3B doe3.3B mag.3B news.3B"
#testsets="bus.3B"

totalerrs=0
totalwerrs=0
Expand All @@ -87,7 +75,7 @@ do
if [ -r "$imdir/$set/pages" ]
then
# Run tesseract on all the pages.
$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "$lang"
$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "eng"
# Count the errors on all the pages.
$bindir/counttestset.sh "$imdir/$set/pages"
# Get the old character word and nonstop word errors.
Expand Down
109 changes: 109 additions & 0 deletions unlvtests/runalltests_spa.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
#!/bin/bash
##############################################################################
# File: runalltests_spa.sh
# Description: Script to run a set of UNLV test sets for Spanish.
# based on runalltests.sh by Ray Smith
# Author: Shree Devi Kumar
# Created: June 09, 2018
#
# (C) Copyright 2007, Google Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
##############################################################################
if [ $# -ne 3 ]
then
echo "Usage:$0 unlv-data-dir version-id tessdata-dir"
exit 1
fi
if [ ! -d src/api ]
then
echo "Run $0 from the tesseract-ocr root directory!"
exit 1
fi
if [ ! -r src/api/tesseract ] && [ ! -r tesseract.exe ]
then
echo "Please build tesseract before running $0"
exit 1
fi
tessdata=$3
lang=$4

#timesum computes the total cpu time
timesum() {
awk ' BEGIN {
total = 0.0;
}
{
total += $2;
}
END {
printf("%.2f\n", total);
}' "$1"
}

imdir="$1"
vid="$2"
bindir=${0%/*}
if [ "$bindir" = "$0" ]
then
bindir="./"
fi
rdir=unlvtests/reports

testsets="spn.3B"

totalerrs=0
totalwerrs=0
totalnswerrs=0
for set in $testsets
do
if [ -r "$imdir/$set/pages" ]
then
# Run tesseract on all the pages.
$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "spa"
# Count the errors on all the pages.
$bindir/counttestset.sh "$imdir/$set/pages" "spa"
# Get the new character word and nonstop word errors and accuracy.
cherrs=$(head -4 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]')
chacc=$(head -5 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]')
wderrs=$(head -4 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]')
wdacc=$(head -5 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]')
nswderrs=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 |
cut -c10-17 |tr -d '[:blank:]')
nswdacc=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 |
cut -c19-26 |tr -d '[:blank:]')

sumfile=$rdir/$vid.$set.sum
if [ -r "unlvtests/results/$set.times" ]
then
total_time=$(timesum "unlvtests/results/$set.times")
if [ -r "unlvtests/results/prev/$set.times" ]
then
paste "unlvtests/results/prev/$set.times" "unlvtests/results/$set.times" |
awk '{ printf("%s %.2f\n", $1, $4-$2); }' |sort -k2n >"unlvtests/results/$set.timedelta"
fi
else
total_time='0.0'
fi
echo "RELEASE TestSet CharErrors Accuracy WordErrors Accuracy\
NonStopWordErrors Accuracy TimeTaken">"$sumfile"
echo "$vid $set $cherrs $chacc $wderrs $wdacc\
$nswderrs $nswdacc ${total_time}s" >>"$sumfile"
fi
done

cat "$rdir/$vid".*.sum >"$rdir/$vid".summary

mv "$rdir/$vid".*.sum unlvtests/results/
cat "$rdir/$vid".summary

0 comments on commit 6559af0

Please sign in to comment.