crossvalidate

#!/bin/bash

# Cross-validates the model by doing the following:
# For line in range l-N for N total lines of labeled data:
#   use line as test corpus, rest as train
#   train model, appending tagged output (of line) to file
# test, comparing gold corpus to total tagged output of each of the above iterations
#  display accuracy score and confusion matrix

# Usage: ./crossvalidate [tagger_args]
# where tagger_args excludes --train, --test, --output and --lang (since those are set by this script)
# Tagging language defaults to Sanskrit, but can be specified as an environment variable e.g.
#   MODEL_LANG="EN" ./crossvalidate [...]

if [ "$MODEL_LANG" == "" ]; then
  MODEL_LANG="SANS" # DEFAULT TAGGING LANGUAGE
fi

if [ "$MODEL_LANG" == "SANS" ]; then
  DATADIR="data/sans"
  echo "Tagging Sanskrit text..."
elif [ "$MODEL_LANG" == "EN" ]; then
  DATADIR="data/en"
  echo "Tagging English text..."
else
  echo "Invalid language option!"
  exit 2
fi

GOLD="${DATADIR}/TaggedCorpus.txt"
TAGSET="${DATADIR}/tagset.txt"
TMP="data/output.txt" # each iteration writes (without appending) tagged output to this file
OUTPUT="data/tagged.txt" # where all the tagged output is appended to

size=( $(wc -l "${GOLD}") ) # get size of labeled corpus

# remove tagged output file:
if [ -e "${OUTPUT}" ]; then
  printf "remove ${OUTPUT}? "
  read response
  if [ "$response" == "y" ] || [ "$response" == "yes" ]; then
    rm -f "${OUTPUT}"
  else
    exit 1
  fi
fi

# run for each line in labeled corpus
printf "Progress: "
for i in $( seq 1 $size); do
  progress=$(($i*100/$size))
  printf "$progress%%"
  tools/corpus.py -l $i "${GOLD}" "${DATADIR}" # make the train/test files
  ./tagger.py --lang ${MODEL_LANG} --train "${DATADIR}/train.txt" --test "${DATADIR}/test.txt"  --output "${TMP}" $@
  if [ $? -ne "0" ]; then # there was an error
    exit 1
  fi
  cat "${TMP}" >> "${OUTPUT}"

  if (( $progress < 10 )); then
    printf "\b\b  \b\b" # erase only 2 chars
  else
    printf "\b\b\b   \b\b\b" # erase 3 chars 
  fi
done
printf "\btagging complete.\n"

# finally, score model:
./score.py -v -l ${MODEL_LANG} -ab ${GOLD} ${OUTPUT} ${TAGSET}