-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathcrossvalidate
executable file
·70 lines (59 loc) · 2.09 KB
/
crossvalidate
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/bin/bash
# Cross-validates the model by doing the following:
# For line in range l-N for N total lines of labeled data:
# use line as test corpus, rest as train
# train model, appending tagged output (of line) to file
# test, comparing gold corpus to total tagged output of each of the above iterations
# display accuracy score and confusion matrix
# Usage: ./crossvalidate [tagger_args]
# where tagger_args excludes --train, --test, --output and --lang (since those are set by this script)
# Tagging language defaults to Sanskrit, but can be specified as an environment variable e.g.
# MODEL_LANG="EN" ./crossvalidate [...]
if [ "$MODEL_LANG" == "" ]; then
MODEL_LANG="SANS" # DEFAULT TAGGING LANGUAGE
fi
if [ "$MODEL_LANG" == "SANS" ]; then
DATADIR="data/sans"
echo "Tagging Sanskrit text..."
elif [ "$MODEL_LANG" == "EN" ]; then
DATADIR="data/en"
echo "Tagging English text..."
else
echo "Invalid language option!"
exit 2
fi
GOLD="${DATADIR}/TaggedCorpus.txt"
TAGSET="${DATADIR}/tagset.txt"
TMP="data/output.txt" # each iteration writes (without appending) tagged output to this file
OUTPUT="data/tagged.txt" # where all the tagged output is appended to
size=( $(wc -l "${GOLD}") ) # get size of labeled corpus
# remove tagged output file:
if [ -e "${OUTPUT}" ]; then
printf "remove ${OUTPUT}? "
read response
if [ "$response" == "y" ] || [ "$response" == "yes" ]; then
rm -f "${OUTPUT}"
else
exit 1
fi
fi
# run for each line in labeled corpus
printf "Progress: "
for i in $( seq 1 $size); do
progress=$(($i*100/$size))
printf "$progress%%"
tools/corpus.py -l $i "${GOLD}" "${DATADIR}" # make the train/test files
./tagger.py --lang ${MODEL_LANG} --train "${DATADIR}/train.txt" --test "${DATADIR}/test.txt" --output "${TMP}" $@
if [ $? -ne "0" ]; then # there was an error
exit 1
fi
cat "${TMP}" >> "${OUTPUT}"
if (( $progress < 10 )); then
printf "\b\b \b\b" # erase only 2 chars
else
printf "\b\b\b \b\b\b" # erase 3 chars
fi
done
printf "\btagging complete.\n"
# finally, score model:
./score.py -v -l ${MODEL_LANG} -ab ${GOLD} ${OUTPUT} ${TAGSET}