-
Notifications
You must be signed in to change notification settings - Fork 391
/
prepare_data.sh
executable file
·97 lines (91 loc) · 3.96 KB
/
prepare_data.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
##################################################################################
# Depending on the task:
# commoncrawl europarl-v7 are the same for all tasks
# http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz
# http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz
#
# WMT14 http://www.statmt.org/wmt14/training-parallel-nc-v9.tgz
# WMT15 http://www.statmt.org/wmt15/training-parallel-nc-v10.tgz
# WMT16 http://data.statmt.org/wmt16/translation-task/training-parallel-nc-v11.tgz
# WMT17 http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz
# Note : there are very little difference, but each year added a few sentences
# new WMT17 http://data.statmt.org/wmt17/translation-task/rapid2016.tgz
#
# For WMT16 Rico Sennrich released some News back translation
# http://data.statmt.org/rsennrich/wmt16_backtranslations/en-de/
##################################################################################
vocab_size=32000
SP_PATH=~/sentencepiece/src
DATA_PATH=/mosesgpu4-hd1TB/datasets/de_DE-en_EN/public
sl=en
tl=de
corpus[1]=commoncrawl.de-en
corpus[2]=europarl-v7.de-en
corpus[3]=News-Commentary11.de-en
#corpus[3]=news-commentary-v12.de-en
#corpus[4]=news.bt.en-de
#corpus[5]=rapid2016.de-en
validset=newstest2014-deen
testset=newstest2017-ende
##################################################################################
# Starting from here, original files are supposed to be in $DATA_PATH
# a data folder will be created in scripts/wmt
##################################################################################
# Data preparation using SentencePiece
# First we concat all the datasets to train the SP model
if true; then
mkdir -p data
echo "$0: Training sentencepiece model"
[ -f data/train.txt ] && rm data/train.txt
for ((i=1; i<= ${#corpus[@]}; i++))
do
for f in $DATA_PATH/${corpus[$i]}.$sl $DATA_PATH/${corpus[$i]}.$tl
do
cat $f >> data/train.txt
done
done
$SP_PATH/spm_train --input=data/train.txt --model_prefix=wmt$sl$tl --vocab_size=$vocab_size --character_coverage=1
rm data/train.txt
fi
# Second we use the trained model to tokenize all the files
if true; then
echo "$0: Tokenizing with sentencepiece model"
[ -f data/train.txt ] && rm data/train.txt
for ((i=1; i<= ${#corpus[@]}; i++))
do
for f in $DATA_PATH/${corpus[$i]}.$sl $DATA_PATH/${corpus[$i]}.$tl
do
file=$(basename $f)
$SP_PATH/spm_encode --model=wmt$sl$tl.model < $f > data/$file.sp
done
done
fi
# We concat the training sets into two (src/tgt) tokenized files
if true; then
cat data/*.$sl.sp > data/train.$sl
cat data/*.$tl.sp > data/train.$tl
fi
# We use the same tokenization method for a valid set (and test set)
if true; then
validsetfile=$(basename $validset-src.$sl.sgm)
../../third_party/input-from-sgm.perl < $DATA_PATH/test/$validset-src.$sl.sgm \
| $SP_PATH/spm_encode --model=wmt$sl$tl.model > data/valid.$sl
validsetfile=$(basename $validset-ref.$tl.sgm)
../../third_party/input-from-sgm.perl < $DATA_PATH/test/$validset-ref.$tl.sgm \
| $SP_PATH/spm_encode --model=wmt$sl$tl.model > data/valid.$tl
testsetfile=$(basename $testset-src.$sl.sgm)
../../third_party/input-from-sgm.perl < $DATA_PATH/test/$testset-src.$sl.sgm \
| $SP_PATH/spm_encode --model=wmt$sl$tl.model > data/test.$sl
testsetfile=$(basename $testset-ref.$tl.sgm)
../../third_party/input-from-sgm.perl < $DATA_PATH/test/$testset-ref.$tl.sgm \
| $SP_PATH/spm_encode --model=wmt$sl$tl.model > data/test.$tl
fi
# Let's finish and clean up
mv wmt$sl$tl.model data/wmt$sl$tl.model
# We keep the first field of the vocab file generated by SentencePiece and remove the first line <unk>
cut -f 1 wmt$sl$tl.vocab | tail -n +2 > data/wmt$sl$tl.vocab.tmp
# we add the <blank> word in first position, needed for OpenNMT-TF
sed -i '1i<blank>' data/wmt$sl$tl.vocab.tmp
# Last tweak we replace the empty line supposed to be the "tab" character (removed by the cut above)
perl -pe '$/=""; s/\n\n/\n\t\n/;' data/wmt$sl$tl.vocab.tmp > data/wmt$sl$tl.vocab
rm data/wmt$sl$tl.vocab.tmp