forked from shangjingbo1226/AutoPhrase
-
Notifications
You must be signed in to change notification settings - Fork 0
/
auto_phrase_stanford.sh
executable file
·137 lines (93 loc) · 3.96 KB
/
auto_phrase_stanford.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/bin/bash
MODEL=${MODEL:- "models/DBLP"}
# RAW_TRAIN is the input of AutoPhrase, where each line is a single document.
RAW_TRAIN=${RAW_TRAIN:- $1}
# When FIRST_RUN is set to 1, AutoPhrase will run all preprocessing.
# Otherwise, AutoPhrase directly starts from the current preprocessed data in the tmp/ folder.
FIRST_RUN=${FIRST_RUN:- 1}
# When ENABLE_POS_TAGGING is set to 1, AutoPhrase will utilize the POS tagging in the phrase mining.
# Otherwise, a simple length penalty mode as the same as SegPhrase will be used.
ENABLE_POS_TAGGING=${ENABLE_POS_TAGGING:- 1}
# A hard threshold of raw frequency is specified for frequent phrase mining, which will generate a candidate set.
MIN_SUP=${MIN_SUP:- 10}
# You can also specify how many threads can be used for AutoPhrase
THREAD=${THREAD:- 10}
### Begin: Suggested Parameters ###
MAX_POSITIVES=-1
LABEL_METHOD=DPDN
RAW_LABEL_FILE=${RAW_LABEL_FILE:-""}
### End: Suggested Parameters ###
green=`tput setaf 2`
reset=`tput sgr0`
echo ${green}===Compilation===${reset}
COMPILE=${COMPILE:- 1}
if [ $COMPILE -eq 1 ]; then
bash compile.sh
fi
mkdir -p tmp
mkdir -p ${MODEL}
if [ $RAW_TRAIN == "data/DBLP.txt" ] && [ ! -e data/DBLP.txt ]; then
echo ${green}===Downloading Toy Dataset===${reset}
curl http://dmserv2.cs.illinois.edu/data/DBLP.txt.gz --output data/DBLP.txt.gz
gzip -d data/DBLP.txt.gz -f
iconv -f UTF8 -t ASCII//TRANSLIT data/DBLP.txt > data/DBLP_ascii.txt
rm data/DBLP.txt
mv data/DBLP_ascii.txt data/DBLP.txt
fi
### END Compilation###
echo ${green}===Tokenization===${reset}
TOKENIZER="-cp .:tools/tokenizer/lib/*:tools/tokenizer/resources/:tools/tokenizer/build/ Tokenizer"
TOKENIZED_TRAIN=tmp/tokenized_train.txt
CASE=tmp/case_tokenized_train.txt
TOKEN_MAPPING=tmp/token_mapping.txt
# hardcode English language
rm -f tmp/language.txt
echo "EN" > tmp/language.txt
LANGUAGE=`cat tmp/language.txt`
echo -ne "Detected Language: $LANGUAGE\033[0K\n"
TOKENIZED_STOPWORDS=tmp/tokenized_stopwords.txt
TOKENIZED_ALL=tmp/tokenized_all.txt
TOKENIZED_QUALITY=tmp/tokenized_quality.txt
STOPWORDS=data/$LANGUAGE/stopwords.txt
ALL_WIKI_ENTITIES=data/$LANGUAGE/wiki_all.txt
QUALITY_WIKI_ENTITIES=data/$LANGUAGE/wiki_quality.txt
LABEL_FILE=tmp/labels.txt
# New tokenization & POS tagging
source tokenize_raw.sh $1
### END Tokenization ###
if [[ $RAW_LABEL_FILE = *[!\ ]* ]]; then
echo -ne "Current step: Tokenizing expert labels...\033[0K\n"
java $TOKENIZER -m test -i $RAW_LABEL_FILE -o $LABEL_FILE -t $TOKEN_MAPPING -c N -thread $THREAD
else
echo -ne "No provided expert labels.\033[0K\n"
fi
### END Part-Of-Speech Tagging ###
echo ${green}===AutoPhrasing===${reset}
if [ $ENABLE_POS_TAGGING -eq 1 ]; then
time ./bin/segphrase_train \
--pos_tag \
--thread $THREAD \
--pos_prune data/BAD_POS_TAGS.txt \
--label_method $LABEL_METHOD \
--label $LABEL_FILE \
--max_positives $MAX_POSITIVES \
--min_sup $MIN_SUP
else
time ./bin/segphrase_train \
--thread $THREAD \
--label_method $LABEL_METHOD \
--label $LABEL_FILE \
--max_positives $MAX_POSITIVES \
--min_sup $MIN_SUP
fi
echo ${green}===Saving Model and Results===${reset}
cp tmp/segmentation.model ${MODEL}/segmentation.model
cp tmp/token_mapping.txt ${MODEL}/token_mapping.txt
cp tmp/language.txt ${MODEL}/language.txt
### END AutoPhrasing ###
echo ${green}===Generating Output===${reset}
java $TOKENIZER -m translate -i tmp/final_quality_multi-words.txt -o ${MODEL}/AutoPhrase_multi-words.txt -t $TOKEN_MAPPING -c N -thread $THREAD
java $TOKENIZER -m translate -i tmp/final_quality_unigrams.txt -o ${MODEL}/AutoPhrase_single-word.txt -t $TOKEN_MAPPING -c N -thread $THREAD
java $TOKENIZER -m translate -i tmp/final_quality_salient.txt -o ${MODEL}/AutoPhrase.txt -t $TOKEN_MAPPING -c N -thread $THREAD
# java $TOKENIZER -m translate -i tmp/distant_training_only_salient.txt -o results/DistantTraning.txt -t $TOKEN_MAPPING -c N -thread $THREAD
### END Generating Output for Checking Quality ###