see project chinese-wikipedia-corpus-creator for more details.
- input:
token_cleaned_plain_files/*
- output:
WORDS_FREQ.txt
- script:
compute_words_freq.bash
- input:
token_cleaned_plain_files/*
- output:
WORDS.txt
- script:
merge_all_text_files.bash
Official software is brown-cluster.
- input: -
- output: ``
- script:
download_and_compile_brown_cluster.bash
- input:
WORDS.txt
- output:
WORDS-c1000-p1.out/*
- script:
compute_brown_cluster.bash
- input:
token_cleaned_plain_files/*
- output:
WORDS_VECS.txt
- script:
compute_plain_word_vec.bash
- input:
./WORDS-c1000-p1.out/paths WORDS_VECS.txt WORDS_FREQ.txt
- output:
zh_wiki_core/**/*
- script:
create_init_model.bash
- input: -
- output:
corpus/UD_Chinese-GS.zip
- script:
download_UD_Chinese-GSD_corpus.bash
- input:
corpus/UD_Chinese-GSd.zip
- output:
corpus/UD_Chinese-GSd
- script:
extract_UD_Chinese-GSD_corpus.bash
- input:
corpus/UD_Chinese-GSd/zh-ud-*.conllu
- output:
corpus/UD_Chinese-GSd/zh-simplified-ud-*.conllu
- script:
convert_UD_Chinese-GSD_corpus.bash
- input:
.corpus/UD_Chinese-GSD/zh-simplified-ud-*.conllu
- output:
corpus/spacy/zh-simplified-ud-*.conllu
- script:
format_convertor.bash
- input:
WORDS_FREQ.txt
,WORDS-c1000-p1.out/paths
,WORDS_VECS.txt
- output:
zh_model/*
- script:
init_model.bash
- input:
zh_model corpus/spacy/zh-simplified-ud-*.conllu
- output:
dependency_model
- script:
train_model.bash
- input:
TODO
- output:
TODO
- script:
onto_to_spacy_json.bash
- input:
zh_model china_ner_train.json china_ner_eval.json
- output:
ner_model
- script:
train_ner.bash