From 1b7e0449850cac4ef753b0e89822c5666ee53dbf Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Wed, 6 Jun 2018 21:26:41 -0700 Subject: [PATCH 01/10] Adapt the decoder to the new label --- fluid/DeepASR/decoder/post_decode_faster.cc | 145 --------------- .../decoder/post_latgen_faster_mapped.cc | 172 ++++++++++++++++++ ...e_faster.h => post_latgen_faster_mapped.h} | 22 ++- fluid/DeepASR/decoder/pybind.cc | 10 +- fluid/DeepASR/decoder/setup.py | 8 +- fluid/DeepASR/infer_by_ckpt.py | 41 ++++- 6 files changed, 231 insertions(+), 167 deletions(-) delete mode 100644 fluid/DeepASR/decoder/post_decode_faster.cc create mode 100644 fluid/DeepASR/decoder/post_latgen_faster_mapped.cc rename fluid/DeepASR/decoder/{post_decode_faster.h => post_latgen_faster_mapped.h} (75%) diff --git a/fluid/DeepASR/decoder/post_decode_faster.cc b/fluid/DeepASR/decoder/post_decode_faster.cc deleted file mode 100644 index ce2b45bc6c..0000000000 --- a/fluid/DeepASR/decoder/post_decode_faster.cc +++ /dev/null @@ -1,145 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "post_decode_faster.h" - -typedef kaldi::int32 int32; -using fst::SymbolTable; -using fst::VectorFst; -using fst::StdArc; - -Decoder::Decoder(std::string word_syms_filename, - std::string fst_in_filename, - std::string logprior_rxfilename, - kaldi::BaseFloat acoustic_scale) { - const char* usage = - "Decode, reading log-likelihoods (of transition-ids or whatever symbol " - "is on the graph) as matrices."; - - kaldi::ParseOptions po(usage); - binary = true; - this->acoustic_scale = acoustic_scale; - allow_partial = true; - kaldi::FasterDecoderOptions decoder_opts; - decoder_opts.Register(&po, true); // true == include obscure settings. - po.Register("binary", &binary, "Write output in binary mode"); - po.Register("allow-partial", - &allow_partial, - "Produce output even when final state was not reached"); - po.Register("acoustic-scale", - &acoustic_scale, - "Scaling factor for acoustic likelihoods"); - - word_syms = NULL; - if (word_syms_filename != "") { - word_syms = fst::SymbolTable::ReadText(word_syms_filename); - if (!word_syms) - KALDI_ERR << "Could not read symbol table from file " - << word_syms_filename; - } - - std::ifstream is_logprior(logprior_rxfilename); - logprior.Read(is_logprior, false); - - // It's important that we initialize decode_fst after loglikes_reader, as it - // can prevent crashes on systems installed without enough virtual memory. - // It has to do with what happens on UNIX systems if you call fork() on a - // large process: the page-table entries are duplicated, which requires a - // lot of virtual memory. - decode_fst = fst::ReadFstKaldi(fst_in_filename); - - decoder = new kaldi::FasterDecoder(*decode_fst, decoder_opts); -} - - -Decoder::~Decoder() { - if (!word_syms) delete word_syms; - delete decode_fst; - delete decoder; -} - -std::string Decoder::decode( - std::string key, - const std::vector>& log_probs) { - size_t num_frames = log_probs.size(); - size_t dim_label = log_probs[0].size(); - - kaldi::Matrix loglikes( - num_frames, dim_label, kaldi::kSetZero, kaldi::kStrideEqualNumCols); - for (size_t i = 0; i < num_frames; ++i) { - memcpy(loglikes.Data() + i * dim_label, - log_probs[i].data(), - sizeof(kaldi::BaseFloat) * dim_label); - } - - return decode(key, loglikes); -} - - -std::vector Decoder::decode(std::string posterior_rspecifier) { - kaldi::SequentialBaseFloatMatrixReader posterior_reader(posterior_rspecifier); - std::vector decoding_results; - - for (; !posterior_reader.Done(); posterior_reader.Next()) { - std::string key = posterior_reader.Key(); - kaldi::Matrix loglikes(posterior_reader.Value()); - - decoding_results.push_back(decode(key, loglikes)); - } - - return decoding_results; -} - - -std::string Decoder::decode(std::string key, - kaldi::Matrix& loglikes) { - std::string decoding_result; - - if (loglikes.NumRows() == 0) { - KALDI_WARN << "Zero-length utterance: " << key; - } - KALDI_ASSERT(loglikes.NumCols() == logprior.Dim()); - - loglikes.ApplyLog(); - loglikes.AddVecToRows(-1.0, logprior); - - kaldi::DecodableMatrixScaled decodable(loglikes, acoustic_scale); - decoder->Decode(&decodable); - - VectorFst decoded; // linear FST. - - if ((allow_partial || decoder->ReachedFinal()) && - decoder->GetBestPath(&decoded)) { - if (!decoder->ReachedFinal()) - KALDI_WARN << "Decoder did not reach end-state, outputting partial " - "traceback."; - - std::vector alignment; - std::vector words; - kaldi::LatticeWeight weight; - - GetLinearSymbolSequence(decoded, &alignment, &words, &weight); - - if (word_syms != NULL) { - for (size_t i = 0; i < words.size(); i++) { - std::string s = word_syms->Find(words[i]); - decoding_result += s; - if (s == "") - KALDI_ERR << "Word-id " << words[i] << " not in symbol table."; - } - } - } - - return decoding_result; -} diff --git a/fluid/DeepASR/decoder/post_latgen_faster_mapped.cc b/fluid/DeepASR/decoder/post_latgen_faster_mapped.cc new file mode 100644 index 0000000000..19d5dbea83 --- /dev/null +++ b/fluid/DeepASR/decoder/post_latgen_faster_mapped.cc @@ -0,0 +1,172 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "post_latgen_faster_mapped.h" + +using namespace kaldi; +typedef kaldi::int32 int32; +using fst::SymbolTable; +using fst::Fst; +using fst::StdArc; + +Decoder::Decoder(std::string trans_model_in_filename, + std::string word_syms_filename, + std::string fst_in_filename, + std::string logprior_in_filename, + kaldi::BaseFloat acoustic_scale) { + const char *usage = + "Generate lattices using neural net model.\n" + "Usage: post-latgen-faster-mapped [options] " + " " + " [ [] " + "]\n"; + ParseOptions po(usage); + allow_partial = false; + this->acoustic_scale = acoustic_scale; + LatticeFasterDecoderConfig config; + + config.Register(&po); + int32 beam = 11; + po.Register("beam", &beam, "Beam size"); + po.Register("acoustic-scale", + &acoustic_scale, + "Scaling factor for acoustic likelihoods"); + po.Register("word-symbol-table", + &word_syms_filename, + "Symbol table for words [for debug output]"); + po.Register("allow-partial", + &allow_partial, + "If true, produce output even if end state was not reached."); + + // int argc = 2; + // char *argv[] = {"post-latgen-faster-mapped", "--beam=11"}; + // po.Read(argc, argv); + + std::ifstream is_logprior(logprior_in_filename); + logprior.Read(is_logprior, false); + + { + bool binary; + Input ki(trans_model_in_filename, &binary); + this->trans_model.Read(ki.Stream(), binary); + } + + this->determinize = config.determinize_lattice; + + this->word_syms = NULL; + if (word_syms_filename != "") { + if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename))) { + KALDI_ERR << "Could not read symbol table from file " + << word_syms_filename; + } + } + + // Input FST is just one FST, not a table of FSTs. + this->decode_fst = fst::ReadFstKaldiGeneric(fst_in_filename); + + this->decoder = new LatticeFasterDecoder(*decode_fst, config); + + std::string lattice_wspecifier = + "ark:|gzip -c > mapped_decoder_data/lat.JOB.gz"; + if (!(determinize ? compact_lattice_writer.Open(lattice_wspecifier) + : lattice_writer.Open(lattice_wspecifier))) + KALDI_ERR << "Could not open table for writing lattices: "; + // << lattice_wspecifier; + + words_writer = new Int32VectorWriter(""); + alignment_writer = new Int32VectorWriter(""); +} + +Decoder::~Decoder() { + if (!this->word_syms) delete this->word_syms; + delete this->decode_fst; + delete this->decoder; + delete words_writer; + delete alignment_writer; +} + + +std::string Decoder::decode(std::string key, + kaldi::Matrix &loglikes) { + std::string decoding_result; + if (loglikes.NumRows() == 0) { + KALDI_WARN << "Zero-length utterance: " << key; + // num_fail++; + } + KALDI_ASSERT(loglikes.NumCols() == logprior.Dim()); + + loglikes.ApplyLog(); + loglikes.AddVecToRows(-1.0, logprior); + + DecodableMatrixScaledMapped matrix_decodable( + trans_model, loglikes, acoustic_scale); + double like; + + if (DecodeUtteranceLatticeFaster(*decoder, + matrix_decodable, + trans_model, + word_syms, + key, + acoustic_scale, + determinize, + allow_partial, + alignment_writer, + words_writer, + &compact_lattice_writer, + &lattice_writer, + &like)) { + // tot_like += like; + // frame_count += loglikes.NumRows(); + // num_success++; + decoding_result = "succeed!"; + } else { // else num_fail++; + decoding_result = "fail!"; + } + return decoding_result; +} + +std::vector Decoder::decode(std::string posterior_rspecifier) { + std::vector ret; + + try { + double tot_like = 0.0; + kaldi::int64 frame_count = 0; + // int num_success = 0, num_fail = 0; + + KALDI_ASSERT(ClassifyRspecifier(fst_in_filename, NULL, NULL) == + kNoRspecifier); + SequentialBaseFloatMatrixReader posterior_reader("ark:" + + posterior_rspecifier); + + Timer timer; + timer.Reset(); + + { + for (; !posterior_reader.Done(); posterior_reader.Next()) { + std::string utt = posterior_reader.Key(); + Matrix &loglikes(posterior_reader.Value()); + KALDI_LOG << utt << " " << loglikes.NumRows() << " x " + << loglikes.NumCols(); + ret.push_back(decode(utt, loglikes)); + } + } + + double elapsed = timer.Elapsed(); + return ret; + } catch (const std::exception &e) { + std::cerr << e.what(); + // ret.push_back("error"); + return ret; + } +} diff --git a/fluid/DeepASR/decoder/post_decode_faster.h b/fluid/DeepASR/decoder/post_latgen_faster_mapped.h similarity index 75% rename from fluid/DeepASR/decoder/post_decode_faster.h rename to fluid/DeepASR/decoder/post_latgen_faster_mapped.h index 8bade8d698..4adbf6ba22 100644 --- a/fluid/DeepASR/decoder/post_decode_faster.h +++ b/fluid/DeepASR/decoder/post_latgen_faster_mapped.h @@ -17,19 +17,18 @@ limitations under the License. */ #include "base/kaldi-common.h" #include "base/timer.h" #include "decoder/decodable-matrix.h" -#include "decoder/faster-decoder.h" -#include "fstext/fstext-lib.h" +#include "decoder/decoder-wrappers.h" +#include "fstext/kaldi-fst-io.h" #include "hmm/transition-model.h" -#include "lat/kaldi-lattice.h" // for {Compact}LatticeArc #include "tree/context-dep.h" #include "util/common-utils.h" - class Decoder { public: - Decoder(std::string word_syms_filename, + Decoder(std::string trans_model_in_filename, + std::string word_syms_filename, std::string fst_in_filename, - std::string logprior_rxfilename, + std::string logprior_in_filename, kaldi::BaseFloat acoustic_scale); ~Decoder(); @@ -48,11 +47,18 @@ class Decoder { kaldi::Matrix &loglikes); fst::SymbolTable *word_syms; - fst::VectorFst *decode_fst; - kaldi::FasterDecoder *decoder; + fst::Fst *decode_fst; + kaldi::LatticeFasterDecoder *decoder; kaldi::Vector logprior; + kaldi::TransitionModel trans_model; + + kaldi::CompactLatticeWriter compact_lattice_writer; + kaldi::LatticeWriter lattice_writer; + kaldi::Int32VectorWriter *words_writer; + kaldi::Int32VectorWriter *alignment_writer; bool binary; + bool determinize; kaldi::BaseFloat acoustic_scale; bool allow_partial; }; diff --git a/fluid/DeepASR/decoder/pybind.cc b/fluid/DeepASR/decoder/pybind.cc index 90ea38ffb5..e99050e68d 100644 --- a/fluid/DeepASR/decoder/pybind.cc +++ b/fluid/DeepASR/decoder/pybind.cc @@ -15,15 +15,19 @@ limitations under the License. */ #include #include -#include "post_decode_faster.h" +#include "post_latgen_faster_mapped.h" namespace py = pybind11; -PYBIND11_MODULE(post_decode_faster, m) { +PYBIND11_MODULE(post_latgen_faster_mapped, m) { m.doc() = "Decoder for Deep ASR model"; py::class_(m, "Decoder") - .def(py::init()) + .def(py::init()) .def("decode", (std::vector (Decoder::*)(std::string)) & Decoder::decode, diff --git a/fluid/DeepASR/decoder/setup.py b/fluid/DeepASR/decoder/setup.py index a98c0b4cc1..74e8aa00fb 100644 --- a/fluid/DeepASR/decoder/setup.py +++ b/fluid/DeepASR/decoder/setup.py @@ -49,8 +49,8 @@ ext_modules = [ Extension( - 'post_decode_faster', - ['pybind.cc', 'post_decode_faster.cc'], + 'post_latgen_faster_mapped', + ['pybind.cc', 'post_latgen_faster_mapped.cc'], include_dirs=[ 'pybind11/include', '.', os.path.join(kaldi_root, 'src'), os.path.join(kaldi_root, 'tools/openfst/src/include') @@ -63,8 +63,8 @@ ] setup( - name='post_decode_faster', - version='0.0.1', + name='post_latgen_faster_mapped', + version='0.1.0', author='Paddle', author_email='', description='Decoder for Deep ASR model', diff --git a/fluid/DeepASR/infer_by_ckpt.py b/fluid/DeepASR/infer_by_ckpt.py index 831581924e..36681e9a2b 100644 --- a/fluid/DeepASR/infer_by_ckpt.py +++ b/fluid/DeepASR/infer_by_ckpt.py @@ -14,7 +14,7 @@ import data_utils.augmentor.trans_splice as trans_splice import data_utils.augmentor.trans_delay as trans_delay import data_utils.async_data_reader as reader -from decoder.post_decode_faster import Decoder +from decoder.post_latgen_faster_mapped import Decoder from data_utils.util import lodtensor_to_ndarray from model_utils.model import stacked_lstmp_model from data_utils.util import split_infer_result @@ -98,20 +98,25 @@ def parse_args(): type=str, default='./checkpoint', help="The checkpoint path to init model. (default: %(default)s)") + parser.add_argument( + '--trans_model', + type=str, + default='./graph/trans_model', + help="The path to vocabulary. (default: %(default)s)") parser.add_argument( '--vocabulary', type=str, - default='./decoder/graph/words.txt', + default='./graph/words.txt', help="The path to vocabulary. (default: %(default)s)") parser.add_argument( '--graphs', type=str, - default='./decoder/graph/TLG.fst', + default='./graph/TLG.fst', help="The path to TLG graphs for decoding. (default: %(default)s)") parser.add_argument( '--log_prior', type=str, - default="./decoder/logprior", + default="./logprior", help="The log prior probs for training data. (default: %(default)s)") parser.add_argument( '--acoustic_scale', @@ -123,6 +128,11 @@ def parse_args(): type=str, default="./decoder/target_trans.txt", help="The path to target transcription. (default: %(default)s)") + parser.add_argument( + '--post_matrix_path', + type=str, + default=None, + help="The path to output post prob matrix. (default: %(default)s)") args = parser.parse_args() return args @@ -146,6 +156,16 @@ def get_trg_trans(args): return trans_dict +def out_post_matrix(key, prob): + with open(args.post_matrix_path, "a") as post_matrix: + post_matrix.write(key + " [\n") + for i in range(prob.shape[0]): + for j in range(prob.shape[1]): + post_matrix.write(str(prob[i][j]) + " ") + post_matrix.write("\n") + post_matrix.write("]\n") + + def infer_from_ckpt(args): """Inference by using checkpoint.""" @@ -174,13 +194,13 @@ def infer_from_ckpt(args): fluid.io.load_persistables(exe, args.checkpoint) # init decoder - decoder = Decoder(args.vocabulary, args.graphs, args.log_prior, - args.acoustic_scale) + decoder = Decoder(args.trans_model, args.vocabulary, args.graphs, + args.log_prior, args.acoustic_scale) ltrans = [ trans_add_delta.TransAddDelta(2, 2), trans_mean_variance_norm.TransMeanVarianceNorm(args.mean_var), - trans_splice.TransSplice(), trans_delay.TransDelay(5) + trans_splice.TransSplice(5, 5), trans_delay.TransDelay(5) ] feature_t = fluid.LoDTensor() @@ -197,6 +217,8 @@ def infer_from_ckpt(args): args.minimum_batch_size)): # load_data (features, labels, lod, name_lst) = batch_data + features = np.reshape(features, (-1, 11, 3, args.frame_dim)) + features = np.transpose(features, (0, 2, 1, 3)) feature_t.set(features, place) feature_t.set_lod([lod]) label_t.set(labels, place) @@ -216,6 +238,9 @@ def infer_from_ckpt(args): for index, sample in enumerate(infer_batch): key = name_lst[index] ref = trg_trans[key] + if args.post_matrix_path is not None: + out_post_matrix(key, sample) + ''' hyp = decoder.decode(key, sample) edit_dist, ref_len = char_errors(ref.decode("utf8"), hyp) total_edit_dist += edit_dist @@ -223,6 +248,8 @@ def infer_from_ckpt(args): print(key + "|Ref:", ref) print(key + "|Hyp:", hyp.encode("utf8")) print("Instance CER: ", edit_dist / ref_len) + ''' + print("batch: ", batch_id) print("Total CER = %f" % (total_edit_dist / total_ref_len)) From 28515f698c70a5e16d059f1cca3cb09d6cf7de87 Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Wed, 6 Jun 2018 21:28:00 -0700 Subject: [PATCH 02/10] Add infer by checkpoint script --- fluid/DeepASR/examples/aishell/infer_by_ckpt.sh | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 fluid/DeepASR/examples/aishell/infer_by_ckpt.sh diff --git a/fluid/DeepASR/examples/aishell/infer_by_ckpt.sh b/fluid/DeepASR/examples/aishell/infer_by_ckpt.sh new file mode 100644 index 0000000000..de049d0221 --- /dev/null +++ b/fluid/DeepASR/examples/aishell/infer_by_ckpt.sh @@ -0,0 +1,16 @@ +export CUDA_VISIBLE_DEVICES=0,1 +python -u ../../infer_by_ckpt.py --batch_size 64 \ + --checkpoint deep_asr.pass_20.checkpoint \ + --infer_feature_lst data/test_feature.lst \ + --infer_label_lst data/test_label.lst \ + --mean_var data/aishell/global_mean_var \ + --frame_dim 80 \ + --class_num 3040 \ + --post_matrix_path post_matrix.decoded \ + --target_trans data/text.test \ + --trans_model mapped_decoder_data/exp/tri5a/final.mdl \ + --log_prior mapped_decoder_data/logprior \ + --vocabulary mapped_decoder_data/exp/tri5a/graph/words.txt \ + --graphs mapped_decoder_data/exp/tri5a/graph/HCLG.fst \ + --acoustic_scale 0.059 \ + --parallel From b3ba7fda4e1ca2ed1629540e5e7210790c23c6ab Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Thu, 7 Jun 2018 04:39:25 -0700 Subject: [PATCH 03/10] Add missing defined decoding function --- .../decoder/post_latgen_faster_mapped.cc | 89 +++++++++++-------- 1 file changed, 54 insertions(+), 35 deletions(-) diff --git a/fluid/DeepASR/decoder/post_latgen_faster_mapped.cc b/fluid/DeepASR/decoder/post_latgen_faster_mapped.cc index 19d5dbea83..c9176fb26c 100644 --- a/fluid/DeepASR/decoder/post_latgen_faster_mapped.cc +++ b/fluid/DeepASR/decoder/post_latgen_faster_mapped.cc @@ -97,6 +97,60 @@ Decoder::~Decoder() { } +std::vector Decoder::decode(std::string posterior_rspecifier) { + std::vector ret; + + try { + double tot_like = 0.0; + kaldi::int64 frame_count = 0; + // int num_success = 0, num_fail = 0; + + KALDI_ASSERT(ClassifyRspecifier(fst_in_filename, NULL, NULL) == + kNoRspecifier); + SequentialBaseFloatMatrixReader posterior_reader("ark:" + + posterior_rspecifier); + + Timer timer; + timer.Reset(); + + { + for (; !posterior_reader.Done(); posterior_reader.Next()) { + std::string utt = posterior_reader.Key(); + Matrix &loglikes(posterior_reader.Value()); + KALDI_LOG << utt << " " << loglikes.NumRows() << " x " + << loglikes.NumCols(); + ret.push_back(decode(utt, loglikes)); + } + } + + double elapsed = timer.Elapsed(); + return ret; + } catch (const std::exception &e) { + std::cerr << e.what(); + // ret.push_back("error"); + return ret; + } +} + + +std::string Decoder::decode( + std::string key, + const std::vector> &log_probs) { + size_t num_frames = log_probs.size(); + size_t dim_label = log_probs[0].size(); + + kaldi::Matrix loglikes( + num_frames, dim_label, kaldi::kSetZero, kaldi::kStrideEqualNumCols); + for (size_t i = 0; i < num_frames; ++i) { + memcpy(loglikes.Data() + i * dim_label, + log_probs[i].data(), + sizeof(kaldi::BaseFloat) * dim_label); + } + + return decode(key, loglikes); +} + + std::string Decoder::decode(std::string key, kaldi::Matrix &loglikes) { std::string decoding_result; @@ -135,38 +189,3 @@ std::string Decoder::decode(std::string key, } return decoding_result; } - -std::vector Decoder::decode(std::string posterior_rspecifier) { - std::vector ret; - - try { - double tot_like = 0.0; - kaldi::int64 frame_count = 0; - // int num_success = 0, num_fail = 0; - - KALDI_ASSERT(ClassifyRspecifier(fst_in_filename, NULL, NULL) == - kNoRspecifier); - SequentialBaseFloatMatrixReader posterior_reader("ark:" + - posterior_rspecifier); - - Timer timer; - timer.Reset(); - - { - for (; !posterior_reader.Done(); posterior_reader.Next()) { - std::string utt = posterior_reader.Key(); - Matrix &loglikes(posterior_reader.Value()); - KALDI_LOG << utt << " " << loglikes.NumRows() << " x " - << loglikes.NumCols(); - ret.push_back(decode(utt, loglikes)); - } - } - - double elapsed = timer.Elapsed(); - return ret; - } catch (const std::exception &e) { - std::cerr << e.what(); - // ret.push_back("error"); - return ret; - } -} From 84152a09cf14219e5d351cf0569922a416320b75 Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Sat, 9 Jun 2018 03:58:19 -0700 Subject: [PATCH 04/10] Disable splitting long sentence in infer --- fluid/DeepASR/data_utils/async_data_reader.py | 8 +++++++- fluid/DeepASR/infer_by_ckpt.py | 7 +++++-- fluid/DeepASR/train.py | 2 +- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/fluid/DeepASR/data_utils/async_data_reader.py b/fluid/DeepASR/data_utils/async_data_reader.py index 731c55de71..0c8d010755 100644 --- a/fluid/DeepASR/data_utils/async_data_reader.py +++ b/fluid/DeepASR/data_utils/async_data_reader.py @@ -185,6 +185,9 @@ class AsyncDataReader(object): corresponding description file. drop_frame_len (int): Samples whose label length above the value will be dropped.(Using '-1' to disable the policy) + split_sentence_threshold(int): Sentence whose length larger than + the value will trigger split operation. + (Assign -1 to disable split) proc_num (int): Number of processes for processing data. sample_buffer_size (int): Buffer size to indicate the maximum samples cached. @@ -204,6 +207,7 @@ def __init__(self, feature_file_list, label_file_list="", drop_frame_len=512, + split_sentence_threshold=512, proc_num=10, sample_buffer_size=1024, sample_info_buffer_size=1024, @@ -214,6 +218,7 @@ def __init__(self, self._feature_file_list = feature_file_list self._label_file_list = label_file_list self._drop_frame_len = drop_frame_len + self._split_sentence_threshold = split_sentence_threshold self._shuffle_block_num = shuffle_block_num self._block_info_list = None self._rng = random.Random(random_seed) @@ -262,7 +267,8 @@ def generate_bucket_list(self, is_shuffle): map(lambda info: info[0], bucket_block_info), map(lambda info: info[1], bucket_block_info), map(lambda info: info[2], bucket_block_info), - map(lambda info: info[3], bucket_block_info))) + map(lambda info: info[3], bucket_block_info), + split_sentence_threshold=self._split_sentence_threshold)) # @TODO make this configurable def set_transformers(self, transformers): diff --git a/fluid/DeepASR/infer_by_ckpt.py b/fluid/DeepASR/infer_by_ckpt.py index 36681e9a2b..554dd7223d 100644 --- a/fluid/DeepASR/infer_by_ckpt.py +++ b/fluid/DeepASR/infer_by_ckpt.py @@ -207,8 +207,11 @@ def infer_from_ckpt(args): label_t = fluid.LoDTensor() # infer data reader - infer_data_reader = reader.AsyncDataReader(args.infer_feature_lst, - args.infer_label_lst) + infer_data_reader = reader.AsyncDataReader( + args.infer_feature_lst, + args.infer_label_lst, + drop_frame_len=-1, + split_sentence_threshold=-1) infer_data_reader.set_transformers(ltrans) infer_costs, infer_accs = [], [] total_edit_dist, total_ref_len = 0.0, 0 diff --git a/fluid/DeepASR/train.py b/fluid/DeepASR/train.py index 8373c0e04f..6073db0d07 100644 --- a/fluid/DeepASR/train.py +++ b/fluid/DeepASR/train.py @@ -187,7 +187,7 @@ def test(exe): return -1.0, -1.0 # test data reader test_data_reader = reader.AsyncDataReader(args.val_feature_lst, - args.val_label_lst) + args.val_label_lst, -1) test_data_reader.set_transformers(ltrans) test_costs, test_accs = [], [] for batch_id, batch_data in enumerate( From 989e6cd58379d89a6959821ab0eb67da978bb2c5 Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Wed, 13 Jun 2018 15:11:41 -0700 Subject: [PATCH 05/10] Return decoding result instead of output directly --- .../decoder/post_latgen_faster_mapped.cc | 110 ++++++++++++++---- .../decoder/post_latgen_faster_mapped.h | 9 ++ .../DeepASR/examples/aishell/infer_by_ckpt.sh | 2 +- 3 files changed, 97 insertions(+), 24 deletions(-) diff --git a/fluid/DeepASR/decoder/post_latgen_faster_mapped.cc b/fluid/DeepASR/decoder/post_latgen_faster_mapped.cc index c9176fb26c..448ec358ea 100644 --- a/fluid/DeepASR/decoder/post_latgen_faster_mapped.cc +++ b/fluid/DeepASR/decoder/post_latgen_faster_mapped.cc @@ -117,8 +117,6 @@ std::vector Decoder::decode(std::string posterior_rspecifier) { for (; !posterior_reader.Done(); posterior_reader.Next()) { std::string utt = posterior_reader.Key(); Matrix &loglikes(posterior_reader.Value()); - KALDI_LOG << utt << " " << loglikes.NumRows() << " x " - << loglikes.NumCols(); ret.push_back(decode(utt, loglikes)); } } @@ -127,11 +125,20 @@ std::vector Decoder::decode(std::string posterior_rspecifier) { return ret; } catch (const std::exception &e) { std::cerr << e.what(); - // ret.push_back("error"); return ret; } } +std::vector Decoder::decode_batch( + std::vector keys, + const std::vector>> + &log_probs_batch) { + std::vector decoding_results; + for (size_t i = 0; i < keys.size(); ++i) { + decoding_results.push_back(decode(keys[i], log_probs_batch[i])); + } + return decoding_results; +} std::string Decoder::decode( std::string key, @@ -167,25 +174,82 @@ std::string Decoder::decode(std::string key, trans_model, loglikes, acoustic_scale); double like; - if (DecodeUtteranceLatticeFaster(*decoder, - matrix_decodable, - trans_model, - word_syms, - key, - acoustic_scale, - determinize, - allow_partial, - alignment_writer, - words_writer, - &compact_lattice_writer, - &lattice_writer, - &like)) { - // tot_like += like; - // frame_count += loglikes.NumRows(); - // num_success++; - decoding_result = "succeed!"; - } else { // else num_fail++; - decoding_result = "fail!"; + return this->DecodeUtteranceLatticeFaster(matrix_decodable, key, &like); +} + + +// Takes care of output. Returns true on success. +std::string Decoder::DecodeUtteranceLatticeFaster( + DecodableInterface &decodable, // not const but is really an input. + std::string utt, + double *like_ptr) { // puts utterance's like in like_ptr on success. + using fst::VectorFst; + + if (!decoder->Decode(&decodable)) { + KALDI_WARN << "Failed to decode file " << utt; + return false; + } + if (!decoder->ReachedFinal()) { + if (allow_partial) { + KALDI_WARN << "Outputting partial output for utterance " << utt + << " since no final-state reached\n"; + } else { + KALDI_WARN << "Not producing output for utterance " << utt + << " since no final-state reached and " + << "--allow-partial=false.\n"; + return false; + } + } + + double likelihood; + LatticeWeight weight; + int32 num_frames; + std::string ret = utt + ' '; + { // First do some stuff with word-level traceback... + VectorFst decoded; + if (!decoder->GetBestPath(&decoded)) + // Shouldn't really reach this point as already checked success. + KALDI_ERR << "Failed to get traceback for utterance " << utt; + + std::vector alignment; + std::vector words; + GetLinearSymbolSequence(decoded, &alignment, &words, &weight); + num_frames = alignment.size(); + if (alignment_writer->IsOpen()) alignment_writer->Write(utt, alignment); + if (word_syms != NULL) { + for (size_t i = 0; i < words.size(); i++) { + std::string s = word_syms->Find(words[i]); + ret += s + ' '; + } + } + likelihood = -(weight.Value1() + weight.Value2()); + } + + // Get lattice, and do determinization if requested. + Lattice lat; + decoder->GetRawLattice(&lat); + if (lat.NumStates() == 0) + KALDI_ERR << "Unexpected problem getting lattice for utterance " << utt; + fst::Connect(&lat); + if (determinize) { + CompactLattice clat; + if (!DeterminizeLatticePhonePrunedWrapper( + trans_model, + &lat, + decoder->GetOptions().lattice_beam, + &clat, + decoder->GetOptions().det_opts)) + KALDI_WARN << "Determinization finished earlier than the beam for " + << "utterance " << utt; + // We'll write the lattice without acoustic scaling. + if (acoustic_scale != 0.0) + fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale), &clat); + compact_lattice_writer.Write(utt, clat); + } else { + // We'll write the lattice without acoustic scaling. + if (acoustic_scale != 0.0) + fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale), &lat); + lattice_writer.Write(utt, lat); } - return decoding_result; + return ret; } diff --git a/fluid/DeepASR/decoder/post_latgen_faster_mapped.h b/fluid/DeepASR/decoder/post_latgen_faster_mapped.h index 4adbf6ba22..a3a9e7d293 100644 --- a/fluid/DeepASR/decoder/post_latgen_faster_mapped.h +++ b/fluid/DeepASR/decoder/post_latgen_faster_mapped.h @@ -41,10 +41,19 @@ class Decoder { std::string key, const std::vector> &log_probs); + // Accept the scores of utterances in batch and return the decoding results + std::vector decode_batch( + std::vector key, + const std::vector>> + &log_probs_batch); + private: // For decoding one utterance std::string decode(std::string key, kaldi::Matrix &loglikes); + std::string DecodeUtteranceLatticeFaster(kaldi::DecodableInterface &decodable, + std::string utt, + double *like_ptr); fst::SymbolTable *word_syms; fst::Fst *decode_fst; diff --git a/fluid/DeepASR/examples/aishell/infer_by_ckpt.sh b/fluid/DeepASR/examples/aishell/infer_by_ckpt.sh index de049d0221..ba7a8fed5f 100644 --- a/fluid/DeepASR/examples/aishell/infer_by_ckpt.sh +++ b/fluid/DeepASR/examples/aishell/infer_by_ckpt.sh @@ -1,5 +1,5 @@ export CUDA_VISIBLE_DEVICES=0,1 -python -u ../../infer_by_ckpt.py --batch_size 64 \ +python -u ../../infer_by_ckpt.py --batch_size 48 \ --checkpoint deep_asr.pass_20.checkpoint \ --infer_feature_lst data/test_feature.lst \ --infer_label_lst data/test_label.lst \ From e1d90fc013b267dc5a61d08141d6f376b26f7071 Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Wed, 20 Jun 2018 08:41:30 -0700 Subject: [PATCH 06/10] Use thread pool for parallel decoding --- .../decoder/post_latgen_faster_mapped.cc | 135 ++++++++++++------ .../decoder/post_latgen_faster_mapped.h | 22 +-- fluid/DeepASR/decoder/pybind.cc | 17 ++- fluid/DeepASR/decoder/setup.py | 4 +- fluid/DeepASR/decoder/setup.sh | 5 + .../DeepASR/examples/aishell/infer_by_ckpt.sh | 1 - fluid/DeepASR/infer_by_ckpt.py | 12 +- 7 files changed, 131 insertions(+), 65 deletions(-) diff --git a/fluid/DeepASR/decoder/post_latgen_faster_mapped.cc b/fluid/DeepASR/decoder/post_latgen_faster_mapped.cc index 448ec358ea..87791d5131 100644 --- a/fluid/DeepASR/decoder/post_latgen_faster_mapped.cc +++ b/fluid/DeepASR/decoder/post_latgen_faster_mapped.cc @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "post_latgen_faster_mapped.h" +#include +#include "ThreadPool.h" using namespace kaldi; typedef kaldi::int32 int32; @@ -34,11 +36,9 @@ Decoder::Decoder(std::string trans_model_in_filename, ParseOptions po(usage); allow_partial = false; this->acoustic_scale = acoustic_scale; - LatticeFasterDecoderConfig config; config.Register(&po); int32 beam = 11; - po.Register("beam", &beam, "Beam size"); po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods"); @@ -49,10 +49,13 @@ Decoder::Decoder(std::string trans_model_in_filename, &allow_partial, "If true, produce output even if end state was not reached."); - // int argc = 2; - // char *argv[] = {"post-latgen-faster-mapped", "--beam=11"}; - // po.Read(argc, argv); + int argc = 2; + char *argv[] = {(char *)"post-latgen-faster-mapped", + (char *)("--beam=" + std::string("11")).c_str()}; + po.Read(argc, argv); + + po.PrintConfig(std::cout); std::ifstream is_logprior(logprior_in_filename); logprior.Read(is_logprior, false); @@ -75,14 +78,16 @@ Decoder::Decoder(std::string trans_model_in_filename, // Input FST is just one FST, not a table of FSTs. this->decode_fst = fst::ReadFstKaldiGeneric(fst_in_filename); - this->decoder = new LatticeFasterDecoder(*decode_fst, config); + kaldi::LatticeFasterDecoder *decoder = + new LatticeFasterDecoder(*decode_fst, config); + decoder_pool.emplace_back(decoder); std::string lattice_wspecifier = "ark:|gzip -c > mapped_decoder_data/lat.JOB.gz"; if (!(determinize ? compact_lattice_writer.Open(lattice_wspecifier) : lattice_writer.Open(lattice_wspecifier))) - KALDI_ERR << "Could not open table for writing lattices: "; - // << lattice_wspecifier; + KALDI_ERR << "Could not open table for writing lattices: " + << lattice_wspecifier; words_writer = new Int32VectorWriter(""); alignment_writer = new Int32VectorWriter(""); @@ -91,15 +96,16 @@ Decoder::Decoder(std::string trans_model_in_filename, Decoder::~Decoder() { if (!this->word_syms) delete this->word_syms; delete this->decode_fst; - delete this->decoder; + for (size_t i = 0; i < decoder_pool.size(); ++i) { + delete decoder_pool[i]; + } delete words_writer; delete alignment_writer; } -std::vector Decoder::decode(std::string posterior_rspecifier) { - std::vector ret; - +void Decoder::decode_from_file(std::string posterior_rspecifier, + size_t num_processes) { try { double tot_like = 0.0; kaldi::int64 frame_count = 0; @@ -112,40 +118,41 @@ std::vector Decoder::decode(std::string posterior_rspecifier) { Timer timer; timer.Reset(); + double elapsed = 0.0; + + for (size_t n = decoder_pool.size(); n < num_processes; ++n) { + kaldi::LatticeFasterDecoder *decoder = + new LatticeFasterDecoder(*decode_fst, config); + decoder_pool.emplace_back(decoder); + } + elapsed = timer.Elapsed(); + ThreadPool thread_pool(num_processes); - { - for (; !posterior_reader.Done(); posterior_reader.Next()) { + while (!posterior_reader.Done()) { + timer.Reset(); + std::vector> que; + for (size_t i = 0; i < num_processes && !posterior_reader.Done(); ++i) { std::string utt = posterior_reader.Key(); Matrix &loglikes(posterior_reader.Value()); - ret.push_back(decode(utt, loglikes)); + que.emplace_back(thread_pool.enqueue(std::bind( + &Decoder::decode_internal, this, decoder_pool[i], utt, loglikes))); + posterior_reader.Next(); + } + timer.Reset(); + for (size_t i = 0; i < que.size(); ++i) { + std::cout << que[i].get() << std::endl; } } - double elapsed = timer.Elapsed(); - return ret; } catch (const std::exception &e) { std::cerr << e.what(); - return ret; } } -std::vector Decoder::decode_batch( - std::vector keys, - const std::vector>> - &log_probs_batch) { - std::vector decoding_results; - for (size_t i = 0; i < keys.size(); ++i) { - decoding_results.push_back(decode(keys[i], log_probs_batch[i])); - } - return decoding_results; -} - -std::string Decoder::decode( - std::string key, +inline kaldi::Matrix vector2kaldi_mat( const std::vector> &log_probs) { size_t num_frames = log_probs.size(); size_t dim_label = log_probs[0].size(); - kaldi::Matrix loglikes( num_frames, dim_label, kaldi::kSetZero, kaldi::kStrideEqualNumCols); for (size_t i = 0; i < num_frames; ++i) { @@ -153,14 +160,56 @@ std::string Decoder::decode( log_probs[i].data(), sizeof(kaldi::BaseFloat) * dim_label); } + return loglikes; +} + +std::vector Decoder::decode_batch( + std::vector keys, + const std::vector>> + &log_probs_batch, + size_t num_processes) { + ThreadPool thread_pool(num_processes); + std::vector decoding_results; //(keys.size(), ""); + + for (size_t n = decoder_pool.size(); n < num_processes; ++n) { + kaldi::LatticeFasterDecoder *decoder = + new LatticeFasterDecoder(*decode_fst, config); + decoder_pool.emplace_back(decoder); + } - return decode(key, loglikes); + size_t index = 0; + while (index < keys.size()) { + std::vector> res_in_que; + for (size_t t = 0; t < num_processes && index < keys.size(); ++t) { + kaldi::Matrix loglikes = + vector2kaldi_mat(log_probs_batch[index]); + res_in_que.emplace_back( + thread_pool.enqueue(std::bind(&Decoder::decode_internal, + this, + decoder_pool[t], + keys[index], + loglikes))); + index++; + } + for (size_t i = 0; i < res_in_que.size(); ++i) { + decoding_results.emplace_back(res_in_que[i].get()); + } + } + return decoding_results; } +std::string Decoder::decode( + std::string key, + const std::vector> &log_probs) { + kaldi::Matrix loglikes = vector2kaldi_mat(log_probs); + return decode_internal(decoder_pool[0], key, loglikes); +} -std::string Decoder::decode(std::string key, - kaldi::Matrix &loglikes) { - std::string decoding_result; + +std::string Decoder::decode_internal( + LatticeFasterDecoder *decoder, + std::string key, + kaldi::Matrix &loglikes) { if (loglikes.NumRows() == 0) { KALDI_WARN << "Zero-length utterance: " << key; // num_fail++; @@ -173,21 +222,22 @@ std::string Decoder::decode(std::string key, DecodableMatrixScaledMapped matrix_decodable( trans_model, loglikes, acoustic_scale); double like; - - return this->DecodeUtteranceLatticeFaster(matrix_decodable, key, &like); + return this->DecodeUtteranceLatticeFaster( + decoder, matrix_decodable, key, &like); } -// Takes care of output. Returns true on success. std::string Decoder::DecodeUtteranceLatticeFaster( + LatticeFasterDecoder *decoder, DecodableInterface &decodable, // not const but is really an input. std::string utt, double *like_ptr) { // puts utterance's like in like_ptr on success. using fst::VectorFst; + std::string ret = utt + ' '; if (!decoder->Decode(&decodable)) { KALDI_WARN << "Failed to decode file " << utt; - return false; + return ret; } if (!decoder->ReachedFinal()) { if (allow_partial) { @@ -197,14 +247,13 @@ std::string Decoder::DecodeUtteranceLatticeFaster( KALDI_WARN << "Not producing output for utterance " << utt << " since no final-state reached and " << "--allow-partial=false.\n"; - return false; + return ret; } } double likelihood; LatticeWeight weight; int32 num_frames; - std::string ret = utt + ' '; { // First do some stuff with word-level traceback... VectorFst decoded; if (!decoder->GetBestPath(&decoded)) @@ -215,7 +264,7 @@ std::string Decoder::DecodeUtteranceLatticeFaster( std::vector words; GetLinearSymbolSequence(decoded, &alignment, &words, &weight); num_frames = alignment.size(); - if (alignment_writer->IsOpen()) alignment_writer->Write(utt, alignment); + // if (alignment_writer->IsOpen()) alignment_writer->Write(utt, alignment); if (word_syms != NULL) { for (size_t i = 0; i < words.size(); i++) { std::string s = word_syms->Find(words[i]); diff --git a/fluid/DeepASR/decoder/post_latgen_faster_mapped.h b/fluid/DeepASR/decoder/post_latgen_faster_mapped.h index a3a9e7d293..0bbb93065a 100644 --- a/fluid/DeepASR/decoder/post_latgen_faster_mapped.h +++ b/fluid/DeepASR/decoder/post_latgen_faster_mapped.h @@ -32,9 +32,10 @@ class Decoder { kaldi::BaseFloat acoustic_scale); ~Decoder(); - // Interface to accept the scores read from specifier and return - // the batch decoding results - std::vector decode(std::string posterior_rspecifier); + // Interface to accept the scores read from specifier and print + // the decoding results directly + void decode_from_file(std::string posterior_rspecifier, + size_t num_processes = 1); // Accept the scores of one utterance and return the decoding result std::string decode( @@ -45,21 +46,26 @@ class Decoder { std::vector decode_batch( std::vector key, const std::vector>> - &log_probs_batch); + &log_probs_batch, + size_t num_processes = 1); private: // For decoding one utterance - std::string decode(std::string key, - kaldi::Matrix &loglikes); - std::string DecodeUtteranceLatticeFaster(kaldi::DecodableInterface &decodable, + std::string decode_internal(kaldi::LatticeFasterDecoder *decoder, + std::string key, + kaldi::Matrix &loglikes); + + std::string DecodeUtteranceLatticeFaster(kaldi::LatticeFasterDecoder *decoder, + kaldi::DecodableInterface &decodable, std::string utt, double *like_ptr); fst::SymbolTable *word_syms; fst::Fst *decode_fst; - kaldi::LatticeFasterDecoder *decoder; + std::vector decoder_pool; kaldi::Vector logprior; kaldi::TransitionModel trans_model; + kaldi::LatticeFasterDecoderConfig config; kaldi::CompactLatticeWriter compact_lattice_writer; kaldi::LatticeWriter lattice_writer; diff --git a/fluid/DeepASR/decoder/pybind.cc b/fluid/DeepASR/decoder/pybind.cc index e99050e68d..93605d214c 100644 --- a/fluid/DeepASR/decoder/pybind.cc +++ b/fluid/DeepASR/decoder/pybind.cc @@ -28,16 +28,23 @@ PYBIND11_MODULE(post_latgen_faster_mapped, m) { std::string, std::string, kaldi::BaseFloat>()) - .def("decode", - (std::vector (Decoder::*)(std::string)) & - Decoder::decode, + .def("decode_from_file", + (void (Decoder::*)(std::string, size_t)) & Decoder::decode_from_file, "Decode for the probability matrices in specifier " - "and return the transcriptions.") + "and print the transcriptions.") .def( "decode", (std::string (Decoder::*)( std::string, const std::vector>&)) & Decoder::decode, "Decode one input probability matrix " - "and return the transcription."); + "and return the transcription.") + .def("decode_batch", + (std::vector (Decoder::*)( + std::string, + const std::vector>>&, + size_t num_processes)) & + Decoder::decode_batch, + "Decode one batch of probability matrices " + "and return the transcriptions."); } diff --git a/fluid/DeepASR/decoder/setup.py b/fluid/DeepASR/decoder/setup.py index 74e8aa00fb..81fc857cce 100644 --- a/fluid/DeepASR/decoder/setup.py +++ b/fluid/DeepASR/decoder/setup.py @@ -24,7 +24,7 @@ "install kaldi and export KALDI_ROOT= .") args = [ - '-std=c++11', '-Wno-sign-compare', '-Wno-unused-variable', + '-std=c++11', '-fopenmp', '-Wno-sign-compare', '-Wno-unused-variable', '-Wno-unused-local-typedefs', '-Wno-unused-but-set-variable', '-Wno-deprecated-declarations', '-Wno-unused-function' ] @@ -53,7 +53,7 @@ ['pybind.cc', 'post_latgen_faster_mapped.cc'], include_dirs=[ 'pybind11/include', '.', os.path.join(kaldi_root, 'src'), - os.path.join(kaldi_root, 'tools/openfst/src/include') + os.path.join(kaldi_root, 'tools/openfst/src/include'), 'ThreadPool' ], language='c++', libraries=LIBS, diff --git a/fluid/DeepASR/decoder/setup.sh b/fluid/DeepASR/decoder/setup.sh index 1471f85f41..238cc64986 100644 --- a/fluid/DeepASR/decoder/setup.sh +++ b/fluid/DeepASR/decoder/setup.sh @@ -4,4 +4,9 @@ if [ ! -d pybind11 ]; then git clone https://github.com/pybind/pybind11.git fi +if [ ! -d ThreadPool ]; then + git clone https://github.com/progschj/ThreadPool.git + echo -e "\n" +fi + python setup.py build_ext -i diff --git a/fluid/DeepASR/examples/aishell/infer_by_ckpt.sh b/fluid/DeepASR/examples/aishell/infer_by_ckpt.sh index ba7a8fed5f..e8e199f923 100644 --- a/fluid/DeepASR/examples/aishell/infer_by_ckpt.sh +++ b/fluid/DeepASR/examples/aishell/infer_by_ckpt.sh @@ -6,7 +6,6 @@ python -u ../../infer_by_ckpt.py --batch_size 48 \ --mean_var data/aishell/global_mean_var \ --frame_dim 80 \ --class_num 3040 \ - --post_matrix_path post_matrix.decoded \ --target_trans data/text.test \ --trans_model mapped_decoder_data/exp/tri5a/final.mdl \ --log_prior mapped_decoder_data/logprior \ diff --git a/fluid/DeepASR/infer_by_ckpt.py b/fluid/DeepASR/infer_by_ckpt.py index 554dd7223d..0498e19aa5 100644 --- a/fluid/DeepASR/infer_by_ckpt.py +++ b/fluid/DeepASR/infer_by_ckpt.py @@ -238,10 +238,10 @@ def infer_from_ckpt(args): probs, lod = lodtensor_to_ndarray(results[0]) infer_batch = split_infer_result(probs, lod) - for index, sample in enumerate(infer_batch): - key = name_lst[index] - ref = trg_trans[key] - if args.post_matrix_path is not None: + decoder.decode_batch(name_lst, infer_batch) + if args.post_matrix_path is not None: + for index, sample in enumerate(infer_batch): + key = name_lst[index] out_post_matrix(key, sample) ''' hyp = decoder.decode(key, sample) @@ -252,9 +252,9 @@ def infer_from_ckpt(args): print(key + "|Hyp:", hyp.encode("utf8")) print("Instance CER: ", edit_dist / ref_len) ''' - print("batch: ", batch_id) + #print("batch: ", batch_id) - print("Total CER = %f" % (total_edit_dist / total_ref_len)) + #print("Total CER = %f" % (total_edit_dist / total_ref_len)) if __name__ == '__main__': From 175f36f9e02b53413def7daf3b616d1e58aacbfe Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Wed, 20 Jun 2018 21:15:13 -0700 Subject: [PATCH 07/10] Expose number of threads for decoding --- fluid/DeepASR/decoder/pybind.cc | 2 +- fluid/DeepASR/examples/aishell/infer_by_ckpt.sh | 6 +++--- fluid/DeepASR/infer_by_ckpt.py | 11 +++++++++-- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/fluid/DeepASR/decoder/pybind.cc b/fluid/DeepASR/decoder/pybind.cc index 93605d214c..151eae3b83 100644 --- a/fluid/DeepASR/decoder/pybind.cc +++ b/fluid/DeepASR/decoder/pybind.cc @@ -41,7 +41,7 @@ PYBIND11_MODULE(post_latgen_faster_mapped, m) { "and return the transcription.") .def("decode_batch", (std::vector (Decoder::*)( - std::string, + std::vector, const std::vector>>&, size_t num_processes)) & Decoder::decode_batch, diff --git a/fluid/DeepASR/examples/aishell/infer_by_ckpt.sh b/fluid/DeepASR/examples/aishell/infer_by_ckpt.sh index e8e199f923..77eb4ce9cd 100644 --- a/fluid/DeepASR/examples/aishell/infer_by_ckpt.sh +++ b/fluid/DeepASR/examples/aishell/infer_by_ckpt.sh @@ -1,6 +1,6 @@ -export CUDA_VISIBLE_DEVICES=0,1 -python -u ../../infer_by_ckpt.py --batch_size 48 \ - --checkpoint deep_asr.pass_20.checkpoint \ +export CUDA_VISIBLE_DEVICES=2,3,4,5 +python -u ../../infer_by_ckpt.py --batch_size 96 \ + --checkpoint checkpoints/deep_asr.pass_20.checkpoint \ --infer_feature_lst data/test_feature.lst \ --infer_label_lst data/test_label.lst \ --mean_var data/aishell/global_mean_var \ diff --git a/fluid/DeepASR/infer_by_ckpt.py b/fluid/DeepASR/infer_by_ckpt.py index 0498e19aa5..07e2d6fc56 100644 --- a/fluid/DeepASR/infer_by_ckpt.py +++ b/fluid/DeepASR/infer_by_ckpt.py @@ -59,6 +59,11 @@ def parse_args(): type=int, default=1749, help='Number of classes in label. (default: %(default)d)') + parser.add_argument( + '--num_threads', + type=int, + default=10, + help='The number of threads for decoding. (default: %(default)d)') parser.add_argument( '--learning_rate', type=float, @@ -189,7 +194,7 @@ def infer_from_ckpt(args): exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) - trg_trans = get_trg_trans(args) + #trg_trans = get_trg_trans(args) # load checkpoint. fluid.io.load_persistables(exe, args.checkpoint) @@ -238,7 +243,9 @@ def infer_from_ckpt(args): probs, lod = lodtensor_to_ndarray(results[0]) infer_batch = split_infer_result(probs, lod) - decoder.decode_batch(name_lst, infer_batch) + decoded = decoder.decode_batch(name_lst, infer_batch, args.num_threads) + for res in decoded: + print(res.encode("utf8")) if args.post_matrix_path is not None: for index, sample in enumerate(infer_batch): key = name_lst[index] From c462ab1ae5cbb628a20e6ddd62697ef01f45968a Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Mon, 2 Jul 2018 01:57:21 -0700 Subject: [PATCH 08/10] Refine infer_by_ckpt: code clean & move out cer scoring --- .../DeepASR/examples/aishell/infer_by_ckpt.sh | 6 +- fluid/DeepASR/examples/aishell/score_cer.sh | 4 + fluid/DeepASR/infer_by_ckpt.py | 130 ++++++++---------- fluid/DeepASR/score_error_rate.py | 68 +++++++++ 4 files changed, 137 insertions(+), 71 deletions(-) create mode 100644 fluid/DeepASR/examples/aishell/score_cer.sh create mode 100644 fluid/DeepASR/score_error_rate.py diff --git a/fluid/DeepASR/examples/aishell/infer_by_ckpt.sh b/fluid/DeepASR/examples/aishell/infer_by_ckpt.sh index 77eb4ce9cd..990daee375 100644 --- a/fluid/DeepASR/examples/aishell/infer_by_ckpt.sh +++ b/fluid/DeepASR/examples/aishell/infer_by_ckpt.sh @@ -1,12 +1,14 @@ +decode_to_path=./decoding_result.txt + export CUDA_VISIBLE_DEVICES=2,3,4,5 python -u ../../infer_by_ckpt.py --batch_size 96 \ --checkpoint checkpoints/deep_asr.pass_20.checkpoint \ --infer_feature_lst data/test_feature.lst \ - --infer_label_lst data/test_label.lst \ --mean_var data/aishell/global_mean_var \ --frame_dim 80 \ --class_num 3040 \ - --target_trans data/text.test \ + --num_threads 24 \ + --decode_to_path $decode_to_path \ --trans_model mapped_decoder_data/exp/tri5a/final.mdl \ --log_prior mapped_decoder_data/logprior \ --vocabulary mapped_decoder_data/exp/tri5a/graph/words.txt \ diff --git a/fluid/DeepASR/examples/aishell/score_cer.sh b/fluid/DeepASR/examples/aishell/score_cer.sh new file mode 100644 index 0000000000..6c60d196a7 --- /dev/null +++ b/fluid/DeepASR/examples/aishell/score_cer.sh @@ -0,0 +1,4 @@ +ref_txt=data/text.test +hyp_txt=decoding_result.txt + +python ../../score_error_rate.py --error_rate_type cer --ref $ref_txt --hyp $hyp_txt diff --git a/fluid/DeepASR/infer_by_ckpt.py b/fluid/DeepASR/infer_by_ckpt.py index 2461852cd4..881b5ba225 100644 --- a/fluid/DeepASR/infer_by_ckpt.py +++ b/fluid/DeepASR/infer_by_ckpt.py @@ -14,10 +14,9 @@ import data_utils.augmentor.trans_splice as trans_splice import data_utils.augmentor.trans_delay as trans_delay import data_utils.async_data_reader as reader -from decoder.post_latgen_faster_mapped import Decoder -from data_utils.util import lodtensor_to_ndarray +from data_utils.util import lodtensor_to_ndarray, split_infer_result from model_utils.model import stacked_lstmp_model -from data_utils.util import split_infer_result +from decoder.post_latgen_faster_mapped import Decoder from tools.error_rate import char_errors @@ -64,11 +63,6 @@ def parse_args(): type=int, default=10, help='The number of threads for decoding. (default: %(default)d)') - parser.add_argument( - '--learning_rate', - type=float, - default=0.00016, - help='Learning rate used to train. (default: %(default)f)') parser.add_argument( '--device', type=str, @@ -80,7 +74,7 @@ def parse_args(): parser.add_argument( '--mean_var', type=str, - default='data/global_mean_var_search26kHr', + default='data/global_mean_var', help="The path for feature's global mean and variance. " "(default: %(default)s)") parser.add_argument( @@ -88,16 +82,6 @@ def parse_args(): type=str, default='data/infer_feature.lst', help='The feature list path for inference. (default: %(default)s)') - parser.add_argument( - '--infer_label_lst', - type=str, - default='data/infer_label.lst', - help='The label list path for inference. (default: %(default)s)') - parser.add_argument( - '--ref_txt', - type=str, - default='data/text.test', - help='The reference text for decoding. (default: %(default)s)') parser.add_argument( '--checkpoint', type=str, @@ -128,16 +112,17 @@ def parse_args(): type=float, default=0.2, help="Scaling factor for acoustic likelihoods. (default: %(default)f)") - parser.add_argument( - '--target_trans', - type=str, - default="./decoder/target_trans.txt", - help="The path to target transcription. (default: %(default)s)") parser.add_argument( '--post_matrix_path', type=str, default=None, help="The path to output post prob matrix. (default: %(default)s)") + parser.add_argument( + '--decode_to_path', + type=str, + default='./decoding_result.txt', + required=True, + help="The path to output the decoding result. (default: %(default)s)") args = parser.parse_args() return args @@ -149,26 +134,47 @@ def print_arguments(args): print('------------------------------------------------') -def get_trg_trans(args): - trans_dict = {} - with open(args.target_trans) as trg_trans: - line = trg_trans.readline() - while line: - items = line.strip().split() - key = items[0] - trans_dict[key] = ''.join(items[1:]) - line = trg_trans.readline() - return trans_dict +class PostMatrixWriter: + """ The writer for outputing the post probability matrix + """ + + def __init__(self, to_path): + self._to_path = to_path + with open(self._to_path, "w") as post_matrix: + post_matrix.seek(0) + post_matrix.truncate() + + def write(self, keys, probs): + with open(self._to_path, "a") as post_matrix: + if isinstance(keys, str): + keys, probs = [keys], [probs] + + for key, prob in zip(keys, probs): + post_matrix.write(key + " [\n") + for i in range(prob.shape[0]): + for j in range(prob.shape[1]): + post_matrix.write(str(prob[i][j]) + " ") + post_matrix.write("\n") + post_matrix.write("]\n") + +class DecodingResultWriter: + """ The writer for writing out decoding results + """ -def out_post_matrix(key, prob): - with open(args.post_matrix_path, "a") as post_matrix: - post_matrix.write(key + " [\n") - for i in range(prob.shape[0]): - for j in range(prob.shape[1]): - post_matrix.write(str(prob[i][j]) + " ") - post_matrix.write("\n") - post_matrix.write("]\n") + def __init__(self, to_path): + self._to_path = to_path + with open(self._to_path, "w") as decoding_result: + decoding_result.seek(0) + decoding_result.truncate() + + def write(self, results): + with open(self._to_path, "a") as decoding_result: + if isinstance(results, str): + decoding_result.write(results.encode("utf8") + "\n") + else: + for result in results: + decoding_result.write(result.encode("utf8") + "\n") def infer_from_ckpt(args): @@ -187,9 +193,10 @@ def infer_from_ckpt(args): infer_program = fluid.default_main_program().clone() + # optimizer, placeholder optimizer = fluid.optimizer.Adam( learning_rate=fluid.layers.exponential_decay( - learning_rate=args.learning_rate, + learning_rate=0.0001, decay_steps=1879, decay_rate=1 / 1.2, staircase=True)) @@ -199,7 +206,6 @@ def infer_from_ckpt(args): exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) - #trg_trans = get_trg_trans(args) # load checkpoint. fluid.io.load_persistables(exe, args.checkpoint) @@ -218,13 +224,13 @@ def infer_from_ckpt(args): # infer data reader infer_data_reader = reader.AsyncDataReader( - args.infer_feature_lst, - args.infer_label_lst, - drop_frame_len=-1, - split_sentence_threshold=-1) + args.infer_feature_lst, drop_frame_len=-1, split_sentence_threshold=-1) infer_data_reader.set_transformers(ltrans) - infer_costs, infer_accs = [], [] - total_edit_dist, total_ref_len = 0.0, 0 + + decoding_result_writer = DecodingResultWriter(args.decode_to_path) + post_matrix_writer = None if args.post_matrix_path is None \ + else PostMatrixWriter(args.post_matrix_path) + for batch_id, batch_data in enumerate( infer_data_reader.batch_iterator(args.batch_size, args.minimum_batch_size)): @@ -242,31 +248,17 @@ def infer_from_ckpt(args): "label": label_t}, fetch_list=[prediction, avg_cost, accuracy], return_numpy=False) - infer_costs.append(lodtensor_to_ndarray(results[1])[0]) - infer_accs.append(lodtensor_to_ndarray(results[2])[0]) probs, lod = lodtensor_to_ndarray(results[0]) infer_batch = split_infer_result(probs, lod) + print("Decoding batch %d ..." % batch_id) decoded = decoder.decode_batch(name_lst, infer_batch, args.num_threads) - for res in decoded: - print(res.encode("utf8")) + + decoding_result_writer.write(decoded) + if args.post_matrix_path is not None: - for index, sample in enumerate(infer_batch): - key = name_lst[index] - out_post_matrix(key, sample) - ''' - hyp = decoder.decode(key, sample) - edit_dist, ref_len = char_errors(ref.decode("utf8"), hyp) - total_edit_dist += edit_dist - total_ref_len += ref_len - print(key + "|Ref:", ref) - print(key + "|Hyp:", hyp.encode("utf8")) - print("Instance CER: ", edit_dist / ref_len) - ''' - #print("batch: ", batch_id) - - #print("Total CER = %f" % (total_edit_dist / total_ref_len)) + post_matrix_writer.write(name_lst, infer_batch) if __name__ == '__main__': diff --git a/fluid/DeepASR/score_error_rate.py b/fluid/DeepASR/score_error_rate.py new file mode 100644 index 0000000000..5ecbca0862 --- /dev/null +++ b/fluid/DeepASR/score_error_rate.py @@ -0,0 +1,68 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +from tools.error_rate import char_errors, word_errors + + +def parse_args(): + parser = argparse.ArgumentParser( + "Score word/character error rate (WER/CER) " + "for decoding result.") + parser.add_argument( + '--error_rate_type', + type=str, + default='cer', + choices=['cer', 'wer'], + help="Error rate type. (default: %(default)s)") + parser.add_argument( + '--ref', type=str, required=True, help="The ground truth text.") + parser.add_argument( + '--hyp', type=str, required=True, help="The decoding result.") + args = parser.parse_args() + return args + + +if __name__ == '__main__': + + args = parse_args() + ref_dict = {} + sum_errors, sum_ref_len = 0.0, 0 + sent_cnt, not_in_ref_cnt = 0, 0 + + with open(args.ref, "r") as ref_txt: + line = ref_txt.readline() + while line: + del_pos = line.find(" ") + key, sent = line[0:del_pos], line[del_pos + 1:-1].strip() + ref_dict[key] = sent + line = ref_txt.readline() + + with open(args.hyp, "r") as hyp_txt: + line = hyp_txt.readline() + while line: + del_pos = line.find(" ") + key, sent = line[0:del_pos], line[del_pos + 1:-1].strip() + sent_cnt += 1 + line = hyp_txt.readline() + if key not in ref_dict: + not_in_ref_cnt += 1 + continue + + if args.error_rate_type == 'cer': + errors, ref_len = char_errors( + ref_dict[key].decode("utf8"), + sent.decode("utf8"), + remove_space=True) + else: + errors, ref_len = word_errors(ref_dict[key].decode("utf8"), + sent.decode("utf8")) + sum_errors += errors + sum_ref_len += ref_len + + print("Error rate[%s] = %f (%d/%d)," % + (args.error_rate_type, sum_errors / sum_ref_len, int(sum_errors), + sum_ref_len)) + print("total %d sentences in hyp, %d not presented in ref." % + (sent_cnt, not_in_ref_cnt)) From b88f95a2d2fcb8ba70336824245f1832451c5ce1 Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Mon, 2 Jul 2018 04:13:31 -0700 Subject: [PATCH 09/10] Expose beam size in decoder --- fluid/DeepASR/decoder/post_latgen_faster_mapped.cc | 3 ++- fluid/DeepASR/decoder/post_latgen_faster_mapped.h | 1 + fluid/DeepASR/decoder/pybind.cc | 1 + fluid/DeepASR/examples/aishell/infer_by_ckpt.sh | 3 ++- fluid/DeepASR/infer_by_ckpt.py | 7 ++++++- 5 files changed, 12 insertions(+), 3 deletions(-) diff --git a/fluid/DeepASR/decoder/post_latgen_faster_mapped.cc b/fluid/DeepASR/decoder/post_latgen_faster_mapped.cc index 87791d5131..f83730ce51 100644 --- a/fluid/DeepASR/decoder/post_latgen_faster_mapped.cc +++ b/fluid/DeepASR/decoder/post_latgen_faster_mapped.cc @@ -26,6 +26,7 @@ Decoder::Decoder(std::string trans_model_in_filename, std::string word_syms_filename, std::string fst_in_filename, std::string logprior_in_filename, + size_t beam_size, kaldi::BaseFloat acoustic_scale) { const char *usage = "Generate lattices using neural net model.\n" @@ -51,7 +52,7 @@ Decoder::Decoder(std::string trans_model_in_filename, int argc = 2; char *argv[] = {(char *)"post-latgen-faster-mapped", - (char *)("--beam=" + std::string("11")).c_str()}; + (char *)("--beam=" + std::to_string(beam_size)).c_str()}; po.Read(argc, argv); diff --git a/fluid/DeepASR/decoder/post_latgen_faster_mapped.h b/fluid/DeepASR/decoder/post_latgen_faster_mapped.h index 0bbb93065a..9c234b8681 100644 --- a/fluid/DeepASR/decoder/post_latgen_faster_mapped.h +++ b/fluid/DeepASR/decoder/post_latgen_faster_mapped.h @@ -29,6 +29,7 @@ class Decoder { std::string word_syms_filename, std::string fst_in_filename, std::string logprior_in_filename, + size_t beam_size, kaldi::BaseFloat acoustic_scale); ~Decoder(); diff --git a/fluid/DeepASR/decoder/pybind.cc b/fluid/DeepASR/decoder/pybind.cc index 151eae3b83..4a9b27d4cf 100644 --- a/fluid/DeepASR/decoder/pybind.cc +++ b/fluid/DeepASR/decoder/pybind.cc @@ -27,6 +27,7 @@ PYBIND11_MODULE(post_latgen_faster_mapped, m) { std::string, std::string, std::string, + size_t, kaldi::BaseFloat>()) .def("decode_from_file", (void (Decoder::*)(std::string, size_t)) & Decoder::decode_from_file, diff --git a/fluid/DeepASR/examples/aishell/infer_by_ckpt.sh b/fluid/DeepASR/examples/aishell/infer_by_ckpt.sh index 990daee375..60a48ba5da 100644 --- a/fluid/DeepASR/examples/aishell/infer_by_ckpt.sh +++ b/fluid/DeepASR/examples/aishell/infer_by_ckpt.sh @@ -4,10 +4,11 @@ export CUDA_VISIBLE_DEVICES=2,3,4,5 python -u ../../infer_by_ckpt.py --batch_size 96 \ --checkpoint checkpoints/deep_asr.pass_20.checkpoint \ --infer_feature_lst data/test_feature.lst \ - --mean_var data/aishell/global_mean_var \ + --mean_var data/global_mean_var \ --frame_dim 80 \ --class_num 3040 \ --num_threads 24 \ + --beam_size 11 \ --decode_to_path $decode_to_path \ --trans_model mapped_decoder_data/exp/tri5a/final.mdl \ --log_prior mapped_decoder_data/logprior \ diff --git a/fluid/DeepASR/infer_by_ckpt.py b/fluid/DeepASR/infer_by_ckpt.py index 881b5ba225..1e0fb15c6d 100644 --- a/fluid/DeepASR/infer_by_ckpt.py +++ b/fluid/DeepASR/infer_by_ckpt.py @@ -27,6 +27,11 @@ def parse_args(): type=int, default=32, help='The sequence number of a batch data. (default: %(default)d)') + parser.add_argument( + '--beam_size', + type=int, + default=11, + help='The beam size for decoding. (default: %(default)d)') parser.add_argument( '--minimum_batch_size', type=int, @@ -211,7 +216,7 @@ def infer_from_ckpt(args): # init decoder decoder = Decoder(args.trans_model, args.vocabulary, args.graphs, - args.log_prior, args.acoustic_scale) + args.log_prior, args.beam_size, args.acoustic_scale) ltrans = [ trans_add_delta.TransAddDelta(2, 2), From db42a954cdd605b3ecf521e086bd5e7cc00a036f Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Mon, 2 Jul 2018 05:44:40 -0700 Subject: [PATCH 10/10] Disable output config in cpp end --- fluid/DeepASR/decoder/post_latgen_faster_mapped.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fluid/DeepASR/decoder/post_latgen_faster_mapped.cc b/fluid/DeepASR/decoder/post_latgen_faster_mapped.cc index f83730ce51..ad8aaa8480 100644 --- a/fluid/DeepASR/decoder/post_latgen_faster_mapped.cc +++ b/fluid/DeepASR/decoder/post_latgen_faster_mapped.cc @@ -56,7 +56,6 @@ Decoder::Decoder(std::string trans_model_in_filename, po.Read(argc, argv); - po.PrintConfig(std::cout); std::ifstream is_logprior(logprior_in_filename); logprior.Read(is_logprior, false); @@ -294,12 +293,13 @@ std::string Decoder::DecodeUtteranceLatticeFaster( // We'll write the lattice without acoustic scaling. if (acoustic_scale != 0.0) fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale), &clat); - compact_lattice_writer.Write(utt, clat); + // disable output lattice temporarily + // compact_lattice_writer.Write(utt, clat); } else { // We'll write the lattice without acoustic scaling. if (acoustic_scale != 0.0) fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale), &lat); - lattice_writer.Write(utt, lat); + // lattice_writer.Write(utt, lat); } return ret; }