Merge pull request #1676 from SmileGoat/add_aishell_eg

[speechx]add aishell test script & json parser & no db norm linear feature & json2kaldi type cmvn
PaddlePaddle · Apr 9, 2022 · 664cc9c · 664cc9c
2 parents 2f97b81 + f0c5bd6
commit 664cc9c
Show file tree

Hide file tree

Showing 18 changed files with 43,192 additions and 10 deletions.
diff --git a/speechx/examples/aishell/local/compute-wer.py b/speechx/examples/aishell/local/compute-wer.py
diff --git a/speechx/examples/aishell/local/split_data.sh b/speechx/examples/aishell/local/split_data.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+data=$1
+feat_scp=$2
+split_feat_name=$3
+numsplit=$4
+
+
+if ! [ "$numsplit" -gt 0 ]; then
+  echo "Invalid num-split argument";
+  exit 1;
+fi
+
+directories=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n; done)
+feat_split_scp=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n/${split_feat_name}; done)
+echo $feat_split_scp
+# if this mkdir fails due to argument-list being too long, iterate.
+if ! mkdir -p $directories >&/dev/null; then
+  for n in `seq $numsplit`; do
+    mkdir -p $data/split${numsplit}/$n
+  done
+fi
+
+utils/split_scp.pl $feat_scp $feat_split_scp
diff --git a/speechx/examples/aishell/path.sh b/speechx/examples/aishell/path.sh
@@ -0,0 +1,14 @@
+# This contains the locations of binarys build required for running the examples.
+
+SPEECHX_ROOT=$PWD/../..
+SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
+
+SPEECHX_TOOLS=$SPEECHX_ROOT/tools
+TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
+
+[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
+
+export LC_AL=C
+
+SPEECHX_BIN=$SPEECHX_EXAMPLES/decoder:$SPEECHX_EXAMPLES/feat
+export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
diff --git a/speechx/examples/aishell/run.sh b/speechx/examples/aishell/run.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+set +x
+set -e
+
+. path.sh
+
+# 1. compile
+if [ ! -d ${SPEECHX_EXAMPLES} ]; then
+    pushd ${SPEECHX_ROOT} 
+    bash build.sh
+    popd
+fi
+
+
+# 2. download model
+if [ ! -d ../paddle_asr_model ]; then
+    wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/paddle_asr_model.tar.gz
+    tar xzfv paddle_asr_model.tar.gz
+    mv ./paddle_asr_model ../
+    # produce wav scp
+    echo "utt1 " $PWD/../paddle_asr_model/BAC009S0764W0290.wav > ../paddle_asr_model/wav.scp
+fi
+
+mkdir -p data
+data=$PWD/data
+aishell_wav_scp=aishell_test.scp
+if [ ! -d $data/test ]; then
+    wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip
+    unzip -d $data aishell_test.zip
+    realpath $data/test/*/*.wav > $data/wavlist
+    awk -F '/' '{ print $(NF) }' $data/wavlist | awk -F '.' '{ print $1 }' > $data/utt_id
+    paste $data/utt_id $data/wavlist > $data/$aishell_wav_scp
+fi
+
+model_dir=$PWD/aishell_ds2_online_model
+if [ ! -d $model_dir ]; then
+    mkdir -p $model_dir 
+    wget -P $model_dir -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
+    tar xzfv $model_dir/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz -C $model_dir
+fi
+
+# 3. make feature
+aishell_online_model=$model_dir/exp/deepspeech2_online/checkpoints
+lm_model_dir=../paddle_asr_model
+label_file=./aishell_result
+wer=./aishell_wer
+
+nj=40
+export GLOG_logtostderr=1
+
+./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj
+
+data=$PWD/data
+# 3. gen linear feat
+cmvn=$PWD/cmvn.ark
+cmvn_json2binary_main --json_file=$model_dir/data/mean_std.json --cmvn_write_path=$cmvn
+
+utils/run.pl JOB=1:$nj $data/split${nj}/JOB/feat_log \
+linear_spectrogram_without_db_norm_main \
+    --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
+    --feature_wspecifier=ark,scp:$data/split${nj}/JOB/feat.ark,$data/split${nj}/JOB/feat.scp \
+    --cmvn_file=$cmvn \
+    --streaming_chunk=0.36
+
+text=$data/test/text
+
+# 4. recognizer
+utils/run.pl JOB=1:$nj $data/split${nj}/JOB/log \
+  offline_decoder_sliding_chunk_main \
+    --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
+    --model_path=$aishell_online_model/avg_1.jit.pdmodel \
+    --param_path=$aishell_online_model/avg_1.jit.pdiparams \
+    --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
+    --dict_file=$lm_model_dir/vocab.txt \
+    --lm_path=$lm_model_dir/avg_1.jit.klm \
+    --result_wspecifier=ark,t:$data/split${nj}/JOB/result
+
+cat $data/split${nj}/*/result > $label_file
+
+local/compute-wer.py --char=1 --v=1 $label_file $text > $wer
+tail $wer
diff --git a/speechx/examples/aishell/utils b/speechx/examples/aishell/utils
@@ -0,0 +1 @@
+../../../utils
diff --git a/speechx/examples/decoder/offline_decoder_sliding_chunk_main.cc b/speechx/examples/decoder/offline_decoder_sliding_chunk_main.cc
@@ -22,7 +22,8 @@
 #include "nnet/decodable.h"
 #include "nnet/paddle_nnet.h"
 
-DEFINE_string(feature_respecifier, "", "test feature rspecifier");
+DEFINE_string(feature_rspecifier, "", "test feature rspecifier");
+DEFINE_string(result_wspecifier, "", "test result wspecifier");
 DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model");
 DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param");
 DEFINE_string(dict_file, "vocab.txt", "vocabulary of lm");
@@ -33,6 +34,12 @@ DEFINE_int32(receptive_field_length,
 DEFINE_int32(downsampling_rate,
              4,
              "two CNN(kernel=5) module downsampling rate.");
+DEFINE_string(model_output_names,
+              "save_infer_model/scale_0.tmp_1,save_infer_model/"
+              "scale_1.tmp_1,save_infer_model/scale_2.tmp_1,save_infer_model/"
+              "scale_3.tmp_1",
+              "model output names");
+DEFINE_string(model_cache_names, "5-1-1024,5-1-1024", "model cache names");
 
 using kaldi::BaseFloat;
 using kaldi::Matrix;
@@ -45,7 +52,8 @@ int main(int argc, char* argv[]) {
     google::InitGoogleLogging(argv[0]);
 
     kaldi::SequentialBaseFloatMatrixReader feature_reader(
-        FLAGS_feature_respecifier);
+        FLAGS_feature_rspecifier);
+    kaldi::TokenWriter result_writer(FLAGS_result_wspecifier);
     std::string model_graph = FLAGS_model_path;
     std::string model_params = FLAGS_param_path;
     std::string dict_file = FLAGS_dict_file;
@@ -66,7 +74,8 @@ int main(int argc, char* argv[]) {
     ppspeech::ModelOptions model_opts;
     model_opts.model_path = model_graph;
     model_opts.params_path = model_params;
-    model_opts.cache_shape = "5-1-1024,5-1-1024";
+    model_opts.cache_shape = FLAGS_model_cache_names;
+    model_opts.output_names = FLAGS_model_output_names;
     std::shared_ptr<ppspeech::PaddleNnet> nnet(
         new ppspeech::PaddleNnet(model_opts));
     std::shared_ptr<ppspeech::DataCache> raw_data(new ppspeech::DataCache());
@@ -130,6 +139,7 @@ int main(int argc, char* argv[]) {
         std::string result;
         result = decoder.GetFinalBestPath();
         KALDI_LOG << " the result of " << utt << " is " << result;
+        result_writer.Write(utt, result);
         decodable->Reset();
         decoder.Reset();
         ++num_done;

diff --git a/speechx/examples/feat/CMakeLists.txt b/speechx/examples/feat/CMakeLists.txt
@@ -7,4 +7,12 @@ target_link_libraries(mfcc-test kaldi-mfcc)
 
 add_executable(linear_spectrogram_main ${CMAKE_CURRENT_SOURCE_DIR}/linear_spectrogram_main.cc)
 target_include_directories(linear_spectrogram_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(linear_spectrogram_main frontend kaldi-util kaldi-feat-common gflags glog)
+target_link_libraries(linear_spectrogram_main frontend kaldi-util kaldi-feat-common gflags glog)
+
+add_executable(linear_spectrogram_without_db_norm_main ${CMAKE_CURRENT_SOURCE_DIR}/linear_spectrogram_without_db_norm_main.cc)
+target_include_directories(linear_spectrogram_without_db_norm_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+target_link_libraries(linear_spectrogram_without_db_norm_main frontend kaldi-util kaldi-feat-common gflags glog)
+
+add_executable(cmvn_json2binary_main ${CMAKE_CURRENT_SOURCE_DIR}/cmvn_json2binary_main.cc)
+target_include_directories(cmvn_json2binary_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+target_link_libraries(cmvn_json2binary_main utils kaldi-util kaldi-matrix gflags glog)
diff --git a/speechx/examples/feat/cmvn_json2binary_main.cc b/speechx/examples/feat/cmvn_json2binary_main.cc
@@ -0,0 +1,58 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/flags.h"
+#include "base/log.h"
+#include "kaldi/matrix/kaldi-matrix.h"
+#include "kaldi/util/kaldi-io.h"
+#include "utils/file_utils.h"
+#include "utils/simdjson.h"
+
+DEFINE_string(json_file, "", "cmvn json file");
+DEFINE_string(cmvn_write_path, "./cmvn.ark", "write cmvn");
+DEFINE_bool(binary, true, "write cmvn in binary (true) or text(false)");
+
+using namespace simdjson;
+
+int main(int argc, char* argv[]) {
+    gflags::ParseCommandLineFlags(&argc, &argv, false);
+    google::InitGoogleLogging(argv[0]);
+
+    ondemand::parser parser;
+    padded_string json = padded_string::load(FLAGS_json_file);
+    ondemand::document val = parser.iterate(json);
+    ondemand::object doc = val;
+    kaldi::int32 frame_num = uint64_t(doc["frame_num"]);
+    auto mean_stat = doc["mean_stat"];
+    std::vector<kaldi::BaseFloat> mean_stat_vec;
+    for (double x : mean_stat) {
+        mean_stat_vec.push_back(x);
+    }
+    auto var_stat = doc["var_stat"];
+    std::vector<kaldi::BaseFloat> var_stat_vec;
+    for (double x : var_stat) {
+        var_stat_vec.push_back(x);
+    }
+
+    size_t mean_size = mean_stat_vec.size();
+    kaldi::Matrix<double> cmvn_stats(2, mean_size + 1);
+    for (size_t idx = 0; idx < mean_size; ++idx) {
+        cmvn_stats(0, idx) = mean_stat_vec[idx];
+        cmvn_stats(1, idx) = var_stat_vec[idx];
+    }
+    cmvn_stats(0, mean_size) = frame_num;
+    kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, FLAGS_binary);
+    LOG(INFO) << "the json file have write into " << FLAGS_cmvn_write_path;
+    return 0;
+}
diff --git a/speechx/examples/feat/linear_spectrogram_main.cc b/speechx/examples/feat/linear_spectrogram_main.cc
@@ -30,6 +30,7 @@
 DEFINE_string(wav_rspecifier, "", "test wav scp path");
 DEFINE_string(feature_wspecifier, "", "output feats wspecifier");
 DEFINE_string(cmvn_write_path, "./cmvn.ark", "write cmvn");
+DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size");
 
 
 std::vector<float> mean_{
@@ -181,6 +182,7 @@ int main(int argc, char* argv[]) {
     ppspeech::LinearSpectrogramOptions opt;
     opt.frame_opts.frame_length_ms = 20;
     opt.frame_opts.frame_shift_ms = 10;
+    opt.streaming_chunk = FLAGS_streaming_chunk;
     opt.frame_opts.dither = 0.0;
     opt.frame_opts.remove_dc_offset = false;
     opt.frame_opts.window_type = "hanning";
@@ -198,7 +200,7 @@ int main(int argc, char* argv[]) {
     LOG(INFO) << "feat dim: " << feature_cache.Dim();
 
     int sample_rate = 16000;
-    float streaming_chunk = 0.36;
+    float streaming_chunk = FLAGS_streaming_chunk;
     int chunk_sample_size = streaming_chunk * sample_rate;
     LOG(INFO) << "sr: " << sample_rate;
     LOG(INFO) << "chunk size (s): " << streaming_chunk;
@@ -256,6 +258,7 @@ int main(int argc, char* argv[]) {
             }
         }
         feat_writer.Write(utt, features);
+        feature_cache.Reset();
 
         if (num_done % 50 == 0 && num_done != 0)
             KALDI_VLOG(2) << "Processed " << num_done << " utterances";