Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[speechx]add aishell test script & json parser & no db norm linear feature & json2kaldi type cmvn #1676

Merged
merged 6 commits into from
Apr 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
500 changes: 500 additions & 0 deletions speechx/examples/aishell/local/compute-wer.py

Large diffs are not rendered by default.

24 changes: 24 additions & 0 deletions speechx/examples/aishell/local/split_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/usr/bin/env bash

data=$1
feat_scp=$2
split_feat_name=$3
numsplit=$4


if ! [ "$numsplit" -gt 0 ]; then
echo "Invalid num-split argument";
exit 1;
fi

directories=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n; done)
feat_split_scp=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n/${split_feat_name}; done)
echo $feat_split_scp
# if this mkdir fails due to argument-list being too long, iterate.
if ! mkdir -p $directories >&/dev/null; then
for n in `seq $numsplit`; do
mkdir -p $data/split${numsplit}/$n
done
fi

utils/split_scp.pl $feat_scp $feat_split_scp
14 changes: 14 additions & 0 deletions speechx/examples/aishell/path.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# This contains the locations of binarys build required for running the examples.

SPEECHX_ROOT=$PWD/../..
SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples

SPEECHX_TOOLS=$SPEECHX_ROOT/tools
TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin

[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }

export LC_AL=C

SPEECHX_BIN=$SPEECHX_EXAMPLES/decoder:$SPEECHX_EXAMPLES/feat
export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
81 changes: 81 additions & 0 deletions speechx/examples/aishell/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/bin/bash
set +x
set -e

. path.sh

# 1. compile
if [ ! -d ${SPEECHX_EXAMPLES} ]; then
pushd ${SPEECHX_ROOT}
bash build.sh
popd
fi


# 2. download model
if [ ! -d ../paddle_asr_model ]; then
wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/paddle_asr_model.tar.gz
tar xzfv paddle_asr_model.tar.gz
mv ./paddle_asr_model ../
# produce wav scp
echo "utt1 " $PWD/../paddle_asr_model/BAC009S0764W0290.wav > ../paddle_asr_model/wav.scp
fi

mkdir -p data
data=$PWD/data
aishell_wav_scp=aishell_test.scp
if [ ! -d $data/test ]; then
wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip
unzip -d $data aishell_test.zip
realpath $data/test/*/*.wav > $data/wavlist
awk -F '/' '{ print $(NF) }' $data/wavlist | awk -F '.' '{ print $1 }' > $data/utt_id
paste $data/utt_id $data/wavlist > $data/$aishell_wav_scp
fi

model_dir=$PWD/aishell_ds2_online_model
if [ ! -d $model_dir ]; then
mkdir -p $model_dir
wget -P $model_dir -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
tar xzfv $model_dir/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz -C $model_dir
fi

# 3. make feature
aishell_online_model=$model_dir/exp/deepspeech2_online/checkpoints
lm_model_dir=../paddle_asr_model
label_file=./aishell_result
wer=./aishell_wer

nj=40
export GLOG_logtostderr=1

./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj

data=$PWD/data
# 3. gen linear feat
cmvn=$PWD/cmvn.ark
cmvn_json2binary_main --json_file=$model_dir/data/mean_std.json --cmvn_write_path=$cmvn

utils/run.pl JOB=1:$nj $data/split${nj}/JOB/feat_log \
linear_spectrogram_without_db_norm_main \
--wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
--feature_wspecifier=ark,scp:$data/split${nj}/JOB/feat.ark,$data/split${nj}/JOB/feat.scp \
--cmvn_file=$cmvn \
--streaming_chunk=0.36

text=$data/test/text

# 4. recognizer
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/log \
offline_decoder_sliding_chunk_main \
--feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
--model_path=$aishell_online_model/avg_1.jit.pdmodel \
--param_path=$aishell_online_model/avg_1.jit.pdiparams \
--model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
--dict_file=$lm_model_dir/vocab.txt \
--lm_path=$lm_model_dir/avg_1.jit.klm \
--result_wspecifier=ark,t:$data/split${nj}/JOB/result

cat $data/split${nj}/*/result > $label_file

local/compute-wer.py --char=1 --v=1 $label_file $text > $wer
tail $wer
1 change: 1 addition & 0 deletions speechx/examples/aishell/utils
16 changes: 13 additions & 3 deletions speechx/examples/decoder/offline_decoder_sliding_chunk_main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@
#include "nnet/decodable.h"
#include "nnet/paddle_nnet.h"

DEFINE_string(feature_respecifier, "", "test feature rspecifier");
DEFINE_string(feature_rspecifier, "", "test feature rspecifier");
DEFINE_string(result_wspecifier, "", "test result wspecifier");
DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model");
DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param");
DEFINE_string(dict_file, "vocab.txt", "vocabulary of lm");
Expand All @@ -33,6 +34,12 @@ DEFINE_int32(receptive_field_length,
DEFINE_int32(downsampling_rate,
4,
"two CNN(kernel=5) module downsampling rate.");
DEFINE_string(model_output_names,
"save_infer_model/scale_0.tmp_1,save_infer_model/"
"scale_1.tmp_1,save_infer_model/scale_2.tmp_1,save_infer_model/"
"scale_3.tmp_1",
"model output names");
DEFINE_string(model_cache_names, "5-1-1024,5-1-1024", "model cache names");

using kaldi::BaseFloat;
using kaldi::Matrix;
Expand All @@ -45,7 +52,8 @@ int main(int argc, char* argv[]) {
google::InitGoogleLogging(argv[0]);

kaldi::SequentialBaseFloatMatrixReader feature_reader(
FLAGS_feature_respecifier);
FLAGS_feature_rspecifier);
kaldi::TokenWriter result_writer(FLAGS_result_wspecifier);
std::string model_graph = FLAGS_model_path;
std::string model_params = FLAGS_param_path;
std::string dict_file = FLAGS_dict_file;
Expand All @@ -66,7 +74,8 @@ int main(int argc, char* argv[]) {
ppspeech::ModelOptions model_opts;
model_opts.model_path = model_graph;
model_opts.params_path = model_params;
model_opts.cache_shape = "5-1-1024,5-1-1024";
model_opts.cache_shape = FLAGS_model_cache_names;
model_opts.output_names = FLAGS_model_output_names;
std::shared_ptr<ppspeech::PaddleNnet> nnet(
new ppspeech::PaddleNnet(model_opts));
std::shared_ptr<ppspeech::DataCache> raw_data(new ppspeech::DataCache());
Expand Down Expand Up @@ -130,6 +139,7 @@ int main(int argc, char* argv[]) {
std::string result;
result = decoder.GetFinalBestPath();
KALDI_LOG << " the result of " << utt << " is " << result;
result_writer.Write(utt, result);
decodable->Reset();
decoder.Reset();
++num_done;
Expand Down
10 changes: 9 additions & 1 deletion speechx/examples/feat/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,12 @@ target_link_libraries(mfcc-test kaldi-mfcc)

add_executable(linear_spectrogram_main ${CMAKE_CURRENT_SOURCE_DIR}/linear_spectrogram_main.cc)
target_include_directories(linear_spectrogram_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
target_link_libraries(linear_spectrogram_main frontend kaldi-util kaldi-feat-common gflags glog)
target_link_libraries(linear_spectrogram_main frontend kaldi-util kaldi-feat-common gflags glog)

add_executable(linear_spectrogram_without_db_norm_main ${CMAKE_CURRENT_SOURCE_DIR}/linear_spectrogram_without_db_norm_main.cc)
target_include_directories(linear_spectrogram_without_db_norm_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
target_link_libraries(linear_spectrogram_without_db_norm_main frontend kaldi-util kaldi-feat-common gflags glog)

add_executable(cmvn_json2binary_main ${CMAKE_CURRENT_SOURCE_DIR}/cmvn_json2binary_main.cc)
target_include_directories(cmvn_json2binary_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
target_link_libraries(cmvn_json2binary_main utils kaldi-util kaldi-matrix gflags glog)
58 changes: 58 additions & 0 deletions speechx/examples/feat/cmvn_json2binary_main.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "base/flags.h"
#include "base/log.h"
#include "kaldi/matrix/kaldi-matrix.h"
#include "kaldi/util/kaldi-io.h"
#include "utils/file_utils.h"
#include "utils/simdjson.h"

DEFINE_string(json_file, "", "cmvn json file");
DEFINE_string(cmvn_write_path, "./cmvn.ark", "write cmvn");
DEFINE_bool(binary, true, "write cmvn in binary (true) or text(false)");

using namespace simdjson;

int main(int argc, char* argv[]) {
gflags::ParseCommandLineFlags(&argc, &argv, false);
google::InitGoogleLogging(argv[0]);

ondemand::parser parser;
padded_string json = padded_string::load(FLAGS_json_file);
ondemand::document val = parser.iterate(json);
ondemand::object doc = val;
kaldi::int32 frame_num = uint64_t(doc["frame_num"]);
auto mean_stat = doc["mean_stat"];
std::vector<kaldi::BaseFloat> mean_stat_vec;
for (double x : mean_stat) {
mean_stat_vec.push_back(x);
}
auto var_stat = doc["var_stat"];
std::vector<kaldi::BaseFloat> var_stat_vec;
for (double x : var_stat) {
var_stat_vec.push_back(x);
}

size_t mean_size = mean_stat_vec.size();
kaldi::Matrix<double> cmvn_stats(2, mean_size + 1);
for (size_t idx = 0; idx < mean_size; ++idx) {
cmvn_stats(0, idx) = mean_stat_vec[idx];
cmvn_stats(1, idx) = var_stat_vec[idx];
}
cmvn_stats(0, mean_size) = frame_num;
kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, FLAGS_binary);
LOG(INFO) << "the json file have write into " << FLAGS_cmvn_write_path;
return 0;
}
5 changes: 4 additions & 1 deletion speechx/examples/feat/linear_spectrogram_main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
DEFINE_string(wav_rspecifier, "", "test wav scp path");
DEFINE_string(feature_wspecifier, "", "output feats wspecifier");
DEFINE_string(cmvn_write_path, "./cmvn.ark", "write cmvn");
DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size");


std::vector<float> mean_{
Expand Down Expand Up @@ -181,6 +182,7 @@ int main(int argc, char* argv[]) {
ppspeech::LinearSpectrogramOptions opt;
opt.frame_opts.frame_length_ms = 20;
opt.frame_opts.frame_shift_ms = 10;
opt.streaming_chunk = FLAGS_streaming_chunk;
opt.frame_opts.dither = 0.0;
opt.frame_opts.remove_dc_offset = false;
opt.frame_opts.window_type = "hanning";
Expand All @@ -198,7 +200,7 @@ int main(int argc, char* argv[]) {
LOG(INFO) << "feat dim: " << feature_cache.Dim();

int sample_rate = 16000;
float streaming_chunk = 0.36;
float streaming_chunk = FLAGS_streaming_chunk;
int chunk_sample_size = streaming_chunk * sample_rate;
LOG(INFO) << "sr: " << sample_rate;
LOG(INFO) << "chunk size (s): " << streaming_chunk;
Expand Down Expand Up @@ -256,6 +258,7 @@ int main(int argc, char* argv[]) {
}
}
feat_writer.Write(utt, features);
feature_cache.Reset();

if (num_done % 50 == 0 && num_done != 0)
KALDI_VLOG(2) << "Processed " << num_done << " utterances";
Expand Down
Loading