From 2e769fb36ce88dabfa984e8b81e8cb1c90799c95 Mon Sep 17 00:00:00 2001
From: zhifu gao <zhifu.gzf@alibaba-inc.com>
Date: 星期五, 07 四月 2023 15:54:09 +0800
Subject: [PATCH] Merge branch 'main' into dev_cmz2

---
 funasr/runtime/grpc/Readme.md                                                                                            |   62 +++
 funasr/train/trainer.py                                                                                                  |    7 
 egs/librispeech/conformer/conf/decode_asr_transformer.yaml                                                               |    6 
 funasr/export/export_model.py                                                                                            |    8 
 funasr/runtime/python/libtorch/README.md                                                                                 |    2 
 funasr/bin/asr_inference_paraformer.py                                                                                   |    2 
 funasr/bin/asr_inference_uniasr_vad.py                                                                                   |    4 
 funasr/bin/asr_inference_paraformer_vad_punc.py                                                                          |    2 
 funasr/runtime/python/libtorch/funasr_torch/paraformer_bin.py                                                            |   10 
 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py |    2 
 funasr/runtime/grpc/paraformer_server.cc                                                                                 |   26 
 funasr/bin/asr_inference_paraformer_vad.py                                                                               |    2 
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py                   |    2 
 funasr/export/models/modules/multihead_att.py                                                                            |    4 
 funasr/tasks/abs_task.py                                                                                                 |   21 
 egs/librispeech/conformer/conf/train_asr_conformer_uttnorm.yaml                                                          |   80 ++++
 funasr/bin/asr_inference_uniasr.py                                                                                       |    4 
 egs/librispeech/conformer/conf/train_asr_conformer.yaml                                                                  |   80 ++++
 funasr/datasets/large_datasets/utils/tokenize.py                                                                         |    2 
 funasr/modules/embedding.py                                                                                              |   13 
 funasr/version.txt                                                                                                       |    2 
 egs/librispeech/conformer/path.sh                                                                                        |    5 
 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh                |    4 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py      |    2 
 funasr/bin/asr_inference_rnnt.py                                                                                         |    4 
 egs/aishell/transformer/utils/cmvn_converter.py                                                                          |   53 +++
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py    |    2 
 funasr/models/decoder/sanm_decoder.py                                                                                    |   13 
 egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py                  |    2 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer.py                      |    2 
 funasr/runtime/python/onnxruntime/demo.py                                                                                |   16 
 egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer.py                              |    2 
 funasr/bin/asr_inference_paraformer_streaming.py                                                                         |  105 ++++-
 egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py               |    2 
 funasr/models/e2e_asr_paraformer.py                                                                                      |   90 ++++
 egs/librispeech/conformer/local/data_prep_librispeech.sh                                                                 |   58 +++
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.sh                   |    4 
 funasr/models/decoder/contextual_decoder.py                                                                              |    6 
 funasr/tasks/asr.py                                                                                                      |    6 
 funasr/runtime/python/libtorch/demo.py                                                                                   |   15 
 egs/librispeech/conformer/run.sh                                                                                         |  262 +++++++++++++++
 egs/librispeech/conformer/utils                                                                                          |    1 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer.py                     |    2 
 egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer.py                                 |    2 
 funasr/runtime/python/onnxruntime/README.md                                                                              |    2 
 egs/aishell/transformer/utils/compute_wer.py                                                                             |    4 
 funasr/runtime/python/grpc/grpc_server.py                                                                                |    2 
 funasr/utils/compute_wer.py                                                                                              |    4 
 funasr/models/predictor/cif.py                                                                                           |    4 
 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py                |    2 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py       |    2 
 51 files changed, 887 insertions(+), 132 deletions(-)

diff --git a/egs/aishell/transformer/utils/cmvn_converter.py b/egs/aishell/transformer/utils/cmvn_converter.py
new file mode 100644
index 0000000..cb978af
--- /dev/null
+++ b/egs/aishell/transformer/utils/cmvn_converter.py
@@ -0,0 +1,53 @@
+import argparse
+import json
+import numpy as np
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="cmvn converter",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--cmvn-json",
+        "-c",
+        default=False,
+        required=True,
+        type=str,
+        help="cmvn json file",
+    )
+    parser.add_argument(
+        "--am-mvn",
+        "-a",
+        default=False,
+        required=True,
+        type=str,
+        help="am mvn file",
+    )
+    return parser
+
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+
+    with open(args.cmvn_json, "r") as fin:
+        cmvn_dict = json.load(fin)
+
+    mean_stats = np.array(cmvn_dict["mean_stats"])
+    var_stats = np.array(cmvn_dict["var_stats"])
+    total_frame = np.array(cmvn_dict["total_frames"])
+
+    mean = -1.0 * mean_stats / total_frame
+    var = 1.0 / np.sqrt(var_stats / total_frame - mean * mean)
+    dims = mean.shape[0]
+    with open(args.am_mvn, 'w') as fout:
+        fout.write("<Nnet>" + "\n" + "<Splice> " + str(dims) + " " + str(dims) + '\n' + "[ 0 ]" + "\n" + "<AddShift> " + str(dims) + " " + str(dims) + "\n")
+        mean_str = str(list(mean)).replace(',', '').replace('[', '[ ').replace(']', ' ]')
+        fout.write("<LearnRateCoef> 0 " + mean_str + '\n')
+        fout.write("<Rescale> " + str(dims) + " " + str(dims) + '\n')
+        var_str = str(list(var)).replace(',', '').replace('[', '[ ').replace(']', ' ]')
+        fout.write("<LearnRateCoef> 0 " + var_str + '\n')
+        fout.write("</Nnet>" + '\n')
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/aishell/transformer/utils/compute_wer.py b/egs/aishell/transformer/utils/compute_wer.py
index 349a3f6..26a9f49 100755
--- a/egs/aishell/transformer/utils/compute_wer.py
+++ b/egs/aishell/transformer/utils/compute_wer.py
@@ -45,8 +45,8 @@
            if out_item['wrong'] > 0:
                rst['wrong_sentences'] += 1
            cer_detail_writer.write(hyp_key + print_cer_detail(out_item) + '\n')
-           cer_detail_writer.write("ref:" + '\t' + "".join(ref_dict[hyp_key]) + '\n')
-           cer_detail_writer.write("hyp:" + '\t' + "".join(hyp_dict[hyp_key]) + '\n')
+           cer_detail_writer.write("ref:" + '\t' + " ".join(list(map(lambda x: x.lower(), ref_dict[hyp_key]))) + '\n')
+           cer_detail_writer.write("hyp:" + '\t' + " ".join(list(map(lambda x: x.lower(), hyp_dict[hyp_key]))) + '\n')
 
     if rst['Wrd'] > 0:
         rst['Err'] = round(rst['wrong_words'] * 100 / rst['Wrd'], 2)
diff --git a/egs/librispeech/conformer/conf/decode_asr_transformer.yaml b/egs/librispeech/conformer/conf/decode_asr_transformer.yaml
new file mode 100644
index 0000000..a147fa7
--- /dev/null
+++ b/egs/librispeech/conformer/conf/decode_asr_transformer.yaml
@@ -0,0 +1,6 @@
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.7
diff --git a/egs/librispeech/conformer/conf/train_asr_conformer.yaml b/egs/librispeech/conformer/conf/train_asr_conformer.yaml
new file mode 100644
index 0000000..68b127f
--- /dev/null
+++ b/egs/librispeech/conformer/conf/train_asr_conformer.yaml
@@ -0,0 +1,80 @@
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: latest
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    use_cnn_module: true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+accum_grad: 2
+max_epoch: 50
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+    lr: 0.0025
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 40000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 10
+
+dataset_conf:
+    shuffle: True
+    shuffle_conf:
+        shuffle_size: 1024
+        sort_size: 500
+    batch_conf:
+        batch_type: token
+        batch_size: 10000
+    num_workers: 8
+
+log_interval: 50
+normalize: None
\ No newline at end of file
diff --git a/egs/librispeech/conformer/conf/train_asr_conformer_uttnorm.yaml b/egs/librispeech/conformer/conf/train_asr_conformer_uttnorm.yaml
new file mode 100644
index 0000000..16b7cc0
--- /dev/null
+++ b/egs/librispeech/conformer/conf/train_asr_conformer_uttnorm.yaml
@@ -0,0 +1,80 @@
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: latest
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    use_cnn_module: true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+accum_grad: 2
+max_epoch: 50
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+    lr: 0.0025
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 40000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 10
+
+dataset_conf:
+    shuffle: True
+    shuffle_conf:
+        shuffle_size: 1024
+        sort_size: 500
+    batch_conf:
+        batch_type: token
+        batch_size: 10000
+    num_workers: 8
+
+log_interval: 50
+normalize: utterance_mvn
\ No newline at end of file
diff --git a/egs/librispeech/conformer/local/data_prep_librispeech.sh b/egs/librispeech/conformer/local/data_prep_librispeech.sh
new file mode 100755
index 0000000..c939b5f
--- /dev/null
+++ b/egs/librispeech/conformer/local/data_prep_librispeech.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+
+# Copyright 2014  Vassil Panayotov
+#           2014  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+if [ "$#" -ne 2 ]; then
+  echo "Usage: $0 <src-dir> <dst-dir>"
+  echo "e.g.: $0 /export/a15/vpanayotov/data/LibriSpeech/dev-clean data/dev-clean"
+  exit 1
+fi
+
+src=$1
+dst=$2
+
+# all utterances are FLAC compressed
+if ! which flac >&/dev/null; then
+   echo "Please install 'flac' on ALL worker nodes!"
+   exit 1
+fi
+
+spk_file=$src/../SPEAKERS.TXT
+
+mkdir -p $dst || exit 1
+
+[ ! -d $src ] && echo "$0: no such directory $src" && exit 1
+[ ! -f $spk_file ] && echo "$0: expected file $spk_file to exist" && exit 1
+
+
+wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp
+trans=$dst/text; [[ -f "$trans" ]] && rm $trans
+
+for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do
+  reader=$(basename $reader_dir)
+  if ! [ $reader -eq $reader ]; then  # not integer.
+    echo "$0: unexpected subdirectory name $reader"
+    exit 1
+  fi
+
+  for chapter_dir in $(find -L $reader_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do
+    chapter=$(basename $chapter_dir)
+    if ! [ "$chapter" -eq "$chapter" ]; then
+      echo "$0: unexpected chapter-subdirectory name $chapter"
+      exit 1
+    fi
+
+    find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \
+      awk -v "dir=$chapter_dir" '{printf "%s %s/%s.flac \n", $0, dir, $0}' >>$wav_scp|| exit 1
+
+    chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt
+    [ ! -f  $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1
+    cat $chapter_trans >>$trans
+  done
+done
+
+echo "$0: successfully prepared data in $dst"
+
+exit 0
diff --git a/egs/librispeech/conformer/path.sh b/egs/librispeech/conformer/path.sh
new file mode 100755
index 0000000..7972642
--- /dev/null
+++ b/egs/librispeech/conformer/path.sh
@@ -0,0 +1,5 @@
+export FUNASR_DIR=$PWD/../../..
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PATH=$FUNASR_DIR/funasr/bin:$PATH
diff --git a/egs/librispeech/conformer/run.sh b/egs/librispeech/conformer/run.sh
new file mode 100755
index 0000000..93d1b46
--- /dev/null
+++ b/egs/librispeech/conformer/run.sh
@@ -0,0 +1,262 @@
+#!/usr/bin/env bash
+
+. ./path.sh || exit 1;
+
+# machines configuration
+CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+gpu_num=8
+count=1
+gpu_inference=true  # Whether to perform gpu decoding, set false for cpu decoding
+# for gpu decoding, inference_nj=ngpu*njob; for cpu decoding, inference_nj=njob
+njob=5
+train_cmd=utils/run.pl
+infer_cmd=utils/run.pl
+
+# general configuration
+feats_dir="../DATA" #feature output dictionary
+exp_dir="."
+lang=en
+dumpdir=dump/fbank
+feats_type=fbank
+token_type=bpe
+dataset_type=large
+scp=feats.scp
+type=kaldi_ark
+stage=3
+stop_stage=4
+
+# feature configuration
+feats_dim=80
+sample_frequency=16000
+nj=100
+speed_perturb="0.9,1.0,1.1"
+
+# data
+data_librispeech=
+
+# bpe model
+nbpe=5000
+bpemode=unigram
+
+# exp tag
+tag=""
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train_960
+valid_set=dev
+test_sets="test_clean test_other dev_clean dev_other"
+
+asr_config=conf/train_asr_conformer.yaml
+#asr_config=conf/train_asr_conformer_uttnorm.yaml
+model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
+
+inference_config=conf/decode_asr_transformer.yaml
+#inference_config=conf/decode_asr_transformer_beam60_ctc0.3.yaml
+inference_asr_model=valid.acc.ave_10best.pth
+
+# you can set gpu num for decoding here
+gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default
+ngpu=$(echo $gpuid_list | awk -F "," '{print NF}')
+
+if ${gpu_inference}; then
+    inference_nj=$[${ngpu}*${njob}]
+    _ngpu=1
+else
+    inference_nj=$njob
+    _ngpu=0
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    echo "stage 0: Data preparation"
+    # Data preparation
+    for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
+        local/data_prep_librispeech.sh ${data_librispeech}/LibriSpeech/${x} ${feats_dir}/data/${x//-/_}
+    done
+fi
+
+feat_train_dir=${feats_dir}/${dumpdir}/$train_set; mkdir -p ${feat_train_dir}
+feat_dev_clean_dir=${feats_dir}/${dumpdir}/dev_clean; mkdir -p ${feat_dev_clean_dir}
+feat_dev_other_dir=${feats_dir}/${dumpdir}/dev_other; mkdir -p ${feat_dev_other_dir}
+feat_test_clean_dir=${feats_dir}/${dumpdir}/test_clean; mkdir -p ${feat_test_clean_dir}
+feat_test_other_dir=${feats_dir}/${dumpdir}/test_other; mkdir -p ${feat_test_other_dir}
+feat_dev_dir=${feats_dir}/${dumpdir}/$valid_set; mkdir -p ${feat_dev_dir}
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "stage 1: Feature Generation"
+    # compute fbank features
+    fbankdir=${feats_dir}/fbank
+    for x in dev_clean dev_other test_clean test_other; do
+        utils/compute_fbank.sh --cmd "$train_cmd" --nj 1 --max_lengths 3000 --feats_dim ${feats_dim} --sample_frequency ${sample_frequency} \
+            ${feats_dir}/data/${x} ${exp_dir}/exp/make_fbank/${x} ${fbankdir}/${x}
+        utils/fix_data_feat.sh ${fbankdir}/${x}
+    done
+
+    mkdir ${feats_dir}/data/$train_set
+    train_sets="train_clean_100 train_clean_360 train_other_500"
+    for file in wav.scp text; do
+        ( for f in $train_sets; do cat $feats_dir/data/$f/$file; done ) | sort -k1 > $feats_dir/data/$train_set/$file || exit 1;
+    done
+    utils/compute_fbank.sh --cmd "$train_cmd" --nj $nj --max_lengths 3000 --feats_dim ${feats_dim} --sample_frequency ${sample_frequency} --speed_perturb ${speed_perturb} \
+    ${feats_dir}/data/$train_set ${exp_dir}/exp/make_fbank/$train_set ${fbankdir}/$train_set
+    utils/fix_data_feat.sh ${fbankdir}/$train_set
+
+    # compute global cmvn
+    utils/compute_cmvn.sh --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} \
+        ${fbankdir}/$train_set ${exp_dir}/exp/make_fbank/$train_set
+
+    # apply cmvn
+    utils/apply_cmvn.sh --cmd "$train_cmd" --nj $nj \
+        ${fbankdir}/$train_set ${fbankdir}/$train_set/cmvn.json ${exp_dir}/exp/make_fbank/$train_set ${feat_train_dir}
+    utils/apply_cmvn.sh --cmd "$train_cmd" --nj 1 \
+        ${fbankdir}/dev_clean ${fbankdir}/$train_set/cmvn.json ${exp_dir}/exp/make_fbank/dev_clean ${feat_dev_clean_dir}
+    utils/apply_cmvn.sh --cmd "$train_cmd" --nj 1\
+        ${fbankdir}/dev_other ${fbankdir}/$train_set/cmvn.json ${exp_dir}/exp/make_fbank/dev_other ${feat_dev_other_dir}
+    utils/apply_cmvn.sh --cmd "$train_cmd" --nj 1 \
+        ${fbankdir}/test_clean ${fbankdir}/$train_set/cmvn.json ${exp_dir}/exp/make_fbank/test_clean ${feat_test_clean_dir}
+    utils/apply_cmvn.sh --cmd "$train_cmd" --nj 1 \
+        ${fbankdir}/test_other ${fbankdir}/$train_set/cmvn.json ${exp_dir}/exp/make_fbank/test_other ${feat_test_other_dir}
+
+    cp ${fbankdir}/$train_set/text ${fbankdir}/$train_set/speech_shape ${fbankdir}/$train_set/text_shape ${feat_train_dir}
+    cp ${fbankdir}/dev_clean/text ${fbankdir}/dev_clean/speech_shape ${fbankdir}/dev_clean/text_shape ${feat_dev_clean_dir}
+    cp ${fbankdir}/dev_other/text ${fbankdir}/dev_other/speech_shape ${fbankdir}/dev_other/text_shape ${feat_dev_other_dir}
+    cp ${fbankdir}/test_clean/text ${fbankdir}/test_clean/speech_shape ${fbankdir}/test_clean/text_shape ${feat_test_clean_dir}
+    cp ${fbankdir}/test_other/text ${fbankdir}/test_other/speech_shape ${fbankdir}/test_other/text_shape ${feat_test_other_dir}
+
+    dev_sets="dev_clean dev_other"
+    for file in feats.scp text speech_shape text_shape; do
+        ( for f in $dev_sets; do cat $feats_dir/${dumpdir}/$f/$file; done ) | sort -k1 > $feat_dev_dir/$file || exit 1;
+    done
+
+    #generate ark list
+    utils/gen_ark_list.sh --cmd "$train_cmd" --nj $nj ${feat_train_dir} ${fbankdir}/${train_set} ${feat_train_dir}
+    utils/gen_ark_list.sh --cmd "$train_cmd" --nj $nj ${feat_dev_dir} ${fbankdir}/${valid_set} ${feat_dev_dir}
+fi
+
+dict=${feats_dir}/data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
+bpemodel=${feats_dir}/data/lang_char/${train_set}_${bpemode}${nbpe}
+echo "dictionary: ${dict}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+    echo "stage 2: Dictionary and Json Data Preparation"
+    mkdir -p ${feats_dir}/data/lang_char/
+    echo "<blank>" > ${dict}
+    echo "<s>" >> ${dict}
+    echo "</s>" >> ${dict}
+    cut -f 2- -d" " ${feats_dir}/data/${train_set}/text > ${feats_dir}/data/lang_char/input.txt
+    spm_train --input=${feats_dir}/data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000
+    spm_encode --model=${bpemodel}.model --output_format=piece < ${feats_dir}/data/lang_char/input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0}' >> ${dict}
+    echo "<unk>" >> ${dict}
+    wc -l ${dict}
+
+    vocab_size=$(cat ${dict} | wc -l)
+    awk -v v=,${vocab_size} '{print $0v}' ${feat_train_dir}/text_shape > ${feat_train_dir}/text_shape.char
+    awk -v v=,${vocab_size} '{print $0v}' ${feat_dev_dir}/text_shape > ${feat_dev_dir}/text_shape.char
+    mkdir -p ${feats_dir}/asr_stats_fbank_zh_char/$train_set
+    mkdir -p ${feats_dir}/asr_stats_fbank_zh_char/$valid_set
+    cp ${feat_train_dir}/speech_shape ${feat_train_dir}/text_shape ${feat_train_dir}/text_shape.char ${feats_dir}/asr_stats_fbank_zh_char/$train_set
+    cp ${feat_dev_dir}/speech_shape ${feat_dev_dir}/text_shape ${feat_dev_dir}/text_shape.char ${feats_dir}/asr_stats_fbank_zh_char/$valid_set
+fi
+
+
+# Training Stage
+world_size=$gpu_num  # run on one machine
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "stage 3: Training"
+    mkdir -p ${exp_dir}/exp/${model_dir}
+    mkdir -p ${exp_dir}/exp/${model_dir}/log
+    INIT_FILE=${exp_dir}/exp/${model_dir}/ddp_init
+    if [ -f $INIT_FILE ];then
+        rm -f $INIT_FILE
+    fi
+    init_method=file://$(readlink -f $INIT_FILE)
+    echo "$0: init method is $init_method"
+    for ((i = 0; i < $gpu_num; ++i)); do
+        {
+            rank=$i
+            local_rank=$i
+            gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+            asr_train.py \
+                --gpu_id $gpu_id \
+                --use_preprocessor true \
+                --split_with_space false \
+                --bpemodel ${bpemodel}.model \
+                --token_type $token_type \
+                --dataset_type $dataset_type \
+                --token_list $dict \
+                --train_data_file $feats_dir/$dumpdir/${train_set}/ark_txt.scp \
+                --valid_data_file $feats_dir/$dumpdir/${valid_set}/ark_txt.scp \
+                --resume true \
+                --output_dir ${exp_dir}/exp/${model_dir} \
+                --config $asr_config \
+                --input_size $feats_dim \
+                --ngpu $gpu_num \
+                --num_worker_count $count \
+                --multiprocessing_distributed true \
+                --dist_init_method $init_method \
+                --dist_world_size $world_size \
+                --dist_rank $rank \
+                --local_rank $local_rank 1> ${exp_dir}/exp/${model_dir}/log/train.log.$i 2>&1
+        } &
+        done
+        wait
+fi
+
+# Testing Stage
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "stage 4: Inference"
+    for dset in ${test_sets}; do
+        asr_exp=${exp_dir}/exp/${model_dir}
+        inference_tag="$(basename "${inference_config}" .yaml)"
+        _dir="${asr_exp}/${inference_tag}/${inference_asr_model}/${dset}"
+        _logdir="${_dir}/logdir"
+        if [ -d ${_dir} ]; then
+            echo "${_dir} is already exists. if you want to decode again, please delete this dir first."
+            exit 0
+        fi
+        mkdir -p "${_logdir}"
+        _data="${feats_dir}/${dumpdir}/${dset}"
+        key_file=${_data}/${scp}
+        num_scp_file="$(<${key_file} wc -l)"
+        _nj=$([ $inference_nj -le $num_scp_file ] && echo "$inference_nj" || echo "$num_scp_file")
+        split_scps=
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/keys.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
+        _opts=
+        if [ -n "${inference_config}" ]; then
+            _opts+="--config ${inference_config} "
+        fi
+        ${infer_cmd} --gpu "${_ngpu}" --max-jobs-run "${_nj}" JOB=1:"${_nj}" "${_logdir}"/asr_inference.JOB.log \
+            python -m funasr.bin.asr_inference_launch \
+                --batch_size 1 \
+                --ngpu "${_ngpu}" \
+                --njob ${njob} \
+                --gpuid_list ${gpuid_list} \
+                --data_path_and_name_and_type "${_data}/${scp},speech,${type}" \
+                --key_file "${_logdir}"/keys.JOB.scp \
+                --asr_train_config "${asr_exp}"/config.yaml \
+                --asr_model_file "${asr_exp}"/"${inference_asr_model}" \
+                --output_dir "${_logdir}"/output.JOB \
+                --mode asr \
+                ${_opts}
+
+        for f in token token_int score text; do
+            if [ -f "${_logdir}/output.1/1best_recog/${f}" ]; then
+                for i in $(seq "${_nj}"); do
+                    cat "${_logdir}/output.${i}/1best_recog/${f}"
+                done | sort -k1 >"${_dir}/${f}"
+            fi
+        done
+        python utils/compute_wer.py ${_data}/text ${_dir}/text ${_dir}/text.cer
+        tail -n 3 ${_dir}/text.cer > ${_dir}/text.cer.txt
+        cat ${_dir}/text.cer.txt
+    done
+fi
\ No newline at end of file
diff --git a/egs/librispeech/conformer/utils b/egs/librispeech/conformer/utils
new file mode 120000
index 0000000..fe070dd
--- /dev/null
+++ b/egs/librispeech/conformer/utils
@@ -0,0 +1 @@
+../../aishell/transformer/utils
\ No newline at end of file
diff --git a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer.py b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer.py
index c016c19..77b2cbd 100644
--- a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer.py
+++ b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer.py
@@ -74,7 +74,7 @@
     # If text exists, compute CER
     text_in = os.path.join(params["data_dir"], "text")
     if os.path.exists(text_in):
-        text_proc_file = os.path.join(best_recog_path, "token")
+        text_proc_file = os.path.join(best_recog_path, "text")
         compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))
 
 
diff --git a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py
index b326067..488936c 100644
--- a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py
+++ b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py
@@ -38,7 +38,7 @@
     # computer CER if GT text is set
     text_in = os.path.join(params["data_dir"], "text")
     if os.path.exists(text_in):
-        text_proc_file = os.path.join(decoding_path, "1best_recog/token")
+        text_proc_file = os.path.join(decoding_path, "1best_recog/text")
         compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
 
 
diff --git a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer.py b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer.py
index 54cfec0..0d06377 100644
--- a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer.py
+++ b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer.py
@@ -74,7 +74,7 @@
     # If text exists, compute CER
     text_in = os.path.join(params["data_dir"], "text")
     if os.path.exists(text_in):
-        text_proc_file = os.path.join(best_recog_path, "token")
+        text_proc_file = os.path.join(best_recog_path, "text")
         compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))
 
 
diff --git a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py
index 2f038a8..c94f685 100644
--- a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py
+++ b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py
@@ -38,7 +38,7 @@
     # computer CER if GT text is set
     text_in = os.path.join(params["data_dir"], "text")
     if os.path.exists(text_in):
-        text_proc_file = os.path.join(decoding_path, "1best_recog/token")
+        text_proc_file = os.path.join(decoding_path, "1best_recog/text")
         compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
 
 
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py
index 6726a41..9f280d5 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py
@@ -17,7 +17,7 @@
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument('--model', type=str, default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
-    parser.add_argument('--audio_in', type=str, default="./data/test")
+    parser.add_argument('--audio_in', type=str, default="./data/test/wav.scp")
     parser.add_argument('--output_dir', type=str, default="./results/")
     parser.add_argument('--batch_size', type=int, default=64)
     parser.add_argument('--gpuid', type=str, default="0")
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh
index f080257..221479d 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh
@@ -63,8 +63,8 @@
 
 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
     echo "Computing WER ..."
-    python utils/proce_text.py ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
-    python utils/proce_text.py ${data_dir}/text ${output_dir}/1best_recog/text.ref
+    cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
+    cp ${data_dir}/text ${output_dir}/1best_recog/text.ref
     python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
     tail -n 3 ${output_dir}/1best_recog/text.cer
 fi
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
index 295c95d..2d311dd 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
@@ -34,7 +34,7 @@
     # computer CER if GT text is set
     text_in = os.path.join(params["data_dir"], "text")
     if os.path.exists(text_in):
-        text_proc_file = os.path.join(decoding_path, "1best_recog/token")
+        text_proc_file = os.path.join(decoding_path, "1best_recog/text")
         compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
 
 
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py
index b4f633a..7cc71e7 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py
@@ -17,7 +17,7 @@
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument('--model', type=str, default="damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1")
-    parser.add_argument('--audio_in', type=str, default="./data/test")
+    parser.add_argument('--audio_in', type=str, default="./data/test/wav.scp")
     parser.add_argument('--output_dir', type=str, default="./results/")
     parser.add_argument('--batch_size', type=int, default=64)
     parser.add_argument('--gpuid', type=str, default="0")
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.sh b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.sh
index cdf81dc..6daf7d4 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.sh
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.sh
@@ -63,8 +63,8 @@
 
 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
     echo "Computing WER ..."
-    python utils/proce_text.py ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
-    python utils/proce_text.py ${data_dir}/text ${output_dir}/1best_recog/text.ref
+    cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
+    cp ${data_dir}/text ${output_dir}/1best_recog/text.ref
     python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
     tail -n 3 ${output_dir}/1best_recog/text.cer
 fi
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py
index e8fee02..747b49f 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py
@@ -34,7 +34,7 @@
     # computer CER if GT text is set
     text_in = os.path.join(params["data_dir"], "text")
     if os.path.exists(text_in):
-        text_proc_file = os.path.join(decoding_path, "1best_recog/token")
+        text_proc_file = os.path.join(decoding_path, "1best_recog/text")
         compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
 
 
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer.py
index 5d74837..96db5f9 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer.py
@@ -75,7 +75,7 @@
     # If text exists, compute CER
     text_in = os.path.join(params["data_dir"], "text")
     if os.path.exists(text_in):
-        text_proc_file = os.path.join(best_recog_path, "token")
+        text_proc_file = os.path.join(best_recog_path, "text")
         compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))
 
 
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py
index 861fefb..74691f0 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py
@@ -39,7 +39,7 @@
     # computer CER if GT text is set
     text_in = os.path.join(params["data_dir"], "text")
     if os.path.exists(text_in):
-        text_proc_file = os.path.join(decoding_path, "1best_recog/token")
+        text_proc_file = os.path.join(decoding_path, "1best_recog/text")
         compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
 
 
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer.py
index 5c62362..8b4a04d 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer.py
@@ -75,7 +75,7 @@
     # If text exists, compute CER
     text_in = os.path.join(params["data_dir"], "text")
     if os.path.exists(text_in):
-        text_proc_file = os.path.join(best_recog_path, "token")
+        text_proc_file = os.path.join(best_recog_path, "text")
         compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))
 
 
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py
index d73cae2..fd124ff 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py
@@ -39,7 +39,7 @@
     # computer CER if GT text is set
     text_in = os.path.join(params["data_dir"], "text")
     if os.path.exists(text_in):
-        text_proc_file = os.path.join(decoding_path, "1best_recog/token")
+        text_proc_file = os.path.join(decoding_path, "1best_recog/text")
         compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
 
 
diff --git a/funasr/bin/asr_inference_paraformer.py b/funasr/bin/asr_inference_paraformer.py
index 2eeffcd..8cbd419 100644
--- a/funasr/bin/asr_inference_paraformer.py
+++ b/funasr/bin/asr_inference_paraformer.py
@@ -797,7 +797,7 @@
                         finish_count += 1
                         # asr_utils.print_progress(finish_count / file_count)
                         if writer is not None:
-                            ibest_writer["text"][key] = text_postprocessed
+                            ibest_writer["text"][key] = " ".join(word_lists)
 
                     logging.info("decoding, utt: {}, predictions: {}".format(key, text))
         rtf_avg = "decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}".format(length_total, forward_time_total, 100 * forward_time_total / (length_total * lfr_factor))
diff --git a/funasr/bin/asr_inference_paraformer_streaming.py b/funasr/bin/asr_inference_paraformer_streaming.py
index 907f190..66dec39 100644
--- a/funasr/bin/asr_inference_paraformer_streaming.py
+++ b/funasr/bin/asr_inference_paraformer_streaming.py
@@ -42,6 +42,7 @@
 from funasr.models.frontend.wav_frontend import WavFrontend
 from funasr.models.e2e_asr_paraformer import BiCifParaformer, ContextualParaformer
 from funasr.export.models.e2e_asr_paraformer import Paraformer as Paraformer_export
+np.set_printoptions(threshold=np.inf)
 
 class Speech2Text:
     """Speech2Text class
@@ -203,7 +204,6 @@
         # Input as audio signal
         if isinstance(speech, np.ndarray):
             speech = torch.tensor(speech)
-
         if self.frontend is not None:
             feats, feats_len = self.frontend.forward(speech, speech_lengths)
             feats = to_device(feats, device=self.device)
@@ -213,13 +213,16 @@
             feats = speech
             feats_len = speech_lengths
         lfr_factor = max(1, (feats.size()[-1] // 80) - 1)
+        feats_len = cache["encoder"]["stride"] + cache["encoder"]["pad_left"] + cache["encoder"]["pad_right"]
+        feats = feats[:,cache["encoder"]["start_idx"]:cache["encoder"]["start_idx"]+feats_len,:]
+        feats_len = torch.tensor([feats_len])
         batch = {"speech": feats, "speech_lengths": feats_len, "cache": cache}
 
         # a. To device
         batch = to_device(batch, device=self.device)
 
         # b. Forward Encoder
-        enc, enc_len = self.asr_model.encode_chunk(**batch)
+        enc, enc_len = self.asr_model.encode_chunk(feats, feats_len, cache)
         if isinstance(enc, tuple):
             enc = enc[0]
         # assert len(enc) == 1, len(enc)
@@ -578,7 +581,22 @@
         speech2text = Speech2TextExport(**speech2text_kwargs)
     else:
         speech2text = Speech2Text(**speech2text_kwargs)
+        
+    def _load_bytes(input):
+        middle_data = np.frombuffer(input, dtype=np.int16)
+        middle_data = np.asarray(middle_data)
+        if middle_data.dtype.kind not in 'iu':
+            raise TypeError("'middle_data' must be an array of integers")
+        dtype = np.dtype('float32')
+        if dtype.kind != 'f':
+            raise TypeError("'dtype' must be a floating point type")
 
+        i = np.iinfo(middle_data.dtype)
+        abs_max = 2 ** (i.bits - 1)
+        offset = i.min + abs_max
+        array = np.frombuffer((middle_data.astype(dtype) - offset) / abs_max, dtype=np.float32)
+        return array
+    
     def _forward(
             data_path_and_name_and_type,
             raw_inputs: Union[np.ndarray, torch.Tensor] = None,
@@ -589,10 +607,12 @@
     ):
 
         # 3. Build data-iterator
+        if data_path_and_name_and_type is not None and data_path_and_name_and_type[2] == "bytes":
+            raw_inputs = _load_bytes(data_path_and_name_and_type[0])
+            raw_inputs = torch.tensor(raw_inputs)
         if data_path_and_name_and_type is None and raw_inputs is not None:
             if isinstance(raw_inputs, np.ndarray):
                 raw_inputs = torch.tensor(raw_inputs)
-
         is_final = False
         if param_dict is not None and "cache" in param_dict:
             cache = param_dict["cache"]
@@ -605,62 +625,87 @@
         asr_result = ""
         wait = True
         if len(cache) == 0:
-            cache["encoder"] = {"start_idx": 0, "pad_left": 0, "stride": 10, "pad_right": 5, "cif_hidden": None, "cif_alphas": None}
+            cache["encoder"] = {"start_idx": 0, "pad_left": 0, "stride": 10, "pad_right": 5, "cif_hidden": None, "cif_alphas": None, "is_final": is_final, "left": 0, "right": 0}
             cache_de = {"decode_fsmn": None}
             cache["decoder"] = cache_de
             cache["first_chunk"] = True
             cache["speech"] = []
-            cache["chunk_index"] = 0
-            cache["speech_chunk"] = []
+            cache["accum_speech"] = 0
 
         if raw_inputs is not None:
             if len(cache["speech"]) == 0:
                 cache["speech"] = raw_inputs
             else:
                 cache["speech"] = torch.cat([cache["speech"], raw_inputs], dim=0)
-            if len(cache["speech_chunk"]) == 0:
-                cache["speech_chunk"] = raw_inputs
-            else:
-                cache["speech_chunk"] = torch.cat([cache["speech_chunk"], raw_inputs], dim=0)
-            while len(cache["speech_chunk"]) >= 960:
+            cache["accum_speech"] += len(raw_inputs)
+            while cache["accum_speech"] >= 960:
                 if cache["first_chunk"]:
-                    if len(cache["speech_chunk"]) >= 14400:
-                        speech = torch.unsqueeze(cache["speech_chunk"][0:14400], axis=0)
-                        speech_length = torch.tensor([14400])
+                    if cache["accum_speech"] >= 14400:
+                        speech = torch.unsqueeze(cache["speech"], axis=0)
+                        speech_length = torch.tensor([len(cache["speech"])])
+                        cache["encoder"]["pad_left"] = 5 
+                        cache["encoder"]["pad_right"] = 5 
+                        cache["encoder"]["stride"] = 10
+                        cache["encoder"]["left"] = 5
+                        cache["encoder"]["right"] = 0
                         results = speech2text(cache, speech, speech_length)
-                        cache["speech_chunk"]= cache["speech_chunk"][4800:]
+                        cache["accum_speech"] -= 4800
                         cache["first_chunk"] = False
                         cache["encoder"]["start_idx"] = -5
+                        cache["encoder"]["is_final"] = False
                         wait = False
                     else:
                         if is_final:
-                            cache["encoder"]["stride"] = len(cache["speech_chunk"]) // 960
+                            cache["encoder"]["stride"] = len(cache["speech"]) // 960
+                            cache["encoder"]["pad_left"] = 0
                             cache["encoder"]["pad_right"] = 0
-                            speech = torch.unsqueeze(cache["speech_chunk"], axis=0)
-                            speech_length = torch.tensor([len(cache["speech_chunk"])])
+                            speech = torch.unsqueeze(cache["speech"], axis=0)
+                            speech_length = torch.tensor([len(cache["speech"])])
                             results = speech2text(cache, speech, speech_length)
-                            cache["speech_chunk"] = []
+                            cache["accum_speech"] = 0
                             wait = False
                         else:
                             break
                 else:
-                    if len(cache["speech_chunk"]) >= 19200:
+                    if cache["accum_speech"] >= 19200:
                         cache["encoder"]["start_idx"] += 10
+                        cache["encoder"]["stride"] = 10
                         cache["encoder"]["pad_left"] = 5
-                        speech = torch.unsqueeze(cache["speech_chunk"][:19200], axis=0)
-                        speech_length = torch.tensor([19200])
+                        cache["encoder"]["pad_right"] = 5
+                        cache["encoder"]["left"] = 0
+                        cache["encoder"]["right"] = 0
+                        speech = torch.unsqueeze(cache["speech"], axis=0)
+                        speech_length = torch.tensor([len(cache["speech"])])
                         results = speech2text(cache, speech, speech_length)
-                        cache["speech_chunk"] = cache["speech_chunk"][9600:]
+                        cache["accum_speech"] -= 9600
                         wait = False
                     else:
                         if is_final:
-                            cache["encoder"]["stride"] = len(cache["speech_chunk"]) // 960
-                            cache["encoder"]["pad_right"] = 0
-                            speech = torch.unsqueeze(cache["speech_chunk"], axis=0)
-                            speech_length = torch.tensor([len(cache["speech_chunk"])])
-                            results = speech2text(cache, speech, speech_length)
-                            cache["speech_chunk"] = []
-                            wait = False
+                            cache["encoder"]["is_final"] = True
+                            if cache["accum_speech"] >= 14400:
+                                cache["encoder"]["start_idx"] += 10
+                                cache["encoder"]["stride"] = 10
+                                cache["encoder"]["pad_left"] = 5
+                                cache["encoder"]["pad_right"] = 5
+                                cache["encoder"]["left"] = 0
+                                cache["encoder"]["right"] = cache["accum_speech"] // 960 - 15
+                                speech = torch.unsqueeze(cache["speech"], axis=0)
+                                speech_length = torch.tensor([len(cache["speech"])])
+                                results = speech2text(cache, speech, speech_length)
+                                cache["accum_speech"] -= 9600
+                                wait = False
+                            else:
+                                cache["encoder"]["start_idx"] += 10
+                                cache["encoder"]["stride"] = cache["accum_speech"] // 960 - 5
+                                cache["encoder"]["pad_left"] = 5
+                                cache["encoder"]["pad_right"] = 0
+                                cache["encoder"]["left"] = 0
+                                cache["encoder"]["right"] = 0
+                                speech = torch.unsqueeze(cache["speech"], axis=0)
+                                speech_length = torch.tensor([len(cache["speech"])])
+                                results = speech2text(cache, speech, speech_length)
+                                cache["accum_speech"] = 0
+                                wait = False
                         else:
                             break
                 
diff --git a/funasr/bin/asr_inference_paraformer_vad.py b/funasr/bin/asr_inference_paraformer_vad.py
index a0dc0aa..1548f9f 100644
--- a/funasr/bin/asr_inference_paraformer_vad.py
+++ b/funasr/bin/asr_inference_paraformer_vad.py
@@ -338,7 +338,7 @@
                     ibest_writer["token"][key] = " ".join(token)
                     ibest_writer["token_int"][key] = " ".join(map(str, token_int))
                     ibest_writer["vad"][key] = "{}".format(vadsegments)
-                    ibest_writer["text"][key] = text_postprocessed
+                    ibest_writer["text"][key] = " ".join(word_lists)
                     ibest_writer["text_with_punc"][key] = text_postprocessed_punc
                     if time_stamp_postprocessed is not None:
                         ibest_writer["time_stamp"][key] = "{}".format(time_stamp_postprocessed)
diff --git a/funasr/bin/asr_inference_paraformer_vad_punc.py b/funasr/bin/asr_inference_paraformer_vad_punc.py
index ab3e1e3..9dc0b79 100644
--- a/funasr/bin/asr_inference_paraformer_vad_punc.py
+++ b/funasr/bin/asr_inference_paraformer_vad_punc.py
@@ -670,7 +670,7 @@
                     ibest_writer["token"][key] = " ".join(token)
                     ibest_writer["token_int"][key] = " ".join(map(str, token_int))
                     ibest_writer["vad"][key] = "{}".format(vadsegments)
-                    ibest_writer["text"][key] = text_postprocessed
+                    ibest_writer["text"][key] = " ".join(word_lists)
                     ibest_writer["text_with_punc"][key] = text_postprocessed_punc
                     if time_stamp_postprocessed is not None:
                         ibest_writer["time_stamp"][key] = "{}".format(time_stamp_postprocessed)
diff --git a/funasr/bin/asr_inference_rnnt.py b/funasr/bin/asr_inference_rnnt.py
index 4a9ff0b..2189a71 100644
--- a/funasr/bin/asr_inference_rnnt.py
+++ b/funasr/bin/asr_inference_rnnt.py
@@ -738,13 +738,13 @@
                         ibest_writer["rtf"][key] = rtf_cur
 
                     if text is not None:
-                        text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
+                        text_postprocessed, word_lists = postprocess_utils.sentence_postprocess(token)
                         item = {'key': key, 'value': text_postprocessed}
                         asr_result_list.append(item)
                         finish_count += 1
                         # asr_utils.print_progress(finish_count / file_count)
                         if writer is not None:
-                            ibest_writer["text"][key] = text_postprocessed
+                            ibest_writer["text"][key] = " ".join(word_lists)
 
                     logging.info("decoding, utt: {}, predictions: {}".format(key, text))
         rtf_avg = "decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}".format(length_total, forward_time_total, 100 * forward_time_total / (length_total * lfr_factor))
diff --git a/funasr/bin/asr_inference_uniasr.py b/funasr/bin/asr_inference_uniasr.py
index 1286bc2..4aea720 100644
--- a/funasr/bin/asr_inference_uniasr.py
+++ b/funasr/bin/asr_inference_uniasr.py
@@ -504,13 +504,13 @@
                     ibest_writer["score"][key] = str(hyp.score)
     
                 if text is not None:
-                    text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
+                    text_postprocessed, word_lists = postprocess_utils.sentence_postprocess(token)
                     item = {'key': key, 'value': text_postprocessed}
                     asr_result_list.append(item)
                     finish_count += 1
                     asr_utils.print_progress(finish_count / file_count)
                     if writer is not None:
-                        ibest_writer["text"][key] = text_postprocessed
+                        ibest_writer["text"][key] = " ".join(word_lists)
         return asr_result_list
     
     return _forward
diff --git a/funasr/bin/asr_inference_uniasr_vad.py b/funasr/bin/asr_inference_uniasr_vad.py
index 3164d0d..52c29b8 100644
--- a/funasr/bin/asr_inference_uniasr_vad.py
+++ b/funasr/bin/asr_inference_uniasr_vad.py
@@ -507,13 +507,13 @@
                     ibest_writer["score"][key] = str(hyp.score)
     
                 if text is not None:
-                    text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
+                    text_postprocessed, word_lists = postprocess_utils.sentence_postprocess(token)
                     item = {'key': key, 'value': text_postprocessed}
                     asr_result_list.append(item)
                     finish_count += 1
                     asr_utils.print_progress(finish_count / file_count)
                     if writer is not None:
-                        ibest_writer["text"][key] = text_postprocessed
+                        ibest_writer["text"][key] = " ".join(word_lists)
         return asr_result_list
     
     return _forward
diff --git a/funasr/datasets/large_datasets/utils/tokenize.py b/funasr/datasets/large_datasets/utils/tokenize.py
index 3f20c5f..d8ceff2 100644
--- a/funasr/datasets/large_datasets/utils/tokenize.py
+++ b/funasr/datasets/large_datasets/utils/tokenize.py
@@ -37,7 +37,7 @@
     vad = -2
 
     if bpe_tokenizer is not None:
-        text = bpe_tokenizer.text2tokens(text)
+        text = bpe_tokenizer.text2tokens("".join(text))
 
     if seg_dict is not None:
         assert isinstance(seg_dict, dict)
diff --git a/funasr/export/export_model.py b/funasr/export/export_model.py
index 444ccf4..b69eeee 100644
--- a/funasr/export/export_model.py
+++ b/funasr/export/export_model.py
@@ -19,6 +19,7 @@
         self,
         cache_dir: Union[Path, str] = None,
         onnx: bool = True,
+        device: str = "cpu",
         quant: bool = True,
         fallback_num: int = 0,
         audio_in: str = None,
@@ -36,6 +37,7 @@
         )
         print("output dir: {}".format(self.cache_dir))
         self.onnx = onnx
+        self.device = device
         self.quant = quant
         self.fallback_num = fallback_num
         self.frontend = None
@@ -111,6 +113,10 @@
             dummy_input = model.get_dummy_inputs(enc_size)
         else:
             dummy_input = model.get_dummy_inputs()
+
+        if self.device == 'cuda':
+            model = model.cuda()
+            dummy_input = tuple([i.cuda() for i in dummy_input])
 
         # model_script = torch.jit.script(model)
         model_script = torch.jit.trace(model, dummy_input)
@@ -260,6 +266,7 @@
     parser.add_argument('--model-name', type=str, required=True)
     parser.add_argument('--export-dir', type=str, required=True)
     parser.add_argument('--type', type=str, default='onnx', help='["onnx", "torch"]')
+    parser.add_argument('--device', type=str, default='cpu', help='["cpu", "cuda"]')
     parser.add_argument('--quantize', type=str2bool, default=False, help='export quantized model')
     parser.add_argument('--fallback-num', type=int, default=0, help='amp fallback number')
     parser.add_argument('--audio_in', type=str, default=None, help='["wav", "wav.scp"]')
@@ -269,6 +276,7 @@
     export_model = ModelExport(
         cache_dir=args.export_dir,
         onnx=args.type == 'onnx',
+        device=args.device,
         quant=args.quantize,
         fallback_num=args.fallback_num,
         audio_in=args.audio_in,
diff --git a/funasr/export/models/modules/multihead_att.py b/funasr/export/models/modules/multihead_att.py
index 1983db8..6fce851 100644
--- a/funasr/export/models/modules/multihead_att.py
+++ b/funasr/export/models/modules/multihead_att.py
@@ -75,8 +75,8 @@
     return x, cache
 
 
-torch_version = float(".".join(torch.__version__.split(".")[:2]))
-if torch_version >= 1.8:
+torch_version = tuple([int(i) for i in torch.__version__.split(".")[:2]])
+if torch_version >= (1, 8):
     import torch.fx
     torch.fx.wrap('preprocess_for_attn')
 
diff --git a/funasr/models/decoder/contextual_decoder.py b/funasr/models/decoder/contextual_decoder.py
index 32f550a..3b462e7 100644
--- a/funasr/models/decoder/contextual_decoder.py
+++ b/funasr/models/decoder/contextual_decoder.py
@@ -74,7 +74,7 @@
         return x, tgt_mask, x_self_attn, x_src_attn
 
 
-class ContexutalBiasDecoder(nn.Module):
+class ContextualBiasDecoder(nn.Module):
     def __init__(
         self,
         size,
@@ -83,7 +83,7 @@
         normalize_before=True,
     ):
         """Construct an DecoderLayer object."""
-        super(ContexutalBiasDecoder, self).__init__()
+        super(ContextualBiasDecoder, self).__init__()
         self.size = size
         self.src_attn = src_attn
         if src_attn is not None:
@@ -186,7 +186,7 @@
             ),
         )
         self.dropout = nn.Dropout(dropout_rate)
-        self.bias_decoder = ContexutalBiasDecoder(
+        self.bias_decoder = ContextualBiasDecoder(
             size=attention_dim,
             src_attn=MultiHeadedAttentionCrossAtt(
                 attention_heads, attention_dim, src_attention_dropout_rate
diff --git a/funasr/models/decoder/sanm_decoder.py b/funasr/models/decoder/sanm_decoder.py
index 3bfcffc..463918a 100644
--- a/funasr/models/decoder/sanm_decoder.py
+++ b/funasr/models/decoder/sanm_decoder.py
@@ -104,7 +104,6 @@
 
             x = residual + self.dropout(self.src_attn(x, memory, memory_mask))
 
-
         return x, tgt_mask, memory, memory_mask, cache
 
     def forward_chunk(self, tgt, tgt_mask, memory, memory_mask=None, cache=None):
@@ -400,7 +399,7 @@
         for i in range(self.att_layer_num):
             decoder = self.decoders[i]
             c = cache[i]
-            x, tgt_mask, memory, memory_mask, c_ret = decoder(
+            x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
                 x, tgt_mask, memory, memory_mask, cache=c
             )
             new_cache.append(c_ret)
@@ -410,13 +409,13 @@
                 j = i + self.att_layer_num
                 decoder = self.decoders2[i]
                 c = cache[j]
-                x, tgt_mask, memory, memory_mask, c_ret = decoder(
+                x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
                     x, tgt_mask, memory, memory_mask, cache=c
                 )
                 new_cache.append(c_ret)
 
         for decoder in self.decoders3:
-            x, tgt_mask, memory, memory_mask, _ = decoder(
+            x, tgt_mask, memory, memory_mask, _ = decoder.forward_chunk(
                 x, tgt_mask, memory, None, cache=None
             )
 
@@ -1077,7 +1076,7 @@
         for i in range(self.att_layer_num):
             decoder = self.decoders[i]
             c = cache[i]
-            x, tgt_mask, memory, memory_mask, c_ret = decoder(
+            x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
                 x, tgt_mask, memory, None, cache=c
             )
             new_cache.append(c_ret)
@@ -1087,14 +1086,14 @@
                 j = i + self.att_layer_num
                 decoder = self.decoders2[i]
                 c = cache[j]
-                x, tgt_mask, memory, memory_mask, c_ret = decoder(
+                x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
                     x, tgt_mask, memory, None, cache=c
                 )
                 new_cache.append(c_ret)
 
         for decoder in self.decoders3:
 
-            x, tgt_mask, memory, memory_mask, _ = decoder(
+            x, tgt_mask, memory, memory_mask, _ = decoder.forward_chunk(
                 x, tgt_mask, memory, None, cache=None
             )
 
diff --git a/funasr/models/e2e_asr_paraformer.py b/funasr/models/e2e_asr_paraformer.py
index 02f60af..f1bb2bf 100644
--- a/funasr/models/e2e_asr_paraformer.py
+++ b/funasr/models/e2e_asr_paraformer.py
@@ -370,19 +370,10 @@
                 encoder_out, encoder_out_lens
             )
 
-        assert encoder_out.size(0) == speech.size(0), (
-            encoder_out.size(),
-            speech.size(0),
-        )
-        assert encoder_out.size(1) <= encoder_out_lens.max(), (
-            encoder_out.size(),
-            encoder_out_lens.max(),
-        )
-
         if intermediate_outs is not None:
             return (encoder_out, intermediate_outs), encoder_out_lens
 
-        return encoder_out, encoder_out_lens
+        return encoder_out, torch.tensor([encoder_out.size(1)])
 
     def calc_predictor(self, encoder_out, encoder_out_lens):
 
@@ -1034,16 +1025,76 @@
 
         # 1. Encoder
         encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
+        intermediate_outs = None
+        if isinstance(encoder_out, tuple):
+            intermediate_outs = encoder_out[1]
+            encoder_out = encoder_out[0]
 
+        loss_att, acc_att, cer_att, wer_att = None, None, None, None
+        loss_ctc, cer_ctc = None, None
+        loss_pre = None
         stats = dict()
+
+        # 1. CTC branch
+        if self.ctc_weight != 0.0:
+            loss_ctc, cer_ctc = self._calc_ctc_loss(
+                encoder_out, encoder_out_lens, text, text_lengths
+            )
+
+            # Collect CTC branch stats
+            stats["loss_ctc"] = loss_ctc.detach() if loss_ctc is not None else None
+            stats["cer_ctc"] = cer_ctc
+
+        # Intermediate CTC (optional)
+        loss_interctc = 0.0
+        if self.interctc_weight != 0.0 and intermediate_outs is not None:
+            for layer_idx, intermediate_out in intermediate_outs:
+                # we assume intermediate_out has the same length & padding
+                # as those of encoder_out
+                loss_ic, cer_ic = self._calc_ctc_loss(
+                    intermediate_out, encoder_out_lens, text, text_lengths
+                )
+                loss_interctc = loss_interctc + loss_ic
+
+                # Collect Intermedaite CTC stats
+                stats["loss_interctc_layer{}".format(layer_idx)] = (
+                    loss_ic.detach() if loss_ic is not None else None
+                )
+                stats["cer_interctc_layer{}".format(layer_idx)] = cer_ic
+
+            loss_interctc = loss_interctc / len(intermediate_outs)
+
+            # calculate whole encoder loss
+            loss_ctc = (
+                               1 - self.interctc_weight
+                       ) * loss_ctc + self.interctc_weight * loss_interctc
+
+        # 2b. Attention decoder branch
+        if self.ctc_weight != 1.0:
+            loss_att, acc_att, cer_att, wer_att, loss_pre = self._calc_att_loss(
+                encoder_out, encoder_out_lens, text, text_lengths
+            )
 
         loss_pre2 = self._calc_pre2_loss(
             encoder_out, encoder_out_lens, text, text_lengths
         )
 
-        loss = loss_pre2
+        # 3. CTC-Att loss definition
+        if self.ctc_weight == 0.0:
+            loss = loss_att + loss_pre * self.predictor_weight + loss_pre2 * self.predictor_weight * 0.5
+        elif self.ctc_weight == 1.0:
+            loss = loss_ctc
+        else:
+            loss = self.ctc_weight * loss_ctc + (1 - self.ctc_weight) * loss_att + loss_pre * self.predictor_weight + loss_pre2 * self.predictor_weight * 0.5
 
+        # Collect Attn branch stats
+        stats["loss_att"] = loss_att.detach() if loss_att is not None else None
+        stats["acc"] = acc_att
+        stats["cer"] = cer_att
+        stats["wer"] = wer_att
+        stats["loss_pre"] = loss_pre.detach().cpu() if loss_pre is not None else None
         stats["loss_pre2"] = loss_pre2.detach().cpu()
+
         stats["loss"] = torch.clone(loss.detach())
 
         # force_gatherable: to-device and to-tensor if scalar for DataParallel
@@ -1094,6 +1145,7 @@
             inner_dim: int = 256,
             bias_encoder_type: str = 'lstm',
             label_bracket: bool = False,
+            use_decoder_embedding: bool = False,
     ):
         assert check_argument_types()
         assert 0.0 <= ctc_weight <= 1.0, ctc_weight
@@ -1147,6 +1199,7 @@
             self.hotword_buffer = None
             self.length_record = []
             self.current_buffer_length = 0
+        self.use_decoder_embedding = use_decoder_embedding
 
     def forward(
             self,
@@ -1288,7 +1341,10 @@
                     hw_list.append(hw_tokens)
         # padding
         hw_list_pad = pad_list(hw_list, 0)
-        hw_embed = self.decoder.embed(hw_list_pad)
+        if self.use_decoder_embedding:
+            hw_embed = self.decoder.embed(hw_list_pad)
+        else:
+            hw_embed = self.bias_embed(hw_list_pad)
         hw_embed, (_, _) = self.bias_encoder(hw_embed)
         _ind = np.arange(0, len(hw_list)).tolist()
         # update self.hotword_buffer, throw a part if oversize
@@ -1404,13 +1460,19 @@
             # default hotword list
             hw_list = [torch.Tensor([self.sos]).long().to(encoder_out.device)]  # empty hotword list
             hw_list_pad = pad_list(hw_list, 0)
-            hw_embed = self.bias_embed(hw_list_pad)
+            if self.use_decoder_embedding:
+                hw_embed = self.decoder.embed(hw_list_pad)
+            else:
+                hw_embed = self.bias_embed(hw_list_pad)
             _, (h_n, _) = self.bias_encoder(hw_embed)
             contextual_info = h_n.squeeze(0).repeat(encoder_out.shape[0], 1, 1)
         else:
             hw_lengths = [len(i) for i in hw_list]
             hw_list_pad = pad_list([torch.Tensor(i).long() for i in hw_list], 0).to(encoder_out.device)
-            hw_embed = self.bias_embed(hw_list_pad)
+            if self.use_decoder_embedding:
+                hw_embed = self.decoder.embed(hw_list_pad)
+            else:
+                hw_embed = self.bias_embed(hw_list_pad)
             hw_embed = torch.nn.utils.rnn.pack_padded_sequence(hw_embed, hw_lengths, batch_first=True,
                                                                enforce_sorted=False)
             _, (h_n, _) = self.bias_encoder(hw_embed)
diff --git a/funasr/models/predictor/cif.py b/funasr/models/predictor/cif.py
index 74f3e68..e80a915 100644
--- a/funasr/models/predictor/cif.py
+++ b/funasr/models/predictor/cif.py
@@ -200,6 +200,7 @@
         return acoustic_embeds, token_num, alphas, cif_peak
 
     def forward_chunk(self, hidden, cache=None):
+        b, t, d = hidden.size()
         h = hidden
         context = h.transpose(1, 2)
         queries = self.pad(context)
@@ -220,6 +221,8 @@
             alphas = alphas * mask_chunk_predictor
       
         if cache is not None:
+            if cache["is_final"]:
+                alphas[:, cache["stride"] + cache["pad_left"] - 1] += 0.45
             if cache["cif_hidden"] is not None:
                 hidden = torch.cat((cache["cif_hidden"], hidden), 1)
             if cache["cif_alphas"] is not None:
@@ -241,7 +244,6 @@
                 mask_chunk_peak_predictor[:, :pre_alphas_length] = 1.0
             mask_chunk_peak_predictor[:, pre_alphas_length + cache["pad_left"]:pre_alphas_length + cache["stride"] + cache["pad_left"]] = 1.0
             
-
         if mask_chunk_peak_predictor is not None:
             cif_peak = cif_peak * mask_chunk_peak_predictor.squeeze(-1)
         
diff --git a/funasr/modules/embedding.py b/funasr/modules/embedding.py
index e4f9bff..79ca0b2 100644
--- a/funasr/modules/embedding.py
+++ b/funasr/modules/embedding.py
@@ -8,7 +8,7 @@
 
 import math
 import torch
-
+import torch.nn.functional as F
 
 def _pre_hook(
     state_dict,
@@ -409,9 +409,18 @@
 
     def forward_chunk(self, x, cache=None):
         start_idx = 0
+        pad_left = 0
+        pad_right = 0
         batch_size, timesteps, input_dim = x.size()
         if cache is not None:
             start_idx = cache["start_idx"]
+            pad_left = cache["left"]
+            pad_right = cache["right"]
         positions = torch.arange(1, timesteps+start_idx+1)[None, :]
         position_encoding = self.encode(positions, input_dim, x.dtype).to(x.device)
-        return x + position_encoding[:, start_idx: start_idx + timesteps]
+        outputs = x + position_encoding[:, start_idx: start_idx + timesteps]
+        outputs = outputs.transpose(1,2)
+        outputs = F.pad(outputs, (pad_left, pad_right))
+        outputs = outputs.transpose(1,2)
+        return outputs
+       
diff --git a/funasr/runtime/grpc/Readme.md b/funasr/runtime/grpc/Readme.md
index 6e3516a..82347be 100644
--- a/funasr/runtime/grpc/Readme.md
+++ b/funasr/runtime/grpc/Readme.md
@@ -53,6 +53,68 @@
 python grpc_main_client_mic.py  --host $server_ip --port 10108
 ```
 
+The `grpc_main_client_mic.py` follows the [original design] (https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/grpc#workflow-in-desgin) by sending audio_data with chunks. If you want to send audio_data in one request, here is an example:
+
+```
+# go to ../python/grpc to find this package
+import paraformer_pb2
+
+
+class RecognizeStub:
+    def __init__(self, channel):
+        self.Recognize = channel.stream_stream(
+                '/paraformer.ASR/Recognize',
+                request_serializer=paraformer_pb2.Request.SerializeToString,
+                response_deserializer=paraformer_pb2.Response.FromString,
+                )
+
+
+async def send(channel, data, speaking, isEnd):
+    stub = RecognizeStub(channel)
+    req = paraformer_pb2.Request()
+    if data:
+        req.audio_data = data
+    req.user = 'zz'
+    req.language = 'zh-CN'
+    req.speaking = speaking
+    req.isEnd = isEnd
+    q = queue.SimpleQueue()
+    q.put(req)
+    return stub.Recognize(iter(q.get, None))
+
+# send the audio data once
+async def grpc_rec(data, grpc_uri):
+    with grpc.insecure_channel(grpc_uri) as channel:
+        b = time.time()
+        response = await send(channel, data, False, False)
+        resp = response.next()
+        text = ''
+        if 'decoding' == resp.action:
+            resp = response.next()
+            if 'finish' == resp.action:
+                text = json.loads(resp.sentence)['text']
+        response = await send(channel, None, False, True)
+        return {
+                'text': text,
+                'time': time.time() - b,
+                }
+
+async def test():
+    # fc = FunAsrGrpcClient('127.0.0.1', 9900)
+    # t = await fc.rec(wav.tobytes())
+    # print(t)
+    wav, _ = sf.read('z-10s.wav', dtype='int16')
+    uri = '127.0.0.1:9900'
+    res = await grpc_rec(wav.tobytes(), uri)
+    print(res)
+
+
+if __name__ == '__main__':
+    asyncio.run(test())
+
+```
+
+
 ## Acknowledge
 1. This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR).
 2. We acknowledge [DeepScience](https://www.deepscience.cn) for contributing the grpc service.
diff --git a/funasr/runtime/grpc/paraformer_server.cc b/funasr/runtime/grpc/paraformer_server.cc
index 69ce903..f2ab4e0 100644
--- a/funasr/runtime/grpc/paraformer_server.cc
+++ b/funasr/runtime/grpc/paraformer_server.cc
@@ -88,7 +88,7 @@
             res.set_language(req.language());
             stream->Write(res);
         } else if (!req.speaking()) {
-            if (client_buffers.count(req.user()) == 0) {
+            if (client_buffers.count(req.user()) == 0 && req.audio_data().size() == 0) {
                 Response res;
                 res.set_sentence(
                     R"({"success": true, "detail": "waiting_for_voice"})"
@@ -99,14 +99,18 @@
                 stream->Write(res);
             }else {
                 auto begin_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
+                if (req.audio_data().size() > 0) {
+                  auto& buf = client_buffers[req.user()];
+                  buf.insert(buf.end(), req.audio_data().begin(), req.audio_data().end());
+                }
                 std::string tmp_data = this->client_buffers[req.user()];
                 this->clear_states(req.user());
-                
+
                 Response res;
                 res.set_sentence(
                     R"({"success": true, "detail": "decoding data: " + std::to_string(tmp_data.length()) + " bytes"})"
                 );
-		int data_len_int = tmp_data.length();
+                int data_len_int = tmp_data.length();
                 std::string data_len = std::to_string(data_len_int);
                 std::stringstream ss;
                 ss << R"({"success": true, "detail": "decoding data: )" << data_len << R"( bytes")"  << R"("})";
@@ -129,18 +133,18 @@
                     res.set_user(req.user());
                     res.set_action("finish");
                     res.set_language(req.language());
-                    
-                    
-                    
+
+
+
                     stream->Write(res);
                 }
                 else {
-                    RPASR_RESULT Result= RapidAsrRecogPCMBuffer(AsrHanlde, tmp_data.c_str(), data_len_int, RASR_NONE, NULL);   
+                    RPASR_RESULT Result= RapidAsrRecogPCMBuffer(AsrHanlde, tmp_data.c_str(), data_len_int, RASR_NONE, NULL);
                     std::string asr_result = ((RPASR_RECOG_RESULT*)Result)->msg;
 
                     auto end_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
                     std::string delay_str = std::to_string(end_time - begin_time);
-                    
+
                     std::cout << "user: " << req.user() << " , delay(ms): " << delay_str << ", text: " << asr_result << std::endl;
                     Response res;
                     std::stringstream ss;
@@ -150,8 +154,8 @@
                     res.set_user(req.user());
                     res.set_action("finish");
                     res.set_language(req.language());
-                    
-                    
+
+
                     stream->Write(res);
                 }
             }
@@ -165,7 +169,7 @@
             res.set_language(req.language());
             stream->Write(res);
         }
-    }    
+    }
     return Status::OK;
 }
 
diff --git a/funasr/runtime/python/grpc/grpc_server.py b/funasr/runtime/python/grpc/grpc_server.py
index d0be6f0..4fd4f95 100644
--- a/funasr/runtime/python/grpc/grpc_server.py
+++ b/funasr/runtime/python/grpc/grpc_server.py
@@ -109,7 +109,7 @@
                             else:
                                 asr_result = ""
                         elif self.backend == "onnxruntime":
-                            from rapid_paraformer.utils.frontend import load_bytes
+                            from funasr_onnx.utils.frontend import load_bytes
                             array = load_bytes(tmp_data)
                             asr_result = self.inference_16k_pipeline(array)[0]
                         end_time = int(round(time.time() * 1000))
diff --git a/funasr/runtime/python/libtorch/README.md b/funasr/runtime/python/libtorch/README.md
index 1912bbe..aeb2eae 100644
--- a/funasr/runtime/python/libtorch/README.md
+++ b/funasr/runtime/python/libtorch/README.md
@@ -31,7 +31,7 @@
 
     ```shell
     git clone https://github.com/alibaba/FunASR.git && cd FunASR
-    cd funasr/runtime/python/funasr_torch
+    cd funasr/runtime/python/libtorch
     python setup.py build
     python setup.py install
     ```
diff --git a/funasr/runtime/python/libtorch/demo.py b/funasr/runtime/python/libtorch/demo.py
index 58285a7..1a9d9e9 100644
--- a/funasr/runtime/python/libtorch/demo.py
+++ b/funasr/runtime/python/libtorch/demo.py
@@ -1,10 +1,15 @@
-
 from funasr_torch import Paraformer
 
-model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
-model = Paraformer(model_dir, batch_size=1)
 
-wav_path = ['/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav']
+model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+
+model = Paraformer(model_dir, batch_size=1)  # cpu
+# model = Paraformer(model_dir, batch_size=1, device_id=0)  # gpu
+
+# when using paraformer-large-vad-punc model, you can set plot_timestamp_to="./xx.png" to get figure of alignment besides timestamps
+# model = Paraformer(model_dir, batch_size=1, plot_timestamp_to="test.png")
+
+wav_path = "YourPath/xx.wav"
 
 result = model(wav_path)
-print(result)
\ No newline at end of file
+print(result)
diff --git a/funasr/runtime/python/libtorch/funasr_torch/paraformer_bin.py b/funasr/runtime/python/libtorch/funasr_torch/paraformer_bin.py
index 3c0606d..e169087 100644
--- a/funasr/runtime/python/libtorch/funasr_torch/paraformer_bin.py
+++ b/funasr/runtime/python/libtorch/funasr_torch/paraformer_bin.py
@@ -46,6 +46,7 @@
         )
         self.ort_infer = torch.jit.load(model_file)
         self.batch_size = batch_size
+        self.device_id = device_id
         self.plot_timestamp_to = plot_timestamp_to
         self.pred_bias = pred_bias
 
@@ -58,8 +59,13 @@
             end_idx = min(waveform_nums, beg_idx + self.batch_size)
             feats, feats_len = self.extract_feat(waveform_list[beg_idx:end_idx])
             try:
-                outputs = self.ort_infer(feats, feats_len)
-                am_scores, valid_token_lens = outputs[0], outputs[1]
+                with torch.no_grad():
+                    if int(self.device_id) == -1:
+                        outputs = self.ort_infer(feats, feats_len)
+                        am_scores, valid_token_lens = outputs[0], outputs[1]
+                    else:
+                        outputs = self.ort_infer(feats.cuda(), feats_len.cuda())
+                        am_scores, valid_token_lens = outputs[0].cpu(), outputs[1].cpu()
                 if len(outputs) == 4:
                     # for BiCifParaformer Inference
                     us_alphas, us_peaks = outputs[2], outputs[3]
diff --git a/funasr/runtime/python/onnxruntime/README.md b/funasr/runtime/python/onnxruntime/README.md
index 7bf30b1..e19e3a2 100644
--- a/funasr/runtime/python/onnxruntime/README.md
+++ b/funasr/runtime/python/onnxruntime/README.md
@@ -32,7 +32,7 @@
 
 ```shell
 git clone https://github.com/alibaba/FunASR.git && cd FunASR
-cd funasr/runtime/python/funasr_onnx
+cd funasr/runtime/python/onnxruntime
 python setup.py build
 python setup.py install
 ```
diff --git a/funasr/runtime/python/onnxruntime/demo.py b/funasr/runtime/python/onnxruntime/demo.py
index 48d54e9..8fc82f1 100644
--- a/funasr/runtime/python/onnxruntime/demo.py
+++ b/funasr/runtime/python/onnxruntime/demo.py
@@ -1,13 +1,15 @@
-
 from funasr_onnx import Paraformer
 
-model_dir = "/Users/shixian/code/funasr/export/damo/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch"
 
-# if you use paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch, you should set pred_bias=0
-# plot_timestamp_to works only when using speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch
-model = Paraformer(model_dir, batch_size=2, plot_timestamp_to="./", pred_bias=0) 
+model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
 
-wav_path = "/Users/shixian/code/funasr/export/damo/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch/example/asr_example.wav"
+model = Paraformer(model_dir, batch_size=2, plot_timestamp_to="./", pred_bias=0)  # cpu
+# model = Paraformer(model_dir, batch_size=2, plot_timestamp_to="./", pred_bias=0, device_id=0)  # gpu
+
+# when using paraformer-large-vad-punc model, you can set plot_timestamp_to="./xx.png" to get figure of alignment besides timestamps
+# model = Paraformer(model_dir, batch_size=1, plot_timestamp_to="test.png")
+
+wav_path = "YourPath/xx.wav"
 
 result = model(wav_path)
-print(result)
\ No newline at end of file
+print(result)
diff --git a/funasr/tasks/abs_task.py b/funasr/tasks/abs_task.py
index c8e408b..775cba8 100644
--- a/funasr/tasks/abs_task.py
+++ b/funasr/tasks/abs_task.py
@@ -464,6 +464,12 @@
             default=sys.maxsize,
             help="The maximum number update step to train",
         )
+        parser.add_argument(
+            "--batch_interval",
+            type=int,
+            default=10000,
+            help="The batch interval for saving model.",
+        )
         group.add_argument(
             "--patience",
             type=int_or_none,
@@ -1355,15 +1361,15 @@
                 from funasr.datasets.large_datasets.build_dataloader import ArkDataLoader
                 train_iter_factory = ArkDataLoader(args.train_data_file, args.token_list, args.dataset_conf,
                                                    frontend_conf=args.frontend_conf if hasattr(args, "frontend_conf") else None,
-                                                   seg_dict_file=args.seg_dict_file if hasattr(args,
-                                                                                               "seg_dict_file") else None,
+                                                   seg_dict_file=args.seg_dict_file if hasattr(args, "seg_dict_file") else None,
                                                    punc_dict_file=args.punc_list if hasattr(args, "punc_list") else None,
+                                                   bpemodel_file=args.bpemodel if hasattr(args, "bpemodel") else None,
                                                    mode="train")
                 valid_iter_factory = ArkDataLoader(args.valid_data_file, args.token_list, args.dataset_conf, 
                                                    frontend_conf=args.frontend_conf if hasattr(args, "frontend_conf") else None,
-                                                   seg_dict_file=args.seg_dict_file if hasattr(args,
-                                                                                               "seg_dict_file") else None,
+                                                   seg_dict_file=args.seg_dict_file if hasattr(args, "seg_dict_file") else None,
                                                    punc_dict_file=args.punc_list if hasattr(args, "punc_list") else None,
+                                                   bpemodel_file=args.bpemodel if hasattr(args, "bpemodel") else None,
                                                    mode="eval")
             elif args.dataset_type == "small":
                 train_iter_factory = cls.build_iter_factory(
@@ -1576,13 +1582,18 @@
     ) -> AbsIterFactory:
         assert check_argument_types()
 
+        if args.frontend_conf is not None and "fs" in args.frontend_conf:
+            dest_sample_rate = args.frontend_conf["fs"]
+        else:
+            dest_sample_rate = 16000
+
         dataset = ESPnetDataset(
             iter_options.data_path_and_name_and_type,
             float_dtype=args.train_dtype,
             preprocess=iter_options.preprocess_fn,
             max_cache_size=iter_options.max_cache_size,
             max_cache_fd=iter_options.max_cache_fd,
-            dest_sample_rate=args.frontend_conf["fs"],
+            dest_sample_rate=dest_sample_rate,
         )
         cls.check_task_requirements(
             dataset, args.allow_variable_data_keys, train=iter_options.train
diff --git a/funasr/tasks/asr.py b/funasr/tasks/asr.py
index 6e0f16a..e151473 100644
--- a/funasr/tasks/asr.py
+++ b/funasr/tasks/asr.py
@@ -412,12 +412,6 @@
             default="13_15",
             help="The range of noise decibel level.",
         )
-        parser.add_argument(
-            "--batch_interval",
-            type=int,
-            default=10000,
-            help="The batch interval for saving model.",
-        )
 
         for class_choices in cls.class_choices_list:
             # Append --<name> and --<name>_conf.
diff --git a/funasr/train/trainer.py b/funasr/train/trainer.py
index 4fbdcd9..b12bded 100644
--- a/funasr/train/trainer.py
+++ b/funasr/train/trainer.py
@@ -579,9 +579,10 @@
             reporter.measure_iter_time(iterator, "iter_time"), 1
         ):
             assert isinstance(batch, dict), type(batch)
-        
-            if rank == 0 and hasattr(model.module, "num_updates"):
-                num_batch_updates = model.module.get_num_updates()
+
+            if rank == 0:
+                if hasattr(model, "num_updates") or (hasattr(model, "module") and hasattr(model.module, "num_updates")):
+                    num_batch_updates = model.get_num_updates() if hasattr(model,"num_updates") else model.module.get_num_updates()
                 if (num_batch_updates%batch_interval == 0) and (options.oss_bucket is not None) and options.use_pai:
                     buffer = BytesIO()
                     torch.save(model.state_dict(), buffer)
diff --git a/funasr/utils/compute_wer.py b/funasr/utils/compute_wer.py
index 349a3f6..26a9f49 100755
--- a/funasr/utils/compute_wer.py
+++ b/funasr/utils/compute_wer.py
@@ -45,8 +45,8 @@
            if out_item['wrong'] > 0:
                rst['wrong_sentences'] += 1
            cer_detail_writer.write(hyp_key + print_cer_detail(out_item) + '\n')
-           cer_detail_writer.write("ref:" + '\t' + "".join(ref_dict[hyp_key]) + '\n')
-           cer_detail_writer.write("hyp:" + '\t' + "".join(hyp_dict[hyp_key]) + '\n')
+           cer_detail_writer.write("ref:" + '\t' + " ".join(list(map(lambda x: x.lower(), ref_dict[hyp_key]))) + '\n')
+           cer_detail_writer.write("hyp:" + '\t' + " ".join(list(map(lambda x: x.lower(), hyp_dict[hyp_key]))) + '\n')
 
     if rst['Wrd'] > 0:
         rst['Err'] = round(rst['wrong_words'] * 100 / rst['Wrd'], 2)
diff --git a/funasr/version.txt b/funasr/version.txt
index d15723f..1c09c74 100644
--- a/funasr/version.txt
+++ b/funasr/version.txt
@@ -1 +1 @@
-0.3.2
+0.3.3

--
Gitblit v1.9.1