python/FunASR-XL.git

parent: c4490d35 | 补丁 | 提交 | show whitespace

zhifu gao

2023-04-07 2e769fb36ce88dabfa984e8b81e8cb1c90799c95

Merge branch 'main' into dev_cmz2

43个文件已修改

8个文件已添加

	egs/aishell/transformer/utils/cmvn_converter.py	53 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/aishell/transformer/utils/compute_wer.py	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/librispeech/conformer/conf/decode_asr_transformer.yaml	6 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/librispeech/conformer/conf/train_asr_conformer.yaml	80 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/librispeech/conformer/conf/train_asr_conformer_uttnorm.yaml	80 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/librispeech/conformer/local/data_prep_librispeech.sh	58 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/librispeech/conformer/path.sh	5 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/librispeech/conformer/run.sh	262 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/librispeech/conformer/utils	1 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.sh	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_paraformer.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_paraformer_streaming.py	101 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_paraformer_vad.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_paraformer_vad_punc.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_rnnt.py	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_uniasr.py	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_uniasr_vad.py	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/datasets/large_datasets/utils/tokenize.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/export/export_model.py	8 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/export/models/modules/multihead_att.py	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/decoder/contextual_decoder.py	6 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/decoder/sanm_decoder.py	13 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/e2e_asr_paraformer.py	84 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/predictor/cif.py	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/modules/embedding.py	13 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/grpc/Readme.md	62 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/grpc/paraformer_server.cc	6 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/grpc/grpc_server.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/libtorch/README.md	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/libtorch/demo.py	13 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/libtorch/funasr_torch/paraformer_bin.py	6 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/README.md	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/demo.py	14 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/tasks/abs_task.py	21 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/tasks/asr.py	6 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/train/trainer.py	5 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/utils/compute_wer.py	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/version.txt	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史

 egs/aishell/transformer/utils/cmvn_converter.py

New file
@@ -0,0 +1,53 @@
import argparse
import json
import numpy as np


def get_parser():
    parser = argparse.ArgumentParser(
        description="cmvn converter",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "--cmvn-json",
        "-c",
        default=False,
        required=True,
        type=str,
        help="cmvn json file",
    )
    parser.add_argument(
        "--am-mvn",
        "-a",
        default=False,
        required=True,
        type=str,
        help="am mvn file",
    )
    return parser

def main():
    parser = get_parser()
    args = parser.parse_args()

    with open(args.cmvn_json, "r") as fin:
        cmvn_dict = json.load(fin)

    mean_stats = np.array(cmvn_dict["mean_stats"])
    var_stats = np.array(cmvn_dict["var_stats"])
    total_frame = np.array(cmvn_dict["total_frames"])

    mean = -1.0 * mean_stats / total_frame
    var = 1.0 / np.sqrt(var_stats / total_frame - mean * mean)
    dims = mean.shape[0]
    with open(args.am_mvn, 'w') as fout:
        fout.write("<Nnet>" + "\n" + "<Splice> " + str(dims) + " " + str(dims) + '\n' + "[ 0 ]" + "\n" + "<AddShift> " + str(dims) + " " + str(dims) + "\n")
        mean_str = str(list(mean)).replace(',', '').replace('[', '[ ').replace(']', ' ]')
        fout.write("<LearnRateCoef> 0 " + mean_str + '\n')
        fout.write("<Rescale> " + str(dims) + " " + str(dims) + '\n')
        var_str = str(list(var)).replace(',', '').replace('[', '[ ').replace(']', ' ]')
        fout.write("<LearnRateCoef> 0 " + var_str + '\n')
        fout.write("</Nnet>" + '\n')

if __name__ == '__main__':
    main()

 egs/aishell/transformer/utils/compute_wer.py

@@ -45,8 +45,8 @@
           if out_item['wrong'] > 0:
               rst['wrong_sentences'] += 1
           cer_detail_writer.write(hyp_key + print_cer_detail(out_item) + '\n')
           cer_detail_writer.write("ref:" + '\t' + "".join(ref_dict[hyp_key]) + '\n')
           cer_detail_writer.write("hyp:" + '\t' + "".join(hyp_dict[hyp_key]) + '\n')
           cer_detail_writer.write("ref:" + '\t' + " ".join(list(map(lambda x: x.lower(), ref_dict[hyp_key]))) + '\n')
           cer_detail_writer.write("hyp:" + '\t' + " ".join(list(map(lambda x: x.lower(), hyp_dict[hyp_key]))) + '\n')

    if rst['Wrd'] > 0:
        rst['Err'] = round(rst['wrong_words'] * 100 / rst['Wrd'], 2)

 egs/librispeech/conformer/conf/decode_asr_transformer.yaml

New file
@@ -0,0 +1,6 @@
beam_size: 10
penalty: 0.0
maxlenratio: 0.0
minlenratio: 0.0
ctc_weight: 0.5
lm_weight: 0.7

 egs/librispeech/conformer/conf/train_asr_conformer.yaml

New file
@@ -0,0 +1,80 @@
encoder: conformer
encoder_conf:
    output_size: 512
    attention_heads: 8
    linear_units: 2048
    num_blocks: 12
    dropout_rate: 0.1
    positional_dropout_rate: 0.1
    attention_dropout_rate: 0.1
    input_layer: conv2d
    normalize_before: true
    macaron_style: true
    rel_pos_type: latest
    pos_enc_layer_type: rel_pos
    selfattention_layer_type: rel_selfattn
    activation_type: swish
    use_cnn_module: true
    cnn_module_kernel: 31

decoder: transformer
decoder_conf:
    attention_heads: 8
    linear_units: 2048
    num_blocks: 6
    dropout_rate: 0.1
    positional_dropout_rate: 0.1
    self_attention_dropout_rate: 0.1
    src_attention_dropout_rate: 0.1

model_conf:
    ctc_weight: 0.3
    lsm_weight: 0.1
    length_normalized_loss: false

accum_grad: 2
max_epoch: 50
patience: none
init: none
best_model_criterion:
-   - valid
    - acc
    - max
keep_nbest_models: 10

optim: adam
optim_conf:
    lr: 0.0025
    weight_decay: 0.000001
scheduler: warmuplr
scheduler_conf:
    warmup_steps: 40000

specaug: specaug
specaug_conf:
    apply_time_warp: true
    time_warp_window: 5
    time_warp_mode: bicubic
    apply_freq_mask: true
    freq_mask_width_range:
    - 0
    - 27
    num_freq_mask: 2
    apply_time_mask: true
    time_mask_width_ratio_range:
    - 0.
    - 0.05
    num_time_mask: 10

dataset_conf:
    shuffle: True
    shuffle_conf:
        shuffle_size: 1024
        sort_size: 500
    batch_conf:
        batch_type: token
        batch_size: 10000
    num_workers: 8

log_interval: 50
normalize: None

 egs/librispeech/conformer/conf/train_asr_conformer_uttnorm.yaml

New file
@@ -0,0 +1,80 @@
encoder: conformer
encoder_conf:
    output_size: 512
    attention_heads: 8
    linear_units: 2048
    num_blocks: 12
    dropout_rate: 0.1
    positional_dropout_rate: 0.1
    attention_dropout_rate: 0.1
    input_layer: conv2d
    normalize_before: true
    macaron_style: true
    rel_pos_type: latest
    pos_enc_layer_type: rel_pos
    selfattention_layer_type: rel_selfattn
    activation_type: swish
    use_cnn_module: true
    cnn_module_kernel: 31

decoder: transformer
decoder_conf:
    attention_heads: 8
    linear_units: 2048
    num_blocks: 6
    dropout_rate: 0.1
    positional_dropout_rate: 0.1
    self_attention_dropout_rate: 0.1
    src_attention_dropout_rate: 0.1

model_conf:
    ctc_weight: 0.3
    lsm_weight: 0.1
    length_normalized_loss: false

accum_grad: 2
max_epoch: 50
patience: none
init: none
best_model_criterion:
-   - valid
    - acc
    - max
keep_nbest_models: 10

optim: adam
optim_conf:
    lr: 0.0025
    weight_decay: 0.000001
scheduler: warmuplr
scheduler_conf:
    warmup_steps: 40000

specaug: specaug
specaug_conf:
    apply_time_warp: true
    time_warp_window: 5
    time_warp_mode: bicubic
    apply_freq_mask: true
    freq_mask_width_range:
    - 0
    - 27
    num_freq_mask: 2
    apply_time_mask: true
    time_mask_width_ratio_range:
    - 0.
    - 0.05
    num_time_mask: 10

dataset_conf:
    shuffle: True
    shuffle_conf:
        shuffle_size: 1024
        sort_size: 500
    batch_conf:
        batch_type: token
        batch_size: 10000
    num_workers: 8

log_interval: 50
normalize: utterance_mvn

 egs/librispeech/conformer/local/data_prep_librispeech.sh

New file
@@ -0,0 +1,58 @@
#!/usr/bin/env bash

# Copyright 2014  Vassil Panayotov
#           2014  Johns Hopkins University (author: Daniel Povey)
# Apache 2.0

if [ "$#" -ne 2 ]; then
  echo "Usage: $0 <src-dir> <dst-dir>"
  echo "e.g.: $0 /export/a15/vpanayotov/data/LibriSpeech/dev-clean data/dev-clean"
  exit 1
fi

src=$1
dst=$2

# all utterances are FLAC compressed
if ! which flac >&/dev/null; then
   echo "Please install 'flac' on ALL worker nodes!"
   exit 1
fi

spk_file=$src/../SPEAKERS.TXT

mkdir -p $dst || exit 1

[ ! -d $src ] && echo "$0: no such directory $src" && exit 1
[ ! -f $spk_file ] && echo "$0: expected file $spk_file to exist" && exit 1


wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp
trans=$dst/text; [[ -f "$trans" ]] && rm $trans

for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do
  reader=$(basename $reader_dir)
  if ! [ $reader -eq $reader ]; then  # not integer.
    echo "$0: unexpected subdirectory name $reader"
    exit 1
  fi

  for chapter_dir in $(find -L $reader_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do
    chapter=$(basename $chapter_dir)
    if ! [ "$chapter" -eq "$chapter" ]; then
      echo "$0: unexpected chapter-subdirectory name $chapter"
      exit 1
    fi

    find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \
      awk -v "dir=$chapter_dir" '{printf "%s %s/%s.flac \n", $0, dir, $0}' >>$wav_scp|| exit 1

    chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt
    [ ! -f  $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1
    cat $chapter_trans >>$trans
  done
done

echo "$0: successfully prepared data in $dst"

exit 0

 egs/librispeech/conformer/path.sh

New file
@@ -0,0 +1,5 @@
export FUNASR_DIR=$PWD/../../..

# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PATH=$FUNASR_DIR/funasr/bin:$PATH

 egs/librispeech/conformer/run.sh

New file
@@ -0,0 +1,262 @@
#!/usr/bin/env bash

. ./path.sh || exit 1;

# machines configuration
CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
gpu_num=8
count=1
gpu_inference=true  # Whether to perform gpu decoding, set false for cpu decoding
# for gpu decoding, inference_nj=ngpu*njob; for cpu decoding, inference_nj=njob
njob=5
train_cmd=utils/run.pl
infer_cmd=utils/run.pl

# general configuration
feats_dir="../DATA" #feature output dictionary
exp_dir="."
lang=en
dumpdir=dump/fbank
feats_type=fbank
token_type=bpe
dataset_type=large
scp=feats.scp
type=kaldi_ark
stage=3
stop_stage=4

# feature configuration
feats_dim=80
sample_frequency=16000
nj=100
speed_perturb="0.9,1.0,1.1"

# data
data_librispeech=

# bpe model
nbpe=5000
bpemode=unigram

# exp tag
tag=""

. utils/parse_options.sh || exit 1;

# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
set -u
set -o pipefail

train_set=train_960
valid_set=dev
test_sets="test_clean test_other dev_clean dev_other"

asr_config=conf/train_asr_conformer.yaml
#asr_config=conf/train_asr_conformer_uttnorm.yaml
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

inference_config=conf/decode_asr_transformer.yaml
#inference_config=conf/decode_asr_transformer_beam60_ctc0.3.yaml
inference_asr_model=valid.acc.ave_10best.pth

# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default
ngpu=$(echo $gpuid_list | awk -F "," '{print NF}')

if ${gpu_inference}; then
    inference_nj=$[${ngpu}*${njob}]
    _ngpu=1
else
    inference_nj=$njob
    _ngpu=0
fi

if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    echo "stage 0: Data preparation"
    # Data preparation
    for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
        local/data_prep_librispeech.sh ${data_librispeech}/LibriSpeech/${x} ${feats_dir}/data/${x//-/_}
    done
fi

feat_train_dir=${feats_dir}/${dumpdir}/$train_set; mkdir -p ${feat_train_dir}
feat_dev_clean_dir=${feats_dir}/${dumpdir}/dev_clean; mkdir -p ${feat_dev_clean_dir}
feat_dev_other_dir=${feats_dir}/${dumpdir}/dev_other; mkdir -p ${feat_dev_other_dir}
feat_test_clean_dir=${feats_dir}/${dumpdir}/test_clean; mkdir -p ${feat_test_clean_dir}
feat_test_other_dir=${feats_dir}/${dumpdir}/test_other; mkdir -p ${feat_test_other_dir}
feat_dev_dir=${feats_dir}/${dumpdir}/$valid_set; mkdir -p ${feat_dev_dir}
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    echo "stage 1: Feature Generation"
    # compute fbank features
    fbankdir=${feats_dir}/fbank
    for x in dev_clean dev_other test_clean test_other; do
        utils/compute_fbank.sh --cmd "$train_cmd" --nj 1 --max_lengths 3000 --feats_dim ${feats_dim} --sample_frequency ${sample_frequency} \
            ${feats_dir}/data/${x} ${exp_dir}/exp/make_fbank/${x} ${fbankdir}/${x}
        utils/fix_data_feat.sh ${fbankdir}/${x}
    done

    mkdir ${feats_dir}/data/$train_set
    train_sets="train_clean_100 train_clean_360 train_other_500"
    for file in wav.scp text; do
        ( for f in $train_sets; do cat $feats_dir/data/$f/$file; done ) | sort -k1 > $feats_dir/data/$train_set/$file || exit 1;
    done
    utils/compute_fbank.sh --cmd "$train_cmd" --nj $nj --max_lengths 3000 --feats_dim ${feats_dim} --sample_frequency ${sample_frequency} --speed_perturb ${speed_perturb} \
    ${feats_dir}/data/$train_set ${exp_dir}/exp/make_fbank/$train_set ${fbankdir}/$train_set
    utils/fix_data_feat.sh ${fbankdir}/$train_set

    # compute global cmvn
    utils/compute_cmvn.sh --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} \
        ${fbankdir}/$train_set ${exp_dir}/exp/make_fbank/$train_set

    # apply cmvn
    utils/apply_cmvn.sh --cmd "$train_cmd" --nj $nj \
        ${fbankdir}/$train_set ${fbankdir}/$train_set/cmvn.json ${exp_dir}/exp/make_fbank/$train_set ${feat_train_dir}
    utils/apply_cmvn.sh --cmd "$train_cmd" --nj 1 \
        ${fbankdir}/dev_clean ${fbankdir}/$train_set/cmvn.json ${exp_dir}/exp/make_fbank/dev_clean ${feat_dev_clean_dir}
    utils/apply_cmvn.sh --cmd "$train_cmd" --nj 1\
        ${fbankdir}/dev_other ${fbankdir}/$train_set/cmvn.json ${exp_dir}/exp/make_fbank/dev_other ${feat_dev_other_dir}
    utils/apply_cmvn.sh --cmd "$train_cmd" --nj 1 \
        ${fbankdir}/test_clean ${fbankdir}/$train_set/cmvn.json ${exp_dir}/exp/make_fbank/test_clean ${feat_test_clean_dir}
    utils/apply_cmvn.sh --cmd "$train_cmd" --nj 1 \
        ${fbankdir}/test_other ${fbankdir}/$train_set/cmvn.json ${exp_dir}/exp/make_fbank/test_other ${feat_test_other_dir}

    cp ${fbankdir}/$train_set/text ${fbankdir}/$train_set/speech_shape ${fbankdir}/$train_set/text_shape ${feat_train_dir}
    cp ${fbankdir}/dev_clean/text ${fbankdir}/dev_clean/speech_shape ${fbankdir}/dev_clean/text_shape ${feat_dev_clean_dir}
    cp ${fbankdir}/dev_other/text ${fbankdir}/dev_other/speech_shape ${fbankdir}/dev_other/text_shape ${feat_dev_other_dir}
    cp ${fbankdir}/test_clean/text ${fbankdir}/test_clean/speech_shape ${fbankdir}/test_clean/text_shape ${feat_test_clean_dir}
    cp ${fbankdir}/test_other/text ${fbankdir}/test_other/speech_shape ${fbankdir}/test_other/text_shape ${feat_test_other_dir}

    dev_sets="dev_clean dev_other"
    for file in feats.scp text speech_shape text_shape; do
        ( for f in $dev_sets; do cat $feats_dir/${dumpdir}/$f/$file; done ) | sort -k1 > $feat_dev_dir/$file || exit 1;
    done

    #generate ark list
    utils/gen_ark_list.sh --cmd "$train_cmd" --nj $nj ${feat_train_dir} ${fbankdir}/${train_set} ${feat_train_dir}
    utils/gen_ark_list.sh --cmd "$train_cmd" --nj $nj ${feat_dev_dir} ${fbankdir}/${valid_set} ${feat_dev_dir}
fi

dict=${feats_dir}/data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
bpemodel=${feats_dir}/data/lang_char/${train_set}_${bpemode}${nbpe}
echo "dictionary: ${dict}"
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    ### Task dependent. You have to check non-linguistic symbols used in the corpus.
    echo "stage 2: Dictionary and Json Data Preparation"
    mkdir -p ${feats_dir}/data/lang_char/
    echo "<blank>" > ${dict}
    echo "<s>" >> ${dict}
    echo "</s>" >> ${dict}
    cut -f 2- -d" " ${feats_dir}/data/${train_set}/text > ${feats_dir}/data/lang_char/input.txt
    spm_train --input=${feats_dir}/data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000
    spm_encode --model=${bpemodel}.model --output_format=piece < ${feats_dir}/data/lang_char/input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0}' >> ${dict}
    echo "<unk>" >> ${dict}
    wc -l ${dict}

    vocab_size=$(cat ${dict} | wc -l)
    awk -v v=,${vocab_size} '{print $0v}' ${feat_train_dir}/text_shape > ${feat_train_dir}/text_shape.char
    awk -v v=,${vocab_size} '{print $0v}' ${feat_dev_dir}/text_shape > ${feat_dev_dir}/text_shape.char
    mkdir -p ${feats_dir}/asr_stats_fbank_zh_char/$train_set
    mkdir -p ${feats_dir}/asr_stats_fbank_zh_char/$valid_set
    cp ${feat_train_dir}/speech_shape ${feat_train_dir}/text_shape ${feat_train_dir}/text_shape.char ${feats_dir}/asr_stats_fbank_zh_char/$train_set
    cp ${feat_dev_dir}/speech_shape ${feat_dev_dir}/text_shape ${feat_dev_dir}/text_shape.char ${feats_dir}/asr_stats_fbank_zh_char/$valid_set
fi


# Training Stage
world_size=$gpu_num  # run on one machine
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    echo "stage 3: Training"
    mkdir -p ${exp_dir}/exp/${model_dir}
    mkdir -p ${exp_dir}/exp/${model_dir}/log
    INIT_FILE=${exp_dir}/exp/${model_dir}/ddp_init
    if [ -f $INIT_FILE ];then
        rm -f $INIT_FILE
    fi
    init_method=file://$(readlink -f $INIT_FILE)
    echo "$0: init method is $init_method"
    for ((i = 0; i < $gpu_num; ++i)); do
        {
            rank=$i
            local_rank=$i
            gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
            asr_train.py \
                --gpu_id $gpu_id \
                --use_preprocessor true \
                --split_with_space false \
                --bpemodel ${bpemodel}.model \
                --token_type $token_type \
                --dataset_type $dataset_type \
                --token_list $dict \
                --train_data_file $feats_dir/$dumpdir/${train_set}/ark_txt.scp \
                --valid_data_file $feats_dir/$dumpdir/${valid_set}/ark_txt.scp \
                --resume true \
                --output_dir ${exp_dir}/exp/${model_dir} \
                --config $asr_config \
                --input_size $feats_dim \
                --ngpu $gpu_num \
                --num_worker_count $count \
                --multiprocessing_distributed true \
                --dist_init_method $init_method \
                --dist_world_size $world_size \
                --dist_rank $rank \
                --local_rank $local_rank 1> ${exp_dir}/exp/${model_dir}/log/train.log.$i 2>&1
        } &
        done
        wait
fi

# Testing Stage
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    echo "stage 4: Inference"
    for dset in ${test_sets}; do
        asr_exp=${exp_dir}/exp/${model_dir}
        inference_tag="$(basename "${inference_config}" .yaml)"
        _dir="${asr_exp}/${inference_tag}/${inference_asr_model}/${dset}"
        _logdir="${_dir}/logdir"
        if [ -d ${_dir} ]; then
            echo "${_dir} is already exists. if you want to decode again, please delete this dir first."
            exit 0
        fi
        mkdir -p "${_logdir}"
        _data="${feats_dir}/${dumpdir}/${dset}"
        key_file=${_data}/${scp}
        num_scp_file="$(<${key_file} wc -l)"
        _nj=$([ $inference_nj -le $num_scp_file ] && echo "$inference_nj" || echo "$num_scp_file")
        split_scps=
        for n in $(seq "${_nj}"); do
            split_scps+=" ${_logdir}/keys.${n}.scp"
        done
        # shellcheck disable=SC2086
        utils/split_scp.pl "${key_file}" ${split_scps}
        _opts=
        if [ -n "${inference_config}" ]; then
            _opts+="--config ${inference_config} "
        fi
        ${infer_cmd} --gpu "${_ngpu}" --max-jobs-run "${_nj}" JOB=1:"${_nj}" "${_logdir}"/asr_inference.JOB.log \
            python -m funasr.bin.asr_inference_launch \
                --batch_size 1 \
                --ngpu "${_ngpu}" \
                --njob ${njob} \
                --gpuid_list ${gpuid_list} \
                --data_path_and_name_and_type "${_data}/${scp},speech,${type}" \
                --key_file "${_logdir}"/keys.JOB.scp \
                --asr_train_config "${asr_exp}"/config.yaml \
                --asr_model_file "${asr_exp}"/"${inference_asr_model}" \
                --output_dir "${_logdir}"/output.JOB \
                --mode asr \
                ${_opts}

        for f in token token_int score text; do
            if [ -f "${_logdir}/output.1/1best_recog/${f}" ]; then
                for i in $(seq "${_nj}"); do
                    cat "${_logdir}/output.${i}/1best_recog/${f}"
                done | sort -k1 >"${_dir}/${f}"
            fi
        done
        python utils/compute_wer.py ${_data}/text ${_dir}/text ${_dir}/text.cer
        tail -n 3 ${_dir}/text.cer > ${_dir}/text.cer.txt
        cat ${_dir}/text.cer.txt
    done
fi

 egs/librispeech/conformer/utils

New file
@@ -0,0 +1 @@
../../aishell/transformer/utils

 egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer.py

@@ -74,7 +74,7 @@
    # If text exists, compute CER
    text_in = os.path.join(params["data_dir"], "text")
    if os.path.exists(text_in):
        text_proc_file = os.path.join(best_recog_path, "token")
        text_proc_file = os.path.join(best_recog_path, "text")
        compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))



 egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py

@@ -38,7 +38,7 @@
    # computer CER if GT text is set
    text_in = os.path.join(params["data_dir"], "text")
    if os.path.exists(text_in):
        text_proc_file = os.path.join(decoding_path, "1best_recog/token")
        text_proc_file = os.path.join(decoding_path, "1best_recog/text")
        compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))



 egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer.py

@@ -74,7 +74,7 @@
    # If text exists, compute CER
    text_in = os.path.join(params["data_dir"], "text")
    if os.path.exists(text_in):
        text_proc_file = os.path.join(best_recog_path, "token")
        text_proc_file = os.path.join(best_recog_path, "text")
        compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))



 egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py

@@ -38,7 +38,7 @@
    # computer CER if GT text is set
    text_in = os.path.join(params["data_dir"], "text")
    if os.path.exists(text_in):
        text_proc_file = os.path.join(decoding_path, "1best_recog/token")
        text_proc_file = os.path.join(decoding_path, "1best_recog/text")
        compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))



 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py

@@ -17,7 +17,7 @@
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--model', type=str, default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
    parser.add_argument('--audio_in', type=str, default="./data/test")
    parser.add_argument('--audio_in', type=str, default="./data/test/wav.scp")
    parser.add_argument('--output_dir', type=str, default="./results/")
    parser.add_argument('--batch_size', type=int, default=64)
    parser.add_argument('--gpuid', type=str, default="0")

 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh

@@ -63,8 +63,8 @@

if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
    echo "Computing WER ..."
    python utils/proce_text.py ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
    python utils/proce_text.py ${data_dir}/text ${output_dir}/1best_recog/text.ref
    cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
    cp ${data_dir}/text ${output_dir}/1best_recog/text.ref
    python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
    tail -n 3 ${output_dir}/1best_recog/text.cer
fi

 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py

@@ -34,7 +34,7 @@
    # computer CER if GT text is set
    text_in = os.path.join(params["data_dir"], "text")
    if os.path.exists(text_in):
        text_proc_file = os.path.join(decoding_path, "1best_recog/token")
        text_proc_file = os.path.join(decoding_path, "1best_recog/text")
        compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))



 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py

@@ -17,7 +17,7 @@
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--model', type=str, default="damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1")
    parser.add_argument('--audio_in', type=str, default="./data/test")
    parser.add_argument('--audio_in', type=str, default="./data/test/wav.scp")
    parser.add_argument('--output_dir', type=str, default="./results/")
    parser.add_argument('--batch_size', type=int, default=64)
    parser.add_argument('--gpuid', type=str, default="0")

 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.sh

@@ -63,8 +63,8 @@

if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
    echo "Computing WER ..."
    python utils/proce_text.py ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
    python utils/proce_text.py ${data_dir}/text ${output_dir}/1best_recog/text.ref
    cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
    cp ${data_dir}/text ${output_dir}/1best_recog/text.ref
    python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
    tail -n 3 ${output_dir}/1best_recog/text.cer
fi

 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py

@@ -34,7 +34,7 @@
    # computer CER if GT text is set
    text_in = os.path.join(params["data_dir"], "text")
    if os.path.exists(text_in):
        text_proc_file = os.path.join(decoding_path, "1best_recog/token")
        text_proc_file = os.path.join(decoding_path, "1best_recog/text")
        compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))



 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer.py

@@ -75,7 +75,7 @@
    # If text exists, compute CER
    text_in = os.path.join(params["data_dir"], "text")
    if os.path.exists(text_in):
        text_proc_file = os.path.join(best_recog_path, "token")
        text_proc_file = os.path.join(best_recog_path, "text")
        compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))



 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py

@@ -39,7 +39,7 @@
    # computer CER if GT text is set
    text_in = os.path.join(params["data_dir"], "text")
    if os.path.exists(text_in):
        text_proc_file = os.path.join(decoding_path, "1best_recog/token")
        text_proc_file = os.path.join(decoding_path, "1best_recog/text")
        compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))



 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer.py

@@ -75,7 +75,7 @@
    # If text exists, compute CER
    text_in = os.path.join(params["data_dir"], "text")
    if os.path.exists(text_in):
        text_proc_file = os.path.join(best_recog_path, "token")
        text_proc_file = os.path.join(best_recog_path, "text")
        compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))



 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py

@@ -39,7 +39,7 @@
    # computer CER if GT text is set
    text_in = os.path.join(params["data_dir"], "text")
    if os.path.exists(text_in):
        text_proc_file = os.path.join(decoding_path, "1best_recog/token")
        text_proc_file = os.path.join(decoding_path, "1best_recog/text")
        compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))



 funasr/bin/asr_inference_paraformer.py

@@ -797,7 +797,7 @@
                        finish_count += 1
                        # asr_utils.print_progress(finish_count / file_count)
                        if writer is not None:
                            ibest_writer["text"][key] = text_postprocessed
                            ibest_writer["text"][key] = " ".join(word_lists)

                    logging.info("decoding, utt: {}, predictions: {}".format(key, text))
        rtf_avg = "decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}".format(length_total, forward_time_total, 100 * forward_time_total / (length_total * lfr_factor))

 funasr/bin/asr_inference_paraformer_streaming.py

@@ -42,6 +42,7 @@
from funasr.models.frontend.wav_frontend import WavFrontend
from funasr.models.e2e_asr_paraformer import BiCifParaformer, ContextualParaformer
from funasr.export.models.e2e_asr_paraformer import Paraformer as Paraformer_export
np.set_printoptions(threshold=np.inf)

class Speech2Text:
    """Speech2Text class
@@ -203,7 +204,6 @@
        # Input as audio signal
        if isinstance(speech, np.ndarray):
            speech = torch.tensor(speech)

        if self.frontend is not None:
            feats, feats_len = self.frontend.forward(speech, speech_lengths)
            feats = to_device(feats, device=self.device)
@@ -213,13 +213,16 @@
            feats = speech
            feats_len = speech_lengths
        lfr_factor = max(1, (feats.size()[-1] // 80) - 1)
        feats_len = cache["encoder"]["stride"] + cache["encoder"]["pad_left"] + cache["encoder"]["pad_right"]
        feats = feats[:,cache["encoder"]["start_idx"]:cache["encoder"]["start_idx"]+feats_len,:]
        feats_len = torch.tensor([feats_len])
        batch = {"speech": feats, "speech_lengths": feats_len, "cache": cache}

        # a. To device
        batch = to_device(batch, device=self.device)

        # b. Forward Encoder
        enc, enc_len = self.asr_model.encode_chunk(**batch)
        enc, enc_len = self.asr_model.encode_chunk(feats, feats_len, cache)
        if isinstance(enc, tuple):
            enc = enc[0]
        # assert len(enc) == 1, len(enc)
@@ -579,6 +582,21 @@
    else:
        speech2text = Speech2Text(**speech2text_kwargs)

    def _load_bytes(input):
        middle_data = np.frombuffer(input, dtype=np.int16)
        middle_data = np.asarray(middle_data)
        if middle_data.dtype.kind not in 'iu':
            raise TypeError("'middle_data' must be an array of integers")
        dtype = np.dtype('float32')
        if dtype.kind != 'f':
            raise TypeError("'dtype' must be a floating point type")

        i = np.iinfo(middle_data.dtype)
        abs_max = 2 ** (i.bits - 1)
        offset = i.min + abs_max
        array = np.frombuffer((middle_data.astype(dtype) - offset) / abs_max, dtype=np.float32)
        return array
    
    def _forward(
            data_path_and_name_and_type,
            raw_inputs: Union[np.ndarray, torch.Tensor] = None,
@@ -589,10 +607,12 @@
    ):

        # 3. Build data-iterator
        if data_path_and_name_and_type is not None and data_path_and_name_and_type[2] == "bytes":
            raw_inputs = _load_bytes(data_path_and_name_and_type[0])
            raw_inputs = torch.tensor(raw_inputs)
        if data_path_and_name_and_type is None and raw_inputs is not None:
            if isinstance(raw_inputs, np.ndarray):
                raw_inputs = torch.tensor(raw_inputs)

        is_final = False
        if param_dict is not None and "cache" in param_dict:
            cache = param_dict["cache"]
@@ -605,61 +625,86 @@
        asr_result = ""
        wait = True
        if len(cache) == 0:
            cache["encoder"] = {"start_idx": 0, "pad_left": 0, "stride": 10, "pad_right": 5, "cif_hidden": None, "cif_alphas": None}
            cache["encoder"] = {"start_idx": 0, "pad_left": 0, "stride": 10, "pad_right": 5, "cif_hidden": None, "cif_alphas": None, "is_final": is_final, "left": 0, "right": 0}
            cache_de = {"decode_fsmn": None}
            cache["decoder"] = cache_de
            cache["first_chunk"] = True
            cache["speech"] = []
            cache["chunk_index"] = 0
            cache["speech_chunk"] = []
            cache["accum_speech"] = 0

        if raw_inputs is not None:
            if len(cache["speech"]) == 0:
                cache["speech"] = raw_inputs
            else:
                cache["speech"] = torch.cat([cache["speech"], raw_inputs], dim=0)
            if len(cache["speech_chunk"]) == 0:
                cache["speech_chunk"] = raw_inputs
            else:
                cache["speech_chunk"] = torch.cat([cache["speech_chunk"], raw_inputs], dim=0)
            while len(cache["speech_chunk"]) >= 960:
            cache["accum_speech"] += len(raw_inputs)
            while cache["accum_speech"] >= 960:
                if cache["first_chunk"]:
                    if len(cache["speech_chunk"]) >= 14400:
                        speech = torch.unsqueeze(cache["speech_chunk"][0:14400], axis=0)
                        speech_length = torch.tensor([14400])
                    if cache["accum_speech"] >= 14400:
                        speech = torch.unsqueeze(cache["speech"], axis=0)
                        speech_length = torch.tensor([len(cache["speech"])])
                        cache["encoder"]["pad_left"] = 5 
                        cache["encoder"]["pad_right"] = 5 
                        cache["encoder"]["stride"] = 10
                        cache["encoder"]["left"] = 5
                        cache["encoder"]["right"] = 0
                        results = speech2text(cache, speech, speech_length)
                        cache["speech_chunk"]= cache["speech_chunk"][4800:]
                        cache["accum_speech"] -= 4800
                        cache["first_chunk"] = False
                        cache["encoder"]["start_idx"] = -5
                        cache["encoder"]["is_final"] = False
                        wait = False
                    else:
                        if is_final:
                            cache["encoder"]["stride"] = len(cache["speech_chunk"]) // 960
                            cache["encoder"]["stride"] = len(cache["speech"]) // 960
                            cache["encoder"]["pad_left"] = 0
                            cache["encoder"]["pad_right"] = 0
                            speech = torch.unsqueeze(cache["speech_chunk"], axis=0)
                            speech_length = torch.tensor([len(cache["speech_chunk"])])
                            speech = torch.unsqueeze(cache["speech"], axis=0)
                            speech_length = torch.tensor([len(cache["speech"])])
                            results = speech2text(cache, speech, speech_length)
                            cache["speech_chunk"] = []
                            cache["accum_speech"] = 0
                            wait = False
                        else:
                            break
                else:
                    if len(cache["speech_chunk"]) >= 19200:
                    if cache["accum_speech"] >= 19200:
                        cache["encoder"]["start_idx"] += 10
                        cache["encoder"]["stride"] = 10
                        cache["encoder"]["pad_left"] = 5
                        speech = torch.unsqueeze(cache["speech_chunk"][:19200], axis=0)
                        speech_length = torch.tensor([19200])
                        cache["encoder"]["pad_right"] = 5
                        cache["encoder"]["left"] = 0
                        cache["encoder"]["right"] = 0
                        speech = torch.unsqueeze(cache["speech"], axis=0)
                        speech_length = torch.tensor([len(cache["speech"])])
                        results = speech2text(cache, speech, speech_length)
                        cache["speech_chunk"] = cache["speech_chunk"][9600:]
                        cache["accum_speech"] -= 9600
                        wait = False
                    else:
                        if is_final:
                            cache["encoder"]["stride"] = len(cache["speech_chunk"]) // 960
                            cache["encoder"]["pad_right"] = 0
                            speech = torch.unsqueeze(cache["speech_chunk"], axis=0)
                            speech_length = torch.tensor([len(cache["speech_chunk"])])
                            cache["encoder"]["is_final"] = True
                            if cache["accum_speech"] >= 14400:
                                cache["encoder"]["start_idx"] += 10
                                cache["encoder"]["stride"] = 10
                                cache["encoder"]["pad_left"] = 5
                                cache["encoder"]["pad_right"] = 5
                                cache["encoder"]["left"] = 0
                                cache["encoder"]["right"] = cache["accum_speech"] // 960 - 15
                                speech = torch.unsqueeze(cache["speech"], axis=0)
                                speech_length = torch.tensor([len(cache["speech"])])
                            results = speech2text(cache, speech, speech_length)
                            cache["speech_chunk"] = []
                                cache["accum_speech"] -= 9600
                                wait = False
                            else:
                                cache["encoder"]["start_idx"] += 10
                                cache["encoder"]["stride"] = cache["accum_speech"] // 960 - 5
                                cache["encoder"]["pad_left"] = 5
                                cache["encoder"]["pad_right"] = 0
                                cache["encoder"]["left"] = 0
                                cache["encoder"]["right"] = 0
                                speech = torch.unsqueeze(cache["speech"], axis=0)
                                speech_length = torch.tensor([len(cache["speech"])])
                                results = speech2text(cache, speech, speech_length)
                                cache["accum_speech"] = 0
                            wait = False
                        else:
                            break

 funasr/bin/asr_inference_paraformer_vad.py

@@ -338,7 +338,7 @@
                    ibest_writer["token"][key] = " ".join(token)
                    ibest_writer["token_int"][key] = " ".join(map(str, token_int))
                    ibest_writer["vad"][key] = "{}".format(vadsegments)
                    ibest_writer["text"][key] = text_postprocessed
                    ibest_writer["text"][key] = " ".join(word_lists)
                    ibest_writer["text_with_punc"][key] = text_postprocessed_punc
                    if time_stamp_postprocessed is not None:
                        ibest_writer["time_stamp"][key] = "{}".format(time_stamp_postprocessed)

 funasr/bin/asr_inference_paraformer_vad_punc.py

@@ -670,7 +670,7 @@
                    ibest_writer["token"][key] = " ".join(token)
                    ibest_writer["token_int"][key] = " ".join(map(str, token_int))
                    ibest_writer["vad"][key] = "{}".format(vadsegments)
                    ibest_writer["text"][key] = text_postprocessed
                    ibest_writer["text"][key] = " ".join(word_lists)
                    ibest_writer["text_with_punc"][key] = text_postprocessed_punc
                    if time_stamp_postprocessed is not None:
                        ibest_writer["time_stamp"][key] = "{}".format(time_stamp_postprocessed)

 funasr/bin/asr_inference_rnnt.py

@@ -738,13 +738,13 @@
                        ibest_writer["rtf"][key] = rtf_cur

                    if text is not None:
                        text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
                        text_postprocessed, word_lists = postprocess_utils.sentence_postprocess(token)
                        item = {'key': key, 'value': text_postprocessed}
                        asr_result_list.append(item)
                        finish_count += 1
                        # asr_utils.print_progress(finish_count / file_count)
                        if writer is not None:
                            ibest_writer["text"][key] = text_postprocessed
                            ibest_writer["text"][key] = " ".join(word_lists)

                    logging.info("decoding, utt: {}, predictions: {}".format(key, text))
        rtf_avg = "decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}".format(length_total, forward_time_total, 100 * forward_time_total / (length_total * lfr_factor))

 funasr/bin/asr_inference_uniasr.py

@@ -504,13 +504,13 @@
                    ibest_writer["score"][key] = str(hyp.score)
    
                if text is not None:
                    text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
                    text_postprocessed, word_lists = postprocess_utils.sentence_postprocess(token)
                    item = {'key': key, 'value': text_postprocessed}
                    asr_result_list.append(item)
                    finish_count += 1
                    asr_utils.print_progress(finish_count / file_count)
                    if writer is not None:
                        ibest_writer["text"][key] = text_postprocessed
                        ibest_writer["text"][key] = " ".join(word_lists)
        return asr_result_list
    
    return _forward

 funasr/bin/asr_inference_uniasr_vad.py

@@ -507,13 +507,13 @@
                    ibest_writer["score"][key] = str(hyp.score)
    
                if text is not None:
                    text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
                    text_postprocessed, word_lists = postprocess_utils.sentence_postprocess(token)
                    item = {'key': key, 'value': text_postprocessed}
                    asr_result_list.append(item)
                    finish_count += 1
                    asr_utils.print_progress(finish_count / file_count)
                    if writer is not None:
                        ibest_writer["text"][key] = text_postprocessed
                        ibest_writer["text"][key] = " ".join(word_lists)
        return asr_result_list
    
    return _forward

 funasr/datasets/large_datasets/utils/tokenize.py

@@ -37,7 +37,7 @@
    vad = -2

    if bpe_tokenizer is not None:
        text = bpe_tokenizer.text2tokens(text)
        text = bpe_tokenizer.text2tokens("".join(text))

    if seg_dict is not None:
        assert isinstance(seg_dict, dict)

 funasr/export/export_model.py

@@ -19,6 +19,7 @@
        self,
        cache_dir: Union[Path, str] = None,
        onnx: bool = True,
        device: str = "cpu",
        quant: bool = True,
        fallback_num: int = 0,
        audio_in: str = None,
@@ -36,6 +37,7 @@
        )
        print("output dir: {}".format(self.cache_dir))
        self.onnx = onnx
        self.device = device
        self.quant = quant
        self.fallback_num = fallback_num
        self.frontend = None
@@ -111,6 +113,10 @@
            dummy_input = model.get_dummy_inputs(enc_size)
        else:
            dummy_input = model.get_dummy_inputs()

        if self.device == 'cuda':
            model = model.cuda()
            dummy_input = tuple([i.cuda() for i in dummy_input])

        # model_script = torch.jit.script(model)
        model_script = torch.jit.trace(model, dummy_input)
@@ -260,6 +266,7 @@
    parser.add_argument('--model-name', type=str, required=True)
    parser.add_argument('--export-dir', type=str, required=True)
    parser.add_argument('--type', type=str, default='onnx', help='["onnx", "torch"]')
    parser.add_argument('--device', type=str, default='cpu', help='["cpu", "cuda"]')
    parser.add_argument('--quantize', type=str2bool, default=False, help='export quantized model')
    parser.add_argument('--fallback-num', type=int, default=0, help='amp fallback number')
    parser.add_argument('--audio_in', type=str, default=None, help='["wav", "wav.scp"]')
@@ -269,6 +276,7 @@
    export_model = ModelExport(
        cache_dir=args.export_dir,
        onnx=args.type == 'onnx',
        device=args.device,
        quant=args.quantize,
        fallback_num=args.fallback_num,
        audio_in=args.audio_in,

 funasr/export/models/modules/multihead_att.py

@@ -75,8 +75,8 @@
    return x, cache


torch_version = float(".".join(torch.__version__.split(".")[:2]))
if torch_version >= 1.8:
torch_version = tuple([int(i) for i in torch.__version__.split(".")[:2]])
if torch_version >= (1, 8):
    import torch.fx
    torch.fx.wrap('preprocess_for_attn')


 funasr/models/decoder/contextual_decoder.py

@@ -74,7 +74,7 @@
        return x, tgt_mask, x_self_attn, x_src_attn


class ContexutalBiasDecoder(nn.Module):
class ContextualBiasDecoder(nn.Module):
    def __init__(
        self,
        size,
@@ -83,7 +83,7 @@
        normalize_before=True,
    ):
        """Construct an DecoderLayer object."""
        super(ContexutalBiasDecoder, self).__init__()
        super(ContextualBiasDecoder, self).__init__()
        self.size = size
        self.src_attn = src_attn
        if src_attn is not None:
@@ -186,7 +186,7 @@
            ),
        )
        self.dropout = nn.Dropout(dropout_rate)
        self.bias_decoder = ContexutalBiasDecoder(
        self.bias_decoder = ContextualBiasDecoder(
            size=attention_dim,
            src_attn=MultiHeadedAttentionCrossAtt(
                attention_heads, attention_dim, src_attention_dropout_rate

 funasr/models/decoder/sanm_decoder.py

@@ -104,7 +104,6 @@

            x = residual + self.dropout(self.src_attn(x, memory, memory_mask))


        return x, tgt_mask, memory, memory_mask, cache

    def forward_chunk(self, tgt, tgt_mask, memory, memory_mask=None, cache=None):
@@ -400,7 +399,7 @@
        for i in range(self.att_layer_num):
            decoder = self.decoders[i]
            c = cache[i]
            x, tgt_mask, memory, memory_mask, c_ret = decoder(
            x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
                x, tgt_mask, memory, memory_mask, cache=c
            )
            new_cache.append(c_ret)
@@ -410,13 +409,13 @@
                j = i + self.att_layer_num
                decoder = self.decoders2[i]
                c = cache[j]
                x, tgt_mask, memory, memory_mask, c_ret = decoder(
                x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
                    x, tgt_mask, memory, memory_mask, cache=c
                )
                new_cache.append(c_ret)

        for decoder in self.decoders3:
            x, tgt_mask, memory, memory_mask, _ = decoder(
            x, tgt_mask, memory, memory_mask, _ = decoder.forward_chunk(
                x, tgt_mask, memory, None, cache=None
            )

@@ -1077,7 +1076,7 @@
        for i in range(self.att_layer_num):
            decoder = self.decoders[i]
            c = cache[i]
            x, tgt_mask, memory, memory_mask, c_ret = decoder(
            x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
                x, tgt_mask, memory, None, cache=c
            )
            new_cache.append(c_ret)
@@ -1087,14 +1086,14 @@
                j = i + self.att_layer_num
                decoder = self.decoders2[i]
                c = cache[j]
                x, tgt_mask, memory, memory_mask, c_ret = decoder(
                x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
                    x, tgt_mask, memory, None, cache=c
                )
                new_cache.append(c_ret)

        for decoder in self.decoders3:

            x, tgt_mask, memory, memory_mask, _ = decoder(
            x, tgt_mask, memory, memory_mask, _ = decoder.forward_chunk(
                x, tgt_mask, memory, None, cache=None
            )


 funasr/models/e2e_asr_paraformer.py

@@ -370,19 +370,10 @@
                encoder_out, encoder_out_lens
            )

        assert encoder_out.size(0) == speech.size(0), (
            encoder_out.size(),
            speech.size(0),
        )
        assert encoder_out.size(1) <= encoder_out_lens.max(), (
            encoder_out.size(),
            encoder_out_lens.max(),
        )

        if intermediate_outs is not None:
            return (encoder_out, intermediate_outs), encoder_out_lens

        return encoder_out, encoder_out_lens
        return encoder_out, torch.tensor([encoder_out.size(1)])

    def calc_predictor(self, encoder_out, encoder_out_lens):

@@ -1034,16 +1025,76 @@

        # 1. Encoder
        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
        intermediate_outs = None
        if isinstance(encoder_out, tuple):
            intermediate_outs = encoder_out[1]
            encoder_out = encoder_out[0]

        loss_att, acc_att, cer_att, wer_att = None, None, None, None
        loss_ctc, cer_ctc = None, None
        loss_pre = None
        stats = dict()

        # 1. CTC branch
        if self.ctc_weight != 0.0:
            loss_ctc, cer_ctc = self._calc_ctc_loss(
                encoder_out, encoder_out_lens, text, text_lengths
            )

            # Collect CTC branch stats
            stats["loss_ctc"] = loss_ctc.detach() if loss_ctc is not None else None
            stats["cer_ctc"] = cer_ctc

        # Intermediate CTC (optional)
        loss_interctc = 0.0
        if self.interctc_weight != 0.0 and intermediate_outs is not None:
            for layer_idx, intermediate_out in intermediate_outs:
                # we assume intermediate_out has the same length & padding
                # as those of encoder_out
                loss_ic, cer_ic = self._calc_ctc_loss(
                    intermediate_out, encoder_out_lens, text, text_lengths
                )
                loss_interctc = loss_interctc + loss_ic

                # Collect Intermedaite CTC stats
                stats["loss_interctc_layer{}".format(layer_idx)] = (
                    loss_ic.detach() if loss_ic is not None else None
                )
                stats["cer_interctc_layer{}".format(layer_idx)] = cer_ic

            loss_interctc = loss_interctc / len(intermediate_outs)

            # calculate whole encoder loss
            loss_ctc = (
                               1 - self.interctc_weight
                       ) * loss_ctc + self.interctc_weight * loss_interctc

        # 2b. Attention decoder branch
        if self.ctc_weight != 1.0:
            loss_att, acc_att, cer_att, wer_att, loss_pre = self._calc_att_loss(
                encoder_out, encoder_out_lens, text, text_lengths
            )

        loss_pre2 = self._calc_pre2_loss(
            encoder_out, encoder_out_lens, text, text_lengths
        )

        loss = loss_pre2
        # 3. CTC-Att loss definition
        if self.ctc_weight == 0.0:
            loss = loss_att + loss_pre * self.predictor_weight + loss_pre2 * self.predictor_weight * 0.5
        elif self.ctc_weight == 1.0:
            loss = loss_ctc
        else:
            loss = self.ctc_weight * loss_ctc + (1 - self.ctc_weight) * loss_att + loss_pre * self.predictor_weight + loss_pre2 * self.predictor_weight * 0.5

        # Collect Attn branch stats
        stats["loss_att"] = loss_att.detach() if loss_att is not None else None
        stats["acc"] = acc_att
        stats["cer"] = cer_att
        stats["wer"] = wer_att
        stats["loss_pre"] = loss_pre.detach().cpu() if loss_pre is not None else None
        stats["loss_pre2"] = loss_pre2.detach().cpu()

        stats["loss"] = torch.clone(loss.detach())

        # force_gatherable: to-device and to-tensor if scalar for DataParallel
@@ -1094,6 +1145,7 @@
            inner_dim: int = 256,
            bias_encoder_type: str = 'lstm',
            label_bracket: bool = False,
            use_decoder_embedding: bool = False,
    ):
        assert check_argument_types()
        assert 0.0 <= ctc_weight <= 1.0, ctc_weight
@@ -1147,6 +1199,7 @@
            self.hotword_buffer = None
            self.length_record = []
            self.current_buffer_length = 0
        self.use_decoder_embedding = use_decoder_embedding

    def forward(
            self,
@@ -1288,7 +1341,10 @@
                    hw_list.append(hw_tokens)
        # padding
        hw_list_pad = pad_list(hw_list, 0)
        if self.use_decoder_embedding:
        hw_embed = self.decoder.embed(hw_list_pad)
        else:
            hw_embed = self.bias_embed(hw_list_pad)
        hw_embed, (_, _) = self.bias_encoder(hw_embed)
        _ind = np.arange(0, len(hw_list)).tolist()
        # update self.hotword_buffer, throw a part if oversize
@@ -1404,12 +1460,18 @@
            # default hotword list
            hw_list = [torch.Tensor([self.sos]).long().to(encoder_out.device)]  # empty hotword list
            hw_list_pad = pad_list(hw_list, 0)
            if self.use_decoder_embedding:
                hw_embed = self.decoder.embed(hw_list_pad)
            else:
            hw_embed = self.bias_embed(hw_list_pad)
            _, (h_n, _) = self.bias_encoder(hw_embed)
            contextual_info = h_n.squeeze(0).repeat(encoder_out.shape[0], 1, 1)
        else:
            hw_lengths = [len(i) for i in hw_list]
            hw_list_pad = pad_list([torch.Tensor(i).long() for i in hw_list], 0).to(encoder_out.device)
            if self.use_decoder_embedding:
                hw_embed = self.decoder.embed(hw_list_pad)
            else:
            hw_embed = self.bias_embed(hw_list_pad)
            hw_embed = torch.nn.utils.rnn.pack_padded_sequence(hw_embed, hw_lengths, batch_first=True,
                                                               enforce_sorted=False)

 funasr/models/predictor/cif.py

@@ -200,6 +200,7 @@
        return acoustic_embeds, token_num, alphas, cif_peak



    def forward_chunk(self, hidden, cache=None):

        b, t, d = hidden.size()

        h = hidden

        context = h.transpose(1, 2)

        queries = self.pad(context)

@@ -220,6 +221,8 @@
            alphas = alphas * mask_chunk_predictor

      

        if cache is not None:

            if cache["is_final"]:

                alphas[:, cache["stride"] + cache["pad_left"] - 1] += 0.45

            if cache["cif_hidden"] is not None:

                hidden = torch.cat((cache["cif_hidden"], hidden), 1)

            if cache["cif_alphas"] is not None:

@@ -240,7 +243,6 @@
                pre_alphas_length = cache["cif_alphas"].size(-1)

                mask_chunk_peak_predictor[:, :pre_alphas_length] = 1.0

            mask_chunk_peak_predictor[:, pre_alphas_length + cache["pad_left"]:pre_alphas_length + cache["stride"] + cache["pad_left"]] = 1.0

            


        if mask_chunk_peak_predictor is not None:

            cif_peak = cif_peak * mask_chunk_peak_predictor.squeeze(-1)


 funasr/modules/embedding.py

@@ -8,7 +8,7 @@

import math
import torch

import torch.nn.functional as F

def _pre_hook(
    state_dict,
@@ -409,9 +409,18 @@

    def forward_chunk(self, x, cache=None):
        start_idx = 0
        pad_left = 0
        pad_right = 0
        batch_size, timesteps, input_dim = x.size()
        if cache is not None:
            start_idx = cache["start_idx"]
            pad_left = cache["left"]
            pad_right = cache["right"]
        positions = torch.arange(1, timesteps+start_idx+1)[None, :]
        position_encoding = self.encode(positions, input_dim, x.dtype).to(x.device)
        return x + position_encoding[:, start_idx: start_idx + timesteps]
        outputs = x + position_encoding[:, start_idx: start_idx + timesteps]
        outputs = outputs.transpose(1,2)
        outputs = F.pad(outputs, (pad_left, pad_right))
        outputs = outputs.transpose(1,2)
        return outputs
       

 funasr/runtime/grpc/Readme.md

@@ -53,6 +53,68 @@
python grpc_main_client_mic.py  --host $server_ip --port 10108
```

The `grpc_main_client_mic.py` follows the [original design] (https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/grpc#workflow-in-desgin) by sending audio_data with chunks. If you want to send audio_data in one request, here is an example:

```
# go to ../python/grpc to find this package
import paraformer_pb2


class RecognizeStub:
    def __init__(self, channel):
        self.Recognize = channel.stream_stream(
                '/paraformer.ASR/Recognize',
                request_serializer=paraformer_pb2.Request.SerializeToString,
                response_deserializer=paraformer_pb2.Response.FromString,
                )


async def send(channel, data, speaking, isEnd):
    stub = RecognizeStub(channel)
    req = paraformer_pb2.Request()
    if data:
        req.audio_data = data
    req.user = 'zz'
    req.language = 'zh-CN'
    req.speaking = speaking
    req.isEnd = isEnd
    q = queue.SimpleQueue()
    q.put(req)
    return stub.Recognize(iter(q.get, None))

# send the audio data once
async def grpc_rec(data, grpc_uri):
    with grpc.insecure_channel(grpc_uri) as channel:
        b = time.time()
        response = await send(channel, data, False, False)
        resp = response.next()
        text = ''
        if 'decoding' == resp.action:
            resp = response.next()
            if 'finish' == resp.action:
                text = json.loads(resp.sentence)['text']
        response = await send(channel, None, False, True)
        return {
                'text': text,
                'time': time.time() - b,
                }

async def test():
    # fc = FunAsrGrpcClient('127.0.0.1', 9900)
    # t = await fc.rec(wav.tobytes())
    # print(t)
    wav, _ = sf.read('z-10s.wav', dtype='int16')
    uri = '127.0.0.1:9900'
    res = await grpc_rec(wav.tobytes(), uri)
    print(res)


if __name__ == '__main__':
    asyncio.run(test())

```


## Acknowledge
1. This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR).
2. We acknowledge [DeepScience](https://www.deepscience.cn) for contributing the grpc service.

 funasr/runtime/grpc/paraformer_server.cc

@@ -88,7 +88,7 @@
            res.set_language(req.language());
            stream->Write(res);
        } else if (!req.speaking()) {
            if (client_buffers.count(req.user()) == 0) {
            if (client_buffers.count(req.user()) == 0 && req.audio_data().size() == 0) {
                Response res;
                res.set_sentence(
                    R"({"success": true, "detail": "waiting_for_voice"})"
@@ -99,6 +99,10 @@
                stream->Write(res);
            }else {
                auto begin_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
                if (req.audio_data().size() > 0) {
                  auto& buf = client_buffers[req.user()];
                  buf.insert(buf.end(), req.audio_data().begin(), req.audio_data().end());
                }
                std::string tmp_data = this->client_buffers[req.user()];
                this->clear_states(req.user());
                

 funasr/runtime/python/grpc/grpc_server.py

@@ -109,7 +109,7 @@
                            else:
                                asr_result = ""
                        elif self.backend == "onnxruntime":
                            from rapid_paraformer.utils.frontend import load_bytes
                            from funasr_onnx.utils.frontend import load_bytes
                            array = load_bytes(tmp_data)
                            asr_result = self.inference_16k_pipeline(array)[0]
                        end_time = int(round(time.time() * 1000))

 funasr/runtime/python/libtorch/README.md

@@ -31,7 +31,7 @@

    ```shell
    git clone https://github.com/alibaba/FunASR.git && cd FunASR
    cd funasr/runtime/python/funasr_torch
    cd funasr/runtime/python/libtorch
    python setup.py build
    python setup.py install
    ```

 funasr/runtime/python/libtorch/demo.py

@@ -1,10 +1,15 @@

from funasr_torch import Paraformer

model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
model = Paraformer(model_dir, batch_size=1)

wav_path = ['/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav']
model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"

model = Paraformer(model_dir, batch_size=1)  # cpu
# model = Paraformer(model_dir, batch_size=1, device_id=0)  # gpu

# when using paraformer-large-vad-punc model, you can set plot_timestamp_to="./xx.png" to get figure of alignment besides timestamps
# model = Paraformer(model_dir, batch_size=1, plot_timestamp_to="test.png")

wav_path = "YourPath/xx.wav"

result = model(wav_path)
print(result)

 funasr/runtime/python/libtorch/funasr_torch/paraformer_bin.py

@@ -46,6 +46,7 @@
        )
        self.ort_infer = torch.jit.load(model_file)
        self.batch_size = batch_size
        self.device_id = device_id
        self.plot_timestamp_to = plot_timestamp_to
        self.pred_bias = pred_bias

@@ -58,8 +59,13 @@
            end_idx = min(waveform_nums, beg_idx + self.batch_size)
            feats, feats_len = self.extract_feat(waveform_list[beg_idx:end_idx])
            try:
                with torch.no_grad():
                    if int(self.device_id) == -1:
                outputs = self.ort_infer(feats, feats_len)
                am_scores, valid_token_lens = outputs[0], outputs[1]
                    else:
                        outputs = self.ort_infer(feats.cuda(), feats_len.cuda())
                        am_scores, valid_token_lens = outputs[0].cpu(), outputs[1].cpu()
                if len(outputs) == 4:
                    # for BiCifParaformer Inference
                    us_alphas, us_peaks = outputs[2], outputs[3]

 funasr/runtime/python/onnxruntime/README.md

@@ -32,7 +32,7 @@

```shell
git clone https://github.com/alibaba/FunASR.git && cd FunASR
cd funasr/runtime/python/funasr_onnx
cd funasr/runtime/python/onnxruntime
python setup.py build
python setup.py install
```

 funasr/runtime/python/onnxruntime/demo.py

@@ -1,13 +1,15 @@

from funasr_onnx import Paraformer

model_dir = "/Users/shixian/code/funasr/export/damo/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch"

# if you use paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch, you should set pred_bias=0
# plot_timestamp_to works only when using speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch
model = Paraformer(model_dir, batch_size=2, plot_timestamp_to="./", pred_bias=0) 
model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"

wav_path = "/Users/shixian/code/funasr/export/damo/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch/example/asr_example.wav"
model = Paraformer(model_dir, batch_size=2, plot_timestamp_to="./", pred_bias=0)  # cpu
# model = Paraformer(model_dir, batch_size=2, plot_timestamp_to="./", pred_bias=0, device_id=0)  # gpu

# when using paraformer-large-vad-punc model, you can set plot_timestamp_to="./xx.png" to get figure of alignment besides timestamps
# model = Paraformer(model_dir, batch_size=1, plot_timestamp_to="test.png")

wav_path = "YourPath/xx.wav"

result = model(wav_path)
print(result)

 funasr/tasks/abs_task.py

@@ -464,6 +464,12 @@
            default=sys.maxsize,
            help="The maximum number update step to train",
        )
        parser.add_argument(
            "--batch_interval",
            type=int,
            default=10000,
            help="The batch interval for saving model.",
        )
        group.add_argument(
            "--patience",
            type=int_or_none,
@@ -1355,15 +1361,15 @@
                from funasr.datasets.large_datasets.build_dataloader import ArkDataLoader
                train_iter_factory = ArkDataLoader(args.train_data_file, args.token_list, args.dataset_conf,
                                                   frontend_conf=args.frontend_conf if hasattr(args, "frontend_conf") else None,
                                                   seg_dict_file=args.seg_dict_file if hasattr(args,
                                                                                               "seg_dict_file") else None,
                                                   seg_dict_file=args.seg_dict_file if hasattr(args, "seg_dict_file") else None,
                                                   punc_dict_file=args.punc_list if hasattr(args, "punc_list") else None,
                                                   bpemodel_file=args.bpemodel if hasattr(args, "bpemodel") else None,
                                                   mode="train")
                valid_iter_factory = ArkDataLoader(args.valid_data_file, args.token_list, args.dataset_conf, 
                                                   frontend_conf=args.frontend_conf if hasattr(args, "frontend_conf") else None,
                                                   seg_dict_file=args.seg_dict_file if hasattr(args,
                                                                                               "seg_dict_file") else None,
                                                   seg_dict_file=args.seg_dict_file if hasattr(args, "seg_dict_file") else None,
                                                   punc_dict_file=args.punc_list if hasattr(args, "punc_list") else None,
                                                   bpemodel_file=args.bpemodel if hasattr(args, "bpemodel") else None,
                                                   mode="eval")
            elif args.dataset_type == "small":
                train_iter_factory = cls.build_iter_factory(
@@ -1576,13 +1582,18 @@
    ) -> AbsIterFactory:
        assert check_argument_types()

        if args.frontend_conf is not None and "fs" in args.frontend_conf:
            dest_sample_rate = args.frontend_conf["fs"]
        else:
            dest_sample_rate = 16000

        dataset = ESPnetDataset(
            iter_options.data_path_and_name_and_type,
            float_dtype=args.train_dtype,
            preprocess=iter_options.preprocess_fn,
            max_cache_size=iter_options.max_cache_size,
            max_cache_fd=iter_options.max_cache_fd,
            dest_sample_rate=args.frontend_conf["fs"],
            dest_sample_rate=dest_sample_rate,
        )
        cls.check_task_requirements(
            dataset, args.allow_variable_data_keys, train=iter_options.train

 funasr/tasks/asr.py

@@ -412,12 +412,6 @@
            default="13_15",
            help="The range of noise decibel level.",
        )
        parser.add_argument(
            "--batch_interval",
            type=int,
            default=10000,
            help="The batch interval for saving model.",
        )

        for class_choices in cls.class_choices_list:
            # Append --<name> and --<name>_conf.

 funasr/train/trainer.py

@@ -580,8 +580,9 @@
        ):
            assert isinstance(batch, dict), type(batch)
        
            if rank == 0 and hasattr(model.module, "num_updates"):
                num_batch_updates = model.module.get_num_updates()
            if rank == 0:
                if hasattr(model, "num_updates") or (hasattr(model, "module") and hasattr(model.module, "num_updates")):
                    num_batch_updates = model.get_num_updates() if hasattr(model,"num_updates") else model.module.get_num_updates()
                if (num_batch_updates%batch_interval == 0) and (options.oss_bucket is not None) and options.use_pai:
                    buffer = BytesIO()
                    torch.save(model.state_dict(), buffer)

 funasr/utils/compute_wer.py

@@ -45,8 +45,8 @@
           if out_item['wrong'] > 0:
               rst['wrong_sentences'] += 1
           cer_detail_writer.write(hyp_key + print_cer_detail(out_item) + '\n')
           cer_detail_writer.write("ref:" + '\t' + "".join(ref_dict[hyp_key]) + '\n')
           cer_detail_writer.write("hyp:" + '\t' + "".join(hyp_dict[hyp_key]) + '\n')
           cer_detail_writer.write("ref:" + '\t' + " ".join(list(map(lambda x: x.lower(), ref_dict[hyp_key]))) + '\n')
           cer_detail_writer.write("hyp:" + '\t' + " ".join(list(map(lambda x: x.lower(), hyp_dict[hyp_key]))) + '\n')

    if rst['Wrd'] > 0:
        rst['Err'] = round(rst['wrong_words'] * 100 / rst['Wrd'], 2)

 funasr/version.txt

@@ -1 +1 @@
0.3.2
0.3.3

New file
			@@ -0,0 +1,53 @@
			import argparse
			import json
			import numpy as np


			def get_parser():
			parser = argparse.ArgumentParser(
			description="cmvn converter",
			formatter_class=argparse.ArgumentDefaultsHelpFormatter,
			)
			parser.add_argument(
			"--cmvn-json",
			"-c",
			default=False,
			required=True,
			type=str,
			help="cmvn json file",
			)
			parser.add_argument(
			"--am-mvn",
			"-a",
			default=False,
			required=True,
			type=str,
			help="am mvn file",
			)
			return parser

			def main():
			parser = get_parser()
			args = parser.parse_args()

			with open(args.cmvn_json, "r") as fin:
			cmvn_dict = json.load(fin)

			mean_stats = np.array(cmvn_dict["mean_stats"])
			var_stats = np.array(cmvn_dict["var_stats"])
			total_frame = np.array(cmvn_dict["total_frames"])

			mean = -1.0 * mean_stats / total_frame
			var = 1.0 / np.sqrt(var_stats / total_frame - mean * mean)
			dims = mean.shape[0]
			with open(args.am_mvn, 'w') as fout:
			fout.write("<Nnet>" + "\n" + "<Splice> " + str(dims) + " " + str(dims) + '\n' + "[ 0 ]" + "\n" + "<AddShift> " + str(dims) + " " + str(dims) + "\n")
			mean_str = str(list(mean)).replace(',', '').replace('[', '[ ').replace(']', ' ]')
			fout.write("<LearnRateCoef> 0 " + mean_str + '\n')
			fout.write("<Rescale> " + str(dims) + " " + str(dims) + '\n')
			var_str = str(list(var)).replace(',', '').replace('[', '[ ').replace(']', ' ]')
			fout.write("<LearnRateCoef> 0 " + var_str + '\n')
			fout.write("</Nnet>" + '\n')

			if __name__ == '__main__':
			main()

			@@ -45,8 +45,8 @@
			if out_item['wrong'] > 0:
			rst['wrong_sentences'] += 1
			cer_detail_writer.write(hyp_key + print_cer_detail(out_item) + '\n')
			cer_detail_writer.write("ref:" + '\t' + "".join(ref_dict[hyp_key]) + '\n')
			cer_detail_writer.write("hyp:" + '\t' + "".join(hyp_dict[hyp_key]) + '\n')
			cer_detail_writer.write("ref:" + '\t' + " ".join(list(map(lambda x: x.lower(), ref_dict[hyp_key]))) + '\n')
			cer_detail_writer.write("hyp:" + '\t' + " ".join(list(map(lambda x: x.lower(), hyp_dict[hyp_key]))) + '\n')

			if rst['Wrd'] > 0:
			rst['Err'] = round(rst['wrong_words'] * 100 / rst['Wrd'], 2)

New file
			@@ -0,0 +1,6 @@
			beam_size: 10
			penalty: 0.0
			maxlenratio: 0.0
			minlenratio: 0.0
			ctc_weight: 0.5
			lm_weight: 0.7

New file
			@@ -0,0 +1,80 @@
			encoder: conformer
			encoder_conf:
			output_size: 512
			attention_heads: 8
			linear_units: 2048
			num_blocks: 12
			dropout_rate: 0.1
			positional_dropout_rate: 0.1
			attention_dropout_rate: 0.1
			input_layer: conv2d
			normalize_before: true
			macaron_style: true
			rel_pos_type: latest
			pos_enc_layer_type: rel_pos
			selfattention_layer_type: rel_selfattn
			activation_type: swish
			use_cnn_module: true
			cnn_module_kernel: 31

			decoder: transformer
			decoder_conf:
			attention_heads: 8
			linear_units: 2048
			num_blocks: 6
			dropout_rate: 0.1
			positional_dropout_rate: 0.1
			self_attention_dropout_rate: 0.1
			src_attention_dropout_rate: 0.1

			model_conf:
			ctc_weight: 0.3
			lsm_weight: 0.1
			length_normalized_loss: false

			accum_grad: 2
			max_epoch: 50
			patience: none
			init: none
			best_model_criterion:
			- - valid
			- acc
			- max
			keep_nbest_models: 10

			optim: adam
			optim_conf:
			lr: 0.0025
			weight_decay: 0.000001
			scheduler: warmuplr
			scheduler_conf:
			warmup_steps: 40000

			specaug: specaug
			specaug_conf:
			apply_time_warp: true
			time_warp_window: 5
			time_warp_mode: bicubic
			apply_freq_mask: true
			freq_mask_width_range:
			- 0
			- 27
			num_freq_mask: 2
			apply_time_mask: true
			time_mask_width_ratio_range:
			- 0.
			- 0.05
			num_time_mask: 10

			dataset_conf:
			shuffle: True
			shuffle_conf:
			shuffle_size: 1024
			sort_size: 500
			batch_conf:
			batch_type: token
			batch_size: 10000
			num_workers: 8

			log_interval: 50
			normalize: None

New file
			@@ -0,0 +1,58 @@
			#!/usr/bin/env bash

			# Copyright 2014 Vassil Panayotov
			# 2014 Johns Hopkins University (author: Daniel Povey)
			# Apache 2.0

			if [ "$#" -ne 2 ]; then
			echo "Usage: $0 <src-dir> <dst-dir>"
			echo "e.g.: $0 /export/a15/vpanayotov/data/LibriSpeech/dev-clean data/dev-clean"
			exit 1
			fi

			src=$1
			dst=$2

			# all utterances are FLAC compressed
			if ! which flac >&/dev/null; then
			echo "Please install 'flac' on ALL worker nodes!"
			exit 1
			fi

			spk_file=$src/../SPEAKERS.TXT

			mkdir -p $dst \|\| exit 1

			[ ! -d $src ] && echo "$0: no such directory $src" && exit 1
			[ ! -f $spk_file ] && echo "$0: expected file $spk_file to exist" && exit 1


			wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp
			trans=$dst/text; [[ -f "$trans" ]] && rm $trans

			for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d \| sort); do
			reader=$(basename $reader_dir)
			if ! [ $reader -eq $reader ]; then # not integer.
			echo "$0: unexpected subdirectory name $reader"
			exit 1
			fi

			for chapter_dir in $(find -L $reader_dir/ -mindepth 1 -maxdepth 1 -type d \| sort); do
			chapter=$(basename $chapter_dir)
			if ! [ "$chapter" -eq "$chapter" ]; then
			echo "$0: unexpected chapter-subdirectory name $chapter"
			exit 1
			fi

			find -L $chapter_dir/ -iname "*.flac" \| sort \| xargs -I% basename % .flac \| \
			awk -v "dir=$chapter_dir" '{printf "%s %s/%s.flac \n", $0, dir, $0}' >>$wav_scp\|\| exit 1

			chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt
			[ ! -f $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1
			cat $chapter_trans >>$trans
			done
			done

			echo "$0: successfully prepared data in $dst"

			exit 0

New file
			@@ -0,0 +1,5 @@
			export FUNASR_DIR=$PWD/../../..

			# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
			export PYTHONIOENCODING=UTF-8
			export PATH=$FUNASR_DIR/funasr/bin:$PATH

New file
			@@ -0,0 +1,262 @@
			#!/usr/bin/env bash

			. ./path.sh \|\| exit 1;

			# machines configuration
			CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
			gpu_num=8
			count=1
			gpu_inference=true # Whether to perform gpu decoding, set false for cpu decoding
			# for gpu decoding, inference_nj=ngpu*njob; for cpu decoding, inference_nj=njob
			njob=5
			train_cmd=utils/run.pl
			infer_cmd=utils/run.pl

			# general configuration
			feats_dir="../DATA" #feature output dictionary
			exp_dir="."
			lang=en
			dumpdir=dump/fbank
			feats_type=fbank
			token_type=bpe
			dataset_type=large
			scp=feats.scp
			type=kaldi_ark
			stage=3
			stop_stage=4

			# feature configuration
			feats_dim=80
			sample_frequency=16000
			nj=100
			speed_perturb="0.9,1.0,1.1"

			# data
			data_librispeech=

			# bpe model
			nbpe=5000
			bpemode=unigram

			# exp tag
			tag=""

			. utils/parse_options.sh \|\| exit 1;

			# Set bash to 'debug' mode, it will exit on :
			# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
			set -e
			set -u
			set -o pipefail

			train_set=train_960
			valid_set=dev
			test_sets="test_clean test_other dev_clean dev_other"

			asr_config=conf/train_asr_conformer.yaml
			#asr_config=conf/train_asr_conformer_uttnorm.yaml
			model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

			inference_config=conf/decode_asr_transformer.yaml
			#inference_config=conf/decode_asr_transformer_beam60_ctc0.3.yaml
			inference_asr_model=valid.acc.ave_10best.pth

			# you can set gpu num for decoding here
			gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default
			ngpu=$(echo $gpuid_list \| awk -F "," '{print NF}')

			if ${gpu_inference}; then
			inference_nj=$[${ngpu}*${njob}]
			_ngpu=1
			else
			inference_nj=$njob
			_ngpu=0
			fi

			if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
			echo "stage 0: Data preparation"
			# Data preparation
			for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
			local/data_prep_librispeech.sh ${data_librispeech}/LibriSpeech/${x} ${feats_dir}/data/${x//-/_}
			done
			fi

			feat_train_dir=${feats_dir}/${dumpdir}/$train_set; mkdir -p ${feat_train_dir}
			feat_dev_clean_dir=${feats_dir}/${dumpdir}/dev_clean; mkdir -p ${feat_dev_clean_dir}
			feat_dev_other_dir=${feats_dir}/${dumpdir}/dev_other; mkdir -p ${feat_dev_other_dir}
			feat_test_clean_dir=${feats_dir}/${dumpdir}/test_clean; mkdir -p ${feat_test_clean_dir}
			feat_test_other_dir=${feats_dir}/${dumpdir}/test_other; mkdir -p ${feat_test_other_dir}
			feat_dev_dir=${feats_dir}/${dumpdir}/$valid_set; mkdir -p ${feat_dev_dir}
			if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
			echo "stage 1: Feature Generation"
			# compute fbank features
			fbankdir=${feats_dir}/fbank
			for x in dev_clean dev_other test_clean test_other; do
			utils/compute_fbank.sh --cmd "$train_cmd" --nj 1 --max_lengths 3000 --feats_dim ${feats_dim} --sample_frequency ${sample_frequency} \
			${feats_dir}/data/${x} ${exp_dir}/exp/make_fbank/${x} ${fbankdir}/${x}
			utils/fix_data_feat.sh ${fbankdir}/${x}
			done

			mkdir ${feats_dir}/data/$train_set
			train_sets="train_clean_100 train_clean_360 train_other_500"
			for file in wav.scp text; do
			( for f in $train_sets; do cat $feats_dir/data/$f/$file; done ) \| sort -k1 > $feats_dir/data/$train_set/$file \|\| exit 1;
			done
			utils/compute_fbank.sh --cmd "$train_cmd" --nj $nj --max_lengths 3000 --feats_dim ${feats_dim} --sample_frequency ${sample_frequency} --speed_perturb ${speed_perturb} \
			${feats_dir}/data/$train_set ${exp_dir}/exp/make_fbank/$train_set ${fbankdir}/$train_set
			utils/fix_data_feat.sh ${fbankdir}/$train_set

			# compute global cmvn
			utils/compute_cmvn.sh --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} \
			${fbankdir}/$train_set ${exp_dir}/exp/make_fbank/$train_set

			# apply cmvn
			utils/apply_cmvn.sh --cmd "$train_cmd" --nj $nj \
			${fbankdir}/$train_set ${fbankdir}/$train_set/cmvn.json ${exp_dir}/exp/make_fbank/$train_set ${feat_train_dir}
			utils/apply_cmvn.sh --cmd "$train_cmd" --nj 1 \
			${fbankdir}/dev_clean ${fbankdir}/$train_set/cmvn.json ${exp_dir}/exp/make_fbank/dev_clean ${feat_dev_clean_dir}
			utils/apply_cmvn.sh --cmd "$train_cmd" --nj 1\
			${fbankdir}/dev_other ${fbankdir}/$train_set/cmvn.json ${exp_dir}/exp/make_fbank/dev_other ${feat_dev_other_dir}
			utils/apply_cmvn.sh --cmd "$train_cmd" --nj 1 \
			${fbankdir}/test_clean ${fbankdir}/$train_set/cmvn.json ${exp_dir}/exp/make_fbank/test_clean ${feat_test_clean_dir}
			utils/apply_cmvn.sh --cmd "$train_cmd" --nj 1 \
			${fbankdir}/test_other ${fbankdir}/$train_set/cmvn.json ${exp_dir}/exp/make_fbank/test_other ${feat_test_other_dir}

			cp ${fbankdir}/$train_set/text ${fbankdir}/$train_set/speech_shape ${fbankdir}/$train_set/text_shape ${feat_train_dir}
			cp ${fbankdir}/dev_clean/text ${fbankdir}/dev_clean/speech_shape ${fbankdir}/dev_clean/text_shape ${feat_dev_clean_dir}
			cp ${fbankdir}/dev_other/text ${fbankdir}/dev_other/speech_shape ${fbankdir}/dev_other/text_shape ${feat_dev_other_dir}
			cp ${fbankdir}/test_clean/text ${fbankdir}/test_clean/speech_shape ${fbankdir}/test_clean/text_shape ${feat_test_clean_dir}
			cp ${fbankdir}/test_other/text ${fbankdir}/test_other/speech_shape ${fbankdir}/test_other/text_shape ${feat_test_other_dir}

			dev_sets="dev_clean dev_other"
			for file in feats.scp text speech_shape text_shape; do
			( for f in $dev_sets; do cat $feats_dir/${dumpdir}/$f/$file; done ) \| sort -k1 > $feat_dev_dir/$file \|\| exit 1;
			done

			#generate ark list
			utils/gen_ark_list.sh --cmd "$train_cmd" --nj $nj ${feat_train_dir} ${fbankdir}/${train_set} ${feat_train_dir}
			utils/gen_ark_list.sh --cmd "$train_cmd" --nj $nj ${feat_dev_dir} ${fbankdir}/${valid_set} ${feat_dev_dir}
			fi

			dict=${feats_dir}/data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
			bpemodel=${feats_dir}/data/lang_char/${train_set}_${bpemode}${nbpe}
			echo "dictionary: ${dict}"
			if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
			### Task dependent. You have to check non-linguistic symbols used in the corpus.
			echo "stage 2: Dictionary and Json Data Preparation"
			mkdir -p ${feats_dir}/data/lang_char/
			echo "<blank>" > ${dict}
			echo "<s>" >> ${dict}
			echo "</s>" >> ${dict}
			cut -f 2- -d" " ${feats_dir}/data/${train_set}/text > ${feats_dir}/data/lang_char/input.txt
			spm_train --input=${feats_dir}/data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000
			spm_encode --model=${bpemodel}.model --output_format=piece < ${feats_dir}/data/lang_char/input.txt \| tr ' ' '\n' \| sort \| uniq \| awk '{print $0}' >> ${dict}
			echo "<unk>" >> ${dict}
			wc -l ${dict}

			vocab_size=$(cat ${dict} \| wc -l)
			awk -v v=,${vocab_size} '{print $0v}' ${feat_train_dir}/text_shape > ${feat_train_dir}/text_shape.char
			awk -v v=,${vocab_size} '{print $0v}' ${feat_dev_dir}/text_shape > ${feat_dev_dir}/text_shape.char
			mkdir -p ${feats_dir}/asr_stats_fbank_zh_char/$train_set
			mkdir -p ${feats_dir}/asr_stats_fbank_zh_char/$valid_set
			cp ${feat_train_dir}/speech_shape ${feat_train_dir}/text_shape ${feat_train_dir}/text_shape.char ${feats_dir}/asr_stats_fbank_zh_char/$train_set
			cp ${feat_dev_dir}/speech_shape ${feat_dev_dir}/text_shape ${feat_dev_dir}/text_shape.char ${feats_dir}/asr_stats_fbank_zh_char/$valid_set
			fi


			# Training Stage
			world_size=$gpu_num # run on one machine
			if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
			echo "stage 3: Training"
			mkdir -p ${exp_dir}/exp/${model_dir}
			mkdir -p ${exp_dir}/exp/${model_dir}/log
			INIT_FILE=${exp_dir}/exp/${model_dir}/ddp_init
			if [ -f $INIT_FILE ];then
			rm -f $INIT_FILE
			fi
			init_method=file://$(readlink -f $INIT_FILE)
			echo "$0: init method is $init_method"
			for ((i = 0; i < $gpu_num; ++i)); do
			{
			rank=$i
			local_rank=$i
			gpu_id=$(echo $CUDA_VISIBLE_DEVICES \| cut -d',' -f$[$i+1])
			asr_train.py \
			--gpu_id $gpu_id \
			--use_preprocessor true \
			--split_with_space false \
			--bpemodel ${bpemodel}.model \
			--token_type $token_type \
			--dataset_type $dataset_type \
			--token_list $dict \
			--train_data_file $feats_dir/$dumpdir/${train_set}/ark_txt.scp \
			--valid_data_file $feats_dir/$dumpdir/${valid_set}/ark_txt.scp \
			--resume true \
			--output_dir ${exp_dir}/exp/${model_dir} \
			--config $asr_config \
			--input_size $feats_dim \
			--ngpu $gpu_num \
			--num_worker_count $count \
			--multiprocessing_distributed true \
			--dist_init_method $init_method \
			--dist_world_size $world_size \
			--dist_rank $rank \
			--local_rank $local_rank 1> ${exp_dir}/exp/${model_dir}/log/train.log.$i 2>&1
			} &
			done
			wait
			fi

			# Testing Stage
			if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
			echo "stage 4: Inference"
			for dset in ${test_sets}; do
			asr_exp=${exp_dir}/exp/${model_dir}
			inference_tag="$(basename "${inference_config}" .yaml)"
			_dir="${asr_exp}/${inference_tag}/${inference_asr_model}/${dset}"
			_logdir="${_dir}/logdir"
			if [ -d ${_dir} ]; then
			echo "${_dir} is already exists. if you want to decode again, please delete this dir first."
			exit 0
			fi
			mkdir -p "${_logdir}"
			_data="${feats_dir}/${dumpdir}/${dset}"
			key_file=${_data}/${scp}
			num_scp_file="$(<${key_file} wc -l)"
			_nj=$([ $inference_nj -le $num_scp_file ] && echo "$inference_nj" \|\| echo "$num_scp_file")
			split_scps=
			for n in $(seq "${_nj}"); do
			split_scps+=" ${_logdir}/keys.${n}.scp"
			done
			# shellcheck disable=SC2086
			utils/split_scp.pl "${key_file}" ${split_scps}
			_opts=
			if [ -n "${inference_config}" ]; then
			_opts+="--config ${inference_config} "
			fi
			${infer_cmd} --gpu "${_ngpu}" --max-jobs-run "${_nj}" JOB=1:"${_nj}" "${_logdir}"/asr_inference.JOB.log \
			python -m funasr.bin.asr_inference_launch \
			--batch_size 1 \
			--ngpu "${_ngpu}" \
			--njob ${njob} \
			--gpuid_list ${gpuid_list} \
			--data_path_and_name_and_type "${_data}/${scp},speech,${type}" \
			--key_file "${_logdir}"/keys.JOB.scp \
			--asr_train_config "${asr_exp}"/config.yaml \
			--asr_model_file "${asr_exp}"/"${inference_asr_model}" \
			--output_dir "${_logdir}"/output.JOB \
			--mode asr \
			${_opts}

			for f in token token_int score text; do
			if [ -f "${_logdir}/output.1/1best_recog/${f}" ]; then
			for i in $(seq "${_nj}"); do
			cat "${_logdir}/output.${i}/1best_recog/${f}"
			done \| sort -k1 >"${_dir}/${f}"
			fi
			done
			python utils/compute_wer.py ${_data}/text ${_dir}/text ${_dir}/text.cer
			tail -n 3 ${_dir}/text.cer > ${_dir}/text.cer.txt
			cat ${_dir}/text.cer.txt
			done
			fi

			@@ -74,7 +74,7 @@
			# If text exists, compute CER
			text_in = os.path.join(params["data_dir"], "text")
			if os.path.exists(text_in):
			text_proc_file = os.path.join(best_recog_path, "token")
			text_proc_file = os.path.join(best_recog_path, "text")
			compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))

			@@ -38,7 +38,7 @@
			# computer CER if GT text is set
			text_in = os.path.join(params["data_dir"], "text")
			if os.path.exists(text_in):
			text_proc_file = os.path.join(decoding_path, "1best_recog/token")
			text_proc_file = os.path.join(decoding_path, "1best_recog/text")
			compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))

			@@ -17,7 +17,7 @@
			if __name__ == "__main__":
			parser = argparse.ArgumentParser()
			parser.add_argument('--model', type=str, default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
			parser.add_argument('--audio_in', type=str, default="./data/test")
			parser.add_argument('--audio_in', type=str, default="./data/test/wav.scp")
			parser.add_argument('--output_dir', type=str, default="./results/")
			parser.add_argument('--batch_size', type=int, default=64)
			parser.add_argument('--gpuid', type=str, default="0")

			@@ -63,8 +63,8 @@

			if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
			echo "Computing WER ..."
			python utils/proce_text.py ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
			python utils/proce_text.py ${data_dir}/text ${output_dir}/1best_recog/text.ref
			cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
			cp ${data_dir}/text ${output_dir}/1best_recog/text.ref
			python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
			tail -n 3 ${output_dir}/1best_recog/text.cer
			fi

			@@ -34,7 +34,7 @@
			# computer CER if GT text is set
			text_in = os.path.join(params["data_dir"], "text")
			if os.path.exists(text_in):
			text_proc_file = os.path.join(decoding_path, "1best_recog/token")
			text_proc_file = os.path.join(decoding_path, "1best_recog/text")
			compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))

			@@ -75,7 +75,7 @@
			# If text exists, compute CER
			text_in = os.path.join(params["data_dir"], "text")
			if os.path.exists(text_in):
			text_proc_file = os.path.join(best_recog_path, "token")
			text_proc_file = os.path.join(best_recog_path, "text")
			compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))

			@@ -39,7 +39,7 @@
			# computer CER if GT text is set
			text_in = os.path.join(params["data_dir"], "text")
			if os.path.exists(text_in):
			text_proc_file = os.path.join(decoding_path, "1best_recog/token")
			text_proc_file = os.path.join(decoding_path, "1best_recog/text")
			compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))

			@@ -797,7 +797,7 @@
			finish_count += 1
			# asr_utils.print_progress(finish_count / file_count)
			if writer is not None:
			ibest_writer["text"][key] = text_postprocessed
			ibest_writer["text"][key] = " ".join(word_lists)

			logging.info("decoding, utt: {}, predictions: {}".format(key, text))
			rtf_avg = "decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}".format(length_total, forward_time_total, 100 * forward_time_total / (length_total * lfr_factor))

			@@ -42,6 +42,7 @@
			from funasr.models.frontend.wav_frontend import WavFrontend
			from funasr.models.e2e_asr_paraformer import BiCifParaformer, ContextualParaformer
			from funasr.export.models.e2e_asr_paraformer import Paraformer as Paraformer_export
			np.set_printoptions(threshold=np.inf)

			class Speech2Text:
			"""Speech2Text class
			@@ -203,7 +204,6 @@
			# Input as audio signal
			if isinstance(speech, np.ndarray):
			speech = torch.tensor(speech)

			if self.frontend is not None:
			feats, feats_len = self.frontend.forward(speech, speech_lengths)
			feats = to_device(feats, device=self.device)
			@@ -213,13 +213,16 @@
			feats = speech
			feats_len = speech_lengths
			lfr_factor = max(1, (feats.size()[-1] // 80) - 1)
			feats_len = cache["encoder"]["stride"] + cache["encoder"]["pad_left"] + cache["encoder"]["pad_right"]
			feats = feats[:,cache["encoder"]["start_idx"]:cache["encoder"]["start_idx"]+feats_len,:]
			feats_len = torch.tensor([feats_len])
			batch = {"speech": feats, "speech_lengths": feats_len, "cache": cache}

			# a. To device
			batch = to_device(batch, device=self.device)

			# b. Forward Encoder
			enc, enc_len = self.asr_model.encode_chunk(**batch)
			enc, enc_len = self.asr_model.encode_chunk(feats, feats_len, cache)
			if isinstance(enc, tuple):
			enc = enc[0]
			# assert len(enc) == 1, len(enc)
			@@ -579,6 +582,21 @@
			else:
			speech2text = Speech2Text(**speech2text_kwargs)

			def _load_bytes(input):
			middle_data = np.frombuffer(input, dtype=np.int16)
			middle_data = np.asarray(middle_data)
			if middle_data.dtype.kind not in 'iu':
			raise TypeError("'middle_data' must be an array of integers")
			dtype = np.dtype('float32')
			if dtype.kind != 'f':
			raise TypeError("'dtype' must be a floating point type")

			i = np.iinfo(middle_data.dtype)
			abs_max = 2 ** (i.bits - 1)
			offset = i.min + abs_max
			array = np.frombuffer((middle_data.astype(dtype) - offset) / abs_max, dtype=np.float32)
			return array

			def _forward(
			data_path_and_name_and_type,
			raw_inputs: Union[np.ndarray, torch.Tensor] = None,
			@@ -589,10 +607,12 @@
			):

			# 3. Build data-iterator
			if data_path_and_name_and_type is not None and data_path_and_name_and_type[2] == "bytes":
			raw_inputs = _load_bytes(data_path_and_name_and_type[0])
			raw_inputs = torch.tensor(raw_inputs)
			if data_path_and_name_and_type is None and raw_inputs is not None:
			if isinstance(raw_inputs, np.ndarray):
			raw_inputs = torch.tensor(raw_inputs)

			is_final = False
			if param_dict is not None and "cache" in param_dict:
			cache = param_dict["cache"]
			@@ -605,61 +625,86 @@
			asr_result = ""
			wait = True
			if len(cache) == 0:
			cache["encoder"] = {"start_idx": 0, "pad_left": 0, "stride": 10, "pad_right": 5, "cif_hidden": None, "cif_alphas": None}
			cache["encoder"] = {"start_idx": 0, "pad_left": 0, "stride": 10, "pad_right": 5, "cif_hidden": None, "cif_alphas": None, "is_final": is_final, "left": 0, "right": 0}
			cache_de = {"decode_fsmn": None}
			cache["decoder"] = cache_de
			cache["first_chunk"] = True
			cache["speech"] = []
			cache["chunk_index"] = 0
			cache["speech_chunk"] = []
			cache["accum_speech"] = 0

			if raw_inputs is not None:
			if len(cache["speech"]) == 0:
			cache["speech"] = raw_inputs
			else:
			cache["speech"] = torch.cat([cache["speech"], raw_inputs], dim=0)
			if len(cache["speech_chunk"]) == 0:
			cache["speech_chunk"] = raw_inputs
			else:
			cache["speech_chunk"] = torch.cat([cache["speech_chunk"], raw_inputs], dim=0)
			while len(cache["speech_chunk"]) >= 960:
			cache["accum_speech"] += len(raw_inputs)
			while cache["accum_speech"] >= 960:
			if cache["first_chunk"]:
			if len(cache["speech_chunk"]) >= 14400:
			speech = torch.unsqueeze(cache["speech_chunk"][0:14400], axis=0)
			speech_length = torch.tensor([14400])
			if cache["accum_speech"] >= 14400:
			speech = torch.unsqueeze(cache["speech"], axis=0)
			speech_length = torch.tensor([len(cache["speech"])])
			cache["encoder"]["pad_left"] = 5
			cache["encoder"]["pad_right"] = 5
			cache["encoder"]["stride"] = 10
			cache["encoder"]["left"] = 5
			cache["encoder"]["right"] = 0
			results = speech2text(cache, speech, speech_length)
			cache["speech_chunk"]= cache["speech_chunk"][4800:]
			cache["accum_speech"] -= 4800
			cache["first_chunk"] = False
			cache["encoder"]["start_idx"] = -5
			cache["encoder"]["is_final"] = False
			wait = False
			else:
			if is_final:
			cache["encoder"]["stride"] = len(cache["speech_chunk"]) // 960
			cache["encoder"]["stride"] = len(cache["speech"]) // 960
			cache["encoder"]["pad_left"] = 0
			cache["encoder"]["pad_right"] = 0
			speech = torch.unsqueeze(cache["speech_chunk"], axis=0)
			speech_length = torch.tensor([len(cache["speech_chunk"])])
			speech = torch.unsqueeze(cache["speech"], axis=0)
			speech_length = torch.tensor([len(cache["speech"])])
			results = speech2text(cache, speech, speech_length)
			cache["speech_chunk"] = []
			cache["accum_speech"] = 0
			wait = False
			else:
			break
			else:
			if len(cache["speech_chunk"]) >= 19200:
			if cache["accum_speech"] >= 19200:
			cache["encoder"]["start_idx"] += 10
			cache["encoder"]["stride"] = 10
			cache["encoder"]["pad_left"] = 5
			speech = torch.unsqueeze(cache["speech_chunk"][:19200], axis=0)
			speech_length = torch.tensor([19200])
			cache["encoder"]["pad_right"] = 5
			cache["encoder"]["left"] = 0
			cache["encoder"]["right"] = 0
			speech = torch.unsqueeze(cache["speech"], axis=0)
			speech_length = torch.tensor([len(cache["speech"])])
			results = speech2text(cache, speech, speech_length)
			cache["speech_chunk"] = cache["speech_chunk"][9600:]
			cache["accum_speech"] -= 9600
			wait = False
			else:
			if is_final:
			cache["encoder"]["stride"] = len(cache["speech_chunk"]) // 960
			cache["encoder"]["pad_right"] = 0
			speech = torch.unsqueeze(cache["speech_chunk"], axis=0)
			speech_length = torch.tensor([len(cache["speech_chunk"])])
			cache["encoder"]["is_final"] = True
			if cache["accum_speech"] >= 14400:
			cache["encoder"]["start_idx"] += 10
			cache["encoder"]["stride"] = 10
			cache["encoder"]["pad_left"] = 5
			cache["encoder"]["pad_right"] = 5
			cache["encoder"]["left"] = 0
			cache["encoder"]["right"] = cache["accum_speech"] // 960 - 15
			speech = torch.unsqueeze(cache["speech"], axis=0)
			speech_length = torch.tensor([len(cache["speech"])])
			results = speech2text(cache, speech, speech_length)
			cache["speech_chunk"] = []
			cache["accum_speech"] -= 9600
			wait = False
			else:
			cache["encoder"]["start_idx"] += 10
			cache["encoder"]["stride"] = cache["accum_speech"] // 960 - 5
			cache["encoder"]["pad_left"] = 5
			cache["encoder"]["pad_right"] = 0
			cache["encoder"]["left"] = 0
			cache["encoder"]["right"] = 0
			speech = torch.unsqueeze(cache["speech"], axis=0)
			speech_length = torch.tensor([len(cache["speech"])])
			results = speech2text(cache, speech, speech_length)
			cache["accum_speech"] = 0
			wait = False
			else:
			break

			@@ -338,7 +338,7 @@
			ibest_writer["token"][key] = " ".join(token)
			ibest_writer["token_int"][key] = " ".join(map(str, token_int))
			ibest_writer["vad"][key] = "{}".format(vadsegments)
			ibest_writer["text"][key] = text_postprocessed
			ibest_writer["text"][key] = " ".join(word_lists)
			ibest_writer["text_with_punc"][key] = text_postprocessed_punc
			if time_stamp_postprocessed is not None:
			ibest_writer["time_stamp"][key] = "{}".format(time_stamp_postprocessed)

			@@ -670,7 +670,7 @@
			ibest_writer["token"][key] = " ".join(token)
			ibest_writer["token_int"][key] = " ".join(map(str, token_int))
			ibest_writer["vad"][key] = "{}".format(vadsegments)
			ibest_writer["text"][key] = text_postprocessed
			ibest_writer["text"][key] = " ".join(word_lists)
			ibest_writer["text_with_punc"][key] = text_postprocessed_punc
			if time_stamp_postprocessed is not None:
			ibest_writer["time_stamp"][key] = "{}".format(time_stamp_postprocessed)

			@@ -738,13 +738,13 @@
			ibest_writer["rtf"][key] = rtf_cur

			if text is not None:
			text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
			text_postprocessed, word_lists = postprocess_utils.sentence_postprocess(token)
			item = {'key': key, 'value': text_postprocessed}
			asr_result_list.append(item)
			finish_count += 1
			# asr_utils.print_progress(finish_count / file_count)
			if writer is not None:
			ibest_writer["text"][key] = text_postprocessed
			ibest_writer["text"][key] = " ".join(word_lists)

			logging.info("decoding, utt: {}, predictions: {}".format(key, text))
			rtf_avg = "decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}".format(length_total, forward_time_total, 100 * forward_time_total / (length_total * lfr_factor))

			@@ -504,13 +504,13 @@
			ibest_writer["score"][key] = str(hyp.score)

			if text is not None:
			text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
			text_postprocessed, word_lists = postprocess_utils.sentence_postprocess(token)
			item = {'key': key, 'value': text_postprocessed}
			asr_result_list.append(item)
			finish_count += 1
			asr_utils.print_progress(finish_count / file_count)
			if writer is not None:
			ibest_writer["text"][key] = text_postprocessed
			ibest_writer["text"][key] = " ".join(word_lists)
			return asr_result_list

			return _forward

			@@ -507,13 +507,13 @@
			ibest_writer["score"][key] = str(hyp.score)

			if text is not None:
			text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
			text_postprocessed, word_lists = postprocess_utils.sentence_postprocess(token)
			item = {'key': key, 'value': text_postprocessed}
			asr_result_list.append(item)
			finish_count += 1
			asr_utils.print_progress(finish_count / file_count)
			if writer is not None:
			ibest_writer["text"][key] = text_postprocessed
			ibest_writer["text"][key] = " ".join(word_lists)
			return asr_result_list

			return _forward

			@@ -37,7 +37,7 @@
			vad = -2

			if bpe_tokenizer is not None:
			text = bpe_tokenizer.text2tokens(text)
			text = bpe_tokenizer.text2tokens("".join(text))

			if seg_dict is not None:
			assert isinstance(seg_dict, dict)

			@@ -19,6 +19,7 @@
			self,
			cache_dir: Union[Path, str] = None,
			onnx: bool = True,
			device: str = "cpu",
			quant: bool = True,
			fallback_num: int = 0,
			audio_in: str = None,
			@@ -36,6 +37,7 @@
			)
			print("output dir: {}".format(self.cache_dir))
			self.onnx = onnx
			self.device = device
			self.quant = quant
			self.fallback_num = fallback_num
			self.frontend = None
			@@ -111,6 +113,10 @@
			dummy_input = model.get_dummy_inputs(enc_size)
			else:
			dummy_input = model.get_dummy_inputs()

			if self.device == 'cuda':
			model = model.cuda()
			dummy_input = tuple([i.cuda() for i in dummy_input])

			# model_script = torch.jit.script(model)
			model_script = torch.jit.trace(model, dummy_input)
			@@ -260,6 +266,7 @@
			parser.add_argument('--model-name', type=str, required=True)
			parser.add_argument('--export-dir', type=str, required=True)
			parser.add_argument('--type', type=str, default='onnx', help='["onnx", "torch"]')
			parser.add_argument('--device', type=str, default='cpu', help='["cpu", "cuda"]')
			parser.add_argument('--quantize', type=str2bool, default=False, help='export quantized model')
			parser.add_argument('--fallback-num', type=int, default=0, help='amp fallback number')
			parser.add_argument('--audio_in', type=str, default=None, help='["wav", "wav.scp"]')
			@@ -269,6 +276,7 @@
			export_model = ModelExport(
			cache_dir=args.export_dir,
			onnx=args.type == 'onnx',
			device=args.device,
			quant=args.quantize,
			fallback_num=args.fallback_num,
			audio_in=args.audio_in,

			@@ -75,8 +75,8 @@
			return x, cache


			torch_version = float(".".join(torch.__version__.split(".")[:2]))
			if torch_version >= 1.8:
			torch_version = tuple([int(i) for i in torch.__version__.split(".")[:2]])
			if torch_version >= (1, 8):
			import torch.fx
			torch.fx.wrap('preprocess_for_attn')

			@@ -74,7 +74,7 @@
			return x, tgt_mask, x_self_attn, x_src_attn


			class ContexutalBiasDecoder(nn.Module):
			class ContextualBiasDecoder(nn.Module):
			def __init__(
			self,
			size,
			@@ -83,7 +83,7 @@
			normalize_before=True,
			):
			"""Construct an DecoderLayer object."""
			super(ContexutalBiasDecoder, self).__init__()
			super(ContextualBiasDecoder, self).__init__()
			self.size = size
			self.src_attn = src_attn
			if src_attn is not None:
			@@ -186,7 +186,7 @@
			),
			)
			self.dropout = nn.Dropout(dropout_rate)
			self.bias_decoder = ContexutalBiasDecoder(
			self.bias_decoder = ContextualBiasDecoder(
			size=attention_dim,
			src_attn=MultiHeadedAttentionCrossAtt(
			attention_heads, attention_dim, src_attention_dropout_rate

			@@ -104,7 +104,6 @@

			x = residual + self.dropout(self.src_attn(x, memory, memory_mask))


			return x, tgt_mask, memory, memory_mask, cache

			def forward_chunk(self, tgt, tgt_mask, memory, memory_mask=None, cache=None):
			@@ -400,7 +399,7 @@
			for i in range(self.att_layer_num):
			decoder = self.decoders[i]
			c = cache[i]
			x, tgt_mask, memory, memory_mask, c_ret = decoder(
			x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
			x, tgt_mask, memory, memory_mask, cache=c
			)
			new_cache.append(c_ret)
			@@ -410,13 +409,13 @@
			j = i + self.att_layer_num
			decoder = self.decoders2[i]
			c = cache[j]
			x, tgt_mask, memory, memory_mask, c_ret = decoder(
			x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
			x, tgt_mask, memory, memory_mask, cache=c
			)
			new_cache.append(c_ret)

			for decoder in self.decoders3:
			x, tgt_mask, memory, memory_mask, _ = decoder(
			x, tgt_mask, memory, memory_mask, _ = decoder.forward_chunk(
			x, tgt_mask, memory, None, cache=None
			)

			@@ -1077,7 +1076,7 @@
			for i in range(self.att_layer_num):
			decoder = self.decoders[i]
			c = cache[i]
			x, tgt_mask, memory, memory_mask, c_ret = decoder(
			x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
			x, tgt_mask, memory, None, cache=c
			)
			new_cache.append(c_ret)
			@@ -1087,14 +1086,14 @@
			j = i + self.att_layer_num
			decoder = self.decoders2[i]
			c = cache[j]
			x, tgt_mask, memory, memory_mask, c_ret = decoder(
			x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
			x, tgt_mask, memory, None, cache=c
			)
			new_cache.append(c_ret)

			for decoder in self.decoders3:

			x, tgt_mask, memory, memory_mask, _ = decoder(
			x, tgt_mask, memory, memory_mask, _ = decoder.forward_chunk(
			x, tgt_mask, memory, None, cache=None
			)

			@@ -370,19 +370,10 @@
			encoder_out, encoder_out_lens
			)

			assert encoder_out.size(0) == speech.size(0), (
			encoder_out.size(),
			speech.size(0),
			)
			assert encoder_out.size(1) <= encoder_out_lens.max(), (
			encoder_out.size(),
			encoder_out_lens.max(),
			)

			if intermediate_outs is not None:
			return (encoder_out, intermediate_outs), encoder_out_lens

			return encoder_out, encoder_out_lens
			return encoder_out, torch.tensor([encoder_out.size(1)])

			def calc_predictor(self, encoder_out, encoder_out_lens):

			@@ -1034,16 +1025,76 @@

			# 1. Encoder
			encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
			intermediate_outs = None
			if isinstance(encoder_out, tuple):
			intermediate_outs = encoder_out[1]
			encoder_out = encoder_out[0]

			loss_att, acc_att, cer_att, wer_att = None, None, None, None
			loss_ctc, cer_ctc = None, None
			loss_pre = None
			stats = dict()

			# 1. CTC branch
			if self.ctc_weight != 0.0:
			loss_ctc, cer_ctc = self._calc_ctc_loss(
			encoder_out, encoder_out_lens, text, text_lengths
			)

			# Collect CTC branch stats
			stats["loss_ctc"] = loss_ctc.detach() if loss_ctc is not None else None
			stats["cer_ctc"] = cer_ctc

			# Intermediate CTC (optional)
			loss_interctc = 0.0
			if self.interctc_weight != 0.0 and intermediate_outs is not None:
			for layer_idx, intermediate_out in intermediate_outs:
			# we assume intermediate_out has the same length & padding
			# as those of encoder_out
			loss_ic, cer_ic = self._calc_ctc_loss(
			intermediate_out, encoder_out_lens, text, text_lengths
			)
			loss_interctc = loss_interctc + loss_ic

			# Collect Intermedaite CTC stats
			stats["loss_interctc_layer{}".format(layer_idx)] = (
			loss_ic.detach() if loss_ic is not None else None
			)
			stats["cer_interctc_layer{}".format(layer_idx)] = cer_ic

			loss_interctc = loss_interctc / len(intermediate_outs)

			# calculate whole encoder loss
			loss_ctc = (
			1 - self.interctc_weight
			) * loss_ctc + self.interctc_weight * loss_interctc

			# 2b. Attention decoder branch
			if self.ctc_weight != 1.0:
			loss_att, acc_att, cer_att, wer_att, loss_pre = self._calc_att_loss(
			encoder_out, encoder_out_lens, text, text_lengths
			)

			loss_pre2 = self._calc_pre2_loss(
			encoder_out, encoder_out_lens, text, text_lengths
			)

			loss = loss_pre2
			# 3. CTC-Att loss definition
			if self.ctc_weight == 0.0:
			loss = loss_att + loss_pre * self.predictor_weight + loss_pre2 * self.predictor_weight * 0.5
			elif self.ctc_weight == 1.0:
			loss = loss_ctc
			else:
			loss = self.ctc_weight * loss_ctc + (1 - self.ctc_weight) * loss_att + loss_pre * self.predictor_weight + loss_pre2 * self.predictor_weight * 0.5

			# Collect Attn branch stats
			stats["loss_att"] = loss_att.detach() if loss_att is not None else None
			stats["acc"] = acc_att
			stats["cer"] = cer_att
			stats["wer"] = wer_att
			stats["loss_pre"] = loss_pre.detach().cpu() if loss_pre is not None else None
			stats["loss_pre2"] = loss_pre2.detach().cpu()

			stats["loss"] = torch.clone(loss.detach())

			# force_gatherable: to-device and to-tensor if scalar for DataParallel
			@@ -1094,6 +1145,7 @@
			inner_dim: int = 256,
			bias_encoder_type: str = 'lstm',
			label_bracket: bool = False,
			use_decoder_embedding: bool = False,
			):
			assert check_argument_types()
			assert 0.0 <= ctc_weight <= 1.0, ctc_weight
			@@ -1147,6 +1199,7 @@
			self.hotword_buffer = None
			self.length_record = []
			self.current_buffer_length = 0
			self.use_decoder_embedding = use_decoder_embedding

			def forward(
			self,
			@@ -1288,7 +1341,10 @@
			hw_list.append(hw_tokens)
			# padding
			hw_list_pad = pad_list(hw_list, 0)
			if self.use_decoder_embedding:
			hw_embed = self.decoder.embed(hw_list_pad)
			else:
			hw_embed = self.bias_embed(hw_list_pad)
			hw_embed, (_, _) = self.bias_encoder(hw_embed)
			_ind = np.arange(0, len(hw_list)).tolist()
			# update self.hotword_buffer, throw a part if oversize
			@@ -1404,12 +1460,18 @@
			# default hotword list
			hw_list = [torch.Tensor([self.sos]).long().to(encoder_out.device)] # empty hotword list
			hw_list_pad = pad_list(hw_list, 0)
			if self.use_decoder_embedding:
			hw_embed = self.decoder.embed(hw_list_pad)
			else:
			hw_embed = self.bias_embed(hw_list_pad)
			_, (h_n, _) = self.bias_encoder(hw_embed)
			contextual_info = h_n.squeeze(0).repeat(encoder_out.shape[0], 1, 1)
			else:
			hw_lengths = [len(i) for i in hw_list]
			hw_list_pad = pad_list([torch.Tensor(i).long() for i in hw_list], 0).to(encoder_out.device)
			if self.use_decoder_embedding:
			hw_embed = self.decoder.embed(hw_list_pad)
			else:
			hw_embed = self.bias_embed(hw_list_pad)
			hw_embed = torch.nn.utils.rnn.pack_padded_sequence(hw_embed, hw_lengths, batch_first=True,
			enforce_sorted=False)

			@@ -200,6 +200,7 @@
			return acoustic_embeds, token_num, alphas, cif_peak

			def forward_chunk(self, hidden, cache=None):
			b, t, d = hidden.size()
			h = hidden
			context = h.transpose(1, 2)
			queries = self.pad(context)
			@@ -220,6 +221,8 @@
			alphas = alphas * mask_chunk_predictor

			if cache is not None:
			if cache["is_final"]:
			alphas[:, cache["stride"] + cache["pad_left"] - 1] += 0.45
			if cache["cif_hidden"] is not None:
			hidden = torch.cat((cache["cif_hidden"], hidden), 1)
			if cache["cif_alphas"] is not None:
			@@ -240,7 +243,6 @@
			pre_alphas_length = cache["cif_alphas"].size(-1)
			mask_chunk_peak_predictor[:, :pre_alphas_length] = 1.0
			mask_chunk_peak_predictor[:, pre_alphas_length + cache["pad_left"]:pre_alphas_length + cache["stride"] + cache["pad_left"]] = 1.0


			if mask_chunk_peak_predictor is not None:
			cif_peak = cif_peak * mask_chunk_peak_predictor.squeeze(-1)

			@@ -8,7 +8,7 @@

			import math
			import torch

			import torch.nn.functional as F

			def _pre_hook(
			state_dict,
			@@ -409,9 +409,18 @@

			def forward_chunk(self, x, cache=None):
			start_idx = 0
			pad_left = 0
			pad_right = 0
			batch_size, timesteps, input_dim = x.size()
			if cache is not None:
			start_idx = cache["start_idx"]
			pad_left = cache["left"]
			pad_right = cache["right"]
			positions = torch.arange(1, timesteps+start_idx+1)[None, :]
			position_encoding = self.encode(positions, input_dim, x.dtype).to(x.device)
			return x + position_encoding[:, start_idx: start_idx + timesteps]
			outputs = x + position_encoding[:, start_idx: start_idx + timesteps]
			outputs = outputs.transpose(1,2)
			outputs = F.pad(outputs, (pad_left, pad_right))
			outputs = outputs.transpose(1,2)
			return outputs

			@@ -53,6 +53,68 @@
			python grpc_main_client_mic.py --host $server_ip --port 10108
			```

			The `grpc_main_client_mic.py` follows the [original design] (https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/grpc#workflow-in-desgin) by sending audio_data with chunks. If you want to send audio_data in one request, here is an example:

			```
			# go to ../python/grpc to find this package
			import paraformer_pb2


			class RecognizeStub:
			def __init__(self, channel):
			self.Recognize = channel.stream_stream(
			'/paraformer.ASR/Recognize',
			request_serializer=paraformer_pb2.Request.SerializeToString,
			response_deserializer=paraformer_pb2.Response.FromString,
			)


			async def send(channel, data, speaking, isEnd):
			stub = RecognizeStub(channel)
			req = paraformer_pb2.Request()
			if data:
			req.audio_data = data
			req.user = 'zz'
			req.language = 'zh-CN'
			req.speaking = speaking
			req.isEnd = isEnd
			q = queue.SimpleQueue()
			q.put(req)
			return stub.Recognize(iter(q.get, None))

			# send the audio data once
			async def grpc_rec(data, grpc_uri):
			with grpc.insecure_channel(grpc_uri) as channel:
			b = time.time()
			response = await send(channel, data, False, False)
			resp = response.next()
			text = ''
			if 'decoding' == resp.action:
			resp = response.next()
			if 'finish' == resp.action:
			text = json.loads(resp.sentence)['text']
			response = await send(channel, None, False, True)
			return {
			'text': text,
			'time': time.time() - b,
			}

			async def test():
			# fc = FunAsrGrpcClient('127.0.0.1', 9900)
			# t = await fc.rec(wav.tobytes())
			# print(t)
			wav, _ = sf.read('z-10s.wav', dtype='int16')
			uri = '127.0.0.1:9900'
			res = await grpc_rec(wav.tobytes(), uri)
			print(res)


			if __name__ == '__main__':
			asyncio.run(test())

			```


			## Acknowledge
			1. This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR).
			2. We acknowledge [DeepScience](https://www.deepscience.cn) for contributing the grpc service.