python/FunASR-XL.git

parent: 2eb69485 | 补丁 | 提交 | ignore whitespace

嘉渊

2023-05-11 5dd5332fd50ed3f81ac34b375f71144f27fd2711

update repo

1个文件已删除

1个文件已修改

1个文件已添加

	egs/aishell/transformer/local/download_and_untar.sh	105 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/aishell/transformer/local/prepare_data.sh	53 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/aishell/transformer/run.sh	69 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史

 egs/aishell/transformer/local/download_and_untar.sh

New file
@@ -0,0 +1,105 @@
#!/usr/bin/env bash

# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
#             2017  Xingyu Na
# Apache 2.0

remove_archive=false

if [ "$1" == --remove-archive ]; then
  remove_archive=true
  shift
fi

if [ $# -ne 3 ]; then
  echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
  echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell"
  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
  echo "<corpus-part> can be one of: data_aishell, resource_aishell."
fi

data=$1
url=$2
part=$3

if [ ! -d "$data" ]; then
  echo "$0: no such directory $data"
  exit 1;
fi

part_ok=false
list="data_aishell resource_aishell"
for x in $list; do
  if [ "$part" == $x ]; then part_ok=true; fi
done
if ! $part_ok; then
  echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
  exit 1;
fi

if [ -z "$url" ]; then
  echo "$0: empty URL base."
  exit 1;
fi

if [ -f $data/$part/.complete ]; then
  echo "$0: data part $part was already successfully extracted, nothing to do."
  exit 0;
fi

# sizes of the archive files in bytes.
sizes="15582913665 1246920"

if [ -f $data/$part.tgz ]; then
  size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}')
  size_ok=false
  for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
  if ! $size_ok; then
    echo "$0: removing existing file $data/$part.tgz because its size in bytes $size"
    echo "does not equal the size of one of the archives."
    rm $data/$part.tgz
  else
    echo "$data/$part.tgz exists and appears to be complete."
  fi
fi

if [ ! -f $data/$part.tgz ]; then
  if ! command -v wget >/dev/null; then
    echo "$0: wget is not installed."
    exit 1;
  fi
  full_url=$url/$part.tgz
  echo "$0: downloading data from $full_url.  This may take some time, please be patient."

  cd $data || exit 1
  if ! wget --no-check-certificate $full_url; then
    echo "$0: error executing wget $full_url"
    exit 1;
  fi
fi

cd $data || exit 1

if ! tar -xvzf $part.tgz; then
  echo "$0: error un-tarring archive $data/$part.tgz"
  exit 1;
fi

touch $data/$part/.complete

if [ $part == "data_aishell" ]; then
  cd $data/$part/wav || exit 1
  for wav in ./*.tar.gz; do
    echo "Extracting wav from $wav"
    tar -zxf $wav && rm $wav
  done
fi

echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"

if $remove_archive; then
  echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied."
  rm $data/$part.tgz
fi

exit 0;

 egs/aishell/transformer/local/prepare_data.sh

File was deleted

 egs/aishell/transformer/run.sh

@@ -3,12 +3,12 @@
. ./path.sh || exit 1;

# machines configuration
CUDA_VISIBLE_DEVICES="2,3"
CUDA_VISIBLE_DEVICES="0,1"
gpu_num=2
count=1
gpu_inference=true  # Whether to perform gpu decoding, set false for cpu decoding
# for gpu decoding, inference_nj=ngpu*njob; for cpu decoding, inference_nj=njob
njob=1
njob=5
train_cmd=utils/run.pl
infer_cmd=utils/run.pl

@@ -16,13 +16,11 @@
feats_dir="../DATA" #feature output dictionary
exp_dir="."
lang=zh
dumpdir=dump/fbank
feats_type=fbank
token_type=char
scp=wav.scp
type=sound
scp=wav.scp
stage=3
stop_stage=3
stop_stage=4

# feature configuration
feats_dim=80
@@ -48,7 +46,7 @@
test_sets="dev test"

asr_config=conf/train_asr_transformer.yaml
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
model_dir="baseline_$(basename "${asr_config}" .yaml)_${lang}_${token_type}_${tag}"

inference_config=conf/decode_asr_transformer.yaml
inference_asr_model=valid.acc.ave_10best.pb
@@ -143,4 +141,61 @@
        } &
        done
        wait
fi

# Testing Stage
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    echo "stage 4: Inference"
    for dset in ${test_sets}; do
        asr_exp=${exp_dir}/exp/${model_dir}
        inference_tag="$(basename "${inference_config}" .yaml)"
        _dir="${asr_exp}/${inference_tag}/${inference_asr_model}/${dset}"
        _logdir="${_dir}/logdir"
        if [ -d ${_dir} ]; then
            echo "${_dir} is already exists. if you want to decode again, please delete this dir first."
            exit 0
        fi
        mkdir -p "${_logdir}"
        _data="${feats_dir}/data/${dset}"
        key_file=${_data}/${scp}
        num_scp_file="$(<${key_file} wc -l)"
        _nj=$([ $inference_nj -le $num_scp_file ] && echo "$inference_nj" || echo "$num_scp_file")
        split_scps=
        for n in $(seq "${_nj}"); do
            split_scps+=" ${_logdir}/keys.${n}.scp"
        done
        # shellcheck disable=SC2086
        utils/split_scp.pl "${key_file}" ${split_scps}
        _opts=
        if [ -n "${inference_config}" ]; then
            _opts+="--config ${inference_config} "
        fi
        ${infer_cmd} --gpu "${_ngpu}" --max-jobs-run "${_nj}" JOB=1:"${_nj}" "${_logdir}"/asr_inference.JOB.log \
            python -m funasr.bin.asr_inference_launch \
                --batch_size 1 \
                --ngpu "${_ngpu}" \
                --njob ${njob} \
                --gpuid_list ${gpuid_list} \
                --data_path_and_name_and_type "${_data}/${scp},speech,${type}" \
                --cmvn_file ${feats_dir}/data/${train_set}/cmvn/cmvn.mvn \
                --key_file "${_logdir}"/keys.JOB.scp \
                --asr_train_config "${asr_exp}"/config.yaml \
                --asr_model_file "${asr_exp}"/"${inference_asr_model}" \
                --output_dir "${_logdir}"/output.JOB \
                --mode asr \
                ${_opts}

        for f in token token_int score text; do
            if [ -f "${_logdir}/output.1/1best_recog/${f}" ]; then
                for i in $(seq "${_nj}"); do
                    cat "${_logdir}/output.${i}/1best_recog/${f}"
                done | sort -k1 >"${_dir}/${f}"
            fi
        done
        python utils/proce_text.py ${_dir}/text ${_dir}/text.proc
        python utils/proce_text.py ${_data}/text ${_data}/text.proc
        python utils/compute_wer.py ${_data}/text.proc ${_dir}/text.proc ${_dir}/text.cer
        tail -n 3 ${_dir}/text.cer > ${_dir}/text.cer.txt
        cat ${_dir}/text.cer.txt
    done
fi

New file
			@@ -0,0 +1,105 @@
			#!/usr/bin/env bash

			# Copyright 2014 Johns Hopkins University (author: Daniel Povey)
			# 2017 Xingyu Na
			# Apache 2.0

			remove_archive=false

			if [ "$1" == --remove-archive ]; then
			remove_archive=true
			shift
			fi

			if [ $# -ne 3 ]; then
			echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
			echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell"
			echo "With --remove-archive it will remove the archive after successfully un-tarring it."
			echo "<corpus-part> can be one of: data_aishell, resource_aishell."
			fi

			data=$1
			url=$2
			part=$3

			if [ ! -d "$data" ]; then
			echo "$0: no such directory $data"
			exit 1;
			fi

			part_ok=false
			list="data_aishell resource_aishell"
			for x in $list; do
			if [ "$part" == $x ]; then part_ok=true; fi
			done
			if ! $part_ok; then
			echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
			exit 1;
			fi

			if [ -z "$url" ]; then
			echo "$0: empty URL base."
			exit 1;
			fi

			if [ -f $data/$part/.complete ]; then
			echo "$0: data part $part was already successfully extracted, nothing to do."
			exit 0;
			fi

			# sizes of the archive files in bytes.
			sizes="15582913665 1246920"

			if [ -f $data/$part.tgz ]; then
			size=$(/bin/ls -l $data/$part.tgz \| awk '{print $5}')
			size_ok=false
			for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
			if ! $size_ok; then
			echo "$0: removing existing file $data/$part.tgz because its size in bytes $size"
			echo "does not equal the size of one of the archives."
			rm $data/$part.tgz
			else
			echo "$data/$part.tgz exists and appears to be complete."
			fi
			fi

			if [ ! -f $data/$part.tgz ]; then
			if ! command -v wget >/dev/null; then
			echo "$0: wget is not installed."
			exit 1;
			fi
			full_url=$url/$part.tgz
			echo "$0: downloading data from $full_url. This may take some time, please be patient."

			cd $data \|\| exit 1
			if ! wget --no-check-certificate $full_url; then
			echo "$0: error executing wget $full_url"
			exit 1;
			fi
			fi

			cd $data \|\| exit 1

			if ! tar -xvzf $part.tgz; then
			echo "$0: error un-tarring archive $data/$part.tgz"
			exit 1;
			fi

			touch $data/$part/.complete

			if [ $part == "data_aishell" ]; then
			cd $data/$part/wav \|\| exit 1
			for wav in ./*.tar.gz; do
			echo "Extracting wav from $wav"
			tar -zxf $wav && rm $wav
			done
			fi

			echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"

			if $remove_archive; then
			echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied."
			rm $data/$part.tgz
			fi

			exit 0;

			@@ -3,12 +3,12 @@
			. ./path.sh \|\| exit 1;

			# machines configuration
			CUDA_VISIBLE_DEVICES="2,3"
			CUDA_VISIBLE_DEVICES="0,1"
			gpu_num=2
			count=1
			gpu_inference=true # Whether to perform gpu decoding, set false for cpu decoding
			# for gpu decoding, inference_nj=ngpu*njob; for cpu decoding, inference_nj=njob
			njob=1
			njob=5
			train_cmd=utils/run.pl
			infer_cmd=utils/run.pl

			@@ -16,13 +16,11 @@
			feats_dir="../DATA" #feature output dictionary
			exp_dir="."
			lang=zh
			dumpdir=dump/fbank
			feats_type=fbank
			token_type=char
			scp=wav.scp
			type=sound
			scp=wav.scp
			stage=3
			stop_stage=3
			stop_stage=4

			# feature configuration
			feats_dim=80
			@@ -48,7 +46,7 @@
			test_sets="dev test"

			asr_config=conf/train_asr_transformer.yaml
			model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
			model_dir="baseline_$(basename "${asr_config}" .yaml)_${lang}_${token_type}_${tag}"

			inference_config=conf/decode_asr_transformer.yaml
			inference_asr_model=valid.acc.ave_10best.pb
			@@ -143,4 +141,61 @@
			} &
			done
			wait
			fi

			# Testing Stage
			if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
			echo "stage 4: Inference"
			for dset in ${test_sets}; do
			asr_exp=${exp_dir}/exp/${model_dir}
			inference_tag="$(basename "${inference_config}" .yaml)"
			_dir="${asr_exp}/${inference_tag}/${inference_asr_model}/${dset}"
			_logdir="${_dir}/logdir"
			if [ -d ${_dir} ]; then
			echo "${_dir} is already exists. if you want to decode again, please delete this dir first."
			exit 0
			fi
			mkdir -p "${_logdir}"
			_data="${feats_dir}/data/${dset}"
			key_file=${_data}/${scp}
			num_scp_file="$(<${key_file} wc -l)"
			_nj=$([ $inference_nj -le $num_scp_file ] && echo "$inference_nj" \|\| echo "$num_scp_file")
			split_scps=
			for n in $(seq "${_nj}"); do
			split_scps+=" ${_logdir}/keys.${n}.scp"
			done
			# shellcheck disable=SC2086
			utils/split_scp.pl "${key_file}" ${split_scps}
			_opts=
			if [ -n "${inference_config}" ]; then
			_opts+="--config ${inference_config} "
			fi
			${infer_cmd} --gpu "${_ngpu}" --max-jobs-run "${_nj}" JOB=1:"${_nj}" "${_logdir}"/asr_inference.JOB.log \
			python -m funasr.bin.asr_inference_launch \
			--batch_size 1 \
			--ngpu "${_ngpu}" \
			--njob ${njob} \
			--gpuid_list ${gpuid_list} \
			--data_path_and_name_and_type "${_data}/${scp},speech,${type}" \
			--cmvn_file ${feats_dir}/data/${train_set}/cmvn/cmvn.mvn \
			--key_file "${_logdir}"/keys.JOB.scp \
			--asr_train_config "${asr_exp}"/config.yaml \
			--asr_model_file "${asr_exp}"/"${inference_asr_model}" \
			--output_dir "${_logdir}"/output.JOB \
			--mode asr \
			${_opts}

			for f in token token_int score text; do
			if [ -f "${_logdir}/output.1/1best_recog/${f}" ]; then
			for i in $(seq "${_nj}"); do
			cat "${_logdir}/output.${i}/1best_recog/${f}"
			done \| sort -k1 >"${_dir}/${f}"
			fi
			done
			python utils/proce_text.py ${_dir}/text ${_dir}/text.proc
			python utils/proce_text.py ${_data}/text ${_data}/text.proc
			python utils/compute_wer.py ${_data}/text.proc ${_dir}/text.proc ${_dir}/text.cer
			tail -n 3 ${_dir}/text.cer > ${_dir}/text.cer.txt
			cat ${_dir}/text.cer.txt
			done
			fi