From 0acf868dbb02642348e3addfec8e56974facfc2f Mon Sep 17 00:00:00 2001
From: hnluo <haoneng.lhn@alibaba-inc.com>
Date: 星期一, 29 五月 2023 10:40:21 +0800
Subject: [PATCH] Merge pull request #558 from alibaba-damo-academy/dev_wjm2
---
egs/wenetspeech/conformer/local/data.sh | 102 +++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 102 insertions(+), 0 deletions(-)
diff --git a/egs/wenetspeech/conformer/local/data.sh b/egs/wenetspeech/conformer/local/data.sh
new file mode 100755
index 0000000..2b0a4be
--- /dev/null
+++ b/egs/wenetspeech/conformer/local/data.sh
@@ -0,0 +1,102 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+# general configuration
+nj=10
+stage=2
+stop_stage=100
+set=L
+data_dir="data"
+WENETSPEECH=
+train_cmd=
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ ! -d "${WENETSPEECH}/audio" ] && [ ! -f "${WENETSPEECH}/WenetSpeech.json" ]; then
+ echo "Valid WENETSPEECH data not found in ${WENETSPEECH}."
+ echo "Please follow the instruction in https://wenet-e2e.github.io/WenetSpeech/"
+ echo "and re-construct the data."
+ exit 1
+fi
+
+train_set=train_"$(echo "${set}" | tr "[:upper:]" "[:lower:]")"
+dev_set=dev
+test_sets="test_net test_meeting"
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ log "data preparation"
+ mkdir -p ${data_dir}
+ abs_data_dir=$(readlink -f ${data_dir})
+ log "making Kaldi format data directory in ${abs_data_dir}"
+ local/wenetspeech_data_prep.sh \
+ --train-subset ${set} \
+ --stage 1 \
+ ${WENETSPEECH} \
+ ${abs_data_dir}
+
+ # prepare utt2spk and spk2utt files
+ for x in ${train_set} ${dev_set} ${test_sets}; do
+ dir=${data_dir}/${x}
+ paste -d " " <(cut -f 1 ${dir}/segments) <(cut -f 1 ${dir}/segments) | \
+ sort -u > ${dir}/utt2spk
+ utils/utt2spk_to_spk2utt.pl ${dir}/utt2spk > ${dir}/spk2utt
+ done
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ log "process the long term opus audio file, may take about 3 hours"
+ for x in ${train_set} ${dev_set} ${test_sets}; do
+ log "process audio for ${data_dir}/${x}"
+ dir=${data_dir}/${x}
+ mkdir -p ${dir}/logs
+
+ nutt=$(<${dir}/segments wc -l)
+ nj=$((nj<nutt?nj:nutt))
+
+ split_scps=""
+ for n in $(seq ${nj}); do
+ split_scps="${split_scps} ${dir}/logs/segments.${n}"
+ done
+ utils/split_scp.pl ${dir}/segments ${split_scps}
+
+ ${train_cmd} "JOB=1:${nj}" "${dir}/logs/process_audio.JOB.log"\
+ python3 local/process_opus.py \
+ ${dir}/wav.scp \
+ ${dir}/logs/segments.JOB \
+ ${dir}/logs/wav.JOB.scp
+
+ # modify the `wav.scp` file and rename the `segments` file
+ # rename the `segments` file to avoid the audio file formatting process in stage 3 of `asr.sh`
+ mv ${dir}/wav.scp ${dir}/wav.scp.org
+ mv ${dir}/segments ${dir}/segments.org
+ for n in $(seq ${nj}); do
+ cat ${dir}/logs/wav.${n}.scp || exit 1;
+ done | sort -u > ${dir}/wav.scp
+ done
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+ log "format text file"
+ for x in ${train_set} ${dev_set} ${test_sets}; do
+ log "format text for ${data_dir}/${x}"
+ dir=${data_dir}/${x}
+ mv ${dir}/text ${dir}/text.org
+ paste -d " " <(cut -f 1 ${dir}/text.org) \
+ <(cut -f 2- ${dir}/text.org | local/text_normalize.pl) | \
+ sort -u > ${dir}/text
+ utils/fix_data_dir.sh ${dir}
+ done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
--
Gitblit v1.9.1