| New file |
| | |
| | | #!/usr/bin/env bash |
| | | # Set bash to 'debug' mode, it will exit on : |
| | | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', |
| | | set -e |
| | | set -u |
| | | set -o pipefail |
| | | |
| | | log() { |
| | | local fname=${BASH_SOURCE[1]##*/} |
| | | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" |
| | | } |
| | | SECONDS=0 |
| | | |
| | | # general configuration |
| | | nj=10 |
| | | stage=2 |
| | | stop_stage=100 |
| | | set=L |
| | | data_dir="data" |
| | | WENETSPEECH= |
| | | train_cmd= |
| | | |
| | | log "$0 $*" |
| | | . utils/parse_options.sh |
| | | |
| | | if [ ! -d "${WENETSPEECH}/audio" ] && [ ! -f "${WENETSPEECH}/WenetSpeech.json" ]; then |
| | | echo "Valid WENETSPEECH data not found in ${WENETSPEECH}." |
| | | echo "Please follow the instruction in https://wenet-e2e.github.io/WenetSpeech/" |
| | | echo "and re-construct the data." |
| | | exit 1 |
| | | fi |
| | | |
| | | train_set=train_"$(echo "${set}" | tr "[:upper:]" "[:lower:]")" |
| | | dev_set=dev |
| | | test_sets="test_net test_meeting" |
| | | |
| | | if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then |
| | | log "data preparation" |
| | | mkdir -p ${data_dir} |
| | | abs_data_dir=$(readlink -f ${data_dir}) |
| | | log "making Kaldi format data directory in ${abs_data_dir}" |
| | | local/wenetspeech_data_prep.sh \ |
| | | --train-subset ${set} \ |
| | | --stage 1 \ |
| | | ${WENETSPEECH} \ |
| | | ${abs_data_dir} |
| | | |
| | | # prepare utt2spk and spk2utt files |
| | | for x in ${train_set} ${dev_set} ${test_sets}; do |
| | | dir=${data_dir}/${x} |
| | | paste -d " " <(cut -f 1 ${dir}/segments) <(cut -f 1 ${dir}/segments) | \ |
| | | sort -u > ${dir}/utt2spk |
| | | utils/utt2spk_to_spk2utt.pl ${dir}/utt2spk > ${dir}/spk2utt |
| | | done |
| | | fi |
| | | |
| | | if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then |
| | | log "process the long term opus audio file, may take about 3 hours" |
| | | for x in ${train_set} ${dev_set} ${test_sets}; do |
| | | log "process audio for ${data_dir}/${x}" |
| | | dir=${data_dir}/${x} |
| | | mkdir -p ${dir}/logs |
| | | |
| | | nutt=$(<${dir}/segments wc -l) |
| | | nj=$((nj<nutt?nj:nutt)) |
| | | |
| | | split_scps="" |
| | | for n in $(seq ${nj}); do |
| | | split_scps="${split_scps} ${dir}/logs/segments.${n}" |
| | | done |
| | | utils/split_scp.pl ${dir}/segments ${split_scps} |
| | | |
| | | ${train_cmd} "JOB=1:${nj}" "${dir}/logs/process_audio.JOB.log"\ |
| | | python3 local/process_opus.py \ |
| | | ${dir}/wav.scp \ |
| | | ${dir}/logs/segments.JOB \ |
| | | ${dir}/logs/wav.JOB.scp |
| | | |
| | | # modify the `wav.scp` file and rename the `segments` file |
| | | # rename the `segments` file to avoid the audio file formatting process in stage 3 of `asr.sh` |
| | | mv ${dir}/wav.scp ${dir}/wav.scp.org |
| | | mv ${dir}/segments ${dir}/segments.org |
| | | for n in $(seq ${nj}); do |
| | | cat ${dir}/logs/wav.${n}.scp || exit 1; |
| | | done | sort -u > ${dir}/wav.scp |
| | | done |
| | | fi |
| | | |
| | | if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then |
| | | log "format text file" |
| | | for x in ${train_set} ${dev_set} ${test_sets}; do |
| | | log "format text for ${data_dir}/${x}" |
| | | dir=${data_dir}/${x} |
| | | mv ${dir}/text ${dir}/text.org |
| | | paste -d " " <(cut -f 1 ${dir}/text.org) \ |
| | | <(cut -f 2- ${dir}/text.org | local/text_normalize.pl) | \ |
| | | sort -u > ${dir}/text |
| | | utils/fix_data_dir.sh ${dir} |
| | | done |
| | | fi |
| | | |
| | | log "Successfully finished. [elapsed=${SECONDS}s]" |