#!/usr/bin/env bash # Set bash to 'debug' mode, it will exit on : # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', set -e set -u set -o pipefail log() { local fname=${BASH_SOURCE[1]##*/} echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" } SECONDS=0 # general configuration nj=10 stage=2 stop_stage=100 set=L data_dir="data" WENETSPEECH= train_cmd= log "$0 $*" . utils/parse_options.sh if [ ! -d "${WENETSPEECH}/audio" ] && [ ! -f "${WENETSPEECH}/WenetSpeech.json" ]; then echo "Valid WENETSPEECH data not found in ${WENETSPEECH}." echo "Please follow the instruction in https://wenet-e2e.github.io/WenetSpeech/" echo "and re-construct the data." exit 1 fi train_set=train_"$(echo "${set}" | tr "[:upper:]" "[:lower:]")" dev_set=dev test_sets="test_net test_meeting" if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then log "data preparation" mkdir -p ${data_dir} abs_data_dir=$(readlink -f ${data_dir}) log "making Kaldi format data directory in ${abs_data_dir}" local/wenetspeech_data_prep.sh \ --train-subset ${set} \ --stage 1 \ ${WENETSPEECH} \ ${abs_data_dir} # prepare utt2spk and spk2utt files for x in ${train_set} ${dev_set} ${test_sets}; do dir=${data_dir}/${x} paste -d " " <(cut -f 1 ${dir}/segments) <(cut -f 1 ${dir}/segments) | \ sort -u > ${dir}/utt2spk utils/utt2spk_to_spk2utt.pl ${dir}/utt2spk > ${dir}/spk2utt done fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then log "process the long term opus audio file, may take about 3 hours" for x in ${train_set} ${dev_set} ${test_sets}; do log "process audio for ${data_dir}/${x}" dir=${data_dir}/${x} mkdir -p ${dir}/logs nutt=$(<${dir}/segments wc -l) nj=$((nj ${dir}/wav.scp done fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then log "format text file" for x in ${train_set} ${dev_set} ${test_sets}; do log "format text for ${data_dir}/${x}" dir=${data_dir}/${x} mv ${dir}/text ${dir}/text.org paste -d " " <(cut -f 1 ${dir}/text.org) \ <(cut -f 2- ${dir}/text.org | local/text_normalize.pl) | \ sort -u > ${dir}/text utils/fix_data_dir.sh ${dir} done fi log "Successfully finished. [elapsed=${SECONDS}s]"