| egs/wenetspeech/conformer/local/data.sh | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
| egs/wenetspeech/conformer/local/extract_meta.py | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
| egs/wenetspeech/conformer/local/path.sh | 补丁 | 查看 | 原始文档 | blame | 历史 | |
| egs/wenetspeech/conformer/local/process_opus.py | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
| egs/wenetspeech/conformer/local/text_normalize.pl | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
| egs/wenetspeech/conformer/local/wenetspeech_data_prep.sh | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
| egs/wenetspeech/conformer/run.sh | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 |
egs/wenetspeech/conformer/local/data.sh
New file @@ -0,0 +1,110 @@ #!/usr/bin/env bash # Set bash to 'debug' mode, it will exit on : # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', set -e set -u set -o pipefail log() { local fname=${BASH_SOURCE[1]##*/} echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" } SECONDS=0 # general configuration nj=10 stage=1 stop_stage=100 set=L data_dir="data" log "$0 $*" . utils/parse_options.sh . ./path.sh || exit 1; . ./cmd.sh || exit 1; . ./db.sh || exit 1; if [ ! -e "${WENETSPEECH}" ]; then log "Fill the value of 'WENETSPEECH' of db.sh" log "or download the data set follwing the instruction in https://wenet-e2e.github.io/WenetSpeech/" exit 1 fi if [ ! -d "${WENETSPEECH}/audio" ] && [ ! -f "${WENETSPEECH}/WenetSpeech.json" ]; then echo "Valid WENETSPEECH data not found in ${WENETSPEECH}." echo "Please follow the instruction in https://wenet-e2e.github.io/WenetSpeech/" echo "and re-construct the data." exit 1 fi train_set=train_"$(echo "${set}" | tr "[:upper:]" "[:lower:]")" dev_set=dev test_sets="test_net test_meeting" if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then log "data preparation" mkdir -p ${data_dir} abs_data_dir=$(readlink -f ${data_dir}) log "making Kaldi format data directory in ${abs_data_dir}" local/wenetspeech_data_prep.sh \ --train-subset ${set} \ --stage 1 \ ${WENETSPEECH} \ ${abs_data_dir} # prepare utt2spk and spk2utt files for x in ${train_set} ${dev_set} ${test_sets}; do dir=${data_dir}/${x} paste -d " " <(cut -f 1 ${dir}/segments) <(cut -f 1 ${dir}/segments) | \ sort -u > ${dir}/utt2spk utils/utt2spk_to_spk2utt.pl ${dir}/utt2spk > ${dir}/spk2utt done fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then log "process the long term opus audio file, may take about 3 hours" for x in ${train_set} ${dev_set} ${test_sets}; do log "process audio for ${data_dir}/${x}" dir=${data_dir}/${x} mkdir -p ${dir}/logs nutt=$(<${dir}/segments wc -l) nj=$((nj<nutt?nj:nutt)) split_scps="" for n in $(seq ${nj}); do split_scps="${split_scps} ${dir}/logs/segments.${n}" done utils/split_scp.pl ${dir}/segments ${split_scps} ${train_cmd} "JOB=1:${nj}" "${dir}/logs/process_audio.JOB.log"\ python3 local/process_opus.py \ ${dir}/wav.scp \ ${dir}/logs/segments.JOB \ ${dir}/logs/wav.JOB.scp # modify the `wav.scp` file and rename the `segments` file # rename the `segments` file to avoid the audio file formatting process in stage 3 of `asr.sh` mv ${dir}/wav.scp ${dir}/wav.scp.org mv ${dir}/segments ${dir}/segments.org for n in $(seq ${nj}); do cat ${dir}/logs/wav.${n}.scp || exit 1; done | sort -u > ${dir}/wav.scp done fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then log "format text file" for x in ${train_set} ${dev_set} ${test_sets}; do log "format text for ${data_dir}/${x}" dir=${data_dir}/${x} mv ${dir}/text ${dir}/text.org paste -d " " <(cut -f 1 ${dir}/text.org) \ <(cut -f 2- ${dir}/text.org | local/text_normalize.pl) | \ sort -u > ${dir}/text utils/fix_data_dir.sh ${dir} done fi log "Successfully finished. [elapsed=${SECONDS}s]" egs/wenetspeech/conformer/local/extract_meta.py
@@ -13,20 +13,22 @@ # See the License for the specific language governing permissions and # limitations under the License. import sys import os import argparse import json import os import sys def get_args(): parser = argparse.ArgumentParser(description=""" parser = argparse.ArgumentParser( description=""" This script is used to process raw json dataset of WenetSpeech, where the long wav is splitinto segments and data of wenet format is generated. """) parser.add_argument('input_json', help="""Input json file of WenetSpeech""") parser.add_argument('output_dir', help="""Output dir for prepared data""") """ ) parser.add_argument("input_json", help="""Input json file of WenetSpeech""") parser.add_argument("output_dir", help="""Output dir for prepared data""") args = parser.parse_args() return args @@ -39,58 +41,68 @@ os.makedirs(output_dir) try: with open(input_json, 'r') as injson: with open(input_json, "r") as injson: json_data = json.load(injson) except Exception: sys.exit(f'Failed to load input json file: {input_json}') sys.exit(f"Failed to load input json file: {input_json}") else: if json_data['audios'] is not None: with open(f'{output_dir}/text', 'w') as utt2text, \ open(f'{output_dir}/segments', 'w') as segments, \ open(f'{output_dir}/utt2dur', 'w') as utt2dur, \ open(f'{output_dir}/wav.scp', 'w') as wavscp, \ open(f'{output_dir}/utt2subsets', 'w') as utt2subsets, \ open(f'{output_dir}/reco2dur', 'w') as reco2dur: for long_audio in json_data['audios']: if json_data["audios"] is not None: with open(f"{output_dir}/text", "w") as utt2text, open( f"{output_dir}/segments", "w" ) as segments, open(f"{output_dir}/utt2dur", "w") as utt2dur, open( f"{output_dir}/wav.scp", "w" ) as wavscp, open( f"{output_dir}/utt2subsets", "w" ) as utt2subsets, open( f"{output_dir}/reco2dur", "w" ) as reco2dur: for long_audio in json_data["audios"]: try: long_audio_path = os.path.realpath( os.path.join(input_dir, long_audio['path'])) aid = long_audio['aid'] segments_lists = long_audio['segments'] duration = long_audio['duration'] assert (os.path.exists(long_audio_path)) os.path.join(input_dir, long_audio["path"]) ) aid = long_audio["aid"] segments_lists = long_audio["segments"] duration = long_audio["duration"] assert os.path.exists(long_audio_path) except AssertionError: print(f'''Warning: {aid} something is wrong, maybe AssertionError, skipped''') print( f"""Warning: {aid} something is wrong, maybe AssertionError, skipped""" ) continue except Exception: print(f'''Warning: {aid} something is wrong, maybe the error path: {long_audio_path}, skipped''') print( f"""Warning: {aid} something is wrong, maybe the error path: {long_audio_path}, skipped""" ) continue else: wavscp.write(f'{aid}\t{long_audio_path}\n') reco2dur.write(f'{aid}\t{duration}\n') wavscp.write(f"{aid}\t{long_audio_path}\n") reco2dur.write(f"{aid}\t{duration}\n") for segment_file in segments_lists: try: sid = segment_file['sid'] start_time = segment_file['begin_time'] end_time = segment_file['end_time'] sid = segment_file["sid"] start_time = segment_file["begin_time"] end_time = segment_file["end_time"] dur = end_time - start_time text = segment_file['text'] text = segment_file["text"] segment_subsets = segment_file["subsets"] except Exception: print(f'''Warning: {segment_file} something is wrong, skipped''') print( f"""Warning: {segment_file} something is wrong, skipped""" ) continue else: utt2text.write(f'{sid}\t{text}\n') utt2text.write(f"{sid}\t{text}\n") segments.write( f'{sid}\t{aid}\t{start_time}\t{end_time}\n' f"{sid}\t{aid}\t{start_time}\t{end_time}\n" ) utt2dur.write(f'{sid}\t{dur}\n') utt2dur.write(f"{sid}\t{dur}\n") segment_sub_names = " ".join(segment_subsets) utt2subsets.write( f'{sid}\t{segment_sub_names}\n') utt2subsets.write(f"{sid}\t{segment_sub_names}\n") def main(): args = get_args() @@ -98,5 +110,5 @@ meta_analysis(args.input_json, args.output_dir) if __name__ == '__main__': if __name__ == "__main__": main() egs/wenetspeech/conformer/local/path.sh
egs/wenetspeech/conformer/local/process_opus.py
@@ -16,14 +16,15 @@ # usage: python3 process_opus.py wav.scp segments output_wav.scp from pydub import AudioSegment import sys import os import sys from pydub import AudioSegment def read_file(wav_scp, segments): wav_scp_dict = {} with open(wav_scp, 'r', encoding='UTF-8') as fin: with open(wav_scp, "r", encoding="UTF-8") as fin: for line_str in fin: wav_id, path = line_str.strip().split() wav_scp_dict[wav_id] = path @@ -32,7 +33,7 @@ seg_path_list = [] start_time_list = [] end_time_list = [] with open(segments, 'r', encoding='UTF-8') as fin: with open(segments, "r", encoding="UTF-8") as fin: for line_str in fin: arr = line_str.strip().split() assert len(arr) == 4 @@ -44,30 +45,27 @@ # TODO(Qijie): Fix the process logic def output(output_wav_scp, utt_list, seg_path_list, start_time_list, end_time_list): def output(output_wav_scp, utt_list, seg_path_list, start_time_list, end_time_list): num_utts = len(utt_list) step = int(num_utts * 0.01) with open(output_wav_scp, 'w', encoding='UTF-8') as fout: with open(output_wav_scp, "w", encoding="UTF-8") as fout: previous_wav_path = "" for i in range(num_utts): utt_id = utt_list[i] current_wav_path = seg_path_list[i] output_dir = (os.path.dirname(current_wav_path)) \ .replace("audio", 'audio_seg') seg_wav_path = os.path.join(output_dir, utt_id + '.wav') output_dir = (os.path.dirname(current_wav_path)).replace( "audio", "audio_seg" ) seg_wav_path = os.path.join(output_dir, utt_id + ".wav") # if not os.path.exists(output_dir): # os.makedirs(output_dir) os.makedirs(output_dir, exist_ok=True) if current_wav_path != previous_wav_path: source_wav = AudioSegment.from_file(current_wav_path) previous_wav_path = current_wav_path start = int(start_time_list[i] * 1000) end = int(end_time_list[i] * 1000) target_audio = source_wav[start:end].set_frame_rate(16000) \ .set_sample_width(2) target_audio = source_wav[start:end].set_frame_rate(16000) target_audio.export(seg_wav_path, format="wav") fout.write("{} {}\n".format(utt_id, seg_wav_path)) @@ -80,11 +78,11 @@ segments = sys.argv[2] output_wav_scp = sys.argv[3] utt_list, seg_path_list, start_time_list, end_time_list \ = read_file(wav_scp, segments) output(output_wav_scp, utt_list, seg_path_list, start_time_list, end_time_list) utt_list, seg_path_list, start_time_list, end_time_list = read_file( wav_scp, segments ) output(output_wav_scp, utt_list, seg_path_list, start_time_list, end_time_list) if __name__ == '__main__': if __name__ == "__main__": main() egs/wenetspeech/conformer/local/text_normalize.pl
New file @@ -0,0 +1,24 @@ #!/usr/bin/env perl use utf8; use open qw(:std :utf8); use warnings; while (<STDIN>) { chomp; # remove non UTF-8 whitespace character if ($_ =~ / /) {$_ =~ s: ::g;} if ($_ =~ / /) {$_ =~ s: ::g;} # upper letters if ($_ =~ /[a-zA-Z]/) {$_ =~ uc $_;} # add "_" before and after each English word if ($_ =~ /([A-Z]+)\s+([A-Z]+)/) {$_ =~ s/([A-Z]+)\s+([A-Z]+)/$1\_$2/g;} if ($_ =~ /([A-Z]+)\s+([A-Z]+)/) {$_ =~ s/([A-Z]+)\s+([A-Z]+)/$1\_$2/g;} if ($_ =~ m/([A-Z]+)(\p{Han}+)/) {$_ =~ s/([A-Z]+)(\p{Han}+)/$1\_$2/g;} if ($_ =~ m/(\p{Han}+)([A-Z]+)/) {$_ =~ s/(\p{Han}+)([A-Z]+)/$1\_$2/g;} # remove UTF-8 whitespace charcter if ($_ =~ /\s+/) {$_ =~ s:\s+::g;} # replace "_" with a normal whitespace if ($_ =~ /\_/) {$_ =~ s:\_: :g;} print "$_\n"; } egs/wenetspeech/conformer/local/wenetspeech_data_prep.sh
@@ -24,7 +24,7 @@ prefix= train_subset=L . ./utils/parse_options.sh || exit 1; . utils/parse_options.sh || exit 1; filter_by_id () { idlist=$1 egs/wenetspeech/conformer/run.sh
@@ -41,6 +41,7 @@ set -u set -o pipefail set=L train_set=train_l valid_set=dev test_sets="dev test_net test_meeting" @@ -71,15 +72,15 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then echo "stage 0: Data preparation" # Data preparation local/wenetspeech_data_prep.sh $raw_data $feats_dir mkdir $feats_dir/data mv $feats_dir/$train_set $feats_dir/data/$train_set for x in $test_sets; do mv $feats_dir/$x $feats_dir/data/ done local/data.sh "--set ${set}" # mkdir $feats_dir/data # mv $feats_dir/$train_set $feats_dir/data/$train_set # for x in $test_sets; do # mv $feats_dir/$x $feats_dir/data/ # done fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then echo "stage 1: Feature and CMVN Generation" utils/compute_cmvn.sh --fbankdir ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config_file "$asr_config" --scale 0.1 fi #if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # echo "stage 1: Feature and CMVN Generation" # utils/compute_cmvn.sh --fbankdir ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config_file "$asr_config" --scale 0.1 #fi