From ca79f9c404e68fcb55a09200abbe8547474605fe Mon Sep 17 00:00:00 2001
From: 嘉渊 <wangjiaming.wjm@alibaba-inc.com>
Date: 星期四, 25 五月 2023 17:02:24 +0800
Subject: [PATCH] update repo
---
egs/wenetspeech/conformer/local/process_opus.py | 38 +++----
egs/wenetspeech/conformer/local/wenetspeech_data_prep.sh | 2
egs/wenetspeech/conformer/run.sh | 21 ++--
egs/wenetspeech/conformer/local/extract_meta.py | 90 ++++++++++-------
egs/wenetspeech/conformer/local/data.sh | 110 ++++++++++++++++++++++
egs/wenetspeech/conformer/local/path.sh | 0
egs/wenetspeech/conformer/local/text_normalize.pl | 24 ++++
7 files changed, 215 insertions(+), 70 deletions(-)
diff --git a/egs/wenetspeech/conformer/local/data.sh b/egs/wenetspeech/conformer/local/data.sh
new file mode 100755
index 0000000..dcfba5f
--- /dev/null
+++ b/egs/wenetspeech/conformer/local/data.sh
@@ -0,0 +1,110 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+# general configuration
+nj=10
+stage=1
+stop_stage=100
+set=L
+data_dir="data"
+
+log "$0 $*"
+. utils/parse_options.sh
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+if [ ! -e "${WENETSPEECH}" ]; then
+ log "Fill the value of 'WENETSPEECH' of db.sh"
+ log "or download the data set follwing the instruction in https://wenet-e2e.github.io/WenetSpeech/"
+ exit 1
+fi
+
+if [ ! -d "${WENETSPEECH}/audio" ] && [ ! -f "${WENETSPEECH}/WenetSpeech.json" ]; then
+ echo "Valid WENETSPEECH data not found in ${WENETSPEECH}."
+ echo "Please follow the instruction in https://wenet-e2e.github.io/WenetSpeech/"
+ echo "and re-construct the data."
+ exit 1
+fi
+
+train_set=train_"$(echo "${set}" | tr "[:upper:]" "[:lower:]")"
+dev_set=dev
+test_sets="test_net test_meeting"
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ log "data preparation"
+ mkdir -p ${data_dir}
+ abs_data_dir=$(readlink -f ${data_dir})
+ log "making Kaldi format data directory in ${abs_data_dir}"
+ local/wenetspeech_data_prep.sh \
+ --train-subset ${set} \
+ --stage 1 \
+ ${WENETSPEECH} \
+ ${abs_data_dir}
+
+ # prepare utt2spk and spk2utt files
+ for x in ${train_set} ${dev_set} ${test_sets}; do
+ dir=${data_dir}/${x}
+ paste -d " " <(cut -f 1 ${dir}/segments) <(cut -f 1 ${dir}/segments) | \
+ sort -u > ${dir}/utt2spk
+ utils/utt2spk_to_spk2utt.pl ${dir}/utt2spk > ${dir}/spk2utt
+ done
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ log "process the long term opus audio file, may take about 3 hours"
+ for x in ${train_set} ${dev_set} ${test_sets}; do
+ log "process audio for ${data_dir}/${x}"
+ dir=${data_dir}/${x}
+ mkdir -p ${dir}/logs
+
+ nutt=$(<${dir}/segments wc -l)
+ nj=$((nj<nutt?nj:nutt))
+
+ split_scps=""
+ for n in $(seq ${nj}); do
+ split_scps="${split_scps} ${dir}/logs/segments.${n}"
+ done
+ utils/split_scp.pl ${dir}/segments ${split_scps}
+
+ ${train_cmd} "JOB=1:${nj}" "${dir}/logs/process_audio.JOB.log"\
+ python3 local/process_opus.py \
+ ${dir}/wav.scp \
+ ${dir}/logs/segments.JOB \
+ ${dir}/logs/wav.JOB.scp
+
+ # modify the `wav.scp` file and rename the `segments` file
+ # rename the `segments` file to avoid the audio file formatting process in stage 3 of `asr.sh`
+ mv ${dir}/wav.scp ${dir}/wav.scp.org
+ mv ${dir}/segments ${dir}/segments.org
+ for n in $(seq ${nj}); do
+ cat ${dir}/logs/wav.${n}.scp || exit 1;
+ done | sort -u > ${dir}/wav.scp
+ done
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+ log "format text file"
+ for x in ${train_set} ${dev_set} ${test_sets}; do
+ log "format text for ${data_dir}/${x}"
+ dir=${data_dir}/${x}
+ mv ${dir}/text ${dir}/text.org
+ paste -d " " <(cut -f 1 ${dir}/text.org) \
+ <(cut -f 2- ${dir}/text.org | local/text_normalize.pl) | \
+ sort -u > ${dir}/text
+ utils/fix_data_dir.sh ${dir}
+ done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs/wenetspeech/conformer/local/extract_meta.py b/egs/wenetspeech/conformer/local/extract_meta.py
index ce2871d..6074162 100755
--- a/egs/wenetspeech/conformer/local/extract_meta.py
+++ b/egs/wenetspeech/conformer/local/extract_meta.py
@@ -13,20 +13,22 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-import sys
-import os
import argparse
import json
+import os
+import sys
def get_args():
- parser = argparse.ArgumentParser(description="""
+ parser = argparse.ArgumentParser(
+ description="""
This script is used to process raw json dataset of WenetSpeech,
where the long wav is splitinto segments and
data of wenet format is generated.
- """)
- parser.add_argument('input_json', help="""Input json file of WenetSpeech""")
- parser.add_argument('output_dir', help="""Output dir for prepared data""")
+ """
+ )
+ parser.add_argument("input_json", help="""Input json file of WenetSpeech""")
+ parser.add_argument("output_dir", help="""Output dir for prepared data""")
args = parser.parse_args()
return args
@@ -39,58 +41,68 @@
os.makedirs(output_dir)
try:
- with open(input_json, 'r') as injson:
+ with open(input_json, "r") as injson:
json_data = json.load(injson)
except Exception:
- sys.exit(f'Failed to load input json file: {input_json}')
+ sys.exit(f"Failed to load input json file: {input_json}")
else:
- if json_data['audios'] is not None:
- with open(f'{output_dir}/text', 'w') as utt2text, \
- open(f'{output_dir}/segments', 'w') as segments, \
- open(f'{output_dir}/utt2dur', 'w') as utt2dur, \
- open(f'{output_dir}/wav.scp', 'w') as wavscp, \
- open(f'{output_dir}/utt2subsets', 'w') as utt2subsets, \
- open(f'{output_dir}/reco2dur', 'w') as reco2dur:
- for long_audio in json_data['audios']:
+ if json_data["audios"] is not None:
+ with open(f"{output_dir}/text", "w") as utt2text, open(
+ f"{output_dir}/segments", "w"
+ ) as segments, open(f"{output_dir}/utt2dur", "w") as utt2dur, open(
+ f"{output_dir}/wav.scp", "w"
+ ) as wavscp, open(
+ f"{output_dir}/utt2subsets", "w"
+ ) as utt2subsets, open(
+ f"{output_dir}/reco2dur", "w"
+ ) as reco2dur:
+ for long_audio in json_data["audios"]:
try:
long_audio_path = os.path.realpath(
- os.path.join(input_dir, long_audio['path']))
- aid = long_audio['aid']
- segments_lists = long_audio['segments']
- duration = long_audio['duration']
- assert (os.path.exists(long_audio_path))
+ os.path.join(input_dir, long_audio["path"])
+ )
+ aid = long_audio["aid"]
+ segments_lists = long_audio["segments"]
+ duration = long_audio["duration"]
+ assert os.path.exists(long_audio_path)
except AssertionError:
- print(f'''Warning: {aid} something is wrong,
- maybe AssertionError, skipped''')
+ print(
+ f"""Warning: {aid} something is wrong,
+ maybe AssertionError, skipped"""
+ )
continue
except Exception:
- print(f'''Warning: {aid} something is wrong, maybe the
- error path: {long_audio_path}, skipped''')
+ print(
+ f"""Warning: {aid} something is wrong, maybe the
+ error path: {long_audio_path}, skipped"""
+ )
continue
else:
- wavscp.write(f'{aid}\t{long_audio_path}\n')
- reco2dur.write(f'{aid}\t{duration}\n')
+ wavscp.write(f"{aid}\t{long_audio_path}\n")
+ reco2dur.write(f"{aid}\t{duration}\n")
for segment_file in segments_lists:
try:
- sid = segment_file['sid']
- start_time = segment_file['begin_time']
- end_time = segment_file['end_time']
+ sid = segment_file["sid"]
+ start_time = segment_file["begin_time"]
+ end_time = segment_file["end_time"]
dur = end_time - start_time
- text = segment_file['text']
+ text = segment_file["text"]
segment_subsets = segment_file["subsets"]
except Exception:
- print(f'''Warning: {segment_file} something
- is wrong, skipped''')
+ print(
+ f"""Warning: {segment_file} something
+ is wrong, skipped"""
+ )
continue
else:
- utt2text.write(f'{sid}\t{text}\n')
+ utt2text.write(f"{sid}\t{text}\n")
segments.write(
- f'{sid}\t{aid}\t{start_time}\t{end_time}\n'
+ f"{sid}\t{aid}\t{start_time}\t{end_time}\n"
)
- utt2dur.write(f'{sid}\t{dur}\n')
+ utt2dur.write(f"{sid}\t{dur}\n")
segment_sub_names = " ".join(segment_subsets)
- utt2subsets.write(
- f'{sid}\t{segment_sub_names}\n')
+ utt2subsets.write(f"{sid}\t{segment_sub_names}\n")
+
def main():
args = get_args()
@@ -98,5 +110,5 @@
meta_analysis(args.input_json, args.output_dir)
-if __name__ == '__main__':
+if __name__ == "__main__":
main()
diff --git a/egs/wenetspeech/conformer/local/path.sh b/egs/wenetspeech/conformer/local/path.sh
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/egs/wenetspeech/conformer/local/path.sh
diff --git a/egs/wenetspeech/conformer/local/process_opus.py b/egs/wenetspeech/conformer/local/process_opus.py
index 9f71eb1..044d183 100755
--- a/egs/wenetspeech/conformer/local/process_opus.py
+++ b/egs/wenetspeech/conformer/local/process_opus.py
@@ -16,14 +16,15 @@
# usage: python3 process_opus.py wav.scp segments output_wav.scp
-from pydub import AudioSegment
-import sys
import os
+import sys
+
+from pydub import AudioSegment
def read_file(wav_scp, segments):
wav_scp_dict = {}
- with open(wav_scp, 'r', encoding='UTF-8') as fin:
+ with open(wav_scp, "r", encoding="UTF-8") as fin:
for line_str in fin:
wav_id, path = line_str.strip().split()
wav_scp_dict[wav_id] = path
@@ -32,7 +33,7 @@
seg_path_list = []
start_time_list = []
end_time_list = []
- with open(segments, 'r', encoding='UTF-8') as fin:
+ with open(segments, "r", encoding="UTF-8") as fin:
for line_str in fin:
arr = line_str.strip().split()
assert len(arr) == 4
@@ -44,30 +45,27 @@
# TODO(Qijie): Fix the process logic
-def output(output_wav_scp, utt_list, seg_path_list, start_time_list,
- end_time_list):
+def output(output_wav_scp, utt_list, seg_path_list, start_time_list, end_time_list):
num_utts = len(utt_list)
step = int(num_utts * 0.01)
- with open(output_wav_scp, 'w', encoding='UTF-8') as fout:
+ with open(output_wav_scp, "w", encoding="UTF-8") as fout:
previous_wav_path = ""
for i in range(num_utts):
utt_id = utt_list[i]
current_wav_path = seg_path_list[i]
- output_dir = (os.path.dirname(current_wav_path)) \
- .replace("audio", 'audio_seg')
- seg_wav_path = os.path.join(output_dir, utt_id + '.wav')
+ output_dir = (os.path.dirname(current_wav_path)).replace(
+ "audio", "audio_seg"
+ )
+ seg_wav_path = os.path.join(output_dir, utt_id + ".wav")
- # if not os.path.exists(output_dir):
- # os.makedirs(output_dir)
-
+ os.makedirs(output_dir, exist_ok=True)
if current_wav_path != previous_wav_path:
source_wav = AudioSegment.from_file(current_wav_path)
previous_wav_path = current_wav_path
start = int(start_time_list[i] * 1000)
end = int(end_time_list[i] * 1000)
- target_audio = source_wav[start:end].set_frame_rate(16000) \
- .set_sample_width(2)
+ target_audio = source_wav[start:end].set_frame_rate(16000)
target_audio.export(seg_wav_path, format="wav")
fout.write("{} {}\n".format(utt_id, seg_wav_path))
@@ -80,11 +78,11 @@
segments = sys.argv[2]
output_wav_scp = sys.argv[3]
- utt_list, seg_path_list, start_time_list, end_time_list \
- = read_file(wav_scp, segments)
- output(output_wav_scp, utt_list, seg_path_list, start_time_list,
- end_time_list)
+ utt_list, seg_path_list, start_time_list, end_time_list = read_file(
+ wav_scp, segments
+ )
+ output(output_wav_scp, utt_list, seg_path_list, start_time_list, end_time_list)
-if __name__ == '__main__':
+if __name__ == "__main__":
main()
diff --git a/egs/wenetspeech/conformer/local/text_normalize.pl b/egs/wenetspeech/conformer/local/text_normalize.pl
new file mode 100755
index 0000000..55b35e2
--- /dev/null
+++ b/egs/wenetspeech/conformer/local/text_normalize.pl
@@ -0,0 +1,24 @@
+#!/usr/bin/env perl
+use utf8;
+use open qw(:std :utf8);
+use warnings;
+
+while (<STDIN>) {
+ chomp;
+ # remove non UTF-8 whitespace character
+ if ($_ =~ /銆�/) {$_ =~ s:銆�::g;}
+ if ($_ =~ /聽/) {$_ =~ s:聽::g;}
+ # upper letters
+ if ($_ =~ /[a-zA-Z]/) {$_ =~ uc $_;}
+ # add "_" before and after each English word
+ if ($_ =~ /([A-Z]+)\s+([A-Z]+)/) {$_ =~ s/([A-Z]+)\s+([A-Z]+)/$1\_$2/g;}
+ if ($_ =~ /([A-Z]+)\s+([A-Z]+)/) {$_ =~ s/([A-Z]+)\s+([A-Z]+)/$1\_$2/g;}
+ if ($_ =~ m/([A-Z]+)(\p{Han}+)/) {$_ =~ s/([A-Z]+)(\p{Han}+)/$1\_$2/g;}
+ if ($_ =~ m/(\p{Han}+)([A-Z]+)/) {$_ =~ s/(\p{Han}+)([A-Z]+)/$1\_$2/g;}
+ # remove UTF-8 whitespace charcter
+ if ($_ =~ /\s+/) {$_ =~ s:\s+::g;}
+ # replace "_" with a normal whitespace
+ if ($_ =~ /\_/) {$_ =~ s:\_: :g;}
+
+ print "$_\n";
+}
diff --git a/egs/wenetspeech/conformer/local/wenetspeech_data_prep.sh b/egs/wenetspeech/conformer/local/wenetspeech_data_prep.sh
index baa2b32..4959328 100755
--- a/egs/wenetspeech/conformer/local/wenetspeech_data_prep.sh
+++ b/egs/wenetspeech/conformer/local/wenetspeech_data_prep.sh
@@ -24,7 +24,7 @@
prefix=
train_subset=L
-. ./utils/parse_options.sh || exit 1;
+. utils/parse_options.sh || exit 1;
filter_by_id () {
idlist=$1
diff --git a/egs/wenetspeech/conformer/run.sh b/egs/wenetspeech/conformer/run.sh
index 006c0b9..2ccafd7 100644
--- a/egs/wenetspeech/conformer/run.sh
+++ b/egs/wenetspeech/conformer/run.sh
@@ -41,6 +41,7 @@
set -u
set -o pipefail
+set=L
train_set=train_l
valid_set=dev
test_sets="dev test_net test_meeting"
@@ -71,15 +72,15 @@
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
echo "stage 0: Data preparation"
# Data preparation
- local/wenetspeech_data_prep.sh $raw_data $feats_dir
- mkdir $feats_dir/data
- mv $feats_dir/$train_set $feats_dir/data/$train_set
- for x in $test_sets; do
- mv $feats_dir/$x $feats_dir/data/
- done
+ local/data.sh "--set ${set}"
+# mkdir $feats_dir/data
+# mv $feats_dir/$train_set $feats_dir/data/$train_set
+# for x in $test_sets; do
+# mv $feats_dir/$x $feats_dir/data/
+# done
fi
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
- echo "stage 1: Feature and CMVN Generation"
- utils/compute_cmvn.sh --fbankdir ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config_file "$asr_config" --scale 0.1
-fi
+#if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+# echo "stage 1: Feature and CMVN Generation"
+# utils/compute_cmvn.sh --fbankdir ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config_file "$asr_config" --scale 0.1
+#fi
--
Gitblit v1.9.1