From 327e91183f3eb434e1ec682ebfa616f9628381c7 Mon Sep 17 00:00:00 2001
From: 嘉渊 <wangjiaming.wjm@alibaba-inc.com>
Date: 星期四, 25 五月 2023 11:43:37 +0800
Subject: [PATCH] update repo
---
egs/wenetspeech/conformer/local/wenetspeech_data_prep.sh | 135 +++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 135 insertions(+), 0 deletions(-)
diff --git a/egs/wenetspeech/conformer/local/wenetspeech_data_prep.sh b/egs/wenetspeech/conformer/local/wenetspeech_data_prep.sh
new file mode 100644
index 0000000..8585305
--- /dev/null
+++ b/egs/wenetspeech/conformer/local/wenetspeech_data_prep.sh
@@ -0,0 +1,135 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang)
+# Seasalt AI, Inc (Author: Guoguo Chen)
+# Mobvoi Inc(Author: Di Wu, Binbin Zhang)
+# NPU, ASLP Group (Author: Qijie Shao)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+set -o pipefail
+
+stage=1
+prefix=
+train_subset=L
+
+. ./tools/parse_options.sh || exit 1;
+
+filter_by_id () {
+ idlist=$1
+ input=$2
+ output=$3
+ field=1
+ if [ $# -eq 4 ]; then
+ field=$4
+ fi
+ cat $input | perl -se '
+ open(F, "<$idlist") || die "Could not open id-list file $idlist";
+ while(<F>) {
+ @A = split;
+ @A>=1 || die "Invalid id-list file line $_";
+ $seen{$A[0]} = 1;
+ }
+ while(<>) {
+ @A = split;
+ @A > 0 || die "Invalid file line $_";
+ @A >= $field || die "Invalid file line $_";
+ if ($seen{$A[$field-1]}) {
+ print $_;
+ }
+ }' -- -idlist="$idlist" -field="$field" > $output ||\
+ (echo "$0: filter_by_id() error: $input" && exit 1) || exit 1;
+}
+
+subset_data_dir () {
+ utt_list=$1
+ src_dir=$2
+ dest_dir=$3
+ mkdir -p $dest_dir || exit 1;
+ # wav.scp text segments utt2dur
+ filter_by_id $utt_list $src_dir/utt2dur $dest_dir/utt2dur ||\
+ (echo "$0: subset_data_dir() error: $src_dir/utt2dur" && exit 1) || exit 1;
+ filter_by_id $utt_list $src_dir/text $dest_dir/text ||\
+ (echo "$0: subset_data_dir() error: $src_dir/text" && exit 1) || exit 1;
+ filter_by_id $utt_list $src_dir/segments $dest_dir/segments ||\
+ (echo "$0: subset_data_dir() error: $src_dir/segments" && exit 1) || exit 1;
+ awk '{print $2}' $dest_dir/segments | sort | uniq > $dest_dir/reco
+ filter_by_id $dest_dir/reco $src_dir/wav.scp $dest_dir/wav.scp ||\
+ (echo "$0: subset_data_dir() error: $src_dir/wav.scp" && exit 1) || exit 1;
+ rm -f $dest_dir/reco
+}
+
+if [ $# -ne 2 ]; then
+ echo "Usage: $0 [options] <wenetspeech-dataset-dir> <data-dir>"
+ echo " e.g.: $0 --train-subset L /disk1/audio_data/wenetspeech/ data/"
+ echo ""
+ echo "This script takes the WenetSpeech source directory, and prepares the"
+ echo "WeNet format data directory."
+ echo " --prefix <prefix> # Prefix for output data directory."
+ echo " --stage <stage> # Processing stage."
+ echo " --train-subset <L|M|S|W> # Train subset to be created."
+ exit 1
+fi
+
+wenetspeech_dir=$1
+data_dir=$2
+
+declare -A subsets
+subsets=(
+ [L]="train_l"
+ [M]="train_m"
+ [S]="train_s"
+ [W]="train_w"
+ [DEV]="dev"
+ [TEST_NET]="test_net"
+ [TEST_MEETING]="test_meeting")
+
+prefix=${prefix:+${prefix}_}
+
+corpus_dir=$data_dir/${prefix}corpus/
+if [ $stage -le 1 ]; then
+ echo "$0: Extract meta into $corpus_dir"
+ # Sanity check.
+ [ ! -f $wenetspeech_dir/WenetSpeech.json ] &&\
+ echo "$0: Please download $wenetspeech_dir/WenetSpeech.json!" && exit 1;
+ [ ! -d $wenetspeech_dir/audio ] &&\
+ echo "$0: Please download $wenetspeech_dir/audio!" && exit 1;
+
+ [ ! -d $corpus_dir ] && mkdir -p $corpus_dir
+
+ # Files to be created:
+ # wav.scp text segments utt2dur
+ python3 local/extract_meta.py \
+ $wenetspeech_dir/WenetSpeech.json $corpus_dir || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+ echo "$0: Split data to train, dev, test_net, and test_meeting"
+ [ ! -f $corpus_dir/utt2subsets ] &&\
+ echo "$0: No such file $corpus_dir/utt2subsets!" && exit 1;
+ for label in $train_subset DEV TEST_NET TEST_MEETING; do
+ if [ ! ${subsets[$label]+set} ]; then
+ echo "$0: Subset $label is not defined in WenetSpeech.json." && exit 1;
+ fi
+ subset=${subsets[$label]}
+ [ ! -d $data_dir/${prefix}$subset ] && mkdir -p $data_dir/${prefix}$subset
+ cat $corpus_dir/utt2subsets | \
+ awk -v s=$label '{for (i=2;i<=NF;i++) if($i==s) print $0;}' \
+ > $corpus_dir/${prefix}${subset}_utt_list|| exit 1;
+ subset_data_dir $corpus_dir/${prefix}${subset}_utt_list \
+ $corpus_dir $data_dir/${prefix}$subset || exit 1;
+ done
+fi
+
+echo "$0: Done"
\ No newline at end of file
--
Gitblit v1.9.1