From c5b15732d4a90c9a6e051474110ddf7aa1f94615 Mon Sep 17 00:00:00 2001
From: 嘉渊 <wangjiaming.wjm@alibaba-inc.com>
Date: 星期三, 10 五月 2023 19:29:40 +0800
Subject: [PATCH] update repo
---
egs/librispeech_100h/conformer/local/data_prep.sh | 85 ++++++++++++++++++++++++++++++++++++++++++
egs/librispeech_100h/conformer/run.sh | 2
2 files changed, 86 insertions(+), 1 deletions(-)
diff --git a/egs/librispeech_100h/conformer/local/data_prep.sh b/egs/librispeech_100h/conformer/local/data_prep.sh
new file mode 100755
index 0000000..c903d45
--- /dev/null
+++ b/egs/librispeech_100h/conformer/local/data_prep.sh
@@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+
+# Copyright 2014 Vassil Panayotov
+# 2014 Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+if [ "$#" -ne 2 ]; then
+ echo "Usage: $0 <src-dir> <dst-dir>"
+ echo "e.g.: $0 /export/a15/vpanayotov/data/LibriSpeech/dev-clean data/dev-clean"
+ exit 1
+fi
+
+src=$1
+dst=$2
+
+# all utterances are FLAC compressed
+if ! which flac >&/dev/null; then
+ echo "Please install 'flac' on ALL worker nodes!"
+ exit 1
+fi
+
+spk_file=$src/../SPEAKERS.TXT
+
+mkdir -p $dst || exit 1
+
+[ ! -d $src ] && echo "$0: no such directory $src" && exit 1
+[ ! -f $spk_file ] && echo "$0: expected file $spk_file to exist" && exit 1
+
+
+wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp
+trans=$dst/text; [[ -f "$trans" ]] && rm $trans
+utt2spk=$dst/utt2spk; [[ -f "$utt2spk" ]] && rm $utt2spk
+spk2gender=$dst/spk2gender; [[ -f $spk2gender ]] && rm $spk2gender
+
+for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do
+ reader=$(basename $reader_dir)
+ if ! [ $reader -eq $reader ]; then # not integer.
+ echo "$0: unexpected subdirectory name $reader"
+ exit 1
+ fi
+
+ reader_gender=$(egrep "^$reader[ ]+\|" $spk_file | awk -F'|' '{gsub(/[ ]+/, ""); print tolower($2)}')
+ if [ "$reader_gender" != 'm' ] && [ "$reader_gender" != 'f' ]; then
+ echo "Unexpected gender: '$reader_gender'"
+ exit 1
+ fi
+
+ for chapter_dir in $(find -L $reader_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do
+ chapter=$(basename $chapter_dir)
+ if ! [ "$chapter" -eq "$chapter" ]; then
+ echo "$0: unexpected chapter-subdirectory name $chapter"
+ exit 1
+ fi
+
+ find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \
+ awk -v "dir=$chapter_dir" '{printf "%s flac -c -d -s %s/%s.flac |\n", $0, dir, $0}' >>$wav_scp|| exit 1
+
+ chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt
+ [ ! -f $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1
+ cat $chapter_trans >>$trans
+
+ # NOTE: For now we are using per-chapter utt2spk. That is each chapter is considered
+ # to be a different speaker. This is done for simplicity and because we want
+ # e.g. the CMVN to be calculated per-chapter
+ awk -v "reader=$reader" -v "chapter=$chapter" '{printf "%s %s-%s\n", $1, reader, chapter}' \
+ <$chapter_trans >>$utt2spk || exit 1
+
+ # reader -> gender map (again using per-chapter granularity)
+ echo "${reader}-${chapter} $reader_gender" >>$spk2gender
+ done
+done
+
+spk2utt=$dst/spk2utt
+utils/utt2spk_to_spk2utt.pl <$utt2spk >$spk2utt || exit 1
+
+ntrans=$(wc -l <$trans)
+nutt2spk=$(wc -l <$utt2spk)
+! [ "$ntrans" -eq "$nutt2spk" ] && \
+ echo "Inconsistent #transcripts($ntrans) and #utt2spk($nutt2spk)" && exit 1
+
+utils/validate_data_dir.sh --no-feats $dst || exit 1
+
+echo "$0: successfully prepared data in $dst"
+
+exit 0
diff --git a/egs/librispeech_100h/conformer/run.sh b/egs/librispeech_100h/conformer/run.sh
index 18c4233..05369b7 100755
--- a/egs/librispeech_100h/conformer/run.sh
+++ b/egs/librispeech_100h/conformer/run.sh
@@ -80,7 +80,7 @@
echo "stage 0: Data preparation"
# Data preparation
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
- local/data_prep_librispeech.sh ${raw_data}/LibriSpeech/${x} ${feats_dir}/data/${x//-/_}
+ local/data_prep.sh ${raw_data}/LibriSpeech/${x} ${feats_dir}/data/${x//-/_}
done
fi
--
Gitblit v1.9.1