From 4e0fcee2a915641e7f39d62c389bee561d849e19 Mon Sep 17 00:00:00 2001
From: jmwang66 <wangjiaming.wjm@alibaba-inc.com>
Date: 星期一, 19 六月 2023 20:28:23 +0800
Subject: [PATCH] Merge branch 'main' into dev_wjm_infer
---
egs/alimeeting/sa_asr/local/alimeeting_data_prep.sh | 50 ++++++++++++++++++++++++--------------------------
1 files changed, 24 insertions(+), 26 deletions(-)
diff --git a/egs/alimeeting/sa-asr/local/alimeeting_data_prep.sh b/egs/alimeeting/sa_asr/local/alimeeting_data_prep.sh
similarity index 74%
rename from egs/alimeeting/sa-asr/local/alimeeting_data_prep.sh
rename to egs/alimeeting/sa_asr/local/alimeeting_data_prep.sh
index c13ee42..fd76837 100755
--- a/egs/alimeeting/sa-asr/local/alimeeting_data_prep.sh
+++ b/egs/alimeeting/sa_asr/local/alimeeting_data_prep.sh
@@ -21,6 +21,8 @@
SECONDS=0
tgt=Train #Train or Eval
+min_wav_duration=0.1
+max_wav_duration=20
log "$0 $*"
@@ -57,27 +59,24 @@
stop_stage=4
mkdir -p $far_dir
mkdir -p $near_dir
+mkdir -p data/org
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
log "stage 1:process alimeeting near dir"
find -L $near_raw_dir/audio_dir -iname "*.wav" | sort > $near_dir/wavlist
- awk -F '/' '{print $NF}' $near_dir/wavlist | awk -F '.' '{print $1}' > $near_dir/uttid
- find -L $near_raw_dir/textgrid_dir -iname "*.TextGrid" | sort > $near_dir/textgrid.flist
+ awk -F '/' '{print $NF}' $near_dir/wavlist | awk -F '.' '{print $1}' | sort > $near_dir/uttid
+ find -L $near_raw_dir/textgrid_dir -iname "*.TextGrid" > $near_dir/textgrid.flist
n1_wav=$(wc -l < $near_dir/wavlist)
n2_text=$(wc -l < $near_dir/textgrid.flist)
log near file found $n1_wav wav and $n2_text text.
- paste $near_dir/uttid $near_dir/wavlist > $near_dir/wav_raw.scp
-
- # cat $near_dir/wav_raw.scp | awk '{printf("%s sox -t wav %s -r 16000 -b 16 -c 1 -t wav - |\n", $1, $2)}' > $near_dir/wav.scp
- cat $near_dir/wav_raw.scp | awk '{printf("%s sox -t wav %s -r 16000 -b 16 -t wav - |\n", $1, $2)}' > $near_dir/wav.scp
+ paste $near_dir/uttid $near_dir/wavlist -d " " > $near_dir/wav.scp
python local/alimeeting_process_textgrid.py --path $near_dir --no-overlap False
cat $near_dir/text_all | local/text_normalize.pl | local/text_format.pl | sort -u > $near_dir/text
utils/filter_scp.pl -f 1 $near_dir/text $near_dir/utt2spk_all | sort -u > $near_dir/utt2spk
- #sed -e 's/ [a-z,A-Z,_,0-9,-]\+SPK/ SPK/' $near_dir/utt2spk_old >$near_dir/tmp1
- #sed -e 's/-[a-z,A-Z,0-9]\+$//' $near_dir/tmp1 | sort -u > $near_dir/utt2spk
+
local/utt2spk_to_spk2utt.pl $near_dir/utt2spk > $near_dir/spk2utt
utils/filter_scp.pl -f 1 $near_dir/text $near_dir/segments_all | sort -u > $near_dir/segments
sed -e 's/ $//g' $near_dir/text> $near_dir/tmp1
@@ -97,9 +96,7 @@
n2_text=$(wc -l < $far_dir/textgrid.flist)
log far file found $n1_wav wav and $n2_text text.
- paste $far_dir/uttid $far_dir/wavlist > $far_dir/wav_raw.scp
-
- cat $far_dir/wav_raw.scp | awk '{printf("%s sox -t wav %s -r 16000 -b 16 -t wav - |\n", $1, $2)}' > $far_dir/wav.scp
+ paste $far_dir/uttid $far_dir/wavlist -d " " > $far_dir/wav.scp
python local/alimeeting_process_overlap_force.py --path $far_dir \
--no-overlap false --mars True \
@@ -119,28 +116,28 @@
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
- log "stage 3: finali data process"
+ log "stage 3: final data process"
local/fix_data_dir.sh $near_dir
local/fix_data_dir.sh $far_dir
- local/copy_data_dir.sh $near_dir data/${tgt}_Ali_near
- local/copy_data_dir.sh $far_dir data/${tgt}_Ali_far
+ local/copy_data_dir.sh $near_dir data/org/${tgt}_Ali_near
+ local/copy_data_dir.sh $far_dir data/org/${tgt}_Ali_far
- sort $far_dir/utt2spk_all_fifo > data/${tgt}_Ali_far/utt2spk_all_fifo
- sed -i "s/src/$/g" data/${tgt}_Ali_far/utt2spk_all_fifo
+ sort $far_dir/utt2spk_all_fifo > data/org/${tgt}_Ali_far/utt2spk_all_fifo
+ sed -i "s/src/$/g" data/org/${tgt}_Ali_far/utt2spk_all_fifo
# remove space in text
for x in ${tgt}_Ali_near ${tgt}_Ali_far; do
- cp data/${x}/text data/${x}/text.org
- paste -d " " <(cut -f 1 -d" " data/${x}/text.org) <(cut -f 2- -d" " data/${x}/text.org | tr -d " ") \
- > data/${x}/text
- rm data/${x}/text.org
+ cp data/org/${x}/text data/org/${x}/text.org
+ paste -d " " <(cut -f 1 -d" " data/org/${x}/text.org) <(cut -f 2- -d" " data/org/${x}/text.org | tr -d " ") \
+ > data/org/${x}/text
+ rm data/org/${x}/text.org
done
log "Successfully finished. [elapsed=${SECONDS}s]"
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
- log "stage 4: process alimeeting far dir (single speaker by oracle time strap)"
+ log "stage 4: process alimeeting far dir (single speaker by oracle time stamp)"
cp -r $far_dir/* $far_single_speaker_dir
mv $far_single_speaker_dir/textgrid.flist $far_single_speaker_dir/textgrid_oldpath
paste -d " " $far_single_speaker_dir/uttid $far_single_speaker_dir/textgrid_oldpath > $far_single_speaker_dir/textgrid.flist
@@ -150,14 +147,15 @@
local/utt2spk_to_spk2utt.pl $far_single_speaker_dir/utt2spk > $far_single_speaker_dir/spk2utt
./local/fix_data_dir.sh $far_single_speaker_dir
- local/copy_data_dir.sh $far_single_speaker_dir data/${tgt}_Ali_far_single_speaker
+ local/copy_data_dir.sh $far_single_speaker_dir data/org/${tgt}_Ali_far_single_speaker
# remove space in text
for x in ${tgt}_Ali_far_single_speaker; do
- cp data/${x}/text data/${x}/text.org
- paste -d " " <(cut -f 1 -d" " data/${x}/text.org) <(cut -f 2- -d" " data/${x}/text.org | tr -d " ") \
- > data/${x}/text
- rm data/${x}/text.org
+ cp data/org/${x}/text data/org/${x}/text.org
+ paste -d " " <(cut -f 1 -d" " data/org/${x}/text.org) <(cut -f 2- -d" " data/org/${x}/text.org | tr -d " ") \
+ > data/org/${x}/text
+ rm data/org/${x}/text.org
done
+ rm -rf data/local
log "Successfully finished. [elapsed=${SECONDS}s]"
fi
\ No newline at end of file
--
Gitblit v1.9.1