From 4e0fcee2a915641e7f39d62c389bee561d849e19 Mon Sep 17 00:00:00 2001
From: jmwang66 <wangjiaming.wjm@alibaba-inc.com>
Date: 星期一, 19 六月 2023 20:28:23 +0800
Subject: [PATCH] Merge branch 'main' into dev_wjm_infer

---
 egs/alimeeting/sa_asr/local/alimeeting_data_prep.sh |   50 ++++++++++++++++++++++++--------------------------
 1 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/egs/alimeeting/sa-asr/local/alimeeting_data_prep.sh b/egs/alimeeting/sa_asr/local/alimeeting_data_prep.sh
similarity index 74%
rename from egs/alimeeting/sa-asr/local/alimeeting_data_prep.sh
rename to egs/alimeeting/sa_asr/local/alimeeting_data_prep.sh
index c13ee42..fd76837 100755
--- a/egs/alimeeting/sa-asr/local/alimeeting_data_prep.sh
+++ b/egs/alimeeting/sa_asr/local/alimeeting_data_prep.sh
@@ -21,6 +21,8 @@
 
 SECONDS=0
 tgt=Train #Train or Eval
+min_wav_duration=0.1
+max_wav_duration=20
 
 
 log "$0 $*"
@@ -57,27 +59,24 @@
 stop_stage=4
 mkdir -p $far_dir
 mkdir -p $near_dir
+mkdir -p data/org
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then 
     log "stage 1:process alimeeting near dir"
     
     find -L $near_raw_dir/audio_dir -iname "*.wav" | sort >  $near_dir/wavlist
-    awk -F '/' '{print $NF}' $near_dir/wavlist | awk -F '.' '{print $1}' > $near_dir/uttid   
-    find -L $near_raw_dir/textgrid_dir  -iname "*.TextGrid" | sort > $near_dir/textgrid.flist
+    awk -F '/' '{print $NF}' $near_dir/wavlist | awk -F '.' '{print $1}' | sort > $near_dir/uttid   
+    find -L $near_raw_dir/textgrid_dir  -iname "*.TextGrid" > $near_dir/textgrid.flist
     n1_wav=$(wc -l < $near_dir/wavlist)
     n2_text=$(wc -l < $near_dir/textgrid.flist)
     log  near file found $n1_wav wav and $n2_text text.
 
-    paste $near_dir/uttid $near_dir/wavlist > $near_dir/wav_raw.scp
-
-    # cat $near_dir/wav_raw.scp | awk '{printf("%s sox -t wav  %s -r 16000 -b 16 -c 1 -t wav  - |\n", $1, $2)}'  > $near_dir/wav.scp
-    cat $near_dir/wav_raw.scp | awk '{printf("%s sox -t wav  %s -r 16000 -b 16 -t wav  - |\n", $1, $2)}'  > $near_dir/wav.scp
+    paste $near_dir/uttid $near_dir/wavlist -d " " > $near_dir/wav.scp
     
     python local/alimeeting_process_textgrid.py --path $near_dir --no-overlap False
     cat $near_dir/text_all | local/text_normalize.pl | local/text_format.pl | sort -u > $near_dir/text
     utils/filter_scp.pl -f 1 $near_dir/text $near_dir/utt2spk_all | sort -u > $near_dir/utt2spk
-    #sed -e 's/ [a-z,A-Z,_,0-9,-]\+SPK/ SPK/'  $near_dir/utt2spk_old >$near_dir/tmp1
-    #sed -e 's/-[a-z,A-Z,0-9]\+$//' $near_dir/tmp1 | sort -u > $near_dir/utt2spk
+
     local/utt2spk_to_spk2utt.pl $near_dir/utt2spk > $near_dir/spk2utt
     utils/filter_scp.pl -f 1 $near_dir/text $near_dir/segments_all | sort -u > $near_dir/segments
     sed -e 's/ $//g' $near_dir/text> $near_dir/tmp1
@@ -97,9 +96,7 @@
     n2_text=$(wc -l < $far_dir/textgrid.flist)
     log  far file found $n1_wav wav and $n2_text text.
 
-    paste $far_dir/uttid $far_dir/wavlist > $far_dir/wav_raw.scp
-
-    cat $far_dir/wav_raw.scp | awk '{printf("%s sox -t wav  %s -r 16000 -b 16 -t wav  - |\n", $1, $2)}'  > $far_dir/wav.scp
+    paste $far_dir/uttid $far_dir/wavlist -d " " > $far_dir/wav.scp
 
     python local/alimeeting_process_overlap_force.py  --path $far_dir \
         --no-overlap false --mars True \
@@ -119,28 +116,28 @@
 
 
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    log "stage 3: finali data process"
+    log "stage 3: final data process"
     local/fix_data_dir.sh $near_dir
     local/fix_data_dir.sh $far_dir
-    local/copy_data_dir.sh $near_dir data/${tgt}_Ali_near
-    local/copy_data_dir.sh $far_dir data/${tgt}_Ali_far
+    local/copy_data_dir.sh $near_dir data/org/${tgt}_Ali_near
+    local/copy_data_dir.sh $far_dir data/org/${tgt}_Ali_far
 
-    sort $far_dir/utt2spk_all_fifo > data/${tgt}_Ali_far/utt2spk_all_fifo
-    sed -i "s/src/$/g" data/${tgt}_Ali_far/utt2spk_all_fifo
+    sort $far_dir/utt2spk_all_fifo > data/org/${tgt}_Ali_far/utt2spk_all_fifo
+    sed -i "s/src/$/g" data/org/${tgt}_Ali_far/utt2spk_all_fifo
 
     # remove space in text
     for x in ${tgt}_Ali_near ${tgt}_Ali_far; do
-        cp data/${x}/text data/${x}/text.org
-        paste -d " " <(cut -f 1 -d" " data/${x}/text.org) <(cut -f 2- -d" " data/${x}/text.org | tr -d " ") \
-        > data/${x}/text
-        rm data/${x}/text.org
+        cp data/org/${x}/text data/org/${x}/text.org
+        paste -d " " <(cut -f 1 -d" " data/org/${x}/text.org) <(cut -f 2- -d" " data/org/${x}/text.org | tr -d " ") \
+        > data/org/${x}/text
+        rm data/org/${x}/text.org
     done
 
     log "Successfully finished. [elapsed=${SECONDS}s]"
 fi
 
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
-    log "stage 4: process alimeeting far dir (single speaker by oracle time strap)"
+    log "stage 4: process alimeeting far dir (single speaker by oracle time stamp)"
     cp -r $far_dir/* $far_single_speaker_dir 
     mv $far_single_speaker_dir/textgrid.flist  $far_single_speaker_dir/textgrid_oldpath
     paste -d " " $far_single_speaker_dir/uttid $far_single_speaker_dir/textgrid_oldpath > $far_single_speaker_dir/textgrid.flist
@@ -150,14 +147,15 @@
     local/utt2spk_to_spk2utt.pl $far_single_speaker_dir/utt2spk > $far_single_speaker_dir/spk2utt
 
     ./local/fix_data_dir.sh $far_single_speaker_dir 
-    local/copy_data_dir.sh $far_single_speaker_dir data/${tgt}_Ali_far_single_speaker
+    local/copy_data_dir.sh $far_single_speaker_dir data/org/${tgt}_Ali_far_single_speaker
 
     # remove space in text
     for x in ${tgt}_Ali_far_single_speaker; do
-        cp data/${x}/text data/${x}/text.org
-        paste -d " " <(cut -f 1 -d" " data/${x}/text.org) <(cut -f 2- -d" " data/${x}/text.org | tr -d " ") \
-        > data/${x}/text
-        rm data/${x}/text.org
+        cp data/org/${x}/text data/org/${x}/text.org
+        paste -d " " <(cut -f 1 -d" " data/org/${x}/text.org) <(cut -f 2- -d" " data/org/${x}/text.org | tr -d " ") \
+        > data/org/${x}/text
+        rm data/org/${x}/text.org
     done
+    rm -rf data/local
     log "Successfully finished. [elapsed=${SECONDS}s]"
 fi
\ No newline at end of file

--
Gitblit v1.9.1