From a0ffc9ba6b86780aa68ceca8b41bce464cf2a066 Mon Sep 17 00:00:00 2001
From: 志浩 <neo.dzh@alibaba-inc.com>
Date: 星期三, 02 八月 2023 14:18:54 +0800
Subject: [PATCH] TOLD/SOND: modify data preprocessing

---
 egs/callhome/diarization/sond/conf/fbank.conf |    4 ++++
 egs/callhome/diarization/sond/finetune.sh     |   29 +++++++++++++++++++++++------
 egs/callhome/diarization/sond/run.sh          |    3 ++-
 3 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/egs/callhome/diarization/sond/conf/fbank.conf b/egs/callhome/diarization/sond/conf/fbank.conf
new file mode 100644
index 0000000..ef2df7f
--- /dev/null
+++ b/egs/callhome/diarization/sond/conf/fbank.conf
@@ -0,0 +1,4 @@
+--sample-frequency=8000
+--num-mel-bins=80
+--frame-length=25
+--snip-edges=false
\ No newline at end of file
diff --git a/egs/callhome/diarization/sond/finetune.sh b/egs/callhome/diarization/sond/finetune.sh
index f2428ad..cd887b8 100644
--- a/egs/callhome/diarization/sond/finetune.sh
+++ b/egs/callhome/diarization/sond/finetune.sh
@@ -92,8 +92,21 @@
 
 # Prepare datasets
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-  echp "Stage 0: Prepare callhome data."
+  echo "Stage 0: Prepare callhome data."
   local/make_callhome.sh ${callhome_root} ${datadir}/
+
+  # split ref.rttm
+  for dset in callhome1 callhome2; do
+    rm -rf data/${dset}/ref.rttm
+    for name in `awk '{print $1}' data/${dset}/wav.scp`; do
+      grep ${name} data/callhome/fullref.rttm >> data/${dset}/ref.rttm;
+    done
+
+    # filter out records which don't have rttm labels.
+    awk '{print $2}' data/${dset}/ref.rttm | sort | uniq > data/${dset}/uttid
+    mv data/${dset}/wav.scp data/${dset}/wav.scp.bak
+    awk '{if (NR==FNR){a[$1]=1}else{if (a[$1]==1){print $0}}}' data/${dset}/uttid data/${dset}/wav.scp.bak > data/${dset}/wav.scp
+  done
 fi
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
@@ -123,10 +136,10 @@
     mkdir -p ${dumpdir}/${dset}/nonoverlap_0s
     python -Wignore script/extract_nonoverlap_segments.py \
       ${datadir}/${dset}/wav.scp ${datadir}/${dset}/ref.rttm ${dumpdir}/${dset}/nonoverlap_0s \
-      --min_dur 0 --max_spk_num 8 --sr ${sr} --no_pbar --nj ${nj}
+      --min_dur 0.1 --max_spk_num 8 --sr ${sr} --no_pbar --nj ${nj}
 
     mkdir -p ${datadir}/${dset}/nonoverlap_0s
-    find `pwd`/${dumpdir}/${dset}/nonoverlap_0s | sort | awk -F'[/.]' '{print $(NF-1),$0}' > ${datadir}/${dset}/nonoverlap_0s/wav.scp
+    find ${dumpdir}/${dset}/nonoverlap_0s/ -iname "*.wav" | sort | awk -F'[/.]' '{print $(NF-1),$0}' > ${datadir}/${dset}/nonoverlap_0s/wav.scp
     awk -F'[/.]' '{print $(NF-1),$(NF-2)}' ${datadir}/${dset}/nonoverlap_0s/wav.scp > ${datadir}/${dset}/nonoverlap_0s/utt2spk
     echo "Done."
   done
@@ -134,13 +147,17 @@
 
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
   echo "Stage 3: Generate fbank features"
-  home_path=`pwd`
+  home_path=$(pwd)
   cd ${kaldi_root}/egs/callhome_diarization/v2 || exit
 
-  . ./cmd.sh
+  export train_cmd="run.pl"
+  export cmd="run.pl"
   . ./path.sh
+  cd $home_path || exit
 
+  ln -s ${kaldi_root}/egs/callhome_diarization/v2/steps ./
   for dset in callhome1 callhome2; do
+    mv ${datadir}/${dset}/segments ${datadir}/${dset}/segs
     steps/make_fbank.sh --write-utt2num-frames true --fbank-config conf/fbank.conf --nj ${nj} --cmd "$train_cmd" \
         ${datadir}/${dset} ${expdir}/make_fbank/${dset} ${dumpdir}/${dset}/fbank
     utils/fix_data_dir.sh ${datadir}/${dset}
@@ -151,8 +168,8 @@
         ${datadir}/${dset} ${expdir}/make_fbank/${dset} ${dumpdir}/${dset}/fbank
     utils/fix_data_dir.sh ${datadir}/${dset}
   done
+  rm -f steps
 
-  cd ${home_path} || exit
 fi
 
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
diff --git a/egs/callhome/diarization/sond/run.sh b/egs/callhome/diarization/sond/run.sh
index c0ecd35..5c5cbd4 100644
--- a/egs/callhome/diarization/sond/run.sh
+++ b/egs/callhome/diarization/sond/run.sh
@@ -48,6 +48,7 @@
 
 # dataset related
 data_root=
+callhome_root=path/to/NIST/LDC2001S97
 
 # experiment configuration
 lang=en
@@ -124,7 +125,7 @@
   utils/fix_data_dir.sh ${datadir}/swbd_sre
 
   # 3. Prepare the Callhome portion of NIST SRE 2000.
-  local/make_callhome.sh /nfs/wangjiaming.wjm/speech-data/NIST/LDC2001S97 ${datadir}/
+  local/make_callhome.sh ${callhome_root} ${datadir}/
 
 fi
 

--
Gitblit v1.9.1