From a0ffc9ba6b86780aa68ceca8b41bce464cf2a066 Mon Sep 17 00:00:00 2001
From: 志浩 <neo.dzh@alibaba-inc.com>
Date: 星期三, 02 八月 2023 14:18:54 +0800
Subject: [PATCH] TOLD/SOND: modify data preprocessing
---
egs/callhome/diarization/sond/conf/fbank.conf | 4 ++++
egs/callhome/diarization/sond/finetune.sh | 29 +++++++++++++++++++++++------
egs/callhome/diarization/sond/run.sh | 3 ++-
3 files changed, 29 insertions(+), 7 deletions(-)
diff --git a/egs/callhome/diarization/sond/conf/fbank.conf b/egs/callhome/diarization/sond/conf/fbank.conf
new file mode 100644
index 0000000..ef2df7f
--- /dev/null
+++ b/egs/callhome/diarization/sond/conf/fbank.conf
@@ -0,0 +1,4 @@
+--sample-frequency=8000
+--num-mel-bins=80
+--frame-length=25
+--snip-edges=false
\ No newline at end of file
diff --git a/egs/callhome/diarization/sond/finetune.sh b/egs/callhome/diarization/sond/finetune.sh
index f2428ad..cd887b8 100644
--- a/egs/callhome/diarization/sond/finetune.sh
+++ b/egs/callhome/diarization/sond/finetune.sh
@@ -92,8 +92,21 @@
# Prepare datasets
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
- echp "Stage 0: Prepare callhome data."
+ echo "Stage 0: Prepare callhome data."
local/make_callhome.sh ${callhome_root} ${datadir}/
+
+ # split ref.rttm
+ for dset in callhome1 callhome2; do
+ rm -rf data/${dset}/ref.rttm
+ for name in `awk '{print $1}' data/${dset}/wav.scp`; do
+ grep ${name} data/callhome/fullref.rttm >> data/${dset}/ref.rttm;
+ done
+
+ # filter out records which don't have rttm labels.
+ awk '{print $2}' data/${dset}/ref.rttm | sort | uniq > data/${dset}/uttid
+ mv data/${dset}/wav.scp data/${dset}/wav.scp.bak
+ awk '{if (NR==FNR){a[$1]=1}else{if (a[$1]==1){print $0}}}' data/${dset}/uttid data/${dset}/wav.scp.bak > data/${dset}/wav.scp
+ done
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
@@ -123,10 +136,10 @@
mkdir -p ${dumpdir}/${dset}/nonoverlap_0s
python -Wignore script/extract_nonoverlap_segments.py \
${datadir}/${dset}/wav.scp ${datadir}/${dset}/ref.rttm ${dumpdir}/${dset}/nonoverlap_0s \
- --min_dur 0 --max_spk_num 8 --sr ${sr} --no_pbar --nj ${nj}
+ --min_dur 0.1 --max_spk_num 8 --sr ${sr} --no_pbar --nj ${nj}
mkdir -p ${datadir}/${dset}/nonoverlap_0s
- find `pwd`/${dumpdir}/${dset}/nonoverlap_0s | sort | awk -F'[/.]' '{print $(NF-1),$0}' > ${datadir}/${dset}/nonoverlap_0s/wav.scp
+ find ${dumpdir}/${dset}/nonoverlap_0s/ -iname "*.wav" | sort | awk -F'[/.]' '{print $(NF-1),$0}' > ${datadir}/${dset}/nonoverlap_0s/wav.scp
awk -F'[/.]' '{print $(NF-1),$(NF-2)}' ${datadir}/${dset}/nonoverlap_0s/wav.scp > ${datadir}/${dset}/nonoverlap_0s/utt2spk
echo "Done."
done
@@ -134,13 +147,17 @@
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "Stage 3: Generate fbank features"
- home_path=`pwd`
+ home_path=$(pwd)
cd ${kaldi_root}/egs/callhome_diarization/v2 || exit
- . ./cmd.sh
+ export train_cmd="run.pl"
+ export cmd="run.pl"
. ./path.sh
+ cd $home_path || exit
+ ln -s ${kaldi_root}/egs/callhome_diarization/v2/steps ./
for dset in callhome1 callhome2; do
+ mv ${datadir}/${dset}/segments ${datadir}/${dset}/segs
steps/make_fbank.sh --write-utt2num-frames true --fbank-config conf/fbank.conf --nj ${nj} --cmd "$train_cmd" \
${datadir}/${dset} ${expdir}/make_fbank/${dset} ${dumpdir}/${dset}/fbank
utils/fix_data_dir.sh ${datadir}/${dset}
@@ -151,8 +168,8 @@
${datadir}/${dset} ${expdir}/make_fbank/${dset} ${dumpdir}/${dset}/fbank
utils/fix_data_dir.sh ${datadir}/${dset}
done
+ rm -f steps
- cd ${home_path} || exit
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
diff --git a/egs/callhome/diarization/sond/run.sh b/egs/callhome/diarization/sond/run.sh
index c0ecd35..5c5cbd4 100644
--- a/egs/callhome/diarization/sond/run.sh
+++ b/egs/callhome/diarization/sond/run.sh
@@ -48,6 +48,7 @@
# dataset related
data_root=
+callhome_root=path/to/NIST/LDC2001S97
# experiment configuration
lang=en
@@ -124,7 +125,7 @@
utils/fix_data_dir.sh ${datadir}/swbd_sre
# 3. Prepare the Callhome portion of NIST SRE 2000.
- local/make_callhome.sh /nfs/wangjiaming.wjm/speech-data/NIST/LDC2001S97 ${datadir}/
+ local/make_callhome.sh ${callhome_root} ${datadir}/
fi
--
Gitblit v1.9.1