#!/bin/bash
|
|
# Copyright 2019 Hitachi, Ltd. (author: Yusuke Fujita, Shota Horiguchi)
|
# Licensed under the MIT license.
|
#
|
# This script prepares kaldi-style data sets shared with different experiments
|
# - data/xxxx
|
# callhome, sre, swb2, and swb_cellular datasets
|
# - data/simu_${simu_outputs}
|
# simulation mixtures generated with various options
|
|
stage=0
|
|
# Modify corpus directories
|
# - callhome_dir
|
# CALLHOME (LDC2001S97)
|
# - swb2_phase1_train
|
# Switchboard-2 Phase 1 (LDC98S75)
|
# - data_root
|
# LDC99S79, LDC2002S06, LDC2001S13, LDC2004S07,
|
# LDC2006S44, LDC2011S01, LDC2011S04, LDC2011S09,
|
# LDC2011S10, LDC2012S01, LDC2011S05, LDC2011S08
|
# - musan_root
|
# MUSAN corpus (https://www.openslr.org/17/)
|
callhome_dir=
|
swb2_phase1_train=
|
data_root=
|
musan_root=
|
# Modify simulated data storage area.
|
# This script distributes simulated data under these directories
|
simu_actual_dirs=(
|
./s05/$USER/diarization-data
|
./s08/$USER/diarization-data
|
./s09/$USER/diarization-data
|
)
|
|
# data preparation options
|
max_jobs_run=4
|
sad_num_jobs=30
|
sad_opts="--extra-left-context 79 --extra-right-context 21 --frames-per-chunk 150 --extra-left-context-initial 0 --extra-right-context-final 0 --acwt 0.3"
|
sad_graph_opts="--min-silence-duration=0.03 --min-speech-duration=0.3 --max-speech-duration=10.0"
|
sad_priors_opts="--sil-scale=0.1"
|
|
# simulation options
|
simu_opts_overlap=yes
|
simu_opts_num_speaker_array=(1 2 3 4)
|
simu_opts_sil_scale_array=(2 2 5 9)
|
simu_opts_rvb_prob=0.5
|
simu_opts_num_train=100000
|
simu_opts_min_utts=10
|
simu_opts_max_utts=20
|
|
simu_cmd="run.pl"
|
train_cmd="run.pl"
|
random_mixture_cmd="run.pl"
|
make_mixture_cmd="run.pl"
|
|
. parse_options.sh || exit
|
|
if [ $stage -le 0 ]; then
|
echo "prepare kaldi-style datasets"
|
# Prepare CALLHOME dataset. This will be used to evaluation.
|
if ! validate_data_dir.sh --no-text --no-feats data/callhome1_spkall \
|
|| ! validate_data_dir.sh --no-text --no-feats data/callhome2_spkall; then
|
# imported from https://github.com/kaldi-asr/kaldi/blob/master/egs/callhome_diarization/v1
|
local/make_callhome.sh $callhome_dir data
|
# Generate two-speaker subsets
|
for dset in callhome1 callhome2; do
|
# Extract two-speaker recordings in wav.scp
|
copy_data_dir.sh data/${dset} data/${dset}_spkall
|
# Regenerate segments file from fullref.rttm
|
# $2: recid, $4: start_time, $5: duration, $8: speakerid
|
awk '{printf "%s_%s_%07d_%07d %s %.2f %.2f\n", \
|
$2, $8, $4*100, ($4+$5)*100, $2, $4, $4+$5}' \
|
data/callhome/fullref.rttm | sort > data/${dset}_spkall/segments
|
utils/fix_data_dir.sh data/${dset}_spkall
|
# Speaker ID is '[recid]_[speakerid]
|
awk '{split($1,A,"_"); printf "%s %s_%s\n", $1, A[1], A[2]}' \
|
data/${dset}_spkall/segments > data/${dset}_spkall/utt2spk
|
utils/fix_data_dir.sh data/${dset}_spkall
|
# Generate rttm files for scoring
|
steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \
|
data/${dset}_spkall/utt2spk data/${dset}_spkall/segments \
|
data/${dset}_spkall/rttm
|
utils/data/get_reco2dur.sh data/${dset}_spkall
|
done
|
fi
|
# Prepare a collection of NIST SRE and SWB data. This will be used to train,
|
if ! validate_data_dir.sh --no-text --no-feats data/swb_sre_comb; then
|
local/make_sre.sh $data_root data
|
# Prepare SWB for x-vector DNN training.
|
local/make_swbd2_phase1.pl $swb2_phase1_train \
|
data/swbd2_phase1_train
|
local/make_swbd2_phase2.pl $data_root/LDC99S79 \
|
data/swbd2_phase2_train
|
local/make_swbd2_phase3.pl $data_root/LDC2002S06 \
|
data/swbd2_phase3_train
|
local/make_swbd_cellular1.pl $data_root/LDC2001S13 \
|
data/swbd_cellular1_train
|
local/make_swbd_cellular2.pl $data_root/LDC2004S07 \
|
data/swbd_cellular2_train
|
# Combine swb and sre data
|
utils/combine_data.sh data/swb_sre_comb \
|
data/swbd_cellular1_train data/swbd_cellular2_train \
|
data/swbd2_phase1_train \
|
data/swbd2_phase2_train data/swbd2_phase3_train data/sre
|
fi
|
# musan data. "back-ground
|
if ! validate_data_dir.sh --no-text --no-feats data/musan_noise_bg; then
|
local/make_musan.sh $musan_root data
|
utils/copy_data_dir.sh data/musan_noise data/musan_noise_bg
|
awk '{if(NR>1) print $1,$1}' $musan_root/noise/free-sound/ANNOTATIONS > data/musan_noise_bg/utt2spk
|
utils/fix_data_dir.sh data/musan_noise_bg
|
fi
|
# simu rirs 8k
|
if ! validate_data_dir.sh --no-text --no-feats data/simu_rirs_8k; then
|
mkdir -p data/simu_rirs_8k
|
# if [ ! -e sim_rir_8k.zip ]; then
|
# wget --no-check-certificate http://www.openslr.org/resources/26/sim_rir_8k.zip
|
# fi
|
unzip sim_rir_8k.zip -d data/sim_rir_8k
|
find $PWD/data/sim_rir_8k -iname "*.wav" \
|
| awk '{n=split($1,A,/[\/\.]/); print A[n-3]"_"A[n-1], $1}' \
|
| sort > data/simu_rirs_8k/wav.scp
|
awk '{print $1, $1}' data/simu_rirs_8k/wav.scp > data/simu_rirs_8k/utt2spk
|
utils/fix_data_dir.sh data/simu_rirs_8k
|
fi
|
# Automatic segmentation using pretrained SAD model
|
# it will take one day using 30 CPU jobs:
|
# make_mfcc: 1 hour, compute_output: 18 hours, decode: 0.5 hours
|
sad_nnet_dir=exp/segmentation_1a/tdnn_stats_asr_sad_1a
|
sad_work_dir=exp/segmentation_1a/tdnn_stats_asr_sad_1a
|
if ! validate_data_dir.sh --no-text $sad_work_dir/swb_sre_comb_seg; then
|
if [ ! -d exp/segmentation_1a ]; then
|
# wget http://kaldi-asr.org/models/4/0004_tdnn_stats_asr_sad_1a.tar.gz
|
tar zxf 0004_tdnn_stats_asr_sad_1a.tar.gz
|
fi
|
steps/segmentation/detect_speech_activity.sh \
|
--nj $sad_num_jobs \
|
--graph-opts "$sad_graph_opts" \
|
--transform-probs-opts "$sad_priors_opts" $sad_opts \
|
data/swb_sre_comb $sad_nnet_dir mfcc_hires $sad_work_dir \
|
$sad_work_dir/swb_sre_comb || exit 1
|
fi
|
# Extract >1.5 sec segments and split into train/valid sets
|
if ! validate_data_dir.sh --no-text --no-feats data/swb_sre_cv; then
|
copy_data_dir.sh data/swb_sre_comb data/swb_sre_comb_seg
|
awk '$4-$3>1.5{print;}' $sad_work_dir/swb_sre_comb_seg/segments > data/swb_sre_comb_seg/segments
|
cp $sad_work_dir/swb_sre_comb_seg/{utt2spk,spk2utt} data/swb_sre_comb_seg
|
fix_data_dir.sh data/swb_sre_comb_seg
|
utils/subset_data_dir_tr_cv.sh data/swb_sre_comb_seg data/swb_sre_tr data/swb_sre_cv
|
fi
|
fi
|
|
simudir=data/simu
|
if [ $stage -le 1 ]; then
|
echo "simulation of mixture"
|
mkdir -p $simudir/.work
|
random_mixture_cmd=local/random_mixture.py
|
make_mixture_cmd=local/make_mixture.py
|
|
for ((i=0; i<${#simu_opts_sil_scale_array[@]}; ++i)); do
|
simu_opts_num_speaker=${simu_opts_num_speaker_array[i]}
|
simu_opts_sil_scale=${simu_opts_sil_scale_array[i]}
|
for dset in swb_sre_tr swb_sre_cv; do
|
if [ "$dset" == "swb_sre_tr" ]; then
|
n_mixtures=${simu_opts_num_train}
|
else
|
n_mixtures=500
|
fi
|
simuid=${dset}_ns${simu_opts_num_speaker}_beta${simu_opts_sil_scale}_${n_mixtures}
|
# check if you have the simulation
|
if ! validate_data_dir.sh --no-text --no-feats $simudir/data/$simuid; then
|
# random mixture generation
|
$train_cmd $simudir/.work/random_mixture_$simuid.log \
|
$random_mixture_cmd --n_speakers $simu_opts_num_speaker --n_mixtures $n_mixtures \
|
--speech_rvb_probability $simu_opts_rvb_prob \
|
--sil_scale $simu_opts_sil_scale \
|
data/$dset data/musan_noise_bg data/simu_rirs_8k \
|
\> $simudir/.work/mixture_$simuid.scp
|
nj=64
|
mkdir -p $simudir/wav/$simuid
|
# distribute simulated data to $simu_actual_dir
|
split_scps=
|
for n in $(seq $nj); do
|
split_scps="$split_scps $simudir/.work/mixture_$simuid.$n.scp"
|
mkdir -p $simudir/.work/data_$simuid.$n
|
actual=${simu_actual_dirs[($n-1)%${#simu_actual_dirs[@]}]}/$simudir/wav/$simuid/$n
|
mkdir -p $actual
|
ln -nfs $actual $simudir/wav/$simuid/$n
|
done
|
utils/split_scp.pl $simudir/.work/mixture_$simuid.scp $split_scps || exit 1
|
|
$simu_cmd --max-jobs-run 64 JOB=1:$nj $simudir/.work/make_mixture_$simuid.JOB.log \
|
$make_mixture_cmd --rate=8000 \
|
$simudir/.work/mixture_$simuid.JOB.scp \
|
$simudir/.work/data_$simuid.JOB $simudir/wav/$simuid/JOB
|
utils/combine_data.sh $simudir/data/$simuid $simudir/.work/data_$simuid.*
|
steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \
|
$simudir/data/$simuid/utt2spk $simudir/data/$simuid/segments \
|
$simudir/data/$simuid/rttm
|
utils/data/get_reco2dur.sh $simudir/data/$simuid
|
fi
|
simuid_concat=${dset}_ns"$(IFS="n"; echo "${simu_opts_num_speaker_array[*]}")"_beta"$(IFS="n"; echo "${simu_opts_sil_scale_array[*]}")"_${n_mixtures}
|
mkdir -p $simudir/data/$simuid_concat
|
for f in `ls -F $simudir/data/$simuid | grep -v "/"`; do
|
cat $simudir/data/$simuid/$f >> $simudir/data/$simuid_concat/$f
|
done
|
done
|
done
|
fi
|
|
if [ $stage -le 3 ]; then
|
# compose eval/callhome2_spkall
|
eval_set=data/eval/callhome2_spkall
|
if ! validate_data_dir.sh --no-text --no-feats $eval_set; then
|
utils/copy_data_dir.sh data/callhome2_spkall $eval_set
|
cp data/callhome2_spkall/rttm $eval_set/rttm
|
awk -v dstdir=wav/eval/callhome2_spkall '{print $1, dstdir"/"$1".wav"}' data/callhome2_spkall/wav.scp > $eval_set/wav.scp
|
mkdir -p wav/eval/callhome2_spkall
|
wav-copy scp:data/callhome2_spkall/wav.scp scp:$eval_set/wav.scp
|
utils/data/get_reco2dur.sh $eval_set
|
fi
|
|
# compose eval/callhome1_spkall
|
adapt_set=data/eval/callhome1_spkall
|
if ! validate_data_dir.sh --no-text --no-feats $adapt_set; then
|
utils/copy_data_dir.sh data/callhome1_spkall $adapt_set
|
cp data/callhome1_spkall/rttm $adapt_set/rttm
|
awk -v dstdir=wav/eval/callhome1_spkall '{print $1, dstdir"/"$1".wav"}' data/callhome1_spkall/wav.scp > $adapt_set/wav.scp
|
mkdir -p wav/eval/callhome1_spkall
|
wav-copy scp:data/callhome1_spkall/wav.scp scp:$adapt_set/wav.scp
|
utils/data/get_reco2dur.sh $adapt_set
|
fi
|
fi
|