| File was renamed from egs/alimeeting/sa-asr/local/alimeeting_data_prep.sh |
| | |
| | | |
| | | SECONDS=0 |
| | | tgt=Train #Train or Eval |
| | | min_wav_duration=0.1 |
| | | max_wav_duration=20 |
| | | |
| | | |
| | | log "$0 $*" |
| | |
| | | stop_stage=4 |
| | | mkdir -p $far_dir |
| | | mkdir -p $near_dir |
| | | mkdir -p data/org |
| | | |
| | | if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then |
| | | log "stage 1:process alimeeting near dir" |
| | | |
| | | find -L $near_raw_dir/audio_dir -iname "*.wav" | sort > $near_dir/wavlist |
| | | awk -F '/' '{print $NF}' $near_dir/wavlist | awk -F '.' '{print $1}' > $near_dir/uttid |
| | | find -L $near_raw_dir/textgrid_dir -iname "*.TextGrid" | sort > $near_dir/textgrid.flist |
| | | awk -F '/' '{print $NF}' $near_dir/wavlist | awk -F '.' '{print $1}' | sort > $near_dir/uttid |
| | | find -L $near_raw_dir/textgrid_dir -iname "*.TextGrid" > $near_dir/textgrid.flist |
| | | n1_wav=$(wc -l < $near_dir/wavlist) |
| | | n2_text=$(wc -l < $near_dir/textgrid.flist) |
| | | log near file found $n1_wav wav and $n2_text text. |
| | | |
| | | paste $near_dir/uttid $near_dir/wavlist > $near_dir/wav_raw.scp |
| | | |
| | | # cat $near_dir/wav_raw.scp | awk '{printf("%s sox -t wav %s -r 16000 -b 16 -c 1 -t wav - |\n", $1, $2)}' > $near_dir/wav.scp |
| | | cat $near_dir/wav_raw.scp | awk '{printf("%s sox -t wav %s -r 16000 -b 16 -t wav - |\n", $1, $2)}' > $near_dir/wav.scp |
| | | paste $near_dir/uttid $near_dir/wavlist -d " " > $near_dir/wav.scp |
| | | |
| | | python local/alimeeting_process_textgrid.py --path $near_dir --no-overlap False |
| | | cat $near_dir/text_all | local/text_normalize.pl | local/text_format.pl | sort -u > $near_dir/text |
| | | utils/filter_scp.pl -f 1 $near_dir/text $near_dir/utt2spk_all | sort -u > $near_dir/utt2spk |
| | | #sed -e 's/ [a-z,A-Z,_,0-9,-]\+SPK/ SPK/' $near_dir/utt2spk_old >$near_dir/tmp1 |
| | | #sed -e 's/-[a-z,A-Z,0-9]\+$//' $near_dir/tmp1 | sort -u > $near_dir/utt2spk |
| | | |
| | | local/utt2spk_to_spk2utt.pl $near_dir/utt2spk > $near_dir/spk2utt |
| | | utils/filter_scp.pl -f 1 $near_dir/text $near_dir/segments_all | sort -u > $near_dir/segments |
| | | sed -e 's/ $//g' $near_dir/text> $near_dir/tmp1 |
| | |
| | | n2_text=$(wc -l < $far_dir/textgrid.flist) |
| | | log far file found $n1_wav wav and $n2_text text. |
| | | |
| | | paste $far_dir/uttid $far_dir/wavlist > $far_dir/wav_raw.scp |
| | | |
| | | cat $far_dir/wav_raw.scp | awk '{printf("%s sox -t wav %s -r 16000 -b 16 -t wav - |\n", $1, $2)}' > $far_dir/wav.scp |
| | | paste $far_dir/uttid $far_dir/wavlist -d " " > $far_dir/wav.scp |
| | | |
| | | python local/alimeeting_process_overlap_force.py --path $far_dir \ |
| | | --no-overlap false --mars True \ |
| | |
| | | |
| | | |
| | | if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then |
| | | log "stage 3: finali data process" |
| | | log "stage 3: final data process" |
| | | local/fix_data_dir.sh $near_dir |
| | | local/fix_data_dir.sh $far_dir |
| | | local/copy_data_dir.sh $near_dir data/${tgt}_Ali_near |
| | | local/copy_data_dir.sh $far_dir data/${tgt}_Ali_far |
| | | local/copy_data_dir.sh $near_dir data/org/${tgt}_Ali_near |
| | | local/copy_data_dir.sh $far_dir data/org/${tgt}_Ali_far |
| | | |
| | | sort $far_dir/utt2spk_all_fifo > data/${tgt}_Ali_far/utt2spk_all_fifo |
| | | sed -i "s/src/$/g" data/${tgt}_Ali_far/utt2spk_all_fifo |
| | | sort $far_dir/utt2spk_all_fifo > data/org/${tgt}_Ali_far/utt2spk_all_fifo |
| | | sed -i "s/src/$/g" data/org/${tgt}_Ali_far/utt2spk_all_fifo |
| | | |
| | | # remove space in text |
| | | for x in ${tgt}_Ali_near ${tgt}_Ali_far; do |
| | | cp data/${x}/text data/${x}/text.org |
| | | paste -d " " <(cut -f 1 -d" " data/${x}/text.org) <(cut -f 2- -d" " data/${x}/text.org | tr -d " ") \ |
| | | > data/${x}/text |
| | | rm data/${x}/text.org |
| | | cp data/org/${x}/text data/org/${x}/text.org |
| | | paste -d " " <(cut -f 1 -d" " data/org/${x}/text.org) <(cut -f 2- -d" " data/org/${x}/text.org | tr -d " ") \ |
| | | > data/org/${x}/text |
| | | rm data/org/${x}/text.org |
| | | done |
| | | |
| | | log "Successfully finished. [elapsed=${SECONDS}s]" |
| | | fi |
| | | |
| | | if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then |
| | | log "stage 4: process alimeeting far dir (single speaker by oracle time strap)" |
| | | log "stage 4: process alimeeting far dir (single speaker by oracle time stamp)" |
| | | cp -r $far_dir/* $far_single_speaker_dir |
| | | mv $far_single_speaker_dir/textgrid.flist $far_single_speaker_dir/textgrid_oldpath |
| | | paste -d " " $far_single_speaker_dir/uttid $far_single_speaker_dir/textgrid_oldpath > $far_single_speaker_dir/textgrid.flist |
| | |
| | | local/utt2spk_to_spk2utt.pl $far_single_speaker_dir/utt2spk > $far_single_speaker_dir/spk2utt |
| | | |
| | | ./local/fix_data_dir.sh $far_single_speaker_dir |
| | | local/copy_data_dir.sh $far_single_speaker_dir data/${tgt}_Ali_far_single_speaker |
| | | local/copy_data_dir.sh $far_single_speaker_dir data/org/${tgt}_Ali_far_single_speaker |
| | | |
| | | # remove space in text |
| | | for x in ${tgt}_Ali_far_single_speaker; do |
| | | cp data/${x}/text data/${x}/text.org |
| | | paste -d " " <(cut -f 1 -d" " data/${x}/text.org) <(cut -f 2- -d" " data/${x}/text.org | tr -d " ") \ |
| | | > data/${x}/text |
| | | rm data/${x}/text.org |
| | | cp data/org/${x}/text data/org/${x}/text.org |
| | | paste -d " " <(cut -f 1 -d" " data/org/${x}/text.org) <(cut -f 2- -d" " data/org/${x}/text.org | tr -d " ") \ |
| | | > data/org/${x}/text |
| | | rm data/org/${x}/text.org |
| | | done |
| | | rm -rf data/local |
| | | log "Successfully finished. [elapsed=${SECONDS}s]" |
| | | fi |