#!/usr/bin/env bash # Set bash to 'debug' mode, it will exit on : # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', set -e set -u set -o pipefail log() { local fname=${BASH_SOURCE[1]##*/} echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" } help_messge=$(cat << EOF Usage: $0 Options: --no_overlap (bool): Whether to ignore the overlapping utterance in the training set. --tgt (string): Which set to process, test or train. EOF ) SECONDS=0 tgt=Train #Train or Eval log "$0 $*" echo $tgt . ./utils/parse_options.sh . ./path.sh AliMeeting="${PWD}/dataset" if [ $# -gt 2 ]; then log "${help_message}" exit 2 fi if [ ! -d "${AliMeeting}" ]; then log "Error: ${AliMeeting} is empty." exit 2 fi # To absolute path AliMeeting=$(cd ${AliMeeting}; pwd) echo $AliMeeting far_raw_dir=${AliMeeting}/${tgt}_Ali_far/ near_raw_dir=${AliMeeting}/${tgt}_Ali_near/ far_dir=data/local/${tgt}_Ali_far near_dir=data/local/${tgt}_Ali_near far_single_speaker_dir=data/local/${tgt}_Ali_far_correct_single_speaker mkdir -p $far_single_speaker_dir stage=1 stop_stage=4 mkdir -p $far_dir mkdir -p $near_dir if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then log "stage 1:process alimeeting near dir" find -L $near_raw_dir/audio_dir -iname "*.wav" > $near_dir/wavlist awk -F '/' '{print $NF}' $near_dir/wavlist | awk -F '.' '{print $1}' > $near_dir/uttid find -L $near_raw_dir/textgrid_dir -iname "*.TextGrid" > $near_dir/textgrid.flist n1_wav=$(wc -l < $near_dir/wavlist) n2_text=$(wc -l < $near_dir/textgrid.flist) log near file found $n1_wav wav and $n2_text text. paste $near_dir/uttid $near_dir/wavlist > $near_dir/wav_raw.scp # cat $near_dir/wav_raw.scp | awk '{printf("%s sox -t wav %s -r 16000 -b 16 -c 1 -t wav - |\n", $1, $2)}' > $near_dir/wav.scp cat $near_dir/wav_raw.scp | awk '{printf("%s sox -t wav %s -r 16000 -b 16 -t wav - |\n", $1, $2)}' > $near_dir/wav.scp python local/alimeeting_process_textgrid.py --path $near_dir --no-overlap False cat $near_dir/text_all | local/text_normalize.pl | local/text_format.pl | sort -u > $near_dir/text utils/filter_scp.pl -f 1 $near_dir/text $near_dir/utt2spk_all | sort -u > $near_dir/utt2spk #sed -e 's/ [a-z,A-Z,_,0-9,-]\+SPK/ SPK/' $near_dir/utt2spk_old >$near_dir/tmp1 #sed -e 's/-[a-z,A-Z,0-9]\+$//' $near_dir/tmp1 | sort -u > $near_dir/utt2spk local/utt2spk_to_spk2utt.pl $near_dir/utt2spk > $near_dir/spk2utt utils/filter_scp.pl -f 1 $near_dir/text $near_dir/segments_all | sort -u > $near_dir/segments sed -e 's/ $//g' $near_dir/text> $near_dir/tmp1 sed -e 's/!//g' $near_dir/tmp1> $near_dir/tmp2 sed -e 's/?//g' $near_dir/tmp2> $near_dir/text fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then log "stage 2:process alimeeting far dir" find -L $far_raw_dir/audio_dir -iname "*.wav" > $far_dir/wavlist awk -F '/' '{print $NF}' $far_dir/wavlist | awk -F '.' '{print $1}' > $far_dir/uttid find -L $far_raw_dir/textgrid_dir -iname "*.TextGrid" > $far_dir/textgrid.flist n1_wav=$(wc -l < $far_dir/wavlist) n2_text=$(wc -l < $far_dir/textgrid.flist) log far file found $n1_wav wav and $n2_text text. paste $far_dir/uttid $far_dir/wavlist > $far_dir/wav_raw.scp cat $far_dir/wav_raw.scp | awk '{printf("%s sox -t wav %s -r 16000 -b 16 -t wav - |\n", $1, $2)}' > $far_dir/wav.scp python local/alimeeting_process_overlap_force.py --path $far_dir \ --no-overlap false --mars True \ --overlap_length 0.8 --max_length 7 cat $far_dir/text_all | local/text_normalize.pl | local/text_format.pl | sort -u > $far_dir/text utils/filter_scp.pl -f 1 $far_dir/text $far_dir/utt2spk_all | sort -u > $far_dir/utt2spk #sed -e 's/ [a-z,A-Z,_,0-9,-]\+SPK/ SPK/' $far_dir/utt2spk_old >$far_dir/utt2spk local/utt2spk_to_spk2utt.pl $far_dir/utt2spk > $far_dir/spk2utt utils/filter_scp.pl -f 1 $far_dir/text $far_dir/segments_all | sort -u > $far_dir/segments sed -e 's/SRC/$/g' $far_dir/text> $far_dir/tmp1 sed -e 's/ $//g' $far_dir/tmp1> $far_dir/tmp2 sed -e 's/!//g' $far_dir/tmp2> $far_dir/tmp3 sed -e 's/?//g' $far_dir/tmp3> $far_dir/text fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then log "stage 3: finali data process" local/copy_data_dir.sh $near_dir data/${tgt}_Ali_near local/copy_data_dir.sh $far_dir data/${tgt}_Ali_far sort $far_dir/utt2spk_all_fifo > data/${tgt}_Ali_far/utt2spk_all_fifo sed -i "s/src/$/g" data/${tgt}_Ali_far/utt2spk_all_fifo # remove space in text for x in ${tgt}_Ali_near ${tgt}_Ali_far; do cp data/${x}/text data/${x}/text.org paste -d " " <(cut -f 1 -d" " data/${x}/text.org) <(cut -f 2- -d" " data/${x}/text.org | tr -d " ") \ > data/${x}/text rm data/${x}/text.org done log "Successfully finished. [elapsed=${SECONDS}s]" fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then log "stage 4: process alimeeting far dir (single speaker by oracle time strap)" cp -r $far_dir/* $far_single_speaker_dir mv $far_single_speaker_dir/textgrid.flist $far_single_speaker_dir/textgrid_oldpath paste -d " " $far_single_speaker_dir/uttid $far_single_speaker_dir/textgrid_oldpath > $far_single_speaker_dir/textgrid.flist python local/process_textgrid_to_single_speaker_wav.py --path $far_single_speaker_dir cp $far_single_speaker_dir/utt2spk $far_single_speaker_dir/text local/utt2spk_to_spk2utt.pl $far_single_speaker_dir/utt2spk > $far_single_speaker_dir/spk2utt ./local/fix_data_dir.sh $far_single_speaker_dir local/copy_data_dir.sh $far_single_speaker_dir data/${tgt}_Ali_far_single_speaker # remove space in text for x in ${tgt}_Ali_far_single_speaker; do cp data/${x}/text data/${x}/text.org paste -d " " <(cut -f 1 -d" " data/${x}/text.org) <(cut -f 2- -d" " data/${x}/text.org | tr -d " ") \ > data/${x}/text rm data/${x}/text.org done log "Successfully finished. [elapsed=${SECONDS}s]" fi