#!/usr/bin/env bash # Set bash to 'debug' mode, it will exit on : # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', set -e set -u set -o pipefail log() { local fname=${BASH_SOURCE[1]##*/} echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" } help_messge=$(cat << EOF Usage: $0 Options: --no_overlap (bool): Whether to ignore the overlapping utterance in the training set. --tgt (string): Which set to process, test or train. EOF ) SECONDS=0 tgt=Train #Train or Eval min_wav_duration=0.1 max_wav_duration=20 log "$0 $*" echo $tgt . ./utils/parse_options.sh . ./path.sh AliMeeting="${PWD}/dataset" if [ $# -gt 2 ]; then log "${help_message}" exit 2 fi if [ ! -d "${AliMeeting}" ]; then log "Error: ${AliMeeting} is empty." exit 2 fi # To absolute path AliMeeting=$(cd ${AliMeeting}; pwd) echo $AliMeeting far_raw_dir=${AliMeeting}/${tgt}_Ali_far/ near_raw_dir=${AliMeeting}/${tgt}_Ali_near/ far_dir=data/local/${tgt}_Ali_far near_dir=data/local/${tgt}_Ali_near far_single_speaker_dir=data/local/${tgt}_Ali_far_correct_single_speaker mkdir -p $far_single_speaker_dir stage=1 stop_stage=4 mkdir -p $far_dir mkdir -p $near_dir mkdir -p data/org if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then log "stage 1:process alimeeting near dir" find -L $near_raw_dir/audio_dir -iname "*.wav" | sort > $near_dir/wavlist awk -F '/' '{print $NF}' $near_dir/wavlist | awk -F '.' '{print $1}' | sort > $near_dir/uttid find -L $near_raw_dir/textgrid_dir -iname "*.TextGrid" > $near_dir/textgrid.flist n1_wav=$(wc -l < $near_dir/wavlist) n2_text=$(wc -l < $near_dir/textgrid.flist) log near file found $n1_wav wav and $n2_text text. paste $near_dir/uttid $near_dir/wavlist -d " " > $near_dir/wav.scp python local/alimeeting_process_textgrid.py --path $near_dir --no-overlap False cat $near_dir/text_all | local/text_normalize.pl | local/text_format.pl | sort -u > $near_dir/text utils/filter_scp.pl -f 1 $near_dir/text $near_dir/utt2spk_all | sort -u > $near_dir/utt2spk local/utt2spk_to_spk2utt.pl $near_dir/utt2spk > $near_dir/spk2utt utils/filter_scp.pl -f 1 $near_dir/text $near_dir/segments_all | sort -u > $near_dir/segments sed -e 's/ $//g' $near_dir/text> $near_dir/tmp1 sed -e 's/!//g' $near_dir/tmp1> $near_dir/tmp2 sed -e 's/?//g' $near_dir/tmp2> $near_dir/text fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then log "stage 2:process alimeeting far dir" find -L $far_raw_dir/audio_dir -iname "*.wav" | sort > $far_dir/wavlist awk -F '/' '{print $NF}' $far_dir/wavlist | awk -F '.' '{print $1}' > $far_dir/uttid find -L $far_raw_dir/textgrid_dir -iname "*.TextGrid" | sort > $far_dir/textgrid.flist n1_wav=$(wc -l < $far_dir/wavlist) n2_text=$(wc -l < $far_dir/textgrid.flist) log far file found $n1_wav wav and $n2_text text. paste $far_dir/uttid $far_dir/wavlist -d " " > $far_dir/wav.scp python local/alimeeting_process_overlap_force.py --path $far_dir \ --no-overlap false --mars True \ --overlap_length 0.8 --max_length 7 cat $far_dir/text_all | local/text_normalize.pl | local/text_format.pl | sort -u > $far_dir/text utils/filter_scp.pl -f 1 $far_dir/text $far_dir/utt2spk_all | sort -u > $far_dir/utt2spk #sed -e 's/ [a-z,A-Z,_,0-9,-]\+SPK/ SPK/' $far_dir/utt2spk_old >$far_dir/utt2spk local/utt2spk_to_spk2utt.pl $far_dir/utt2spk > $far_dir/spk2utt utils/filter_scp.pl -f 1 $far_dir/text $far_dir/segments_all | sort -u > $far_dir/segments sed -e 's/SRC/$/g' $far_dir/text> $far_dir/tmp1 sed -e 's/ $//g' $far_dir/tmp1> $far_dir/tmp2 sed -e 's/!//g' $far_dir/tmp2> $far_dir/tmp3 sed -e 's/?//g' $far_dir/tmp3> $far_dir/text fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then log "stage 3: final data process" local/fix_data_dir.sh $near_dir local/fix_data_dir.sh $far_dir local/copy_data_dir.sh $near_dir data/org/${tgt}_Ali_near local/copy_data_dir.sh $far_dir data/org/${tgt}_Ali_far sort $far_dir/utt2spk_all_fifo > data/org/${tgt}_Ali_far/utt2spk_all_fifo sed -i "s/src/$/g" data/org/${tgt}_Ali_far/utt2spk_all_fifo # remove space in text for x in ${tgt}_Ali_near ${tgt}_Ali_far; do cp data/org/${x}/text data/org/${x}/text.org paste -d " " <(cut -f 1 -d" " data/org/${x}/text.org) <(cut -f 2- -d" " data/org/${x}/text.org | tr -d " ") \ > data/org/${x}/text rm data/org/${x}/text.org done log "Successfully finished. [elapsed=${SECONDS}s]" fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then log "stage 4: process alimeeting far dir (single speaker by oracle time stamp)" cp -r $far_dir/* $far_single_speaker_dir mv $far_single_speaker_dir/textgrid.flist $far_single_speaker_dir/textgrid_oldpath paste -d " " $far_single_speaker_dir/uttid $far_single_speaker_dir/textgrid_oldpath > $far_single_speaker_dir/textgrid.flist python local/process_textgrid_to_single_speaker_wav.py --path $far_single_speaker_dir cp $far_single_speaker_dir/utt2spk $far_single_speaker_dir/text local/utt2spk_to_spk2utt.pl $far_single_speaker_dir/utt2spk > $far_single_speaker_dir/spk2utt ./local/fix_data_dir.sh $far_single_speaker_dir local/copy_data_dir.sh $far_single_speaker_dir data/org/${tgt}_Ali_far_single_speaker # remove space in text for x in ${tgt}_Ali_far_single_speaker; do cp data/org/${x}/text data/org/${x}/text.org paste -d " " <(cut -f 1 -d" " data/org/${x}/text.org) <(cut -f 2- -d" " data/org/${x}/text.org | tr -d " ") \ > data/org/${x}/text rm data/org/${x}/text.org done rm -rf data/local log "Successfully finished. [elapsed=${SECONDS}s]" fi