| | |
| | | fi |
| | | |
| | | corpus=$1 |
| | | #dict_dir=$2 |
| | | tmp=$2 |
| | | dir=$3 |
| | | |
| | |
| | | # validate utt-key list, IC0803W0380 is a bad utterance |
| | | awk '{print $1}' $corpus/wav.scp | grep -v 'IC0803W0380' > $tmp/wav_utt.list |
| | | awk '{print $1}' $corpus/trans.txt > $tmp/trans_utt.list |
| | | tools/filter_scp.pl -f 1 $tmp/wav_utt.list $tmp/trans_utt.list > $tmp/utt.list |
| | | utils/filter_scp.pl -f 1 $tmp/wav_utt.list $tmp/trans_utt.list > $tmp/utt.list |
| | | |
| | | # wav.scp |
| | | awk -F'\t' -v path_prefix=$corpus '{printf("%s\t%s/%s\n",$1,path_prefix,$2)}' $corpus/wav.scp > $tmp/tmp_wav.scp |
| | | tools/filter_scp.pl -f 1 $tmp/utt.list $tmp/tmp_wav.scp | sort -k 1 | uniq > $tmp/wav.scp |
| | | utils/filter_scp.pl -f 1 $tmp/utt.list $tmp/tmp_wav.scp | sort -k 1 | uniq > $tmp/wav.scp |
| | | |
| | | # text |
| | | tools/filter_scp.pl -f 1 $tmp/utt.list $corpus/trans.txt | sort -k 1 | uniq > $tmp/text |
| | | utils/filter_scp.pl -f 1 $tmp/utt.list $corpus/trans.txt | sort -k 1 | uniq > $tmp/text |
| | | |
| | | # copy prepared resources from tmp_dir to target dir |
| | | mkdir -p $dir |