| New file |
| | |
| | | #!/usr/bin/env bash |
| | | |
| | | |
| | | # Begin configuration section. |
| | | nj=32 |
| | | cmd=utils/run.pl |
| | | |
| | | echo "$0 $@" |
| | | |
| | | . utils/parse_options.sh || exit 1; |
| | | |
| | | # tokenize configuration |
| | | text_dir=$1 |
| | | seg_file=$2 |
| | | logdir=$3 |
| | | output_dir=$4 |
| | | |
| | | txt_dir=${output_dir}/txt; mkdir -p ${output_dir}/txt |
| | | mkdir -p ${logdir} |
| | | |
| | | $cmd JOB=1:$nj $logdir/text_tokenize.JOB.log \ |
| | | python utils/text_tokenize.py -t ${text_dir}/txt/text.JOB.txt \ |
| | | -s ${seg_file} -i JOB -o ${txt_dir} \ |
| | | || exit 1; |
| | | |
| | | # concatenate the text files together. |
| | | for n in $(seq $nj); do |
| | | cat ${txt_dir}/text.$n.txt || exit 1 |
| | | done > ${output_dir}/text || exit 1 |
| | | |
| | | for n in $(seq $nj); do |
| | | cat ${txt_dir}/len.$n || exit 1 |
| | | done > ${output_dir}/text_shape || exit 1 |
| | | |
| | | echo "$0: Succeeded text tokenize" |