| egs/aishell/conformer/local/download_and_untar.sh | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
| egs/aishell/conformer/local/prepare_data.sh | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
| egs/aishell/conformer/run.sh | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
| egs/librispeech_100h/conformer/local/download_and_untar.sh | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
| egs/librispeech_100h/conformer/run.sh | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 |
egs/aishell/conformer/local/download_and_untar.sh
New file @@ -0,0 +1,105 @@ #!/usr/bin/env bash # Copyright 2014 Johns Hopkins University (author: Daniel Povey) # 2017 Xingyu Na # Apache 2.0 remove_archive=false if [ "$1" == --remove-archive ]; then remove_archive=true shift fi if [ $# -ne 3 ]; then echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>" echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell" echo "With --remove-archive it will remove the archive after successfully un-tarring it." echo "<corpus-part> can be one of: data_aishell, resource_aishell." fi data=$1 url=$2 part=$3 if [ ! -d "$data" ]; then echo "$0: no such directory $data" exit 1; fi part_ok=false list="data_aishell resource_aishell" for x in $list; do if [ "$part" == $x ]; then part_ok=true; fi done if ! $part_ok; then echo "$0: expected <corpus-part> to be one of $list, but got '$part'" exit 1; fi if [ -z "$url" ]; then echo "$0: empty URL base." exit 1; fi if [ -f $data/$part/.complete ]; then echo "$0: data part $part was already successfully extracted, nothing to do." exit 0; fi # sizes of the archive files in bytes. sizes="15582913665 1246920" if [ -f $data/$part.tgz ]; then size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}') size_ok=false for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done if ! $size_ok; then echo "$0: removing existing file $data/$part.tgz because its size in bytes $size" echo "does not equal the size of one of the archives." rm $data/$part.tgz else echo "$data/$part.tgz exists and appears to be complete." fi fi if [ ! -f $data/$part.tgz ]; then if ! command -v wget >/dev/null; then echo "$0: wget is not installed." exit 1; fi full_url=$url/$part.tgz echo "$0: downloading data from $full_url. This may take some time, please be patient." cd $data || exit 1 if ! wget --no-check-certificate $full_url; then echo "$0: error executing wget $full_url" exit 1; fi fi cd $data || exit 1 if ! tar -xvzf $part.tgz; then echo "$0: error un-tarring archive $data/$part.tgz" exit 1; fi touch $data/$part/.complete if [ $part == "data_aishell" ]; then cd $data/$part/wav || exit 1 for wav in ./*.tar.gz; do echo "Extracting wav from $wav" tar -zxf $wav && rm $wav done fi echo "$0: Successfully downloaded and un-tarred $data/$part.tgz" if $remove_archive; then echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied." rm $data/$part.tgz fi exit 0; egs/aishell/conformer/local/prepare_data.sh
File was deleted egs/aishell/conformer/run.sh
@@ -8,7 +8,7 @@ count=1 gpu_inference=true # Whether to perform gpu decoding, set false for cpu decoding # for gpu decoding, inference_nj=ngpu*njob; for cpu decoding, inference_nj=njob njob=1 njob=5 train_cmd=utils/run.pl infer_cmd=utils/run.pl @@ -16,10 +16,9 @@ feats_dir="../DATA" #feature output dictionary exp_dir="." lang=zh feats_type=fbank token_type=char scp=wav.scp type=sound scp=wav.scp stage=3 stop_stage=4 @@ -47,7 +46,7 @@ test_sets="dev test" asr_config=conf/train_asr_conformer.yaml model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}" model_dir="baseline_$(basename "${asr_config}" .yaml)_${lang}_${token_type}_${tag}" inference_config=conf/decode_asr_transformer.yaml inference_asr_model=valid.acc.ave_10best.pb egs/librispeech_100h/conformer/local/download_and_untar.sh
New file @@ -0,0 +1,97 @@ #!/usr/bin/env bash # Copyright 2014 Johns Hopkins University (author: Daniel Povey) # Apache 2.0 remove_archive=false if [ "$1" == --remove-archive ]; then remove_archive=true shift fi if [ $# -ne 3 ]; then echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>" echo "e.g.: $0 /export/a15/vpanayotov/data www.openslr.org/resources/11 dev-clean" echo "With --remove-archive it will remove the archive after successfully un-tarring it." echo "<corpus-part> can be one of: dev-clean, test-clean, dev-other, test-other," echo " train-clean-100, train-clean-360, train-other-500." exit 1 fi data=$1 url=$2 part=$3 if [ ! -d "$data" ]; then echo "$0: no such directory $data" exit 1 fi part_ok=false list="dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500" for x in $list; do if [ "$part" == $x ]; then part_ok=true; fi done if ! $part_ok; then echo "$0: expected <corpus-part> to be one of $list, but got '$part'" exit 1 fi if [ -z "$url" ]; then echo "$0: empty URL base." exit 1 fi if [ -f $data/LibriSpeech/$part/.complete ]; then echo "$0: data part $part was already successfully extracted, nothing to do." exit 0 fi # sizes of the archive files in bytes. This is some older versions. sizes_old="371012589 347390293 379743611 361838298 6420417880 23082659865 30626749128" # sizes_new is the archive file sizes of the final release. Some of these sizes are of # things we probably won't download. sizes_new="337926286 314305928 695964615 297279345 87960560420 33373768 346663984 328757843 6387309499 23049477885 30593501606" if [ -f $data/$part.tar.gz ]; then size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}') size_ok=false for s in $sizes_old $sizes_new; do if [ $s == $size ]; then size_ok=true; fi; done if ! $size_ok; then echo "$0: removing existing file $data/$part.tar.gz because its size in bytes $size" echo "does not equal the size of one of the archives." rm $data/$part.tar.gz else echo "$data/$part.tar.gz exists and appears to be complete." fi fi if [ ! -f $data/$part.tar.gz ]; then if ! which wget >/dev/null; then echo "$0: wget is not installed." exit 1 fi full_url=$url/$part.tar.gz echo "$0: downloading data from $full_url. This may take some time, please be patient." if ! wget -P $data --no-check-certificate $full_url; then echo "$0: error executing wget $full_url" exit 1 fi fi if ! tar -C $data -xvzf $data/$part.tar.gz; then echo "$0: error un-tarring archive $data/$part.tar.gz" exit 1 fi touch $data/LibriSpeech/$part/.complete echo "$0: Successfully downloaded and un-tarred $data/$part.tar.gz" if $remove_archive; then echo "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied." rm $data/$part.tar.gz fi egs/librispeech_100h/conformer/run.sh
@@ -3,8 +3,8 @@ . ./path.sh || exit 1; # machines configuration CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" gpu_num=8 CUDA_VISIBLE_DEVICES="0,1" gpu_num=2 count=1 gpu_inference=true # Whether to perform gpu decoding, set false for cpu decoding # for gpu decoding, inference_nj=ngpu*njob; for cpu decoding, inference_nj=njob @@ -16,30 +16,26 @@ feats_dir="../DATA" #feature output dictionary exp_dir="." lang=en dumpdir=dump/fbank feats_type=fbank token_type=bpe dataset_type=large scp=feats.scp type=kaldi_ark stage=3 stop_stage=4 type=sound scp=wav.scp stage=1 stop_stage=1 # feature configuration feats_dim=80 sample_frequency=16000 nj=100 speed_perturb="0.9,1.0,1.1" nj=64 # data data_librispeech= raw_data= data_url=www.openslr.org/resources/12 # bpe model nbpe=5000 bpemode=unigram # exp tag tag="" tag="exp1" . utils/parse_options.sh || exit 1; @@ -54,8 +50,7 @@ test_sets="test_clean test_other dev_clean dev_other" asr_config=conf/train_asr_conformer.yaml #asr_config=conf/train_asr_conformer_uttnorm.yaml model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}" model_dir="baseline_$(basename "${asr_config}" .yaml)_${lang}_${token_type}_${tag}" inference_config=conf/decode_asr_transformer.yaml #inference_config=conf/decode_asr_transformer_beam60_ctc0.3.yaml @@ -73,6 +68,14 @@ _ngpu=0 fi if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then echo "stage -1: Data Download" for part in dev-clean test-clean dev-other test-other train-clean-100; do local/download_and_untar.sh ${raw_data} ${data_url} ${part} done fi if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then echo "stage 0: Data preparation" # Data preparation