| egs/aishell2/transformerLM/conf/train_lm_transformer.yaml | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
| egs/aishell2/transformerLM/path.sh | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
| egs/aishell2/transformerLM/run.sh | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
| egs/aishell2/transformerLM/utils/parse_options.sh | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
| egs/aishell2/transformerLM/utils/run.pl | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
| egs/aishell2/transformerLM/utils/split_scp.pl | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 |
egs/aishell2/transformerLM/conf/train_lm_transformer.yaml
New file @@ -0,0 +1,31 @@ lm: transformer lm_conf: pos_enc: null embed_unit: 128 att_unit: 512 head: 8 unit: 2048 layer: 16 dropout_rate: 0.1 # optimization related grad_clip: 5.0 batch_type: numel batch_bins: 6000000 accum_grad: 1 max_epoch: 15 # 15epoch is enougth optim: adam optim_conf: lr: 0.001 scheduler: warmuplr scheduler_conf: warmup_steps: 25000 best_model_criterion: - - valid - loss - min keep_nbest_models: 10 # 10 is good. log_interval: 50 egs/aishell2/transformerLM/path.sh
New file @@ -0,0 +1,6 @@ export FUNASR_DIR=$PWD/../../.. # NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C export PYTHONIOENCODING=UTF-8 export PYTHONPATH=../../../:$PYTHONPATH export PATH=$FUNASR_DIR/funasr/bin:$PATH egs/aishell2/transformerLM/run.sh
New file @@ -0,0 +1,245 @@ #!/usr/bin/env bash . ./path.sh || exit 1; # machines configuration CUDA_VISIBLE_DEVICES="0,1" gpu_num=2 count=1 train_cmd=utils/run.pl infer_cmd=utils/run.pl # general configuration lang=zh nlsyms_txt=none # Non-linguistic symbol list if existing. cleaner=none # Text cleaner. g2p=none # g2p method (needed if token_type=phn). lm_fold_length=150 # fold_length for LM training. word_vocab_size=10000 # Size of word vocabulary. token_type=char lm_token_list= nj=10 ## path to AISHELL2 trans lm_train_text= lm_dev_text= lm_test_text= train_data_path_and_name_and_type=${lm_train_text},text,text train_shape_file= valid_data_path_and_name_and_type=${lm_dev_text},text,text valid_shape_file= lm_config=conf/train_lm_transformer.yaml exp_dir=./data tag=exp1 model_dir="baseline_$(basename "${lm_config}" .yaml)_${lang}_${token_type}_${tag}" lm_exp=${exp_dir}/exp/${model_dir} inference_lm=valid.loss.ave.pth # Language model path for decoding. stage=0 stop_stage=3 . utils/parse_options.sh || exit 1; # Set bash to 'debug' mode, it will exit on : # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', set -e set -u set -o pipefail min() { local a b a=$1 for b in "$@"; do if [ "${b}" -le "${a}" ]; then a="${b}" fi done echo "${a}" } # you can set gpu num for decoding here gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default ngpu=$(echo $gpuid_list | awk -F "," '{print NF}') mkdir -p ${exp_dir}/exp/${model_dir} token_list=${exp_dir}/exp/${model_dir}/vocab.txt blank="<blank>" # CTC blank symbole sos="<s>" # sos symbole eos="</s>" # eos symbole oov="<unk>" # Out of vocabulary symbol. if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then if [ "${token_type}" = char ] || [ "${token_type}" = word ]; then echo "Stage 0: Generate character level token_list from ${lm_train_text}" # The first symbol in token_list must be "<blank>": # 0 is reserved for CTC-blank for ASR and also used as ignore-index in the other task python -m funasr.bin.tokenize_text \ --token_type "${token_type}" \ --input "${lm_train_text}" \ --output "${token_list}" \ --non_linguistic_symbols "${nlsyms_txt}" \ --field 2- \ --cleaner "${cleaner}" \ --g2p "${g2p}" \ --write_vocabulary true \ --add_symbol "${blank}:0" \ --add_symbol "${sos}:1" \ --add_symbol "${eos}:2" \ --add_symbol "${oov}:-1" else echo "Error: not supported --token_type '${token_type}'" exit 2 fi ## use_word_lm=false ## # Create word-list for word-LM training ## if ${use_word_lm} && [ "${token_type}" != word ]; then ## echo "Generate word level token_list from ${lm_train_text}" ## python -m funasr.bin.tokenize_text \ ## --token_type word \ ## --input "${lm_train_text}" \ ## --output "${token_list}" \ ## --field 2- \ ## --cleaner "${cleaner}" \ ## --g2p "${g2p}" \ ## --write_vocabulary true \ ## --vocabulary_size "${word_vocab_size}" \ ## --add_symbol "${blank}:0" \ ## --add_symbol "${sos}:1" \ ## --add_symbol "${eos}:2" \ ## --add_symbol "${oov}:-1" ## fi lm_token_list="${token_list}" fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then echo "stage 1: Data preparation" # 1. Split the key file _logdir="${exp_dir}/exp/${model_dir}/log" mkdir -p "${_logdir}" # Get the minimum number among ${nj} and the number lines of input files _nj=$(min "${nj}" "$(<${lm_train_text} wc -l)" "$(<${lm_dev_text} wc -l)") key_file="${lm_train_text}" split_scps="" for n in $(seq ${_nj}); do split_scps+=" ${_logdir}/train.${n}.scp" done # shellcheck disable=SC2086 utils/split_scp.pl "${key_file}" ${split_scps} key_file="${lm_dev_text}" split_scps="" for n in $(seq ${_nj}); do split_scps+=" ${_logdir}/dev.${n}.scp" done # shellcheck disable=SC2086 utils/split_scp.pl "${key_file}" ${split_scps} # 2. Submit jobs ## python ../../funasr/bin/lm_train.py \ ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \ python -m funasr.bin.lm_train \ --collect_stats true \ --use_preprocessor true \ --token_type "${token_type}" \ --token_list "${lm_token_list}" \ --non_linguistic_symbols "${nlsyms_txt}" \ --cleaner "${cleaner}" \ --g2p "${g2p}" \ --train_data_path_and_name_and_type "${lm_train_text},text,text" \ --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \ --train_shape_file "${_logdir}/train.JOB.scp" \ --valid_shape_file "${_logdir}/dev.JOB.scp" \ --output_dir "${_logdir}/stats.JOB" \ --config ${lm_config} || { cat "${_logdir}"/stats.*.log; exit 1; } # 3. Aggregate shape files _opts= for i in $(seq "${_nj}"); do _opts+="--input_dir ${_logdir}/stats.${i} " done lm_stats_dir=${exp_dir}/exp/${model_dir} # shellcheck disable=SC2086 python -m funasr.bin.aggregate_stats_dirs ${_opts} --output_dir "${lm_stats_dir}" # Append the num-tokens at the last dimensions. This is used for batch-bins count <"${lm_stats_dir}/train/text_shape" \ awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \ >"${lm_stats_dir}/train/text_shape.${token_type}" <"${lm_stats_dir}/valid/text_shape" \ awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \ >"${lm_stats_dir}/valid/text_shape.${token_type}" train_shape_file=${lm_stats_dir}/train/text_shape.${token_type} valid_shape_file=${lm_stats_dir}/valid/text_shape.${token_type} fi # Training Stage world_size=$gpu_num # run on one machine if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then echo "stage 2: Training" mkdir -p ${lm_exp} mkdir -p ${lm_exp}/log INIT_FILE=${lm_exp}/ddp_init if [ -f $INIT_FILE ];then rm -f $INIT_FILE fi init_method=file://$(readlink -f $INIT_FILE) echo "$0: init method is $init_method" for ((i = 0; i < $gpu_num; ++i)); do { rank=$i local_rank=$i gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) python ../../../funasr/bin/lm_train.py \ --gpu_id ${gpu_id} \ --use_preprocessor true \ --token_type "${token_type}" \ --token_list "${lm_token_list}" \ --non_linguistic_symbols "${nlsyms_txt}" \ --cleaner "${cleaner}" \ --train_data_path_and_name_and_type "${train_data_path_and_name_and_type}" \ --train_shape_file "${train_shape_file}" \ --valid_data_path_and_name_and_type "${valid_data_path_and_name_and_type}" \ --valid_shape_file "${valid_shape_file}" \ --fold_length "${lm_fold_length}" \ --resume true \ --output_dir "${lm_exp}" \ --config ${lm_config} \ --ngpu ${gpu_num} \ --num_worker_count ${count} \ --multiprocessing_distributed true \ --dist_init_method ${init_method} \ --dist_world_size ${world_size} \ --dist_rank ${rank} \ --local_rank ${local_rank} 1> ${exp_dir}/exp/${model_dir}/log/train.log.$i 2>&1 } & done wait fi # Testing Stage gpu_num=1 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then echo "Stage 3: Calc perplexity: ${lm_test_text}" python ../../../funasr/bin/lm_inference.py \ --output_dir "${lm_exp}/perplexity_test" \ --ngpu "${gpu_num}" \ --batch_size 1 \ --train_config "${lm_exp}"/config.yaml \ --model_file "${lm_exp}/${inference_lm}" \ --data_path_and_name_and_type "${lm_test_text},text,text" \ --num_workers 1 \ --split_with_space false fi egs/aishell2/transformerLM/utils/parse_options.sh
New file @@ -0,0 +1,97 @@ #!/usr/bin/env bash # Copyright 2012 Johns Hopkins University (Author: Daniel Povey); # Arnab Ghoshal, Karel Vesely # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, # MERCHANTABLITY OR NON-INFRINGEMENT. # See the Apache 2 License for the specific language governing permissions and # limitations under the License. # Parse command-line options. # To be sourced by another script (as in ". parse_options.sh"). # Option format is: --option-name arg # and shell variable "option_name" gets set to value "arg." # The exception is --help, which takes no arguments, but prints the # $help_message variable (if defined). ### ### The --config file options have lower priority to command line ### options, so we need to import them first... ### # Now import all the configs specified by command-line, in left-to-right order for ((argpos=1; argpos<$#; argpos++)); do if [ "${!argpos}" == "--config" ]; then argpos_plus1=$((argpos+1)) config=${!argpos_plus1} [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 . $config # source the config file. fi done ### ### Now we process the command line options ### while true; do [ -z "${1:-}" ] && break; # break if there are no arguments case "$1" in # If the enclosing script is called with --help option, print the help # message and exit. Scripts should put help messages in $help_message --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; else printf "$help_message\n" 1>&2 ; fi; exit 0 ;; --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" exit 1 ;; # If the first command-line argument begins with "--" (e.g. --foo-bar), # then work out the variable name as $name, which will equal "foo_bar". --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; # Next we test whether the variable in question is undefned-- if so it's # an invalid option and we die. Note: $0 evaluates to the name of the # enclosing script. # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar # is undefined. We then have to wrap this test inside "eval" because # foo_bar is itself inside a variable ($name). eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; oldval="`eval echo \\$$name`"; # Work out whether we seem to be expecting a Boolean argument. if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then was_bool=true; else was_bool=false; fi # Set the variable to the right value-- the escaped quotes make it work if # the option had spaces, like --cmd "queue.pl -sync y" eval $name=\"$2\"; # Check that Boolean-valued arguments are really Boolean. if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 exit 1; fi shift 2; ;; *) break; esac done # Check for an empty argument to the --cmd option, which can easily occur as a # result of scripting errors. [ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; true; # so this script returns exit code 0. egs/aishell2/transformerLM/utils/run.pl
New file @@ -0,0 +1,356 @@ #!/usr/bin/env perl use warnings; #sed replacement for -w perl parameter # In general, doing # run.pl some.log a b c is like running the command a b c in # the bash shell, and putting the standard error and output into some.log. # To run parallel jobs (backgrounded on the host machine), you can do (e.g.) # run.pl JOB=1:4 some.JOB.log a b c JOB is like running the command a b c JOB # and putting it in some.JOB.log, for each one. [Note: JOB can be any identifier]. # If any of the jobs fails, this script will fail. # A typical example is: # run.pl some.log my-prog "--opt=foo bar" foo \| other-prog baz # and run.pl will run something like: # ( my-prog '--opt=foo bar' foo | other-prog baz ) >& some.log # # Basically it takes the command-line arguments, quotes them # as necessary to preserve spaces, and evaluates them with bash. # In addition it puts the command line at the top of the log, and # the start and end times of the command at the beginning and end. # The reason why this is useful is so that we can create a different # version of this program that uses a queueing system instead. #use Data::Dumper; @ARGV < 2 && die "usage: run.pl log-file command-line arguments..."; #print STDERR "COMMAND-LINE: " . Dumper(\@ARGV) . "\n"; $job_pick = 'all'; $max_jobs_run = -1; $jobstart = 1; $jobend = 1; $ignored_opts = ""; # These will be ignored. # First parse an option like JOB=1:4, and any # options that would normally be given to # queue.pl, which we will just discard. for (my $x = 1; $x <= 2; $x++) { # This for-loop is to # allow the JOB=1:n option to be interleaved with the # options to qsub. while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) { # parse any options that would normally go to qsub, but which will be ignored here. my $switch = shift @ARGV; if ($switch eq "-V") { $ignored_opts .= "-V "; } elsif ($switch eq "--max-jobs-run" || $switch eq "-tc") { # we do support the option --max-jobs-run n, and its GridEngine form -tc n. # if the command appears multiple times uses the smallest option. if ( $max_jobs_run <= 0 ) { $max_jobs_run = shift @ARGV; } else { my $new_constraint = shift @ARGV; if ( ($new_constraint < $max_jobs_run) ) { $max_jobs_run = $new_constraint; } } if (! ($max_jobs_run > 0)) { die "run.pl: invalid option --max-jobs-run $max_jobs_run"; } } else { my $argument = shift @ARGV; if ($argument =~ m/^--/) { print STDERR "run.pl: WARNING: suspicious argument '$argument' to $switch; starts with '-'\n"; } if ($switch eq "-sync" && $argument =~ m/^[yY]/) { $ignored_opts .= "-sync "; # Note: in the # corresponding code in queue.pl it says instead, just "$sync = 1;". } elsif ($switch eq "-pe") { # e.g. -pe smp 5 my $argument2 = shift @ARGV; $ignored_opts .= "$switch $argument $argument2 "; } elsif ($switch eq "--gpu") { $using_gpu = $argument; } elsif ($switch eq "--pick") { if($argument =~ m/^(all|failed|incomplete)$/) { $job_pick = $argument; } else { print STDERR "run.pl: ERROR: --pick argument must be one of 'all', 'failed' or 'incomplete'" } } else { # Ignore option. $ignored_opts .= "$switch $argument "; } } } if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:20 $jobname = $1; $jobstart = $2; $jobend = $3; if ($jobstart > $jobend) { die "run.pl: invalid job range $ARGV[0]"; } if ($jobstart <= 0) { die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is required for GridEngine compatibility)."; } shift; } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1. $jobname = $1; $jobstart = $2; $jobend = $2; shift; } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) { print STDERR "run.pl: Warning: suspicious first argument to run.pl: $ARGV[0]\n"; } } # Users found this message confusing so we are removing it. # if ($ignored_opts ne "") { # print STDERR "run.pl: Warning: ignoring options \"$ignored_opts\"\n"; # } if ($max_jobs_run == -1) { # If --max-jobs-run option not set, # then work out the number of processors if possible, # and set it based on that. $max_jobs_run = 0; if ($using_gpu) { if (open(P, "nvidia-smi -L |")) { $max_jobs_run++ while (<P>); close(P); } if ($max_jobs_run == 0) { $max_jobs_run = 1; print STDERR "run.pl: Warning: failed to detect number of GPUs from nvidia-smi, using ${max_jobs_run}\n"; } } elsif (open(P, "</proc/cpuinfo")) { # Linux while (<P>) { if (m/^processor/) { $max_jobs_run++; } } if ($max_jobs_run == 0) { print STDERR "run.pl: Warning: failed to detect any processors from /proc/cpuinfo\n"; $max_jobs_run = 10; # reasonable default. } close(P); } elsif (open(P, "sysctl -a |")) { # BSD/Darwin while (<P>) { if (m/hw\.ncpu\s*[:=]\s*(\d+)/) { # hw.ncpu = 4, or hw.ncpu: 4 $max_jobs_run = $1; last; } } close(P); if ($max_jobs_run == 0) { print STDERR "run.pl: Warning: failed to detect any processors from sysctl -a\n"; $max_jobs_run = 10; # reasonable default. } } else { # allow at most 32 jobs at once, on non-UNIX systems; change this code # if you need to change this default. $max_jobs_run = 32; } # The just-computed value of $max_jobs_run is just the number of processors # (or our best guess); and if it happens that the number of jobs we need to # run is just slightly above $max_jobs_run, it will make sense to increase # $max_jobs_run to equal the number of jobs, so we don't have a small number # of leftover jobs. $num_jobs = $jobend - $jobstart + 1; if (!$using_gpu && $num_jobs > $max_jobs_run && $num_jobs < 1.4 * $max_jobs_run) { $max_jobs_run = $num_jobs; } } sub pick_or_exit { # pick_or_exit ( $logfile ) # Invoked before each job is started helps to run jobs selectively. # # Given the name of the output logfile decides whether the job must be # executed (by returning from the subroutine) or not (by terminating the # process calling exit) # # PRE: $job_pick is a global variable set by command line switch --pick # and indicates which class of jobs must be executed. # # 1) If a failed job is not executed the process exit code will indicate # failure, just as if the task was just executed and failed. # # 2) If a task is incomplete it will be executed. Incomplete may be either # a job whose log file does not contain the accounting notes in the end, # or a job whose log file does not exist. # # 3) If the $job_pick is set to 'all' (default behavior) a task will be # executed regardless of the result of previous attempts. # # This logic could have been implemented in the main execution loop # but a subroutine to preserve the current level of readability of # that part of the code. # # Alexandre Felipe, (o.alexandre.felipe@gmail.com) 14th of August of 2020 # if($job_pick eq 'all'){ return; # no need to bother with the previous log } open my $fh, "<", $_[0] or return; # job not executed yet my $log_line; my $cur_line; while ($cur_line = <$fh>) { if( $cur_line =~ m/# Ended \(code .*/ ) { $log_line = $cur_line; } } close $fh; if (! defined($log_line)){ return; # incomplete } if ( $log_line =~ m/# Ended \(code 0\).*/ ) { exit(0); # complete } elsif ( $log_line =~ m/# Ended \(code \d+(; signal \d+)?\).*/ ){ if ($job_pick !~ m/^(failed|all)$/) { exit(1); # failed but not going to run } else { return; # failed } } elsif ( $log_line =~ m/.*\S.*/ ) { return; # incomplete jobs are always run } } $logfile = shift @ARGV; if (defined $jobname && $logfile !~ m/$jobname/ && $jobend > $jobstart) { print STDERR "run.pl: you are trying to run a parallel job but " . "you are putting the output into just one log file ($logfile)\n"; exit(1); } $cmd = ""; foreach $x (@ARGV) { if ($x =~ m/^\S+$/) { $cmd .= $x . " "; } elsif ($x =~ m:\":) { $cmd .= "'$x' "; } else { $cmd .= "\"$x\" "; } } #$Data::Dumper::Indent=0; $ret = 0; $numfail = 0; %active_pids=(); use POSIX ":sys_wait_h"; for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) { if (scalar(keys %active_pids) >= $max_jobs_run) { # Lets wait for a change in any child's status # Then we have to work out which child finished $r = waitpid(-1, 0); $code = $?; if ($r < 0 ) { die "run.pl: Error waiting for child process"; } # should never happen. if ( defined $active_pids{$r} ) { $jid=$active_pids{$r}; $fail[$jid]=$code; if ($code !=0) { $numfail++;} delete $active_pids{$r}; # print STDERR "Finished: $r/$jid " . Dumper(\%active_pids) . "\n"; } else { die "run.pl: Cannot find the PID of the child process that just finished."; } # In theory we could do a non-blocking waitpid over all jobs running just # to find out if only one or more jobs finished during the previous waitpid() # However, we just omit this and will reap the next one in the next pass # through the for(;;) cycle } $childpid = fork(); if (!defined $childpid) { die "run.pl: Error forking in run.pl (writing to $logfile)"; } if ($childpid == 0) { # We're in the child... this branch # executes the job and returns (possibly with an error status). if (defined $jobname) { $cmd =~ s/$jobname/$jobid/g; $logfile =~ s/$jobname/$jobid/g; } # exit if the job does not need to be executed pick_or_exit( $logfile ); system("mkdir -p `dirname $logfile` 2>/dev/null"); open(F, ">$logfile") || die "run.pl: Error opening log file $logfile"; print F "# " . $cmd . "\n"; print F "# Started at " . `date`; $starttime = `date +'%s'`; print F "#\n"; close(F); # Pipe into bash.. make sure we're not using any other shell. open(B, "|bash") || die "run.pl: Error opening shell command"; print B "( " . $cmd . ") 2>>$logfile >> $logfile"; close(B); # If there was an error, exit status is in $? $ret = $?; $lowbits = $ret & 127; $highbits = $ret >> 8; if ($lowbits != 0) { $return_str = "code $highbits; signal $lowbits" } else { $return_str = "code $highbits"; } $endtime = `date +'%s'`; open(F, ">>$logfile") || die "run.pl: Error opening log file $logfile (again)"; $enddate = `date`; chop $enddate; print F "# Accounting: time=" . ($endtime - $starttime) . " threads=1\n"; print F "# Ended ($return_str) at " . $enddate . ", elapsed time " . ($endtime-$starttime) . " seconds\n"; close(F); exit($ret == 0 ? 0 : 1); } else { $pid[$jobid] = $childpid; $active_pids{$childpid} = $jobid; # print STDERR "Queued: " . Dumper(\%active_pids) . "\n"; } } # Now we have submitted all the jobs, lets wait until all the jobs finish foreach $child (keys %active_pids) { $jobid=$active_pids{$child}; $r = waitpid($pid[$jobid], 0); $code = $?; if ($r == -1) { die "run.pl: Error waiting for child process"; } # should never happen. if ($r != 0) { $fail[$jobid]=$code; $numfail++ if $code!=0; } # Completed successfully } # Some sanity checks: # The $fail array should not contain undefined codes # The number of non-zeros in that array should be equal to $numfail # We cannot do foreach() here, as the JOB ids do not start at zero $failed_jids=0; for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) { $job_return = $fail[$jobid]; if (not defined $job_return ) { # print Dumper(\@fail); die "run.pl: Sanity check failed: we have indication that some jobs are running " . "even after we waited for all jobs to finish" ; } if ($job_return != 0 ){ $failed_jids++;} } if ($failed_jids != $numfail) { die "run.pl: Sanity check failed: cannot find out how many jobs failed ($failed_jids x $numfail)." } if ($numfail > 0) { $ret = 1; } if ($ret != 0) { $njobs = $jobend - $jobstart + 1; if ($njobs == 1) { if (defined $jobname) { $logfile =~ s/$jobname/$jobstart/; # only one numbered job, so replace name with # that job. } print STDERR "run.pl: job failed, log is in $logfile\n"; if ($logfile =~ m/JOB/) { print STDERR "run.pl: probably you forgot to put JOB=1:\$nj in your script."; } } else { $logfile =~ s/$jobname/*/g; print STDERR "run.pl: $numfail / $njobs failed, log is in $logfile\n"; } } exit ($ret); egs/aishell2/transformerLM/utils/split_scp.pl
New file @@ -0,0 +1,246 @@ #!/usr/bin/env perl # Copyright 2010-2011 Microsoft Corporation # See ../../COPYING for clarification regarding multiple authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, # MERCHANTABLITY OR NON-INFRINGEMENT. # See the Apache 2 License for the specific language governing permissions and # limitations under the License. # This program splits up any kind of .scp or archive-type file. # If there is no utt2spk option it will work on any text file and # will split it up with an approximately equal number of lines in # each but. # With the --utt2spk option it will work on anything that has the # utterance-id as the first entry on each line; the utt2spk file is # of the form "utterance speaker" (on each line). # It splits it into equal size chunks as far as it can. If you use the utt2spk # option it will make sure these chunks coincide with speaker boundaries. In # this case, if there are more chunks than speakers (and in some other # circumstances), some of the resulting chunks will be empty and it will print # an error message and exit with nonzero status. # You will normally call this like: # split_scp.pl scp scp.1 scp.2 scp.3 ... # or # split_scp.pl --utt2spk=utt2spk scp scp.1 scp.2 scp.3 ... # Note that you can use this script to split the utt2spk file itself, # e.g. split_scp.pl --utt2spk=utt2spk utt2spk utt2spk.1 utt2spk.2 ... # You can also call the scripts like: # split_scp.pl -j 3 0 scp scp.0 # [note: with this option, it assumes zero-based indexing of the split parts, # i.e. the second number must be 0 <= n < num-jobs.] use warnings; $num_jobs = 0; $job_id = 0; $utt2spk_file = ""; $one_based = 0; for ($x = 1; $x <= 3 && @ARGV > 0; $x++) { if ($ARGV[0] eq "-j") { shift @ARGV; $num_jobs = shift @ARGV; $job_id = shift @ARGV; } if ($ARGV[0] =~ /--utt2spk=(.+)/) { $utt2spk_file=$1; shift; } if ($ARGV[0] eq '--one-based') { $one_based = 1; shift @ARGV; } } if ($num_jobs != 0 && ($num_jobs < 0 || $job_id - $one_based < 0 || $job_id - $one_based >= $num_jobs)) { die "$0: Invalid job number/index values for '-j $num_jobs $job_id" . ($one_based ? " --one-based" : "") . "'\n" } $one_based and $job_id--; if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) { die "Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ... or: split_scp.pl -j num-jobs job-id [--one-based] [--utt2spk=<utt2spk_file>] in.scp [out.scp] ... where 0 <= job-id < num-jobs, or 1 <= job-id <- num-jobs if --one-based.\n"; } $error = 0; $inscp = shift @ARGV; if ($num_jobs == 0) { # without -j option @OUTPUTS = @ARGV; } else { for ($j = 0; $j < $num_jobs; $j++) { if ($j == $job_id) { if (@ARGV > 0) { push @OUTPUTS, $ARGV[0]; } else { push @OUTPUTS, "-"; } } else { push @OUTPUTS, "/dev/null"; } } } if ($utt2spk_file ne "") { # We have the --utt2spk option... open($u_fh, '<', $utt2spk_file) || die "$0: Error opening utt2spk file $utt2spk_file: $!\n"; while(<$u_fh>) { @A = split; @A == 2 || die "$0: Bad line $_ in utt2spk file $utt2spk_file\n"; ($u,$s) = @A; $utt2spk{$u} = $s; } close $u_fh; open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n"; @spkrs = (); while(<$i_fh>) { @A = split; if(@A == 0) { die "$0: Empty or space-only line in scp file $inscp\n"; } $u = $A[0]; $s = $utt2spk{$u}; defined $s || die "$0: No utterance $u in utt2spk file $utt2spk_file\n"; if(!defined $spk_count{$s}) { push @spkrs, $s; $spk_count{$s} = 0; $spk_data{$s} = []; # ref to new empty array. } $spk_count{$s}++; push @{$spk_data{$s}}, $_; } # Now split as equally as possible .. # First allocate spks to files by allocating an approximately # equal number of speakers. $numspks = @spkrs; # number of speakers. $numscps = @OUTPUTS; # number of output files. if ($numspks < $numscps) { die "$0: Refusing to split data because number of speakers $numspks " . "is less than the number of output .scp files $numscps\n"; } for($scpidx = 0; $scpidx < $numscps; $scpidx++) { $scparray[$scpidx] = []; # [] is array reference. } for ($spkidx = 0; $spkidx < $numspks; $spkidx++) { $scpidx = int(($spkidx*$numscps) / $numspks); $spk = $spkrs[$spkidx]; push @{$scparray[$scpidx]}, $spk; $scpcount[$scpidx] += $spk_count{$spk}; } # Now will try to reassign beginning + ending speakers # to different scp's and see if it gets more balanced. # Suppose objf we're minimizing is sum_i (num utts in scp[i] - average)^2. # We can show that if considering changing just 2 scp's, we minimize # this by minimizing the squared difference in sizes. This is # equivalent to minimizing the absolute difference in sizes. This # shows this method is bound to converge. $changed = 1; while($changed) { $changed = 0; for($scpidx = 0; $scpidx < $numscps; $scpidx++) { # First try to reassign ending spk of this scp. if($scpidx < $numscps-1) { $sz = @{$scparray[$scpidx]}; if($sz > 0) { $spk = $scparray[$scpidx]->[$sz-1]; $count = $spk_count{$spk}; $nutt1 = $scpcount[$scpidx]; $nutt2 = $scpcount[$scpidx+1]; if( abs( ($nutt2+$count) - ($nutt1-$count)) < abs($nutt2 - $nutt1)) { # Would decrease # size-diff by reassigning spk... $scpcount[$scpidx+1] += $count; $scpcount[$scpidx] -= $count; pop @{$scparray[$scpidx]}; unshift @{$scparray[$scpidx+1]}, $spk; $changed = 1; } } } if($scpidx > 0 && @{$scparray[$scpidx]} > 0) { $spk = $scparray[$scpidx]->[0]; $count = $spk_count{$spk}; $nutt1 = $scpcount[$scpidx-1]; $nutt2 = $scpcount[$scpidx]; if( abs( ($nutt2-$count) - ($nutt1+$count)) < abs($nutt2 - $nutt1)) { # Would decrease # size-diff by reassigning spk... $scpcount[$scpidx-1] += $count; $scpcount[$scpidx] -= $count; shift @{$scparray[$scpidx]}; push @{$scparray[$scpidx-1]}, $spk; $changed = 1; } } } } # Now print out the files... for($scpidx = 0; $scpidx < $numscps; $scpidx++) { $scpfile = $OUTPUTS[$scpidx]; ($scpfile ne '-' ? open($f_fh, '>', $scpfile) : open($f_fh, '>&', \*STDOUT)) || die "$0: Could not open scp file $scpfile for writing: $!\n"; $count = 0; if(@{$scparray[$scpidx]} == 0) { print STDERR "$0: eError: split_scp.pl producing empty .scp file " . "$scpfile (too many splits and too few speakers?)\n"; $error = 1; } else { foreach $spk ( @{$scparray[$scpidx]} ) { print $f_fh @{$spk_data{$spk}}; $count += $spk_count{$spk}; } $count == $scpcount[$scpidx] || die "Count mismatch [code error]"; } close($f_fh); } } else { # This block is the "normal" case where there is no --utt2spk # option and we just break into equal size chunks. open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n"; $numscps = @OUTPUTS; # size of array. @F = (); while(<$i_fh>) { push @F, $_; } $numlines = @F; if($numlines == 0) { print STDERR "$0: error: empty input scp file $inscp\n"; $error = 1; } $linesperscp = int( $numlines / $numscps); # the "whole part".. $linesperscp >= 1 || die "$0: You are splitting into too many pieces! [reduce \$nj ($numscps) to be smaller than the number of lines ($numlines) in $inscp]\n"; $remainder = $numlines - ($linesperscp * $numscps); ($remainder >= 0 && $remainder < $numlines) || die "bad remainder $remainder"; # [just doing int() rounds down]. $n = 0; for($scpidx = 0; $scpidx < @OUTPUTS; $scpidx++) { $scpfile = $OUTPUTS[$scpidx]; ($scpfile ne '-' ? open($o_fh, '>', $scpfile) : open($o_fh, '>&', \*STDOUT)) || die "$0: Could not open scp file $scpfile for writing: $!\n"; for($k = 0; $k < $linesperscp + ($scpidx < $remainder ? 1 : 0); $k++) { print $o_fh $F[$n++]; } close($o_fh) || die "$0: Eror closing scp file $scpfile: $!\n"; } $n == $numlines || die "$n != $numlines [code error]"; } exit ($error);