From 837dc3758a5364fd720bb44f497c82aebe4f7dab Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期二, 21 二月 2023 16:36:15 +0800
Subject: [PATCH] Merge branch 'main' of github.com:alibaba-damo-academy/FunASR add
---
egs/aishell2/transformerLM/conf/train_lm_transformer.yaml | 31 +
egs/aishell2/transformerLM/path.sh | 6
egs/aishell2/transformerLM/run.sh | 245 ++++++++++++++
egs/aishell2/transformerLM/utils/run.pl | 356 ++++++++++++++++++++
egs/aishell2/transformerLM/utils/split_scp.pl | 246 ++++++++++++++
egs/aishell2/transformerLM/utils/parse_options.sh | 97 +++++
6 files changed, 981 insertions(+), 0 deletions(-)
diff --git a/egs/aishell2/transformerLM/conf/train_lm_transformer.yaml b/egs/aishell2/transformerLM/conf/train_lm_transformer.yaml
new file mode 100644
index 0000000..c256295
--- /dev/null
+++ b/egs/aishell2/transformerLM/conf/train_lm_transformer.yaml
@@ -0,0 +1,31 @@
+lm: transformer
+lm_conf:
+ pos_enc: null
+ embed_unit: 128
+ att_unit: 512
+ head: 8
+ unit: 2048
+ layer: 16
+ dropout_rate: 0.1
+
+# optimization related
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 6000000
+accum_grad: 1
+max_epoch: 15 # 15epoch is enougth
+
+optim: adam
+optim_conf:
+ lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+ warmup_steps: 25000
+
+best_model_criterion:
+- - valid
+ - loss
+ - min
+keep_nbest_models: 10 # 10 is good.
+
+log_interval: 50
diff --git a/egs/aishell2/transformerLM/path.sh b/egs/aishell2/transformerLM/path.sh
new file mode 100755
index 0000000..ea3c0be
--- /dev/null
+++ b/egs/aishell2/transformerLM/path.sh
@@ -0,0 +1,6 @@
+export FUNASR_DIR=$PWD/../../..
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=../../../:$PYTHONPATH
+export PATH=$FUNASR_DIR/funasr/bin:$PATH
diff --git a/egs/aishell2/transformerLM/run.sh b/egs/aishell2/transformerLM/run.sh
new file mode 100755
index 0000000..28e3762
--- /dev/null
+++ b/egs/aishell2/transformerLM/run.sh
@@ -0,0 +1,245 @@
+#!/usr/bin/env bash
+
+. ./path.sh || exit 1;
+
+# machines configuration
+CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=2
+count=1
+train_cmd=utils/run.pl
+infer_cmd=utils/run.pl
+
+# general configuration
+lang=zh
+nlsyms_txt=none # Non-linguistic symbol list if existing.
+cleaner=none # Text cleaner.
+g2p=none # g2p method (needed if token_type=phn).
+lm_fold_length=150 # fold_length for LM training.
+word_vocab_size=10000 # Size of word vocabulary.
+token_type=char
+lm_token_list=
+
+nj=10
+## path to AISHELL2 trans
+lm_train_text=
+lm_dev_text=
+lm_test_text=
+
+train_data_path_and_name_and_type=${lm_train_text},text,text
+train_shape_file=
+valid_data_path_and_name_and_type=${lm_dev_text},text,text
+valid_shape_file=
+lm_config=conf/train_lm_transformer.yaml
+exp_dir=./data
+tag=exp1
+model_dir="baseline_$(basename "${lm_config}" .yaml)_${lang}_${token_type}_${tag}"
+lm_exp=${exp_dir}/exp/${model_dir}
+inference_lm=valid.loss.ave.pth # Language model path for decoding.
+
+stage=0
+stop_stage=3
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+min() {
+ local a b
+ a=$1
+ for b in "$@"; do
+ if [ "${b}" -le "${a}" ]; then
+ a="${b}"
+ fi
+ done
+ echo "${a}"
+}
+
+# you can set gpu num for decoding here
+gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default
+ngpu=$(echo $gpuid_list | awk -F "," '{print NF}')
+
+mkdir -p ${exp_dir}/exp/${model_dir}
+token_list=${exp_dir}/exp/${model_dir}/vocab.txt
+blank="<blank>" # CTC blank symbole
+sos="<s>" # sos symbole
+eos="</s>" # eos symbole
+oov="<unk>" # Out of vocabulary symbol.
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ if [ "${token_type}" = char ] || [ "${token_type}" = word ]; then
+ echo "Stage 0: Generate character level token_list from ${lm_train_text}"
+
+ # The first symbol in token_list must be "<blank>":
+ # 0 is reserved for CTC-blank for ASR and also used as ignore-index in the other task
+ python -m funasr.bin.tokenize_text \
+ --token_type "${token_type}" \
+ --input "${lm_train_text}" \
+ --output "${token_list}" \
+ --non_linguistic_symbols "${nlsyms_txt}" \
+ --field 2- \
+ --cleaner "${cleaner}" \
+ --g2p "${g2p}" \
+ --write_vocabulary true \
+ --add_symbol "${blank}:0" \
+ --add_symbol "${sos}:1" \
+ --add_symbol "${eos}:2" \
+ --add_symbol "${oov}:-1"
+
+ else
+ echo "Error: not supported --token_type '${token_type}'"
+ exit 2
+ fi
+
+ ## use_word_lm=false
+ ## # Create word-list for word-LM training
+ ## if ${use_word_lm} && [ "${token_type}" != word ]; then
+ ## echo "Generate word level token_list from ${lm_train_text}"
+ ## python -m funasr.bin.tokenize_text \
+ ## --token_type word \
+ ## --input "${lm_train_text}" \
+ ## --output "${token_list}" \
+ ## --field 2- \
+ ## --cleaner "${cleaner}" \
+ ## --g2p "${g2p}" \
+ ## --write_vocabulary true \
+ ## --vocabulary_size "${word_vocab_size}" \
+ ## --add_symbol "${blank}:0" \
+ ## --add_symbol "${sos}:1" \
+ ## --add_symbol "${eos}:2" \
+ ## --add_symbol "${oov}:-1"
+ ## fi
+
+ lm_token_list="${token_list}"
+
+fi
+
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ echo "stage 1: Data preparation"
+
+ # 1. Split the key file
+ _logdir="${exp_dir}/exp/${model_dir}/log"
+ mkdir -p "${_logdir}"
+ # Get the minimum number among ${nj} and the number lines of input files
+ _nj=$(min "${nj}" "$(<${lm_train_text} wc -l)" "$(<${lm_dev_text} wc -l)")
+
+ key_file="${lm_train_text}"
+ split_scps=""
+ for n in $(seq ${_nj}); do
+ split_scps+=" ${_logdir}/train.${n}.scp"
+ done
+ # shellcheck disable=SC2086
+ utils/split_scp.pl "${key_file}" ${split_scps}
+
+ key_file="${lm_dev_text}"
+ split_scps=""
+ for n in $(seq ${_nj}); do
+ split_scps+=" ${_logdir}/dev.${n}.scp"
+ done
+ # shellcheck disable=SC2086
+ utils/split_scp.pl "${key_file}" ${split_scps}
+
+ # 2. Submit jobs
+ ## python ../../funasr/bin/lm_train.py \
+ ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
+ python -m funasr.bin.lm_train \
+ --collect_stats true \
+ --use_preprocessor true \
+ --token_type "${token_type}" \
+ --token_list "${lm_token_list}" \
+ --non_linguistic_symbols "${nlsyms_txt}" \
+ --cleaner "${cleaner}" \
+ --g2p "${g2p}" \
+ --train_data_path_and_name_and_type "${lm_train_text},text,text" \
+ --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
+ --train_shape_file "${_logdir}/train.JOB.scp" \
+ --valid_shape_file "${_logdir}/dev.JOB.scp" \
+ --output_dir "${_logdir}/stats.JOB" \
+ --config ${lm_config} || { cat "${_logdir}"/stats.*.log; exit 1; }
+
+ # 3. Aggregate shape files
+ _opts=
+ for i in $(seq "${_nj}"); do
+ _opts+="--input_dir ${_logdir}/stats.${i} "
+ done
+ lm_stats_dir=${exp_dir}/exp/${model_dir}
+ # shellcheck disable=SC2086
+ python -m funasr.bin.aggregate_stats_dirs ${_opts} --output_dir "${lm_stats_dir}"
+
+ # Append the num-tokens at the last dimensions. This is used for batch-bins count
+ <"${lm_stats_dir}/train/text_shape" \
+ awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
+ >"${lm_stats_dir}/train/text_shape.${token_type}"
+
+ <"${lm_stats_dir}/valid/text_shape" \
+ awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
+ >"${lm_stats_dir}/valid/text_shape.${token_type}"
+
+ train_shape_file=${lm_stats_dir}/train/text_shape.${token_type}
+ valid_shape_file=${lm_stats_dir}/valid/text_shape.${token_type}
+
+fi
+
+# Training Stage
+world_size=$gpu_num # run on one machine
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ echo "stage 2: Training"
+ mkdir -p ${lm_exp}
+ mkdir -p ${lm_exp}/log
+ INIT_FILE=${lm_exp}/ddp_init
+ if [ -f $INIT_FILE ];then
+ rm -f $INIT_FILE
+ fi
+ init_method=file://$(readlink -f $INIT_FILE)
+ echo "$0: init method is $init_method"
+ for ((i = 0; i < $gpu_num; ++i)); do
+ {
+ rank=$i
+ local_rank=$i
+ gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+ python ../../../funasr/bin/lm_train.py \
+ --gpu_id ${gpu_id} \
+ --use_preprocessor true \
+ --token_type "${token_type}" \
+ --token_list "${lm_token_list}" \
+ --non_linguistic_symbols "${nlsyms_txt}" \
+ --cleaner "${cleaner}" \
+ --train_data_path_and_name_and_type "${train_data_path_and_name_and_type}" \
+ --train_shape_file "${train_shape_file}" \
+ --valid_data_path_and_name_and_type "${valid_data_path_and_name_and_type}" \
+ --valid_shape_file "${valid_shape_file}" \
+ --fold_length "${lm_fold_length}" \
+ --resume true \
+ --output_dir "${lm_exp}" \
+ --config ${lm_config} \
+ --ngpu ${gpu_num} \
+ --num_worker_count ${count} \
+ --multiprocessing_distributed true \
+ --dist_init_method ${init_method} \
+ --dist_world_size ${world_size} \
+ --dist_rank ${rank} \
+ --local_rank ${local_rank} 1> ${exp_dir}/exp/${model_dir}/log/train.log.$i 2>&1
+ } &
+ done
+ wait
+fi
+
+# Testing Stage
+gpu_num=1
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+ echo "Stage 3: Calc perplexity: ${lm_test_text}"
+
+ python ../../../funasr/bin/lm_inference.py \
+ --output_dir "${lm_exp}/perplexity_test" \
+ --ngpu "${gpu_num}" \
+ --batch_size 1 \
+ --train_config "${lm_exp}"/config.yaml \
+ --model_file "${lm_exp}/${inference_lm}" \
+ --data_path_and_name_and_type "${lm_test_text},text,text" \
+ --num_workers 1 \
+ --split_with_space false
+fi
+
diff --git a/egs/aishell2/transformerLM/utils/parse_options.sh b/egs/aishell2/transformerLM/utils/parse_options.sh
new file mode 100755
index 0000000..71fb9e5
--- /dev/null
+++ b/egs/aishell2/transformerLM/utils/parse_options.sh
@@ -0,0 +1,97 @@
+#!/usr/bin/env bash
+
+# Copyright 2012 Johns Hopkins University (Author: Daniel Povey);
+# Arnab Ghoshal, Karel Vesely
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Parse command-line options.
+# To be sourced by another script (as in ". parse_options.sh").
+# Option format is: --option-name arg
+# and shell variable "option_name" gets set to value "arg."
+# The exception is --help, which takes no arguments, but prints the
+# $help_message variable (if defined).
+
+
+###
+### The --config file options have lower priority to command line
+### options, so we need to import them first...
+###
+
+# Now import all the configs specified by command-line, in left-to-right order
+for ((argpos=1; argpos<$#; argpos++)); do
+ if [ "${!argpos}" == "--config" ]; then
+ argpos_plus1=$((argpos+1))
+ config=${!argpos_plus1}
+ [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
+ . $config # source the config file.
+ fi
+done
+
+
+###
+### Now we process the command line options
+###
+while true; do
+ [ -z "${1:-}" ] && break; # break if there are no arguments
+ case "$1" in
+ # If the enclosing script is called with --help option, print the help
+ # message and exit. Scripts should put help messages in $help_message
+ --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
+ else printf "$help_message\n" 1>&2 ; fi;
+ exit 0 ;;
+ --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
+ exit 1 ;;
+ # If the first command-line argument begins with "--" (e.g. --foo-bar),
+ # then work out the variable name as $name, which will equal "foo_bar".
+ --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
+ # Next we test whether the variable in question is undefned-- if so it's
+ # an invalid option and we die. Note: $0 evaluates to the name of the
+ # enclosing script.
+ # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
+ # is undefined. We then have to wrap this test inside "eval" because
+ # foo_bar is itself inside a variable ($name).
+ eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
+
+ oldval="`eval echo \\$$name`";
+ # Work out whether we seem to be expecting a Boolean argument.
+ if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
+ was_bool=true;
+ else
+ was_bool=false;
+ fi
+
+ # Set the variable to the right value-- the escaped quotes make it work if
+ # the option had spaces, like --cmd "queue.pl -sync y"
+ eval $name=\"$2\";
+
+ # Check that Boolean-valued arguments are really Boolean.
+ if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
+ echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
+ exit 1;
+ fi
+ shift 2;
+ ;;
+ *) break;
+ esac
+done
+
+
+# Check for an empty argument to the --cmd option, which can easily occur as a
+# result of scripting errors.
+[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
+
+
+true; # so this script returns exit code 0.
diff --git a/egs/aishell2/transformerLM/utils/run.pl b/egs/aishell2/transformerLM/utils/run.pl
new file mode 100755
index 0000000..483f95b
--- /dev/null
+++ b/egs/aishell2/transformerLM/utils/run.pl
@@ -0,0 +1,356 @@
+#!/usr/bin/env perl
+use warnings; #sed replacement for -w perl parameter
+# In general, doing
+# run.pl some.log a b c is like running the command a b c in
+# the bash shell, and putting the standard error and output into some.log.
+# To run parallel jobs (backgrounded on the host machine), you can do (e.g.)
+# run.pl JOB=1:4 some.JOB.log a b c JOB is like running the command a b c JOB
+# and putting it in some.JOB.log, for each one. [Note: JOB can be any identifier].
+# If any of the jobs fails, this script will fail.
+
+# A typical example is:
+# run.pl some.log my-prog "--opt=foo bar" foo \| other-prog baz
+# and run.pl will run something like:
+# ( my-prog '--opt=foo bar' foo | other-prog baz ) >& some.log
+#
+# Basically it takes the command-line arguments, quotes them
+# as necessary to preserve spaces, and evaluates them with bash.
+# In addition it puts the command line at the top of the log, and
+# the start and end times of the command at the beginning and end.
+# The reason why this is useful is so that we can create a different
+# version of this program that uses a queueing system instead.
+
+#use Data::Dumper;
+
+@ARGV < 2 && die "usage: run.pl log-file command-line arguments...";
+
+#print STDERR "COMMAND-LINE: " . Dumper(\@ARGV) . "\n";
+$job_pick = 'all';
+$max_jobs_run = -1;
+$jobstart = 1;
+$jobend = 1;
+$ignored_opts = ""; # These will be ignored.
+
+# First parse an option like JOB=1:4, and any
+# options that would normally be given to
+# queue.pl, which we will just discard.
+
+for (my $x = 1; $x <= 2; $x++) { # This for-loop is to
+ # allow the JOB=1:n option to be interleaved with the
+ # options to qsub.
+ while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) {
+ # parse any options that would normally go to qsub, but which will be ignored here.
+ my $switch = shift @ARGV;
+ if ($switch eq "-V") {
+ $ignored_opts .= "-V ";
+ } elsif ($switch eq "--max-jobs-run" || $switch eq "-tc") {
+ # we do support the option --max-jobs-run n, and its GridEngine form -tc n.
+ # if the command appears multiple times uses the smallest option.
+ if ( $max_jobs_run <= 0 ) {
+ $max_jobs_run = shift @ARGV;
+ } else {
+ my $new_constraint = shift @ARGV;
+ if ( ($new_constraint < $max_jobs_run) ) {
+ $max_jobs_run = $new_constraint;
+ }
+ }
+
+ if (! ($max_jobs_run > 0)) {
+ die "run.pl: invalid option --max-jobs-run $max_jobs_run";
+ }
+ } else {
+ my $argument = shift @ARGV;
+ if ($argument =~ m/^--/) {
+ print STDERR "run.pl: WARNING: suspicious argument '$argument' to $switch; starts with '-'\n";
+ }
+ if ($switch eq "-sync" && $argument =~ m/^[yY]/) {
+ $ignored_opts .= "-sync "; # Note: in the
+ # corresponding code in queue.pl it says instead, just "$sync = 1;".
+ } elsif ($switch eq "-pe") { # e.g. -pe smp 5
+ my $argument2 = shift @ARGV;
+ $ignored_opts .= "$switch $argument $argument2 ";
+ } elsif ($switch eq "--gpu") {
+ $using_gpu = $argument;
+ } elsif ($switch eq "--pick") {
+ if($argument =~ m/^(all|failed|incomplete)$/) {
+ $job_pick = $argument;
+ } else {
+ print STDERR "run.pl: ERROR: --pick argument must be one of 'all', 'failed' or 'incomplete'"
+ }
+ } else {
+ # Ignore option.
+ $ignored_opts .= "$switch $argument ";
+ }
+ }
+ }
+ if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:20
+ $jobname = $1;
+ $jobstart = $2;
+ $jobend = $3;
+ if ($jobstart > $jobend) {
+ die "run.pl: invalid job range $ARGV[0]";
+ }
+ if ($jobstart <= 0) {
+ die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is required for GridEngine compatibility).";
+ }
+ shift;
+ } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1.
+ $jobname = $1;
+ $jobstart = $2;
+ $jobend = $2;
+ shift;
+ } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) {
+ print STDERR "run.pl: Warning: suspicious first argument to run.pl: $ARGV[0]\n";
+ }
+}
+
+# Users found this message confusing so we are removing it.
+# if ($ignored_opts ne "") {
+# print STDERR "run.pl: Warning: ignoring options \"$ignored_opts\"\n";
+# }
+
+if ($max_jobs_run == -1) { # If --max-jobs-run option not set,
+ # then work out the number of processors if possible,
+ # and set it based on that.
+ $max_jobs_run = 0;
+ if ($using_gpu) {
+ if (open(P, "nvidia-smi -L |")) {
+ $max_jobs_run++ while (<P>);
+ close(P);
+ }
+ if ($max_jobs_run == 0) {
+ $max_jobs_run = 1;
+ print STDERR "run.pl: Warning: failed to detect number of GPUs from nvidia-smi, using ${max_jobs_run}\n";
+ }
+ } elsif (open(P, "</proc/cpuinfo")) { # Linux
+ while (<P>) { if (m/^processor/) { $max_jobs_run++; } }
+ if ($max_jobs_run == 0) {
+ print STDERR "run.pl: Warning: failed to detect any processors from /proc/cpuinfo\n";
+ $max_jobs_run = 10; # reasonable default.
+ }
+ close(P);
+ } elsif (open(P, "sysctl -a |")) { # BSD/Darwin
+ while (<P>) {
+ if (m/hw\.ncpu\s*[:=]\s*(\d+)/) { # hw.ncpu = 4, or hw.ncpu: 4
+ $max_jobs_run = $1;
+ last;
+ }
+ }
+ close(P);
+ if ($max_jobs_run == 0) {
+ print STDERR "run.pl: Warning: failed to detect any processors from sysctl -a\n";
+ $max_jobs_run = 10; # reasonable default.
+ }
+ } else {
+ # allow at most 32 jobs at once, on non-UNIX systems; change this code
+ # if you need to change this default.
+ $max_jobs_run = 32;
+ }
+ # The just-computed value of $max_jobs_run is just the number of processors
+ # (or our best guess); and if it happens that the number of jobs we need to
+ # run is just slightly above $max_jobs_run, it will make sense to increase
+ # $max_jobs_run to equal the number of jobs, so we don't have a small number
+ # of leftover jobs.
+ $num_jobs = $jobend - $jobstart + 1;
+ if (!$using_gpu &&
+ $num_jobs > $max_jobs_run && $num_jobs < 1.4 * $max_jobs_run) {
+ $max_jobs_run = $num_jobs;
+ }
+}
+
+sub pick_or_exit {
+ # pick_or_exit ( $logfile )
+ # Invoked before each job is started helps to run jobs selectively.
+ #
+ # Given the name of the output logfile decides whether the job must be
+ # executed (by returning from the subroutine) or not (by terminating the
+ # process calling exit)
+ #
+ # PRE: $job_pick is a global variable set by command line switch --pick
+ # and indicates which class of jobs must be executed.
+ #
+ # 1) If a failed job is not executed the process exit code will indicate
+ # failure, just as if the task was just executed and failed.
+ #
+ # 2) If a task is incomplete it will be executed. Incomplete may be either
+ # a job whose log file does not contain the accounting notes in the end,
+ # or a job whose log file does not exist.
+ #
+ # 3) If the $job_pick is set to 'all' (default behavior) a task will be
+ # executed regardless of the result of previous attempts.
+ #
+ # This logic could have been implemented in the main execution loop
+ # but a subroutine to preserve the current level of readability of
+ # that part of the code.
+ #
+ # Alexandre Felipe, (o.alexandre.felipe@gmail.com) 14th of August of 2020
+ #
+ if($job_pick eq 'all'){
+ return; # no need to bother with the previous log
+ }
+ open my $fh, "<", $_[0] or return; # job not executed yet
+ my $log_line;
+ my $cur_line;
+ while ($cur_line = <$fh>) {
+ if( $cur_line =~ m/# Ended \(code .*/ ) {
+ $log_line = $cur_line;
+ }
+ }
+ close $fh;
+ if (! defined($log_line)){
+ return; # incomplete
+ }
+ if ( $log_line =~ m/# Ended \(code 0\).*/ ) {
+ exit(0); # complete
+ } elsif ( $log_line =~ m/# Ended \(code \d+(; signal \d+)?\).*/ ){
+ if ($job_pick !~ m/^(failed|all)$/) {
+ exit(1); # failed but not going to run
+ } else {
+ return; # failed
+ }
+ } elsif ( $log_line =~ m/.*\S.*/ ) {
+ return; # incomplete jobs are always run
+ }
+}
+
+
+$logfile = shift @ARGV;
+
+if (defined $jobname && $logfile !~ m/$jobname/ &&
+ $jobend > $jobstart) {
+ print STDERR "run.pl: you are trying to run a parallel job but "
+ . "you are putting the output into just one log file ($logfile)\n";
+ exit(1);
+}
+
+$cmd = "";
+
+foreach $x (@ARGV) {
+ if ($x =~ m/^\S+$/) { $cmd .= $x . " "; }
+ elsif ($x =~ m:\":) { $cmd .= "'$x' "; }
+ else { $cmd .= "\"$x\" "; }
+}
+
+#$Data::Dumper::Indent=0;
+$ret = 0;
+$numfail = 0;
+%active_pids=();
+
+use POSIX ":sys_wait_h";
+for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
+ if (scalar(keys %active_pids) >= $max_jobs_run) {
+
+ # Lets wait for a change in any child's status
+ # Then we have to work out which child finished
+ $r = waitpid(-1, 0);
+ $code = $?;
+ if ($r < 0 ) { die "run.pl: Error waiting for child process"; } # should never happen.
+ if ( defined $active_pids{$r} ) {
+ $jid=$active_pids{$r};
+ $fail[$jid]=$code;
+ if ($code !=0) { $numfail++;}
+ delete $active_pids{$r};
+ # print STDERR "Finished: $r/$jid " . Dumper(\%active_pids) . "\n";
+ } else {
+ die "run.pl: Cannot find the PID of the child process that just finished.";
+ }
+
+ # In theory we could do a non-blocking waitpid over all jobs running just
+ # to find out if only one or more jobs finished during the previous waitpid()
+ # However, we just omit this and will reap the next one in the next pass
+ # through the for(;;) cycle
+ }
+ $childpid = fork();
+ if (!defined $childpid) { die "run.pl: Error forking in run.pl (writing to $logfile)"; }
+ if ($childpid == 0) { # We're in the child... this branch
+ # executes the job and returns (possibly with an error status).
+ if (defined $jobname) {
+ $cmd =~ s/$jobname/$jobid/g;
+ $logfile =~ s/$jobname/$jobid/g;
+ }
+ # exit if the job does not need to be executed
+ pick_or_exit( $logfile );
+
+ system("mkdir -p `dirname $logfile` 2>/dev/null");
+ open(F, ">$logfile") || die "run.pl: Error opening log file $logfile";
+ print F "# " . $cmd . "\n";
+ print F "# Started at " . `date`;
+ $starttime = `date +'%s'`;
+ print F "#\n";
+ close(F);
+
+ # Pipe into bash.. make sure we're not using any other shell.
+ open(B, "|bash") || die "run.pl: Error opening shell command";
+ print B "( " . $cmd . ") 2>>$logfile >> $logfile";
+ close(B); # If there was an error, exit status is in $?
+ $ret = $?;
+
+ $lowbits = $ret & 127;
+ $highbits = $ret >> 8;
+ if ($lowbits != 0) { $return_str = "code $highbits; signal $lowbits" }
+ else { $return_str = "code $highbits"; }
+
+ $endtime = `date +'%s'`;
+ open(F, ">>$logfile") || die "run.pl: Error opening log file $logfile (again)";
+ $enddate = `date`;
+ chop $enddate;
+ print F "# Accounting: time=" . ($endtime - $starttime) . " threads=1\n";
+ print F "# Ended ($return_str) at " . $enddate . ", elapsed time " . ($endtime-$starttime) . " seconds\n";
+ close(F);
+ exit($ret == 0 ? 0 : 1);
+ } else {
+ $pid[$jobid] = $childpid;
+ $active_pids{$childpid} = $jobid;
+ # print STDERR "Queued: " . Dumper(\%active_pids) . "\n";
+ }
+}
+
+# Now we have submitted all the jobs, lets wait until all the jobs finish
+foreach $child (keys %active_pids) {
+ $jobid=$active_pids{$child};
+ $r = waitpid($pid[$jobid], 0);
+ $code = $?;
+ if ($r == -1) { die "run.pl: Error waiting for child process"; } # should never happen.
+ if ($r != 0) { $fail[$jobid]=$code; $numfail++ if $code!=0; } # Completed successfully
+}
+
+# Some sanity checks:
+# The $fail array should not contain undefined codes
+# The number of non-zeros in that array should be equal to $numfail
+# We cannot do foreach() here, as the JOB ids do not start at zero
+$failed_jids=0;
+for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
+ $job_return = $fail[$jobid];
+ if (not defined $job_return ) {
+ # print Dumper(\@fail);
+
+ die "run.pl: Sanity check failed: we have indication that some jobs are running " .
+ "even after we waited for all jobs to finish" ;
+ }
+ if ($job_return != 0 ){ $failed_jids++;}
+}
+if ($failed_jids != $numfail) {
+ die "run.pl: Sanity check failed: cannot find out how many jobs failed ($failed_jids x $numfail)."
+}
+if ($numfail > 0) { $ret = 1; }
+
+if ($ret != 0) {
+ $njobs = $jobend - $jobstart + 1;
+ if ($njobs == 1) {
+ if (defined $jobname) {
+ $logfile =~ s/$jobname/$jobstart/; # only one numbered job, so replace name with
+ # that job.
+ }
+ print STDERR "run.pl: job failed, log is in $logfile\n";
+ if ($logfile =~ m/JOB/) {
+ print STDERR "run.pl: probably you forgot to put JOB=1:\$nj in your script.";
+ }
+ }
+ else {
+ $logfile =~ s/$jobname/*/g;
+ print STDERR "run.pl: $numfail / $njobs failed, log is in $logfile\n";
+ }
+}
+
+
+exit ($ret);
diff --git a/egs/aishell2/transformerLM/utils/split_scp.pl b/egs/aishell2/transformerLM/utils/split_scp.pl
new file mode 100755
index 0000000..0876dcb
--- /dev/null
+++ b/egs/aishell2/transformerLM/utils/split_scp.pl
@@ -0,0 +1,246 @@
+#!/usr/bin/env perl
+
+# Copyright 2010-2011 Microsoft Corporation
+
+# See ../../COPYING for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This program splits up any kind of .scp or archive-type file.
+# If there is no utt2spk option it will work on any text file and
+# will split it up with an approximately equal number of lines in
+# each but.
+# With the --utt2spk option it will work on anything that has the
+# utterance-id as the first entry on each line; the utt2spk file is
+# of the form "utterance speaker" (on each line).
+# It splits it into equal size chunks as far as it can. If you use the utt2spk
+# option it will make sure these chunks coincide with speaker boundaries. In
+# this case, if there are more chunks than speakers (and in some other
+# circumstances), some of the resulting chunks will be empty and it will print
+# an error message and exit with nonzero status.
+# You will normally call this like:
+# split_scp.pl scp scp.1 scp.2 scp.3 ...
+# or
+# split_scp.pl --utt2spk=utt2spk scp scp.1 scp.2 scp.3 ...
+# Note that you can use this script to split the utt2spk file itself,
+# e.g. split_scp.pl --utt2spk=utt2spk utt2spk utt2spk.1 utt2spk.2 ...
+
+# You can also call the scripts like:
+# split_scp.pl -j 3 0 scp scp.0
+# [note: with this option, it assumes zero-based indexing of the split parts,
+# i.e. the second number must be 0 <= n < num-jobs.]
+
+use warnings;
+
+$num_jobs = 0;
+$job_id = 0;
+$utt2spk_file = "";
+$one_based = 0;
+
+for ($x = 1; $x <= 3 && @ARGV > 0; $x++) {
+ if ($ARGV[0] eq "-j") {
+ shift @ARGV;
+ $num_jobs = shift @ARGV;
+ $job_id = shift @ARGV;
+ }
+ if ($ARGV[0] =~ /--utt2spk=(.+)/) {
+ $utt2spk_file=$1;
+ shift;
+ }
+ if ($ARGV[0] eq '--one-based') {
+ $one_based = 1;
+ shift @ARGV;
+ }
+}
+
+if ($num_jobs != 0 && ($num_jobs < 0 || $job_id - $one_based < 0 ||
+ $job_id - $one_based >= $num_jobs)) {
+ die "$0: Invalid job number/index values for '-j $num_jobs $job_id" .
+ ($one_based ? " --one-based" : "") . "'\n"
+}
+
+$one_based
+ and $job_id--;
+
+if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) {
+ die
+"Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ...
+ or: split_scp.pl -j num-jobs job-id [--one-based] [--utt2spk=<utt2spk_file>] in.scp [out.scp]
+ ... where 0 <= job-id < num-jobs, or 1 <= job-id <- num-jobs if --one-based.\n";
+}
+
+$error = 0;
+$inscp = shift @ARGV;
+if ($num_jobs == 0) { # without -j option
+ @OUTPUTS = @ARGV;
+} else {
+ for ($j = 0; $j < $num_jobs; $j++) {
+ if ($j == $job_id) {
+ if (@ARGV > 0) { push @OUTPUTS, $ARGV[0]; }
+ else { push @OUTPUTS, "-"; }
+ } else {
+ push @OUTPUTS, "/dev/null";
+ }
+ }
+}
+
+if ($utt2spk_file ne "") { # We have the --utt2spk option...
+ open($u_fh, '<', $utt2spk_file) || die "$0: Error opening utt2spk file $utt2spk_file: $!\n";
+ while(<$u_fh>) {
+ @A = split;
+ @A == 2 || die "$0: Bad line $_ in utt2spk file $utt2spk_file\n";
+ ($u,$s) = @A;
+ $utt2spk{$u} = $s;
+ }
+ close $u_fh;
+ open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";
+ @spkrs = ();
+ while(<$i_fh>) {
+ @A = split;
+ if(@A == 0) { die "$0: Empty or space-only line in scp file $inscp\n"; }
+ $u = $A[0];
+ $s = $utt2spk{$u};
+ defined $s || die "$0: No utterance $u in utt2spk file $utt2spk_file\n";
+ if(!defined $spk_count{$s}) {
+ push @spkrs, $s;
+ $spk_count{$s} = 0;
+ $spk_data{$s} = []; # ref to new empty array.
+ }
+ $spk_count{$s}++;
+ push @{$spk_data{$s}}, $_;
+ }
+ # Now split as equally as possible ..
+ # First allocate spks to files by allocating an approximately
+ # equal number of speakers.
+ $numspks = @spkrs; # number of speakers.
+ $numscps = @OUTPUTS; # number of output files.
+ if ($numspks < $numscps) {
+ die "$0: Refusing to split data because number of speakers $numspks " .
+ "is less than the number of output .scp files $numscps\n";
+ }
+ for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
+ $scparray[$scpidx] = []; # [] is array reference.
+ }
+ for ($spkidx = 0; $spkidx < $numspks; $spkidx++) {
+ $scpidx = int(($spkidx*$numscps) / $numspks);
+ $spk = $spkrs[$spkidx];
+ push @{$scparray[$scpidx]}, $spk;
+ $scpcount[$scpidx] += $spk_count{$spk};
+ }
+
+ # Now will try to reassign beginning + ending speakers
+ # to different scp's and see if it gets more balanced.
+ # Suppose objf we're minimizing is sum_i (num utts in scp[i] - average)^2.
+ # We can show that if considering changing just 2 scp's, we minimize
+ # this by minimizing the squared difference in sizes. This is
+ # equivalent to minimizing the absolute difference in sizes. This
+ # shows this method is bound to converge.
+
+ $changed = 1;
+ while($changed) {
+ $changed = 0;
+ for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
+ # First try to reassign ending spk of this scp.
+ if($scpidx < $numscps-1) {
+ $sz = @{$scparray[$scpidx]};
+ if($sz > 0) {
+ $spk = $scparray[$scpidx]->[$sz-1];
+ $count = $spk_count{$spk};
+ $nutt1 = $scpcount[$scpidx];
+ $nutt2 = $scpcount[$scpidx+1];
+ if( abs( ($nutt2+$count) - ($nutt1-$count))
+ < abs($nutt2 - $nutt1)) { # Would decrease
+ # size-diff by reassigning spk...
+ $scpcount[$scpidx+1] += $count;
+ $scpcount[$scpidx] -= $count;
+ pop @{$scparray[$scpidx]};
+ unshift @{$scparray[$scpidx+1]}, $spk;
+ $changed = 1;
+ }
+ }
+ }
+ if($scpidx > 0 && @{$scparray[$scpidx]} > 0) {
+ $spk = $scparray[$scpidx]->[0];
+ $count = $spk_count{$spk};
+ $nutt1 = $scpcount[$scpidx-1];
+ $nutt2 = $scpcount[$scpidx];
+ if( abs( ($nutt2-$count) - ($nutt1+$count))
+ < abs($nutt2 - $nutt1)) { # Would decrease
+ # size-diff by reassigning spk...
+ $scpcount[$scpidx-1] += $count;
+ $scpcount[$scpidx] -= $count;
+ shift @{$scparray[$scpidx]};
+ push @{$scparray[$scpidx-1]}, $spk;
+ $changed = 1;
+ }
+ }
+ }
+ }
+ # Now print out the files...
+ for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
+ $scpfile = $OUTPUTS[$scpidx];
+ ($scpfile ne '-' ? open($f_fh, '>', $scpfile)
+ : open($f_fh, '>&', \*STDOUT)) ||
+ die "$0: Could not open scp file $scpfile for writing: $!\n";
+ $count = 0;
+ if(@{$scparray[$scpidx]} == 0) {
+ print STDERR "$0: eError: split_scp.pl producing empty .scp file " .
+ "$scpfile (too many splits and too few speakers?)\n";
+ $error = 1;
+ } else {
+ foreach $spk ( @{$scparray[$scpidx]} ) {
+ print $f_fh @{$spk_data{$spk}};
+ $count += $spk_count{$spk};
+ }
+ $count == $scpcount[$scpidx] || die "Count mismatch [code error]";
+ }
+ close($f_fh);
+ }
+} else {
+ # This block is the "normal" case where there is no --utt2spk
+ # option and we just break into equal size chunks.
+
+ open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";
+
+ $numscps = @OUTPUTS; # size of array.
+ @F = ();
+ while(<$i_fh>) {
+ push @F, $_;
+ }
+ $numlines = @F;
+ if($numlines == 0) {
+ print STDERR "$0: error: empty input scp file $inscp\n";
+ $error = 1;
+ }
+ $linesperscp = int( $numlines / $numscps); # the "whole part"..
+ $linesperscp >= 1 || die "$0: You are splitting into too many pieces! [reduce \$nj ($numscps) to be smaller than the number of lines ($numlines) in $inscp]\n";
+ $remainder = $numlines - ($linesperscp * $numscps);
+ ($remainder >= 0 && $remainder < $numlines) || die "bad remainder $remainder";
+ # [just doing int() rounds down].
+ $n = 0;
+ for($scpidx = 0; $scpidx < @OUTPUTS; $scpidx++) {
+ $scpfile = $OUTPUTS[$scpidx];
+ ($scpfile ne '-' ? open($o_fh, '>', $scpfile)
+ : open($o_fh, '>&', \*STDOUT)) ||
+ die "$0: Could not open scp file $scpfile for writing: $!\n";
+ for($k = 0; $k < $linesperscp + ($scpidx < $remainder ? 1 : 0); $k++) {
+ print $o_fh $F[$n++];
+ }
+ close($o_fh) || die "$0: Eror closing scp file $scpfile: $!\n";
+ }
+ $n == $numlines || die "$n != $numlines [code error]";
+}
+
+exit ($error);
--
Gitblit v1.9.1