python/FunASR-XL.git

parent: 098cf074 | 补丁 | 提交 | ignore whitespace

Merge pull request #142 from alibaba-damo-academy/dev_lyb

zhifu gao

2023-02-21 345fe5ccd5d14b84354f1b3044822065b07e89af

Merge pull request #142 from alibaba-damo-academy/dev_lyb

add transformerLM egs(aishell2)

6个文件已添加

	egs/aishell2/transformerLM/conf/train_lm_transformer.yaml	31 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/aishell2/transformerLM/path.sh	6 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/aishell2/transformerLM/run.sh	245 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/aishell2/transformerLM/utils/parse_options.sh	97 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/aishell2/transformerLM/utils/run.pl	356 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/aishell2/transformerLM/utils/split_scp.pl	246 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史

 egs/aishell2/transformerLM/conf/train_lm_transformer.yaml

New file
@@ -0,0 +1,31 @@
lm: transformer
lm_conf:
    pos_enc: null
    embed_unit: 128
    att_unit: 512
    head: 8
    unit: 2048
    layer: 16
    dropout_rate: 0.1

# optimization related
grad_clip: 5.0
batch_type: numel
batch_bins: 6000000
accum_grad: 1
max_epoch: 15  # 15epoch is enougth

optim: adam
optim_conf:
   lr: 0.001
scheduler: warmuplr
scheduler_conf:
   warmup_steps: 25000

best_model_criterion:
-   - valid
    - loss
    - min
keep_nbest_models: 10  # 10 is good.

log_interval: 50

 egs/aishell2/transformerLM/path.sh

New file
@@ -0,0 +1,6 @@
export FUNASR_DIR=$PWD/../../..

# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=../../../:$PYTHONPATH
export PATH=$FUNASR_DIR/funasr/bin:$PATH

 egs/aishell2/transformerLM/run.sh

New file
@@ -0,0 +1,245 @@
#!/usr/bin/env bash

. ./path.sh || exit 1;

# machines configuration
CUDA_VISIBLE_DEVICES="0,1"
gpu_num=2
count=1
train_cmd=utils/run.pl
infer_cmd=utils/run.pl

# general configuration
lang=zh
nlsyms_txt=none            # Non-linguistic symbol list if existing.
cleaner=none               # Text cleaner.
g2p=none         # g2p method (needed if token_type=phn).
lm_fold_length=150         # fold_length for LM training.
word_vocab_size=10000 # Size of word vocabulary.
token_type=char
lm_token_list=

nj=10
## path to AISHELL2 trans
lm_train_text=
lm_dev_text=
lm_test_text=

train_data_path_and_name_and_type=${lm_train_text},text,text
train_shape_file=
valid_data_path_and_name_and_type=${lm_dev_text},text,text
valid_shape_file=
lm_config=conf/train_lm_transformer.yaml
exp_dir=./data
tag=exp1
model_dir="baseline_$(basename "${lm_config}" .yaml)_${lang}_${token_type}_${tag}"
lm_exp=${exp_dir}/exp/${model_dir}
inference_lm=valid.loss.ave.pth       # Language model path for decoding.

stage=0
stop_stage=3

. utils/parse_options.sh || exit 1;

# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
set -u
set -o pipefail

min() {
  local a b
  a=$1
  for b in "$@"; do
      if [ "${b}" -le "${a}" ]; then
          a="${b}"
      fi
  done
  echo "${a}"
}

# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default
ngpu=$(echo $gpuid_list | awk -F "," '{print NF}')

mkdir -p ${exp_dir}/exp/${model_dir}
token_list=${exp_dir}/exp/${model_dir}/vocab.txt
blank="<blank>" # CTC blank symbole
sos="<s>"       # sos symbole
eos="</s>"      # eos symbole
oov="<unk>"     # Out of vocabulary symbol.
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    if [ "${token_type}" = char ] || [ "${token_type}" = word ]; then
        echo "Stage 0: Generate character level token_list from ${lm_train_text}"

        # The first symbol in token_list must be "<blank>":
        # 0 is reserved for CTC-blank for ASR and also used as ignore-index in the other task
        python -m funasr.bin.tokenize_text  \
            --token_type "${token_type}" \
            --input "${lm_train_text}" \
            --output "${token_list}" \
            --non_linguistic_symbols "${nlsyms_txt}" \
            --field 2- \
            --cleaner "${cleaner}" \
            --g2p "${g2p}" \
            --write_vocabulary true \
            --add_symbol "${blank}:0" \
            --add_symbol "${sos}:1" \
            --add_symbol "${eos}:2" \
            --add_symbol "${oov}:-1"

    else
        echo "Error: not supported --token_type '${token_type}'"
        exit 2
    fi

    ## use_word_lm=false
    ## # Create word-list for word-LM training
    ## if ${use_word_lm} && [ "${token_type}" != word ]; then
    ##     echo "Generate word level token_list from ${lm_train_text}"
    ##     python -m funasr.bin.tokenize_text \
    ##         --token_type word \
    ##         --input "${lm_train_text}" \
    ##         --output "${token_list}" \
    ##         --field 2- \
    ##         --cleaner "${cleaner}" \
    ##         --g2p "${g2p}" \
    ##         --write_vocabulary true \
    ##         --vocabulary_size "${word_vocab_size}" \
    ##         --add_symbol "${blank}:0" \
    ##         --add_symbol "${sos}:1" \
    ##         --add_symbol "${eos}:2" \
    ##         --add_symbol "${oov}:-1" 
    ## fi

    lm_token_list="${token_list}"

fi


if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    echo "stage 1: Data preparation"
    
    # 1. Split the key file
    _logdir="${exp_dir}/exp/${model_dir}/log"
    mkdir -p "${_logdir}"
    # Get the minimum number among ${nj} and the number lines of input files
    _nj=$(min "${nj}" "$(<${lm_train_text} wc -l)" "$(<${lm_dev_text} wc -l)")

    key_file="${lm_train_text}"
    split_scps=""
    for n in $(seq ${_nj}); do
        split_scps+=" ${_logdir}/train.${n}.scp"
    done
    # shellcheck disable=SC2086
    utils/split_scp.pl "${key_file}" ${split_scps}

    key_file="${lm_dev_text}"
    split_scps=""
    for n in $(seq ${_nj}); do
        split_scps+=" ${_logdir}/dev.${n}.scp"
    done
    # shellcheck disable=SC2086
    utils/split_scp.pl "${key_file}" ${split_scps}

    # 2. Submit jobs
    ## python ../../funasr/bin/lm_train.py \
    ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
        python -m funasr.bin.lm_train \
            --collect_stats true \
            --use_preprocessor true \
            --token_type "${token_type}" \
            --token_list "${lm_token_list}" \
            --non_linguistic_symbols "${nlsyms_txt}" \
            --cleaner "${cleaner}" \
            --g2p "${g2p}" \
            --train_data_path_and_name_and_type "${lm_train_text},text,text" \
            --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
            --train_shape_file "${_logdir}/train.JOB.scp" \
            --valid_shape_file "${_logdir}/dev.JOB.scp" \
            --output_dir "${_logdir}/stats.JOB" \
            --config ${lm_config} || { cat "${_logdir}"/stats.*.log; exit 1; }

    # 3. Aggregate shape files
    _opts=
    for i in $(seq "${_nj}"); do
        _opts+="--input_dir ${_logdir}/stats.${i} "
    done
    lm_stats_dir=${exp_dir}/exp/${model_dir}
    # shellcheck disable=SC2086
    python -m funasr.bin.aggregate_stats_dirs ${_opts} --output_dir "${lm_stats_dir}"

    # Append the num-tokens at the last dimensions. This is used for batch-bins count
    <"${lm_stats_dir}/train/text_shape" \
        awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
        >"${lm_stats_dir}/train/text_shape.${token_type}"

    <"${lm_stats_dir}/valid/text_shape" \
        awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
        >"${lm_stats_dir}/valid/text_shape.${token_type}"
    
    train_shape_file=${lm_stats_dir}/train/text_shape.${token_type}
    valid_shape_file=${lm_stats_dir}/valid/text_shape.${token_type}
    
fi

# Training Stage
world_size=$gpu_num  # run on one machine
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    echo "stage 2: Training"
    mkdir -p ${lm_exp}
    mkdir -p ${lm_exp}/log
    INIT_FILE=${lm_exp}/ddp_init
    if [ -f $INIT_FILE ];then
        rm -f $INIT_FILE
    fi
    init_method=file://$(readlink -f $INIT_FILE)
    echo "$0: init method is $init_method"
    for ((i = 0; i < $gpu_num; ++i)); do
        {
            rank=$i
            local_rank=$i
            gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
            python  ../../../funasr/bin/lm_train.py \
                --gpu_id ${gpu_id} \
                --use_preprocessor true \
                --token_type "${token_type}" \
                --token_list "${lm_token_list}" \
                --non_linguistic_symbols "${nlsyms_txt}" \
                --cleaner "${cleaner}" \
                --train_data_path_and_name_and_type "${train_data_path_and_name_and_type}" \
                --train_shape_file "${train_shape_file}" \
                --valid_data_path_and_name_and_type "${valid_data_path_and_name_and_type}" \
                --valid_shape_file "${valid_shape_file}" \
                --fold_length "${lm_fold_length}" \
                --resume true \
                --output_dir "${lm_exp}" \
                --config ${lm_config} \
                --ngpu ${gpu_num} \
                --num_worker_count ${count} \
                --multiprocessing_distributed true \
                --dist_init_method ${init_method} \
                --dist_world_size ${world_size} \
                --dist_rank ${rank} \
                --local_rank ${local_rank} 1> ${exp_dir}/exp/${model_dir}/log/train.log.$i 2>&1
        } & 
      done
      wait
fi

# Testing Stage
gpu_num=1
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    echo "Stage 3: Calc perplexity: ${lm_test_text}"
    
    python ../../../funasr/bin/lm_inference.py \
        --output_dir "${lm_exp}/perplexity_test" \
        --ngpu "${gpu_num}" \
        --batch_size 1 \
        --train_config "${lm_exp}"/config.yaml \
        --model_file "${lm_exp}/${inference_lm}" \
        --data_path_and_name_and_type "${lm_test_text},text,text" \
        --num_workers 1 \
        --split_with_space false 
fi


 egs/aishell2/transformerLM/utils/parse_options.sh

New file
@@ -0,0 +1,97 @@
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey);
#                 Arnab Ghoshal, Karel Vesely

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


# Parse command-line options.
# To be sourced by another script (as in ". parse_options.sh").
# Option format is: --option-name arg
# and shell variable "option_name" gets set to value "arg."
# The exception is --help, which takes no arguments, but prints the
# $help_message variable (if defined).


###
### The --config file options have lower priority to command line
### options, so we need to import them first...
###

# Now import all the configs specified by command-line, in left-to-right order
for ((argpos=1; argpos<$#; argpos++)); do
  if [ "${!argpos}" == "--config" ]; then
    argpos_plus1=$((argpos+1))
    config=${!argpos_plus1}
    [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
    . $config  # source the config file.
  fi
done


###
### Now we process the command line options
###
while true; do
  [ -z "${1:-}" ] && break;  # break if there are no arguments
  case "$1" in
    # If the enclosing script is called with --help option, print the help
    # message and exit.  Scripts should put help messages in $help_message
    --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
      else printf "$help_message\n" 1>&2 ; fi;
      exit 0 ;;
    --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
      exit 1 ;;
    # If the first command-line argument begins with "--" (e.g. --foo-bar),
    # then work out the variable name as $name, which will equal "foo_bar".
    --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
      # Next we test whether the variable in question is undefned-- if so it's
      # an invalid option and we die.  Note: $0 evaluates to the name of the
      # enclosing script.
      # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
      # is undefined.  We then have to wrap this test inside "eval" because
      # foo_bar is itself inside a variable ($name).
      eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;

      oldval="`eval echo \\$$name`";
      # Work out whether we seem to be expecting a Boolean argument.
      if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
        was_bool=true;
      else
        was_bool=false;
      fi

      # Set the variable to the right value-- the escaped quotes make it work if
      # the option had spaces, like --cmd "queue.pl -sync y"
      eval $name=\"$2\";

      # Check that Boolean-valued arguments are really Boolean.
      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
        exit 1;
      fi
      shift 2;
      ;;
  *) break;
  esac
done


# Check for an empty argument to the --cmd option, which can easily occur as a
# result of scripting errors.
[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;


true; # so this script returns exit code 0.

 egs/aishell2/transformerLM/utils/run.pl

New file
@@ -0,0 +1,356 @@
#!/usr/bin/env perl
use warnings; #sed replacement for -w perl parameter
# In general, doing
#  run.pl some.log a b c is like running the command a b c in
# the bash shell, and putting the standard error and output into some.log.
# To run parallel jobs (backgrounded on the host machine), you can do (e.g.)
#  run.pl JOB=1:4 some.JOB.log a b c JOB is like running the command a b c JOB
# and putting it in some.JOB.log, for each one. [Note: JOB can be any identifier].
# If any of the jobs fails, this script will fail.

# A typical example is:
#  run.pl some.log my-prog "--opt=foo bar" foo \|  other-prog baz
# and run.pl will run something like:
# ( my-prog '--opt=foo bar' foo |  other-prog baz ) >& some.log
#
# Basically it takes the command-line arguments, quotes them
# as necessary to preserve spaces, and evaluates them with bash.
# In addition it puts the command line at the top of the log, and
# the start and end times of the command at the beginning and end.
# The reason why this is useful is so that we can create a different
# version of this program that uses a queueing system instead.

#use Data::Dumper;

@ARGV < 2 && die "usage: run.pl log-file command-line arguments...";

#print STDERR "COMMAND-LINE: " .  Dumper(\@ARGV) . "\n";
$job_pick = 'all';
$max_jobs_run = -1;
$jobstart = 1;
$jobend = 1;
$ignored_opts = ""; # These will be ignored.

# First parse an option like JOB=1:4, and any
# options that would normally be given to
# queue.pl, which we will just discard.

for (my $x = 1; $x <= 2; $x++) { # This for-loop is to
  # allow the JOB=1:n option to be interleaved with the
  # options to qsub.
  while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) {
    # parse any options that would normally go to qsub, but which will be ignored here.
    my $switch = shift @ARGV;
    if ($switch eq "-V") {
      $ignored_opts .= "-V ";
    } elsif ($switch eq "--max-jobs-run" || $switch eq "-tc") {
      # we do support the option --max-jobs-run n, and its GridEngine form -tc n.
      # if the command appears multiple times uses the smallest option.
      if ( $max_jobs_run <= 0 ) {
          $max_jobs_run =  shift @ARGV;
      } else {
        my $new_constraint = shift @ARGV;
        if ( ($new_constraint < $max_jobs_run) ) {
          $max_jobs_run = $new_constraint;
        }
      }
      
      if (! ($max_jobs_run > 0)) {
        die "run.pl: invalid option --max-jobs-run $max_jobs_run";
      }
    } else {
      my $argument = shift @ARGV;
      if ($argument =~ m/^--/) {
        print STDERR "run.pl: WARNING: suspicious argument '$argument' to $switch; starts with '-'\n";
      }
      if ($switch eq "-sync" && $argument =~ m/^[yY]/) {
        $ignored_opts .= "-sync "; # Note: in the
        # corresponding code in queue.pl it says instead, just "$sync = 1;".
      } elsif ($switch eq "-pe") { # e.g. -pe smp 5
        my $argument2 = shift @ARGV;
        $ignored_opts .= "$switch $argument $argument2 ";
      } elsif ($switch eq "--gpu") {
        $using_gpu = $argument;
      } elsif ($switch eq "--pick") {
        if($argument =~ m/^(all|failed|incomplete)$/) {
          $job_pick = $argument;
        } else {
          print STDERR "run.pl: ERROR: --pick argument must be one of 'all', 'failed' or 'incomplete'"
        }
      } else {
        # Ignore option.
        $ignored_opts .= "$switch $argument ";
      }
    }
  }
  if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:20
    $jobname = $1;
    $jobstart = $2;
    $jobend = $3;
    if ($jobstart > $jobend) {
      die "run.pl: invalid job range $ARGV[0]";
    }
    if ($jobstart <= 0) {
      die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is required for GridEngine compatibility).";
    }
    shift;
  } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1.
    $jobname = $1;
    $jobstart = $2;
    $jobend = $2;
    shift;
  } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) {
    print STDERR "run.pl: Warning: suspicious first argument to run.pl: $ARGV[0]\n";
  }
}

# Users found this message confusing so we are removing it.
# if ($ignored_opts ne "") {
#   print STDERR "run.pl: Warning: ignoring options \"$ignored_opts\"\n";
# }

if ($max_jobs_run == -1) { # If --max-jobs-run option not set,
                           # then work out the number of processors if possible,
                           # and set it based on that.
  $max_jobs_run = 0;
  if ($using_gpu) {
    if (open(P, "nvidia-smi -L |")) {
      $max_jobs_run++ while (<P>);
      close(P);
    }
    if ($max_jobs_run == 0) {
      $max_jobs_run = 1;
      print STDERR "run.pl: Warning: failed to detect number of GPUs from nvidia-smi, using ${max_jobs_run}\n";
    }
  } elsif (open(P, "</proc/cpuinfo")) {  # Linux
    while (<P>) { if (m/^processor/) { $max_jobs_run++; } }
    if ($max_jobs_run == 0) {
      print STDERR "run.pl: Warning: failed to detect any processors from /proc/cpuinfo\n";
      $max_jobs_run = 10;  # reasonable default.
    }
    close(P);
  } elsif (open(P, "sysctl -a |")) {  # BSD/Darwin
    while (<P>) {
      if (m/hw\.ncpu\s*[:=]\s*(\d+)/) { # hw.ncpu = 4, or hw.ncpu: 4
        $max_jobs_run = $1;
        last;
      }
    }
    close(P);
    if ($max_jobs_run == 0) {
      print STDERR "run.pl: Warning: failed to detect any processors from sysctl -a\n";
      $max_jobs_run = 10;  # reasonable default.
    }
  } else {
    # allow at most 32 jobs at once, on non-UNIX systems; change this code
    # if you need to change this default.
    $max_jobs_run = 32;
  }
  # The just-computed value of $max_jobs_run is just the number of processors
  # (or our best guess); and if it happens that the number of jobs we need to
  # run is just slightly above $max_jobs_run, it will make sense to increase
  # $max_jobs_run to equal the number of jobs, so we don't have a small number
  # of leftover jobs.
  $num_jobs = $jobend - $jobstart + 1;
  if (!$using_gpu &&
      $num_jobs > $max_jobs_run && $num_jobs < 1.4 * $max_jobs_run) {
    $max_jobs_run = $num_jobs;
  }
}

sub pick_or_exit {
  # pick_or_exit ( $logfile ) 
  # Invoked before each job is started helps to run jobs selectively.
  #
  # Given the name of the output logfile decides whether the job must be 
  # executed (by returning from the subroutine) or not (by terminating the
  # process calling exit)
  # 
  # PRE: $job_pick is a global variable set by command line switch --pick
  #      and indicates which class of jobs must be executed.
  #
  # 1) If a failed job is not executed the process exit code will indicate 
  #    failure, just as if the task was just executed  and failed.
  #
  # 2) If a task is incomplete it will be executed. Incomplete may be either
  #    a job whose log file does not contain the accounting notes in the end,
  #    or a job whose log file does not exist.
  #
  # 3) If the $job_pick is set to 'all' (default behavior) a task will be
  #    executed regardless of the result of previous attempts.
  #
  # This logic could have been implemented in the main execution loop
  # but a subroutine to preserve the current level of readability of
  # that part of the code.
  #
  # Alexandre Felipe, (o.alexandre.felipe@gmail.com) 14th of August of 2020
  #
  if($job_pick eq 'all'){
    return; # no need to bother with the previous log
  }
  open my $fh, "<", $_[0] or return; # job not executed yet
  my $log_line;
  my $cur_line;
  while ($cur_line = <$fh>) {
    if( $cur_line =~ m/# Ended \(code .*/ ) {
      $log_line = $cur_line;
    }
  }
  close $fh;
  if (! defined($log_line)){
    return; # incomplete
  }
  if ( $log_line =~ m/# Ended \(code 0\).*/ ) {
    exit(0); # complete
  } elsif ( $log_line =~ m/# Ended \(code \d+(; signal \d+)?\).*/ ){
    if ($job_pick !~ m/^(failed|all)$/) {
      exit(1); # failed but not going to run
    } else {
      return; # failed
    }
  } elsif ( $log_line =~ m/.*\S.*/ ) {
    return; # incomplete jobs are always run
  }
}


$logfile = shift @ARGV;

if (defined $jobname && $logfile !~ m/$jobname/ &&
    $jobend > $jobstart) {
  print STDERR "run.pl: you are trying to run a parallel job but "
    . "you are putting the output into just one log file ($logfile)\n";
  exit(1);
}

$cmd = "";

foreach $x (@ARGV) {
    if ($x =~ m/^\S+$/) { $cmd .=  $x . " "; }
    elsif ($x =~ m:\":) { $cmd .= "'$x' "; }
    else { $cmd .= "\"$x\" "; }
}

#$Data::Dumper::Indent=0;
$ret = 0;
$numfail = 0;
%active_pids=();

use POSIX ":sys_wait_h";
for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
  if (scalar(keys %active_pids) >= $max_jobs_run) {

    # Lets wait for a change in any child's status
    # Then we have to work out which child finished
    $r = waitpid(-1, 0);
    $code = $?;
    if ($r < 0 ) { die "run.pl: Error waiting for child process"; } # should never happen.
    if ( defined $active_pids{$r} ) {
        $jid=$active_pids{$r};
        $fail[$jid]=$code;
        if ($code !=0) { $numfail++;}
        delete $active_pids{$r};
        # print STDERR "Finished: $r/$jid " .  Dumper(\%active_pids) . "\n";
    } else {
        die "run.pl: Cannot find the PID of the child process that just finished.";
    }

    # In theory we could do a non-blocking waitpid over all jobs running just
    # to find out if only one or more jobs finished during the previous waitpid()
    # However, we just omit this and will reap the next one in the next pass
    # through the for(;;) cycle
  }
  $childpid = fork();
  if (!defined $childpid) { die "run.pl: Error forking in run.pl (writing to $logfile)"; }
  if ($childpid == 0) { # We're in the child... this branch
    # executes the job and returns (possibly with an error status).
    if (defined $jobname) {
      $cmd =~ s/$jobname/$jobid/g;
      $logfile =~ s/$jobname/$jobid/g;
    }
    # exit if the job does not need to be executed
    pick_or_exit( $logfile );

    system("mkdir -p `dirname $logfile` 2>/dev/null");
    open(F, ">$logfile") || die "run.pl: Error opening log file $logfile";
    print F "# " . $cmd . "\n";
    print F "# Started at " . `date`;
    $starttime = `date +'%s'`;
    print F "#\n";
    close(F);

    # Pipe into bash.. make sure we're not using any other shell.
    open(B, "|bash") || die "run.pl: Error opening shell command";
    print B "( " . $cmd . ") 2>>$logfile >> $logfile";
    close(B);                   # If there was an error, exit status is in $?
    $ret = $?;

    $lowbits = $ret & 127;
    $highbits = $ret >> 8;
    if ($lowbits != 0) { $return_str = "code $highbits; signal $lowbits" }
    else { $return_str = "code $highbits"; }

    $endtime = `date +'%s'`;
    open(F, ">>$logfile") || die "run.pl: Error opening log file $logfile (again)";
    $enddate = `date`;
    chop $enddate;
    print F "# Accounting: time=" . ($endtime - $starttime) . " threads=1\n";
    print F "# Ended ($return_str) at " . $enddate . ", elapsed time " . ($endtime-$starttime) . " seconds\n";
    close(F);
    exit($ret == 0 ? 0 : 1);
  } else {
    $pid[$jobid] = $childpid;
    $active_pids{$childpid} = $jobid;
    # print STDERR "Queued: " .  Dumper(\%active_pids) . "\n";
  }
}

# Now we have submitted all the jobs, lets wait until all the jobs finish
foreach $child (keys %active_pids) {
    $jobid=$active_pids{$child};
    $r = waitpid($pid[$jobid], 0);
    $code = $?;
    if ($r == -1) { die "run.pl: Error waiting for child process"; } # should never happen.
    if ($r != 0) { $fail[$jobid]=$code; $numfail++ if $code!=0; } # Completed successfully
}

# Some sanity checks:
# The $fail array should not contain undefined codes
# The number of non-zeros in that array  should be equal to $numfail
# We cannot do foreach() here, as the JOB ids do not start at zero
$failed_jids=0;
for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
  $job_return = $fail[$jobid];
  if (not defined $job_return ) {
    # print Dumper(\@fail);

    die "run.pl: Sanity check failed: we have indication that some jobs are running " .
      "even after we waited for all jobs to finish" ;
  }
  if ($job_return != 0 ){ $failed_jids++;}
}
if ($failed_jids != $numfail) {
  die "run.pl: Sanity check failed: cannot find out how many jobs failed ($failed_jids x $numfail)."
}
if ($numfail > 0) { $ret = 1; }

if ($ret != 0) {
  $njobs = $jobend - $jobstart + 1;
  if ($njobs == 1) {
    if (defined $jobname) {
      $logfile =~ s/$jobname/$jobstart/; # only one numbered job, so replace name with
                                         # that job.
    }
    print STDERR "run.pl: job failed, log is in $logfile\n";
    if ($logfile =~ m/JOB/) {
      print STDERR "run.pl: probably you forgot to put JOB=1:\$nj in your script.";
    }
  }
  else {
    $logfile =~ s/$jobname/*/g;
    print STDERR "run.pl: $numfail / $njobs failed, log is in $logfile\n";
  }
}


exit ($ret);

 egs/aishell2/transformerLM/utils/split_scp.pl

New file
@@ -0,0 +1,246 @@
#!/usr/bin/env perl

# Copyright 2010-2011 Microsoft Corporation

# See ../../COPYING for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


# This program splits up any kind of .scp or archive-type file.
# If there is no utt2spk option it will work on any text  file and
# will split it up with an approximately equal number of lines in
# each but.
# With the --utt2spk option it will work on anything that has the
# utterance-id as the first entry on each line; the utt2spk file is
# of the form "utterance speaker" (on each line).
# It splits it into equal size chunks as far as it can.  If you use the utt2spk
# option it will make sure these chunks coincide with speaker boundaries.  In
# this case, if there are more chunks than speakers (and in some other
# circumstances), some of the resulting chunks will be empty and it will print
# an error message and exit with nonzero status.
# You will normally call this like:
# split_scp.pl scp scp.1 scp.2 scp.3 ...
# or
# split_scp.pl --utt2spk=utt2spk scp scp.1 scp.2 scp.3 ...
# Note that you can use this script to split the utt2spk file itself,
# e.g. split_scp.pl --utt2spk=utt2spk utt2spk utt2spk.1 utt2spk.2 ...

# You can also call the scripts like:
# split_scp.pl -j 3 0 scp scp.0
# [note: with this option, it assumes zero-based indexing of the split parts,
# i.e. the second number must be 0 <= n < num-jobs.]

use warnings;

$num_jobs = 0;
$job_id = 0;
$utt2spk_file = "";
$one_based = 0;

for ($x = 1; $x <= 3 && @ARGV > 0; $x++) {
    if ($ARGV[0] eq "-j") {
        shift @ARGV;
        $num_jobs = shift @ARGV;
        $job_id = shift @ARGV;
    }
    if ($ARGV[0] =~ /--utt2spk=(.+)/) {
        $utt2spk_file=$1;
        shift;
    }
    if ($ARGV[0] eq '--one-based') {
        $one_based = 1;
        shift @ARGV;
    }
}

if ($num_jobs != 0 && ($num_jobs < 0 || $job_id - $one_based < 0 ||
                       $job_id - $one_based >= $num_jobs)) {
  die "$0: Invalid job number/index values for '-j $num_jobs $job_id" .
      ($one_based ? " --one-based" : "") . "'\n"
}

$one_based
    and $job_id--;

if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) {
    die
"Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ...
   or: split_scp.pl -j num-jobs job-id [--one-based] [--utt2spk=<utt2spk_file>] in.scp [out.scp]
 ... where 0 <= job-id < num-jobs, or 1 <= job-id <- num-jobs if --one-based.\n";
}

$error = 0;
$inscp = shift @ARGV;
if ($num_jobs == 0) { # without -j option
    @OUTPUTS = @ARGV;
} else {
    for ($j = 0; $j < $num_jobs; $j++) {
        if ($j == $job_id) {
            if (@ARGV > 0) { push @OUTPUTS, $ARGV[0]; }
            else { push @OUTPUTS, "-"; }
        } else {
            push @OUTPUTS, "/dev/null";
        }
    }
}

if ($utt2spk_file ne "") {  # We have the --utt2spk option...
    open($u_fh, '<', $utt2spk_file) || die "$0: Error opening utt2spk file $utt2spk_file: $!\n";
    while(<$u_fh>) {
        @A = split;
        @A == 2 || die "$0: Bad line $_ in utt2spk file $utt2spk_file\n";
        ($u,$s) = @A;
        $utt2spk{$u} = $s;
    }
    close $u_fh;
    open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";
    @spkrs = ();
    while(<$i_fh>) {
        @A = split;
        if(@A == 0) { die "$0: Empty or space-only line in scp file $inscp\n"; }
        $u = $A[0];
        $s = $utt2spk{$u};
        defined $s || die "$0: No utterance $u in utt2spk file $utt2spk_file\n";
        if(!defined $spk_count{$s}) {
            push @spkrs, $s;
            $spk_count{$s} = 0;
            $spk_data{$s} = [];  # ref to new empty array.
        }
        $spk_count{$s}++;
        push @{$spk_data{$s}}, $_;
    }
    # Now split as equally as possible ..
    # First allocate spks to files by allocating an approximately
    # equal number of speakers.
    $numspks = @spkrs;  # number of speakers.
    $numscps = @OUTPUTS; # number of output files.
    if ($numspks < $numscps) {
      die "$0: Refusing to split data because number of speakers $numspks " .
          "is less than the number of output .scp files $numscps\n";
    }
    for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
        $scparray[$scpidx] = []; # [] is array reference.
    }
    for ($spkidx = 0; $spkidx < $numspks; $spkidx++) {
        $scpidx = int(($spkidx*$numscps) / $numspks);
        $spk = $spkrs[$spkidx];
        push @{$scparray[$scpidx]}, $spk;
        $scpcount[$scpidx] += $spk_count{$spk};
    }

    # Now will try to reassign beginning + ending speakers
    # to different scp's and see if it gets more balanced.
    # Suppose objf we're minimizing is sum_i (num utts in scp[i] - average)^2.
    # We can show that if considering changing just 2 scp's, we minimize
    # this by minimizing the squared difference in sizes.  This is
    # equivalent to minimizing the absolute difference in sizes.  This
    # shows this method is bound to converge.

    $changed = 1;
    while($changed) {
        $changed = 0;
        for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
            # First try to reassign ending spk of this scp.
            if($scpidx < $numscps-1) {
                $sz = @{$scparray[$scpidx]};
                if($sz > 0) {
                    $spk = $scparray[$scpidx]->[$sz-1];
                    $count = $spk_count{$spk};
                    $nutt1 = $scpcount[$scpidx];
                    $nutt2 = $scpcount[$scpidx+1];
                    if( abs( ($nutt2+$count) - ($nutt1-$count))
                        < abs($nutt2 - $nutt1))  { # Would decrease
                        # size-diff by reassigning spk...
                        $scpcount[$scpidx+1] += $count;
                        $scpcount[$scpidx] -= $count;
                        pop @{$scparray[$scpidx]};
                        unshift @{$scparray[$scpidx+1]}, $spk;
                        $changed = 1;
                    }
                }
            }
            if($scpidx > 0 && @{$scparray[$scpidx]} > 0) {
                $spk = $scparray[$scpidx]->[0];
                $count = $spk_count{$spk};
                $nutt1 = $scpcount[$scpidx-1];
                $nutt2 = $scpcount[$scpidx];
                if( abs( ($nutt2-$count) - ($nutt1+$count))
                    < abs($nutt2 - $nutt1))  { # Would decrease
                    # size-diff by reassigning spk...
                    $scpcount[$scpidx-1] += $count;
                    $scpcount[$scpidx] -= $count;
                    shift @{$scparray[$scpidx]};
                    push @{$scparray[$scpidx-1]}, $spk;
                    $changed = 1;
                }
            }
        }
    }
    # Now print out the files...
    for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
        $scpfile = $OUTPUTS[$scpidx];
        ($scpfile ne '-' ? open($f_fh, '>', $scpfile)
                         : open($f_fh, '>&', \*STDOUT)) ||
            die "$0: Could not open scp file $scpfile for writing: $!\n";
        $count = 0;
        if(@{$scparray[$scpidx]} == 0) {
            print STDERR "$0: eError: split_scp.pl producing empty .scp file " .
                         "$scpfile (too many splits and too few speakers?)\n";
            $error = 1;
        } else {
            foreach $spk ( @{$scparray[$scpidx]} ) {
                print $f_fh @{$spk_data{$spk}};
                $count += $spk_count{$spk};
            }
            $count == $scpcount[$scpidx] || die "Count mismatch [code error]";
        }
        close($f_fh);
    }
} else {
   # This block is the "normal" case where there is no --utt2spk
   # option and we just break into equal size chunks.

    open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";

    $numscps = @OUTPUTS;  # size of array.
    @F = ();
    while(<$i_fh>) {
        push @F, $_;
    }
    $numlines = @F;
    if($numlines == 0) {
        print STDERR "$0: error: empty input scp file $inscp\n";
        $error = 1;
    }
    $linesperscp = int( $numlines / $numscps); # the "whole part"..
    $linesperscp >= 1 || die "$0: You are splitting into too many pieces! [reduce \$nj ($numscps) to be smaller than the number of lines ($numlines) in $inscp]\n";
    $remainder = $numlines - ($linesperscp * $numscps);
    ($remainder >= 0 && $remainder < $numlines) || die "bad remainder $remainder";
    # [just doing int() rounds down].
    $n = 0;
    for($scpidx = 0; $scpidx < @OUTPUTS; $scpidx++) {
        $scpfile = $OUTPUTS[$scpidx];
        ($scpfile ne '-' ? open($o_fh, '>', $scpfile)
                         : open($o_fh, '>&', \*STDOUT)) ||
            die "$0: Could not open scp file $scpfile for writing: $!\n";
        for($k = 0; $k < $linesperscp + ($scpidx < $remainder ? 1 : 0); $k++) {
            print $o_fh $F[$n++];
        }
        close($o_fh) || die "$0: Eror closing scp file $scpfile: $!\n";
    }
    $n == $numlines || die "$n != $numlines [code error]";
}

exit ($error);

New file
			@@ -0,0 +1,31 @@
			lm: transformer
			lm_conf:
			pos_enc: null
			embed_unit: 128
			att_unit: 512
			head: 8
			unit: 2048
			layer: 16
			dropout_rate: 0.1

			# optimization related
			grad_clip: 5.0
			batch_type: numel
			batch_bins: 6000000
			accum_grad: 1
			max_epoch: 15 # 15epoch is enougth

			optim: adam
			optim_conf:
			lr: 0.001
			scheduler: warmuplr
			scheduler_conf:
			warmup_steps: 25000

			best_model_criterion:
			- - valid
			- loss
			- min
			keep_nbest_models: 10 # 10 is good.

			log_interval: 50

New file
			@@ -0,0 +1,6 @@
			export FUNASR_DIR=$PWD/../../..

			# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
			export PYTHONIOENCODING=UTF-8
			export PYTHONPATH=../../../:$PYTHONPATH
			export PATH=$FUNASR_DIR/funasr/bin:$PATH

New file
			@@ -0,0 +1,245 @@
			#!/usr/bin/env bash

			. ./path.sh \|\| exit 1;

			# machines configuration
			CUDA_VISIBLE_DEVICES="0,1"
			gpu_num=2
			count=1
			train_cmd=utils/run.pl
			infer_cmd=utils/run.pl

			# general configuration
			lang=zh
			nlsyms_txt=none # Non-linguistic symbol list if existing.
			cleaner=none # Text cleaner.
			g2p=none # g2p method (needed if token_type=phn).
			lm_fold_length=150 # fold_length for LM training.
			word_vocab_size=10000 # Size of word vocabulary.
			token_type=char
			lm_token_list=

			nj=10
			## path to AISHELL2 trans
			lm_train_text=
			lm_dev_text=
			lm_test_text=

			train_data_path_and_name_and_type=${lm_train_text},text,text
			train_shape_file=
			valid_data_path_and_name_and_type=${lm_dev_text},text,text
			valid_shape_file=
			lm_config=conf/train_lm_transformer.yaml
			exp_dir=./data
			tag=exp1
			model_dir="baseline_$(basename "${lm_config}" .yaml)_${lang}_${token_type}_${tag}"
			lm_exp=${exp_dir}/exp/${model_dir}
			inference_lm=valid.loss.ave.pth # Language model path for decoding.

			stage=0
			stop_stage=3

			. utils/parse_options.sh \|\| exit 1;

			# Set bash to 'debug' mode, it will exit on :
			# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
			set -e
			set -u
			set -o pipefail

			min() {
			local a b
			a=$1
			for b in "$@"; do
			if [ "${b}" -le "${a}" ]; then
			a="${b}"
			fi
			done
			echo "${a}"
			}

			# you can set gpu num for decoding here
			gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default
			ngpu=$(echo $gpuid_list \| awk -F "," '{print NF}')

			mkdir -p ${exp_dir}/exp/${model_dir}
			token_list=${exp_dir}/exp/${model_dir}/vocab.txt
			blank="<blank>" # CTC blank symbole
			sos="<s>" # sos symbole
			eos="</s>" # eos symbole
			oov="<unk>" # Out of vocabulary symbol.
			if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
			if [ "${token_type}" = char ] \|\| [ "${token_type}" = word ]; then
			echo "Stage 0: Generate character level token_list from ${lm_train_text}"

			# The first symbol in token_list must be "<blank>":
			# 0 is reserved for CTC-blank for ASR and also used as ignore-index in the other task
			python -m funasr.bin.tokenize_text \
			--token_type "${token_type}" \
			--input "${lm_train_text}" \
			--output "${token_list}" \
			--non_linguistic_symbols "${nlsyms_txt}" \
			--field 2- \
			--cleaner "${cleaner}" \
			--g2p "${g2p}" \
			--write_vocabulary true \
			--add_symbol "${blank}:0" \
			--add_symbol "${sos}:1" \
			--add_symbol "${eos}:2" \
			--add_symbol "${oov}:-1"

			else
			echo "Error: not supported --token_type '${token_type}'"
			exit 2
			fi

			## use_word_lm=false
			## # Create word-list for word-LM training
			## if ${use_word_lm} && [ "${token_type}" != word ]; then
			## echo "Generate word level token_list from ${lm_train_text}"
			## python -m funasr.bin.tokenize_text \
			## --token_type word \
			## --input "${lm_train_text}" \
			## --output "${token_list}" \
			## --field 2- \
			## --cleaner "${cleaner}" \
			## --g2p "${g2p}" \
			## --write_vocabulary true \
			## --vocabulary_size "${word_vocab_size}" \
			## --add_symbol "${blank}:0" \
			## --add_symbol "${sos}:1" \
			## --add_symbol "${eos}:2" \
			## --add_symbol "${oov}:-1"
			## fi

			lm_token_list="${token_list}"

			fi


			if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
			echo "stage 1: Data preparation"

			# 1. Split the key file
			_logdir="${exp_dir}/exp/${model_dir}/log"
			mkdir -p "${_logdir}"
			# Get the minimum number among ${nj} and the number lines of input files
			_nj=$(min "${nj}" "$(<${lm_train_text} wc -l)" "$(<${lm_dev_text} wc -l)")

			key_file="${lm_train_text}"
			split_scps=""
			for n in $(seq ${_nj}); do
			split_scps+=" ${_logdir}/train.${n}.scp"
			done
			# shellcheck disable=SC2086
			utils/split_scp.pl "${key_file}" ${split_scps}

			key_file="${lm_dev_text}"
			split_scps=""
			for n in $(seq ${_nj}); do
			split_scps+=" ${_logdir}/dev.${n}.scp"
			done
			# shellcheck disable=SC2086
			utils/split_scp.pl "${key_file}" ${split_scps}

			# 2. Submit jobs
			## python ../../funasr/bin/lm_train.py \
			${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
			python -m funasr.bin.lm_train \
			--collect_stats true \
			--use_preprocessor true \
			--token_type "${token_type}" \
			--token_list "${lm_token_list}" \
			--non_linguistic_symbols "${nlsyms_txt}" \
			--cleaner "${cleaner}" \
			--g2p "${g2p}" \
			--train_data_path_and_name_and_type "${lm_train_text},text,text" \
			--valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
			--train_shape_file "${_logdir}/train.JOB.scp" \
			--valid_shape_file "${_logdir}/dev.JOB.scp" \
			--output_dir "${_logdir}/stats.JOB" \
			--config ${lm_config} \|\| { cat "${_logdir}"/stats.*.log; exit 1; }

			# 3. Aggregate shape files
			_opts=
			for i in $(seq "${_nj}"); do
			_opts+="--input_dir ${_logdir}/stats.${i} "
			done
			lm_stats_dir=${exp_dir}/exp/${model_dir}
			# shellcheck disable=SC2086
			python -m funasr.bin.aggregate_stats_dirs ${_opts} --output_dir "${lm_stats_dir}"

			# Append the num-tokens at the last dimensions. This is used for batch-bins count
			<"${lm_stats_dir}/train/text_shape" \
			awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
			>"${lm_stats_dir}/train/text_shape.${token_type}"

			<"${lm_stats_dir}/valid/text_shape" \
			awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
			>"${lm_stats_dir}/valid/text_shape.${token_type}"

			train_shape_file=${lm_stats_dir}/train/text_shape.${token_type}
			valid_shape_file=${lm_stats_dir}/valid/text_shape.${token_type}

			fi

			# Training Stage
			world_size=$gpu_num # run on one machine
			if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
			echo "stage 2: Training"
			mkdir -p ${lm_exp}
			mkdir -p ${lm_exp}/log
			INIT_FILE=${lm_exp}/ddp_init
			if [ -f $INIT_FILE ];then
			rm -f $INIT_FILE
			fi
			init_method=file://$(readlink -f $INIT_FILE)
			echo "$0: init method is $init_method"
			for ((i = 0; i < $gpu_num; ++i)); do
			{
			rank=$i
			local_rank=$i
			gpu_id=$(echo $CUDA_VISIBLE_DEVICES \| cut -d',' -f$[$i+1])
			python ../../../funasr/bin/lm_train.py \
			--gpu_id ${gpu_id} \
			--use_preprocessor true \
			--token_type "${token_type}" \
			--token_list "${lm_token_list}" \
			--non_linguistic_symbols "${nlsyms_txt}" \
			--cleaner "${cleaner}" \
			--train_data_path_and_name_and_type "${train_data_path_and_name_and_type}" \
			--train_shape_file "${train_shape_file}" \
			--valid_data_path_and_name_and_type "${valid_data_path_and_name_and_type}" \
			--valid_shape_file "${valid_shape_file}" \
			--fold_length "${lm_fold_length}" \
			--resume true \
			--output_dir "${lm_exp}" \
			--config ${lm_config} \
			--ngpu ${gpu_num} \
			--num_worker_count ${count} \
			--multiprocessing_distributed true \
			--dist_init_method ${init_method} \
			--dist_world_size ${world_size} \
			--dist_rank ${rank} \
			--local_rank ${local_rank} 1> ${exp_dir}/exp/${model_dir}/log/train.log.$i 2>&1
			} &
			done
			wait
			fi

			# Testing Stage
			gpu_num=1
			if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
			echo "Stage 3: Calc perplexity: ${lm_test_text}"

			python ../../../funasr/bin/lm_inference.py \
			--output_dir "${lm_exp}/perplexity_test" \
			--ngpu "${gpu_num}" \
			--batch_size 1 \
			--train_config "${lm_exp}"/config.yaml \
			--model_file "${lm_exp}/${inference_lm}" \
			--data_path_and_name_and_type "${lm_test_text},text,text" \
			--num_workers 1 \
			--split_with_space false
			fi

New file
			@@ -0,0 +1,97 @@
			#!/usr/bin/env bash

			# Copyright 2012 Johns Hopkins University (Author: Daniel Povey);
			# Arnab Ghoshal, Karel Vesely

			# Licensed under the Apache License, Version 2.0 (the "License");
			# you may not use this file except in compliance with the License.
			# You may obtain a copy of the License at
			#
			# http://www.apache.org/licenses/LICENSE-2.0
			#
			# THIS CODE IS PROVIDED AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
			# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
			# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
			# MERCHANTABLITY OR NON-INFRINGEMENT.
			# See the Apache 2 License for the specific language governing permissions and
			# limitations under the License.


			# Parse command-line options.
			# To be sourced by another script (as in ". parse_options.sh").
			# Option format is: --option-name arg
			# and shell variable "option_name" gets set to value "arg."
			# The exception is --help, which takes no arguments, but prints the
			# $help_message variable (if defined).


			###
			### The --config file options have lower priority to command line
			### options, so we need to import them first...
			###

			# Now import all the configs specified by command-line, in left-to-right order
			for ((argpos=1; argpos<$#; argpos++)); do
			if [ "${!argpos}" == "--config" ]; then
			argpos_plus1=$((argpos+1))
			config=${!argpos_plus1}
			[ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
			. $config # source the config file.
			fi
			done


			###
			### Now we process the command line options
			###
			while true; do
			[ -z "${1:-}" ] && break; # break if there are no arguments
			case "$1" in
			# If the enclosing script is called with --help option, print the help
			# message and exit. Scripts should put help messages in $help_message
			--help\|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
			else printf "$help_message\n" 1>&2 ; fi;
			exit 0 ;;
			--=) echo "$0: options to scripts must be of the form --name value, got '$1'"
			exit 1 ;;
			# If the first command-line argument begins with "--" (e.g. --foo-bar),
			# then work out the variable name as $name, which will equal "foo_bar".
			--*) name=`echo "$1" \| sed s/^--// \| sed s/-/_/g`;
			# Next we test whether the variable in question is undefned-- if so it's
			# an invalid option and we die. Note: $0 evaluates to the name of the
			# enclosing script.
			# The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
			# is undefined. We then have to wrap this test inside "eval" because
			# foo_bar is itself inside a variable ($name).
			eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;

			oldval="`eval echo \\$$name`";
			# Work out whether we seem to be expecting a Boolean argument.
			if [ "$oldval" == "true" ] \|\| [ "$oldval" == "false" ]; then
			was_bool=true;
			else
			was_bool=false;
			fi

			# Set the variable to the right value-- the escaped quotes make it work if
			# the option had spaces, like --cmd "queue.pl -sync y"
			eval $name=\"$2\";

			# Check that Boolean-valued arguments are really Boolean.
			if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
			echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
			exit 1;
			fi
			shift 2;
			;;
			*) break;
			esac
			done


			# Check for an empty argument to the --cmd option, which can easily occur as a
			# result of scripting errors.
			[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;


			true; # so this script returns exit code 0.

New file
			@@ -0,0 +1,356 @@
			#!/usr/bin/env perl
			use warnings; #sed replacement for -w perl parameter
			# In general, doing
			# run.pl some.log a b c is like running the command a b c in
			# the bash shell, and putting the standard error and output into some.log.
			# To run parallel jobs (backgrounded on the host machine), you can do (e.g.)
			# run.pl JOB=1:4 some.JOB.log a b c JOB is like running the command a b c JOB
			# and putting it in some.JOB.log, for each one. [Note: JOB can be any identifier].
			# If any of the jobs fails, this script will fail.

			# A typical example is:
			# run.pl some.log my-prog "--opt=foo bar" foo \\| other-prog baz
			# and run.pl will run something like:
			# ( my-prog '--opt=foo bar' foo \| other-prog baz ) >& some.log
			#
			# Basically it takes the command-line arguments, quotes them
			# as necessary to preserve spaces, and evaluates them with bash.
			# In addition it puts the command line at the top of the log, and
			# the start and end times of the command at the beginning and end.
			# The reason why this is useful is so that we can create a different
			# version of this program that uses a queueing system instead.

			#use Data::Dumper;

			@ARGV < 2 && die "usage: run.pl log-file command-line arguments...";

			#print STDERR "COMMAND-LINE: " . Dumper(\@ARGV) . "\n";
			$job_pick = 'all';
			$max_jobs_run = -1;
			$jobstart = 1;
			$jobend = 1;
			$ignored_opts = ""; # These will be ignored.

			# First parse an option like JOB=1:4, and any
			# options that would normally be given to
			# queue.pl, which we will just discard.

			for (my $x = 1; $x <= 2; $x++) { # This for-loop is to
			# allow the JOB=1:n option to be interleaved with the
			# options to qsub.
			while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) {
			# parse any options that would normally go to qsub, but which will be ignored here.
			my $switch = shift @ARGV;
			if ($switch eq "-V") {
			$ignored_opts .= "-V ";
			} elsif ($switch eq "--max-jobs-run" \|\| $switch eq "-tc") {
			# we do support the option --max-jobs-run n, and its GridEngine form -tc n.
			# if the command appears multiple times uses the smallest option.
			if ( $max_jobs_run <= 0 ) {
			$max_jobs_run = shift @ARGV;
			} else {
			my $new_constraint = shift @ARGV;
			if ( ($new_constraint < $max_jobs_run) ) {
			$max_jobs_run = $new_constraint;
			}
			}

			if (! ($max_jobs_run > 0)) {
			die "run.pl: invalid option --max-jobs-run $max_jobs_run";
			}
			} else {
			my $argument = shift @ARGV;
			if ($argument =~ m/^--/) {
			print STDERR "run.pl: WARNING: suspicious argument '$argument' to $switch; starts with '-'\n";
			}
			if ($switch eq "-sync" && $argument =~ m/^[yY]/) {
			$ignored_opts .= "-sync "; # Note: in the
			# corresponding code in queue.pl it says instead, just "$sync = 1;".
			} elsif ($switch eq "-pe") { # e.g. -pe smp 5
			my $argument2 = shift @ARGV;
			$ignored_opts .= "$switch $argument $argument2 ";
			} elsif ($switch eq "--gpu") {
			$using_gpu = $argument;
			} elsif ($switch eq "--pick") {
			if($argument =~ m/^(all\|failed\|incomplete)$/) {
			$job_pick = $argument;
			} else {
			print STDERR "run.pl: ERROR: --pick argument must be one of 'all', 'failed' or 'incomplete'"
			}
			} else {
			# Ignore option.
			$ignored_opts .= "$switch $argument ";
			}
			}
			}
			if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:20
			$jobname = $1;
			$jobstart = $2;
			$jobend = $3;
			if ($jobstart > $jobend) {
			die "run.pl: invalid job range $ARGV[0]";
			}
			if ($jobstart <= 0) {
			die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is required for GridEngine compatibility).";
			}
			shift;
			} elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1.
			$jobname = $1;
			$jobstart = $2;
			$jobend = $2;
			shift;
			} elsif ($ARGV[0] =~ m/.+\=.\:.$/) {
			print STDERR "run.pl: Warning: suspicious first argument to run.pl: $ARGV[0]\n";
			}
			}

			# Users found this message confusing so we are removing it.
			# if ($ignored_opts ne "") {
			# print STDERR "run.pl: Warning: ignoring options \"$ignored_opts\"\n";
			# }

			if ($max_jobs_run == -1) { # If --max-jobs-run option not set,
			# then work out the number of processors if possible,
			# and set it based on that.
			$max_jobs_run = 0;
			if ($using_gpu) {
			if (open(P, "nvidia-smi -L \|")) {
			$max_jobs_run++ while (<P>);
			close(P);
			}
			if ($max_jobs_run == 0) {
			$max_jobs_run = 1;
			print STDERR "run.pl: Warning: failed to detect number of GPUs from nvidia-smi, using ${max_jobs_run}\n";
			}
			} elsif (open(P, "</proc/cpuinfo")) { # Linux
			while (<P>) { if (m/^processor/) { $max_jobs_run++; } }
			if ($max_jobs_run == 0) {
			print STDERR "run.pl: Warning: failed to detect any processors from /proc/cpuinfo\n";
			$max_jobs_run = 10; # reasonable default.
			}
			close(P);
			} elsif (open(P, "sysctl -a \|")) { # BSD/Darwin
			while (<P>) {
			if (m/hw\.ncpu\s[:=]\s(\d+)/) { # hw.ncpu = 4, or hw.ncpu: 4
			$max_jobs_run = $1;
			last;
			}
			}
			close(P);
			if ($max_jobs_run == 0) {
			print STDERR "run.pl: Warning: failed to detect any processors from sysctl -a\n";
			$max_jobs_run = 10; # reasonable default.
			}
			} else {
			# allow at most 32 jobs at once, on non-UNIX systems; change this code
			# if you need to change this default.
			$max_jobs_run = 32;
			}
			# The just-computed value of $max_jobs_run is just the number of processors
			# (or our best guess); and if it happens that the number of jobs we need to
			# run is just slightly above $max_jobs_run, it will make sense to increase
			# $max_jobs_run to equal the number of jobs, so we don't have a small number
			# of leftover jobs.
			$num_jobs = $jobend - $jobstart + 1;
			if (!$using_gpu &&
			$num_jobs > $max_jobs_run && $num_jobs < 1.4 * $max_jobs_run) {
			$max_jobs_run = $num_jobs;
			}
			}

			sub pick_or_exit {
			# pick_or_exit ( $logfile )
			# Invoked before each job is started helps to run jobs selectively.
			#
			# Given the name of the output logfile decides whether the job must be
			# executed (by returning from the subroutine) or not (by terminating the
			# process calling exit)
			#
			# PRE: $job_pick is a global variable set by command line switch --pick
			# and indicates which class of jobs must be executed.
			#
			# 1) If a failed job is not executed the process exit code will indicate
			# failure, just as if the task was just executed and failed.
			#
			# 2) If a task is incomplete it will be executed. Incomplete may be either
			# a job whose log file does not contain the accounting notes in the end,
			# or a job whose log file does not exist.
			#
			# 3) If the $job_pick is set to 'all' (default behavior) a task will be
			# executed regardless of the result of previous attempts.
			#
			# This logic could have been implemented in the main execution loop
			# but a subroutine to preserve the current level of readability of
			# that part of the code.
			#
			# Alexandre Felipe, (o.alexandre.felipe@gmail.com) 14th of August of 2020
			#
			if($job_pick eq 'all'){
			return; # no need to bother with the previous log
			}
			open my $fh, "<", $_[0] or return; # job not executed yet
			my $log_line;
			my $cur_line;
			while ($cur_line = <$fh>) {
			if( $cur_line =~ m/# Ended \(code .*/ ) {
			$log_line = $cur_line;
			}
			}
			close $fh;
			if (! defined($log_line)){
			return; # incomplete
			}
			if ( $log_line =~ m/# Ended \(code 0\).*/ ) {
			exit(0); # complete
			} elsif ( $log_line =~ m/# Ended \(code \d+(; signal \d+)?\).*/ ){
			if ($job_pick !~ m/^(failed\|all)$/) {
			exit(1); # failed but not going to run
			} else {
			return; # failed
			}
			} elsif ( $log_line =~ m/.\S./ ) {
			return; # incomplete jobs are always run
			}
			}


			$logfile = shift @ARGV;

			if (defined $jobname && $logfile !~ m/$jobname/ &&
			$jobend > $jobstart) {
			print STDERR "run.pl: you are trying to run a parallel job but "
			. "you are putting the output into just one log file ($logfile)\n";
			exit(1);
			}

			$cmd = "";

			foreach $x (@ARGV) {
			if ($x =~ m/^\S+$/) { $cmd .= $x . " "; }
			elsif ($x =~ m:\":) { $cmd .= "'$x' "; }
			else { $cmd .= "\"$x\" "; }
			}

			#$Data::Dumper::Indent=0;
			$ret = 0;
			$numfail = 0;
			%active_pids=();

			use POSIX ":sys_wait_h";
			for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
			if (scalar(keys %active_pids) >= $max_jobs_run) {

			# Lets wait for a change in any child's status
			# Then we have to work out which child finished
			$r = waitpid(-1, 0);
			$code = $?;
			if ($r < 0 ) { die "run.pl: Error waiting for child process"; } # should never happen.
			if ( defined $active_pids{$r} ) {
			$jid=$active_pids{$r};
			$fail[$jid]=$code;
			if ($code !=0) { $numfail++;}
			delete $active_pids{$r};
			# print STDERR "Finished: $r/$jid " . Dumper(\%active_pids) . "\n";
			} else {
			die "run.pl: Cannot find the PID of the child process that just finished.";
			}

			# In theory we could do a non-blocking waitpid over all jobs running just
			# to find out if only one or more jobs finished during the previous waitpid()
			# However, we just omit this and will reap the next one in the next pass
			# through the for(;;) cycle
			}
			$childpid = fork();
			if (!defined $childpid) { die "run.pl: Error forking in run.pl (writing to $logfile)"; }
			if ($childpid == 0) { # We're in the child... this branch
			# executes the job and returns (possibly with an error status).
			if (defined $jobname) {
			$cmd =~ s/$jobname/$jobid/g;
			$logfile =~ s/$jobname/$jobid/g;
			}
			# exit if the job does not need to be executed
			pick_or_exit( $logfile );

			system("mkdir -p `dirname $logfile` 2>/dev/null");
			open(F, ">$logfile") \|\| die "run.pl: Error opening log file $logfile";
			print F "# " . $cmd . "\n";
			print F "# Started at " . `date`;
			$starttime = `date +'%s'`;
			print F "#\n";
			close(F);

			# Pipe into bash.. make sure we're not using any other shell.
			open(B, "\|bash") \|\| die "run.pl: Error opening shell command";
			print B "( " . $cmd . ") 2>>$logfile >> $logfile";
			close(B); # If there was an error, exit status is in $?
			$ret = $?;

			$lowbits = $ret & 127;
			$highbits = $ret >> 8;
			if ($lowbits != 0) { $return_str = "code $highbits; signal $lowbits" }
			else { $return_str = "code $highbits"; }

			$endtime = `date +'%s'`;
			open(F, ">>$logfile") \|\| die "run.pl: Error opening log file $logfile (again)";
			$enddate = `date`;
			chop $enddate;
			print F "# Accounting: time=" . ($endtime - $starttime) . " threads=1\n";
			print F "# Ended ($return_str) at " . $enddate . ", elapsed time " . ($endtime-$starttime) . " seconds\n";
			close(F);
			exit($ret == 0 ? 0 : 1);
			} else {
			$pid[$jobid] = $childpid;
			$active_pids{$childpid} = $jobid;
			# print STDERR "Queued: " . Dumper(\%active_pids) . "\n";
			}
			}

			# Now we have submitted all the jobs, lets wait until all the jobs finish
			foreach $child (keys %active_pids) {
			$jobid=$active_pids{$child};
			$r = waitpid($pid[$jobid], 0);
			$code = $?;
			if ($r == -1) { die "run.pl: Error waiting for child process"; } # should never happen.
			if ($r != 0) { $fail[$jobid]=$code; $numfail++ if $code!=0; } # Completed successfully
			}

			# Some sanity checks:
			# The $fail array should not contain undefined codes
			# The number of non-zeros in that array should be equal to $numfail
			# We cannot do foreach() here, as the JOB ids do not start at zero
			$failed_jids=0;
			for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
			$job_return = $fail[$jobid];
			if (not defined $job_return ) {
			# print Dumper(\@fail);

			die "run.pl: Sanity check failed: we have indication that some jobs are running " .
			"even after we waited for all jobs to finish" ;
			}
			if ($job_return != 0 ){ $failed_jids++;}
			}
			if ($failed_jids != $numfail) {
			die "run.pl: Sanity check failed: cannot find out how many jobs failed ($failed_jids x $numfail)."
			}
			if ($numfail > 0) { $ret = 1; }

			if ($ret != 0) {
			$njobs = $jobend - $jobstart + 1;
			if ($njobs == 1) {
			if (defined $jobname) {
			$logfile =~ s/$jobname/$jobstart/; # only one numbered job, so replace name with
			# that job.
			}
			print STDERR "run.pl: job failed, log is in $logfile\n";
			if ($logfile =~ m/JOB/) {
			print STDERR "run.pl: probably you forgot to put JOB=1:\$nj in your script.";
			}
			}
			else {
			$logfile =~ s/$jobname/*/g;
			print STDERR "run.pl: $numfail / $njobs failed, log is in $logfile\n";
			}
			}


			exit ($ret);

New file
			@@ -0,0 +1,246 @@
			#!/usr/bin/env perl

			# Copyright 2010-2011 Microsoft Corporation

			# See ../../COPYING for clarification regarding multiple authors
			#
			# Licensed under the Apache License, Version 2.0 (the "License");
			# you may not use this file except in compliance with the License.
			# You may obtain a copy of the License at
			#
			# http://www.apache.org/licenses/LICENSE-2.0
			#
			# THIS CODE IS PROVIDED AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
			# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
			# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
			# MERCHANTABLITY OR NON-INFRINGEMENT.
			# See the Apache 2 License for the specific language governing permissions and
			# limitations under the License.


			# This program splits up any kind of .scp or archive-type file.
			# If there is no utt2spk option it will work on any text file and
			# will split it up with an approximately equal number of lines in
			# each but.
			# With the --utt2spk option it will work on anything that has the
			# utterance-id as the first entry on each line; the utt2spk file is
			# of the form "utterance speaker" (on each line).
			# It splits it into equal size chunks as far as it can. If you use the utt2spk
			# option it will make sure these chunks coincide with speaker boundaries. In
			# this case, if there are more chunks than speakers (and in some other
			# circumstances), some of the resulting chunks will be empty and it will print
			# an error message and exit with nonzero status.
			# You will normally call this like:
			# split_scp.pl scp scp.1 scp.2 scp.3 ...
			# or
			# split_scp.pl --utt2spk=utt2spk scp scp.1 scp.2 scp.3 ...
			# Note that you can use this script to split the utt2spk file itself,
			# e.g. split_scp.pl --utt2spk=utt2spk utt2spk utt2spk.1 utt2spk.2 ...

			# You can also call the scripts like:
			# split_scp.pl -j 3 0 scp scp.0
			# [note: with this option, it assumes zero-based indexing of the split parts,
			# i.e. the second number must be 0 <= n < num-jobs.]

			use warnings;

			$num_jobs = 0;
			$job_id = 0;
			$utt2spk_file = "";
			$one_based = 0;

			for ($x = 1; $x <= 3 && @ARGV > 0; $x++) {
			if ($ARGV[0] eq "-j") {
			shift @ARGV;
			$num_jobs = shift @ARGV;
			$job_id = shift @ARGV;
			}
			if ($ARGV[0] =~ /--utt2spk=(.+)/) {
			$utt2spk_file=$1;
			shift;
			}
			if ($ARGV[0] eq '--one-based') {
			$one_based = 1;
			shift @ARGV;
			}
			}

			if ($num_jobs != 0 && ($num_jobs < 0 \|\| $job_id - $one_based < 0 \|\|
			$job_id - $one_based >= $num_jobs)) {
			die "$0: Invalid job number/index values for '-j $num_jobs $job_id" .
			($one_based ? " --one-based" : "") . "'\n"
			}

			$one_based
			and $job_id--;

			if(($num_jobs == 0 && @ARGV < 2) \|\| ($num_jobs > 0 && (@ARGV < 1 \|\| @ARGV > 2))) {
			die
			"Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ...
			or: split_scp.pl -j num-jobs job-id [--one-based] [--utt2spk=<utt2spk_file>] in.scp [out.scp]
			... where 0 <= job-id < num-jobs, or 1 <= job-id <- num-jobs if --one-based.\n";
			}

			$error = 0;
			$inscp = shift @ARGV;
			if ($num_jobs == 0) { # without -j option
			@OUTPUTS = @ARGV;
			} else {
			for ($j = 0; $j < $num_jobs; $j++) {
			if ($j == $job_id) {
			if (@ARGV > 0) { push @OUTPUTS, $ARGV[0]; }
			else { push @OUTPUTS, "-"; }
			} else {
			push @OUTPUTS, "/dev/null";
			}
			}
			}

			if ($utt2spk_file ne "") { # We have the --utt2spk option...
			open($u_fh, '<', $utt2spk_file) \|\| die "$0: Error opening utt2spk file $utt2spk_file: $!\n";
			while(<$u_fh>) {
			@A = split;
			@A == 2 \|\| die "$0: Bad line $_ in utt2spk file $utt2spk_file\n";
			($u,$s) = @A;
			$utt2spk{$u} = $s;
			}
			close $u_fh;
			open($i_fh, '<', $inscp) \|\| die "$0: Error opening input scp file $inscp: $!\n";
			@spkrs = ();
			while(<$i_fh>) {
			@A = split;
			if(@A == 0) { die "$0: Empty or space-only line in scp file $inscp\n"; }
			$u = $A[0];
			$s = $utt2spk{$u};
			defined $s \|\| die "$0: No utterance $u in utt2spk file $utt2spk_file\n";
			if(!defined $spk_count{$s}) {
			push @spkrs, $s;
			$spk_count{$s} = 0;
			$spk_data{$s} = []; # ref to new empty array.
			}
			$spk_count{$s}++;
			push @{$spk_data{$s}}, $_;
			}
			# Now split as equally as possible ..
			# First allocate spks to files by allocating an approximately
			# equal number of speakers.
			$numspks = @spkrs; # number of speakers.
			$numscps = @OUTPUTS; # number of output files.
			if ($numspks < $numscps) {
			die "$0: Refusing to split data because number of speakers $numspks " .
			"is less than the number of output .scp files $numscps\n";
			}
			for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
			$scparray[$scpidx] = []; # [] is array reference.
			}
			for ($spkidx = 0; $spkidx < $numspks; $spkidx++) {
			$scpidx = int(($spkidx*$numscps) / $numspks);
			$spk = $spkrs[$spkidx];
			push @{$scparray[$scpidx]}, $spk;
			$scpcount[$scpidx] += $spk_count{$spk};
			}

			# Now will try to reassign beginning + ending speakers
			# to different scp's and see if it gets more balanced.
			# Suppose objf we're minimizing is sum_i (num utts in scp[i] - average)^2.
			# We can show that if considering changing just 2 scp's, we minimize
			# this by minimizing the squared difference in sizes. This is
			# equivalent to minimizing the absolute difference in sizes. This
			# shows this method is bound to converge.

			$changed = 1;
			while($changed) {
			$changed = 0;
			for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
			# First try to reassign ending spk of this scp.
			if($scpidx < $numscps-1) {
			$sz = @{$scparray[$scpidx]};
			if($sz > 0) {
			$spk = $scparray[$scpidx]->[$sz-1];
			$count = $spk_count{$spk};
			$nutt1 = $scpcount[$scpidx];
			$nutt2 = $scpcount[$scpidx+1];
			if( abs( ($nutt2+$count) - ($nutt1-$count))
			< abs($nutt2 - $nutt1)) { # Would decrease
			# size-diff by reassigning spk...
			$scpcount[$scpidx+1] += $count;
			$scpcount[$scpidx] -= $count;
			pop @{$scparray[$scpidx]};
			unshift @{$scparray[$scpidx+1]}, $spk;
			$changed = 1;
			}
			}
			}
			if($scpidx > 0 && @{$scparray[$scpidx]} > 0) {
			$spk = $scparray[$scpidx]->[0];
			$count = $spk_count{$spk};
			$nutt1 = $scpcount[$scpidx-1];
			$nutt2 = $scpcount[$scpidx];
			if( abs( ($nutt2-$count) - ($nutt1+$count))
			< abs($nutt2 - $nutt1)) { # Would decrease
			# size-diff by reassigning spk...
			$scpcount[$scpidx-1] += $count;
			$scpcount[$scpidx] -= $count;
			shift @{$scparray[$scpidx]};
			push @{$scparray[$scpidx-1]}, $spk;
			$changed = 1;
			}
			}
			}
			}
			# Now print out the files...
			for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
			$scpfile = $OUTPUTS[$scpidx];
			($scpfile ne '-' ? open($f_fh, '>', $scpfile)
			: open($f_fh, '>&', \*STDOUT)) \|\|
			die "$0: Could not open scp file $scpfile for writing: $!\n";
			$count = 0;
			if(@{$scparray[$scpidx]} == 0) {
			print STDERR "$0: eError: split_scp.pl producing empty .scp file " .
			"$scpfile (too many splits and too few speakers?)\n";
			$error = 1;
			} else {
			foreach $spk ( @{$scparray[$scpidx]} ) {
			print $f_fh @{$spk_data{$spk}};
			$count += $spk_count{$spk};
			}
			$count == $scpcount[$scpidx] \|\| die "Count mismatch [code error]";
			}
			close($f_fh);
			}
			} else {
			# This block is the "normal" case where there is no --utt2spk
			# option and we just break into equal size chunks.

			open($i_fh, '<', $inscp) \|\| die "$0: Error opening input scp file $inscp: $!\n";

			$numscps = @OUTPUTS; # size of array.
			@F = ();
			while(<$i_fh>) {
			push @F, $_;
			}
			$numlines = @F;
			if($numlines == 0) {
			print STDERR "$0: error: empty input scp file $inscp\n";
			$error = 1;
			}
			$linesperscp = int( $numlines / $numscps); # the "whole part"..
			$linesperscp >= 1 \|\| die "$0: You are splitting into too many pieces! [reduce \$nj ($numscps) to be smaller than the number of lines ($numlines) in $inscp]\n";
			$remainder = $numlines - ($linesperscp * $numscps);
			($remainder >= 0 && $remainder < $numlines) \|\| die "bad remainder $remainder";
			# [just doing int() rounds down].
			$n = 0;
			for($scpidx = 0; $scpidx < @OUTPUTS; $scpidx++) {
			$scpfile = $OUTPUTS[$scpidx];
			($scpfile ne '-' ? open($o_fh, '>', $scpfile)
			: open($o_fh, '>&', \*STDOUT)) \|\|
			die "$0: Could not open scp file $scpfile for writing: $!\n";
			for($k = 0; $k < $linesperscp + ($scpidx < $remainder ? 1 : 0); $k++) {
			print $o_fh $F[$n++];
			}
			close($o_fh) \|\| die "$0: Eror closing scp file $scpfile: $!\n";
			}
			$n == $numlines \|\| die "$n != $numlines [code error]";
			}

			exit ($error);