From f8d1c79fe355efb18ae49e4363307dfec3ab89ce Mon Sep 17 00:00:00 2001
From: 雾聪 <wucong.lyb@alibaba-inc.com>
Date: 星期一, 07 八月 2023 16:14:11 +0800
Subject: [PATCH] Merge branch 'main' of https://github.com/alibaba-damo-academy/FunASR into main
---
funasr/build_utils/build_args.py | 12
funasr/modules/eend_ola/utils/feature.py | 286 +
egs/callhome/eend_ola/local/split.py | 117
egs/librispeech/e_branchformer/conf/train_asr_e_branchformer.yaml | 105
egs/callhome/sond/sond_fbank.yaml | 2739 ++++++++++++++++
egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_2spkr.yaml | 52
egs/callhome/eend_ola/local/make_swbd_cellular2.pl | 83
egs/callhome/eend_ola/local/make_sre.pl | 63
egs/librispeech/branchformer/conf/train_asr_branchformer.yaml | 104
egs/callhome/eend_ola/path.sh | 13
funasr/modules/eend_ola/utils/losses.py | 77
egs/librispeech/branchformer/local/spm_encode.py | 98
funasr/models/data2vec.py | 4
egs/callhome/sond/unit_test.py | 97
egs/librispeech/e_branchformer/local/data_prep.sh | 58
egs/librispeech/branchformer/conf/decode_asr_transformer_beam10_ctc0.3.yaml | 6
egs/librispeech/e_branchformer/local/download_and_untar.sh | 97
egs/callhome/eend_ola/local/random_mixture.py | 145
egs/librispeech/e_branchformer/local/spm_train.py | 12
egs/librispeech/branchformer/run.sh | 223 +
egs/callhome/eend_ola/local/gen_feats_scp.py | 25
egs/callhome/sond/sond.yaml | 2739 ++++++++++++++++
egs/callhome/eend_ola/local/make_musan.sh | 37
funasr/modules/eend_ola/eend_ola_dataloader.py | 57
egs/callhome/eend_ola/local/infer.py | 138
egs/librispeech/e_branchformer/conf/decode_asr_transformer_beam10_ctc0.3.yaml | 6
egs/callhome/eend_ola/local/parse_options.sh | 97
egs/librispeech/e_branchformer/utils | 1
egs/callhome/eend_ola/conf/train_diar_eend_ola_callhome_chunk2000.yaml | 45
egs/callhome/eend_ola/local/make_swbd2_phase3.pl | 102
funasr/models/e2e_diar_eend_ola.py | 165
egs/librispeech/e_branchformer/run.sh | 223 +
egs/callhome/eend_ola/local/make_callhome.sh | 73
funasr/modules/eend_ola/encoder.py | 20
egs/librispeech/e_branchformer/local/spm_encode.py | 98
funasr/build_utils/build_diar_model.py | 30
egs/librispeech/branchformer/utils | 1
egs/callhome/eend_ola/local/model_averaging.py | 28
funasr/datasets/small_datasets/sequence_iter_factory.py | 4
egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_allspkr_chunk2000.yaml | 44
egs/callhome/eend_ola/run.sh | 324 +
egs/aishell2/data2vec_pretrain/run.sh | 4
egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_allspkr.yaml | 52
egs/callhome/eend_ola/local/dump_feature.py | 144
egs/callhome/eend_ola/local/make_swbd2_phase2.pl | 107
egs/librispeech/branchformer/path.sh | 5
funasr/build_utils/build_pretrain_model.py | 7
egs/librispeech/branchformer/local/spm_train.py | 12
egs/librispeech/branchformer/local/download_and_untar.sh | 97
egs/callhome/eend_ola/local/make_mixture.py | 120
egs/callhome/eend_ola/local/make_sre.sh | 48
funasr/utils/prepare_data.py | 9
egs/callhome/eend_ola/local/make_swbd_cellular1.pl | 83
egs/callhome/eend_ola/local/run_prepare_shared_eda.sh | 235 +
funasr/build_utils/build_dataloader.py | 17
egs/librispeech/e_branchformer/path.sh | 5
funasr/modules/eend_ola/utils/kaldi_data.py | 162
egs/callhome/eend_ola/local/make_musan.py | 123
egs/callhome/eend_ola/local/make_swbd2_phase1.pl | 106
egs/librispeech/branchformer/local/data_prep.sh | 58
60 files changed, 9,848 insertions(+), 194 deletions(-)
diff --git a/egs/aishell2/data2vec_pretrain/run.sh b/egs/aishell2/data2vec_pretrain/run.sh
index f07deb5..3df39af 100755
--- a/egs/aishell2/data2vec_pretrain/run.sh
+++ b/egs/aishell2/data2vec_pretrain/run.sh
@@ -20,7 +20,6 @@
stop_stage=3
# feature configuration
-feats_dim=80
nj=64
# data
@@ -42,7 +41,7 @@
valid_set=dev_ios
asr_config=conf/train_pretrain_transformer.yaml
-model_dir="baseline_$(basename "${asr_config}" .yaml) _${lang}_${token_type}_${tag}"
+model_dir="baseline_$(basename "${asr_config}" .yaml)_${lang}_${token_type}_${tag}"
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
echo "stage 0: Data preparation"
@@ -115,7 +114,6 @@
--resume true \
--output_dir ${exp_dir}/exp/${model_dir} \
--config $asr_config \
- --input_size $feats_dim \
--ngpu $gpu_num \
--num_worker_count $count \
--multiprocessing_distributed true \
diff --git a/egs/callhome/eend_ola/conf/train_diar_eend_ola_callhome_chunk2000.yaml b/egs/callhome/eend_ola/conf/train_diar_eend_ola_callhome_chunk2000.yaml
new file mode 100644
index 0000000..cd143f7
--- /dev/null
+++ b/egs/callhome/eend_ola/conf/train_diar_eend_ola_callhome_chunk2000.yaml
@@ -0,0 +1,45 @@
+# network architecture
+# encoder related
+encoder: eend_ola_transformer
+encoder_conf:
+ idim: 345
+ n_layers: 4
+ n_units: 256
+
+# encoder-decoder attractor related
+encoder_decoder_attractor: eda
+encoder_decoder_attractor_conf:
+ n_units: 256
+
+# model related
+model: eend_ola
+model_conf:
+ attractor_loss_weight: 0.01
+ max_n_speaker: 8
+
+# optimization related
+accum_grad: 1
+grad_clip: 5
+max_epoch: 100
+val_scheduler_criterion:
+ - valid
+ - loss
+best_model_criterion:
+- - valid
+ - loss
+ - min
+keep_nbest_models: 100
+
+optim: adam
+optim_conf:
+ lr: 0.00001
+
+dataset_conf:
+ data_names: speech_speaker_labels
+ data_types: kaldi_ark
+ batch_conf:
+ batch_type: unsorted
+ batch_size: 8
+ num_workers: 8
+
+log_interval: 50
\ No newline at end of file
diff --git a/egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_2spkr.yaml b/egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_2spkr.yaml
new file mode 100644
index 0000000..47316fe
--- /dev/null
+++ b/egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_2spkr.yaml
@@ -0,0 +1,52 @@
+# network architecture
+# encoder related
+encoder: eend_ola_transformer
+encoder_conf:
+ idim: 345
+ n_layers: 4
+ n_units: 256
+
+# encoder-decoder attractor related
+encoder_decoder_attractor: eda
+encoder_decoder_attractor_conf:
+ n_units: 256
+
+# model related
+model: eend_ola
+model_conf:
+ max_n_speaker: 8
+
+# optimization related
+accum_grad: 1
+grad_clip: 5
+max_epoch: 100
+val_scheduler_criterion:
+ - valid
+ - loss
+best_model_criterion:
+- - valid
+ - loss
+ - min
+keep_nbest_models: 100
+
+optim: adam
+optim_conf:
+ lr: 1.0
+ betas:
+ - 0.9
+ - 0.98
+ eps: 1.0e-9
+scheduler: noamlr
+scheduler_conf:
+ model_size: 256
+ warmup_steps: 100000
+
+dataset_conf:
+ data_names: speech_speaker_labels
+ data_types: kaldi_ark
+ batch_conf:
+ batch_type: unsorted
+ batch_size: 64
+ num_workers: 8
+
+log_interval: 50
\ No newline at end of file
diff --git a/egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_allspkr.yaml b/egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_allspkr.yaml
new file mode 100644
index 0000000..f55e148
--- /dev/null
+++ b/egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_allspkr.yaml
@@ -0,0 +1,52 @@
+# network architecture
+# encoder related
+encoder: eend_ola_transformer
+encoder_conf:
+ idim: 345
+ n_layers: 4
+ n_units: 256
+
+# encoder-decoder attractor related
+encoder_decoder_attractor: eda
+encoder_decoder_attractor_conf:
+ n_units: 256
+
+# model related
+model: eend_ola
+model_conf:
+ max_n_speaker: 8
+
+# optimization related
+accum_grad: 1
+grad_clip: 5
+max_epoch: 25
+val_scheduler_criterion:
+ - valid
+ - loss
+best_model_criterion:
+- - valid
+ - loss
+ - min
+keep_nbest_models: 100
+
+optim: adam
+optim_conf:
+ lr: 1.0
+ betas:
+ - 0.9
+ - 0.98
+ eps: 1.0e-9
+scheduler: noamlr
+scheduler_conf:
+ model_size: 256
+ warmup_steps: 100000
+
+dataset_conf:
+ data_names: speech_speaker_labels
+ data_types: kaldi_ark
+ batch_conf:
+ batch_type: unsorted
+ batch_size: 64
+ num_workers: 8
+
+log_interval: 50
\ No newline at end of file
diff --git a/egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_allspkr_chunk2000.yaml b/egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_allspkr_chunk2000.yaml
new file mode 100644
index 0000000..d21d467
--- /dev/null
+++ b/egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_allspkr_chunk2000.yaml
@@ -0,0 +1,44 @@
+# network architecture
+# encoder related
+encoder: eend_ola_transformer
+encoder_conf:
+ idim: 345
+ n_layers: 4
+ n_units: 256
+
+# encoder-decoder attractor related
+encoder_decoder_attractor: eda
+encoder_decoder_attractor_conf:
+ n_units: 256
+
+# model related
+model: eend_ola
+model_conf:
+ max_n_speaker: 8
+
+# optimization related
+accum_grad: 1
+grad_clip: 5
+max_epoch: 1
+val_scheduler_criterion:
+ - valid
+ - loss
+best_model_criterion:
+- - valid
+ - loss
+ - min
+keep_nbest_models: 100
+
+optim: adam
+optim_conf:
+ lr: 0.00001
+
+dataset_conf:
+ data_names: speech_speaker_labels
+ data_types: kaldi_ark
+ batch_conf:
+ batch_type: unsorted
+ batch_size: 8
+ num_workers: 8
+
+log_interval: 50
\ No newline at end of file
diff --git a/egs/callhome/eend_ola/local/dump_feature.py b/egs/callhome/eend_ola/local/dump_feature.py
new file mode 100644
index 0000000..5d7a061
--- /dev/null
+++ b/egs/callhome/eend_ola/local/dump_feature.py
@@ -0,0 +1,144 @@
+import argparse
+import os
+
+from kaldiio import WriteHelper
+
+import funasr.modules.eend_ola.utils.feature as feature
+from funasr.modules.eend_ola.utils.kaldi_data import load_segments_rechash, load_utt2spk, load_wav_scp, load_reco2dur, \
+ load_spk2utt, load_wav
+
+
+def _count_frames(data_len, size, step):
+ return int((data_len - size + step) / step)
+
+
+def _gen_frame_indices(
+ data_length, size=2000, step=2000,
+ use_last_samples=False,
+ label_delay=0,
+ subsampling=1):
+ i = -1
+ for i in range(_count_frames(data_length, size, step)):
+ yield i * step, i * step + size
+ if use_last_samples and i * step + size < data_length:
+ if data_length - (i + 1) * step - subsampling * label_delay > 0:
+ yield (i + 1) * step, data_length
+
+
+class KaldiData:
+ def __init__(self, data_dir, idx):
+ self.data_dir = data_dir
+ segment_file = os.path.join(self.data_dir, 'segments.{}'.format(idx))
+ self.segments = load_segments_rechash(segment_file)
+
+ utt2spk_file = os.path.join(self.data_dir, 'utt2spk.{}'.format(idx))
+ self.utt2spk = load_utt2spk(utt2spk_file)
+
+ wav_file = os.path.join(self.data_dir, 'wav.scp.{}'.format(idx))
+ self.wavs = load_wav_scp(wav_file)
+
+ reco2dur_file = os.path.join(self.data_dir, 'reco2dur.{}'.format(idx))
+ self.reco2dur = load_reco2dur(reco2dur_file)
+
+ spk2utt_file = os.path.join(self.data_dir, 'spk2utt.{}'.format(idx))
+ self.spk2utt = load_spk2utt(spk2utt_file)
+
+ def load_wav(self, recid, start=0, end=None):
+ data, rate = load_wav(self.wavs[recid], start, end)
+ return data, rate
+
+
+class KaldiDiarizationDataset():
+ def __init__(
+ self,
+ data_dir,
+ index,
+ chunk_size=2000,
+ context_size=0,
+ frame_size=1024,
+ frame_shift=256,
+ subsampling=1,
+ rate=16000,
+ input_transform=None,
+ use_last_samples=False,
+ label_delay=0,
+ n_speakers=None,
+ ):
+ self.data_dir = data_dir
+ self.index = index
+ self.chunk_size = chunk_size
+ self.context_size = context_size
+ self.frame_size = frame_size
+ self.frame_shift = frame_shift
+ self.subsampling = subsampling
+ self.input_transform = input_transform
+ self.n_speakers = n_speakers
+ self.chunk_indices = []
+ self.label_delay = label_delay
+
+ self.data = KaldiData(self.data_dir, index)
+
+ for rec, path in self.data.wavs.items():
+ data_len = int(self.data.reco2dur[rec] * rate / frame_shift)
+ data_len = int(data_len / self.subsampling)
+ for st, ed in _gen_frame_indices(
+ data_len, chunk_size, chunk_size, use_last_samples,
+ label_delay=self.label_delay,
+ subsampling=self.subsampling):
+ self.chunk_indices.append(
+ (rec, path, st * self.subsampling, ed * self.subsampling))
+ print(len(self.chunk_indices), " chunks")
+
+
+def convert(args):
+ dataset = KaldiDiarizationDataset(
+ data_dir=args.data_dir,
+ index=args.index,
+ chunk_size=args.num_frames,
+ context_size=args.context_size,
+ input_transform="logmel23_mn",
+ frame_size=args.frame_size,
+ frame_shift=args.frame_shift,
+ subsampling=args.subsampling,
+ rate=8000,
+ use_last_samples=True,
+ )
+
+ feature_ark_file = os.path.join(args.output_dir, "feature.ark.{}".format(args.index))
+ feature_scp_file = os.path.join(args.output_dir, "feature.scp.{}".format(args.index))
+ label_ark_file = os.path.join(args.output_dir, "label.ark.{}".format(args.index))
+ label_scp_file = os.path.join(args.output_dir, "label.scp.{}".format(args.index))
+ with WriteHelper('ark,scp:{},{}'.format(feature_ark_file, feature_scp_file)) as feature_writer, \
+ WriteHelper('ark,scp:{},{}'.format(label_ark_file, label_scp_file)) as label_writer:
+ for idx, (rec, path, st, ed) in enumerate(dataset.chunk_indices):
+ Y, T = feature.get_labeledSTFT(
+ dataset.data,
+ rec,
+ st,
+ ed,
+ dataset.frame_size,
+ dataset.frame_shift,
+ dataset.n_speakers)
+ Y = feature.transform(Y, dataset.input_transform)
+ Y_spliced = feature.splice(Y, dataset.context_size)
+ Y_ss, T_ss = feature.subsample(Y_spliced, T, dataset.subsampling)
+ st = '{:0>7d}'.format(st)
+ ed = '{:0>7d}'.format(ed)
+ key = "{}_{}_{}".format(rec, st, ed)
+ feature_writer(key, Y_ss)
+ label_writer(key, T_ss.reshape(-1))
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--data_dir", type=str)
+ parser.add_argument("--output_dir", type=str)
+ parser.add_argument("--index", type=str)
+ parser.add_argument("--num_frames", type=int, default=500)
+ parser.add_argument("--context_size", type=int, default=7)
+ parser.add_argument("--frame_size", type=int, default=200)
+ parser.add_argument("--frame_shift", type=int, default=80)
+ parser.add_argument("--subsampling", type=int, default=10)
+
+ args = parser.parse_args()
+ convert(args)
diff --git a/egs/callhome/eend_ola/local/gen_feats_scp.py b/egs/callhome/eend_ola/local/gen_feats_scp.py
new file mode 100644
index 0000000..88a94f2
--- /dev/null
+++ b/egs/callhome/eend_ola/local/gen_feats_scp.py
@@ -0,0 +1,25 @@
+import os
+import argparse
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--root_path", type=str)
+ parser.add_argument("--out_path", type=str)
+ parser.add_argument("--split_num", type=int, default=64)
+ args = parser.parse_args()
+ root_path = args.root_path
+ out_path = args.out_path
+ split_num = args.split_num
+
+ with open(os.path.join(out_path, "feats.scp"), "w") as out_f:
+ for i in range(split_num):
+ idx = str(i + 1)
+ feature_file = os.path.join(root_path, "feature.scp.{}".format(idx))
+ label_file = os.path.join(root_path, "label.scp.{}".format(idx))
+ with open(feature_file) as ff, open(label_file) as fl:
+ ff_lines = ff.readlines()
+ fl_lines = fl.readlines()
+ for ff_line, fl_line in zip(ff_lines, fl_lines):
+ sample_name, f_path = ff_line.strip().split()
+ _, l_path = fl_line.strip().split()
+ out_f.write("{} {} {}\n".format(sample_name, f_path, l_path))
\ No newline at end of file
diff --git a/egs/callhome/eend_ola/local/infer.py b/egs/callhome/eend_ola/local/infer.py
new file mode 100644
index 0000000..23e1d52
--- /dev/null
+++ b/egs/callhome/eend_ola/local/infer.py
@@ -0,0 +1,138 @@
+import argparse
+import os
+
+import numpy as np
+import soundfile as sf
+import torch
+import yaml
+from scipy.signal import medfilt
+
+import funasr.models.frontend.eend_ola_feature as eend_ola_feature
+from funasr.build_utils.build_model_from_file import build_model_from_file
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--config_file",
+ type=str,
+ help="model config file",
+ )
+ parser.add_argument(
+ "--model_file",
+ type=str,
+ help="model path",
+ )
+ parser.add_argument(
+ "--output_rttm_file",
+ type=str,
+ help="output rttm path",
+ )
+ parser.add_argument(
+ "--wav_scp_file",
+ type=str,
+ default="wav.scp",
+ help="input data path",
+ )
+ parser.add_argument(
+ "--frame_shift",
+ type=int,
+ default=80,
+ help="frame shift",
+ )
+ parser.add_argument(
+ "--frame_size",
+ type=int,
+ default=200,
+ help="frame size",
+ )
+ parser.add_argument(
+ "--context_size",
+ type=int,
+ default=7,
+ help="context size",
+ )
+ parser.add_argument(
+ "--sampling_rate",
+ type=int,
+ default=8000,
+ help="sampling rate",
+ )
+ parser.add_argument(
+ "--subsampling",
+ type=int,
+ default=10,
+ help="setting subsampling",
+ )
+ parser.add_argument(
+ "--shuffle",
+ type=bool,
+ default=True,
+ help="shuffle speech in time",
+ )
+ parser.add_argument(
+ "--attractor_threshold",
+ type=float,
+ default=0.5,
+ help="threshold for selecting attractors",
+ )
+ parser.add_argument(
+ "--device",
+ type=str,
+ default="cuda",
+ )
+ args = parser.parse_args()
+
+ with open(args.config_file) as f:
+ configs = yaml.safe_load(f)
+ for k, v in configs.items():
+ if not hasattr(args, k):
+ setattr(args, k, v)
+
+ np.random.seed(args.seed)
+ torch.manual_seed(args.seed)
+ torch.cuda.manual_seed(args.seed)
+ os.environ['PYTORCH_SEED'] = str(args.seed)
+
+ model, _ = build_model_from_file(config_file=args.config_file, model_file=args.model_file, task_name="diar",
+ device=args.device)
+ model.eval()
+
+ with open(args.wav_scp_file) as f:
+ wav_lines = [line.strip().split() for line in f.readlines()]
+ wav_items = {x[0]: x[1] for x in wav_lines}
+
+ print("Start inference")
+ with open(args.output_rttm_file, "w") as wf:
+ for wav_id in wav_items.keys():
+ print("Process wav: {}".format(wav_id))
+ data, rate = sf.read(wav_items[wav_id])
+ speech = eend_ola_feature.stft(data, args.frame_size, args.frame_shift)
+ speech = eend_ola_feature.transform(speech)
+ speech = eend_ola_feature.splice(speech, context_size=args.context_size)
+ speech = speech[::args.subsampling] # sampling
+ speech = torch.from_numpy(speech)
+
+ with torch.no_grad():
+ speech = speech.to(args.device)
+ ys, _, _, _ = model.estimate_sequential(
+ [speech],
+ n_speakers=None,
+ th=args.attractor_threshold,
+ shuffle=args.shuffle
+ )
+
+ a = ys[0].cpu().numpy()
+ a = medfilt(a, (11, 1))
+ rst = []
+ for spkr_id, frames in enumerate(a.T):
+ frames = np.pad(frames, (1, 1), 'constant')
+ changes, = np.where(np.diff(frames, axis=0) != 0)
+ fmt = "SPEAKER {:s} 1 {:7.2f} {:7.2f} <NA> <NA> {:s} <NA>"
+ for s, e in zip(changes[::2], changes[1::2]):
+ st = s * args.frame_shift * args.subsampling / args.sampling_rate
+ dur = (e - s) * args.frame_shift * args.subsampling / args.sampling_rate
+ print(fmt.format(
+ wav_id,
+ st,
+ dur,
+ wav_id + "_" + str(spkr_id)), file=wf)
\ No newline at end of file
diff --git a/egs/callhome/eend_ola/local/make_callhome.sh b/egs/callhome/eend_ola/local/make_callhome.sh
new file mode 100755
index 0000000..caa8f67
--- /dev/null
+++ b/egs/callhome/eend_ola/local/make_callhome.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+# Copyright 2017 David Snyder
+# Apache 2.0.
+#
+# This script prepares the Callhome portion of the NIST SRE 2000
+# corpus (LDC2001S97). It is the evaluation dataset used in the
+# callhome_diarization recipe.
+
+if [ $# -ne 2 ]; then
+ echo "Usage: $0 <callhome-speech> <out-data-dir>"
+ echo "e.g.: $0 /mnt/data/LDC2001S97 data/"
+ exit 1;
+fi
+
+src_dir=$1
+data_dir=$2
+
+tmp_dir=$data_dir/callhome/.tmp/
+mkdir -p $tmp_dir
+
+# Download some metadata that wasn't provided in the LDC release
+if [ ! -d "$tmp_dir/sre2000-key" ]; then
+ wget --no-check-certificate -P $tmp_dir/ \
+ http://www.openslr.org/resources/10/sre2000-key.tar.gz
+ tar -xvf $tmp_dir/sre2000-key.tar.gz -C $tmp_dir/
+fi
+
+# The list of 500 recordings
+awk '{print $1}' $tmp_dir/sre2000-key/reco2num > $tmp_dir/reco.list
+
+# Create wav.scp file
+count=0
+missing=0
+while read reco; do
+ path=$(find $src_dir -name "$reco.sph")
+ if [ -z "${path// }" ]; then
+ >&2 echo "$0: Missing Sphere file for $reco"
+ missing=$((missing+1))
+ else
+ echo "$reco sph2pipe -f wav -p $path |"
+ fi
+ count=$((count+1))
+done < $tmp_dir/reco.list > $data_dir/callhome/wav.scp
+
+if [ $missing -gt 0 ]; then
+ echo "$0: Missing $missing out of $count recordings"
+fi
+
+cp $tmp_dir/sre2000-key/segments $data_dir/callhome/
+awk '{print $1, $2}' $data_dir/callhome/segments > $data_dir/callhome/utt2spk
+utils/utt2spk_to_spk2utt.pl $data_dir/callhome/utt2spk > $data_dir/callhome/spk2utt
+cp $tmp_dir/sre2000-key/reco2num $data_dir/callhome/reco2num_spk
+cp $tmp_dir/sre2000-key/fullref.rttm $data_dir/callhome/
+
+utils/validate_data_dir.sh --no-text --no-feats $data_dir/callhome
+utils/fix_data_dir.sh $data_dir/callhome
+
+utils/copy_data_dir.sh $data_dir/callhome $data_dir/callhome1
+utils/copy_data_dir.sh $data_dir/callhome $data_dir/callhome2
+
+utils/shuffle_list.pl $data_dir/callhome/wav.scp | head -n 250 \
+ | utils/filter_scp.pl - $data_dir/callhome/wav.scp \
+ > $data_dir/callhome1/wav.scp
+utils/fix_data_dir.sh $data_dir/callhome1
+utils/filter_scp.pl --exclude $data_dir/callhome1/wav.scp \
+ $data_dir/callhome/wav.scp > $data_dir/callhome2/wav.scp
+utils/fix_data_dir.sh $data_dir/callhome2
+utils/filter_scp.pl $data_dir/callhome1/wav.scp $data_dir/callhome/reco2num_spk \
+ > $data_dir/callhome1/reco2num_spk
+utils/filter_scp.pl $data_dir/callhome2/wav.scp $data_dir/callhome/reco2num_spk \
+ > $data_dir/callhome2/reco2num_spk
+
+rm -rf $tmp_dir 2> /dev/null
diff --git a/egs/callhome/eend_ola/local/make_mixture.py b/egs/callhome/eend_ola/local/make_mixture.py
new file mode 100755
index 0000000..6b15903
--- /dev/null
+++ b/egs/callhome/eend_ola/local/make_mixture.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+
+# Copyright 2019 Hitachi, Ltd. (author: Yusuke Fujita)
+# Licensed under the MIT license.
+#
+# This script generates simulated multi-talker mixtures for diarization
+#
+# common/make_mixture.py \
+# mixture.scp \
+# data/mixture \
+# wav/mixture
+
+
+import argparse
+import os
+from funasr.modules.eend_ola.utils import kaldi_data
+import numpy as np
+import math
+import soundfile as sf
+import json
+
+parser = argparse.ArgumentParser()
+parser.add_argument('script',
+ help='list of json')
+parser.add_argument('out_data_dir',
+ help='output data dir of mixture')
+parser.add_argument('out_wav_dir',
+ help='output mixture wav files are stored here')
+parser.add_argument('--rate', type=int, default=16000,
+ help='sampling rate')
+args = parser.parse_args()
+
+# open output data files
+segments_f = open(args.out_data_dir + '/segments', 'w')
+utt2spk_f = open(args.out_data_dir + '/utt2spk', 'w')
+wav_scp_f = open(args.out_data_dir + '/wav.scp', 'w')
+
+# "-R" forces the default random seed for reproducibility
+resample_cmd = "sox -R -t wav - -t wav - rate {}".format(args.rate)
+
+for line in open(args.script):
+ recid, jsonstr = line.strip().split(None, 1)
+ indata = json.loads(jsonstr)
+ wavfn = indata['recid']
+ # recid now include out_wav_dir
+ recid = os.path.join(args.out_wav_dir, wavfn).replace('/','_')
+ noise = indata['noise']
+ noise_snr = indata['snr']
+ mixture = []
+ for speaker in indata['speakers']:
+ spkid = speaker['spkid']
+ utts = speaker['utts']
+ intervals = speaker['intervals']
+ rir = speaker['rir']
+ data = []
+ pos = 0
+ for interval, utt in zip(intervals, utts):
+ # append silence interval data
+ silence = np.zeros(int(interval * args.rate))
+ data.append(silence)
+ # utterance is reverberated using room impulse response
+ preprocess = "wav-reverberate --print-args=false " \
+ " --impulse-response={} - -".format(rir)
+ if isinstance(utt, list):
+ rec, st, et = utt
+ st = np.rint(st * args.rate).astype(int)
+ et = np.rint(et * args.rate).astype(int)
+ else:
+ rec = utt
+ st = 0
+ et = None
+ if rir is not None:
+ wav_rxfilename = kaldi_data.process_wav(rec, preprocess)
+ else:
+ wav_rxfilename = rec
+ wav_rxfilename = kaldi_data.process_wav(
+ wav_rxfilename, resample_cmd)
+ speech, _ = kaldi_data.load_wav(wav_rxfilename, st, et)
+ data.append(speech)
+ # calculate start/end position in samples
+ startpos = pos + len(silence)
+ endpos = startpos + len(speech)
+ # write segments and utt2spk
+ uttid = '{}_{}_{:07d}_{:07d}'.format(
+ spkid, recid, int(startpos / args.rate * 100),
+ int(endpos / args.rate * 100))
+ print(uttid, recid,
+ startpos / args.rate, endpos / args.rate, file=segments_f)
+ print(uttid, spkid, file=utt2spk_f)
+ # update position for next utterance
+ pos = endpos
+ data = np.concatenate(data)
+ mixture.append(data)
+
+ # fitting to the maximum-length speaker data, then mix all speakers
+ maxlen = max(len(x) for x in mixture)
+ mixture = [np.pad(x, (0, maxlen - len(x)), 'constant') for x in mixture]
+ mixture = np.sum(mixture, axis=0)
+ # noise is repeated or cutted for fitting to the mixture data length
+ noise_resampled = kaldi_data.process_wav(noise, resample_cmd)
+ noise_data, _ = kaldi_data.load_wav(noise_resampled)
+ if maxlen > len(noise_data):
+ noise_data = np.pad(noise_data, (0, maxlen - len(noise_data)), 'wrap')
+ else:
+ noise_data = noise_data[:maxlen]
+ # noise power is scaled according to selected SNR, then mixed
+ signal_power = np.sum(mixture**2) / len(mixture)
+ noise_power = np.sum(noise_data**2) / len(noise_data)
+ scale = math.sqrt(
+ math.pow(10, - noise_snr / 10) * signal_power / noise_power)
+ mixture += noise_data * scale
+ # output the wav file and write wav.scp
+ outfname = '{}.wav'.format(wavfn)
+ outpath = os.path.join(args.out_wav_dir, outfname)
+ sf.write(outpath, mixture, args.rate)
+ print(recid, os.path.abspath(outpath), file=wav_scp_f)
+
+wav_scp_f.close()
+segments_f.close()
+utt2spk_f.close()
diff --git a/egs/callhome/eend_ola/local/make_musan.py b/egs/callhome/eend_ola/local/make_musan.py
new file mode 100755
index 0000000..833da06
--- /dev/null
+++ b/egs/callhome/eend_ola/local/make_musan.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python3
+# Copyright 2015 David Snyder
+# 2018 Ewald Enzinger
+# Apache 2.0.
+#
+# Modified version of egs/sre16/v1/local/make_musan.py (commit e3fb7c4a0da4167f8c94b80f4d3cc5ab4d0e22e8).
+# This version uses the raw MUSAN audio files (16 kHz) and does not use sox to resample at 8 kHz.
+#
+# This file is meant to be invoked by make_musan.sh.
+
+import os, sys
+
+def process_music_annotations(path):
+ utt2spk = {}
+ utt2vocals = {}
+ lines = open(path, 'r').readlines()
+ for line in lines:
+ utt, genres, vocals, musician = line.rstrip().split()[:4]
+ # For this application, the musican ID isn't important
+ utt2spk[utt] = utt
+ utt2vocals[utt] = vocals == "Y"
+ return utt2spk, utt2vocals
+
+def prepare_music(root_dir, use_vocals):
+ utt2vocals = {}
+ utt2spk = {}
+ utt2wav = {}
+ num_good_files = 0
+ num_bad_files = 0
+ music_dir = os.path.join(root_dir, "music")
+ for root, dirs, files in os.walk(music_dir):
+ for file in files:
+ file_path = os.path.join(root, file)
+ if file.endswith(".wav"):
+ utt = str(file).replace(".wav", "")
+ utt2wav[utt] = file_path
+ elif str(file) == "ANNOTATIONS":
+ utt2spk_part, utt2vocals_part = process_music_annotations(file_path)
+ utt2spk.update(utt2spk_part)
+ utt2vocals.update(utt2vocals_part)
+ utt2spk_str = ""
+ utt2wav_str = ""
+ for utt in utt2vocals:
+ if utt in utt2wav:
+ if use_vocals or not utt2vocals[utt]:
+ utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
+ utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
+ num_good_files += 1
+ else:
+ print("Missing file {}".format(utt))
+ num_bad_files += 1
+ print("In music directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
+ return utt2spk_str, utt2wav_str
+
+def prepare_speech(root_dir):
+ utt2spk = {}
+ utt2wav = {}
+ num_good_files = 0
+ num_bad_files = 0
+ speech_dir = os.path.join(root_dir, "speech")
+ for root, dirs, files in os.walk(speech_dir):
+ for file in files:
+ file_path = os.path.join(root, file)
+ if file.endswith(".wav"):
+ utt = str(file).replace(".wav", "")
+ utt2wav[utt] = file_path
+ utt2spk[utt] = utt
+ utt2spk_str = ""
+ utt2wav_str = ""
+ for utt in utt2spk:
+ if utt in utt2wav:
+ utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
+ utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
+ num_good_files += 1
+ else:
+ print("Missing file {}".format(utt))
+ num_bad_files += 1
+ print("In speech directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
+ return utt2spk_str, utt2wav_str
+
+def prepare_noise(root_dir):
+ utt2spk = {}
+ utt2wav = {}
+ num_good_files = 0
+ num_bad_files = 0
+ noise_dir = os.path.join(root_dir, "noise")
+ for root, dirs, files in os.walk(noise_dir):
+ for file in files:
+ file_path = os.path.join(root, file)
+ if file.endswith(".wav"):
+ utt = str(file).replace(".wav", "")
+ utt2wav[utt] = file_path
+ utt2spk[utt] = utt
+ utt2spk_str = ""
+ utt2wav_str = ""
+ for utt in utt2spk:
+ if utt in utt2wav:
+ utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
+ utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
+ num_good_files += 1
+ else:
+ print("Missing file {}".format(utt))
+ num_bad_files += 1
+ print("In noise directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
+ return utt2spk_str, utt2wav_str
+
+def main():
+ in_dir = sys.argv[1]
+ out_dir = sys.argv[2]
+ use_vocals = sys.argv[3] == "Y"
+ utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals)
+ utt2spk_speech, utt2wav_speech = prepare_speech(in_dir)
+ utt2spk_noise, utt2wav_noise = prepare_noise(in_dir)
+ utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise
+ utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise
+ wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w')
+ wav_fi.write(utt2wav)
+ utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w')
+ utt2spk_fi.write(utt2spk)
+
+
+if __name__=="__main__":
+ main()
diff --git a/egs/callhome/eend_ola/local/make_musan.sh b/egs/callhome/eend_ola/local/make_musan.sh
new file mode 100755
index 0000000..694940a
--- /dev/null
+++ b/egs/callhome/eend_ola/local/make_musan.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+# Copyright 2015 David Snyder
+# Apache 2.0.
+#
+# This script, called by ../run.sh, creates the MUSAN
+# data directory. The required dataset is freely available at
+# http://www.openslr.org/17/
+
+set -e
+in_dir=$1
+data_dir=$2
+use_vocals='Y'
+
+mkdir -p local/musan.tmp
+
+echo "Preparing ${data_dir}/musan..."
+mkdir -p ${data_dir}/musan
+local/make_musan.py ${in_dir} ${data_dir}/musan ${use_vocals}
+
+utils/fix_data_dir.sh ${data_dir}/musan
+
+grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music
+grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech
+grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise
+utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \
+ ${data_dir}/musan ${data_dir}/musan_music
+utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \
+ ${data_dir}/musan ${data_dir}/musan_speech
+utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \
+ ${data_dir}/musan ${data_dir}/musan_noise
+
+utils/fix_data_dir.sh ${data_dir}/musan_music
+utils/fix_data_dir.sh ${data_dir}/musan_speech
+utils/fix_data_dir.sh ${data_dir}/musan_noise
+
+rm -rf local/musan.tmp
+
diff --git a/egs/callhome/eend_ola/local/make_sre.pl b/egs/callhome/eend_ola/local/make_sre.pl
new file mode 100755
index 0000000..b86fa7e
--- /dev/null
+++ b/egs/callhome/eend_ola/local/make_sre.pl
@@ -0,0 +1,63 @@
+#!/usr/bin/perl
+#
+# Copyright 2015 David Snyder
+# Apache 2.0.
+# Usage: make_sre.pl <path-to-data> <name-of-source> <sre-ref> <output-dir>
+
+if (@ARGV != 4) {
+ print STDERR "Usage: $0 <path-to-data> <name-of-source> <sre-ref> <output-dir>\n";
+ print STDERR "e.g. $0 /export/corpora5/LDC/LDC2006S44 sre2004 sre_ref data/sre2004\n";
+ exit(1);
+}
+
+($db_base, $sre_name, $sre_ref_filename, $out_dir) = @ARGV;
+%utt2sph = ();
+%spk2gender = ();
+
+$tmp_dir = "$out_dir/tmp";
+if (system("mkdir -p $tmp_dir") != 0) {
+ die "Error making directory $tmp_dir";
+}
+
+if (system("find $db_base -name '*.sph' > $tmp_dir/sph.list") != 0) {
+ die "Error getting list of sph files";
+}
+open(WAVLIST, "<", "$tmp_dir/sph.list") or die "cannot open wav list";
+
+while(<WAVLIST>) {
+ chomp;
+ $sph = $_;
+ @A1 = split("/",$sph);
+ @A2 = split("[./]",$A1[$#A1]);
+ $uttId=$A2[0];
+ $utt2sph{$uttId} = $sph;
+}
+
+open(GNDR,">", "$out_dir/spk2gender") or die "Could not open the output file $out_dir/spk2gender";
+open(SPKR,">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk";
+open(WAV,">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp";
+open(SRE_REF, "<", $sre_ref_filename) or die "Cannot open SRE reference.";
+while (<SRE_REF>) {
+ chomp;
+ ($speaker, $gender, $other_sre_name, $utt_id, $channel) = split(" ", $_);
+ $channel_num = "1";
+ if ($channel eq "A") {
+ $channel_num = "1";
+ } else {
+ $channel_num = "2";
+ }
+ if (($other_sre_name eq $sre_name) and (exists $utt2sph{$utt_id})) {
+ $full_utt_id = "$speaker-$gender-$sre_name-$utt_id-$channel";
+ $spk2gender{"$speaker-$gender"} = $gender;
+ print WAV "$full_utt_id"," sph2pipe -f wav -p -c $channel_num $utt2sph{$utt_id} |\n";
+ print SPKR "$full_utt_id $speaker-$gender","\n";
+ }
+}
+foreach $speaker (keys %spk2gender) {
+ print GNDR "$speaker $spk2gender{$speaker}\n";
+}
+
+close(GNDR) || die;
+close(SPKR) || die;
+close(WAV) || die;
+close(SRE_REF) || die;
diff --git a/egs/callhome/eend_ola/local/make_sre.sh b/egs/callhome/eend_ola/local/make_sre.sh
new file mode 100755
index 0000000..bef4e06
--- /dev/null
+++ b/egs/callhome/eend_ola/local/make_sre.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+# Copyright 2015 David Snyder
+# Apache 2.0.
+#
+# See README.txt for more info on data required.
+
+set -e
+
+data_root=$1
+data_dir=$2
+
+wget -P data/local/ http://www.openslr.org/resources/15/speaker_list.tgz
+tar -C data/local/ -xvf data/local/speaker_list.tgz
+sre_ref=data/local/speaker_list
+
+local/make_sre.pl $data_root/LDC2006S44/ \
+ sre2004 $sre_ref $data_dir/sre2004
+
+local/make_sre.pl $data_root/LDC2011S01 \
+ sre2005 $sre_ref $data_dir/sre2005_train
+
+local/make_sre.pl $data_root/LDC2011S04 \
+ sre2005 $sre_ref $data_dir/sre2005_test
+
+local/make_sre.pl $data_root/LDC2011S09 \
+ sre2006 $sre_ref $data_dir/sre2006_train
+
+local/make_sre.pl $data_root/LDC2011S10 \
+ sre2006 $sre_ref $data_dir/sre2006_test_1
+
+local/make_sre.pl $data_root/LDC2012S01 \
+ sre2006 $sre_ref $data_dir/sre2006_test_2
+
+local/make_sre.pl $data_root/LDC2011S05 \
+ sre2008 $sre_ref $data_dir/sre2008_train
+
+local/make_sre.pl $data_root/LDC2011S08 \
+ sre2008 $sre_ref $data_dir/sre2008_test
+
+utils/combine_data.sh $data_dir/sre \
+ $data_dir/sre2004 $data_dir/sre2005_train \
+ $data_dir/sre2005_test $data_dir/sre2006_train \
+ $data_dir/sre2006_test_1 $data_dir/sre2006_test_2 \
+ $data_dir/sre2008_train $data_dir/sre2008_test
+
+utils/validate_data_dir.sh --no-text --no-feats $data_dir/sre
+utils/fix_data_dir.sh $data_dir/sre
+rm data/local/speaker_list.*
diff --git a/egs/callhome/eend_ola/local/make_swbd2_phase1.pl b/egs/callhome/eend_ola/local/make_swbd2_phase1.pl
new file mode 100755
index 0000000..71b26b5
--- /dev/null
+++ b/egs/callhome/eend_ola/local/make_swbd2_phase1.pl
@@ -0,0 +1,106 @@
+#!/usr/bin/perl
+use warnings; #sed replacement for -w perl parameter
+#
+# Copyright 2017 David Snyder
+# Apache 2.0
+
+if (@ARGV != 2) {
+ print STDERR "Usage: $0 <path-to-LDC98S75> <path-to-output>\n";
+ print STDERR "e.g. $0 /export/corpora3/LDC/LDC98S75 data/swbd2_phase1_train\n";
+ exit(1);
+}
+($db_base, $out_dir) = @ARGV;
+
+if (system("mkdir -p $out_dir")) {
+ die "Error making directory $out_dir";
+}
+
+open(CS, "<$db_base/doc/callstat.tbl") || die "Could not open $db_base/doc/callstat.tbl";
+open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender";
+open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk";
+open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp";
+
+@badAudio = ("3", "4");
+
+$tmp_dir = "$out_dir/tmp";
+if (system("mkdir -p $tmp_dir") != 0) {
+ die "Error making directory $tmp_dir";
+}
+
+if (system("find $db_base -name '*.sph' > $tmp_dir/sph.list") != 0) {
+ die "Error getting list of sph files";
+}
+
+open(WAVLIST, "<$tmp_dir/sph.list") or die "cannot open wav list";
+
+%wavs = ();
+while(<WAVLIST>) {
+ chomp;
+ $sph = $_;
+ @t = split("/",$sph);
+ @t1 = split("[./]",$t[$#t]);
+ $uttId = $t1[0];
+ $wavs{$uttId} = $sph;
+}
+
+while (<CS>) {
+ $line = $_ ;
+ @A = split(",", $line);
+ @A1 = split("[./]",$A[0]);
+ $wav = $A1[0];
+ if (/$wav/i ~~ @badAudio) {
+ # do nothing
+ print "Bad Audio = $wav";
+ } else {
+ $spkr1= "sw_" . $A[2];
+ $spkr2= "sw_" . $A[3];
+ $gender1 = $A[5];
+ $gender2 = $A[6];
+ if ($gender1 eq "M") {
+ $gender1 = "m";
+ } elsif ($gender1 eq "F") {
+ $gender1 = "f";
+ } else {
+ die "Unknown Gender in $line";
+ }
+ if ($gender2 eq "M") {
+ $gender2 = "m";
+ } elsif ($gender2 eq "F") {
+ $gender2 = "f";
+ } else {
+ die "Unknown Gender in $line";
+ }
+ if (-e "$wavs{$wav}") {
+ $uttId = $spkr1 ."_" . $wav ."_1";
+ if (!$spk2gender{$spkr1}) {
+ $spk2gender{$spkr1} = $gender1;
+ print GNDR "$spkr1"," $gender1\n";
+ }
+ print WAV "$uttId"," sph2pipe -f wav -p -c 1 $wavs{$wav} |\n";
+ print SPKR "$uttId"," $spkr1","\n";
+
+ $uttId = $spkr2 . "_" . $wav ."_2";
+ if (!$spk2gender{$spkr2}) {
+ $spk2gender{$spkr2} = $gender2;
+ print GNDR "$spkr2"," $gender2\n";
+ }
+ print WAV "$uttId"," sph2pipe -f wav -p -c 2 $wavs{$wav} |\n";
+ print SPKR "$uttId"," $spkr2","\n";
+ } else {
+ print STDERR "Missing $wavs{$wav} for $wav\n";
+ }
+ }
+}
+
+close(WAV) || die;
+close(SPKR) || die;
+close(GNDR) || die;
+if (system("utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
+ die "Error creating spk2utt file in directory $out_dir";
+}
+if (system("utils/fix_data_dir.sh $out_dir") != 0) {
+ die "Error fixing data dir $out_dir";
+}
+if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
+ die "Error validating directory $out_dir";
+}
diff --git a/egs/callhome/eend_ola/local/make_swbd2_phase2.pl b/egs/callhome/eend_ola/local/make_swbd2_phase2.pl
new file mode 100755
index 0000000..337ab9d
--- /dev/null
+++ b/egs/callhome/eend_ola/local/make_swbd2_phase2.pl
@@ -0,0 +1,107 @@
+#!/usr/bin/perl
+use warnings; #sed replacement for -w perl parameter
+#
+# Copyright 2013 Daniel Povey
+# Apache 2.0
+
+if (@ARGV != 2) {
+ print STDERR "Usage: $0 <path-to-LDC99S79> <path-to-output>\n";
+ print STDERR "e.g. $0 /export/corpora5/LDC/LDC99S79 data/swbd2_phase2_train\n";
+ exit(1);
+}
+($db_base, $out_dir) = @ARGV;
+
+if (system("mkdir -p $out_dir")) {
+ die "Error making directory $out_dir";
+}
+
+open(CS, "<$db_base/DISC1/doc/callstat.tbl") || die "Could not open $db_base/DISC1/doc/callstat.tbl";
+open(CI, "<$db_base/DISC1/doc/callinfo.tbl") || die "Could not open $db_base/DISC1/doc/callinfo.tbl";
+open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender";
+open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk";
+open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp";
+
+@badAudio = ("3", "4");
+
+$tmp_dir = "$out_dir/tmp";
+if (system("mkdir -p $tmp_dir") != 0) {
+ die "Error making directory $tmp_dir";
+}
+
+if (system("find $db_base -name '*.sph' > $tmp_dir/sph.list") != 0) {
+ die "Error getting list of sph files";
+}
+
+open(WAVLIST, "<$tmp_dir/sph.list") or die "cannot open wav list";
+
+while(<WAVLIST>) {
+ chomp;
+ $sph = $_;
+ @t = split("/",$sph);
+ @t1 = split("[./]",$t[$#t]);
+ $uttId=$t1[0];
+ $wav{$uttId} = $sph;
+}
+
+while (<CS>) {
+ $line = $_ ;
+ $ci = <CI>;
+ $ci = <CI>;
+ @ci = split(",",$ci);
+ $wav = $ci[0];
+ @A = split(",", $line);
+ if (/$wav/i ~~ @badAudio) {
+ # do nothing
+ } else {
+ $spkr1= "sw_" . $A[2];
+ $spkr2= "sw_" . $A[3];
+ $gender1 = $A[4];
+ $gender2 = $A[5];
+ if ($gender1 eq "M") {
+ $gender1 = "m";
+ } elsif ($gender1 eq "F") {
+ $gender1 = "f";
+ } else {
+ die "Unknown Gender in $line";
+ }
+ if ($gender2 eq "M") {
+ $gender2 = "m";
+ } elsif ($gender2 eq "F") {
+ $gender2 = "f";
+ } else {
+ die "Unknown Gender in $line";
+ }
+ if (-e "$wav{$wav}") {
+ $uttId = $spkr1 ."_" . $wav ."_1";
+ if (!$spk2gender{$spkr1}) {
+ $spk2gender{$spkr1} = $gender1;
+ print GNDR "$spkr1"," $gender1\n";
+ }
+ print WAV "$uttId"," sph2pipe -f wav -p -c 1 $wav{$wav} |\n";
+ print SPKR "$uttId"," $spkr1","\n";
+
+ $uttId = $spkr2 . "_" . $wav ."_2";
+ if (!$spk2gender{$spkr2}) {
+ $spk2gender{$spkr2} = $gender2;
+ print GNDR "$spkr2"," $gender2\n";
+ }
+ print WAV "$uttId"," sph2pipe -f wav -p -c 2 $wav{$wav} |\n";
+ print SPKR "$uttId"," $spkr2","\n";
+ } else {
+ print STDERR "Missing $wav{$wav} for $wav\n";
+ }
+ }
+}
+
+close(WAV) || die;
+close(SPKR) || die;
+close(GNDR) || die;
+if (system("utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
+ die "Error creating spk2utt file in directory $out_dir";
+}
+if (system("utils/fix_data_dir.sh $out_dir") != 0) {
+ die "Error fixing data dir $out_dir";
+}
+if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
+ die "Error validating directory $out_dir";
+}
diff --git a/egs/callhome/eend_ola/local/make_swbd2_phase3.pl b/egs/callhome/eend_ola/local/make_swbd2_phase3.pl
new file mode 100755
index 0000000..f278534
--- /dev/null
+++ b/egs/callhome/eend_ola/local/make_swbd2_phase3.pl
@@ -0,0 +1,102 @@
+#!/usr/bin/perl
+use warnings; #sed replacement for -w perl parameter
+#
+# Copyright 2013 Daniel Povey
+# Apache 2.0
+
+if (@ARGV != 2) {
+ print STDERR "Usage: $0 <path-to-LDC2002S06> <path-to-output>\n";
+ print STDERR "e.g. $0 /export/corpora5/LDC/LDC2002S06 data/swbd2_phase3_train\n";
+ exit(1);
+}
+($db_base, $out_dir) = @ARGV;
+
+if (system("mkdir -p $out_dir")) {
+ die "Error making directory $out_dir";
+}
+
+open(CS, "<$db_base/DISC1/docs/callstat.tbl") || die "Could not open $db_base/DISC1/docs/callstat.tbl";
+open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender";
+open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk";
+open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp";
+
+@badAudio = ("3", "4");
+
+$tmp_dir = "$out_dir/tmp";
+if (system("mkdir -p $tmp_dir") != 0) {
+ die "Error making directory $tmp_dir";
+}
+
+if (system("find $db_base -name '*.sph' > $tmp_dir/sph.list") != 0) {
+ die "Error getting list of sph files";
+}
+
+open(WAVLIST, "<$tmp_dir/sph.list") or die "cannot open wav list";
+while(<WAVLIST>) {
+ chomp;
+ $sph = $_;
+ @t = split("/",$sph);
+ @t1 = split("[./]",$t[$#t]);
+ $uttId=$t1[0];
+ $wav{$uttId} = $sph;
+}
+
+while (<CS>) {
+ $line = $_ ;
+ @A = split(",", $line);
+ $wav = "sw_" . $A[0] ;
+ if (/$wav/i ~~ @badAudio) {
+ # do nothing
+ } else {
+ $spkr1= "sw_" . $A[3];
+ $spkr2= "sw_" . $A[4];
+ $gender1 = $A[5];
+ $gender2 = $A[6];
+ if ($gender1 eq "M") {
+ $gender1 = "m";
+ } elsif ($gender1 eq "F") {
+ $gender1 = "f";
+ } else {
+ die "Unknown Gender in $line";
+ }
+ if ($gender2 eq "M") {
+ $gender2 = "m";
+ } elsif ($gender2 eq "F") {
+ $gender2 = "f";
+ } else {
+ die "Unknown Gender in $line";
+ }
+ if (-e "$wav{$wav}") {
+ $uttId = $spkr1 ."_" . $wav ."_1";
+ if (!$spk2gender{$spkr1}) {
+ $spk2gender{$spkr1} = $gender1;
+ print GNDR "$spkr1"," $gender1\n";
+ }
+ print WAV "$uttId"," sph2pipe -f wav -p -c 1 $wav{$wav} |\n";
+ print SPKR "$uttId"," $spkr1","\n";
+
+ $uttId = $spkr2 . "_" . $wav ."_2";
+ if (!$spk2gender{$spkr2}) {
+ $spk2gender{$spkr2} = $gender2;
+ print GNDR "$spkr2"," $gender2\n";
+ }
+ print WAV "$uttId"," sph2pipe -f wav -p -c 2 $wav{$wav} |\n";
+ print SPKR "$uttId"," $spkr2","\n";
+ } else {
+ print STDERR "Missing $wav{$wav} for $wav\n";
+ }
+ }
+}
+
+close(WAV) || die;
+close(SPKR) || die;
+close(GNDR) || die;
+if (system("utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
+ die "Error creating spk2utt file in directory $out_dir";
+}
+if (system("utils/fix_data_dir.sh $out_dir") != 0) {
+ die "Error fixing data dir $out_dir";
+}
+if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
+ die "Error validating directory $out_dir";
+}
diff --git a/egs/callhome/eend_ola/local/make_swbd_cellular1.pl b/egs/callhome/eend_ola/local/make_swbd_cellular1.pl
new file mode 100644
index 0000000..ede6cc2
--- /dev/null
+++ b/egs/callhome/eend_ola/local/make_swbd_cellular1.pl
@@ -0,0 +1,83 @@
+#!/usr/bin/perl
+use warnings; #sed replacement for -w perl parameter
+#
+# Copyright 2013 Daniel Povey
+# Apache 2.0
+
+if (@ARGV != 2) {
+ print STDERR "Usage: $0 <path-to-LDC2001S13> <path-to-output>\n";
+ print STDERR "e.g. $0 /export/corpora5/LDC/LDC2001S13 data/swbd_cellular1_train\n";
+ exit(1);
+}
+($db_base, $out_dir) = @ARGV;
+
+if (system("mkdir -p $out_dir")) {
+ die "Error making directory $out_dir";
+}
+
+open(CS, "<$db_base/doc/swb_callstats.tbl") || die "Could not open $db_base/doc/swb_callstats.tbl";
+open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender";
+open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk";
+open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp";
+
+@badAudio = ("40019", "45024", "40022");
+
+while (<CS>) {
+ $line = $_ ;
+ @A = split(",", $line);
+ if (/$A[0]/i ~~ @badAudio) {
+ # do nothing
+ } else {
+ $wav = "sw_" . $A[0];
+ $spkr1= "sw_" . $A[1];
+ $spkr2= "sw_" . $A[2];
+ $gender1 = $A[3];
+ $gender2 = $A[4];
+ if ($A[3] eq "M") {
+ $gender1 = "m";
+ } elsif ($A[3] eq "F") {
+ $gender1 = "f";
+ } else {
+ die "Unknown Gender in $line";
+ }
+ if ($A[4] eq "M") {
+ $gender2 = "m";
+ } elsif ($A[4] eq "F") {
+ $gender2 = "f";
+ } else {
+ die "Unknown Gender in $line";
+ }
+ if (-e "$db_base/data/$wav.sph") {
+ $uttId = $spkr1 . "-swbdc_" . $wav ."_1";
+ if (!$spk2gender{$spkr1}) {
+ $spk2gender{$spkr1} = $gender1;
+ print GNDR "$spkr1"," $gender1\n";
+ }
+ print WAV "$uttId"," sph2pipe -f wav -p -c 1 $db_base/data/$wav.sph |\n";
+ print SPKR "$uttId"," $spkr1","\n";
+
+ $uttId = $spkr2 . "-swbdc_" . $wav ."_2";
+ if (!$spk2gender{$spkr2}) {
+ $spk2gender{$spkr2} = $gender2;
+ print GNDR "$spkr2"," $gender2\n";
+ }
+ print WAV "$uttId"," sph2pipe -f wav -p -c 2 $db_base/data/$wav.sph |\n";
+ print SPKR "$uttId"," $spkr2","\n";
+ } else {
+ print STDERR "Missing $db_base/data/$wav.sph\n";
+ }
+ }
+}
+
+close(WAV) || die;
+close(SPKR) || die;
+close(GNDR) || die;
+if (system("utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
+ die "Error creating spk2utt file in directory $out_dir";
+}
+if (system("utils/fix_data_dir.sh $out_dir") != 0) {
+ die "Error fixing data dir $out_dir";
+}
+if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
+ die "Error validating directory $out_dir";
+}
diff --git a/egs/callhome/eend_ola/local/make_swbd_cellular2.pl b/egs/callhome/eend_ola/local/make_swbd_cellular2.pl
new file mode 100755
index 0000000..4de954c
--- /dev/null
+++ b/egs/callhome/eend_ola/local/make_swbd_cellular2.pl
@@ -0,0 +1,83 @@
+#!/usr/bin/perl
+use warnings; #sed replacement for -w perl parameter
+#
+# Copyright 2013 Daniel Povey
+# Apache 2.0
+
+if (@ARGV != 2) {
+ print STDERR "Usage: $0 <path-to-LDC2004S07> <path-to-output>\n";
+ print STDERR "e.g. $0 /export/corpora5/LDC/LDC2004S07 data/swbd_cellular2_train\n";
+ exit(1);
+}
+($db_base, $out_dir) = @ARGV;
+
+if (system("mkdir -p $out_dir")) {
+ die "Error making directory $out_dir";
+}
+
+open(CS, "<$db_base/docs/swb_callstats.tbl") || die "Could not open $db_base/docs/swb_callstats.tbl";
+open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender";
+open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk";
+open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp";
+
+@badAudio=("45024", "40022");
+
+while (<CS>) {
+ $line = $_ ;
+ @A = split(",", $line);
+ if (/$A[0]/i ~~ @badAudio) {
+ # do nothing
+ } else {
+ $wav = "sw_" . $A[0];
+ $spkr1= "sw_" . $A[1];
+ $spkr2= "sw_" . $A[2];
+ $gender1 = $A[3];
+ $gender2 = $A[4];
+ if ($A[3] eq "M") {
+ $gender1 = "m";
+ } elsif ($A[3] eq "F") {
+ $gender1 = "f";
+ } else {
+ die "Unknown Gender in $line";
+ }
+ if ($A[4] eq "M") {
+ $gender2 = "m";
+ } elsif ($A[4] eq "F") {
+ $gender2 = "f";
+ } else {
+ die "Unknown Gender in $line";
+ }
+ if (-e "$db_base/data/$wav.sph") {
+ $uttId = $spkr1 . "-swbdc_" . $wav ."_1";
+ if (!$spk2gender{$spkr1}) {
+ $spk2gender{$spkr1} = $gender1;
+ print GNDR "$spkr1"," $gender1\n";
+ }
+ print WAV "$uttId"," sph2pipe -f wav -p -c 1 $db_base/data/$wav.sph |\n";
+ print SPKR "$uttId"," $spkr1","\n";
+
+ $uttId = $spkr2 . "-swbdc_" . $wav ."_2";
+ if (!$spk2gender{$spkr2}) {
+ $spk2gender{$spkr2} = $gender2;
+ print GNDR "$spkr2"," $gender2\n";
+ }
+ print WAV "$uttId"," sph2pipe -f wav -p -c 2 $db_base/data/$wav.sph |\n";
+ print SPKR "$uttId"," $spkr2","\n";
+ } else {
+ print STDERR "Missing $db_base/data/$wav.sph\n";
+ }
+ }
+}
+
+close(WAV) || die;
+close(SPKR) || die;
+close(GNDR) || die;
+if (system("utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
+ die "Error creating spk2utt file in directory $out_dir";
+}
+if (system("utils/fix_data_dir.sh $out_dir") != 0) {
+ die "Error fixing data dir $out_dir";
+}
+if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
+ die "Error validating directory $out_dir";
+}
diff --git a/egs/callhome/eend_ola/local/model_averaging.py b/egs/callhome/eend_ola/local/model_averaging.py
new file mode 100755
index 0000000..1871cd9
--- /dev/null
+++ b/egs/callhome/eend_ola/local/model_averaging.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+
+import argparse
+
+import torch
+
+
+def average_model(input_files, output_file):
+ output_model = {}
+ for ckpt_path in input_files:
+ model_params = torch.load(ckpt_path, map_location="cpu")
+ for key, value in model_params.items():
+ if key not in output_model:
+ output_model[key] = value
+ else:
+ output_model[key] += value
+ for key in output_model.keys():
+ output_model[key] /= len(input_files)
+ torch.save(output_model, output_file)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument("output_file")
+ parser.add_argument("input_files", nargs='+')
+ args = parser.parse_args()
+
+ average_model(args.input_files, args.output_file)
\ No newline at end of file
diff --git a/egs/callhome/eend_ola/local/parse_options.sh b/egs/callhome/eend_ola/local/parse_options.sh
new file mode 100755
index 0000000..71fb9e5
--- /dev/null
+++ b/egs/callhome/eend_ola/local/parse_options.sh
@@ -0,0 +1,97 @@
+#!/usr/bin/env bash
+
+# Copyright 2012 Johns Hopkins University (Author: Daniel Povey);
+# Arnab Ghoshal, Karel Vesely
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Parse command-line options.
+# To be sourced by another script (as in ". parse_options.sh").
+# Option format is: --option-name arg
+# and shell variable "option_name" gets set to value "arg."
+# The exception is --help, which takes no arguments, but prints the
+# $help_message variable (if defined).
+
+
+###
+### The --config file options have lower priority to command line
+### options, so we need to import them first...
+###
+
+# Now import all the configs specified by command-line, in left-to-right order
+for ((argpos=1; argpos<$#; argpos++)); do
+ if [ "${!argpos}" == "--config" ]; then
+ argpos_plus1=$((argpos+1))
+ config=${!argpos_plus1}
+ [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
+ . $config # source the config file.
+ fi
+done
+
+
+###
+### Now we process the command line options
+###
+while true; do
+ [ -z "${1:-}" ] && break; # break if there are no arguments
+ case "$1" in
+ # If the enclosing script is called with --help option, print the help
+ # message and exit. Scripts should put help messages in $help_message
+ --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
+ else printf "$help_message\n" 1>&2 ; fi;
+ exit 0 ;;
+ --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
+ exit 1 ;;
+ # If the first command-line argument begins with "--" (e.g. --foo-bar),
+ # then work out the variable name as $name, which will equal "foo_bar".
+ --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
+ # Next we test whether the variable in question is undefned-- if so it's
+ # an invalid option and we die. Note: $0 evaluates to the name of the
+ # enclosing script.
+ # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
+ # is undefined. We then have to wrap this test inside "eval" because
+ # foo_bar is itself inside a variable ($name).
+ eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
+
+ oldval="`eval echo \\$$name`";
+ # Work out whether we seem to be expecting a Boolean argument.
+ if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
+ was_bool=true;
+ else
+ was_bool=false;
+ fi
+
+ # Set the variable to the right value-- the escaped quotes make it work if
+ # the option had spaces, like --cmd "queue.pl -sync y"
+ eval $name=\"$2\";
+
+ # Check that Boolean-valued arguments are really Boolean.
+ if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
+ echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
+ exit 1;
+ fi
+ shift 2;
+ ;;
+ *) break;
+ esac
+done
+
+
+# Check for an empty argument to the --cmd option, which can easily occur as a
+# result of scripting errors.
+[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
+
+
+true; # so this script returns exit code 0.
diff --git a/egs/callhome/eend_ola/local/random_mixture.py b/egs/callhome/eend_ola/local/random_mixture.py
new file mode 100755
index 0000000..05d7828
--- /dev/null
+++ b/egs/callhome/eend_ola/local/random_mixture.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python3
+
+# Copyright 2019 Hitachi, Ltd. (author: Yusuke Fujita)
+# Licensed under the MIT license.
+
+"""
+This script generates random multi-talker mixtures for diarization.
+It generates a scp-like outputs: lines of "[recid] [json]".
+ recid: recording id of mixture
+ serial numbers like mix_0000001, mix_0000002, ...
+ json: mixture configuration formatted in "one-line"
+The json format is as following:
+{
+ 'speakers':[ # list of speakers
+ {
+ 'spkid': 'Name', # speaker id
+ 'rir': '/rirdir/rir.wav', # wav_rxfilename of room impulse response
+ 'utts': [ # list of wav_rxfilenames of utterances
+ '/wavdir/utt1.wav',
+ '/wavdir/utt2.wav',...],
+ 'intervals': [1.2, 3.4, ...] # list of silence durations before utterances
+ }, ... ],
+ 'noise': '/noisedir/noise.wav' # wav_rxfilename of background noise
+ 'snr': 15.0, # SNR for mixing background noise
+ 'recid': 'mix_000001' # recording id of the mixture
+}
+
+Usage:
+ common/random_mixture.py \
+ --n_mixtures=10000 \ # number of mixtures
+ data/voxceleb1_train \ # kaldi-style data dir of utterances
+ data/musan_noise_bg \ # background noises
+ data/simu_rirs \ # room impulse responses
+ > mixture.scp # output scp-like file
+
+The actual data dir and wav files are generated using make_mixture.py:
+ common/make_mixture.py \
+ mixture.scp \ # scp-like file for mixture
+ data/mixture \ # output data dir
+ wav/mixture # output wav dir
+"""
+
+import argparse
+import os
+from funasr.modules.eend_ola.utils import kaldi_data
+import random
+import numpy as np
+import json
+import itertools
+
+parser = argparse.ArgumentParser()
+parser.add_argument('data_dir',
+ help='data dir of single-speaker recordings')
+parser.add_argument('noise_dir',
+ help='data dir of background noise recordings')
+parser.add_argument('rir_dir',
+ help='data dir of room impulse responses')
+parser.add_argument('--n_mixtures', type=int, default=10,
+ help='number of mixture recordings')
+parser.add_argument('--n_speakers', type=int, default=4,
+ help='number of speakers in a mixture')
+parser.add_argument('--min_utts', type=int, default=10,
+ help='minimum number of uttenraces per speaker')
+parser.add_argument('--max_utts', type=int, default=20,
+ help='maximum number of utterances per speaker')
+parser.add_argument('--sil_scale', type=float, default=10.0,
+ help='average silence time')
+parser.add_argument('--noise_snrs', default="10:15:20",
+ help='colon-delimited SNRs for background noises')
+parser.add_argument('--random_seed', type=int, default=777,
+ help='random seed')
+parser.add_argument('--speech_rvb_probability', type=float, default=1,
+ help='reverb probability')
+args = parser.parse_args()
+
+random.seed(args.random_seed)
+np.random.seed(args.random_seed)
+
+# load list of wav files from kaldi-style data dirs
+wavs = kaldi_data.load_wav_scp(
+ os.path.join(args.data_dir, 'wav.scp'))
+noises = kaldi_data.load_wav_scp(
+ os.path.join(args.noise_dir, 'wav.scp'))
+rirs = kaldi_data.load_wav_scp(
+ os.path.join(args.rir_dir, 'wav.scp'))
+
+# spk2utt is used for counting number of utterances per speaker
+spk2utt = kaldi_data.load_spk2utt(
+ os.path.join(args.data_dir, 'spk2utt'))
+
+segments = kaldi_data.load_segments_hash(
+ os.path.join(args.data_dir, 'segments'))
+
+# choice lists for random sampling
+all_speakers = list(spk2utt.keys())
+all_noises = list(noises.keys())
+all_rirs = list(rirs.keys())
+noise_snrs = [float(x) for x in args.noise_snrs.split(':')]
+
+mixtures = []
+for it in range(args.n_mixtures):
+ # recording ids are mix_0000001, mix_0000002, ...
+ recid = 'mix_{:07d}'.format(it + 1)
+ # randomly select speakers, a background noise and a SNR
+ speakers = random.sample(all_speakers, args.n_speakers)
+ noise = random.choice(all_noises)
+ noise_snr = random.choice(noise_snrs)
+ mixture = {'speakers': []}
+ for speaker in speakers:
+ # randomly select the number of utterances
+ n_utts = np.random.randint(args.min_utts, args.max_utts + 1)
+ # utts = spk2utt[speaker][:n_utts]
+ cycle_utts = itertools.cycle(spk2utt[speaker])
+ # random start utterance
+ roll = np.random.randint(0, len(spk2utt[speaker]))
+ for i in range(roll):
+ next(cycle_utts)
+ utts = [next(cycle_utts) for i in range(n_utts)]
+ # randomly select wait time before appending utterance
+ intervals = np.random.exponential(args.sil_scale, size=n_utts)
+ # randomly select a room impulse response
+ if random.random() < args.speech_rvb_probability:
+ rir = rirs[random.choice(all_rirs)]
+ else:
+ rir = None
+ if segments is not None:
+ utts = [segments[utt] for utt in utts]
+ utts = [(wavs[rec], st, et) for (rec, st, et) in utts]
+ mixture['speakers'].append({
+ 'spkid': speaker,
+ 'rir': rir,
+ 'utts': utts,
+ 'intervals': intervals.tolist()
+ })
+ else:
+ mixture['speakers'].append({
+ 'spkid': speaker,
+ 'rir': rir,
+ 'utts': [wavs[utt] for utt in utts],
+ 'intervals': intervals.tolist()
+ })
+ mixture['noise'] = noises[noise]
+ mixture['snr'] = noise_snr
+ mixture['recid'] = recid
+ print(recid, json.dumps(mixture))
diff --git a/egs/callhome/eend_ola/local/run_prepare_shared_eda.sh b/egs/callhome/eend_ola/local/run_prepare_shared_eda.sh
new file mode 100755
index 0000000..f1019d6
--- /dev/null
+++ b/egs/callhome/eend_ola/local/run_prepare_shared_eda.sh
@@ -0,0 +1,235 @@
+#!/bin/bash
+
+# Copyright 2019 Hitachi, Ltd. (author: Yusuke Fujita, Shota Horiguchi)
+# Licensed under the MIT license.
+#
+# This script prepares kaldi-style data sets shared with different experiments
+# - data/xxxx
+# callhome, sre, swb2, and swb_cellular datasets
+# - data/simu_${simu_outputs}
+# simulation mixtures generated with various options
+
+stage=0
+
+# Modify corpus directories
+# - callhome_dir
+# CALLHOME (LDC2001S97)
+# - swb2_phase1_train
+# Switchboard-2 Phase 1 (LDC98S75)
+# - data_root
+# LDC99S79, LDC2002S06, LDC2001S13, LDC2004S07,
+# LDC2006S44, LDC2011S01, LDC2011S04, LDC2011S09,
+# LDC2011S10, LDC2012S01, LDC2011S05, LDC2011S08
+# - musan_root
+# MUSAN corpus (https://www.openslr.org/17/)
+callhome_dir=
+swb2_phase1_train=
+data_root=
+musan_root=
+# Modify simulated data storage area.
+# This script distributes simulated data under these directories
+simu_actual_dirs=(
+./s05/$USER/diarization-data
+./s08/$USER/diarization-data
+./s09/$USER/diarization-data
+)
+
+# data preparation options
+max_jobs_run=4
+sad_num_jobs=30
+sad_opts="--extra-left-context 79 --extra-right-context 21 --frames-per-chunk 150 --extra-left-context-initial 0 --extra-right-context-final 0 --acwt 0.3"
+sad_graph_opts="--min-silence-duration=0.03 --min-speech-duration=0.3 --max-speech-duration=10.0"
+sad_priors_opts="--sil-scale=0.1"
+
+# simulation options
+simu_opts_overlap=yes
+simu_opts_num_speaker_array=(1 2 3 4)
+simu_opts_sil_scale_array=(2 2 5 9)
+simu_opts_rvb_prob=0.5
+simu_opts_num_train=100000
+simu_opts_min_utts=10
+simu_opts_max_utts=20
+
+simu_cmd="run.pl"
+train_cmd="run.pl"
+random_mixture_cmd="run.pl"
+make_mixture_cmd="run.pl"
+
+. parse_options.sh || exit
+
+if [ $stage -le 0 ]; then
+ echo "prepare kaldi-style datasets"
+ # Prepare CALLHOME dataset. This will be used to evaluation.
+ if ! validate_data_dir.sh --no-text --no-feats data/callhome1_spkall \
+ || ! validate_data_dir.sh --no-text --no-feats data/callhome2_spkall; then
+ # imported from https://github.com/kaldi-asr/kaldi/blob/master/egs/callhome_diarization/v1
+ local/make_callhome.sh $callhome_dir data
+ # Generate two-speaker subsets
+ for dset in callhome1 callhome2; do
+ # Extract two-speaker recordings in wav.scp
+ copy_data_dir.sh data/${dset} data/${dset}_spkall
+ # Regenerate segments file from fullref.rttm
+ # $2: recid, $4: start_time, $5: duration, $8: speakerid
+ awk '{printf "%s_%s_%07d_%07d %s %.2f %.2f\n", \
+ $2, $8, $4*100, ($4+$5)*100, $2, $4, $4+$5}' \
+ data/callhome/fullref.rttm | sort > data/${dset}_spkall/segments
+ utils/fix_data_dir.sh data/${dset}_spkall
+ # Speaker ID is '[recid]_[speakerid]
+ awk '{split($1,A,"_"); printf "%s %s_%s\n", $1, A[1], A[2]}' \
+ data/${dset}_spkall/segments > data/${dset}_spkall/utt2spk
+ utils/fix_data_dir.sh data/${dset}_spkall
+ # Generate rttm files for scoring
+ steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \
+ data/${dset}_spkall/utt2spk data/${dset}_spkall/segments \
+ data/${dset}_spkall/rttm
+ utils/data/get_reco2dur.sh data/${dset}_spkall
+ done
+ fi
+ # Prepare a collection of NIST SRE and SWB data. This will be used to train,
+ if ! validate_data_dir.sh --no-text --no-feats data/swb_sre_comb; then
+ local/make_sre.sh $data_root data
+ # Prepare SWB for x-vector DNN training.
+ local/make_swbd2_phase1.pl $swb2_phase1_train \
+ data/swbd2_phase1_train
+ local/make_swbd2_phase2.pl $data_root/LDC99S79 \
+ data/swbd2_phase2_train
+ local/make_swbd2_phase3.pl $data_root/LDC2002S06 \
+ data/swbd2_phase3_train
+ local/make_swbd_cellular1.pl $data_root/LDC2001S13 \
+ data/swbd_cellular1_train
+ local/make_swbd_cellular2.pl $data_root/LDC2004S07 \
+ data/swbd_cellular2_train
+ # Combine swb and sre data
+ utils/combine_data.sh data/swb_sre_comb \
+ data/swbd_cellular1_train data/swbd_cellular2_train \
+ data/swbd2_phase1_train \
+ data/swbd2_phase2_train data/swbd2_phase3_train data/sre
+ fi
+ # musan data. "back-ground
+ if ! validate_data_dir.sh --no-text --no-feats data/musan_noise_bg; then
+ local/make_musan.sh $musan_root data
+ utils/copy_data_dir.sh data/musan_noise data/musan_noise_bg
+ awk '{if(NR>1) print $1,$1}' $musan_root/noise/free-sound/ANNOTATIONS > data/musan_noise_bg/utt2spk
+ utils/fix_data_dir.sh data/musan_noise_bg
+ fi
+ # simu rirs 8k
+ if ! validate_data_dir.sh --no-text --no-feats data/simu_rirs_8k; then
+ mkdir -p data/simu_rirs_8k
+# if [ ! -e sim_rir_8k.zip ]; then
+# wget --no-check-certificate http://www.openslr.org/resources/26/sim_rir_8k.zip
+# fi
+ unzip sim_rir_8k.zip -d data/sim_rir_8k
+ find $PWD/data/sim_rir_8k -iname "*.wav" \
+ | awk '{n=split($1,A,/[\/\.]/); print A[n-3]"_"A[n-1], $1}' \
+ | sort > data/simu_rirs_8k/wav.scp
+ awk '{print $1, $1}' data/simu_rirs_8k/wav.scp > data/simu_rirs_8k/utt2spk
+ utils/fix_data_dir.sh data/simu_rirs_8k
+ fi
+ # Automatic segmentation using pretrained SAD model
+ # it will take one day using 30 CPU jobs:
+ # make_mfcc: 1 hour, compute_output: 18 hours, decode: 0.5 hours
+ sad_nnet_dir=exp/segmentation_1a/tdnn_stats_asr_sad_1a
+ sad_work_dir=exp/segmentation_1a/tdnn_stats_asr_sad_1a
+ if ! validate_data_dir.sh --no-text $sad_work_dir/swb_sre_comb_seg; then
+ if [ ! -d exp/segmentation_1a ]; then
+# wget http://kaldi-asr.org/models/4/0004_tdnn_stats_asr_sad_1a.tar.gz
+ tar zxf 0004_tdnn_stats_asr_sad_1a.tar.gz
+ fi
+ steps/segmentation/detect_speech_activity.sh \
+ --nj $sad_num_jobs \
+ --graph-opts "$sad_graph_opts" \
+ --transform-probs-opts "$sad_priors_opts" $sad_opts \
+ data/swb_sre_comb $sad_nnet_dir mfcc_hires $sad_work_dir \
+ $sad_work_dir/swb_sre_comb || exit 1
+ fi
+ # Extract >1.5 sec segments and split into train/valid sets
+ if ! validate_data_dir.sh --no-text --no-feats data/swb_sre_cv; then
+ copy_data_dir.sh data/swb_sre_comb data/swb_sre_comb_seg
+ awk '$4-$3>1.5{print;}' $sad_work_dir/swb_sre_comb_seg/segments > data/swb_sre_comb_seg/segments
+ cp $sad_work_dir/swb_sre_comb_seg/{utt2spk,spk2utt} data/swb_sre_comb_seg
+ fix_data_dir.sh data/swb_sre_comb_seg
+ utils/subset_data_dir_tr_cv.sh data/swb_sre_comb_seg data/swb_sre_tr data/swb_sre_cv
+ fi
+fi
+
+simudir=data/simu
+if [ $stage -le 1 ]; then
+ echo "simulation of mixture"
+ mkdir -p $simudir/.work
+ random_mixture_cmd=local/random_mixture.py
+ make_mixture_cmd=local/make_mixture.py
+
+ for ((i=0; i<${#simu_opts_sil_scale_array[@]}; ++i)); do
+ simu_opts_num_speaker=${simu_opts_num_speaker_array[i]}
+ simu_opts_sil_scale=${simu_opts_sil_scale_array[i]}
+ for dset in swb_sre_tr swb_sre_cv; do
+ if [ "$dset" == "swb_sre_tr" ]; then
+ n_mixtures=${simu_opts_num_train}
+ else
+ n_mixtures=500
+ fi
+ simuid=${dset}_ns${simu_opts_num_speaker}_beta${simu_opts_sil_scale}_${n_mixtures}
+ # check if you have the simulation
+ if ! validate_data_dir.sh --no-text --no-feats $simudir/data/$simuid; then
+ # random mixture generation
+ $train_cmd $simudir/.work/random_mixture_$simuid.log \
+ $random_mixture_cmd --n_speakers $simu_opts_num_speaker --n_mixtures $n_mixtures \
+ --speech_rvb_probability $simu_opts_rvb_prob \
+ --sil_scale $simu_opts_sil_scale \
+ data/$dset data/musan_noise_bg data/simu_rirs_8k \
+ \> $simudir/.work/mixture_$simuid.scp
+ nj=64
+ mkdir -p $simudir/wav/$simuid
+ # distribute simulated data to $simu_actual_dir
+ split_scps=
+ for n in $(seq $nj); do
+ split_scps="$split_scps $simudir/.work/mixture_$simuid.$n.scp"
+ mkdir -p $simudir/.work/data_$simuid.$n
+ actual=${simu_actual_dirs[($n-1)%${#simu_actual_dirs[@]}]}/$simudir/wav/$simuid/$n
+ mkdir -p $actual
+ ln -nfs $actual $simudir/wav/$simuid/$n
+ done
+ utils/split_scp.pl $simudir/.work/mixture_$simuid.scp $split_scps || exit 1
+
+ $simu_cmd --max-jobs-run 64 JOB=1:$nj $simudir/.work/make_mixture_$simuid.JOB.log \
+ $make_mixture_cmd --rate=8000 \
+ $simudir/.work/mixture_$simuid.JOB.scp \
+ $simudir/.work/data_$simuid.JOB $simudir/wav/$simuid/JOB
+ utils/combine_data.sh $simudir/data/$simuid $simudir/.work/data_$simuid.*
+ steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \
+ $simudir/data/$simuid/utt2spk $simudir/data/$simuid/segments \
+ $simudir/data/$simuid/rttm
+ utils/data/get_reco2dur.sh $simudir/data/$simuid
+ fi
+ simuid_concat=${dset}_ns"$(IFS="n"; echo "${simu_opts_num_speaker_array[*]}")"_beta"$(IFS="n"; echo "${simu_opts_sil_scale_array[*]}")"_${n_mixtures}
+ mkdir -p $simudir/data/$simuid_concat
+ for f in `ls -F $simudir/data/$simuid | grep -v "/"`; do
+ cat $simudir/data/$simuid/$f >> $simudir/data/$simuid_concat/$f
+ done
+ done
+ done
+fi
+
+if [ $stage -le 3 ]; then
+ # compose eval/callhome2_spkall
+ eval_set=data/eval/callhome2_spkall
+ if ! validate_data_dir.sh --no-text --no-feats $eval_set; then
+ utils/copy_data_dir.sh data/callhome2_spkall $eval_set
+ cp data/callhome2_spkall/rttm $eval_set/rttm
+ awk -v dstdir=wav/eval/callhome2_spkall '{print $1, dstdir"/"$1".wav"}' data/callhome2_spkall/wav.scp > $eval_set/wav.scp
+ mkdir -p wav/eval/callhome2_spkall
+ wav-copy scp:data/callhome2_spkall/wav.scp scp:$eval_set/wav.scp
+ utils/data/get_reco2dur.sh $eval_set
+ fi
+
+ # compose eval/callhome1_spkall
+ adapt_set=data/eval/callhome1_spkall
+ if ! validate_data_dir.sh --no-text --no-feats $adapt_set; then
+ utils/copy_data_dir.sh data/callhome1_spkall $adapt_set
+ cp data/callhome1_spkall/rttm $adapt_set/rttm
+ awk -v dstdir=wav/eval/callhome1_spkall '{print $1, dstdir"/"$1".wav"}' data/callhome1_spkall/wav.scp > $adapt_set/wav.scp
+ mkdir -p wav/eval/callhome1_spkall
+ wav-copy scp:data/callhome1_spkall/wav.scp scp:$adapt_set/wav.scp
+ utils/data/get_reco2dur.sh $adapt_set
+ fi
+fi
diff --git a/egs/callhome/eend_ola/local/split.py b/egs/callhome/eend_ola/local/split.py
new file mode 100644
index 0000000..7ad1bad
--- /dev/null
+++ b/egs/callhome/eend_ola/local/split.py
@@ -0,0 +1,117 @@
+import argparse
+import os
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument('root_path', help='raw data path')
+ args = parser.parse_args()
+
+ root_path = args.root_path
+ work_path = os.path.join(root_path, ".work")
+ scp_files = os.listdir(work_path)
+
+ reco2dur_dict = {}
+ with open(os.path.join(root_path, 'reco2dur')) as f:
+ lines = f.readlines()
+ for line in lines:
+ parts = line.strip().split()
+ reco2dur_dict[parts[0]] = parts[1]
+
+ spk2utt_dict = {}
+ with open(os.path.join(root_path, 'spk2utt')) as f:
+ lines = f.readlines()
+ for line in lines:
+ parts = line.strip().split()
+ spk = parts[0]
+ utts = parts[1:]
+ for utt in utts:
+ tmp = utt.split('data')
+ rec = 'data_' + '_'.join(tmp[1][1:].split('_')[:-2])
+ if rec in spk2utt_dict.keys():
+ spk2utt_dict[rec].append((spk, utt))
+ else:
+ spk2utt_dict[rec] = []
+ spk2utt_dict[rec].append((spk, utt))
+
+ segment_dict = {}
+ with open(os.path.join(root_path, 'segments')) as f:
+ lines = f.readlines()
+ for line in lines:
+ parts = line.strip().split()
+ if parts[1] in segment_dict.keys():
+ segment_dict[parts[1]].append((parts[0], parts[2], parts[3]))
+ else:
+ segment_dict[parts[1]] = []
+ segment_dict[parts[1]].append((parts[0], parts[2], parts[3]))
+
+ utt2spk_dict = {}
+ with open(os.path.join(root_path, 'utt2spk')) as f:
+ lines = f.readlines()
+ for line in lines:
+ parts = line.strip().split()
+ utt = parts[0]
+ tmp = utt.split('data')
+ rec = 'data_' + '_'.join(tmp[1][1:].split('_')[:-2])
+ if rec in utt2spk_dict.keys():
+ utt2spk_dict[rec].append((parts[0], parts[1]))
+ else:
+ utt2spk_dict[rec] = []
+ utt2spk_dict[rec].append((parts[0], parts[1]))
+
+ for file in scp_files:
+ scp_file = os.path.join(work_path, file)
+ idx = scp_file.split('.')[-1]
+ reco2dur_file = os.path.join(work_path, 'reco2dur.{}'.format(str(idx)))
+ spk2utt_file = os.path.join(work_path, 'spk2utt.{}'.format(str(idx)))
+ segment_file = os.path.join(work_path, 'segments.{}'.format(str(idx)))
+ utt2spk_file = os.path.join(work_path, 'utt2spk.{}'.format(str(idx)))
+
+ fpp = open(scp_file)
+ scp_lines = fpp.readlines()
+ keys = []
+ for line in scp_lines:
+ name = line.strip().split()[0]
+ keys.append(name)
+
+ with open(reco2dur_file, 'w') as f:
+ lines = []
+ for key in keys:
+ string = key + ' ' + reco2dur_dict[key]
+ lines.append(string + '\n')
+ lines[-1] = lines[-1][:-1]
+ f.writelines(lines)
+
+ with open(spk2utt_file, 'w') as f:
+ lines = []
+ for key in keys:
+ items = spk2utt_dict[key]
+ for item in items:
+ string = item[0]
+ for it in item[1:]:
+ string += ' '
+ string += it
+ lines.append(string + '\n')
+ lines[-1] = lines[-1][:-1]
+ f.writelines(lines)
+
+ with open(segment_file, 'w') as f:
+ lines = []
+ for key in keys:
+ items = segment_dict[key]
+ for item in items:
+ string = item[0] + ' ' + key + ' ' + item[1] + ' ' + item[2]
+ lines.append(string + '\n')
+ lines[-1] = lines[-1][:-1]
+ f.writelines(lines)
+
+ with open(utt2spk_file, 'w') as f:
+ lines = []
+ for key in keys:
+ items = utt2spk_dict[key]
+ for item in items:
+ string = item[0] + ' ' + item[1]
+ lines.append(string + '\n')
+ lines[-1] = lines[-1][:-1]
+ f.writelines(lines)
+
+ fpp.close()
diff --git a/egs/callhome/eend_ola/path.sh b/egs/callhome/eend_ola/path.sh
new file mode 100755
index 0000000..e1906b7
--- /dev/null
+++ b/egs/callhome/eend_ola/path.sh
@@ -0,0 +1,13 @@
+export FUNASR_DIR=$PWD/../../..
+
+# kaldi-related
+export KALDI_ROOT=
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$KALDI_ROOT/tools/sctk/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=../../../:$PYTHONPATH
+export PATH=$FUNASR_DIR/funasr/bin:$PATH
diff --git a/egs/callhome/eend_ola/run.sh b/egs/callhome/eend_ola/run.sh
new file mode 100644
index 0000000..aa441bf
--- /dev/null
+++ b/egs/callhome/eend_ola/run.sh
@@ -0,0 +1,324 @@
+#!/usr/bin/env bash
+
+. ./path.sh || exit 1;
+
+# machines configuration
+CUDA_VISIBLE_DEVICES="0"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+count=1
+
+# general configuration
+dump_cmd=utils/run.pl
+nj=64
+
+# feature configuration
+data_dir="./data"
+simu_feats_dir=$data_dir/ark_data/dump/simu_data/data
+simu_feats_dir_chunk2000=$data_dir/ark_data/dump/simu_data_chunk2000/data
+callhome_feats_dir_chunk2000=$data_dir/ark_data/dump/callhome_chunk2000/data
+simu_train_dataset=train
+simu_valid_dataset=dev
+callhome_train_dataset=callhome1_spkall
+callhome_valid_dataset=callhome2_spkall
+
+# model average
+simu_average_2spkr_start=91
+simu_average_2spkr_end=100
+simu_average_allspkr_start=16
+simu_average_allspkr_end=25
+callhome_average_start=91
+callhome_average_end=100
+
+exp_dir="."
+input_size=345
+stage=1
+stop_stage=5
+
+# exp tag
+tag="exp1"
+
+. local/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+simu_2spkr_diar_config=conf/train_diar_eend_ola_simu_2spkr.yaml
+simu_allspkr_diar_config=conf/train_diar_eend_ola_simu_allspkr.yaml
+simu_allspkr_chunk2000_diar_config=conf/train_diar_eend_ola_simu_allspkr_chunk2000.yaml
+callhome_diar_config=conf/train_diar_eend_ola_callhome_chunk2000.yaml
+simu_2spkr_model_dir="baseline_$(basename "${simu_2spkr_diar_config}" .yaml)_${tag}"
+simu_allspkr_model_dir="baseline_$(basename "${simu_allspkr_diar_config}" .yaml)_${tag}"
+simu_allspkr_chunk2000_model_dir="baseline_$(basename "${simu_allspkr_chunk2000_diar_config}" .yaml)_${tag}"
+callhome_model_dir="baseline_$(basename "${callhome_diar_config}" .yaml)_${tag}"
+
+# simulate mixture data for training and inference
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+ echo "stage -1: Simulate mixture data for training and inference"
+ echo "The detail can be found in https://github.com/hitachi-speech/EEND"
+ echo "Before running this step, you should download and compile kaldi and set KALDI_ROOT in this script and path.sh"
+ echo "This stage may take a long time, please waiting..."
+ KALDI_ROOT=
+ ln -s $KALDI_ROOT/egs/wsj/s5/steps steps
+ ln -s $KALDI_ROOT/egs/wsj/s5/utils utils
+ local/run_prepare_shared_eda.sh
+fi
+
+# Prepare data for training and inference
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ echo "stage 0: Prepare data for training and inference"
+ simu_opts_num_speaker_array=(1 2 3 4)
+ simu_opts_sil_scale_array=(2 2 5 9)
+ simu_opts_num_train=100000
+
+ # for simulated data of chunk500 and chunk2000
+ for dset in swb_sre_cv swb_sre_tr; do
+ if [ "$dset" == "swb_sre_tr" ]; then
+ n_mixtures=${simu_opts_num_train}
+ dataset=train
+ else
+ n_mixtures=500
+ dataset=dev
+ fi
+ simu_data_dir=${dset}_ns"$(IFS="n"; echo "${simu_opts_num_speaker_array[*]}")"_beta"$(IFS="n"; echo "${simu_opts_sil_scale_array[*]}")"_${n_mixtures}
+ mkdir -p ${data_dir}/simu/data/${simu_data_dir}/.work
+ split_scps=
+ for n in $(seq $nj); do
+ split_scps="$split_scps ${data_dir}/simu/data/${simu_data_dir}/.work/wav.scp.$n"
+ done
+ utils/split_scp.pl "${data_dir}/simu/data/${simu_data_dir}/wav.scp" $split_scps || exit 1
+ python local/split.py ${data_dir}/simu/data/${simu_data_dir}
+ # for chunk_size=500
+ output_dir=${data_dir}/ark_data/dump/simu_data/$dataset
+ mkdir -p $output_dir/.logs
+ $dump_cmd --max-jobs-run $nj JOB=1:$nj $output_dir/.logs/dump.JOB.log \
+ python local/dump_feature.py \
+ --data_dir ${data_dir}/simu/data/${simu_data_dir}/.work \
+ --output_dir $output_dir \
+ --index JOB
+ mkdir -p ${data_dir}/ark_data/dump/simu_data/data/$dataset
+ cat ${data_dir}/ark_data/dump/simu_data/$dataset/feature.scp.* > ${data_dir}/ark_data/dump/simu_data/data/$dataset/feature.scp
+ cat ${data_dir}/ark_data/dump/simu_data/$dataset/label.scp.* > ${data_dir}/ark_data/dump/simu_data/data/$dataset/label.scp
+ paste -d" " ${data_dir}/ark_data/dump/simu_data/data/$dataset/feature.scp <(cut -f2 -d" " ${data_dir}/ark_data/dump/simu_data/data/$dataset/label.scp) > ${data_dir}/ark_data/dump/simu_data/data/$dataset/feats.scp
+ grep "ns2" ${data_dir}/ark_data/dump/simu_data/data/$dataset/feats.scp > ${data_dir}/ark_data/dump/simu_data/data/$dataset/feats_2spkr.scp
+ # for chunk_size=2000
+ output_dir=${data_dir}/ark_data/dump/simu_data_chunk2000/$dataset
+ mkdir -p $output_dir/.logs
+ $dump_cmd --max-jobs-run $nj JOB=1:$nj $output_dir/.logs/dump.JOB.log \
+ python local/dump_feature.py \
+ --data_dir ${data_dir}/simu/data/${simu_data_dir}/.work \
+ --output_dir $output_dir \
+ --index JOB \
+ --num_frames 2000
+ mkdir -p ${data_dir}/ark_data/dump/simu_data_chunk2000/data/$dataset
+ cat ${data_dir}/ark_data/dump/simu_data_chunk2000/$dataset/feature.scp.* > ${data_dir}/ark_data/dump/simu_data_chunk2000/data/$dataset/feature.scp
+ cat ${data_dir}/ark_data/dump/simu_data_chunk2000/$dataset/label.scp.* > ${data_dir}/ark_data/dump/simu_data_chunk2000/data/$dataset/label.scp
+ paste -d" " ${data_dir}/ark_data/dump/simu_data_chunk2000/data/$dataset/feature.scp <(cut -f2 -d" " ${data_dir}/ark_data/dump/simu_data_chunk2000/data/$dataset/label.scp) > ${data_dir}/ark_data/dump/simu_data_chunk2000/data/$dataset/feats.scp
+ done
+
+ # for callhome data
+ for dset in callhome1_spkall callhome2_spkall; do
+ find $data_dir/eval/$dset -maxdepth 1 -type f -exec cp {} {}.1 \;
+ output_dir=${data_dir}/ark_data/dump/callhome_chunk2000/$dset
+ mkdir -p $output_dir
+ python local/dump_feature.py \
+ --data_dir $data_dir/eval/$dset \
+ --output_dir $output_dir \
+ --index 1 \
+ --num_frames 2000
+ mkdir -p ${data_dir}/ark_data/dump/callhome_chunk2000/data/$dset
+ paste -d" " ${data_dir}/ark_data/dump/callhome_chunk2000/$dset/feature.scp.1 <(cut -f2 -d" " ${data_dir}/ark_data/dump/callhome_chunk2000/$dset/label.scp.1) > ${data_dir}/ark_data/dump/callhome_chunk2000/data/$dset/feats.scp
+ done
+fi
+
+# Training on simulated two-speaker data
+world_size=$gpu_num
+simu_2spkr_ave_id=avg${simu_average_2spkr_start}-${simu_average_2spkr_end}
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ echo "stage 1: Training on simulated two-speaker data"
+ mkdir -p ${exp_dir}/exp/${simu_2spkr_model_dir}
+ mkdir -p ${exp_dir}/exp/${simu_2spkr_model_dir}/log
+ INIT_FILE=${exp_dir}/exp/${simu_2spkr_model_dir}/ddp_init
+ if [ -f $INIT_FILE ];then
+ rm -f $INIT_FILE
+ fi
+ init_method=file://$(readlink -f $INIT_FILE)
+ echo "$0: init method is $init_method"
+ for ((i = 0; i < $gpu_num; ++i)); do
+ {
+ rank=$i
+ local_rank=$i
+ gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+ train.py \
+ --task_name diar \
+ --gpu_id $gpu_id \
+ --use_preprocessor false \
+ --input_size $input_size \
+ --data_dir ${simu_feats_dir} \
+ --train_set ${simu_train_dataset} \
+ --valid_set ${simu_valid_dataset} \
+ --data_file_names "feats_2spkr.scp" \
+ --resume true \
+ --output_dir ${exp_dir}/exp/${simu_2spkr_model_dir} \
+ --config $simu_2spkr_diar_config \
+ --ngpu $gpu_num \
+ --num_worker_count $count \
+ --dist_init_method $init_method \
+ --dist_world_size $world_size \
+ --dist_rank $rank \
+ --local_rank $local_rank 1> ${exp_dir}/exp/${simu_2spkr_model_dir}/log/train.log.$i 2>&1
+ } &
+ done
+ wait
+ echo "averaging model parameters into ${exp_dir}/exp/$simu_2spkr_model_dir/$simu_2spkr_ave_id.pb"
+ models=`eval echo ${exp_dir}/exp/${simu_2spkr_model_dir}/{$simu_average_2spkr_start..$simu_average_2spkr_end}epoch.pb`
+ python local/model_averaging.py ${exp_dir}/exp/${simu_2spkr_model_dir}/$simu_2spkr_ave_id.pb $models
+fi
+
+# Training on simulated all-speaker data
+world_size=$gpu_num
+simu_allspkr_ave_id=avg${simu_average_allspkr_start}-${simu_average_allspkr_end}
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ echo "stage 2: Training on simulated all-speaker data"
+ mkdir -p ${exp_dir}/exp/${simu_allspkr_model_dir}
+ mkdir -p ${exp_dir}/exp/${simu_allspkr_model_dir}/log
+ INIT_FILE=${exp_dir}/exp/${simu_allspkr_model_dir}/ddp_init
+ if [ -f $INIT_FILE ];then
+ rm -f $INIT_FILE
+ fi
+ init_method=file://$(readlink -f $INIT_FILE)
+ echo "$0: init method is $init_method"
+ for ((i = 0; i < $gpu_num; ++i)); do
+ {
+ rank=$i
+ local_rank=$i
+ gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+ train.py \
+ --task_name diar \
+ --gpu_id $gpu_id \
+ --use_preprocessor false \
+ --input_size $input_size \
+ --data_dir ${simu_feats_dir} \
+ --train_set ${simu_train_dataset} \
+ --valid_set ${simu_valid_dataset} \
+ --data_file_names "feats.scp" \
+ --resume true \
+ --init_param ${exp_dir}/exp/${simu_2spkr_model_dir}/$simu_2spkr_ave_id.pb \
+ --output_dir ${exp_dir}/exp/${simu_allspkr_model_dir} \
+ --config $simu_allspkr_diar_config \
+ --ngpu $gpu_num \
+ --num_worker_count $count \
+ --dist_init_method $init_method \
+ --dist_world_size $world_size \
+ --dist_rank $rank \
+ --local_rank $local_rank 1> ${exp_dir}/exp/${simu_allspkr_model_dir}/log/train.log.$i 2>&1
+ } &
+ done
+ wait
+ echo "averaging model parameters into ${exp_dir}/exp/$simu_allspkr_model_dir/$simu_allspkr_ave_id.pb"
+ models=`eval echo ${exp_dir}/exp/${simu_allspkr_model_dir}/{$simu_average_allspkr_start..$simu_average_allspkr_end}epoch.pb`
+ python local/model_averaging.py ${exp_dir}/exp/${simu_allspkr_model_dir}/$simu_allspkr_ave_id.pb $models
+fi
+
+# Training on simulated all-speaker data with chunk_size 2000
+world_size=$gpu_num
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+ echo "stage 3: Training on simulated all-speaker data with chunk_size 2000"
+ mkdir -p ${exp_dir}/exp/${simu_allspkr_chunk2000_model_dir}
+ mkdir -p ${exp_dir}/exp/${simu_allspkr_chunk2000_model_dir}/log
+ INIT_FILE=${exp_dir}/exp/${simu_allspkr_chunk2000_model_dir}/ddp_init
+ if [ -f $INIT_FILE ];then
+ rm -f $INIT_FILE
+ fi
+ init_method=file://$(readlink -f $INIT_FILE)
+ echo "$0: init method is $init_method"
+ for ((i = 0; i < $gpu_num; ++i)); do
+ {
+ rank=$i
+ local_rank=$i
+ gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+ train.py \
+ --task_name diar \
+ --gpu_id $gpu_id \
+ --use_preprocessor false \
+ --input_size $input_size \
+ --data_dir ${simu_feats_dir_chunk2000} \
+ --train_set ${simu_train_dataset} \
+ --valid_set ${simu_valid_dataset} \
+ --data_file_names "feats.scp" \
+ --resume true \
+ --init_param ${exp_dir}/exp/${simu_allspkr_model_dir}/$simu_allspkr_ave_id.pb \
+ --output_dir ${exp_dir}/exp/${simu_allspkr_chunk2000_model_dir} \
+ --config $simu_allspkr_chunk2000_diar_config \
+ --ngpu $gpu_num \
+ --num_worker_count $count \
+ --dist_init_method $init_method \
+ --dist_world_size $world_size \
+ --dist_rank $rank \
+ --local_rank $local_rank 1> ${exp_dir}/exp/${simu_allspkr_chunk2000_model_dir}/log/train.log.$i 2>&1
+ } &
+ done
+ wait
+fi
+
+# Training on callhome all-speaker data with chunk_size 2000
+world_size=$gpu_num
+callhome_ave_id=avg${callhome_average_start}-${callhome_average_end}
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+ echo "stage 4: Training on callhome all-speaker data with chunk_size 2000"
+ mkdir -p ${exp_dir}/exp/${callhome_model_dir}
+ mkdir -p ${exp_dir}/exp/${callhome_model_dir}/log
+ INIT_FILE=${exp_dir}/exp/${callhome_model_dir}/ddp_init
+ if [ -f $INIT_FILE ];then
+ rm -f $INIT_FILE
+ fi
+ init_method=file://$(readlink -f $INIT_FILE)
+ echo "$0: init method is $init_method"
+ for ((i = 0; i < $gpu_num; ++i)); do
+ {
+ rank=$i
+ local_rank=$i
+ gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+ train.py \
+ --task_name diar \
+ --gpu_id $gpu_id \
+ --use_preprocessor false \
+ --input_size $input_size \
+ --data_dir ${callhome_feats_dir_chunk2000} \
+ --train_set ${callhome_train_dataset} \
+ --valid_set ${callhome_valid_dataset} \
+ --data_file_names "feats.scp" \
+ --resume true \
+ --init_param ${exp_dir}/exp/${simu_allspkr_chunk2000_model_dir}/1epoch.pb \
+ --output_dir ${exp_dir}/exp/${callhome_model_dir} \
+ --config $callhome_diar_config \
+ --ngpu $gpu_num \
+ --num_worker_count $count \
+ --dist_init_method $init_method \
+ --dist_world_size $world_size \
+ --dist_rank $rank \
+ --local_rank $local_rank 1> ${exp_dir}/exp/${callhome_model_dir}/log/train.log.$i 2>&1
+ } &
+ done
+ wait
+ echo "averaging model parameters into ${exp_dir}/exp/$callhome_model_dir/$callhome_ave_id.pb"
+ models=`eval echo ${exp_dir}/exp/${callhome_model_dir}/{$callhome_average_start..$callhome_average_end}epoch.pb`
+ python local/model_averaging.py ${exp_dir}/exp/${callhome_model_dir}/$callhome_ave_id.pb $models
+fi
+
+# inference and compute DER
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+ echo "Inference"
+ mkdir -p ${exp_dir}/exp/${callhome_model_dir}/inference/log
+ CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES python local/infer.py \
+ --config_file ${exp_dir}/exp/${callhome_model_dir}/config.yaml \
+ --model_file ${exp_dir}/exp/${callhome_model_dir}/$callhome_ave_id.pb \
+ --output_rttm_file ${exp_dir}/exp/${callhome_model_dir}/inference/rttm \
+ --wav_scp_file $data_dir/eval/callhome2_spkall/wav.scp \
+ 1> ${exp_dir}/exp/${callhome_model_dir}/inference/log/infer.log 2>&1
+ md-eval.pl -c 0.25 \
+ -r ${data_dir}/eval/${callhome_valid_dataset}/rttm \
+ -s ${exp_dir}/exp/${callhome_model_dir}/inference/rttm > ${exp_dir}/exp/${callhome_model_dir}/inference/result_med11_collar0.25 2>/dev/null || exit
+fi
\ No newline at end of file
diff --git a/egs/callhome/sond/sond.yaml b/egs/callhome/sond/sond.yaml
new file mode 100644
index 0000000..868163f
--- /dev/null
+++ b/egs/callhome/sond/sond.yaml
@@ -0,0 +1,2739 @@
+config: finetune.yaml
+print_config: false
+log_level: INFO
+dry_run: false
+iterator_type: sequence
+output_dir: exp/sond
+ngpu: 1
+seed: 0
+num_workers: 16
+num_att_plot: 0
+dist_backend: nccl
+dist_init_method: env://
+dist_world_size: null
+dist_rank: null
+local_rank: 0
+dist_master_addr: null
+dist_master_port: null
+dist_launcher: null
+multiprocessing_distributed: true
+distributed: false
+unused_parameters: true
+sharded_ddp: false
+ddp_backend: pytorch_ddp
+cudnn_enabled: true
+cudnn_benchmark: false
+cudnn_deterministic: true
+collect_stats: false
+write_collected_feats: false
+max_epoch: 50
+patience: null
+val_scheduler_criterion:
+- valid
+- acc
+early_stopping_criterion:
+- valid
+- loss
+- min
+best_model_criterion:
+- - valid
+ - acc
+ - max
+keep_nbest_models: 10
+nbest_averaging_interval: 0
+grad_clip: 5
+grad_clip_type: 2.0
+grad_noise: false
+accum_grad: 1
+no_forward_run: false
+resume: true
+train_dtype: float32
+use_amp: false
+log_interval: 50
+use_matplotlib: false
+use_tensorboard: true
+use_wandb: false
+wandb_project: null
+wandb_id: null
+wandb_entity: null
+wandb_name: null
+wandb_model_log_interval: -1
+use_pai: true
+detect_anomaly: false
+pretrain_path: null
+init_param: []
+ignore_init_mismatch: false
+freeze_param: []
+num_iters_per_epoch: null
+batch_size: 20
+valid_batch_size: null
+batch_bins: 10000
+valid_batch_bins: null
+train_shape_file:
+- /data/volume1/youyan/aishell/ark/train/speech_shape.1
+- /data/volume1/youyan/aishell/ark/train/text_shape.1
+valid_shape_file:
+- /data/volume1/youyan/aishell/ark/dev/speech_shape.1
+- /data/volume1/youyan/aishell/ark/dev/text_shape.1
+batch_type: length
+valid_batch_type: null
+fold_length:
+- 512
+- 150
+sort_in_batch: descending
+sort_batch: descending
+multiple_iterator: false
+chunk_length: 500
+chunk_shift_ratio: 0.5
+num_cache_chunks: 1024
+train_data_path_and_name_and_type:
+- - /data/volume1/youyan/aishell/ark/train/data.scp
+ - speech
+ - kaldi_ark
+- - /data/volume1/youyan/aishell/ark/train/data.text.1
+ - text
+ - text
+valid_data_path_and_name_and_type:
+- - /data/volume1/youyan/aishell/ark/dev/data.scp
+ - speech
+ - kaldi_ark
+- - /data/volume1/youyan/aishell/ark/dev/data.text.1
+ - text
+ - text
+allow_variable_data_keys: false
+max_cache_size: 0.0
+max_cache_fd: 32
+valid_max_cache_size: null
+optim: adam
+optim_conf:
+ lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+ warmup_steps: 30000
+token_list:
+- '0'
+- '1'
+- '2'
+- '3'
+- '4'
+- '5'
+- '6'
+- '7'
+- '8'
+- '9'
+- '10'
+- '11'
+- '12'
+- '13'
+- '14'
+- '15'
+- '16'
+- '17'
+- '18'
+- '19'
+- '20'
+- '21'
+- '22'
+- '23'
+- '24'
+- '25'
+- '26'
+- '27'
+- '28'
+- '29'
+- '30'
+- '32'
+- '33'
+- '34'
+- '35'
+- '36'
+- '37'
+- '38'
+- '39'
+- '40'
+- '41'
+- '42'
+- '43'
+- '44'
+- '45'
+- '46'
+- '48'
+- '49'
+- '50'
+- '51'
+- '52'
+- '53'
+- '54'
+- '56'
+- '57'
+- '58'
+- '60'
+- '64'
+- '65'
+- '66'
+- '67'
+- '68'
+- '69'
+- '70'
+- '71'
+- '72'
+- '73'
+- '74'
+- '75'
+- '76'
+- '77'
+- '78'
+- '80'
+- '81'
+- '82'
+- '83'
+- '84'
+- '85'
+- '86'
+- '88'
+- '89'
+- '90'
+- '92'
+- '96'
+- '97'
+- '98'
+- '99'
+- '100'
+- '101'
+- '102'
+- '104'
+- '105'
+- '106'
+- '108'
+- '112'
+- '113'
+- '114'
+- '116'
+- '120'
+- '128'
+- '129'
+- '130'
+- '131'
+- '132'
+- '133'
+- '134'
+- '135'
+- '136'
+- '137'
+- '138'
+- '139'
+- '140'
+- '141'
+- '142'
+- '144'
+- '145'
+- '146'
+- '147'
+- '148'
+- '149'
+- '150'
+- '152'
+- '153'
+- '154'
+- '156'
+- '160'
+- '161'
+- '162'
+- '163'
+- '164'
+- '165'
+- '166'
+- '168'
+- '169'
+- '170'
+- '172'
+- '176'
+- '177'
+- '178'
+- '180'
+- '184'
+- '192'
+- '193'
+- '194'
+- '195'
+- '196'
+- '197'
+- '198'
+- '200'
+- '201'
+- '202'
+- '204'
+- '208'
+- '209'
+- '210'
+- '212'
+- '216'
+- '224'
+- '225'
+- '226'
+- '228'
+- '232'
+- '240'
+- '256'
+- '257'
+- '258'
+- '259'
+- '260'
+- '261'
+- '262'
+- '263'
+- '264'
+- '265'
+- '266'
+- '267'
+- '268'
+- '269'
+- '270'
+- '272'
+- '273'
+- '274'
+- '275'
+- '276'
+- '277'
+- '278'
+- '280'
+- '281'
+- '282'
+- '284'
+- '288'
+- '289'
+- '290'
+- '291'
+- '292'
+- '293'
+- '294'
+- '296'
+- '297'
+- '298'
+- '300'
+- '304'
+- '305'
+- '306'
+- '308'
+- '312'
+- '320'
+- '321'
+- '322'
+- '323'
+- '324'
+- '325'
+- '326'
+- '328'
+- '329'
+- '330'
+- '332'
+- '336'
+- '337'
+- '338'
+- '340'
+- '344'
+- '352'
+- '353'
+- '354'
+- '356'
+- '360'
+- '368'
+- '384'
+- '385'
+- '386'
+- '387'
+- '388'
+- '389'
+- '390'
+- '392'
+- '393'
+- '394'
+- '396'
+- '400'
+- '401'
+- '402'
+- '404'
+- '408'
+- '416'
+- '417'
+- '418'
+- '420'
+- '424'
+- '432'
+- '448'
+- '449'
+- '450'
+- '452'
+- '456'
+- '464'
+- '480'
+- '512'
+- '513'
+- '514'
+- '515'
+- '516'
+- '517'
+- '518'
+- '519'
+- '520'
+- '521'
+- '522'
+- '523'
+- '524'
+- '525'
+- '526'
+- '528'
+- '529'
+- '530'
+- '531'
+- '532'
+- '533'
+- '534'
+- '536'
+- '537'
+- '538'
+- '540'
+- '544'
+- '545'
+- '546'
+- '547'
+- '548'
+- '549'
+- '550'
+- '552'
+- '553'
+- '554'
+- '556'
+- '560'
+- '561'
+- '562'
+- '564'
+- '568'
+- '576'
+- '577'
+- '578'
+- '579'
+- '580'
+- '581'
+- '582'
+- '584'
+- '585'
+- '586'
+- '588'
+- '592'
+- '593'
+- '594'
+- '596'
+- '600'
+- '608'
+- '609'
+- '610'
+- '612'
+- '616'
+- '624'
+- '640'
+- '641'
+- '642'
+- '643'
+- '644'
+- '645'
+- '646'
+- '648'
+- '649'
+- '650'
+- '652'
+- '656'
+- '657'
+- '658'
+- '660'
+- '664'
+- '672'
+- '673'
+- '674'
+- '676'
+- '680'
+- '688'
+- '704'
+- '705'
+- '706'
+- '708'
+- '712'
+- '720'
+- '736'
+- '768'
+- '769'
+- '770'
+- '771'
+- '772'
+- '773'
+- '774'
+- '776'
+- '777'
+- '778'
+- '780'
+- '784'
+- '785'
+- '786'
+- '788'
+- '792'
+- '800'
+- '801'
+- '802'
+- '804'
+- '808'
+- '816'
+- '832'
+- '833'
+- '834'
+- '836'
+- '840'
+- '848'
+- '864'
+- '896'
+- '897'
+- '898'
+- '900'
+- '904'
+- '912'
+- '928'
+- '960'
+- '1024'
+- '1025'
+- '1026'
+- '1027'
+- '1028'
+- '1029'
+- '1030'
+- '1031'
+- '1032'
+- '1033'
+- '1034'
+- '1035'
+- '1036'
+- '1037'
+- '1038'
+- '1040'
+- '1041'
+- '1042'
+- '1043'
+- '1044'
+- '1045'
+- '1046'
+- '1048'
+- '1049'
+- '1050'
+- '1052'
+- '1056'
+- '1057'
+- '1058'
+- '1059'
+- '1060'
+- '1061'
+- '1062'
+- '1064'
+- '1065'
+- '1066'
+- '1068'
+- '1072'
+- '1073'
+- '1074'
+- '1076'
+- '1080'
+- '1088'
+- '1089'
+- '1090'
+- '1091'
+- '1092'
+- '1093'
+- '1094'
+- '1096'
+- '1097'
+- '1098'
+- '1100'
+- '1104'
+- '1105'
+- '1106'
+- '1108'
+- '1112'
+- '1120'
+- '1121'
+- '1122'
+- '1124'
+- '1128'
+- '1136'
+- '1152'
+- '1153'
+- '1154'
+- '1155'
+- '1156'
+- '1157'
+- '1158'
+- '1160'
+- '1161'
+- '1162'
+- '1164'
+- '1168'
+- '1169'
+- '1170'
+- '1172'
+- '1176'
+- '1184'
+- '1185'
+- '1186'
+- '1188'
+- '1192'
+- '1200'
+- '1216'
+- '1217'
+- '1218'
+- '1220'
+- '1224'
+- '1232'
+- '1248'
+- '1280'
+- '1281'
+- '1282'
+- '1283'
+- '1284'
+- '1285'
+- '1286'
+- '1288'
+- '1289'
+- '1290'
+- '1292'
+- '1296'
+- '1297'
+- '1298'
+- '1300'
+- '1304'
+- '1312'
+- '1313'
+- '1314'
+- '1316'
+- '1320'
+- '1328'
+- '1344'
+- '1345'
+- '1346'
+- '1348'
+- '1352'
+- '1360'
+- '1376'
+- '1408'
+- '1409'
+- '1410'
+- '1412'
+- '1416'
+- '1424'
+- '1440'
+- '1472'
+- '1536'
+- '1537'
+- '1538'
+- '1539'
+- '1540'
+- '1541'
+- '1542'
+- '1544'
+- '1545'
+- '1546'
+- '1548'
+- '1552'
+- '1553'
+- '1554'
+- '1556'
+- '1560'
+- '1568'
+- '1569'
+- '1570'
+- '1572'
+- '1576'
+- '1584'
+- '1600'
+- '1601'
+- '1602'
+- '1604'
+- '1608'
+- '1616'
+- '1632'
+- '1664'
+- '1665'
+- '1666'
+- '1668'
+- '1672'
+- '1680'
+- '1696'
+- '1728'
+- '1792'
+- '1793'
+- '1794'
+- '1796'
+- '1800'
+- '1808'
+- '1824'
+- '1856'
+- '1920'
+- '2048'
+- '2049'
+- '2050'
+- '2051'
+- '2052'
+- '2053'
+- '2054'
+- '2055'
+- '2056'
+- '2057'
+- '2058'
+- '2059'
+- '2060'
+- '2061'
+- '2062'
+- '2064'
+- '2065'
+- '2066'
+- '2067'
+- '2068'
+- '2069'
+- '2070'
+- '2072'
+- '2073'
+- '2074'
+- '2076'
+- '2080'
+- '2081'
+- '2082'
+- '2083'
+- '2084'
+- '2085'
+- '2086'
+- '2088'
+- '2089'
+- '2090'
+- '2092'
+- '2096'
+- '2097'
+- '2098'
+- '2100'
+- '2104'
+- '2112'
+- '2113'
+- '2114'
+- '2115'
+- '2116'
+- '2117'
+- '2118'
+- '2120'
+- '2121'
+- '2122'
+- '2124'
+- '2128'
+- '2129'
+- '2130'
+- '2132'
+- '2136'
+- '2144'
+- '2145'
+- '2146'
+- '2148'
+- '2152'
+- '2160'
+- '2176'
+- '2177'
+- '2178'
+- '2179'
+- '2180'
+- '2181'
+- '2182'
+- '2184'
+- '2185'
+- '2186'
+- '2188'
+- '2192'
+- '2193'
+- '2194'
+- '2196'
+- '2200'
+- '2208'
+- '2209'
+- '2210'
+- '2212'
+- '2216'
+- '2224'
+- '2240'
+- '2241'
+- '2242'
+- '2244'
+- '2248'
+- '2256'
+- '2272'
+- '2304'
+- '2305'
+- '2306'
+- '2307'
+- '2308'
+- '2309'
+- '2310'
+- '2312'
+- '2313'
+- '2314'
+- '2316'
+- '2320'
+- '2321'
+- '2322'
+- '2324'
+- '2328'
+- '2336'
+- '2337'
+- '2338'
+- '2340'
+- '2344'
+- '2352'
+- '2368'
+- '2369'
+- '2370'
+- '2372'
+- '2376'
+- '2384'
+- '2400'
+- '2432'
+- '2433'
+- '2434'
+- '2436'
+- '2440'
+- '2448'
+- '2464'
+- '2496'
+- '2560'
+- '2561'
+- '2562'
+- '2563'
+- '2564'
+- '2565'
+- '2566'
+- '2568'
+- '2569'
+- '2570'
+- '2572'
+- '2576'
+- '2577'
+- '2578'
+- '2580'
+- '2584'
+- '2592'
+- '2593'
+- '2594'
+- '2596'
+- '2600'
+- '2608'
+- '2624'
+- '2625'
+- '2626'
+- '2628'
+- '2632'
+- '2640'
+- '2656'
+- '2688'
+- '2689'
+- '2690'
+- '2692'
+- '2696'
+- '2704'
+- '2720'
+- '2752'
+- '2816'
+- '2817'
+- '2818'
+- '2820'
+- '2824'
+- '2832'
+- '2848'
+- '2880'
+- '2944'
+- '3072'
+- '3073'
+- '3074'
+- '3075'
+- '3076'
+- '3077'
+- '3078'
+- '3080'
+- '3081'
+- '3082'
+- '3084'
+- '3088'
+- '3089'
+- '3090'
+- '3092'
+- '3096'
+- '3104'
+- '3105'
+- '3106'
+- '3108'
+- '3112'
+- '3120'
+- '3136'
+- '3137'
+- '3138'
+- '3140'
+- '3144'
+- '3152'
+- '3168'
+- '3200'
+- '3201'
+- '3202'
+- '3204'
+- '3208'
+- '3216'
+- '3232'
+- '3264'
+- '3328'
+- '3329'
+- '3330'
+- '3332'
+- '3336'
+- '3344'
+- '3360'
+- '3392'
+- '3456'
+- '3584'
+- '3585'
+- '3586'
+- '3588'
+- '3592'
+- '3600'
+- '3616'
+- '3648'
+- '3712'
+- '3840'
+- '4096'
+- '4097'
+- '4098'
+- '4099'
+- '4100'
+- '4101'
+- '4102'
+- '4103'
+- '4104'
+- '4105'
+- '4106'
+- '4107'
+- '4108'
+- '4109'
+- '4110'
+- '4112'
+- '4113'
+- '4114'
+- '4115'
+- '4116'
+- '4117'
+- '4118'
+- '4120'
+- '4121'
+- '4122'
+- '4124'
+- '4128'
+- '4129'
+- '4130'
+- '4131'
+- '4132'
+- '4133'
+- '4134'
+- '4136'
+- '4137'
+- '4138'
+- '4140'
+- '4144'
+- '4145'
+- '4146'
+- '4148'
+- '4152'
+- '4160'
+- '4161'
+- '4162'
+- '4163'
+- '4164'
+- '4165'
+- '4166'
+- '4168'
+- '4169'
+- '4170'
+- '4172'
+- '4176'
+- '4177'
+- '4178'
+- '4180'
+- '4184'
+- '4192'
+- '4193'
+- '4194'
+- '4196'
+- '4200'
+- '4208'
+- '4224'
+- '4225'
+- '4226'
+- '4227'
+- '4228'
+- '4229'
+- '4230'
+- '4232'
+- '4233'
+- '4234'
+- '4236'
+- '4240'
+- '4241'
+- '4242'
+- '4244'
+- '4248'
+- '4256'
+- '4257'
+- '4258'
+- '4260'
+- '4264'
+- '4272'
+- '4288'
+- '4289'
+- '4290'
+- '4292'
+- '4296'
+- '4304'
+- '4320'
+- '4352'
+- '4353'
+- '4354'
+- '4355'
+- '4356'
+- '4357'
+- '4358'
+- '4360'
+- '4361'
+- '4362'
+- '4364'
+- '4368'
+- '4369'
+- '4370'
+- '4372'
+- '4376'
+- '4384'
+- '4385'
+- '4386'
+- '4388'
+- '4392'
+- '4400'
+- '4416'
+- '4417'
+- '4418'
+- '4420'
+- '4424'
+- '4432'
+- '4448'
+- '4480'
+- '4481'
+- '4482'
+- '4484'
+- '4488'
+- '4496'
+- '4512'
+- '4544'
+- '4608'
+- '4609'
+- '4610'
+- '4611'
+- '4612'
+- '4613'
+- '4614'
+- '4616'
+- '4617'
+- '4618'
+- '4620'
+- '4624'
+- '4625'
+- '4626'
+- '4628'
+- '4632'
+- '4640'
+- '4641'
+- '4642'
+- '4644'
+- '4648'
+- '4656'
+- '4672'
+- '4673'
+- '4674'
+- '4676'
+- '4680'
+- '4688'
+- '4704'
+- '4736'
+- '4737'
+- '4738'
+- '4740'
+- '4744'
+- '4752'
+- '4768'
+- '4800'
+- '4864'
+- '4865'
+- '4866'
+- '4868'
+- '4872'
+- '4880'
+- '4896'
+- '4928'
+- '4992'
+- '5120'
+- '5121'
+- '5122'
+- '5123'
+- '5124'
+- '5125'
+- '5126'
+- '5128'
+- '5129'
+- '5130'
+- '5132'
+- '5136'
+- '5137'
+- '5138'
+- '5140'
+- '5144'
+- '5152'
+- '5153'
+- '5154'
+- '5156'
+- '5160'
+- '5168'
+- '5184'
+- '5185'
+- '5186'
+- '5188'
+- '5192'
+- '5200'
+- '5216'
+- '5248'
+- '5249'
+- '5250'
+- '5252'
+- '5256'
+- '5264'
+- '5280'
+- '5312'
+- '5376'
+- '5377'
+- '5378'
+- '5380'
+- '5384'
+- '5392'
+- '5408'
+- '5440'
+- '5504'
+- '5632'
+- '5633'
+- '5634'
+- '5636'
+- '5640'
+- '5648'
+- '5664'
+- '5696'
+- '5760'
+- '5888'
+- '6144'
+- '6145'
+- '6146'
+- '6147'
+- '6148'
+- '6149'
+- '6150'
+- '6152'
+- '6153'
+- '6154'
+- '6156'
+- '6160'
+- '6161'
+- '6162'
+- '6164'
+- '6168'
+- '6176'
+- '6177'
+- '6178'
+- '6180'
+- '6184'
+- '6192'
+- '6208'
+- '6209'
+- '6210'
+- '6212'
+- '6216'
+- '6224'
+- '6240'
+- '6272'
+- '6273'
+- '6274'
+- '6276'
+- '6280'
+- '6288'
+- '6304'
+- '6336'
+- '6400'
+- '6401'
+- '6402'
+- '6404'
+- '6408'
+- '6416'
+- '6432'
+- '6464'
+- '6528'
+- '6656'
+- '6657'
+- '6658'
+- '6660'
+- '6664'
+- '6672'
+- '6688'
+- '6720'
+- '6784'
+- '6912'
+- '7168'
+- '7169'
+- '7170'
+- '7172'
+- '7176'
+- '7184'
+- '7200'
+- '7232'
+- '7296'
+- '7424'
+- '7680'
+- '8192'
+- '8193'
+- '8194'
+- '8195'
+- '8196'
+- '8197'
+- '8198'
+- '8199'
+- '8200'
+- '8201'
+- '8202'
+- '8203'
+- '8204'
+- '8205'
+- '8206'
+- '8208'
+- '8209'
+- '8210'
+- '8211'
+- '8212'
+- '8213'
+- '8214'
+- '8216'
+- '8217'
+- '8218'
+- '8220'
+- '8224'
+- '8225'
+- '8226'
+- '8227'
+- '8228'
+- '8229'
+- '8230'
+- '8232'
+- '8233'
+- '8234'
+- '8236'
+- '8240'
+- '8241'
+- '8242'
+- '8244'
+- '8248'
+- '8256'
+- '8257'
+- '8258'
+- '8259'
+- '8260'
+- '8261'
+- '8262'
+- '8264'
+- '8265'
+- '8266'
+- '8268'
+- '8272'
+- '8273'
+- '8274'
+- '8276'
+- '8280'
+- '8288'
+- '8289'
+- '8290'
+- '8292'
+- '8296'
+- '8304'
+- '8320'
+- '8321'
+- '8322'
+- '8323'
+- '8324'
+- '8325'
+- '8326'
+- '8328'
+- '8329'
+- '8330'
+- '8332'
+- '8336'
+- '8337'
+- '8338'
+- '8340'
+- '8344'
+- '8352'
+- '8353'
+- '8354'
+- '8356'
+- '8360'
+- '8368'
+- '8384'
+- '8385'
+- '8386'
+- '8388'
+- '8392'
+- '8400'
+- '8416'
+- '8448'
+- '8449'
+- '8450'
+- '8451'
+- '8452'
+- '8453'
+- '8454'
+- '8456'
+- '8457'
+- '8458'
+- '8460'
+- '8464'
+- '8465'
+- '8466'
+- '8468'
+- '8472'
+- '8480'
+- '8481'
+- '8482'
+- '8484'
+- '8488'
+- '8496'
+- '8512'
+- '8513'
+- '8514'
+- '8516'
+- '8520'
+- '8528'
+- '8544'
+- '8576'
+- '8577'
+- '8578'
+- '8580'
+- '8584'
+- '8592'
+- '8608'
+- '8640'
+- '8704'
+- '8705'
+- '8706'
+- '8707'
+- '8708'
+- '8709'
+- '8710'
+- '8712'
+- '8713'
+- '8714'
+- '8716'
+- '8720'
+- '8721'
+- '8722'
+- '8724'
+- '8728'
+- '8736'
+- '8737'
+- '8738'
+- '8740'
+- '8744'
+- '8752'
+- '8768'
+- '8769'
+- '8770'
+- '8772'
+- '8776'
+- '8784'
+- '8800'
+- '8832'
+- '8833'
+- '8834'
+- '8836'
+- '8840'
+- '8848'
+- '8864'
+- '8896'
+- '8960'
+- '8961'
+- '8962'
+- '8964'
+- '8968'
+- '8976'
+- '8992'
+- '9024'
+- '9088'
+- '9216'
+- '9217'
+- '9218'
+- '9219'
+- '9220'
+- '9221'
+- '9222'
+- '9224'
+- '9225'
+- '9226'
+- '9228'
+- '9232'
+- '9233'
+- '9234'
+- '9236'
+- '9240'
+- '9248'
+- '9249'
+- '9250'
+- '9252'
+- '9256'
+- '9264'
+- '9280'
+- '9281'
+- '9282'
+- '9284'
+- '9288'
+- '9296'
+- '9312'
+- '9344'
+- '9345'
+- '9346'
+- '9348'
+- '9352'
+- '9360'
+- '9376'
+- '9408'
+- '9472'
+- '9473'
+- '9474'
+- '9476'
+- '9480'
+- '9488'
+- '9504'
+- '9536'
+- '9600'
+- '9728'
+- '9729'
+- '9730'
+- '9732'
+- '9736'
+- '9744'
+- '9760'
+- '9792'
+- '9856'
+- '9984'
+- '10240'
+- '10241'
+- '10242'
+- '10243'
+- '10244'
+- '10245'
+- '10246'
+- '10248'
+- '10249'
+- '10250'
+- '10252'
+- '10256'
+- '10257'
+- '10258'
+- '10260'
+- '10264'
+- '10272'
+- '10273'
+- '10274'
+- '10276'
+- '10280'
+- '10288'
+- '10304'
+- '10305'
+- '10306'
+- '10308'
+- '10312'
+- '10320'
+- '10336'
+- '10368'
+- '10369'
+- '10370'
+- '10372'
+- '10376'
+- '10384'
+- '10400'
+- '10432'
+- '10496'
+- '10497'
+- '10498'
+- '10500'
+- '10504'
+- '10512'
+- '10528'
+- '10560'
+- '10624'
+- '10752'
+- '10753'
+- '10754'
+- '10756'
+- '10760'
+- '10768'
+- '10784'
+- '10816'
+- '10880'
+- '11008'
+- '11264'
+- '11265'
+- '11266'
+- '11268'
+- '11272'
+- '11280'
+- '11296'
+- '11328'
+- '11392'
+- '11520'
+- '11776'
+- '12288'
+- '12289'
+- '12290'
+- '12291'
+- '12292'
+- '12293'
+- '12294'
+- '12296'
+- '12297'
+- '12298'
+- '12300'
+- '12304'
+- '12305'
+- '12306'
+- '12308'
+- '12312'
+- '12320'
+- '12321'
+- '12322'
+- '12324'
+- '12328'
+- '12336'
+- '12352'
+- '12353'
+- '12354'
+- '12356'
+- '12360'
+- '12368'
+- '12384'
+- '12416'
+- '12417'
+- '12418'
+- '12420'
+- '12424'
+- '12432'
+- '12448'
+- '12480'
+- '12544'
+- '12545'
+- '12546'
+- '12548'
+- '12552'
+- '12560'
+- '12576'
+- '12608'
+- '12672'
+- '12800'
+- '12801'
+- '12802'
+- '12804'
+- '12808'
+- '12816'
+- '12832'
+- '12864'
+- '12928'
+- '13056'
+- '13312'
+- '13313'
+- '13314'
+- '13316'
+- '13320'
+- '13328'
+- '13344'
+- '13376'
+- '13440'
+- '13568'
+- '13824'
+- '14336'
+- '14337'
+- '14338'
+- '14340'
+- '14344'
+- '14352'
+- '14368'
+- '14400'
+- '14464'
+- '14592'
+- '14848'
+- '15360'
+- '16384'
+- '16385'
+- '16386'
+- '16387'
+- '16388'
+- '16389'
+- '16390'
+- '16391'
+- '16392'
+- '16393'
+- '16394'
+- '16395'
+- '16396'
+- '16397'
+- '16398'
+- '16400'
+- '16401'
+- '16402'
+- '16403'
+- '16404'
+- '16405'
+- '16406'
+- '16408'
+- '16409'
+- '16410'
+- '16412'
+- '16416'
+- '16417'
+- '16418'
+- '16419'
+- '16420'
+- '16421'
+- '16422'
+- '16424'
+- '16425'
+- '16426'
+- '16428'
+- '16432'
+- '16433'
+- '16434'
+- '16436'
+- '16440'
+- '16448'
+- '16449'
+- '16450'
+- '16451'
+- '16452'
+- '16453'
+- '16454'
+- '16456'
+- '16457'
+- '16458'
+- '16460'
+- '16464'
+- '16465'
+- '16466'
+- '16468'
+- '16472'
+- '16480'
+- '16481'
+- '16482'
+- '16484'
+- '16488'
+- '16496'
+- '16512'
+- '16513'
+- '16514'
+- '16515'
+- '16516'
+- '16517'
+- '16518'
+- '16520'
+- '16521'
+- '16522'
+- '16524'
+- '16528'
+- '16529'
+- '16530'
+- '16532'
+- '16536'
+- '16544'
+- '16545'
+- '16546'
+- '16548'
+- '16552'
+- '16560'
+- '16576'
+- '16577'
+- '16578'
+- '16580'
+- '16584'
+- '16592'
+- '16608'
+- '16640'
+- '16641'
+- '16642'
+- '16643'
+- '16644'
+- '16645'
+- '16646'
+- '16648'
+- '16649'
+- '16650'
+- '16652'
+- '16656'
+- '16657'
+- '16658'
+- '16660'
+- '16664'
+- '16672'
+- '16673'
+- '16674'
+- '16676'
+- '16680'
+- '16688'
+- '16704'
+- '16705'
+- '16706'
+- '16708'
+- '16712'
+- '16720'
+- '16736'
+- '16768'
+- '16769'
+- '16770'
+- '16772'
+- '16776'
+- '16784'
+- '16800'
+- '16832'
+- '16896'
+- '16897'
+- '16898'
+- '16899'
+- '16900'
+- '16901'
+- '16902'
+- '16904'
+- '16905'
+- '16906'
+- '16908'
+- '16912'
+- '16913'
+- '16914'
+- '16916'
+- '16920'
+- '16928'
+- '16929'
+- '16930'
+- '16932'
+- '16936'
+- '16944'
+- '16960'
+- '16961'
+- '16962'
+- '16964'
+- '16968'
+- '16976'
+- '16992'
+- '17024'
+- '17025'
+- '17026'
+- '17028'
+- '17032'
+- '17040'
+- '17056'
+- '17088'
+- '17152'
+- '17153'
+- '17154'
+- '17156'
+- '17160'
+- '17168'
+- '17184'
+- '17216'
+- '17280'
+- '17408'
+- '17409'
+- '17410'
+- '17411'
+- '17412'
+- '17413'
+- '17414'
+- '17416'
+- '17417'
+- '17418'
+- '17420'
+- '17424'
+- '17425'
+- '17426'
+- '17428'
+- '17432'
+- '17440'
+- '17441'
+- '17442'
+- '17444'
+- '17448'
+- '17456'
+- '17472'
+- '17473'
+- '17474'
+- '17476'
+- '17480'
+- '17488'
+- '17504'
+- '17536'
+- '17537'
+- '17538'
+- '17540'
+- '17544'
+- '17552'
+- '17568'
+- '17600'
+- '17664'
+- '17665'
+- '17666'
+- '17668'
+- '17672'
+- '17680'
+- '17696'
+- '17728'
+- '17792'
+- '17920'
+- '17921'
+- '17922'
+- '17924'
+- '17928'
+- '17936'
+- '17952'
+- '17984'
+- '18048'
+- '18176'
+- '18432'
+- '18433'
+- '18434'
+- '18435'
+- '18436'
+- '18437'
+- '18438'
+- '18440'
+- '18441'
+- '18442'
+- '18444'
+- '18448'
+- '18449'
+- '18450'
+- '18452'
+- '18456'
+- '18464'
+- '18465'
+- '18466'
+- '18468'
+- '18472'
+- '18480'
+- '18496'
+- '18497'
+- '18498'
+- '18500'
+- '18504'
+- '18512'
+- '18528'
+- '18560'
+- '18561'
+- '18562'
+- '18564'
+- '18568'
+- '18576'
+- '18592'
+- '18624'
+- '18688'
+- '18689'
+- '18690'
+- '18692'
+- '18696'
+- '18704'
+- '18720'
+- '18752'
+- '18816'
+- '18944'
+- '18945'
+- '18946'
+- '18948'
+- '18952'
+- '18960'
+- '18976'
+- '19008'
+- '19072'
+- '19200'
+- '19456'
+- '19457'
+- '19458'
+- '19460'
+- '19464'
+- '19472'
+- '19488'
+- '19520'
+- '19584'
+- '19712'
+- '19968'
+- '20480'
+- '20481'
+- '20482'
+- '20483'
+- '20484'
+- '20485'
+- '20486'
+- '20488'
+- '20489'
+- '20490'
+- '20492'
+- '20496'
+- '20497'
+- '20498'
+- '20500'
+- '20504'
+- '20512'
+- '20513'
+- '20514'
+- '20516'
+- '20520'
+- '20528'
+- '20544'
+- '20545'
+- '20546'
+- '20548'
+- '20552'
+- '20560'
+- '20576'
+- '20608'
+- '20609'
+- '20610'
+- '20612'
+- '20616'
+- '20624'
+- '20640'
+- '20672'
+- '20736'
+- '20737'
+- '20738'
+- '20740'
+- '20744'
+- '20752'
+- '20768'
+- '20800'
+- '20864'
+- '20992'
+- '20993'
+- '20994'
+- '20996'
+- '21000'
+- '21008'
+- '21024'
+- '21056'
+- '21120'
+- '21248'
+- '21504'
+- '21505'
+- '21506'
+- '21508'
+- '21512'
+- '21520'
+- '21536'
+- '21568'
+- '21632'
+- '21760'
+- '22016'
+- '22528'
+- '22529'
+- '22530'
+- '22532'
+- '22536'
+- '22544'
+- '22560'
+- '22592'
+- '22656'
+- '22784'
+- '23040'
+- '23552'
+- '24576'
+- '24577'
+- '24578'
+- '24579'
+- '24580'
+- '24581'
+- '24582'
+- '24584'
+- '24585'
+- '24586'
+- '24588'
+- '24592'
+- '24593'
+- '24594'
+- '24596'
+- '24600'
+- '24608'
+- '24609'
+- '24610'
+- '24612'
+- '24616'
+- '24624'
+- '24640'
+- '24641'
+- '24642'
+- '24644'
+- '24648'
+- '24656'
+- '24672'
+- '24704'
+- '24705'
+- '24706'
+- '24708'
+- '24712'
+- '24720'
+- '24736'
+- '24768'
+- '24832'
+- '24833'
+- '24834'
+- '24836'
+- '24840'
+- '24848'
+- '24864'
+- '24896'
+- '24960'
+- '25088'
+- '25089'
+- '25090'
+- '25092'
+- '25096'
+- '25104'
+- '25120'
+- '25152'
+- '25216'
+- '25344'
+- '25600'
+- '25601'
+- '25602'
+- '25604'
+- '25608'
+- '25616'
+- '25632'
+- '25664'
+- '25728'
+- '25856'
+- '26112'
+- '26624'
+- '26625'
+- '26626'
+- '26628'
+- '26632'
+- '26640'
+- '26656'
+- '26688'
+- '26752'
+- '26880'
+- '27136'
+- '27648'
+- '28672'
+- '28673'
+- '28674'
+- '28676'
+- '28680'
+- '28688'
+- '28704'
+- '28736'
+- '28800'
+- '28928'
+- '29184'
+- '29696'
+- '30720'
+- '32768'
+- '32769'
+- '32770'
+- '32771'
+- '32772'
+- '32773'
+- '32774'
+- '32775'
+- '32776'
+- '32777'
+- '32778'
+- '32779'
+- '32780'
+- '32781'
+- '32782'
+- '32784'
+- '32785'
+- '32786'
+- '32787'
+- '32788'
+- '32789'
+- '32790'
+- '32792'
+- '32793'
+- '32794'
+- '32796'
+- '32800'
+- '32801'
+- '32802'
+- '32803'
+- '32804'
+- '32805'
+- '32806'
+- '32808'
+- '32809'
+- '32810'
+- '32812'
+- '32816'
+- '32817'
+- '32818'
+- '32820'
+- '32824'
+- '32832'
+- '32833'
+- '32834'
+- '32835'
+- '32836'
+- '32837'
+- '32838'
+- '32840'
+- '32841'
+- '32842'
+- '32844'
+- '32848'
+- '32849'
+- '32850'
+- '32852'
+- '32856'
+- '32864'
+- '32865'
+- '32866'
+- '32868'
+- '32872'
+- '32880'
+- '32896'
+- '32897'
+- '32898'
+- '32899'
+- '32900'
+- '32901'
+- '32902'
+- '32904'
+- '32905'
+- '32906'
+- '32908'
+- '32912'
+- '32913'
+- '32914'
+- '32916'
+- '32920'
+- '32928'
+- '32929'
+- '32930'
+- '32932'
+- '32936'
+- '32944'
+- '32960'
+- '32961'
+- '32962'
+- '32964'
+- '32968'
+- '32976'
+- '32992'
+- '33024'
+- '33025'
+- '33026'
+- '33027'
+- '33028'
+- '33029'
+- '33030'
+- '33032'
+- '33033'
+- '33034'
+- '33036'
+- '33040'
+- '33041'
+- '33042'
+- '33044'
+- '33048'
+- '33056'
+- '33057'
+- '33058'
+- '33060'
+- '33064'
+- '33072'
+- '33088'
+- '33089'
+- '33090'
+- '33092'
+- '33096'
+- '33104'
+- '33120'
+- '33152'
+- '33153'
+- '33154'
+- '33156'
+- '33160'
+- '33168'
+- '33184'
+- '33216'
+- '33280'
+- '33281'
+- '33282'
+- '33283'
+- '33284'
+- '33285'
+- '33286'
+- '33288'
+- '33289'
+- '33290'
+- '33292'
+- '33296'
+- '33297'
+- '33298'
+- '33300'
+- '33304'
+- '33312'
+- '33313'
+- '33314'
+- '33316'
+- '33320'
+- '33328'
+- '33344'
+- '33345'
+- '33346'
+- '33348'
+- '33352'
+- '33360'
+- '33376'
+- '33408'
+- '33409'
+- '33410'
+- '33412'
+- '33416'
+- '33424'
+- '33440'
+- '33472'
+- '33536'
+- '33537'
+- '33538'
+- '33540'
+- '33544'
+- '33552'
+- '33568'
+- '33600'
+- '33664'
+- '33792'
+- '33793'
+- '33794'
+- '33795'
+- '33796'
+- '33797'
+- '33798'
+- '33800'
+- '33801'
+- '33802'
+- '33804'
+- '33808'
+- '33809'
+- '33810'
+- '33812'
+- '33816'
+- '33824'
+- '33825'
+- '33826'
+- '33828'
+- '33832'
+- '33840'
+- '33856'
+- '33857'
+- '33858'
+- '33860'
+- '33864'
+- '33872'
+- '33888'
+- '33920'
+- '33921'
+- '33922'
+- '33924'
+- '33928'
+- '33936'
+- '33952'
+- '33984'
+- '34048'
+- '34049'
+- '34050'
+- '34052'
+- '34056'
+- '34064'
+- '34080'
+- '34112'
+- '34176'
+- '34304'
+- '34305'
+- '34306'
+- '34308'
+- '34312'
+- '34320'
+- '34336'
+- '34368'
+- '34432'
+- '34560'
+- '34816'
+- '34817'
+- '34818'
+- '34819'
+- '34820'
+- '34821'
+- '34822'
+- '34824'
+- '34825'
+- '34826'
+- '34828'
+- '34832'
+- '34833'
+- '34834'
+- '34836'
+- '34840'
+- '34848'
+- '34849'
+- '34850'
+- '34852'
+- '34856'
+- '34864'
+- '34880'
+- '34881'
+- '34882'
+- '34884'
+- '34888'
+- '34896'
+- '34912'
+- '34944'
+- '34945'
+- '34946'
+- '34948'
+- '34952'
+- '34960'
+- '34976'
+- '35008'
+- '35072'
+- '35073'
+- '35074'
+- '35076'
+- '35080'
+- '35088'
+- '35104'
+- '35136'
+- '35200'
+- '35328'
+- '35329'
+- '35330'
+- '35332'
+- '35336'
+- '35344'
+- '35360'
+- '35392'
+- '35456'
+- '35584'
+- '35840'
+- '35841'
+- '35842'
+- '35844'
+- '35848'
+- '35856'
+- '35872'
+- '35904'
+- '35968'
+- '36096'
+- '36352'
+- '36864'
+- '36865'
+- '36866'
+- '36867'
+- '36868'
+- '36869'
+- '36870'
+- '36872'
+- '36873'
+- '36874'
+- '36876'
+- '36880'
+- '36881'
+- '36882'
+- '36884'
+- '36888'
+- '36896'
+- '36897'
+- '36898'
+- '36900'
+- '36904'
+- '36912'
+- '36928'
+- '36929'
+- '36930'
+- '36932'
+- '36936'
+- '36944'
+- '36960'
+- '36992'
+- '36993'
+- '36994'
+- '36996'
+- '37000'
+- '37008'
+- '37024'
+- '37056'
+- '37120'
+- '37121'
+- '37122'
+- '37124'
+- '37128'
+- '37136'
+- '37152'
+- '37184'
+- '37248'
+- '37376'
+- '37377'
+- '37378'
+- '37380'
+- '37384'
+- '37392'
+- '37408'
+- '37440'
+- '37504'
+- '37632'
+- '37888'
+- '37889'
+- '37890'
+- '37892'
+- '37896'
+- '37904'
+- '37920'
+- '37952'
+- '38016'
+- '38144'
+- '38400'
+- '38912'
+- '38913'
+- '38914'
+- '38916'
+- '38920'
+- '38928'
+- '38944'
+- '38976'
+- '39040'
+- '39168'
+- '39424'
+- '39936'
+- '40960'
+- '40961'
+- '40962'
+- '40963'
+- '40964'
+- '40965'
+- '40966'
+- '40968'
+- '40969'
+- '40970'
+- '40972'
+- '40976'
+- '40977'
+- '40978'
+- '40980'
+- '40984'
+- '40992'
+- '40993'
+- '40994'
+- '40996'
+- '41000'
+- '41008'
+- '41024'
+- '41025'
+- '41026'
+- '41028'
+- '41032'
+- '41040'
+- '41056'
+- '41088'
+- '41089'
+- '41090'
+- '41092'
+- '41096'
+- '41104'
+- '41120'
+- '41152'
+- '41216'
+- '41217'
+- '41218'
+- '41220'
+- '41224'
+- '41232'
+- '41248'
+- '41280'
+- '41344'
+- '41472'
+- '41473'
+- '41474'
+- '41476'
+- '41480'
+- '41488'
+- '41504'
+- '41536'
+- '41600'
+- '41728'
+- '41984'
+- '41985'
+- '41986'
+- '41988'
+- '41992'
+- '42000'
+- '42016'
+- '42048'
+- '42112'
+- '42240'
+- '42496'
+- '43008'
+- '43009'
+- '43010'
+- '43012'
+- '43016'
+- '43024'
+- '43040'
+- '43072'
+- '43136'
+- '43264'
+- '43520'
+- '44032'
+- '45056'
+- '45057'
+- '45058'
+- '45060'
+- '45064'
+- '45072'
+- '45088'
+- '45120'
+- '45184'
+- '45312'
+- '45568'
+- '46080'
+- '47104'
+- '49152'
+- '49153'
+- '49154'
+- '49155'
+- '49156'
+- '49157'
+- '49158'
+- '49160'
+- '49161'
+- '49162'
+- '49164'
+- '49168'
+- '49169'
+- '49170'
+- '49172'
+- '49176'
+- '49184'
+- '49185'
+- '49186'
+- '49188'
+- '49192'
+- '49200'
+- '49216'
+- '49217'
+- '49218'
+- '49220'
+- '49224'
+- '49232'
+- '49248'
+- '49280'
+- '49281'
+- '49282'
+- '49284'
+- '49288'
+- '49296'
+- '49312'
+- '49344'
+- '49408'
+- '49409'
+- '49410'
+- '49412'
+- '49416'
+- '49424'
+- '49440'
+- '49472'
+- '49536'
+- '49664'
+- '49665'
+- '49666'
+- '49668'
+- '49672'
+- '49680'
+- '49696'
+- '49728'
+- '49792'
+- '49920'
+- '50176'
+- '50177'
+- '50178'
+- '50180'
+- '50184'
+- '50192'
+- '50208'
+- '50240'
+- '50304'
+- '50432'
+- '50688'
+- '51200'
+- '51201'
+- '51202'
+- '51204'
+- '51208'
+- '51216'
+- '51232'
+- '51264'
+- '51328'
+- '51456'
+- '51712'
+- '52224'
+- '53248'
+- '53249'
+- '53250'
+- '53252'
+- '53256'
+- '53264'
+- '53280'
+- '53312'
+- '53376'
+- '53504'
+- '53760'
+- '54272'
+- '55296'
+- '57344'
+- '57345'
+- '57346'
+- '57348'
+- '57352'
+- '57360'
+- '57376'
+- '57408'
+- '57472'
+- '57600'
+- '57856'
+- '58368'
+- '59392'
+- '61440'
+init: null
+input_size: null
+cmvn_file: null
+ctc_conf:
+ dropout_rate: 0.0
+ ctc_type: builtin
+ reduce: true
+ ignore_nan_grad: true
+joint_net_conf: null
+use_preprocessor: true
+token_type: char
+bpemodel: null
+non_linguistic_symbols: null
+cleaner: null
+g2p: null
+speech_volume_normalize: null
+rir_scp: null
+rir_apply_prob: 1.0
+noise_scp: null
+noise_apply_prob: 1.0
+noise_db_range: '13_15'
+specaug: null
+specaug_conf: {}
+normalize: null
+normalize_conf: {}
+label_aggregator: null
+label_aggregator_conf: {}
+model: sond
+model_conf:
+ lsm_weight: 0.1
+ length_normalized_loss: true
+ max_spk_num: 16
+ normalize_speech_speaker: true
+# speech encoder
+encoder: resnet34_sp_l2reg
+encoder_conf:
+ # pass by model, equal to feature dim
+ # input_size: 80
+ pooling_type: "window_shift"
+ batchnorm_momentum: 0.01
+ pool_size: 20
+ stride: 1
+ tf2torch_tensor_name_prefix_torch: encoder
+ tf2torch_tensor_name_prefix_tf: EAND/speech_encoder
+speaker_encoder: null
+speaker_encoder_conf: {}
+ci_scorer: conv
+ci_scorer_conf:
+ input_units: 512
+ num_layers: 3
+ num_units: 512
+ kernel_size: 1
+ dropout_rate: 0.0
+ position_encoder: null
+ out_units: 1
+ out_norm: false
+ auxiliary_states: false
+ tf2torch_tensor_name_prefix_torch: ci_scorer
+ tf2torch_tensor_name_prefix_tf: EAND/compute_distance_layer/ci_scorer
+cd_scorer: san
+cd_scorer_conf:
+ input_size: 512
+ output_size: 512
+ out_units: 1
+ attention_heads: 4
+ linear_units: 1024
+ num_blocks: 4
+ dropout_rate: 0.0
+ positional_dropout_rate: 0.0
+ attention_dropout_rate: 0.0
+ # use string "null" to remove input layer
+ input_layer: "null"
+ pos_enc_class: null
+ normalize_before: true
+ tf2torch_tensor_name_prefix_torch: cd_scorer
+ tf2torch_tensor_name_prefix_tf: EAND/compute_distance_layer/cd_scorer
+# post net
+decoder: fsmn
+decoder_conf:
+ in_units: 32
+ out_units: 2517
+ filter_size: 31
+ fsmn_num_layers: 6
+ dnn_num_layers: 1
+ num_memory_units: 16
+ ffn_inner_dim: 512
+ dropout_rate: 0.0
+ tf2torch_tensor_name_prefix_torch: decoder
+ tf2torch_tensor_name_prefix_tf: EAND/post_net
+frontend: wav_frontend
+frontend_conf:
+ fs: 8000
+ window: povey
+ n_mels: 80
+ frame_length: 25
+ frame_shift: 10
+ filter_length_min: -1
+ filter_length_max: -1
+ lfr_m: 1
+ lfr_n: 1
+ dither: 0.0
+ snip_edges: false
+ upsacle_samples: false
+num_worker_count: 1
+required:
+- output_dir
+- token_list
+oss_bucket: 'null'
+version: 0.1.4
diff --git a/egs/callhome/sond/sond_fbank.yaml b/egs/callhome/sond/sond_fbank.yaml
new file mode 100644
index 0000000..fc76259
--- /dev/null
+++ b/egs/callhome/sond/sond_fbank.yaml
@@ -0,0 +1,2739 @@
+config: finetune.yaml
+print_config: false
+log_level: INFO
+dry_run: false
+iterator_type: sequence
+output_dir: exp/sond
+ngpu: 1
+seed: 0
+num_workers: 16
+num_att_plot: 0
+dist_backend: nccl
+dist_init_method: env://
+dist_world_size: null
+dist_rank: null
+local_rank: 0
+dist_master_addr: null
+dist_master_port: null
+dist_launcher: null
+multiprocessing_distributed: true
+distributed: false
+unused_parameters: true
+sharded_ddp: false
+ddp_backend: pytorch_ddp
+cudnn_enabled: true
+cudnn_benchmark: false
+cudnn_deterministic: true
+collect_stats: false
+write_collected_feats: false
+max_epoch: 50
+patience: null
+val_scheduler_criterion:
+- valid
+- acc
+early_stopping_criterion:
+- valid
+- loss
+- min
+best_model_criterion:
+- - valid
+ - acc
+ - max
+keep_nbest_models: 10
+nbest_averaging_interval: 0
+grad_clip: 5
+grad_clip_type: 2.0
+grad_noise: false
+accum_grad: 1
+no_forward_run: false
+resume: true
+train_dtype: float32
+use_amp: false
+log_interval: 50
+use_matplotlib: false
+use_tensorboard: true
+use_wandb: false
+wandb_project: null
+wandb_id: null
+wandb_entity: null
+wandb_name: null
+wandb_model_log_interval: -1
+use_pai: true
+detect_anomaly: false
+pretrain_path: null
+init_param: []
+ignore_init_mismatch: false
+freeze_param: []
+num_iters_per_epoch: null
+batch_size: 20
+valid_batch_size: null
+batch_bins: 10000
+valid_batch_bins: null
+train_shape_file:
+- /data/volume1/youyan/aishell/ark/train/speech_shape.1
+- /data/volume1/youyan/aishell/ark/train/text_shape.1
+valid_shape_file:
+- /data/volume1/youyan/aishell/ark/dev/speech_shape.1
+- /data/volume1/youyan/aishell/ark/dev/text_shape.1
+batch_type: length
+valid_batch_type: null
+fold_length:
+- 512
+- 150
+sort_in_batch: descending
+sort_batch: descending
+multiple_iterator: false
+chunk_length: 500
+chunk_shift_ratio: 0.5
+num_cache_chunks: 1024
+train_data_path_and_name_and_type:
+- - /data/volume1/youyan/aishell/ark/train/data.scp
+ - speech
+ - kaldi_ark
+- - /data/volume1/youyan/aishell/ark/train/data.text.1
+ - text
+ - text
+valid_data_path_and_name_and_type:
+- - /data/volume1/youyan/aishell/ark/dev/data.scp
+ - speech
+ - kaldi_ark
+- - /data/volume1/youyan/aishell/ark/dev/data.text.1
+ - text
+ - text
+allow_variable_data_keys: false
+max_cache_size: 0.0
+max_cache_fd: 32
+valid_max_cache_size: null
+optim: adam
+optim_conf:
+ lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+ warmup_steps: 30000
+token_list:
+- '0'
+- '1'
+- '2'
+- '3'
+- '4'
+- '5'
+- '6'
+- '7'
+- '8'
+- '9'
+- '10'
+- '11'
+- '12'
+- '13'
+- '14'
+- '15'
+- '16'
+- '17'
+- '18'
+- '19'
+- '20'
+- '21'
+- '22'
+- '23'
+- '24'
+- '25'
+- '26'
+- '27'
+- '28'
+- '29'
+- '30'
+- '32'
+- '33'
+- '34'
+- '35'
+- '36'
+- '37'
+- '38'
+- '39'
+- '40'
+- '41'
+- '42'
+- '43'
+- '44'
+- '45'
+- '46'
+- '48'
+- '49'
+- '50'
+- '51'
+- '52'
+- '53'
+- '54'
+- '56'
+- '57'
+- '58'
+- '60'
+- '64'
+- '65'
+- '66'
+- '67'
+- '68'
+- '69'
+- '70'
+- '71'
+- '72'
+- '73'
+- '74'
+- '75'
+- '76'
+- '77'
+- '78'
+- '80'
+- '81'
+- '82'
+- '83'
+- '84'
+- '85'
+- '86'
+- '88'
+- '89'
+- '90'
+- '92'
+- '96'
+- '97'
+- '98'
+- '99'
+- '100'
+- '101'
+- '102'
+- '104'
+- '105'
+- '106'
+- '108'
+- '112'
+- '113'
+- '114'
+- '116'
+- '120'
+- '128'
+- '129'
+- '130'
+- '131'
+- '132'
+- '133'
+- '134'
+- '135'
+- '136'
+- '137'
+- '138'
+- '139'
+- '140'
+- '141'
+- '142'
+- '144'
+- '145'
+- '146'
+- '147'
+- '148'
+- '149'
+- '150'
+- '152'
+- '153'
+- '154'
+- '156'
+- '160'
+- '161'
+- '162'
+- '163'
+- '164'
+- '165'
+- '166'
+- '168'
+- '169'
+- '170'
+- '172'
+- '176'
+- '177'
+- '178'
+- '180'
+- '184'
+- '192'
+- '193'
+- '194'
+- '195'
+- '196'
+- '197'
+- '198'
+- '200'
+- '201'
+- '202'
+- '204'
+- '208'
+- '209'
+- '210'
+- '212'
+- '216'
+- '224'
+- '225'
+- '226'
+- '228'
+- '232'
+- '240'
+- '256'
+- '257'
+- '258'
+- '259'
+- '260'
+- '261'
+- '262'
+- '263'
+- '264'
+- '265'
+- '266'
+- '267'
+- '268'
+- '269'
+- '270'
+- '272'
+- '273'
+- '274'
+- '275'
+- '276'
+- '277'
+- '278'
+- '280'
+- '281'
+- '282'
+- '284'
+- '288'
+- '289'
+- '290'
+- '291'
+- '292'
+- '293'
+- '294'
+- '296'
+- '297'
+- '298'
+- '300'
+- '304'
+- '305'
+- '306'
+- '308'
+- '312'
+- '320'
+- '321'
+- '322'
+- '323'
+- '324'
+- '325'
+- '326'
+- '328'
+- '329'
+- '330'
+- '332'
+- '336'
+- '337'
+- '338'
+- '340'
+- '344'
+- '352'
+- '353'
+- '354'
+- '356'
+- '360'
+- '368'
+- '384'
+- '385'
+- '386'
+- '387'
+- '388'
+- '389'
+- '390'
+- '392'
+- '393'
+- '394'
+- '396'
+- '400'
+- '401'
+- '402'
+- '404'
+- '408'
+- '416'
+- '417'
+- '418'
+- '420'
+- '424'
+- '432'
+- '448'
+- '449'
+- '450'
+- '452'
+- '456'
+- '464'
+- '480'
+- '512'
+- '513'
+- '514'
+- '515'
+- '516'
+- '517'
+- '518'
+- '519'
+- '520'
+- '521'
+- '522'
+- '523'
+- '524'
+- '525'
+- '526'
+- '528'
+- '529'
+- '530'
+- '531'
+- '532'
+- '533'
+- '534'
+- '536'
+- '537'
+- '538'
+- '540'
+- '544'
+- '545'
+- '546'
+- '547'
+- '548'
+- '549'
+- '550'
+- '552'
+- '553'
+- '554'
+- '556'
+- '560'
+- '561'
+- '562'
+- '564'
+- '568'
+- '576'
+- '577'
+- '578'
+- '579'
+- '580'
+- '581'
+- '582'
+- '584'
+- '585'
+- '586'
+- '588'
+- '592'
+- '593'
+- '594'
+- '596'
+- '600'
+- '608'
+- '609'
+- '610'
+- '612'
+- '616'
+- '624'
+- '640'
+- '641'
+- '642'
+- '643'
+- '644'
+- '645'
+- '646'
+- '648'
+- '649'
+- '650'
+- '652'
+- '656'
+- '657'
+- '658'
+- '660'
+- '664'
+- '672'
+- '673'
+- '674'
+- '676'
+- '680'
+- '688'
+- '704'
+- '705'
+- '706'
+- '708'
+- '712'
+- '720'
+- '736'
+- '768'
+- '769'
+- '770'
+- '771'
+- '772'
+- '773'
+- '774'
+- '776'
+- '777'
+- '778'
+- '780'
+- '784'
+- '785'
+- '786'
+- '788'
+- '792'
+- '800'
+- '801'
+- '802'
+- '804'
+- '808'
+- '816'
+- '832'
+- '833'
+- '834'
+- '836'
+- '840'
+- '848'
+- '864'
+- '896'
+- '897'
+- '898'
+- '900'
+- '904'
+- '912'
+- '928'
+- '960'
+- '1024'
+- '1025'
+- '1026'
+- '1027'
+- '1028'
+- '1029'
+- '1030'
+- '1031'
+- '1032'
+- '1033'
+- '1034'
+- '1035'
+- '1036'
+- '1037'
+- '1038'
+- '1040'
+- '1041'
+- '1042'
+- '1043'
+- '1044'
+- '1045'
+- '1046'
+- '1048'
+- '1049'
+- '1050'
+- '1052'
+- '1056'
+- '1057'
+- '1058'
+- '1059'
+- '1060'
+- '1061'
+- '1062'
+- '1064'
+- '1065'
+- '1066'
+- '1068'
+- '1072'
+- '1073'
+- '1074'
+- '1076'
+- '1080'
+- '1088'
+- '1089'
+- '1090'
+- '1091'
+- '1092'
+- '1093'
+- '1094'
+- '1096'
+- '1097'
+- '1098'
+- '1100'
+- '1104'
+- '1105'
+- '1106'
+- '1108'
+- '1112'
+- '1120'
+- '1121'
+- '1122'
+- '1124'
+- '1128'
+- '1136'
+- '1152'
+- '1153'
+- '1154'
+- '1155'
+- '1156'
+- '1157'
+- '1158'
+- '1160'
+- '1161'
+- '1162'
+- '1164'
+- '1168'
+- '1169'
+- '1170'
+- '1172'
+- '1176'
+- '1184'
+- '1185'
+- '1186'
+- '1188'
+- '1192'
+- '1200'
+- '1216'
+- '1217'
+- '1218'
+- '1220'
+- '1224'
+- '1232'
+- '1248'
+- '1280'
+- '1281'
+- '1282'
+- '1283'
+- '1284'
+- '1285'
+- '1286'
+- '1288'
+- '1289'
+- '1290'
+- '1292'
+- '1296'
+- '1297'
+- '1298'
+- '1300'
+- '1304'
+- '1312'
+- '1313'
+- '1314'
+- '1316'
+- '1320'
+- '1328'
+- '1344'
+- '1345'
+- '1346'
+- '1348'
+- '1352'
+- '1360'
+- '1376'
+- '1408'
+- '1409'
+- '1410'
+- '1412'
+- '1416'
+- '1424'
+- '1440'
+- '1472'
+- '1536'
+- '1537'
+- '1538'
+- '1539'
+- '1540'
+- '1541'
+- '1542'
+- '1544'
+- '1545'
+- '1546'
+- '1548'
+- '1552'
+- '1553'
+- '1554'
+- '1556'
+- '1560'
+- '1568'
+- '1569'
+- '1570'
+- '1572'
+- '1576'
+- '1584'
+- '1600'
+- '1601'
+- '1602'
+- '1604'
+- '1608'
+- '1616'
+- '1632'
+- '1664'
+- '1665'
+- '1666'
+- '1668'
+- '1672'
+- '1680'
+- '1696'
+- '1728'
+- '1792'
+- '1793'
+- '1794'
+- '1796'
+- '1800'
+- '1808'
+- '1824'
+- '1856'
+- '1920'
+- '2048'
+- '2049'
+- '2050'
+- '2051'
+- '2052'
+- '2053'
+- '2054'
+- '2055'
+- '2056'
+- '2057'
+- '2058'
+- '2059'
+- '2060'
+- '2061'
+- '2062'
+- '2064'
+- '2065'
+- '2066'
+- '2067'
+- '2068'
+- '2069'
+- '2070'
+- '2072'
+- '2073'
+- '2074'
+- '2076'
+- '2080'
+- '2081'
+- '2082'
+- '2083'
+- '2084'
+- '2085'
+- '2086'
+- '2088'
+- '2089'
+- '2090'
+- '2092'
+- '2096'
+- '2097'
+- '2098'
+- '2100'
+- '2104'
+- '2112'
+- '2113'
+- '2114'
+- '2115'
+- '2116'
+- '2117'
+- '2118'
+- '2120'
+- '2121'
+- '2122'
+- '2124'
+- '2128'
+- '2129'
+- '2130'
+- '2132'
+- '2136'
+- '2144'
+- '2145'
+- '2146'
+- '2148'
+- '2152'
+- '2160'
+- '2176'
+- '2177'
+- '2178'
+- '2179'
+- '2180'
+- '2181'
+- '2182'
+- '2184'
+- '2185'
+- '2186'
+- '2188'
+- '2192'
+- '2193'
+- '2194'
+- '2196'
+- '2200'
+- '2208'
+- '2209'
+- '2210'
+- '2212'
+- '2216'
+- '2224'
+- '2240'
+- '2241'
+- '2242'
+- '2244'
+- '2248'
+- '2256'
+- '2272'
+- '2304'
+- '2305'
+- '2306'
+- '2307'
+- '2308'
+- '2309'
+- '2310'
+- '2312'
+- '2313'
+- '2314'
+- '2316'
+- '2320'
+- '2321'
+- '2322'
+- '2324'
+- '2328'
+- '2336'
+- '2337'
+- '2338'
+- '2340'
+- '2344'
+- '2352'
+- '2368'
+- '2369'
+- '2370'
+- '2372'
+- '2376'
+- '2384'
+- '2400'
+- '2432'
+- '2433'
+- '2434'
+- '2436'
+- '2440'
+- '2448'
+- '2464'
+- '2496'
+- '2560'
+- '2561'
+- '2562'
+- '2563'
+- '2564'
+- '2565'
+- '2566'
+- '2568'
+- '2569'
+- '2570'
+- '2572'
+- '2576'
+- '2577'
+- '2578'
+- '2580'
+- '2584'
+- '2592'
+- '2593'
+- '2594'
+- '2596'
+- '2600'
+- '2608'
+- '2624'
+- '2625'
+- '2626'
+- '2628'
+- '2632'
+- '2640'
+- '2656'
+- '2688'
+- '2689'
+- '2690'
+- '2692'
+- '2696'
+- '2704'
+- '2720'
+- '2752'
+- '2816'
+- '2817'
+- '2818'
+- '2820'
+- '2824'
+- '2832'
+- '2848'
+- '2880'
+- '2944'
+- '3072'
+- '3073'
+- '3074'
+- '3075'
+- '3076'
+- '3077'
+- '3078'
+- '3080'
+- '3081'
+- '3082'
+- '3084'
+- '3088'
+- '3089'
+- '3090'
+- '3092'
+- '3096'
+- '3104'
+- '3105'
+- '3106'
+- '3108'
+- '3112'
+- '3120'
+- '3136'
+- '3137'
+- '3138'
+- '3140'
+- '3144'
+- '3152'
+- '3168'
+- '3200'
+- '3201'
+- '3202'
+- '3204'
+- '3208'
+- '3216'
+- '3232'
+- '3264'
+- '3328'
+- '3329'
+- '3330'
+- '3332'
+- '3336'
+- '3344'
+- '3360'
+- '3392'
+- '3456'
+- '3584'
+- '3585'
+- '3586'
+- '3588'
+- '3592'
+- '3600'
+- '3616'
+- '3648'
+- '3712'
+- '3840'
+- '4096'
+- '4097'
+- '4098'
+- '4099'
+- '4100'
+- '4101'
+- '4102'
+- '4103'
+- '4104'
+- '4105'
+- '4106'
+- '4107'
+- '4108'
+- '4109'
+- '4110'
+- '4112'
+- '4113'
+- '4114'
+- '4115'
+- '4116'
+- '4117'
+- '4118'
+- '4120'
+- '4121'
+- '4122'
+- '4124'
+- '4128'
+- '4129'
+- '4130'
+- '4131'
+- '4132'
+- '4133'
+- '4134'
+- '4136'
+- '4137'
+- '4138'
+- '4140'
+- '4144'
+- '4145'
+- '4146'
+- '4148'
+- '4152'
+- '4160'
+- '4161'
+- '4162'
+- '4163'
+- '4164'
+- '4165'
+- '4166'
+- '4168'
+- '4169'
+- '4170'
+- '4172'
+- '4176'
+- '4177'
+- '4178'
+- '4180'
+- '4184'
+- '4192'
+- '4193'
+- '4194'
+- '4196'
+- '4200'
+- '4208'
+- '4224'
+- '4225'
+- '4226'
+- '4227'
+- '4228'
+- '4229'
+- '4230'
+- '4232'
+- '4233'
+- '4234'
+- '4236'
+- '4240'
+- '4241'
+- '4242'
+- '4244'
+- '4248'
+- '4256'
+- '4257'
+- '4258'
+- '4260'
+- '4264'
+- '4272'
+- '4288'
+- '4289'
+- '4290'
+- '4292'
+- '4296'
+- '4304'
+- '4320'
+- '4352'
+- '4353'
+- '4354'
+- '4355'
+- '4356'
+- '4357'
+- '4358'
+- '4360'
+- '4361'
+- '4362'
+- '4364'
+- '4368'
+- '4369'
+- '4370'
+- '4372'
+- '4376'
+- '4384'
+- '4385'
+- '4386'
+- '4388'
+- '4392'
+- '4400'
+- '4416'
+- '4417'
+- '4418'
+- '4420'
+- '4424'
+- '4432'
+- '4448'
+- '4480'
+- '4481'
+- '4482'
+- '4484'
+- '4488'
+- '4496'
+- '4512'
+- '4544'
+- '4608'
+- '4609'
+- '4610'
+- '4611'
+- '4612'
+- '4613'
+- '4614'
+- '4616'
+- '4617'
+- '4618'
+- '4620'
+- '4624'
+- '4625'
+- '4626'
+- '4628'
+- '4632'
+- '4640'
+- '4641'
+- '4642'
+- '4644'
+- '4648'
+- '4656'
+- '4672'
+- '4673'
+- '4674'
+- '4676'
+- '4680'
+- '4688'
+- '4704'
+- '4736'
+- '4737'
+- '4738'
+- '4740'
+- '4744'
+- '4752'
+- '4768'
+- '4800'
+- '4864'
+- '4865'
+- '4866'
+- '4868'
+- '4872'
+- '4880'
+- '4896'
+- '4928'
+- '4992'
+- '5120'
+- '5121'
+- '5122'
+- '5123'
+- '5124'
+- '5125'
+- '5126'
+- '5128'
+- '5129'
+- '5130'
+- '5132'
+- '5136'
+- '5137'
+- '5138'
+- '5140'
+- '5144'
+- '5152'
+- '5153'
+- '5154'
+- '5156'
+- '5160'
+- '5168'
+- '5184'
+- '5185'
+- '5186'
+- '5188'
+- '5192'
+- '5200'
+- '5216'
+- '5248'
+- '5249'
+- '5250'
+- '5252'
+- '5256'
+- '5264'
+- '5280'
+- '5312'
+- '5376'
+- '5377'
+- '5378'
+- '5380'
+- '5384'
+- '5392'
+- '5408'
+- '5440'
+- '5504'
+- '5632'
+- '5633'
+- '5634'
+- '5636'
+- '5640'
+- '5648'
+- '5664'
+- '5696'
+- '5760'
+- '5888'
+- '6144'
+- '6145'
+- '6146'
+- '6147'
+- '6148'
+- '6149'
+- '6150'
+- '6152'
+- '6153'
+- '6154'
+- '6156'
+- '6160'
+- '6161'
+- '6162'
+- '6164'
+- '6168'
+- '6176'
+- '6177'
+- '6178'
+- '6180'
+- '6184'
+- '6192'
+- '6208'
+- '6209'
+- '6210'
+- '6212'
+- '6216'
+- '6224'
+- '6240'
+- '6272'
+- '6273'
+- '6274'
+- '6276'
+- '6280'
+- '6288'
+- '6304'
+- '6336'
+- '6400'
+- '6401'
+- '6402'
+- '6404'
+- '6408'
+- '6416'
+- '6432'
+- '6464'
+- '6528'
+- '6656'
+- '6657'
+- '6658'
+- '6660'
+- '6664'
+- '6672'
+- '6688'
+- '6720'
+- '6784'
+- '6912'
+- '7168'
+- '7169'
+- '7170'
+- '7172'
+- '7176'
+- '7184'
+- '7200'
+- '7232'
+- '7296'
+- '7424'
+- '7680'
+- '8192'
+- '8193'
+- '8194'
+- '8195'
+- '8196'
+- '8197'
+- '8198'
+- '8199'
+- '8200'
+- '8201'
+- '8202'
+- '8203'
+- '8204'
+- '8205'
+- '8206'
+- '8208'
+- '8209'
+- '8210'
+- '8211'
+- '8212'
+- '8213'
+- '8214'
+- '8216'
+- '8217'
+- '8218'
+- '8220'
+- '8224'
+- '8225'
+- '8226'
+- '8227'
+- '8228'
+- '8229'
+- '8230'
+- '8232'
+- '8233'
+- '8234'
+- '8236'
+- '8240'
+- '8241'
+- '8242'
+- '8244'
+- '8248'
+- '8256'
+- '8257'
+- '8258'
+- '8259'
+- '8260'
+- '8261'
+- '8262'
+- '8264'
+- '8265'
+- '8266'
+- '8268'
+- '8272'
+- '8273'
+- '8274'
+- '8276'
+- '8280'
+- '8288'
+- '8289'
+- '8290'
+- '8292'
+- '8296'
+- '8304'
+- '8320'
+- '8321'
+- '8322'
+- '8323'
+- '8324'
+- '8325'
+- '8326'
+- '8328'
+- '8329'
+- '8330'
+- '8332'
+- '8336'
+- '8337'
+- '8338'
+- '8340'
+- '8344'
+- '8352'
+- '8353'
+- '8354'
+- '8356'
+- '8360'
+- '8368'
+- '8384'
+- '8385'
+- '8386'
+- '8388'
+- '8392'
+- '8400'
+- '8416'
+- '8448'
+- '8449'
+- '8450'
+- '8451'
+- '8452'
+- '8453'
+- '8454'
+- '8456'
+- '8457'
+- '8458'
+- '8460'
+- '8464'
+- '8465'
+- '8466'
+- '8468'
+- '8472'
+- '8480'
+- '8481'
+- '8482'
+- '8484'
+- '8488'
+- '8496'
+- '8512'
+- '8513'
+- '8514'
+- '8516'
+- '8520'
+- '8528'
+- '8544'
+- '8576'
+- '8577'
+- '8578'
+- '8580'
+- '8584'
+- '8592'
+- '8608'
+- '8640'
+- '8704'
+- '8705'
+- '8706'
+- '8707'
+- '8708'
+- '8709'
+- '8710'
+- '8712'
+- '8713'
+- '8714'
+- '8716'
+- '8720'
+- '8721'
+- '8722'
+- '8724'
+- '8728'
+- '8736'
+- '8737'
+- '8738'
+- '8740'
+- '8744'
+- '8752'
+- '8768'
+- '8769'
+- '8770'
+- '8772'
+- '8776'
+- '8784'
+- '8800'
+- '8832'
+- '8833'
+- '8834'
+- '8836'
+- '8840'
+- '8848'
+- '8864'
+- '8896'
+- '8960'
+- '8961'
+- '8962'
+- '8964'
+- '8968'
+- '8976'
+- '8992'
+- '9024'
+- '9088'
+- '9216'
+- '9217'
+- '9218'
+- '9219'
+- '9220'
+- '9221'
+- '9222'
+- '9224'
+- '9225'
+- '9226'
+- '9228'
+- '9232'
+- '9233'
+- '9234'
+- '9236'
+- '9240'
+- '9248'
+- '9249'
+- '9250'
+- '9252'
+- '9256'
+- '9264'
+- '9280'
+- '9281'
+- '9282'
+- '9284'
+- '9288'
+- '9296'
+- '9312'
+- '9344'
+- '9345'
+- '9346'
+- '9348'
+- '9352'
+- '9360'
+- '9376'
+- '9408'
+- '9472'
+- '9473'
+- '9474'
+- '9476'
+- '9480'
+- '9488'
+- '9504'
+- '9536'
+- '9600'
+- '9728'
+- '9729'
+- '9730'
+- '9732'
+- '9736'
+- '9744'
+- '9760'
+- '9792'
+- '9856'
+- '9984'
+- '10240'
+- '10241'
+- '10242'
+- '10243'
+- '10244'
+- '10245'
+- '10246'
+- '10248'
+- '10249'
+- '10250'
+- '10252'
+- '10256'
+- '10257'
+- '10258'
+- '10260'
+- '10264'
+- '10272'
+- '10273'
+- '10274'
+- '10276'
+- '10280'
+- '10288'
+- '10304'
+- '10305'
+- '10306'
+- '10308'
+- '10312'
+- '10320'
+- '10336'
+- '10368'
+- '10369'
+- '10370'
+- '10372'
+- '10376'
+- '10384'
+- '10400'
+- '10432'
+- '10496'
+- '10497'
+- '10498'
+- '10500'
+- '10504'
+- '10512'
+- '10528'
+- '10560'
+- '10624'
+- '10752'
+- '10753'
+- '10754'
+- '10756'
+- '10760'
+- '10768'
+- '10784'
+- '10816'
+- '10880'
+- '11008'
+- '11264'
+- '11265'
+- '11266'
+- '11268'
+- '11272'
+- '11280'
+- '11296'
+- '11328'
+- '11392'
+- '11520'
+- '11776'
+- '12288'
+- '12289'
+- '12290'
+- '12291'
+- '12292'
+- '12293'
+- '12294'
+- '12296'
+- '12297'
+- '12298'
+- '12300'
+- '12304'
+- '12305'
+- '12306'
+- '12308'
+- '12312'
+- '12320'
+- '12321'
+- '12322'
+- '12324'
+- '12328'
+- '12336'
+- '12352'
+- '12353'
+- '12354'
+- '12356'
+- '12360'
+- '12368'
+- '12384'
+- '12416'
+- '12417'
+- '12418'
+- '12420'
+- '12424'
+- '12432'
+- '12448'
+- '12480'
+- '12544'
+- '12545'
+- '12546'
+- '12548'
+- '12552'
+- '12560'
+- '12576'
+- '12608'
+- '12672'
+- '12800'
+- '12801'
+- '12802'
+- '12804'
+- '12808'
+- '12816'
+- '12832'
+- '12864'
+- '12928'
+- '13056'
+- '13312'
+- '13313'
+- '13314'
+- '13316'
+- '13320'
+- '13328'
+- '13344'
+- '13376'
+- '13440'
+- '13568'
+- '13824'
+- '14336'
+- '14337'
+- '14338'
+- '14340'
+- '14344'
+- '14352'
+- '14368'
+- '14400'
+- '14464'
+- '14592'
+- '14848'
+- '15360'
+- '16384'
+- '16385'
+- '16386'
+- '16387'
+- '16388'
+- '16389'
+- '16390'
+- '16391'
+- '16392'
+- '16393'
+- '16394'
+- '16395'
+- '16396'
+- '16397'
+- '16398'
+- '16400'
+- '16401'
+- '16402'
+- '16403'
+- '16404'
+- '16405'
+- '16406'
+- '16408'
+- '16409'
+- '16410'
+- '16412'
+- '16416'
+- '16417'
+- '16418'
+- '16419'
+- '16420'
+- '16421'
+- '16422'
+- '16424'
+- '16425'
+- '16426'
+- '16428'
+- '16432'
+- '16433'
+- '16434'
+- '16436'
+- '16440'
+- '16448'
+- '16449'
+- '16450'
+- '16451'
+- '16452'
+- '16453'
+- '16454'
+- '16456'
+- '16457'
+- '16458'
+- '16460'
+- '16464'
+- '16465'
+- '16466'
+- '16468'
+- '16472'
+- '16480'
+- '16481'
+- '16482'
+- '16484'
+- '16488'
+- '16496'
+- '16512'
+- '16513'
+- '16514'
+- '16515'
+- '16516'
+- '16517'
+- '16518'
+- '16520'
+- '16521'
+- '16522'
+- '16524'
+- '16528'
+- '16529'
+- '16530'
+- '16532'
+- '16536'
+- '16544'
+- '16545'
+- '16546'
+- '16548'
+- '16552'
+- '16560'
+- '16576'
+- '16577'
+- '16578'
+- '16580'
+- '16584'
+- '16592'
+- '16608'
+- '16640'
+- '16641'
+- '16642'
+- '16643'
+- '16644'
+- '16645'
+- '16646'
+- '16648'
+- '16649'
+- '16650'
+- '16652'
+- '16656'
+- '16657'
+- '16658'
+- '16660'
+- '16664'
+- '16672'
+- '16673'
+- '16674'
+- '16676'
+- '16680'
+- '16688'
+- '16704'
+- '16705'
+- '16706'
+- '16708'
+- '16712'
+- '16720'
+- '16736'
+- '16768'
+- '16769'
+- '16770'
+- '16772'
+- '16776'
+- '16784'
+- '16800'
+- '16832'
+- '16896'
+- '16897'
+- '16898'
+- '16899'
+- '16900'
+- '16901'
+- '16902'
+- '16904'
+- '16905'
+- '16906'
+- '16908'
+- '16912'
+- '16913'
+- '16914'
+- '16916'
+- '16920'
+- '16928'
+- '16929'
+- '16930'
+- '16932'
+- '16936'
+- '16944'
+- '16960'
+- '16961'
+- '16962'
+- '16964'
+- '16968'
+- '16976'
+- '16992'
+- '17024'
+- '17025'
+- '17026'
+- '17028'
+- '17032'
+- '17040'
+- '17056'
+- '17088'
+- '17152'
+- '17153'
+- '17154'
+- '17156'
+- '17160'
+- '17168'
+- '17184'
+- '17216'
+- '17280'
+- '17408'
+- '17409'
+- '17410'
+- '17411'
+- '17412'
+- '17413'
+- '17414'
+- '17416'
+- '17417'
+- '17418'
+- '17420'
+- '17424'
+- '17425'
+- '17426'
+- '17428'
+- '17432'
+- '17440'
+- '17441'
+- '17442'
+- '17444'
+- '17448'
+- '17456'
+- '17472'
+- '17473'
+- '17474'
+- '17476'
+- '17480'
+- '17488'
+- '17504'
+- '17536'
+- '17537'
+- '17538'
+- '17540'
+- '17544'
+- '17552'
+- '17568'
+- '17600'
+- '17664'
+- '17665'
+- '17666'
+- '17668'
+- '17672'
+- '17680'
+- '17696'
+- '17728'
+- '17792'
+- '17920'
+- '17921'
+- '17922'
+- '17924'
+- '17928'
+- '17936'
+- '17952'
+- '17984'
+- '18048'
+- '18176'
+- '18432'
+- '18433'
+- '18434'
+- '18435'
+- '18436'
+- '18437'
+- '18438'
+- '18440'
+- '18441'
+- '18442'
+- '18444'
+- '18448'
+- '18449'
+- '18450'
+- '18452'
+- '18456'
+- '18464'
+- '18465'
+- '18466'
+- '18468'
+- '18472'
+- '18480'
+- '18496'
+- '18497'
+- '18498'
+- '18500'
+- '18504'
+- '18512'
+- '18528'
+- '18560'
+- '18561'
+- '18562'
+- '18564'
+- '18568'
+- '18576'
+- '18592'
+- '18624'
+- '18688'
+- '18689'
+- '18690'
+- '18692'
+- '18696'
+- '18704'
+- '18720'
+- '18752'
+- '18816'
+- '18944'
+- '18945'
+- '18946'
+- '18948'
+- '18952'
+- '18960'
+- '18976'
+- '19008'
+- '19072'
+- '19200'
+- '19456'
+- '19457'
+- '19458'
+- '19460'
+- '19464'
+- '19472'
+- '19488'
+- '19520'
+- '19584'
+- '19712'
+- '19968'
+- '20480'
+- '20481'
+- '20482'
+- '20483'
+- '20484'
+- '20485'
+- '20486'
+- '20488'
+- '20489'
+- '20490'
+- '20492'
+- '20496'
+- '20497'
+- '20498'
+- '20500'
+- '20504'
+- '20512'
+- '20513'
+- '20514'
+- '20516'
+- '20520'
+- '20528'
+- '20544'
+- '20545'
+- '20546'
+- '20548'
+- '20552'
+- '20560'
+- '20576'
+- '20608'
+- '20609'
+- '20610'
+- '20612'
+- '20616'
+- '20624'
+- '20640'
+- '20672'
+- '20736'
+- '20737'
+- '20738'
+- '20740'
+- '20744'
+- '20752'
+- '20768'
+- '20800'
+- '20864'
+- '20992'
+- '20993'
+- '20994'
+- '20996'
+- '21000'
+- '21008'
+- '21024'
+- '21056'
+- '21120'
+- '21248'
+- '21504'
+- '21505'
+- '21506'
+- '21508'
+- '21512'
+- '21520'
+- '21536'
+- '21568'
+- '21632'
+- '21760'
+- '22016'
+- '22528'
+- '22529'
+- '22530'
+- '22532'
+- '22536'
+- '22544'
+- '22560'
+- '22592'
+- '22656'
+- '22784'
+- '23040'
+- '23552'
+- '24576'
+- '24577'
+- '24578'
+- '24579'
+- '24580'
+- '24581'
+- '24582'
+- '24584'
+- '24585'
+- '24586'
+- '24588'
+- '24592'
+- '24593'
+- '24594'
+- '24596'
+- '24600'
+- '24608'
+- '24609'
+- '24610'
+- '24612'
+- '24616'
+- '24624'
+- '24640'
+- '24641'
+- '24642'
+- '24644'
+- '24648'
+- '24656'
+- '24672'
+- '24704'
+- '24705'
+- '24706'
+- '24708'
+- '24712'
+- '24720'
+- '24736'
+- '24768'
+- '24832'
+- '24833'
+- '24834'
+- '24836'
+- '24840'
+- '24848'
+- '24864'
+- '24896'
+- '24960'
+- '25088'
+- '25089'
+- '25090'
+- '25092'
+- '25096'
+- '25104'
+- '25120'
+- '25152'
+- '25216'
+- '25344'
+- '25600'
+- '25601'
+- '25602'
+- '25604'
+- '25608'
+- '25616'
+- '25632'
+- '25664'
+- '25728'
+- '25856'
+- '26112'
+- '26624'
+- '26625'
+- '26626'
+- '26628'
+- '26632'
+- '26640'
+- '26656'
+- '26688'
+- '26752'
+- '26880'
+- '27136'
+- '27648'
+- '28672'
+- '28673'
+- '28674'
+- '28676'
+- '28680'
+- '28688'
+- '28704'
+- '28736'
+- '28800'
+- '28928'
+- '29184'
+- '29696'
+- '30720'
+- '32768'
+- '32769'
+- '32770'
+- '32771'
+- '32772'
+- '32773'
+- '32774'
+- '32775'
+- '32776'
+- '32777'
+- '32778'
+- '32779'
+- '32780'
+- '32781'
+- '32782'
+- '32784'
+- '32785'
+- '32786'
+- '32787'
+- '32788'
+- '32789'
+- '32790'
+- '32792'
+- '32793'
+- '32794'
+- '32796'
+- '32800'
+- '32801'
+- '32802'
+- '32803'
+- '32804'
+- '32805'
+- '32806'
+- '32808'
+- '32809'
+- '32810'
+- '32812'
+- '32816'
+- '32817'
+- '32818'
+- '32820'
+- '32824'
+- '32832'
+- '32833'
+- '32834'
+- '32835'
+- '32836'
+- '32837'
+- '32838'
+- '32840'
+- '32841'
+- '32842'
+- '32844'
+- '32848'
+- '32849'
+- '32850'
+- '32852'
+- '32856'
+- '32864'
+- '32865'
+- '32866'
+- '32868'
+- '32872'
+- '32880'
+- '32896'
+- '32897'
+- '32898'
+- '32899'
+- '32900'
+- '32901'
+- '32902'
+- '32904'
+- '32905'
+- '32906'
+- '32908'
+- '32912'
+- '32913'
+- '32914'
+- '32916'
+- '32920'
+- '32928'
+- '32929'
+- '32930'
+- '32932'
+- '32936'
+- '32944'
+- '32960'
+- '32961'
+- '32962'
+- '32964'
+- '32968'
+- '32976'
+- '32992'
+- '33024'
+- '33025'
+- '33026'
+- '33027'
+- '33028'
+- '33029'
+- '33030'
+- '33032'
+- '33033'
+- '33034'
+- '33036'
+- '33040'
+- '33041'
+- '33042'
+- '33044'
+- '33048'
+- '33056'
+- '33057'
+- '33058'
+- '33060'
+- '33064'
+- '33072'
+- '33088'
+- '33089'
+- '33090'
+- '33092'
+- '33096'
+- '33104'
+- '33120'
+- '33152'
+- '33153'
+- '33154'
+- '33156'
+- '33160'
+- '33168'
+- '33184'
+- '33216'
+- '33280'
+- '33281'
+- '33282'
+- '33283'
+- '33284'
+- '33285'
+- '33286'
+- '33288'
+- '33289'
+- '33290'
+- '33292'
+- '33296'
+- '33297'
+- '33298'
+- '33300'
+- '33304'
+- '33312'
+- '33313'
+- '33314'
+- '33316'
+- '33320'
+- '33328'
+- '33344'
+- '33345'
+- '33346'
+- '33348'
+- '33352'
+- '33360'
+- '33376'
+- '33408'
+- '33409'
+- '33410'
+- '33412'
+- '33416'
+- '33424'
+- '33440'
+- '33472'
+- '33536'
+- '33537'
+- '33538'
+- '33540'
+- '33544'
+- '33552'
+- '33568'
+- '33600'
+- '33664'
+- '33792'
+- '33793'
+- '33794'
+- '33795'
+- '33796'
+- '33797'
+- '33798'
+- '33800'
+- '33801'
+- '33802'
+- '33804'
+- '33808'
+- '33809'
+- '33810'
+- '33812'
+- '33816'
+- '33824'
+- '33825'
+- '33826'
+- '33828'
+- '33832'
+- '33840'
+- '33856'
+- '33857'
+- '33858'
+- '33860'
+- '33864'
+- '33872'
+- '33888'
+- '33920'
+- '33921'
+- '33922'
+- '33924'
+- '33928'
+- '33936'
+- '33952'
+- '33984'
+- '34048'
+- '34049'
+- '34050'
+- '34052'
+- '34056'
+- '34064'
+- '34080'
+- '34112'
+- '34176'
+- '34304'
+- '34305'
+- '34306'
+- '34308'
+- '34312'
+- '34320'
+- '34336'
+- '34368'
+- '34432'
+- '34560'
+- '34816'
+- '34817'
+- '34818'
+- '34819'
+- '34820'
+- '34821'
+- '34822'
+- '34824'
+- '34825'
+- '34826'
+- '34828'
+- '34832'
+- '34833'
+- '34834'
+- '34836'
+- '34840'
+- '34848'
+- '34849'
+- '34850'
+- '34852'
+- '34856'
+- '34864'
+- '34880'
+- '34881'
+- '34882'
+- '34884'
+- '34888'
+- '34896'
+- '34912'
+- '34944'
+- '34945'
+- '34946'
+- '34948'
+- '34952'
+- '34960'
+- '34976'
+- '35008'
+- '35072'
+- '35073'
+- '35074'
+- '35076'
+- '35080'
+- '35088'
+- '35104'
+- '35136'
+- '35200'
+- '35328'
+- '35329'
+- '35330'
+- '35332'
+- '35336'
+- '35344'
+- '35360'
+- '35392'
+- '35456'
+- '35584'
+- '35840'
+- '35841'
+- '35842'
+- '35844'
+- '35848'
+- '35856'
+- '35872'
+- '35904'
+- '35968'
+- '36096'
+- '36352'
+- '36864'
+- '36865'
+- '36866'
+- '36867'
+- '36868'
+- '36869'
+- '36870'
+- '36872'
+- '36873'
+- '36874'
+- '36876'
+- '36880'
+- '36881'
+- '36882'
+- '36884'
+- '36888'
+- '36896'
+- '36897'
+- '36898'
+- '36900'
+- '36904'
+- '36912'
+- '36928'
+- '36929'
+- '36930'
+- '36932'
+- '36936'
+- '36944'
+- '36960'
+- '36992'
+- '36993'
+- '36994'
+- '36996'
+- '37000'
+- '37008'
+- '37024'
+- '37056'
+- '37120'
+- '37121'
+- '37122'
+- '37124'
+- '37128'
+- '37136'
+- '37152'
+- '37184'
+- '37248'
+- '37376'
+- '37377'
+- '37378'
+- '37380'
+- '37384'
+- '37392'
+- '37408'
+- '37440'
+- '37504'
+- '37632'
+- '37888'
+- '37889'
+- '37890'
+- '37892'
+- '37896'
+- '37904'
+- '37920'
+- '37952'
+- '38016'
+- '38144'
+- '38400'
+- '38912'
+- '38913'
+- '38914'
+- '38916'
+- '38920'
+- '38928'
+- '38944'
+- '38976'
+- '39040'
+- '39168'
+- '39424'
+- '39936'
+- '40960'
+- '40961'
+- '40962'
+- '40963'
+- '40964'
+- '40965'
+- '40966'
+- '40968'
+- '40969'
+- '40970'
+- '40972'
+- '40976'
+- '40977'
+- '40978'
+- '40980'
+- '40984'
+- '40992'
+- '40993'
+- '40994'
+- '40996'
+- '41000'
+- '41008'
+- '41024'
+- '41025'
+- '41026'
+- '41028'
+- '41032'
+- '41040'
+- '41056'
+- '41088'
+- '41089'
+- '41090'
+- '41092'
+- '41096'
+- '41104'
+- '41120'
+- '41152'
+- '41216'
+- '41217'
+- '41218'
+- '41220'
+- '41224'
+- '41232'
+- '41248'
+- '41280'
+- '41344'
+- '41472'
+- '41473'
+- '41474'
+- '41476'
+- '41480'
+- '41488'
+- '41504'
+- '41536'
+- '41600'
+- '41728'
+- '41984'
+- '41985'
+- '41986'
+- '41988'
+- '41992'
+- '42000'
+- '42016'
+- '42048'
+- '42112'
+- '42240'
+- '42496'
+- '43008'
+- '43009'
+- '43010'
+- '43012'
+- '43016'
+- '43024'
+- '43040'
+- '43072'
+- '43136'
+- '43264'
+- '43520'
+- '44032'
+- '45056'
+- '45057'
+- '45058'
+- '45060'
+- '45064'
+- '45072'
+- '45088'
+- '45120'
+- '45184'
+- '45312'
+- '45568'
+- '46080'
+- '47104'
+- '49152'
+- '49153'
+- '49154'
+- '49155'
+- '49156'
+- '49157'
+- '49158'
+- '49160'
+- '49161'
+- '49162'
+- '49164'
+- '49168'
+- '49169'
+- '49170'
+- '49172'
+- '49176'
+- '49184'
+- '49185'
+- '49186'
+- '49188'
+- '49192'
+- '49200'
+- '49216'
+- '49217'
+- '49218'
+- '49220'
+- '49224'
+- '49232'
+- '49248'
+- '49280'
+- '49281'
+- '49282'
+- '49284'
+- '49288'
+- '49296'
+- '49312'
+- '49344'
+- '49408'
+- '49409'
+- '49410'
+- '49412'
+- '49416'
+- '49424'
+- '49440'
+- '49472'
+- '49536'
+- '49664'
+- '49665'
+- '49666'
+- '49668'
+- '49672'
+- '49680'
+- '49696'
+- '49728'
+- '49792'
+- '49920'
+- '50176'
+- '50177'
+- '50178'
+- '50180'
+- '50184'
+- '50192'
+- '50208'
+- '50240'
+- '50304'
+- '50432'
+- '50688'
+- '51200'
+- '51201'
+- '51202'
+- '51204'
+- '51208'
+- '51216'
+- '51232'
+- '51264'
+- '51328'
+- '51456'
+- '51712'
+- '52224'
+- '53248'
+- '53249'
+- '53250'
+- '53252'
+- '53256'
+- '53264'
+- '53280'
+- '53312'
+- '53376'
+- '53504'
+- '53760'
+- '54272'
+- '55296'
+- '57344'
+- '57345'
+- '57346'
+- '57348'
+- '57352'
+- '57360'
+- '57376'
+- '57408'
+- '57472'
+- '57600'
+- '57856'
+- '58368'
+- '59392'
+- '61440'
+init: null
+input_size: 80
+cmvn_file: null
+ctc_conf:
+ dropout_rate: 0.0
+ ctc_type: builtin
+ reduce: true
+ ignore_nan_grad: true
+joint_net_conf: null
+use_preprocessor: true
+token_type: char
+bpemodel: null
+non_linguistic_symbols: null
+cleaner: null
+g2p: null
+speech_volume_normalize: null
+rir_scp: null
+rir_apply_prob: 1.0
+noise_scp: null
+noise_apply_prob: 1.0
+noise_db_range: '13_15'
+specaug: null
+specaug_conf: {}
+normalize: null
+normalize_conf: {}
+label_aggregator: null
+label_aggregator_conf: {}
+model: sond
+model_conf:
+ lsm_weight: 0.1
+ length_normalized_loss: true
+ max_spk_num: 16
+ normalize_speech_speaker: true
+# speech encoder
+encoder: resnet34_sp_l2reg
+encoder_conf:
+ # pass by model, equal to feature dim
+ # input_size: 80
+ batchnorm_momentum: 0.01
+ pooling_type: "window_shift"
+ pool_size: 20
+ stride: 1
+ tf2torch_tensor_name_prefix_torch: encoder
+ tf2torch_tensor_name_prefix_tf: EAND/speech_encoder
+speaker_encoder: null
+speaker_encoder_conf: {}
+ci_scorer: conv
+ci_scorer_conf:
+ input_units: 512
+ num_layers: 3
+ num_units: 512
+ kernel_size: 1
+ dropout_rate: 0.0
+ position_encoder: null
+ out_units: 1
+ out_norm: false
+ auxiliary_states: false
+ tf2torch_tensor_name_prefix_torch: ci_scorer
+ tf2torch_tensor_name_prefix_tf: EAND/compute_distance_layer/ci_scorer
+cd_scorer: san
+cd_scorer_conf:
+ input_size: 512
+ output_size: 512
+ out_units: 1
+ attention_heads: 4
+ linear_units: 1024
+ num_blocks: 4
+ dropout_rate: 0.0
+ positional_dropout_rate: 0.0
+ attention_dropout_rate: 0.0
+ # use string "null" to remove input layer
+ input_layer: "null"
+ pos_enc_class: null
+ normalize_before: true
+ tf2torch_tensor_name_prefix_torch: cd_scorer
+ tf2torch_tensor_name_prefix_tf: EAND/compute_distance_layer/cd_scorer
+# post net
+decoder: fsmn
+decoder_conf:
+ in_units: 32
+ out_units: 2517
+ filter_size: 31
+ fsmn_num_layers: 6
+ dnn_num_layers: 1
+ num_memory_units: 16
+ ffn_inner_dim: 512
+ dropout_rate: 0.0
+ tf2torch_tensor_name_prefix_torch: decoder
+ tf2torch_tensor_name_prefix_tf: EAND/post_net
+frontend: null
+frontend_conf:
+ fs: 8000
+ window: povey
+ n_mels: 80
+ frame_length: 25
+ frame_shift: 10
+ filter_length_min: -1
+ filter_length_max: -1
+ lfr_m: 1
+ lfr_n: 1
+ dither: 0.0
+ snip_edges: false
+ upsacle_samples: false
+num_worker_count: 0
+required:
+- output_dir
+- token_list
+oss_bucket: 'null'
+version: 0.1.4
diff --git a/egs/callhome/sond/unit_test.py b/egs/callhome/sond/unit_test.py
new file mode 100644
index 0000000..a48eda1
--- /dev/null
+++ b/egs/callhome/sond/unit_test.py
@@ -0,0 +1,97 @@
+from funasr.bin.diar_inference_launch import inference_launch
+import os
+
+
+def test_fbank_cpu_infer():
+ diar_config_path = "sond_fbank.yaml"
+ diar_model_path = "sond.pb"
+ output_dir = "./outputs"
+ data_path_and_name_and_type = [
+ ("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
+ ("data/unit_test/test_profile.scp", "profile", "kaldi_ark"),
+ ]
+ pipeline = inference_launch(
+ mode="sond",
+ diar_train_config=diar_config_path,
+ diar_model_file=diar_model_path,
+ output_dir=output_dir,
+ num_workers=0,
+ log_level="INFO",
+ )
+ results = pipeline(data_path_and_name_and_type)
+ print(results)
+
+
+def test_fbank_gpu_infer():
+ diar_config_path = "sond_fbank.yaml"
+ diar_model_path = "sond.pb"
+ output_dir = "./outputs"
+ data_path_and_name_and_type = [
+ ("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
+ ("data/unit_test/test_profile.scp", "profile", "kaldi_ark"),
+ ]
+ pipeline = inference_launch(
+ mode="sond",
+ diar_train_config=diar_config_path,
+ diar_model_file=diar_model_path,
+ output_dir=output_dir,
+ ngpu=1,
+ num_workers=1,
+ log_level="INFO",
+ )
+ results = pipeline(data_path_and_name_and_type)
+ print(results)
+
+
+def test_wav_gpu_infer():
+ diar_config_path = "config.yaml"
+ diar_model_path = "sond.pb"
+ output_dir = "./outputs"
+ data_path_and_name_and_type = [
+ ("data/unit_test/test_wav.scp", "speech", "sound"),
+ ("data/unit_test/test_profile.scp", "profile", "kaldi_ark"),
+ ]
+ pipeline = inference_launch(
+ mode="sond",
+ diar_train_config=diar_config_path,
+ diar_model_file=diar_model_path,
+ output_dir=output_dir,
+ ngpu=1,
+ num_workers=1,
+ log_level="WARNING",
+ )
+ results = pipeline(data_path_and_name_and_type)
+ print(results)
+
+
+def test_without_profile_gpu_infer():
+ diar_config_path = "config.yaml"
+ diar_model_path = "sond.pb"
+ output_dir = "./outputs"
+ raw_inputs = [[
+ "data/unit_test/raw_inputs/record.wav",
+ "data/unit_test/raw_inputs/spk1.wav",
+ "data/unit_test/raw_inputs/spk2.wav",
+ "data/unit_test/raw_inputs/spk3.wav",
+ "data/unit_test/raw_inputs/spk4.wav"
+ ]]
+ pipeline = inference_launch(
+ mode="sond_demo",
+ diar_train_config=diar_config_path,
+ diar_model_file=diar_model_path,
+ output_dir=output_dir,
+ ngpu=1,
+ num_workers=1,
+ log_level="WARNING",
+ param_dict={},
+ )
+ results = pipeline(raw_inputs=raw_inputs)
+ print(results)
+
+
+if __name__ == '__main__':
+ os.environ["CUDA_VISIBLE_DEVICES"] = "7"
+ test_fbank_cpu_infer()
+ # test_fbank_gpu_infer()
+ # test_wav_gpu_infer()
+ # test_without_profile_gpu_infer()
diff --git a/egs/librispeech/branchformer/conf/decode_asr_transformer_beam10_ctc0.3.yaml b/egs/librispeech/branchformer/conf/decode_asr_transformer_beam10_ctc0.3.yaml
new file mode 100644
index 0000000..62745de
--- /dev/null
+++ b/egs/librispeech/branchformer/conf/decode_asr_transformer_beam10_ctc0.3.yaml
@@ -0,0 +1,6 @@
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.3
+lm_weight: 0.0
diff --git a/egs/librispeech/branchformer/conf/train_asr_branchformer.yaml b/egs/librispeech/branchformer/conf/train_asr_branchformer.yaml
new file mode 100644
index 0000000..d77755c
--- /dev/null
+++ b/egs/librispeech/branchformer/conf/train_asr_branchformer.yaml
@@ -0,0 +1,104 @@
+# network architecture
+# encoder related
+encoder: branchformer
+encoder_conf:
+ output_size: 512
+ use_attn: true
+ attention_heads: 8
+ attention_layer_type: rel_selfattn
+ pos_enc_layer_type: rel_pos
+ rel_pos_type: latest
+ use_cgmlp: true
+ cgmlp_linear_units: 3072
+ cgmlp_conv_kernel: 31
+ use_linear_after_conv: false
+ gate_activation: identity
+ merge_method: concat
+ cgmlp_weight: 0.5 # used only if merge_method is "fixed_ave"
+ attn_branch_drop_rate: 0.0 # used only if merge_method is "learned_ave"
+ num_blocks: 18
+ dropout_rate: 0.1
+ positional_dropout_rate: 0.1
+ attention_dropout_rate: 0.1
+ input_layer: conv2d
+ stochastic_depth_rate: 0.0
+
+# decoder related
+decoder: transformer
+decoder_conf:
+ attention_heads: 8
+ linear_units: 2048
+ num_blocks: 6
+ dropout_rate: 0.1
+ positional_dropout_rate: 0.1
+ self_attention_dropout_rate: 0.1
+ src_attention_dropout_rate: 0.1
+
+# frontend related
+frontend: wav_frontend
+frontend_conf:
+ fs: 16000
+ window: hamming
+ n_mels: 80
+ frame_length: 25
+ frame_shift: 10
+ lfr_m: 1
+ lfr_n: 1
+
+# hybrid CTC/attention
+model_conf:
+ ctc_weight: 0.3
+ lsm_weight: 0.1 # label smoothing option
+ length_normalized_loss: false
+
+# optimization related
+accum_grad: 2
+grad_clip: 5
+max_epoch: 210
+val_scheduler_criterion:
+ - valid
+ - acc
+best_model_criterion:
+- - valid
+ - acc
+ - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+ lr: 0.0025
+ weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+ warmup_steps: 40000
+
+specaug: specaug
+specaug_conf:
+ apply_time_warp: true
+ time_warp_window: 5
+ time_warp_mode: bicubic
+ apply_freq_mask: true
+ freq_mask_width_range:
+ - 0
+ - 27
+ num_freq_mask: 2
+ apply_time_mask: true
+ time_mask_width_ratio_range:
+ - 0.
+ - 0.05
+ num_time_mask: 10
+
+dataset_conf:
+ data_names: speech,text
+ data_types: sound,text
+ shuffle: True
+ shuffle_conf:
+ shuffle_size: 2048
+ sort_size: 500
+ batch_conf:
+ batch_type: token
+ batch_size: 30000
+ num_workers: 8
+
+log_interval: 50
+normalize: None
\ No newline at end of file
diff --git a/egs/librispeech/branchformer/local/data_prep.sh b/egs/librispeech/branchformer/local/data_prep.sh
new file mode 100755
index 0000000..c939b5f
--- /dev/null
+++ b/egs/librispeech/branchformer/local/data_prep.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+
+# Copyright 2014 Vassil Panayotov
+# 2014 Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+if [ "$#" -ne 2 ]; then
+ echo "Usage: $0 <src-dir> <dst-dir>"
+ echo "e.g.: $0 /export/a15/vpanayotov/data/LibriSpeech/dev-clean data/dev-clean"
+ exit 1
+fi
+
+src=$1
+dst=$2
+
+# all utterances are FLAC compressed
+if ! which flac >&/dev/null; then
+ echo "Please install 'flac' on ALL worker nodes!"
+ exit 1
+fi
+
+spk_file=$src/../SPEAKERS.TXT
+
+mkdir -p $dst || exit 1
+
+[ ! -d $src ] && echo "$0: no such directory $src" && exit 1
+[ ! -f $spk_file ] && echo "$0: expected file $spk_file to exist" && exit 1
+
+
+wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp
+trans=$dst/text; [[ -f "$trans" ]] && rm $trans
+
+for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do
+ reader=$(basename $reader_dir)
+ if ! [ $reader -eq $reader ]; then # not integer.
+ echo "$0: unexpected subdirectory name $reader"
+ exit 1
+ fi
+
+ for chapter_dir in $(find -L $reader_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do
+ chapter=$(basename $chapter_dir)
+ if ! [ "$chapter" -eq "$chapter" ]; then
+ echo "$0: unexpected chapter-subdirectory name $chapter"
+ exit 1
+ fi
+
+ find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \
+ awk -v "dir=$chapter_dir" '{printf "%s %s/%s.flac \n", $0, dir, $0}' >>$wav_scp|| exit 1
+
+ chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt
+ [ ! -f $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1
+ cat $chapter_trans >>$trans
+ done
+done
+
+echo "$0: successfully prepared data in $dst"
+
+exit 0
diff --git a/egs/librispeech/branchformer/local/download_and_untar.sh b/egs/librispeech/branchformer/local/download_and_untar.sh
new file mode 100755
index 0000000..fe322e4
--- /dev/null
+++ b/egs/librispeech/branchformer/local/download_and_untar.sh
@@ -0,0 +1,97 @@
+#!/usr/bin/env bash
+
+# Copyright 2014 Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+ remove_archive=true
+ shift
+fi
+
+if [ $# -ne 3 ]; then
+ echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
+ echo "e.g.: $0 /export/a15/vpanayotov/data www.openslr.org/resources/11 dev-clean"
+ echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+ echo "<corpus-part> can be one of: dev-clean, test-clean, dev-other, test-other,"
+ echo " train-clean-100, train-clean-360, train-other-500."
+ exit 1
+fi
+
+data=$1
+url=$2
+part=$3
+
+if [ ! -d "$data" ]; then
+ echo "$0: no such directory $data"
+ exit 1
+fi
+
+part_ok=false
+list="dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500"
+for x in $list; do
+ if [ "$part" == $x ]; then part_ok=true; fi
+done
+if ! $part_ok; then
+ echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
+ exit 1
+fi
+
+if [ -z "$url" ]; then
+ echo "$0: empty URL base."
+ exit 1
+fi
+
+if [ -f $data/LibriSpeech/$part/.complete ]; then
+ echo "$0: data part $part was already successfully extracted, nothing to do."
+ exit 0
+fi
+
+
+# sizes of the archive files in bytes. This is some older versions.
+sizes_old="371012589 347390293 379743611 361838298 6420417880 23082659865 30626749128"
+# sizes_new is the archive file sizes of the final release. Some of these sizes are of
+# things we probably won't download.
+sizes_new="337926286 314305928 695964615 297279345 87960560420 33373768 346663984 328757843 6387309499 23049477885 30593501606"
+
+if [ -f $data/$part.tar.gz ]; then
+ size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}')
+ size_ok=false
+ for s in $sizes_old $sizes_new; do if [ $s == $size ]; then size_ok=true; fi; done
+ if ! $size_ok; then
+ echo "$0: removing existing file $data/$part.tar.gz because its size in bytes $size"
+ echo "does not equal the size of one of the archives."
+ rm $data/$part.tar.gz
+ else
+ echo "$data/$part.tar.gz exists and appears to be complete."
+ fi
+fi
+
+if [ ! -f $data/$part.tar.gz ]; then
+ if ! which wget >/dev/null; then
+ echo "$0: wget is not installed."
+ exit 1
+ fi
+ full_url=$url/$part.tar.gz
+ echo "$0: downloading data from $full_url. This may take some time, please be patient."
+
+ if ! wget -P $data --no-check-certificate $full_url; then
+ echo "$0: error executing wget $full_url"
+ exit 1
+ fi
+fi
+
+if ! tar -C $data -xvzf $data/$part.tar.gz; then
+ echo "$0: error un-tarring archive $data/$part.tar.gz"
+ exit 1
+fi
+
+touch $data/LibriSpeech/$part/.complete
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tar.gz"
+
+if $remove_archive; then
+ echo "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied."
+ rm $data/$part.tar.gz
+fi
diff --git a/egs/librispeech/branchformer/local/spm_encode.py b/egs/librispeech/branchformer/local/spm_encode.py
new file mode 100755
index 0000000..9e1c15f
--- /dev/null
+++ b/egs/librispeech/branchformer/local/spm_encode.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in
+# https://github.com/pytorch/fairseq/blob/master/LICENSE
+
+
+import argparse
+import contextlib
+import sys
+
+import sentencepiece as spm
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model", required=True,
+ help="sentencepiece model to use for encoding")
+ parser.add_argument("--inputs", nargs="+", default=['-'],
+ help="input files to filter/encode")
+ parser.add_argument("--outputs", nargs="+", default=['-'],
+ help="path to save encoded outputs")
+ parser.add_argument("--output_format", choices=["piece", "id"], default="piece")
+ parser.add_argument("--min-len", type=int, metavar="N",
+ help="filter sentence pairs with fewer than N tokens")
+ parser.add_argument("--max-len", type=int, metavar="N",
+ help="filter sentence pairs with more than N tokens")
+ args = parser.parse_args()
+
+ assert len(args.inputs) == len(args.outputs), \
+ "number of input and output paths should match"
+
+ sp = spm.SentencePieceProcessor()
+ sp.Load(args.model)
+
+ if args.output_format == "piece":
+ def encode(l):
+ return sp.EncodeAsPieces(l)
+ elif args.output_format == "id":
+ def encode(l):
+ return list(map(str, sp.EncodeAsIds(l)))
+ else:
+ raise NotImplementedError
+
+ if args.min_len is not None or args.max_len is not None:
+ def valid(line):
+ return (
+ (args.min_len is None or len(line) >= args.min_len) and
+ (args.max_len is None or len(line) <= args.max_len)
+ )
+ else:
+ def valid(lines):
+ return True
+
+ with contextlib.ExitStack() as stack:
+ inputs = [
+ stack.enter_context(open(input, "r", encoding="utf-8"))
+ if input != "-" else sys.stdin
+ for input in args.inputs
+ ]
+ outputs = [
+ stack.enter_context(open(output, "w", encoding="utf-8"))
+ if output != "-" else sys.stdout
+ for output in args.outputs
+ ]
+
+ stats = {
+ "num_empty": 0,
+ "num_filtered": 0,
+ }
+
+ def encode_line(line):
+ line = line.strip()
+ if len(line) > 0:
+ line = encode(line)
+ if valid(line):
+ return line
+ else:
+ stats["num_filtered"] += 1
+ else:
+ stats["num_empty"] += 1
+ return None
+
+ for i, lines in enumerate(zip(*inputs), start=1):
+ enc_lines = list(map(encode_line, lines))
+ if not any(enc_line is None for enc_line in enc_lines):
+ for enc_line, output_h in zip(enc_lines, outputs):
+ print(" ".join(enc_line), file=output_h)
+ if i % 10000 == 0:
+ print("processed {} lines".format(i), file=sys.stderr)
+
+ print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr)
+ print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/egs/librispeech/branchformer/local/spm_train.py b/egs/librispeech/branchformer/local/spm_train.py
new file mode 100755
index 0000000..134a0b1
--- /dev/null
+++ b/egs/librispeech/branchformer/local/spm_train.py
@@ -0,0 +1,12 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# https://github.com/pytorch/fairseq/blob/master/LICENSE
+import sys
+
+import sentencepiece as spm
+
+if __name__ == "__main__":
+ spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:]))
diff --git a/egs/librispeech/branchformer/path.sh b/egs/librispeech/branchformer/path.sh
new file mode 100755
index 0000000..7972642
--- /dev/null
+++ b/egs/librispeech/branchformer/path.sh
@@ -0,0 +1,5 @@
+export FUNASR_DIR=$PWD/../../..
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PATH=$FUNASR_DIR/funasr/bin:$PATH
diff --git a/egs/librispeech/branchformer/run.sh b/egs/librispeech/branchformer/run.sh
new file mode 100755
index 0000000..f925eb5
--- /dev/null
+++ b/egs/librispeech/branchformer/run.sh
@@ -0,0 +1,223 @@
+#!/usr/bin/env bash
+
+. ./path.sh || exit 1;
+
+# machines configuration
+CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+gpu_num=8
+count=1
+gpu_inference=true # Whether to perform gpu decoding, set false for cpu decoding
+# for gpu decoding, inference_nj=ngpu*njob; for cpu decoding, inference_nj=njob
+njob=5
+train_cmd=utils/run.pl
+infer_cmd=utils/run.pl
+
+# general configuration
+feats_dir="../DATA" #feature output dictionary
+exp_dir="."
+lang=en
+token_type=bpe
+type=sound
+scp=wav.scp
+speed_perturb="0.9 1.0 1.1"
+stage=0
+stop_stage=5
+
+# feature configuration
+feats_dim=80
+nj=64
+
+# data
+raw_data=
+data_url=www.openslr.org/resources/12
+
+# bpe model
+nbpe=5000
+bpemode=unigram
+
+# exp tag
+tag="exp1"
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train_960
+valid_set=dev
+test_sets="test_clean test_other dev_clean dev_other"
+
+asr_config=conf/train_asr_branchformer.yaml
+model_dir="baseline_$(basename "${asr_config}" .yaml)_${lang}_${token_type}_${tag}"
+
+inference_config=conf/decode_asr_transformer_beam10_ctc0.3.yaml
+inference_asr_model=valid.acc.ave_10best.pb
+
+# you can set gpu num for decoding here
+gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default
+ngpu=$(echo $gpuid_list | awk -F "," '{print NF}')
+
+if ${gpu_inference}; then
+ inference_nj=$[${ngpu}*${njob}]
+ _ngpu=1
+else
+ inference_nj=$njob
+ _ngpu=0
+fi
+
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+ echo "stage -1: Data Download"
+ for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do
+ local/download_and_untar.sh ${raw_data} ${data_url} ${part}
+ done
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ echo "stage 0: Data preparation"
+ # Data preparation
+ for x in dev-clean dev-other test-clean test-other train-clean-100 train-clean-360 train-other-500; do
+ local/data_prep.sh ${raw_data}/LibriSpeech/${x} ${feats_dir}/data/${x//-/_}
+ done
+ mkdir $feats_dir/data/$valid_set
+ dev_sets="dev_clean dev_other"
+ for file in wav.scp text; do
+ ( for f in $dev_sets; do cat $feats_dir/data/$f/$file; done ) | sort -k1 > $feats_dir/data/$valid_set/$file || exit 1;
+ done
+ mkdir $feats_dir/data/$train_set
+ train_sets="train_clean_100 train_clean_360 train_other_500"
+ for file in wav.scp text; do
+ ( for f in $train_sets; do cat $feats_dir/data/$f/$file; done ) | sort -k1 > $feats_dir/data/$train_set/$file || exit 1;
+ done
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ echo "stage 1: Feature and CMVN Generation"
+ utils/compute_cmvn.sh --fbankdir ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config_file "$asr_config" --scale 1.0
+fi
+
+token_list=${feats_dir}/data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
+bpemodel=${feats_dir}/data/lang_char/${train_set}_${bpemode}${nbpe}
+echo "dictionary: ${token_list}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+ echo "stage 2: Dictionary and Json Data Preparation"
+ mkdir -p ${feats_dir}/data/lang_char/
+ echo "<blank>" > ${token_list}
+ echo "<s>" >> ${token_list}
+ echo "</s>" >> ${token_list}
+ cut -f 2- -d" " ${feats_dir}/data/${train_set}/text > ${feats_dir}/data/lang_char/input.txt
+ local/spm_train.py --input=${feats_dir}/data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000
+ local/spm_encode.py --model=${bpemodel}.model --output_format=piece < ${feats_dir}/data/lang_char/input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0}' >> ${token_list}
+ echo "<unk>" >> ${token_list}
+fi
+
+# LM Training Stage
+world_size=$gpu_num # run on one machine
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+ echo "stage 3: LM Training"
+fi
+
+# ASR Training Stage
+world_size=$gpu_num # run on one machine
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+ echo "stage 4: ASR Training"
+ mkdir -p ${exp_dir}/exp/${model_dir}
+ mkdir -p ${exp_dir}/exp/${model_dir}/log
+ INIT_FILE=${exp_dir}/exp/${model_dir}/ddp_init
+ if [ -f $INIT_FILE ];then
+ rm -f $INIT_FILE
+ fi
+ init_method=file://$(readlink -f $INIT_FILE)
+ echo "$0: init method is $init_method"
+ for ((i = 0; i < $gpu_num; ++i)); do
+ {
+ rank=$i
+ local_rank=$i
+ gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+ train.py \
+ --task_name asr \
+ --gpu_id $gpu_id \
+ --use_preprocessor true \
+ --split_with_space false \
+ --bpemodel ${bpemodel}.model \
+ --token_type $token_type \
+ --token_list $token_list \
+ --dataset_type large \
+ --data_dir ${feats_dir}/data \
+ --train_set ${train_set} \
+ --valid_set ${valid_set} \
+ --cmvn_file ${feats_dir}/data/${train_set}/cmvn/am.mvn \
+ --speed_perturb ${speed_perturb} \
+ --resume true \
+ --output_dir ${exp_dir}/exp/${model_dir} \
+ --config $asr_config \
+ --ngpu $gpu_num \
+ --num_worker_count $count \
+ --multiprocessing_distributed true \
+ --dist_init_method $init_method \
+ --dist_world_size $world_size \
+ --dist_rank $rank \
+ --local_rank $local_rank 1> ${exp_dir}/exp/${model_dir}/log/train.log.$i 2>&1
+ } &
+ done
+ wait
+fi
+
+# Testing Stage
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+ echo "stage 5: Inference"
+ for dset in ${test_sets}; do
+ asr_exp=${exp_dir}/exp/${model_dir}
+ inference_tag="$(basename "${inference_config}" .yaml)"
+ _dir="${asr_exp}/${inference_tag}/${inference_asr_model}/${dset}"
+ _logdir="${_dir}/logdir"
+ if [ -d ${_dir} ]; then
+ echo "${_dir} is already exists. if you want to decode again, please delete this dir first."
+ exit 0
+ fi
+ mkdir -p "${_logdir}"
+ _data="${feats_dir}/data/${dset}"
+ key_file=${_data}/${scp}
+ num_scp_file="$(<${key_file} wc -l)"
+ _nj=$([ $inference_nj -le $num_scp_file ] && echo "$inference_nj" || echo "$num_scp_file")
+ split_scps=
+ for n in $(seq "${_nj}"); do
+ split_scps+=" ${_logdir}/keys.${n}.scp"
+ done
+ # shellcheck disable=SC2086
+ utils/split_scp.pl "${key_file}" ${split_scps}
+ _opts=
+ if [ -n "${inference_config}" ]; then
+ _opts+="--config ${inference_config} "
+ fi
+ ${infer_cmd} --gpu "${_ngpu}" --max-jobs-run "${_nj}" JOB=1:"${_nj}" "${_logdir}"/asr_inference.JOB.log \
+ python -m funasr.bin.asr_inference_launch \
+ --batch_size 1 \
+ --ngpu "${_ngpu}" \
+ --njob ${njob} \
+ --gpuid_list ${gpuid_list} \
+ --data_path_and_name_and_type "${_data}/${scp},speech,${type}" \
+ --cmvn_file ${feats_dir}/data/${train_set}/cmvn/am.mvn \
+ --key_file "${_logdir}"/keys.JOB.scp \
+ --asr_train_config "${asr_exp}"/config.yaml \
+ --asr_model_file "${asr_exp}"/"${inference_asr_model}" \
+ --output_dir "${_logdir}"/output.JOB \
+ --mode asr \
+ ${_opts}
+
+ for f in token token_int score text; do
+ if [ -f "${_logdir}/output.1/1best_recog/${f}" ]; then
+ for i in $(seq "${_nj}"); do
+ cat "${_logdir}/output.${i}/1best_recog/${f}"
+ done | sort -k1 >"${_dir}/${f}"
+ fi
+ done
+ python utils/compute_wer.py ${_data}/text ${_dir}/text ${_dir}/text.cer
+ tail -n 3 ${_dir}/text.cer > ${_dir}/text.cer.txt
+ cat ${_dir}/text.cer.txt
+ done
+fi
\ No newline at end of file
diff --git a/egs/librispeech/branchformer/utils b/egs/librispeech/branchformer/utils
new file mode 120000
index 0000000..fe070dd
--- /dev/null
+++ b/egs/librispeech/branchformer/utils
@@ -0,0 +1 @@
+../../aishell/transformer/utils
\ No newline at end of file
diff --git a/egs/librispeech/e_branchformer/conf/decode_asr_transformer_beam10_ctc0.3.yaml b/egs/librispeech/e_branchformer/conf/decode_asr_transformer_beam10_ctc0.3.yaml
new file mode 100644
index 0000000..62745de
--- /dev/null
+++ b/egs/librispeech/e_branchformer/conf/decode_asr_transformer_beam10_ctc0.3.yaml
@@ -0,0 +1,6 @@
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.3
+lm_weight: 0.0
diff --git a/egs/librispeech/e_branchformer/conf/train_asr_e_branchformer.yaml b/egs/librispeech/e_branchformer/conf/train_asr_e_branchformer.yaml
new file mode 100644
index 0000000..c3607ae
--- /dev/null
+++ b/egs/librispeech/e_branchformer/conf/train_asr_e_branchformer.yaml
@@ -0,0 +1,105 @@
+# network architecture
+# encoder related
+encoder: e_branchformer
+encoder_conf:
+ output_size: 512
+ attention_heads: 8
+ attention_layer_type: rel_selfattn
+ pos_enc_layer_type: rel_pos
+ rel_pos_type: latest
+ cgmlp_linear_units: 3072
+ cgmlp_conv_kernel: 31
+ use_linear_after_conv: false
+ gate_activation: identity
+ num_blocks: 17
+ dropout_rate: 0.1
+ positional_dropout_rate: 0.1
+ attention_dropout_rate: 0.1
+ input_layer: conv2d
+ layer_drop_rate: 0.1
+ linear_units: 1024
+ positionwise_layer_type: linear
+ macaron_ffn: true
+ use_ffn: true
+ merge_conv_kernel: 31
+
+# decoder related
+decoder: transformer
+decoder_conf:
+ attention_heads: 8
+ linear_units: 2048
+ num_blocks: 6
+ dropout_rate: 0.1
+ positional_dropout_rate: 0.1
+ self_attention_dropout_rate: 0.1
+ src_attention_dropout_rate: 0.1
+ layer_drop_rate: 0.2
+
+# frontend related
+frontend: wav_frontend
+frontend_conf:
+ fs: 16000
+ window: hamming
+ n_mels: 80
+ frame_length: 25
+ frame_shift: 10
+ lfr_m: 1
+ lfr_n: 1
+
+# hybrid CTC/attention
+model_conf:
+ ctc_weight: 0.3
+ lsm_weight: 0.1 # label smoothing option
+ length_normalized_loss: false
+
+# optimization related
+accum_grad: 2
+grad_clip: 5
+max_epoch: 240
+val_scheduler_criterion:
+ - valid
+ - acc
+best_model_criterion:
+- - valid
+ - acc
+ - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+ lr: 0.002
+ weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+ warmup_steps: 40000
+
+specaug: specaug
+specaug_conf:
+ apply_time_warp: true
+ time_warp_window: 5
+ time_warp_mode: bicubic
+ apply_freq_mask: true
+ freq_mask_width_range:
+ - 0
+ - 27
+ num_freq_mask: 2
+ apply_time_mask: true
+ time_mask_width_ratio_range:
+ - 0.
+ - 0.05
+ num_time_mask: 10
+
+dataset_conf:
+ data_names: speech,text
+ data_types: sound,text
+ shuffle: True
+ shuffle_conf:
+ shuffle_size: 2048
+ sort_size: 500
+ batch_conf:
+ batch_type: token
+ batch_size: 30000
+ num_workers: 8
+
+log_interval: 50
+normalize: None
\ No newline at end of file
diff --git a/egs/librispeech/e_branchformer/local/data_prep.sh b/egs/librispeech/e_branchformer/local/data_prep.sh
new file mode 100755
index 0000000..c939b5f
--- /dev/null
+++ b/egs/librispeech/e_branchformer/local/data_prep.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+
+# Copyright 2014 Vassil Panayotov
+# 2014 Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+if [ "$#" -ne 2 ]; then
+ echo "Usage: $0 <src-dir> <dst-dir>"
+ echo "e.g.: $0 /export/a15/vpanayotov/data/LibriSpeech/dev-clean data/dev-clean"
+ exit 1
+fi
+
+src=$1
+dst=$2
+
+# all utterances are FLAC compressed
+if ! which flac >&/dev/null; then
+ echo "Please install 'flac' on ALL worker nodes!"
+ exit 1
+fi
+
+spk_file=$src/../SPEAKERS.TXT
+
+mkdir -p $dst || exit 1
+
+[ ! -d $src ] && echo "$0: no such directory $src" && exit 1
+[ ! -f $spk_file ] && echo "$0: expected file $spk_file to exist" && exit 1
+
+
+wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp
+trans=$dst/text; [[ -f "$trans" ]] && rm $trans
+
+for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do
+ reader=$(basename $reader_dir)
+ if ! [ $reader -eq $reader ]; then # not integer.
+ echo "$0: unexpected subdirectory name $reader"
+ exit 1
+ fi
+
+ for chapter_dir in $(find -L $reader_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do
+ chapter=$(basename $chapter_dir)
+ if ! [ "$chapter" -eq "$chapter" ]; then
+ echo "$0: unexpected chapter-subdirectory name $chapter"
+ exit 1
+ fi
+
+ find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \
+ awk -v "dir=$chapter_dir" '{printf "%s %s/%s.flac \n", $0, dir, $0}' >>$wav_scp|| exit 1
+
+ chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt
+ [ ! -f $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1
+ cat $chapter_trans >>$trans
+ done
+done
+
+echo "$0: successfully prepared data in $dst"
+
+exit 0
diff --git a/egs/librispeech/e_branchformer/local/download_and_untar.sh b/egs/librispeech/e_branchformer/local/download_and_untar.sh
new file mode 100755
index 0000000..fe322e4
--- /dev/null
+++ b/egs/librispeech/e_branchformer/local/download_and_untar.sh
@@ -0,0 +1,97 @@
+#!/usr/bin/env bash
+
+# Copyright 2014 Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+ remove_archive=true
+ shift
+fi
+
+if [ $# -ne 3 ]; then
+ echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
+ echo "e.g.: $0 /export/a15/vpanayotov/data www.openslr.org/resources/11 dev-clean"
+ echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+ echo "<corpus-part> can be one of: dev-clean, test-clean, dev-other, test-other,"
+ echo " train-clean-100, train-clean-360, train-other-500."
+ exit 1
+fi
+
+data=$1
+url=$2
+part=$3
+
+if [ ! -d "$data" ]; then
+ echo "$0: no such directory $data"
+ exit 1
+fi
+
+part_ok=false
+list="dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500"
+for x in $list; do
+ if [ "$part" == $x ]; then part_ok=true; fi
+done
+if ! $part_ok; then
+ echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
+ exit 1
+fi
+
+if [ -z "$url" ]; then
+ echo "$0: empty URL base."
+ exit 1
+fi
+
+if [ -f $data/LibriSpeech/$part/.complete ]; then
+ echo "$0: data part $part was already successfully extracted, nothing to do."
+ exit 0
+fi
+
+
+# sizes of the archive files in bytes. This is some older versions.
+sizes_old="371012589 347390293 379743611 361838298 6420417880 23082659865 30626749128"
+# sizes_new is the archive file sizes of the final release. Some of these sizes are of
+# things we probably won't download.
+sizes_new="337926286 314305928 695964615 297279345 87960560420 33373768 346663984 328757843 6387309499 23049477885 30593501606"
+
+if [ -f $data/$part.tar.gz ]; then
+ size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}')
+ size_ok=false
+ for s in $sizes_old $sizes_new; do if [ $s == $size ]; then size_ok=true; fi; done
+ if ! $size_ok; then
+ echo "$0: removing existing file $data/$part.tar.gz because its size in bytes $size"
+ echo "does not equal the size of one of the archives."
+ rm $data/$part.tar.gz
+ else
+ echo "$data/$part.tar.gz exists and appears to be complete."
+ fi
+fi
+
+if [ ! -f $data/$part.tar.gz ]; then
+ if ! which wget >/dev/null; then
+ echo "$0: wget is not installed."
+ exit 1
+ fi
+ full_url=$url/$part.tar.gz
+ echo "$0: downloading data from $full_url. This may take some time, please be patient."
+
+ if ! wget -P $data --no-check-certificate $full_url; then
+ echo "$0: error executing wget $full_url"
+ exit 1
+ fi
+fi
+
+if ! tar -C $data -xvzf $data/$part.tar.gz; then
+ echo "$0: error un-tarring archive $data/$part.tar.gz"
+ exit 1
+fi
+
+touch $data/LibriSpeech/$part/.complete
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tar.gz"
+
+if $remove_archive; then
+ echo "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied."
+ rm $data/$part.tar.gz
+fi
diff --git a/egs/librispeech/e_branchformer/local/spm_encode.py b/egs/librispeech/e_branchformer/local/spm_encode.py
new file mode 100755
index 0000000..9e1c15f
--- /dev/null
+++ b/egs/librispeech/e_branchformer/local/spm_encode.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in
+# https://github.com/pytorch/fairseq/blob/master/LICENSE
+
+
+import argparse
+import contextlib
+import sys
+
+import sentencepiece as spm
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model", required=True,
+ help="sentencepiece model to use for encoding")
+ parser.add_argument("--inputs", nargs="+", default=['-'],
+ help="input files to filter/encode")
+ parser.add_argument("--outputs", nargs="+", default=['-'],
+ help="path to save encoded outputs")
+ parser.add_argument("--output_format", choices=["piece", "id"], default="piece")
+ parser.add_argument("--min-len", type=int, metavar="N",
+ help="filter sentence pairs with fewer than N tokens")
+ parser.add_argument("--max-len", type=int, metavar="N",
+ help="filter sentence pairs with more than N tokens")
+ args = parser.parse_args()
+
+ assert len(args.inputs) == len(args.outputs), \
+ "number of input and output paths should match"
+
+ sp = spm.SentencePieceProcessor()
+ sp.Load(args.model)
+
+ if args.output_format == "piece":
+ def encode(l):
+ return sp.EncodeAsPieces(l)
+ elif args.output_format == "id":
+ def encode(l):
+ return list(map(str, sp.EncodeAsIds(l)))
+ else:
+ raise NotImplementedError
+
+ if args.min_len is not None or args.max_len is not None:
+ def valid(line):
+ return (
+ (args.min_len is None or len(line) >= args.min_len) and
+ (args.max_len is None or len(line) <= args.max_len)
+ )
+ else:
+ def valid(lines):
+ return True
+
+ with contextlib.ExitStack() as stack:
+ inputs = [
+ stack.enter_context(open(input, "r", encoding="utf-8"))
+ if input != "-" else sys.stdin
+ for input in args.inputs
+ ]
+ outputs = [
+ stack.enter_context(open(output, "w", encoding="utf-8"))
+ if output != "-" else sys.stdout
+ for output in args.outputs
+ ]
+
+ stats = {
+ "num_empty": 0,
+ "num_filtered": 0,
+ }
+
+ def encode_line(line):
+ line = line.strip()
+ if len(line) > 0:
+ line = encode(line)
+ if valid(line):
+ return line
+ else:
+ stats["num_filtered"] += 1
+ else:
+ stats["num_empty"] += 1
+ return None
+
+ for i, lines in enumerate(zip(*inputs), start=1):
+ enc_lines = list(map(encode_line, lines))
+ if not any(enc_line is None for enc_line in enc_lines):
+ for enc_line, output_h in zip(enc_lines, outputs):
+ print(" ".join(enc_line), file=output_h)
+ if i % 10000 == 0:
+ print("processed {} lines".format(i), file=sys.stderr)
+
+ print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr)
+ print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/egs/librispeech/e_branchformer/local/spm_train.py b/egs/librispeech/e_branchformer/local/spm_train.py
new file mode 100755
index 0000000..134a0b1
--- /dev/null
+++ b/egs/librispeech/e_branchformer/local/spm_train.py
@@ -0,0 +1,12 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# https://github.com/pytorch/fairseq/blob/master/LICENSE
+import sys
+
+import sentencepiece as spm
+
+if __name__ == "__main__":
+ spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:]))
diff --git a/egs/librispeech/e_branchformer/path.sh b/egs/librispeech/e_branchformer/path.sh
new file mode 100755
index 0000000..7972642
--- /dev/null
+++ b/egs/librispeech/e_branchformer/path.sh
@@ -0,0 +1,5 @@
+export FUNASR_DIR=$PWD/../../..
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PATH=$FUNASR_DIR/funasr/bin:$PATH
diff --git a/egs/librispeech/e_branchformer/run.sh b/egs/librispeech/e_branchformer/run.sh
new file mode 100755
index 0000000..f1ffa0d
--- /dev/null
+++ b/egs/librispeech/e_branchformer/run.sh
@@ -0,0 +1,223 @@
+#!/usr/bin/env bash
+
+. ./path.sh || exit 1;
+
+# machines configuration
+CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+gpu_num=8
+count=1
+gpu_inference=true # Whether to perform gpu decoding, set false for cpu decoding
+# for gpu decoding, inference_nj=ngpu*njob; for cpu decoding, inference_nj=njob
+njob=5
+train_cmd=utils/run.pl
+infer_cmd=utils/run.pl
+
+# general configuration
+feats_dir="../DATA" #feature output dictionary
+exp_dir="."
+lang=en
+token_type=bpe
+type=sound
+scp=wav.scp
+speed_perturb="0.9 1.0 1.1"
+stage=0
+stop_stage=5
+
+# feature configuration
+feats_dim=80
+nj=64
+
+# data
+raw_data=
+data_url=www.openslr.org/resources/12
+
+# bpe model
+nbpe=5000
+bpemode=unigram
+
+# exp tag
+tag="exp1"
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train_960
+valid_set=dev
+test_sets="test_clean test_other dev_clean dev_other"
+
+asr_config=conf/train_asr_e_branchformer.yaml
+model_dir="baseline_$(basename "${asr_config}" .yaml)_${lang}_${token_type}_${tag}"
+
+inference_config=conf/decode_asr_transformer_beam10_ctc0.3.yaml
+inference_asr_model=valid.acc.ave_10best.pb
+
+# you can set gpu num for decoding here
+gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default
+ngpu=$(echo $gpuid_list | awk -F "," '{print NF}')
+
+if ${gpu_inference}; then
+ inference_nj=$[${ngpu}*${njob}]
+ _ngpu=1
+else
+ inference_nj=$njob
+ _ngpu=0
+fi
+
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+ echo "stage -1: Data Download"
+ for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do
+ local/download_and_untar.sh ${raw_data} ${data_url} ${part}
+ done
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ echo "stage 0: Data preparation"
+ # Data preparation
+ for x in dev-clean dev-other test-clean test-other train-clean-100 train-clean-360 train-other-500; do
+ local/data_prep.sh ${raw_data}/LibriSpeech/${x} ${feats_dir}/data/${x//-/_}
+ done
+ mkdir $feats_dir/data/$valid_set
+ dev_sets="dev_clean dev_other"
+ for file in wav.scp text; do
+ ( for f in $dev_sets; do cat $feats_dir/data/$f/$file; done ) | sort -k1 > $feats_dir/data/$valid_set/$file || exit 1;
+ done
+ mkdir $feats_dir/data/$train_set
+ train_sets="train_clean_100 train_clean_360 train_other_500"
+ for file in wav.scp text; do
+ ( for f in $train_sets; do cat $feats_dir/data/$f/$file; done ) | sort -k1 > $feats_dir/data/$train_set/$file || exit 1;
+ done
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ echo "stage 1: Feature and CMVN Generation"
+ utils/compute_cmvn.sh --fbankdir ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config_file "$asr_config" --scale 1.0
+fi
+
+token_list=${feats_dir}/data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
+bpemodel=${feats_dir}/data/lang_char/${train_set}_${bpemode}${nbpe}
+echo "dictionary: ${token_list}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+ echo "stage 2: Dictionary and Json Data Preparation"
+ mkdir -p ${feats_dir}/data/lang_char/
+ echo "<blank>" > ${token_list}
+ echo "<s>" >> ${token_list}
+ echo "</s>" >> ${token_list}
+ cut -f 2- -d" " ${feats_dir}/data/${train_set}/text > ${feats_dir}/data/lang_char/input.txt
+ local/spm_train.py --input=${feats_dir}/data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000
+ local/spm_encode.py --model=${bpemodel}.model --output_format=piece < ${feats_dir}/data/lang_char/input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0}' >> ${token_list}
+ echo "<unk>" >> ${token_list}
+fi
+
+# LM Training Stage
+world_size=$gpu_num # run on one machine
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+ echo "stage 3: LM Training"
+fi
+
+# ASR Training Stage
+world_size=$gpu_num # run on one machine
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+ echo "stage 4: ASR Training"
+ mkdir -p ${exp_dir}/exp/${model_dir}
+ mkdir -p ${exp_dir}/exp/${model_dir}/log
+ INIT_FILE=${exp_dir}/exp/${model_dir}/ddp_init
+ if [ -f $INIT_FILE ];then
+ rm -f $INIT_FILE
+ fi
+ init_method=file://$(readlink -f $INIT_FILE)
+ echo "$0: init method is $init_method"
+ for ((i = 0; i < $gpu_num; ++i)); do
+ {
+ rank=$i
+ local_rank=$i
+ gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+ train.py \
+ --task_name asr \
+ --gpu_id $gpu_id \
+ --use_preprocessor true \
+ --split_with_space false \
+ --bpemodel ${bpemodel}.model \
+ --token_type $token_type \
+ --token_list $token_list \
+ --dataset_type large \
+ --data_dir ${feats_dir}/data \
+ --train_set ${train_set} \
+ --valid_set ${valid_set} \
+ --cmvn_file ${feats_dir}/data/${train_set}/cmvn/am.mvn \
+ --speed_perturb ${speed_perturb} \
+ --resume true \
+ --output_dir ${exp_dir}/exp/${model_dir} \
+ --config $asr_config \
+ --ngpu $gpu_num \
+ --num_worker_count $count \
+ --multiprocessing_distributed true \
+ --dist_init_method $init_method \
+ --dist_world_size $world_size \
+ --dist_rank $rank \
+ --local_rank $local_rank 1> ${exp_dir}/exp/${model_dir}/log/train.log.$i 2>&1
+ } &
+ done
+ wait
+fi
+
+# Testing Stage
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+ echo "stage 5: Inference"
+ for dset in ${test_sets}; do
+ asr_exp=${exp_dir}/exp/${model_dir}
+ inference_tag="$(basename "${inference_config}" .yaml)"
+ _dir="${asr_exp}/${inference_tag}/${inference_asr_model}/${dset}"
+ _logdir="${_dir}/logdir"
+ if [ -d ${_dir} ]; then
+ echo "${_dir} is already exists. if you want to decode again, please delete this dir first."
+ exit 0
+ fi
+ mkdir -p "${_logdir}"
+ _data="${feats_dir}/data/${dset}"
+ key_file=${_data}/${scp}
+ num_scp_file="$(<${key_file} wc -l)"
+ _nj=$([ $inference_nj -le $num_scp_file ] && echo "$inference_nj" || echo "$num_scp_file")
+ split_scps=
+ for n in $(seq "${_nj}"); do
+ split_scps+=" ${_logdir}/keys.${n}.scp"
+ done
+ # shellcheck disable=SC2086
+ utils/split_scp.pl "${key_file}" ${split_scps}
+ _opts=
+ if [ -n "${inference_config}" ]; then
+ _opts+="--config ${inference_config} "
+ fi
+ ${infer_cmd} --gpu "${_ngpu}" --max-jobs-run "${_nj}" JOB=1:"${_nj}" "${_logdir}"/asr_inference.JOB.log \
+ python -m funasr.bin.asr_inference_launch \
+ --batch_size 1 \
+ --ngpu "${_ngpu}" \
+ --njob ${njob} \
+ --gpuid_list ${gpuid_list} \
+ --data_path_and_name_and_type "${_data}/${scp},speech,${type}" \
+ --cmvn_file ${feats_dir}/data/${train_set}/cmvn/am.mvn \
+ --key_file "${_logdir}"/keys.JOB.scp \
+ --asr_train_config "${asr_exp}"/config.yaml \
+ --asr_model_file "${asr_exp}"/"${inference_asr_model}" \
+ --output_dir "${_logdir}"/output.JOB \
+ --mode asr \
+ ${_opts}
+
+ for f in token token_int score text; do
+ if [ -f "${_logdir}/output.1/1best_recog/${f}" ]; then
+ for i in $(seq "${_nj}"); do
+ cat "${_logdir}/output.${i}/1best_recog/${f}"
+ done | sort -k1 >"${_dir}/${f}"
+ fi
+ done
+ python utils/compute_wer.py ${_data}/text ${_dir}/text ${_dir}/text.cer
+ tail -n 3 ${_dir}/text.cer > ${_dir}/text.cer.txt
+ cat ${_dir}/text.cer.txt
+ done
+fi
\ No newline at end of file
diff --git a/egs/librispeech/e_branchformer/utils b/egs/librispeech/e_branchformer/utils
new file mode 120000
index 0000000..fe070dd
--- /dev/null
+++ b/egs/librispeech/e_branchformer/utils
@@ -0,0 +1 @@
+../../aishell/transformer/utils
\ No newline at end of file
diff --git a/funasr/build_utils/build_args.py b/funasr/build_utils/build_args.py
index 632c134..08018a7 100644
--- a/funasr/build_utils/build_args.py
+++ b/funasr/build_utils/build_args.py
@@ -54,6 +54,12 @@
default=None,
help="The number of input dimension of the feature",
)
+ task_parser.add_argument(
+ "--cmvn_file",
+ type=str_or_none,
+ default=None,
+ help="The path of cmvn file.",
+ )
elif args.task_name == "lm":
from funasr.build_utils.build_lm_model import class_choices_list
@@ -86,6 +92,12 @@
from funasr.build_utils.build_diar_model import class_choices_list
for class_choices in class_choices_list:
class_choices.add_arguments(task_parser)
+ task_parser.add_argument(
+ "--input_size",
+ type=int_or_none,
+ default=None,
+ help="The number of input dimension of the feature",
+ )
elif args.task_name == "sv":
from funasr.build_utils.build_sv_model import class_choices_list
diff --git a/funasr/build_utils/build_dataloader.py b/funasr/build_utils/build_dataloader.py
index c95c40d..473097e 100644
--- a/funasr/build_utils/build_dataloader.py
+++ b/funasr/build_utils/build_dataloader.py
@@ -4,8 +4,21 @@
def build_dataloader(args):
if args.dataset_type == "small":
- train_iter_factory = SequenceIterFactory(args, mode="train")
- valid_iter_factory = SequenceIterFactory(args, mode="valid")
+ if args.task_name == "diar" and args.model == "eend_ola":
+ from funasr.modules.eend_ola.eend_ola_dataloader import EENDOLADataLoader
+ train_iter_factory = EENDOLADataLoader(
+ data_file=args.train_data_path_and_name_and_type[0][0],
+ batch_size=args.dataset_conf["batch_conf"]["batch_size"],
+ num_workers=args.dataset_conf["num_workers"],
+ shuffle=True)
+ valid_iter_factory = EENDOLADataLoader(
+ data_file=args.valid_data_path_and_name_and_type[0][0],
+ batch_size=args.dataset_conf["batch_conf"]["batch_size"],
+ num_workers=0,
+ shuffle=False)
+ else:
+ train_iter_factory = SequenceIterFactory(args, mode="train")
+ valid_iter_factory = SequenceIterFactory(args, mode="valid")
elif args.dataset_type == "large":
train_iter_factory = LargeDataLoader(args, mode="train")
valid_iter_factory = LargeDataLoader(args, mode="valid")
diff --git a/funasr/build_utils/build_diar_model.py b/funasr/build_utils/build_diar_model.py
index 1aa0701..cf23dad 100644
--- a/funasr/build_utils/build_diar_model.py
+++ b/funasr/build_utils/build_diar_model.py
@@ -192,18 +192,22 @@
def build_diar_model(args):
# token_list
- if isinstance(args.token_list, str):
- with open(args.token_list, encoding="utf-8") as f:
- token_list = [line.rstrip() for line in f]
+ if args.token_list is not None:
+ if isinstance(args.token_list, str):
+ with open(args.token_list, encoding="utf-8") as f:
+ token_list = [line.rstrip() for line in f]
- # Overwriting token_list to keep it as "portable".
- args.token_list = list(token_list)
- elif isinstance(args.token_list, (tuple, list)):
- token_list = list(args.token_list)
+ # Overwriting token_list to keep it as "portable".
+ args.token_list = list(token_list)
+ elif isinstance(args.token_list, (tuple, list)):
+ token_list = list(args.token_list)
+ else:
+ raise RuntimeError("token_list must be str or list")
+ vocab_size = len(token_list)
+ logging.info(f"Vocabulary size: {vocab_size}")
else:
- raise RuntimeError("token_list must be str or list")
- vocab_size = len(token_list)
- logging.info(f"Vocabulary size: {vocab_size}")
+ token_list = None
+ vocab_size = None
# frontend
if args.input_size is None:
@@ -212,16 +216,14 @@
frontend = frontend_class(cmvn_file=args.cmvn_file, **args.frontend_conf)
else:
frontend = frontend_class(**args.frontend_conf)
- input_size = frontend.output_size()
else:
args.frontend = None
args.frontend_conf = {}
frontend = None
- input_size = args.input_size
# encoder
encoder_class = encoder_choices.get_class(args.encoder)
- encoder = encoder_class(input_size=input_size, **args.encoder_conf)
+ encoder = encoder_class(**args.encoder_conf)
if args.model == "sond":
# data augmentation for spectrogram
@@ -294,7 +296,7 @@
**args.model_conf,
)
- elif args.model_name == "eend_ola":
+ elif args.model == "eend_ola":
# encoder-decoder attractor
encoder_decoder_attractor_class = encoder_decoder_attractor_choices.get_class(args.encoder_decoder_attractor)
encoder_decoder_attractor = encoder_decoder_attractor_class(**args.encoder_decoder_attractor_conf)
diff --git a/funasr/build_utils/build_pretrain_model.py b/funasr/build_utils/build_pretrain_model.py
index 629937f..0784fb2 100644
--- a/funasr/build_utils/build_pretrain_model.py
+++ b/funasr/build_utils/build_pretrain_model.py
@@ -4,13 +4,18 @@
from funasr.models.encoder.data2vec_encoder import Data2VecEncoder
from funasr.models.frontend.default import DefaultFrontend
from funasr.models.frontend.windowing import SlidingWindow
+from funasr.models.frontend.wav_frontend import WavFrontend
from funasr.models.specaug.specaug import SpecAug
from funasr.torch_utils.initialize import initialize
from funasr.train.class_choices import ClassChoices
frontend_choices = ClassChoices(
name="frontend",
- classes=dict(default=DefaultFrontend, sliding_window=SlidingWindow),
+ classes=dict(
+ default=DefaultFrontend,
+ sliding_window=SlidingWindow,
+ wav_frontend=WavFrontend,
+ ),
default="default",
)
specaug_choices = ClassChoices(
diff --git a/funasr/datasets/small_datasets/sequence_iter_factory.py b/funasr/datasets/small_datasets/sequence_iter_factory.py
index 3ebcc5a..e748c3d 100644
--- a/funasr/datasets/small_datasets/sequence_iter_factory.py
+++ b/funasr/datasets/small_datasets/sequence_iter_factory.py
@@ -57,7 +57,7 @@
data_path_and_name_and_type,
preprocess=preprocess_fn,
dest_sample_rate=dest_sample_rate,
- speed_perturb=args.speed_perturb if mode=="train" else None,
+ speed_perturb=args.speed_perturb if mode == "train" else None,
)
# sampler
@@ -84,7 +84,7 @@
args.max_update = len(bs_list) * args.max_epoch
logging.info("Max update: {}".format(args.max_update))
- if args.distributed and mode=="train":
+ if args.distributed and mode == "train":
world_size = torch.distributed.get_world_size()
rank = torch.distributed.get_rank()
for batch in batches:
diff --git a/funasr/models/data2vec.py b/funasr/models/data2vec.py
index 92c95cc..425794d 100644
--- a/funasr/models/data2vec.py
+++ b/funasr/models/data2vec.py
@@ -12,12 +12,12 @@
import torch
from funasr.layers.abs_normalize import AbsNormalize
+from funasr.models.base_model import FunASRModel
from funasr.models.encoder.abs_encoder import AbsEncoder
from funasr.models.frontend.abs_frontend import AbsFrontend
from funasr.models.preencoder.abs_preencoder import AbsPreEncoder
from funasr.models.specaug.abs_specaug import AbsSpecAug
from funasr.torch_utils.device_funcs import force_gatherable
-from funasr.models.base_model import FunASRModel
if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
from torch.cuda.amp import autocast
@@ -36,8 +36,8 @@
frontend: Optional[AbsFrontend],
specaug: Optional[AbsSpecAug],
normalize: Optional[AbsNormalize],
- preencoder: Optional[AbsPreEncoder],
encoder: AbsEncoder,
+ preencoder: Optional[AbsPreEncoder] = None,
):
super().__init__()
diff --git a/funasr/models/e2e_diar_eend_ola.py b/funasr/models/e2e_diar_eend_ola.py
index ae3a436..a0b545a 100644
--- a/funasr/models/e2e_diar_eend_ola.py
+++ b/funasr/models/e2e_diar_eend_ola.py
@@ -1,21 +1,20 @@
-# Copyright ESPnet (https://github.com/espnet/espnet). All Rights Reserved.
-# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
-
from contextlib import contextmanager
from distutils.version import LooseVersion
-from typing import Dict
-from typing import Tuple
+from typing import Dict, List, Tuple, Optional
import numpy as np
import torch
import torch.nn as nn
+import torch.nn.functional as F
+from funasr.models.base_model import FunASRModel
from funasr.models.frontend.wav_frontend import WavFrontendMel23
from funasr.modules.eend_ola.encoder import EENDOLATransformerEncoder
from funasr.modules.eend_ola.encoder_decoder_attractor import EncoderDecoderAttractor
+from funasr.modules.eend_ola.utils.losses import standard_loss, cal_power_loss, fast_batch_pit_n_speaker_loss
+from funasr.modules.eend_ola.utils.power import create_powerlabel
from funasr.modules.eend_ola.utils.power import generate_mapping_dict
from funasr.torch_utils.device_funcs import force_gatherable
-from funasr.models.base_model import FunASRModel
if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
pass
@@ -33,12 +32,35 @@
return att
+def pad_labels(ts, out_size):
+ for i, t in enumerate(ts):
+ if t.shape[1] < out_size:
+ ts[i] = F.pad(
+ t,
+ (0, out_size - t.shape[1], 0, 0),
+ mode='constant',
+ value=0.
+ )
+ return ts
+
+
+def pad_results(ys, out_size):
+ ys_padded = []
+ for i, y in enumerate(ys):
+ if y.shape[1] < out_size:
+ ys_padded.append(
+ torch.cat([y, torch.zeros(y.shape[0], out_size - y.shape[1]).to(torch.float32).to(y.device)], dim=1))
+ else:
+ ys_padded.append(y)
+ return ys_padded
+
+
class DiarEENDOLAModel(FunASRModel):
"""EEND-OLA diarization model"""
def __init__(
self,
- frontend: WavFrontendMel23,
+ frontend: Optional[WavFrontendMel23],
encoder: EENDOLATransformerEncoder,
encoder_decoder_attractor: EncoderDecoderAttractor,
n_units: int = 256,
@@ -47,11 +69,10 @@
mapping_dict=None,
**kwargs,
):
-
super().__init__()
self.frontend = frontend
self.enc = encoder
- self.eda = encoder_decoder_attractor
+ self.encoder_decoder_attractor = encoder_decoder_attractor
self.attractor_loss_weight = attractor_loss_weight
self.max_n_speaker = max_n_speaker
if mapping_dict is None:
@@ -74,7 +95,8 @@
def forward_post_net(self, logits, ilens):
maxlen = torch.max(ilens).to(torch.int).item()
logits = nn.utils.rnn.pad_sequence(logits, batch_first=True, padding_value=-1)
- logits = nn.utils.rnn.pack_padded_sequence(logits, ilens.cpu().to(torch.int64), batch_first=True, enforce_sorted=False)
+ logits = nn.utils.rnn.pack_padded_sequence(logits, ilens.cpu().to(torch.int64), batch_first=True,
+ enforce_sorted=False)
outputs, (_, _) = self.postnet(logits)
outputs = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True, padding_value=-1, total_length=maxlen)[0]
outputs = [output[:ilens[i].to(torch.int).item()] for i, output in enumerate(outputs)]
@@ -83,95 +105,45 @@
def forward(
self,
- speech: torch.Tensor,
- speech_lengths: torch.Tensor,
- text: torch.Tensor,
- text_lengths: torch.Tensor,
+ speech: List[torch.Tensor],
+ speaker_labels: List[torch.Tensor],
+ orders: torch.Tensor,
) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
- """Frontend + Encoder + Decoder + Calc loss
- Args:
- speech: (Batch, Length, ...)
- speech_lengths: (Batch, )
- text: (Batch, Length)
- text_lengths: (Batch,)
- """
- assert text_lengths.dim() == 1, text_lengths.shape
+
# Check that batch_size is unified
- assert (
- speech.shape[0]
- == speech_lengths.shape[0]
- == text.shape[0]
- == text_lengths.shape[0]
- ), (speech.shape, speech_lengths.shape, text.shape, text_lengths.shape)
- batch_size = speech.shape[0]
+ assert (len(speech) == len(speaker_labels)), (len(speech), len(speaker_labels))
+ speech_lengths = torch.tensor([len(sph) for sph in speech]).to(torch.int64)
+ speaker_labels_lengths = torch.tensor([spk.shape[-1] for spk in speaker_labels]).to(torch.int64)
+ batch_size = len(speech)
- # for data-parallel
- text = text[:, : text_lengths.max()]
+ # Encoder
+ encoder_out = self.forward_encoder(speech, speech_lengths)
- # 1. Encoder
- encoder_out, encoder_out_lens = self.enc(speech, speech_lengths)
- intermediate_outs = None
- if isinstance(encoder_out, tuple):
- intermediate_outs = encoder_out[1]
- encoder_out = encoder_out[0]
+ # Encoder-decoder attractor
+ attractor_loss, attractors = self.encoder_decoder_attractor([e[order] for e, order in zip(encoder_out, orders)],
+ speaker_labels_lengths)
+ speaker_logits = [torch.matmul(e, att.permute(1, 0)) for e, att in zip(encoder_out, attractors)]
- loss_att, acc_att, cer_att, wer_att = None, None, None, None
- loss_ctc, cer_ctc = None, None
+ # pit loss
+ pit_speaker_labels = fast_batch_pit_n_speaker_loss(speaker_logits, speaker_labels)
+ pit_loss = standard_loss(speaker_logits, pit_speaker_labels)
+
+ # pse loss
+ with torch.no_grad():
+ power_ts = [create_powerlabel(label.cpu().numpy(), self.mapping_dict, self.max_n_speaker).
+ to(encoder_out[0].device, non_blocking=True) for label in pit_speaker_labels]
+ pad_attractors = [pad_attractor(att, self.max_n_speaker) for att in attractors]
+ pse_speaker_logits = [torch.matmul(e, pad_att.permute(1, 0)) for e, pad_att in zip(encoder_out, pad_attractors)]
+ pse_speaker_logits = self.forward_post_net(pse_speaker_logits, speech_lengths)
+ pse_loss = cal_power_loss(pse_speaker_logits, power_ts)
+
+ loss = pse_loss + pit_loss + self.attractor_loss_weight * attractor_loss
+
stats = dict()
-
- # 1. CTC branch
- if self.ctc_weight != 0.0:
- loss_ctc, cer_ctc = self._calc_ctc_loss(
- encoder_out, encoder_out_lens, text, text_lengths
- )
-
- # Collect CTC branch stats
- stats["loss_ctc"] = loss_ctc.detach() if loss_ctc is not None else None
- stats["cer_ctc"] = cer_ctc
-
- # Intermediate CTC (optional)
- loss_interctc = 0.0
- if self.interctc_weight != 0.0 and intermediate_outs is not None:
- for layer_idx, intermediate_out in intermediate_outs:
- # we assume intermediate_out has the same length & padding
- # as those of encoder_out
- loss_ic, cer_ic = self._calc_ctc_loss(
- intermediate_out, encoder_out_lens, text, text_lengths
- )
- loss_interctc = loss_interctc + loss_ic
-
- # Collect Intermedaite CTC stats
- stats["loss_interctc_layer{}".format(layer_idx)] = (
- loss_ic.detach() if loss_ic is not None else None
- )
- stats["cer_interctc_layer{}".format(layer_idx)] = cer_ic
-
- loss_interctc = loss_interctc / len(intermediate_outs)
-
- # calculate whole encoder loss
- loss_ctc = (
- 1 - self.interctc_weight
- ) * loss_ctc + self.interctc_weight * loss_interctc
-
- # 2b. Attention decoder branch
- if self.ctc_weight != 1.0:
- loss_att, acc_att, cer_att, wer_att = self._calc_att_loss(
- encoder_out, encoder_out_lens, text, text_lengths
- )
-
- # 3. CTC-Att loss definition
- if self.ctc_weight == 0.0:
- loss = loss_att
- elif self.ctc_weight == 1.0:
- loss = loss_ctc
- else:
- loss = self.ctc_weight * loss_ctc + (1 - self.ctc_weight) * loss_att
-
- # Collect Attn branch stats
- stats["loss_att"] = loss_att.detach() if loss_att is not None else None
- stats["acc"] = acc_att
- stats["cer"] = cer_att
- stats["wer"] = wer_att
+ stats["pse_loss"] = pse_loss.detach()
+ stats["pit_loss"] = pit_loss.detach()
+ stats["attractor_loss"] = attractor_loss.detach()
+ stats["batch_size"] = batch_size
# Collect total loss stats
stats["loss"] = torch.clone(loss.detach())
@@ -182,21 +154,20 @@
def estimate_sequential(self,
speech: torch.Tensor,
- speech_lengths: torch.Tensor,
n_speakers: int = None,
shuffle: bool = True,
threshold: float = 0.5,
**kwargs):
- speech = [s[:s_len] for s, s_len in zip(speech, speech_lengths)]
+ speech_lengths = torch.tensor([len(sph) for sph in speech]).to(torch.int64)
emb = self.forward_encoder(speech, speech_lengths)
if shuffle:
orders = [np.arange(e.shape[0]) for e in emb]
for order in orders:
np.random.shuffle(order)
- attractors, probs = self.eda.estimate(
+ attractors, probs = self.encoder_decoder_attractor.estimate(
[e[torch.from_numpy(order).to(torch.long).to(speech[0].device)] for e, order in zip(emb, orders)])
else:
- attractors, probs = self.eda.estimate(emb)
+ attractors, probs = self.encoder_decoder_attractor.estimate(emb)
attractors_active = []
for p, att, e in zip(probs, attractors, emb):
if n_speakers and n_speakers >= 0:
diff --git a/funasr/modules/eend_ola/eend_ola_dataloader.py b/funasr/modules/eend_ola/eend_ola_dataloader.py
new file mode 100644
index 0000000..2ee9272
--- /dev/null
+++ b/funasr/modules/eend_ola/eend_ola_dataloader.py
@@ -0,0 +1,57 @@
+import logging
+
+import kaldiio
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+from torch.utils.data import Dataset
+
+
+def custom_collate(batch):
+ keys, speech, speaker_labels, orders = zip(*batch)
+ speech = [torch.from_numpy(np.copy(sph)).to(torch.float32) for sph in speech]
+ speaker_labels = [torch.from_numpy(np.copy(spk)).to(torch.float32) for spk in speaker_labels]
+ orders = [torch.from_numpy(np.copy(o)).to(torch.int64) for o in orders]
+ batch = dict(speech=speech,
+ speaker_labels=speaker_labels,
+ orders=orders)
+
+ return keys, batch
+
+
+class EENDOLADataset(Dataset):
+ def __init__(
+ self,
+ data_file,
+ ):
+ self.data_file = data_file
+ with open(data_file) as f:
+ lines = f.readlines()
+ self.samples = [line.strip().split() for line in lines]
+ logging.info("total samples: {}".format(len(self.samples)))
+
+ def __len__(self):
+ return len(self.samples)
+
+ def __getitem__(self, idx):
+ key, speech_path, speaker_label_path = self.samples[idx]
+ speech = kaldiio.load_mat(speech_path)
+ speaker_label = kaldiio.load_mat(speaker_label_path).reshape(speech.shape[0], -1)
+
+ order = np.arange(speech.shape[0])
+ np.random.shuffle(order)
+
+ return key, speech, speaker_label, order
+
+
+class EENDOLADataLoader():
+ def __init__(self, data_file, batch_size, shuffle=True, num_workers=8):
+ dataset = EENDOLADataset(data_file)
+ self.data_loader = DataLoader(dataset,
+ batch_size=batch_size,
+ collate_fn=custom_collate,
+ shuffle=shuffle,
+ num_workers=num_workers)
+
+ def build_iter(self, epoch):
+ return self.data_loader
\ No newline at end of file
diff --git a/funasr/modules/eend_ola/encoder.py b/funasr/modules/eend_ola/encoder.py
index 90a63f3..3065884 100644
--- a/funasr/modules/eend_ola/encoder.py
+++ b/funasr/modules/eend_ola/encoder.py
@@ -91,6 +91,7 @@
dropout_rate: float = 0.1,
use_pos_emb: bool = False):
super(EENDOLATransformerEncoder, self).__init__()
+ self.linear_in = nn.Linear(idim, n_units)
self.lnorm_in = nn.LayerNorm(n_units)
self.n_layers = n_layers
self.dropout = nn.Dropout(dropout_rate)
@@ -104,25 +105,10 @@
setattr(self, '{}{:d}'.format("ff_", i),
PositionwiseFeedForward(n_units, e_units, dropout_rate))
self.lnorm_out = nn.LayerNorm(n_units)
- if use_pos_emb:
- self.pos_enc = torch.nn.Sequential(
- torch.nn.Linear(idim, n_units),
- torch.nn.LayerNorm(n_units),
- torch.nn.Dropout(dropout_rate),
- torch.nn.ReLU(),
- PositionalEncoding(n_units, dropout_rate),
- )
- else:
- self.linear_in = nn.Linear(idim, n_units)
- self.pos_enc = None
def __call__(self, x, x_mask=None):
BT_size = x.shape[0] * x.shape[1]
- if self.pos_enc is not None:
- e = self.pos_enc(x)
- e = e.view(BT_size, -1)
- else:
- e = self.linear_in(x.reshape(BT_size, -1))
+ e = self.linear_in(x.reshape(BT_size, -1))
for i in range(self.n_layers):
e = getattr(self, '{}{:d}'.format("lnorm1_", i))(e)
s = getattr(self, '{}{:d}'.format("self_att_", i))(e, x.shape[0], x_mask)
@@ -130,4 +116,4 @@
e = getattr(self, '{}{:d}'.format("lnorm2_", i))(e)
s = getattr(self, '{}{:d}'.format("ff_", i))(e)
e = e + self.dropout(s)
- return self.lnorm_out(e)
+ return self.lnorm_out(e)
\ No newline at end of file
diff --git a/funasr/modules/eend_ola/utils/feature.py b/funasr/modules/eend_ola/utils/feature.py
new file mode 100644
index 0000000..544a352
--- /dev/null
+++ b/funasr/modules/eend_ola/utils/feature.py
@@ -0,0 +1,286 @@
+# Copyright 2019 Hitachi, Ltd. (author: Yusuke Fujita)
+# Licensed under the MIT license.
+#
+# This module is for computing audio features
+
+import numpy as np
+import librosa
+
+
+def get_input_dim(
+ frame_size,
+ context_size,
+ transform_type,
+):
+ if transform_type.startswith('logmel23'):
+ frame_size = 23
+ elif transform_type.startswith('logmel'):
+ frame_size = 40
+ else:
+ fft_size = 1 << (frame_size - 1).bit_length()
+ frame_size = int(fft_size / 2) + 1
+ input_dim = (2 * context_size + 1) * frame_size
+ return input_dim
+
+
+def transform(
+ Y,
+ transform_type=None,
+ dtype=np.float32):
+ """ Transform STFT feature
+
+ Args:
+ Y: STFT
+ (n_frames, n_bins)-shaped np.complex array
+ transform_type:
+ None, "log"
+ dtype: output data type
+ np.float32 is expected
+ Returns:
+ Y (numpy.array): transformed feature
+ """
+ Y = np.abs(Y)
+ if not transform_type:
+ pass
+ elif transform_type == 'log':
+ Y = np.log(np.maximum(Y, 1e-10))
+ elif transform_type == 'logmel':
+ n_fft = 2 * (Y.shape[1] - 1)
+ sr = 16000
+ n_mels = 40
+ mel_basis = librosa.filters.mel(sr, n_fft, n_mels)
+ Y = np.dot(Y ** 2, mel_basis.T)
+ Y = np.log10(np.maximum(Y, 1e-10))
+ elif transform_type == 'logmel23':
+ n_fft = 2 * (Y.shape[1] - 1)
+ sr = 8000
+ n_mels = 23
+ mel_basis = librosa.filters.mel(sr, n_fft, n_mels)
+ Y = np.dot(Y ** 2, mel_basis.T)
+ Y = np.log10(np.maximum(Y, 1e-10))
+ elif transform_type == 'logmel23_mn':
+ n_fft = 2 * (Y.shape[1] - 1)
+ sr = 8000
+ n_mels = 23
+ mel_basis = librosa.filters.mel(sr, n_fft, n_mels)
+ Y = np.dot(Y ** 2, mel_basis.T)
+ Y = np.log10(np.maximum(Y, 1e-10))
+ mean = np.mean(Y, axis=0)
+ Y = Y - mean
+ elif transform_type == 'logmel23_swn':
+ n_fft = 2 * (Y.shape[1] - 1)
+ sr = 8000
+ n_mels = 23
+ mel_basis = librosa.filters.mel(sr, n_fft, n_mels)
+ Y = np.dot(Y ** 2, mel_basis.T)
+ Y = np.log10(np.maximum(Y, 1e-10))
+ # b = np.ones(300)/300
+ # mean = scipy.signal.convolve2d(Y, b[:, None], mode='same')
+
+ # simple 2-means based threshoding for mean calculation
+ powers = np.sum(Y, axis=1)
+ th = (np.max(powers) + np.min(powers)) / 2.0
+ for i in range(10):
+ th = (np.mean(powers[powers >= th]) + np.mean(powers[powers < th])) / 2
+ mean = np.mean(Y[powers > th, :], axis=0)
+ Y = Y - mean
+ elif transform_type == 'logmel23_mvn':
+ n_fft = 2 * (Y.shape[1] - 1)
+ sr = 8000
+ n_mels = 23
+ mel_basis = librosa.filters.mel(sr, n_fft, n_mels)
+ Y = np.dot(Y ** 2, mel_basis.T)
+ Y = np.log10(np.maximum(Y, 1e-10))
+ mean = np.mean(Y, axis=0)
+ Y = Y - mean
+ std = np.maximum(np.std(Y, axis=0), 1e-10)
+ Y = Y / std
+ else:
+ raise ValueError('Unknown transform_type: %s' % transform_type)
+ return Y.astype(dtype)
+
+
+def subsample(Y, T, subsampling=1):
+ """ Frame subsampling
+ """
+ Y_ss = Y[::subsampling]
+ T_ss = T[::subsampling]
+ return Y_ss, T_ss
+
+
+def splice(Y, context_size=0):
+ """ Frame splicing
+
+ Args:
+ Y: feature
+ (n_frames, n_featdim)-shaped numpy array
+ context_size:
+ number of frames concatenated on left-side
+ if context_size = 5, 11 frames are concatenated.
+
+ Returns:
+ Y_spliced: spliced feature
+ (n_frames, n_featdim * (2 * context_size + 1))-shaped
+ """
+ Y_pad = np.pad(
+ Y,
+ [(context_size, context_size), (0, 0)],
+ 'constant')
+ Y_spliced = np.lib.stride_tricks.as_strided(
+ np.ascontiguousarray(Y_pad),
+ (Y.shape[0], Y.shape[1] * (2 * context_size + 1)),
+ (Y.itemsize * Y.shape[1], Y.itemsize), writeable=False)
+ return Y_spliced
+
+
+def stft(
+ data,
+ frame_size=1024,
+ frame_shift=256):
+ """ Compute STFT features
+
+ Args:
+ data: audio signal
+ (n_samples,)-shaped np.float32 array
+ frame_size: number of samples in a frame (must be a power of two)
+ frame_shift: number of samples between frames
+
+ Returns:
+ stft: STFT frames
+ (n_frames, n_bins)-shaped np.complex64 array
+ """
+ # round up to nearest power of 2
+ fft_size = 1 << (frame_size - 1).bit_length()
+ # HACK: The last frame is ommited
+ # as librosa.stft produces such an excessive frame
+ if len(data) % frame_shift == 0:
+ return librosa.stft(data, n_fft=fft_size, win_length=frame_size,
+ hop_length=frame_shift).T[:-1]
+ else:
+ return librosa.stft(data, n_fft=fft_size, win_length=frame_size,
+ hop_length=frame_shift).T
+
+
+def _count_frames(data_len, size, shift):
+ # HACK: Assuming librosa.stft(..., center=True)
+ n_frames = 1 + int(data_len / shift)
+ if data_len % shift == 0:
+ n_frames = n_frames - 1
+ return n_frames
+
+
+def get_frame_labels(
+ kaldi_obj,
+ rec,
+ start=0,
+ end=None,
+ frame_size=1024,
+ frame_shift=256,
+ n_speakers=None):
+ """ Get frame-aligned labels of given recording
+ Args:
+ kaldi_obj (KaldiData)
+ rec (str): recording id
+ start (int): start frame index
+ end (int): end frame index
+ None means the last frame of recording
+ frame_size (int): number of frames in a frame
+ frame_shift (int): number of shift samples
+ n_speakers (int): number of speakers
+ if None, the value is given from data
+ Returns:
+ T: label
+ (n_frames, n_speakers)-shaped np.int32 array
+ """
+ filtered_segments = kaldi_obj.segments[kaldi_obj.segments['rec'] == rec]
+ speakers = np.unique(
+ [kaldi_obj.utt2spk[seg['utt']] for seg
+ in filtered_segments]).tolist()
+ if n_speakers is None:
+ n_speakers = len(speakers)
+ es = end * frame_shift if end is not None else None
+ data, rate = kaldi_obj.load_wav(rec, start * frame_shift, es)
+ n_frames = _count_frames(len(data), frame_size, frame_shift)
+ T = np.zeros((n_frames, n_speakers), dtype=np.int32)
+ if end is None:
+ end = n_frames
+
+ for seg in filtered_segments:
+ speaker_index = speakers.index(kaldi_obj.utt2spk[seg['utt']])
+ start_frame = np.rint(
+ seg['st'] * rate / frame_shift).astype(int)
+ end_frame = np.rint(
+ seg['et'] * rate / frame_shift).astype(int)
+ rel_start = rel_end = None
+ if start <= start_frame and start_frame < end:
+ rel_start = start_frame - start
+ if start < end_frame and end_frame <= end:
+ rel_end = end_frame - start
+ if rel_start is not None or rel_end is not None:
+ T[rel_start:rel_end, speaker_index] = 1
+ return T
+
+
+def get_labeledSTFT(
+ kaldi_obj,
+ rec, start, end, frame_size, frame_shift,
+ n_speakers=None,
+ use_speaker_id=False):
+ """ Extracts STFT and corresponding labels
+
+ Extracts STFT and corresponding diarization labels for
+ given recording id and start/end times
+
+ Args:
+ kaldi_obj (KaldiData)
+ rec (str): recording id
+ start (int): start frame index
+ end (int): end frame index
+ frame_size (int): number of samples in a frame
+ frame_shift (int): number of shift samples
+ n_speakers (int): number of speakers
+ if None, the value is given from data
+ Returns:
+ Y: STFT
+ (n_frames, n_bins)-shaped np.complex64 array,
+ T: label
+ (n_frmaes, n_speakers)-shaped np.int32 array.
+ """
+ data, rate = kaldi_obj.load_wav(
+ rec, start * frame_shift, end * frame_shift)
+ Y = stft(data, frame_size, frame_shift)
+ filtered_segments = kaldi_obj.segments[rec]
+ # filtered_segments = kaldi_obj.segments[kaldi_obj.segments['rec'] == rec]
+ speakers = np.unique(
+ [kaldi_obj.utt2spk[seg['utt']] for seg
+ in filtered_segments]).tolist()
+ if n_speakers is None:
+ n_speakers = len(speakers)
+ T = np.zeros((Y.shape[0], n_speakers), dtype=np.int32)
+
+ if use_speaker_id:
+ all_speakers = sorted(kaldi_obj.spk2utt.keys())
+ S = np.zeros((Y.shape[0], len(all_speakers)), dtype=np.int32)
+
+ for seg in filtered_segments:
+ speaker_index = speakers.index(kaldi_obj.utt2spk[seg['utt']])
+ if use_speaker_id:
+ all_speaker_index = all_speakers.index(kaldi_obj.utt2spk[seg['utt']])
+ start_frame = np.rint(
+ seg['st'] * rate / frame_shift).astype(int)
+ end_frame = np.rint(
+ seg['et'] * rate / frame_shift).astype(int)
+ rel_start = rel_end = None
+ if start <= start_frame and start_frame < end:
+ rel_start = start_frame - start
+ if start < end_frame and end_frame <= end:
+ rel_end = end_frame - start
+ if rel_start is not None or rel_end is not None:
+ T[rel_start:rel_end, speaker_index] = 1
+ if use_speaker_id:
+ S[rel_start:rel_end, all_speaker_index] = 1
+
+ if use_speaker_id:
+ return Y, T, S
+ else:
+ return Y, T
diff --git a/funasr/modules/eend_ola/utils/kaldi_data.py b/funasr/modules/eend_ola/utils/kaldi_data.py
new file mode 100644
index 0000000..42f6d5e
--- /dev/null
+++ b/funasr/modules/eend_ola/utils/kaldi_data.py
@@ -0,0 +1,162 @@
+# Copyright 2019 Hitachi, Ltd. (author: Yusuke Fujita)
+# Licensed under the MIT license.
+#
+# This library provides utilities for kaldi-style data directory.
+
+
+from __future__ import print_function
+import os
+import sys
+import numpy as np
+import subprocess
+import soundfile as sf
+import io
+from functools import lru_cache
+
+
+def load_segments(segments_file):
+ """ load segments file as array """
+ if not os.path.exists(segments_file):
+ return None
+ return np.loadtxt(
+ segments_file,
+ dtype=[('utt', 'object'),
+ ('rec', 'object'),
+ ('st', 'f'),
+ ('et', 'f')],
+ ndmin=1)
+
+
+def load_segments_hash(segments_file):
+ ret = {}
+ if not os.path.exists(segments_file):
+ return None
+ for line in open(segments_file):
+ utt, rec, st, et = line.strip().split()
+ ret[utt] = (rec, float(st), float(et))
+ return ret
+
+
+def load_segments_rechash(segments_file):
+ ret = {}
+ if not os.path.exists(segments_file):
+ return None
+ for line in open(segments_file):
+ utt, rec, st, et = line.strip().split()
+ if rec not in ret:
+ ret[rec] = []
+ ret[rec].append({'utt':utt, 'st':float(st), 'et':float(et)})
+ return ret
+
+
+def load_wav_scp(wav_scp_file):
+ """ return dictionary { rec: wav_rxfilename } """
+ lines = [line.strip().split(None, 1) for line in open(wav_scp_file)]
+ return {x[0]: x[1] for x in lines}
+
+
+@lru_cache(maxsize=1)
+def load_wav(wav_rxfilename, start=0, end=None):
+ """ This function reads audio file and return data in numpy.float32 array.
+ "lru_cache" holds recently loaded audio so that can be called
+ many times on the same audio file.
+ OPTIMIZE: controls lru_cache size for random access,
+ considering memory size
+ """
+ if wav_rxfilename.endswith('|'):
+ # input piped command
+ p = subprocess.Popen(wav_rxfilename[:-1], shell=True,
+ stdout=subprocess.PIPE)
+ data, samplerate = sf.read(io.BytesIO(p.stdout.read()),
+ dtype='float32')
+ # cannot seek
+ data = data[start:end]
+ elif wav_rxfilename == '-':
+ # stdin
+ data, samplerate = sf.read(sys.stdin, dtype='float32')
+ # cannot seek
+ data = data[start:end]
+ else:
+ # normal wav file
+ data, samplerate = sf.read(wav_rxfilename, start=start, stop=end)
+ return data, samplerate
+
+
+def load_utt2spk(utt2spk_file):
+ """ returns dictionary { uttid: spkid } """
+ lines = [line.strip().split(None, 1) for line in open(utt2spk_file)]
+ return {x[0]: x[1] for x in lines}
+
+
+def load_spk2utt(spk2utt_file):
+ """ returns dictionary { spkid: list of uttids } """
+ if not os.path.exists(spk2utt_file):
+ return None
+ lines = [line.strip().split() for line in open(spk2utt_file)]
+ return {x[0]: x[1:] for x in lines}
+
+
+def load_reco2dur(reco2dur_file):
+ """ returns dictionary { recid: duration } """
+ if not os.path.exists(reco2dur_file):
+ return None
+ lines = [line.strip().split(None, 1) for line in open(reco2dur_file)]
+ return {x[0]: float(x[1]) for x in lines}
+
+
+def process_wav(wav_rxfilename, process):
+ """ This function returns preprocessed wav_rxfilename
+ Args:
+ wav_rxfilename: input
+ process: command which can be connected via pipe,
+ use stdin and stdout
+ Returns:
+ wav_rxfilename: output piped command
+ """
+ if wav_rxfilename.endswith('|'):
+ # input piped command
+ return wav_rxfilename + process + "|"
+ else:
+ # stdin "-" or normal file
+ return "cat {} | {} |".format(wav_rxfilename, process)
+
+
+def extract_segments(wavs, segments=None):
+ """ This function returns generator of segmented audio as
+ (utterance id, numpy.float32 array)
+ TODO?: sampling rate is not converted.
+ """
+ if segments is not None:
+ # segments should be sorted by rec-id
+ for seg in segments:
+ wav = wavs[seg['rec']]
+ data, samplerate = load_wav(wav)
+ st_sample = np.rint(seg['st'] * samplerate).astype(int)
+ et_sample = np.rint(seg['et'] * samplerate).astype(int)
+ yield seg['utt'], data[st_sample:et_sample]
+ else:
+ # segments file not found,
+ # wav.scp is used as segmented audio list
+ for rec in wavs:
+ data, samplerate = load_wav(wavs[rec])
+ yield rec, data
+
+
+class KaldiData:
+ def __init__(self, data_dir):
+ self.data_dir = data_dir
+ self.segments = load_segments_rechash(
+ os.path.join(self.data_dir, 'segments'))
+ self.utt2spk = load_utt2spk(
+ os.path.join(self.data_dir, 'utt2spk'))
+ self.wavs = load_wav_scp(
+ os.path.join(self.data_dir, 'wav.scp'))
+ self.reco2dur = load_reco2dur(
+ os.path.join(self.data_dir, 'reco2dur'))
+ self.spk2utt = load_spk2utt(
+ os.path.join(self.data_dir, 'spk2utt'))
+
+ def load_wav(self, recid, start=0, end=None):
+ data, rate = load_wav(
+ self.wavs[recid], start, end)
+ return data, rate
diff --git a/funasr/modules/eend_ola/utils/losses.py b/funasr/modules/eend_ola/utils/losses.py
index af0181d..756952d 100644
--- a/funasr/modules/eend_ola/utils/losses.py
+++ b/funasr/modules/eend_ola/utils/losses.py
@@ -1,11 +1,10 @@
import numpy as np
import torch
import torch.nn.functional as F
-from itertools import permutations
-from torch import nn
+from scipy.optimize import linear_sum_assignment
-def standard_loss(ys, ts, label_delay=0):
+def standard_loss(ys, ts):
losses = [F.binary_cross_entropy(torch.sigmoid(y), t) * len(y) for y, t in zip(ys, ts)]
loss = torch.sum(torch.stack(losses))
n_frames = torch.from_numpy(np.array(np.sum([t.shape[0] for t in ts]))).to(torch.float32).to(ys[0].device)
@@ -13,55 +12,29 @@
return loss
-def batch_pit_n_speaker_loss(ys, ts, n_speakers_list):
- max_n_speakers = ts[0].shape[1]
- olens = [y.shape[0] for y in ys]
- ys = nn.utils.rnn.pad_sequence(ys, batch_first=True, padding_value=-1)
- ys_mask = [torch.ones(olen).to(ys.device) for olen in olens]
- ys_mask = torch.nn.utils.rnn.pad_sequence(ys_mask, batch_first=True, padding_value=0).unsqueeze(-1)
+def fast_batch_pit_n_speaker_loss(ys, ts):
+ with torch.no_grad():
+ bs = len(ys)
+ indices = []
+ for b in range(bs):
+ y = ys[b].transpose(0, 1)
+ t = ts[b].transpose(0, 1)
+ C, _ = t.shape
+ y = y[:, None, :].repeat(1, C, 1)
+ t = t[None, :, :].repeat(C, 1, 1)
+ bce_loss = F.binary_cross_entropy(torch.sigmoid(y), t, reduction="none").mean(-1)
+ C = bce_loss.cpu()
+ indices.append(linear_sum_assignment(C))
+ labels_perm = [t[:, idx[1]] for t, idx in zip(ts, indices)]
- losses = []
- for shift in range(max_n_speakers):
- ts_roll = [torch.roll(t, -shift, dims=1) for t in ts]
- ts_roll = nn.utils.rnn.pad_sequence(ts_roll, batch_first=True, padding_value=-1)
- loss = F.binary_cross_entropy(torch.sigmoid(ys), ts_roll, reduction='none')
- if ys_mask is not None:
- loss = loss * ys_mask
- loss = torch.sum(loss, dim=1)
- losses.append(loss)
- losses = torch.stack(losses, dim=2)
+ return labels_perm
- perms = np.array(list(permutations(range(max_n_speakers)))).astype(np.float32)
- perms = torch.from_numpy(perms).to(losses.device)
- y_ind = torch.arange(max_n_speakers, dtype=torch.float32, device=losses.device)
- t_inds = torch.fmod(perms - y_ind, max_n_speakers).to(torch.long)
- losses_perm = []
- for t_ind in t_inds:
- losses_perm.append(
- torch.mean(losses[:, y_ind.to(torch.long), t_ind], dim=1))
- losses_perm = torch.stack(losses_perm, dim=1)
-
- def select_perm_indices(num, max_num):
- perms = list(permutations(range(max_num)))
- sub_perms = list(permutations(range(num)))
- return [
- [x[:num] for x in perms].index(perm)
- for perm in sub_perms]
-
- masks = torch.full_like(losses_perm, device=losses.device, fill_value=float('inf'))
- for i, t in enumerate(ts):
- n_speakers = n_speakers_list[i]
- indices = select_perm_indices(n_speakers, max_n_speakers)
- masks[i, indices] = 0
- losses_perm += masks
-
- min_loss = torch.sum(torch.min(losses_perm, dim=1)[0])
- n_frames = torch.from_numpy(np.array(np.sum([t.shape[0] for t in ts]))).to(losses.device)
- min_loss = min_loss / n_frames
-
- min_indices = torch.argmin(losses_perm, dim=1)
- labels_perm = [t[:, perms[idx].to(torch.long)] for t, idx in zip(ts, min_indices)]
- labels_perm = [t[:, :n_speakers] for t, n_speakers in zip(labels_perm, n_speakers_list)]
-
- return min_loss, labels_perm
+def cal_power_loss(logits, power_ts):
+ losses = [F.cross_entropy(input=logit, target=power_t.to(torch.long)) * len(logit) for logit, power_t in
+ zip(logits, power_ts)]
+ loss = torch.sum(torch.stack(losses))
+ n_frames = torch.from_numpy(np.array(np.sum([power_t.shape[0] for power_t in power_ts]))).to(torch.float32).to(
+ power_ts[0].device)
+ loss = loss / n_frames
+ return loss
diff --git a/funasr/utils/prepare_data.py b/funasr/utils/prepare_data.py
index 0e773bb..8d82a2f 100644
--- a/funasr/utils/prepare_data.py
+++ b/funasr/utils/prepare_data.py
@@ -196,12 +196,16 @@
def prepare_data(args, distributed_option):
distributed = distributed_option.distributed
+ data_names = args.dataset_conf.get("data_names", "speech,text").split(",")
+ data_types = args.dataset_conf.get("data_types", "sound,text").split(",")
+ file_names = args.data_file_names.split(",")
+ batch_type = args.dataset_conf["batch_conf"]["batch_type"]
if not distributed or distributed_option.dist_rank == 0:
if hasattr(args, "filter_input") and args.filter_input:
filter_wav_text(args.data_dir, args.train_set)
filter_wav_text(args.data_dir, args.valid_set)
- if args.dataset_type == "small":
+ if args.dataset_type == "small" and batch_type != "unsorted":
calc_shape(args, args.train_set)
calc_shape(args, args.valid_set)
@@ -209,9 +213,6 @@
generate_data_list(args, args.data_dir, args.train_set)
generate_data_list(args, args.data_dir, args.valid_set)
- data_names = args.dataset_conf.get("data_names", "speech,text").split(",")
- data_types = args.dataset_conf.get("data_types", "sound,text").split(",")
- file_names = args.data_file_names.split(",")
print("data_names: {}, data_types: {}, file_names: {}".format(data_names, data_types, file_names))
assert len(data_names) == len(data_types) == len(file_names)
if args.dataset_type == "small":
--
Gitblit v1.9.1