python/FunASR-XL.git

			@@ -1,54 +1,57 @@
			encoder: chunk_conformer
			encoder_conf:
			main_conf:
			pos_wise_act_type: swish
			pos_enc_dropout_rate: 0.3
			conv_mod_act_type: swish
			activation_type: swish
			positional_dropout_rate: 0.5
			time_reduction_factor: 2
			unified_model_training: true
			default_chunk_size: 16
			jitter_range: 4
			left_chunk_size: 1
			input_conf:
			block_type: conv2d
			conv_size: 512
			embed_vgg_like: false
			subsampling_factor: 4
			num_frame: 1
			body_conf:
			- block_type: conformer
			linear_size: 2048
			hidden_size: 512
			heads: 8
			dropout_rate: 0.3
			pos_wise_dropout_rate: 0.3
			att_dropout_rate: 0.3
			conv_mod_kernel_size: 15
			linear_units: 2048
			output_size: 512
			attention_heads: 8
			dropout_rate: 0.5
			positional_dropout_rate: 0.5
			attention_dropout_rate: 0.5
			cnn_module_kernel: 15
			num_blocks: 12

			# decoder related
			decoder: rnn
			decoder_conf:
			rnnt_decoder: rnnt
			rnnt_decoder_conf:
			embed_size: 512
			hidden_size: 512
			embed_dropout_rate: 0.2
			dropout_rate: 0.1

			embed_dropout_rate: 0.5
			dropout_rate: 0.5
			joint_network_conf:
			joint_space_size: 512

			# frontend related
			frontend: wav_frontend
			frontend_conf:
			fs: 16000
			window: hamming
			n_mels: 80
			frame_length: 25
			frame_shift: 10
			lfr_m: 1
			lfr_n: 1


			# Auxiliary CTC
			model: rnnt_unified
			model_conf:
			auxiliary_ctc_weight: 0.0

			# minibatch related
			use_amp: true
			batch_type: numel
			batch_bins: 1600000
			num_workers: 16

			# optimization related
			accum_grad: 1
			grad_clip: 5
			max_epoch: 80
			max_epoch: 120
			val_scheduler_criterion:
			- valid
			- loss
			@@ -56,16 +59,14 @@
			- - valid
			- cer_transducer_chunk
			- min
			keep_nbest_models: 5
			keep_nbest_models: 10

			optim: adam
			optim_conf:
			lr: 0.0003
			lr: 0.001
			scheduler: warmuplr
			scheduler_conf:
			warmup_steps: 25000

			normalize: None

			specaug: specaug
			specaug_conf:
			@@ -75,10 +76,24 @@
			apply_freq_mask: true
			freq_mask_width_range:
			- 0
			- 30
			- 40
			num_freq_mask: 2
			apply_time_mask: true
			time_mask_width_range:
			- 0
			- 40
			num_time_mask: 2
			- 50
			num_time_mask: 5

			dataset_conf:
			data_names: speech,text
			data_types: sound,text
			shuffle: True
			shuffle_conf:
			shuffle_size: 2048
			sort_size: 500
			batch_conf:
			batch_type: token
			batch_size: 16000
			num_workers: 8

			log_interval: 50