python/FunASR-XL.git

New file
			@@ -0,0 +1,104 @@
			# network architecture
			# encoder related
			encoder: conformer
			encoder_conf:
			output_size: 512 # dimension of attention
			attention_heads: 8
			linear_units: 2048 # the number of units of position-wise feed forward
			num_blocks: 12 # the number of encoder blocks
			dropout_rate: 0.1
			positional_dropout_rate: 0.1
			attention_dropout_rate: 0.0
			input_layer: conv2d # encoder architecture type
			normalize_before: true
			rel_pos_type: latest
			pos_enc_layer_type: rel_pos
			selfattention_layer_type: rel_selfattn
			activation_type: swish
			macaron_style: true
			use_cnn_module: true
			cnn_module_kernel: 15

			# decoder related
			decoder: transformer
			decoder_conf:
			attention_heads: 8
			linear_units: 2048
			num_blocks: 6
			dropout_rate: 0.1
			positional_dropout_rate: 0.1
			self_attention_dropout_rate: 0.0
			src_attention_dropout_rate: 0.0

			# CTC realted
			ctc_conf:
			ignore_nan_grad: true

			# frontend related
			frontend: wav_frontend
			frontend_conf:
			fs: 16000
			window: hamming
			n_mels: 80
			frame_length: 25
			frame_shift: 10
			lfr_m: 1
			lfr_n: 1

			# hybrid CTC/attention
			model_conf:
			ctc_weight: 0.3
			lsm_weight: 0.1 # label smoothing option
			length_normalized_loss: false

			# optimization related
			accum_grad: 4
			grad_clip: 5
			patience: none
			max_epoch: 30
			val_scheduler_criterion:
			- valid
			- acc
			best_model_criterion:
			- - valid
			- acc
			- max
			keep_nbest_models: 10

			optim: adam
			optim_conf:
			lr: 0.0015
			scheduler: warmuplr
			scheduler_conf:
			warmup_steps: 30000

			specaug: specaug
			specaug_conf:
			apply_time_warp: true
			time_warp_window: 5
			time_warp_mode: bicubic
			apply_freq_mask: true
			freq_mask_width_range:
			- 0
			- 30
			num_freq_mask: 2
			apply_time_mask: true
			time_mask_width_range:
			- 0
			- 40
			num_time_mask: 2

			dataset_conf:
			data_names: speech,text
			data_types: sound,text
			shuffle: True
			shuffle_conf:
			shuffle_size: 2048
			sort_size: 500
			batch_conf:
			batch_type: token
			batch_size: 32000
			num_workers: 8

			log_interval: 50
			normalize: None

	egs/wenetspeech/conformer/conf/decode_asr_transformer_5beam.yaml	6 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/wenetspeech/conformer/conf/train_asr_conformer.yaml	104 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/wenetspeech/conformer/path.sh	5 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/wenetspeech/conformer/run.sh	13 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/wenetspeech/conformer/utils	1 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史

New file
			@@ -0,0 +1,6 @@
			beam_size: 5
			penalty: 0.0
			maxlenratio: 0.0
			minlenratio: 0.0
			ctc_weight: 0.5
			lm_weight: 0.7

New file
			@@ -0,0 +1,5 @@
			export FUNASR_DIR=$PWD/../../..

			# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
			export PYTHONIOENCODING=UTF-8
			export PATH=$FUNASR_DIR/funasr/bin:$PATH

New file
			@@ -0,0 +1,13 @@
			#!/usr/bin/env bash

			. ./path.sh \|\| exit 1;

			# machines configuration
			CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
			gpu_num=8
			count=1
			gpu_inference=true # Whether to perform gpu decoding, set false for cpu decoding
			# for gpu decoding, inference_nj=ngpu*njob; for cpu decoding, inference_nj=njob
			njob=5
			train_cmd=utils/run.pl
			infer_cmd=utils/run.pl