python/FunASR-XL.git

			@@ -69,7 +69,7 @@
			self.stochastic_depth_rate = stochastic_depth_rate
			self.dropout_rate = dropout_rate

			def forward(self, x, mask, cache=None, mask_shfit_chunk=None, mask_att_chunk_encoder=None):
			def forward(self, x, mask, cache=None, mask_shift_chunk=None, mask_att_chunk_encoder=None):
			"""Compute encoded features.

			Args:
			@@ -106,7 +106,7 @@
			self.self_attn(
			x,
			mask,
			mask_shfit_chunk=mask_shfit_chunk,
			mask_shift_chunk=mask_shift_chunk,
			mask_att_chunk_encoder=mask_att_chunk_encoder,
			),
			),
			@@ -122,7 +122,7 @@
			self.self_attn(
			x,
			mask,
			mask_shfit_chunk=mask_shfit_chunk,
			mask_shift_chunk=mask_shift_chunk,
			mask_att_chunk_encoder=mask_att_chunk_encoder,
			)
			)
			@@ -131,7 +131,7 @@
			self.self_attn(
			x,
			mask,
			mask_shfit_chunk=mask_shfit_chunk,
			mask_shift_chunk=mask_shift_chunk,
			mask_att_chunk_encoder=mask_att_chunk_encoder,
			)
			)
			@@ -145,7 +145,7 @@
			if not self.normalize_before:
			x = self.norm2(x)

			return x, mask, cache, mask_shfit_chunk, mask_att_chunk_encoder
			return x, mask, cache, mask_shift_chunk, mask_att_chunk_encoder

			def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0):
			"""Compute encoded features.
			@@ -212,7 +212,7 @@
			interctc_layer_idx: List[int] = [],
			interctc_use_conditioning: bool = False,
			kernel_size: int = 11,
			sanm_shfit: int = 0,
			sanm_shift: int = 0,
			lora_list: List[str] = None,
			lora_rank: int = 8,
			lora_alpha: int = 16,
			@@ -299,7 +299,7 @@
			output_size,
			attention_dropout_rate,
			kernel_size,
			sanm_shfit,
			sanm_shift,
			lora_list,
			lora_rank,
			lora_alpha,
			@@ -312,7 +312,7 @@
			output_size,
			attention_dropout_rate,
			kernel_size,
			sanm_shfit,
			sanm_shift,
			lora_list,
			lora_rank,
			lora_alpha,
			@@ -523,6 +523,7 @@
			feats_dim=560,
			model_name="encoder",
			onnx: bool = True,
			ctc_linear: nn.Module = None,
			):
			super().__init__()
			self.embed = model.embed
			@@ -553,6 +554,8 @@
			self.num_heads = model.encoders[0].self_attn.h
			self.hidden_size = model.encoders[0].self_attn.linear_out.out_features

			self.ctc_linear = ctc_linear

			def prepare_mask(self, mask):
			mask_3d_btd = mask[:, :, None]
			if len(mask.shape) == 2:
			@@ -566,6 +569,7 @@
			def forward(self, speech: torch.Tensor, speech_lengths: torch.Tensor, online: bool = False):
			if not online:
			speech = speech * self._output_size**0.5

			mask = self.make_pad_mask(speech_lengths)
			mask = self.prepare_mask(mask)
			if self.embed is None:
			@@ -581,6 +585,10 @@

			xs_pad = self.model.after_norm(xs_pad)

			if self.ctc_linear is not None:
			xs_pad = self.ctc_linear(xs_pad)
			xs_pad = F.softmax(xs_pad, dim=2)

			return xs_pad, speech_lengths

			def get_output_size(self):