python/FunASR-XL.git

			@@ -29,9 +29,8 @@
			from funasr.modules.nets_utils import make_pad_mask, pad_list
			from funasr.modules.nets_utils import th_accuracy
			from funasr.torch_utils.device_funcs import force_gatherable
			from funasr.train.abs_espnet_model import AbsESPnetModel
			from funasr.models.base_model import FunASRModel
			from funasr.models.predictor.cif import CifPredictorV3


			if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
			from torch.cuda.amp import autocast
			@@ -42,7 +41,7 @@
			yield


			class Paraformer(AbsESPnetModel):
			class Paraformer(FunASRModel):
			"""
			Author: Speech Lab of DAMO Academy, Alibaba Group
			Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition
			@@ -56,9 +55,7 @@
			frontend: Optional[AbsFrontend],
			specaug: Optional[AbsSpecAug],
			normalize: Optional[AbsNormalize],
			preencoder: Optional[AbsPreEncoder],
			encoder: AbsEncoder,
			postencoder: Optional[AbsPostEncoder],
			decoder: AbsDecoder,
			ctc: CTC,
			ctc_weight: float = 0.5,
			@@ -79,6 +76,9 @@
			predictor_bias: int = 0,
			sampling_ratio: float = 0.2,
			share_embedding: bool = False,
			preencoder: Optional[AbsPreEncoder] = None,
			postencoder: Optional[AbsPostEncoder] = None,
			use_1st_decoder_loss: bool = False,
			):
			assert check_argument_types()
			assert 0.0 <= ctc_weight <= 1.0, ctc_weight
			@@ -145,6 +145,8 @@
			if self.share_embedding:
			self.decoder.embed = None

			self.use_1st_decoder_loss = use_1st_decoder_loss

			def forward(
			self,
			speech: torch.Tensor,
			@@ -153,7 +155,455 @@
			text_lengths: torch.Tensor,
			) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
			"""Frontend + Encoder + Decoder + Calc loss
			Args:
			speech: (Batch, Length, ...)
			speech_lengths: (Batch, )
			text: (Batch, Length)
			text_lengths: (Batch,)
			"""
			assert text_lengths.dim() == 1, text_lengths.shape
			# Check that batch_size is unified
			assert (
			speech.shape[0]
			== speech_lengths.shape[0]
			== text.shape[0]
			== text_lengths.shape[0]
			), (speech.shape, speech_lengths.shape, text.shape, text_lengths.shape)
			batch_size = speech.shape[0]
			self.step_cur += 1
			# for data-parallel
			text = text[:, : text_lengths.max()]
			speech = speech[:, :speech_lengths.max()]

			# 1. Encoder
			encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
			intermediate_outs = None
			if isinstance(encoder_out, tuple):
			intermediate_outs = encoder_out[1]
			encoder_out = encoder_out[0]

			loss_att, pre_loss_att, acc_att, cer_att, wer_att = None, None, None, None, None
			loss_ctc, cer_ctc = None, None
			loss_pre = None
			stats = dict()

			# 1. CTC branch
			if self.ctc_weight != 0.0:
			loss_ctc, cer_ctc = self._calc_ctc_loss(
			encoder_out, encoder_out_lens, text, text_lengths
			)

			# Collect CTC branch stats
			stats["loss_ctc"] = loss_ctc.detach() if loss_ctc is not None else None
			stats["cer_ctc"] = cer_ctc

			# Intermediate CTC (optional)
			loss_interctc = 0.0
			if self.interctc_weight != 0.0 and intermediate_outs is not None:
			for layer_idx, intermediate_out in intermediate_outs:
			# we assume intermediate_out has the same length & padding
			# as those of encoder_out
			loss_ic, cer_ic = self._calc_ctc_loss(
			intermediate_out, encoder_out_lens, text, text_lengths
			)
			loss_interctc = loss_interctc + loss_ic

			# Collect Intermedaite CTC stats
			stats["loss_interctc_layer{}".format(layer_idx)] = (
			loss_ic.detach() if loss_ic is not None else None
			)
			stats["cer_interctc_layer{}".format(layer_idx)] = cer_ic

			loss_interctc = loss_interctc / len(intermediate_outs)

			# calculate whole encoder loss
			loss_ctc = (
			1 - self.interctc_weight
			) * loss_ctc + self.interctc_weight * loss_interctc

			# 2b. Attention decoder branch
			if self.ctc_weight != 1.0:
			loss_att, acc_att, cer_att, wer_att, loss_pre, pre_loss_att = self._calc_att_loss(
			encoder_out, encoder_out_lens, text, text_lengths
			)

			# 3. CTC-Att loss definition
			if self.ctc_weight == 0.0:
			loss = loss_att + loss_pre * self.predictor_weight
			elif self.ctc_weight == 1.0:
			loss = loss_ctc
			else:
			loss = self.ctc_weight * loss_ctc + (1 - self.ctc_weight) * loss_att + loss_pre * self.predictor_weight

			if self.use_1st_decoder_loss and pre_loss_att is not None:
			loss = loss + pre_loss_att

			# Collect Attn branch stats
			stats["loss_att"] = loss_att.detach() if loss_att is not None else None
			stats["pre_loss_att"] = pre_loss_att.detach() if pre_loss_att is not None else None
			stats["acc"] = acc_att
			stats["cer"] = cer_att
			stats["wer"] = wer_att
			stats["loss_pre"] = loss_pre.detach().cpu() if loss_pre is not None else None

			stats["loss"] = torch.clone(loss.detach())

			# force_gatherable: to-device and to-tensor if scalar for DataParallel
			loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
			return loss, stats, weight

			def collect_feats(
			self,
			speech: torch.Tensor,
			speech_lengths: torch.Tensor,
			text: torch.Tensor,
			text_lengths: torch.Tensor,
			) -> Dict[str, torch.Tensor]:
			if self.extract_feats_in_collect_stats:
			feats, feats_lengths = self._extract_feats(speech, speech_lengths)
			else:
			# Generate dummy stats if extract_feats_in_collect_stats is False
			logging.warning(
			"Generating dummy stats for feats and feats_lengths, "
			"because encoder_conf.extract_feats_in_collect_stats is "
			f"{self.extract_feats_in_collect_stats}"
			)
			feats, feats_lengths = speech, speech_lengths
			return {"feats": feats, "feats_lengths": feats_lengths}

			def encode(
			self, speech: torch.Tensor, speech_lengths: torch.Tensor
			) -> Tuple[torch.Tensor, torch.Tensor]:
			"""Frontend + Encoder. Note that this method is used by asr_inference.py
			Args:
			speech: (Batch, Length, ...)
			speech_lengths: (Batch, )
			"""
			with autocast(False):
			# 1. Extract feats
			feats, feats_lengths = self._extract_feats(speech, speech_lengths)

			# 2. Data augmentation
			if self.specaug is not None and self.training:
			feats, feats_lengths = self.specaug(feats, feats_lengths)

			# 3. Normalization for feature: e.g. Global-CMVN, Utterance-CMVN
			if self.normalize is not None:
			feats, feats_lengths = self.normalize(feats, feats_lengths)

			# Pre-encoder, e.g. used for raw input data
			if self.preencoder is not None:
			feats, feats_lengths = self.preencoder(feats, feats_lengths)

			# 4. Forward encoder
			# feats: (Batch, Length, Dim)
			# -> encoder_out: (Batch, Length2, Dim2)
			if self.encoder.interctc_use_conditioning:
			encoder_out, encoder_out_lens, _ = self.encoder(
			feats, feats_lengths, ctc=self.ctc
			)
			else:
			encoder_out, encoder_out_lens, _ = self.encoder(feats, feats_lengths)
			intermediate_outs = None
			if isinstance(encoder_out, tuple):
			intermediate_outs = encoder_out[1]
			encoder_out = encoder_out[0]

			# Post-encoder, e.g. NLU
			if self.postencoder is not None:
			encoder_out, encoder_out_lens = self.postencoder(
			encoder_out, encoder_out_lens
			)

			assert encoder_out.size(0) == speech.size(0), (
			encoder_out.size(),
			speech.size(0),
			)
			assert encoder_out.size(1) <= encoder_out_lens.max(), (
			encoder_out.size(),
			encoder_out_lens.max(),
			)

			if intermediate_outs is not None:
			return (encoder_out, intermediate_outs), encoder_out_lens

			return encoder_out, encoder_out_lens

			def calc_predictor(self, encoder_out, encoder_out_lens):

			encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
			encoder_out.device)
			pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = self.predictor(encoder_out, None, encoder_out_mask,
			ignore_id=self.ignore_id)
			return pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index

			def cal_decoder_with_predictor(self, encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens):

			decoder_outs = self.decoder(
			encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens
			)
			decoder_out = decoder_outs[0]
			decoder_out = torch.log_softmax(decoder_out, dim=-1)
			return decoder_out, ys_pad_lens

			def _extract_feats(
			self, speech: torch.Tensor, speech_lengths: torch.Tensor
			) -> Tuple[torch.Tensor, torch.Tensor]:
			assert speech_lengths.dim() == 1, speech_lengths.shape

			# for data-parallel
			speech = speech[:, : speech_lengths.max()]
			if self.frontend is not None:
			# Frontend
			# e.g. STFT and Feature extract
			# data_loader may send time-domain signal in this case
			# speech (Batch, NSamples) -> feats: (Batch, NFrames, Dim)
			feats, feats_lengths = self.frontend(speech, speech_lengths)
			else:
			# No frontend and no feature extract
			feats, feats_lengths = speech, speech_lengths
			return feats, feats_lengths

			def nll(
			self,
			encoder_out: torch.Tensor,
			encoder_out_lens: torch.Tensor,
			ys_pad: torch.Tensor,
			ys_pad_lens: torch.Tensor,
			) -> torch.Tensor:
			"""Compute negative log likelihood(nll) from transformer-decoder
			Normally, this function is called in batchify_nll.
			Args:
			encoder_out: (Batch, Length, Dim)
			encoder_out_lens: (Batch,)
			ys_pad: (Batch, Length)
			ys_pad_lens: (Batch,)
			"""
			ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id)
			ys_in_lens = ys_pad_lens + 1

			# 1. Forward decoder
			decoder_out, _ = self.decoder(
			encoder_out, encoder_out_lens, ys_in_pad, ys_in_lens
			) # [batch, seqlen, dim]
			batch_size = decoder_out.size(0)
			decoder_num_class = decoder_out.size(2)
			# nll: negative log-likelihood
			nll = torch.nn.functional.cross_entropy(
			decoder_out.view(-1, decoder_num_class),
			ys_out_pad.view(-1),
			ignore_index=self.ignore_id,
			reduction="none",
			)
			nll = nll.view(batch_size, -1)
			nll = nll.sum(dim=1)
			assert nll.size(0) == batch_size
			return nll

			def batchify_nll(
			self,
			encoder_out: torch.Tensor,
			encoder_out_lens: torch.Tensor,
			ys_pad: torch.Tensor,
			ys_pad_lens: torch.Tensor,
			batch_size: int = 100,
			):
			"""Compute negative log likelihood(nll) from transformer-decoder
			To avoid OOM, this fuction seperate the input into batches.
			Then call nll for each batch and combine and return results.
			Args:
			encoder_out: (Batch, Length, Dim)
			encoder_out_lens: (Batch,)
			ys_pad: (Batch, Length)
			ys_pad_lens: (Batch,)
			batch_size: int, samples each batch contain when computing nll,
			you may change this to avoid OOM or increase
			GPU memory usage
			"""
			total_num = encoder_out.size(0)
			if total_num <= batch_size:
			nll = self.nll(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens)
			else:
			nll = []
			start_idx = 0
			while True:
			end_idx = min(start_idx + batch_size, total_num)
			batch_encoder_out = encoder_out[start_idx:end_idx, :, :]
			batch_encoder_out_lens = encoder_out_lens[start_idx:end_idx]
			batch_ys_pad = ys_pad[start_idx:end_idx, :]
			batch_ys_pad_lens = ys_pad_lens[start_idx:end_idx]
			batch_nll = self.nll(
			batch_encoder_out,
			batch_encoder_out_lens,
			batch_ys_pad,
			batch_ys_pad_lens,
			)
			nll.append(batch_nll)
			start_idx = end_idx
			if start_idx == total_num:
			break
			nll = torch.cat(nll)
			assert nll.size(0) == total_num
			return nll

			def _calc_att_loss(
			self,
			encoder_out: torch.Tensor,
			encoder_out_lens: torch.Tensor,
			ys_pad: torch.Tensor,
			ys_pad_lens: torch.Tensor,
			):
			encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
			encoder_out.device)
			if self.predictor_bias == 1:
			_, ys_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id)
			ys_pad_lens = ys_pad_lens + self.predictor_bias
			pre_acoustic_embeds, pre_token_length, _, pre_peak_index = self.predictor(encoder_out, ys_pad, encoder_out_mask,
			ignore_id=self.ignore_id)

			# 0. sampler
			decoder_out_1st = None
			pre_loss_att = None
			if self.sampling_ratio > 0.0:
			if self.step_cur < 2:
			logging.info("enable sampler in paraformer, sampling_ratio: {}".format(self.sampling_ratio))
			if self.use_1st_decoder_loss:
			sematic_embeds, decoder_out_1st, pre_loss_att = self.sampler_with_grad(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens,
			pre_acoustic_embeds)
			else:
			sematic_embeds, decoder_out_1st = self.sampler(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens,
			pre_acoustic_embeds)
			else:
			if self.step_cur < 2:
			logging.info("disable sampler in paraformer, sampling_ratio: {}".format(self.sampling_ratio))
			sematic_embeds = pre_acoustic_embeds

			# 1. Forward decoder
			decoder_outs = self.decoder(
			encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens
			)
			decoder_out, _ = decoder_outs[0], decoder_outs[1]

			if decoder_out_1st is None:
			decoder_out_1st = decoder_out
			# 2. Compute attention loss
			loss_att = self.criterion_att(decoder_out, ys_pad)
			acc_att = th_accuracy(
			decoder_out_1st.view(-1, self.vocab_size),
			ys_pad,
			ignore_label=self.ignore_id,
			)
			loss_pre = self.criterion_pre(ys_pad_lens.type_as(pre_token_length), pre_token_length)

			# Compute cer/wer using attention-decoder
			if self.training or self.error_calculator is None:
			cer_att, wer_att = None, None
			else:
			ys_hat = decoder_out_1st.argmax(dim=-1)
			cer_att, wer_att = self.error_calculator(ys_hat.cpu(), ys_pad.cpu())

			return loss_att, acc_att, cer_att, wer_att, loss_pre, pre_loss_att

			def sampler(self, encoder_out, encoder_out_lens, ys_pad, ys_pad_lens, pre_acoustic_embeds):

			tgt_mask = (~make_pad_mask(ys_pad_lens, maxlen=ys_pad_lens.max())[:, :, None]).to(ys_pad.device)
			ys_pad_masked = ys_pad * tgt_mask[:, :, 0]
			if self.share_embedding:
			ys_pad_embed = self.decoder.output_layer.weight[ys_pad_masked]
			else:
			ys_pad_embed = self.decoder.embed(ys_pad_masked)
			with torch.no_grad():
			decoder_outs = self.decoder(
			encoder_out, encoder_out_lens, pre_acoustic_embeds, ys_pad_lens
			)
			decoder_out, _ = decoder_outs[0], decoder_outs[1]
			pred_tokens = decoder_out.argmax(-1)
			nonpad_positions = ys_pad.ne(self.ignore_id)
			seq_lens = (nonpad_positions).sum(1)
			same_num = ((pred_tokens == ys_pad) & nonpad_positions).sum(1)
			input_mask = torch.ones_like(nonpad_positions)
			bsz, seq_len = ys_pad.size()
			for li in range(bsz):
			target_num = (((seq_lens[li] - same_num[li].sum()).float()) * self.sampling_ratio).long()
			if target_num > 0:
			input_mask[li].scatter_(dim=0, index=torch.randperm(seq_lens[li])[:target_num].cuda(), value=0)
			input_mask = input_mask.eq(1)
			input_mask = input_mask.masked_fill(~nonpad_positions, False)
			input_mask_expand_dim = input_mask.unsqueeze(2).to(pre_acoustic_embeds.device)

			sematic_embeds = pre_acoustic_embeds.masked_fill(~input_mask_expand_dim, 0) + ys_pad_embed.masked_fill(
			input_mask_expand_dim, 0)
			return sematic_embeds * tgt_mask, decoder_out * tgt_mask

			def sampler_with_grad(self, encoder_out, encoder_out_lens, ys_pad, ys_pad_lens, pre_acoustic_embeds):
			tgt_mask = (~make_pad_mask(ys_pad_lens, maxlen=ys_pad_lens.max())[:, :, None]).to(ys_pad.device)
			ys_pad_masked = ys_pad * tgt_mask[:, :, 0]
			if self.share_embedding:
			ys_pad_embed = self.decoder.output_layer.weight[ys_pad_masked]
			else:
			ys_pad_embed = self.decoder.embed(ys_pad_masked)
			decoder_outs = self.decoder(
			encoder_out, encoder_out_lens, pre_acoustic_embeds, ys_pad_lens
			)
			pre_loss_att = self.criterion_att(decoder_outs[0], ys_pad)
			decoder_out, _ = decoder_outs[0], decoder_outs[1]
			pred_tokens = decoder_out.argmax(-1)
			nonpad_positions = ys_pad.ne(self.ignore_id)
			seq_lens = (nonpad_positions).sum(1)
			same_num = ((pred_tokens == ys_pad) & nonpad_positions).sum(1)
			input_mask = torch.ones_like(nonpad_positions)
			bsz, seq_len = ys_pad.size()
			for li in range(bsz):
			target_num = (((seq_lens[li] - same_num[li].sum()).float()) * self.sampling_ratio).long()
			if target_num > 0:
			input_mask[li].scatter_(dim=0, index=torch.randperm(seq_lens[li])[:target_num].cuda(), value=0)
			input_mask = input_mask.eq(1)
			input_mask = input_mask.masked_fill(~nonpad_positions, False)
			input_mask_expand_dim = input_mask.unsqueeze(2).to(pre_acoustic_embeds.device)

			sematic_embeds = pre_acoustic_embeds.masked_fill(~input_mask_expand_dim, 0) + ys_pad_embed.masked_fill(
			input_mask_expand_dim, 0)

			return sematic_embeds * tgt_mask, decoder_out * tgt_mask, pre_loss_att

			def _calc_ctc_loss(
			self,
			encoder_out: torch.Tensor,
			encoder_out_lens: torch.Tensor,
			ys_pad: torch.Tensor,
			ys_pad_lens: torch.Tensor,
			):
			# Calc CTC loss
			loss_ctc = self.ctc(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens)

			# Calc CER using CTC
			cer_ctc = None
			if not self.training and self.error_calculator is not None:
			ys_hat = self.ctc.argmax(encoder_out).data
			cer_ctc = self.error_calculator(ys_hat.cpu(), ys_pad.cpu(), is_ctc=True)
			return loss_ctc, cer_ctc


			class ParaformerOnline(Paraformer):
			"""
			Author: Speech Lab, Alibaba Group, China
			Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition
			https://arxiv.org/abs/2206.08317
			"""

			def __init__(
			self, args, *kwargs,
			):
			super().__init__(args, *kwargs)

			def forward(
			self,
			speech: torch.Tensor,
			speech_lengths: torch.Tensor,
			text: torch.Tensor,
			text_lengths: torch.Tensor,
			) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
			"""Frontend + Encoder + Decoder + Calc loss
			Args:
			speech: (Batch, Length, ...)
			speech_lengths: (Batch, )
			@@ -247,89 +697,14 @@
			loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
			return loss, stats, weight

			def collect_feats(
			self,
			speech: torch.Tensor,
			speech_lengths: torch.Tensor,
			text: torch.Tensor,
			text_lengths: torch.Tensor,
			) -> Dict[str, torch.Tensor]:
			if self.extract_feats_in_collect_stats:
			feats, feats_lengths = self._extract_feats(speech, speech_lengths)
			else:
			# Generate dummy stats if extract_feats_in_collect_stats is False
			logging.warning(
			"Generating dummy stats for feats and feats_lengths, "
			"because encoder_conf.extract_feats_in_collect_stats is "
			f"{self.extract_feats_in_collect_stats}"
			)
			feats, feats_lengths = speech, speech_lengths
			return {"feats": feats, "feats_lengths": feats_lengths}

			def encode(
			self, speech: torch.Tensor, speech_lengths: torch.Tensor
			) -> Tuple[torch.Tensor, torch.Tensor]:
			"""Frontend + Encoder. Note that this method is used by asr_inference.py

			Args:
			speech: (Batch, Length, ...)
			speech_lengths: (Batch, )
			"""
			with autocast(False):
			# 1. Extract feats
			feats, feats_lengths = self._extract_feats(speech, speech_lengths)

			# 2. Data augmentation
			if self.specaug is not None and self.training:
			feats, feats_lengths = self.specaug(feats, feats_lengths)

			# 3. Normalization for feature: e.g. Global-CMVN, Utterance-CMVN
			if self.normalize is not None:
			feats, feats_lengths = self.normalize(feats, feats_lengths)

			# Pre-encoder, e.g. used for raw input data
			if self.preencoder is not None:
			feats, feats_lengths = self.preencoder(feats, feats_lengths)

			# 4. Forward encoder
			# feats: (Batch, Length, Dim)
			# -> encoder_out: (Batch, Length2, Dim2)
			if self.encoder.interctc_use_conditioning:
			encoder_out, encoder_out_lens, _ = self.encoder(
			feats, feats_lengths, ctc=self.ctc
			)
			else:
			encoder_out, encoder_out_lens, _ = self.encoder(feats, feats_lengths)
			intermediate_outs = None
			if isinstance(encoder_out, tuple):
			intermediate_outs = encoder_out[1]
			encoder_out = encoder_out[0]

			# Post-encoder, e.g. NLU
			if self.postencoder is not None:
			encoder_out, encoder_out_lens = self.postencoder(
			encoder_out, encoder_out_lens
			)

			assert encoder_out.size(0) == speech.size(0), (
			encoder_out.size(),
			speech.size(0),
			)
			assert encoder_out.size(1) <= encoder_out_lens.max(), (
			encoder_out.size(),
			encoder_out_lens.max(),
			)

			if intermediate_outs is not None:
			return (encoder_out, intermediate_outs), encoder_out_lens

			return encoder_out, encoder_out_lens

			def encode_chunk(
			self, speech: torch.Tensor, speech_lengths: torch.Tensor, cache: dict = None
			) -> Tuple[torch.Tensor, torch.Tensor]:
			"""Frontend + Encoder. Note that this method is used by asr_inference.py
			<<<<<<< HEAD
			=======

			>>>>>>> 4cd79db451786548d8a100f25c3b03da0eb30f4b
			Args:
			speech: (Batch, Length, ...)
			speech_lengths: (Batch, )
			@@ -375,27 +750,11 @@

			return encoder_out, torch.tensor([encoder_out.size(1)])

			def calc_predictor(self, encoder_out, encoder_out_lens):

			encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
			encoder_out.device)
			pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = self.predictor(encoder_out, None, encoder_out_mask,
			ignore_id=self.ignore_id)
			return pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index

			def calc_predictor_chunk(self, encoder_out, cache=None):

			pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = self.predictor.forward_chunk(encoder_out, cache["encoder"])
			return pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index

			def cal_decoder_with_predictor(self, encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens):

			decoder_outs = self.decoder(
			encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens
			)
			decoder_out = decoder_outs[0]
			decoder_out = torch.log_softmax(decoder_out, dim=-1)
			return decoder_out, ys_pad_lens
			pre_acoustic_embeds, pre_token_length = \
			self.predictor.forward_chunk(encoder_out, cache["encoder"])
			return pre_acoustic_embeds, pre_token_length

			def cal_decoder_with_predictor_chunk(self, encoder_out, sematic_embeds, cache=None):
			decoder_outs = self.decoder.forward_chunk(
			@@ -404,210 +763,6 @@
			decoder_out = decoder_outs
			decoder_out = torch.log_softmax(decoder_out, dim=-1)
			return decoder_out

			def _extract_feats(
			self, speech: torch.Tensor, speech_lengths: torch.Tensor
			) -> Tuple[torch.Tensor, torch.Tensor]:
			assert speech_lengths.dim() == 1, speech_lengths.shape

			# for data-parallel
			speech = speech[:, : speech_lengths.max()]
			if self.frontend is not None:
			# Frontend
			# e.g. STFT and Feature extract
			# data_loader may send time-domain signal in this case
			# speech (Batch, NSamples) -> feats: (Batch, NFrames, Dim)
			feats, feats_lengths = self.frontend(speech, speech_lengths)
			else:
			# No frontend and no feature extract
			feats, feats_lengths = speech, speech_lengths
			return feats, feats_lengths

			def nll(
			self,
			encoder_out: torch.Tensor,
			encoder_out_lens: torch.Tensor,
			ys_pad: torch.Tensor,
			ys_pad_lens: torch.Tensor,
			) -> torch.Tensor:
			"""Compute negative log likelihood(nll) from transformer-decoder

			Normally, this function is called in batchify_nll.

			Args:
			encoder_out: (Batch, Length, Dim)
			encoder_out_lens: (Batch,)
			ys_pad: (Batch, Length)
			ys_pad_lens: (Batch,)
			"""
			ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id)
			ys_in_lens = ys_pad_lens + 1

			# 1. Forward decoder
			decoder_out, _ = self.decoder(
			encoder_out, encoder_out_lens, ys_in_pad, ys_in_lens
			) # [batch, seqlen, dim]
			batch_size = decoder_out.size(0)
			decoder_num_class = decoder_out.size(2)
			# nll: negative log-likelihood
			nll = torch.nn.functional.cross_entropy(
			decoder_out.view(-1, decoder_num_class),
			ys_out_pad.view(-1),
			ignore_index=self.ignore_id,
			reduction="none",
			)
			nll = nll.view(batch_size, -1)
			nll = nll.sum(dim=1)
			assert nll.size(0) == batch_size
			return nll

			def batchify_nll(
			self,
			encoder_out: torch.Tensor,
			encoder_out_lens: torch.Tensor,
			ys_pad: torch.Tensor,
			ys_pad_lens: torch.Tensor,
			batch_size: int = 100,
			):
			"""Compute negative log likelihood(nll) from transformer-decoder

			To avoid OOM, this fuction seperate the input into batches.
			Then call nll for each batch and combine and return results.
			Args:
			encoder_out: (Batch, Length, Dim)
			encoder_out_lens: (Batch,)
			ys_pad: (Batch, Length)
			ys_pad_lens: (Batch,)
			batch_size: int, samples each batch contain when computing nll,
			you may change this to avoid OOM or increase
			GPU memory usage
			"""
			total_num = encoder_out.size(0)
			if total_num <= batch_size:
			nll = self.nll(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens)
			else:
			nll = []
			start_idx = 0
			while True:
			end_idx = min(start_idx + batch_size, total_num)
			batch_encoder_out = encoder_out[start_idx:end_idx, :, :]
			batch_encoder_out_lens = encoder_out_lens[start_idx:end_idx]
			batch_ys_pad = ys_pad[start_idx:end_idx, :]
			batch_ys_pad_lens = ys_pad_lens[start_idx:end_idx]
			batch_nll = self.nll(
			batch_encoder_out,
			batch_encoder_out_lens,
			batch_ys_pad,
			batch_ys_pad_lens,
			)
			nll.append(batch_nll)
			start_idx = end_idx
			if start_idx == total_num:
			break
			nll = torch.cat(nll)
			assert nll.size(0) == total_num
			return nll

			def _calc_att_loss(
			self,
			encoder_out: torch.Tensor,
			encoder_out_lens: torch.Tensor,
			ys_pad: torch.Tensor,
			ys_pad_lens: torch.Tensor,
			):
			encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
			encoder_out.device)
			if self.predictor_bias == 1:
			_, ys_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id)
			ys_pad_lens = ys_pad_lens + self.predictor_bias
			pre_acoustic_embeds, pre_token_length, _, pre_peak_index = self.predictor(encoder_out, ys_pad, encoder_out_mask,
			ignore_id=self.ignore_id)

			# 0. sampler
			decoder_out_1st = None
			if self.sampling_ratio > 0.0:
			if self.step_cur < 2:
			logging.info("enable sampler in paraformer, sampling_ratio: {}".format(self.sampling_ratio))
			sematic_embeds, decoder_out_1st = self.sampler(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens,
			pre_acoustic_embeds)
			else:
			if self.step_cur < 2:
			logging.info("disable sampler in paraformer, sampling_ratio: {}".format(self.sampling_ratio))
			sematic_embeds = pre_acoustic_embeds

			# 1. Forward decoder
			decoder_outs = self.decoder(
			encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens
			)
			decoder_out, _ = decoder_outs[0], decoder_outs[1]

			if decoder_out_1st is None:
			decoder_out_1st = decoder_out
			# 2. Compute attention loss
			loss_att = self.criterion_att(decoder_out, ys_pad)
			acc_att = th_accuracy(
			decoder_out_1st.view(-1, self.vocab_size),
			ys_pad,
			ignore_label=self.ignore_id,
			)
			loss_pre = self.criterion_pre(ys_pad_lens.type_as(pre_token_length), pre_token_length)

			# Compute cer/wer using attention-decoder
			if self.training or self.error_calculator is None:
			cer_att, wer_att = None, None
			else:
			ys_hat = decoder_out_1st.argmax(dim=-1)
			cer_att, wer_att = self.error_calculator(ys_hat.cpu(), ys_pad.cpu())

			return loss_att, acc_att, cer_att, wer_att, loss_pre

			def sampler(self, encoder_out, encoder_out_lens, ys_pad, ys_pad_lens, pre_acoustic_embeds):

			tgt_mask = (~make_pad_mask(ys_pad_lens, maxlen=ys_pad_lens.max())[:, :, None]).to(ys_pad.device)
			ys_pad_masked = ys_pad * tgt_mask[:, :, 0]
			if self.share_embedding:
			ys_pad_embed = self.decoder.output_layer.weight[ys_pad_masked]
			else:
			ys_pad_embed = self.decoder.embed(ys_pad_masked)
			with torch.no_grad():
			decoder_outs = self.decoder(
			encoder_out, encoder_out_lens, pre_acoustic_embeds, ys_pad_lens
			)
			decoder_out, _ = decoder_outs[0], decoder_outs[1]
			pred_tokens = decoder_out.argmax(-1)
			nonpad_positions = ys_pad.ne(self.ignore_id)
			seq_lens = (nonpad_positions).sum(1)
			same_num = ((pred_tokens == ys_pad) & nonpad_positions).sum(1)
			input_mask = torch.ones_like(nonpad_positions)
			bsz, seq_len = ys_pad.size()
			for li in range(bsz):
			target_num = (((seq_lens[li] - same_num[li].sum()).float()) * self.sampling_ratio).long()
			if target_num > 0:
			input_mask[li].scatter_(dim=0, index=torch.randperm(seq_lens[li])[:target_num].cuda(), value=0)
			input_mask = input_mask.eq(1)
			input_mask = input_mask.masked_fill(~nonpad_positions, False)
			input_mask_expand_dim = input_mask.unsqueeze(2).to(pre_acoustic_embeds.device)

			sematic_embeds = pre_acoustic_embeds.masked_fill(~input_mask_expand_dim, 0) + ys_pad_embed.masked_fill(
			input_mask_expand_dim, 0)
			return sematic_embeds * tgt_mask, decoder_out * tgt_mask

			def _calc_ctc_loss(
			self,
			encoder_out: torch.Tensor,
			encoder_out_lens: torch.Tensor,
			ys_pad: torch.Tensor,
			ys_pad_lens: torch.Tensor,
			):
			# Calc CTC loss
			loss_ctc = self.ctc(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens)

			# Calc CER using CTC
			cer_ctc = None
			if not self.training and self.error_calculator is not None:
			ys_hat = self.ctc.argmax(encoder_out).data
			cer_ctc = self.error_calculator(ys_hat.cpu(), ys_pad.cpu(), is_ctc=True)
			return loss_ctc, cer_ctc


			class ParaformerBert(Paraformer):
			@@ -623,9 +778,7 @@
			frontend: Optional[AbsFrontend],
			specaug: Optional[AbsSpecAug],
			normalize: Optional[AbsNormalize],
			preencoder: Optional[AbsPreEncoder],
			encoder: AbsEncoder,
			postencoder: Optional[AbsPostEncoder],
			decoder: AbsDecoder,
			ctc: CTC,
			ctc_weight: float = 0.5,
			@@ -648,6 +801,8 @@
			embeds_id: int = 2,
			embeds_loss_weight: float = 0.0,
			embed_dims: int = 768,
			preencoder: Optional[AbsPreEncoder] = None,
			postencoder: Optional[AbsPostEncoder] = None,
			):
			assert check_argument_types()
			assert 0.0 <= ctc_weight <= 1.0, ctc_weight
			@@ -779,7 +934,6 @@
			embed_lengths: torch.Tensor = None,
			) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
			"""Frontend + Encoder + Decoder + Calc loss

			Args:
			speech: (Batch, Length, ...)
			speech_lengths: (Batch, )
			@@ -798,9 +952,9 @@
			self.step_cur += 1
			# for data-parallel
			text = text[:, : text_lengths.max()]
			speech = speech[:, :speech_lengths.max(), :]
			speech = speech[:, :speech_lengths.max()]
			if embed is not None:
			embed = embed[:, :embed_lengths.max(), :]
			embed = embed[:, :embed_lengths.max()]

			# 1. Encoder
			encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
			@@ -888,74 +1042,73 @@


			class BiCifParaformer(Paraformer):

			"""
			Paraformer model with an extra cif predictor
			to conduct accurate timestamp prediction
			"""

			def __init__(
			self,
			vocab_size: int,
			token_list: Union[Tuple[str, ...], List[str]],
			frontend: Optional[AbsFrontend],
			specaug: Optional[AbsSpecAug],
			normalize: Optional[AbsNormalize],
			preencoder: Optional[AbsPreEncoder],
			encoder: AbsEncoder,
			postencoder: Optional[AbsPostEncoder],
			decoder: AbsDecoder,
			ctc: CTC,
			ctc_weight: float = 0.5,
			interctc_weight: float = 0.0,
			ignore_id: int = -1,
			blank_id: int = 0,
			sos: int = 1,
			eos: int = 2,
			lsm_weight: float = 0.0,
			length_normalized_loss: bool = False,
			report_cer: bool = True,
			report_wer: bool = True,
			sym_space: str = "<space>",
			sym_blank: str = "<blank>",
			extract_feats_in_collect_stats: bool = True,
			predictor = None,
			predictor_weight: float = 0.0,
			predictor_bias: int = 0,
			sampling_ratio: float = 0.2,
			self,
			vocab_size: int,
			token_list: Union[Tuple[str, ...], List[str]],
			frontend: Optional[AbsFrontend],
			specaug: Optional[AbsSpecAug],
			normalize: Optional[AbsNormalize],
			encoder: AbsEncoder,
			decoder: AbsDecoder,
			ctc: CTC,
			ctc_weight: float = 0.5,
			interctc_weight: float = 0.0,
			ignore_id: int = -1,
			blank_id: int = 0,
			sos: int = 1,
			eos: int = 2,
			lsm_weight: float = 0.0,
			length_normalized_loss: bool = False,
			report_cer: bool = True,
			report_wer: bool = True,
			sym_space: str = "<space>",
			sym_blank: str = "<blank>",
			extract_feats_in_collect_stats: bool = True,
			predictor=None,
			predictor_weight: float = 0.0,
			predictor_bias: int = 0,
			sampling_ratio: float = 0.2,
			preencoder: Optional[AbsPreEncoder] = None,
			postencoder: Optional[AbsPostEncoder] = None,
			):
			assert check_argument_types()
			assert 0.0 <= ctc_weight <= 1.0, ctc_weight
			assert 0.0 <= interctc_weight < 1.0, interctc_weight

			super().__init__(
			vocab_size=vocab_size,
			token_list=token_list,
			frontend=frontend,
			specaug=specaug,
			normalize=normalize,
			preencoder=preencoder,
			encoder=encoder,
			postencoder=postencoder,
			decoder=decoder,
			ctc=ctc,
			ctc_weight=ctc_weight,
			interctc_weight=interctc_weight,
			ignore_id=ignore_id,
			blank_id=blank_id,
			sos=sos,
			eos=eos,
			lsm_weight=lsm_weight,
			length_normalized_loss=length_normalized_loss,
			report_cer=report_cer,
			report_wer=report_wer,
			sym_space=sym_space,
			sym_blank=sym_blank,
			extract_feats_in_collect_stats=extract_feats_in_collect_stats,
			predictor=predictor,
			predictor_weight=predictor_weight,
			predictor_bias=predictor_bias,
			sampling_ratio=sampling_ratio,
			vocab_size=vocab_size,
			token_list=token_list,
			frontend=frontend,
			specaug=specaug,
			normalize=normalize,
			preencoder=preencoder,
			encoder=encoder,
			postencoder=postencoder,
			decoder=decoder,
			ctc=ctc,
			ctc_weight=ctc_weight,
			interctc_weight=interctc_weight,
			ignore_id=ignore_id,
			blank_id=blank_id,
			sos=sos,
			eos=eos,
			lsm_weight=lsm_weight,
			length_normalized_loss=length_normalized_loss,
			report_cer=report_cer,
			report_wer=report_wer,
			sym_space=sym_space,
			sym_blank=sym_blank,
			extract_feats_in_collect_stats=extract_feats_in_collect_stats,
			predictor=predictor,
			predictor_weight=predictor_weight,
			predictor_bias=predictor_bias,
			sampling_ratio=sampling_ratio,
			)
			assert isinstance(self.predictor, CifPredictorV3), "BiCifParaformer should use CIFPredictorV3"

			@@ -1030,21 +1183,23 @@
			cer_att, wer_att = self.error_calculator(ys_hat.cpu(), ys_pad.cpu())

			return loss_att, acc_att, cer_att, wer_att, loss_pre


			def calc_predictor(self, encoder_out, encoder_out_lens):

			encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
			encoder_out.device)
			pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index, pre_token_length2 = self.predictor(encoder_out, None, encoder_out_mask,
			ignore_id=self.ignore_id)
			pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index, pre_token_length2 = self.predictor(encoder_out,
			None,
			encoder_out_mask,
			ignore_id=self.ignore_id)
			return pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index


			def calc_predictor_timestamp(self, encoder_out, encoder_out_lens, token_num):
			encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
			encoder_out.device)
			ds_alphas, ds_cif_peak, us_alphas, us_peaks = self.predictor.get_upsample_timestamp(encoder_out,
			encoder_out_mask,
			token_num)
			encoder_out_mask,
			token_num)
			return ds_alphas, ds_cif_peak, us_alphas, us_peaks

			def forward(
			@@ -1055,7 +1210,6 @@
			text_lengths: torch.Tensor,
			) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
			"""Frontend + Encoder + Decoder + Calc loss

			Args:
			speech: (Batch, Length, ...)
			speech_lengths: (Batch, )
			@@ -1138,7 +1292,8 @@
			elif self.ctc_weight == 1.0:
			loss = loss_ctc
			else:
			loss = self.ctc_weight * loss_ctc + (1 - self.ctc_weight) * loss_att + loss_pre * self.predictor_weight + loss_pre2 * self.predictor_weight * 0.5
			loss = self.ctc_weight * loss_ctc + (
			1 - self.ctc_weight) * loss_att + loss_pre * self.predictor_weight + loss_pre2 * self.predictor_weight * 0.5

			# Collect Attn branch stats
			stats["loss_att"] = loss_att.detach() if loss_att is not None else None
			@@ -1167,9 +1322,7 @@
			frontend: Optional[AbsFrontend],
			specaug: Optional[AbsSpecAug],
			normalize: Optional[AbsNormalize],
			preencoder: Optional[AbsPreEncoder],
			encoder: AbsEncoder,
			postencoder: Optional[AbsPostEncoder],
			decoder: AbsDecoder,
			ctc: CTC,
			ctc_weight: float = 0.5,
			@@ -1199,6 +1352,8 @@
			bias_encoder_type: str = 'lstm',
			label_bracket: bool = False,
			use_decoder_embedding: bool = False,
			preencoder: Optional[AbsPreEncoder] = None,
			postencoder: Optional[AbsPostEncoder] = None,
			):
			assert check_argument_types()
			assert 0.0 <= ctc_weight <= 1.0, ctc_weight
			@@ -1262,7 +1417,6 @@
			text_lengths: torch.Tensor,
			) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
			"""Frontend + Encoder + Decoder + Calc loss

			Args:
			speech: (Batch, Length, ...)
			speech_lengths: (Batch, )
			@@ -1646,4 +1800,4 @@
			"torch tensor: {}, {}, loading from tf tensor: {}, {}".format(name, data_tf.size(), name_tf,
			var_dict_tf[name_tf].shape))

			return var_dict_torch_update
			return var_dict_torch_update