python/FunASR-XL.git

			@@ -228,19 +228,13 @@
			if cache["cif_alphas"] is not None:
			alphas = torch.cat((cache["cif_alphas"], alphas), -1)

			#if cache["is_final"]:
			# tail_threshold = torch.tensor([self.tail_threshold], dtype=alphas.dtype).to(alphas.device)
			# tail_threshold = torch.reshape(tail_threshold, (1, 1))
			# alphas = torch.cat([alphas, tail_threshold], dim=1)
			# zeros_hidden = torch.zeros((b, 1, d), dtype=hidden.dtype).to(hidden.device)
			# hidden = torch.cat([hidden, zeros_hidden], dim=1)

			token_num = alphas.sum(-1)
			acoustic_embeds, cif_peak = cif(hidden, alphas, self.threshold)
			len_time = alphas.size(-1)
			last_fire_place = len_time - 1
			last_fire_remainds = 0.0
			pre_alphas_length = 0
			last_fire = False

			mask_chunk_peak_predictor = None
			if cache is not None:
			@@ -250,8 +244,6 @@
			pre_alphas_length = cache["cif_alphas"].size(-1)
			mask_chunk_peak_predictor[:, :pre_alphas_length] = 1.0
			mask_chunk_peak_predictor[:, pre_alphas_length + cache["pad_left"]:pre_alphas_length + cache["stride"] + cache["pad_left"]] = 1.0
			#if cache["is_final"]:
			# mask_chunk_peak_predictor[:, -1] = 1.0

			if mask_chunk_peak_predictor is not None:
			cif_peak = cif_peak * mask_chunk_peak_predictor.squeeze(-1)
			@@ -260,10 +252,15 @@
			if cif_peak[0][len_time - 1 - i] > self.threshold or cif_peak[0][len_time - 1 - i] == self.threshold:
			last_fire_place = len_time - 1 - i
			last_fire_remainds = cif_peak[0][len_time - 1 - i] - self.threshold
			last_fire = True
			break
			last_fire_remainds = torch.tensor([last_fire_remainds], dtype=alphas.dtype).to(alphas.device)
			cache["cif_hidden"] = hidden[:, last_fire_place:, :]
			cache["cif_alphas"] = torch.cat((last_fire_remainds.unsqueeze(0), alphas[:, last_fire_place+1:]), -1)
			if last_fire:
			last_fire_remainds = torch.tensor([last_fire_remainds], dtype=alphas.dtype).to(alphas.device)
			cache["cif_hidden"] = hidden[:, last_fire_place:, :]
			cache["cif_alphas"] = torch.cat((last_fire_remainds.unsqueeze(0), alphas[:, last_fire_place+1:]), -1)
			else:
			cache["cif_hidden"] = hidden
			cache["cif_alphas"] = alphas
			token_num_int = token_num.floor().type(torch.int32).item()
			return acoustic_embeds[:, 0:token_num_int, :], token_num, alphas, cif_peak