| | |
| | | return acoustic_embeds, token_num, alphas, cif_peak
|
| | |
|
| | | def forward_chunk(self, hidden, cache=None):
|
| | | b, t, d = hidden.size()
|
| | | h = hidden
|
| | | context = h.transpose(1, 2)
|
| | | queries = self.pad(context)
|
| | |
| | | alphas = alphas * mask_chunk_predictor
|
| | |
|
| | | if cache is not None:
|
| | | if cache["is_final"]:
|
| | | alphas[:, cache["stride"] + cache["pad_left"] - 1] += 0.45
|
| | | if cache["cif_hidden"] is not None:
|
| | | hidden = torch.cat((cache["cif_hidden"], hidden), 1)
|
| | | if cache["cif_alphas"] is not None:
|
| | |
| | | last_fire_place = len_time - 1
|
| | | last_fire_remainds = 0.0
|
| | | pre_alphas_length = 0
|
| | | last_fire = False
|
| | |
|
| | | mask_chunk_peak_predictor = None
|
| | | if cache is not None:
|
| | |
| | | mask_chunk_peak_predictor[:, :pre_alphas_length] = 1.0
|
| | | mask_chunk_peak_predictor[:, pre_alphas_length + cache["pad_left"]:pre_alphas_length + cache["stride"] + cache["pad_left"]] = 1.0
|
| | |
|
| | |
|
| | | if mask_chunk_peak_predictor is not None:
|
| | | cif_peak = cif_peak * mask_chunk_peak_predictor.squeeze(-1)
|
| | |
|
| | |
| | | if cif_peak[0][len_time - 1 - i] > self.threshold or cif_peak[0][len_time - 1 - i] == self.threshold:
|
| | | last_fire_place = len_time - 1 - i
|
| | | last_fire_remainds = cif_peak[0][len_time - 1 - i] - self.threshold
|
| | | last_fire = True
|
| | | break
|
| | | last_fire_remainds = torch.tensor([last_fire_remainds], dtype=alphas.dtype).to(alphas.device)
|
| | | cache["cif_hidden"] = hidden[:, last_fire_place:, :]
|
| | | cache["cif_alphas"] = torch.cat((last_fire_remainds.unsqueeze(0), alphas[:, last_fire_place+1:]), -1)
|
| | | if last_fire:
|
| | | last_fire_remainds = torch.tensor([last_fire_remainds], dtype=alphas.dtype).to(alphas.device)
|
| | | cache["cif_hidden"] = hidden[:, last_fire_place:, :]
|
| | | cache["cif_alphas"] = torch.cat((last_fire_remainds.unsqueeze(0), alphas[:, last_fire_place+1:]), -1)
|
| | | else:
|
| | | cache["cif_hidden"] = hidden
|
| | | cache["cif_alphas"] = alphas
|
| | | token_num_int = token_num.floor().type(torch.int32).item()
|
| | | return acoustic_embeds[:, 0:token_num_int, :], token_num, alphas, cif_peak
|
| | |
|