| | |
| | | CT-Transformer: Controllable time-delay transformer for real-time punctuation prediction and disfluency detection |
| | | https://arxiv.org/pdf/2003.01309.pdf |
| | | """ |
| | | |
| | | def __init__( |
| | | self, |
| | | *args, |
| | |
| | | ): |
| | | super().__init__(*args, **kwargs) |
| | | |
| | | |
| | | def punc_forward(self, text: torch.Tensor, text_lengths: torch.Tensor, vad_indexes: torch.Tensor, **kwargs): |
| | | def punc_forward( |
| | | self, text: torch.Tensor, text_lengths: torch.Tensor, vad_indexes: torch.Tensor, **kwargs |
| | | ): |
| | | """Compute loss value from buffer sequences. |
| | | |
| | | Args: |
| | |
| | | |
| | | def with_vad(self): |
| | | return True |
| | | |
| | | def inference(self, |
| | | data_in, |
| | | data_lengths=None, |
| | | key: list = None, |
| | | tokenizer=None, |
| | | frontend=None, |
| | | cache: dict = {}, |
| | | **kwargs, |
| | | ): |
| | | |
| | | def inference( |
| | | self, |
| | | data_in, |
| | | data_lengths=None, |
| | | key: list = None, |
| | | tokenizer=None, |
| | | frontend=None, |
| | | cache: dict = {}, |
| | | **kwargs, |
| | | ): |
| | | assert len(data_in) == 1 |
| | | |
| | | |
| | | if len(cache) == 0: |
| | | cache["pre_text"] = [] |
| | | text = load_audio_text_image_video(data_in, data_type=kwargs.get("kwargs", "text"))[0] |
| | | text = "".join(cache["pre_text"]) + " " + text |
| | | |
| | | |
| | | split_size = kwargs.get("split_size", 20) |
| | | |
| | |
| | | mini_sentences_id = split_to_mini_sentence(tokens_int, split_size) |
| | | assert len(mini_sentences) == len(mini_sentences_id) |
| | | cache_sent = [] |
| | | cache_sent_id = torch.from_numpy(np.array([], dtype='int32')) |
| | | cache_sent_id = torch.from_numpy(np.array([], dtype="int32")) |
| | | skip_num = 0 |
| | | sentence_punc_list = [] |
| | | sentence_words_list = [] |
| | |
| | | mini_sentence_id = np.concatenate((cache_sent_id, mini_sentence_id), axis=0) |
| | | data = { |
| | | "text": torch.unsqueeze(torch.from_numpy(mini_sentence_id), 0), |
| | | "text_lengths": torch.from_numpy(np.array([len(mini_sentence_id)], dtype='int32')), |
| | | "vad_indexes": torch.from_numpy(np.array([len(cache["pre_text"])], dtype='int32')), |
| | | "text_lengths": torch.from_numpy(np.array([len(mini_sentence_id)], dtype="int32")), |
| | | "vad_indexes": torch.from_numpy(np.array([len(cache["pre_text"])], dtype="int32")), |
| | | } |
| | | data = to_device(data, kwargs["device"]) |
| | | # y, _ = self.wrapped_model(**data) |
| | |
| | | sentenceEnd = -1 |
| | | last_comma_index = -1 |
| | | for i in range(len(punctuations) - 2, 1, -1): |
| | | if self.punc_list[punctuations[i]] == "。" or self.punc_list[punctuations[i]] == "?": |
| | | if ( |
| | | self.punc_list[punctuations[i]] == "。" |
| | | or self.punc_list[punctuations[i]] == "?" |
| | | ): |
| | | sentenceEnd = i |
| | | break |
| | | if last_comma_index < 0 and self.punc_list[punctuations[i]] == ",": |
| | | last_comma_index = i |
| | | |
| | | if sentenceEnd < 0 and len(mini_sentence) > cache_pop_trigger_limit and last_comma_index >= 0: |
| | | if ( |
| | | sentenceEnd < 0 |
| | | and len(mini_sentence) > cache_pop_trigger_limit |
| | | and last_comma_index >= 0 |
| | | ): |
| | | # The sentence it too long, cut off at a comma. |
| | | sentenceEnd = last_comma_index |
| | | punctuations[sentenceEnd] = self.sentence_end_id |
| | | cache_sent = mini_sentence[sentenceEnd + 1:] |
| | | cache_sent_id = mini_sentence_id[sentenceEnd + 1:] |
| | | mini_sentence = mini_sentence[0:sentenceEnd + 1] |
| | | punctuations = punctuations[0:sentenceEnd + 1] |
| | | cache_sent = mini_sentence[sentenceEnd + 1 :] |
| | | cache_sent_id = mini_sentence_id[sentenceEnd + 1 :] |
| | | mini_sentence = mini_sentence[0 : sentenceEnd + 1] |
| | | punctuations = punctuations[0 : sentenceEnd + 1] |
| | | |
| | | # if len(punctuations) == 0: |
| | | # continue |
| | |
| | | sentence_punc_list_out = [] |
| | | for i in range(0, len(sentence_words_list)): |
| | | if i > 0: |
| | | if len(sentence_words_list[i][0].encode()) == 1 and len(sentence_words_list[i - 1][-1].encode()) == 1: |
| | | if ( |
| | | len(sentence_words_list[i][0].encode()) == 1 |
| | | and len(sentence_words_list[i - 1][-1].encode()) == 1 |
| | | ): |
| | | sentence_words_list[i] = " " + sentence_words_list[i] |
| | | if skip_num < len(cache["pre_text"]): |
| | | skip_num += 1 |
| | |
| | | if sentence_punc_list[i] == "。" or sentence_punc_list[i] == "?": |
| | | sentenceEnd = i |
| | | break |
| | | cache["pre_text"] = sentence_words_list[sentenceEnd + 1:] |
| | | cache["pre_text"] = sentence_words_list[sentenceEnd + 1 :] |
| | | if sentence_out[-1] in self.punc_list: |
| | | sentence_out = sentence_out[:-1] |
| | | sentence_punc_list_out[-1] = "_" |
| | |
| | | punc_array = punctuations |
| | | else: |
| | | punc_array = torch.cat([punc_array, punctuations], dim=0) |
| | | |
| | | |
| | | result_i = {"key": key[0], "text": sentence_out, "punc_array": punc_array} |
| | | results.append(result_i) |
| | | |
| | | |
| | | return results, meta_data |
| | | |
| | | def export( |
| | | self, |
| | | **kwargs, |
| | | ): |
| | | |
| | | is_onnx = kwargs.get("type", "onnx") == "onnx" |
| | | encoder_class = tables.encoder_classes.get(kwargs["encoder"] + "Export") |
| | | self.encoder = encoder_class(self.encoder, onnx=is_onnx) |
| | | |
| | | self.forward = self._export_forward |
| | | |
| | | return self |
| | | def export(self, **kwargs): |
| | | |
| | | def export_forward(self, inputs: torch.Tensor, |
| | | text_lengths: torch.Tensor, |
| | | vad_indexes: torch.Tensor, |
| | | sub_masks: torch.Tensor, |
| | | ): |
| | | """Compute loss value from buffer sequences. |
| | | from .export_meta import export_rebuild_model |
| | | |
| | | Args: |
| | | input (torch.Tensor): Input ids. (batch, len) |
| | | hidden (torch.Tensor): Target ids. (batch, len) |
| | | |
| | | """ |
| | | x = self.embed(inputs) |
| | | # mask = self._target_mask(input) |
| | | h, _ = self.encoder(x, text_lengths, vad_indexes, sub_masks) |
| | | y = self.decoder(h) |
| | | return y |
| | | |
| | | def export_dummy_inputs(self): |
| | | length = 120 |
| | | text_indexes = torch.randint(0, self.embed.num_embeddings, (1, length)).type(torch.int32) |
| | | text_lengths = torch.tensor([length], dtype=torch.int32) |
| | | vad_mask = torch.ones(length, length, dtype=torch.float32)[None, None, :, :] |
| | | sub_masks = torch.ones(length, length, dtype=torch.float32) |
| | | sub_masks = torch.tril(sub_masks).type(torch.float32) |
| | | return (text_indexes, text_lengths, vad_mask, sub_masks[None, None, :, :]) |
| | | |
| | | def export_input_names(self): |
| | | return ['inputs', 'text_lengths', 'vad_masks', 'sub_masks'] |
| | | |
| | | def export_output_names(self): |
| | | return ['logits'] |
| | | |
| | | def export_dynamic_axes(self): |
| | | return { |
| | | 'inputs': { |
| | | 1: 'feats_length' |
| | | }, |
| | | 'vad_masks': { |
| | | 2: 'feats_length1', |
| | | 3: 'feats_length2' |
| | | }, |
| | | 'sub_masks': { |
| | | 2: 'feats_length1', |
| | | 3: 'feats_length2' |
| | | }, |
| | | 'logits': { |
| | | 1: 'logits_length' |
| | | }, |
| | | } |
| | | def export_name(self): |
| | | return "model.onnx" |
| | | models = export_rebuild_model(model=self, **kwargs) |
| | | return models |