kongdeqiang
2026-03-13 28ccfbfc51068a663a80764e14074df5edf2b5ba
funasr/models/ct_transformer_streaming/model.py
@@ -31,6 +31,7 @@
    CT-Transformer: Controllable time-delay transformer for real-time punctuation prediction and disfluency detection
    https://arxiv.org/pdf/2003.01309.pdf
    """
    def __init__(
        self,
        *args,
@@ -38,8 +39,9 @@
    ):
        super().__init__(*args, **kwargs)
    def punc_forward(self, text: torch.Tensor, text_lengths: torch.Tensor, vad_indexes: torch.Tensor, **kwargs):
    def punc_forward(
        self, text: torch.Tensor, text_lengths: torch.Tensor, vad_indexes: torch.Tensor, **kwargs
    ):
        """Compute loss value from buffer sequences.
        Args:
@@ -55,23 +57,23 @@
    def with_vad(self):
        return True
    def inference(self,
                 data_in,
                 data_lengths=None,
                 key: list = None,
                 tokenizer=None,
                 frontend=None,
                 cache: dict = {},
                 **kwargs,
                 ):
    def inference(
        self,
        data_in,
        data_lengths=None,
        key: list = None,
        tokenizer=None,
        frontend=None,
        cache: dict = {},
        **kwargs,
    ):
        assert len(data_in) == 1
        if len(cache) == 0:
            cache["pre_text"] = []
        text = load_audio_text_image_video(data_in, data_type=kwargs.get("kwargs", "text"))[0]
        text = "".join(cache["pre_text"]) + " " + text
        split_size = kwargs.get("split_size", 20)
@@ -82,7 +84,7 @@
        mini_sentences_id = split_to_mini_sentence(tokens_int, split_size)
        assert len(mini_sentences) == len(mini_sentences_id)
        cache_sent = []
        cache_sent_id = torch.from_numpy(np.array([], dtype='int32'))
        cache_sent_id = torch.from_numpy(np.array([], dtype="int32"))
        skip_num = 0
        sentence_punc_list = []
        sentence_words_list = []
@@ -97,8 +99,8 @@
            mini_sentence_id = np.concatenate((cache_sent_id, mini_sentence_id), axis=0)
            data = {
                "text": torch.unsqueeze(torch.from_numpy(mini_sentence_id), 0),
                "text_lengths": torch.from_numpy(np.array([len(mini_sentence_id)], dtype='int32')),
                "vad_indexes": torch.from_numpy(np.array([len(cache["pre_text"])], dtype='int32')),
                "text_lengths": torch.from_numpy(np.array([len(mini_sentence_id)], dtype="int32")),
                "vad_indexes": torch.from_numpy(np.array([len(cache["pre_text"])], dtype="int32")),
            }
            data = to_device(data, kwargs["device"])
            # y, _ = self.wrapped_model(**data)
@@ -114,20 +116,27 @@
                sentenceEnd = -1
                last_comma_index = -1
                for i in range(len(punctuations) - 2, 1, -1):
                    if self.punc_list[punctuations[i]] == "。" or self.punc_list[punctuations[i]] == "?":
                    if (
                        self.punc_list[punctuations[i]] == "。"
                        or self.punc_list[punctuations[i]] == "?"
                    ):
                        sentenceEnd = i
                        break
                    if last_comma_index < 0 and self.punc_list[punctuations[i]] == ",":
                        last_comma_index = i
                if sentenceEnd < 0 and len(mini_sentence) > cache_pop_trigger_limit and last_comma_index >= 0:
                if (
                    sentenceEnd < 0
                    and len(mini_sentence) > cache_pop_trigger_limit
                    and last_comma_index >= 0
                ):
                    # The sentence it too long, cut off at a comma.
                    sentenceEnd = last_comma_index
                    punctuations[sentenceEnd] = self.sentence_end_id
                cache_sent = mini_sentence[sentenceEnd + 1:]
                cache_sent_id = mini_sentence_id[sentenceEnd + 1:]
                mini_sentence = mini_sentence[0:sentenceEnd + 1]
                punctuations = punctuations[0:sentenceEnd + 1]
                cache_sent = mini_sentence[sentenceEnd + 1 :]
                cache_sent_id = mini_sentence_id[sentenceEnd + 1 :]
                mini_sentence = mini_sentence[0 : sentenceEnd + 1]
                punctuations = punctuations[0 : sentenceEnd + 1]
            # if len(punctuations) == 0:
            #    continue
@@ -141,7 +150,10 @@
        sentence_punc_list_out = []
        for i in range(0, len(sentence_words_list)):
            if i > 0:
                if len(sentence_words_list[i][0].encode()) == 1 and len(sentence_words_list[i - 1][-1].encode()) == 1:
                if (
                    len(sentence_words_list[i][0].encode()) == 1
                    and len(sentence_words_list[i - 1][-1].encode()) == 1
                ):
                    sentence_words_list[i] = " " + sentence_words_list[i]
            if skip_num < len(cache["pre_text"]):
                skip_num += 1
@@ -158,7 +170,7 @@
            if sentence_punc_list[i] == "。" or sentence_punc_list[i] == "?":
                sentenceEnd = i
                break
        cache["pre_text"] = sentence_words_list[sentenceEnd + 1:]
        cache["pre_text"] = sentence_words_list[sentenceEnd + 1 :]
        if sentence_out[-1] in self.punc_list:
            sentence_out = sentence_out[:-1]
            sentence_punc_list_out[-1] = "_"
@@ -167,74 +179,15 @@
            punc_array = punctuations
        else:
            punc_array = torch.cat([punc_array, punctuations], dim=0)
        result_i = {"key": key[0], "text": sentence_out, "punc_array": punc_array}
        results.append(result_i)
        return results, meta_data
    def export(
        self,
        **kwargs,
    ):
        is_onnx = kwargs.get("type", "onnx") == "onnx"
        encoder_class = tables.encoder_classes.get(kwargs["encoder"] + "Export")
        self.encoder = encoder_class(self.encoder, onnx=is_onnx)
        self.forward = self._export_forward
        return self
    def export(self, **kwargs):
    def export_forward(self, inputs: torch.Tensor,
                text_lengths: torch.Tensor,
                vad_indexes: torch.Tensor,
                sub_masks: torch.Tensor,
                ):
        """Compute loss value from buffer sequences.
        from .export_meta import export_rebuild_model
        Args:
            input (torch.Tensor): Input ids. (batch, len)
            hidden (torch.Tensor): Target ids. (batch, len)
        """
        x = self.embed(inputs)
        # mask = self._target_mask(input)
        h, _ = self.encoder(x, text_lengths, vad_indexes, sub_masks)
        y = self.decoder(h)
        return y
    def export_dummy_inputs(self):
        length = 120
        text_indexes = torch.randint(0, self.embed.num_embeddings, (1, length)).type(torch.int32)
        text_lengths = torch.tensor([length], dtype=torch.int32)
        vad_mask = torch.ones(length, length, dtype=torch.float32)[None, None, :, :]
        sub_masks = torch.ones(length, length, dtype=torch.float32)
        sub_masks = torch.tril(sub_masks).type(torch.float32)
        return (text_indexes, text_lengths, vad_mask, sub_masks[None, None, :, :])
    def export_input_names(self):
        return ['inputs', 'text_lengths', 'vad_masks', 'sub_masks']
    def export_output_names(self):
        return ['logits']
    def export_dynamic_axes(self):
        return {
            'inputs': {
                1: 'feats_length'
            },
            'vad_masks': {
                2: 'feats_length1',
                3: 'feats_length2'
            },
            'sub_masks': {
                2: 'feats_length1',
                3: 'feats_length2'
            },
            'logits': {
                1: 'logits_length'
            },
        }
    def export_name(self):
        return "model.onnx"
        models = export_rebuild_model(model=self, **kwargs)
        return models