维石
2024-06-17 9377eed41e5ed37cede49c7c8dc8225050b884f6
update code
17个文件已修改
43 ■■■■■ 已修改文件
funasr/datasets/llm_datasets/datasets.py 2 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/datasets/llm_datasets_qwenaudio/datasets.py 2 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/datasets/llm_datasets_vicuna/datasets.py 2 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/datasets/sense_voice_datasets/datasets.py 2 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/frontends/default.py 1 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/bicif_paraformer/cif_predictor.py 2 ●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/contextual_paraformer/decoder.py 1 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/lcbnet/model.py 2 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/llm_asr/model.py 2 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/llm_asr_nar/model.py 2 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/mfcca/mfcca_encoder.py 2 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/paraformer_streaming/model.py 4 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/sense_voice/model.py 8 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/transformer/model.py 2 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/whisper/model.py 2 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/whisper_lid/model.py 2 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
runtime/python/onnxruntime/funasr_onnx/paraformer_bin.py 5 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/datasets/llm_datasets/datasets.py
@@ -64,8 +64,6 @@
    def __getitem__(self, index):
        item = self.index_ds[index]
        # import pdb;
        # pdb.set_trace()
        source = item["source"]
        data_src = load_audio_text_image_video(source, fs=self.fs)
        if self.preprocessor_speech:
funasr/datasets/llm_datasets_qwenaudio/datasets.py
@@ -66,8 +66,6 @@
    def __getitem__(self, index):
        item = self.index_ds[index]
        # import pdb;
        # pdb.set_trace()
        source = item["source"]
        data_src = load_audio_text_image_video(source, fs=self.fs)
        if self.preprocessor_speech:
funasr/datasets/llm_datasets_vicuna/datasets.py
@@ -66,8 +66,6 @@
    def __getitem__(self, index):
        item = self.index_ds[index]
        # import pdb;
        # pdb.set_trace()
        source = item["source"]
        data_src = load_audio_text_image_video(source, fs=self.fs)
        if self.preprocessor_speech:
funasr/datasets/sense_voice_datasets/datasets.py
@@ -71,8 +71,6 @@
        return len(self.index_ds)
    def __getitem__(self, index):
        # import pdb;
        # pdb.set_trace()
        output = None
        for idx in range(self.retry):
funasr/frontends/default.py
@@ -235,7 +235,6 @@
        self, input: torch.Tensor, input_lengths: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        # 1. Domain-conversion: e.g. Stft: time -> time-freq
        # import pdb;pdb.set_trace()
        if self.stft is not None:
            input_stft, feats_lens = self._compute_stft(input, input_lengths)
        else:
funasr/models/bicif_paraformer/cif_predictor.py
@@ -198,7 +198,7 @@
            output2 = self.upsample_cnn(_output)
            output2 = output2.transpose(1, 2)
            output2, _ = self.self_attn(output2, mask)
        # import pdb; pdb.set_trace()
        alphas2 = torch.sigmoid(self.cif_output2(output2))
        alphas2 = torch.nn.functional.relu(alphas2 * self.smooth_factor2 - self.noise_threshold2)
        # repeat the mask in T demension to match the upsampled length
funasr/models/contextual_paraformer/decoder.py
@@ -424,7 +424,6 @@
        # contextual_mask = myutils.sequence_mask(contextual_length, device=memory.device)[:, None, :]
        contextual_mask = self.make_pad_mask(contextual_length)
        contextual_mask, _ = self.prepare_mask(contextual_mask)
        # import pdb; pdb.set_trace()
        contextual_mask = contextual_mask.transpose(2, 1).unsqueeze(1)
        cx, tgt_mask, _, _, _ = self.bias_decoder(
            x_self_attn, tgt_mask, bias_embed, memory_mask=contextual_mask
funasr/models/lcbnet/model.py
@@ -23,8 +23,6 @@
from funasr.utils.datadir_writer import DatadirWriter
from funasr.register import tables
import pdb
@tables.register("model_classes", "LCBNet")
class LCBNet(nn.Module):
funasr/models/llm_asr/model.py
@@ -164,8 +164,6 @@
                text: (Batch, Length)
                text_lengths: (Batch,)
        """
        # import pdb;
        # pdb.set_trace()
        if len(text_lengths.size()) > 1:
            text_lengths = text_lengths[:, 0]
        if len(speech_lengths.size()) > 1:
funasr/models/llm_asr_nar/model.py
@@ -166,8 +166,6 @@
                text: (Batch, Length)
                text_lengths: (Batch,)
        """
        # import pdb;
        # pdb.set_trace()
        if len(text_lengths.size()) > 1:
            text_lengths = text_lengths[:, 0]
        if len(speech_lengths.size()) > 1:
funasr/models/mfcca/mfcca_encoder.py
@@ -34,7 +34,6 @@
from funasr.models.transformer.utils.subsampling import TooShortUttError
from funasr.models.transformer.utils.subsampling import check_short_utt
from funasr.models.encoder.abs_encoder import AbsEncoder
import pdb
import math
@@ -363,7 +362,6 @@
        t_leng = xs_pad.size(1)
        d_dim = xs_pad.size(2)
        xs_pad = xs_pad.reshape(-1, channel_size, t_leng, d_dim)
        # pdb.set_trace()
        if channel_size < 8:
            repeat_num = math.ceil(8 / channel_size)
            xs_pad = xs_pad.repeat(1, repeat_num, 1, 1)[:, 0:8, :, :]
funasr/models/paraformer_streaming/model.py
@@ -50,8 +50,6 @@
        super().__init__(*args, **kwargs)
        # import pdb;
        # pdb.set_trace()
        self.sampling_ratio = kwargs.get("sampling_ratio", 0.2)
        self.scama_mask = None
@@ -83,8 +81,6 @@
                text: (Batch, Length)
                text_lengths: (Batch,)
        """
        # import pdb;
        # pdb.set_trace()
        decoding_ind = kwargs.get("decoding_ind")
        if len(text_lengths.size()) > 1:
            text_lengths = text_lengths[:, 0]
funasr/models/sense_voice/model.py
@@ -73,8 +73,6 @@
    ):
        target_mask = kwargs.get("target_mask", None)
        # import pdb;
        # pdb.set_trace()
        if len(text_lengths.size()) > 1:
            text_lengths = text_lengths[:, 0]
        if len(speech_lengths.size()) > 1:
@@ -303,8 +301,6 @@
    ):
        target_mask = kwargs.get("target_mask", None)
        # import pdb;
        # pdb.set_trace()
        if len(text_lengths.size()) > 1:
            text_lengths = text_lengths[:, 0]
        if len(speech_lengths.size()) > 1:
@@ -648,8 +644,6 @@
    ):
        target_mask = kwargs.get("target_mask", None)
        # import pdb;
        # pdb.set_trace()
        if len(text_lengths.size()) > 1:
            text_lengths = text_lengths[:, 0]
        if len(speech_lengths.size()) > 1:
@@ -1052,8 +1046,6 @@
    ):
        target_mask = kwargs.get("target_mask", None)
        # import pdb;
        # pdb.set_trace()
        if len(text_lengths.size()) > 1:
            text_lengths = text_lengths[:, 0]
        if len(speech_lengths.size()) > 1:
funasr/models/transformer/model.py
@@ -145,8 +145,6 @@
                text: (Batch, Length)
                text_lengths: (Batch,)
        """
        # import pdb;
        # pdb.set_trace()
        if len(text_lengths.size()) > 1:
            text_lengths = text_lengths[:, 0]
        if len(speech_lengths.size()) > 1:
funasr/models/whisper/model.py
@@ -114,8 +114,6 @@
        
        result = whisper.decode(self.model, speech, language='english')
        # result = whisper.transcribe(self.model, speech)
        import pdb; pdb.set_trace()
        
        results = []
        result_i = {"key": key[0], "text": result.text}
funasr/models/whisper_lid/model.py
@@ -140,8 +140,6 @@
                text: (Batch, Length)
                text_lengths: (Batch,)
        """
        # import pdb;
        # pdb.set_trace()
        if len(text_lengths.size()) > 1:
            text_lengths = text_lengths[:, 0]
        if len(speech_lengths.size()) > 1:
runtime/python/onnxruntime/funasr_onnx/paraformer_bin.py
@@ -328,7 +328,6 @@
    ) -> List:
        # make hotword list
        hotwords, hotwords_length = self.proc_hotword(hotwords)
        # import pdb; pdb.set_trace()
        [bias_embed] = self.eb_infer(hotwords, hotwords_length)
        # index from bias_embed
        bias_embed = bias_embed.transpose(1, 0, 2)
@@ -376,10 +375,10 @@
            return np.array(hotwords)
        hotword_int = [word_map(i) for i in hotwords]
        # import pdb; pdb.set_trace()
        hotword_int.append(np.array([1]))
        hotwords = pad_list(hotword_int, pad_value=0, max_len=10)
        # import pdb; pdb.set_trace()
        return hotwords, hotwords_length
    def bb_infer(