| | |
| | | from funasr.models.frontend.wav_frontend import WavFrontend |
| | | from funasr.models.e2e_asr_paraformer import BiCifParaformer, ContextualParaformer |
| | | from funasr.export.models.e2e_asr_paraformer import Paraformer as Paraformer_export |
| | | np.set_printoptions(threshold=np.inf) |
| | | |
| | | class Speech2Text: |
| | | """Speech2Text class |
| | |
| | | # Input as audio signal |
| | | if isinstance(speech, np.ndarray): |
| | | speech = torch.tensor(speech) |
| | | |
| | | if self.frontend is not None: |
| | | feats, feats_len = self.frontend.forward(speech, speech_lengths) |
| | | feats = to_device(feats, device=self.device) |
| | |
| | | feats = speech |
| | | feats_len = speech_lengths |
| | | lfr_factor = max(1, (feats.size()[-1] // 80) - 1) |
| | | feats_len = cache["encoder"]["stride"] + cache["encoder"]["pad_left"] + cache["encoder"]["pad_right"] |
| | | feats = feats[:,cache["encoder"]["start_idx"]:cache["encoder"]["start_idx"]+feats_len,:] |
| | | feats_len = torch.tensor([feats_len]) |
| | | batch = {"speech": feats, "speech_lengths": feats_len, "cache": cache} |
| | | |
| | | # a. To device |
| | | batch = to_device(batch, device=self.device) |
| | | |
| | | # b. Forward Encoder |
| | | enc, enc_len = self.asr_model.encode_chunk(**batch) |
| | | enc, enc_len = self.asr_model.encode_chunk(feats, feats_len, cache) |
| | | if isinstance(enc, tuple): |
| | | enc = enc[0] |
| | | # assert len(enc) == 1, len(enc) |
| | |
| | | if data_path_and_name_and_type is None and raw_inputs is not None: |
| | | if isinstance(raw_inputs, np.ndarray): |
| | | raw_inputs = torch.tensor(raw_inputs) |
| | | |
| | | is_final = False |
| | | if param_dict is not None and "cache" in param_dict: |
| | | cache = param_dict["cache"] |
| | |
| | | asr_result = "" |
| | | wait = True |
| | | if len(cache) == 0: |
| | | cache["encoder"] = {"start_idx": 0, "pad_left": 0, "stride": 10, "pad_right": 5, "cif_hidden": None, "cif_alphas": None} |
| | | cache["encoder"] = {"start_idx": 0, "pad_left": 0, "stride": 10, "pad_right": 5, "cif_hidden": None, "cif_alphas": None, "is_final": is_final, "left": 0, "right": 0} |
| | | cache_de = {"decode_fsmn": None} |
| | | cache["decoder"] = cache_de |
| | | cache["first_chunk"] = True |
| | | cache["speech"] = [] |
| | | cache["chunk_index"] = 0 |
| | | cache["speech_chunk"] = [] |
| | | cache["accum_speech"] = 0 |
| | | |
| | | if raw_inputs is not None: |
| | | if len(cache["speech"]) == 0: |
| | | cache["speech"] = raw_inputs |
| | | else: |
| | | cache["speech"] = torch.cat([cache["speech"], raw_inputs], dim=0) |
| | | if len(cache["speech_chunk"]) == 0: |
| | | cache["speech_chunk"] = raw_inputs |
| | | else: |
| | | cache["speech_chunk"] = torch.cat([cache["speech_chunk"], raw_inputs], dim=0) |
| | | while len(cache["speech_chunk"]) >= 960: |
| | | cache["accum_speech"] += len(raw_inputs) |
| | | while cache["accum_speech"] >= 960: |
| | | if cache["first_chunk"]: |
| | | if len(cache["speech_chunk"]) >= 14400: |
| | | speech = torch.unsqueeze(cache["speech_chunk"][0:14400], axis=0) |
| | | speech_length = torch.tensor([14400]) |
| | | if cache["accum_speech"] >= 14400: |
| | | speech = torch.unsqueeze(cache["speech"], axis=0) |
| | | speech_length = torch.tensor([len(cache["speech"])]) |
| | | cache["encoder"]["pad_left"] = 5 |
| | | cache["encoder"]["pad_right"] = 5 |
| | | cache["encoder"]["stride"] = 10 |
| | | cache["encoder"]["left"] = 5 |
| | | cache["encoder"]["right"] = 0 |
| | | results = speech2text(cache, speech, speech_length) |
| | | cache["speech_chunk"]= cache["speech_chunk"][4800:] |
| | | cache["accum_speech"] -= 4800 |
| | | cache["first_chunk"] = False |
| | | cache["encoder"]["start_idx"] = -5 |
| | | cache["encoder"]["is_final"] = False |
| | | wait = False |
| | | else: |
| | | if is_final: |
| | | cache["encoder"]["stride"] = len(cache["speech_chunk"]) // 960 |
| | | cache["encoder"]["stride"] = len(cache["speech"]) // 960 |
| | | cache["encoder"]["pad_left"] = 0 |
| | | cache["encoder"]["pad_right"] = 0 |
| | | speech = torch.unsqueeze(cache["speech_chunk"], axis=0) |
| | | speech_length = torch.tensor([len(cache["speech_chunk"])]) |
| | | speech = torch.unsqueeze(cache["speech"], axis=0) |
| | | speech_length = torch.tensor([len(cache["speech"])]) |
| | | results = speech2text(cache, speech, speech_length) |
| | | cache["speech_chunk"] = [] |
| | | cache["accum_speech"] = 0 |
| | | wait = False |
| | | else: |
| | | break |
| | | else: |
| | | if len(cache["speech_chunk"]) >= 19200: |
| | | if cache["accum_speech"] >= 19200: |
| | | cache["encoder"]["start_idx"] += 10 |
| | | cache["encoder"]["stride"] = 10 |
| | | cache["encoder"]["pad_left"] = 5 |
| | | speech = torch.unsqueeze(cache["speech_chunk"][:19200], axis=0) |
| | | speech_length = torch.tensor([19200]) |
| | | cache["encoder"]["pad_right"] = 5 |
| | | cache["encoder"]["left"] = 0 |
| | | cache["encoder"]["right"] = 0 |
| | | speech = torch.unsqueeze(cache["speech"], axis=0) |
| | | speech_length = torch.tensor([len(cache["speech"])]) |
| | | results = speech2text(cache, speech, speech_length) |
| | | cache["speech_chunk"] = cache["speech_chunk"][9600:] |
| | | cache["accum_speech"] -= 9600 |
| | | wait = False |
| | | else: |
| | | if is_final: |
| | | cache["encoder"]["stride"] = len(cache["speech_chunk"]) // 960 |
| | | cache["encoder"]["pad_right"] = 0 |
| | | speech = torch.unsqueeze(cache["speech_chunk"], axis=0) |
| | | speech_length = torch.tensor([len(cache["speech_chunk"])]) |
| | | results = speech2text(cache, speech, speech_length) |
| | | cache["speech_chunk"] = [] |
| | | wait = False |
| | | cache["encoder"]["is_final"] = True |
| | | if cache["accum_speech"] >= 14400: |
| | | cache["encoder"]["start_idx"] += 10 |
| | | cache["encoder"]["stride"] = 10 |
| | | cache["encoder"]["pad_left"] = 5 |
| | | cache["encoder"]["pad_right"] = 5 |
| | | cache["encoder"]["left"] = 0 |
| | | cache["encoder"]["right"] = cache["accum_speech"] // 960 - 15 |
| | | speech = torch.unsqueeze(cache["speech"], axis=0) |
| | | speech_length = torch.tensor([len(cache["speech"])]) |
| | | results = speech2text(cache, speech, speech_length) |
| | | cache["accum_speech"] -= 9600 |
| | | wait = False |
| | | else: |
| | | cache["encoder"]["start_idx"] += 10 |
| | | cache["encoder"]["stride"] = cache["accum_speech"] // 960 - 5 |
| | | cache["encoder"]["pad_left"] = 5 |
| | | cache["encoder"]["pad_right"] = 0 |
| | | cache["encoder"]["left"] = 0 |
| | | cache["encoder"]["right"] = 0 |
| | | speech = torch.unsqueeze(cache["speech"], axis=0) |
| | | speech_length = torch.tensor([len(cache["speech"])]) |
| | | results = speech2text(cache, speech, speech_length) |
| | | cache["accum_speech"] = 0 |
| | | wait = False |
| | | else: |
| | | break |
| | | |