| | |
| | | import string |
| | | from funasr.register import tables |
| | | |
| | | from funasr.utils.load_utils import load_audio_and_text_image_video, extract_fbank |
| | | from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank |
| | | from funasr.utils.vad_utils import slice_padding_audio_samples |
| | | from funasr.utils.timestamp_tools import time_stamp_sentence |
| | | from funasr.download.file import download_from_url |
| | | |
| | | def build_iter_for_infer(data_in, input_len=None, data_type=None, key=None): |
| | | def prepare_data_iterator(data_in, input_len=None, data_type=None, key=None): |
| | | """ |
| | | |
| | | :param input: |
| | |
| | | filelist = [".scp", ".txt", ".json", ".jsonl"] |
| | | |
| | | chars = string.ascii_letters + string.digits |
| | | |
| | | if isinstance(data_in, str) and data_in.startswith('http'): # url |
| | | data_in = download_from_url(data_in) |
| | | if isinstance(data_in, str) and os.path.exists(data_in): # wav_path; filelist: wav.scp, file.jsonl;text.txt; |
| | | _, file_extension = os.path.splitext(data_in) |
| | | file_extension = file_extension.lower() |
| | |
| | | data_list = [data_in] |
| | | key_list = [key] |
| | | elif isinstance(data_in, (list, tuple)): |
| | | if data_type is not None and isinstance(data_type, (list, tuple)): |
| | | if data_type is not None and isinstance(data_type, (list, tuple)): # mutiple inputs |
| | | data_list_tmp = [] |
| | | for data_in_i, data_type_i in zip(data_in, data_type): |
| | | key_list, data_list_i = build_iter_for_infer(data_in=data_in_i, data_type=data_type_i) |
| | | key_list, data_list_i = prepare_data_iterator(data_in=data_in_i, data_type=data_type_i) |
| | | data_list_tmp.append(data_list_i) |
| | | data_list = [] |
| | | for item in zip(*data_list_tmp): |
| | | data_list.append(item) |
| | | else: |
| | | # [audio sample point, fbank] |
| | | # [audio sample point, fbank, text] |
| | | data_list = data_in |
| | | key_list = ["rand_key_" + ''.join(random.choice(chars) for _ in range(13)) for _ in range(len(data_in))] |
| | | else: # raw text; audio sample point, fbank; bytes |
| | |
| | | kwargs = self.kwargs if kwargs is None else kwargs |
| | | kwargs.update(cfg) |
| | | model = self.model if model is None else model |
| | | |
| | | data_type = kwargs.get("data_type", "sound") |
| | | |
| | | batch_size = kwargs.get("batch_size", 1) |
| | | # if kwargs.get("device", "cpu") == "cpu": |
| | | # batch_size = 1 |
| | | |
| | | key_list, data_list = build_iter_for_infer(input, input_len=input_len, data_type=data_type, key=key) |
| | | key_list, data_list = prepare_data_iterator(input, input_len=input_len, data_type=kwargs.get("data_type", None), key=key) |
| | | |
| | | speed_stats = {} |
| | | asr_result_list = [] |
| | |
| | | batch["data_lengths"] = input_len |
| | | |
| | | time1 = time.perf_counter() |
| | | results, meta_data = model.generate(**batch, **kwargs) |
| | | with torch.no_grad(): |
| | | results, meta_data = model.generate(**batch, **kwargs) |
| | | time2 = time.perf_counter() |
| | | |
| | | asr_result_list.extend(results) |
| | |
| | | batch_size = int(kwargs.get("batch_size_s", 300))*1000 |
| | | batch_size_threshold_ms = int(kwargs.get("batch_size_threshold_s", 60))*1000 |
| | | kwargs["batch_size"] = batch_size |
| | | data_type = kwargs.get("data_type", "sound") |
| | | key_list, data_list = build_iter_for_infer(input, input_len=input_len, data_type=data_type) |
| | | |
| | | key_list, data_list = prepare_data_iterator(input, input_len=input_len, data_type=kwargs.get("data_type", None)) |
| | | results_ret_list = [] |
| | | time_speech_total_all_samples = 0.0 |
| | | |
| | |
| | | key = res[i]["key"] |
| | | vadsegments = res[i]["value"] |
| | | input_i = data_list[i] |
| | | speech = load_audio_and_text_image_video(input_i, fs=kwargs["frontend"].fs, audio_fs=kwargs.get("fs", 16000)) |
| | | speech = load_audio_text_image_video(input_i, fs=kwargs["frontend"].fs, audio_fs=kwargs.get("fs", 16000)) |
| | | speech_lengths = len(speech) |
| | | n = len(vadsegments) |
| | | data_with_index = [(vadsegments[i], i) for i in range(n)] |
| | |
| | | kwargs.update(cfg) |
| | | |
| | | |
| | | key_list, data_list = build_iter_for_infer(input, input_len=input_len) |
| | | key_list, data_list = prepare_data_iterator(input, input_len=input_len) |
| | | batch_size = kwargs.get("batch_size", 1) |
| | | device = kwargs.get("device", "cpu") |
| | | if device == "cpu": |
| | |
| | | |
| | | # extract fbank feats |
| | | time1 = time.perf_counter() |
| | | audio_sample_list = load_audio_and_text_image_video(data_batch, fs=self.frontend.fs, audio_fs=kwargs.get("fs", 16000)) |
| | | audio_sample_list = load_audio_text_image_video(data_batch, fs=self.frontend.fs, audio_fs=kwargs.get("fs", 16000)) |
| | | time2 = time.perf_counter() |
| | | meta_data["load_data"] = f"{time2 - time1:0.3f}" |
| | | speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"), |