| | |
| | | from funasr.utils.types import str2triple_str |
| | | from funasr.utils.types import str_or_none |
| | | from funasr.utils.vad_utils import slice_padding_fbank |
| | | |
| | | from tqdm import tqdm |
| | | |
| | | def inference_asr( |
| | | maxlenratio: float, |
| | |
| | | else: |
| | | text_postprocessed, word_lists = postprocessed_result[0], postprocessed_result[1] |
| | | item = {'key': key, 'value': text_postprocessed} |
| | | if timestamp_postprocessed != "" or len(timestamp) == 0: |
| | | if timestamp_postprocessed != "": |
| | | item['timestamp'] = timestamp_postprocessed |
| | | asr_result_list.append(item) |
| | | finish_count += 1 |
| | |
| | | |
| | | batch_size_token_ms_cum = 0 |
| | | beg_idx = 0 |
| | | for j, _ in enumerate(range(0, n)): |
| | | beg_asr_total = time.time() |
| | | for j, _ in enumerate(tqdm(range(0, n))): |
| | | batch_size_token_ms_cum += (sorted_data[j][0][1] - sorted_data[j][0][0]) |
| | | if j < n - 1 and (batch_size_token_ms_cum + sorted_data[j + 1][0][1] - sorted_data[j + 1][0][0]) < batch_size_token_ms and (sorted_data[j + 1][0][1] - sorted_data[j + 1][0][0]) < batch_size_token_threshold_s: |
| | | continue |
| | |
| | | beg_idx = end_idx |
| | | batch = {"speech": speech_j, "speech_lengths": speech_lengths_j} |
| | | batch = to_device(batch, device=device) |
| | | print("batch: ", speech_j.shape[0]) |
| | | # print("batch: ", speech_j.shape[0]) |
| | | beg_asr = time.time() |
| | | results = speech2text(**batch) |
| | | end_asr = time.time() |
| | | print("time cost asr: ", end_asr - beg_asr) |
| | | # print("time cost asr: ", end_asr - beg_asr) |
| | | |
| | | if len(results) < 1: |
| | | results = [["", [], [], [], [], [], []]] |
| | | results_sorted.extend(results) |
| | | |
| | | end_asr_total = time.time() |
| | | print("total time cost asr: ", end_asr_total-beg_asr_total) |
| | | restored_data = [0] * n |
| | | for j in range(n): |
| | | index = sorted_data[j][1] |
| | |
| | | data = yaml.load(f, Loader=yaml.Loader) |
| | | return data |
| | | |
| | | def _prepare_cache(cache: dict = {}, chunk_size=[5, 10, 5], batch_size=1): |
| | | def _prepare_cache(cache: dict = {}, chunk_size=[5, 10, 5], encoder_chunk_look_back=0, |
| | | decoder_chunk_look_back=0, batch_size=1): |
| | | if len(cache) > 0: |
| | | return cache |
| | | config = _read_yaml(asr_train_config) |
| | | enc_output_size = config["encoder_conf"]["output_size"] |
| | | feats_dims = config["frontend_conf"]["n_mels"] * config["frontend_conf"]["lfr_m"] |
| | | cache_en = {"start_idx": 0, "cif_hidden": torch.zeros((batch_size, 1, enc_output_size)), |
| | | "cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size, "last_chunk": False, |
| | | "cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size, |
| | | "encoder_chunk_look_back": encoder_chunk_look_back, "last_chunk": False, "opt": None, |
| | | "feats": torch.zeros((batch_size, chunk_size[0] + chunk_size[2], feats_dims)), "tail_chunk": False} |
| | | cache["encoder"] = cache_en |
| | | |
| | | cache_de = {"decode_fsmn": None} |
| | | cache_de = {"decode_fsmn": None, "decoder_chunk_look_back": decoder_chunk_look_back, "opt": None, "chunk_size": chunk_size} |
| | | cache["decoder"] = cache_de |
| | | |
| | | return cache |
| | | |
| | | def _cache_reset(cache: dict = {}, chunk_size=[5, 10, 5], batch_size=1): |
| | | def _cache_reset(cache: dict = {}, chunk_size=[5, 10, 5], encoder_chunk_look_back=0, |
| | | decoder_chunk_look_back=0, batch_size=1): |
| | | if len(cache) > 0: |
| | | config = _read_yaml(asr_train_config) |
| | | enc_output_size = config["encoder_conf"]["output_size"] |
| | | feats_dims = config["frontend_conf"]["n_mels"] * config["frontend_conf"]["lfr_m"] |
| | | cache_en = {"start_idx": 0, "cif_hidden": torch.zeros((batch_size, 1, enc_output_size)), |
| | | "cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size, "last_chunk": False, |
| | | "feats": torch.zeros((batch_size, chunk_size[0] + chunk_size[2], feats_dims)), |
| | | "tail_chunk": False} |
| | | "cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size, |
| | | "encoder_chunk_look_back": encoder_chunk_look_back, "last_chunk": False, "opt": None, |
| | | "feats": torch.zeros((batch_size, chunk_size[0] + chunk_size[2], feats_dims)), "tail_chunk": False} |
| | | cache["encoder"] = cache_en |
| | | |
| | | cache_de = {"decode_fsmn": None} |
| | | cache_de = {"decode_fsmn": None, "decoder_chunk_look_back": decoder_chunk_look_back, "opt": None, "chunk_size": chunk_size} |
| | | cache["decoder"] = cache_de |
| | | |
| | | return cache |
| | | |
| | | #def _prepare_cache(cache: dict = {}, chunk_size=[5, 10, 5], batch_size=1): |
| | | # if len(cache) > 0: |
| | | # return cache |
| | | # config = _read_yaml(asr_train_config) |
| | | # enc_output_size = config["encoder_conf"]["output_size"] |
| | | # feats_dims = config["frontend_conf"]["n_mels"] * config["frontend_conf"]["lfr_m"] |
| | | # cache_en = {"start_idx": 0, "cif_hidden": torch.zeros((batch_size, 1, enc_output_size)), |
| | | # "cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size, "last_chunk": False, |
| | | # "feats": torch.zeros((batch_size, chunk_size[0] + chunk_size[2], feats_dims)), "tail_chunk": False} |
| | | # cache["encoder"] = cache_en |
| | | |
| | | # cache_de = {"decode_fsmn": None} |
| | | # cache["decoder"] = cache_de |
| | | |
| | | # return cache |
| | | |
| | | #def _cache_reset(cache: dict = {}, chunk_size=[5, 10, 5], batch_size=1): |
| | | # if len(cache) > 0: |
| | | # config = _read_yaml(asr_train_config) |
| | | # enc_output_size = config["encoder_conf"]["output_size"] |
| | | # feats_dims = config["frontend_conf"]["n_mels"] * config["frontend_conf"]["lfr_m"] |
| | | # cache_en = {"start_idx": 0, "cif_hidden": torch.zeros((batch_size, 1, enc_output_size)), |
| | | # "cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size, "last_chunk": False, |
| | | # "feats": torch.zeros((batch_size, chunk_size[0] + chunk_size[2], feats_dims)), |
| | | # "tail_chunk": False} |
| | | # cache["encoder"] = cache_en |
| | | |
| | | # cache_de = {"decode_fsmn": None} |
| | | # cache["decoder"] = cache_de |
| | | |
| | | # return cache |
| | | |
| | | def _forward( |
| | | data_path_and_name_and_type, |
| | |
| | | is_final = False |
| | | cache = {} |
| | | chunk_size = [5, 10, 5] |
| | | encoder_chunk_look_back = 0 |
| | | decoder_chunk_look_back = 0 |
| | | if param_dict is not None and "cache" in param_dict: |
| | | cache = param_dict["cache"] |
| | | if param_dict is not None and "is_final" in param_dict: |
| | | is_final = param_dict["is_final"] |
| | | if param_dict is not None and "chunk_size" in param_dict: |
| | | chunk_size = param_dict["chunk_size"] |
| | | if param_dict is not None and "encoder_chunk_look_back" in param_dict: |
| | | encoder_chunk_look_back = param_dict["encoder_chunk_look_back"] |
| | | if encoder_chunk_look_back > 0: |
| | | chunk_size[0] = 0 |
| | | if param_dict is not None and "decoder_chunk_look_back" in param_dict: |
| | | decoder_chunk_look_back = param_dict["decoder_chunk_look_back"] |
| | | |
| | | # 7 .Start for-loop |
| | | # FIXME(kamo): The output format should be discussed about |
| | | raw_inputs = torch.unsqueeze(raw_inputs, axis=0) |
| | | asr_result_list = [] |
| | | cache = _prepare_cache(cache, chunk_size=chunk_size, batch_size=1) |
| | | cache = _prepare_cache(cache, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, |
| | | decoder_chunk_look_back=decoder_chunk_look_back, batch_size=1) |
| | | item = {} |
| | | if data_path_and_name_and_type is not None and data_path_and_name_and_type[2] == "sound": |
| | | sample_offset = 0 |
| | | speech_length = raw_inputs.shape[1] |
| | | stride_size = chunk_size[1] * 960 |
| | | cache = _prepare_cache(cache, chunk_size=chunk_size, batch_size=1) |
| | | cache = _prepare_cache(cache, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, |
| | | decoder_chunk_look_back=decoder_chunk_look_back, batch_size=1) |
| | | final_result = "" |
| | | for sample_offset in range(0, speech_length, min(stride_size, speech_length - sample_offset)): |
| | | if sample_offset + stride_size >= speech_length - 1: |
| | |
| | | |
| | | asr_result_list.append(item) |
| | | if is_final: |
| | | cache = _cache_reset(cache, chunk_size=chunk_size, batch_size=1) |
| | | cache = _cache_reset(cache, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, |
| | | decoder_chunk_look_back=decoder_chunk_look_back, batch_size=1) |
| | | return asr_result_list |
| | | |
| | | return _forward |
| | |
| | | quantize_modules: Optional[List[str]] = None, |
| | | quantize_dtype: Optional[str] = "float16", |
| | | streaming: Optional[bool] = False, |
| | | simu_streaming: Optional[bool] = False, |
| | | fake_streaming: Optional[bool] = False, |
| | | full_utt: Optional[bool] = False, |
| | | chunk_size: Optional[int] = 16, |
| | | left_context: Optional[int] = 16, |
| | |
| | | quantize_modules=quantize_modules, |
| | | quantize_dtype=quantize_dtype, |
| | | streaming=streaming, |
| | | simu_streaming=simu_streaming, |
| | | fake_streaming=fake_streaming, |
| | | full_utt=full_utt, |
| | | chunk_size=chunk_size, |
| | | left_context=left_context, |
| | |
| | | final_hyps = speech2text.streaming_decode( |
| | | speech[_end: len(speech)], is_final=True |
| | | ) |
| | | elif speech2text.simu_streaming: |
| | | final_hyps = speech2text.simu_streaming_decode(**batch) |
| | | elif speech2text.fake_streaming: |
| | | final_hyps = speech2text.fake_streaming_decode(**batch) |
| | | elif speech2text.full_utt: |
| | | final_hyps = speech2text.full_utt_decode(**batch) |
| | | else: |
| | |
| | | group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight") |
| | | group.add_argument("--ngram_weight", type=float, default=0.9, help="ngram weight") |
| | | group.add_argument("--streaming", type=str2bool, default=False) |
| | | group.add_argument("--simu_streaming", type=str2bool, default=False) |
| | | group.add_argument("--fake_streaming", type=str2bool, default=False) |
| | | group.add_argument("--full_utt", type=str2bool, default=False) |
| | | group.add_argument("--chunk_size", type=int, default=16) |
| | | group.add_argument("--left_context", type=int, default=16) |