| | |
| | | timestamp_infer_config: Union[Path, str] = None, |
| | | timestamp_model_file: Union[Path, str] = None, |
| | | param_dict: dict = None, |
| | | decoding_ind: int = 0, |
| | | **kwargs, |
| | | ): |
| | | ncpu = kwargs.get("ncpu", 1) |
| | |
| | | nbest=nbest, |
| | | hotword_list_or_file=hotword_list_or_file, |
| | | clas_scale=clas_scale, |
| | | decoding_ind=decoding_ind, |
| | | ) |
| | | |
| | | speech2text = Speech2TextParaformer(**speech2text_kwargs) |
| | |
| | | **kwargs, |
| | | ): |
| | | |
| | | decoding_ind = None |
| | | hotword_list_or_file = None |
| | | if param_dict is not None: |
| | | hotword_list_or_file = param_dict.get('hotword') |
| | |
| | | hotword_list_or_file = kwargs['hotword'] |
| | | if hotword_list_or_file is not None or 'hotword' in kwargs: |
| | | speech2text.hotword_list = speech2text.generate_hotwords_list(hotword_list_or_file) |
| | | if param_dict is not None and "decoding_ind" in param_dict: |
| | | decoding_ind = param_dict["decoding_ind"] |
| | | |
| | | # 3. Build data-iterator |
| | | if data_path_and_name_and_type is None and raw_inputs is not None: |
| | |
| | | # N-best list of (text, token, token_int, hyp_object) |
| | | |
| | | time_beg = time.time() |
| | | batch["decoding_ind"] = decoding_ind |
| | | results = speech2text(**batch) |
| | | if len(results) < 1: |
| | | hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[]) |
| | |
| | | else: |
| | | text_postprocessed, word_lists = postprocessed_result[0], postprocessed_result[1] |
| | | item = {'key': key, 'value': text_postprocessed} |
| | | if timestamp_postprocessed != "" or len(timestamp) == 0: |
| | | if timestamp_postprocessed != "": |
| | | item['timestamp'] = timestamp_postprocessed |
| | | asr_result_list.append(item) |
| | | finish_count += 1 |
| | |
| | | item = {'key': key, 'value': text_postprocessed_punc} |
| | | if text_postprocessed != "": |
| | | item['text_postprocessed'] = text_postprocessed |
| | | if time_stamp_postprocessed != "" or len(time_stamp) == 0: |
| | | if time_stamp_postprocessed != "": |
| | | item['time_stamp'] = time_stamp_postprocessed |
| | | |
| | | item['sentences'] = time_stamp_sentence(punc_id_list, time_stamp_postprocessed, text_postprocessed) |
| | |
| | | data = yaml.load(f, Loader=yaml.Loader) |
| | | return data |
| | | |
| | | def _prepare_cache(cache: dict = {}, chunk_size=[5, 10, 5], batch_size=1): |
| | | def _prepare_cache(cache: dict = {}, chunk_size=[5, 10, 5], encoder_chunk_look_back=0, |
| | | decoder_chunk_look_back=0, batch_size=1): |
| | | if len(cache) > 0: |
| | | return cache |
| | | config = _read_yaml(asr_train_config) |
| | | enc_output_size = config["encoder_conf"]["output_size"] |
| | | feats_dims = config["frontend_conf"]["n_mels"] * config["frontend_conf"]["lfr_m"] |
| | | cache_en = {"start_idx": 0, "cif_hidden": torch.zeros((batch_size, 1, enc_output_size)), |
| | | "cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size, "last_chunk": False, |
| | | "cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size, |
| | | "encoder_chunk_look_back": encoder_chunk_look_back, "last_chunk": False, "opt": None, |
| | | "feats": torch.zeros((batch_size, chunk_size[0] + chunk_size[2], feats_dims)), "tail_chunk": False} |
| | | cache["encoder"] = cache_en |
| | | |
| | | cache_de = {"decode_fsmn": None} |
| | | cache_de = {"decode_fsmn": None, "decoder_chunk_look_back": decoder_chunk_look_back, "opt": None} |
| | | cache["decoder"] = cache_de |
| | | |
| | | return cache |
| | | |
| | | def _cache_reset(cache: dict = {}, chunk_size=[5, 10, 5], batch_size=1): |
| | | def _cache_reset(cache: dict = {}, chunk_size=[5, 10, 5], encoder_chunk_look_back=0, |
| | | decoder_chunk_look_back=0, batch_size=1): |
| | | if len(cache) > 0: |
| | | config = _read_yaml(asr_train_config) |
| | | enc_output_size = config["encoder_conf"]["output_size"] |
| | | feats_dims = config["frontend_conf"]["n_mels"] * config["frontend_conf"]["lfr_m"] |
| | | cache_en = {"start_idx": 0, "cif_hidden": torch.zeros((batch_size, 1, enc_output_size)), |
| | | "cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size, "last_chunk": False, |
| | | "feats": torch.zeros((batch_size, chunk_size[0] + chunk_size[2], feats_dims)), |
| | | "tail_chunk": False} |
| | | "cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size, |
| | | "encoder_chunk_look_back": encoder_chunk_look_back, "last_chunk": False, "opt": None, |
| | | "feats": torch.zeros((batch_size, chunk_size[0] + chunk_size[2], feats_dims)), "tail_chunk": False} |
| | | cache["encoder"] = cache_en |
| | | |
| | | cache_de = {"decode_fsmn": None} |
| | | cache_de = {"decode_fsmn": None, "decoder_chunk_look_back": decoder_chunk_look_back, "opt": None} |
| | | cache["decoder"] = cache_de |
| | | |
| | | return cache |
| | | |
| | | #def _prepare_cache(cache: dict = {}, chunk_size=[5, 10, 5], batch_size=1): |
| | | # if len(cache) > 0: |
| | | # return cache |
| | | # config = _read_yaml(asr_train_config) |
| | | # enc_output_size = config["encoder_conf"]["output_size"] |
| | | # feats_dims = config["frontend_conf"]["n_mels"] * config["frontend_conf"]["lfr_m"] |
| | | # cache_en = {"start_idx": 0, "cif_hidden": torch.zeros((batch_size, 1, enc_output_size)), |
| | | # "cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size, "last_chunk": False, |
| | | # "feats": torch.zeros((batch_size, chunk_size[0] + chunk_size[2], feats_dims)), "tail_chunk": False} |
| | | # cache["encoder"] = cache_en |
| | | |
| | | # cache_de = {"decode_fsmn": None} |
| | | # cache["decoder"] = cache_de |
| | | |
| | | # return cache |
| | | |
| | | #def _cache_reset(cache: dict = {}, chunk_size=[5, 10, 5], batch_size=1): |
| | | # if len(cache) > 0: |
| | | # config = _read_yaml(asr_train_config) |
| | | # enc_output_size = config["encoder_conf"]["output_size"] |
| | | # feats_dims = config["frontend_conf"]["n_mels"] * config["frontend_conf"]["lfr_m"] |
| | | # cache_en = {"start_idx": 0, "cif_hidden": torch.zeros((batch_size, 1, enc_output_size)), |
| | | # "cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size, "last_chunk": False, |
| | | # "feats": torch.zeros((batch_size, chunk_size[0] + chunk_size[2], feats_dims)), |
| | | # "tail_chunk": False} |
| | | # cache["encoder"] = cache_en |
| | | |
| | | # cache_de = {"decode_fsmn": None} |
| | | # cache["decoder"] = cache_de |
| | | |
| | | # return cache |
| | | |
| | | def _forward( |
| | | data_path_and_name_and_type, |
| | |
| | | is_final = False |
| | | cache = {} |
| | | chunk_size = [5, 10, 5] |
| | | encoder_chunk_look_back = 0 |
| | | decoder_chunk_look_back = 0 |
| | | if param_dict is not None and "cache" in param_dict: |
| | | cache = param_dict["cache"] |
| | | if param_dict is not None and "is_final" in param_dict: |
| | | is_final = param_dict["is_final"] |
| | | if param_dict is not None and "chunk_size" in param_dict: |
| | | chunk_size = param_dict["chunk_size"] |
| | | if param_dict is not None and "encoder_chunk_look_back" in param_dict: |
| | | encoder_chunk_look_back = param_dict["encoder_chunk_look_back"] |
| | | if encoder_chunk_look_back > 0: |
| | | chunk_size[0] = 0 |
| | | if param_dict is not None and "decoder_chunk_look_back" in param_dict: |
| | | decoder_chunk_look_back = param_dict["decoder_chunk_look_back"] |
| | | |
| | | # 7 .Start for-loop |
| | | # FIXME(kamo): The output format should be discussed about |
| | | raw_inputs = torch.unsqueeze(raw_inputs, axis=0) |
| | | asr_result_list = [] |
| | | cache = _prepare_cache(cache, chunk_size=chunk_size, batch_size=1) |
| | | cache = _prepare_cache(cache, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, |
| | | decoder_chunk_look_back=decoder_chunk_look_back, batch_size=1) |
| | | item = {} |
| | | if data_path_and_name_and_type is not None and data_path_and_name_and_type[2] == "sound": |
| | | sample_offset = 0 |
| | | speech_length = raw_inputs.shape[1] |
| | | stride_size = chunk_size[1] * 960 |
| | | cache = _prepare_cache(cache, chunk_size=chunk_size, batch_size=1) |
| | | cache = _prepare_cache(cache, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, |
| | | decoder_chunk_look_back=decoder_chunk_look_back, batch_size=1) |
| | | final_result = "" |
| | | for sample_offset in range(0, speech_length, min(stride_size, speech_length - sample_offset)): |
| | | if sample_offset + stride_size >= speech_length - 1: |
| | |
| | | |
| | | asr_result_list.append(item) |
| | | if is_final: |
| | | cache = _cache_reset(cache, chunk_size=chunk_size, batch_size=1) |
| | | cache = _cache_reset(cache, chunk_size=chunk_size, batch_size=1, |
| | | encoder_chunk_look_back=encoder_chunk_look_back, decoder_chunk_look_back=decoder_chunk_look_back) |
| | | return asr_result_list |
| | | |
| | | return _forward |