| | |
| | | else: |
| | | timestamp_list[-1][1] = num_frames*TIME_RATE |
| | | assert len(new_char_list) == len(timestamp_list) |
| | | res = "" |
| | | res_str = "" |
| | | for char, timestamp in zip(new_char_list, timestamp_list): |
| | | res += "{} {} {};".format(char, timestamp[0], timestamp[1]) |
| | | return res |
| | | res_str += "{} {} {};".format(char, str(timestamp[0]+0.0005)[:5], str(timestamp[1]+0.0005)[:5]) |
| | | res = [] |
| | | for char, timestamp in zip(char_list, timestamp_list): |
| | | if char != '<sil>': |
| | | res.append([int(timestamp[0] * 1000), int(timestamp[1] * 1000)]) |
| | | return res_str, res |
| | | |
| | | |
| | | class SpeechText2Timestamp: |
| | | def __init__( |
| | | self, |
| | | tp_train_config: Union[Path, str] = None, |
| | | tp_model_file: Union[Path, str] = None, |
| | | tp_cmvn_file: Union[Path, str] = None, |
| | | timestamp_infer_config: Union[Path, str] = None, |
| | | timestamp_model_file: Union[Path, str] = None, |
| | | timestamp_cmvn_file: Union[Path, str] = None, |
| | | device: str = "cpu", |
| | | dtype: str = "float32", |
| | | **kwargs, |
| | |
| | | assert check_argument_types() |
| | | # 1. Build ASR model |
| | | tp_model, tp_train_args = ASRTask.build_model_from_file( |
| | | tp_train_config, tp_model_file, device |
| | | timestamp_infer_config, timestamp_model_file, device |
| | | ) |
| | | if 'cuda' in device: |
| | | tp_model = tp_model.cuda() |
| | | |
| | | frontend = None |
| | | if tp_train_args.frontend is not None: |
| | | frontend = WavFrontend(cmvn_file=tp_cmvn_file, **tp_train_args.frontend_conf) |
| | | frontend = WavFrontend(cmvn_file=timestamp_cmvn_file, **tp_train_args.frontend_conf) |
| | | |
| | | logging.info("tp_model: {}".format(tp_model)) |
| | | logging.info("tp_train_args: {}".format(tp_train_args)) |
| | |
| | | # Input as audio signal |
| | | if isinstance(speech, np.ndarray): |
| | | speech = torch.tensor(speech) |
| | | |
| | | if self.frontend is not None: |
| | | feats, feats_len = self.frontend.forward(speech, speech_lengths) |
| | | feats = to_device(feats, device=self.device) |
| | | feats_len = feats_len.int() |
| | | self.tp_model.frontend = None |
| | | else: |
| | | feats = speech |
| | | feats_len = speech_lengths |
| | |
| | | ngpu: int, |
| | | log_level: Union[int, str], |
| | | data_path_and_name_and_type, |
| | | tp_train_config: Optional[str], |
| | | tp_model_file: Optional[str], |
| | | tp_cmvn_file: Optional[str] = None, |
| | | timestamp_infer_config: Optional[str], |
| | | timestamp_model_file: Optional[str], |
| | | timestamp_cmvn_file: Optional[str] = None, |
| | | raw_inputs: Union[np.ndarray, torch.Tensor] = None, |
| | | key_file: Optional[str] = None, |
| | | allow_variable_data_keys: bool = False, |
| | |
| | | batch_size=batch_size, |
| | | ngpu=ngpu, |
| | | log_level=log_level, |
| | | tp_train_config=tp_train_config, |
| | | tp_model_file=tp_model_file, |
| | | tp_cmvn_file=tp_cmvn_file, |
| | | timestamp_infer_config=timestamp_infer_config, |
| | | timestamp_model_file=timestamp_model_file, |
| | | timestamp_cmvn_file=timestamp_cmvn_file, |
| | | key_file=key_file, |
| | | allow_variable_data_keys=allow_variable_data_keys, |
| | | output_dir=output_dir, |
| | |
| | | ngpu: int, |
| | | log_level: Union[int, str], |
| | | # data_path_and_name_and_type, |
| | | tp_train_config: Optional[str], |
| | | tp_model_file: Optional[str], |
| | | tp_cmvn_file: Optional[str] = None, |
| | | timestamp_infer_config: Optional[str], |
| | | timestamp_model_file: Optional[str], |
| | | timestamp_cmvn_file: Optional[str] = None, |
| | | # raw_inputs: Union[np.ndarray, torch.Tensor] = None, |
| | | key_file: Optional[str] = None, |
| | | allow_variable_data_keys: bool = False, |
| | |
| | | device = "cuda" |
| | | else: |
| | | device = "cpu" |
| | | |
| | | # 1. Set random-seed |
| | | set_all_random_seed(seed) |
| | | |
| | | # 2. Build speech2vadsegment |
| | | speechtext2timestamp_kwargs = dict( |
| | | tp_train_config=tp_train_config, |
| | | tp_model_file=tp_model_file, |
| | | tp_cmvn_file=tp_cmvn_file, |
| | | timestamp_infer_config=timestamp_infer_config, |
| | | timestamp_model_file=timestamp_model_file, |
| | | timestamp_cmvn_file=timestamp_cmvn_file, |
| | | device=device, |
| | | dtype=dtype, |
| | | ) |
| | |
| | | raw_inputs: Union[np.ndarray, torch.Tensor] = None, |
| | | output_dir_v2: Optional[str] = None, |
| | | fs: dict = None, |
| | | param_dict: dict = None |
| | | param_dict: dict = None, |
| | | **kwargs |
| | | ): |
| | | # 3. Build data-iterator |
| | | if data_path_and_name_and_type is None and raw_inputs is not None: |
| | |
| | | for batch_id in range(_bs): |
| | | key = keys[batch_id] |
| | | token = speechtext2timestamp.converter.ids2tokens(batch['text'][batch_id]) |
| | | timestamp = time_stamp_lfr6_advance(us_alphas[batch_id], us_cif_peak[batch_id], token) |
| | | logging.warning(timestamp) |
| | | import pdb; pdb.set_trace() |
| | | tp_result_list.append({'text':"".join([i for i in token if i != '<sil>']), 'timestamp': timestamp}) |
| | | |
| | | ts_str, ts_list = time_stamp_lfr6_advance(us_alphas[batch_id], us_cif_peak[batch_id], token) |
| | | logging.warning(ts_str) |
| | | tp_result_list.append({'text':"".join([i for i in token if i != '<sil>']), 'timestamp': ts_list}) |
| | | return tp_result_list |
| | | |
| | | return _forward |
| | |
| | | |
| | | group = parser.add_argument_group("The model configuration related") |
| | | group.add_argument( |
| | | "--tp_train_config", |
| | | "--timestamp_infer_config", |
| | | type=str, |
| | | help="VAD infer configuration", |
| | | ) |
| | | group.add_argument( |
| | | "--tp_model_file", |
| | | "--timestamp_model_file", |
| | | type=str, |
| | | help="VAD model parameter file", |
| | | ) |
| | | group.add_argument( |
| | | "--tp_cmvn_file", |
| | | "--timestamp_cmvn_file", |
| | | type=str, |
| | | help="Global cmvn file", |
| | | ) |