python/FunASR-XL.git

			@@ -87,18 +87,22 @@
			else:
			timestamp_list[-1][1] = num_frames*TIME_RATE
			assert len(new_char_list) == len(timestamp_list)
			res = ""
			res_str = ""
			for char, timestamp in zip(new_char_list, timestamp_list):
			res += "{} {} {};".format(char, timestamp[0], timestamp[1])
			return res
			res_str += "{} {} {};".format(char, str(timestamp[0]+0.0005)[:5], str(timestamp[1]+0.0005)[:5])
			res = []
			for char, timestamp in zip(char_list, timestamp_list):
			if char != '<sil>':
			res.append([int(timestamp[0] * 1000), int(timestamp[1] * 1000)])
			return res_str, res


			class SpeechText2Timestamp:
			def __init__(
			self,
			tp_train_config: Union[Path, str] = None,
			tp_model_file: Union[Path, str] = None,
			tp_cmvn_file: Union[Path, str] = None,
			timestamp_infer_config: Union[Path, str] = None,
			timestamp_model_file: Union[Path, str] = None,
			timestamp_cmvn_file: Union[Path, str] = None,
			device: str = "cpu",
			dtype: str = "float32",
			**kwargs,
			@@ -106,11 +110,14 @@
			assert check_argument_types()
			# 1. Build ASR model
			tp_model, tp_train_args = ASRTask.build_model_from_file(
			tp_train_config, tp_model_file, device
			timestamp_infer_config, timestamp_model_file, device
			)
			if 'cuda' in device:
			tp_model = tp_model.cuda()

			frontend = None
			if tp_train_args.frontend is not None:
			frontend = WavFrontend(cmvn_file=tp_cmvn_file, **tp_train_args.frontend_conf)
			frontend = WavFrontend(cmvn_file=timestamp_cmvn_file, **tp_train_args.frontend_conf)

			logging.info("tp_model: {}".format(tp_model))
			logging.info("tp_train_args: {}".format(tp_train_args))
			@@ -144,11 +151,11 @@
			# Input as audio signal
			if isinstance(speech, np.ndarray):
			speech = torch.tensor(speech)

			if self.frontend is not None:
			feats, feats_len = self.frontend.forward(speech, speech_lengths)
			feats = to_device(feats, device=self.device)
			feats_len = feats_len.int()
			self.tp_model.frontend = None
			else:
			feats = speech
			feats_len = speech_lengths
			@@ -174,9 +181,9 @@
			ngpu: int,
			log_level: Union[int, str],
			data_path_and_name_and_type,
			tp_train_config: Optional[str],
			tp_model_file: Optional[str],
			tp_cmvn_file: Optional[str] = None,
			timestamp_infer_config: Optional[str],
			timestamp_model_file: Optional[str],
			timestamp_cmvn_file: Optional[str] = None,
			raw_inputs: Union[np.ndarray, torch.Tensor] = None,
			key_file: Optional[str] = None,
			allow_variable_data_keys: bool = False,
			@@ -190,9 +197,9 @@
			batch_size=batch_size,
			ngpu=ngpu,
			log_level=log_level,
			tp_train_config=tp_train_config,
			tp_model_file=tp_model_file,
			tp_cmvn_file=tp_cmvn_file,
			timestamp_infer_config=timestamp_infer_config,
			timestamp_model_file=timestamp_model_file,
			timestamp_cmvn_file=timestamp_cmvn_file,
			key_file=key_file,
			allow_variable_data_keys=allow_variable_data_keys,
			output_dir=output_dir,
			@@ -209,9 +216,9 @@
			ngpu: int,
			log_level: Union[int, str],
			# data_path_and_name_and_type,
			tp_train_config: Optional[str],
			tp_model_file: Optional[str],
			tp_cmvn_file: Optional[str] = None,
			timestamp_infer_config: Optional[str],
			timestamp_model_file: Optional[str],
			timestamp_cmvn_file: Optional[str] = None,
			# raw_inputs: Union[np.ndarray, torch.Tensor] = None,
			key_file: Optional[str] = None,
			allow_variable_data_keys: bool = False,
			@@ -236,15 +243,14 @@
			device = "cuda"
			else:
			device = "cpu"

			# 1. Set random-seed
			set_all_random_seed(seed)

			# 2. Build speech2vadsegment
			speechtext2timestamp_kwargs = dict(
			tp_train_config=tp_train_config,
			tp_model_file=tp_model_file,
			tp_cmvn_file=tp_cmvn_file,
			timestamp_infer_config=timestamp_infer_config,
			timestamp_model_file=timestamp_model_file,
			timestamp_cmvn_file=timestamp_cmvn_file,
			device=device,
			dtype=dtype,
			)
			@@ -256,7 +262,8 @@
			raw_inputs: Union[np.ndarray, torch.Tensor] = None,
			output_dir_v2: Optional[str] = None,
			fs: dict = None,
			param_dict: dict = None
			param_dict: dict = None,
			**kwargs
			):
			# 3. Build data-iterator
			if data_path_and_name_and_type is None and raw_inputs is not None:
			@@ -295,11 +302,9 @@
			for batch_id in range(_bs):
			key = keys[batch_id]
			token = speechtext2timestamp.converter.ids2tokens(batch['text'][batch_id])
			timestamp = time_stamp_lfr6_advance(us_alphas[batch_id], us_cif_peak[batch_id], token)
			logging.warning(timestamp)
			import pdb; pdb.set_trace()
			tp_result_list.append({'text':"".join([i for i in token if i != '<sil>']), 'timestamp': timestamp})

			ts_str, ts_list = time_stamp_lfr6_advance(us_alphas[batch_id], us_cif_peak[batch_id], token)
			logging.warning(ts_str)
			tp_result_list.append({'text':"".join([i for i in token if i != '<sil>']), 'timestamp': ts_list})
			return tp_result_list

			return _forward
			@@ -362,17 +367,17 @@

			group = parser.add_argument_group("The model configuration related")
			group.add_argument(
			"--tp_train_config",
			"--timestamp_infer_config",
			type=str,
			help="VAD infer configuration",
			)
			group.add_argument(
			"--tp_model_file",
			"--timestamp_model_file",
			type=str,
			help="VAD model parameter file",
			)
			group.add_argument(
			"--tp_cmvn_file",
			"--timestamp_cmvn_file",
			type=str,
			help="Global cmvn file",
			)