python/FunASR-XL.git

			@@ -16,6 +16,7 @@

			import numpy as np
			import torch
			from scipy.signal import medfilt
			from typeguard import check_argument_types

			from funasr.models.frontend.wav_frontend import WavFrontendMel23
			@@ -26,6 +27,7 @@
			from funasr.utils.types import str2bool
			from funasr.utils.types import str2triple_str
			from funasr.utils.types import str_or_none


			class Speech2Diarization:
			"""Speech2Diarlization class
			@@ -234,9 +236,23 @@
			# batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}

			results = speech2diar(**batch)

			# post process
			a = results[0][0].cpu().numpy()
			a = medfilt(a, (11, 1))
			rst = []
			for spkid, frames in enumerate(a.T):
			frames = np.pad(frames, (1, 1), 'constant')
			changes, = np.where(np.diff(frames, axis=0) != 0)
			fmt = "SPEAKER {:s} 1 {:7.2f} {:7.2f} <NA> <NA> {:s} <NA>"
			for s, e in zip(changes[::2], changes[1::2]):
			st = s / 10.
			dur = (e - s) / 10.
			rst.append(fmt.format(keys[0], st, dur, "{}_{}".format(keys[0], str(spkid))))

			# Only supporting batch_size==1
			key, value = keys[0], output_results_str(results, keys[0])
			item = {"key": key, "value": value}
			value = "\n".join(rst)
			item = {"key": keys[0], "value": value}
			result_list.append(item)
			if output_path is not None:
			output_writer.write(value)