python/FunASR-XL.git

parent: 5d300aed | 补丁 | 提交 | ignore whitespace

vad streaming return [beg, -1], [], [-1, end], [beg, end] (#1306)

zhifu gao

2024-01-26 65396eeeff96cdc21f939828e13a2e3d0127f2c6

vad streaming return [beg, -1], [], [-1, end], [beg, end] (#1306)

* fix add_file bug (#1296)

Co-authored-by: shixian.shi <shixian.shi@alibaba-inc.com>

* funasr1.0 uniasr

* funasr1.0 uniasr

* update with main (#1305)

* v1.0.3

* update clients for 2pass

* update download tools

---------

Co-authored-by: 雾聪 <wucong.lyb@alibaba-inc.com>

* vad streaming return [beg, -1], [], [-1, end], [beg, end]]

---------

Co-authored-by: shixian.shi <shixian.shi@alibaba-inc.com>
Co-authored-by: 雾聪 <wucong.lyb@alibaba-inc.com>

4个文件已修改

	examples/industrial_data_pretraining/fsmn_vad_streaming/demo.py	6 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	examples/industrial_data_pretraining/uniasr/demo.py	5 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/fsmn_vad_streaming/model.py	47 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/utils/load_utils.py	1 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史

 examples/industrial_data_pretraining/fsmn_vad_streaming/demo.py

@@ -6,10 +6,9 @@
from funasr import AutoModel
wav_file = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav"

chunk_size = 60000 # ms
model = AutoModel(model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", model_revision="v2.0.4")

res = model.generate(input=wav_file, chunk_size=chunk_size, )
res = model.generate(input=wav_file)
print(res)


@@ -20,6 +19,7 @@
wav_file = os.path.join(model.model_path, "example/vad_example.wav")
speech, sample_rate = soundfile.read(wav_file)

chunk_size = 200 # ms
chunk_stride = int(chunk_size * sample_rate / 1000)

cache = {}
@@ -32,6 +32,8 @@
                cache=cache,
                is_final=is_final,
                chunk_size=chunk_size,
                disable_pbar=True,
                )
    # print(res)
    if len(res[0]["value"]):
        print(res)

 examples/industrial_data_pretraining/uniasr/demo.py

@@ -5,8 +5,9 @@

from funasr import AutoModel

model = AutoModel(model="iic/speech_UniASR-large_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline", model_revision="v2.0.4",
                  )

model = AutoModel(model="iic/speech_UniASR-large_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline", model_revision="v2.0.4",)


res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav")
print(res)

 funasr/models/fsmn_vad_streaming/model.py

@@ -482,13 +482,17 @@
        
        return frame_state
    
    def forward(self, feats: torch.Tensor, waveform: torch.tensor, cache: dict = {},
                is_final: bool = False
    def forward(self, feats: torch.Tensor,
                waveform: torch.tensor,
                cache: dict = {},
                is_final: bool = False,
                **kwargs,
                ):
        # if len(cache) == 0:
        #     self.AllResetDetection()
        # self.waveform = waveform  # compute decibel for each frame
        cache["stats"].waveform = waveform
        is_streaming_input = kwargs.get("is_streaming_input", True)
        self.ComputeDecibel(cache=cache)
        self.ComputeScores(feats, cache=cache)
        if not is_final:
@@ -500,13 +504,32 @@
            segment_batch = []
            if len(cache["stats"].output_data_buf) > 0:
                for i in range(cache["stats"].output_data_buf_offset, len(cache["stats"].output_data_buf)):
                    if not is_final and (not cache["stats"].output_data_buf[i].contain_seg_start_point or not
                    cache["stats"].output_data_buf[
                        i].contain_seg_end_point):
                        continue
                    segment = [cache["stats"].output_data_buf[i].start_ms, cache["stats"].output_data_buf[i].end_ms]
                    if is_streaming_input: # in this case, return [beg, -1], [], [-1, end], [beg, end]
                        if not cache["stats"].output_data_buf[i].contain_seg_start_point:
                            continue
                        if not cache["stats"].next_seg and not cache["stats"].output_data_buf[i].contain_seg_end_point:
                            continue
                        start_ms = cache["stats"].output_data_buf[i].start_ms if cache["stats"].next_seg else -1
                        if cache["stats"].output_data_buf[i].contain_seg_end_point:
                            end_ms = cache["stats"].output_data_buf[i].end_ms
                            cache["stats"].next_seg = True
                            cache["stats"].output_data_buf_offset += 1
                        else:
                            end_ms = -1
                            cache["stats"].next_seg = False
                        segment = [start_ms, end_ms]
						
                    else: # in this case, return [beg, end]
						
                        if not is_final and (not cache["stats"].output_data_buf[i].contain_seg_start_point or not
                        cache["stats"].output_data_buf[
                            i].contain_seg_end_point):
                            continue
                        segment = [cache["stats"].output_data_buf[i].start_ms, cache["stats"].output_data_buf[i].end_ms]
                        cache["stats"].output_data_buf_offset += 1  # need update this parameter
					
                    segment_batch.append(segment)
                    cache["stats"].output_data_buf_offset += 1  # need update this parameter
					
            if segment_batch:
                segments.append(segment_batch)
        # if is_final:
@@ -551,7 +574,8 @@
        chunk_stride_samples = int(chunk_size * frontend.fs / 1000)
        
        time1 = time.perf_counter()
        cfg = {"is_final": kwargs.get("is_final", False)}
        is_streaming_input = kwargs.get("is_streaming_input", False) if chunk_size >= 15000 else kwargs.get("is_streaming_input", True)
        cfg = {"is_final": kwargs.get("is_final", False), "is_streaming_input": is_streaming_input}
        audio_sample_list = load_audio_text_image_video(data_in,
                                                        fs=frontend.fs,
                                                        audio_fs=kwargs.get("fs", 16000),
@@ -560,7 +584,7 @@
                                                        cache=cfg,
                                                        )
        _is_final = cfg["is_final"]  # if data_in is a file or url, set is_final=True
		
        is_streaming_input = cfg["is_streaming_input"]
        time2 = time.perf_counter()
        meta_data["load_data"] = f"{time2 - time1:0.3f}"
        assert len(audio_sample_list) == 1, "batch_size must be set 1"
@@ -588,7 +612,8 @@
                "feats": speech,
                "waveform": cache["frontend"]["waveforms"],
                "is_final": kwargs["is_final"],
                "cache": cache
                "cache": cache,
                "is_streaming_input": is_streaming_input
            }
            segments_i = self.forward(**batch)
            if len(segments_i) > 0:

 funasr/utils/load_utils.py

@@ -51,6 +51,7 @@
        # if data_in is a file or url, set is_final=True
        if "cache" in kwargs:
            kwargs["cache"]["is_final"] = True
            kwargs["cache"]["is_streaming_input"] = False
    elif isinstance(data_or_path_or_list, str) and data_type == "text" and tokenizer is not None:
        data_or_path_or_list = tokenizer.encode(data_or_path_or_list)
    elif isinstance(data_or_path_or_list, np.ndarray):  # audio sample point

			@@ -6,10 +6,9 @@
			from funasr import AutoModel
			wav_file = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav"

			chunk_size = 60000 # ms
			model = AutoModel(model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", model_revision="v2.0.4")

			res = model.generate(input=wav_file, chunk_size=chunk_size, )
			res = model.generate(input=wav_file)
			print(res)


			@@ -20,6 +19,7 @@
			wav_file = os.path.join(model.model_path, "example/vad_example.wav")
			speech, sample_rate = soundfile.read(wav_file)

			chunk_size = 200 # ms
			chunk_stride = int(chunk_size * sample_rate / 1000)

			cache = {}
			@@ -32,6 +32,8 @@
			cache=cache,
			is_final=is_final,
			chunk_size=chunk_size,
			disable_pbar=True,
			)
			# print(res)
			if len(res[0]["value"]):
			print(res)

			@@ -5,8 +5,9 @@

			from funasr import AutoModel

			model = AutoModel(model="iic/speech_UniASR-large_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline", model_revision="v2.0.4",
			)

			model = AutoModel(model="iic/speech_UniASR-large_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline", model_revision="v2.0.4",)


			res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav")
			print(res)

			@@ -482,13 +482,17 @@

			return frame_state

			def forward(self, feats: torch.Tensor, waveform: torch.tensor, cache: dict = {},
			is_final: bool = False
			def forward(self, feats: torch.Tensor,
			waveform: torch.tensor,
			cache: dict = {},
			is_final: bool = False,
			**kwargs,
			):
			# if len(cache) == 0:
			# self.AllResetDetection()
			# self.waveform = waveform # compute decibel for each frame
			cache["stats"].waveform = waveform
			is_streaming_input = kwargs.get("is_streaming_input", True)
			self.ComputeDecibel(cache=cache)
			self.ComputeScores(feats, cache=cache)
			if not is_final:
			@@ -500,13 +504,32 @@
			segment_batch = []
			if len(cache["stats"].output_data_buf) > 0:
			for i in range(cache["stats"].output_data_buf_offset, len(cache["stats"].output_data_buf)):
			if not is_final and (not cache["stats"].output_data_buf[i].contain_seg_start_point or not
			cache["stats"].output_data_buf[
			i].contain_seg_end_point):
			continue
			segment = [cache["stats"].output_data_buf[i].start_ms, cache["stats"].output_data_buf[i].end_ms]
			if is_streaming_input: # in this case, return [beg, -1], [], [-1, end], [beg, end]
			if not cache["stats"].output_data_buf[i].contain_seg_start_point:
			continue
			if not cache["stats"].next_seg and not cache["stats"].output_data_buf[i].contain_seg_end_point:
			continue
			start_ms = cache["stats"].output_data_buf[i].start_ms if cache["stats"].next_seg else -1
			if cache["stats"].output_data_buf[i].contain_seg_end_point:
			end_ms = cache["stats"].output_data_buf[i].end_ms
			cache["stats"].next_seg = True
			cache["stats"].output_data_buf_offset += 1
			else:
			end_ms = -1
			cache["stats"].next_seg = False
			segment = [start_ms, end_ms]

			else: # in this case, return [beg, end]

			if not is_final and (not cache["stats"].output_data_buf[i].contain_seg_start_point or not
			cache["stats"].output_data_buf[
			i].contain_seg_end_point):
			continue
			segment = [cache["stats"].output_data_buf[i].start_ms, cache["stats"].output_data_buf[i].end_ms]
			cache["stats"].output_data_buf_offset += 1 # need update this parameter

			segment_batch.append(segment)
			cache["stats"].output_data_buf_offset += 1 # need update this parameter

			if segment_batch:
			segments.append(segment_batch)
			# if is_final:
			@@ -551,7 +574,8 @@
			chunk_stride_samples = int(chunk_size * frontend.fs / 1000)

			time1 = time.perf_counter()
			cfg = {"is_final": kwargs.get("is_final", False)}
			is_streaming_input = kwargs.get("is_streaming_input", False) if chunk_size >= 15000 else kwargs.get("is_streaming_input", True)
			cfg = {"is_final": kwargs.get("is_final", False), "is_streaming_input": is_streaming_input}
			audio_sample_list = load_audio_text_image_video(data_in,
			fs=frontend.fs,
			audio_fs=kwargs.get("fs", 16000),
			@@ -560,7 +584,7 @@
			cache=cfg,
			)
			_is_final = cfg["is_final"] # if data_in is a file or url, set is_final=True

			is_streaming_input = cfg["is_streaming_input"]
			time2 = time.perf_counter()
			meta_data["load_data"] = f"{time2 - time1:0.3f}"
			assert len(audio_sample_list) == 1, "batch_size must be set 1"
			@@ -588,7 +612,8 @@
			"feats": speech,
			"waveform": cache["frontend"]["waveforms"],
			"is_final": kwargs["is_final"],
			"cache": cache
			"cache": cache,
			"is_streaming_input": is_streaming_input
			}
			segments_i = self.forward(**batch)
			if len(segments_i) > 0:

			@@ -51,6 +51,7 @@
			# if data_in is a file or url, set is_final=True
			if "cache" in kwargs:
			kwargs["cache"]["is_final"] = True
			kwargs["cache"]["is_streaming_input"] = False
			elif isinstance(data_or_path_or_list, str) and data_type == "text" and tokenizer is not None:
			data_or_path_or_list = tokenizer.encode(data_or_path_or_list)
			elif isinstance(data_or_path_or_list, np.ndarray): # audio sample point