From 0143122a4e2ee86cc27ba137b2bb0530577cbf12 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期五, 12 一月 2024 10:27:36 +0800
Subject: [PATCH] funasr1.0 streaming demo

---
 funasr/models/paraformer/model.py |   29 +++++++++++++++--------------
 1 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/funasr/models/paraformer/model.py b/funasr/models/paraformer/model.py
index 03a0bd2..78a72ec 100644
--- a/funasr/models/paraformer/model.py
+++ b/funasr/models/paraformer/model.py
@@ -22,13 +22,13 @@
 
 from torch.cuda.amp import autocast
 
-from funasr.datasets.audio_datasets.load_audio_extract_fbank import load_audio, extract_fbank
+from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
 from funasr.utils import postprocess_utils
 from funasr.utils.datadir_writer import DatadirWriter
-from funasr.utils.register import register_class, registry_tables
+from funasr.register import tables
 from funasr.models.ctc.ctc import CTC
 
-@register_class("model_classes", "Paraformer")
+@tables.register("model_classes", "Paraformer")
 class Paraformer(nn.Module):
 	"""
 	Author: Speech Lab of DAMO Academy, Alibaba Group
@@ -79,17 +79,17 @@
 		super().__init__()
 
 		if specaug is not None:
-			specaug_class = registry_tables.specaug_classes.get(specaug.lower())
+			specaug_class = tables.specaug_classes.get(specaug.lower())
 			specaug = specaug_class(**specaug_conf)
 		if normalize is not None:
-			normalize_class = registry_tables.normalize_classes.get(normalize.lower())
+			normalize_class = tables.normalize_classes.get(normalize.lower())
 			normalize = normalize_class(**normalize_conf)
-		encoder_class = registry_tables.encoder_classes.get(encoder.lower())
+		encoder_class = tables.encoder_classes.get(encoder.lower())
 		encoder = encoder_class(input_size=input_size, **encoder_conf)
 		encoder_output_size = encoder.output_size()
 
 		if decoder is not None:
-			decoder_class = registry_tables.decoder_classes.get(decoder.lower())
+			decoder_class = tables.decoder_classes.get(decoder.lower())
 			decoder = decoder_class(
 				vocab_size=vocab_size,
 				encoder_output_size=encoder_output_size,
@@ -104,7 +104,7 @@
 				odim=vocab_size, encoder_output_size=encoder_output_size, **ctc_conf
 			)
 		if predictor is not None:
-			predictor_class = registry_tables.predictor_classes.get(predictor.lower())
+			predictor_class = tables.predictor_classes.get(predictor.lower())
 			predictor = predictor_class(**predictor_conf)
 		
 		# note that eos is the same as sos (equivalent ID)
@@ -447,7 +447,6 @@
              frontend=None,
              **kwargs,
              ):
-		
 		# init beamsearch
 		is_use_ctc = kwargs.get("decoding_ctc_weight", 0.0) > 0.00001 and self.ctc != None
 		is_use_lm = kwargs.get("lm_weight", 0.0) > 0.00001 and kwargs.get("lm_file", None) is not None
@@ -466,7 +465,7 @@
 		else:
 			# extract fbank feats
 			time1 = time.perf_counter()
-			audio_sample_list = load_audio(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000))
+			audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000), data_type=kwargs.get("data_type", "sound"), tokenizer=tokenizer)
 			time2 = time.perf_counter()
 			meta_data["load_data"] = f"{time2 - time1:0.3f}"
 			speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"), frontend=frontend)
@@ -475,7 +474,6 @@
 			meta_data["batch_data_time"] = speech_lengths.sum().item() * frontend.frame_shift * frontend.lfr_n / 1000
 			
 		speech.to(device=kwargs["device"]), speech_lengths.to(device=kwargs["device"])
-
 		# Encoder
 		encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
 		if isinstance(encoder_out, tuple):
@@ -495,6 +493,8 @@
 
 		results = []
 		b, n, d = decoder_out.size()
+		if isinstance(key[0], (list, tuple)):
+			key = key[0]
 		for i in range(b):
 			x = encoder_out[i, :encoder_out_lens[i], :]
 			am_scores = decoder_out[i, :pre_token_length[i], :]
@@ -535,13 +535,14 @@
 					text = tokenizer.tokens2text(token)
 					
 					text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
-					result_i = {"key": key[i], "token": token, "text": text, "text_postprocessed": text_postprocessed}
+					
+					result_i = {"key": key[i], "text": text_postprocessed}
 
 					
 					if ibest_writer is not None:
 						ibest_writer["token"][key[i]] = " ".join(token)
-						ibest_writer["text"][key[i]] = text
-						ibest_writer["text_postprocessed"][key[i]] = text_postprocessed
+						# ibest_writer["text"][key[i]] = text
+						ibest_writer["text"][key[i]] = text_postprocessed
 				else:
 					result_i = {"key": key[i], "token_int": token_int}
 				results.append(result_i)

--
Gitblit v1.9.1