From 30bba1d30bb0352afae21dfac1021bf614663513 Mon Sep 17 00:00:00 2001
From: shixian.shi <shixian.shi@alibaba-inc.com>
Date: 星期二, 11 七月 2023 17:30:31 +0800
Subject: [PATCH] update demo for using tp model with long audio

---
 egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/demo_long.py |   52 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 52 insertions(+), 0 deletions(-)

diff --git a/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/demo_long.py b/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/demo_long.py
new file mode 100644
index 0000000..c04d985
--- /dev/null
+++ b/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/demo_long.py
@@ -0,0 +1,52 @@
+# if you want to use ASR model besides paraformer-bicif (like contextual paraformer)
+# to get ASR results for long audio as well as timestamp prediction results, 
+# try this demo
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+import os
+import librosa
+import soundfile as sf
+
+param_dict = dict()
+param_dict['hotword'] = "淇¤"
+
+test_wav = '/Users/shixian/Downloads/tpdebug.wav'
+output_dir = './tmp'
+os.system("mkdir -p {}".format(output_dir))
+
+vad_pipeline = pipeline(
+    task=Tasks.voice_activity_detection,
+    model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch',
+    model_revision=None,
+)
+asr_pipeline = pipeline(
+    task=Tasks.auto_speech_recognition,
+    model="damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404",
+    output_dir=output_dir)
+tp_pipeline = pipeline(
+    task=Tasks.speech_timestamp,
+    model='damo/speech_timestamp_prediction-v1-16k-offline',
+    output_dir=output_dir)
+
+vad_res = vad_pipeline(audio_in=test_wav)
+timestamps = vad_res['text']
+
+samples = librosa.load(test_wav, sr=16000)[0]
+wavseg_scp = "{}/wav.scp".format(output_dir)
+
+with open(wavseg_scp, 'w') as fout:
+    for i, timestamp in enumerate(timestamps):
+        start = int(timestamp[0]/1000*16000)
+        end = int(timestamp[1]/1000*16000)
+        uttid = "wav_{}_{} ".format(start, end)
+        wavpath = '{}/wavseg_{}.wav'.format(output_dir, i)
+        _samples = samples[start:end]
+        sf.write(wavpath, _samples, 16000)
+        fout.write("{} {}\n".format(uttid, wavpath))
+print("Wav segment done: {}".format(wavseg_scp))
+
+asr_res = '{}/1best_recog/text'.format(output_dir)
+tp_res = '{}/timestamp_prediction/tp_sync'.format(output_dir)
+rec_result_asr = asr_pipeline(audio_in=wavseg_scp)
+rec_result_tp = tp_pipeline(audio_in=wavseg_scp, text_in=asr_res)
+print("Find your ASR results in {}, and timestamp prediction results in {}.".format(asr_res, tp_res))
\ No newline at end of file

--
Gitblit v1.9.1