# if you want to use ASR model besides paraformer-bicif (like contextual paraformer) # to get ASR results for long audio as well as timestamp prediction results, # try this demo from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks import os import librosa import soundfile as sf param_dict = dict() param_dict['hotword'] = "你的热词" test_wav = 'YOUR_LONG_WAV.wav' output_dir = './tmp' os.system("mkdir -p {}".format(output_dir)) vad_pipeline = pipeline( task=Tasks.voice_activity_detection, model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch', model_revision=None, ) asr_pipeline = pipeline( task=Tasks.auto_speech_recognition, model="damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404", output_dir=output_dir) tp_pipeline = pipeline( task=Tasks.speech_timestamp, model='damo/speech_timestamp_prediction-v1-16k-offline', output_dir=output_dir) vad_res = vad_pipeline(audio_in=test_wav) timestamps = vad_res['text'] samples = librosa.load(test_wav, sr=16000)[0] wavseg_scp = "{}/wav.scp".format(output_dir) with open(wavseg_scp, 'w') as fout: for i, timestamp in enumerate(timestamps): start = int(timestamp[0]/1000*16000) end = int(timestamp[1]/1000*16000) uttid = "wav_{}_{} ".format(start, end) wavpath = '{}/wavseg_{}.wav'.format(output_dir, i) _samples = samples[start:end] sf.write(wavpath, _samples, 16000) fout.write("{} {}\n".format(uttid, wavpath)) print("Wav segment done: {}".format(wavseg_scp)) asr_res = '{}/1best_recog/text'.format(output_dir) tp_res = '{}/timestamp_prediction/tp_sync'.format(output_dir) rec_result_asr = asr_pipeline(audio_in=wavseg_scp) rec_result_tp = tp_pipeline(audio_in=wavseg_scp, text_in=asr_res) print("Find your ASR results in {}, and timestamp prediction results in {}.".format(asr_res, tp_res))