| New file |
| | |
| | | # if you want to use ASR model besides paraformer-bicif (like contextual paraformer) |
| | | # to get ASR results for long audio as well as timestamp prediction results, |
| | | # try this demo |
| | | from modelscope.pipelines import pipeline |
| | | from modelscope.utils.constant import Tasks |
| | | import os |
| | | import librosa |
| | | import soundfile as sf |
| | | |
| | | param_dict = dict() |
| | | param_dict['hotword'] = "信诺" |
| | | |
| | | test_wav = '/Users/shixian/Downloads/tpdebug.wav' |
| | | output_dir = './tmp' |
| | | os.system("mkdir -p {}".format(output_dir)) |
| | | |
| | | vad_pipeline = pipeline( |
| | | task=Tasks.voice_activity_detection, |
| | | model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch', |
| | | model_revision=None, |
| | | ) |
| | | asr_pipeline = pipeline( |
| | | task=Tasks.auto_speech_recognition, |
| | | model="damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404", |
| | | output_dir=output_dir) |
| | | tp_pipeline = pipeline( |
| | | task=Tasks.speech_timestamp, |
| | | model='damo/speech_timestamp_prediction-v1-16k-offline', |
| | | output_dir=output_dir) |
| | | |
| | | vad_res = vad_pipeline(audio_in=test_wav) |
| | | timestamps = vad_res['text'] |
| | | |
| | | samples = librosa.load(test_wav, sr=16000)[0] |
| | | wavseg_scp = "{}/wav.scp".format(output_dir) |
| | | |
| | | with open(wavseg_scp, 'w') as fout: |
| | | for i, timestamp in enumerate(timestamps): |
| | | start = int(timestamp[0]/1000*16000) |
| | | end = int(timestamp[1]/1000*16000) |
| | | uttid = "wav_{}_{} ".format(start, end) |
| | | wavpath = '{}/wavseg_{}.wav'.format(output_dir, i) |
| | | _samples = samples[start:end] |
| | | sf.write(wavpath, _samples, 16000) |
| | | fout.write("{} {}\n".format(uttid, wavpath)) |
| | | print("Wav segment done: {}".format(wavseg_scp)) |
| | | |
| | | asr_res = '{}/1best_recog/text'.format(output_dir) |
| | | tp_res = '{}/timestamp_prediction/tp_sync'.format(output_dir) |
| | | rec_result_asr = asr_pipeline(audio_in=wavseg_scp) |
| | | rec_result_tp = tp_pipeline(audio_in=wavseg_scp, text_in=asr_res) |
| | | print("Find your ASR results in {}, and timestamp prediction results in {}.".format(asr_res, tp_res)) |