| | |
| | | model = AutoModel(model="iic/emotion2vec_base_finetuned", model_revision="v2.0.4", |
| | | # vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch", |
| | | # vad_model_revision="v2.0.4", |
| | | # vad_kwargs={"max_single_segment_time": 10}, |
| | | # vad_kwargs={"max_single_segment_time": 1000}, |
| | | ) |
| | | |
| | | wav_file = f"{model.model_path}/example/test.wav" |
| | |
| | | model_revision="v2.0.4", |
| | | vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch", |
| | | vad_model_revision="v2.0.4", |
| | | vad_kwargs={"max_single_segment_time": 60}, |
| | | vad_kwargs={"max_single_segment_time": 60000}, |
| | | punc_model="iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch", |
| | | punc_model_revision="v2.0.4", |
| | | # spk_model="iic/speech_campplus_sv_zh-cn_16k-common", |
| | |
| | | model = AutoModel(model="iic/Whisper-large-v3", |
| | | model_revision="v2.0.5", |
| | | vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch", |
| | | vad_kwargs={"max_single_segment_time": 30}, |
| | | vad_kwargs={"max_single_segment_time": 30000}, |
| | | ) |
| | | |
| | | res = model.generate( |
| | |
| | | # model = AutoModel(model="Whisper-large-v2", hub="openai") |
| | | model = AutoModel(model="Whisper-large-v3", |
| | | vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch", |
| | | vad_kwargs={"max_single_segment_time": 30}, |
| | | vad_kwargs={"max_single_segment_time": 30000}, |
| | | hub="openai", |
| | | ) |
| | | |
| | |
| | | key = res[i]["key"] |
| | | vadsegments = res[i]["value"] |
| | | input_i = data_list[i] |
| | | speech = load_audio_text_image_video(input_i, fs=kwargs["frontend"].fs, audio_fs=kwargs.get("fs", 16000)) |
| | | fs = kwargs["frontend"].fs if hasattr(kwargs["frontend"], "fs") else 16000 |
| | | speech = load_audio_text_image_video(input_i, fs=fs, audio_fs=kwargs.get("fs", 16000)) |
| | | speech_lengths = len(speech) |
| | | n = len(vadsegments) |
| | | data_with_index = [(vadsegments[i], i) for i in range(n)] |