| | |
| | | vad_model="fsmn-vad", vad_model_revision="v2.0.2", \ |
| | | punc_model="ct-punc-c", punc_model_revision="v2.0.2", \ |
| | | spk_model="cam++", spk_model_revision="v2.0.2") |
| | | res = model(input=f"{model.model_path}/example/asr_example.wav", |
| | | res = model.generate(input=f"{model.model_path}/example/asr_example.wav", |
| | | batch_size=64, |
| | | hotword='魔搭') |
| | | print(res) |
| | |
| | | for i in range(total_chunk_num): |
| | | speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride] |
| | | is_final = i == total_chunk_num - 1 |
| | | res = model(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, decoder_chunk_look_back=decoder_chunk_look_back) |
| | | res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, decoder_chunk_look_back=decoder_chunk_look_back) |
| | | print(res) |
| | | ``` |
| | | Note: `chunk_size` is the configuration for streaming latency.` [0,10,5]` indicates that the real-time display granularity is `10*60=600ms`, and the lookahead information is `5*60=300ms`. Each inference input is `600ms` (sample points are `16000*0.6=960`), and the output is the corresponding text. For the last speech segment input, `is_final=True` needs to be set to force the output of the last word. |
| | |
| | | |
| | | model = AutoModel(model="fsmn-vad", model_revision="v2.0.2") |
| | | wav_file = f"{model.model_path}/example/asr_example.wav" |
| | | res = model(input=wav_file) |
| | | res = model.generate(input=wav_file) |
| | | print(res) |
| | | ``` |
| | | ### Voice Activity Detection (Non-streaming) |
| | |
| | | for i in range(total_chunk_num): |
| | | speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride] |
| | | is_final = i == total_chunk_num - 1 |
| | | res = model(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size) |
| | | res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size) |
| | | if len(res[0]["value"]): |
| | | print(res) |
| | | ``` |
| | |
| | | from funasr import AutoModel |
| | | |
| | | model = AutoModel(model="ct-punc", model_revision="v2.0.2") |
| | | res = model(input="那今天的会就到这里吧 happy new year 明年见") |
| | | res = model.generate(input="那今天的会就到这里吧 happy new year 明年见") |
| | | print(res) |
| | | ``` |
| | | ### Timestamp Prediction |
| | |
| | | model = AutoModel(model="fa-zh", model_revision="v2.0.2") |
| | | wav_file = f"{model.model_path}/example/asr_example.wav" |
| | | text_file = f"{model.model_path}/example/text.txt" |
| | | res = model(input=(wav_file, text_file), data_type=("sound", "text")) |
| | | res = model.generate(input=(wav_file, text_file), data_type=("sound", "text")) |
| | | print(res) |
| | | ``` |
| | | [//]: # (FunASR supports inference and fine-tuning of models trained on industrial datasets of tens of thousands of hours. For more details, please refer to ([modelscope_egs](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_pipeline/quick_start.html)). It also supports training and fine-tuning of models on academic standard datasets. For more details, please refer to([egs](https://alibaba-damo-academy.github.io/FunASR/en/academic_recipe/asr_recipe.html)). The models include speech recognition (ASR), speech activity detection (VAD), punctuation recovery, language model, speaker verification, speaker separation, and multi-party conversation speech recognition. For a detailed list of models, please refer to the [Model Zoo](https://github.com/alibaba-damo-academy/FunASR/blob/main/docs/model_zoo/modelscope_models.md):) |
| | |
| | | } |
| | | @inproceedings{gao22b_interspeech, |
| | | author={Zhifu Gao and ShiLiang Zhang and Ian McLoughlin and Zhijie Yan}, |
| | | title={{Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition}}, |
| | | title={Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition}, |
| | | year=2022, |
| | | booktitle={Proc. Interspeech 2022}, |
| | | pages={2063--2067}, |
| | | doi={10.21437/Interspeech.2022-9996} |
| | | } |
| | | @inproceedings{shi2023seaco, |
| | | author={Xian Shi and Yexin Yang and Zerui Li and Yanni Chen and Zhifu Gao and Shiliang Zhang}, |
| | | title={SeACo-Paraformer: A Non-Autoregressive ASR System with Flexible and Effective Hotword Customization Ability}, |
| | | year={2023}, |
| | | booktitle={ICASSP2024} |
| | | } |
| | | ``` |
| | |
| | | vad_model="fsmn-vad", vad_model_revision="v2.0.2", \ |
| | | punc_model="ct-punc-c", punc_model_revision="v2.0.2", \ |
| | | spk_model="cam++", spk_model_revision="v2.0.2") |
| | | res = model(input=f"{model.model_path}/example/asr_example.wav", |
| | | res = model.generate(input=f"{model.model_path}/example/asr_example.wav", |
| | | batch_size=64, |
| | | hotword='魔搭') |
| | | print(res) |
| | |
| | | for i in range(total_chunk_num): |
| | | speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride] |
| | | is_final = i == total_chunk_num - 1 |
| | | res = model(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, decoder_chunk_look_back=decoder_chunk_look_back) |
| | | res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, decoder_chunk_look_back=decoder_chunk_look_back) |
| | | print(res) |
| | | ``` |
| | | |
| | |
| | | model = AutoModel(model="fsmn-vad", model_revision="v2.0.2") |
| | | |
| | | wav_file = f"{model.model_path}/example/asr_example.wav" |
| | | res = model(input=wav_file) |
| | | res = model.generate(input=wav_file) |
| | | print(res) |
| | | ``` |
| | | |
| | |
| | | for i in range(total_chunk_num): |
| | | speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride] |
| | | is_final = i == total_chunk_num - 1 |
| | | res = model(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size) |
| | | res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size) |
| | | if len(res[0]["value"]): |
| | | print(res) |
| | | ``` |
| | |
| | | |
| | | model = AutoModel(model="ct-punc", model_revision="v2.0.2") |
| | | |
| | | res = model(input="那今天的会就到这里吧 happy new year 明年见") |
| | | res = model.generate(input="那今天的会就到这里吧 happy new year 明年见") |
| | | print(res) |
| | | ``` |
| | | |
| | |
| | | |
| | | wav_file = f"{model.model_path}/example/asr_example.wav" |
| | | text_file = f"{model.model_path}/example/text.txt" |
| | | res = model(input=(wav_file, text_file), data_type=("sound", "text")) |
| | | res = model.generate(input=(wav_file, text_file), data_type=("sound", "text")) |
| | | print(res) |
| | | ``` |
| | | 更多详细用法([示例](examples/industrial_data_pretraining)) |
| | |
| | | pages={2063--2067}, |
| | | doi={10.21437/Interspeech.2022-9996} |
| | | } |
| | | @article{shi2023seaco, |
| | | author={Xian Shi and Yexin Yang and Zerui Li and Yanni Chen and Zhifu Gao and Shiliang Zhang}, |
| | | title={{SeACo-Paraformer: A Non-Autoregressive ASR System with Flexible and Effective Hotword Customization Ability}}, |
| | | year=2023, |
| | | journal={arXiv preprint arXiv:2308.03266(accepted by ICASSP2024)}, |
| | | } |
| | | ``` |
| | |
| | | spk_model_revision="v2.0.2", |
| | | ) |
| | | |
| | | res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_vad_punc_example.wav", batch_size_s=300, batch_size_threshold_s=60) |
| | | res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_vad_punc_example.wav", batch_size_s=300, batch_size_threshold_s=60) |
| | | print(res) |
| | |
| | | model_revision="v2.0.2", |
| | | ) |
| | | |
| | | res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav") |
| | | res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav") |
| | | print(res) |
| | |
| | | |
| | | model = AutoModel(model="damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404", model_revision="v2.0.2") |
| | | |
| | | res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", |
| | | res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", |
| | | hotword='达摩院 魔搭') |
| | | print(res) |
| | |
| | | |
| | | model = AutoModel(model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch", model_revision="v2.0.2") |
| | | |
| | | res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt") |
| | | res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt") |
| | | print(res) |
| | | |
| | | |
| | |
| | | |
| | | model = AutoModel(model="damo/punc_ct-transformer_cn-en-common-vocab471067-large", model_revision="v2.0.2") |
| | | |
| | | res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt") |
| | | res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt") |
| | | print(res) |
| | |
| | | rec_result_all = "outputs: " |
| | | cache = {} |
| | | for vad in vads: |
| | | rec_result = model(input=vad, cache=cache) |
| | | rec_result = model.generate(input=vad, cache=cache) |
| | | print(rec_result) |
| | | rec_result_all += rec_result[0]['text'] |
| | | |
| | |
| | | |
| | | model = AutoModel(model="damo/emotion2vec_base", model_revision="v2.0.1") |
| | | |
| | | res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", output_dir="./outputs") |
| | | res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", output_dir="./outputs") |
| | | print(res) |
| | |
| | | chunk_size = 60000 # ms |
| | | model = AutoModel(model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", model_revision="v2.0.2") |
| | | |
| | | res = model(input=wav_file, chunk_size=chunk_size, ) |
| | | res = model.generate(input=wav_file, chunk_size=chunk_size, ) |
| | | print(res) |
| | | |
| | | |
| | |
| | | for i in range(total_chunk_num): |
| | | speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride] |
| | | is_final = i == total_chunk_num - 1 |
| | | res = model(input=speech_chunk, |
| | | res = model.generate(input=speech_chunk, |
| | | cache=cache, |
| | | is_final=is_final, |
| | | chunk_size=chunk_size, |
| | |
| | | |
| | | model = AutoModel(model="damo/speech_timestamp_prediction-v1-16k-offline", model_revision="v2.0.2") |
| | | |
| | | res = model(input=("https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", |
| | | res = model.generate(input=("https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", |
| | | "欢迎大家来到魔搭社区进行体验"), |
| | | data_type=("sound", "text"), |
| | | batch_size=2, |
| | |
| | | spk_model_revision="v2.0.2" |
| | | ) |
| | | |
| | | res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", |
| | | res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", |
| | | hotword='达摩院 磨搭') |
| | | print(res) |
| | |
| | | |
| | | model = AutoModel(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", model_revision="v2.0.2") |
| | | |
| | | res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav") |
| | | res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav") |
| | | print(res) |
| | | |
| | | |
| | |
| | | fbanks = frontend(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", batch_size=2) |
| | | |
| | | for batch_idx, fbank_dict in enumerate(fbanks): |
| | | res = model(**fbank_dict) |
| | | res = model.generate(**fbank_dict) |
| | | print(res) |
| | |
| | | |
| | | model = AutoModel(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online", model_revision="v2.0.2") |
| | | cache = {} |
| | | res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", |
| | | res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", |
| | | chunk_size=chunk_size, |
| | | encoder_chunk_look_back=encoder_chunk_look_back, |
| | | decoder_chunk_look_back=decoder_chunk_look_back, |
| | |
| | | for i in range(total_chunk_num): |
| | | speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride] |
| | | is_final = i == total_chunk_num - 1 |
| | | res = model(input=speech_chunk, |
| | | res = model.generate(input=speech_chunk, |
| | | cache=cache, |
| | | is_final=is_final, |
| | | chunk_size=chunk_size, |
| | |
| | | spk_model_revision="v2.0.2", |
| | | ) |
| | | |
| | | res = model(input=f"{model.model_path}/example/asr_example.wav", |
| | | res = model.generate(input=f"{model.model_path}/example/asr_example.wav", |
| | | hotword='达摩院 魔搭') |
| | | print(res) |
| | |
| | | # step.1: compute the vad model |
| | | self.vad_kwargs.update(cfg) |
| | | beg_vad = time.time() |
| | | res = self.generate(input, input_len=input_len, model=self.vad_model, kwargs=self.vad_kwargs, **cfg) |
| | | res = self.inference(input, input_len=input_len, model=self.vad_model, kwargs=self.vad_kwargs, **cfg) |
| | | end_vad = time.time() |
| | | print(f"time cost vad: {end_vad - beg_vad:0.3f}") |
| | | |
| | |
| | | batch_size_ms_cum = 0 |
| | | end_idx = j + 1 |
| | | speech_j, speech_lengths_j = slice_padding_audio_samples(speech, speech_lengths, sorted_data[beg_idx:end_idx]) |
| | | results = self.generate(speech_j, input_len=None, model=model, kwargs=kwargs, **cfg) |
| | | results = self.inference(speech_j, input_len=None, model=model, kwargs=kwargs, **cfg) |
| | | if self.spk_model is not None: |
| | | all_segments = [] |
| | | # compose vad segments: [[start_time_sec, end_time_sec, speech], [...]] |
| | |
| | | segments = sv_chunk(vad_segments) |
| | | all_segments.extend(segments) |
| | | speech_b = [i[2] for i in segments] |
| | | spk_res = self.generate(speech_b, input_len=None, model=self.spk_model, kwargs=kwargs, **cfg) |
| | | spk_res = self.inference(speech_b, input_len=None, model=self.spk_model, kwargs=kwargs, **cfg) |
| | | results[_b]['spk_embedding'] = spk_res[0]['spk_embedding'] |
| | | beg_idx = end_idx |
| | | if len(results) < 1: |
| | |
| | | # step.3 compute punc model |
| | | if self.punc_model is not None: |
| | | self.punc_kwargs.update(cfg) |
| | | punc_res = self.generate(result["text"], model=self.punc_model, kwargs=self.punc_kwargs, **cfg) |
| | | punc_res = self.inference(result["text"], model=self.punc_model, kwargs=self.punc_kwargs, **cfg) |
| | | result["text_with_punc"] = punc_res[0]["text"] |
| | | |
| | | # speaker embedding cluster after resorted |
| | |
| | | import json |
| | | import time |
| | | import torch |
| | | import hydra |
| | | import random |
| | | import string |
| | | import logging |
| | | import os.path |
| | | from tqdm import tqdm |
| | | from omegaconf import DictConfig, OmegaConf, ListConfig |
| | | |
| | | from funasr.register import tables |
| | | from funasr.utils.load_utils import load_bytes |
| | | from funasr.download.file import download_from_url |
| | | from funasr.download.download_from_hub import download_model |
| | | from funasr.utils.vad_utils import slice_padding_audio_samples |
| | | from funasr.train_utils.set_all_random_seed import set_all_random_seed |
| | | from funasr.train_utils.load_pretrained_model import load_pretrained_model |
| | | from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank |
| | | from funasr.utils.timestamp_tools import timestamp_sentence |
| | | from funasr.models.campplus.utils import sv_chunk, postprocess, distribute_spk |
| | | from funasr.models.campplus.cluster_backend import ClusterBackend |
| | | from funasr.auto.auto_model import AutoModel |
| | | |
| | | |
| | |
| | | if kwargs.get("debug", False): |
| | | import pdb; pdb.set_trace() |
| | | model = AutoModel(**kwargs) |
| | | res = model(input=kwargs["input"]) |
| | | res = model.generate(input=kwargs["input"]) |
| | | print(res) |
| | | |
| | | |
| | | |
| | | if __name__ == '__main__': |