| | |
| | | |
| | | from funasr import AutoModel |
| | | |
| | | model = AutoModel(model="../modelscope_models/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch", |
| | | vad_model="../modelscope_models/speech_fsmn_vad_zh-cn-16k-common-pytorch", |
| | | punc_model="../modelscope_models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch", |
| | | model = AutoModel(model="damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch", |
| | | model_revision="v2.0.0", |
| | | vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", |
| | | vad_model_revision="v2.0.0", |
| | | punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch", |
| | | punc_model_revision="v2.0.0", |
| | | ) |
| | | |
| | | res = model(input="../modelscope_models/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav", batch_size_s=300, batch_size_threshold_s=60) |
| | | res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_vad_punc_example.wav", batch_size_s=300, batch_size_threshold_s=60) |
| | | print(res) |
| | |
| | | |
| | | # download model |
| | | local_path_root=../modelscope_models |
| | | mkdir -p ${local_path_root} |
| | | |
| | | local_path=${local_path_root}/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch |
| | | git clone https://www.modelscope.cn/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git ${local_path} |
| | | |
| | | local_path_vad=${local_path_root}/speech_fsmn_vad_zh-cn-16k-common-pytorch |
| | | git clone https://www.modelscope.cn/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch.git ${local_path_vad} |
| | | |
| | | local_path_punc=${local_path_root}/punc_ct-transformer_zh-cn-common-vocab272727-pytorch |
| | | git clone https://www.modelscope.cn/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch.git ${local_path_punc} |
| | | |
| | | model="damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch" |
| | | model_revision="v2.0.0" |
| | | vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch" |
| | | vad_model_revision="v2.0.0" |
| | | punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch" |
| | | punc_model_revision="v2.0.0" |
| | | |
| | | python funasr/bin/inference.py \ |
| | | +model="${local_path}" \ |
| | | +vad_model="${local_path_vad}" \ |
| | | +punc_model="${local_path_punc}" \ |
| | | +input="${local_path}/example/asr_example.wav" \ |
| | | +model=${model} \ |
| | | +model_revision=${model_revision} \ |
| | | +vad_model=${vad_model} \ |
| | | +vad_model_revision=${vad_model_revision} \ |
| | | +punc_model=${punc_model} \ |
| | | +punc_model_revision=${punc_model_revision} \ |
| | | +input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_vad_punc_example.wav" \ |
| | | +output_dir="./outputs/debug" \ |
| | | +device="cpu" \ |
| | | +batch_size_s=300 \ |
| | | +batch_size_threshold_s=60 \ |
| | | +debug="true" \ |
| | | +"hotword='达摩院 魔搭'" |
| | | +batch_size_threshold_s=60 |
| | | |
| | |
| | | |
| | | from funasr import AutoModel |
| | | |
| | | model = AutoModel(model="../modelscope_models/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404") |
| | | model = AutoModel(model="damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404", model_revision="v2.0.0") |
| | | |
| | | res = model(input="../modelscope_models/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/example/asr_example.wav", |
| | | res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", |
| | | hotword='达摩院 魔搭') |
| | | print(res) |
| | |
| | | |
| | | # download model |
| | | local_path_root=../modelscope_models |
| | | mkdir -p ${local_path_root} |
| | | local_path=${local_path_root}/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404 |
| | | git clone https://www.modelscope.cn/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404.git ${local_path} |
| | | |
| | | model="damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404" |
| | | model_revision="v2.0.0" |
| | | |
| | | python funasr/bin/inference.py \ |
| | | +model="${local_path}" \ |
| | | +input="${local_path}/example/asr_example.wav" \ |
| | | +model=${model} \ |
| | | +model_revision=${model_revision} \ |
| | | +input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav" \ |
| | | +output_dir="./outputs/debug" \ |
| | | +device="cpu" \ |
| | | +"hotword='达摩院 魔搭'" |
| | |
| | | |
| | | from funasr import AutoModel |
| | | |
| | | model = AutoModel(model="/Users/zhifu/Downloads/modelscope_models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch") |
| | | model = AutoModel(model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch", model_revision="v2.0.0") |
| | | |
| | | res = model(input="/Users/zhifu/Downloads/modelscope_models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/example/punc_example.txt") |
| | | res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt") |
| | | print(res) |
| | |
| | | |
| | | # download model |
| | | local_path_root=../modelscope_models |
| | | mkdir -p ${local_path_root} |
| | | local_path=${local_path_root}/punc_ct-transformer_zh-cn-common-vocab272727-pytorch |
| | | git clone https://www.modelscope.cn/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch.git ${local_path} |
| | | |
| | | model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch" |
| | | model_revision="v2.0.0" |
| | | |
| | | python funasr/bin/inference.py \ |
| | | +model="${local_path}" \ |
| | | +input="${local_path}/example/punc_example.txt" \ |
| | | +model=${model} \ |
| | | +model_revision=${model_revision} \ |
| | | +input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt" \ |
| | | +output_dir="./outputs/debug" \ |
| | | +device="cpu" |
| | |
| | | |
| | | from funasr import AutoModel |
| | | |
| | | model = AutoModel(model="../modelscope_models/emotion2vec_base") |
| | | model = AutoModel(model="damo/emotion2vec_base", model_revision="v2.0.0") |
| | | |
| | | res = model(input="../modelscope_models/emotion2vec_base/example/test.wav", output_dir="./outputs") |
| | | res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", output_dir="./outputs") |
| | | print(res) |
| | |
| | | |
| | | # download model |
| | | local_path_root=../modelscope_models |
| | | mkdir -p ${local_path_root} |
| | | local_path=${local_path_root}/emotion2vec_base |
| | | git clone https://www.modelscope.cn/damo/emotion2vec_base.git ${local_path} |
| | | #local_path=/Users/zhifu/Downloads/modelscope_models/emotion2vec_base |
| | | model="damo/emotion2vec_base" |
| | | model_revision="v2.0.0" |
| | | |
| | | python funasr/bin/inference.py \ |
| | | +model="${local_path}" \ |
| | | +input="${local_path}/example/test.wav" \ |
| | | +model=${model} \ |
| | | +model_revision=${model_revision} \ |
| | | +input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav" \ |
| | | +output_dir="./outputs/debug" \ |
| | | +device="cpu" \ |
| | | +debug=true |
| | |
| | | |
| | | from funasr import AutoModel |
| | | |
| | | model = AutoModel(model="../modelscope_models/speech_fsmn_vad_zh-cn-16k-common-pytorch") |
| | | model = AutoModel(model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", model_revision="v2.0.0") |
| | | |
| | | res = model(input="../modelscope_models/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav") |
| | | res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav") |
| | | print(res) |
| | |
| | | |
| | | # download model |
| | | local_path_root=../modelscope_models |
| | | mkdir -p ${local_path_root} |
| | | local_path=${local_path_root}/speech_fsmn_vad_zh-cn-16k-common-pytorch |
| | | git clone https://www.modelscope.cn/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch.git ${local_path} |
| | | |
| | | model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch" |
| | | model_revision="v2.0.0" |
| | | |
| | | python funasr/bin/inference.py \ |
| | | +model="${local_path}" \ |
| | | +input="${local_path}/example/vad_example.wav" \ |
| | | +model=${model} \ |
| | | +model_revision=${model_revision} \ |
| | | +input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav" \ |
| | | +output_dir="./outputs/debug" \ |
| | | +device="cpu" \ |
| | |
| | | |
| | | from funasr import AutoModel |
| | | |
| | | model = AutoModel(model="../modelscope_models/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch") |
| | | model = AutoModel(model="damo/speech_timestamp_prediction-v1-16k-offline", model_revision="v2.0.0") |
| | | |
| | | res = model(input=("../modelscope_models/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav", |
| | | res = model(input=("https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", |
| | | "欢迎大家来到魔搭社区进行体验"), |
| | | data_type=("sound", "text"), |
| | | batch_size=2, |
| | |
| | | |
| | | # download model |
| | | local_path_root=../modelscope_models |
| | | mkdir -p ${local_path_root} |
| | | local_path=${local_path_root}/speech_timestamp_prediction-v1-16k-offline |
| | | git clone https://www.modelscope.cn/damo/speech_timestamp_prediction-v1-16k-offline.git ${local_path} |
| | | |
| | | model="damo/speech_timestamp_prediction-v1-16k-offline" |
| | | model_revision="v2.0.0" |
| | | |
| | | python funasr/bin/inference.py \ |
| | | +model="${local_path}" \ |
| | | +input='["../modelscope_models/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav", "欢迎大家来到魔搭社区进行体验"]' \ |
| | | +model=${model} \ |
| | | +model_revision=${model_revision} \ |
| | | +input='["https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", "欢迎大家来到魔搭社区进行体验"]' \ |
| | | +data_type='["sound", "text"]' \ |
| | | +output_dir="../outputs/debug" \ |
| | | +device="cpu" \ |
| | |
| | | |
| | | from funasr import AutoModel |
| | | |
| | | model = AutoModel(model="../modelscope_models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch") |
| | | model = AutoModel(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", model_revison="v2.0.0") |
| | | |
| | | res = model(input="../modelscope_models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav") |
| | | res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav") |
| | | print(res) |
| | | |
| | | |
| | | from funasr import AutoFrontend |
| | | |
| | | frontend = AutoFrontend(model="../modelscope_models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch") |
| | | frontend = AutoFrontend(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", model_revison="v2.0.0") |
| | | |
| | | fbanks = frontend(input="../modelscope_models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav", batch_size=2) |
| | | fbanks = frontend(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", batch_size=2) |
| | | |
| | | for batch_idx, fbank_dict in enumerate(fbanks): |
| | | res = model(**fbank_dict) |
| | |
| | | |
| | | # download model |
| | | local_path_root=../modelscope_models |
| | | mkdir -p ${local_path_root} |
| | | local_path=${local_path_root}/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch |
| | | git clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git ${local_path} |
| | | |
| | | model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" |
| | | model_revision="v2.0.0" |
| | | |
| | | python funasr/bin/inference.py \ |
| | | +model="${local_path}" \ |
| | | +input="${local_path}/example/asr_example.wav" \ |
| | | +model=${model} \ |
| | | +model_revision=${model_revision} \ |
| | | +input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav" \ |
| | | +output_dir="./outputs/debug" \ |
| | | +device="cpu" \ |
| | | |
| | |
| | | |
| | | from funasr import AutoModel |
| | | |
| | | model = AutoModel(model="../modelscope_models/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", |
| | | model = AutoModel(model="damo/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", |
| | | model_revision="v2.0.0", |
| | | vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", |
| | | vad_model_revision="v2.0.0", |
| | | punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch", |
| | | punc_model_revision="v2.0.0", |
| | | ) |
| | | |
| | | #vad_model="../modelscope_models/speech_fsmn_vad_zh-cn-16k-common-pytorch", |
| | | #punc_model="../modelscope_models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch" |
| | | |
| | | res = model(input="../modelscope_models/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav", |
| | | res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", |
| | | hotword='达摩院 磨搭') |
| | | print(res) |
| | |
| | | |
| | | # download model |
| | | local_path_root=../modelscope_models |
| | | mkdir -p ${local_path_root} |
| | | |
| | | local_path=${local_path_root}/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch |
| | | git clone https://www.modelscope.cn/damo/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git ${local_path} |
| | | |
| | | #local_path_vad=${local_path_root}/speech_fsmn_vad_zh-cn-16k-common-pytorch |
| | | #git clone https://www.modelscope.cn/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch.git ${local_path_vad} |
| | | |
| | | #local_path_punc=${local_path_root}/punc_ct-transformer_zh-cn-common-vocab272727-pytorch |
| | | #git clone https://www.modelscope.cn/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch.git ${local_path_punc} |
| | | model="damo/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" |
| | | model_revision="v2.0.0" |
| | | vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch" |
| | | vad_model_revision="v2.0.0" |
| | | punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch" |
| | | punc_model_revision="v2.0.0" |
| | | |
| | | python funasr/bin/inference.py \ |
| | | +model="${local_path}" \ |
| | | +input="${local_path}/example/asr_example.wav" \ |
| | | +model=${model} \ |
| | | +model_revision=${model_revision} \ |
| | | +vad_model=${vad_model} \ |
| | | +vad_model_revision=${vad_model_revision} \ |
| | | +punc_model=${punc_model} \ |
| | | +punc_model_revision=${punc_model_revision} \ |
| | | +input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav" \ |
| | | +output_dir="./outputs/debug" \ |
| | | +device="cpu" \ |
| | | +"hotword='达摩院 魔搭'" |
| | | |
| | | #+vad_model="${local_path_vad}" \ |
| | | #+punc_model="${local_path_punc}" \ |
| | |
| | | from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank |
| | | from funasr.utils.vad_utils import slice_padding_audio_samples |
| | | from funasr.utils.timestamp_tools import time_stamp_sentence |
| | | from funasr.download.file import download_from_url |
| | | |
| | | def prepare_data_iterator(data_in, input_len=None, data_type=None, key=None): |
| | | """ |
| | |
| | | filelist = [".scp", ".txt", ".json", ".jsonl"] |
| | | |
| | | chars = string.ascii_letters + string.digits |
| | | |
| | | if isinstance(data_in, str) and data_in.startswith('http'): # url |
| | | data_in = download_from_url(data_in) |
| | | if isinstance(data_in, str) and os.path.exists(data_in): # wav_path; filelist: wav.scp, file.jsonl;text.txt; |
| | | _, file_extension = os.path.splitext(data_in) |
| | | file_extension = file_extension.lower() |
| | |
| | | data_list = [data_in] |
| | | key_list = [key] |
| | | elif isinstance(data_in, (list, tuple)): |
| | | if data_type is not None and isinstance(data_type, (list, tuple)): |
| | | if data_type is not None and isinstance(data_type, (list, tuple)): # mutiple inputs |
| | | data_list_tmp = [] |
| | | for data_in_i, data_type_i in zip(data_in, data_type): |
| | | key_list, data_list_i = prepare_data_iterator(data_in=data_in_i, data_type=data_type_i) |
| | |
| | | for item in zip(*data_list_tmp): |
| | | data_list.append(item) |
| | | else: |
| | | # [audio sample point, fbank] |
| | | # [audio sample point, fbank, text] |
| | | data_list = data_in |
| | | key_list = ["rand_key_" + ''.join(random.choice(chars) for _ in range(13)) for _ in range(len(data_in))] |
| | | else: # raw text; audio sample point, fbank; bytes |
| | |
| | | kwargs.update(cfg) |
| | | model = self.model if model is None else model |
| | | |
| | | data_type = kwargs.get("data_type", "sound") |
| | | batch_size = kwargs.get("batch_size", 1) |
| | | # if kwargs.get("device", "cpu") == "cpu": |
| | | # batch_size = 1 |
| | | |
| | | key_list, data_list = prepare_data_iterator(input, input_len=input_len, data_type=data_type, key=key) |
| | | key_list, data_list = prepare_data_iterator(input, input_len=input_len, data_type=kwargs.get("data_type", None), key=key) |
| | | |
| | | speed_stats = {} |
| | | asr_result_list = [] |
| | |
| | | batch_size = int(kwargs.get("batch_size_s", 300))*1000 |
| | | batch_size_threshold_ms = int(kwargs.get("batch_size_threshold_s", 60))*1000 |
| | | kwargs["batch_size"] = batch_size |
| | | data_type = kwargs.get("data_type", "sound") |
| | | key_list, data_list = prepare_data_iterator(input, input_len=input_len, data_type=data_type) |
| | | |
| | | key_list, data_list = prepare_data_iterator(input, input_len=input_len, data_type=kwargs.get("data_type", None)) |
| | | results_ret_list = [] |
| | | time_speech_total_all_samples = 0.0 |
| | | |
| | |
| | | |
| | | config = os.path.join(model_or_path, "config.yaml") |
| | | if os.path.exists(config) and os.path.exists(os.path.join(model_or_path, "model.pb")): |
| | | # config = os.path.join(model_or_path, "config.yaml") |
| | | # assert os.path.exists(config), "{} is not exist!".format(config) |
| | | cfg = OmegaConf.load(config) |
| | | kwargs = OmegaConf.merge(cfg, kwargs) |
| | | init_param = os.path.join(model_or_path, "model.pb") |
| | |
| | | assert os.path.exists(os.path.join(model_or_path, "configuration.json")) |
| | | with open(os.path.join(model_or_path, "configuration.json"), 'r', encoding='utf-8') as f: |
| | | conf_json = json.load(f) |
| | | config = os.path.join(model_or_path, conf_json["model"]["model_config"]) |
| | | config = os.path.join(model_or_path, conf_json["model_config"]) |
| | | cfg = OmegaConf.load(config) |
| | | kwargs = OmegaConf.merge(cfg, kwargs) |
| | | init_param = os.path.join(model_or_path, conf_json["model"]["model_name"]) |
| | | init_param = os.path.join(model_or_path, conf_json["model_file"]) |
| | | kwargs["init_param"] = init_param |
| | | kwargs["model"] = cfg["model"] |
| | | return OmegaConf.to_container(kwargs, resolve=True) |
| | |
| | | from typing import Generator, Union |
| | | |
| | | import requests |
| | | from urllib.parse import urlparse |
| | | |
| | | def download_from_url(url): |
| | | result = urlparse(url) |
| | | file_path = None |
| | | if result.scheme is not None and len(result.scheme) > 0: |
| | | storage = HTTPStorage() |
| | | # bytes |
| | | data = storage.read(url) |
| | | work_dir = tempfile.TemporaryDirectory().name |
| | | if not os.path.exists(work_dir): |
| | | os.makedirs(work_dir) |
| | | file_path = os.path.join(work_dir, os.path.basename(url)) |
| | | with open(file_path, 'wb') as fb: |
| | | fb.write(data) |
| | | assert file_path is not None, f"failed to download: {url}" |
| | | return file_path |
| | | |
| | | class Storage(metaclass=ABCMeta): |
| | | """Abstract class of storage. |
| | |
| | | import torch |
| | | import torch.nn as nn |
| | | from funasr.models.ct_transformer.utils import split_to_mini_sentence, split_words |
| | | from funasr.utils.load_utils import load_audio_text_image_video |
| | | |
| | | from funasr.register import tables |
| | | |
| | |
| | | **kwargs, |
| | | ): |
| | | assert len(data_in) == 1 |
| | | |
| | | text = load_audio_text_image_video(data_in, data_type=kwargs.get("kwargs", "text"))[0] |
| | | vad_indexes = kwargs.get("vad_indexes", None) |
| | | text = data_in[0] |
| | | text_lengths = data_lengths[0] if data_lengths is not None else None |
| | | # text = data_in[0] |
| | | # text_lengths = data_lengths[0] if data_lengths is not None else None |
| | | split_size = kwargs.get("split_size", 20) |
| | | |
| | | tokens = split_words(text) |
| | |
| | | result_i = {"key": key[i], "text": text_postprocessed, |
| | | "timestamp": time_stamp_postprocessed, |
| | | } |
| | | results.append(result_i) |
| | | |
| | | if ibest_writer: |
| | | # ibest_writer["token"][key[i]] = " ".join(token) |
| | | ibest_writer["timestamp_list"][key[i]] = time_stamp_postprocessed |
| | | ibest_writer["timestamp_str"][key[i]] = timestamp_str |
| | | results.append(result_i) |
| | | |
| | | return results, meta_data |
| | |
| | | import logging |
| | | from torch.nn.utils.rnn import pad_sequence |
| | | try: |
| | | from urllib.parse import urlparse |
| | | from funasr.download.file import HTTPStorage |
| | | import tempfile |
| | | from funasr.download.file import download_from_url |
| | | except: |
| | | print("urllib is not installed, if you infer from url, please install it first.") |
| | | # def load_audio(data_or_path_or_list, fs: int=16000, audio_fs: int=16000): |
| | | # |
| | | # if isinstance(data_or_path_or_list, (list, tuple)): |
| | | # return [load_audio(audio, fs=fs, audio_fs=audio_fs) for audio in data_or_path_or_list] |
| | | # |
| | | # if isinstance(data_or_path_or_list, str) and os.path.exists(data_or_path_or_list): |
| | | # data_or_path_or_list, audio_fs = torchaudio.load(data_or_path_or_list) |
| | | # data_or_path_or_list = data_or_path_or_list[0, :] |
| | | # elif isinstance(data_or_path_or_list, np.ndarray): # audio sample point |
| | | # data_or_path_or_list = np.squeeze(data_or_path_or_list) #[n_samples,] |
| | | # |
| | | # if audio_fs != fs: |
| | | # resampler = torchaudio.transforms.Resample(audio_fs, fs) |
| | | # data_or_path_or_list = resampler(data_or_path_or_list[None, :])[0, :] |
| | | # return data_or_path_or_list |
| | | |
| | | |
| | | def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs: int = 16000, data_type=None, tokenizer=None): |
| | | |
| | | def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs: int = 16000, data_type="sound", tokenizer=None): |
| | | if isinstance(data_or_path_or_list, (list, tuple)): |
| | | if data_type is not None and isinstance(data_type, (list, tuple)): |
| | | |
| | |
| | | |
| | | return data_or_path_or_list_ret |
| | | else: |
| | | return [load_audio_text_image_video(audio, fs=fs, audio_fs=audio_fs) for audio in data_or_path_or_list] |
| | | return [load_audio_text_image_video(audio, fs=fs, audio_fs=audio_fs, data_type=data_type) for audio in data_or_path_or_list] |
| | | if isinstance(data_or_path_or_list, str) and data_or_path_or_list.startswith('http'): |
| | | data_or_path_or_list = download_from_url(data_or_path_or_list) |
| | | if isinstance(data_or_path_or_list, str) and os.path.exists(data_or_path_or_list): |
| | | if data_type is None or data_type == "sound": |
| | | data_or_path_or_list, audio_fs = torchaudio.load(data_or_path_or_list) |
| | | data_or_path_or_list = data_or_path_or_list[0, :] |
| | | # elif data_type == "text" and tokenizer is not None: |
| | | # data_or_path_or_list = tokenizer.encode(data_or_path_or_list) |
| | | elif isinstance(data_or_path_or_list, str) and data_type == "text" and tokenizer is not None: |
| | | data_or_path_or_list = tokenizer.encode(data_or_path_or_list) |
| | | elif isinstance(data_or_path_or_list, np.ndarray): # audio sample point |
| | | data_or_path_or_list = np.squeeze(data_or_path_or_list) # [n_samples,] |
| | | elif isinstance(data_or_path_or_list, str) and data_type is not None and data_type == "text" and tokenizer is not None: |
| | | data_or_path_or_list = tokenizer.encode(data_or_path_or_list) |
| | | else: |
| | | pass |
| | | # print(f"unsupport data type: {data_or_path_or_list}, return raw data") |
| | | |
| | | if audio_fs != fs and data_type != "text": |
| | | resampler = torchaudio.transforms.Resample(audio_fs, fs) |
| | |
| | | data_len = torch.tensor([data_len]) |
| | | return data.to(torch.float32), data_len.to(torch.int32) |
| | | |
| | | def download_from_url(url): |
| | | |
| | | result = urlparse(url) |
| | | file_path = None |
| | | if result.scheme is not None and len(result.scheme) > 0: |
| | | storage = HTTPStorage() |
| | | # bytes |
| | | data = storage.read(url) |
| | | work_dir = tempfile.TemporaryDirectory().name |
| | | if not os.path.exists(work_dir): |
| | | os.makedirs(work_dir) |
| | | file_path = os.path.join(work_dir, os.path.basename(url)) |
| | | with open(file_path, 'wb') as fb: |
| | | fb.write(data) |
| | | assert file_path is not None, f"failed to download: {url}" |
| | | return file_path |