egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/demo.py
@@ -3,7 +3,7 @@ if __name__ == '__main__': audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav' output_dir = None output_dir = "./results" inference_pipeline = pipeline( task=Tasks.auto_speech_recognition, model="damo/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch", egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/demo.py
@@ -3,7 +3,7 @@ if __name__ == '__main__': audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav' output_dir = None output_dir = "./results" inference_pipeline = pipeline( task=Tasks.auto_speech_recognition, model="damo/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch", egs_modelscope/asr/paraformerbert/speech_paraformerbert_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py
@@ -3,7 +3,7 @@ if __name__ == '__main__': audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav' output_dir = None output_dir = "./results" inference_pipeline = pipeline( task=Tasks.auto_speech_recognition, model="damo/speech_paraformerbert_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch", egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-online/infer.py
@@ -3,7 +3,7 @@ if __name__ == '__main__': audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav' output_dir = None output_dir = "./results" inference_pipeline = pipeline( task=Tasks.auto_speech_recognition, model="damo/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-online", egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-online/infer.py
@@ -3,7 +3,7 @@ if __name__ == '__main__': audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav' output_dir = None output_dir = "./results" inference_pipeline = pipeline( task=Tasks.auto_speech_recognition, model="damo/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-online", egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-online/infer.py
@@ -3,7 +3,7 @@ if __name__ == '__main__': audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav' output_dir = None output_dir = "./results" inference_pipeline = pipeline( task=Tasks.auto_speech_recognition, model="damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-online", egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/demo.py
@@ -3,15 +3,14 @@ if __name__ == '__main__': audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav' output_dir = None output_dir = "./results" inference_pipeline = pipeline( task=Tasks.auto_speech_recognition, model='damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch', vad_model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch', punc_model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch', output_dir=output_dir, batch_size=64, ) rec_result = inference_pipeline(audio_in=audio_in) rec_result = inference_pipeline(audio_in=audio_in, batch_size_token=5000) print(rec_result) egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/demo.py
@@ -3,7 +3,7 @@ if __name__ == '__main__': audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav' output_dir = None output_dir = "./results" inference_pipeline = pipeline( task=Tasks.voice_activity_detection, model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/demo_online.py
@@ -7,7 +7,7 @@ import soundfile if __name__ == '__main__': output_dir = None output_dir = "./results" inference_pipeline = pipeline( task=Tasks.voice_activity_detection, model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/demo.py
@@ -3,7 +3,7 @@ if __name__ == '__main__': audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example_8k.wav' output_dir = None output_dir = "./results" inference_pipeline = pipeline( task=Tasks.voice_activity_detection, model="damo/speech_fsmn_vad_zh-cn-8k-common", egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/demo_online.py
@@ -7,7 +7,7 @@ import soundfile if __name__ == '__main__': output_dir = None output_dir = "./results" inference_pipeline = pipeline( task=Tasks.voice_activity_detection, model="damo/speech_fsmn_vad_zh-cn-8k-common", funasr/bin/asr_inference_launch.py
@@ -600,6 +600,9 @@ if 'hotword' in kwargs: hotword_list_or_file = kwargs['hotword'] batch_size_token = kwargs.get("batch_size_token", 6000) print("batch_size_token: ", batch_size_token) if speech2text.hotword_list is None: speech2text.hotword_list = speech2text.generate_hotwords_list(hotword_list_or_file) @@ -642,8 +645,10 @@ assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" beg_vad = time.time() vad_results = speech2vadsegment(**batch) end_vad = time.time() print("time cost vad: ", end_vad-beg_vad) _, vadsegments = vad_results[0], vad_results[1][0] speech, speech_lengths = batch["speech"], batch["speech_lengths"] @@ -652,17 +657,29 @@ data_with_index = [(vadsegments[i], i) for i in range(n)] sorted_data = sorted(data_with_index, key=lambda x: x[0][1] - x[0][0]) results_sorted = [] for j, beg_idx in enumerate(range(0, n, batch_size)): end_idx = min(n, beg_idx + batch_size) batch_size_token_ms = batch_size_token*60 batch_size_token_ms_cum = 0 beg_idx = 0 for j, _ in enumerate(range(0, n)): batch_size_token_ms_cum += (sorted_data[j][0][1] - sorted_data[j][0][0]) if j < n-1 and (batch_size_token_ms_cum + sorted_data[j+1][0][1] - sorted_data[j+1][0][0])<batch_size_token_ms: continue batch_size_token_ms_cum = 0 end_idx = j + 1 speech_j, speech_lengths_j = slice_padding_fbank(speech, speech_lengths, sorted_data[beg_idx:end_idx]) beg_idx = end_idx batch = {"speech": speech_j, "speech_lengths": speech_lengths_j} batch = to_device(batch, device=device) print("batch: ", speech_j.shape[0]) beg_asr = time.time() results = speech2text(**batch) end_asr = time.time() print("time cost asr: ", end_asr - beg_asr) if len(results) < 1: results = [["", [], [], [], [], [], []]] results_sorted.extend(results) restored_data = [0] * n for j in range(n): index = sorted_data[j][1] @@ -701,7 +718,10 @@ text_postprocessed_punc = text_postprocessed punc_id_list = [] if len(word_lists) > 0 and text2punc is not None: beg_punc = time.time() text_postprocessed_punc, punc_id_list = text2punc(word_lists, 20) end_punc = time.time() print("time cost punc: ", end_punc-beg_punc) item = {'key': key, 'value': text_postprocessed_punc} if text_postprocessed != "": funasr/bin/build_trainer.py
New file @@ -0,0 +1,145 @@ import os import yaml def update_dct(fin_configs, root): if root == {}: return {} for root_key, root_value in root.items(): if not isinstance(root[root_key], dict): fin_configs[root_key] = root[root_key] else: if root_key in fin_configs.keys(): result = update_dct(fin_configs[root_key], root[root_key]) fin_configs[root_key] = result else: fin_configs[root_key] = root[root_key] return fin_configs def parse_args(mode): if mode == "asr": from funasr.tasks.asr import ASRTask as ASRTask elif mode == "paraformer": from funasr.tasks.asr import ASRTaskParaformer as ASRTask elif mode == "paraformer_vad_punc": from funasr.tasks.asr import ASRTaskParaformer as ASRTask elif mode == "uniasr": from funasr.tasks.asr import ASRTaskUniASR as ASRTask elif mode == "mfcca": from funasr.tasks.asr import ASRTaskMFCCA as ASRTask elif mode == "tp": from funasr.tasks.asr import ASRTaskAligner as ASRTask else: raise ValueError("Unknown mode: {}".format(mode)) parser = ASRTask.get_parser() args = parser.parse_args() return args, ASRTask def build_trainer(modelscope_dict, data_dir, output_dir, train_set="train", dev_set="validation", distributed=False, dataset_type="small", batch_bins=None, max_epoch=None, optim=None, lr=None, scheduler=None, scheduler_conf=None, specaug=None, specaug_conf=None, param_dict=None, **kwargs): mode = modelscope_dict['mode'] args, ASRTask = parse_args(mode=mode) # ddp related if args.local_rank is not None: distributed = True else: distributed = False args.local_rank = args.local_rank if args.local_rank is not None else 0 local_rank = args.local_rank if "CUDA_VISIBLE_DEVICES" in os.environ.keys(): gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",") os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[args.local_rank]) else: os.environ['CUDA_VISIBLE_DEVICES'] = str(args.local_rank) config = modelscope_dict['am_model_config'] finetune_config = modelscope_dict['finetune_config'] init_param = modelscope_dict['init_model'] cmvn_file = modelscope_dict['cmvn_file'] seg_dict_file = modelscope_dict['seg_dict'] # overwrite parameters with open(config) as f: configs = yaml.safe_load(f) with open(finetune_config) as f: finetune_configs = yaml.safe_load(f) # set data_types if dataset_type == "large": finetune_configs["dataset_conf"]["data_types"] = "sound,text" finetune_configs = update_dct(configs, finetune_configs) for key, value in finetune_configs.items(): if hasattr(args, key): setattr(args, key, value) # prepare data args.dataset_type = dataset_type if args.dataset_type == "small": args.train_data_path_and_name_and_type = [["{}/{}/wav.scp".format(data_dir, train_set), "speech", "sound"], ["{}/{}/text".format(data_dir, train_set), "text", "text"]] args.valid_data_path_and_name_and_type = [["{}/{}/wav.scp".format(data_dir, dev_set), "speech", "sound"], ["{}/{}/text".format(data_dir, dev_set), "text", "text"]] elif args.dataset_type == "large": args.train_data_file = None args.valid_data_file = None else: raise ValueError(f"Not supported dataset_type={args.dataset_type}") args.init_param = [init_param] args.cmvn_file = cmvn_file if os.path.exists(seg_dict_file): args.seg_dict_file = seg_dict_file else: args.seg_dict_file = None args.data_dir = data_dir args.train_set = train_set args.dev_set = dev_set args.output_dir = output_dir args.gpu_id = args.local_rank args.config = finetune_config if optim is not None: args.optim = optim if lr is not None: args.optim_conf["lr"] = lr if scheduler is not None: args.scheduler = scheduler if scheduler_conf is not None: args.scheduler_conf = scheduler_conf if specaug is not None: args.specaug = specaug if specaug_conf is not None: args.specaug_conf = specaug_conf if max_epoch is not None: args.max_epoch = max_epoch if batch_bins is not None: if args.dataset_type == "small": args.batch_bins = batch_bins elif args.dataset_type == "large": args.dataset_conf["batch_conf"]["batch_size"] = batch_bins else: raise ValueError(f"Not supported dataset_type={args.dataset_type}") if args.normalize in ["null", "none", "None"]: args.normalize = None if args.patience in ["null", "none", "None"]: args.patience = None args.local_rank = local_rank args.distributed = distributed ASRTask.finetune_args = args return ASRTask funasr/version.txt
@@ -1 +1 @@ 0.5.4 0.5.5