python/FunASR-XL.git

			@@ -1,5 +1,9 @@
			# Quick Start

			> Note:
			> The modelscope pipeline supports all the models in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope) to inference and finetine. Here we take typic model as example to demonstrate the usage.


			## Inference with pipeline

			### Speech Recognition
			@@ -15,6 +19,7 @@

			rec_result = inference_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
			print(rec_result)
			# {'text': '欢迎大家来体验达摩院推出的语音识别模型'}
			```

			### Voice Activity Detection
			@@ -34,6 +39,7 @@

			segments_result = inference_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav')
			print(segments_result)
			# {'text': [[70, 2340], [2620, 6200], [6480, 23670], [23950, 26250], [26780, 28990], [29950, 31430], [31750, 37600], [38210, 46900], [47310, 49630], [49910, 56460], [56740, 59540], [59820, 70450]]}
			```

			### Punctuation Restoration
			@@ -49,6 +55,7 @@

			rec_result = inference_pipeline(text_in='我们都是木头人不会讲话不会动')
			print(rec_result)
			# {'text': '我们都是木头人，不会讲话，不会动。'}
			```

			### Timestamp Prediction
			@@ -65,6 +72,7 @@
			audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_timestamps.wav',
			text_in='一个东太平洋国家为什么跑到西太平洋来了呢',)
			print(rec_result)
			# {'text': '<sil> 0.000 0.380;一 0.380 0.560;个 0.560 0.800;东 0.800 0.980;太 0.980 1.140;平 1.140 1.260;洋 1.260 1.440;国 1.440 1.680;家 1.680 1.920;<sil> 1.920 2.040;为 2.040 2.200;什 2.200 2.320;么 2.320 2.500;跑 2.500 2.680;到 2.680 2.860;西 2.860 3.040;太 3.040 3.200;平 3.200 3.380;洋 3.380 3.500;来 3.500 3.640;了 3.640 3.800;呢 3.800 4.150;<sil> 4.150 4.440;', 'timestamp': [[380, 560], [560, 800], [800, 980], [980, 1140], [1140, 1260], [1260, 1440], [1440, 1680], [1680, 1920], [2040, 2200], [2200, 2320], [2320, 2500], [2500, 2680], [2680, 2860], [2860, 3040], [3040, 3200], [3200, 3380], [3380, 3500], [3500, 3640], [3640, 3800], [3800, 4150]]}
			```

			### Speaker Verification
			@@ -85,6 +93,7 @@
			# speaker verification
			rec_result = inference_sv_pipline(audio_in=('https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_enroll.wav','https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_same.wav'))
			print(rec_result["scores"][0])
			# 0.8540499500025098
			```

			### Speaker Diarization
			@@ -99,8 +108,9 @@
			task=Tasks.speaker_diarization,
			diar_model_config="sond.yaml",
			model='damo/speech_diarization_sond-en-us-callhome-8k-n16k4-pytorch',
			model_revision="v1.0.3",
			sv_model="damo/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch",
			sv_model_revision="master",
			sv_model_revision="v1.0.0",
			)

			audio_list=[
			@@ -112,6 +122,7 @@

			results = inference_diar_pipline(audio_in=audio_list)
			print(results)
			# {'text': 'spk1 [(0.8, 1.84), (2.8, 6.16), (7.04, 10.64), (12.08, 12.8), (14.24, 15.6)]\nspk2 [(0.0, 1.12), (1.68, 3.2), (4.48, 7.12), (8.48, 9.04), (10.56, 14.48), (15.44, 16.0)]'}
			```

			### FAQ
			@@ -196,6 +207,13 @@
			```shell
			python finetune.py &> log.txt &
			```
			tail log.txt
			```
			[bach-gpu011024008134] 2023-04-23 18:59:13,976 (e2e_asr_paraformer:467) INFO: enable sampler in paraformer, sampling_ratio: 0.75
			[bach-gpu011024008134] 2023-04-23 18:59:48,924 (trainer:777) INFO: 2epoch:train:1-50batch:50num_updates: iter_time=0.008, forward_time=0.302, loss_att=0.186, acc=0.942, loss_pre=0.005, loss=0.192, backward_time=0.231, optim_step_time=0.117, optim0_lr0=7.484e-06, train_time=0.753
			[bach-gpu011024008134] 2023-04-23 19:00:23,869 (trainer:777) INFO: 2epoch:train:51-100batch:100num_updates: iter_time=1.152e-04, forward_time=0.275, loss_att=0.184, acc=0.945, loss_pre=0.005, loss=0.189, backward_time=0.234, optim_step_time=0.117, optim0_lr0=7.567e-06, train_time=0.699
			[bach-gpu011024008134] 2023-04-23 19:00:58,463 (trainer:777) INFO: 2epoch:train:101-150batch:150num_updates: iter_time=1.123e-04, forward_time=0.271, loss_att=0.204, acc=0.942, loss_pre=0.005, loss=0.210, backward_time=0.231, optim_step_time=0.116, optim0_lr0=7.651e-06, train_time=0.692
			```

			### FAQ
			### Multi GPUs training and distributed training