1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
| #!/usr/bin/env python3
| # -*- encoding: utf-8 -*-
| # Copyright FunASR (https://github.com/FunAudioLLM/SenseVoice). All Rights Reserved.
| # MIT License (https://opensource.org/licenses/MIT)
|
| from funasr import AutoModel
| from funasr.utils.postprocess_utils import rich_transcription_postprocess
|
| model_dir = "iic/SenseVoiceSmall"
|
|
| model = AutoModel(
| model=model_dir,
| vad_model="fsmn-vad",
| vad_kwargs={"max_single_segment_time": 30000},
| device="cuda:0",
| )
|
| # en
| res = model.generate(
| input=f"{model.model_path}/example/en.mp3",
| cache={},
| language="auto", # "zh", "en", "yue", "ja", "ko", "nospeech"
| use_itn=True,
| batch_size_s=60,
| merge_vad=True, #
| merge_length_s=15,
| )
| text = rich_transcription_postprocess(res[0]["text"])
| print(text)
|
| # en with timestamp
| res = model.generate(
| input=f"{model.model_path}/example/en.mp3",
| cache={},
| language="auto", # "zh", "en", "yue", "ja", "ko", "nospeech"
| use_itn=True,
| batch_size_s=60,
| merge_vad=True, #
| merge_length_s=15,
| output_timestamp=True,
| )
| print(res)
| text = rich_transcription_postprocess(res[0]["text"])
| print(text)
|
| # zh
| res = model.generate(
| input=f"{model.model_path}/example/zh.mp3",
| cache={},
| language="auto", # "zh", "en", "yue", "ja", "ko", "nospeech"
| use_itn=True,
| batch_size_s=60,
| merge_vad=True, #
| merge_length_s=15,
| )
| text = rich_transcription_postprocess(res[0]["text"])
| print(text)
|
| # zh with timestamp
| res = model.generate(
| input=f"{model.model_path}/example/zh.mp3",
| cache={},
| language="auto", # "zh", "en", "yue", "ja", "ko", "nospeech"
| use_itn=True,
| batch_size_s=60,
| merge_vad=True, #
| merge_length_s=15,
| output_timestamp=True,
| )
| print(res)
| text = rich_transcription_postprocess(res[0]["text"])
| print(text)
|
| # yue
| res = model.generate(
| input=f"{model.model_path}/example/yue.mp3",
| cache={},
| language="auto", # "zh", "en", "yue", "ja", "ko", "nospeech"
| use_itn=True,
| batch_size_s=60,
| merge_vad=True, #
| merge_length_s=15,
| )
| text = rich_transcription_postprocess(res[0]["text"])
| print(text)
|
| # ja
| res = model.generate(
| input=f"{model.model_path}/example/ja.mp3",
| cache={},
| language="auto", # "zh", "en", "yue", "ja", "ko", "nospeech"
| use_itn=True,
| batch_size_s=60,
| merge_vad=True, #
| merge_length_s=15,
| )
| text = rich_transcription_postprocess(res[0]["text"])
| print(text)
|
|
| # ko
| res = model.generate(
| input=f"{model.model_path}/example/ko.mp3",
| cache={},
| language="auto", # "zh", "en", "yue", "ja", "ko", "nospeech"
| use_itn=True,
| batch_size_s=60,
| merge_vad=True, #
| merge_length_s=15,
| )
| text = rich_transcription_postprocess(res[0]["text"])
| print(text)
|
|