From c2575f022df4d125c9bd1e2b25142417c0a277b5 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期一, 22 七月 2024 17:11:44 +0800
Subject: [PATCH] docs
---
examples/README.md | 9 +++++++++
examples/README_zh.md | 9 +++++++++
README_zh.md | 1 +
examples/industrial_data_pretraining/sense_voice/demo.py | 22 ++++------------------
README.md | 1 +
5 files changed, 24 insertions(+), 18 deletions(-)
diff --git a/README.md b/README.md
index 4374a2f..747b69b 100644
--- a/README.md
+++ b/README.md
@@ -163,6 +163,7 @@
- `use_itn`: Whether the output result includes punctuation and inverse text normalization.
- `batch_size_s`: Indicates the use of dynamic batching, where the total duration of audio in the batch is measured in seconds (s).
- `merge_vad`: Whether to merge short audio fragments segmented by the VAD model, with the merged length being `merge_length_s`, in seconds (s).
+- `ban_emo_unk`: Whether to ban the output of the `emo_unk` token.
#### Paraformer
```python
diff --git a/README_zh.md b/README_zh.md
index bd6d1cd..581bb4d 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -162,6 +162,7 @@
- `use_itn`锛氳緭鍑虹粨鏋滀腑鏄惁鍖呭惈鏍囩偣涓庨�嗘枃鏈鍒欏寲銆�
- `batch_size_s` 琛ㄧず閲囩敤鍔ㄦ�乥atch锛宐atch涓�婚煶棰戞椂闀匡紝鍗曚綅涓虹s銆�
- `merge_vad`锛氭槸鍚﹀皢 vad 妯″瀷鍒囧壊鐨勭煭闊抽纰庣墖鍚堟垚锛屽悎骞跺悗闀垮害涓篳merge_length_s`锛屽崟浣嶄负绉抯銆�
+- `ban_emo_unk`锛氱鐢╡mo_unk鏍囩锛岀鐢ㄥ悗鎵�鏈夌殑鍙ュ瓙閮戒細琚祴涓庢儏鎰熸爣绛俱��
#### Paraformer
```python
diff --git a/examples/README.md b/examples/README.md
index fe9a0ed..802b1a4 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -96,6 +96,15 @@
text = rich_transcription_postprocess(res[0]["text"])
print(text)
```
+Notes:
+- `model_dir`: The name of the model, or the path to the model on the local disk.
+- `vad_model`: This indicates the activation of VAD (Voice Activity Detection). The purpose of VAD is to split long audio into shorter clips. In this case, the inference time includes both VAD and SenseVoice total consumption, and represents the end-to-end latency. If you wish to test the SenseVoice model's inference time separately, the VAD model can be disabled.
+- `vad_kwargs`: Specifies the configurations for the VAD model. `max_single_segment_time`: denotes the maximum duration for audio segmentation by the `vad_model`, with the unit being milliseconds (ms).
+- `use_itn`: Whether the output result includes punctuation and inverse text normalization.
+- `batch_size_s`: Indicates the use of dynamic batching, where the total duration of audio in the batch is measured in seconds (s).
+- `merge_vad`: Whether to merge short audio fragments segmented by the VAD model, with the merged length being `merge_length_s`, in seconds (s).
+- `ban_emo_unk`: Whether to ban the output of the `emo_unk` token.
+
##### Paraformer
```python
from funasr import AutoModel
diff --git a/examples/README_zh.md b/examples/README_zh.md
index f95ee64..550b7f4 100644
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -97,6 +97,15 @@
text = rich_transcription_postprocess(res[0]["text"])
print(text)
```
+鍙傛暟璇存槑锛�
+- `model_dir`锛氭ā鍨嬪悕绉帮紝鎴栨湰鍦扮鐩樹腑鐨勬ā鍨嬭矾寰勩��
+- `vad_model`锛氳〃绀哄紑鍚疺AD锛孷AD鐨勪綔鐢ㄦ槸灏嗛暱闊抽鍒囧壊鎴愮煭闊抽锛屾鏃舵帹鐞嗚�楁椂鍖呮嫭浜哣AD涓嶴enseVoice鎬昏�楁椂锛屼负閾捐矾鑰楁椂锛屽鏋滈渶瑕佸崟鐙祴璇昐enseVoice妯″瀷鑰楁椂锛屽彲浠ュ叧闂璙AD妯″瀷銆�
+- `vad_kwargs`锛氳〃绀篤AD妯″瀷閰嶇疆,`max_single_segment_time`: 琛ㄧず`vad_model`鏈�澶у垏鍓查煶棰戞椂闀�, 鍗曚綅鏄绉抦s銆�
+- `use_itn`锛氳緭鍑虹粨鏋滀腑鏄惁鍖呭惈鏍囩偣涓庨�嗘枃鏈鍒欏寲銆�
+- `batch_size_s` 琛ㄧず閲囩敤鍔ㄦ�乥atch锛宐atch涓�婚煶棰戞椂闀匡紝鍗曚綅涓虹s銆�
+- `merge_vad`锛氭槸鍚﹀皢 vad 妯″瀷鍒囧壊鐨勭煭闊抽纰庣墖鍚堟垚锛屽悎骞跺悗闀垮害涓篳merge_length_s`锛屽崟浣嶄负绉抯銆�
+- `ban_emo_unk`锛氱鐢╡mo_unk鏍囩锛岀鐢ㄥ悗鎵�鏈夌殑鍙ュ瓙閮戒細琚祴涓庢儏鎰熸爣绛俱��
+
##### Paraformer
```python
from funasr import AutoModel
diff --git a/examples/industrial_data_pretraining/sense_voice/demo.py b/examples/industrial_data_pretraining/sense_voice/demo.py
index 07046d9..b8a10a8 100644
--- a/examples/industrial_data_pretraining/sense_voice/demo.py
+++ b/examples/industrial_data_pretraining/sense_voice/demo.py
@@ -1,13 +1,12 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
-# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+# Copyright FunASR (https://github.com/FunAudioLLM/SenseVoice). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
-
from funasr import AutoModel
from funasr.utils.postprocess_utils import rich_transcription_postprocess
-model_dir = "/Users/zhifu/Downloads/modelscope_models/SenseVoiceSmall" # "iic/SenseVoiceSmall"
+model_dir = "iic/SenseVoiceSmall"
model = AutoModel(
@@ -19,30 +18,17 @@
# en
res = model.generate(
- input="/Users/zhifu/Downloads/8_output.wav",
+ input=f"{model.model_path}/example/en.mp3",
cache={},
language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
use_itn=True,
batch_size_s=60,
merge_vad=True, #
- merge_length_s=0.1,
-)
-text = rich_transcription_postprocess(res[0]["text"])
-print(text)
-
-# en
-res = model.generate(
- input="/Users/zhifu/Downloads/8_output.wav",
- cache={},
- language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
- use_itn=True,
- batch_size_s=60,
- merge_vad=False, #
merge_length_s=15,
)
text = rich_transcription_postprocess(res[0]["text"])
print(text)
-raise "exit"
+
# zh
res = model.generate(
input=f"{model.model_path}/example/zh.mp3",
--
Gitblit v1.9.1