From 9070774ab6c9a7149d31240fb0d686485f30f8e0 Mon Sep 17 00:00:00 2001
From: 语帆 <yf352572@alibaba-inc.com>
Date: 星期一, 04 三月 2024 15:21:47 +0800
Subject: [PATCH] atsr

---
 examples/industrial_data_pretraining/lcbnet/README.md |  105 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 105 insertions(+), 0 deletions(-)

diff --git a/examples/industrial_data_pretraining/lcbnet/README.md b/examples/industrial_data_pretraining/lcbnet/README.md
new file mode 100644
index 0000000..4273ec0
--- /dev/null
+++ b/examples/industrial_data_pretraining/lcbnet/README.md
@@ -0,0 +1,105 @@
+---
+tasks:
+- audio-visual-speech-recognition 
+domain:
+- audio, visual
+model-type:
+- Autoregressive
+frameworks:
+- pytorch
+backbone:
+- transformer/conformer
+metrics:
+- WER/B-WER
+license: Apache License 2.0
+language: 
+- en
+tags:
+- FunASR
+- Alibaba
+- ICASSP 2024
+- Audio-Visual
+- Hotword
+- Long-Context Biasing
+datasets:
+  train:
+  - SlideSpeech corpus
+  test:
+  - dev and test of SlideSpeech corpus
+indexing:
+   results:
+   - task:
+       name: Audio-Visual Speech Recognition
+     dataset:
+       name: SlideSpeech corpus
+       type: audio    # optional
+       args: 16k sampling rate, 5002 bpe units  # optional
+     metrics:
+       - type: WER
+         value: 18.8%  # float
+         description: beamsearch search, withou lm, avg.
+         args: default
+
+widgets:
+  - task: audio-visual-speech-recognition 
+    inputs:
+      - type: audio
+        name: input
+        title: 闊抽
+      - type: text
+        name: input
+        title: OCR璇嗗埆鏂囨湰
+finetune-support: True
+---
+
+
+# Paraformer-large妯″瀷浠嬬粛
+
+## Highlights
+- 鐑瘝鐗堟湰锛歔Paraformer-large鐑瘝鐗堟ā鍨媇(https://www.modelscope.cn/models/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/summary)鏀寔鐑瘝瀹氬埗鍔熻兘锛屽熀浜庢彁渚涚殑鐑瘝鍒楄〃杩涜婵�鍔卞寮猴紝鎻愬崌鐑瘝鐨勫彫鍥炵巼鍜屽噯纭巼銆�
+- 闀块煶棰戠増鏈細[Paraformer-large闀块煶棰戞ā鍨媇(https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)锛岄泦鎴怴AD銆丄SR銆佹爣鐐逛笌鏃堕棿鎴冲姛鑳斤紝鍙洿鎺ュ鏃堕暱涓烘暟灏忔椂闊抽杩涜璇嗗埆锛屽苟杈撳嚭甯︽爣鐐规枃瀛椾笌鏃堕棿鎴炽��
+
+## <strong>[FunASR寮�婧愰」鐩粙缁峕(https://github.com/alibaba-damo-academy/FunASR)</strong>
+<strong>[FunASR](https://github.com/alibaba-damo-academy/FunASR)</strong>甯屾湜鍦ㄨ闊宠瘑鍒殑瀛︽湳鐮旂┒鍜屽伐涓氬簲鐢ㄤ箣闂存灦璧蜂竴搴фˉ姊併�傞�氳繃鍙戝竷宸ヤ笟绾ц闊宠瘑鍒ā鍨嬬殑璁粌鍜屽井璋冿紝鐮旂┒浜哄憳鍜屽紑鍙戜汉鍛樺彲浠ユ洿鏂逛究鍦拌繘琛岃闊宠瘑鍒ā鍨嬬殑鐮旂┒鍜岀敓浜э紝骞舵帹鍔ㄨ闊宠瘑鍒敓鎬佺殑鍙戝睍銆傝璇煶璇嗗埆鏇存湁瓒ｏ紒
+
+[**github浠撳簱**](https://github.com/alibaba-damo-academy/FunASR)
+| [**鏈�鏂板姩鎬�**](https://github.com/alibaba-damo-academy/FunASR#whats-new) 
+| [**鐜瀹夎**](https://github.com/alibaba-damo-academy/FunASR#installation)
+| [**鏈嶅姟閮ㄧ讲**](https://www.funasr.com)
+| [**妯″瀷搴�**](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo)
+| [**鑱旂郴鎴戜滑**](https://github.com/alibaba-damo-academy/FunASR#contact)
+
+
+## 妯″瀷鍘熺悊浠嬬粛
+
+闅忕潃鍦ㄧ嚎浼氳鍜岃绋嬭秺鏉ヨ秺鏅亶锛屽浣曞埄鐢ㄨ棰戝够鐏墖涓赴瀵岀殑鏂囨湰淇℃伅鏉ユ敼鍠勮闊宠瘑鍒紙Automatic聽 Speech Recognition锛� ASR锛夐潰涓寸潃鏂扮殑鎸戞垬銆傝棰戜腑鐨勫够鐏墖涓庤闊冲疄鏃跺悓姝ワ紝鐩告瘮浜庣粺涓�鐨勭█鏈夎瘝鍒楄〃锛岃兘澶熸彁渚涙洿闀跨殑涓婁笅鏂囩浉鍏充俊鎭�傚洜姝わ紝鎴戜滑鎻愬嚭浜嗕竴绉嶅垱鏂扮殑闀夸笂涓嬫枃鍋忕疆缃戠粶锛圠CB-net锛夛紝鐢ㄤ簬闊抽-瑙嗚璇煶璇嗗埆锛圓udio-Visual Speech Recognition锛孉VSR锛夛紝浠ユ洿濂藉湴鍒╃敤瑙嗛涓殑闀挎椂涓婁笅鏂囦俊鎭��
+
+<p align="center">
+<img src="fig/lcbnet1.png" alt="AVSR鏁翠綋娴佺▼妗嗘灦"  width="500" />
+<p align="center">
+<img src="fig/lcbnet2.png" alt="LCB-NET妯″瀷缁撴瀯"  width="500" />
+
+
+鍏蜂綋鏉ヨ锛屾垜浠鍏堜娇鐢∣CR鎶�鏈潵妫�娴嬪拰璇嗗埆骞荤伅鐗囦腑鐨勬枃鏈唴瀹癸紝鍏舵鎴戜滑閲囩敤鍏抽敭璇嶆彁鍙栨妧鏈潵鑾峰彇鏂囨湰鍐呭涓殑鍏抽敭璇嶇煭璇�傛渶鍚庯紝鎴戜滑灏嗗叧閿瘝鎷兼帴鎴愰暱涓婁笅鏂囨枃鏈拰闊抽鍚屾椂杈撳叆鍒版垜浠殑LCB-net妯″瀷涓繘琛岃瘑鍒�傝�孡CB-net妯″瀷閲囩敤浜嗗弻缂栫爜鍣ㄧ粨鏋勶紝鍚屾椂寤烘ā闊抽鍜岄暱涓婁笅鏂囨枃鏈俊鎭�傛澶栵紝鎴戜滑杩樺紩鍏ヤ簡涓�涓樉寮忕殑鍋忕疆璇嶉娴嬫ā鍧楋紝閫氳繃浣跨敤浜屽厓浜ゅ弶鐔碉紙BCE锛夋崯澶卞嚱鏁版樉寮忛娴嬮暱涓婁笅鏂囨枃鏈腑鍦ㄩ煶棰戜腑鍑虹幇鐨勫叧閿亸缃瘝銆傛澶栵紝涓哄寮篖CB-net鐨勬硾鍖栬兘鍔涘拰绋冲仴鎬э紝鎴戜滑杩橀噰鐢ㄤ簡鍔ㄦ�佺殑鍏抽敭璇嶆ā鎷熺瓥鐣ャ�傚疄楠岃瘉鏄庯紝鎴戜滑鎻愬嚭鐨凩CB-net鐑瘝妯″瀷锛屼笉浠呰兘澶熸彁鍗囧叧閿瘝鐨勮瘑鍒晥鏋滐紝鍚屾椂涔熻兘澶熸彁鍗囬潪鍏抽敭璇嶇殑璇嗗埆鏁堟灉銆傚叿浣撳疄楠岀粨鏋滃涓嬫墍绀猴細
+
+<p align="center">
+<img src="fig/lcbnet3.png" alt="瀹為獙缁撴灉"  width="500" />
+
+
+鏇磋缁嗙殑缁嗚妭瑙侊細
+- 璁烘枃锛� [LCB-net: Long-Context Biasing for Audio-Visual Speech Recognition](https://arxiv.org/abs/2401.06390)
+
+
+
+
+
+## 鐩稿叧璁烘枃浠ュ強寮曠敤淇℃伅
+
+```BibTeX
+@inproceedings{yu2024lcbnet,
+  title={LCB-net: Long-Context Biasing for Audio-Visual Speech Recognition},
+  author={Fan Yu, Haoxu Wang, Xian Shi, Shiliang Zhang},
+  booktitle={ICASSP},
+  year={2024}
+}
+```
\ No newline at end of file

--
Gitblit v1.9.1