From 9070774ab6c9a7149d31240fb0d686485f30f8e0 Mon Sep 17 00:00:00 2001
From: 语帆 <yf352572@alibaba-inc.com>
Date: 星期一, 04 三月 2024 15:21:47 +0800
Subject: [PATCH] atsr
---
examples/industrial_data_pretraining/lcbnet/README.md | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 105 insertions(+), 0 deletions(-)
diff --git a/examples/industrial_data_pretraining/lcbnet/README.md b/examples/industrial_data_pretraining/lcbnet/README.md
new file mode 100644
index 0000000..4273ec0
--- /dev/null
+++ b/examples/industrial_data_pretraining/lcbnet/README.md
@@ -0,0 +1,105 @@
+---
+tasks:
+- audio-visual-speech-recognition
+domain:
+- audio, visual
+model-type:
+- Autoregressive
+frameworks:
+- pytorch
+backbone:
+- transformer/conformer
+metrics:
+- WER/B-WER
+license: Apache License 2.0
+language:
+- en
+tags:
+- FunASR
+- Alibaba
+- ICASSP 2024
+- Audio-Visual
+- Hotword
+- Long-Context Biasing
+datasets:
+ train:
+ - SlideSpeech corpus
+ test:
+ - dev and test of SlideSpeech corpus
+indexing:
+ results:
+ - task:
+ name: Audio-Visual Speech Recognition
+ dataset:
+ name: SlideSpeech corpus
+ type: audio # optional
+ args: 16k sampling rate, 5002 bpe units # optional
+ metrics:
+ - type: WER
+ value: 18.8% # float
+ description: beamsearch search, withou lm, avg.
+ args: default
+
+widgets:
+ - task: audio-visual-speech-recognition
+ inputs:
+ - type: audio
+ name: input
+ title: 闊抽
+ - type: text
+ name: input
+ title: OCR璇嗗埆鏂囨湰
+finetune-support: True
+---
+
+
+# Paraformer-large妯″瀷浠嬬粛
+
+## Highlights
+- 鐑瘝鐗堟湰锛歔Paraformer-large鐑瘝鐗堟ā鍨媇(https://www.modelscope.cn/models/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/summary)鏀寔鐑瘝瀹氬埗鍔熻兘锛屽熀浜庢彁渚涚殑鐑瘝鍒楄〃杩涜婵�鍔卞寮猴紝鎻愬崌鐑瘝鐨勫彫鍥炵巼鍜屽噯纭巼銆�
+- 闀块煶棰戠増鏈細[Paraformer-large闀块煶棰戞ā鍨媇(https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)锛岄泦鎴怴AD銆丄SR銆佹爣鐐逛笌鏃堕棿鎴冲姛鑳斤紝鍙洿鎺ュ鏃堕暱涓烘暟灏忔椂闊抽杩涜璇嗗埆锛屽苟杈撳嚭甯︽爣鐐规枃瀛椾笌鏃堕棿鎴炽��
+
+## <strong>[FunASR寮�婧愰」鐩粙缁峕(https://github.com/alibaba-damo-academy/FunASR)</strong>
+<strong>[FunASR](https://github.com/alibaba-damo-academy/FunASR)</strong>甯屾湜鍦ㄨ闊宠瘑鍒殑瀛︽湳鐮旂┒鍜屽伐涓氬簲鐢ㄤ箣闂存灦璧蜂竴搴фˉ姊併�傞�氳繃鍙戝竷宸ヤ笟绾ц闊宠瘑鍒ā鍨嬬殑璁粌鍜屽井璋冿紝鐮旂┒浜哄憳鍜屽紑鍙戜汉鍛樺彲浠ユ洿鏂逛究鍦拌繘琛岃闊宠瘑鍒ā鍨嬬殑鐮旂┒鍜岀敓浜э紝骞舵帹鍔ㄨ闊宠瘑鍒敓鎬佺殑鍙戝睍銆傝璇煶璇嗗埆鏇存湁瓒o紒
+
+[**github浠撳簱**](https://github.com/alibaba-damo-academy/FunASR)
+| [**鏈�鏂板姩鎬�**](https://github.com/alibaba-damo-academy/FunASR#whats-new)
+| [**鐜瀹夎**](https://github.com/alibaba-damo-academy/FunASR#installation)
+| [**鏈嶅姟閮ㄧ讲**](https://www.funasr.com)
+| [**妯″瀷搴�**](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo)
+| [**鑱旂郴鎴戜滑**](https://github.com/alibaba-damo-academy/FunASR#contact)
+
+
+## 妯″瀷鍘熺悊浠嬬粛
+
+闅忕潃鍦ㄧ嚎浼氳鍜岃绋嬭秺鏉ヨ秺鏅亶锛屽浣曞埄鐢ㄨ棰戝够鐏墖涓赴瀵岀殑鏂囨湰淇℃伅鏉ユ敼鍠勮闊宠瘑鍒紙Automatic聽 Speech Recognition锛� ASR锛夐潰涓寸潃鏂扮殑鎸戞垬銆傝棰戜腑鐨勫够鐏墖涓庤闊冲疄鏃跺悓姝ワ紝鐩告瘮浜庣粺涓�鐨勭█鏈夎瘝鍒楄〃锛岃兘澶熸彁渚涙洿闀跨殑涓婁笅鏂囩浉鍏充俊鎭�傚洜姝わ紝鎴戜滑鎻愬嚭浜嗕竴绉嶅垱鏂扮殑闀夸笂涓嬫枃鍋忕疆缃戠粶锛圠CB-net锛夛紝鐢ㄤ簬闊抽-瑙嗚璇煶璇嗗埆锛圓udio-Visual Speech Recognition锛孉VSR锛夛紝浠ユ洿濂藉湴鍒╃敤瑙嗛涓殑闀挎椂涓婁笅鏂囦俊鎭��
+
+<p align="center">
+<img src="fig/lcbnet1.png" alt="AVSR鏁翠綋娴佺▼妗嗘灦" width="500" />
+<p align="center">
+<img src="fig/lcbnet2.png" alt="LCB-NET妯″瀷缁撴瀯" width="500" />
+
+
+鍏蜂綋鏉ヨ锛屾垜浠鍏堜娇鐢∣CR鎶�鏈潵妫�娴嬪拰璇嗗埆骞荤伅鐗囦腑鐨勬枃鏈唴瀹癸紝鍏舵鎴戜滑閲囩敤鍏抽敭璇嶆彁鍙栨妧鏈潵鑾峰彇鏂囨湰鍐呭涓殑鍏抽敭璇嶇煭璇�傛渶鍚庯紝鎴戜滑灏嗗叧閿瘝鎷兼帴鎴愰暱涓婁笅鏂囨枃鏈拰闊抽鍚屾椂杈撳叆鍒版垜浠殑LCB-net妯″瀷涓繘琛岃瘑鍒�傝�孡CB-net妯″瀷閲囩敤浜嗗弻缂栫爜鍣ㄧ粨鏋勶紝鍚屾椂寤烘ā闊抽鍜岄暱涓婁笅鏂囨枃鏈俊鎭�傛澶栵紝鎴戜滑杩樺紩鍏ヤ簡涓�涓樉寮忕殑鍋忕疆璇嶉娴嬫ā鍧楋紝閫氳繃浣跨敤浜屽厓浜ゅ弶鐔碉紙BCE锛夋崯澶卞嚱鏁版樉寮忛娴嬮暱涓婁笅鏂囨枃鏈腑鍦ㄩ煶棰戜腑鍑虹幇鐨勫叧閿亸缃瘝銆傛澶栵紝涓哄寮篖CB-net鐨勬硾鍖栬兘鍔涘拰绋冲仴鎬э紝鎴戜滑杩橀噰鐢ㄤ簡鍔ㄦ�佺殑鍏抽敭璇嶆ā鎷熺瓥鐣ャ�傚疄楠岃瘉鏄庯紝鎴戜滑鎻愬嚭鐨凩CB-net鐑瘝妯″瀷锛屼笉浠呰兘澶熸彁鍗囧叧閿瘝鐨勮瘑鍒晥鏋滐紝鍚屾椂涔熻兘澶熸彁鍗囬潪鍏抽敭璇嶇殑璇嗗埆鏁堟灉銆傚叿浣撳疄楠岀粨鏋滃涓嬫墍绀猴細
+
+<p align="center">
+<img src="fig/lcbnet3.png" alt="瀹為獙缁撴灉" width="500" />
+
+
+鏇磋缁嗙殑缁嗚妭瑙侊細
+- 璁烘枃锛� [LCB-net: Long-Context Biasing for Audio-Visual Speech Recognition](https://arxiv.org/abs/2401.06390)
+
+
+
+
+
+## 鐩稿叧璁烘枃浠ュ強寮曠敤淇℃伅
+
+```BibTeX
+@inproceedings{yu2024lcbnet,
+ title={LCB-net: Long-Context Biasing for Audio-Visual Speech Recognition},
+ author={Fan Yu, Haoxu Wang, Xian Shi, Shiliang Zhang},
+ booktitle={ICASSP},
+ year={2024}
+}
+```
\ No newline at end of file
--
Gitblit v1.9.1