From 1367973f9818d8e15c7bf52ad6ffba4ddb6ac2b2 Mon Sep 17 00:00:00 2001
From: Rin Arakaki <rnarkkx@gmail.com>
Date: 星期二, 24 十二月 2024 17:51:31 +0800
Subject: [PATCH] shfit to shift (#2266)
---
funasr/models/paraformer_streaming/model.py | 8
funasr/models/ct_transformer_streaming/template.yaml | 4
funasr/models/sond/encoder/fsmn_encoder.py | 6
funasr/models/contextual_paraformer/template.yaml | 6
funasr/models/uniasr/model.py | 16 +-
runtime/csharp/AliParaformerAsr/AliParaformerAsr/Model/EncoderConfEntity.cs | 4
funasr/models/sense_voice/model.py | 32 ++--
funasr/models/scama/template.yaml | 4
funasr/models/ct_transformer/template.yaml | 2
funasr/models/paraformer/template.yaml | 4
runtime/csharp/AliParaformerAsr/AliParaformerAsr/Model/DecoderConfEntity.cs | 6
funasr/models/scama/model.py | 8
funasr/models/sanm/decoder.py | 10
funasr/models/sanm/template.yaml | 4
funasr/models/paraformer_streaming/template.yaml | 4
funasr/models/ct_transformer_streaming/encoder.py | 16 +-
funasr/models/uniasr/template.yaml | 4
funasr/models/sanm/attention.py | 30 +-
funasr/models/scama/encoder.py | 30 +-
funasr/models/scama/chunk_utilis.py | 40 ++--
funasr/models/ct_transformer_streaming/attention.py | 4
funasr/models/monotonic_aligner/template.yaml | 4
examples/industrial_data_pretraining/sanm_kws_streaming/conf/sanm_6e_320_256_fdim40_t2602.yaml | 2
funasr/models/bicif_paraformer/template.yaml | 4
runtime/triton_gpu/model_repo_paraformer_large_online/feature_extractor/config.yaml | 4
funasr/models/scama/decoder.py | 10
docs/tutorial/Tables_zh.md | 56 +++---
examples/industrial_data_pretraining/sanm_kws/conf/sanm_6e_320_256_fdim40_t2602.yaml | 2
funasr/models/contextual_paraformer/decoder.py | 12
funasr/models/paraformer/decoder.py | 18 +-
runtime/triton_gpu/model_repo_sense_voice_small/feature_extractor/config.yaml | 2
docs/tutorial/Tables.md | 54 +++---
funasr/models/seaco_paraformer/template.yaml | 6
funasr/models/sanm/encoder.py | 16 +-
34 files changed, 216 insertions(+), 216 deletions(-)
diff --git a/docs/tutorial/Tables.md b/docs/tutorial/Tables.md
index 2cbfc62..831a3ac 100644
--- a/docs/tutorial/Tables.md
+++ b/docs/tutorial/Tables.md
@@ -5,13 +5,13 @@
The聽original聽intention聽of聽the聽funasr-1.x.x聽version聽is聽to聽make聽model聽integration聽easier.聽The聽core聽feature聽is聽the聽registry聽and聽AutoModel:
* The聽introduction聽of聽the聽registry聽enables聽the聽development聽of聽building聽blocks聽to聽access聽the聽model,聽compatible聽with聽a聽variety聽of聽tasks;
-
+
* The聽newly聽designed聽AutoModel聽interface聽unifies聽modelscope,聽huggingface,聽and聽funasr聽inference聽and聽training聽interfaces,聽and聽supports聽free聽download聽of聽repositories;
-
+
* Support聽model聽export,聽demo-level聽service聽deployment,聽and聽industrial-level聽multi-concurrent聽service聽deployment;
-
+
* Unify聽academic聽and聽industrial聽model聽inference聽training聽scripts;
-
+
# Quick聽to聽get聽started
@@ -51,19 +51,19 @@
```
* `model`(str):聽[Model聽Warehouse](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo)The聽model聽name聽in,聽or聽the聽model聽path聽in聽the聽local聽disk
-
+
* `device`(str):聽`cuda:0`(Default聽gpu0),聽using聽GPU聽for聽inference,聽specified.聽If`cpu`Then聽the聽CPU聽is聽used聽for聽inference
-
+
* `ncpu`(int):聽`4`(Default),聽set聽the聽number聽of聽threads聽used聽for聽CPU聽internal聽operation聽parallelism
-
+
* `output_dir`(str):聽`None`(Default)聽If聽set,聽the聽output聽path聽of聽the聽output聽result
-
+
* `batch_size`(int):聽`1`(Default),聽batch聽processing聽during聽decoding,聽number聽of聽samples
-
+
* `hub`(str)锛歚ms`(Default)聽to聽download聽the聽model聽from聽modelscope.聽If`hf`To聽download聽the聽model聽from聽huggingface.
-
+
* `**kwargs`(dict):聽All聽in`config.yaml`Parameters,聽which聽can聽be聽specified聽directly聽here,聽for聽example,聽the聽maximum聽cut聽length聽in聽the聽vad聽model.`max_single_segment_time=6000`(Milliseconds).
-
+
#### AutoModel聽reasoning
@@ -72,13 +72,13 @@
```
* * wav聽file聽path,聽for聽example:聽asr\_example.wav
-
+
* pcm聽file聽path,聽for聽example:聽asr\_example.pcm,聽you聽need聽to聽specify聽the聽audio聽sampling聽rate聽fs聽(default聽is聽16000)
-
+
* Audio聽byte聽stream,聽for聽example:聽microphone聽byte聽data
-
+
* wav.scp,kaldi-style聽wav聽list聽(`wav_id聽\t聽wav_path`),聽for聽example:
-
+
```plaintext
Asr_example1./audios/asr_example1.wav
@@ -89,13 +89,13 @@
In聽this聽input
* Audio聽sampling聽points,聽for聽example:`audio,聽rate聽=聽soundfile.read("asr_example_zh.wav")`Is聽numpy.ndarray.聽batch聽input聽is聽supported.聽The聽type聽is聽list:`[audio_sample1,聽audio_sample2,聽...,聽audio_sampleN]`
-
+
* fbank聽input,聽support聽group聽batch.聽shape聽is聽\[batch,聽frames,聽dim\],聽type聽is聽torch.Tensor,聽for聽example
-
+
* `output_dir`:聽None聽(default),聽if聽set,聽the聽output聽path聽of聽the聽output聽result
-
+
* `**kwargs`(dict):聽Model-related聽inference聽parameters,聽e.g,`beam_size=10`,`decoding_ctc_weight=0.1`.
-
+
Detailed聽documentation聽link:[https://github.com/modelscope/FunASR/blob/main/examples/README\_zh.md](https://github.com/modelscope/FunASR/blob/main/examples/README_zh.md)
@@ -128,7 +128,7 @@
pos_enc_class: SinusoidalPositionEncoder
normalize_before: true
kernel_size: 11
- sanm_shfit: 0
+ sanm_shift: 0
selfattention_layer_type: sanm
@@ -208,10 +208,10 @@
"model": {"type" : "funasr"},
"pipeline": {"type":"funasr-pipeline"},
"model_name_in_hub": {
- "ms":"",
+ "ms":"",
"hf":""},
"file_path_metas": {
- "init_param":"model.pt",
+ "init_param":"model.pt",
"config":"config.yaml",
"tokenizer_conf": {"bpemodel": "chn_jpn_yue_eng_ko_spectok.bpe.model"},
"frontend_conf":{"cmvn_file": "am.mvn"}}
@@ -274,7 +274,7 @@
def forward(
self,
**kwargs,
- ):
+ ):
def inference(
self,
@@ -320,9 +320,9 @@
## Principles聽of聽Registration
* Model:聽models聽are聽independent聽of聽each聽other.聽Each聽Model聽needs聽to聽create聽a聽new聽Model聽directory聽under聽funasr/models/.聽Do聽not聽use聽class聽inheritance聽method!!!聽Do聽not聽import聽from聽other聽model聽directories,聽and聽put聽everything聽you聽need聽into聽your聽own聽model聽directory!!!聽Do聽not聽modify聽the聽existing聽model聽code!!!
-
+
* dataset,frontend,tokenizer,聽if聽you聽can聽reuse聽the聽existing聽one,聽reuse聽it聽directly,聽if聽you聽cannot聽reuse聽it,聽please聽register聽a聽new聽one,聽modify聽it聽again,聽and聽do聽not聽modify聽the聽original聽one!!!
-
+
# Independent聽warehouse
@@ -337,7 +337,7 @@
model = AutoModel (
model="iic/SenseVoiceSmall ",
trust_remote_code=True
-remote_code = "./model.py",
+remote_code = "./model.py",
)
```
@@ -360,4 +360,4 @@
print(text)
```
-Trim聽reference:[https://github.com/FunAudioLLM/SenseVoice/blob/main/finetune.sh](https://github.com/FunAudioLLM/SenseVoice/blob/main/finetune.sh)
\ No newline at end of file
+Trim聽reference:[https://github.com/FunAudioLLM/SenseVoice/blob/main/finetune.sh](https://github.com/FunAudioLLM/SenseVoice/blob/main/finetune.sh)
diff --git a/docs/tutorial/Tables_zh.md b/docs/tutorial/Tables_zh.md
index 72af82a..e9360e0 100644
--- a/docs/tutorial/Tables_zh.md
+++ b/docs/tutorial/Tables_zh.md
@@ -5,13 +5,13 @@
funasr-1.x.x聽鐗堟湰鐨勮璁″垵琛锋槸銆�**璁╂ā鍨嬮泦鎴愭洿绠�鍗�**銆戯紝鏍稿績feature涓烘敞鍐岃〃涓嶢utoModel锛�
* 娉ㄥ唽琛ㄧ殑寮曞叆锛屼娇寰楀紑鍙戜腑鍙互鐢ㄦ惌绉湪鐨勬柟寮忔帴鍏ユā鍨嬶紝鍏煎澶氱task锛�
-
+
* 鏂拌璁$殑AutoModel鎺ュ彛锛岀粺涓�modelscope銆乭uggingface涓巉unasr鎺ㄧ悊涓庤缁冩帴鍙o紝鏀寔鑷敱閫夋嫨涓嬭浇浠撳簱锛�
-
+
* 鏀寔妯″瀷瀵煎嚭锛宒emo绾у埆鏈嶅姟閮ㄧ讲锛屼互鍙婂伐涓氱骇鍒骞跺彂鏈嶅姟閮ㄧ讲锛�
-
+
* 缁熶竴瀛︽湳涓庡伐涓氭ā鍨嬫帹鐞嗚缁冭剼鏈紱
-
+
# 蹇�熶笂鎵�
@@ -51,19 +51,19 @@
```
* `model`(str):聽[妯″瀷浠撳簱](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo)聽涓殑妯″瀷鍚嶇О锛屾垨鏈湴纾佺洏涓殑妯″瀷璺緞
-
+
* `device`(str):聽`cuda:0`锛堥粯璁pu0锛夛紝浣跨敤聽GPU聽杩涜鎺ㄧ悊锛屾寚瀹氥�傚鏋滀负`cpu`锛屽垯浣跨敤聽CPU聽杩涜鎺ㄧ悊
-
+
* `ncpu`(int):聽`4`聽锛堥粯璁わ級锛岃缃敤浜幝燙PU聽鍐呴儴鎿嶄綔骞惰鎬х殑绾跨▼鏁�
-
+
* `output_dir`(str):聽`None`聽锛堥粯璁わ級锛屽鏋滆缃紝杈撳嚭缁撴灉鐨勮緭鍑鸿矾寰�
-
+
* `batch_size`(int):聽`1`聽锛堥粯璁わ級锛岃В鐮佹椂鐨勬壒澶勭悊锛屾牱鏈釜鏁�
-
+
* `hub`(str)锛歚ms`锛堥粯璁わ級锛屼粠modelscope涓嬭浇妯″瀷銆傚鏋滀负`hf`锛屼粠huggingface涓嬭浇妯″瀷銆�
-
+
* `**kwargs`(dict):聽鎵�鏈夊湪`config.yaml`涓弬鏁帮紝鍧囧彲浠ョ洿鎺ュ湪姝ゅ鎸囧畾锛屼緥濡傦紝vad妯″瀷涓渶澶у垏鍓查暱搴β燻max_single_segment_time=6000`聽锛堟绉掞級銆�
-
+
#### AutoModel聽鎺ㄧ悊
@@ -72,13 +72,13 @@
```
* * wav鏂囦欢璺緞,聽渚嬪:聽asr\_example.wav
-
+
* pcm鏂囦欢璺緞,聽渚嬪:聽asr\_example.pcm锛屾鏃堕渶瑕佹寚瀹氶煶棰戦噰鏍风巼fs锛堥粯璁や负16000锛�
-
+
* 闊抽瀛楄妭鏁版祦锛屼緥濡傦細楹﹀厠椋庣殑瀛楄妭鏁版暟鎹�
-
+
* wav.scp锛宬aldi聽鏍峰紡鐨劼爓av聽鍒楄〃聽(`wav_id聽\t聽wav_path`),聽渚嬪:
-
+
```plaintext
asr_example1 ./audios/asr_example1.wav
@@ -89,13 +89,13 @@
鍦ㄨ繖绉嶈緭鍏ヂ�
* 闊抽閲囨牱鐐癸紝渚嬪锛歚audio,聽rate聽=聽soundfile.read("asr_example_zh.wav")`,聽鏁版嵁绫诲瀷涓郝爊umpy.ndarray銆傛敮鎸乥atch杈撳叆锛岀被鍨嬩负list锛毬燻[audio_sample1,聽audio_sample2,聽...,聽audio_sampleN]`
-
+
* fbank杈撳叆锛屾敮鎸佺粍batch銆俿hape涓篭[batch,聽frames,聽dim\]锛岀被鍨嬩负torch.Tensor锛屼緥濡�
-
+
* `output_dir`:聽None聽锛堥粯璁わ級锛屽鏋滆缃紝杈撳嚭缁撴灉鐨勮緭鍑鸿矾寰�
-
+
* `**kwargs`(dict):聽涓庢ā鍨嬬浉鍏崇殑鎺ㄧ悊鍙傛暟锛屼緥濡傦紝`beam_size=10`锛宍decoding_ctc_weight=0.1`銆�
-
+
璇︾粏鏂囨。閾炬帴锛歔https://github.com/modelscope/FunASR/blob/main/examples/README\_zh.md](https://github.com/modelscope/FunASR/blob/main/examples/README_zh.md)
@@ -128,7 +128,7 @@
pos_enc_class: SinusoidalPositionEncoder
normalize_before: true
kernel_size: 11
- sanm_shfit: 0
+ sanm_shift: 0
selfattention_layer_type: sanm
@@ -208,10 +208,10 @@
"model": {"type" : "funasr"},
"pipeline": {"type":"funasr-pipeline"},
"model_name_in_hub": {
- "ms":"",
+ "ms":"",
"hf":""},
"file_path_metas": {
- "init_param":"model.pt",
+ "init_param":"model.pt",
"config":"config.yaml",
"tokenizer_conf": {"bpemodel": "chn_jpn_yue_eng_ko_spectok.bpe.model"},
"frontend_conf":{"cmvn_file": "am.mvn"}}
@@ -274,7 +274,7 @@
def forward(
self,
**kwargs,
- ):
+ ):
def inference(
self,
@@ -320,9 +320,9 @@
## 娉ㄥ唽鍘熷垯
* Model锛氭ā鍨嬩箣闂翠簰鐩哥嫭绔嬶紝姣忎竴涓ā鍨嬶紝閮介渶瑕佸湪funasr/models/涓嬮潰鏂板缓涓�涓ā鍨嬬洰褰曪紝涓嶈閲囩敤绫荤殑缁ф壙鏂规硶锛侊紒锛佷笉瑕佷粠鍏朵粬妯″瀷鐩綍涓璱mport锛屾墍鏈夐渶瑕佺敤鍒扮殑閮藉崟鐙斁鍒拌嚜宸辩殑妯″瀷鐩綍涓紒锛侊紒涓嶈淇敼鐜版湁鐨勬ā鍨嬩唬鐮侊紒锛侊紒
-
+
* dataset锛宖rontend锛宼okenizer锛屽鏋滆兘澶嶇敤鐜版湁鐨勶紝鐩存帴澶嶇敤锛屽鏋滀笉鑳藉鐢紝璇锋敞鍐屼竴涓柊鐨勶紝鍐嶄慨鏀癸紝涓嶈淇敼鍘熸潵鐨勶紒锛侊紒
-
+
# 鐙珛浠撳簱
@@ -336,8 +336,8 @@
# trust_remote_code锛歚True` 琛ㄧず model 浠g爜瀹炵幇浠� `remote_code` 澶勫姞杞斤紝`remote_code` 鎸囧畾 `model` 鍏蜂綋浠g爜鐨勪綅缃紙渚嬪锛屽綋鍓嶇洰褰曚笅鐨� `model.py`锛夛紝鏀寔缁濆璺緞涓庣浉瀵硅矾寰勶紝浠ュ強缃戠粶 url銆�
model = AutoModel(
model="iic/SenseVoiceSmall",
- trust_remote_code=True,
- remote_code="./model.py",
+ trust_remote_code=True,
+ remote_code="./model.py",
)
```
@@ -360,4 +360,4 @@
print(text)
```
-寰皟鍙傝�冿細[https://github.com/FunAudioLLM/SenseVoice/blob/main/finetune.sh](https://github.com/FunAudioLLM/SenseVoice/blob/main/finetune.sh)
\ No newline at end of file
+寰皟鍙傝�冿細[https://github.com/FunAudioLLM/SenseVoice/blob/main/finetune.sh](https://github.com/FunAudioLLM/SenseVoice/blob/main/finetune.sh)
diff --git a/examples/industrial_data_pretraining/sanm_kws/conf/sanm_6e_320_256_fdim40_t2602.yaml b/examples/industrial_data_pretraining/sanm_kws/conf/sanm_6e_320_256_fdim40_t2602.yaml
index c4d8c18..4fa8f35 100644
--- a/examples/industrial_data_pretraining/sanm_kws/conf/sanm_6e_320_256_fdim40_t2602.yaml
+++ b/examples/industrial_data_pretraining/sanm_kws/conf/sanm_6e_320_256_fdim40_t2602.yaml
@@ -18,7 +18,7 @@
pos_enc_class: SinusoidalPositionEncoder
normalize_before: true
kernel_size: 11
- sanm_shfit: 0
+ sanm_shift: 0
selfattention_layer_type: sanm
# frontend related
diff --git a/examples/industrial_data_pretraining/sanm_kws_streaming/conf/sanm_6e_320_256_fdim40_t2602.yaml b/examples/industrial_data_pretraining/sanm_kws_streaming/conf/sanm_6e_320_256_fdim40_t2602.yaml
index 664997c..28b3ccb 100644
--- a/examples/industrial_data_pretraining/sanm_kws_streaming/conf/sanm_6e_320_256_fdim40_t2602.yaml
+++ b/examples/industrial_data_pretraining/sanm_kws_streaming/conf/sanm_6e_320_256_fdim40_t2602.yaml
@@ -18,7 +18,7 @@
pos_enc_class: SinusoidalPositionEncoder
normalize_before: true
kernel_size: 11
- sanm_shfit: 0
+ sanm_shift: 0
selfattention_layer_type: sanm
chunk_size:
- 16
diff --git a/funasr/models/bicif_paraformer/template.yaml b/funasr/models/bicif_paraformer/template.yaml
index db7ce55..710938c 100644
--- a/funasr/models/bicif_paraformer/template.yaml
+++ b/funasr/models/bicif_paraformer/template.yaml
@@ -30,7 +30,7 @@
pos_enc_class: SinusoidalPositionEncoder
normalize_before: true
kernel_size: 11
- sanm_shfit: 0
+ sanm_shift: 0
selfattention_layer_type: sanm
# decoder
@@ -45,7 +45,7 @@
src_attention_dropout_rate: 0.1
att_layer_num: 16
kernel_size: 11
- sanm_shfit: 0
+ sanm_shift: 0
predictor: CifPredictorV3
predictor_conf:
diff --git a/funasr/models/contextual_paraformer/decoder.py b/funasr/models/contextual_paraformer/decoder.py
index ba2ce9a..958c46b 100644
--- a/funasr/models/contextual_paraformer/decoder.py
+++ b/funasr/models/contextual_paraformer/decoder.py
@@ -137,7 +137,7 @@
concat_after: bool = False,
att_layer_num: int = 6,
kernel_size: int = 21,
- sanm_shfit: int = 0,
+ sanm_shift: int = 0,
):
super().__init__(
vocab_size=vocab_size,
@@ -179,14 +179,14 @@
self.att_layer_num = att_layer_num
self.num_blocks = num_blocks
- if sanm_shfit is None:
- sanm_shfit = (kernel_size - 1) // 2
+ if sanm_shift is None:
+ sanm_shift = (kernel_size - 1) // 2
self.decoders = repeat(
att_layer_num - 1,
lambda lnum: DecoderLayerSANM(
attention_dim,
MultiHeadedAttentionSANMDecoder(
- attention_dim, self_attention_dropout_rate, kernel_size, sanm_shfit=sanm_shfit
+ attention_dim, self_attention_dropout_rate, kernel_size, sanm_shift=sanm_shift
),
MultiHeadedAttentionCrossAtt(
attention_heads, attention_dim, src_attention_dropout_rate
@@ -210,7 +210,7 @@
self.last_decoder = ContextualDecoderLayer(
attention_dim,
MultiHeadedAttentionSANMDecoder(
- attention_dim, self_attention_dropout_rate, kernel_size, sanm_shfit=sanm_shfit
+ attention_dim, self_attention_dropout_rate, kernel_size, sanm_shift=sanm_shift
),
MultiHeadedAttentionCrossAtt(
attention_heads, attention_dim, src_attention_dropout_rate
@@ -228,7 +228,7 @@
lambda lnum: DecoderLayerSANM(
attention_dim,
MultiHeadedAttentionSANMDecoder(
- attention_dim, self_attention_dropout_rate, kernel_size, sanm_shfit=0
+ attention_dim, self_attention_dropout_rate, kernel_size, sanm_shift=0
),
None,
PositionwiseFeedForwardDecoderSANM(attention_dim, linear_units, dropout_rate),
diff --git a/funasr/models/contextual_paraformer/template.yaml b/funasr/models/contextual_paraformer/template.yaml
index 2205250..17e542a 100644
--- a/funasr/models/contextual_paraformer/template.yaml
+++ b/funasr/models/contextual_paraformer/template.yaml
@@ -30,7 +30,7 @@
pos_enc_class: SinusoidalPositionEncoder
normalize_before: true
kernel_size: 11
- sanm_shfit: 0
+ sanm_shift: 0
selfattention_layer_type: sanm
@@ -46,7 +46,7 @@
src_attention_dropout_rate: 0.1
att_layer_num: 16
kernel_size: 11
- sanm_shfit: 0
+ sanm_shift: 0
predictor: CifPredictorV2
predictor_conf:
@@ -126,4 +126,4 @@
ctc_type: builtin
reduce: true
ignore_nan_grad: true
-normalize: null
\ No newline at end of file
+normalize: null
diff --git a/funasr/models/ct_transformer/template.yaml b/funasr/models/ct_transformer/template.yaml
index 2538e6b..7ad6e69 100644
--- a/funasr/models/ct_transformer/template.yaml
+++ b/funasr/models/ct_transformer/template.yaml
@@ -41,7 +41,7 @@
pos_enc_class: SinusoidalPositionEncoder
normalize_before: true
kernel_size: 11
- sanm_shfit: 0
+ sanm_shift: 0
selfattention_layer_type: sanm
padding_idx: 0
diff --git a/funasr/models/ct_transformer_streaming/attention.py b/funasr/models/ct_transformer_streaming/attention.py
index 97e770b..be7113f 100644
--- a/funasr/models/ct_transformer_streaming/attention.py
+++ b/funasr/models/ct_transformer_streaming/attention.py
@@ -11,9 +11,9 @@
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
- def forward(self, x, mask, mask_shfit_chunk=None, mask_att_chunk_encoder=None):
+ def forward(self, x, mask, mask_shift_chunk=None, mask_att_chunk_encoder=None):
q_h, k_h, v_h, v = self.forward_qkv(x)
- fsmn_memory = self.forward_fsmn(v, mask[0], mask_shfit_chunk)
+ fsmn_memory = self.forward_fsmn(v, mask[0], mask_shift_chunk)
q_h = q_h * self.d_k ** (-0.5)
scores = torch.matmul(q_h, k_h.transpose(-2, -1))
att_outs = self.forward_attention(v_h, scores, mask[1], mask_att_chunk_encoder)
diff --git a/funasr/models/ct_transformer_streaming/encoder.py b/funasr/models/ct_transformer_streaming/encoder.py
index a61319a..7d09875 100644
--- a/funasr/models/ct_transformer_streaming/encoder.py
+++ b/funasr/models/ct_transformer_streaming/encoder.py
@@ -56,7 +56,7 @@
self.stochastic_depth_rate = stochastic_depth_rate
self.dropout_rate = dropout_rate
- def forward(self, x, mask, cache=None, mask_shfit_chunk=None, mask_att_chunk_encoder=None):
+ def forward(self, x, mask, cache=None, mask_shift_chunk=None, mask_att_chunk_encoder=None):
"""Compute encoded features.
Args:
@@ -93,7 +93,7 @@
self.self_attn(
x,
mask,
- mask_shfit_chunk=mask_shfit_chunk,
+ mask_shift_chunk=mask_shift_chunk,
mask_att_chunk_encoder=mask_att_chunk_encoder,
),
),
@@ -109,7 +109,7 @@
self.self_attn(
x,
mask,
- mask_shfit_chunk=mask_shfit_chunk,
+ mask_shift_chunk=mask_shift_chunk,
mask_att_chunk_encoder=mask_att_chunk_encoder,
)
)
@@ -118,7 +118,7 @@
self.self_attn(
x,
mask,
- mask_shfit_chunk=mask_shfit_chunk,
+ mask_shift_chunk=mask_shift_chunk,
mask_att_chunk_encoder=mask_att_chunk_encoder,
)
)
@@ -132,7 +132,7 @@
if not self.normalize_before:
x = self.norm2(x)
- return x, mask, cache, mask_shfit_chunk, mask_att_chunk_encoder
+ return x, mask, cache, mask_shift_chunk, mask_att_chunk_encoder
def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0):
"""Compute encoded features.
@@ -198,7 +198,7 @@
interctc_layer_idx: List[int] = [],
interctc_use_conditioning: bool = False,
kernel_size: int = 11,
- sanm_shfit: int = 0,
+ sanm_shift: int = 0,
selfattention_layer_type: str = "sanm",
):
super().__init__()
@@ -277,7 +277,7 @@
output_size,
attention_dropout_rate,
kernel_size,
- sanm_shfit,
+ sanm_shift,
)
encoder_selfattn_layer_args = (
@@ -286,7 +286,7 @@
output_size,
attention_dropout_rate,
kernel_size,
- sanm_shfit,
+ sanm_shift,
)
self.encoders0 = repeat(
diff --git a/funasr/models/ct_transformer_streaming/template.yaml b/funasr/models/ct_transformer_streaming/template.yaml
index 2477ac2..ae59b44 100644
--- a/funasr/models/ct_transformer_streaming/template.yaml
+++ b/funasr/models/ct_transformer_streaming/template.yaml
@@ -41,10 +41,10 @@
pos_enc_class: SinusoidalPositionEncoder
normalize_before: true
kernel_size: 11
- sanm_shfit: 5
+ sanm_shift: 5
selfattention_layer_type: sanm
padding_idx: 0
tokenizer: CharTokenizer
tokenizer_conf:
- unk_symbol: <unk>
\ No newline at end of file
+ unk_symbol: <unk>
diff --git a/funasr/models/monotonic_aligner/template.yaml b/funasr/models/monotonic_aligner/template.yaml
index f8d5ded..82abf35 100644
--- a/funasr/models/monotonic_aligner/template.yaml
+++ b/funasr/models/monotonic_aligner/template.yaml
@@ -25,7 +25,7 @@
pos_enc_class: SinusoidalPositionEncoder
normalize_before: true
kernel_size: 11
- sanm_shfit: 0
+ sanm_shift: 0
selfattention_layer_type: sanm
predictor: CifPredictorV3
@@ -111,5 +111,5 @@
ctc_type: builtin
reduce: true
ignore_nan_grad: true
-
+
normalize: null
diff --git a/funasr/models/paraformer/decoder.py b/funasr/models/paraformer/decoder.py
index 7edd91a..fafb8d4 100644
--- a/funasr/models/paraformer/decoder.py
+++ b/funasr/models/paraformer/decoder.py
@@ -248,7 +248,7 @@
concat_after: bool = False,
att_layer_num: int = 6,
kernel_size: int = 21,
- sanm_shfit: int = 0,
+ sanm_shift: int = 0,
lora_list: List[str] = None,
lora_rank: int = 8,
lora_alpha: int = 16,
@@ -298,14 +298,14 @@
self.att_layer_num = att_layer_num
self.num_blocks = num_blocks
- if sanm_shfit is None:
- sanm_shfit = (kernel_size - 1) // 2
+ if sanm_shift is None:
+ sanm_shift = (kernel_size - 1) // 2
self.decoders = repeat(
att_layer_num,
lambda lnum: DecoderLayerSANM(
attention_dim,
MultiHeadedAttentionSANMDecoder(
- attention_dim, self_attention_dropout_rate, kernel_size, sanm_shfit=sanm_shfit
+ attention_dim, self_attention_dropout_rate, kernel_size, sanm_shift=sanm_shift
),
MultiHeadedAttentionCrossAtt(
attention_heads,
@@ -330,7 +330,7 @@
lambda lnum: DecoderLayerSANM(
attention_dim,
MultiHeadedAttentionSANMDecoder(
- attention_dim, self_attention_dropout_rate, kernel_size, sanm_shfit=0
+ attention_dim, self_attention_dropout_rate, kernel_size, sanm_shift=0
),
None,
PositionwiseFeedForwardDecoderSANM(attention_dim, linear_units, dropout_rate),
@@ -785,20 +785,20 @@
for _ in range(cache_num)
]
return (tgt, memory, pre_acoustic_embeds, cache)
-
+
def is_optimizable(self):
return True
-
+
def get_input_names(self):
cache_num = len(self.model.decoders) + len(self.model.decoders2)
return ['tgt', 'memory', 'pre_acoustic_embeds'] \
+ ['cache_%d' % i for i in range(cache_num)]
-
+
def get_output_names(self):
cache_num = len(self.model.decoders) + len(self.model.decoders2)
return ['y'] \
+ ['out_cache_%d' % i for i in range(cache_num)]
-
+
def get_dynamic_axes(self):
ret = {
'tgt': {
diff --git a/funasr/models/paraformer/template.yaml b/funasr/models/paraformer/template.yaml
index 249e88c..170c10b 100644
--- a/funasr/models/paraformer/template.yaml
+++ b/funasr/models/paraformer/template.yaml
@@ -29,7 +29,7 @@
pos_enc_class: SinusoidalPositionEncoder
normalize_before: true
kernel_size: 11
- sanm_shfit: 0
+ sanm_shift: 0
selfattention_layer_type: sanm
# decoder
@@ -44,7 +44,7 @@
src_attention_dropout_rate: 0.1
att_layer_num: 16
kernel_size: 11
- sanm_shfit: 0
+ sanm_shift: 0
predictor: CifPredictorV2
predictor_conf:
diff --git a/funasr/models/paraformer_streaming/model.py b/funasr/models/paraformer_streaming/model.py
index 16021ce..bbc9668 100644
--- a/funasr/models/paraformer_streaming/model.py
+++ b/funasr/models/paraformer_streaming/model.py
@@ -198,10 +198,10 @@
mask_chunk_predictor = self.encoder.overlap_chunk_cls.get_mask_chunk_predictor(
None, device=encoder_out.device, batch_size=encoder_out.size(0)
)
- mask_shfit_chunk = self.encoder.overlap_chunk_cls.get_mask_shfit_chunk(
+ mask_shift_chunk = self.encoder.overlap_chunk_cls.get_mask_shift_chunk(
None, device=encoder_out.device, batch_size=encoder_out.size(0)
)
- encoder_out = encoder_out * mask_shfit_chunk
+ encoder_out = encoder_out * mask_shift_chunk
pre_acoustic_embeds, pre_token_length, pre_alphas, _ = self.predictor(
encoder_out,
ys_pad,
@@ -357,10 +357,10 @@
mask_chunk_predictor = self.encoder.overlap_chunk_cls.get_mask_chunk_predictor(
None, device=encoder_out.device, batch_size=encoder_out.size(0)
)
- mask_shfit_chunk = self.encoder.overlap_chunk_cls.get_mask_shfit_chunk(
+ mask_shift_chunk = self.encoder.overlap_chunk_cls.get_mask_shift_chunk(
None, device=encoder_out.device, batch_size=encoder_out.size(0)
)
- encoder_out = encoder_out * mask_shfit_chunk
+ encoder_out = encoder_out * mask_shift_chunk
pre_acoustic_embeds, pre_token_length, pre_alphas, pre_peak_index = self.predictor(
encoder_out,
None,
diff --git a/funasr/models/paraformer_streaming/template.yaml b/funasr/models/paraformer_streaming/template.yaml
index 889971a..44cbbc7 100644
--- a/funasr/models/paraformer_streaming/template.yaml
+++ b/funasr/models/paraformer_streaming/template.yaml
@@ -29,7 +29,7 @@
pos_enc_class: SinusoidalPositionEncoder
normalize_before: true
kernel_size: 11
- sanm_shfit: 0
+ sanm_shift: 0
selfattention_layer_type: sanm
chunk_size:
- 12
@@ -59,7 +59,7 @@
src_attention_dropout_rate: 0.1
att_layer_num: 16
kernel_size: 11
- sanm_shfit: 5
+ sanm_shift: 5
predictor: CifPredictorV2
predictor_conf:
diff --git a/funasr/models/sanm/attention.py b/funasr/models/sanm/attention.py
index 47d60cb..a9bb70f 100644
--- a/funasr/models/sanm/attention.py
+++ b/funasr/models/sanm/attention.py
@@ -154,7 +154,7 @@
n_feat,
dropout_rate,
kernel_size,
- sanm_shfit=0,
+ sanm_shift=0,
lora_list=None,
lora_rank=8,
lora_alpha=16,
@@ -199,17 +199,17 @@
)
# padding
left_padding = (kernel_size - 1) // 2
- if sanm_shfit > 0:
- left_padding = left_padding + sanm_shfit
+ if sanm_shift > 0:
+ left_padding = left_padding + sanm_shift
right_padding = kernel_size - 1 - left_padding
self.pad_fn = nn.ConstantPad1d((left_padding, right_padding), 0.0)
- def forward_fsmn(self, inputs, mask, mask_shfit_chunk=None):
+ def forward_fsmn(self, inputs, mask, mask_shift_chunk=None):
b, t, d = inputs.size()
if mask is not None:
mask = torch.reshape(mask, (b, -1, 1))
- if mask_shfit_chunk is not None:
- mask = mask * mask_shfit_chunk
+ if mask_shift_chunk is not None:
+ mask = mask * mask_shift_chunk
inputs = inputs * mask
x = inputs.transpose(1, 2)
@@ -289,7 +289,7 @@
return self.linear_out(x) # (batch, time1, d_model)
- def forward(self, x, mask, mask_shfit_chunk=None, mask_att_chunk_encoder=None):
+ def forward(self, x, mask, mask_shift_chunk=None, mask_att_chunk_encoder=None):
"""Compute scaled dot product attention.
Args:
@@ -304,7 +304,7 @@
"""
q_h, k_h, v_h, v = self.forward_qkv(x)
- fsmn_memory = self.forward_fsmn(v, mask, mask_shfit_chunk)
+ fsmn_memory = self.forward_fsmn(v, mask, mask_shift_chunk)
q_h = q_h * self.d_k ** (-0.5)
scores = torch.matmul(q_h, k_h.transpose(-2, -1))
att_outs = self.forward_attention(v_h, scores, mask, mask_att_chunk_encoder)
@@ -478,7 +478,7 @@
"""
- def __init__(self, n_feat, dropout_rate, kernel_size, sanm_shfit=0):
+ def __init__(self, n_feat, dropout_rate, kernel_size, sanm_shift=0):
"""Construct an MultiHeadedAttention object."""
super().__init__()
@@ -490,13 +490,13 @@
# padding
# padding
left_padding = (kernel_size - 1) // 2
- if sanm_shfit > 0:
- left_padding = left_padding + sanm_shfit
+ if sanm_shift > 0:
+ left_padding = left_padding + sanm_shift
right_padding = kernel_size - 1 - left_padding
self.pad_fn = nn.ConstantPad1d((left_padding, right_padding), 0.0)
self.kernel_size = kernel_size
- def forward(self, inputs, mask, cache=None, mask_shfit_chunk=None):
+ def forward(self, inputs, mask, cache=None, mask_shift_chunk=None):
"""
:param x: (#batch, time1, size).
:param mask: Mask tensor (#batch, 1, time)
@@ -509,9 +509,9 @@
if mask is not None:
mask = torch.reshape(mask, (b, -1, 1))
# logging.info("in fsmn, mask: {}, {}".format(mask.size(), mask[0:100:50, :, :]))
- if mask_shfit_chunk is not None:
- # logging.info("in fsmn, mask_fsmn: {}, {}".format(mask_shfit_chunk.size(), mask_shfit_chunk[0:100:50, :, :]))
- mask = mask * mask_shfit_chunk
+ if mask_shift_chunk is not None:
+ # logging.info("in fsmn, mask_fsmn: {}, {}".format(mask_shift_chunk.size(), mask_shift_chunk[0:100:50, :, :]))
+ mask = mask * mask_shift_chunk
# logging.info("in fsmn, mask_after_fsmn: {}, {}".format(mask.size(), mask[0:100:50, :, :]))
# print("in fsmn, mask", mask.size())
# print("in fsmn, inputs", inputs.size())
diff --git a/funasr/models/sanm/decoder.py b/funasr/models/sanm/decoder.py
index 1a4fb26..01a5f0e 100644
--- a/funasr/models/sanm/decoder.py
+++ b/funasr/models/sanm/decoder.py
@@ -226,7 +226,7 @@
concat_after: bool = False,
att_layer_num: int = 6,
kernel_size: int = 21,
- sanm_shfit: int = None,
+ sanm_shift: int = None,
concat_embeds: bool = False,
attention_dim: int = None,
tf2torch_tensor_name_prefix_torch: str = "decoder",
@@ -271,14 +271,14 @@
self.att_layer_num = att_layer_num
self.num_blocks = num_blocks
- if sanm_shfit is None:
- sanm_shfit = (kernel_size - 1) // 2
+ if sanm_shift is None:
+ sanm_shift = (kernel_size - 1) // 2
self.decoders = repeat(
att_layer_num,
lambda lnum: DecoderLayerSANM(
attention_dim,
MultiHeadedAttentionSANMDecoder(
- attention_dim, self_attention_dropout_rate, kernel_size, sanm_shfit=sanm_shfit
+ attention_dim, self_attention_dropout_rate, kernel_size, sanm_shift=sanm_shift
),
MultiHeadedAttentionCrossAtt(
attention_heads,
@@ -303,7 +303,7 @@
attention_dim,
self_attention_dropout_rate,
kernel_size,
- sanm_shfit=sanm_shfit,
+ sanm_shift=sanm_shift,
),
None,
PositionwiseFeedForwardDecoderSANM(attention_dim, linear_units, dropout_rate),
diff --git a/funasr/models/sanm/encoder.py b/funasr/models/sanm/encoder.py
index 0d39ca7..b590e24 100644
--- a/funasr/models/sanm/encoder.py
+++ b/funasr/models/sanm/encoder.py
@@ -69,7 +69,7 @@
self.stochastic_depth_rate = stochastic_depth_rate
self.dropout_rate = dropout_rate
- def forward(self, x, mask, cache=None, mask_shfit_chunk=None, mask_att_chunk_encoder=None):
+ def forward(self, x, mask, cache=None, mask_shift_chunk=None, mask_att_chunk_encoder=None):
"""Compute encoded features.
Args:
@@ -106,7 +106,7 @@
self.self_attn(
x,
mask,
- mask_shfit_chunk=mask_shfit_chunk,
+ mask_shift_chunk=mask_shift_chunk,
mask_att_chunk_encoder=mask_att_chunk_encoder,
),
),
@@ -122,7 +122,7 @@
self.self_attn(
x,
mask,
- mask_shfit_chunk=mask_shfit_chunk,
+ mask_shift_chunk=mask_shift_chunk,
mask_att_chunk_encoder=mask_att_chunk_encoder,
)
)
@@ -131,7 +131,7 @@
self.self_attn(
x,
mask,
- mask_shfit_chunk=mask_shfit_chunk,
+ mask_shift_chunk=mask_shift_chunk,
mask_att_chunk_encoder=mask_att_chunk_encoder,
)
)
@@ -145,7 +145,7 @@
if not self.normalize_before:
x = self.norm2(x)
- return x, mask, cache, mask_shfit_chunk, mask_att_chunk_encoder
+ return x, mask, cache, mask_shift_chunk, mask_att_chunk_encoder
def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0):
"""Compute encoded features.
@@ -212,7 +212,7 @@
interctc_layer_idx: List[int] = [],
interctc_use_conditioning: bool = False,
kernel_size: int = 11,
- sanm_shfit: int = 0,
+ sanm_shift: int = 0,
lora_list: List[str] = None,
lora_rank: int = 8,
lora_alpha: int = 16,
@@ -299,7 +299,7 @@
output_size,
attention_dropout_rate,
kernel_size,
- sanm_shfit,
+ sanm_shift,
lora_list,
lora_rank,
lora_alpha,
@@ -312,7 +312,7 @@
output_size,
attention_dropout_rate,
kernel_size,
- sanm_shfit,
+ sanm_shift,
lora_list,
lora_rank,
lora_alpha,
diff --git a/funasr/models/sanm/template.yaml b/funasr/models/sanm/template.yaml
index 316fe75..987fec2 100644
--- a/funasr/models/sanm/template.yaml
+++ b/funasr/models/sanm/template.yaml
@@ -26,7 +26,7 @@
pos_enc_class: SinusoidalPositionEncoder
normalize_before: true
kernel_size: 11
- sanm_shfit: 0
+ sanm_shift: 0
selfattention_layer_type: sanm
# decoder
@@ -41,7 +41,7 @@
src_attention_dropout_rate: 0.1
att_layer_num: 16
kernel_size: 11
- sanm_shfit: 0
+ sanm_shift: 0
diff --git a/funasr/models/scama/chunk_utilis.py b/funasr/models/scama/chunk_utilis.py
index 2fe3fa4..d9b4aa9 100644
--- a/funasr/models/scama/chunk_utilis.py
+++ b/funasr/models/scama/chunk_utilis.py
@@ -21,7 +21,7 @@
stride: tuple = (10,),
pad_left: tuple = (0,),
encoder_att_look_back_factor: tuple = (1,),
- shfit_fsmn: int = 0,
+ shift_fsmn: int = 0,
decoder_att_look_back_factor: tuple = (1,),
):
@@ -45,11 +45,11 @@
encoder_att_look_back_factor,
decoder_att_look_back_factor,
)
- self.shfit_fsmn = shfit_fsmn
+ self.shift_fsmn = shift_fsmn
self.x_add_mask = None
self.x_rm_mask = None
self.x_len = None
- self.mask_shfit_chunk = None
+ self.mask_shift_chunk = None
self.mask_chunk_predictor = None
self.mask_att_chunk_encoder = None
self.mask_shift_att_chunk_decoder = None
@@ -88,7 +88,7 @@
stride,
pad_left,
encoder_att_look_back_factor,
- chunk_size + self.shfit_fsmn,
+ chunk_size + self.shift_fsmn,
decoder_att_look_back_factor,
)
return (
@@ -118,13 +118,13 @@
chunk_size, stride, pad_left, encoder_att_look_back_factor, chunk_size_pad_shift = (
self.get_chunk_size(ind)
)
- shfit_fsmn = self.shfit_fsmn
+ shift_fsmn = self.shift_fsmn
pad_right = chunk_size - stride - pad_left
chunk_num_batch = np.ceil(x_len / stride).astype(np.int32)
x_len_chunk = (
(chunk_num_batch - 1) * chunk_size_pad_shift
- + shfit_fsmn
+ + shift_fsmn
+ pad_left
+ 0
+ x_len
@@ -138,13 +138,13 @@
max_len_for_x_mask_tmp = max(chunk_size, x_len_max + pad_left)
x_add_mask = np.zeros([0, max_len_for_x_mask_tmp], dtype=dtype)
x_rm_mask = np.zeros([max_len_for_x_mask_tmp, 0], dtype=dtype)
- mask_shfit_chunk = np.zeros([0, num_units], dtype=dtype)
+ mask_shift_chunk = np.zeros([0, num_units], dtype=dtype)
mask_chunk_predictor = np.zeros([0, num_units_predictor], dtype=dtype)
mask_shift_att_chunk_decoder = np.zeros([0, 1], dtype=dtype)
mask_att_chunk_encoder = np.zeros([0, chunk_num * chunk_size_pad_shift], dtype=dtype)
for chunk_ids in range(chunk_num):
# x_mask add
- fsmn_padding = np.zeros((shfit_fsmn, max_len_for_x_mask_tmp), dtype=dtype)
+ fsmn_padding = np.zeros((shift_fsmn, max_len_for_x_mask_tmp), dtype=dtype)
x_mask_cur = np.diag(np.ones(chunk_size, dtype=np.float32))
x_mask_pad_left = np.zeros((chunk_size, chunk_ids * stride), dtype=dtype)
x_mask_pad_right = np.zeros((chunk_size, max_len_for_x_mask_tmp), dtype=dtype)
@@ -154,7 +154,7 @@
x_add_mask = np.concatenate([x_add_mask, x_add_mask_fsmn], axis=0)
# x_mask rm
- fsmn_padding = np.zeros((max_len_for_x_mask_tmp, shfit_fsmn), dtype=dtype)
+ fsmn_padding = np.zeros((max_len_for_x_mask_tmp, shift_fsmn), dtype=dtype)
padding_mask_left = np.zeros((max_len_for_x_mask_tmp, pad_left), dtype=dtype)
padding_mask_right = np.zeros((max_len_for_x_mask_tmp, pad_right), dtype=dtype)
x_mask_cur = np.diag(np.ones(stride, dtype=dtype))
@@ -170,13 +170,13 @@
x_rm_mask = np.concatenate([x_rm_mask, x_rm_mask_cur_fsmn], axis=1)
# fsmn_padding_mask
- pad_shfit_mask = np.zeros([shfit_fsmn, num_units], dtype=dtype)
+ pad_shift_mask = np.zeros([shift_fsmn, num_units], dtype=dtype)
ones_1 = np.ones([chunk_size, num_units], dtype=dtype)
- mask_shfit_chunk_cur = np.concatenate([pad_shfit_mask, ones_1], axis=0)
- mask_shfit_chunk = np.concatenate([mask_shfit_chunk, mask_shfit_chunk_cur], axis=0)
+ mask_shift_chunk_cur = np.concatenate([pad_shift_mask, ones_1], axis=0)
+ mask_shift_chunk = np.concatenate([mask_shift_chunk, mask_shift_chunk_cur], axis=0)
# predictor mask
- zeros_1 = np.zeros([shfit_fsmn + pad_left, num_units_predictor], dtype=dtype)
+ zeros_1 = np.zeros([shift_fsmn + pad_left, num_units_predictor], dtype=dtype)
ones_2 = np.ones([stride, num_units_predictor], dtype=dtype)
zeros_3 = np.zeros(
[chunk_size - stride - pad_left, num_units_predictor], dtype=dtype
@@ -188,13 +188,13 @@
)
# encoder att mask
- zeros_1_top = np.zeros([shfit_fsmn, chunk_num * chunk_size_pad_shift], dtype=dtype)
+ zeros_1_top = np.zeros([shift_fsmn, chunk_num * chunk_size_pad_shift], dtype=dtype)
zeros_2_num = max(chunk_ids - encoder_att_look_back_factor, 0)
zeros_2 = np.zeros([chunk_size, zeros_2_num * chunk_size_pad_shift], dtype=dtype)
encoder_att_look_back_num = max(chunk_ids - zeros_2_num, 0)
- zeros_2_left = np.zeros([chunk_size, shfit_fsmn], dtype=dtype)
+ zeros_2_left = np.zeros([chunk_size, shift_fsmn], dtype=dtype)
ones_2_mid = np.ones([stride, stride], dtype=dtype)
zeros_2_bottom = np.zeros([chunk_size - stride, stride], dtype=dtype)
zeros_2_right = np.zeros([chunk_size, chunk_size - stride], dtype=dtype)
@@ -202,7 +202,7 @@
ones_2 = np.concatenate([zeros_2_left, ones_2, zeros_2_right], axis=1)
ones_2 = np.tile(ones_2, [1, encoder_att_look_back_num])
- zeros_3_left = np.zeros([chunk_size, shfit_fsmn], dtype=dtype)
+ zeros_3_left = np.zeros([chunk_size, shift_fsmn], dtype=dtype)
ones_3_right = np.ones([chunk_size, chunk_size], dtype=dtype)
ones_3 = np.concatenate([zeros_3_left, ones_3_right], axis=1)
@@ -218,7 +218,7 @@
)
# decoder fsmn_shift_att_mask
- zeros_1 = np.zeros([shfit_fsmn, 1])
+ zeros_1 = np.zeros([shift_fsmn, 1])
ones_1 = np.ones([chunk_size, 1])
mask_shift_att_chunk_decoder_cur = np.concatenate([zeros_1, ones_1], axis=0)
mask_shift_att_chunk_decoder = np.concatenate(
@@ -229,7 +229,7 @@
self.x_len_chunk = x_len_chunk
self.x_rm_mask = x_rm_mask[:x_len_max, :x_len_chunk_max]
self.x_len = x_len
- self.mask_shfit_chunk = mask_shfit_chunk[:x_len_chunk_max, :]
+ self.mask_shift_chunk = mask_shift_chunk[:x_len_chunk_max, :]
self.mask_chunk_predictor = mask_chunk_predictor[:x_len_chunk_max, :]
self.mask_att_chunk_encoder = mask_att_chunk_encoder[:x_len_chunk_max, :x_len_chunk_max]
self.mask_shift_att_chunk_decoder = mask_shift_att_chunk_decoder[:x_len_chunk_max, :]
@@ -238,7 +238,7 @@
self.x_len_chunk,
self.x_rm_mask,
self.x_len,
- self.mask_shfit_chunk,
+ self.mask_shift_chunk,
self.mask_chunk_predictor,
self.mask_att_chunk_encoder,
self.mask_shift_att_chunk_decoder,
@@ -309,7 +309,7 @@
x = torch.from_numpy(x).type(dtype).to(device)
return x
- def get_mask_shfit_chunk(
+ def get_mask_shift_chunk(
self, chunk_outs=None, device="cpu", batch_size=1, num_units=1, idx=4, dtype=torch.float32
):
with torch.no_grad():
diff --git a/funasr/models/scama/decoder.py b/funasr/models/scama/decoder.py
index 31b2357..f457b75 100644
--- a/funasr/models/scama/decoder.py
+++ b/funasr/models/scama/decoder.py
@@ -226,7 +226,7 @@
concat_after: bool = False,
att_layer_num: int = 6,
kernel_size: int = 21,
- sanm_shfit: int = None,
+ sanm_shift: int = None,
concat_embeds: bool = False,
attention_dim: int = None,
tf2torch_tensor_name_prefix_torch: str = "decoder",
@@ -271,14 +271,14 @@
self.att_layer_num = att_layer_num
self.num_blocks = num_blocks
- if sanm_shfit is None:
- sanm_shfit = (kernel_size - 1) // 2
+ if sanm_shift is None:
+ sanm_shift = (kernel_size - 1) // 2
self.decoders = repeat(
att_layer_num,
lambda lnum: DecoderLayerSANM(
attention_dim,
MultiHeadedAttentionSANMDecoder(
- attention_dim, self_attention_dropout_rate, kernel_size, sanm_shfit=sanm_shfit
+ attention_dim, self_attention_dropout_rate, kernel_size, sanm_shift=sanm_shift
),
MultiHeadedAttentionCrossAtt(
attention_heads,
@@ -303,7 +303,7 @@
attention_dim,
self_attention_dropout_rate,
kernel_size,
- sanm_shfit=sanm_shfit,
+ sanm_shift=sanm_shift,
),
None,
PositionwiseFeedForwardDecoderSANM(attention_dim, linear_units, dropout_rate),
diff --git a/funasr/models/scama/encoder.py b/funasr/models/scama/encoder.py
index e1fe924..0c871e1 100644
--- a/funasr/models/scama/encoder.py
+++ b/funasr/models/scama/encoder.py
@@ -69,7 +69,7 @@
self.stochastic_depth_rate = stochastic_depth_rate
self.dropout_rate = dropout_rate
- def forward(self, x, mask, cache=None, mask_shfit_chunk=None, mask_att_chunk_encoder=None):
+ def forward(self, x, mask, cache=None, mask_shift_chunk=None, mask_att_chunk_encoder=None):
"""Compute encoded features.
Args:
@@ -106,7 +106,7 @@
self.self_attn(
x,
mask,
- mask_shfit_chunk=mask_shfit_chunk,
+ mask_shift_chunk=mask_shift_chunk,
mask_att_chunk_encoder=mask_att_chunk_encoder,
),
),
@@ -122,7 +122,7 @@
self.self_attn(
x,
mask,
- mask_shfit_chunk=mask_shfit_chunk,
+ mask_shift_chunk=mask_shift_chunk,
mask_att_chunk_encoder=mask_att_chunk_encoder,
)
)
@@ -131,7 +131,7 @@
self.self_attn(
x,
mask,
- mask_shfit_chunk=mask_shfit_chunk,
+ mask_shift_chunk=mask_shift_chunk,
mask_att_chunk_encoder=mask_att_chunk_encoder,
)
)
@@ -145,7 +145,7 @@
if not self.normalize_before:
x = self.norm2(x)
- return x, mask, cache, mask_shfit_chunk, mask_att_chunk_encoder
+ return x, mask, cache, mask_shift_chunk, mask_att_chunk_encoder
def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0):
"""Compute encoded features.
@@ -212,7 +212,7 @@
interctc_layer_idx: List[int] = [],
interctc_use_conditioning: bool = False,
kernel_size: int = 11,
- sanm_shfit: int = 0,
+ sanm_shift: int = 0,
selfattention_layer_type: str = "sanm",
chunk_size: Union[int, Sequence[int]] = (16,),
stride: Union[int, Sequence[int]] = (10,),
@@ -299,7 +299,7 @@
output_size,
attention_dropout_rate,
kernel_size,
- sanm_shfit,
+ sanm_shift,
)
encoder_selfattn_layer_args = (
@@ -308,7 +308,7 @@
output_size,
attention_dropout_rate,
kernel_size,
- sanm_shfit,
+ sanm_shift,
)
self.encoders0 = repeat(
1,
@@ -343,12 +343,12 @@
assert 0 < min(interctc_layer_idx) and max(interctc_layer_idx) < num_blocks
self.interctc_use_conditioning = interctc_use_conditioning
self.conditioning_layer = None
- shfit_fsmn = (kernel_size - 1) // 2
+ shift_fsmn = (kernel_size - 1) // 2
self.overlap_chunk_cls = overlap_chunk(
chunk_size=chunk_size,
stride=stride,
pad_left=pad_left,
- shfit_fsmn=shfit_fsmn,
+ shift_fsmn=shift_fsmn,
encoder_att_look_back_factor=encoder_att_look_back_factor,
decoder_att_look_back_factor=decoder_att_look_back_factor,
)
@@ -397,31 +397,31 @@
else:
xs_pad = self.embed(xs_pad)
- mask_shfit_chunk, mask_att_chunk_encoder = None, None
+ mask_shift_chunk, mask_att_chunk_encoder = None, None
if self.overlap_chunk_cls is not None:
ilens = masks.squeeze(1).sum(1)
chunk_outs = self.overlap_chunk_cls.gen_chunk_mask(ilens, ind)
xs_pad, ilens = self.overlap_chunk_cls.split_chunk(xs_pad, ilens, chunk_outs=chunk_outs)
masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device)
- mask_shfit_chunk = self.overlap_chunk_cls.get_mask_shfit_chunk(
+ mask_shift_chunk = self.overlap_chunk_cls.get_mask_shift_chunk(
chunk_outs, xs_pad.device, xs_pad.size(0), dtype=xs_pad.dtype
)
mask_att_chunk_encoder = self.overlap_chunk_cls.get_mask_att_chunk_encoder(
chunk_outs, xs_pad.device, xs_pad.size(0), dtype=xs_pad.dtype
)
- encoder_outs = self.encoders0(xs_pad, masks, None, mask_shfit_chunk, mask_att_chunk_encoder)
+ encoder_outs = self.encoders0(xs_pad, masks, None, mask_shift_chunk, mask_att_chunk_encoder)
xs_pad, masks = encoder_outs[0], encoder_outs[1]
intermediate_outs = []
if len(self.interctc_layer_idx) == 0:
encoder_outs = self.encoders(
- xs_pad, masks, None, mask_shfit_chunk, mask_att_chunk_encoder
+ xs_pad, masks, None, mask_shift_chunk, mask_att_chunk_encoder
)
xs_pad, masks = encoder_outs[0], encoder_outs[1]
else:
for layer_idx, encoder_layer in enumerate(self.encoders):
encoder_outs = encoder_layer(
- xs_pad, masks, None, mask_shfit_chunk, mask_att_chunk_encoder
+ xs_pad, masks, None, mask_shift_chunk, mask_att_chunk_encoder
)
xs_pad, masks = encoder_outs[0], encoder_outs[1]
if layer_idx + 1 in self.interctc_layer_idx:
diff --git a/funasr/models/scama/model.py b/funasr/models/scama/model.py
index c15f435..4a28435 100644
--- a/funasr/models/scama/model.py
+++ b/funasr/models/scama/model.py
@@ -321,10 +321,10 @@
mask_chunk_predictor = self.encoder.overlap_chunk_cls.get_mask_chunk_predictor(
None, device=encoder_out.device, batch_size=encoder_out.size(0)
)
- mask_shfit_chunk = self.encoder.overlap_chunk_cls.get_mask_shfit_chunk(
+ mask_shift_chunk = self.encoder.overlap_chunk_cls.get_mask_shift_chunk(
None, device=encoder_out.device, batch_size=encoder_out.size(0)
)
- encoder_out = encoder_out * mask_shfit_chunk
+ encoder_out = encoder_out * mask_shift_chunk
pre_acoustic_embeds, pre_token_length, pre_alphas, _ = self.predictor(
encoder_out,
ys_out_pad,
@@ -415,10 +415,10 @@
mask_chunk_predictor = self.encoder.overlap_chunk_cls.get_mask_chunk_predictor(
None, device=encoder_out.device, batch_size=encoder_out.size(0)
)
- mask_shfit_chunk = self.encoder.overlap_chunk_cls.get_mask_shfit_chunk(
+ mask_shift_chunk = self.encoder.overlap_chunk_cls.get_mask_shift_chunk(
None, device=encoder_out.device, batch_size=encoder_out.size(0)
)
- encoder_out = encoder_out * mask_shfit_chunk
+ encoder_out = encoder_out * mask_shift_chunk
pre_acoustic_embeds, pre_token_length, pre_alphas, _ = self.predictor(
encoder_out,
ys_out_pad,
diff --git a/funasr/models/scama/template.yaml b/funasr/models/scama/template.yaml
index bc2e210..8e14cd3 100644
--- a/funasr/models/scama/template.yaml
+++ b/funasr/models/scama/template.yaml
@@ -26,7 +26,7 @@
pos_enc_class: SinusoidalPositionEncoder
normalize_before: true
kernel_size: 11
- sanm_shfit: 0
+ sanm_shift: 0
selfattention_layer_type: sanm
# decoder
@@ -41,7 +41,7 @@
src_attention_dropout_rate: 0.1
att_layer_num: 16
kernel_size: 11
- sanm_shfit: 0
+ sanm_shift: 0
predictor: CifPredictorV2
predictor_conf:
diff --git a/funasr/models/seaco_paraformer/template.yaml b/funasr/models/seaco_paraformer/template.yaml
index fcaf524..2bf0825 100644
--- a/funasr/models/seaco_paraformer/template.yaml
+++ b/funasr/models/seaco_paraformer/template.yaml
@@ -36,7 +36,7 @@
pos_enc_class: SinusoidalPositionEncoder
normalize_before: true
kernel_size: 11
- sanm_shfit: 0
+ sanm_shift: 0
selfattention_layer_type: sanm
# decoder
@@ -51,7 +51,7 @@
src_attention_dropout_rate: 0.1
att_layer_num: 16
kernel_size: 11
- sanm_shfit: 0
+ sanm_shift: 0
# seaco decoder
seaco_decoder: ParaformerSANMDecoder
@@ -64,7 +64,7 @@
self_attention_dropout_rate: 0.1
src_attention_dropout_rate: 0.1
kernel_size: 21
- sanm_shfit: 0
+ sanm_shift: 0
use_output_layer: false
wo_input_layer: true
diff --git a/funasr/models/sense_voice/model.py b/funasr/models/sense_voice/model.py
index 9d8ef84..ca0c40a 100644
--- a/funasr/models/sense_voice/model.py
+++ b/funasr/models/sense_voice/model.py
@@ -95,7 +95,7 @@
n_feat,
dropout_rate,
kernel_size,
- sanm_shfit=0,
+ sanm_shift=0,
lora_list=None,
lora_rank=8,
lora_alpha=16,
@@ -121,17 +121,17 @@
)
# padding
left_padding = (kernel_size - 1) // 2
- if sanm_shfit > 0:
- left_padding = left_padding + sanm_shfit
+ if sanm_shift > 0:
+ left_padding = left_padding + sanm_shift
right_padding = kernel_size - 1 - left_padding
self.pad_fn = nn.ConstantPad1d((left_padding, right_padding), 0.0)
- def forward_fsmn(self, inputs, mask, mask_shfit_chunk=None):
+ def forward_fsmn(self, inputs, mask, mask_shift_chunk=None):
b, t, d = inputs.size()
if mask is not None:
mask = torch.reshape(mask, (b, -1, 1))
- if mask_shfit_chunk is not None:
- mask = mask * mask_shfit_chunk
+ if mask_shift_chunk is not None:
+ mask = mask * mask_shift_chunk
inputs = inputs * mask
x = inputs.transpose(1, 2)
@@ -211,7 +211,7 @@
return self.linear_out(x) # (batch, time1, d_model)
- def forward(self, x, mask, mask_shfit_chunk=None, mask_att_chunk_encoder=None):
+ def forward(self, x, mask, mask_shift_chunk=None, mask_att_chunk_encoder=None):
"""Compute scaled dot product attention.
Args:
@@ -226,7 +226,7 @@
"""
q_h, k_h, v_h, v = self.forward_qkv(x)
- fsmn_memory = self.forward_fsmn(v, mask, mask_shfit_chunk)
+ fsmn_memory = self.forward_fsmn(v, mask, mask_shift_chunk)
q_h = q_h * self.d_k ** (-0.5)
scores = torch.matmul(q_h, k_h.transpose(-2, -1))
att_outs = self.forward_attention(v_h, scores, mask, mask_att_chunk_encoder)
@@ -326,7 +326,7 @@
self.stochastic_depth_rate = stochastic_depth_rate
self.dropout_rate = dropout_rate
- def forward(self, x, mask, cache=None, mask_shfit_chunk=None, mask_att_chunk_encoder=None):
+ def forward(self, x, mask, cache=None, mask_shift_chunk=None, mask_att_chunk_encoder=None):
"""Compute encoded features.
Args:
@@ -363,7 +363,7 @@
self.self_attn(
x,
mask,
- mask_shfit_chunk=mask_shfit_chunk,
+ mask_shift_chunk=mask_shift_chunk,
mask_att_chunk_encoder=mask_att_chunk_encoder,
),
),
@@ -379,7 +379,7 @@
self.self_attn(
x,
mask,
- mask_shfit_chunk=mask_shfit_chunk,
+ mask_shift_chunk=mask_shift_chunk,
mask_att_chunk_encoder=mask_att_chunk_encoder,
)
)
@@ -388,7 +388,7 @@
self.self_attn(
x,
mask,
- mask_shfit_chunk=mask_shfit_chunk,
+ mask_shift_chunk=mask_shift_chunk,
mask_att_chunk_encoder=mask_att_chunk_encoder,
)
)
@@ -402,7 +402,7 @@
if not self.normalize_before:
x = self.norm2(x)
- return x, mask, cache, mask_shfit_chunk, mask_att_chunk_encoder
+ return x, mask, cache, mask_shift_chunk, mask_att_chunk_encoder
def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0):
"""Compute encoded features.
@@ -469,7 +469,7 @@
positionwise_conv_kernel_size: int = 1,
padding_idx: int = -1,
kernel_size: int = 11,
- sanm_shfit: int = 0,
+ sanm_shift: int = 0,
selfattention_layer_type: str = "sanm",
**kwargs,
):
@@ -494,7 +494,7 @@
output_size,
attention_dropout_rate,
kernel_size,
- sanm_shfit,
+ sanm_shift,
)
encoder_selfattn_layer_args = (
attention_heads,
@@ -502,7 +502,7 @@
output_size,
attention_dropout_rate,
kernel_size,
- sanm_shfit,
+ sanm_shift,
)
self.encoders0 = nn.ModuleList(
diff --git a/funasr/models/sond/encoder/fsmn_encoder.py b/funasr/models/sond/encoder/fsmn_encoder.py
index 9ec9912..5bccf43 100644
--- a/funasr/models/sond/encoder/fsmn_encoder.py
+++ b/funasr/models/sond/encoder/fsmn_encoder.py
@@ -36,12 +36,12 @@
right_padding = kernel_size - 1 - left_padding
self.pad_fn = nn.ConstantPad1d((left_padding, right_padding), 0.0)
- def forward(self, inputs, mask, mask_shfit_chunk=None):
+ def forward(self, inputs, mask, mask_shift_chunk=None):
b, t, d = inputs.size()
if mask is not None:
mask = torch.reshape(mask, (b, -1, 1))
- if mask_shfit_chunk is not None:
- mask = mask * mask_shfit_chunk
+ if mask_shift_chunk is not None:
+ mask = mask * mask_shift_chunk
inputs = inputs * mask
x = inputs.transpose(1, 2)
diff --git a/funasr/models/uniasr/model.py b/funasr/models/uniasr/model.py
index bde6377..002dcdd 100644
--- a/funasr/models/uniasr/model.py
+++ b/funasr/models/uniasr/model.py
@@ -521,10 +521,10 @@
mask_chunk_predictor = self.encoder.overlap_chunk_cls.get_mask_chunk_predictor(
None, device=encoder_out.device, batch_size=encoder_out.size(0)
)
- mask_shfit_chunk = self.encoder.overlap_chunk_cls.get_mask_shfit_chunk(
+ mask_shift_chunk = self.encoder.overlap_chunk_cls.get_mask_shift_chunk(
None, device=encoder_out.device, batch_size=encoder_out.size(0)
)
- encoder_out = encoder_out * mask_shfit_chunk
+ encoder_out = encoder_out * mask_shift_chunk
pre_acoustic_embeds, pre_token_length, pre_alphas, _ = self.predictor(
encoder_out,
ys_out_pad,
@@ -622,10 +622,10 @@
mask_chunk_predictor = self.encoder2.overlap_chunk_cls.get_mask_chunk_predictor(
None, device=encoder_out.device, batch_size=encoder_out.size(0)
)
- mask_shfit_chunk = self.encoder2.overlap_chunk_cls.get_mask_shfit_chunk(
+ mask_shift_chunk = self.encoder2.overlap_chunk_cls.get_mask_shift_chunk(
None, device=encoder_out.device, batch_size=encoder_out.size(0)
)
- encoder_out = encoder_out * mask_shfit_chunk
+ encoder_out = encoder_out * mask_shift_chunk
pre_acoustic_embeds, pre_token_length, pre_alphas, _ = self.predictor2(
encoder_out,
ys_out_pad,
@@ -724,10 +724,10 @@
mask_chunk_predictor = self.encoder.overlap_chunk_cls.get_mask_chunk_predictor(
None, device=encoder_out.device, batch_size=encoder_out.size(0)
)
- mask_shfit_chunk = self.encoder.overlap_chunk_cls.get_mask_shfit_chunk(
+ mask_shift_chunk = self.encoder.overlap_chunk_cls.get_mask_shift_chunk(
None, device=encoder_out.device, batch_size=encoder_out.size(0)
)
- encoder_out = encoder_out * mask_shfit_chunk
+ encoder_out = encoder_out * mask_shift_chunk
pre_acoustic_embeds, pre_token_length, pre_alphas, _ = self.predictor(
encoder_out,
ys_out_pad,
@@ -806,10 +806,10 @@
mask_chunk_predictor = self.encoder2.overlap_chunk_cls.get_mask_chunk_predictor(
None, device=encoder_out.device, batch_size=encoder_out.size(0)
)
- mask_shfit_chunk = self.encoder2.overlap_chunk_cls.get_mask_shfit_chunk(
+ mask_shift_chunk = self.encoder2.overlap_chunk_cls.get_mask_shift_chunk(
None, device=encoder_out.device, batch_size=encoder_out.size(0)
)
- encoder_out = encoder_out * mask_shfit_chunk
+ encoder_out = encoder_out * mask_shift_chunk
pre_acoustic_embeds, pre_token_length, pre_alphas, _ = self.predictor2(
encoder_out,
ys_out_pad,
diff --git a/funasr/models/uniasr/template.yaml b/funasr/models/uniasr/template.yaml
index 43d55fc..c4a4deb 100644
--- a/funasr/models/uniasr/template.yaml
+++ b/funasr/models/uniasr/template.yaml
@@ -33,7 +33,7 @@
pos_enc_class: SinusoidalPositionEncoder
normalize_before: true
kernel_size: 11
- sanm_shfit: 0
+ sanm_shift: 0
selfattention_layer_type: sanm
chunk_size:
- 20
@@ -89,7 +89,7 @@
pos_enc_class: SinusoidalPositionEncoder
normalize_before: true
kernel_size: 21
- sanm_shfit: 0
+ sanm_shift: 0
selfattention_layer_type: sanm
chunk_size:
- 45
diff --git a/runtime/csharp/AliParaformerAsr/AliParaformerAsr/Model/DecoderConfEntity.cs b/runtime/csharp/AliParaformerAsr/AliParaformerAsr/Model/DecoderConfEntity.cs
index d5c0057..97b1d90 100644
--- a/runtime/csharp/AliParaformerAsr/AliParaformerAsr/Model/DecoderConfEntity.cs
+++ b/runtime/csharp/AliParaformerAsr/AliParaformerAsr/Model/DecoderConfEntity.cs
@@ -19,7 +19,7 @@
private float _src_attention_dropout_rate = 0.1F;
private int _att_layer_num = 16;
private int _kernel_size = 11;
- private int _sanm_shfit = 0;
+ private int _sanm_shift = 0;
public int attention_heads { get => _attention_heads; set => _attention_heads = value; }
public int linear_units { get => _linear_units; set => _linear_units = value; }
@@ -30,7 +30,7 @@
public float src_attention_dropout_rate { get => _src_attention_dropout_rate; set => _src_attention_dropout_rate = value; }
public int att_layer_num { get => _att_layer_num; set => _att_layer_num = value; }
public int kernel_size { get => _kernel_size; set => _kernel_size = value; }
- public int sanm_shfit { get => _sanm_shfit; set => _sanm_shfit = value; }
-
+ public int sanm_shift { get => _sanm_shift; set => _sanm_shift = value; }
+
}
}
diff --git a/runtime/csharp/AliParaformerAsr/AliParaformerAsr/Model/EncoderConfEntity.cs b/runtime/csharp/AliParaformerAsr/AliParaformerAsr/Model/EncoderConfEntity.cs
index ffe505e..5d01266 100644
--- a/runtime/csharp/AliParaformerAsr/AliParaformerAsr/Model/EncoderConfEntity.cs
+++ b/runtime/csharp/AliParaformerAsr/AliParaformerAsr/Model/EncoderConfEntity.cs
@@ -21,7 +21,7 @@
private string _pos_enc_class = "SinusoidalPositionEncoder";
private bool _normalize_before = true;
private int _kernel_size = 11;
- private int _sanm_shfit = 0;
+ private int _sanm_shift = 0;
private string _selfattention_layer_type = "sanm";
public int output_size { get => _output_size; set => _output_size = value; }
@@ -35,7 +35,7 @@
public string pos_enc_class { get => _pos_enc_class; set => _pos_enc_class = value; }
public bool normalize_before { get => _normalize_before; set => _normalize_before = value; }
public int kernel_size { get => _kernel_size; set => _kernel_size = value; }
- public int sanm_shfit { get => _sanm_shfit; set => _sanm_shfit = value; }
+ public int sanm_shift { get => _sanm_shift; set => _sanm_shift = value; }
public string selfattention_layer_type { get => _selfattention_layer_type; set => _selfattention_layer_type = value; }
}
}
diff --git a/runtime/triton_gpu/model_repo_paraformer_large_online/feature_extractor/config.yaml b/runtime/triton_gpu/model_repo_paraformer_large_online/feature_extractor/config.yaml
index 9b2266f..cf57504 100755
--- a/runtime/triton_gpu/model_repo_paraformer_large_online/feature_extractor/config.yaml
+++ b/runtime/triton_gpu/model_repo_paraformer_large_online/feature_extractor/config.yaml
@@ -8593,7 +8593,7 @@
pos_enc_class: SinusoidalPositionEncoder
normalize_before: true
kernel_size: 11
- sanm_shfit: 0
+ sanm_shift: 0
selfattention_layer_type: sanm
chunk_size:
- 12
@@ -8623,7 +8623,7 @@
src_attention_dropout_rate: 0.1
att_layer_num: 16
kernel_size: 11
- sanm_shfit: 5
+ sanm_shift: 5
predictor: cif_predictor_v2
predictor_conf:
idim: 512
diff --git a/runtime/triton_gpu/model_repo_sense_voice_small/feature_extractor/config.yaml b/runtime/triton_gpu/model_repo_sense_voice_small/feature_extractor/config.yaml
index 26bb9d3..a66f1ca 100644
--- a/runtime/triton_gpu/model_repo_sense_voice_small/feature_extractor/config.yaml
+++ b/runtime/triton_gpu/model_repo_sense_voice_small/feature_extractor/config.yaml
@@ -12,7 +12,7 @@
pos_enc_class: SinusoidalPositionEncoder
normalize_before: true
kernel_size: 11
- sanm_shfit: 0
+ sanm_shift: 0
selfattention_layer_type: sanm
--
Gitblit v1.9.1