From 3cee2214b77eda865ddfd7990a1a7197e757e8bb Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期一, 27 十一月 2023 13:53:07 +0800
Subject: [PATCH] Merge branch 'dev_gzf_funasr2' of github.com:alibaba-damo-academy/FunASR into dev_gzf_funasr2 add

---
 funasr/utils/speaker_utils.py                  |    3 
 funasr/models/e2e_asr_contextual_paraformer.py |    5 -
 runtime/onnxruntime/src/fsmn-vad-online.cpp    |    1 
 runtime/onnxruntime/src/paraformer-online.cpp  |    1 
 MODEL_LICENSE                                  |   71 ++++++++++++-----------
 funasr/bin/asr_inference_launch.py             |   41 +++++++------
 runtime/onnxruntime/src/fsmn-vad.cpp           |    1 
 runtime/onnxruntime/src/paraformer.cpp         |    1 
 8 files changed, 67 insertions(+), 57 deletions(-)

diff --git a/MODEL_LICENSE b/MODEL_LICENSE
index 3d9e410..09ee9eb 100644
--- a/MODEL_LICENSE
+++ b/MODEL_LICENSE
@@ -1,3 +1,39 @@
+FunASR Model Open Source License
+Version 1.0
+
+Copyright (C) [2023-2028] Alibaba Group. All rights reserved.
+
+Thank you for choosing the FunASR open source models. The FunASR open source models contain a series of open-source models that allow everyone to use, modify, share, and learn from it.
+
+To ensure better community collaboration, we have developed the following agreement and hope that you carefully read and abide by it.
+
+1 Definitions
+In this agreement, [FunASR software] refers to the FunASR open source model, and its derivatives, including fine-tuned models. [You] refer to individuals or organizations who use, modify, share, and learn from [FunASR software].
+
+2 License and Restrictions
+
+2.1 License
+You are free to use, copy, modify, and share [FunASR software] under the conditions of this agreement.
+
+2.2 Restrictions
+You should indicate the code and model source and author information when using, copying, modifying and sharing [FunASR software]. You should keep the relevant names of models in [FunASR software].
+
+3 Responsibility and Risk
+[FunASR software] is for reference and learning purposes only and is not responsible for any direct or indirect losses caused by your use or modification of [FunASR software]. You should take responsibility and risks for your use and modification of [FunASR software].
+
+4 Termination
+If you violate any terms of this agreement, your license will be automatically terminated, and you must stop using, copying, modifying, and sharing [FunASR software].
+
+5 Revision
+This agreement may be updated and revised from time to time. The revised agreement will be published in the FunASR official repository and automatically take effect. If you continue to use, copy, modify, and share [FunASR software], it means you agree to the revised agreement.
+
+6 Other Provisions
+This agreement is subject to the laws of [Country/Region]. If any provisions are found to be illegal, invalid, or unenforceable, they shall be deemed deleted from this agreement, and the remaining provisions shall remain valid and binding.
+
+If you have any questions or comments about this agreement, please contact us.
+
+Copyright (c) [2023-2028] Alibaba Group. All rights reserved.
+
 FunASR 妯″瀷寮�婧愬崗璁�
 
 鐗堟湰鍙凤細1.0
@@ -36,38 +72,3 @@
 
 鐗堟潈鎵�鏈壜� [2023-2028] [闃块噷宸村反闆嗗洟]銆備繚鐣欐墍鏈夋潈鍒┿��
 
-FunASR Model Open Source License
-Version 1.0
-
-Copyright (C) [2023-2028] Alibaba Group. All rights reserved.
-
-Thank you for choosing the FunASR open source models. The FunASR open source models contain a series of open-source models that allow everyone to use, modify, share, and learn from it.
-
-To ensure better community collaboration, we have developed the following agreement and hope that you carefully read and abide by it.
-
-1 Definitions
-In this agreement, [FunASR software] refers to the FunASR open source model, and its derivatives, including fine-tuned models. [You] refer to individuals or organizations who use, modify, share, and learn from [FunASR software].
-
-2 License and Restrictions
-
-2.1 License
-You are free to use, copy, modify, and share [FunASR software] under the conditions of this agreement.
-
-2.2 Restrictions
-You should indicate the code and model source and author information when using, copying, modifying and sharing [FunASR software]. You should keep the relevant names of models in [FunASR software].
-
-3 Responsibility and Risk
-[FunASR software] is for reference and learning purposes only and is not responsible for any direct or indirect losses caused by your use or modification of [FunASR software]. You should take responsibility and risks for your use and modification of [FunASR software].
-
-4 Termination
-If you violate any terms of this agreement, your license will be automatically terminated, and you must stop using, copying, modifying, and sharing [FunASR software].
-
-5 Revision
-This agreement may be updated and revised from time to time. The revised agreement will be published in the FunASR official repository and automatically take effect. If you continue to use, copy, modify, and share [FunASR software], it means you agree to the revised agreement.
-
-6 Other Provisions
-This agreement is subject to the laws of [Country/Region]. If any provisions are found to be illegal, invalid, or unenforceable, they shall be deemed deleted from this agreement, and the remaining provisions shall remain valid and binding.
-
-If you have any questions or comments about this agreement, please contact us.
-
-Copyright (c) [2023-2028] Alibaba Group. All rights reserved.
diff --git a/funasr/bin/asr_inference_launch.py b/funasr/bin/asr_inference_launch.py
index 7dd27fc..f61c085 100644
--- a/funasr/bin/asr_inference_launch.py
+++ b/funasr/bin/asr_inference_launch.py
@@ -956,24 +956,29 @@
                 ed = int(vadsegment[1]) / 1000
                 vad_segments.append(
                     [st, ed, audio[int(st * 16000):int(ed * 16000)]])
-            check_audio_list(vad_segments)
-            # sv pipeline
-            segments = sv_chunk(vad_segments)
-            embeddings = []
-            for s in segments:
-                #_, embs = self.sv_pipeline([s[2]], output_emb=True)
-                # embeddings.append(embs)
-                wavs = sv_preprocess([s[2]])
-                # embs = self.forward(wavs)
-                embs = []
-                for x in wavs:
-                    x = extract_feature([x])
-                    embs.append(sv_model(x))
-                embs = torch.cat(embs)
-                embeddings.append(embs.detach().numpy())
-            embeddings = np.concatenate(embeddings)
-            labels = cb_model(embeddings)
-            sv_output = postprocess(segments, vad_segments, labels, embeddings)
+            audio_dur = check_audio_list(vad_segments)
+            if audio_dur > 5:
+                # sv pipeline
+                segments = sv_chunk(vad_segments)
+                embeddings = []
+                for s in segments:
+                    #_, embs = self.sv_pipeline([s[2]], output_emb=True)
+                    # embeddings.append(embs)
+                    wavs = sv_preprocess([s[2]])
+                    # embs = self.forward(wavs)
+                    embs = []
+                    for x in wavs:
+                        x = extract_feature([x])
+                        embs.append(sv_model(x))
+                    embs = torch.cat(embs)
+                    embeddings.append(embs.detach().numpy())
+                embeddings = np.concatenate(embeddings)
+                labels = cb_model(embeddings)
+                sv_output = postprocess(segments, vad_segments, labels, embeddings)
+            else:
+                # fake speaker res for too shot utterance
+                sv_output = [[0.0, vadsegments[-1][-1]/1000.0, 0]]
+                logging.warning("Too short utterence found: {}, return default speaker results.".format(keys))
 
             speech, speech_lengths = batch["speech"], batch["speech_lengths"]
 
diff --git a/funasr/models/e2e_asr_contextual_paraformer.py b/funasr/models/e2e_asr_contextual_paraformer.py
index 64e0f8d..a2f7078 100644
--- a/funasr/models/e2e_asr_contextual_paraformer.py
+++ b/funasr/models/e2e_asr_contextual_paraformer.py
@@ -134,7 +134,7 @@
             text_lengths: torch.Tensor,
             hotword_pad: torch.Tensor,
             hotword_lengths: torch.Tensor,
-            ideal_attn: torch.Tensor,
+            dha_pad: torch.Tensor,
     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
         """Frontend + Encoder + Decoder + Calc loss
 
@@ -207,7 +207,7 @@
         # 2b. Attention decoder branch
         if self.ctc_weight != 1.0:
             loss_att, acc_att, cer_att, wer_att, loss_pre, loss_ideal = self._calc_att_clas_loss(
-                encoder_out, encoder_out_lens, text, text_lengths, hotword_pad, hotword_lengths, ideal_attn
+                encoder_out, encoder_out_lens, text, text_lengths, hotword_pad, hotword_lengths
             )
 
         # 3. CTC-Att loss definition
@@ -242,7 +242,6 @@
             ys_pad_lens: torch.Tensor,
             hotword_pad: torch.Tensor,
             hotword_lengths: torch.Tensor,
-            ideal_attn: torch.Tensor,
     ):
         encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
             encoder_out.device)
diff --git a/funasr/utils/speaker_utils.py b/funasr/utils/speaker_utils.py
index 38ef11c..edaf58b 100644
--- a/funasr/utils/speaker_utils.py
+++ b/funasr/utils/speaker_utils.py
@@ -35,7 +35,8 @@
             assert seg[0] >= audio[
                 i - 1][1], 'modelscope error: Wrong time stamps.'
         audio_dur += seg[1] - seg[0]
-    assert audio_dur > 5, 'modelscope error: The effective audio duration is too short.'
+    return audio_dur
+    # assert audio_dur > 5, 'modelscope error: The effective audio duration is too short.'
 
 
 def sv_preprocess(inputs: Union[np.ndarray, list]):
diff --git a/runtime/onnxruntime/src/fsmn-vad-online.cpp b/runtime/onnxruntime/src/fsmn-vad-online.cpp
index b9cda4b..a8cc5d8 100644
--- a/runtime/onnxruntime/src/fsmn-vad-online.cpp
+++ b/runtime/onnxruntime/src/fsmn-vad-online.cpp
@@ -110,6 +110,7 @@
                     p.insert(p.end(), vad_feats[vad_feats.size() - 1].begin(), vad_feats[vad_feats.size() - 1].end());
                 }
                 out_feats.emplace_back(p);
+                p.clear();
             } else {
                 lfr_splice_frame_idxs = i;
                 break;
diff --git a/runtime/onnxruntime/src/fsmn-vad.cpp b/runtime/onnxruntime/src/fsmn-vad.cpp
index db633b0..c832274 100644
--- a/runtime/onnxruntime/src/fsmn-vad.cpp
+++ b/runtime/onnxruntime/src/fsmn-vad.cpp
@@ -264,6 +264,7 @@
                 p.insert(p.end(), vad_feats[vad_feats.size() - 1].begin(), vad_feats[vad_feats.size() - 1].end());
             }
             out_feats.emplace_back(p);
+            p.clear();
         }
     }
     // Apply cmvn
diff --git a/runtime/onnxruntime/src/paraformer-online.cpp b/runtime/onnxruntime/src/paraformer-online.cpp
index 3b629c5..d08b57e 100644
--- a/runtime/onnxruntime/src/paraformer-online.cpp
+++ b/runtime/onnxruntime/src/paraformer-online.cpp
@@ -164,6 +164,7 @@
                     p.insert(p.end(), wav_feats[wav_feats.size() - 1].begin(), wav_feats[wav_feats.size() - 1].end());
                 }
                 out_feats.emplace_back(p);
+                p.clear();
             } else {
                 lfr_splice_frame_idxs = i;
                 break;
diff --git a/runtime/onnxruntime/src/paraformer.cpp b/runtime/onnxruntime/src/paraformer.cpp
index 9e8e336..4e89ea2 100644
--- a/runtime/onnxruntime/src/paraformer.cpp
+++ b/runtime/onnxruntime/src/paraformer.cpp
@@ -436,6 +436,7 @@
                 p.insert(p.end(), asr_feats[asr_feats.size() - 1].begin(), asr_feats[asr_feats.size() - 1].end());
             }
             out_feats.emplace_back(p);
+            p.clear();
         }
     }
     // Apply cmvn

--
Gitblit v1.9.1