From adcee8828ef5d78b575043954deb662a35e318f7 Mon Sep 17 00:00:00 2001
From: huangmingming <huangmingming@deepscience.cn>
Date: 星期一, 30 一月 2023 16:02:54 +0800
Subject: [PATCH] update the minimum size of audio

---
 funasr/bin/asr_inference_paraformer_vad_punc.py |   42 ++++++++++++++++++++++++++++--------------
 1 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/funasr/bin/asr_inference_paraformer_vad_punc.py b/funasr/bin/asr_inference_paraformer_vad_punc.py
index 7752ea9..7a539e4 100644
--- a/funasr/bin/asr_inference_paraformer_vad_punc.py
+++ b/funasr/bin/asr_inference_paraformer_vad_punc.py
@@ -3,6 +3,7 @@
 import logging
 import sys
 import time
+import json
 from pathlib import Path
 from typing import Optional
 from typing import Sequence
@@ -100,10 +101,13 @@
         # logging.info("asr_train_args: {}".format(asr_train_args))
         asr_model.to(dtype=getattr(torch, dtype)).eval()
 
-        ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
+        if asr_model.ctc != None:
+            ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
+            scorers.update(
+                ctc=ctc
+            )
         token_list = asr_model.token_list
         scorers.update(
-            ctc=ctc,
             length_bonus=LengthBonus(len(token_list)),
         )
 
@@ -171,7 +175,7 @@
         self.converter = converter
         self.tokenizer = tokenizer
         is_use_lm = lm_weight != 0.0 and lm_file is not None
-        if ctc_weight == 0.0 and not is_use_lm:
+        if (ctc_weight == 0.0 or asr_model.ctc == None) and not is_use_lm:
             beam_search = None
         self.beam_search = beam_search
         logging.info(f"Beam_search: {self.beam_search}")
@@ -562,6 +566,7 @@
         length_total = 0.0
         finish_count = 0
         file_count = 1
+        lfr_factor = 6
         # 7 .Start for-loop
         asr_result_list = []
         output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
@@ -597,7 +602,7 @@
                     results = speech2text(**batch)
                     if len(results) < 1:
                         hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
-                        results = [[" ", ["<space>"], [2], 10, 6]] * nbest
+                        results = [[" ", ["<space>"], [2], 0, 1, 6]] * nbest
                     time_end = time.time()
                     forward_time = time_end - time_beg
                     lfr_factor = results[0][-1]
@@ -615,7 +620,8 @@
                 
                 key = keys[0]
                 result = result_segments[0]
-                text, token, token_int, time_stamp = result
+                text, token, token_int = result[0], result[1], result[2]
+                time_stamp = None if len(result) < 4 else result[3]
                 
                 # Create a directory: outdir/{n}best_recog
                 if writer is not None:
@@ -630,15 +636,23 @@
                         text_postprocessed, time_stamp_postprocessed, word_lists = postprocessed_result[0], \
                                                                                    postprocessed_result[1], \
                                                                                    postprocessed_result[2]
-                        text_postprocessed_punc, punc_id_list = text2punc(word_lists, 20)
-                        text_postprocessed_punc_time_stamp = "predictions: {}  time_stamp: {}".format(
-                            text_postprocessed_punc, time_stamp_postprocessed)
+                        if len(word_lists) > 0: 
+                            text_postprocessed_punc, punc_id_list = text2punc(word_lists, 20)
+                            text_postprocessed_punc_time_stamp = json.dumps({"predictions": text_postprocessed_punc,
+                                                                             "time_stamp": time_stamp_postprocessed},
+                                                                            ensure_ascii=False)
+                        else:
+                            text_postprocessed_punc = ""
+                            punc_id_list = []
+                            text_postprocessed_punc_time_stamp = ""
+                            
                     else:
-                        text_postprocessed = postprocessed_result
-                        time_stamp_postprocessed = None
-                        word_lists = None
-                        text_postprocessed_punc_time_stamp = None
-                        punc_id_list = None
+                        text_postprocessed = ""
+                        time_stamp_postprocessed = ""
+                        word_lists = ""
+                        text_postprocessed_punc_time_stamp = ""
+                        punc_id_list = ""
+                        text_postprocessed_punc = ""
 
                     item = {'key': key, 'value': text_postprocessed_punc_time_stamp, 'text': text_postprocessed,
                             'time_stamp': time_stamp_postprocessed, 'punc': punc_id_list, 'token': token}
@@ -660,7 +674,7 @@
                                                                                          time_stamp_postprocessed))
         
         logging.info("decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}".
-                     format(length_total, forward_time_total, 100 * forward_time_total / (length_total * lfr_factor)))
+                     format(length_total, forward_time_total, 100 * forward_time_total / (length_total * lfr_factor+1e-6)))
         return asr_result_list
     return _forward
 

--
Gitblit v1.9.1