From a1a33c2a20b9de7114d42e07a2ad7d6a1d2e0f8f Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期四, 16 二月 2023 16:46:52 +0800
Subject: [PATCH] readme

---
 funasr/bin/asr_inference_paraformer.py |   53 ++++++++++++++++++++++++++++++++++++++++++-----------
 1 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/funasr/bin/asr_inference_paraformer.py b/funasr/bin/asr_inference_paraformer.py
index 709c5bf..be35e78 100644
--- a/funasr/bin/asr_inference_paraformer.py
+++ b/funasr/bin/asr_inference_paraformer.py
@@ -6,6 +6,8 @@
 import copy
 import os
 import codecs
+import tempfile
+import requests
 from pathlib import Path
 from typing import Optional
 from typing import Sequence
@@ -175,10 +177,24 @@
         self.converter = converter
         self.tokenizer = tokenizer
 
-        # 6. [Optional] Build hotword list from file or str
+        # 6. [Optional] Build hotword list from str, local file or url
+        # for None 
         if hotword_list_or_file is None:
             self.hotword_list = None
+        # for text str input
+        elif not os.path.exists(hotword_list_or_file) and not hotword_list_or_file.startswith('http'):
+            logging.info("Attempting to parse hotwords as str...")
+            self.hotword_list = []
+            hotword_str_list = []
+            for hw in hotword_list_or_file.strip().split():
+                hotword_str_list.append(hw)
+                self.hotword_list.append(self.converter.tokens2ids([i for i in hw]))
+            self.hotword_list.append([self.asr_model.sos])
+            hotword_str_list.append('<s>')
+            logging.info("Hotword list: {}.".format(hotword_str_list))
+        # for local txt inputs
         elif os.path.exists(hotword_list_or_file):
+            logging.info("Attempting to parse hotwords from local txt...")
             self.hotword_list = []
             hotword_str_list = []
             with codecs.open(hotword_list_or_file, 'r') as fin:
@@ -186,20 +202,31 @@
                     hw = line.strip()
                     hotword_str_list.append(hw)
                     self.hotword_list.append(self.converter.tokens2ids([i for i in hw]))
-                self.hotword_list.append([1])
+                self.hotword_list.append([self.asr_model.sos])
                 hotword_str_list.append('<s>')
             logging.info("Initialized hotword list from file: {}, hotword list: {}."
                 .format(hotword_list_or_file, hotword_str_list))
+        # for url, download and generate txt
         else:
-            logging.info("Attempting to parse hotwords as str...")
+            logging.info("Attempting to parse hotwords from url...")
+            work_dir = tempfile.TemporaryDirectory().name
+            if not os.path.exists(work_dir):
+                os.makedirs(work_dir)
+            text_file_path = os.path.join(work_dir, os.path.basename(hotword_list_or_file))
+            local_file = requests.get(hotword_list_or_file)
+            open(text_file_path, "wb").write(local_file.content)
+            hotword_list_or_file = text_file_path
             self.hotword_list = []
             hotword_str_list = []
-            for hw in hotword_list_or_file.strip().split():
-                hotword_str_list.append(hw)
-                self.hotword_list.append(self.converter.tokens2ids([i for i in hw]))
-            self.hotword_list.append([1])
-            hotword_str_list.append('<s>')
-            logging.info("Hotword list: {}.".format(hotword_str_list))
+            with codecs.open(hotword_list_or_file, 'r') as fin:
+                for line in fin.readlines():
+                    hw = line.strip()
+                    hotword_str_list.append(hw)
+                    self.hotword_list.append(self.converter.tokens2ids([i for i in hw]))
+                self.hotword_list.append([self.asr_model.sos])
+                hotword_str_list.append('<s>')
+            logging.info("Initialized hotword list from file: {}, hotword list: {}."
+                .format(hotword_list_or_file, hotword_str_list))
 
 
         is_use_lm = lm_weight != 0.0 and lm_file is not None
@@ -428,7 +455,11 @@
         format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
     )
 
-    hotword_list_or_file = param_dict['hotword']
+    if param_dict is not None:
+        hotword_list_or_file = param_dict.get('hotword')
+    else:
+        hotword_list_or_file = None
+
     if ngpu >= 1 and torch.cuda.is_available():
         device = "cuda"
     else:
@@ -539,7 +570,7 @@
                         ibest_writer["rtf"][key] = rtf_cur
 
                     if text is not None:
-                        text_postprocessed = postprocess_utils.sentence_postprocess(token)
+                        text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
                         item = {'key': key, 'value': text_postprocessed}
                         asr_result_list.append(item)
                         finish_count += 1

--
Gitblit v1.9.1