From 9ee67cfbea90a89e44aaa9dfef7290548b1bbddd Mon Sep 17 00:00:00 2001
From: haoneng.lhn <haoneng.lhn@alibaba-inc.com>
Date: 星期三, 31 五月 2023 17:43:26 +0800
Subject: [PATCH] add paraformer online infer and finetune recipe
---
funasr/bin/asr_infer.py | 18 ++++++++++++++----
1 files changed, 14 insertions(+), 4 deletions(-)
diff --git a/funasr/bin/asr_infer.py b/funasr/bin/asr_infer.py
index acb5fd8..f9d6bf7 100644
--- a/funasr/bin/asr_infer.py
+++ b/funasr/bin/asr_infer.py
@@ -9,6 +9,7 @@
import time
import copy
import os
+import re
import codecs
import tempfile
import requests
@@ -304,6 +305,7 @@
nbest: int = 1,
frontend_conf: dict = None,
hotword_list_or_file: str = None,
+ decoding_ind: int = 0,
**kwargs,
):
assert check_argument_types()
@@ -414,6 +416,7 @@
self.nbest = nbest
self.frontend = frontend
self.encoder_downsampling_factor = 1
+ self.decoding_ind = decoding_ind
if asr_train_args.encoder == "data2vec_encoder" or asr_train_args.encoder_conf["input_layer"] == "conv2d":
self.encoder_downsampling_factor = 4
@@ -451,7 +454,7 @@
batch = to_device(batch, device=self.device)
# b. Forward Encoder
- enc, enc_len = self.asr_model.encode(**batch)
+ enc, enc_len = self.asr_model.encode(**batch, ind=self.decoding_ind)
if isinstance(enc, tuple):
enc = enc[0]
# assert len(enc) == 1, len(enc)
@@ -828,9 +831,16 @@
# Change integer-ids to tokens
token = self.converter.ids2tokens(token_int)
- token = " ".join(token)
-
- results.append(token)
+ postprocessed_result = ""
+ for item in token:
+ if item.endswith('@@'):
+ postprocessed_result += item[:-2]
+ elif re.match('^[a-zA-Z]+$', item):
+ postprocessed_result += item + " "
+ else:
+ postprocessed_result += item
+
+ results.append(postprocessed_result)
# assert check_return_type(results)
return results
--
Gitblit v1.9.1