From 2f9685797b0c8a420574c2a459c242f90efdf3ee Mon Sep 17 00:00:00 2001
From: aky15 <ankeyuthu@gmail.com>
Date: 星期三, 24 五月 2023 14:04:54 +0800
Subject: [PATCH] support resume model from pai (#544)
---
funasr/bin/asr_infer.py | 62 ++++++++++++++++++++++---------
1 files changed, 44 insertions(+), 18 deletions(-)
diff --git a/funasr/bin/asr_infer.py b/funasr/bin/asr_infer.py
index 03145f8..760fd07 100644
--- a/funasr/bin/asr_infer.py
+++ b/funasr/bin/asr_infer.py
@@ -9,6 +9,7 @@
import time
import copy
import os
+import re
import codecs
import tempfile
import requests
@@ -488,15 +489,20 @@
nbest_hyps = nbest_hyps[: self.nbest]
else:
- yseq = am_scores.argmax(dim=-1)
- score = am_scores.max(dim=-1)[0]
- score = torch.sum(score, dim=-1)
- # pad with mask tokens to ensure compatibility with sos/eos tokens
- yseq = torch.tensor(
- [self.asr_model.sos] + yseq.tolist() + [self.asr_model.eos], device=yseq.device
- )
+ if pre_token_length[i] == 0:
+ yseq = torch.tensor(
+ [self.asr_model.sos] + [self.asr_model.eos], device=yseq.device
+ )
+ score = torch.tensor(0.0, device=yseq.device)
+ else:
+ yseq = am_scores.argmax(dim=-1)
+ score = am_scores.max(dim=-1)[0]
+ score = torch.sum(score, dim=-1)
+ # pad with mask tokens to ensure compatibility with sos/eos tokens
+ yseq = torch.tensor(
+ [self.asr_model.sos] + yseq.tolist() + [self.asr_model.eos], device=yseq.device
+ )
nbest_hyps = [Hypothesis(yseq=yseq, score=score)]
-
for hyp in nbest_hyps:
assert isinstance(hyp, (Hypothesis)), type(hyp)
@@ -749,10 +755,13 @@
feats = cache_en["feats"]
feats_len = torch.tensor([feats.shape[1]])
self.asr_model.frontend = None
+ self.frontend.cache_reset()
results = self.infer(feats, feats_len, cache)
return results
else:
if self.frontend is not None:
+ if cache_en["start_idx"] == 0:
+ self.frontend.cache_reset()
feats, feats_len = self.frontend.forward(speech, speech_lengths, cache_en["is_final"])
feats = to_device(feats, device=self.device)
feats_len = feats_len.int()
@@ -820,9 +829,16 @@
# Change integer-ids to tokens
token = self.converter.ids2tokens(token_int)
- token = " ".join(token)
-
- results.append(token)
+ postprocessed_result = ""
+ for item in token:
+ if item.endswith('@@'):
+ postprocessed_result += item[:-2]
+ elif re.match('^[a-zA-Z]+$', item):
+ postprocessed_result += item + " "
+ else:
+ postprocessed_result += item
+
+ results.append(postprocessed_result)
# assert check_return_type(results)
return results
@@ -1494,8 +1510,13 @@
if isinstance(speech, np.ndarray):
speech = torch.tensor(speech)
- feats = speech.unsqueeze(0).to(getattr(torch, self.dtype))
- feats_lengths = feats.new_full([1], dtype=torch.long, fill_value=feats.size(1))
+ if self.frontend is not None:
+ speech = torch.unsqueeze(speech, axis=0)
+ speech_lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1))
+ feats, feats_lengths = self.frontend(speech, speech_lengths)
+ else:
+ feats = speech.unsqueeze(0).to(getattr(torch, self.dtype))
+ feats_lengths = feats.new_full([1], dtype=torch.long, fill_value=feats.size(1))
if self.asr_model.normalize is not None:
feats, feats_lengths = self.asr_model.normalize(feats, feats_lengths)
@@ -1520,14 +1541,19 @@
if isinstance(speech, np.ndarray):
speech = torch.tensor(speech)
-
- feats = speech.unsqueeze(0).to(getattr(torch, self.dtype))
- feats_lengths = feats.new_full([1], dtype=torch.long, fill_value=feats.size(1))
+
+ if self.frontend is not None:
+ speech = torch.unsqueeze(speech, axis=0)
+ speech_lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1))
+ feats, feats_lengths = self.frontend(speech, speech_lengths)
+ else:
+ feats = speech.unsqueeze(0).to(getattr(torch, self.dtype))
+ feats_lengths = feats.new_full([1], dtype=torch.long, fill_value=feats.size(1))
feats = to_device(feats, device=self.device)
feats_lengths = to_device(feats_lengths, device=self.device)
- enc_out, _ = self.asr_model.encoder(feats, feats_lengths)
+ enc_out, _, _ = self.asr_model.encoder(feats, feats_lengths)
nbest_hyps = self.beam_search(enc_out[0])
@@ -1581,7 +1607,7 @@
d = ModelDownloader()
kwargs.update(**d.download_and_unpack(model_tag))
- return Speech2Text(**kwargs)
+ return Speech2TextTransducer(**kwargs)
class Speech2TextSAASR:
--
Gitblit v1.9.1