From adcee8828ef5d78b575043954deb662a35e318f7 Mon Sep 17 00:00:00 2001
From: huangmingming <huangmingming@deepscience.cn>
Date: 星期一, 30 一月 2023 16:02:54 +0800
Subject: [PATCH] update the minimum size of audio
---
funasr/bin/asr_inference_uniasr.py | 397 ++++++++++++++++++++++++++++++++++++++++++++++++--------
1 files changed, 337 insertions(+), 60 deletions(-)
diff --git a/funasr/bin/asr_inference_uniasr.py b/funasr/bin/asr_inference_uniasr.py
old mode 100755
new mode 100644
index 796c5b3..d386ff1
--- a/funasr/bin/asr_inference_uniasr.py
+++ b/funasr/bin/asr_inference_uniasr.py
@@ -8,6 +8,8 @@
from typing import Sequence
from typing import Tuple
from typing import Union
+from typing import Dict
+from typing import Any
import numpy as np
import torch
@@ -31,6 +33,12 @@
from funasr.utils.types import str2bool
from funasr.utils.types import str2triple_str
from funasr.utils.types import str_or_none
+from funasr.utils import asr_utils, wav_utils, postprocess_utils
+from funasr.models.frontend.wav_frontend import WavFrontend
+
+
+header_colors = '\033[95m'
+end_colors = '\033[0m'
class Speech2Text:
@@ -49,6 +57,7 @@
self,
asr_train_config: Union[Path, str] = None,
asr_model_file: Union[Path, str] = None,
+ cmvn_file: Union[Path, str] = None,
lm_train_config: Union[Path, str] = None,
lm_file: Union[Path, str] = None,
token_type: str = None,
@@ -66,6 +75,7 @@
token_num_relax: int = 1,
decoding_ind: int = 0,
decoding_mode: str = "model1",
+ frontend_conf: dict = None,
**kwargs,
):
assert check_argument_types()
@@ -73,19 +83,27 @@
# 1. Build ASR model
scorers = {}
asr_model, asr_train_args = ASRTask.build_model_from_file(
- asr_train_config, asr_model_file, device
+ asr_train_config, asr_model_file, cmvn_file, device
)
+ frontend = None
+ if asr_train_args.frontend is not None and asr_train_args.frontend_conf is not None:
+ frontend = WavFrontend(cmvn_file=cmvn_file, **asr_train_args.frontend_conf)
+
+ logging.info("asr_train_args: {}".format(asr_train_args))
asr_model.to(dtype=getattr(torch, dtype)).eval()
if decoding_mode == "model1":
decoder = asr_model.decoder
else:
decoder = asr_model.decoder2
- ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
+ if asr_model.ctc != None:
+ ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
+ scorers.update(
+ ctc=ctc
+ )
token_list = asr_model.token_list
scorers.update(
decoder=decoder,
- ctc=ctc,
length_bonus=LengthBonus(len(token_list)),
)
@@ -127,7 +145,7 @@
for scorer in scorers.values():
if isinstance(scorer, torch.nn.Module):
scorer.to(device=device, dtype=getattr(torch, dtype)).eval()
- logging.info(f"Beam_search: {beam_search}")
+ # logging.info(f"Beam_search: {beam_search}")
logging.info(f"Decoding device={device}, dtype={dtype}")
# 5. [Optional] Build Text converter: e.g. bpe-sym -> Text
@@ -162,10 +180,11 @@
self.token_num_relax = token_num_relax
self.decoding_ind = decoding_ind
self.decoding_mode = decoding_mode
+ self.frontend = frontend
@torch.no_grad()
def __call__(
- self, speech: Union[torch.Tensor, np.ndarray]
+ self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None
) -> List[
Tuple[
Optional[str],
@@ -177,7 +196,7 @@
"""Inference
Args:
- data: Input speech data
+ speech: Input speech data
Returns:
text, token, token_int, hyp
@@ -188,24 +207,29 @@
if isinstance(speech, np.ndarray):
speech = torch.tensor(speech)
- # data: (Nsamples,) -> (1, Nsamples)
- speech = speech.unsqueeze(0).to(getattr(torch, self.dtype))
- # lengths: (1,)
- lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1))
- batch = {"speech": speech, "speech_lengths": lengths}
+ if self.frontend is not None:
+ feats, feats_len = self.frontend.forward(speech, speech_lengths)
+ feats = to_device(feats, device=self.device)
+ feats_len = feats_len.int()
+ self.asr_model.frontend = None
+ else:
+ feats = speech
+ feats_len = speech_lengths
+ lfr_factor = max(1, (feats.size()[-1] // 80) - 1)
+ feats_raw = feats.clone().to(self.device)
+ batch = {"speech": feats, "speech_lengths": feats_len}
# a. To device
batch = to_device(batch, device=self.device)
# b. Forward Encoder
- speech_raw = speech.clone().to(self.device)
- enc, enc_len = self.asr_model.encode(**batch, ind=self.decoding_ind)
+ _, enc, enc_len = self.asr_model.encode(**batch, ind=self.decoding_ind)
if isinstance(enc, tuple):
enc = enc[0]
assert len(enc) == 1, len(enc)
if self.decoding_mode == "model1":
predictor_outs = self.asr_model.calc_predictor_mask(enc, enc_len)
else:
- enc, enc_len = self.asr_model.encode2(enc, enc_len, speech_raw, lengths, ind=self.decoding_ind)
+ enc, enc_len = self.asr_model.encode2(enc, enc_len, feats_raw, feats_len, ind=self.decoding_ind)
predictor_outs = self.asr_model.calc_predictor_mask2(enc, enc_len)
scama_mask = predictor_outs[4]
@@ -248,34 +272,252 @@
return results
+# def inference(
+# maxlenratio: float,
+# minlenratio: float,
+# batch_size: int,
+# beam_size: int,
+# ngpu: int,
+# ctc_weight: float,
+# lm_weight: float,
+# penalty: float,
+# log_level: Union[int, str],
+# data_path_and_name_and_type,
+# asr_train_config: Optional[str],
+# asr_model_file: Optional[str],
+# ngram_file: Optional[str] = None,
+# cmvn_file: Optional[str] = None,
+# raw_inputs: Union[np.ndarray, torch.Tensor] = None,
+# lm_train_config: Optional[str] = None,
+# lm_file: Optional[str] = None,
+# token_type: Optional[str] = None,
+# key_file: Optional[str] = None,
+# word_lm_train_config: Optional[str] = None,
+# bpemodel: Optional[str] = None,
+# allow_variable_data_keys: bool = False,
+# streaming: bool = False,
+# output_dir: Optional[str] = None,
+# dtype: str = "float32",
+# seed: int = 0,
+# ngram_weight: float = 0.9,
+# nbest: int = 1,
+# num_workers: int = 1,
+# token_num_relax: int = 1,
+# decoding_ind: int = 0,
+# decoding_mode: str = "model1",
+# **kwargs,
+# ):
+# assert check_argument_types()
+# if batch_size > 1:
+# raise NotImplementedError("batch decoding is not implemented")
+# if word_lm_train_config is not None:
+# raise NotImplementedError("Word LM is not implemented")
+# if ngpu > 1:
+# raise NotImplementedError("only single GPU decoding is supported")
+#
+# logging.basicConfig(
+# level=log_level,
+# format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+# )
+#
+# if ngpu >= 1 and torch.cuda.is_available():
+# device = "cuda"
+# else:
+# device = "cpu"
+#
+# # 1. Set random-seed
+# set_all_random_seed(seed)
+#
+# # 2. Build speech2text
+# speech2text_kwargs = dict(
+# asr_train_config=asr_train_config,
+# asr_model_file=asr_model_file,
+# cmvn_file=cmvn_file,
+# lm_train_config=lm_train_config,
+# lm_file=lm_file,
+# ngram_file=ngram_file,
+# token_type=token_type,
+# bpemodel=bpemodel,
+# device=device,
+# maxlenratio=maxlenratio,
+# minlenratio=minlenratio,
+# dtype=dtype,
+# beam_size=beam_size,
+# ctc_weight=ctc_weight,
+# lm_weight=lm_weight,
+# ngram_weight=ngram_weight,
+# penalty=penalty,
+# nbest=nbest,
+# streaming=streaming,
+# token_num_relax=token_num_relax,
+# decoding_ind=decoding_ind,
+# decoding_mode=decoding_mode,
+# )
+# speech2text = Speech2Text(**speech2text_kwargs)
+#
+# # 3. Build data-iterator
+# loader = ASRTask.build_streaming_iterator(
+# data_path_and_name_and_type,
+# dtype=dtype,
+# batch_size=batch_size,
+# key_file=key_file,
+# num_workers=num_workers,
+# preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False),
+# collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False),
+# allow_variable_data_keys=allow_variable_data_keys,
+# inference=True,
+# )
+#
+# finish_count = 0
+# file_count = 1
+# # 7 .Start for-loop
+# # FIXME(kamo): The output format should be discussed about
+# asr_result_list = []
+# if output_dir is not None:
+# writer = DatadirWriter(output_dir)
+# else:
+# writer = None
+#
+# for keys, batch in loader:
+# assert isinstance(batch, dict), type(batch)
+# assert all(isinstance(s, str) for s in keys), keys
+# _bs = len(next(iter(batch.values())))
+# assert len(keys) == _bs, f"{len(keys)} != {_bs}"
+# #batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
+#
+# # N-best list of (text, token, token_int, hyp_object)
+# try:
+# results = speech2text(**batch)
+# except TooShortUttError as e:
+# logging.warning(f"Utterance {keys} {e}")
+# hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
+# results = [[" ", ["<space>"], [2], hyp]] * nbest
+#
+# # Only supporting batch_size==1
+# key = keys[0]
+# logging.info(f"Utterance: {key}")
+# for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results):
+# # Create a directory: outdir/{n}best_recog
+# if writer is not None:
+# ibest_writer = writer[f"{n}best_recog"]
+#
+# # Write the result to each file
+# ibest_writer["token"][key] = " ".join(token)
+# ibest_writer["token_int"][key] = " ".join(map(str, token_int))
+# ibest_writer["score"][key] = str(hyp.score)
+#
+# if text is not None:
+# text_postprocessed = postprocess_utils.sentence_postprocess(token)
+# item = {'key': key, 'value': text_postprocessed}
+# asr_result_list.append(item)
+# finish_count += 1
+# asr_utils.print_progress(finish_count / file_count)
+# if writer is not None:
+# ibest_writer["text"][key] = text
+# return asr_result_list
+
def inference(
- output_dir: str,
maxlenratio: float,
minlenratio: float,
batch_size: int,
- dtype: str,
beam_size: int,
ngpu: int,
- seed: int,
ctc_weight: float,
lm_weight: float,
- ngram_weight: float,
penalty: float,
- nbest: int,
- num_workers: int,
log_level: Union[int, str],
- data_path_and_name_and_type: Sequence[Tuple[str, str, str]],
- key_file: Optional[str],
+ data_path_and_name_and_type,
asr_train_config: Optional[str],
asr_model_file: Optional[str],
- lm_train_config: Optional[str],
- lm_file: Optional[str],
- word_lm_train_config: Optional[str],
- ngram_file: Optional[str],
- token_type: Optional[str],
- bpemodel: Optional[str],
- allow_variable_data_keys: bool,
- streaming: bool,
+ ngram_file: Optional[str] = None,
+ cmvn_file: Optional[str] = None,
+ raw_inputs: Union[np.ndarray, torch.Tensor] = None,
+ lm_train_config: Optional[str] = None,
+ lm_file: Optional[str] = None,
+ token_type: Optional[str] = None,
+ key_file: Optional[str] = None,
+ word_lm_train_config: Optional[str] = None,
+ bpemodel: Optional[str] = None,
+ allow_variable_data_keys: bool = False,
+ streaming: bool = False,
+ output_dir: Optional[str] = None,
+ dtype: str = "float32",
+ seed: int = 0,
+ ngram_weight: float = 0.9,
+ nbest: int = 1,
+ num_workers: int = 1,
+ token_num_relax: int = 1,
+ decoding_ind: int = 0,
+ decoding_mode: str = "model1",
+ **kwargs,
+):
+ inference_pipeline = inference_modelscope(
+ maxlenratio=maxlenratio,
+ minlenratio=minlenratio,
+ batch_size=batch_size,
+ beam_size=beam_size,
+ ngpu=ngpu,
+ ctc_weight=ctc_weight,
+ lm_weight=lm_weight,
+ penalty=penalty,
+ log_level=log_level,
+ asr_train_config=asr_train_config,
+ asr_model_file=asr_model_file,
+ cmvn_file=cmvn_file,
+ raw_inputs=raw_inputs,
+ lm_train_config=lm_train_config,
+ lm_file=lm_file,
+ token_type=token_type,
+ key_file=key_file,
+ word_lm_train_config=word_lm_train_config,
+ bpemodel=bpemodel,
+ allow_variable_data_keys=allow_variable_data_keys,
+ streaming=streaming,
+ output_dir=output_dir,
+ dtype=dtype,
+ seed=seed,
+ ngram_weight=ngram_weight,
+ ngram_file=ngram_file,
+ nbest=nbest,
+ num_workers=num_workers,
+ token_num_relax=token_num_relax,
+ decoding_ind=decoding_ind,
+ decoding_mode=decoding_mode,
+ **kwargs,
+ )
+ return inference_pipeline(data_path_and_name_and_type, raw_inputs)
+
+
+def inference_modelscope(
+ maxlenratio: float,
+ minlenratio: float,
+ batch_size: int,
+ beam_size: int,
+ ngpu: int,
+ ctc_weight: float,
+ lm_weight: float,
+ penalty: float,
+ log_level: Union[int, str],
+ # data_path_and_name_and_type,
+ asr_train_config: Optional[str],
+ asr_model_file: Optional[str],
+ ngram_file: Optional[str] = None,
+ cmvn_file: Optional[str] = None,
+ # raw_inputs: Union[np.ndarray, torch.Tensor] = None,
+ lm_train_config: Optional[str] = None,
+ lm_file: Optional[str] = None,
+ token_type: Optional[str] = None,
+ key_file: Optional[str] = None,
+ word_lm_train_config: Optional[str] = None,
+ bpemodel: Optional[str] = None,
+ allow_variable_data_keys: bool = False,
+ streaming: bool = False,
+ output_dir: Optional[str] = None,
+ dtype: str = "float32",
+ seed: int = 0,
+ ngram_weight: float = 0.9,
+ nbest: int = 1,
+ num_workers: int = 1,
token_num_relax: int = 1,
decoding_ind: int = 0,
decoding_mode: str = "model1",
@@ -294,7 +536,7 @@
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
- if ngpu >= 1:
+ if ngpu >= 1 and torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
@@ -306,6 +548,7 @@
speech2text_kwargs = dict(
asr_train_config=asr_train_config,
asr_model_file=asr_model_file,
+ cmvn_file=cmvn_file,
lm_train_config=lm_train_config,
lm_file=lm_file,
ngram_file=ngram_file,
@@ -327,30 +570,46 @@
decoding_mode=decoding_mode,
)
speech2text = Speech2Text(**speech2text_kwargs)
-
- # 3. Build data-iterator
- loader = ASRTask.build_streaming_iterator(
- data_path_and_name_and_type,
- dtype=dtype,
- batch_size=batch_size,
- key_file=key_file,
- num_workers=num_workers,
- preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False),
- collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False),
- allow_variable_data_keys=allow_variable_data_keys,
- inference=True,
- )
-
- # 7 .Start for-loop
- # FIXME(kamo): The output format should be discussed about
- with DatadirWriter(output_dir) as writer:
+
+ def _forward(data_path_and_name_and_type,
+ raw_inputs: Union[np.ndarray, torch.Tensor] = None,
+ output_dir_v2: Optional[str] = None,
+ ):
+ # 3. Build data-iterator
+ if data_path_and_name_and_type is None and raw_inputs is not None:
+ if isinstance(raw_inputs, torch.Tensor):
+ raw_inputs = raw_inputs.numpy()
+ data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
+ loader = ASRTask.build_streaming_iterator(
+ data_path_and_name_and_type,
+ dtype=dtype,
+ batch_size=batch_size,
+ key_file=key_file,
+ num_workers=num_workers,
+ preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False),
+ collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False),
+ allow_variable_data_keys=allow_variable_data_keys,
+ inference=True,
+ )
+
+ finish_count = 0
+ file_count = 1
+ # 7 .Start for-loop
+ # FIXME(kamo): The output format should be discussed about
+ asr_result_list = []
+ output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
+ if output_path is not None:
+ writer = DatadirWriter(output_path)
+ else:
+ writer = None
+
for keys, batch in loader:
assert isinstance(batch, dict), type(batch)
assert all(isinstance(s, str) for s in keys), keys
_bs = len(next(iter(batch.values())))
assert len(keys) == _bs, f"{len(keys)} != {_bs}"
- batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
-
+ #batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
+
# N-best list of (text, token, token_int, hyp_object)
try:
results = speech2text(**batch)
@@ -358,21 +617,32 @@
logging.warning(f"Utterance {keys} {e}")
hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
results = [[" ", ["<space>"], [2], hyp]] * nbest
-
+
# Only supporting batch_size==1
key = keys[0]
logging.info(f"Utterance: {key}")
for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results):
# Create a directory: outdir/{n}best_recog
- ibest_writer = writer[f"{n}best_recog"]
-
- # Write the result to each file
- ibest_writer["token"][key] = " ".join(token)
- ibest_writer["token_int"][key] = " ".join(map(str, token_int))
- ibest_writer["score"][key] = str(hyp.score)
-
+ if writer is not None:
+ ibest_writer = writer[f"{n}best_recog"]
+
+ # Write the result to each file
+ ibest_writer["token"][key] = " ".join(token)
+ # ibest_writer["token_int"][key] = " ".join(map(str, token_int))
+ ibest_writer["score"][key] = str(hyp.score)
+
if text is not None:
- ibest_writer["text"][key] = text
+ text_postprocessed = postprocess_utils.sentence_postprocess(token)
+ item = {'key': key, 'value': text_postprocessed}
+ asr_result_list.append(item)
+ finish_count += 1
+ asr_utils.print_progress(finish_count / file_count)
+ if writer is not None:
+ ibest_writer["text"][key] = text
+ return asr_result_list
+
+ return _forward
+
def get_parser():
@@ -416,9 +686,11 @@
group.add_argument(
"--data_path_and_name_and_type",
type=str2triple_str,
- required=True,
+ required=False,
action="append",
)
+ group.add_argument("--raw_inputs", type=list, default=None)
+ # example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}])
group.add_argument("--key_file", type=str_or_none)
group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
@@ -434,6 +706,11 @@
help="ASR model parameter file",
)
group.add_argument(
+ "--cmvn_file",
+ type=str,
+ help="Global cmvn file",
+ )
+ group.add_argument(
"--lm_train_config",
type=str,
help="LM training configuration",
--
Gitblit v1.9.1