From 9b4e9cc8a0311e5243d69b73ed073e7ea441982e Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期三, 27 三月 2024 16:05:29 +0800
Subject: [PATCH] train update
---
funasr/models/paraformer_streaming/model.py | 31 ++++++++++++++++++-------------
1 files changed, 18 insertions(+), 13 deletions(-)
diff --git a/funasr/models/paraformer_streaming/model.py b/funasr/models/paraformer_streaming/model.py
index 9bf5d39..499b487 100644
--- a/funasr/models/paraformer_streaming/model.py
+++ b/funasr/models/paraformer_streaming/model.py
@@ -235,8 +235,7 @@
decoder_out_1st = None
pre_loss_att = None
if self.sampling_ratio > 0.0:
- if self.step_cur < 2:
- logging.info("enable sampler in paraformer, sampling_ratio: {}".format(self.sampling_ratio))
+
if self.use_1st_decoder_loss:
sematic_embeds, decoder_out_1st, pre_loss_att = \
self.sampler_with_grad(encoder_out, encoder_out_lens, ys_pad,
@@ -246,8 +245,6 @@
self.sampler(encoder_out, encoder_out_lens, ys_pad,
ys_pad_lens, pre_acoustic_embeds, scama_mask)
else:
- if self.step_cur < 2:
- logging.info("disable sampler in paraformer, sampling_ratio: {}".format(self.sampling_ratio))
sematic_embeds = pre_acoustic_embeds
# 1. Forward decoder
@@ -534,10 +531,14 @@
for i in range(n):
kwargs["is_final"] = _is_final and i == n -1
audio_sample_i = audio_sample[i*chunk_stride_samples:(i+1)*chunk_stride_samples]
-
- # extract fbank feats
- speech, speech_lengths = extract_fbank([audio_sample_i], data_type=kwargs.get("data_type", "sound"),
- frontend=frontend, cache=cache["frontend"], is_final=kwargs["is_final"])
+ if kwargs["is_final"] and len(audio_sample_i) < 960:
+ cache["encoder"]["tail_chunk"] = True
+ speech = cache["encoder"]["feats"]
+ speech_lengths = torch.tensor([speech.shape[1]], dtype=torch.int64).to(speech.device)
+ else:
+ # extract fbank feats
+ speech, speech_lengths = extract_fbank([audio_sample_i], data_type=kwargs.get("data_type", "sound"),
+ frontend=frontend, cache=cache["frontend"], is_final=kwargs["is_final"])
time3 = time.perf_counter()
meta_data["extract_feat"] = f"{time3 - time2:0.3f}"
meta_data["batch_data_time"] = speech_lengths.sum().item() * frontend.frame_shift * frontend.lfr_n / 1000
@@ -556,11 +557,15 @@
self.init_cache(cache, **kwargs)
if kwargs.get("output_dir"):
- writer = DatadirWriter(kwargs.get("output_dir"))
- ibest_writer = writer[f"{1}best_recog"]
+ if not hasattr(self, "writer"):
+ self.writer = DatadirWriter(kwargs.get("output_dir"))
+ ibest_writer = self.writer[f"{1}best_recog"]
ibest_writer["token"][key[0]] = " ".join(tokens)
ibest_writer["text"][key[0]] = text_postprocessed
-
+
return result, meta_data
-
-
+
+ def export(self, **kwargs):
+ from .export_meta import export_rebuild_model
+ models = export_rebuild_model(model=self, **kwargs)
+ return models
\ No newline at end of file
--
Gitblit v1.9.1