From b28f3c9da94ae72a3a0b7bb5982b587be7cf4cd6 Mon Sep 17 00:00:00 2001
From: zhifu gao <zhifu.gzf@alibaba-inc.com>
Date: 星期四, 18 一月 2024 22:00:58 +0800
Subject: [PATCH] fsmn-vad bugfix (#1270)
---
funasr/models/uniasr/model.py | 118 ++++++++++++++++++++++++++---------------------------------
1 files changed, 52 insertions(+), 66 deletions(-)
diff --git a/funasr/models/uniasr/e2e_uni_asr.py b/funasr/models/uniasr/model.py
similarity index 95%
rename from funasr/models/uniasr/e2e_uni_asr.py
rename to funasr/models/uniasr/model.py
index 390d274..de80d4a 100644
--- a/funasr/models/uniasr/e2e_uni_asr.py
+++ b/funasr/models/uniasr/model.py
@@ -1,85 +1,73 @@
-import logging
-from contextlib import contextmanager
-from distutils.version import LooseVersion
-from typing import Dict
-from typing import List
-from typing import Optional
-from typing import Tuple
-from typing import Union
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+# MIT License (https://opensource.org/licenses/MIT)
+import time
import torch
+import logging
+from torch.cuda.amp import autocast
+from typing import Union, Dict, List, Tuple, Optional
-from funasr.models.e2e_asr_common import ErrorCalculator
+from funasr.register import tables
+from funasr.models.ctc.ctc import CTC
+from funasr.utils import postprocess_utils
from funasr.metrics.compute_acc import th_accuracy
-from funasr.models.transformer.utils.add_sos_eos import add_sos_eos
-from funasr.losses.label_smoothing_loss import (
- LabelSmoothingLoss, # noqa: H301
-)
-from funasr.models.ctc import CTC
-from funasr.models.decoder.abs_decoder import AbsDecoder
-from funasr.models.encoder.abs_encoder import AbsEncoder
-from funasr.frontends.abs_frontend import AbsFrontend
-from funasr.models.postencoder.abs_postencoder import AbsPostEncoder
-from funasr.models.preencoder.abs_preencoder import AbsPreEncoder
-from funasr.models.specaug.abs_specaug import AbsSpecAug
-from funasr.layers.abs_normalize import AbsNormalize
-from funasr.train_utils.device_funcs import force_gatherable
-from funasr.models.base_model import FunASRModel
-from funasr.models.scama.chunk_utilis import sequence_mask
+from funasr.utils.datadir_writer import DatadirWriter
+from funasr.models.paraformer.search import Hypothesis
from funasr.models.paraformer.cif_predictor import mae_loss
-
-if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
- from torch.cuda.amp import autocast
-else:
- # Nothing to do if torch<1.6.0
- @contextmanager
- def autocast(enabled=True):
- yield
+from funasr.train_utils.device_funcs import force_gatherable
+from funasr.losses.label_smoothing_loss import LabelSmoothingLoss
+from funasr.models.transformer.utils.add_sos_eos import add_sos_eos
+from funasr.models.transformer.utils.nets_utils import make_pad_mask, pad_list
+from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
-class UniASR(FunASRModel):
+@tables.register("model_classes", "UniASR")
+class UniASR(torch.nn.Module):
"""
Author: Speech Lab of DAMO Academy, Alibaba Group
"""
def __init__(
self,
- vocab_size: int,
- token_list: Union[Tuple[str, ...], List[str]],
- frontend: Optional[AbsFrontend],
- specaug: Optional[AbsSpecAug],
- normalize: Optional[AbsNormalize],
- encoder: AbsEncoder,
- decoder: AbsDecoder,
- ctc: CTC,
+ specaug: Optional[str] = None,
+ specaug_conf: Optional[Dict] = None,
+ normalize: str = None,
+ normalize_conf: Optional[Dict] = None,
+ encoder: str = None,
+ encoder_conf: Optional[Dict] = None,
+ decoder: str = None,
+ decoder_conf: Optional[Dict] = None,
+ ctc: str = None,
+ ctc_conf: Optional[Dict] = None,
+ predictor: str = None,
+ predictor_conf: Optional[Dict] = None,
ctc_weight: float = 0.5,
- interctc_weight: float = 0.0,
+ input_size: int = 80,
+ vocab_size: int = -1,
ignore_id: int = -1,
+ blank_id: int = 0,
+ sos: int = 1,
+ eos: int = 2,
lsm_weight: float = 0.0,
length_normalized_loss: bool = False,
- report_cer: bool = True,
- report_wer: bool = True,
- sym_space: str = "<space>",
- sym_blank: str = "<blank>",
- extract_feats_in_collect_stats: bool = True,
- predictor=None,
+ # report_cer: bool = True,
+ # report_wer: bool = True,
+ # sym_space: str = "<space>",
+ # sym_blank: str = "<blank>",
+ # extract_feats_in_collect_stats: bool = True,
+ # predictor=None,
predictor_weight: float = 0.0,
- decoder_attention_chunk_type: str = 'chunk',
- encoder2: AbsEncoder = None,
- decoder2: AbsDecoder = None,
- ctc2: CTC = None,
- ctc_weight2: float = 0.5,
- interctc_weight2: float = 0.0,
- predictor2=None,
- predictor_weight2: float = 0.0,
- decoder_attention_chunk_type2: str = 'chunk',
- stride_conv=None,
- loss_weight_model1: float = 0.5,
- enable_maas_finetune: bool = False,
- freeze_encoder2: bool = False,
- preencoder: Optional[AbsPreEncoder] = None,
- postencoder: Optional[AbsPostEncoder] = None,
+ predictor_bias: int = 0,
+ sampling_ratio: float = 0.2,
+ share_embedding: bool = False,
+ # preencoder: Optional[AbsPreEncoder] = None,
+ # postencoder: Optional[AbsPostEncoder] = None,
+ use_1st_decoder_loss: bool = False,
encoder1_encoder2_joint_training: bool = True,
+ **kwargs,
+
):
assert 0.0 <= ctc_weight <= 1.0, ctc_weight
assert 0.0 <= interctc_weight < 1.0, interctc_weight
@@ -443,10 +431,8 @@
# force_gatherable: to-device and to-tensor if scalar for DataParallel
if self.length_normalized_loss:
batch_size = int((text_lengths + 1).sum())
-<<<<<<< HEAD:funasr/models/uniasr/e2e_uni_asr.py
-=======
->>>>>>> main:funasr/models/e2e_uni_asr.py
+
loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
return loss, stats, weight
--
Gitblit v1.9.1