| | |
| | | from typing import Union |
| | | |
| | | import torch |
| | | from typeguard import check_argument_types |
| | | |
| | | from funasr.models.e2e_asr_common import ErrorCalculator |
| | | from funasr.modules.nets_utils import th_accuracy |
| | |
| | | from funasr.models.specaug.abs_specaug import AbsSpecAug |
| | | from funasr.layers.abs_normalize import AbsNormalize |
| | | from funasr.torch_utils.device_funcs import force_gatherable |
| | | from funasr.train.abs_espnet_model import AbsESPnetModel |
| | | from funasr.models.base_model import FunASRModel |
| | | from funasr.modules.streaming_utils.chunk_utilis import sequence_mask |
| | | from funasr.models.predictor.cif import mae_loss |
| | | |
| | |
| | | yield |
| | | |
| | | |
| | | class UniASR(AbsESPnetModel): |
| | | class UniASR(FunASRModel): |
| | | """ |
| | | Author: Speech Lab, Alibaba Group, China |
| | | Author: Speech Lab of DAMO Academy, Alibaba Group |
| | | """ |
| | | |
| | | def __init__( |
| | |
| | | frontend: Optional[AbsFrontend], |
| | | specaug: Optional[AbsSpecAug], |
| | | normalize: Optional[AbsNormalize], |
| | | preencoder: Optional[AbsPreEncoder], |
| | | encoder: AbsEncoder, |
| | | postencoder: Optional[AbsPostEncoder], |
| | | decoder: AbsDecoder, |
| | | ctc: CTC, |
| | | ctc_weight: float = 0.5, |
| | |
| | | loss_weight_model1: float = 0.5, |
| | | enable_maas_finetune: bool = False, |
| | | freeze_encoder2: bool = False, |
| | | preencoder: Optional[AbsPreEncoder] = None, |
| | | postencoder: Optional[AbsPostEncoder] = None, |
| | | encoder1_encoder2_joint_training: bool = True, |
| | | ): |
| | | assert check_argument_types() |
| | | assert 0.0 <= ctc_weight <= 1.0, ctc_weight |
| | | assert 0.0 <= interctc_weight < 1.0, interctc_weight |
| | | |
| | |
| | | decoding_ind: int = None, |
| | | ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]: |
| | | """Frontend + Encoder + Decoder + Calc loss |
| | | |
| | | Args: |
| | | speech: (Batch, Length, ...) |
| | | speech_lengths: (Batch, ) |
| | |
| | | with torch.no_grad(): |
| | | speech_raw, encoder_out, encoder_out_lens = self.encode(speech, speech_lengths, ind=ind) |
| | | else: |
| | | speech_raw, encoder_out_lens = self.encode(speech, speech_lengths, ind=ind) |
| | | speech_raw, encoder_out, encoder_out_lens = self.encode(speech, speech_lengths, ind=ind) |
| | | |
| | | intermediate_outs = None |
| | | if isinstance(encoder_out, tuple): |
| | |
| | | self, speech: torch.Tensor, speech_lengths: torch.Tensor, ind: int = 0, |
| | | ) -> Tuple[torch.Tensor, torch.Tensor]: |
| | | """Frontend + Encoder. Note that this method is used by asr_inference.py |
| | | |
| | | Args: |
| | | speech: (Batch, Length, ...) |
| | | speech_lengths: (Batch, ) |
| | |
| | | ind: int = 0, |
| | | ) -> Tuple[torch.Tensor, torch.Tensor]: |
| | | """Frontend + Encoder. Note that this method is used by asr_inference.py |
| | | |
| | | Args: |
| | | speech: (Batch, Length, ...) |
| | | speech_lengths: (Batch, ) |
| | |
| | | ys_pad_lens: torch.Tensor, |
| | | ) -> torch.Tensor: |
| | | """Compute negative log likelihood(nll) from transformer-decoder |
| | | |
| | | Normally, this function is called in batchify_nll. |
| | | |
| | | Args: |
| | | encoder_out: (Batch, Length, Dim) |
| | | encoder_out_lens: (Batch,) |
| | |
| | | batch_size: int = 100, |
| | | ): |
| | | """Compute negative log likelihood(nll) from transformer-decoder |
| | | |
| | | To avoid OOM, this fuction seperate the input into batches. |
| | | Then call nll for each batch and combine and return results. |
| | | Args: |
| | |
| | | ys_hat = self.ctc2.argmax(encoder_out).data |
| | | cer_ctc = self.error_calculator(ys_hat.cpu(), ys_pad.cpu(), is_ctc=True) |
| | | return loss_ctc, cer_ctc |
| | | |