| | |
| | | LabelSmoothingLoss, # noqa: H301 |
| | | ) |
| | | from funasr.models.ctc import CTC |
| | | from funasr.models.decoder.abs_decoder import AbsDecoder |
| | | from funasr.models.encoder.abs_encoder import AbsEncoder |
| | | from funasr.models.frontend.abs_frontend import AbsFrontend |
| | | from funasr.models.decoder.abs_decoder import AbsDecoder |
| | | from funasr.models.postencoder.abs_postencoder import AbsPostEncoder |
| | | from funasr.models.preencoder.abs_preencoder import AbsPreEncoder |
| | | from funasr.models.specaug.abs_specaug import AbsSpecAug |
| | | from funasr.layers.abs_normalize import AbsNormalize |
| | | from funasr.torch_utils.device_funcs import force_gatherable |
| | | from funasr.train.abs_espnet_model import AbsESPnetModel |
| | | from funasr.models.base_model import FunASRModel |
| | | from funasr.modules.streaming_utils.chunk_utilis import sequence_mask |
| | | from funasr.models.predictor.cif import mae_loss |
| | | |
| | |
| | | yield |
| | | |
| | | |
| | | class UniASR(AbsESPnetModel): |
| | | class UniASR(FunASRModel): |
| | | """ |
| | | Author: Speech Lab, Alibaba Group, China |
| | | """ |
| | |
| | | self, |
| | | vocab_size: int, |
| | | token_list: Union[Tuple[str, ...], List[str]], |
| | | frontend: Optional[AbsFrontend], |
| | | specaug: Optional[AbsSpecAug], |
| | | normalize: Optional[AbsNormalize], |
| | | frontend: Optional[torch.nn.Module], |
| | | specaug: Optional[torch.nn.Module], |
| | | normalize: Optional[torch.nn.Module], |
| | | preencoder: Optional[AbsPreEncoder], |
| | | encoder: AbsEncoder, |
| | | encoder: torch.nn.Module, |
| | | postencoder: Optional[AbsPostEncoder], |
| | | decoder: AbsDecoder, |
| | | ctc: CTC, |
| | |
| | | |
| | | # for data-parallel |
| | | text = text[:, : text_lengths.max()] |
| | | speech = speech[:, :speech_lengths.max(), :] |
| | | speech = speech[:, :speech_lengths.max()] |
| | | |
| | | ind = self.encoder.overlap_chunk_cls.random_choice(self.training, decoding_ind) |
| | | speech_raw = speech.clone().to(speech.device) |
| | | # 1. Encoder |
| | | if self.enable_maas_finetune: |
| | | with torch.no_grad(): |
| | | encoder_out, encoder_out_lens = self.encode(speech, speech_lengths, ind=ind) |
| | | speech_raw, encoder_out, encoder_out_lens = self.encode(speech, speech_lengths, ind=ind) |
| | | else: |
| | | encoder_out, encoder_out_lens = self.encode(speech, speech_lengths, ind=ind) |
| | | speech_raw, encoder_out, encoder_out_lens = self.encode(speech, speech_lengths, ind=ind) |
| | | |
| | | intermediate_outs = None |
| | | if isinstance(encoder_out, tuple): |
| | |
| | | # 3. Normalization for feature: e.g. Global-CMVN, Utterance-CMVN |
| | | if self.normalize is not None: |
| | | feats, feats_lengths = self.normalize(feats, feats_lengths) |
| | | |
| | | speech_raw = feats.clone().to(feats.device) |
| | | # Pre-encoder, e.g. used for raw input data |
| | | if self.preencoder is not None: |
| | | feats, feats_lengths = self.preencoder(feats, feats_lengths) |
| | |
| | | if intermediate_outs is not None: |
| | | return (encoder_out, intermediate_outs), encoder_out_lens |
| | | |
| | | return encoder_out, encoder_out_lens |
| | | return speech_raw, encoder_out, encoder_out_lens |
| | | |
| | | def encode2( |
| | | self, |