python/FunASR-XL.git

			@@ -1,3 +1,7 @@
			"""
			Author: Speech Lab, Alibaba Group, China
			"""

			import logging
			from contextlib import contextmanager
			from distutils.version import LooseVersion
			@@ -8,7 +12,6 @@
			from typing import Union

			import torch
			from typeguard import check_argument_types

			from funasr.layers.abs_normalize import AbsNormalize
			from funasr.losses.label_smoothing_loss import (
			@@ -25,7 +28,7 @@
			from funasr.modules.e2e_asr_common import ErrorCalculator
			from funasr.modules.nets_utils import th_accuracy
			from funasr.torch_utils.device_funcs import force_gatherable
			from funasr.train.abs_espnet_model import AbsESPnetModel
			from funasr.models.base_model import FunASRModel

			if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
			from torch.cuda.amp import autocast
			@@ -36,7 +39,7 @@
			yield


			class ESPnetSVModel(AbsESPnetModel):
			class ESPnetSVModel(FunASRModel):
			"""CTC-attention hybrid Encoder-Decoder model"""

			def __init__(
			@@ -52,7 +55,6 @@
			pooling_layer: torch.nn.Module,
			decoder: AbsDecoder,
			):
			assert check_argument_types()

			super().__init__()
			# note that eos is the same as sos (equivalent ID)
			@@ -76,7 +78,6 @@
			text_lengths: torch.Tensor,
			) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
			"""Frontend + Encoder + Decoder + Calc loss

			Args:
			speech: (Batch, Length, ...)
			speech_lengths: (Batch, )
			@@ -217,7 +218,6 @@
			self, speech: torch.Tensor, speech_lengths: torch.Tensor
			) -> Tuple[torch.Tensor, torch.Tensor]:
			"""Frontend + Encoder. Note that this method is used by asr_inference.py

			Args:
			speech: (Batch, Length, ...)
			speech_lengths: (Batch, )
			@@ -267,4 +267,4 @@
			else:
			# No frontend and no feature extract
			feats, feats_lengths = speech, speech_lengths
			return feats, feats_lengths
			return feats, feats_lengths