python/FunASR-XL.git

			@@ -407,7 +407,24 @@

			return x + position_encoding

			def forward_chunk(self, x, cache=None):
			class StreamSinusoidalPositionEncoder(torch.nn.Module):
			'''

			'''
			def __int__(self, d_model=80, dropout_rate=0.1):
			pass

			def encode(self, positions: torch.Tensor = None, depth: int = None, dtype: torch.dtype = torch.float32):
			batch_size = positions.size(0)
			positions = positions.type(dtype)
			log_timescale_increment = torch.log(torch.tensor([10000], dtype=dtype)) / (depth / 2 - 1)
			inv_timescales = torch.exp(torch.arange(depth / 2).type(dtype) * (-log_timescale_increment))
			inv_timescales = torch.reshape(inv_timescales, [batch_size, -1])
			scaled_time = torch.reshape(positions, [1, -1, 1]) * torch.reshape(inv_timescales, [1, 1, -1])
			encoding = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=2)
			return encoding.type(dtype)

			def forward(self, x, cache=None):
			start_idx = 0
			pad_left = 0
			pad_right = 0
			@@ -419,8 +436,83 @@
			positions = torch.arange(1, timesteps+start_idx+1)[None, :]
			position_encoding = self.encode(positions, input_dim, x.dtype).to(x.device)
			outputs = x + position_encoding[:, start_idx: start_idx + timesteps]
			outputs = outputs.transpose(1,2)
			outputs = outputs.transpose(1, 2)
			outputs = F.pad(outputs, (pad_left, pad_right))
			outputs = outputs.transpose(1,2)
			outputs = outputs.transpose(1, 2)
			return outputs


			class StreamingRelPositionalEncoding(torch.nn.Module):
			"""Relative positional encoding.
			Args:
			size: Module size.
			max_len: Maximum input length.
			dropout_rate: Dropout rate.
			"""

			def __init__(
			self, size: int, dropout_rate: float = 0.0, max_len: int = 5000
			) -> None:
			"""Construct a RelativePositionalEncoding object."""
			super().__init__()

			self.size = size

			self.pe = None
			self.dropout = torch.nn.Dropout(p=dropout_rate)

			self.extend_pe(torch.tensor(0.0).expand(1, max_len))
			self._register_load_state_dict_pre_hook(_pre_hook)

			def extend_pe(self, x: torch.Tensor, left_context: int = 0) -> None:
			"""Reset positional encoding.
			Args:
			x: Input sequences. (B, T, ?)
			left_context: Number of frames in left context.
			"""
			time1 = x.size(1) + left_context

			if self.pe is not None:
			if self.pe.size(1) >= time1 * 2 - 1:
			if self.pe.dtype != x.dtype or self.pe.device != x.device:
			self.pe = self.pe.to(device=x.device, dtype=x.dtype)
			return

			pe_positive = torch.zeros(time1, self.size)
			pe_negative = torch.zeros(time1, self.size)

			position = torch.arange(0, time1, dtype=torch.float32).unsqueeze(1)
			div_term = torch.exp(
			torch.arange(0, self.size, 2, dtype=torch.float32)
			* -(math.log(10000.0) / self.size)
			)

			pe_positive[:, 0::2] = torch.sin(position * div_term)
			pe_positive[:, 1::2] = torch.cos(position * div_term)
			pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)

			pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
			pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)
			pe_negative = pe_negative[1:].unsqueeze(0)

			self.pe = torch.cat([pe_positive, pe_negative], dim=1).to(
			dtype=x.dtype, device=x.device
			)

			def forward(self, x: torch.Tensor, left_context: int = 0) -> torch.Tensor:
			"""Compute positional encoding.
			Args:
			x: Input sequences. (B, T, ?)
			left_context: Number of frames in left context.
			Returns:
			pos_enc: Positional embedding sequences. (B, 2 * (T - 1), ?)
			"""
			self.extend_pe(x, left_context=left_context)

			time1 = x.size(1) + left_context

			pos_enc = self.pe[
			:, self.pe.size(1) // 2 - time1 + 1 : self.pe.size(1) // 2 + x.size(1)
			]
			pos_enc = self.dropout(pos_enc)

			return pos_enc