From 4e2fe544ae37174a3e09dfcdbbdae5abfe711e53 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期三, 05 七月 2023 16:57:21 +0800
Subject: [PATCH] funasr sdk

---
 funasr/modules/embedding.py |   39 +++++++++++++++++++++++++--------------
 1 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/funasr/modules/embedding.py b/funasr/modules/embedding.py
index e0070de..374eba4 100644
--- a/funasr/modules/embedding.py
+++ b/funasr/modules/embedding.py
@@ -393,8 +393,9 @@
     def encode(self, positions: torch.Tensor = None, depth: int = None, dtype: torch.dtype = torch.float32):
         batch_size = positions.size(0)
         positions = positions.type(dtype)
-        log_timescale_increment = torch.log(torch.tensor([10000], dtype=dtype)) / (depth / 2 - 1)
-        inv_timescales = torch.exp(torch.arange(depth / 2).type(dtype) * (-log_timescale_increment))
+        device = positions.device
+        log_timescale_increment = torch.log(torch.tensor([10000], dtype=dtype, device=device)) / (depth / 2 - 1)
+        inv_timescales = torch.exp(torch.arange(depth / 2, device=device).type(dtype) * (-log_timescale_increment))
         inv_timescales = torch.reshape(inv_timescales, [batch_size, -1])
         scaled_time = torch.reshape(positions, [1, -1, 1]) * torch.reshape(inv_timescales, [1, 1, -1])
         encoding = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=2)
@@ -402,27 +403,37 @@
 
     def forward(self, x):
         batch_size, timesteps, input_dim = x.size()
-        positions = torch.arange(1, timesteps+1)[None, :]
+        positions = torch.arange(1, timesteps+1, device=x.device)[None, :]
         position_encoding = self.encode(positions, input_dim, x.dtype).to(x.device)
 
         return x + position_encoding
 
-    def forward_chunk(self, x, cache=None):
-        start_idx = 0
-        pad_left = 0
-        pad_right = 0
+class StreamSinusoidalPositionEncoder(torch.nn.Module):
+    '''
+
+    '''
+    def __int__(self, d_model=80, dropout_rate=0.1):
+        pass
+
+    def encode(self, positions: torch.Tensor = None, depth: int = None, dtype: torch.dtype = torch.float32):
+        batch_size = positions.size(0)
+        positions = positions.type(dtype)
+        log_timescale_increment = torch.log(torch.tensor([10000], dtype=dtype)) / (depth / 2 - 1)
+        inv_timescales = torch.exp(torch.arange(depth / 2).type(dtype) * (-log_timescale_increment))
+        inv_timescales = torch.reshape(inv_timescales, [batch_size, -1])
+        scaled_time = torch.reshape(positions, [1, -1, 1]) * torch.reshape(inv_timescales, [1, 1, -1])
+        encoding = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=2)
+        return encoding.type(dtype)
+
+    def forward(self, x, cache=None):
         batch_size, timesteps, input_dim = x.size()
+        start_idx = 0
         if cache is not None:
             start_idx = cache["start_idx"]
-            pad_left = cache["left"]
-            pad_right = cache["right"]
+            cache["start_idx"] += timesteps
         positions = torch.arange(1, timesteps+start_idx+1)[None, :]
         position_encoding = self.encode(positions, input_dim, x.dtype).to(x.device)
-        outputs = x + position_encoding[:, start_idx: start_idx + timesteps]
-        outputs = outputs.transpose(1,2)
-        outputs = F.pad(outputs, (pad_left, pad_right))
-        outputs = outputs.transpose(1,2)
-        return outputs
+        return x + position_encoding[:, start_idx: start_idx + timesteps]
 
 class StreamingRelPositionalEncoding(torch.nn.Module):
     """Relative positional encoding.

--
Gitblit v1.9.1