funasr/models/sense_voice/model.py
@@ -555,7 +555,8 @@ ilens: torch.Tensor, ): """Embed positions in tensor.""" masks = sequence_mask(ilens, device=ilens.device)[:, None, :] maxlen = xs_pad.shape[1] masks = sequence_mask(ilens, maxlen = maxlen, device=ilens.device)[:, None, :] xs_pad *= self.output_size() ** 0.5