python/FunASR-XL.git

parent: 27850286 | 补丁 | 提交 | ignore whitespace

Merge pull request #970 from alibaba-damo-academy/dev_lhn

hnluo

2023-09-19 8516d3e850671a35c0031b55b1884074453c331e

Merge pull request #970 from alibaba-damo-academy/dev_lhn

Dev lhn

7个文件已修改

	funasr/bin/asr_inference_launch.py	68 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/datasets/large_datasets/datapipes/batch.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/datasets/large_datasets/datapipes/filter.py	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/datasets/large_datasets/datapipes/map.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/decoder/sanm_decoder.py	146 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/encoder/sanm_encoder.py	118 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/modules/attention.py	67 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史

 funasr/bin/asr_inference_launch.py

@@ -842,37 +842,72 @@
            data = yaml.load(f, Loader=yaml.Loader)
        return data

    def _prepare_cache(cache: dict = {}, chunk_size=[5, 10, 5], batch_size=1):
    def _prepare_cache(cache: dict = {}, chunk_size=[5, 10, 5], encoder_chunk_look_back=0,
                       decoder_chunk_look_back=0, batch_size=1):
        if len(cache) > 0:
            return cache
        config = _read_yaml(asr_train_config)
        enc_output_size = config["encoder_conf"]["output_size"]
        feats_dims = config["frontend_conf"]["n_mels"] * config["frontend_conf"]["lfr_m"]
        cache_en = {"start_idx": 0, "cif_hidden": torch.zeros((batch_size, 1, enc_output_size)),
                    "cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size, "last_chunk": False,
                    "cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size,
                    "encoder_chunk_look_back": encoder_chunk_look_back, "last_chunk": False, "opt": None,
                    "feats": torch.zeros((batch_size, chunk_size[0] + chunk_size[2], feats_dims)), "tail_chunk": False}
        cache["encoder"] = cache_en

        cache_de = {"decode_fsmn": None}
        cache_de = {"decode_fsmn": None, "decoder_chunk_look_back": decoder_chunk_look_back, "opt": None, "chunk_size": chunk_size}
        cache["decoder"] = cache_de

        return cache

    def _cache_reset(cache: dict = {}, chunk_size=[5, 10, 5], batch_size=1):
    def _cache_reset(cache: dict = {}, chunk_size=[5, 10, 5], encoder_chunk_look_back=0,
                     decoder_chunk_look_back=0, batch_size=1):
        if len(cache) > 0:
            config = _read_yaml(asr_train_config)
            enc_output_size = config["encoder_conf"]["output_size"]
            feats_dims = config["frontend_conf"]["n_mels"] * config["frontend_conf"]["lfr_m"]
            cache_en = {"start_idx": 0, "cif_hidden": torch.zeros((batch_size, 1, enc_output_size)),
                        "cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size, "last_chunk": False,
                        "feats": torch.zeros((batch_size, chunk_size[0] + chunk_size[2], feats_dims)),
                        "tail_chunk": False}
                        "cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size,
                        "encoder_chunk_look_back": encoder_chunk_look_back, "last_chunk": False, "opt": None,
                        "feats": torch.zeros((batch_size, chunk_size[0] + chunk_size[2], feats_dims)), "tail_chunk": False}
            cache["encoder"] = cache_en

            cache_de = {"decode_fsmn": None}
            cache_de = {"decode_fsmn": None, "decoder_chunk_look_back": decoder_chunk_look_back, "opt": None, "chunk_size": chunk_size}
            cache["decoder"] = cache_de

        return cache

    #def _prepare_cache(cache: dict = {}, chunk_size=[5, 10, 5], batch_size=1):
    #    if len(cache) > 0:
    #        return cache
    #    config = _read_yaml(asr_train_config)
    #    enc_output_size = config["encoder_conf"]["output_size"]
    #    feats_dims = config["frontend_conf"]["n_mels"] * config["frontend_conf"]["lfr_m"]
    #    cache_en = {"start_idx": 0, "cif_hidden": torch.zeros((batch_size, 1, enc_output_size)),
    #                "cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size, "last_chunk": False,
    #                "feats": torch.zeros((batch_size, chunk_size[0] + chunk_size[2], feats_dims)), "tail_chunk": False}
    #    cache["encoder"] = cache_en

    #    cache_de = {"decode_fsmn": None}
    #    cache["decoder"] = cache_de

    #    return cache

    #def _cache_reset(cache: dict = {}, chunk_size=[5, 10, 5], batch_size=1):
    #    if len(cache) > 0:
    #        config = _read_yaml(asr_train_config)
    #        enc_output_size = config["encoder_conf"]["output_size"]
    #        feats_dims = config["frontend_conf"]["n_mels"] * config["frontend_conf"]["lfr_m"]
    #        cache_en = {"start_idx": 0, "cif_hidden": torch.zeros((batch_size, 1, enc_output_size)),
    #                    "cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size, "last_chunk": False,
    #                    "feats": torch.zeros((batch_size, chunk_size[0] + chunk_size[2], feats_dims)),
    #                    "tail_chunk": False}
    #        cache["encoder"] = cache_en

    #        cache_de = {"decode_fsmn": None}
    #        cache["decoder"] = cache_de

    #    return cache

    def _forward(
            data_path_and_name_and_type,
@@ -901,24 +936,34 @@
        is_final = False
        cache = {}
        chunk_size = [5, 10, 5]
        encoder_chunk_look_back = 0
        decoder_chunk_look_back = 0
        if param_dict is not None and "cache" in param_dict:
            cache = param_dict["cache"]
        if param_dict is not None and "is_final" in param_dict:
            is_final = param_dict["is_final"]
        if param_dict is not None and "chunk_size" in param_dict:
            chunk_size = param_dict["chunk_size"]
        if param_dict is not None and "encoder_chunk_look_back" in param_dict:
            encoder_chunk_look_back = param_dict["encoder_chunk_look_back"]
            if encoder_chunk_look_back > 0:
                chunk_size[0] = 0
        if param_dict is not None and "decoder_chunk_look_back" in param_dict:
            decoder_chunk_look_back = param_dict["decoder_chunk_look_back"]

        # 7 .Start for-loop
        # FIXME(kamo): The output format should be discussed about
        raw_inputs = torch.unsqueeze(raw_inputs, axis=0)
        asr_result_list = []
        cache = _prepare_cache(cache, chunk_size=chunk_size, batch_size=1)
        cache = _prepare_cache(cache, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, 
                               decoder_chunk_look_back=decoder_chunk_look_back, batch_size=1)
        item = {}
        if data_path_and_name_and_type is not None and data_path_and_name_and_type[2] == "sound":
            sample_offset = 0
            speech_length = raw_inputs.shape[1]
            stride_size = chunk_size[1] * 960
            cache = _prepare_cache(cache, chunk_size=chunk_size, batch_size=1)
            cache = _prepare_cache(cache, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, 
                                   decoder_chunk_look_back=decoder_chunk_look_back, batch_size=1)
            final_result = ""
            for sample_offset in range(0, speech_length, min(stride_size, speech_length - sample_offset)):
                if sample_offset + stride_size >= speech_length - 1:
@@ -939,7 +984,8 @@

        asr_result_list.append(item)
        if is_final:
            cache = _cache_reset(cache, chunk_size=chunk_size, batch_size=1)
            cache = _cache_reset(cache, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, 
                                 decoder_chunk_look_back=decoder_chunk_look_back, batch_size=1)
        return asr_result_list

    return _forward

 funasr/datasets/large_datasets/datapipes/batch.py

@@ -39,7 +39,7 @@
        self.batch_mode = batch_mode

    def set_epoch(self, epoch):
        self.epoch = epoch
        self.datapipe.set_epoch(epoch)

    def __iter__(self):
        buffer = []

 funasr/datasets/large_datasets/datapipes/filter.py

@@ -13,7 +13,7 @@
        self.fn = fn

    def set_epoch(self, epoch):
        self.epoch = epoch
        self.datapipe.set_epoch(epoch)

    def __iter__(self):
        assert callable(self.fn)
@@ -21,4 +21,4 @@
            if self.fn(data):
                yield data
            else:
                continue
                continue

 funasr/datasets/large_datasets/datapipes/map.py

@@ -14,7 +14,7 @@
        self.fn = fn

    def set_epoch(self, epoch):
        self.epoch = epoch
        self.datapipe.set_epoch(epoch)

    def __iter__(self):
        assert callable(self.fn)

 funasr/models/decoder/sanm_decoder.py

@@ -105,7 +105,50 @@

        return x, tgt_mask, memory, memory_mask, cache

    def forward_chunk(self, tgt, tgt_mask, memory, memory_mask=None, cache=None):
    #def forward_chunk(self, tgt, tgt_mask, memory, memory_mask=None, cache=None):
    #    """Compute decoded features.

    #    Args:
    #        tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size).
    #        tgt_mask (torch.Tensor): Mask for input tensor (#batch, maxlen_out).
    #        memory (torch.Tensor): Encoded memory, float32 (#batch, maxlen_in, size).
    #        memory_mask (torch.Tensor): Encoded memory mask (#batch, maxlen_in).
    #        cache (List[torch.Tensor]): List of cached tensors.
    #            Each tensor shape should be (#batch, maxlen_out - 1, size).

    #    Returns:
    #        torch.Tensor: Output tensor(#batch, maxlen_out, size).
    #        torch.Tensor: Mask for output tensor (#batch, maxlen_out).
    #        torch.Tensor: Encoded memory (#batch, maxlen_in, size).
    #        torch.Tensor: Encoded memory mask (#batch, maxlen_in).

    #    """
    #    # tgt = self.dropout(tgt)
    #    residual = tgt
    #    if self.normalize_before:
    #        tgt = self.norm1(tgt)
    #    tgt = self.feed_forward(tgt)

    #    x = tgt
    #    if self.self_attn:
    #        if self.normalize_before:
    #            tgt = self.norm2(tgt)
    #        if self.training:
    #            cache = None
    #        x, cache = self.self_attn(tgt, tgt_mask, cache=cache)
    #        x = residual + self.dropout(x)

    #    if self.src_attn is not None:
    #        residual = x
    #        if self.normalize_before:
    #            x = self.norm3(x)

    #        x = residual + self.dropout(self.src_attn(x, memory, memory_mask))


    #    return x, tgt_mask, memory, memory_mask, cache

    def forward_chunk(self, tgt, memory, fsmn_cache=None, opt_cache=None, chunk_size=None, look_back=0):
        """Compute decoded features.

        Args:
@@ -123,7 +166,6 @@
            torch.Tensor: Encoded memory mask (#batch, maxlen_in).

        """
        # tgt = self.dropout(tgt)
        residual = tgt
        if self.normalize_before:
            tgt = self.norm1(tgt)
@@ -133,9 +175,7 @@
        if self.self_attn:
            if self.normalize_before:
                tgt = self.norm2(tgt)
            if self.training:
                cache = None
            x, cache = self.self_attn(tgt, tgt_mask, cache=cache)
            x, fsmn_cache = self.self_attn(tgt, None, fsmn_cache)
            x = residual + self.dropout(x)

        if self.src_attn is not None:
@@ -143,10 +183,11 @@
            if self.normalize_before:
                x = self.norm3(x)

            x = residual + self.dropout(self.src_attn(x, memory, memory_mask))
            x, opt_cache = self.src_attn.forward_chunk(x, memory, opt_cache, chunk_size, look_back)
            x = residual + x

        return x, memory, fsmn_cache, opt_cache

        return x, tgt_mask, memory, memory_mask, cache

class FsmnDecoderSCAMAOpt(BaseTransformerDecoder):
    """
@@ -992,6 +1033,65 @@
        )
        return logp.squeeze(0), state

    #def forward_chunk(
    #    self,
    #    memory: torch.Tensor,
    #    tgt: torch.Tensor,
    #    cache: dict = None,
    #) -> Tuple[torch.Tensor, torch.Tensor]:
    #    """Forward decoder.

    #    Args:
    #        hs_pad: encoded memory, float32  (batch, maxlen_in, feat)
    #        hlens: (batch)
    #        ys_in_pad:
    #            input token ids, int64 (batch, maxlen_out)
    #            if input_layer == "embed"
    #            input tensor (batch, maxlen_out, #mels) in the other cases
    #        ys_in_lens: (batch)
    #    Returns:
    #        (tuple): tuple containing:

    #        x: decoded token score before softmax (batch, maxlen_out, token)
    #            if use_output_layer is True,
    #        olens: (batch, )
    #    """
    #    x = tgt
    #    if cache["decode_fsmn"] is None:
    #        cache_layer_num = len(self.decoders)
    #        if self.decoders2 is not None:
    #            cache_layer_num += len(self.decoders2)
    #        new_cache = [None] * cache_layer_num
    #    else:
    #        new_cache = cache["decode_fsmn"]
    #    for i in range(self.att_layer_num):
    #        decoder = self.decoders[i]
    #        x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
    #            x, None, memory, None, cache=new_cache[i]
    #        )
    #        new_cache[i] = c_ret

    #    if self.num_blocks - self.att_layer_num > 1:
    #        for i in range(self.num_blocks - self.att_layer_num):
    #            j = i + self.att_layer_num
    #            decoder = self.decoders2[i]
    #            x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
    #                x, None, memory, None, cache=new_cache[j]
    #            )
    #            new_cache[j] = c_ret

    #    for decoder in self.decoders3:

    #        x, tgt_mask, memory, memory_mask, _ = decoder.forward_chunk(
    #            x, None, memory, None, cache=None
    #        )
    #    if self.normalize_before:
    #        x = self.after_norm(x)
    #    if self.output_layer is not None:
    #        x = self.output_layer(x)
    #    cache["decode_fsmn"] = new_cache
    #    return x

    def forward_chunk(
        self,
        memory: torch.Tensor,
@@ -1020,35 +1120,43 @@
            cache_layer_num = len(self.decoders)
            if self.decoders2 is not None:
                cache_layer_num += len(self.decoders2)
            new_cache = [None] * cache_layer_num
            fsmn_cache = [None] * cache_layer_num
        else:
            new_cache = cache["decode_fsmn"]
            fsmn_cache = cache["decode_fsmn"]

        if cache["opt"] is None:
            cache_layer_num = len(self.decoders)
            opt_cache = [None] * cache_layer_num
        else:
            opt_cache = cache["opt"]

        for i in range(self.att_layer_num):
            decoder = self.decoders[i]
            x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
                x, None, memory, None, cache=new_cache[i]
            x, memory, fsmn_cache[i], opt_cache[i] = decoder.forward_chunk(
                x, memory, fsmn_cache=fsmn_cache[i], opt_cache=opt_cache[i],
                chunk_size=cache["chunk_size"], look_back=cache["decoder_chunk_look_back"]
            )
            new_cache[i] = c_ret

        if self.num_blocks - self.att_layer_num > 1:
            for i in range(self.num_blocks - self.att_layer_num):
                j = i + self.att_layer_num
                decoder = self.decoders2[i]
                x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
                    x, None, memory, None, cache=new_cache[j]
                x, memory, fsmn_cache[j], _  = decoder.forward_chunk(
                    x, memory, fsmn_cache=fsmn_cache[j]
                )
                new_cache[j] = c_ret

        for decoder in self.decoders3:

            x, tgt_mask, memory, memory_mask, _ = decoder.forward_chunk(
                x, None, memory, None, cache=None
            x, memory, _, _ = decoder.forward_chunk(
                x, memory
            )
        if self.normalize_before:
            x = self.after_norm(x)
        if self.output_layer is not None:
            x = self.output_layer(x)
        cache["decode_fsmn"] = new_cache

        cache["decode_fsmn"] = fsmn_cache
        if cache["decoder_chunk_look_back"] > 0 or cache["decoder_chunk_look_back"] == -1:
            cache["opt"] = opt_cache
        return x

    def forward_one_step(

 funasr/models/encoder/sanm_encoder.py

@@ -114,8 +114,44 @@
        if not self.normalize_before:
            x = self.norm2(x)


        return x, mask, cache, mask_shfit_chunk, mask_att_chunk_encoder

    def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0):
        """Compute encoded features.

        Args:
            x_input (torch.Tensor): Input tensor (#batch, time, size).
            mask (torch.Tensor): Mask tensor for the input (#batch, time).
            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).

        Returns:
            torch.Tensor: Output tensor (#batch, time, size).
            torch.Tensor: Mask tensor (#batch, time).

        """

        residual = x
        if self.normalize_before:
            x = self.norm1(x)

        if self.in_size == self.size:
            attn, cache = self.self_attn.forward_chunk(x, cache, chunk_size, look_back)
            x = residual + attn
        else:
            x, cache = self.self_attn.forward_chunk(x, cache, chunk_size, look_back)

        if not self.normalize_before:
            x = self.norm1(x)

        residual = x
        if self.normalize_before:
            x = self.norm2(x)
        x = residual + self.feed_forward(x)
        if not self.normalize_before:
            x = self.norm2(x)

        return x, cache


class SANMEncoder(AbsEncoder):
    """
@@ -837,11 +873,56 @@
        cache["feats"] = overlap_feats[:, -(cache["chunk_size"][0] + cache["chunk_size"][2]):, :]
        return overlap_feats

    #def forward_chunk(self,
    #                  xs_pad: torch.Tensor,
    #                  ilens: torch.Tensor,
    #                  cache: dict = None,
    #                  ctc: CTC = None,
    #                  ):
    #    xs_pad *= self.output_size() ** 0.5
    #    if self.embed is None:
    #        xs_pad = xs_pad
    #    else:
    #        xs_pad = self.embed(xs_pad, cache)
    #    if cache["tail_chunk"]:
    #        xs_pad = to_device(cache["feats"], device=xs_pad.device)
    #    else:
    #        xs_pad = self._add_overlap_chunk(xs_pad, cache)
    #    encoder_outs = self.encoders0(xs_pad, None, None, None, None)
    #    xs_pad, masks = encoder_outs[0], encoder_outs[1]
    #    intermediate_outs = []
    #    if len(self.interctc_layer_idx) == 0:
    #        encoder_outs = self.encoders(xs_pad, None, None, None, None)
    #        xs_pad, masks = encoder_outs[0], encoder_outs[1]
    #    else:
    #        for layer_idx, encoder_layer in enumerate(self.encoders):
    #            encoder_outs = encoder_layer(xs_pad, None, None, None, None)
    #            xs_pad, masks = encoder_outs[0], encoder_outs[1]
    #            if layer_idx + 1 in self.interctc_layer_idx:
    #                encoder_out = xs_pad

    #                # intermediate outputs are also normalized
    #                if self.normalize_before:
    #                    encoder_out = self.after_norm(encoder_out)

    #                intermediate_outs.append((layer_idx + 1, encoder_out))

    #                if self.interctc_use_conditioning:
    #                    ctc_out = ctc.softmax(encoder_out)
    #                    xs_pad = xs_pad + self.conditioning_layer(ctc_out)

    #    if self.normalize_before:
    #        xs_pad = self.after_norm(xs_pad)

    #    if len(intermediate_outs) > 0:
    #        return (xs_pad, intermediate_outs), None, None
    #    return xs_pad, ilens, None


    def forward_chunk(self,
                      xs_pad: torch.Tensor,
                      ilens: torch.Tensor,
                      cache: dict = None,
                      ctc: CTC = None,
                      ):
        xs_pad *= self.output_size() ** 0.5
        if self.embed is None:
@@ -852,34 +933,25 @@
            xs_pad = to_device(cache["feats"], device=xs_pad.device)
        else:
            xs_pad = self._add_overlap_chunk(xs_pad, cache)
        encoder_outs = self.encoders0(xs_pad, None, None, None, None)
        xs_pad, masks = encoder_outs[0], encoder_outs[1]
        intermediate_outs = []
        if len(self.interctc_layer_idx) == 0:
            encoder_outs = self.encoders(xs_pad, None, None, None, None)
            xs_pad, masks = encoder_outs[0], encoder_outs[1]
        if cache["opt"] is None:
            cache_layer_num = len(self.encoders0) + len(self.encoders)
            new_cache = [None] * cache_layer_num
        else:
            for layer_idx, encoder_layer in enumerate(self.encoders):
                encoder_outs = encoder_layer(xs_pad, None, None, None, None)
                xs_pad, masks = encoder_outs[0], encoder_outs[1]
                if layer_idx + 1 in self.interctc_layer_idx:
                    encoder_out = xs_pad
            new_cache = cache["opt"]

                    # intermediate outputs are also normalized
                    if self.normalize_before:
                        encoder_out = self.after_norm(encoder_out)
        for layer_idx, encoder_layer in enumerate(self.encoders0):
            encoder_outs = encoder_layer.forward_chunk(xs_pad, new_cache[layer_idx], cache["chunk_size"], cache["encoder_chunk_look_back"])
            xs_pad, new_cache[0] = encoder_outs[0], encoder_outs[1]

                    intermediate_outs.append((layer_idx + 1, encoder_out))

                    if self.interctc_use_conditioning:
                        ctc_out = ctc.softmax(encoder_out)
                        xs_pad = xs_pad + self.conditioning_layer(ctc_out)
        for layer_idx, encoder_layer in enumerate(self.encoders):
            encoder_outs = encoder_layer.forward_chunk(xs_pad, new_cache[layer_idx+len(self.encoders0)], cache["chunk_size"], cache["encoder_chunk_look_back"])
            xs_pad, new_cache[layer_idx+len(self.encoders0)] = encoder_outs[0], encoder_outs[1]

        if self.normalize_before:
            xs_pad = self.after_norm(xs_pad)
        if cache["encoder_chunk_look_back"] > 0 or cache["encoder_chunk_look_back"] == -1:
            cache["opt"] = new_cache

        if len(intermediate_outs) > 0:
            return (xs_pad, intermediate_outs), None, None
        return xs_pad, ilens, None

    def gen_tf2torch_map_dict(self):

 funasr/modules/attention.py

@@ -456,6 +456,44 @@
        att_outs = self.forward_attention(v_h, scores, mask, mask_att_chunk_encoder)
        return att_outs + fsmn_memory

    def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0):
        """Compute scaled dot product attention.

        Args:
            query (torch.Tensor): Query tensor (#batch, time1, size).
            key (torch.Tensor): Key tensor (#batch, time2, size).
            value (torch.Tensor): Value tensor (#batch, time2, size).
            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
                (#batch, time1, time2).

        Returns:
            torch.Tensor: Output tensor (#batch, time1, d_model).

        """
        q_h, k_h, v_h, v = self.forward_qkv(x)
        if chunk_size is not None and look_back > 0 or look_back == -1:
            if cache is not None:
                k_h_stride = k_h[:, :, :-(chunk_size[2]), :]
                v_h_stride = v_h[:, :, :-(chunk_size[2]), :]
                k_h = torch.cat((cache["k"], k_h), dim=2)
                v_h = torch.cat((cache["v"], v_h), dim=2)

                cache["k"] = torch.cat((cache["k"], k_h_stride), dim=2)
                cache["v"] = torch.cat((cache["v"], v_h_stride), dim=2)
                if look_back != -1:
                    cache["k"] = cache["k"][:, :, -(look_back * chunk_size[1]):, :]
                    cache["v"] = cache["v"][:, :, -(look_back * chunk_size[1]):, :]
            else:
                cache_tmp = {"k": k_h[:, :, :-(chunk_size[2]), :],
                             "v": v_h[:, :, :-(chunk_size[2]), :]}
                cache = cache_tmp
        fsmn_memory = self.forward_fsmn(v, None)
        q_h = q_h * self.d_k ** (-0.5)
        scores = torch.matmul(q_h, k_h.transpose(-2, -1))
        att_outs = self.forward_attention(v_h, scores, None)
        return att_outs + fsmn_memory, cache


class MultiHeadedAttentionSANMwithMask(MultiHeadedAttentionSANM):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
@@ -667,6 +705,35 @@
        scores = torch.matmul(q_h, k_h.transpose(-2, -1))
        return self.forward_attention(v_h, scores, memory_mask)

    def forward_chunk(self, x, memory, cache=None, chunk_size=None, look_back=0):
        """Compute scaled dot product attention.

        Args:
            query (torch.Tensor): Query tensor (#batch, time1, size).
            key (torch.Tensor): Key tensor (#batch, time2, size).
            value (torch.Tensor): Value tensor (#batch, time2, size).
            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
                (#batch, time1, time2).

        Returns:
            torch.Tensor: Output tensor (#batch, time1, d_model).

        """
        q_h, k_h, v_h = self.forward_qkv(x, memory)
        if chunk_size is not None and look_back > 0:
            if cache is not None:
                k_h = torch.cat((cache["k"], k_h), dim=2)
                v_h = torch.cat((cache["v"], v_h), dim=2)
                cache["k"] = k_h[:, :, -(look_back * chunk_size[1]):, :]
                cache["v"] = v_h[:, :, -(look_back * chunk_size[1]):, :]
            else:
                cache_tmp = {"k": k_h[:, :, -(look_back * chunk_size[1]):, :],
                             "v": v_h[:, :, -(look_back * chunk_size[1]):, :]}
                cache = cache_tmp
        q_h = q_h * self.d_k ** (-0.5)
        scores = torch.matmul(q_h, k_h.transpose(-2, -1))
        return self.forward_attention(v_h, scores, None), cache


class MultiHeadSelfAttention(nn.Module):
    """Multi-Head Attention layer.

			@@ -842,37 +842,72 @@
			data = yaml.load(f, Loader=yaml.Loader)
			return data

			def _prepare_cache(cache: dict = {}, chunk_size=[5, 10, 5], batch_size=1):
			def _prepare_cache(cache: dict = {}, chunk_size=[5, 10, 5], encoder_chunk_look_back=0,
			decoder_chunk_look_back=0, batch_size=1):
			if len(cache) > 0:
			return cache
			config = _read_yaml(asr_train_config)
			enc_output_size = config["encoder_conf"]["output_size"]
			feats_dims = config["frontend_conf"]["n_mels"] * config["frontend_conf"]["lfr_m"]
			cache_en = {"start_idx": 0, "cif_hidden": torch.zeros((batch_size, 1, enc_output_size)),
			"cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size, "last_chunk": False,
			"cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size,
			"encoder_chunk_look_back": encoder_chunk_look_back, "last_chunk": False, "opt": None,
			"feats": torch.zeros((batch_size, chunk_size[0] + chunk_size[2], feats_dims)), "tail_chunk": False}
			cache["encoder"] = cache_en

			cache_de = {"decode_fsmn": None}
			cache_de = {"decode_fsmn": None, "decoder_chunk_look_back": decoder_chunk_look_back, "opt": None, "chunk_size": chunk_size}
			cache["decoder"] = cache_de

			return cache

			def _cache_reset(cache: dict = {}, chunk_size=[5, 10, 5], batch_size=1):
			def _cache_reset(cache: dict = {}, chunk_size=[5, 10, 5], encoder_chunk_look_back=0,
			decoder_chunk_look_back=0, batch_size=1):
			if len(cache) > 0:
			config = _read_yaml(asr_train_config)
			enc_output_size = config["encoder_conf"]["output_size"]
			feats_dims = config["frontend_conf"]["n_mels"] * config["frontend_conf"]["lfr_m"]
			cache_en = {"start_idx": 0, "cif_hidden": torch.zeros((batch_size, 1, enc_output_size)),
			"cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size, "last_chunk": False,
			"feats": torch.zeros((batch_size, chunk_size[0] + chunk_size[2], feats_dims)),
			"tail_chunk": False}
			"cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size,
			"encoder_chunk_look_back": encoder_chunk_look_back, "last_chunk": False, "opt": None,
			"feats": torch.zeros((batch_size, chunk_size[0] + chunk_size[2], feats_dims)), "tail_chunk": False}
			cache["encoder"] = cache_en

			cache_de = {"decode_fsmn": None}
			cache_de = {"decode_fsmn": None, "decoder_chunk_look_back": decoder_chunk_look_back, "opt": None, "chunk_size": chunk_size}
			cache["decoder"] = cache_de

			return cache

			#def _prepare_cache(cache: dict = {}, chunk_size=[5, 10, 5], batch_size=1):
			# if len(cache) > 0:
			# return cache
			# config = _read_yaml(asr_train_config)
			# enc_output_size = config["encoder_conf"]["output_size"]
			# feats_dims = config["frontend_conf"]["n_mels"] * config["frontend_conf"]["lfr_m"]
			# cache_en = {"start_idx": 0, "cif_hidden": torch.zeros((batch_size, 1, enc_output_size)),
			# "cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size, "last_chunk": False,
			# "feats": torch.zeros((batch_size, chunk_size[0] + chunk_size[2], feats_dims)), "tail_chunk": False}
			# cache["encoder"] = cache_en

			# cache_de = {"decode_fsmn": None}
			# cache["decoder"] = cache_de

			# return cache

			#def _cache_reset(cache: dict = {}, chunk_size=[5, 10, 5], batch_size=1):
			# if len(cache) > 0:
			# config = _read_yaml(asr_train_config)
			# enc_output_size = config["encoder_conf"]["output_size"]
			# feats_dims = config["frontend_conf"]["n_mels"] * config["frontend_conf"]["lfr_m"]
			# cache_en = {"start_idx": 0, "cif_hidden": torch.zeros((batch_size, 1, enc_output_size)),
			# "cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size, "last_chunk": False,
			# "feats": torch.zeros((batch_size, chunk_size[0] + chunk_size[2], feats_dims)),
			# "tail_chunk": False}
			# cache["encoder"] = cache_en

			# cache_de = {"decode_fsmn": None}
			# cache["decoder"] = cache_de

			# return cache

			def _forward(
			data_path_and_name_and_type,
			@@ -901,24 +936,34 @@
			is_final = False
			cache = {}
			chunk_size = [5, 10, 5]
			encoder_chunk_look_back = 0
			decoder_chunk_look_back = 0
			if param_dict is not None and "cache" in param_dict:
			cache = param_dict["cache"]
			if param_dict is not None and "is_final" in param_dict:
			is_final = param_dict["is_final"]
			if param_dict is not None and "chunk_size" in param_dict:
			chunk_size = param_dict["chunk_size"]
			if param_dict is not None and "encoder_chunk_look_back" in param_dict:
			encoder_chunk_look_back = param_dict["encoder_chunk_look_back"]
			if encoder_chunk_look_back > 0:
			chunk_size[0] = 0
			if param_dict is not None and "decoder_chunk_look_back" in param_dict:
			decoder_chunk_look_back = param_dict["decoder_chunk_look_back"]

			# 7 .Start for-loop
			# FIXME(kamo): The output format should be discussed about
			raw_inputs = torch.unsqueeze(raw_inputs, axis=0)
			asr_result_list = []
			cache = _prepare_cache(cache, chunk_size=chunk_size, batch_size=1)
			cache = _prepare_cache(cache, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back,
			decoder_chunk_look_back=decoder_chunk_look_back, batch_size=1)
			item = {}
			if data_path_and_name_and_type is not None and data_path_and_name_and_type[2] == "sound":
			sample_offset = 0
			speech_length = raw_inputs.shape[1]
			stride_size = chunk_size[1] * 960
			cache = _prepare_cache(cache, chunk_size=chunk_size, batch_size=1)
			cache = _prepare_cache(cache, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back,
			decoder_chunk_look_back=decoder_chunk_look_back, batch_size=1)
			final_result = ""
			for sample_offset in range(0, speech_length, min(stride_size, speech_length - sample_offset)):
			if sample_offset + stride_size >= speech_length - 1:
			@@ -939,7 +984,8 @@

			asr_result_list.append(item)
			if is_final:
			cache = _cache_reset(cache, chunk_size=chunk_size, batch_size=1)
			cache = _cache_reset(cache, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back,
			decoder_chunk_look_back=decoder_chunk_look_back, batch_size=1)
			return asr_result_list

			return _forward

			@@ -39,7 +39,7 @@
			self.batch_mode = batch_mode

			def set_epoch(self, epoch):
			self.epoch = epoch
			self.datapipe.set_epoch(epoch)

			def __iter__(self):
			buffer = []

			@@ -13,7 +13,7 @@
			self.fn = fn

			def set_epoch(self, epoch):
			self.epoch = epoch
			self.datapipe.set_epoch(epoch)

			def __iter__(self):
			assert callable(self.fn)
			@@ -21,4 +21,4 @@
			if self.fn(data):
			yield data
			else:
			continue
			continue

			@@ -14,7 +14,7 @@
			self.fn = fn

			def set_epoch(self, epoch):
			self.epoch = epoch
			self.datapipe.set_epoch(epoch)

			def __iter__(self):
			assert callable(self.fn)

			@@ -105,7 +105,50 @@

			return x, tgt_mask, memory, memory_mask, cache

			def forward_chunk(self, tgt, tgt_mask, memory, memory_mask=None, cache=None):
			#def forward_chunk(self, tgt, tgt_mask, memory, memory_mask=None, cache=None):
			# """Compute decoded features.

			# Args:
			# tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size).
			# tgt_mask (torch.Tensor): Mask for input tensor (#batch, maxlen_out).
			# memory (torch.Tensor): Encoded memory, float32 (#batch, maxlen_in, size).
			# memory_mask (torch.Tensor): Encoded memory mask (#batch, maxlen_in).
			# cache (List[torch.Tensor]): List of cached tensors.
			# Each tensor shape should be (#batch, maxlen_out - 1, size).

			# Returns:
			# torch.Tensor: Output tensor(#batch, maxlen_out, size).
			# torch.Tensor: Mask for output tensor (#batch, maxlen_out).
			# torch.Tensor: Encoded memory (#batch, maxlen_in, size).
			# torch.Tensor: Encoded memory mask (#batch, maxlen_in).

			# """
			# # tgt = self.dropout(tgt)
			# residual = tgt
			# if self.normalize_before:
			# tgt = self.norm1(tgt)
			# tgt = self.feed_forward(tgt)

			# x = tgt
			# if self.self_attn:
			# if self.normalize_before:
			# tgt = self.norm2(tgt)
			# if self.training:
			# cache = None
			# x, cache = self.self_attn(tgt, tgt_mask, cache=cache)
			# x = residual + self.dropout(x)

			# if self.src_attn is not None:
			# residual = x
			# if self.normalize_before:
			# x = self.norm3(x)

			# x = residual + self.dropout(self.src_attn(x, memory, memory_mask))


			# return x, tgt_mask, memory, memory_mask, cache

			def forward_chunk(self, tgt, memory, fsmn_cache=None, opt_cache=None, chunk_size=None, look_back=0):
			"""Compute decoded features.

			Args:
			@@ -123,7 +166,6 @@
			torch.Tensor: Encoded memory mask (#batch, maxlen_in).

			"""
			# tgt = self.dropout(tgt)
			residual = tgt
			if self.normalize_before:
			tgt = self.norm1(tgt)
			@@ -133,9 +175,7 @@
			if self.self_attn:
			if self.normalize_before:
			tgt = self.norm2(tgt)
			if self.training:
			cache = None
			x, cache = self.self_attn(tgt, tgt_mask, cache=cache)
			x, fsmn_cache = self.self_attn(tgt, None, fsmn_cache)
			x = residual + self.dropout(x)

			if self.src_attn is not None:
			@@ -143,10 +183,11 @@
			if self.normalize_before:
			x = self.norm3(x)

			x = residual + self.dropout(self.src_attn(x, memory, memory_mask))
			x, opt_cache = self.src_attn.forward_chunk(x, memory, opt_cache, chunk_size, look_back)
			x = residual + x

			return x, memory, fsmn_cache, opt_cache

			return x, tgt_mask, memory, memory_mask, cache

			class FsmnDecoderSCAMAOpt(BaseTransformerDecoder):
			"""
			@@ -992,6 +1033,65 @@
			)
			return logp.squeeze(0), state

			#def forward_chunk(
			# self,
			# memory: torch.Tensor,
			# tgt: torch.Tensor,
			# cache: dict = None,
			#) -> Tuple[torch.Tensor, torch.Tensor]:
			# """Forward decoder.

			# Args:
			# hs_pad: encoded memory, float32 (batch, maxlen_in, feat)
			# hlens: (batch)
			# ys_in_pad:
			# input token ids, int64 (batch, maxlen_out)
			# if input_layer == "embed"
			# input tensor (batch, maxlen_out, #mels) in the other cases
			# ys_in_lens: (batch)
			# Returns:
			# (tuple): tuple containing:

			# x: decoded token score before softmax (batch, maxlen_out, token)
			# if use_output_layer is True,
			# olens: (batch, )
			# """
			# x = tgt
			# if cache["decode_fsmn"] is None:
			# cache_layer_num = len(self.decoders)
			# if self.decoders2 is not None:
			# cache_layer_num += len(self.decoders2)
			# new_cache = [None] * cache_layer_num
			# else:
			# new_cache = cache["decode_fsmn"]
			# for i in range(self.att_layer_num):
			# decoder = self.decoders[i]
			# x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
			# x, None, memory, None, cache=new_cache[i]
			# )
			# new_cache[i] = c_ret

			# if self.num_blocks - self.att_layer_num > 1:
			# for i in range(self.num_blocks - self.att_layer_num):
			# j = i + self.att_layer_num
			# decoder = self.decoders2[i]
			# x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
			# x, None, memory, None, cache=new_cache[j]
			# )
			# new_cache[j] = c_ret

			# for decoder in self.decoders3:

			# x, tgt_mask, memory, memory_mask, _ = decoder.forward_chunk(
			# x, None, memory, None, cache=None
			# )
			# if self.normalize_before:
			# x = self.after_norm(x)
			# if self.output_layer is not None:
			# x = self.output_layer(x)
			# cache["decode_fsmn"] = new_cache
			# return x

			def forward_chunk(
			self,
			memory: torch.Tensor,
			@@ -1020,35 +1120,43 @@
			cache_layer_num = len(self.decoders)
			if self.decoders2 is not None:
			cache_layer_num += len(self.decoders2)
			new_cache = [None] * cache_layer_num
			fsmn_cache = [None] * cache_layer_num
			else:
			new_cache = cache["decode_fsmn"]
			fsmn_cache = cache["decode_fsmn"]

			if cache["opt"] is None:
			cache_layer_num = len(self.decoders)
			opt_cache = [None] * cache_layer_num
			else:
			opt_cache = cache["opt"]

			for i in range(self.att_layer_num):
			decoder = self.decoders[i]
			x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
			x, None, memory, None, cache=new_cache[i]
			x, memory, fsmn_cache[i], opt_cache[i] = decoder.forward_chunk(
			x, memory, fsmn_cache=fsmn_cache[i], opt_cache=opt_cache[i],
			chunk_size=cache["chunk_size"], look_back=cache["decoder_chunk_look_back"]
			)
			new_cache[i] = c_ret

			if self.num_blocks - self.att_layer_num > 1:
			for i in range(self.num_blocks - self.att_layer_num):
			j = i + self.att_layer_num
			decoder = self.decoders2[i]
			x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
			x, None, memory, None, cache=new_cache[j]
			x, memory, fsmn_cache[j], _ = decoder.forward_chunk(
			x, memory, fsmn_cache=fsmn_cache[j]
			)
			new_cache[j] = c_ret

			for decoder in self.decoders3:

			x, tgt_mask, memory, memory_mask, _ = decoder.forward_chunk(
			x, None, memory, None, cache=None
			x, memory, _, _ = decoder.forward_chunk(
			x, memory
			)
			if self.normalize_before:
			x = self.after_norm(x)
			if self.output_layer is not None:
			x = self.output_layer(x)
			cache["decode_fsmn"] = new_cache

			cache["decode_fsmn"] = fsmn_cache
			if cache["decoder_chunk_look_back"] > 0 or cache["decoder_chunk_look_back"] == -1:
			cache["opt"] = opt_cache
			return x

			def forward_one_step(

			@@ -114,8 +114,44 @@
			if not self.normalize_before:
			x = self.norm2(x)


			return x, mask, cache, mask_shfit_chunk, mask_att_chunk_encoder

			def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0):
			"""Compute encoded features.

			Args:
			x_input (torch.Tensor): Input tensor (#batch, time, size).
			mask (torch.Tensor): Mask tensor for the input (#batch, time).
			cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).

			Returns:
			torch.Tensor: Output tensor (#batch, time, size).
			torch.Tensor: Mask tensor (#batch, time).

			"""

			residual = x
			if self.normalize_before:
			x = self.norm1(x)

			if self.in_size == self.size:
			attn, cache = self.self_attn.forward_chunk(x, cache, chunk_size, look_back)
			x = residual + attn
			else:
			x, cache = self.self_attn.forward_chunk(x, cache, chunk_size, look_back)

			if not self.normalize_before:
			x = self.norm1(x)

			residual = x
			if self.normalize_before:
			x = self.norm2(x)
			x = residual + self.feed_forward(x)
			if not self.normalize_before:
			x = self.norm2(x)

			return x, cache


			class SANMEncoder(AbsEncoder):
			"""
			@@ -837,11 +873,56 @@
			cache["feats"] = overlap_feats[:, -(cache["chunk_size"][0] + cache["chunk_size"][2]):, :]
			return overlap_feats

			#def forward_chunk(self,
			# xs_pad: torch.Tensor,
			# ilens: torch.Tensor,
			# cache: dict = None,
			# ctc: CTC = None,
			# ):
			# xs_pad = self.output_size() * 0.5
			# if self.embed is None:
			# xs_pad = xs_pad
			# else:
			# xs_pad = self.embed(xs_pad, cache)
			# if cache["tail_chunk"]:
			# xs_pad = to_device(cache["feats"], device=xs_pad.device)
			# else:
			# xs_pad = self._add_overlap_chunk(xs_pad, cache)
			# encoder_outs = self.encoders0(xs_pad, None, None, None, None)
			# xs_pad, masks = encoder_outs[0], encoder_outs[1]
			# intermediate_outs = []
			# if len(self.interctc_layer_idx) == 0:
			# encoder_outs = self.encoders(xs_pad, None, None, None, None)
			# xs_pad, masks = encoder_outs[0], encoder_outs[1]
			# else:
			# for layer_idx, encoder_layer in enumerate(self.encoders):
			# encoder_outs = encoder_layer(xs_pad, None, None, None, None)
			# xs_pad, masks = encoder_outs[0], encoder_outs[1]
			# if layer_idx + 1 in self.interctc_layer_idx:
			# encoder_out = xs_pad

			# # intermediate outputs are also normalized
			# if self.normalize_before:
			# encoder_out = self.after_norm(encoder_out)

			# intermediate_outs.append((layer_idx + 1, encoder_out))

			# if self.interctc_use_conditioning:
			# ctc_out = ctc.softmax(encoder_out)
			# xs_pad = xs_pad + self.conditioning_layer(ctc_out)

			# if self.normalize_before:
			# xs_pad = self.after_norm(xs_pad)

			# if len(intermediate_outs) > 0:
			# return (xs_pad, intermediate_outs), None, None
			# return xs_pad, ilens, None


			def forward_chunk(self,
			xs_pad: torch.Tensor,
			ilens: torch.Tensor,
			cache: dict = None,
			ctc: CTC = None,
			):
			xs_pad = self.output_size() * 0.5
			if self.embed is None:
			@@ -852,34 +933,25 @@
			xs_pad = to_device(cache["feats"], device=xs_pad.device)
			else:
			xs_pad = self._add_overlap_chunk(xs_pad, cache)
			encoder_outs = self.encoders0(xs_pad, None, None, None, None)
			xs_pad, masks = encoder_outs[0], encoder_outs[1]
			intermediate_outs = []
			if len(self.interctc_layer_idx) == 0:
			encoder_outs = self.encoders(xs_pad, None, None, None, None)
			xs_pad, masks = encoder_outs[0], encoder_outs[1]
			if cache["opt"] is None:
			cache_layer_num = len(self.encoders0) + len(self.encoders)
			new_cache = [None] * cache_layer_num
			else:
			for layer_idx, encoder_layer in enumerate(self.encoders):
			encoder_outs = encoder_layer(xs_pad, None, None, None, None)
			xs_pad, masks = encoder_outs[0], encoder_outs[1]
			if layer_idx + 1 in self.interctc_layer_idx:
			encoder_out = xs_pad
			new_cache = cache["opt"]

			# intermediate outputs are also normalized
			if self.normalize_before:
			encoder_out = self.after_norm(encoder_out)
			for layer_idx, encoder_layer in enumerate(self.encoders0):
			encoder_outs = encoder_layer.forward_chunk(xs_pad, new_cache[layer_idx], cache["chunk_size"], cache["encoder_chunk_look_back"])
			xs_pad, new_cache[0] = encoder_outs[0], encoder_outs[1]

			intermediate_outs.append((layer_idx + 1, encoder_out))

			if self.interctc_use_conditioning:
			ctc_out = ctc.softmax(encoder_out)
			xs_pad = xs_pad + self.conditioning_layer(ctc_out)
			for layer_idx, encoder_layer in enumerate(self.encoders):
			encoder_outs = encoder_layer.forward_chunk(xs_pad, new_cache[layer_idx+len(self.encoders0)], cache["chunk_size"], cache["encoder_chunk_look_back"])
			xs_pad, new_cache[layer_idx+len(self.encoders0)] = encoder_outs[0], encoder_outs[1]

			if self.normalize_before:
			xs_pad = self.after_norm(xs_pad)
			if cache["encoder_chunk_look_back"] > 0 or cache["encoder_chunk_look_back"] == -1:
			cache["opt"] = new_cache

			if len(intermediate_outs) > 0:
			return (xs_pad, intermediate_outs), None, None
			return xs_pad, ilens, None

			def gen_tf2torch_map_dict(self):

			@@ -456,6 +456,44 @@
			att_outs = self.forward_attention(v_h, scores, mask, mask_att_chunk_encoder)
			return att_outs + fsmn_memory

			def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0):
			"""Compute scaled dot product attention.

			Args:
			query (torch.Tensor): Query tensor (#batch, time1, size).
			key (torch.Tensor): Key tensor (#batch, time2, size).
			value (torch.Tensor): Value tensor (#batch, time2, size).
			mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
			(#batch, time1, time2).

			Returns:
			torch.Tensor: Output tensor (#batch, time1, d_model).

			"""
			q_h, k_h, v_h, v = self.forward_qkv(x)
			if chunk_size is not None and look_back > 0 or look_back == -1:
			if cache is not None:
			k_h_stride = k_h[:, :, :-(chunk_size[2]), :]
			v_h_stride = v_h[:, :, :-(chunk_size[2]), :]
			k_h = torch.cat((cache["k"], k_h), dim=2)
			v_h = torch.cat((cache["v"], v_h), dim=2)

			cache["k"] = torch.cat((cache["k"], k_h_stride), dim=2)
			cache["v"] = torch.cat((cache["v"], v_h_stride), dim=2)
			if look_back != -1:
			cache["k"] = cache["k"][:, :, -(look_back * chunk_size[1]):, :]
			cache["v"] = cache["v"][:, :, -(look_back * chunk_size[1]):, :]
			else:
			cache_tmp = {"k": k_h[:, :, :-(chunk_size[2]), :],
			"v": v_h[:, :, :-(chunk_size[2]), :]}
			cache = cache_tmp
			fsmn_memory = self.forward_fsmn(v, None)
			q_h = q_h * self.d_k ** (-0.5)
			scores = torch.matmul(q_h, k_h.transpose(-2, -1))
			att_outs = self.forward_attention(v_h, scores, None)
			return att_outs + fsmn_memory, cache


			class MultiHeadedAttentionSANMwithMask(MultiHeadedAttentionSANM):
			def __init__(self, args, *kwargs):
			super().__init__(args, *kwargs)
			@@ -667,6 +705,35 @@
			scores = torch.matmul(q_h, k_h.transpose(-2, -1))
			return self.forward_attention(v_h, scores, memory_mask)

			def forward_chunk(self, x, memory, cache=None, chunk_size=None, look_back=0):
			"""Compute scaled dot product attention.

			Args:
			query (torch.Tensor): Query tensor (#batch, time1, size).
			key (torch.Tensor): Key tensor (#batch, time2, size).
			value (torch.Tensor): Value tensor (#batch, time2, size).
			mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
			(#batch, time1, time2).

			Returns:
			torch.Tensor: Output tensor (#batch, time1, d_model).

			"""
			q_h, k_h, v_h = self.forward_qkv(x, memory)
			if chunk_size is not None and look_back > 0:
			if cache is not None:
			k_h = torch.cat((cache["k"], k_h), dim=2)
			v_h = torch.cat((cache["v"], v_h), dim=2)
			cache["k"] = k_h[:, :, -(look_back * chunk_size[1]):, :]
			cache["v"] = v_h[:, :, -(look_back * chunk_size[1]):, :]
			else:
			cache_tmp = {"k": k_h[:, :, -(look_back * chunk_size[1]):, :],
			"v": v_h[:, :, -(look_back * chunk_size[1]):, :]}
			cache = cache_tmp
			q_h = q_h * self.d_k ** (-0.5)
			scores = torch.matmul(q_h, k_h.transpose(-2, -1))
			return self.forward_attention(v_h, scores, None), cache


			class MultiHeadSelfAttention(nn.Module):
			"""Multi-Head Attention layer.