Merge branch 'main' of github.com:alibaba-damo-academy/FunASR
merge
10个文件已修改
1 文件已重命名
1个文件已删除
| | |
| | | #git clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git ${local_path} |
| | | |
| | | ## generate jsonl from wav.scp and text.txt |
| | | #python funasr/datasets/audio_datasets/scp2jsonl.py \ |
| | | #python -m funasr.datasets.audio_datasets.scp2jsonl \ |
| | | #++scp_file_list='["/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"]' \ |
| | | #++data_type_list='["source", "target"]' \ |
| | | #++jsonl_file_out=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl |
| | |
| | | |
| | | @hydra.main(config_name=None, version_base=None) |
| | | def main_hydra(cfg: DictConfig): |
| | | """ |
| | | python funasr/datasets/audio_datasets/scp2jsonl.py \ |
| | | ++scp_file_list='["/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"]' \ |
| | | ++data_type_list='["source", "target"]' \ |
| | | ++jsonl_file_out=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl |
| | | |
| | | """ |
| | | |
| | | |
| | | kwargs = OmegaConf.to_container(cfg, resolve=True) |
| | | |
| | | scp_file_list = kwargs.get("scp_file_list", ("/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt")) |
| | |
| | | gen_jsonl_from_wav_text_list(scp_file_list, data_type_list=data_type_list, jsonl_file_out=jsonl_file_out) |
| | | |
| | | |
| | | """ |
| | | python -m funasr.datasets.audio_datasets.scp2jsonl \ |
| | | ++scp_file_list='["/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"]' \ |
| | | ++data_type_list='["source", "target"]' \ |
| | | ++jsonl_file_out=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl |
| | | """ |
| | | |
| | | if __name__ == "__main__": |
| | | main_hydra() |
| | | |
| | |
| | | try: |
| | | from rotary_embedding_torch import RotaryEmbedding |
| | | except: |
| | | print("Please install rotary_embedding_torch by: \n pip install -U rotary_embedding_torch") |
| | | print("If you want use mossformer, lease install rotary_embedding_torch by: \n pip install -U rotary_embedding_torch") |
| | | from funasr.models.transformer.layer_norm import GlobalLayerNorm, CumulativeLayerNorm, ScaleNorm |
| | | from funasr.models.transformer.embedding import ScaledSinuEmbedding |
| | | from funasr.models.transformer.mossformer import FLASH_ShareA_FFConvM |
| | |
| | | speech, speech_lengths = data_in, data_lengths |
| | | if len(speech.shape) < 3: |
| | | speech = speech[None, :, :] |
| | | if speech_lengths is None: |
| | | if speech_lengths is not None: |
| | | speech_lengths = speech_lengths.squeeze(-1) |
| | | else: |
| | | speech_lengths = speech.shape[1] |
| | | else: |
| | | # extract fbank feats |
| | |
| | | """Receptance Weighted Key Value (RWKV) block definition. |
| | | |
| | | Based/modified from https://github.com/BlinkDL/RWKV-LM/blob/main/RWKV-v4/src/model.py |
| | | |
| | | """ |
| | | |
| | | from typing import Dict, Optional, Tuple |
| | | #!/usr/bin/env python3 |
| | | # -*- encoding: utf-8 -*- |
| | | # Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. |
| | | # MIT License (https://opensource.org/licenses/MIT) |
| | | |
| | | import torch |
| | | from typing import Dict, Optional, Tuple |
| | | |
| | | from funasr.models.rwkv_bat.rwkv_attention import EncoderSelfAttention, DecoderSelfAttention |
| | | from funasr.models.rwkv_bat.rwkv_feed_forward import FeedForward |
| | | from funasr.models.transformer.layer_norm import LayerNorm |
| | | from funasr.models.rwkv_bat.rwkv_feed_forward import FeedForward |
| | | from funasr.models.rwkv_bat.rwkv_attention import EncoderSelfAttention, DecoderSelfAttention |
| | | |
| | | |
| | | class RWKV(torch.nn.Module): |
| | | """RWKV module. |
| | |
| | | """Attention (time mixing) modules for RWKV block. |
| | | |
| | | Based/Modified from https://github.com/BlinkDL/RWKV-LM/blob/main/RWKV-v4/src/model.py. |
| | | |
| | | Some variables are renamed according to https://github.com/huggingface/transformers/blob/main/src/transformers/models/rwkv/modeling_rwkv.py. |
| | | |
| | | """ # noqa |
| | | #!/usr/bin/env python3 |
| | | # -*- encoding: utf-8 -*- |
| | | # Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. |
| | | # MIT License (https://opensource.org/licenses/MIT) |
| | | |
| | | import math |
| | | from importlib.util import find_spec |
| | | import torch |
| | | from pathlib import Path |
| | | from importlib.util import find_spec |
| | | from typing import List, Optional, Tuple, Union |
| | | |
| | | import torch |
| | | |
| | | wkv_kernel_encoder = None |
| | | wkv_kernel_decoder = None |
| | |
| | | """RWKV encoder definition for Transducer models.""" |
| | | |
| | | import math |
| | | from typing import Dict, List, Optional, Tuple |
| | | #!/usr/bin/env python3 |
| | | # -*- encoding: utf-8 -*- |
| | | # Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. |
| | | # MIT License (https://opensource.org/licenses/MIT) |
| | | |
| | | import torch |
| | | from typing import Dict, List, Optional, Tuple |
| | | |
| | | from funasr.models.encoder.abs_encoder import AbsEncoder |
| | | from funasr.register import tables |
| | | from funasr.models.rwkv_bat.rwkv import RWKV |
| | | from funasr.models.transformer.layer_norm import LayerNorm |
| | | from funasr.models.rwkv_bat.rwkv_subsampling import RWKVConvInput |
| | | from funasr.models.transformer.utils.nets_utils import make_source_mask |
| | | from funasr.models.rwkv_bat.rwkv_subsampling import RWKVConvInput |
| | | |
| | | class RWKVEncoder(AbsEncoder): |
| | | |
| | | @tables.register("encoder_classes", "RWKVEncoder") |
| | | class RWKVEncoder(torch.nn.Module): |
| | | """RWKV encoder module. |
| | | |
| | | Based on https://arxiv.org/pdf/2305.13048.pdf. |
| | |
| | | subsampling_factor: int =4, |
| | | time_reduction_factor: int = 1, |
| | | kernel: int = 3, |
| | | **kwargs, |
| | | ) -> None: |
| | | """Construct a RWKVEncoder object.""" |
| | | super().__init__() |
| | |
| | | """Feed-forward (channel mixing) module for RWKV block. |
| | | |
| | | Based/Modified from https://github.com/BlinkDL/RWKV-LM/blob/main/RWKV-v4/src/model.py |
| | | |
| | | Some variables are renamed according to https://github.com/huggingface/transformers/blob/main/src/transformers/models/rwkv/modeling_rwkv.py. |
| | | |
| | | """ # noqa |
| | | |
| | | from typing import List, Optional, Tuple |
| | | #!/usr/bin/env python3 |
| | | # -*- encoding: utf-8 -*- |
| | | # Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. |
| | | # MIT License (https://opensource.org/licenses/MIT) |
| | | |
| | | import torch |
| | | from typing import List, Optional, Tuple |
| | | |
| | | |
| | | class FeedForward(torch.nn.Module): |
| | |
| | | #!/usr/bin/env python3 |
| | | # -*- coding: utf-8 -*- |
| | | # -*- encoding: utf-8 -*- |
| | | # Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. |
| | | # MIT License (https://opensource.org/licenses/MIT) |
| | | |
| | | # Copyright 2019 Shigeki Karita |
| | | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) |
| | | |
| | | """Subsampling layer definition.""" |
| | | import numpy as np |
| | | import torch |
| | | import torch.nn.functional as F |
| | | from funasr.models.transformer.embedding import PositionalEncoding |
| | | import logging |
| | | from funasr.models.scama.utils import sequence_mask |
| | | from funasr.models.transformer.utils.nets_utils import sub_factor_to_params, pad_to_len |
| | | from typing import Optional, Tuple, Union |
| | | import math |
| | | import torch |
| | | from typing import Optional, Tuple, Union |
| | | from funasr.models.transformer.utils.nets_utils import pad_to_len |
| | | |
| | | |
| | | class TooShortUttError(Exception): |
| | | """Raised when the utt is too short for subsampling. |
| | |
| | | ) |
| | | pbar.set_description(description) |
| | | if self.writer: |
| | | self.writer.add_scalar(f'rank{self.local_rank}_Loss/train', loss.item(), |
| | | epoch*len(self.dataloader_train) + batch_idx) |
| | | self.writer.add_scalar(f'rank{self.local_rank}_Loss/train', loss.item(), self.batch_total) |
| | | self.writer.add_scalar(f'rank{self.local_rank}_lr/train', lr, self.batch_total) |
| | | for key, var in stats.items(): |
| | | self.writer.add_scalar(f'rank{self.local_rank}_{key}/train', var.item(), |
| | | epoch * len(self.dataloader_train) + batch_idx) |
| | | self.writer.add_scalar(f'rank{self.local_rank}_{key}/train', var.item(), self.batch_total) |
| | | for key, var in speed_stats.items(): |
| | | self.writer.add_scalar(f'rank{self.local_rank}_{key}/train', eval(var), |
| | | epoch * len(self.dataloader_train) + batch_idx) |
| | | |
| | | # if batch_idx == 2: |
| | | # break |
| | | self.writer.add_scalar(f'rank{self.local_rank}_{key}/train', eval(var), self.batch_total) |
| | | |
| | | |
| | | pbar.close() |
| | | |
| | | def _validate_epoch(self, epoch): |
| | |
| | | |
| | | if (batch_idx+1) % self.log_interval == 0 or (batch_idx+1) == len(self.dataloader_val): |
| | | pbar.update(self.log_interval) |
| | | time_now = datetime.now() |
| | | time_now = time_now.strftime("%Y-%m-%d %H:%M:%S") |
| | | description = ( |
| | | f"{time_now}, " |
| | | f"rank: {self.local_rank}, " |
| | | f"validation epoch: {epoch}/{self.max_epoch}, " |
| | | f"step: {batch_idx+1}/{len(self.dataloader_val)}, " |